linux-old/net/ipv4/tcp.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp.c,v 1.139 1999/03/17 19:30:34 davem Exp $
   9 *
  10 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14 *              Florian La Roche, <flla@stud.uni-sb.de>
  15 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20 *              Jorge Cwik, <jorge@laser.satlink.net>
  21 *
  22 * Fixes:
  23 *              Alan Cox        :       Numerous verify_area() calls
  24 *              Alan Cox        :       Set the ACK bit on a reset
  25 *              Alan Cox        :       Stopped it crashing if it closed while
  26 *                                      sk->inuse=1 and was trying to connect
  27 *                                      (tcp_err()).
  28 *              Alan Cox        :       All icmp error handling was broken
  29 *                                      pointers passed where wrong and the
  30 *                                      socket was looked up backwards. Nobody
  31 *                                      tested any icmp error code obviously.
  32 *              Alan Cox        :       tcp_err() now handled properly. It
  33 *                                      wakes people on errors. poll
  34 *                                      behaves and the icmp error race
  35 *                                      has gone by moving it into sock.c
  36 *              Alan Cox        :       tcp_send_reset() fixed to work for
  37 *                                      everything not just packets for
  38 *                                      unknown sockets.
  39 *              Alan Cox        :       tcp option processing.
  40 *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41 *                                      syn rule wrong]
  42 *              Herp Rosmanith  :       More reset fixes
  43 *              Alan Cox        :       No longer acks invalid rst frames.
  44 *                                      Acking any kind of RST is right out.
  45 *              Alan Cox        :       Sets an ignore me flag on an rst
  46 *                                      receive otherwise odd bits of prattle
  47 *                                      escape still
  48 *              Alan Cox        :       Fixed another acking RST frame bug.
  49 *                                      Should stop LAN workplace lockups.
  50 *              Alan Cox        :       Some tidyups using the new skb list
  51 *                                      facilities
  52 *              Alan Cox        :       sk->keepopen now seems to work
  53 *              Alan Cox        :       Pulls options out correctly on accepts
  54 *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55 *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56 *                                      bit to skb ops.
  57 *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58 *                                      nasty.
  59 *              Alan Cox        :       Added some better commenting, as the
  60 *                                      tcp is hard to follow
  61 *              Alan Cox        :       Removed incorrect check for 20 * psh
  62 *      Michael O'Reilly        :       ack < copied bug fix.
  63 *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64 *              Alan Cox        :       FIN with no memory -> CRASH
  65 *              Alan Cox        :       Added socket option proto entries.
  66 *                                      Also added awareness of them to accept.
  67 *              Alan Cox        :       Added TCP options (SOL_TCP)
  68 *              Alan Cox        :       Switched wakeup calls to callbacks,
  69 *                                      so the kernel can layer network
  70 *                                      sockets.
  71 *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72 *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73 *              Alan Cox        :       RST frames sent on unsynchronised
  74 *                                      state ack error.
  75 *              Alan Cox        :       Put in missing check for SYN bit.
  76 *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77 *                                      window non shrink trick.
  78 *              Alan Cox        :       Added a couple of small NET2E timer
  79 *                                      fixes
  80 *              Charles Hedrick :       TCP fixes
  81 *              Toomas Tamm     :       TCP window fixes
  82 *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83 *              Charles Hedrick :       Rewrote most of it to actually work
  84 *              Linus           :       Rewrote tcp_read() and URG handling
  85 *                                      completely
  86 *              Gerhard Koerting:       Fixed some missing timer handling
  87 *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88 *              Gerhard Koerting:       PC/TCP workarounds
  89 *              Adam Caldwell   :       Assorted timer/timing errors
  90 *              Matthew Dillon  :       Fixed another RST bug
  91 *              Alan Cox        :       Move to kernel side addressing changes.
  92 *              Alan Cox        :       Beginning work on TCP fastpathing
  93 *                                      (not yet usable)
  94 *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95 *              Alan Cox        :       TCP fast path debugging
  96 *              Alan Cox        :       Window clamping
  97 *              Michael Riepe   :       Bug in tcp_check()
  98 *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99 *              Matt Dillon     :       Yet more small nasties remove from the
 100 *                                      TCP code (Be very nice to this man if
 101 *                                      tcp finally works 100%) 8)
 102 *              Alan Cox        :       BSD accept semantics.
 103 *              Alan Cox        :       Reset on closedown bug.
 104 *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105 *              Michael Pall    :       Handle poll() after URG properly in
 106 *                                      all cases.
 107 *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108 *                                      (multi URG PUSH broke rlogin).
 109 *              Michael Pall    :       Fix the multi URG PUSH problem in
 110 *                                      tcp_readable(), poll() after URG
 111 *                                      works now.
 112 *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113 *                                      BSD api.
 114 *              Alan Cox        :       Changed the semantics of sk->socket to
 115 *                                      fix a race and a signal problem with
 116 *                                      accept() and async I/O.
 117 *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118 *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119 *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120 *                                      clients/servers which listen in on
 121 *                                      fixed ports.
 122 *              Alan Cox        :       Cleaned the above up and shrank it to
 123 *                                      a sensible code size.
 124 *              Alan Cox        :       Self connect lockup fix.
 125 *              Alan Cox        :       No connect to multicast.
 126 *              Ross Biro       :       Close unaccepted children on master
 127 *                                      socket close.
 128 *              Alan Cox        :       Reset tracing code.
 129 *              Alan Cox        :       Spurious resets on shutdown.
 130 *              Alan Cox        :       Giant 15 minute/60 second timer error
 131 *              Alan Cox        :       Small whoops in polling before an
 132 *                                      accept.
 133 *              Alan Cox        :       Kept the state trace facility since
 134 *                                      it's handy for debugging.
 135 *              Alan Cox        :       More reset handler fixes.
 136 *              Alan Cox        :       Started rewriting the code based on
 137 *                                      the RFC's for other useful protocol
 138 *                                      references see: Comer, KA9Q NOS, and
 139 *                                      for a reference on the difference
 140 *                                      between specifications and how BSD
 141 *                                      works see the 4.4lite source.
 142 *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143 *                                      close.
 144 *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145 *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146 *              Alan Cox        :       Reimplemented timers as per the RFC
 147 *                                      and using multiple timers for sanity.
 148 *              Alan Cox        :       Small bug fixes, and a lot of new
 149 *                                      comments.
 150 *              Alan Cox        :       Fixed dual reader crash by locking
 151 *                                      the buffers (much like datagram.c)
 152 *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153 *                                      now gets fed up of retrying without
 154 *                                      (even a no space) answer.
 155 *              Alan Cox        :       Extracted closing code better
 156 *              Alan Cox        :       Fixed the closing state machine to
 157 *                                      resemble the RFC.
 158 *              Alan Cox        :       More 'per spec' fixes.
 159 *              Jorge Cwik      :       Even faster checksumming.
 160 *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161 *                                      only frames. At least one pc tcp stack
 162 *                                      generates them.
 163 *              Alan Cox        :       Cache last socket.
 164 *              Alan Cox        :       Per route irtt.
 165 *              Matt Day        :       poll()->select() match BSD precisely on error
 166 *              Alan Cox        :       New buffers
 167 *              Marc Tamsky     :       Various sk->prot->retransmits and
 168 *                                      sk->retransmits misupdating fixed.
 169 *                                      Fixed tcp_write_timeout: stuck close,
 170 *                                      and TCP syn retries gets used now.
 171 *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172 *                                      ack if state is TCP_CLOSED.
 173 *              Alan Cox        :       Look up device on a retransmit - routes may
 174 *                                      change. Doesn't yet cope with MSS shrink right
 175 *                                      but its a start!
 176 *              Marc Tamsky     :       Closing in closing fixes.
 177 *              Mike Shaver     :       RFC1122 verifications.
 178 *              Alan Cox        :       rcv_saddr errors.
 179 *              Alan Cox        :       Block double connect().
 180 *              Alan Cox        :       Small hooks for enSKIP.
 181 *              Alexey Kuznetsov:       Path MTU discovery.
 182 *              Alan Cox        :       Support soft errors.
 183 *              Alan Cox        :       Fix MTU discovery pathological case
 184 *                                      when the remote claims no mtu!
 185 *              Marc Tamsky     :       TCP_CLOSE fix.
 186 *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187 *                                      window but wrong (fixes NT lpd problems)
 188 *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189 *              Joerg Reuter    :       No modification of locked buffers in
 190 *                                      tcp_do_retransmit()
 191 *              Eric Schenk     :       Changed receiver side silly window
 192 *                                      avoidance algorithm to BSD style
 193 *                                      algorithm. This doubles throughput
 194 *                                      against machines running Solaris,
 195 *                                      and seems to result in general
 196 *                                      improvement.
 197 *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198 *      Willy Konynenberg       :       Transparent proxying support.
 199 *      Mike McLagan            :       Routing by source
 200 *              Keith Owens     :       Do proper merging with partial SKB's in
 201 *                                      tcp_do_sendmsg to avoid burstiness.
 202 *              Eric Schenk     :       Fix fast close down bug with
 203 *                                      shutdown() followed by close().
 204 *              Andi Kleen :    Make poll agree with SIGIO
 205 *                                      
 206 *              This program is free software; you can redistribute it and/or
 207 *              modify it under the terms of the GNU General Public License
 208 *              as published by the Free Software Foundation; either version
 209 *              2 of the License, or(at your option) any later version.
 210 *
 211 * Description of States:
 212 *
 213 *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214 *
 215 *      TCP_SYN_RECV            received a connection request, sent ack,
 216 *                              waiting for final ack in three-way handshake.
 217 *
 218 *      TCP_ESTABLISHED         connection established
 219 *
 220 *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221 *                              transmission of remaining buffered data
 222 *
 223 *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224 *                              to shutdown
 225 *
 226 *      TCP_CLOSING             both sides have shutdown but we still have
 227 *                              data we have to finish sending
 228 *
 229 *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230 *                              closed, can only be entered from FIN_WAIT2
 231 *                              or CLOSING.  Required because the other end
 232 *                              may not have gotten our last ACK causing it
 233 *                              to retransmit the data packet (which we ignore)
 234 *
 235 *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236 *                              us to finish writing our data and to shutdown
 237 *                              (we have to close() to move on to LAST_ACK)
 238 *
 239 *      TCP_LAST_ACK            out side has shutdown after remote has
 240 *                              shutdown.  There may still be data in our
 241 *                              buffer that we have to finish sending
 242 *
 243 *      TCP_CLOSE               socket is finished
 244 */
 245
 246/*
 247 * RFC1122 status:
 248 * NOTE: I'm not going to be doing comments in the code for this one except
 249 * for violations and the like.  tcp.c is just too big... If I say something
 250 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251 * with Alan. -- MS 950903
 252 * [Note: Most of the TCP code has been rewriten/redesigned since this 
 253 *  RFC1122 check. It is probably not correct anymore. It should be redone 
 254 *  before 2.2. -AK]
 255 *
 256 * Use of PSH (4.2.2.2)
 257 *   MAY aggregate data sent without the PSH flag. (does)
 258 *   MAY queue data received without the PSH flag. (does)
 259 *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 260 *   MAY implement PSH on send calls. (doesn't, thus:)
 261 *     MUST NOT buffer data indefinitely (doesn't [1 second])
 262 *     MUST set PSH on last segment (does)
 263 *   MAY pass received PSH to application layer (doesn't)
 264 *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 265 *
 266 * Window Size (4.2.2.3, 4.2.2.16)
 267 *   MUST treat window size as an unsigned number (does)
 268 *   SHOULD treat window size as a 32-bit number (does not)
 269 *   MUST NOT shrink window once it is offered (does not normally)
 270 *
 271 * Urgent Pointer (4.2.2.4)
 272 * **MUST point urgent pointer to last byte of urgent data (not right
 273 *     after). (doesn't, to be like BSD. That's configurable, but defaults
 274 *      to off)
 275 *   MUST inform application layer asynchronously of incoming urgent
 276 *     data. (does)
 277 *   MUST provide application with means of determining the amount of
 278 *     urgent data pending. (does)
 279 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 280 *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 281 *      [Follows BSD 1 byte of urgent data]
 282 *
 283 * TCP Options (4.2.2.5)
 284 *   MUST be able to receive TCP options in any segment. (does)
 285 *   MUST ignore unsupported options (does)
 286 *
 287 * Maximum Segment Size Option (4.2.2.6)
 288 *   MUST implement both sending and receiving MSS. (does, but currently
 289 *      only uses the smaller of both of them)
 290 *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 291 *     it always). (does, even when MSS == 536, which is legal)
 292 *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 293 *   MUST calculate "effective send MSS" correctly:
 294 *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 295 *     (does - but allows operator override)
 296 *
 297 * TCP Checksum (4.2.2.7)
 298 *   MUST generate and check TCP checksum. (does)
 299 *
 300 * Initial Sequence Number Selection (4.2.2.8)
 301 *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 302 *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 303 *     necessary for 10Mbps networks - and harder than BSD to spoof!
 304 *     With syncookies we don't)
 305 *
 306 * Simultaneous Open Attempts (4.2.2.10)
 307 *   MUST support simultaneous open attempts (does)
 308 *
 309 * Recovery from Old Duplicate SYN (4.2.2.11)
 310 *   MUST keep track of active vs. passive open (does)
 311 *
 312 * RST segment (4.2.2.12)
 313 *   SHOULD allow an RST segment to contain data (does, but doesn't do
 314 *     anything with it, which is standard)
 315 *
 316 * Closing a Connection (4.2.2.13)
 317 *   MUST inform application of whether connection was closed by RST or
 318 *     normal close. (does)
 319 *   MAY allow "half-duplex" close (treat connection as closed for the
 320 *     local app, even before handshake is done). (does)
 321 *   MUST linger in TIME_WAIT for 2 * MSL (does)
 322 *
 323 * Retransmission Timeout (4.2.2.15)
 324 *   MUST implement Jacobson's slow start and congestion avoidance
 325 *     stuff. (does)
 326 *
 327 * Probing Zero Windows (4.2.2.17)
 328 *   MUST support probing of zero windows. (does)
 329 *   MAY keep offered window closed indefinitely. (does)
 330 *   MUST allow remote window to stay closed indefinitely. (does)
 331 *
 332 * Passive Open Calls (4.2.2.18)
 333 *   MUST NOT let new passive open affect other connections. (doesn't)
 334 *   MUST support passive opens (LISTENs) concurrently. (does)
 335 *
 336 * Time to Live (4.2.2.19)
 337 *   MUST make TCP TTL configurable. (does - IP_TTL option)
 338 *
 339 * Event Processing (4.2.2.20)
 340 *   SHOULD queue out-of-order segments. (does)
 341 *   MUST aggregate ACK segments whenever possible. (does but badly)
 342 *
 343 * Retransmission Timeout Calculation (4.2.3.1)
 344 *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 345 *     calculation. (does, or at least explains them in the comments 8*b)
 346 *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 347 *
 348 * When to Send an ACK Segment (4.2.3.2)
 349 *   SHOULD implement delayed ACK. (does)
 350 *   MUST keep ACK delay < 0.5 sec. (does)
 351 *
 352 * When to Send a Window Update (4.2.3.3)
 353 *   MUST implement receiver-side SWS. (does)
 354 *
 355 * When to Send Data (4.2.3.4)
 356 *   MUST implement sender-side SWS. (does)
 357 *   SHOULD implement Nagle algorithm. (does)
 358 *
 359 * TCP Connection Failures (4.2.3.5)
 360 *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 361 *   SHOULD inform application layer of soft errors. (does)
 362 *
 363 * TCP Keep-Alives (4.2.3.6)
 364 *   MAY provide keep-alives. (does)
 365 *   MUST make keep-alives configurable on a per-connection basis. (does)
 366 *   MUST default to no keep-alives. (does)
 367 *   MUST make keep-alive interval configurable. (does)
 368 *   MUST make default keep-alive interval > 2 hours. (does)
 369 *   MUST NOT interpret failure to ACK keep-alive packet as dead
 370 *     connection. (doesn't)
 371 *   SHOULD send keep-alive with no data. (does)
 372 *
 373 * TCP Multihoming (4.2.3.7)
 374 *   MUST get source address from IP layer before sending first
 375 *     SYN. (does)
 376 *   MUST use same local address for all segments of a connection. (does)
 377 *
 378 * IP Options (4.2.3.8)
 379 *   MUST ignore unsupported IP options. (does)
 380 *   MAY support Time Stamp and Record Route. (does)
 381 *   MUST allow application to specify a source route. (does)
 382 *   MUST allow received Source Route option to set route for all future
 383 *     segments on this connection. (does not (security issues))
 384 *
 385 * ICMP messages (4.2.3.9)
 386 *   MUST act on ICMP errors. (does)
 387 *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore 
 388 *   because that is deprecated now by the IETF, can be turned on)
 389 *   MUST NOT abort connection upon receipt of soft Destination
 390 *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391 *     Problems. (doesn't)
 392 *   SHOULD report soft Destination Unreachables etc. to the
 393 *     application. (does, except during SYN_RECV and may drop messages
 394 *     in some rare cases before accept() - ICMP is unreliable) 
 395 *   SHOULD abort connection upon receipt of hard Destination Unreachable
 396 *     messages (2, 3, 4). (does, but see above)
 397 *
 398 * Remote Address Validation (4.2.3.10)
 399 *   MUST reject as an error OPEN for invalid remote IP address. (does)
 400 *   MUST ignore SYN with invalid source address. (does)
 401 *   MUST silently discard incoming SYN for broadcast/multicast
 402 *     address. (does)
 403 *
 404 * Asynchronous Reports (4.2.4.1)
 405 * MUST provide mechanism for reporting soft errors to application
 406 *     layer. (does)
 407 *
 408 * Type of Service (4.2.4.2)
 409 *   MUST allow application layer to set Type of Service. (does IP_TOS)
 410 *
 411 * (Whew. -- MS 950903)
 412 * (Updated by AK, but not complete yet.)
 413 **/
 414
 415#include <linux/types.h>
 416#include <linux/fcntl.h>
 417#include <linux/poll.h>
 418#include <linux/init.h>
 419
 420#include <net/icmp.h>
 421#include <net/tcp.h>
 422
 423#include <asm/uaccess.h>
 424
 425int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 426
 427struct tcp_mib  tcp_statistics;
 428
 429kmem_cache_t *tcp_openreq_cachep;
 430kmem_cache_t *tcp_bucket_cachep;
 431kmem_cache_t *tcp_timewait_cachep;
 432
 433/*
 434 *      Find someone to 'accept'. Must be called with
 435 *      the socket locked or with interrupts disabled
 436 */
 437
 438static struct open_request *tcp_find_established(struct tcp_opt *tp, 
 439                                                 struct open_request **prevp)
 440{
 441        struct open_request *req = tp->syn_wait_queue;
 442        struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; 
 443        while(req) {
 444                if (req->sk && 
 445                    ((1 << req->sk->state) &
 446                     ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
 447                        break;
 448                prev = req; 
 449                req = req->dl_next;
 450        }
 451        *prevp = prev; 
 452        return req;
 453}
 454
 455/*
 456 *      Walk down the receive queue counting readable data.
 457 *
 458 *      Must be called with the socket lock held.
 459 */
 460
 461static int tcp_readable(struct sock *sk)
 462{
 463        unsigned long counted;
 464        unsigned long amount;
 465        struct sk_buff *skb;
 466        int sum;
 467
 468        SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
 469
 470        skb = skb_peek(&sk->receive_queue);
 471        if (skb == NULL) {
 472                SOCK_DEBUG(sk, "empty\n");
 473                return(0);
 474        }
 475
 476        counted = sk->tp_pinfo.af_tcp.copied_seq;       /* Where we are at the moment */
 477        amount = 0;
 478
 479        /* Do until a push or until we are out of data. */
 480        do {
 481                /* Found a hole so stops here. */
 482                if (before(counted, TCP_SKB_CB(skb)->seq))      /* should not happen */
 483                        break;
 484
 485                /* Length - header but start from where we are up to
 486                 * avoid overlaps.
 487                 */
 488                sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
 489                if (sum >= 0) {
 490                        /* Add it up, move on. */
 491                        amount += sum;
 492                        counted += sum;
 493                        if (skb->h.th->syn)
 494                                counted++;
 495                }
 496
 497                /* Don't count urg data ... but do it in the right place!
 498                 * Consider: "old_data (ptr is here) URG PUSH data"
 499                 * The old code would stop at the first push because
 500                 * it counted the urg (amount==1) and then does amount--
 501                 * *after* the loop.  This means tcp_readable() always
 502                 * returned zero if any URG PUSH was in the queue, even
 503                 * though there was normal data available. If we subtract
 504                 * the urg data right here, we even get it to work for more
 505                 * than one URG PUSH skb without normal data.
 506                 * This means that poll() finally works now with urg data
 507                 * in the queue.  Note that rlogin was never affected
 508                 * because it doesn't use poll(); it uses two processes
 509                 * and a blocking read().  And the queue scan in tcp_read()
 510                 * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 511                 */
 512
 513                /* Don't count urg data. */
 514                if (skb->h.th->urg)
 515                        amount--;
 516#if 0
 517                if (amount && skb->h.th->psh) break;
 518#endif
 519                skb = skb->next;
 520        } while(skb != (struct sk_buff *)&sk->receive_queue);
 521
 522        SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
 523        return(amount);
 524}
 525
 526/*
 527 * LISTEN is a special case for poll..
 528 */
 529static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 530{
 531        struct open_request *req, *dummy;
 532
 533        lock_sock(sk);
 534        req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
 535        release_sock(sk);
 536        if (req)
 537                return POLLIN | POLLRDNORM;
 538        return 0;
 539}
 540
 541/*
 542 *      Compute minimal free write space needed to queue new packets. 
 543 */
 544#define tcp_min_write_space(__sk) \
 545        (atomic_read(&(__sk)->wmem_alloc) / 2)
 546
 547/*
 548 *      Wait for a TCP event.
 549 *
 550 *      Note that we don't need to lock the socket, as the upper poll layers
 551 *      take care of normal races (between the test and the event) and we don't
 552 *      go look at any of the socket buffers directly.
 553 */
 554unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 555{
 556        unsigned int mask;
 557        struct sock *sk = sock->sk;
 558        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 559
 560        poll_wait(file, sk->sleep, wait);
 561        if (sk->state == TCP_LISTEN)
 562                return tcp_listen_poll(sk, wait);
 563
 564        mask = 0;
 565        if (sk->err)
 566                mask = POLLERR;
 567
 568        /*
 569         * POLLHUP is certainly not done right. But poll() doesn't
 570         * have a notion of HUP in just one direction, and for a
 571         * socket the read side is more interesting.
 572         *
 573         * Some poll() documentation says that POLLHUP is incompatible
 574         * with the POLLOUT/POLLWR flags, so somebody should check this
 575         * all. But careful, it tends to be safer to return too many
 576         * bits than too few, and you can easily break real applications
 577         * if you don't tell them that something has hung up!
 578         *
 579         * Check-me.
 580         */
 581        if (sk->shutdown & RCV_SHUTDOWN)
 582                mask |= POLLHUP;
 583
 584        /* Connected? */
 585        if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 586                if ((tp->rcv_nxt != tp->copied_seq) &&
 587                    (tp->urg_seq != tp->copied_seq ||
 588                     tp->rcv_nxt != tp->copied_seq+1 ||
 589                     sk->urginline || !tp->urg_data))
 590                        mask |= POLLIN | POLLRDNORM;
 591
 592                if (!(sk->shutdown & SEND_SHUTDOWN)) {
 593                        if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
 594                                mask |= POLLOUT | POLLWRNORM;
 595                        } else {  /* send SIGIO later */
 596                                sk->socket->flags |= SO_NOSPACE;
 597                        }
 598                }
 599
 600                if (tp->urg_data & URG_VALID)
 601                        mask |= POLLPRI;
 602        }
 603        return mask;
 604}
 605
 606/*
 607 *      Socket write_space callback.
 608 *      This (or rather the sock_wake_async) should agree with poll. 
 609 */
 610void tcp_write_space(struct sock *sk)
 611{
 612        if (sk->dead)
 613                return; 
 614
 615        wake_up_interruptible(sk->sleep);
 616        if (sock_wspace(sk) >=
 617            tcp_min_write_space(sk))
 618                sock_wake_async(sk->socket, 2);
 619}
 620
 621
 622int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 623{
 624        int answ;
 625
 626        switch(cmd) {
 627        case TIOCINQ:
 628#ifdef FIXME    /* FIXME: */
 629        case FIONREAD:
 630#endif
 631                if (sk->state == TCP_LISTEN)
 632                        return(-EINVAL);
 633                lock_sock(sk);
 634                answ = tcp_readable(sk);
 635                release_sock(sk);
 636                break;
 637        case SIOCATMARK:
 638                {
 639                        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 640                        answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 641                        break;
 642                }
 643        case TIOCOUTQ:
 644                if (sk->state == TCP_LISTEN)
 645                        return(-EINVAL);
 646                answ = sock_wspace(sk);
 647                break;
 648        default:
 649                return(-ENOIOCTLCMD);
 650        };
 651
 652        return put_user(answ, (int *)arg);
 653}
 654
 655/*
 656 *      Wait for a socket to get into the connected state
 657 *
 658 *      Note: must be called with the socket locked.
 659 */
 660static int wait_for_tcp_connect(struct sock * sk, int flags)
 661{
 662        struct task_struct *tsk = current;
 663        struct wait_queue wait = { tsk, NULL };
 664
 665        while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 666                if(sk->err)
 667                        return sock_error(sk);
 668                if((1 << sk->state) &
 669                   ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 670                        if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 671                                send_sig(SIGPIPE, tsk, 0);
 672                        return -EPIPE;
 673                }
 674                if(flags & MSG_DONTWAIT)
 675                        return -EAGAIN;
 676                if(signal_pending(tsk))
 677                        return -ERESTARTSYS;
 678
 679                tsk->state = TASK_INTERRUPTIBLE;
 680                add_wait_queue(sk->sleep, &wait);
 681                release_sock(sk);
 682
 683                if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
 684                    sk->err == 0)
 685                        schedule();
 686
 687                tsk->state = TASK_RUNNING;
 688                remove_wait_queue(sk->sleep, &wait);
 689                lock_sock(sk);
 690        }
 691        return 0;
 692}
 693
 694static inline int tcp_memory_free(struct sock *sk)
 695{
 696        return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
 697}
 698
 699/*
 700 *      Wait for more memory for a socket
 701 */
 702static void wait_for_tcp_memory(struct sock * sk)
 703{
 704        release_sock(sk);
 705        if (!tcp_memory_free(sk)) {
 706                struct wait_queue wait = { current, NULL };
 707
 708                sk->socket->flags &= ~SO_NOSPACE;
 709                add_wait_queue(sk->sleep, &wait);
 710                for (;;) {
 711                        if (signal_pending(current))
 712                                break;
 713                        current->state = TASK_INTERRUPTIBLE;
 714                        if (tcp_memory_free(sk))
 715                                break;
 716                        if (sk->shutdown & SEND_SHUTDOWN)
 717                                break;
 718                        if (sk->err)
 719                                break;
 720                        schedule();
 721                }
 722                current->state = TASK_RUNNING;
 723                remove_wait_queue(sk->sleep, &wait);
 724        }
 725        lock_sock(sk);
 726}
 727
 728/* When all user supplied data has been queued set the PSH bit */
 729#define PSH_NEEDED (seglen == 0 && iovlen == 0)
 730
 731/*
 732 *      This routine copies from a user buffer into a socket,
 733 *      and starts the transmit system.
 734 *
 735 *      Note: must be called with the socket locked.
 736 */
 737
 738int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 739{
 740        struct iovec *iov;
 741        struct tcp_opt *tp;
 742        struct sk_buff *skb;
 743        int iovlen, flags;
 744        int mss_now;
 745        int err, copied;
 746
 747        lock_sock(sk);
 748
 749        err = 0;
 750        tp = &(sk->tp_pinfo.af_tcp);
 751
 752        /* Wait for a connection to finish. */
 753        flags = msg->msg_flags;
 754        if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 755                if((err = wait_for_tcp_connect(sk, flags)) != 0)
 756                        goto out;
 757
 758        /* This should be in poll */
 759        sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
 760
 761        mss_now = tcp_current_mss(sk);
 762
 763        /* Ok commence sending. */
 764        iovlen = msg->msg_iovlen;
 765        iov = msg->msg_iov;
 766        copied = 0;
 767        
 768        while(--iovlen >= 0) {
 769                int seglen=iov->iov_len;
 770                unsigned char * from=iov->iov_base;
 771
 772                iov++;
 773
 774                while(seglen > 0) {
 775                        int copy, tmp, queue_it;
 776
 777                        if (err)
 778                                goto do_fault2;
 779
 780                        /* Stop on errors. */
 781                        if (sk->err)
 782                                goto do_sock_err;
 783
 784                        /* Make sure that we are established. */
 785                        if (sk->shutdown & SEND_SHUTDOWN)
 786                                goto do_shutdown;
 787        
 788                        /* Now we need to check if we have a half
 789                         * built packet we can tack some data onto.
 790                         */
 791                        if (tp->send_head && !(flags & MSG_OOB)) {
 792                                skb = sk->write_queue.prev;
 793                                copy = skb->len;
 794                                /* If the remote does SWS avoidance we should
 795                                 * queue the best we can if not we should in 
 796                                 * fact send multiple packets...
 797                                 * A method for detecting this would be most
 798                                 * welcome.
 799                                 */
 800                                if (skb_tailroom(skb) > 0 &&
 801                                    (mss_now - copy) > 0 &&
 802                                    tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
 803                                        int last_byte_was_odd = (copy % 4);
 804
 805                                        copy = mss_now - copy;
 806                                        if(copy > skb_tailroom(skb))
 807                                                copy = skb_tailroom(skb);
 808                                        if(copy > seglen)
 809                                                copy = seglen;
 810                                        if(last_byte_was_odd) {
 811                                                if(copy_from_user(skb_put(skb, copy),
 812                                                                  from, copy))
 813                                                        err = -EFAULT;
 814                                                skb->csum = csum_partial(skb->data,
 815                                                                         skb->len, 0);
 816                                        } else {
 817                                                skb->csum =
 818                                                        csum_and_copy_from_user(
 819                                                        from, skb_put(skb, copy),
 820                                                        copy, skb->csum, &err);
 821                                        }
 822                                        /*
 823                                         * FIXME: the *_user functions should
 824                                         *        return how much data was
 825                                         *        copied before the fault
 826                                         *        occurred and then a partial
 827                                         *        packet with this data should
 828                                         *        be sent.  Unfortunately
 829                                         *        csum_and_copy_from_user doesn't
 830                                         *        return this information.
 831                                         *        ATM it might send partly zeroed
 832                                         *        data in this case.
 833                                         */
 834                                        tp->write_seq += copy;
 835                                        TCP_SKB_CB(skb)->end_seq += copy;
 836                                        from += copy;
 837                                        copied += copy;
 838                                        seglen -= copy;
 839                                        if (PSH_NEEDED)
 840                                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 841                                        continue;
 842                                }
 843                        }
 844
 845                        /* We also need to worry about the window.  If
 846                         * window < 1/2 the maximum window we've seen
 847                         * from this host, don't use it.  This is
 848                         * sender side silly window prevention, as
 849                         * specified in RFC1122.  (Note that this is
 850                         * different than earlier versions of SWS
 851                         * prevention, e.g. RFC813.).  What we
 852                         * actually do is use the whole MSS.  Since
 853                         * the results in the right edge of the packet
 854                         * being outside the window, it will be queued
 855                         * for later rather than sent.
 856                         */
 857                        copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 858                        if(copy > (tp->max_window >> 1))
 859                                copy = min(copy, mss_now);
 860                        else
 861                                copy = mss_now;
 862                        if(copy > seglen)
 863                                copy = seglen;
 864
 865                        /* Determine how large of a buffer to allocate.  */
 866                        tmp = MAX_HEADER + sk->prot->max_header;
 867                        if (copy < min(mss_now, tp->max_window >> 1) &&
 868                            !(flags & MSG_OOB)) {
 869                                tmp += min(mss_now, tp->max_window);
 870
 871                                /* What is happening here is that we want to
 872                                 * tack on later members of the users iovec
 873                                 * if possible into a single frame.  When we
 874                                 * leave this loop our caller checks to see if
 875                                 * we can send queued frames onto the wire.
 876                                 * See tcp_v[46]_sendmsg() for this.
 877                                 */
 878                                queue_it = 1;
 879                        } else {
 880                                tmp += copy;
 881                                queue_it = 0;
 882                        }
 883                        skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
 884
 885                        /* If we didn't get any memory, we need to sleep. */
 886                        if (skb == NULL) {
 887                                sk->socket->flags |= SO_NOSPACE;
 888                                if (flags&MSG_DONTWAIT) {
 889                                        err = -EAGAIN;
 890                                        goto do_interrupted;
 891                                }
 892                                if (signal_pending(current)) {
 893                                        err = -ERESTARTSYS;
 894                                        goto do_interrupted;
 895                                }
 896                                wait_for_tcp_memory(sk);
 897
 898                                /* If SACK's were formed or PMTU events happened,
 899                                 * we must find out about it.
 900                                 */
 901                                mss_now = tcp_current_mss(sk);
 902                                continue;
 903                        }
 904
 905                        seglen -= copy;
 906
 907                        /* Prepare control bits for TCP header creation engine. */
 908                        TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 909                                                  (PSH_NEEDED ?
 910                                                   TCPCB_FLAG_PSH : 0));
 911                        TCP_SKB_CB(skb)->sacked = 0;
 912                        if (flags & MSG_OOB) {
 913                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 914                                TCP_SKB_CB(skb)->urg_ptr = copy;
 915                        } else
 916                                TCP_SKB_CB(skb)->urg_ptr = 0;
 917
 918                        /* TCP data bytes are SKB_PUT() on top, later
 919                         * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 920                         * Reserve header space and checksum the data.
 921                         */
 922                        skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 923                        skb->csum = csum_and_copy_from_user(from,
 924                                        skb_put(skb, copy), copy, 0, &err);
 925
 926                        if (err)
 927                                goto do_fault;
 928
 929                        from += copy;
 930                        copied += copy;
 931
 932                        TCP_SKB_CB(skb)->seq = tp->write_seq;
 933                        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 934
 935                        /* This advances tp->write_seq for us. */
 936                        tcp_send_skb(sk, skb, queue_it);
 937                }
 938        }
 939        sk->err = 0;
 940        err = copied;
 941        goto out;
 942
 943do_sock_err:
 944        if(copied)
 945                err = copied;
 946        else
 947                err = sock_error(sk);
 948        goto out;
 949do_shutdown:
 950        if(copied)
 951                err = copied;
 952        else {
 953                if (!(flags&MSG_NOSIGNAL))
 954                        send_sig(SIGPIPE, current, 0);
 955                err = -EPIPE;
 956        }
 957        goto out;
 958do_interrupted:
 959        if(copied)
 960                err = copied;
 961        goto out;
 962do_fault:
 963        kfree_skb(skb);
 964do_fault2:
 965        err = -EFAULT;
 966out:
 967        tcp_push_pending_frames(sk, tp);
 968        release_sock(sk);
 969        return err;
 970}
 971
 972#undef PSH_NEEDED
 973
 974/*
 975 *      Send an ack if one is backlogged at this point. Ought to merge
 976 *      this with tcp_send_ack().
 977 *      This is called for delayed acks also.
 978 */
 979 
 980void tcp_read_wakeup(struct sock *sk)
 981{
 982        /* If we're closed, don't send an ack, or we'll get a RST
 983         * from the closed destination.
 984         */
 985        if (sk->state != TCP_CLOSE)
 986                tcp_send_ack(sk);
 987}
 988
 989/*
 990 *      Handle reading urgent data. BSD has very simple semantics for
 991 *      this, no blocking and very strange errors 8)
 992 */
 993
 994static int tcp_recv_urg(struct sock * sk, int nonblock,
 995                        struct msghdr *msg, int len, int flags, 
 996                        int *addr_len)
 997{
 998        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 999
1000        /* No URG data to read. */
1001        if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
1002                return -EINVAL; /* Yes this is right ! */
1003
1004        if (sk->err)
1005                return sock_error(sk);
1006
1007        if (sk->done)
1008                return -ENOTCONN;
1009
1010        if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
1011                sk->done = 1;
1012                return 0;
1013        }
1014
1015        lock_sock(sk);
1016        if (tp->urg_data & URG_VALID) {
1017                int err = 0; 
1018                char c = tp->urg_data;
1019
1020                if (!(flags & MSG_PEEK))
1021                        tp->urg_data = URG_READ;
1022                        
1023                if(msg->msg_name)
1024                        tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1025                                                       msg->msg_name);       
1026
1027                if(addr_len)
1028                        *addr_len = tp->af_specific->sockaddr_len;
1029
1030                /* Read urgent data. */
1031                msg->msg_flags|=MSG_OOB;
1032                release_sock(sk);
1033
1034                if(len>0)
1035                {
1036                        err = memcpy_toiovec(msg->msg_iov, &c, 1);
1037                        /* N.B. already set above ... */
1038                        msg->msg_flags|=MSG_OOB;
1039                }
1040                else
1041                        msg->msg_flags|=MSG_TRUNC;
1042                        
1043                /* N.B. Is this right?? If len == 0 we didn't read any data */ 
1044                return err ? -EFAULT : 1;
1045        }
1046        release_sock(sk);
1047
1048        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1049         * the available implementations agree in this case:
1050         * this call should never block, independent of the
1051         * blocking state of the socket.
1052         * Mike <pall@rz.uni-karlsruhe.de>
1053         */
1054        return -EAGAIN;
1055}
1056
1057/*
1058 *      Release a skb if it is no longer needed. This routine
1059 *      must be called with interrupts disabled or with the
1060 *      socket locked so that the sk_buff queue operation is ok.
1061 */
1062
1063static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1064{
1065        __skb_unlink(skb, &sk->receive_queue);
1066        kfree_skb(skb);
1067}
1068
1069/* Clean up the receive buffer for full frames taken by the user,
1070 * then send an ACK if necessary.  COPIED is the number of bytes
1071 * tcp_recvmsg has given to the user so far, it speeds up the
1072 * calculation of whether or not we must ACK for the sake of
1073 * a window update.
1074 */
1075static void cleanup_rbuf(struct sock *sk, int copied)
1076{
1077        struct sk_buff *skb;
1078        
1079        /* NOTE! The socket must be locked, so that we don't get
1080         * a messed-up receive queue.
1081         */
1082        while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1083                if (!skb->used || atomic_read(&skb->users) > 1)
1084                        break;
1085                tcp_eat_skb(sk, skb);
1086        }
1087
1088        /* We send an ACK if we can now advertise a non-zero window
1089         * which has been raised "significantly".
1090         */
1091        if(copied > 0) {
1092                struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1093                __u32 rcv_window_now = tcp_receive_window(tp);
1094                __u32 new_window = __tcp_select_window(sk);
1095
1096                /* We won't be raising the window any further than
1097                 * the window-clamp allows.  Our window selection
1098                 * also keeps things a nice multiple of MSS.  These
1099                 * checks are necessary to prevent spurious ACKs
1100                 * which don't advertize a larger window.
1101                 */
1102                if((new_window && (new_window >= rcv_window_now * 2)) &&
1103                   ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
1104                        tcp_read_wakeup(sk);
1105        }
1106}
1107
1108
1109/*
1110 *      This routine copies from a sock struct into the user buffer. 
1111 */
1112 
1113int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1114                int len, int nonblock, int flags, int *addr_len)
1115{
1116        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1117        struct wait_queue wait = { current, NULL };
1118        int copied = 0;
1119        u32 peek_seq;
1120        volatile u32 *seq;      /* So gcc doesn't overoptimise */
1121        unsigned long used;
1122        int err = 0; 
1123        int target = 1;         /* Read at least this many bytes */
1124
1125        if (sk->err)
1126                return sock_error(sk);
1127
1128        if (sk->state == TCP_LISTEN)
1129                return -ENOTCONN;
1130
1131        /* Urgent data needs to be handled specially. */
1132        if (flags & MSG_OOB)
1133                return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1134
1135        /*      Copying sequence to update. This is volatile to handle
1136         *      the multi-reader case neatly (memcpy_to/fromfs might be
1137         *      inline and thus not flush cached variables otherwise).
1138         */
1139        peek_seq = tp->copied_seq;
1140        seq = &tp->copied_seq;
1141        if (flags & MSG_PEEK)
1142                seq = &peek_seq;
1143                
1144        /* Handle the POSIX bogosity MSG_WAITALL. */
1145        if (flags & MSG_WAITALL)
1146                target=len;
1147
1148        add_wait_queue(sk->sleep, &wait);
1149        lock_sock(sk);
1150        
1151        /*
1152         *      BUG BUG BUG
1153         *      This violates 1003.1g compliance. We must wait for 
1154         *      data to exist even if we read none!
1155         */
1156         
1157        while (len > 0) {
1158                struct sk_buff * skb;
1159                u32 offset;
1160
1161                /* Are we at urgent data? Stop if we have read anything. */
1162                if (copied && tp->urg_data && tp->urg_seq == *seq)
1163                        break;
1164
1165                /* We need to check signals first, to get correct SIGURG
1166                 * handling. FIXME: Need to check this doesnt impact 1003.1g
1167                 * and move it down to the bottom of the loop
1168                 */
1169                if (signal_pending(current)) {
1170                        if (copied)
1171                                break;
1172                        copied = -ERESTARTSYS;
1173                        if (nonblock)
1174                                copied = -EAGAIN;
1175                        break;
1176                }
1177
1178                /* Next get a buffer. */
1179                current->state = TASK_INTERRUPTIBLE;
1180
1181                skb = skb_peek(&sk->receive_queue);
1182                do {
1183                        if (!skb)
1184                                break;
1185
1186                        /* Now that we have two receive queues this 
1187                         * shouldn't happen.
1188                         */
1189                        if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1190                                printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1191                                       *seq, TCP_SKB_CB(skb)->seq);
1192                                break;
1193                        }
1194                        offset = *seq - TCP_SKB_CB(skb)->seq;
1195                        if (skb->h.th->syn)
1196                                offset--;
1197                        if (offset < skb->len)
1198                                goto found_ok_skb;
1199                        if (skb->h.th->fin)
1200                                goto found_fin_ok;
1201                        if (!(flags & MSG_PEEK))
1202                                skb->used = 1;
1203                        skb = skb->next;
1204                } while (skb != (struct sk_buff *)&sk->receive_queue);
1205
1206                if (copied >= target)
1207                        break;
1208
1209                /*
1210                   These three lines and clause if (sk->state == TCP_CLOSE)
1211                   are unlikely to be correct, if target > 1.
1212                   I DO NOT FIX IT, because I have no idea, what
1213                   POSIX prescribes to make here. Probably, it really
1214                   wants to lose data 8), if not all target is received.
1215                                                                 --ANK
1216                 */
1217                if (sk->err && !(flags&MSG_PEEK)) {
1218                        copied = sock_error(sk);
1219                        break;
1220                }
1221
1222                if (sk->shutdown & RCV_SHUTDOWN) {
1223                        sk->done = 1;
1224                        break;
1225                }
1226
1227                if (sk->state == TCP_CLOSE) {
1228                        if (!sk->done) {
1229                                sk->done = 1;
1230                                break;
1231                        }
1232                        copied = -ENOTCONN;
1233                        break;
1234                }
1235
1236                if (nonblock) {
1237                        copied = -EAGAIN;
1238                        break;
1239                }
1240
1241                cleanup_rbuf(sk, copied);
1242                release_sock(sk);
1243                sk->socket->flags |= SO_WAITDATA;
1244                schedule();
1245                sk->socket->flags &= ~SO_WAITDATA;
1246                lock_sock(sk);
1247                continue;
1248
1249        found_ok_skb:
1250                /*      Lock the buffer. We can be fairly relaxed as
1251                 *      an interrupt will never steal a buffer we are
1252                 *      using unless I've missed something serious in
1253                 *      tcp_data.
1254                 */
1255                atomic_inc(&skb->users);
1256
1257                /* Ok so how much can we use? */
1258                used = skb->len - offset;
1259                if (len < used)
1260                        used = len;
1261
1262                /* Do we have urgent data here? */
1263                if (tp->urg_data) {
1264                        u32 urg_offset = tp->urg_seq - *seq;
1265                        if (urg_offset < used) {
1266                                if (!urg_offset) {
1267                                        if (!sk->urginline) {
1268                                                ++*seq;
1269                                                offset++;
1270                                                used--;
1271                                        }
1272                                } else
1273                                        used = urg_offset;
1274                        }
1275                }
1276
1277                /*      Copy it - We _MUST_ update *seq first so that we
1278                 *      don't ever double read when we have dual readers
1279                 */
1280                *seq += used;
1281
1282                /*      This memcpy_toiovec can sleep. If it sleeps and we
1283                 *      do a second read it relies on the skb->users to avoid
1284                 *      a crash when cleanup_rbuf() gets called.
1285                 */
1286                err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1287                if (err) {
1288                        /* Exception. Bailout! */
1289                        atomic_dec(&skb->users);
1290                        copied = -EFAULT;
1291                        break;
1292                }
1293
1294                copied += used;
1295                len -= used;
1296
1297                /*      We now will not sleep again until we are finished
1298                 *      with skb. Sorry if you are doing the SMP port
1299                 *      but you'll just have to fix it neatly ;)
1300                 */
1301                atomic_dec(&skb->users);
1302
1303                if (after(tp->copied_seq,tp->urg_seq))
1304                        tp->urg_data = 0;
1305                if (used + offset < skb->len)
1306                        continue;
1307
1308                /*      Process the FIN. We may also need to handle PSH
1309                 *      here and make it break out of MSG_WAITALL.
1310                 */
1311                if (skb->h.th->fin)
1312                        goto found_fin_ok;
1313                if (flags & MSG_PEEK)
1314                        continue;
1315                skb->used = 1;
1316                if (atomic_read(&skb->users) == 1)
1317                        tcp_eat_skb(sk, skb);
1318                continue;
1319
1320        found_fin_ok:
1321                ++*seq;
1322                if (flags & MSG_PEEK)
1323                        break;
1324
1325                /* All is done. */
1326                skb->used = 1;
1327                sk->shutdown |= RCV_SHUTDOWN;
1328                break;
1329        }
1330
1331        if(copied > 0 && msg->msg_name)
1332                tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1333                                               msg->msg_name);       
1334
1335        if(addr_len)
1336                *addr_len = tp->af_specific->sockaddr_len;
1337
1338        remove_wait_queue(sk->sleep, &wait);
1339        current->state = TASK_RUNNING;
1340
1341        /* Clean up data we have read: This will do ACK frames. */
1342        cleanup_rbuf(sk, copied);
1343        release_sock(sk);
1344        return copied;
1345}
1346
1347/*
1348 * Check whether to renew the timer.
1349 */
1350static inline void tcp_check_fin_timer(struct sock *sk)
1351{
1352        if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev)
1353                tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
1354}
1355
1356/*
1357 *      State processing on a close. This implements the state shift for
1358 *      sending our FIN frame. Note that we only send a FIN for some
1359 *      states. A shutdown() may have already sent the FIN, or we may be
1360 *      closed.
1361 */
1362
1363static unsigned char new_state[16] = {
1364  /* current state:        new state:      action:      */
1365  /* (Invalid)          */ TCP_CLOSE,
1366  /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1367  /* TCP_SYN_SENT       */ TCP_CLOSE,
1368  /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1369  /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1370  /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1371  /* TCP_TIME_WAIT      */ TCP_CLOSE,
1372  /* TCP_CLOSE          */ TCP_CLOSE,
1373  /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1374  /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1375  /* TCP_LISTEN         */ TCP_CLOSE,
1376  /* TCP_CLOSING        */ TCP_CLOSING,
1377};
1378
1379static int tcp_close_state(struct sock *sk, int dead)
1380{
1381        int next = (int) new_state[sk->state];
1382        int ns = (next & TCP_STATE_MASK);
1383
1384        tcp_set_state(sk, ns);
1385
1386        /*      This is a (useful) BSD violating of the RFC. There is a
1387         *      problem with TCP as specified in that the other end could
1388         *      keep a socket open forever with no application left this end.
1389         *      We use a 3 minute timeout (about the same as BSD) then kill
1390         *      our end. If they send after that then tough - BUT: long enough
1391         *      that we won't make the old 4*rto = almost no time - whoops
1392         *      reset mistake.
1393         */
1394        if (dead)
1395                tcp_check_fin_timer(sk);
1396
1397        return (next & TCP_ACTION_FIN);
1398}
1399
1400/*
1401 *      Shutdown the sending side of a connection. Much like close except
1402 *      that we don't receive shut down or set sk->dead.
1403 */
1404
1405void tcp_shutdown(struct sock *sk, int how)
1406{
1407        /*      We need to grab some memory, and put together a FIN,
1408         *      and then put it into the queue to be sent.
1409         *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1410         */
1411        if (!(how & SEND_SHUTDOWN))
1412                return;
1413
1414        /* If we've already sent a FIN, or it's a closed state, skip this. */
1415        if ((1 << sk->state) &
1416            (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1417                lock_sock(sk);
1418
1419                /* Clear out any half completed packets.  FIN if needed. */
1420                if (tcp_close_state(sk,0))
1421                        tcp_send_fin(sk);
1422
1423                release_sock(sk);
1424        }
1425}
1426
1427
1428/*
1429 *      Return 1 if we still have things to send in our buffers.
1430 */
1431
1432static inline int closing(struct sock * sk)
1433{
1434        return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1435}
1436
1437/*
1438 *      This routine closes sockets which have been at least partially
1439 *      opened, but not yet accepted. Currently it is only called by
1440 *      tcp_close, and timeout mirrors the value there.
1441 */
1442
1443static void tcp_close_pending (struct sock *sk)
1444{
1445        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1446        struct open_request *req = tp->syn_wait_queue;
1447
1448        while(req) {
1449                struct open_request *iter;
1450                
1451                if (req->sk)
1452                        tcp_close(req->sk, 0);
1453
1454                iter = req;
1455                req = req->dl_next;
1456                
1457                (*iter->class->destructor)(iter);
1458                tcp_dec_slow_timer(TCP_SLT_SYNACK);
1459                sk->ack_backlog--;
1460                tcp_openreq_free(iter);
1461        }
1462
1463        tcp_synq_init(tp);
1464}
1465
1466void tcp_close(struct sock *sk, long timeout)
1467{
1468        struct sk_buff *skb;
1469        int data_was_unread = 0;
1470
1471        /*
1472         * Check whether the socket is locked ... supposedly
1473         * it's impossible to tcp_close() a locked socket.
1474         */
1475        if (atomic_read(&sk->sock_readers))
1476                printk("tcp_close: socket already locked!\n");
1477
1478        /* We need to grab some memory, and put together a FIN,
1479         * and then put it into the queue to be sent.
1480         */
1481        lock_sock(sk);
1482        if(sk->state == TCP_LISTEN) {
1483                /* Special case. */
1484                tcp_set_state(sk, TCP_CLOSE);
1485                tcp_close_pending(sk);
1486                release_sock(sk);
1487                sk->dead = 1;
1488                return;
1489        }
1490
1491        /* It is questionable, what the role of this is now.
1492         * In any event either it should be removed, or
1493         * increment of SLT_KEEPALIVE be done, this is causing
1494         * big problems.  For now I comment it out.  -DaveM
1495         */
1496        /* sk->keepopen = 1; */
1497        sk->shutdown = SHUTDOWN_MASK;
1498
1499        if (!sk->dead)
1500                sk->state_change(sk);
1501
1502        /*  We need to flush the recv. buffs.  We do this only on the
1503         *  descriptor close, not protocol-sourced closes, because the
1504         *  reader process may not have drained the data yet!
1505         */
1506        while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1507                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1508                data_was_unread += len;
1509                kfree_skb(skb);
1510        }
1511
1512        /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1513         * 3.10, we send a RST here because data was lost.  To
1514         * witness the awful effects of the old behavior of always
1515         * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1516         * a bulk GET in an FTP client, suspend the process, wait
1517         * for the client to advertise a zero window, then kill -9
1518         * the FTP client, wheee...  Note: timeout is always zero
1519         * in such a case.
1520         */
1521        if(data_was_unread != 0) {
1522                /* Unread data was tossed, zap the connection. */
1523                tcp_set_state(sk, TCP_CLOSE);
1524                tcp_send_active_reset(sk);
1525        } else if (tcp_close_state(sk,1)) {
1526                /* We FIN if the application ate all the data before
1527                 * zapping the connection.
1528                 */
1529                tcp_send_fin(sk);
1530        }
1531
1532        if (timeout) {
1533                struct task_struct *tsk = current;
1534                struct wait_queue wait = { tsk, NULL };
1535
1536                add_wait_queue(sk->sleep, &wait);
1537                release_sock(sk);
1538
1539                while (1) {
1540                        tsk->state = TASK_INTERRUPTIBLE;
1541                        if (!closing(sk))
1542                                break;
1543                        timeout = schedule_timeout(timeout);
1544                        if (signal_pending(tsk) || !timeout)
1545                                break;
1546                }
1547
1548                tsk->state = TASK_RUNNING;
1549                remove_wait_queue(sk->sleep, &wait);
1550                
1551                lock_sock(sk);
1552        }
1553
1554        /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1555         * we may need to set up a timer.
1556         */
1557        tcp_check_fin_timer(sk);
1558
1559        release_sock(sk);
1560        sk->dead = 1;
1561}
1562
1563/*
1564 *      Wait for an incoming connection, avoid race
1565 *      conditions. This must be called with the socket locked.
1566 */
1567static struct open_request * wait_for_connect(struct sock * sk,
1568                                              struct open_request **pprev)
1569{
1570        struct wait_queue wait = { current, NULL };
1571        struct open_request *req;
1572
1573        add_wait_queue(sk->sleep, &wait);
1574        for (;;) {
1575                current->state = TASK_INTERRUPTIBLE;
1576                release_sock(sk);
1577                schedule();
1578                lock_sock(sk);
1579                req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
1580                if (req) 
1581                        break;
1582                if (signal_pending(current))
1583                        break;
1584        }
1585        current->state = TASK_RUNNING;
1586        remove_wait_queue(sk->sleep, &wait);
1587        return req;
1588}
1589
1590/*
1591 *      This will accept the next outstanding connection.
1592 *
1593 *      Be careful about race conditions here - this is subtle.
1594 */
1595
1596struct sock *tcp_accept(struct sock *sk, int flags)
1597{
1598        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1599        struct open_request *req, *prev;
1600        struct sock *newsk = NULL;
1601        int error;
1602
1603        lock_sock(sk); 
1604
1605        /* We need to make sure that this socket is listening,
1606         * and that it has something pending.
1607         */
1608        error = EINVAL;
1609        if (sk->state != TCP_LISTEN)
1610                goto out;
1611
1612        /* Find already established connection */
1613        req = tcp_find_established(tp, &prev);
1614        if (!req) {
1615                /* If this is a non blocking socket don't sleep */
1616                error = EAGAIN;
1617                if (flags & O_NONBLOCK)
1618                        goto out;
1619            
1620                error = ERESTARTSYS;
1621                req = wait_for_connect(sk, &prev);
1622                if (!req) 
1623                        goto out;
1624        }
1625
1626        tcp_synq_unlink(tp, req, prev);
1627        newsk = req->sk;
1628        req->class->destructor(req);
1629        tcp_openreq_free(req);
1630        sk->ack_backlog--; 
1631        if(sk->keepopen)
1632                tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1633
1634        release_sock(sk);
1635        return newsk;
1636
1637out:
1638        /* sk should be in LISTEN state, thus accept can use sk->err for
1639         * internal purposes without stomping one anyone's feed.
1640         */ 
1641        sk->err = error; 
1642        release_sock(sk);
1643        return newsk;
1644}
1645
1646/*
1647 *      Socket option code for TCP. 
1648 */
1649  
1650int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, 
1651                   int optlen)
1652{
1653        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1654        int val;
1655
1656        if (level != SOL_TCP)
1657                return tp->af_specific->setsockopt(sk, level, optname, 
1658                                                   optval, optlen);
1659        
1660        if(optlen<sizeof(int))
1661                return -EINVAL;
1662
1663        if (get_user(val, (int *)optval))
1664                return -EFAULT;
1665
1666        switch(optname) {
1667        case TCP_MAXSEG:
1668                /* values greater than interface MTU won't take effect.  however at
1669                 * the point when this call is done we typically don't yet know
1670                 * which interface is going to be used
1671                 */
1672                if(val < 1 || val > MAX_WINDOW)
1673                        return -EINVAL;
1674                tp->user_mss = val;
1675                return 0;
1676
1677        case TCP_NODELAY:
1678                /* You cannot try to use this and TCP_CORK in
1679                 * tandem, so let the user know.
1680                 */
1681                if (sk->nonagle == 2)
1682                        return -EINVAL;
1683                sk->nonagle = (val == 0) ? 0 : 1;
1684                return 0;
1685
1686        case TCP_CORK:
1687                /* When set indicates to always queue non-full frames.
1688                 * Later the user clears this option and we transmit
1689                 * any pending partial frames in the queue.  This is
1690                 * meant to be used alongside sendfile() to get properly
1691                 * filled frames when the user (for example) must write
1692                 * out headers with a write() call first and then use
1693                 * sendfile to send out the data parts.
1694                 *
1695                 * You cannot try to use TCP_NODELAY and this mechanism
1696                 * at the same time, so let the user know.
1697                 */
1698                if (sk->nonagle == 1)
1699                        return -EINVAL;
1700                if (val != 0) {
1701                        sk->nonagle = 2;
1702                } else {
1703                        sk->nonagle = 0;
1704
1705                        lock_sock(sk);
1706                        tcp_push_pending_frames(sk, tp);
1707                        release_sock(sk);
1708                }
1709                return 0;
1710
1711        default:
1712                return -ENOPROTOOPT;
1713        };
1714}
1715
1716int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
1717                   int *optlen)
1718{
1719        struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1720        int val, len;
1721
1722        if(level != SOL_TCP)
1723                return tp->af_specific->getsockopt(sk, level, optname,
1724                                                   optval, optlen);
1725
1726        if(get_user(len,optlen))
1727                return -EFAULT;
1728
1729        len = min(len, sizeof(int));
1730
1731        switch(optname) {
1732        case TCP_MAXSEG:
1733                val = tp->user_mss;
1734                break;
1735        case TCP_NODELAY:
1736                val = (sk->nonagle == 1);
1737                break;
1738        case TCP_CORK:
1739                val = (sk->nonagle == 2);
1740                break;
1741        default:
1742                return -ENOPROTOOPT;
1743        };
1744
1745        if(put_user(len, optlen))
1746                return -EFAULT;
1747        if(copy_to_user(optval, &val,len))
1748                return -EFAULT;
1749        return 0;
1750}
1751
1752void tcp_set_keepalive(struct sock *sk, int val)
1753{
1754        if (!sk->keepopen && val)
1755                tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1756        else if (sk->keepopen && !val)
1757                tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1758}
1759
1760extern void __skb_cb_too_small_for_tcp(int, int);
1761
1762void __init tcp_init(void)
1763{
1764        struct sk_buff *skb = NULL;
1765
1766        if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
1767                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
1768                                           sizeof(skb->cb));
1769
1770        tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
1771                                                   sizeof(struct open_request),
1772                                               0, SLAB_HWCACHE_ALIGN,
1773                                               NULL, NULL);
1774        if(!tcp_openreq_cachep)
1775                panic("tcp_init: Cannot alloc open_request cache.");
1776
1777        tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
1778                                              sizeof(struct tcp_bind_bucket),
1779                                              0, SLAB_HWCACHE_ALIGN,
1780                                              NULL, NULL);
1781        if(!tcp_bucket_cachep)
1782                panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
1783
1784        tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
1785                                                sizeof(struct tcp_tw_bucket),
1786                                                0, SLAB_HWCACHE_ALIGN,
1787                                                NULL, NULL);
1788        if(!tcp_timewait_cachep)
1789                panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
1790}
1791
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.