1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Version: $Id: tcp.c,v 1.139 1999/03/17 19:30:34 davem Exp $ 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * Corey Minyard <wf-rch!minyard@relay.EU.net> 14 * Florian La Roche, <flla@stud.uni-sb.de> 15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 16 * Linus Torvalds, <torvalds@cs.helsinki.fi> 17 * Alan Cox, <gw4pts@gw4pts.ampr.org> 18 * Matthew Dillon, <dillon@apollo.west.oic.com> 19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 20 * Jorge Cwik, <jorge@laser.satlink.net> 21 * 22 * Fixes: 23 * Alan Cox : Numerous verify_area() calls 24 * Alan Cox : Set the ACK bit on a reset 25 * Alan Cox : Stopped it crashing if it closed while 26 * sk->inuse=1 and was trying to connect 27 * (tcp_err()). 28 * Alan Cox : All icmp error handling was broken 29 * pointers passed where wrong and the 30 * socket was looked up backwards. Nobody 31 * tested any icmp error code obviously. 32 * Alan Cox : tcp_err() now handled properly. It 33 * wakes people on errors. poll 34 * behaves and the icmp error race 35 * has gone by moving it into sock.c 36 * Alan Cox : tcp_send_reset() fixed to work for 37 * everything not just packets for 38 * unknown sockets. 39 * Alan Cox : tcp option processing. 40 * Alan Cox : Reset tweaked (still not 100%) [Had 41 * syn rule wrong] 42 * Herp Rosmanith : More reset fixes 43 * Alan Cox : No longer acks invalid rst frames. 44 * Acking any kind of RST is right out. 45 * Alan Cox : Sets an ignore me flag on an rst 46 * receive otherwise odd bits of prattle 47 * escape still 48 * Alan Cox : Fixed another acking RST frame bug. 49 * Should stop LAN workplace lockups. 50 * Alan Cox : Some tidyups using the new skb list 51 * facilities 52 * Alan Cox : sk->keepopen now seems to work 53 * Alan Cox : Pulls options out correctly on accepts 54 * Alan Cox : Fixed assorted sk->rqueue->next errors 55 * Alan Cox : PSH doesn't end a TCP read. Switched a 56 * bit to skb ops. 57 * Alan Cox : Tidied tcp_data to avoid a potential 58 * nasty. 59 * Alan Cox : Added some better commenting, as the 60 * tcp is hard to follow 61 * Alan Cox : Removed incorrect check for 20 * psh 62 * Michael O'Reilly : ack < copied bug fix. 63 * Johannes Stille : Misc tcp fixes (not all in yet). 64 * Alan Cox : FIN with no memory -> CRASH 65 * Alan Cox : Added socket option proto entries. 66 * Also added awareness of them to accept. 67 * Alan Cox : Added TCP options (SOL_TCP) 68 * Alan Cox : Switched wakeup calls to callbacks, 69 * so the kernel can layer network 70 * sockets. 71 * Alan Cox : Use ip_tos/ip_ttl settings. 72 * Alan Cox : Handle FIN (more) properly (we hope). 73 * Alan Cox : RST frames sent on unsynchronised 74 * state ack error. 75 * Alan Cox : Put in missing check for SYN bit. 76 * Alan Cox : Added tcp_select_window() aka NET2E 77 * window non shrink trick. 78 * Alan Cox : Added a couple of small NET2E timer 79 * fixes 80 * Charles Hedrick : TCP fixes 81 * Toomas Tamm : TCP window fixes 82 * Alan Cox : Small URG fix to rlogin ^C ack fight 83 * Charles Hedrick : Rewrote most of it to actually work 84 * Linus : Rewrote tcp_read() and URG handling 85 * completely 86 * Gerhard Koerting: Fixed some missing timer handling 87 * Matthew Dillon : Reworked TCP machine states as per RFC 88 * Gerhard Koerting: PC/TCP workarounds 89 * Adam Caldwell : Assorted timer/timing errors 90 * Matthew Dillon : Fixed another RST bug 91 * Alan Cox : Move to kernel side addressing changes. 92 * Alan Cox : Beginning work on TCP fastpathing 93 * (not yet usable) 94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine. 95 * Alan Cox : TCP fast path debugging 96 * Alan Cox : Window clamping 97 * Michael Riepe : Bug in tcp_check() 98 * Matt Dillon : More TCP improvements and RST bug fixes 99 * Matt Dillon : Yet more small nasties remove from the 100 * TCP code (Be very nice to this man if 101 * tcp finally works 100%) 8) 102 * Alan Cox : BSD accept semantics. 103 * Alan Cox : Reset on closedown bug. 104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). 105 * Michael Pall : Handle poll() after URG properly in 106 * all cases. 107 * Michael Pall : Undo the last fix in tcp_read_urg() 108 * (multi URG PUSH broke rlogin). 109 * Michael Pall : Fix the multi URG PUSH problem in 110 * tcp_readable(), poll() after URG 111 * works now. 112 * Michael Pall : recv(...,MSG_OOB) never blocks in the 113 * BSD api. 114 * Alan Cox : Changed the semantics of sk->socket to 115 * fix a race and a signal problem with 116 * accept() and async I/O. 117 * Alan Cox : Relaxed the rules on tcp_sendto(). 118 * Yury Shevchuk : Really fixed accept() blocking problem. 119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for 120 * clients/servers which listen in on 121 * fixed ports. 122 * Alan Cox : Cleaned the above up and shrank it to 123 * a sensible code size. 124 * Alan Cox : Self connect lockup fix. 125 * Alan Cox : No connect to multicast. 126 * Ross Biro : Close unaccepted children on master 127 * socket close. 128 * Alan Cox : Reset tracing code. 129 * Alan Cox : Spurious resets on shutdown. 130 * Alan Cox : Giant 15 minute/60 second timer error 131 * Alan Cox : Small whoops in polling before an 132 * accept. 133 * Alan Cox : Kept the state trace facility since 134 * it's handy for debugging. 135 * Alan Cox : More reset handler fixes. 136 * Alan Cox : Started rewriting the code based on 137 * the RFC's for other useful protocol 138 * references see: Comer, KA9Q NOS, and 139 * for a reference on the difference 140 * between specifications and how BSD 141 * works see the 4.4lite source. 142 * A.N.Kuznetsov : Don't time wait on completion of tidy 143 * close. 144 * Linus Torvalds : Fin/Shutdown & copied_seq changes. 145 * Linus Torvalds : Fixed BSD port reuse to work first syn 146 * Alan Cox : Reimplemented timers as per the RFC 147 * and using multiple timers for sanity. 148 * Alan Cox : Small bug fixes, and a lot of new 149 * comments. 150 * Alan Cox : Fixed dual reader crash by locking 151 * the buffers (much like datagram.c) 152 * Alan Cox : Fixed stuck sockets in probe. A probe 153 * now gets fed up of retrying without 154 * (even a no space) answer. 155 * Alan Cox : Extracted closing code better 156 * Alan Cox : Fixed the closing state machine to 157 * resemble the RFC. 158 * Alan Cox : More 'per spec' fixes. 159 * Jorge Cwik : Even faster checksumming. 160 * Alan Cox : tcp_data() doesn't ack illegal PSH 161 * only frames. At least one pc tcp stack 162 * generates them. 163 * Alan Cox : Cache last socket. 164 * Alan Cox : Per route irtt. 165 * Matt Day : poll()->select() match BSD precisely on error 166 * Alan Cox : New buffers 167 * Marc Tamsky : Various sk->prot->retransmits and 168 * sk->retransmits misupdating fixed. 169 * Fixed tcp_write_timeout: stuck close, 170 * and TCP syn retries gets used now. 171 * Mark Yarvis : In tcp_read_wakeup(), don't send an 172 * ack if state is TCP_CLOSED. 173 * Alan Cox : Look up device on a retransmit - routes may 174 * change. Doesn't yet cope with MSS shrink right 175 * but its a start! 176 * Marc Tamsky : Closing in closing fixes. 177 * Mike Shaver : RFC1122 verifications. 178 * Alan Cox : rcv_saddr errors. 179 * Alan Cox : Block double connect(). 180 * Alan Cox : Small hooks for enSKIP. 181 * Alexey Kuznetsov: Path MTU discovery. 182 * Alan Cox : Support soft errors. 183 * Alan Cox : Fix MTU discovery pathological case 184 * when the remote claims no mtu! 185 * Marc Tamsky : TCP_CLOSE fix. 186 * Colin (G3TNE) : Send a reset on syn ack replies in 187 * window but wrong (fixes NT lpd problems) 188 * Pedro Roque : Better TCP window handling, delayed ack. 189 * Joerg Reuter : No modification of locked buffers in 190 * tcp_do_retransmit() 191 * Eric Schenk : Changed receiver side silly window 192 * avoidance algorithm to BSD style 193 * algorithm. This doubles throughput 194 * against machines running Solaris, 195 * and seems to result in general 196 * improvement. 197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD 198 * Willy Konynenberg : Transparent proxying support. 199 * Mike McLagan : Routing by source 200 * Keith Owens : Do proper merging with partial SKB's in 201 * tcp_do_sendmsg to avoid burstiness. 202 * Eric Schenk : Fix fast close down bug with 203 * shutdown() followed by close(). 204 * Andi Kleen : Make poll agree with SIGIO 205 * 206 * This program is free software; you can redistribute it and/or 207 * modify it under the terms of the GNU General Public License 208 * as published by the Free Software Foundation; either version 209 * 2 of the License, or(at your option) any later version. 210 * 211 * Description of States: 212 * 213 * TCP_SYN_SENT sent a connection request, waiting for ack 214 * 215 * TCP_SYN_RECV received a connection request, sent ack, 216 * waiting for final ack in three-way handshake. 217 * 218 * TCP_ESTABLISHED connection established 219 * 220 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete 221 * transmission of remaining buffered data 222 * 223 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote 224 * to shutdown 225 * 226 * TCP_CLOSING both sides have shutdown but we still have 227 * data we have to finish sending 228 * 229 * TCP_TIME_WAIT timeout to catch resent junk before entering 230 * closed, can only be entered from FIN_WAIT2 231 * or CLOSING. Required because the other end 232 * may not have gotten our last ACK causing it 233 * to retransmit the data packet (which we ignore) 234 * 235 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for 236 * us to finish writing our data and to shutdown 237 * (we have to close() to move on to LAST_ACK) 238 * 239 * TCP_LAST_ACK out side has shutdown after remote has 240 * shutdown. There may still be data in our 241 * buffer that we have to finish sending 242 * 243 * TCP_CLOSE socket is finished 244 */ 245 246/* 247 * RFC1122 status: 248 * NOTE: I'm not going to be doing comments in the code for this one except 249 * for violations and the like. tcp.c is just too big... If I say something 250 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out 251 * with Alan. -- MS 950903 252 * [Note: Most of the TCP code has been rewriten/redesigned since this 253 * RFC1122 check. It is probably not correct anymore. It should be redone 254 * before 2.2. -AK] 255 * 256 * Use of PSH (4.2.2.2) 257 * MAY aggregate data sent without the PSH flag. (does) 258 * MAY queue data received without the PSH flag. (does) 259 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) 260 * MAY implement PSH on send calls. (doesn't, thus:) 261 * MUST NOT buffer data indefinitely (doesn't [1 second]) 262 * MUST set PSH on last segment (does) 263 * MAY pass received PSH to application layer (doesn't) 264 * SHOULD send maximum-sized segment whenever possible. (almost always does) 265 * 266 * Window Size (4.2.2.3, 4.2.2.16) 267 * MUST treat window size as an unsigned number (does) 268 * SHOULD treat window size as a 32-bit number (does not) 269 * MUST NOT shrink window once it is offered (does not normally) 270 * 271 * Urgent Pointer (4.2.2.4) 272 * **MUST point urgent pointer to last byte of urgent data (not right 273 * after). (doesn't, to be like BSD. That's configurable, but defaults 274 * to off) 275 * MUST inform application layer asynchronously of incoming urgent 276 * data. (does) 277 * MUST provide application with means of determining the amount of 278 * urgent data pending. (does) 279 * **MUST support urgent data sequence of arbitrary length. (doesn't, but 280 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) 281 * [Follows BSD 1 byte of urgent data] 282 * 283 * TCP Options (4.2.2.5) 284 * MUST be able to receive TCP options in any segment. (does) 285 * MUST ignore unsupported options (does) 286 * 287 * Maximum Segment Size Option (4.2.2.6) 288 * MUST implement both sending and receiving MSS. (does, but currently 289 * only uses the smaller of both of them) 290 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send 291 * it always). (does, even when MSS == 536, which is legal) 292 * MUST assume MSS == 536 if no MSS received at connection setup (does) 293 * MUST calculate "effective send MSS" correctly: 294 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) 295 * (does - but allows operator override) 296 * 297 * TCP Checksum (4.2.2.7) 298 * MUST generate and check TCP checksum. (does) 299 * 300 * Initial Sequence Number Selection (4.2.2.8) 301 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's 302 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is 303 * necessary for 10Mbps networks - and harder than BSD to spoof! 304 * With syncookies we don't) 305 * 306 * Simultaneous Open Attempts (4.2.2.10) 307 * MUST support simultaneous open attempts (does) 308 * 309 * Recovery from Old Duplicate SYN (4.2.2.11) 310 * MUST keep track of active vs. passive open (does) 311 * 312 * RST segment (4.2.2.12) 313 * SHOULD allow an RST segment to contain data (does, but doesn't do 314 * anything with it, which is standard) 315 * 316 * Closing a Connection (4.2.2.13) 317 * MUST inform application of whether connection was closed by RST or 318 * normal close. (does) 319 * MAY allow "half-duplex" close (treat connection as closed for the 320 * local app, even before handshake is done). (does) 321 * MUST linger in TIME_WAIT for 2 * MSL (does) 322 * 323 * Retransmission Timeout (4.2.2.15) 324 * MUST implement Jacobson's slow start and congestion avoidance 325 * stuff. (does) 326 * 327 * Probing Zero Windows (4.2.2.17) 328 * MUST support probing of zero windows. (does) 329 * MAY keep offered window closed indefinitely. (does) 330 * MUST allow remote window to stay closed indefinitely. (does) 331 * 332 * Passive Open Calls (4.2.2.18) 333 * MUST NOT let new passive open affect other connections. (doesn't) 334 * MUST support passive opens (LISTENs) concurrently. (does) 335 * 336 * Time to Live (4.2.2.19) 337 * MUST make TCP TTL configurable. (does - IP_TTL option) 338 * 339 * Event Processing (4.2.2.20) 340 * SHOULD queue out-of-order segments. (does) 341 * MUST aggregate ACK segments whenever possible. (does but badly) 342 * 343 * Retransmission Timeout Calculation (4.2.3.1) 344 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO 345 * calculation. (does, or at least explains them in the comments 8*b) 346 * SHOULD initialize RTO to 0 and RTT to 3. (does) 347 * 348 * When to Send an ACK Segment (4.2.3.2) 349 * SHOULD implement delayed ACK. (does) 350 * MUST keep ACK delay < 0.5 sec. (does) 351 * 352 * When to Send a Window Update (4.2.3.3) 353 * MUST implement receiver-side SWS. (does) 354 * 355 * When to Send Data (4.2.3.4) 356 * MUST implement sender-side SWS. (does) 357 * SHOULD implement Nagle algorithm. (does) 358 * 359 * TCP Connection Failures (4.2.3.5) 360 * MUST handle excessive retransmissions "properly" (see the RFC). (does) 361 * SHOULD inform application layer of soft errors. (does) 362 * 363 * TCP Keep-Alives (4.2.3.6) 364 * MAY provide keep-alives. (does) 365 * MUST make keep-alives configurable on a per-connection basis. (does) 366 * MUST default to no keep-alives. (does) 367 * MUST make keep-alive interval configurable. (does) 368 * MUST make default keep-alive interval > 2 hours. (does) 369 * MUST NOT interpret failure to ACK keep-alive packet as dead 370 * connection. (doesn't) 371 * SHOULD send keep-alive with no data. (does) 372 * 373 * TCP Multihoming (4.2.3.7) 374 * MUST get source address from IP layer before sending first 375 * SYN. (does) 376 * MUST use same local address for all segments of a connection. (does) 377 * 378 * IP Options (4.2.3.8) 379 * MUST ignore unsupported IP options. (does) 380 * MAY support Time Stamp and Record Route. (does) 381 * MUST allow application to specify a source route. (does) 382 * MUST allow received Source Route option to set route for all future 383 * segments on this connection. (does not (security issues)) 384 * 385 * ICMP messages (4.2.3.9) 386 * MUST act on ICMP errors. (does) 387 * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore 388 * because that is deprecated now by the IETF, can be turned on) 389 * MUST NOT abort connection upon receipt of soft Destination 390 * Unreachables (0, 1, 5), Time Exceededs and Parameter 391 * Problems. (doesn't) 392 * SHOULD report soft Destination Unreachables etc. to the 393 * application. (does, except during SYN_RECV and may drop messages 394 * in some rare cases before accept() - ICMP is unreliable) 395 * SHOULD abort connection upon receipt of hard Destination Unreachable 396 * messages (2, 3, 4). (does, but see above) 397 * 398 * Remote Address Validation (4.2.3.10) 399 * MUST reject as an error OPEN for invalid remote IP address. (does) 400 * MUST ignore SYN with invalid source address. (does) 401 * MUST silently discard incoming SYN for broadcast/multicast 402 * address. (does) 403 * 404 * Asynchronous Reports (4.2.4.1) 405 * MUST provide mechanism for reporting soft errors to application 406 * layer. (does) 407 * 408 * Type of Service (4.2.4.2) 409 * MUST allow application layer to set Type of Service. (does IP_TOS) 410 * 411 * (Whew. -- MS 950903) 412 * (Updated by AK, but not complete yet.) 413 **/ 414 415#include <linux/types.h> 416#include <linux/fcntl.h> 417#include <linux/poll.h> 418#include <linux/init.h> 419 420#include <net/icmp.h> 421#include <net/tcp.h> 422 423#include <asm/uaccess.h> 424 425int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 426 427struct tcp_mib tcp_statistics; 428 429kmem_cache_t *tcp_openreq_cachep; 430kmem_cache_t *tcp_bucket_cachep; 431kmem_cache_t *tcp_timewait_cachep; 432 433/* 434 * Find someone to 'accept'. Must be called with 435 * the socket locked or with interrupts disabled 436 */ 437 438static struct open_request *tcp_find_established(struct tcp_opt *tp, 439 struct open_request **prevp) 440{ 441 struct open_request *req = tp->syn_wait_queue; 442 struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; 443 while(req) { 444 if (req->sk && 445 ((1 << req->sk->state) & 446 ~(TCPF_SYN_SENT|TCPF_SYN_RECV))) 447 break; 448 prev = req; 449 req = req->dl_next; 450 } 451 *prevp = prev; 452 return req; 453} 454 455/* 456 * Walk down the receive queue counting readable data. 457 * 458 * Must be called with the socket lock held. 459 */ 460 461static int tcp_readable(struct sock *sk) 462{ 463 unsigned long counted; 464 unsigned long amount; 465 struct sk_buff *skb; 466 int sum; 467 468 SOCK_DEBUG(sk, "tcp_readable: %p - ",sk); 469 470 skb = skb_peek(&sk->receive_queue); 471 if (skb == NULL) { 472 SOCK_DEBUG(sk, "empty\n"); 473 return(0); 474 } 475 476 counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */ 477 amount = 0; 478 479 /* Do until a push or until we are out of data. */ 480 do { 481 /* Found a hole so stops here. */ 482 if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */ 483 break; 484 485 /* Length - header but start from where we are up to 486 * avoid overlaps. 487 */ 488 sum = skb->len - (counted - TCP_SKB_CB(skb)->seq); 489 if (sum >= 0) { 490 /* Add it up, move on. */ 491 amount += sum; 492 counted += sum; 493 if (skb->h.th->syn) 494 counted++; 495 } 496 497 /* Don't count urg data ... but do it in the right place! 498 * Consider: "old_data (ptr is here) URG PUSH data" 499 * The old code would stop at the first push because 500 * it counted the urg (amount==1) and then does amount-- 501 * *after* the loop. This means tcp_readable() always 502 * returned zero if any URG PUSH was in the queue, even 503 * though there was normal data available. If we subtract 504 * the urg data right here, we even get it to work for more 505 * than one URG PUSH skb without normal data. 506 * This means that poll() finally works now with urg data 507 * in the queue. Note that rlogin was never affected 508 * because it doesn't use poll(); it uses two processes 509 * and a blocking read(). And the queue scan in tcp_read() 510 * was correct. Mike <pall@rz.uni-karlsruhe.de> 511 */ 512 513 /* Don't count urg data. */ 514 if (skb->h.th->urg) 515 amount--; 516#if 0 517 if (amount && skb->h.th->psh) break; 518#endif 519 skb = skb->next; 520 } while(skb != (struct sk_buff *)&sk->receive_queue); 521 522 SOCK_DEBUG(sk, "got %lu bytes.\n",amount); 523 return(amount); 524} 525 526/* 527 * LISTEN is a special case for poll.. 528 */ 529static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) 530{ 531 struct open_request *req, *dummy; 532 533 lock_sock(sk); 534 req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy); 535 release_sock(sk); 536 if (req) 537 return POLLIN | POLLRDNORM; 538 return 0; 539} 540 541/* 542 * Compute minimal free write space needed to queue new packets. 543 */ 544#define tcp_min_write_space(__sk) \ 545 (atomic_read(&(__sk)->wmem_alloc) / 2) 546 547/* 548 * Wait for a TCP event. 549 * 550 * Note that we don't need to lock the socket, as the upper poll layers 551 * take care of normal races (between the test and the event) and we don't 552 * go look at any of the socket buffers directly. 553 */ 554unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) 555{ 556 unsigned int mask; 557 struct sock *sk = sock->sk; 558 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 559 560 poll_wait(file, sk->sleep, wait); 561 if (sk->state == TCP_LISTEN) 562 return tcp_listen_poll(sk, wait); 563 564 mask = 0; 565 if (sk->err) 566 mask = POLLERR; 567 568 /* 569 * POLLHUP is certainly not done right. But poll() doesn't 570 * have a notion of HUP in just one direction, and for a 571 * socket the read side is more interesting. 572 * 573 * Some poll() documentation says that POLLHUP is incompatible 574 * with the POLLOUT/POLLWR flags, so somebody should check this 575 * all. But careful, it tends to be safer to return too many 576 * bits than too few, and you can easily break real applications 577 * if you don't tell them that something has hung up! 578 * 579 * Check-me. 580 */ 581 if (sk->shutdown & RCV_SHUTDOWN) 582 mask |= POLLHUP; 583 584 /* Connected? */ 585 if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { 586 if ((tp->rcv_nxt != tp->copied_seq) && 587 (tp->urg_seq != tp->copied_seq || 588 tp->rcv_nxt != tp->copied_seq+1 || 589 sk->urginline || !tp->urg_data)) 590 mask |= POLLIN | POLLRDNORM; 591 592 if (!(sk->shutdown & SEND_SHUTDOWN)) { 593 if (sock_wspace(sk) >= tcp_min_write_space(sk)) { 594 mask |= POLLOUT | POLLWRNORM; 595 } else { /* send SIGIO later */ 596 sk->socket->flags |= SO_NOSPACE; 597 } 598 } 599 600 if (tp->urg_data & URG_VALID) 601 mask |= POLLPRI; 602 } 603 return mask; 604} 605 606/* 607 * Socket write_space callback. 608 * This (or rather the sock_wake_async) should agree with poll. 609 */ 610void tcp_write_space(struct sock *sk) 611{ 612 if (sk->dead) 613 return; 614 615 wake_up_interruptible(sk->sleep); 616 if (sock_wspace(sk) >= 617 tcp_min_write_space(sk)) 618 sock_wake_async(sk->socket, 2); 619} 620 621 622int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 623{ 624 int answ; 625 626 switch(cmd) { 627 case TIOCINQ: 628#ifdef FIXME /* FIXME: */ 629 case FIONREAD: 630#endif 631 if (sk->state == TCP_LISTEN) 632 return(-EINVAL); 633 lock_sock(sk); 634 answ = tcp_readable(sk); 635 release_sock(sk); 636 break; 637 case SIOCATMARK: 638 { 639 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 640 answ = tp->urg_data && tp->urg_seq == tp->copied_seq; 641 break; 642 } 643 case TIOCOUTQ: 644 if (sk->state == TCP_LISTEN) 645 return(-EINVAL); 646 answ = sock_wspace(sk); 647 break; 648 default: 649 return(-ENOIOCTLCMD); 650 }; 651 652 return put_user(answ, (int *)arg); 653} 654 655/* 656 * Wait for a socket to get into the connected state 657 * 658 * Note: must be called with the socket locked. 659 */ 660static int wait_for_tcp_connect(struct sock * sk, int flags) 661{ 662 struct task_struct *tsk = current; 663 struct wait_queue wait = { tsk, NULL }; 664 665 while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { 666 if(sk->err) 667 return sock_error(sk); 668 if((1 << sk->state) & 669 ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 670 if(sk->keepopen && !(flags&MSG_NOSIGNAL)) 671 send_sig(SIGPIPE, tsk, 0); 672 return -EPIPE; 673 } 674 if(flags & MSG_DONTWAIT) 675 return -EAGAIN; 676 if(signal_pending(tsk)) 677 return -ERESTARTSYS; 678 679 tsk->state = TASK_INTERRUPTIBLE; 680 add_wait_queue(sk->sleep, &wait); 681 release_sock(sk); 682 683 if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && 684 sk->err == 0) 685 schedule(); 686 687 tsk->state = TASK_RUNNING; 688 remove_wait_queue(sk->sleep, &wait); 689 lock_sock(sk); 690 } 691 return 0; 692} 693 694static inline int tcp_memory_free(struct sock *sk) 695{ 696 return atomic_read(&sk->wmem_alloc) < sk->sndbuf; 697} 698 699/* 700 * Wait for more memory for a socket 701 */ 702static void wait_for_tcp_memory(struct sock * sk) 703{ 704 release_sock(sk); 705 if (!tcp_memory_free(sk)) { 706 struct wait_queue wait = { current, NULL }; 707 708 sk->socket->flags &= ~SO_NOSPACE; 709 add_wait_queue(sk->sleep, &wait); 710 for (;;) { 711 if (signal_pending(current)) 712 break; 713 current->state = TASK_INTERRUPTIBLE; 714 if (tcp_memory_free(sk)) 715 break; 716 if (sk->shutdown & SEND_SHUTDOWN) 717 break; 718 if (sk->err) 719 break; 720 schedule(); 721 } 722 current->state = TASK_RUNNING; 723 remove_wait_queue(sk->sleep, &wait); 724 } 725 lock_sock(sk); 726} 727 728/* When all user supplied data has been queued set the PSH bit */ 729#define PSH_NEEDED (seglen == 0 && iovlen == 0) 730 731/* 732 * This routine copies from a user buffer into a socket, 733 * and starts the transmit system. 734 * 735 * Note: must be called with the socket locked. 736 */ 737 738int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) 739{ 740 struct iovec *iov; 741 struct tcp_opt *tp; 742 struct sk_buff *skb; 743 int iovlen, flags; 744 int mss_now; 745 int err, copied; 746 747 lock_sock(sk); 748 749 err = 0; 750 tp = &(sk->tp_pinfo.af_tcp); 751 752 /* Wait for a connection to finish. */ 753 flags = msg->msg_flags; 754 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 755 if((err = wait_for_tcp_connect(sk, flags)) != 0) 756 goto out; 757 758 /* This should be in poll */ 759 sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ 760 761 mss_now = tcp_current_mss(sk); 762 763 /* Ok commence sending. */ 764 iovlen = msg->msg_iovlen; 765 iov = msg->msg_iov; 766 copied = 0; 767 768 while(--iovlen >= 0) { 769 int seglen=iov->iov_len; 770 unsigned char * from=iov->iov_base; 771 772 iov++; 773 774 while(seglen > 0) { 775 int copy, tmp, queue_it; 776 777 if (err) 778 goto do_fault2; 779 780 /* Stop on errors. */ 781 if (sk->err) 782 goto do_sock_err; 783 784 /* Make sure that we are established. */ 785 if (sk->shutdown & SEND_SHUTDOWN) 786 goto do_shutdown; 787 788 /* Now we need to check if we have a half 789 * built packet we can tack some data onto. 790 */ 791 if (tp->send_head && !(flags & MSG_OOB)) { 792 skb = sk->write_queue.prev; 793 copy = skb->len; 794 /* If the remote does SWS avoidance we should 795 * queue the best we can if not we should in 796 * fact send multiple packets... 797 * A method for detecting this would be most 798 * welcome. 799 */ 800 if (skb_tailroom(skb) > 0 && 801 (mss_now - copy) > 0 && 802 tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) { 803 int last_byte_was_odd = (copy % 4); 804 805 copy = mss_now - copy; 806 if(copy > skb_tailroom(skb)) 807 copy = skb_tailroom(skb); 808 if(copy > seglen) 809 copy = seglen; 810 if(last_byte_was_odd) { 811 if(copy_from_user(skb_put(skb, copy), 812 from, copy)) 813 err = -EFAULT; 814 skb->csum = csum_partial(skb->data, 815 skb->len, 0); 816 } else { 817 skb->csum = 818 csum_and_copy_from_user( 819 from, skb_put(skb, copy), 820 copy, skb->csum, &err); 821 } 822 /* 823 * FIXME: the *_user functions should 824 * return how much data was 825 * copied before the fault 826 * occurred and then a partial 827 * packet with this data should 828 * be sent. Unfortunately 829 * csum_and_copy_from_user doesn't 830 * return this information. 831 * ATM it might send partly zeroed 832 * data in this case. 833 */ 834 tp->write_seq += copy; 835 TCP_SKB_CB(skb)->end_seq += copy; 836 from += copy; 837 copied += copy; 838 seglen -= copy; 839 if (PSH_NEEDED) 840 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 841 continue; 842 } 843 } 844 845 /* We also need to worry about the window. If 846 * window < 1/2 the maximum window we've seen 847 * from this host, don't use it. This is 848 * sender side silly window prevention, as 849 * specified in RFC1122. (Note that this is 850 * different than earlier versions of SWS 851 * prevention, e.g. RFC813.). What we 852 * actually do is use the whole MSS. Since 853 * the results in the right edge of the packet 854 * being outside the window, it will be queued 855 * for later rather than sent. 856 */ 857 copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); 858 if(copy > (tp->max_window >> 1)) 859 copy = min(copy, mss_now); 860 else 861 copy = mss_now; 862 if(copy > seglen) 863 copy = seglen; 864 865 /* Determine how large of a buffer to allocate. */ 866 tmp = MAX_HEADER + sk->prot->max_header; 867 if (copy < min(mss_now, tp->max_window >> 1) && 868 !(flags & MSG_OOB)) { 869 tmp += min(mss_now, tp->max_window); 870 871 /* What is happening here is that we want to 872 * tack on later members of the users iovec 873 * if possible into a single frame. When we 874 * leave this loop our caller checks to see if 875 * we can send queued frames onto the wire. 876 * See tcp_v[46]_sendmsg() for this. 877 */ 878 queue_it = 1; 879 } else { 880 tmp += copy; 881 queue_it = 0; 882 } 883 skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); 884 885 /* If we didn't get any memory, we need to sleep. */ 886 if (skb == NULL) { 887 sk->socket->flags |= SO_NOSPACE; 888 if (flags&MSG_DONTWAIT) { 889 err = -EAGAIN; 890 goto do_interrupted; 891 } 892 if (signal_pending(current)) { 893 err = -ERESTARTSYS; 894 goto do_interrupted; 895 } 896 wait_for_tcp_memory(sk); 897 898 /* If SACK's were formed or PMTU events happened, 899 * we must find out about it. 900 */ 901 mss_now = tcp_current_mss(sk); 902 continue; 903 } 904 905 seglen -= copy; 906 907 /* Prepare control bits for TCP header creation engine. */ 908 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | 909 (PSH_NEEDED ? 910 TCPCB_FLAG_PSH : 0)); 911 TCP_SKB_CB(skb)->sacked = 0; 912 if (flags & MSG_OOB) { 913 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG; 914 TCP_SKB_CB(skb)->urg_ptr = copy; 915 } else 916 TCP_SKB_CB(skb)->urg_ptr = 0; 917 918 /* TCP data bytes are SKB_PUT() on top, later 919 * TCP+IP+DEV headers are SKB_PUSH()'d beneath. 920 * Reserve header space and checksum the data. 921 */ 922 skb_reserve(skb, MAX_HEADER + sk->prot->max_header); 923 skb->csum = csum_and_copy_from_user(from, 924 skb_put(skb, copy), copy, 0, &err); 925 926 if (err) 927 goto do_fault; 928 929 from += copy; 930 copied += copy; 931 932 TCP_SKB_CB(skb)->seq = tp->write_seq; 933 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy; 934 935 /* This advances tp->write_seq for us. */ 936 tcp_send_skb(sk, skb, queue_it); 937 } 938 } 939 sk->err = 0; 940 err = copied; 941 goto out; 942 943do_sock_err: 944 if(copied) 945 err = copied; 946 else 947 err = sock_error(sk); 948 goto out; 949do_shutdown: 950 if(copied) 951 err = copied; 952 else { 953 if (!(flags&MSG_NOSIGNAL)) 954 send_sig(SIGPIPE, current, 0); 955 err = -EPIPE; 956 } 957 goto out; 958do_interrupted: 959 if(copied) 960 err = copied; 961 goto out; 962do_fault: 963 kfree_skb(skb); 964do_fault2: 965 err = -EFAULT; 966out: 967 tcp_push_pending_frames(sk, tp); 968 release_sock(sk); 969 return err; 970} 971 972#undef PSH_NEEDED 973 974/* 975 * Send an ack if one is backlogged at this point. Ought to merge 976 * this with tcp_send_ack(). 977 * This is called for delayed acks also. 978 */ 979 980void tcp_read_wakeup(struct sock *sk) 981{ 982 /* If we're closed, don't send an ack, or we'll get a RST 983 * from the closed destination. 984 */ 985 if (sk->state != TCP_CLOSE) 986 tcp_send_ack(sk); 987} 988 989/* 990 * Handle reading urgent data. BSD has very simple semantics for 991 * this, no blocking and very strange errors 8) 992 */ 993 994static int tcp_recv_urg(struct sock * sk, int nonblock, 995 struct msghdr *msg, int len, int flags, 996 int *addr_len) 997{ 998 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 999 1000 /* No URG data to read. */
1001 if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ) 1002 return -EINVAL; /* Yes this is right ! */ 1003 1004 if (sk->err) 1005 return sock_error(sk); 1006 1007 if (sk->done) 1008 return -ENOTCONN; 1009 1010 if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) { 1011 sk->done = 1; 1012 return 0; 1013 } 1014 1015 lock_sock(sk); 1016 if (tp->urg_data & URG_VALID) { 1017 int err = 0; 1018 char c = tp->urg_data; 1019 1020 if (!(flags & MSG_PEEK)) 1021 tp->urg_data = URG_READ; 1022 1023 if(msg->msg_name) 1024 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) 1025 msg->msg_name); 1026 1027 if(addr_len) 1028 *addr_len = tp->af_specific->sockaddr_len; 1029 1030 /* Read urgent data. */ 1031 msg->msg_flags|=MSG_OOB; 1032 release_sock(sk); 1033 1034 if(len>0) 1035 { 1036 err = memcpy_toiovec(msg->msg_iov, &c, 1); 1037 /* N.B. already set above ... */ 1038 msg->msg_flags|=MSG_OOB; 1039 } 1040 else 1041 msg->msg_flags|=MSG_TRUNC; 1042 1043 /* N.B. Is this right?? If len == 0 we didn't read any data */ 1044 return err ? -EFAULT : 1; 1045 } 1046 release_sock(sk); 1047 1048 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and 1049 * the available implementations agree in this case: 1050 * this call should never block, independent of the 1051 * blocking state of the socket. 1052 * Mike <pall@rz.uni-karlsruhe.de> 1053 */ 1054 return -EAGAIN; 1055} 1056 1057/* 1058 * Release a skb if it is no longer needed. This routine 1059 * must be called with interrupts disabled or with the 1060 * socket locked so that the sk_buff queue operation is ok. 1061 */ 1062 1063static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) 1064{ 1065 __skb_unlink(skb, &sk->receive_queue); 1066 kfree_skb(skb); 1067} 1068 1069/* Clean up the receive buffer for full frames taken by the user, 1070 * then send an ACK if necessary. COPIED is the number of bytes 1071 * tcp_recvmsg has given to the user so far, it speeds up the 1072 * calculation of whether or not we must ACK for the sake of 1073 * a window update. 1074 */ 1075static void cleanup_rbuf(struct sock *sk, int copied) 1076{ 1077 struct sk_buff *skb; 1078 1079 /* NOTE! The socket must be locked, so that we don't get 1080 * a messed-up receive queue. 1081 */ 1082 while ((skb=skb_peek(&sk->receive_queue)) != NULL) { 1083 if (!skb->used || atomic_read(&skb->users) > 1) 1084 break; 1085 tcp_eat_skb(sk, skb); 1086 } 1087 1088 /* We send an ACK if we can now advertise a non-zero window 1089 * which has been raised "significantly". 1090 */ 1091 if(copied > 0) { 1092 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 1093 __u32 rcv_window_now = tcp_receive_window(tp); 1094 __u32 new_window = __tcp_select_window(sk); 1095 1096 /* We won't be raising the window any further than 1097 * the window-clamp allows. Our window selection 1098 * also keeps things a nice multiple of MSS. These 1099 * checks are necessary to prevent spurious ACKs 1100 * which don't advertize a larger window. 1101 */ 1102 if((new_window && (new_window >= rcv_window_now * 2)) && 1103 ((rcv_window_now + tp->mss_cache) <= tp->window_clamp)) 1104 tcp_read_wakeup(sk); 1105 } 1106} 1107 1108 1109/* 1110 * This routine copies from a sock struct into the user buffer. 1111 */ 1112 1113int tcp_recvmsg(struct sock *sk, struct msghdr *msg, 1114 int len, int nonblock, int flags, int *addr_len) 1115{ 1116 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 1117 struct wait_queue wait = { current, NULL }; 1118 int copied = 0; 1119 u32 peek_seq; 1120 volatile u32 *seq; /* So gcc doesn't overoptimise */ 1121 unsigned long used; 1122 int err = 0; 1123 int target = 1; /* Read at least this many bytes */ 1124 1125 if (sk->err) 1126 return sock_error(sk); 1127 1128 if (sk->state == TCP_LISTEN) 1129 return -ENOTCONN; 1130 1131 /* Urgent data needs to be handled specially. */ 1132 if (flags & MSG_OOB) 1133 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len); 1134 1135 /* Copying sequence to update. This is volatile to handle 1136 * the multi-reader case neatly (memcpy_to/fromfs might be 1137 * inline and thus not flush cached variables otherwise). 1138 */ 1139 peek_seq = tp->copied_seq; 1140 seq = &tp->copied_seq; 1141 if (flags & MSG_PEEK) 1142 seq = &peek_seq; 1143 1144 /* Handle the POSIX bogosity MSG_WAITALL. */ 1145 if (flags & MSG_WAITALL) 1146 target=len; 1147 1148 add_wait_queue(sk->sleep, &wait); 1149 lock_sock(sk); 1150 1151 /* 1152 * BUG BUG BUG 1153 * This violates 1003.1g compliance. We must wait for 1154 * data to exist even if we read none! 1155 */ 1156 1157 while (len > 0) { 1158 struct sk_buff * skb; 1159 u32 offset; 1160 1161 /* Are we at urgent data? Stop if we have read anything. */ 1162 if (copied && tp->urg_data && tp->urg_seq == *seq) 1163 break; 1164 1165 /* We need to check signals first, to get correct SIGURG 1166 * handling. FIXME: Need to check this doesnt impact 1003.1g 1167 * and move it down to the bottom of the loop 1168 */ 1169 if (signal_pending(current)) { 1170 if (copied) 1171 break; 1172 copied = -ERESTARTSYS; 1173 if (nonblock) 1174 copied = -EAGAIN; 1175 break; 1176 } 1177 1178 /* Next get a buffer. */ 1179 current->state = TASK_INTERRUPTIBLE; 1180 1181 skb = skb_peek(&sk->receive_queue); 1182 do { 1183 if (!skb) 1184 break; 1185 1186 /* Now that we have two receive queues this 1187 * shouldn't happen. 1188 */ 1189 if (before(*seq, TCP_SKB_CB(skb)->seq)) { 1190 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n", 1191 *seq, TCP_SKB_CB(skb)->seq); 1192 break; 1193 } 1194 offset = *seq - TCP_SKB_CB(skb)->seq; 1195 if (skb->h.th->syn) 1196 offset--; 1197 if (offset < skb->len) 1198 goto found_ok_skb; 1199 if (skb->h.th->fin) 1200 goto found_fin_ok; 1201 if (!(flags & MSG_PEEK)) 1202 skb->used = 1; 1203 skb = skb->next; 1204 } while (skb != (struct sk_buff *)&sk->receive_queue); 1205 1206 if (copied >= target) 1207 break; 1208 1209 /* 1210 These three lines and clause if (sk->state == TCP_CLOSE) 1211 are unlikely to be correct, if target > 1. 1212 I DO NOT FIX IT, because I have no idea, what 1213 POSIX prescribes to make here. Probably, it really 1214 wants to lose data 8), if not all target is received. 1215 --ANK 1216 */ 1217 if (sk->err && !(flags&MSG_PEEK)) { 1218 copied = sock_error(sk); 1219 break; 1220 } 1221 1222 if (sk->shutdown & RCV_SHUTDOWN) { 1223 sk->done = 1; 1224 break; 1225 } 1226 1227 if (sk->state == TCP_CLOSE) { 1228 if (!sk->done) { 1229 sk->done = 1; 1230 break; 1231 } 1232 copied = -ENOTCONN; 1233 break; 1234 } 1235 1236 if (nonblock) { 1237 copied = -EAGAIN; 1238 break; 1239 } 1240 1241 cleanup_rbuf(sk, copied); 1242 release_sock(sk); 1243 sk->socket->flags |= SO_WAITDATA; 1244 schedule(); 1245 sk->socket->flags &= ~SO_WAITDATA; 1246 lock_sock(sk); 1247 continue; 1248 1249 found_ok_skb: 1250 /* Lock the buffer. We can be fairly relaxed as 1251 * an interrupt will never steal a buffer we are 1252 * using unless I've missed something serious in 1253 * tcp_data. 1254 */ 1255 atomic_inc(&skb->users); 1256 1257 /* Ok so how much can we use? */ 1258 used = skb->len - offset; 1259 if (len < used) 1260 used = len; 1261 1262 /* Do we have urgent data here? */ 1263 if (tp->urg_data) { 1264 u32 urg_offset = tp->urg_seq - *seq; 1265 if (urg_offset < used) { 1266 if (!urg_offset) { 1267 if (!sk->urginline) { 1268 ++*seq; 1269 offset++; 1270 used--; 1271 } 1272 } else 1273 used = urg_offset; 1274 } 1275 } 1276 1277 /* Copy it - We _MUST_ update *seq first so that we 1278 * don't ever double read when we have dual readers 1279 */ 1280 *seq += used; 1281 1282 /* This memcpy_toiovec can sleep. If it sleeps and we 1283 * do a second read it relies on the skb->users to avoid 1284 * a crash when cleanup_rbuf() gets called. 1285 */ 1286 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); 1287 if (err) { 1288 /* Exception. Bailout! */ 1289 atomic_dec(&skb->users); 1290 copied = -EFAULT; 1291 break; 1292 } 1293 1294 copied += used; 1295 len -= used; 1296 1297 /* We now will not sleep again until we are finished 1298 * with skb. Sorry if you are doing the SMP port 1299 * but you'll just have to fix it neatly ;) 1300 */ 1301 atomic_dec(&skb->users); 1302 1303 if (after(tp->copied_seq,tp->urg_seq)) 1304 tp->urg_data = 0; 1305 if (used + offset < skb->len) 1306 continue; 1307 1308 /* Process the FIN. We may also need to handle PSH 1309 * here and make it break out of MSG_WAITALL. 1310 */ 1311 if (skb->h.th->fin) 1312 goto found_fin_ok; 1313 if (flags & MSG_PEEK) 1314 continue; 1315 skb->used = 1; 1316 if (atomic_read(&skb->users) == 1) 1317 tcp_eat_skb(sk, skb); 1318 continue; 1319 1320 found_fin_ok: 1321 ++*seq; 1322 if (flags & MSG_PEEK) 1323 break; 1324 1325 /* All is done. */ 1326 skb->used = 1; 1327 sk->shutdown |= RCV_SHUTDOWN; 1328 break; 1329 } 1330 1331 if(copied > 0 && msg->msg_name) 1332 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) 1333 msg->msg_name); 1334 1335 if(addr_len) 1336 *addr_len = tp->af_specific->sockaddr_len; 1337 1338 remove_wait_queue(sk->sleep, &wait); 1339 current->state = TASK_RUNNING; 1340 1341 /* Clean up data we have read: This will do ACK frames. */ 1342 cleanup_rbuf(sk, copied); 1343 release_sock(sk); 1344 return copied; 1345} 1346 1347/* 1348 * Check whether to renew the timer. 1349 */ 1350static inline void tcp_check_fin_timer(struct sock *sk) 1351{ 1352 if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev) 1353 tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); 1354} 1355 1356/* 1357 * State processing on a close. This implements the state shift for 1358 * sending our FIN frame. Note that we only send a FIN for some 1359 * states. A shutdown() may have already sent the FIN, or we may be 1360 * closed. 1361 */ 1362 1363static unsigned char new_state[16] = { 1364 /* current state: new state: action: */ 1365 /* (Invalid) */ TCP_CLOSE, 1366 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, 1367 /* TCP_SYN_SENT */ TCP_CLOSE, 1368 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, 1369 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, 1370 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, 1371 /* TCP_TIME_WAIT */ TCP_CLOSE, 1372 /* TCP_CLOSE */ TCP_CLOSE, 1373 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, 1374 /* TCP_LAST_ACK */ TCP_LAST_ACK, 1375 /* TCP_LISTEN */ TCP_CLOSE, 1376 /* TCP_CLOSING */ TCP_CLOSING, 1377}; 1378 1379static int tcp_close_state(struct sock *sk, int dead) 1380{ 1381 int next = (int) new_state[sk->state]; 1382 int ns = (next & TCP_STATE_MASK); 1383 1384 tcp_set_state(sk, ns); 1385 1386 /* This is a (useful) BSD violating of the RFC. There is a 1387 * problem with TCP as specified in that the other end could 1388 * keep a socket open forever with no application left this end. 1389 * We use a 3 minute timeout (about the same as BSD) then kill 1390 * our end. If they send after that then tough - BUT: long enough 1391 * that we won't make the old 4*rto = almost no time - whoops 1392 * reset mistake. 1393 */ 1394 if (dead) 1395 tcp_check_fin_timer(sk); 1396 1397 return (next & TCP_ACTION_FIN); 1398} 1399 1400/* 1401 * Shutdown the sending side of a connection. Much like close except 1402 * that we don't receive shut down or set sk->dead. 1403 */ 1404 1405void tcp_shutdown(struct sock *sk, int how) 1406{ 1407 /* We need to grab some memory, and put together a FIN, 1408 * and then put it into the queue to be sent. 1409 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. 1410 */ 1411 if (!(how & SEND_SHUTDOWN)) 1412 return; 1413 1414 /* If we've already sent a FIN, or it's a closed state, skip this. */ 1415 if ((1 << sk->state) & 1416 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { 1417 lock_sock(sk); 1418 1419 /* Clear out any half completed packets. FIN if needed. */ 1420 if (tcp_close_state(sk,0)) 1421 tcp_send_fin(sk); 1422 1423 release_sock(sk); 1424 } 1425} 1426 1427 1428/* 1429 * Return 1 if we still have things to send in our buffers. 1430 */ 1431 1432static inline int closing(struct sock * sk) 1433{ 1434 return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); 1435} 1436 1437/* 1438 * This routine closes sockets which have been at least partially 1439 * opened, but not yet accepted. Currently it is only called by 1440 * tcp_close, and timeout mirrors the value there. 1441 */ 1442 1443static void tcp_close_pending (struct sock *sk) 1444{ 1445 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 1446 struct open_request *req = tp->syn_wait_queue; 1447 1448 while(req) { 1449 struct open_request *iter; 1450 1451 if (req->sk) 1452 tcp_close(req->sk, 0); 1453 1454 iter = req; 1455 req = req->dl_next; 1456 1457 (*iter->class->destructor)(iter); 1458 tcp_dec_slow_timer(TCP_SLT_SYNACK); 1459 sk->ack_backlog--; 1460 tcp_openreq_free(iter); 1461 } 1462 1463 tcp_synq_init(tp); 1464} 1465 1466void tcp_close(struct sock *sk, long timeout) 1467{ 1468 struct sk_buff *skb; 1469 int data_was_unread = 0; 1470 1471 /* 1472 * Check whether the socket is locked ... supposedly 1473 * it's impossible to tcp_close() a locked socket. 1474 */ 1475 if (atomic_read(&sk->sock_readers)) 1476 printk("tcp_close: socket already locked!\n"); 1477 1478 /* We need to grab some memory, and put together a FIN, 1479 * and then put it into the queue to be sent. 1480 */ 1481 lock_sock(sk); 1482 if(sk->state == TCP_LISTEN) { 1483 /* Special case. */ 1484 tcp_set_state(sk, TCP_CLOSE); 1485 tcp_close_pending(sk); 1486 release_sock(sk); 1487 sk->dead = 1; 1488 return; 1489 } 1490 1491 /* It is questionable, what the role of this is now. 1492 * In any event either it should be removed, or 1493 * increment of SLT_KEEPALIVE be done, this is causing 1494 * big problems. For now I comment it out. -DaveM 1495 */ 1496 /* sk->keepopen = 1; */ 1497 sk->shutdown = SHUTDOWN_MASK; 1498 1499 if (!sk->dead) 1500 sk->state_change(sk); 1501 1502 /* We need to flush the recv. buffs. We do this only on the 1503 * descriptor close, not protocol-sourced closes, because the 1504 * reader process may not have drained the data yet! 1505 */ 1506 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) { 1507 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin; 1508 data_was_unread += len; 1509 kfree_skb(skb); 1510 } 1511 1512 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section 1513 * 3.10, we send a RST here because data was lost. To 1514 * witness the awful effects of the old behavior of always 1515 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start 1516 * a bulk GET in an FTP client, suspend the process, wait 1517 * for the client to advertise a zero window, then kill -9 1518 * the FTP client, wheee... Note: timeout is always zero 1519 * in such a case. 1520 */ 1521 if(data_was_unread != 0) { 1522 /* Unread data was tossed, zap the connection. */ 1523 tcp_set_state(sk, TCP_CLOSE); 1524 tcp_send_active_reset(sk); 1525 } else if (tcp_close_state(sk,1)) { 1526 /* We FIN if the application ate all the data before 1527 * zapping the connection. 1528 */ 1529 tcp_send_fin(sk); 1530 } 1531 1532 if (timeout) { 1533 struct task_struct *tsk = current; 1534 struct wait_queue wait = { tsk, NULL }; 1535 1536 add_wait_queue(sk->sleep, &wait); 1537 release_sock(sk); 1538 1539 while (1) { 1540 tsk->state = TASK_INTERRUPTIBLE; 1541 if (!closing(sk)) 1542 break; 1543 timeout = schedule_timeout(timeout); 1544 if (signal_pending(tsk) || !timeout) 1545 break; 1546 } 1547 1548 tsk->state = TASK_RUNNING; 1549 remove_wait_queue(sk->sleep, &wait); 1550 1551 lock_sock(sk); 1552 } 1553 1554 /* Now that the socket is dead, if we are in the FIN_WAIT2 state 1555 * we may need to set up a timer. 1556 */ 1557 tcp_check_fin_timer(sk); 1558 1559 release_sock(sk); 1560 sk->dead = 1; 1561} 1562 1563/* 1564 * Wait for an incoming connection, avoid race 1565 * conditions. This must be called with the socket locked. 1566 */ 1567static struct open_request * wait_for_connect(struct sock * sk, 1568 struct open_request **pprev) 1569{ 1570 struct wait_queue wait = { current, NULL }; 1571 struct open_request *req; 1572 1573 add_wait_queue(sk->sleep, &wait); 1574 for (;;) { 1575 current->state = TASK_INTERRUPTIBLE; 1576 release_sock(sk); 1577 schedule(); 1578 lock_sock(sk); 1579 req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev); 1580 if (req) 1581 break; 1582 if (signal_pending(current)) 1583 break; 1584 } 1585 current->state = TASK_RUNNING; 1586 remove_wait_queue(sk->sleep, &wait); 1587 return req; 1588} 1589 1590/* 1591 * This will accept the next outstanding connection. 1592 * 1593 * Be careful about race conditions here - this is subtle. 1594 */ 1595 1596struct sock *tcp_accept(struct sock *sk, int flags) 1597{ 1598 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 1599 struct open_request *req, *prev; 1600 struct sock *newsk = NULL; 1601 int error; 1602 1603 lock_sock(sk); 1604 1605 /* We need to make sure that this socket is listening, 1606 * and that it has something pending. 1607 */ 1608 error = EINVAL; 1609 if (sk->state != TCP_LISTEN) 1610 goto out; 1611 1612 /* Find already established connection */ 1613 req = tcp_find_established(tp, &prev); 1614 if (!req) { 1615 /* If this is a non blocking socket don't sleep */ 1616 error = EAGAIN; 1617 if (flags & O_NONBLOCK) 1618 goto out; 1619 1620 error = ERESTARTSYS; 1621 req = wait_for_connect(sk, &prev); 1622 if (!req) 1623 goto out; 1624 } 1625 1626 tcp_synq_unlink(tp, req, prev); 1627 newsk = req->sk; 1628 req->class->destructor(req); 1629 tcp_openreq_free(req); 1630 sk->ack_backlog--; 1631 if(sk->keepopen) 1632 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); 1633 1634 release_sock(sk); 1635 return newsk; 1636 1637out: 1638 /* sk should be in LISTEN state, thus accept can use sk->err for 1639 * internal purposes without stomping one anyone's feed. 1640 */ 1641 sk->err = error; 1642 release_sock(sk); 1643 return newsk; 1644} 1645 1646/* 1647 * Socket option code for TCP. 1648 */ 1649 1650int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, 1651 int optlen) 1652{ 1653 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 1654 int val; 1655 1656 if (level != SOL_TCP) 1657 return tp->af_specific->setsockopt(sk, level, optname, 1658 optval, optlen); 1659 1660 if(optlen<sizeof(int)) 1661 return -EINVAL; 1662 1663 if (get_user(val, (int *)optval)) 1664 return -EFAULT; 1665 1666 switch(optname) { 1667 case TCP_MAXSEG: 1668 /* values greater than interface MTU won't take effect. however at 1669 * the point when this call is done we typically don't yet know 1670 * which interface is going to be used 1671 */ 1672 if(val < 1 || val > MAX_WINDOW) 1673 return -EINVAL; 1674 tp->user_mss = val; 1675 return 0; 1676 1677 case TCP_NODELAY: 1678 /* You cannot try to use this and TCP_CORK in 1679 * tandem, so let the user know. 1680 */ 1681 if (sk->nonagle == 2) 1682 return -EINVAL; 1683 sk->nonagle = (val == 0) ? 0 : 1; 1684 return 0; 1685 1686 case TCP_CORK: 1687 /* When set indicates to always queue non-full frames. 1688 * Later the user clears this option and we transmit 1689 * any pending partial frames in the queue. This is 1690 * meant to be used alongside sendfile() to get properly 1691 * filled frames when the user (for example) must write 1692 * out headers with a write() call first and then use 1693 * sendfile to send out the data parts. 1694 * 1695 * You cannot try to use TCP_NODELAY and this mechanism 1696 * at the same time, so let the user know. 1697 */ 1698 if (sk->nonagle == 1) 1699 return -EINVAL; 1700 if (val != 0) { 1701 sk->nonagle = 2; 1702 } else { 1703 sk->nonagle = 0; 1704 1705 lock_sock(sk); 1706 tcp_push_pending_frames(sk, tp); 1707 release_sock(sk); 1708 } 1709 return 0; 1710 1711 default: 1712 return -ENOPROTOOPT; 1713 }; 1714} 1715 1716int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, 1717 int *optlen) 1718{ 1719 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 1720 int val, len; 1721 1722 if(level != SOL_TCP) 1723 return tp->af_specific->getsockopt(sk, level, optname, 1724 optval, optlen); 1725 1726 if(get_user(len,optlen)) 1727 return -EFAULT; 1728 1729 len = min(len, sizeof(int)); 1730 1731 switch(optname) { 1732 case TCP_MAXSEG: 1733 val = tp->user_mss; 1734 break; 1735 case TCP_NODELAY: 1736 val = (sk->nonagle == 1); 1737 break; 1738 case TCP_CORK: 1739 val = (sk->nonagle == 2); 1740 break; 1741 default: 1742 return -ENOPROTOOPT; 1743 }; 1744 1745 if(put_user(len, optlen)) 1746 return -EFAULT; 1747 if(copy_to_user(optval, &val,len)) 1748 return -EFAULT; 1749 return 0; 1750} 1751 1752void tcp_set_keepalive(struct sock *sk, int val) 1753{ 1754 if (!sk->keepopen && val) 1755 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); 1756 else if (sk->keepopen && !val) 1757 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); 1758} 1759 1760extern void __skb_cb_too_small_for_tcp(int, int); 1761 1762void __init tcp_init(void) 1763{ 1764 struct sk_buff *skb = NULL; 1765 1766 if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) 1767 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), 1768 sizeof(skb->cb)); 1769 1770 tcp_openreq_cachep = kmem_cache_create("tcp_open_request", 1771 sizeof(struct open_request), 1772 0, SLAB_HWCACHE_ALIGN, 1773 NULL, NULL); 1774 if(!tcp_openreq_cachep) 1775 panic("tcp_init: Cannot alloc open_request cache."); 1776 1777 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", 1778 sizeof(struct tcp_bind_bucket), 1779 0, SLAB_HWCACHE_ALIGN, 1780 NULL, NULL); 1781 if(!tcp_bucket_cachep) 1782 panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); 1783 1784 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", 1785 sizeof(struct tcp_tw_bucket), 1786 0, SLAB_HWCACHE_ALIGN, 1787 NULL, NULL); 1788 if(!tcp_timewait_cachep) 1789 panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); 1790} 1791

