1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#include <linux/kernel.h>
249#include <linux/module.h>
250#include <linux/types.h>
251#include <linux/fcntl.h>
252#include <linux/poll.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/bootmem.h>
262#include <linux/highmem.h>
263#include <linux/swap.h>
264#include <linux/cache.h>
265#include <linux/err.h>
266#include <linux/crypto.h>
267
268#include <net/icmp.h>
269#include <net/tcp.h>
270#include <net/xfrm.h>
271#include <net/ip.h>
272#include <net/netdma.h>
273#include <net/sock.h>
274
275#include <asm/uaccess.h>
276#include <asm/ioctls.h>
277
278int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
279
280atomic_t tcp_orphan_count = ATOMIC_INIT(0);
281
282EXPORT_SYMBOL_GPL(tcp_orphan_count);
283
284int sysctl_tcp_mem[3] __read_mostly;
285int sysctl_tcp_wmem[3] __read_mostly;
286int sysctl_tcp_rmem[3] __read_mostly;
287
288EXPORT_SYMBOL(sysctl_tcp_mem);
289EXPORT_SYMBOL(sysctl_tcp_rmem);
290EXPORT_SYMBOL(sysctl_tcp_wmem);
291
292atomic_t tcp_memory_allocated;
293atomic_t tcp_sockets_allocated;
294
295EXPORT_SYMBOL(tcp_memory_allocated);
296EXPORT_SYMBOL(tcp_sockets_allocated);
297
298
299
300
301struct tcp_splice_state {
302 struct pipe_inode_info *pipe;
303 size_t len;
304 unsigned int flags;
305};
306
307
308
309
310
311
312
313int tcp_memory_pressure __read_mostly;
314
315EXPORT_SYMBOL(tcp_memory_pressure);
316
317void tcp_enter_memory_pressure(struct sock *sk)
318{
319 if (!tcp_memory_pressure) {
320 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
321 tcp_memory_pressure = 1;
322 }
323}
324
325EXPORT_SYMBOL(tcp_enter_memory_pressure);
326
327
328
329
330
331
332
333
334unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
335{
336 unsigned int mask;
337 struct sock *sk = sock->sk;
338 struct tcp_sock *tp = tcp_sk(sk);
339
340 poll_wait(file, sk->sk_sleep, wait);
341 if (sk->sk_state == TCP_LISTEN)
342 return inet_csk_listen_poll(sk);
343
344
345
346
347
348
349 mask = 0;
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
379 mask |= POLLHUP;
380 if (sk->sk_shutdown & RCV_SHUTDOWN)
381 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
382
383
384 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
385
386
387
388 if ((tp->rcv_nxt != tp->copied_seq) &&
389 (tp->urg_seq != tp->copied_seq ||
390 tp->rcv_nxt != tp->copied_seq + 1 ||
391 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
392 mask |= POLLIN | POLLRDNORM;
393
394 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
395 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
396 mask |= POLLOUT | POLLWRNORM;
397 } else {
398 set_bit(SOCK_ASYNC_NOSPACE,
399 &sk->sk_socket->flags);
400 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
401
402
403
404
405
406 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
407 mask |= POLLOUT | POLLWRNORM;
408 }
409 }
410
411 if (tp->urg_data & TCP_URG_VALID)
412 mask |= POLLPRI;
413 }
414
415 smp_rmb();
416 if (sk->sk_err)
417 mask |= POLLERR;
418
419 return mask;
420}
421
422int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
423{
424 struct tcp_sock *tp = tcp_sk(sk);
425 int answ;
426
427 switch (cmd) {
428 case SIOCINQ:
429 if (sk->sk_state == TCP_LISTEN)
430 return -EINVAL;
431
432 lock_sock(sk);
433 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
434 answ = 0;
435 else if (sock_flag(sk, SOCK_URGINLINE) ||
436 !tp->urg_data ||
437 before(tp->urg_seq, tp->copied_seq) ||
438 !before(tp->urg_seq, tp->rcv_nxt)) {
439 answ = tp->rcv_nxt - tp->copied_seq;
440
441
442 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
443 answ -=
444 tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
445 } else
446 answ = tp->urg_seq - tp->copied_seq;
447 release_sock(sk);
448 break;
449 case SIOCATMARK:
450 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
451 break;
452 case SIOCOUTQ:
453 if (sk->sk_state == TCP_LISTEN)
454 return -EINVAL;
455
456 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
457 answ = 0;
458 else
459 answ = tp->write_seq - tp->snd_una;
460 break;
461 default:
462 return -ENOIOCTLCMD;
463 }
464
465 return put_user(answ, (int __user *)arg);
466}
467
468static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
469{
470 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
471 tp->pushed_seq = tp->write_seq;
472}
473
474static inline int forced_push(struct tcp_sock *tp)
475{
476 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
477}
478
479static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
480{
481 struct tcp_sock *tp = tcp_sk(sk);
482 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
483
484 skb->csum = 0;
485 tcb->seq = tcb->end_seq = tp->write_seq;
486 tcb->flags = TCPCB_FLAG_ACK;
487 tcb->sacked = 0;
488 skb_header_release(skb);
489 tcp_add_write_queue_tail(sk, skb);
490 sk->sk_wmem_queued += skb->truesize;
491 sk_mem_charge(sk, skb->truesize);
492 if (tp->nonagle & TCP_NAGLE_PUSH)
493 tp->nonagle &= ~TCP_NAGLE_PUSH;
494}
495
496static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
497 struct sk_buff *skb)
498{
499 if (flags & MSG_OOB) {
500 tp->urg_mode = 1;
501 tp->snd_up = tp->write_seq;
502 }
503}
504
505static inline void tcp_push(struct sock *sk, int flags, int mss_now,
506 int nonagle)
507{
508 struct tcp_sock *tp = tcp_sk(sk);
509
510 if (tcp_send_head(sk)) {
511 struct sk_buff *skb = tcp_write_queue_tail(sk);
512 if (!(flags & MSG_MORE) || forced_push(tp))
513 tcp_mark_push(tp, skb);
514 tcp_mark_urg(tp, flags, skb);
515 __tcp_push_pending_frames(sk, mss_now,
516 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
517 }
518}
519
520static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
521 unsigned int offset, size_t len)
522{
523 struct tcp_splice_state *tss = rd_desc->arg.data;
524 int ret;
525
526 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
527 tss->flags);
528 if (ret > 0)
529 rd_desc->count -= ret;
530 return ret;
531}
532
533static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
534{
535
536 read_descriptor_t rd_desc = {
537 .arg.data = tss,
538 .count = tss->len,
539 };
540
541 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
542}
543
544
545
546
547
548
549
550
551
552
553
554
555
556ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
557 struct pipe_inode_info *pipe, size_t len,
558 unsigned int flags)
559{
560 struct sock *sk = sock->sk;
561 struct tcp_splice_state tss = {
562 .pipe = pipe,
563 .len = len,
564 .flags = flags,
565 };
566 long timeo;
567 ssize_t spliced;
568 int ret;
569
570
571
572
573 if (unlikely(*ppos))
574 return -ESPIPE;
575
576 ret = spliced = 0;
577
578 lock_sock(sk);
579
580 timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
581 while (tss.len) {
582 ret = __tcp_splice_read(sk, &tss);
583 if (ret < 0)
584 break;
585 else if (!ret) {
586 if (spliced)
587 break;
588 if (sock_flag(sk, SOCK_DONE))
589 break;
590 if (sk->sk_err) {
591 ret = sock_error(sk);
592 break;
593 }
594 if (sk->sk_shutdown & RCV_SHUTDOWN)
595 break;
596 if (sk->sk_state == TCP_CLOSE) {
597
598
599
600
601 if (!sock_flag(sk, SOCK_DONE))
602 ret = -ENOTCONN;
603 break;
604 }
605 if (flags & SPLICE_F_NONBLOCK) {
606 ret = -EAGAIN;
607 break;
608 }
609 if (!timeo) {
610 ret = -EAGAIN;
611 break;
612 }
613 sk_wait_data(sk, &timeo);
614 if (signal_pending(current)) {
615 ret = sock_intr_errno(timeo);
616 break;
617 }
618 continue;
619 }
620 tss.len -= ret;
621 spliced += ret;
622
623 if (!timeo)
624 break;
625 release_sock(sk);
626 lock_sock(sk);
627
628 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
629 (sk->sk_shutdown & RCV_SHUTDOWN) ||
630 signal_pending(current))
631 break;
632 }
633
634 release_sock(sk);
635
636 if (spliced)
637 return spliced;
638
639 return ret;
640}
641
642struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
643{
644 struct sk_buff *skb;
645
646
647 size = ALIGN(size, 4);
648
649 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
650 if (skb) {
651 if (sk_wmem_schedule(sk, skb->truesize)) {
652
653
654
655
656 skb_reserve(skb, skb_tailroom(skb) - size);
657 return skb;
658 }
659 __kfree_skb(skb);
660 } else {
661 sk->sk_prot->enter_memory_pressure(sk);
662 sk_stream_moderate_sndbuf(sk);
663 }
664 return NULL;
665}
666
667static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
668 size_t psize, int flags)
669{
670 struct tcp_sock *tp = tcp_sk(sk);
671 int mss_now, size_goal;
672 int err;
673 ssize_t copied;
674 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
675
676
677 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
678 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
679 goto out_err;
680
681 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
682
683 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
684 size_goal = tp->xmit_size_goal;
685 copied = 0;
686
687 err = -EPIPE;
688 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
689 goto do_error;
690
691 while (psize > 0) {
692 struct sk_buff *skb = tcp_write_queue_tail(sk);
693 struct page *page = pages[poffset / PAGE_SIZE];
694 int copy, i, can_coalesce;
695 int offset = poffset % PAGE_SIZE;
696 int size = min_t(size_t, psize, PAGE_SIZE - offset);
697
698 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
699new_segment:
700 if (!sk_stream_memory_free(sk))
701 goto wait_for_sndbuf;
702
703 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
704 if (!skb)
705 goto wait_for_memory;
706
707 skb_entail(sk, skb);
708 copy = size_goal;
709 }
710
711 if (copy > size)
712 copy = size;
713
714 i = skb_shinfo(skb)->nr_frags;
715 can_coalesce = skb_can_coalesce(skb, i, page, offset);
716 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
717 tcp_mark_push(tp, skb);
718 goto new_segment;
719 }
720 if (!sk_wmem_schedule(sk, copy))
721 goto wait_for_memory;
722
723 if (can_coalesce) {
724 skb_shinfo(skb)->frags[i - 1].size += copy;
725 } else {
726 get_page(page);
727 skb_fill_page_desc(skb, i, page, offset, copy);
728 }
729
730 skb->len += copy;
731 skb->data_len += copy;
732 skb->truesize += copy;
733 sk->sk_wmem_queued += copy;
734 sk_mem_charge(sk, copy);
735 skb->ip_summed = CHECKSUM_PARTIAL;
736 tp->write_seq += copy;
737 TCP_SKB_CB(skb)->end_seq += copy;
738 skb_shinfo(skb)->gso_segs = 0;
739
740 if (!copied)
741 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
742
743 copied += copy;
744 poffset += copy;
745 if (!(psize -= copy))
746 goto out;
747
748 if (skb->len < size_goal || (flags & MSG_OOB))
749 continue;
750
751 if (forced_push(tp)) {
752 tcp_mark_push(tp, skb);
753 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
754 } else if (skb == tcp_send_head(sk))
755 tcp_push_one(sk, mss_now);
756 continue;
757
758wait_for_sndbuf:
759 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
760wait_for_memory:
761 if (copied)
762 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
763
764 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
765 goto do_error;
766
767 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
768 size_goal = tp->xmit_size_goal;
769 }
770
771out:
772 if (copied)
773 tcp_push(sk, flags, mss_now, tp->nonagle);
774 return copied;
775
776do_error:
777 if (copied)
778 goto out;
779out_err:
780 return sk_stream_error(sk, flags, err);
781}
782
783ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
784 size_t size, int flags)
785{
786 ssize_t res;
787 struct sock *sk = sock->sk;
788
789 if (!(sk->sk_route_caps & NETIF_F_SG) ||
790 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
791 return sock_no_sendpage(sock, page, offset, size, flags);
792
793 lock_sock(sk);
794 TCP_CHECK_TIMER(sk);
795 res = do_tcp_sendpages(sk, &page, offset, size, flags);
796 TCP_CHECK_TIMER(sk);
797 release_sock(sk);
798 return res;
799}
800
801#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
802#define TCP_OFF(sk) (sk->sk_sndmsg_off)
803
804static inline int select_size(struct sock *sk)
805{
806 struct tcp_sock *tp = tcp_sk(sk);
807 int tmp = tp->mss_cache;
808
809 if (sk->sk_route_caps & NETIF_F_SG) {
810 if (sk_can_gso(sk))
811 tmp = 0;
812 else {
813 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
814
815 if (tmp >= pgbreak &&
816 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
817 tmp = pgbreak;
818 }
819 }
820
821 return tmp;
822}
823
824int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
825 size_t size)
826{
827 struct sock *sk = sock->sk;
828 struct iovec *iov;
829 struct tcp_sock *tp = tcp_sk(sk);
830 struct sk_buff *skb;
831 int iovlen, flags;
832 int mss_now, size_goal;
833 int err, copied;
834 long timeo;
835
836 lock_sock(sk);
837 TCP_CHECK_TIMER(sk);
838
839 flags = msg->msg_flags;
840 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
841
842
843 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
844 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
845 goto out_err;
846
847
848 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849
850 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
851 size_goal = tp->xmit_size_goal;
852
853
854 iovlen = msg->msg_iovlen;
855 iov = msg->msg_iov;
856 copied = 0;
857
858 err = -EPIPE;
859 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
860 goto do_error;
861
862 while (--iovlen >= 0) {
863 int seglen = iov->iov_len;
864 unsigned char __user *from = iov->iov_base;
865
866 iov++;
867
868 while (seglen > 0) {
869 int copy;
870
871 skb = tcp_write_queue_tail(sk);
872
873 if (!tcp_send_head(sk) ||
874 (copy = size_goal - skb->len) <= 0) {
875
876new_segment:
877
878
879
880 if (!sk_stream_memory_free(sk))
881 goto wait_for_sndbuf;
882
883 skb = sk_stream_alloc_skb(sk, select_size(sk),
884 sk->sk_allocation);
885 if (!skb)
886 goto wait_for_memory;
887
888
889
890
891 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
892 skb->ip_summed = CHECKSUM_PARTIAL;
893
894 skb_entail(sk, skb);
895 copy = size_goal;
896 }
897
898
899 if (copy > seglen)
900 copy = seglen;
901
902
903 if (skb_tailroom(skb) > 0) {
904
905 if (copy > skb_tailroom(skb))
906 copy = skb_tailroom(skb);
907 if ((err = skb_add_data(skb, from, copy)) != 0)
908 goto do_fault;
909 } else {
910 int merge = 0;
911 int i = skb_shinfo(skb)->nr_frags;
912 struct page *page = TCP_PAGE(sk);
913 int off = TCP_OFF(sk);
914
915 if (skb_can_coalesce(skb, i, page, off) &&
916 off != PAGE_SIZE) {
917
918
919 merge = 1;
920 } else if (i == MAX_SKB_FRAGS ||
921 (!i &&
922 !(sk->sk_route_caps & NETIF_F_SG))) {
923
924
925
926
927 tcp_mark_push(tp, skb);
928 goto new_segment;
929 } else if (page) {
930 if (off == PAGE_SIZE) {
931 put_page(page);
932 TCP_PAGE(sk) = page = NULL;
933 off = 0;
934 }
935 } else
936 off = 0;
937
938 if (copy > PAGE_SIZE - off)
939 copy = PAGE_SIZE - off;
940
941 if (!sk_wmem_schedule(sk, copy))
942 goto wait_for_memory;
943
944 if (!page) {
945
946 if (!(page = sk_stream_alloc_page(sk)))
947 goto wait_for_memory;
948 }
949
950
951
952 err = skb_copy_to_page(sk, from, skb, page,
953 off, copy);
954 if (err) {
955
956
957
958 if (!TCP_PAGE(sk)) {
959 TCP_PAGE(sk) = page;
960 TCP_OFF(sk) = 0;
961 }
962 goto do_error;
963 }
964
965
966 if (merge) {
967 skb_shinfo(skb)->frags[i - 1].size +=
968 copy;
969 } else {
970 skb_fill_page_desc(skb, i, page, off, copy);
971 if (TCP_PAGE(sk)) {
972 get_page(page);
973 } else if (off + copy < PAGE_SIZE) {
974 get_page(page);
975 TCP_PAGE(sk) = page;
976 }
977 }
978
979 TCP_OFF(sk) = off + copy;
980 }
981
982 if (!copied)
983 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
984
985 tp->write_seq += copy;
986 TCP_SKB_CB(skb)->end_seq += copy;
987 skb_shinfo(skb)->gso_segs = 0;
988
989 from += copy;
990 copied += copy;
991 if ((seglen -= copy) == 0 && iovlen == 0)
992 goto out;
993
994 if (skb->len < size_goal || (flags & MSG_OOB))
995 continue;
996
997 if (forced_push(tp)) {
998 tcp_mark_push(tp, skb);
999 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1000 } else if (skb == tcp_send_head(sk))
1001 tcp_push_one(sk, mss_now);
1002 continue;
1003
1004wait_for_sndbuf:
1005 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1006wait_for_memory:
1007 if (copied)
1008 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1009
1010 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1011 goto do_error;
1012
1013 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1014 size_goal = tp->xmit_size_goal;
1015 }
1016 }
1017
1018out:
1019 if (copied)
1020 tcp_push(sk, flags, mss_now, tp->nonagle);
1021 TCP_CHECK_TIMER(sk);
1022 release_sock(sk);
1023 return copied;
1024
1025do_fault:
1026 if (!skb->len) {
1027 tcp_unlink_write_queue(skb, sk);
1028
1029
1030
1031 tcp_check_send_head(sk, skb);
1032 sk_wmem_free_skb(sk, skb);
1033 }
1034
1035do_error:
1036 if (copied)
1037 goto out;
1038out_err:
1039 err = sk_stream_error(sk, flags, err);
1040 TCP_CHECK_TIMER(sk);
1041 release_sock(sk);
1042 return err;
1043}
1044
1045
1046
1047
1048
1049
1050static int tcp_recv_urg(struct sock *sk, long timeo,
1051 struct msghdr *msg, int len, int flags,
1052 int *addr_len)
1053{
1054 struct tcp_sock *tp = tcp_sk(sk);
1055
1056
1057 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1058 tp->urg_data == TCP_URG_READ)
1059 return -EINVAL;
1060
1061 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1062 return -ENOTCONN;
1063
1064 if (tp->urg_data & TCP_URG_VALID) {
1065 int err = 0;
1066 char c = tp->urg_data;
1067
1068 if (!(flags & MSG_PEEK))
1069 tp->urg_data = TCP_URG_READ;
1070
1071
1072 msg->msg_flags |= MSG_OOB;
1073
1074 if (len > 0) {
1075 if (!(flags & MSG_TRUNC))
1076 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1077 len = 1;
1078 } else
1079 msg->msg_flags |= MSG_TRUNC;
1080
1081 return err ? -EFAULT : len;
1082 }
1083
1084 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1085 return 0;
1086
1087
1088
1089
1090
1091
1092
1093 return -EAGAIN;
1094}
1095
1096
1097
1098
1099
1100
1101
1102void tcp_cleanup_rbuf(struct sock *sk, int copied)
1103{
1104 struct tcp_sock *tp = tcp_sk(sk);
1105 int time_to_ack = 0;
1106
1107#if TCP_DEBUG
1108 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1109
1110 WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1111#endif
1112
1113 if (inet_csk_ack_scheduled(sk)) {
1114 const struct inet_connection_sock *icsk = inet_csk(sk);
1115
1116
1117 if (icsk->icsk_ack.blocked ||
1118
1119 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1120
1121
1122
1123
1124
1125
1126 (copied > 0 &&
1127 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1128 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1129 !icsk->icsk_ack.pingpong)) &&
1130 !atomic_read(&sk->sk_rmem_alloc)))
1131 time_to_ack = 1;
1132 }
1133
1134
1135
1136
1137
1138
1139
1140 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1141 __u32 rcv_window_now = tcp_receive_window(tp);
1142
1143
1144 if (2*rcv_window_now <= tp->window_clamp) {
1145 __u32 new_window = __tcp_select_window(sk);
1146
1147
1148
1149
1150
1151
1152 if (new_window && new_window >= 2 * rcv_window_now)
1153 time_to_ack = 1;
1154 }
1155 }
1156 if (time_to_ack)
1157 tcp_send_ack(sk);
1158}
1159
1160static void tcp_prequeue_process(struct sock *sk)
1161{
1162 struct sk_buff *skb;
1163 struct tcp_sock *tp = tcp_sk(sk);
1164
1165 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1166
1167
1168
1169 local_bh_disable();
1170 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1171 sk->sk_backlog_rcv(sk, skb);
1172 local_bh_enable();
1173
1174
1175 tp->ucopy.memory = 0;
1176}
1177
1178static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1179{
1180 struct sk_buff *skb;
1181 u32 offset;
1182
1183 skb_queue_walk(&sk->sk_receive_queue, skb) {
1184 offset = seq - TCP_SKB_CB(skb)->seq;
1185 if (tcp_hdr(skb)->syn)
1186 offset--;
1187 if (offset < skb->len || tcp_hdr(skb)->fin) {
1188 *off = offset;
1189 return skb;
1190 }
1191 }
1192 return NULL;
1193}
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1207 sk_read_actor_t recv_actor)
1208{
1209 struct sk_buff *skb;
1210 struct tcp_sock *tp = tcp_sk(sk);
1211 u32 seq = tp->copied_seq;
1212 u32 offset;
1213 int copied = 0;
1214
1215 if (sk->sk_state == TCP_LISTEN)
1216 return -ENOTCONN;
1217 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1218 if (offset < skb->len) {
1219 int used;
1220 size_t len;
1221
1222 len = skb->len - offset;
1223
1224 if (tp->urg_data) {
1225 u32 urg_offset = tp->urg_seq - seq;
1226 if (urg_offset < len)
1227 len = urg_offset;
1228 if (!len)
1229 break;
1230 }
1231 used = recv_actor(desc, skb, offset, len);
1232 if (used < 0) {
1233 if (!copied)
1234 copied = used;
1235 break;
1236 } else if (used <= len) {
1237 seq += used;
1238 copied += used;
1239 offset += used;
1240 }
1241
1242
1243
1244
1245
1246
1247 skb = tcp_recv_skb(sk, seq-1, &offset);
1248 if (!skb || (offset+1 != skb->len))
1249 break;
1250 }
1251 if (tcp_hdr(skb)->fin) {
1252 sk_eat_skb(sk, skb, 0);
1253 ++seq;
1254 break;
1255 }
1256 sk_eat_skb(sk, skb, 0);
1257 if (!desc->count)
1258 break;
1259 tp->copied_seq = seq;
1260 }
1261 tp->copied_seq = seq;
1262
1263 tcp_rcv_space_adjust(sk);
1264
1265
1266 if (copied > 0)
1267 tcp_cleanup_rbuf(sk, copied);
1268 return copied;
1269}
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1280 size_t len, int nonblock, int flags, int *addr_len)
1281{
1282 struct tcp_sock *tp = tcp_sk(sk);
1283 int copied = 0;
1284 u32 peek_seq;
1285 u32 *seq;
1286 unsigned long used;
1287 int err;
1288 int target;
1289 long timeo;
1290 struct task_struct *user_recv = NULL;
1291 int copied_early = 0;
1292 struct sk_buff *skb;
1293
1294 lock_sock(sk);
1295
1296 TCP_CHECK_TIMER(sk);
1297
1298 err = -ENOTCONN;
1299 if (sk->sk_state == TCP_LISTEN)
1300 goto out;
1301
1302 timeo = sock_rcvtimeo(sk, nonblock);
1303
1304
1305 if (flags & MSG_OOB)
1306 goto recv_urg;
1307
1308 seq = &tp->copied_seq;
1309 if (flags & MSG_PEEK) {
1310 peek_seq = tp->copied_seq;
1311 seq = &peek_seq;
1312 }
1313
1314 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1315
1316#ifdef CONFIG_NET_DMA
1317 tp->ucopy.dma_chan = NULL;
1318 preempt_disable();
1319 skb = skb_peek_tail(&sk->sk_receive_queue);
1320 {
1321 int available = 0;
1322
1323 if (skb)
1324 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1325 if ((available < target) &&
1326 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1327 !sysctl_tcp_low_latency &&
1328 __get_cpu_var(softnet_data).net_dma) {
1329 preempt_enable_no_resched();
1330 tp->ucopy.pinned_list =
1331 dma_pin_iovec_pages(msg->msg_iov, len);
1332 } else {
1333 preempt_enable_no_resched();
1334 }
1335 }
1336#endif
1337
1338 do {
1339 u32 offset;
1340
1341
1342 if (tp->urg_data && tp->urg_seq == *seq) {
1343 if (copied)
1344 break;
1345 if (signal_pending(current)) {
1346 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1347 break;
1348 }
1349 }
1350
1351
1352
1353 skb = skb_peek(&sk->sk_receive_queue);
1354 do {
1355 if (!skb)
1356 break;
1357
1358
1359
1360
1361 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1362 printk(KERN_INFO "recvmsg bug: copied %X "
1363 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1364 break;
1365 }
1366 offset = *seq - TCP_SKB_CB(skb)->seq;
1367 if (tcp_hdr(skb)->syn)
1368 offset--;
1369 if (offset < skb->len)
1370 goto found_ok_skb;
1371 if (tcp_hdr(skb)->fin)
1372 goto found_fin_ok;
1373 WARN_ON(!(flags & MSG_PEEK));
1374 skb = skb->next;
1375 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1376
1377
1378
1379 if (copied >= target && !sk->sk_backlog.tail)
1380 break;
1381
1382 if (copied) {
1383 if (sk->sk_err ||
1384 sk->sk_state == TCP_CLOSE ||
1385 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1386 !timeo ||
1387 signal_pending(current) ||
1388 (flags & MSG_PEEK))
1389 break;
1390 } else {
1391 if (sock_flag(sk, SOCK_DONE))
1392 break;
1393
1394 if (sk->sk_err) {
1395 copied = sock_error(sk);
1396 break;
1397 }
1398
1399 if (sk->sk_shutdown & RCV_SHUTDOWN)
1400 break;
1401
1402 if (sk->sk_state == TCP_CLOSE) {
1403 if (!sock_flag(sk, SOCK_DONE)) {
1404
1405
1406
1407 copied = -ENOTCONN;
1408 break;
1409 }
1410 break;
1411 }
1412
1413 if (!timeo) {
1414 copied = -EAGAIN;
1415 break;
1416 }
1417
1418 if (signal_pending(current)) {
1419 copied = sock_intr_errno(timeo);
1420 break;
1421 }
1422 }
1423
1424 tcp_cleanup_rbuf(sk, copied);
1425
1426 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1427
1428 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1429 user_recv = current;
1430 tp->ucopy.task = user_recv;
1431 tp->ucopy.iov = msg->msg_iov;
1432 }
1433
1434 tp->ucopy.len = len;
1435
1436 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1437 !(flags & (MSG_PEEK | MSG_TRUNC)));
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465 if (!skb_queue_empty(&tp->ucopy.prequeue))
1466 goto do_prequeue;
1467
1468
1469 }
1470
1471 if (copied >= target) {
1472
1473 release_sock(sk);
1474 lock_sock(sk);
1475 } else
1476 sk_wait_data(sk, &timeo);
1477
1478#ifdef CONFIG_NET_DMA
1479 tp->ucopy.wakeup = 0;
1480#endif
1481
1482 if (user_recv) {
1483 int chunk;
1484
1485
1486
1487 if ((chunk = len - tp->ucopy.len) != 0) {
1488 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1489 len -= chunk;
1490 copied += chunk;
1491 }
1492
1493 if (tp->rcv_nxt == tp->copied_seq &&
1494 !skb_queue_empty(&tp->ucopy.prequeue)) {
1495do_prequeue:
1496 tcp_prequeue_process(sk);
1497
1498 if ((chunk = len - tp->ucopy.len) != 0) {
1499 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1500 len -= chunk;
1501 copied += chunk;
1502 }
1503 }
1504 }
1505 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1506 if (net_ratelimit())
1507 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1508 current->comm, task_pid_nr(current));
1509 peek_seq = tp->copied_seq;
1510 }
1511 continue;
1512
1513 found_ok_skb:
1514
1515 used = skb->len - offset;
1516 if (len < used)
1517 used = len;
1518
1519
1520 if (tp->urg_data) {
1521 u32 urg_offset = tp->urg_seq - *seq;
1522 if (urg_offset < used) {
1523 if (!urg_offset) {
1524 if (!sock_flag(sk, SOCK_URGINLINE)) {
1525 ++*seq;
1526 offset++;
1527 used--;
1528 if (!used)
1529 goto skip_copy;
1530 }
1531 } else
1532 used = urg_offset;
1533 }
1534 }
1535
1536 if (!(flags & MSG_TRUNC)) {
1537#ifdef CONFIG_NET_DMA
1538 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1539 tp->ucopy.dma_chan = get_softnet_dma();
1540
1541 if (tp->ucopy.dma_chan) {
1542 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1543 tp->ucopy.dma_chan, skb, offset,
1544 msg->msg_iov, used,
1545 tp->ucopy.pinned_list);
1546
1547 if (tp->ucopy.dma_cookie < 0) {
1548
1549 printk(KERN_ALERT "dma_cookie < 0\n");
1550
1551
1552 if (!copied)
1553 copied = -EFAULT;
1554 break;
1555 }
1556 if ((offset + used) == skb->len)
1557 copied_early = 1;
1558
1559 } else
1560#endif
1561 {
1562 err = skb_copy_datagram_iovec(skb, offset,
1563 msg->msg_iov, used);
1564 if (err) {
1565
1566 if (!copied)
1567 copied = -EFAULT;
1568 break;
1569 }
1570 }
1571 }
1572
1573 *seq += used;
1574 copied += used;
1575 len -= used;
1576
1577 tcp_rcv_space_adjust(sk);
1578
1579skip_copy:
1580 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1581 tp->urg_data = 0;
1582 tcp_fast_path_check(sk);
1583 }
1584 if (used + offset < skb->len)
1585 continue;
1586
1587 if (tcp_hdr(skb)->fin)
1588 goto found_fin_ok;
1589 if (!(flags & MSG_PEEK)) {
1590 sk_eat_skb(sk, skb, copied_early);
1591 copied_early = 0;
1592 }
1593 continue;
1594
1595 found_fin_ok:
1596
1597 ++*seq;
1598 if (!(flags & MSG_PEEK)) {
1599 sk_eat_skb(sk, skb, copied_early);
1600 copied_early = 0;
1601 }
1602 break;
1603 } while (len > 0);
1604
1605 if (user_recv) {
1606 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1607 int chunk;
1608
1609 tp->ucopy.len = copied > 0 ? len : 0;
1610
1611 tcp_prequeue_process(sk);
1612
1613 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1614 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1615 len -= chunk;
1616 copied += chunk;
1617 }
1618 }
1619
1620 tp->ucopy.task = NULL;
1621 tp->ucopy.len = 0;
1622 }
1623
1624#ifdef CONFIG_NET_DMA
1625 if (tp->ucopy.dma_chan) {
1626 dma_cookie_t done, used;
1627
1628 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1629
1630 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1631 tp->ucopy.dma_cookie, &done,
1632 &used) == DMA_IN_PROGRESS) {
1633
1634 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1635 (dma_async_is_complete(skb->dma_cookie, done,
1636 used) == DMA_SUCCESS)) {
1637 __skb_dequeue(&sk->sk_async_wait_queue);
1638 kfree_skb(skb);
1639 }
1640 }
1641
1642
1643 __skb_queue_purge(&sk->sk_async_wait_queue);
1644 dma_chan_put(tp->ucopy.dma_chan);
1645 tp->ucopy.dma_chan = NULL;
1646 }
1647 if (tp->ucopy.pinned_list) {
1648 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1649 tp->ucopy.pinned_list = NULL;
1650 }
1651#endif
1652
1653
1654
1655
1656
1657
1658 tcp_cleanup_rbuf(sk, copied);
1659
1660 TCP_CHECK_TIMER(sk);
1661 release_sock(sk);
1662 return copied;
1663
1664out:
1665 TCP_CHECK_TIMER(sk);
1666 release_sock(sk);
1667 return err;
1668
1669recv_urg:
1670 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1671 goto out;
1672}
1673
1674void tcp_set_state(struct sock *sk, int state)
1675{
1676 int oldstate = sk->sk_state;
1677
1678 switch (state) {
1679 case TCP_ESTABLISHED:
1680 if (oldstate != TCP_ESTABLISHED)
1681 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1682 break;
1683
1684 case TCP_CLOSE:
1685 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1686 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1687
1688 sk->sk_prot->unhash(sk);
1689 if (inet_csk(sk)->icsk_bind_hash &&
1690 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1691 inet_put_port(sk);
1692
1693 default:
1694 if (oldstate==TCP_ESTABLISHED)
1695 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1696 }
1697
1698
1699
1700
1701 sk->sk_state = state;
1702
1703#ifdef STATE_TRACE
1704 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
1705#endif
1706}
1707EXPORT_SYMBOL_GPL(tcp_set_state);
1708
1709
1710
1711
1712
1713
1714
1715
1716static const unsigned char new_state[16] = {
1717
1718 TCP_CLOSE,
1719 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1720 TCP_CLOSE,
1721 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1722 TCP_FIN_WAIT1,
1723 TCP_FIN_WAIT2,
1724 TCP_CLOSE,
1725 TCP_CLOSE,
1726 TCP_LAST_ACK | TCP_ACTION_FIN,
1727 TCP_LAST_ACK,
1728 TCP_CLOSE,
1729 TCP_CLOSING,
1730};
1731
1732static int tcp_close_state(struct sock *sk)
1733{
1734 int next = (int)new_state[sk->sk_state];
1735 int ns = next & TCP_STATE_MASK;
1736
1737 tcp_set_state(sk, ns);
1738
1739 return next & TCP_ACTION_FIN;
1740}
1741
1742
1743
1744
1745
1746
1747void tcp_shutdown(struct sock *sk, int how)
1748{
1749
1750
1751
1752
1753 if (!(how & SEND_SHUTDOWN))
1754 return;
1755
1756
1757 if ((1 << sk->sk_state) &
1758 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1759 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1760
1761 if (tcp_close_state(sk))
1762 tcp_send_fin(sk);
1763 }
1764}
1765
1766void tcp_close(struct sock *sk, long timeout)
1767{
1768 struct sk_buff *skb;
1769 int data_was_unread = 0;
1770 int state;
1771
1772 lock_sock(sk);
1773 sk->sk_shutdown = SHUTDOWN_MASK;
1774
1775 if (sk->sk_state == TCP_LISTEN) {
1776 tcp_set_state(sk, TCP_CLOSE);
1777
1778
1779 inet_csk_listen_stop(sk);
1780
1781 goto adjudge_to_death;
1782 }
1783
1784
1785
1786
1787
1788 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1789 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1790 tcp_hdr(skb)->fin;
1791 data_was_unread += len;
1792 __kfree_skb(skb);
1793 }
1794
1795 sk_mem_reclaim(sk);
1796
1797
1798
1799
1800
1801
1802
1803
1804 if (data_was_unread) {
1805
1806 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
1807 tcp_set_state(sk, TCP_CLOSE);
1808 tcp_send_active_reset(sk, GFP_KERNEL);
1809 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1810
1811 sk->sk_prot->disconnect(sk, 0);
1812 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
1813 } else if (tcp_close_state(sk)) {
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839 tcp_send_fin(sk);
1840 }
1841
1842 sk_stream_wait_close(sk, timeout);
1843
1844adjudge_to_death:
1845 state = sk->sk_state;
1846 sock_hold(sk);
1847 sock_orphan(sk);
1848 atomic_inc(sk->sk_prot->orphan_count);
1849
1850
1851 release_sock(sk);
1852
1853
1854
1855
1856
1857 local_bh_disable();
1858 bh_lock_sock(sk);
1859 WARN_ON(sock_owned_by_user(sk));
1860
1861
1862 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1863 goto out;
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879 if (sk->sk_state == TCP_FIN_WAIT2) {
1880 struct tcp_sock *tp = tcp_sk(sk);
1881 if (tp->linger2 < 0) {
1882 tcp_set_state(sk, TCP_CLOSE);
1883 tcp_send_active_reset(sk, GFP_ATOMIC);
1884 NET_INC_STATS_BH(sock_net(sk),
1885 LINUX_MIB_TCPABORTONLINGER);
1886 } else {
1887 const int tmo = tcp_fin_time(sk);
1888
1889 if (tmo > TCP_TIMEWAIT_LEN) {
1890 inet_csk_reset_keepalive_timer(sk,
1891 tmo - TCP_TIMEWAIT_LEN);
1892 } else {
1893 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1894 goto out;
1895 }
1896 }
1897 }
1898 if (sk->sk_state != TCP_CLOSE) {
1899 sk_mem_reclaim(sk);
1900 if (tcp_too_many_orphans(sk,
1901 atomic_read(sk->sk_prot->orphan_count))) {
1902 if (net_ratelimit())
1903 printk(KERN_INFO "TCP: too many of orphaned "
1904 "sockets\n");
1905 tcp_set_state(sk, TCP_CLOSE);
1906 tcp_send_active_reset(sk, GFP_ATOMIC);
1907 NET_INC_STATS_BH(sock_net(sk),
1908 LINUX_MIB_TCPABORTONMEMORY);
1909 }
1910 }
1911
1912 if (sk->sk_state == TCP_CLOSE)
1913 inet_csk_destroy_sock(sk);
1914
1915
1916out:
1917 bh_unlock_sock(sk);
1918 local_bh_enable();
1919 sock_put(sk);
1920}
1921
1922
1923
1924static inline int tcp_need_reset(int state)
1925{
1926 return (1 << state) &
1927 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1928 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1929}
1930
1931int tcp_disconnect(struct sock *sk, int flags)
1932{
1933 struct inet_sock *inet = inet_sk(sk);
1934 struct inet_connection_sock *icsk = inet_csk(sk);
1935 struct tcp_sock *tp = tcp_sk(sk);
1936 int err = 0;
1937 int old_state = sk->sk_state;
1938
1939 if (old_state != TCP_CLOSE)
1940 tcp_set_state(sk, TCP_CLOSE);
1941
1942
1943 if (old_state == TCP_LISTEN) {
1944 inet_csk_listen_stop(sk);
1945 } else if (tcp_need_reset(old_state) ||
1946 (tp->snd_nxt != tp->write_seq &&
1947 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1948
1949
1950
1951 tcp_send_active_reset(sk, gfp_any());
1952 sk->sk_err = ECONNRESET;
1953 } else if (old_state == TCP_SYN_SENT)
1954 sk->sk_err = ECONNRESET;
1955
1956 tcp_clear_xmit_timers(sk);
1957 __skb_queue_purge(&sk->sk_receive_queue);
1958 tcp_write_queue_purge(sk);
1959 __skb_queue_purge(&tp->out_of_order_queue);
1960#ifdef CONFIG_NET_DMA
1961 __skb_queue_purge(&sk->sk_async_wait_queue);
1962#endif
1963
1964 inet->dport = 0;
1965
1966 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1967 inet_reset_saddr(sk);
1968
1969 sk->sk_shutdown = 0;
1970 sock_reset_flag(sk, SOCK_DONE);
1971 tp->srtt = 0;
1972 if ((tp->write_seq += tp->max_window + 2) == 0)
1973 tp->write_seq = 1;
1974 icsk->icsk_backoff = 0;
1975 tp->snd_cwnd = 2;
1976 icsk->icsk_probes_out = 0;
1977 tp->packets_out = 0;
1978 tp->snd_ssthresh = 0x7fffffff;
1979 tp->snd_cwnd_cnt = 0;
1980 tp->bytes_acked = 0;
1981 tcp_set_ca_state(sk, TCP_CA_Open);
1982 tcp_clear_retrans(tp);
1983 inet_csk_delack_init(sk);
1984 tcp_init_send_head(sk);
1985 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
1986 __sk_dst_reset(sk);
1987
1988 WARN_ON(inet->num && !icsk->icsk_bind_hash);
1989
1990 sk->sk_error_report(sk);
1991 return err;
1992}
1993
1994
1995
1996
1997static int do_tcp_setsockopt(struct sock *sk, int level,
1998 int optname, char __user *optval, int optlen)
1999{
2000 struct tcp_sock *tp = tcp_sk(sk);
2001 struct inet_connection_sock *icsk = inet_csk(sk);
2002 int val;
2003 int err = 0;
2004
2005
2006 if (optname == TCP_CONGESTION) {
2007 char name[TCP_CA_NAME_MAX];
2008
2009 if (optlen < 1)
2010 return -EINVAL;
2011
2012 val = strncpy_from_user(name, optval,
2013 min(TCP_CA_NAME_MAX-1, optlen));
2014 if (val < 0)
2015 return -EFAULT;
2016 name[val] = 0;
2017
2018 lock_sock(sk);
2019 err = tcp_set_congestion_control(sk, name);
2020 release_sock(sk);
2021 return err;
2022 }
2023
2024 if (optlen < sizeof(int))
2025 return -EINVAL;
2026
2027 if (get_user(val, (int __user *)optval))
2028 return -EFAULT;
2029
2030 lock_sock(sk);
2031
2032 switch (optname) {
2033 case TCP_MAXSEG:
2034
2035
2036
2037 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2038 err = -EINVAL;
2039 break;
2040 }
2041 tp->rx_opt.user_mss = val;
2042 break;
2043
2044 case TCP_NODELAY:
2045 if (val) {
2046
2047
2048
2049
2050
2051
2052
2053
2054 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2055 tcp_push_pending_frames(sk);
2056 } else {
2057 tp->nonagle &= ~TCP_NAGLE_OFF;
2058 }
2059 break;
2060
2061 case TCP_CORK:
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073 if (val) {
2074 tp->nonagle |= TCP_NAGLE_CORK;
2075 } else {
2076 tp->nonagle &= ~TCP_NAGLE_CORK;
2077 if (tp->nonagle&TCP_NAGLE_OFF)
2078 tp->nonagle |= TCP_NAGLE_PUSH;
2079 tcp_push_pending_frames(sk);
2080 }
2081 break;
2082
2083 case TCP_KEEPIDLE:
2084 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2085 err = -EINVAL;
2086 else {
2087 tp->keepalive_time = val * HZ;
2088 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2089 !((1 << sk->sk_state) &
2090 (TCPF_CLOSE | TCPF_LISTEN))) {
2091 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2092 if (tp->keepalive_time > elapsed)
2093 elapsed = tp->keepalive_time - elapsed;
2094 else
2095 elapsed = 0;
2096 inet_csk_reset_keepalive_timer(sk, elapsed);
2097 }
2098 }
2099 break;
2100 case TCP_KEEPINTVL:
2101 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2102 err = -EINVAL;
2103 else
2104 tp->keepalive_intvl = val * HZ;
2105 break;
2106 case TCP_KEEPCNT:
2107 if (val < 1 || val > MAX_TCP_KEEPCNT)
2108 err = -EINVAL;
2109 else
2110 tp->keepalive_probes = val;
2111 break;
2112 case TCP_SYNCNT:
2113 if (val < 1 || val > MAX_TCP_SYNCNT)
2114 err = -EINVAL;
2115 else
2116 icsk->icsk_syn_retries = val;
2117 break;
2118
2119 case TCP_LINGER2:
2120 if (val < 0)
2121 tp->linger2 = -1;
2122 else if (val > sysctl_tcp_fin_timeout / HZ)
2123 tp->linger2 = 0;
2124 else
2125 tp->linger2 = val * HZ;
2126 break;
2127
2128 case TCP_DEFER_ACCEPT:
2129 icsk->icsk_accept_queue.rskq_defer_accept = 0;
2130 if (val > 0) {
2131
2132
2133 while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2134 val > ((TCP_TIMEOUT_INIT / HZ) <<
2135 icsk->icsk_accept_queue.rskq_defer_accept))
2136 icsk->icsk_accept_queue.rskq_defer_accept++;
2137 icsk->icsk_accept_queue.rskq_defer_accept++;
2138 }
2139 break;
2140
2141 case TCP_WINDOW_CLAMP:
2142 if (!val) {
2143 if (sk->sk_state != TCP_CLOSE) {
2144 err = -EINVAL;
2145 break;
2146 }
2147 tp->window_clamp = 0;
2148 } else
2149 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2150 SOCK_MIN_RCVBUF / 2 : val;
2151 break;
2152
2153 case TCP_QUICKACK:
2154 if (!val) {
2155 icsk->icsk_ack.pingpong = 1;
2156 } else {
2157 icsk->icsk_ack.pingpong = 0;
2158 if ((1 << sk->sk_state) &
2159 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2160 inet_csk_ack_scheduled(sk)) {
2161 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2162 tcp_cleanup_rbuf(sk, 1);
2163 if (!(val & 1))
2164 icsk->icsk_ack.pingpong = 1;
2165 }
2166 }
2167 break;
2168
2169#ifdef CONFIG_TCP_MD5SIG
2170 case TCP_MD5SIG:
2171
2172 err = tp->af_specific->md5_parse(sk, optval, optlen);
2173 break;
2174#endif
2175
2176 default:
2177 err = -ENOPROTOOPT;
2178 break;
2179 }
2180
2181 release_sock(sk);
2182 return err;
2183}
2184
2185int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2186 int optlen)
2187{
2188 struct inet_connection_sock *icsk = inet_csk(sk);
2189
2190 if (level != SOL_TCP)
2191 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2192 optval, optlen);
2193 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2194}
2195
2196#ifdef CONFIG_COMPAT
2197int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2198 char __user *optval, int optlen)
2199{
2200 if (level != SOL_TCP)
2201 return inet_csk_compat_setsockopt(sk, level, optname,
2202 optval, optlen);
2203 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2204}
2205
2206EXPORT_SYMBOL(compat_tcp_setsockopt);
2207#endif
2208
2209
2210void tcp_get_info(struct sock *sk, struct tcp_info *info)
2211{
2212 struct tcp_sock *tp = tcp_sk(sk);
2213 const struct inet_connection_sock *icsk = inet_csk(sk);
2214 u32 now = tcp_time_stamp;
2215
2216 memset(info, 0, sizeof(*info));
2217
2218 info->tcpi_state = sk->sk_state;
2219 info->tcpi_ca_state = icsk->icsk_ca_state;
2220 info->tcpi_retransmits = icsk->icsk_retransmits;
2221 info->tcpi_probes = icsk->icsk_probes_out;
2222 info->tcpi_backoff = icsk->icsk_backoff;
2223
2224 if (tp->rx_opt.tstamp_ok)
2225 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2226 if (tcp_is_sack(tp))
2227 info->tcpi_options |= TCPI_OPT_SACK;
2228 if (tp->rx_opt.wscale_ok) {
2229 info->tcpi_options |= TCPI_OPT_WSCALE;
2230 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2231 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2232 }
2233
2234 if (tp->ecn_flags&TCP_ECN_OK)
2235 info->tcpi_options |= TCPI_OPT_ECN;
2236
2237 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2238 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2239 info->tcpi_snd_mss = tp->mss_cache;
2240 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2241
2242 if (sk->sk_state == TCP_LISTEN) {
2243 info->tcpi_unacked = sk->sk_ack_backlog;
2244 info->tcpi_sacked = sk->sk_max_ack_backlog;
2245 } else {
2246 info->tcpi_unacked = tp->packets_out;
2247 info->tcpi_sacked = tp->sacked_out;
2248 }
2249 info->tcpi_lost = tp->lost_out;
2250 info->tcpi_retrans = tp->retrans_out;
2251 info->tcpi_fackets = tp->fackets_out;
2252
2253 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2254 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2255 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2256
2257 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2258 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2259 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2260 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2261 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2262 info->tcpi_snd_cwnd = tp->snd_cwnd;
2263 info->tcpi_advmss = tp->advmss;
2264 info->tcpi_reordering = tp->reordering;
2265
2266 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2267 info->tcpi_rcv_space = tp->rcvq_space.space;
2268
2269 info->tcpi_total_retrans = tp->total_retrans;
2270}
2271
2272EXPORT_SYMBOL_GPL(tcp_get_info);
2273
2274static int do_tcp_getsockopt(struct sock *sk, int level,
2275 int optname, char __user *optval, int __user *optlen)
2276{
2277 struct inet_connection_sock *icsk = inet_csk(sk);
2278 struct tcp_sock *tp = tcp_sk(sk);
2279 int val, len;
2280
2281 if (get_user(len, optlen))
2282 return -EFAULT;
2283
2284 len = min_t(unsigned int, len, sizeof(int));
2285
2286 if (len < 0)
2287 return -EINVAL;
2288
2289 switch (optname) {
2290 case TCP_MAXSEG:
2291 val = tp->mss_cache;
2292 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2293 val = tp->rx_opt.user_mss;
2294 break;
2295 case TCP_NODELAY:
2296 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2297 break;
2298 case TCP_CORK:
2299 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2300 break;
2301 case TCP_KEEPIDLE:
2302 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2303 break;
2304 case TCP_KEEPINTVL:
2305 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2306 break;
2307 case TCP_KEEPCNT:
2308 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2309 break;
2310 case TCP_SYNCNT:
2311 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2312 break;
2313 case TCP_LINGER2:
2314 val = tp->linger2;
2315 if (val >= 0)
2316 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2317 break;
2318 case TCP_DEFER_ACCEPT:
2319 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2320 ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2321 break;
2322 case TCP_WINDOW_CLAMP:
2323 val = tp->window_clamp;
2324 break;
2325 case TCP_INFO: {
2326 struct tcp_info info;
2327
2328 if (get_user(len, optlen))
2329 return -EFAULT;
2330
2331 tcp_get_info(sk, &info);
2332
2333 len = min_t(unsigned int, len, sizeof(info));
2334 if (put_user(len, optlen))
2335 return -EFAULT;
2336 if (copy_to_user(optval, &info, len))
2337 return -EFAULT;
2338 return 0;
2339 }
2340 case TCP_QUICKACK:
2341 val = !icsk->icsk_ack.pingpong;
2342 break;
2343
2344 case TCP_CONGESTION:
2345 if (get_user(len, optlen))
2346 return -EFAULT;
2347 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2348 if (put_user(len, optlen))
2349 return -EFAULT;
2350 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2351 return -EFAULT;
2352 return 0;
2353 default:
2354 return -ENOPROTOOPT;
2355 }
2356
2357 if (put_user(len, optlen))
2358 return -EFAULT;
2359 if (copy_to_user(optval, &val, len))
2360 return -EFAULT;
2361 return 0;
2362}
2363
2364int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2365 int __user *optlen)
2366{
2367 struct inet_connection_sock *icsk = inet_csk(sk);
2368
2369 if (level != SOL_TCP)
2370 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2371 optval, optlen);
2372 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2373}
2374
2375#ifdef CONFIG_COMPAT
2376int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2377 char __user *optval, int __user *optlen)
2378{
2379 if (level != SOL_TCP)
2380 return inet_csk_compat_getsockopt(sk, level, optname,
2381 optval, optlen);
2382 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2383}
2384
2385EXPORT_SYMBOL(compat_tcp_getsockopt);
2386#endif
2387
2388struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2389{
2390 struct sk_buff *segs = ERR_PTR(-EINVAL);
2391 struct tcphdr *th;
2392 unsigned thlen;
2393 unsigned int seq;
2394 __be32 delta;
2395 unsigned int oldlen;
2396 unsigned int len;
2397
2398 if (!pskb_may_pull(skb, sizeof(*th)))
2399 goto out;
2400
2401 th = tcp_hdr(skb);
2402 thlen = th->doff * 4;
2403 if (thlen < sizeof(*th))
2404 goto out;
2405
2406 if (!pskb_may_pull(skb, thlen))
2407 goto out;
2408
2409 oldlen = (u16)~skb->len;
2410 __skb_pull(skb, thlen);
2411
2412 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2413
2414 int type = skb_shinfo(skb)->gso_type;
2415 int mss;
2416
2417 if (unlikely(type &
2418 ~(SKB_GSO_TCPV4 |
2419 SKB_GSO_DODGY |
2420 SKB_GSO_TCP_ECN |
2421 SKB_GSO_TCPV6 |
2422 0) ||
2423 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2424 goto out;
2425
2426 mss = skb_shinfo(skb)->gso_size;
2427 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2428
2429 segs = NULL;
2430 goto out;
2431 }
2432
2433 segs = skb_segment(skb, features);
2434 if (IS_ERR(segs))
2435 goto out;
2436
2437 len = skb_shinfo(skb)->gso_size;
2438 delta = htonl(oldlen + (thlen + len));
2439
2440 skb = segs;
2441 th = tcp_hdr(skb);
2442 seq = ntohl(th->seq);
2443
2444 do {
2445 th->fin = th->psh = 0;
2446
2447 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2448 (__force u32)delta));
2449 if (skb->ip_summed != CHECKSUM_PARTIAL)
2450 th->check =
2451 csum_fold(csum_partial(skb_transport_header(skb),
2452 thlen, skb->csum));
2453
2454 seq += len;
2455 skb = skb->next;
2456 th = tcp_hdr(skb);
2457
2458 th->seq = htonl(seq);
2459 th->cwr = 0;
2460 } while (skb->next);
2461
2462 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2463 skb->data_len);
2464 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2465 (__force u32)delta));
2466 if (skb->ip_summed != CHECKSUM_PARTIAL)
2467 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2468 thlen, skb->csum));
2469
2470out:
2471 return segs;
2472}
2473EXPORT_SYMBOL(tcp_tso_segment);
2474
2475#ifdef CONFIG_TCP_MD5SIG
2476static unsigned long tcp_md5sig_users;
2477static struct tcp_md5sig_pool **tcp_md5sig_pool;
2478static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2479
2480static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2481{
2482 int cpu;
2483 for_each_possible_cpu(cpu) {
2484 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2485 if (p) {
2486 if (p->md5_desc.tfm)
2487 crypto_free_hash(p->md5_desc.tfm);
2488 kfree(p);
2489 p = NULL;
2490 }
2491 }
2492 free_percpu(pool);
2493}
2494
2495void tcp_free_md5sig_pool(void)
2496{
2497 struct tcp_md5sig_pool **pool = NULL;
2498
2499 spin_lock_bh(&tcp_md5sig_pool_lock);
2500 if (--tcp_md5sig_users == 0) {
2501 pool = tcp_md5sig_pool;
2502 tcp_md5sig_pool = NULL;
2503 }
2504 spin_unlock_bh(&tcp_md5sig_pool_lock);
2505 if (pool)
2506 __tcp_free_md5sig_pool(pool);
2507}
2508
2509EXPORT_SYMBOL(tcp_free_md5sig_pool);
2510
2511static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2512{
2513 int cpu;
2514 struct tcp_md5sig_pool **pool;
2515
2516 pool = alloc_percpu(struct tcp_md5sig_pool *);
2517 if (!pool)
2518 return NULL;
2519
2520 for_each_possible_cpu(cpu) {
2521 struct tcp_md5sig_pool *p;
2522 struct crypto_hash *hash;
2523
2524 p = kzalloc(sizeof(*p), GFP_KERNEL);
2525 if (!p)
2526 goto out_free;
2527 *per_cpu_ptr(pool, cpu) = p;
2528
2529 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2530 if (!hash || IS_ERR(hash))
2531 goto out_free;
2532
2533 p->md5_desc.tfm = hash;
2534 }
2535 return pool;
2536out_free:
2537 __tcp_free_md5sig_pool(pool);
2538 return NULL;
2539}
2540
2541struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2542{
2543 struct tcp_md5sig_pool **pool;
2544 int alloc = 0;
2545
2546retry:
2547 spin_lock_bh(&tcp_md5sig_pool_lock);
2548 pool = tcp_md5sig_pool;
2549 if (tcp_md5sig_users++ == 0) {
2550 alloc = 1;
2551 spin_unlock_bh(&tcp_md5sig_pool_lock);
2552 } else if (!pool) {
2553 tcp_md5sig_users--;
2554 spin_unlock_bh(&tcp_md5sig_pool_lock);
2555 cpu_relax();
2556 goto retry;
2557 } else
2558 spin_unlock_bh(&tcp_md5sig_pool_lock);
2559
2560 if (alloc) {
2561
2562 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2563 spin_lock_bh(&tcp_md5sig_pool_lock);
2564 if (!p) {
2565 tcp_md5sig_users--;
2566 spin_unlock_bh(&tcp_md5sig_pool_lock);
2567 return NULL;
2568 }
2569 pool = tcp_md5sig_pool;
2570 if (pool) {
2571
2572 spin_unlock_bh(&tcp_md5sig_pool_lock);
2573 __tcp_free_md5sig_pool(p);
2574 } else {
2575 tcp_md5sig_pool = pool = p;
2576 spin_unlock_bh(&tcp_md5sig_pool_lock);
2577 }
2578 }
2579 return pool;
2580}
2581
2582EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2583
2584struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2585{
2586 struct tcp_md5sig_pool **p;
2587 spin_lock_bh(&tcp_md5sig_pool_lock);
2588 p = tcp_md5sig_pool;
2589 if (p)
2590 tcp_md5sig_users++;
2591 spin_unlock_bh(&tcp_md5sig_pool_lock);
2592 return (p ? *per_cpu_ptr(p, cpu) : NULL);
2593}
2594
2595EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2596
2597void __tcp_put_md5sig_pool(void)
2598{
2599 tcp_free_md5sig_pool();
2600}
2601
2602EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2603
2604int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2605 struct tcphdr *th)
2606{
2607 struct scatterlist sg;
2608 int err;
2609
2610 __sum16 old_checksum = th->check;
2611 th->check = 0;
2612
2613 sg_init_one(&sg, th, sizeof(struct tcphdr));
2614 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
2615 th->check = old_checksum;
2616 return err;
2617}
2618
2619EXPORT_SYMBOL(tcp_md5_hash_header);
2620
2621int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2622 struct sk_buff *skb, unsigned header_len)
2623{
2624 struct scatterlist sg;
2625 const struct tcphdr *tp = tcp_hdr(skb);
2626 struct hash_desc *desc = &hp->md5_desc;
2627 unsigned i;
2628 const unsigned head_data_len = skb_headlen(skb) > header_len ?
2629 skb_headlen(skb) - header_len : 0;
2630 const struct skb_shared_info *shi = skb_shinfo(skb);
2631
2632 sg_init_table(&sg, 1);
2633
2634 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
2635 if (crypto_hash_update(desc, &sg, head_data_len))
2636 return 1;
2637
2638 for (i = 0; i < shi->nr_frags; ++i) {
2639 const struct skb_frag_struct *f = &shi->frags[i];
2640 sg_set_page(&sg, f->page, f->size, f->page_offset);
2641 if (crypto_hash_update(desc, &sg, f->size))
2642 return 1;
2643 }
2644
2645 return 0;
2646}
2647
2648EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2649
2650int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
2651{
2652 struct scatterlist sg;
2653
2654 sg_init_one(&sg, key->key, key->keylen);
2655 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2656}
2657
2658EXPORT_SYMBOL(tcp_md5_hash_key);
2659
2660#endif
2661
2662void tcp_done(struct sock *sk)
2663{
2664 if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2665 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2666
2667 tcp_set_state(sk, TCP_CLOSE);
2668 tcp_clear_xmit_timers(sk);
2669
2670 sk->sk_shutdown = SHUTDOWN_MASK;
2671
2672 if (!sock_flag(sk, SOCK_DEAD))
2673 sk->sk_state_change(sk);
2674 else
2675 inet_csk_destroy_sock(sk);
2676}
2677EXPORT_SYMBOL_GPL(tcp_done);
2678
2679extern struct tcp_congestion_ops tcp_reno;
2680
2681static __initdata unsigned long thash_entries;
2682static int __init set_thash_entries(char *str)
2683{
2684 if (!str)
2685 return 0;
2686 thash_entries = simple_strtoul(str, &str, 0);
2687 return 1;
2688}
2689__setup("thash_entries=", set_thash_entries);
2690
2691void __init tcp_init(void)
2692{
2693 struct sk_buff *skb = NULL;
2694 unsigned long nr_pages, limit;
2695 int order, i, max_share;
2696
2697 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2698
2699 tcp_hashinfo.bind_bucket_cachep =
2700 kmem_cache_create("tcp_bind_bucket",
2701 sizeof(struct inet_bind_bucket), 0,
2702 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2703
2704
2705
2706
2707
2708
2709 tcp_hashinfo.ehash =
2710 alloc_large_system_hash("TCP established",
2711 sizeof(struct inet_ehash_bucket),
2712 thash_entries,
2713 (num_physpages >= 128 * 1024) ?
2714 13 : 15,
2715 0,
2716 &tcp_hashinfo.ehash_size,
2717 NULL,
2718 thash_entries ? 0 : 512 * 1024);
2719 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2720 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2721 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2722 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2723 }
2724 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2725 panic("TCP: failed to alloc ehash_locks");
2726 tcp_hashinfo.bhash =
2727 alloc_large_system_hash("TCP bind",
2728 sizeof(struct inet_bind_hashbucket),
2729 tcp_hashinfo.ehash_size,
2730 (num_physpages >= 128 * 1024) ?
2731 13 : 15,
2732 0,
2733 &tcp_hashinfo.bhash_size,
2734 NULL,
2735 64 * 1024);
2736 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2737 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2738 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2739 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2740 }
2741
2742
2743
2744
2745 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2746 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2747 order++)
2748 ;
2749 if (order >= 4) {
2750 tcp_death_row.sysctl_max_tw_buckets = 180000;
2751 sysctl_tcp_max_orphans = 4096 << (order - 4);
2752 sysctl_max_syn_backlog = 1024;
2753 } else if (order < 3) {
2754 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2755 sysctl_tcp_max_orphans >>= (3 - order);
2756 sysctl_max_syn_backlog = 128;
2757 }
2758
2759
2760
2761
2762
2763
2764 nr_pages = totalram_pages - totalhigh_pages;
2765 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2766 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2767 limit = max(limit, 128UL);
2768 limit = min(limit, INT_MAX * 4UL / 3 / 2);
2769 sysctl_tcp_mem[0] = limit / 4 * 3;
2770 sysctl_tcp_mem[1] = limit;
2771 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2772
2773
2774 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2775 max_share = min(4UL*1024*1024, limit);
2776
2777 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
2778 sysctl_tcp_wmem[1] = 16*1024;
2779 sysctl_tcp_wmem[2] = max(64*1024, max_share);
2780
2781 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
2782 sysctl_tcp_rmem[1] = 87380;
2783 sysctl_tcp_rmem[2] = max(87380, max_share);
2784
2785 printk(KERN_INFO "TCP: Hash tables configured "
2786 "(established %d bind %d)\n",
2787 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2788
2789 tcp_register_congestion_control(&tcp_reno);
2790}
2791
2792EXPORT_SYMBOL(tcp_close);
2793EXPORT_SYMBOL(tcp_disconnect);
2794EXPORT_SYMBOL(tcp_getsockopt);
2795EXPORT_SYMBOL(tcp_ioctl);
2796EXPORT_SYMBOL(tcp_poll);
2797EXPORT_SYMBOL(tcp_read_sock);
2798EXPORT_SYMBOL(tcp_recvmsg);
2799EXPORT_SYMBOL(tcp_sendmsg);
2800EXPORT_SYMBOL(tcp_splice_read);
2801EXPORT_SYMBOL(tcp_sendpage);
2802EXPORT_SYMBOL(tcp_setsockopt);
2803EXPORT_SYMBOL(tcp_shutdown);
2804