1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#include <linux/kernel.h>
249#include <linux/module.h>
250#include <linux/types.h>
251#include <linux/fcntl.h>
252#include <linux/poll.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/bootmem.h>
262#include <linux/highmem.h>
263#include <linux/swap.h>
264#include <linux/cache.h>
265#include <linux/err.h>
266#include <linux/crypto.h>
267
268#include <net/icmp.h>
269#include <net/tcp.h>
270#include <net/xfrm.h>
271#include <net/ip.h>
272#include <net/netdma.h>
273#include <net/sock.h>
274
275#include <asm/uaccess.h>
276#include <asm/ioctls.h>
277
278int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
279
280struct percpu_counter tcp_orphan_count;
281EXPORT_SYMBOL_GPL(tcp_orphan_count);
282
283int sysctl_tcp_mem[3] __read_mostly;
284int sysctl_tcp_wmem[3] __read_mostly;
285int sysctl_tcp_rmem[3] __read_mostly;
286
287EXPORT_SYMBOL(sysctl_tcp_mem);
288EXPORT_SYMBOL(sysctl_tcp_rmem);
289EXPORT_SYMBOL(sysctl_tcp_wmem);
290
291atomic_t tcp_memory_allocated;
292EXPORT_SYMBOL(tcp_memory_allocated);
293
294
295
296
297struct percpu_counter tcp_sockets_allocated;
298EXPORT_SYMBOL(tcp_sockets_allocated);
299
300
301
302
303struct tcp_splice_state {
304 struct pipe_inode_info *pipe;
305 size_t len;
306 unsigned int flags;
307};
308
309
310
311
312
313
314
315int tcp_memory_pressure __read_mostly;
316
317EXPORT_SYMBOL(tcp_memory_pressure);
318
319void tcp_enter_memory_pressure(struct sock *sk)
320{
321 if (!tcp_memory_pressure) {
322 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
323 tcp_memory_pressure = 1;
324 }
325}
326
327EXPORT_SYMBOL(tcp_enter_memory_pressure);
328
329
330
331
332
333
334
335
336unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
337{
338 unsigned int mask;
339 struct sock *sk = sock->sk;
340 struct tcp_sock *tp = tcp_sk(sk);
341
342 poll_wait(file, sk->sk_sleep, wait);
343 if (sk->sk_state == TCP_LISTEN)
344 return inet_csk_listen_poll(sk);
345
346
347
348
349
350
351 mask = 0;
352 if (sk->sk_err)
353 mask = POLLERR;
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
383 mask |= POLLHUP;
384 if (sk->sk_shutdown & RCV_SHUTDOWN)
385 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
386
387
388 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
389 int target = sock_rcvlowat(sk, 0, INT_MAX);
390
391 if (tp->urg_seq == tp->copied_seq &&
392 !sock_flag(sk, SOCK_URGINLINE) &&
393 tp->urg_data)
394 target--;
395
396
397
398
399 if (tp->rcv_nxt - tp->copied_seq >= target)
400 mask |= POLLIN | POLLRDNORM;
401
402 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
403 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
404 mask |= POLLOUT | POLLWRNORM;
405 } else {
406 set_bit(SOCK_ASYNC_NOSPACE,
407 &sk->sk_socket->flags);
408 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
409
410
411
412
413
414 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
415 mask |= POLLOUT | POLLWRNORM;
416 }
417 }
418
419 if (tp->urg_data & TCP_URG_VALID)
420 mask |= POLLPRI;
421 }
422 return mask;
423}
424
425int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
426{
427 struct tcp_sock *tp = tcp_sk(sk);
428 int answ;
429
430 switch (cmd) {
431 case SIOCINQ:
432 if (sk->sk_state == TCP_LISTEN)
433 return -EINVAL;
434
435 lock_sock(sk);
436 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
437 answ = 0;
438 else if (sock_flag(sk, SOCK_URGINLINE) ||
439 !tp->urg_data ||
440 before(tp->urg_seq, tp->copied_seq) ||
441 !before(tp->urg_seq, tp->rcv_nxt)) {
442 answ = tp->rcv_nxt - tp->copied_seq;
443
444
445 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
446 answ -=
447 tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
448 } else
449 answ = tp->urg_seq - tp->copied_seq;
450 release_sock(sk);
451 break;
452 case SIOCATMARK:
453 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
454 break;
455 case SIOCOUTQ:
456 if (sk->sk_state == TCP_LISTEN)
457 return -EINVAL;
458
459 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
460 answ = 0;
461 else
462 answ = tp->write_seq - tp->snd_una;
463 break;
464 default:
465 return -ENOIOCTLCMD;
466 }
467
468 return put_user(answ, (int __user *)arg);
469}
470
471static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
472{
473 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
474 tp->pushed_seq = tp->write_seq;
475}
476
477static inline int forced_push(struct tcp_sock *tp)
478{
479 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
480}
481
482static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
483{
484 struct tcp_sock *tp = tcp_sk(sk);
485 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
486
487 skb->csum = 0;
488 tcb->seq = tcb->end_seq = tp->write_seq;
489 tcb->flags = TCPCB_FLAG_ACK;
490 tcb->sacked = 0;
491 skb_header_release(skb);
492 tcp_add_write_queue_tail(sk, skb);
493 sk->sk_wmem_queued += skb->truesize;
494 sk_mem_charge(sk, skb->truesize);
495 if (tp->nonagle & TCP_NAGLE_PUSH)
496 tp->nonagle &= ~TCP_NAGLE_PUSH;
497}
498
499static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
500 struct sk_buff *skb)
501{
502 if (flags & MSG_OOB)
503 tp->snd_up = tp->write_seq;
504}
505
506static inline void tcp_push(struct sock *sk, int flags, int mss_now,
507 int nonagle)
508{
509 struct tcp_sock *tp = tcp_sk(sk);
510
511 if (tcp_send_head(sk)) {
512 struct sk_buff *skb = tcp_write_queue_tail(sk);
513 if (!(flags & MSG_MORE) || forced_push(tp))
514 tcp_mark_push(tp, skb);
515 tcp_mark_urg(tp, flags, skb);
516 __tcp_push_pending_frames(sk, mss_now,
517 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
518 }
519}
520
521static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
522 unsigned int offset, size_t len)
523{
524 struct tcp_splice_state *tss = rd_desc->arg.data;
525 int ret;
526
527 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
528 tss->flags);
529 if (ret > 0)
530 rd_desc->count -= ret;
531 return ret;
532}
533
534static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
535{
536
537 read_descriptor_t rd_desc = {
538 .arg.data = tss,
539 .count = tss->len,
540 };
541
542 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
543}
544
545
546
547
548
549
550
551
552
553
554
555
556
557ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
558 struct pipe_inode_info *pipe, size_t len,
559 unsigned int flags)
560{
561 struct sock *sk = sock->sk;
562 struct tcp_splice_state tss = {
563 .pipe = pipe,
564 .len = len,
565 .flags = flags,
566 };
567 long timeo;
568 ssize_t spliced;
569 int ret;
570
571
572
573
574 if (unlikely(*ppos))
575 return -ESPIPE;
576
577 ret = spliced = 0;
578
579 lock_sock(sk);
580
581 timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
582 while (tss.len) {
583 ret = __tcp_splice_read(sk, &tss);
584 if (ret < 0)
585 break;
586 else if (!ret) {
587 if (spliced)
588 break;
589 if (sock_flag(sk, SOCK_DONE))
590 break;
591 if (sk->sk_err) {
592 ret = sock_error(sk);
593 break;
594 }
595 if (sk->sk_shutdown & RCV_SHUTDOWN)
596 break;
597 if (sk->sk_state == TCP_CLOSE) {
598
599
600
601
602 if (!sock_flag(sk, SOCK_DONE))
603 ret = -ENOTCONN;
604 break;
605 }
606 if (!timeo) {
607 ret = -EAGAIN;
608 break;
609 }
610 sk_wait_data(sk, &timeo);
611 if (signal_pending(current)) {
612 ret = sock_intr_errno(timeo);
613 break;
614 }
615 continue;
616 }
617 tss.len -= ret;
618 spliced += ret;
619
620 if (!timeo)
621 break;
622 release_sock(sk);
623 lock_sock(sk);
624
625 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
626 (sk->sk_shutdown & RCV_SHUTDOWN) ||
627 signal_pending(current))
628 break;
629 }
630
631 release_sock(sk);
632
633 if (spliced)
634 return spliced;
635
636 return ret;
637}
638
639struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
640{
641 struct sk_buff *skb;
642
643
644 size = ALIGN(size, 4);
645
646 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
647 if (skb) {
648 if (sk_wmem_schedule(sk, skb->truesize)) {
649
650
651
652
653 skb_reserve(skb, skb_tailroom(skb) - size);
654 return skb;
655 }
656 __kfree_skb(skb);
657 } else {
658 sk->sk_prot->enter_memory_pressure(sk);
659 sk_stream_moderate_sndbuf(sk);
660 }
661 return NULL;
662}
663
664static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
665 int large_allowed)
666{
667 struct tcp_sock *tp = tcp_sk(sk);
668 u32 xmit_size_goal, old_size_goal;
669
670 xmit_size_goal = mss_now;
671
672 if (large_allowed && sk_can_gso(sk)) {
673 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
674 inet_csk(sk)->icsk_af_ops->net_header_len -
675 inet_csk(sk)->icsk_ext_hdr_len -
676 tp->tcp_header_len);
677
678 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
679
680
681 old_size_goal = tp->xmit_size_goal_segs * mss_now;
682
683 if (likely(old_size_goal <= xmit_size_goal &&
684 old_size_goal + mss_now > xmit_size_goal)) {
685 xmit_size_goal = old_size_goal;
686 } else {
687 tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
688 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
689 }
690 }
691
692 return max(xmit_size_goal, mss_now);
693}
694
695static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
696{
697 int mss_now;
698
699 mss_now = tcp_current_mss(sk);
700 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
701
702 return mss_now;
703}
704
705static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
706 size_t psize, int flags)
707{
708 struct tcp_sock *tp = tcp_sk(sk);
709 int mss_now, size_goal;
710 int err;
711 ssize_t copied;
712 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
713
714
715 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
716 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
717 goto out_err;
718
719 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
720
721 mss_now = tcp_send_mss(sk, &size_goal, flags);
722 copied = 0;
723
724 err = -EPIPE;
725 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
726 goto out_err;
727
728 while (psize > 0) {
729 struct sk_buff *skb = tcp_write_queue_tail(sk);
730 struct page *page = pages[poffset / PAGE_SIZE];
731 int copy, i, can_coalesce;
732 int offset = poffset % PAGE_SIZE;
733 int size = min_t(size_t, psize, PAGE_SIZE - offset);
734
735 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
736new_segment:
737 if (!sk_stream_memory_free(sk))
738 goto wait_for_sndbuf;
739
740 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
741 if (!skb)
742 goto wait_for_memory;
743
744 skb_entail(sk, skb);
745 copy = size_goal;
746 }
747
748 if (copy > size)
749 copy = size;
750
751 i = skb_shinfo(skb)->nr_frags;
752 can_coalesce = skb_can_coalesce(skb, i, page, offset);
753 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
754 tcp_mark_push(tp, skb);
755 goto new_segment;
756 }
757 if (!sk_wmem_schedule(sk, copy))
758 goto wait_for_memory;
759
760 if (can_coalesce) {
761 skb_shinfo(skb)->frags[i - 1].size += copy;
762 } else {
763 get_page(page);
764 skb_fill_page_desc(skb, i, page, offset, copy);
765 }
766
767 skb->len += copy;
768 skb->data_len += copy;
769 skb->truesize += copy;
770 sk->sk_wmem_queued += copy;
771 sk_mem_charge(sk, copy);
772 skb->ip_summed = CHECKSUM_PARTIAL;
773 tp->write_seq += copy;
774 TCP_SKB_CB(skb)->end_seq += copy;
775 skb_shinfo(skb)->gso_segs = 0;
776
777 if (!copied)
778 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
779
780 copied += copy;
781 poffset += copy;
782 if (!(psize -= copy))
783 goto out;
784
785 if (skb->len < size_goal || (flags & MSG_OOB))
786 continue;
787
788 if (forced_push(tp)) {
789 tcp_mark_push(tp, skb);
790 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
791 } else if (skb == tcp_send_head(sk))
792 tcp_push_one(sk, mss_now);
793 continue;
794
795wait_for_sndbuf:
796 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
797wait_for_memory:
798 if (copied)
799 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
800
801 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
802 goto do_error;
803
804 mss_now = tcp_send_mss(sk, &size_goal, flags);
805 }
806
807out:
808 if (copied)
809 tcp_push(sk, flags, mss_now, tp->nonagle);
810 return copied;
811
812do_error:
813 if (copied)
814 goto out;
815out_err:
816 return sk_stream_error(sk, flags, err);
817}
818
819ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
820 size_t size, int flags)
821{
822 ssize_t res;
823 struct sock *sk = sock->sk;
824
825 if (!(sk->sk_route_caps & NETIF_F_SG) ||
826 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
827 return sock_no_sendpage(sock, page, offset, size, flags);
828
829 lock_sock(sk);
830 TCP_CHECK_TIMER(sk);
831 res = do_tcp_sendpages(sk, &page, offset, size, flags);
832 TCP_CHECK_TIMER(sk);
833 release_sock(sk);
834 return res;
835}
836
837#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
838#define TCP_OFF(sk) (sk->sk_sndmsg_off)
839
840static inline int select_size(struct sock *sk)
841{
842 struct tcp_sock *tp = tcp_sk(sk);
843 int tmp = tp->mss_cache;
844
845 if (sk->sk_route_caps & NETIF_F_SG) {
846 if (sk_can_gso(sk))
847 tmp = 0;
848 else {
849 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
850
851 if (tmp >= pgbreak &&
852 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
853 tmp = pgbreak;
854 }
855 }
856
857 return tmp;
858}
859
860int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
861 size_t size)
862{
863 struct sock *sk = sock->sk;
864 struct iovec *iov;
865 struct tcp_sock *tp = tcp_sk(sk);
866 struct sk_buff *skb;
867 int iovlen, flags;
868 int mss_now, size_goal;
869 int err, copied;
870 long timeo;
871
872 lock_sock(sk);
873 TCP_CHECK_TIMER(sk);
874
875 flags = msg->msg_flags;
876 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
877
878
879 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
880 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
881 goto out_err;
882
883
884 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
885
886 mss_now = tcp_send_mss(sk, &size_goal, flags);
887
888
889 iovlen = msg->msg_iovlen;
890 iov = msg->msg_iov;
891 copied = 0;
892
893 err = -EPIPE;
894 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
895 goto out_err;
896
897 while (--iovlen >= 0) {
898 int seglen = iov->iov_len;
899 unsigned char __user *from = iov->iov_base;
900
901 iov++;
902
903 while (seglen > 0) {
904 int copy;
905
906 skb = tcp_write_queue_tail(sk);
907
908 if (!tcp_send_head(sk) ||
909 (copy = size_goal - skb->len) <= 0) {
910
911new_segment:
912
913
914
915 if (!sk_stream_memory_free(sk))
916 goto wait_for_sndbuf;
917
918 skb = sk_stream_alloc_skb(sk, select_size(sk),
919 sk->sk_allocation);
920 if (!skb)
921 goto wait_for_memory;
922
923
924
925
926 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
927 skb->ip_summed = CHECKSUM_PARTIAL;
928
929 skb_entail(sk, skb);
930 copy = size_goal;
931 }
932
933
934 if (copy > seglen)
935 copy = seglen;
936
937
938 if (skb_tailroom(skb) > 0) {
939
940 if (copy > skb_tailroom(skb))
941 copy = skb_tailroom(skb);
942 if ((err = skb_add_data(skb, from, copy)) != 0)
943 goto do_fault;
944 } else {
945 int merge = 0;
946 int i = skb_shinfo(skb)->nr_frags;
947 struct page *page = TCP_PAGE(sk);
948 int off = TCP_OFF(sk);
949
950 if (skb_can_coalesce(skb, i, page, off) &&
951 off != PAGE_SIZE) {
952
953
954 merge = 1;
955 } else if (i == MAX_SKB_FRAGS ||
956 (!i &&
957 !(sk->sk_route_caps & NETIF_F_SG))) {
958
959
960
961
962 tcp_mark_push(tp, skb);
963 goto new_segment;
964 } else if (page) {
965 if (off == PAGE_SIZE) {
966 put_page(page);
967 TCP_PAGE(sk) = page = NULL;
968 off = 0;
969 }
970 } else
971 off = 0;
972
973 if (copy > PAGE_SIZE - off)
974 copy = PAGE_SIZE - off;
975
976 if (!sk_wmem_schedule(sk, copy))
977 goto wait_for_memory;
978
979 if (!page) {
980
981 if (!(page = sk_stream_alloc_page(sk)))
982 goto wait_for_memory;
983 }
984
985
986
987 err = skb_copy_to_page(sk, from, skb, page,
988 off, copy);
989 if (err) {
990
991
992
993 if (!TCP_PAGE(sk)) {
994 TCP_PAGE(sk) = page;
995 TCP_OFF(sk) = 0;
996 }
997 goto do_error;
998 }
999
1000
1001 if (merge) {
1002 skb_shinfo(skb)->frags[i - 1].size +=
1003 copy;
1004 } else {
1005 skb_fill_page_desc(skb, i, page, off, copy);
1006 if (TCP_PAGE(sk)) {
1007 get_page(page);
1008 } else if (off + copy < PAGE_SIZE) {
1009 get_page(page);
1010 TCP_PAGE(sk) = page;
1011 }
1012 }
1013
1014 TCP_OFF(sk) = off + copy;
1015 }
1016
1017 if (!copied)
1018 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1019
1020 tp->write_seq += copy;
1021 TCP_SKB_CB(skb)->end_seq += copy;
1022 skb_shinfo(skb)->gso_segs = 0;
1023
1024 from += copy;
1025 copied += copy;
1026 if ((seglen -= copy) == 0 && iovlen == 0)
1027 goto out;
1028
1029 if (skb->len < size_goal || (flags & MSG_OOB))
1030 continue;
1031
1032 if (forced_push(tp)) {
1033 tcp_mark_push(tp, skb);
1034 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1035 } else if (skb == tcp_send_head(sk))
1036 tcp_push_one(sk, mss_now);
1037 continue;
1038
1039wait_for_sndbuf:
1040 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1041wait_for_memory:
1042 if (copied)
1043 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1044
1045 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1046 goto do_error;
1047
1048 mss_now = tcp_send_mss(sk, &size_goal, flags);
1049 }
1050 }
1051
1052out:
1053 if (copied)
1054 tcp_push(sk, flags, mss_now, tp->nonagle);
1055 TCP_CHECK_TIMER(sk);
1056 release_sock(sk);
1057 return copied;
1058
1059do_fault:
1060 if (!skb->len) {
1061 tcp_unlink_write_queue(skb, sk);
1062
1063
1064
1065 tcp_check_send_head(sk, skb);
1066 sk_wmem_free_skb(sk, skb);
1067 }
1068
1069do_error:
1070 if (copied)
1071 goto out;
1072out_err:
1073 err = sk_stream_error(sk, flags, err);
1074 TCP_CHECK_TIMER(sk);
1075 release_sock(sk);
1076 return err;
1077}
1078
1079
1080
1081
1082
1083
1084static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1085{
1086 struct tcp_sock *tp = tcp_sk(sk);
1087
1088
1089 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1090 tp->urg_data == TCP_URG_READ)
1091 return -EINVAL;
1092
1093 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1094 return -ENOTCONN;
1095
1096 if (tp->urg_data & TCP_URG_VALID) {
1097 int err = 0;
1098 char c = tp->urg_data;
1099
1100 if (!(flags & MSG_PEEK))
1101 tp->urg_data = TCP_URG_READ;
1102
1103
1104 msg->msg_flags |= MSG_OOB;
1105
1106 if (len > 0) {
1107 if (!(flags & MSG_TRUNC))
1108 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1109 len = 1;
1110 } else
1111 msg->msg_flags |= MSG_TRUNC;
1112
1113 return err ? -EFAULT : len;
1114 }
1115
1116 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1117 return 0;
1118
1119
1120
1121
1122
1123
1124
1125 return -EAGAIN;
1126}
1127
1128
1129
1130
1131
1132
1133
1134void tcp_cleanup_rbuf(struct sock *sk, int copied)
1135{
1136 struct tcp_sock *tp = tcp_sk(sk);
1137 int time_to_ack = 0;
1138
1139#if TCP_DEBUG
1140 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1141
1142 WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1143#endif
1144
1145 if (inet_csk_ack_scheduled(sk)) {
1146 const struct inet_connection_sock *icsk = inet_csk(sk);
1147
1148
1149 if (icsk->icsk_ack.blocked ||
1150
1151 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1152
1153
1154
1155
1156
1157
1158 (copied > 0 &&
1159 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1160 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1161 !icsk->icsk_ack.pingpong)) &&
1162 !atomic_read(&sk->sk_rmem_alloc)))
1163 time_to_ack = 1;
1164 }
1165
1166
1167
1168
1169
1170
1171
1172 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1173 __u32 rcv_window_now = tcp_receive_window(tp);
1174
1175
1176 if (2*rcv_window_now <= tp->window_clamp) {
1177 __u32 new_window = __tcp_select_window(sk);
1178
1179
1180
1181
1182
1183
1184 if (new_window && new_window >= 2 * rcv_window_now)
1185 time_to_ack = 1;
1186 }
1187 }
1188 if (time_to_ack)
1189 tcp_send_ack(sk);
1190}
1191
1192static void tcp_prequeue_process(struct sock *sk)
1193{
1194 struct sk_buff *skb;
1195 struct tcp_sock *tp = tcp_sk(sk);
1196
1197 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1198
1199
1200
1201 local_bh_disable();
1202 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1203 sk_backlog_rcv(sk, skb);
1204 local_bh_enable();
1205
1206
1207 tp->ucopy.memory = 0;
1208}
1209
1210static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1211{
1212 struct sk_buff *skb;
1213 u32 offset;
1214
1215 skb_queue_walk(&sk->sk_receive_queue, skb) {
1216 offset = seq - TCP_SKB_CB(skb)->seq;
1217 if (tcp_hdr(skb)->syn)
1218 offset--;
1219 if (offset < skb->len || tcp_hdr(skb)->fin) {
1220 *off = offset;
1221 return skb;
1222 }
1223 }
1224 return NULL;
1225}
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1239 sk_read_actor_t recv_actor)
1240{
1241 struct sk_buff *skb;
1242 struct tcp_sock *tp = tcp_sk(sk);
1243 u32 seq = tp->copied_seq;
1244 u32 offset;
1245 int copied = 0;
1246
1247 if (sk->sk_state == TCP_LISTEN)
1248 return -ENOTCONN;
1249 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1250 if (offset < skb->len) {
1251 int used;
1252 size_t len;
1253
1254 len = skb->len - offset;
1255
1256 if (tp->urg_data) {
1257 u32 urg_offset = tp->urg_seq - seq;
1258 if (urg_offset < len)
1259 len = urg_offset;
1260 if (!len)
1261 break;
1262 }
1263 used = recv_actor(desc, skb, offset, len);
1264 if (used < 0) {
1265 if (!copied)
1266 copied = used;
1267 break;
1268 } else if (used <= len) {
1269 seq += used;
1270 copied += used;
1271 offset += used;
1272 }
1273
1274
1275
1276
1277
1278
1279 skb = tcp_recv_skb(sk, seq-1, &offset);
1280 if (!skb || (offset+1 != skb->len))
1281 break;
1282 }
1283 if (tcp_hdr(skb)->fin) {
1284 sk_eat_skb(sk, skb, 0);
1285 ++seq;
1286 break;
1287 }
1288 sk_eat_skb(sk, skb, 0);
1289 if (!desc->count)
1290 break;
1291 }
1292 tp->copied_seq = seq;
1293
1294 tcp_rcv_space_adjust(sk);
1295
1296
1297 if (copied > 0)
1298 tcp_cleanup_rbuf(sk, copied);
1299 return copied;
1300}
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1311 size_t len, int nonblock, int flags, int *addr_len)
1312{
1313 struct tcp_sock *tp = tcp_sk(sk);
1314 int copied = 0;
1315 u32 peek_seq;
1316 u32 *seq;
1317 unsigned long used;
1318 int err;
1319 int target;
1320 long timeo;
1321 struct task_struct *user_recv = NULL;
1322 int copied_early = 0;
1323 struct sk_buff *skb;
1324 u32 urg_hole = 0;
1325
1326 lock_sock(sk);
1327
1328 TCP_CHECK_TIMER(sk);
1329
1330 err = -ENOTCONN;
1331 if (sk->sk_state == TCP_LISTEN)
1332 goto out;
1333
1334 timeo = sock_rcvtimeo(sk, nonblock);
1335
1336
1337 if (flags & MSG_OOB)
1338 goto recv_urg;
1339
1340 seq = &tp->copied_seq;
1341 if (flags & MSG_PEEK) {
1342 peek_seq = tp->copied_seq;
1343 seq = &peek_seq;
1344 }
1345
1346 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1347
1348#ifdef CONFIG_NET_DMA
1349 tp->ucopy.dma_chan = NULL;
1350 preempt_disable();
1351 skb = skb_peek_tail(&sk->sk_receive_queue);
1352 {
1353 int available = 0;
1354
1355 if (skb)
1356 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1357 if ((available < target) &&
1358 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1359 !sysctl_tcp_low_latency &&
1360 dma_find_channel(DMA_MEMCPY)) {
1361 preempt_enable_no_resched();
1362 tp->ucopy.pinned_list =
1363 dma_pin_iovec_pages(msg->msg_iov, len);
1364 } else {
1365 preempt_enable_no_resched();
1366 }
1367 }
1368#endif
1369
1370 do {
1371 u32 offset;
1372
1373
1374 if (tp->urg_data && tp->urg_seq == *seq) {
1375 if (copied)
1376 break;
1377 if (signal_pending(current)) {
1378 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1379 break;
1380 }
1381 }
1382
1383
1384
1385 skb = skb_peek(&sk->sk_receive_queue);
1386 do {
1387 if (!skb)
1388 break;
1389
1390
1391
1392
1393 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1394 printk(KERN_INFO "recvmsg bug: copied %X "
1395 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1396 break;
1397 }
1398 offset = *seq - TCP_SKB_CB(skb)->seq;
1399 if (tcp_hdr(skb)->syn)
1400 offset--;
1401 if (offset < skb->len)
1402 goto found_ok_skb;
1403 if (tcp_hdr(skb)->fin)
1404 goto found_fin_ok;
1405 WARN_ON(!(flags & MSG_PEEK));
1406 skb = skb->next;
1407 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1408
1409
1410
1411 if (copied >= target && !sk->sk_backlog.tail)
1412 break;
1413
1414 if (copied) {
1415 if (sk->sk_err ||
1416 sk->sk_state == TCP_CLOSE ||
1417 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1418 !timeo ||
1419 signal_pending(current))
1420 break;
1421 } else {
1422 if (sock_flag(sk, SOCK_DONE))
1423 break;
1424
1425 if (sk->sk_err) {
1426 copied = sock_error(sk);
1427 break;
1428 }
1429
1430 if (sk->sk_shutdown & RCV_SHUTDOWN)
1431 break;
1432
1433 if (sk->sk_state == TCP_CLOSE) {
1434 if (!sock_flag(sk, SOCK_DONE)) {
1435
1436
1437
1438 copied = -ENOTCONN;
1439 break;
1440 }
1441 break;
1442 }
1443
1444 if (!timeo) {
1445 copied = -EAGAIN;
1446 break;
1447 }
1448
1449 if (signal_pending(current)) {
1450 copied = sock_intr_errno(timeo);
1451 break;
1452 }
1453 }
1454
1455 tcp_cleanup_rbuf(sk, copied);
1456
1457 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1458
1459 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1460 user_recv = current;
1461 tp->ucopy.task = user_recv;
1462 tp->ucopy.iov = msg->msg_iov;
1463 }
1464
1465 tp->ucopy.len = len;
1466
1467 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1468 !(flags & (MSG_PEEK | MSG_TRUNC)));
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496 if (!skb_queue_empty(&tp->ucopy.prequeue))
1497 goto do_prequeue;
1498
1499
1500 }
1501
1502 if (copied >= target) {
1503
1504 release_sock(sk);
1505 lock_sock(sk);
1506 } else
1507 sk_wait_data(sk, &timeo);
1508
1509#ifdef CONFIG_NET_DMA
1510 tp->ucopy.wakeup = 0;
1511#endif
1512
1513 if (user_recv) {
1514 int chunk;
1515
1516
1517
1518 if ((chunk = len - tp->ucopy.len) != 0) {
1519 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1520 len -= chunk;
1521 copied += chunk;
1522 }
1523
1524 if (tp->rcv_nxt == tp->copied_seq &&
1525 !skb_queue_empty(&tp->ucopy.prequeue)) {
1526do_prequeue:
1527 tcp_prequeue_process(sk);
1528
1529 if ((chunk = len - tp->ucopy.len) != 0) {
1530 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1531 len -= chunk;
1532 copied += chunk;
1533 }
1534 }
1535 }
1536 if ((flags & MSG_PEEK) &&
1537 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1538 if (net_ratelimit())
1539 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1540 current->comm, task_pid_nr(current));
1541 peek_seq = tp->copied_seq;
1542 }
1543 continue;
1544
1545 found_ok_skb:
1546
1547 used = skb->len - offset;
1548 if (len < used)
1549 used = len;
1550
1551
1552 if (tp->urg_data) {
1553 u32 urg_offset = tp->urg_seq - *seq;
1554 if (urg_offset < used) {
1555 if (!urg_offset) {
1556 if (!sock_flag(sk, SOCK_URGINLINE)) {
1557 ++*seq;
1558 urg_hole++;
1559 offset++;
1560 used--;
1561 if (!used)
1562 goto skip_copy;
1563 }
1564 } else
1565 used = urg_offset;
1566 }
1567 }
1568
1569 if (!(flags & MSG_TRUNC)) {
1570#ifdef CONFIG_NET_DMA
1571 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1572 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1573
1574 if (tp->ucopy.dma_chan) {
1575 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1576 tp->ucopy.dma_chan, skb, offset,
1577 msg->msg_iov, used,
1578 tp->ucopy.pinned_list);
1579
1580 if (tp->ucopy.dma_cookie < 0) {
1581
1582 printk(KERN_ALERT "dma_cookie < 0\n");
1583
1584
1585 if (!copied)
1586 copied = -EFAULT;
1587 break;
1588 }
1589 if ((offset + used) == skb->len)
1590 copied_early = 1;
1591
1592 } else
1593#endif
1594 {
1595 err = skb_copy_datagram_iovec(skb, offset,
1596 msg->msg_iov, used);
1597 if (err) {
1598
1599 if (!copied)
1600 copied = -EFAULT;
1601 break;
1602 }
1603 }
1604 }
1605
1606 *seq += used;
1607 copied += used;
1608 len -= used;
1609
1610 tcp_rcv_space_adjust(sk);
1611
1612skip_copy:
1613 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1614 tp->urg_data = 0;
1615 tcp_fast_path_check(sk);
1616 }
1617 if (used + offset < skb->len)
1618 continue;
1619
1620 if (tcp_hdr(skb)->fin)
1621 goto found_fin_ok;
1622 if (!(flags & MSG_PEEK)) {
1623 sk_eat_skb(sk, skb, copied_early);
1624 copied_early = 0;
1625 }
1626 continue;
1627
1628 found_fin_ok:
1629
1630 ++*seq;
1631 if (!(flags & MSG_PEEK)) {
1632 sk_eat_skb(sk, skb, copied_early);
1633 copied_early = 0;
1634 }
1635 break;
1636 } while (len > 0);
1637
1638 if (user_recv) {
1639 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1640 int chunk;
1641
1642 tp->ucopy.len = copied > 0 ? len : 0;
1643
1644 tcp_prequeue_process(sk);
1645
1646 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1647 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1648 len -= chunk;
1649 copied += chunk;
1650 }
1651 }
1652
1653 tp->ucopy.task = NULL;
1654 tp->ucopy.len = 0;
1655 }
1656
1657#ifdef CONFIG_NET_DMA
1658 if (tp->ucopy.dma_chan) {
1659 dma_cookie_t done, used;
1660
1661 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1662
1663 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1664 tp->ucopy.dma_cookie, &done,
1665 &used) == DMA_IN_PROGRESS) {
1666
1667 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1668 (dma_async_is_complete(skb->dma_cookie, done,
1669 used) == DMA_SUCCESS)) {
1670 __skb_dequeue(&sk->sk_async_wait_queue);
1671 kfree_skb(skb);
1672 }
1673 }
1674
1675
1676 __skb_queue_purge(&sk->sk_async_wait_queue);
1677 tp->ucopy.dma_chan = NULL;
1678 }
1679 if (tp->ucopy.pinned_list) {
1680 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1681 tp->ucopy.pinned_list = NULL;
1682 }
1683#endif
1684
1685
1686
1687
1688
1689
1690 tcp_cleanup_rbuf(sk, copied);
1691
1692 TCP_CHECK_TIMER(sk);
1693 release_sock(sk);
1694 return copied;
1695
1696out:
1697 TCP_CHECK_TIMER(sk);
1698 release_sock(sk);
1699 return err;
1700
1701recv_urg:
1702 err = tcp_recv_urg(sk, msg, len, flags);
1703 goto out;
1704}
1705
1706void tcp_set_state(struct sock *sk, int state)
1707{
1708 int oldstate = sk->sk_state;
1709
1710 switch (state) {
1711 case TCP_ESTABLISHED:
1712 if (oldstate != TCP_ESTABLISHED)
1713 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1714 break;
1715
1716 case TCP_CLOSE:
1717 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1718 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1719
1720 sk->sk_prot->unhash(sk);
1721 if (inet_csk(sk)->icsk_bind_hash &&
1722 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1723 inet_put_port(sk);
1724
1725 default:
1726 if (oldstate == TCP_ESTABLISHED)
1727 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1728 }
1729
1730
1731
1732
1733 sk->sk_state = state;
1734
1735#ifdef STATE_TRACE
1736 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1737#endif
1738}
1739EXPORT_SYMBOL_GPL(tcp_set_state);
1740
1741
1742
1743
1744
1745
1746
1747
1748static const unsigned char new_state[16] = {
1749
1750 TCP_CLOSE,
1751 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1752 TCP_CLOSE,
1753 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1754 TCP_FIN_WAIT1,
1755 TCP_FIN_WAIT2,
1756 TCP_CLOSE,
1757 TCP_CLOSE,
1758 TCP_LAST_ACK | TCP_ACTION_FIN,
1759 TCP_LAST_ACK,
1760 TCP_CLOSE,
1761 TCP_CLOSING,
1762};
1763
1764static int tcp_close_state(struct sock *sk)
1765{
1766 int next = (int)new_state[sk->sk_state];
1767 int ns = next & TCP_STATE_MASK;
1768
1769 tcp_set_state(sk, ns);
1770
1771 return next & TCP_ACTION_FIN;
1772}
1773
1774
1775
1776
1777
1778
1779void tcp_shutdown(struct sock *sk, int how)
1780{
1781
1782
1783
1784
1785 if (!(how & SEND_SHUTDOWN))
1786 return;
1787
1788
1789 if ((1 << sk->sk_state) &
1790 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1791 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1792
1793 if (tcp_close_state(sk))
1794 tcp_send_fin(sk);
1795 }
1796}
1797
1798void tcp_close(struct sock *sk, long timeout)
1799{
1800 struct sk_buff *skb;
1801 int data_was_unread = 0;
1802 int state;
1803
1804 lock_sock(sk);
1805 sk->sk_shutdown = SHUTDOWN_MASK;
1806
1807 if (sk->sk_state == TCP_LISTEN) {
1808 tcp_set_state(sk, TCP_CLOSE);
1809
1810
1811 inet_csk_listen_stop(sk);
1812
1813 goto adjudge_to_death;
1814 }
1815
1816
1817
1818
1819
1820 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1821 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1822 tcp_hdr(skb)->fin;
1823 data_was_unread += len;
1824 __kfree_skb(skb);
1825 }
1826
1827 sk_mem_reclaim(sk);
1828
1829
1830
1831
1832
1833
1834
1835
1836 if (data_was_unread) {
1837
1838 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
1839 tcp_set_state(sk, TCP_CLOSE);
1840 tcp_send_active_reset(sk, GFP_KERNEL);
1841 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1842
1843 sk->sk_prot->disconnect(sk, 0);
1844 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
1845 } else if (tcp_close_state(sk)) {
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871 tcp_send_fin(sk);
1872 }
1873
1874 sk_stream_wait_close(sk, timeout);
1875
1876adjudge_to_death:
1877 state = sk->sk_state;
1878 sock_hold(sk);
1879 sock_orphan(sk);
1880
1881
1882 release_sock(sk);
1883
1884
1885
1886
1887
1888 local_bh_disable();
1889 bh_lock_sock(sk);
1890 WARN_ON(sock_owned_by_user(sk));
1891
1892 percpu_counter_inc(sk->sk_prot->orphan_count);
1893
1894
1895 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1896 goto out;
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912 if (sk->sk_state == TCP_FIN_WAIT2) {
1913 struct tcp_sock *tp = tcp_sk(sk);
1914 if (tp->linger2 < 0) {
1915 tcp_set_state(sk, TCP_CLOSE);
1916 tcp_send_active_reset(sk, GFP_ATOMIC);
1917 NET_INC_STATS_BH(sock_net(sk),
1918 LINUX_MIB_TCPABORTONLINGER);
1919 } else {
1920 const int tmo = tcp_fin_time(sk);
1921
1922 if (tmo > TCP_TIMEWAIT_LEN) {
1923 inet_csk_reset_keepalive_timer(sk,
1924 tmo - TCP_TIMEWAIT_LEN);
1925 } else {
1926 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1927 goto out;
1928 }
1929 }
1930 }
1931 if (sk->sk_state != TCP_CLOSE) {
1932 int orphan_count = percpu_counter_read_positive(
1933 sk->sk_prot->orphan_count);
1934
1935 sk_mem_reclaim(sk);
1936 if (tcp_too_many_orphans(sk, orphan_count)) {
1937 if (net_ratelimit())
1938 printk(KERN_INFO "TCP: too many of orphaned "
1939 "sockets\n");
1940 tcp_set_state(sk, TCP_CLOSE);
1941 tcp_send_active_reset(sk, GFP_ATOMIC);
1942 NET_INC_STATS_BH(sock_net(sk),
1943 LINUX_MIB_TCPABORTONMEMORY);
1944 }
1945 }
1946
1947 if (sk->sk_state == TCP_CLOSE)
1948 inet_csk_destroy_sock(sk);
1949
1950
1951out:
1952 bh_unlock_sock(sk);
1953 local_bh_enable();
1954 sock_put(sk);
1955}
1956
1957
1958
1959static inline int tcp_need_reset(int state)
1960{
1961 return (1 << state) &
1962 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1963 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1964}
1965
1966int tcp_disconnect(struct sock *sk, int flags)
1967{
1968 struct inet_sock *inet = inet_sk(sk);
1969 struct inet_connection_sock *icsk = inet_csk(sk);
1970 struct tcp_sock *tp = tcp_sk(sk);
1971 int err = 0;
1972 int old_state = sk->sk_state;
1973
1974 if (old_state != TCP_CLOSE)
1975 tcp_set_state(sk, TCP_CLOSE);
1976
1977
1978 if (old_state == TCP_LISTEN) {
1979 inet_csk_listen_stop(sk);
1980 } else if (tcp_need_reset(old_state) ||
1981 (tp->snd_nxt != tp->write_seq &&
1982 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1983
1984
1985
1986 tcp_send_active_reset(sk, gfp_any());
1987 sk->sk_err = ECONNRESET;
1988 } else if (old_state == TCP_SYN_SENT)
1989 sk->sk_err = ECONNRESET;
1990
1991 tcp_clear_xmit_timers(sk);
1992 __skb_queue_purge(&sk->sk_receive_queue);
1993 tcp_write_queue_purge(sk);
1994 __skb_queue_purge(&tp->out_of_order_queue);
1995#ifdef CONFIG_NET_DMA
1996 __skb_queue_purge(&sk->sk_async_wait_queue);
1997#endif
1998
1999 inet->dport = 0;
2000
2001 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2002 inet_reset_saddr(sk);
2003
2004 sk->sk_shutdown = 0;
2005 sock_reset_flag(sk, SOCK_DONE);
2006 tp->srtt = 0;
2007 if ((tp->write_seq += tp->max_window + 2) == 0)
2008 tp->write_seq = 1;
2009 icsk->icsk_backoff = 0;
2010 tp->snd_cwnd = 2;
2011 icsk->icsk_probes_out = 0;
2012 tp->packets_out = 0;
2013 tp->snd_ssthresh = 0x7fffffff;
2014 tp->snd_cwnd_cnt = 0;
2015 tp->bytes_acked = 0;
2016 tcp_set_ca_state(sk, TCP_CA_Open);
2017 tcp_clear_retrans(tp);
2018 inet_csk_delack_init(sk);
2019 tcp_init_send_head(sk);
2020 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2021 __sk_dst_reset(sk);
2022
2023 WARN_ON(inet->num && !icsk->icsk_bind_hash);
2024
2025 sk->sk_error_report(sk);
2026 return err;
2027}
2028
2029
2030
2031
2032static int do_tcp_setsockopt(struct sock *sk, int level,
2033 int optname, char __user *optval, int optlen)
2034{
2035 struct tcp_sock *tp = tcp_sk(sk);
2036 struct inet_connection_sock *icsk = inet_csk(sk);
2037 int val;
2038 int err = 0;
2039
2040
2041 if (optname == TCP_CONGESTION) {
2042 char name[TCP_CA_NAME_MAX];
2043
2044 if (optlen < 1)
2045 return -EINVAL;
2046
2047 val = strncpy_from_user(name, optval,
2048 min(TCP_CA_NAME_MAX-1, optlen));
2049 if (val < 0)
2050 return -EFAULT;
2051 name[val] = 0;
2052
2053 lock_sock(sk);
2054 err = tcp_set_congestion_control(sk, name);
2055 release_sock(sk);
2056 return err;
2057 }
2058
2059 if (optlen < sizeof(int))
2060 return -EINVAL;
2061
2062 if (get_user(val, (int __user *)optval))
2063 return -EFAULT;
2064
2065 lock_sock(sk);
2066
2067 switch (optname) {
2068 case TCP_MAXSEG:
2069
2070
2071
2072 if (val < 8 || val > MAX_TCP_WINDOW) {
2073 err = -EINVAL;
2074 break;
2075 }
2076 tp->rx_opt.user_mss = val;
2077 break;
2078
2079 case TCP_NODELAY:
2080 if (val) {
2081
2082
2083
2084
2085
2086
2087
2088
2089 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2090 tcp_push_pending_frames(sk);
2091 } else {
2092 tp->nonagle &= ~TCP_NAGLE_OFF;
2093 }
2094 break;
2095
2096 case TCP_CORK:
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108 if (val) {
2109 tp->nonagle |= TCP_NAGLE_CORK;
2110 } else {
2111 tp->nonagle &= ~TCP_NAGLE_CORK;
2112 if (tp->nonagle&TCP_NAGLE_OFF)
2113 tp->nonagle |= TCP_NAGLE_PUSH;
2114 tcp_push_pending_frames(sk);
2115 }
2116 break;
2117
2118 case TCP_KEEPIDLE:
2119 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2120 err = -EINVAL;
2121 else {
2122 tp->keepalive_time = val * HZ;
2123 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2124 !((1 << sk->sk_state) &
2125 (TCPF_CLOSE | TCPF_LISTEN))) {
2126 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2127 if (tp->keepalive_time > elapsed)
2128 elapsed = tp->keepalive_time - elapsed;
2129 else
2130 elapsed = 0;
2131 inet_csk_reset_keepalive_timer(sk, elapsed);
2132 }
2133 }
2134 break;
2135 case TCP_KEEPINTVL:
2136 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2137 err = -EINVAL;
2138 else
2139 tp->keepalive_intvl = val * HZ;
2140 break;
2141 case TCP_KEEPCNT:
2142 if (val < 1 || val > MAX_TCP_KEEPCNT)
2143 err = -EINVAL;
2144 else
2145 tp->keepalive_probes = val;
2146 break;
2147 case TCP_SYNCNT:
2148 if (val < 1 || val > MAX_TCP_SYNCNT)
2149 err = -EINVAL;
2150 else
2151 icsk->icsk_syn_retries = val;
2152 break;
2153
2154 case TCP_LINGER2:
2155 if (val < 0)
2156 tp->linger2 = -1;
2157 else if (val > sysctl_tcp_fin_timeout / HZ)
2158 tp->linger2 = 0;
2159 else
2160 tp->linger2 = val * HZ;
2161 break;
2162
2163 case TCP_DEFER_ACCEPT:
2164 icsk->icsk_accept_queue.rskq_defer_accept = 0;
2165 if (val > 0) {
2166
2167
2168 while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2169 val > ((TCP_TIMEOUT_INIT / HZ) <<
2170 icsk->icsk_accept_queue.rskq_defer_accept))
2171 icsk->icsk_accept_queue.rskq_defer_accept++;
2172 icsk->icsk_accept_queue.rskq_defer_accept++;
2173 }
2174 break;
2175
2176 case TCP_WINDOW_CLAMP:
2177 if (!val) {
2178 if (sk->sk_state != TCP_CLOSE) {
2179 err = -EINVAL;
2180 break;
2181 }
2182 tp->window_clamp = 0;
2183 } else
2184 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2185 SOCK_MIN_RCVBUF / 2 : val;
2186 break;
2187
2188 case TCP_QUICKACK:
2189 if (!val) {
2190 icsk->icsk_ack.pingpong = 1;
2191 } else {
2192 icsk->icsk_ack.pingpong = 0;
2193 if ((1 << sk->sk_state) &
2194 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2195 inet_csk_ack_scheduled(sk)) {
2196 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2197 tcp_cleanup_rbuf(sk, 1);
2198 if (!(val & 1))
2199 icsk->icsk_ack.pingpong = 1;
2200 }
2201 }
2202 break;
2203
2204#ifdef CONFIG_TCP_MD5SIG
2205 case TCP_MD5SIG:
2206
2207 err = tp->af_specific->md5_parse(sk, optval, optlen);
2208 break;
2209#endif
2210
2211 default:
2212 err = -ENOPROTOOPT;
2213 break;
2214 }
2215
2216 release_sock(sk);
2217 return err;
2218}
2219
2220int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2221 int optlen)
2222{
2223 struct inet_connection_sock *icsk = inet_csk(sk);
2224
2225 if (level != SOL_TCP)
2226 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2227 optval, optlen);
2228 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2229}
2230
2231#ifdef CONFIG_COMPAT
2232int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2233 char __user *optval, int optlen)
2234{
2235 if (level != SOL_TCP)
2236 return inet_csk_compat_setsockopt(sk, level, optname,
2237 optval, optlen);
2238 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2239}
2240
2241EXPORT_SYMBOL(compat_tcp_setsockopt);
2242#endif
2243
2244
2245void tcp_get_info(struct sock *sk, struct tcp_info *info)
2246{
2247 struct tcp_sock *tp = tcp_sk(sk);
2248 const struct inet_connection_sock *icsk = inet_csk(sk);
2249 u32 now = tcp_time_stamp;
2250
2251 memset(info, 0, sizeof(*info));
2252
2253 info->tcpi_state = sk->sk_state;
2254 info->tcpi_ca_state = icsk->icsk_ca_state;
2255 info->tcpi_retransmits = icsk->icsk_retransmits;
2256 info->tcpi_probes = icsk->icsk_probes_out;
2257 info->tcpi_backoff = icsk->icsk_backoff;
2258
2259 if (tp->rx_opt.tstamp_ok)
2260 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2261 if (tcp_is_sack(tp))
2262 info->tcpi_options |= TCPI_OPT_SACK;
2263 if (tp->rx_opt.wscale_ok) {
2264 info->tcpi_options |= TCPI_OPT_WSCALE;
2265 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2266 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2267 }
2268
2269 if (tp->ecn_flags&TCP_ECN_OK)
2270 info->tcpi_options |= TCPI_OPT_ECN;
2271
2272 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2273 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2274 info->tcpi_snd_mss = tp->mss_cache;
2275 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2276
2277 if (sk->sk_state == TCP_LISTEN) {
2278 info->tcpi_unacked = sk->sk_ack_backlog;
2279 info->tcpi_sacked = sk->sk_max_ack_backlog;
2280 } else {
2281 info->tcpi_unacked = tp->packets_out;
2282 info->tcpi_sacked = tp->sacked_out;
2283 }
2284 info->tcpi_lost = tp->lost_out;
2285 info->tcpi_retrans = tp->retrans_out;
2286 info->tcpi_fackets = tp->fackets_out;
2287
2288 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2289 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2290 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2291
2292 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2293 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2294 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2295 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2296 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2297 info->tcpi_snd_cwnd = tp->snd_cwnd;
2298 info->tcpi_advmss = tp->advmss;
2299 info->tcpi_reordering = tp->reordering;
2300
2301 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2302 info->tcpi_rcv_space = tp->rcvq_space.space;
2303
2304 info->tcpi_total_retrans = tp->total_retrans;
2305}
2306
2307EXPORT_SYMBOL_GPL(tcp_get_info);
2308
2309static int do_tcp_getsockopt(struct sock *sk, int level,
2310 int optname, char __user *optval, int __user *optlen)
2311{
2312 struct inet_connection_sock *icsk = inet_csk(sk);
2313 struct tcp_sock *tp = tcp_sk(sk);
2314 int val, len;
2315
2316 if (get_user(len, optlen))
2317 return -EFAULT;
2318
2319 len = min_t(unsigned int, len, sizeof(int));
2320
2321 if (len < 0)
2322 return -EINVAL;
2323
2324 switch (optname) {
2325 case TCP_MAXSEG:
2326 val = tp->mss_cache;
2327 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2328 val = tp->rx_opt.user_mss;
2329 break;
2330 case TCP_NODELAY:
2331 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2332 break;
2333 case TCP_CORK:
2334 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2335 break;
2336 case TCP_KEEPIDLE:
2337 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2338 break;
2339 case TCP_KEEPINTVL:
2340 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2341 break;
2342 case TCP_KEEPCNT:
2343 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2344 break;
2345 case TCP_SYNCNT:
2346 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2347 break;
2348 case TCP_LINGER2:
2349 val = tp->linger2;
2350 if (val >= 0)
2351 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2352 break;
2353 case TCP_DEFER_ACCEPT:
2354 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2355 ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2356 break;
2357 case TCP_WINDOW_CLAMP:
2358 val = tp->window_clamp;
2359 break;
2360 case TCP_INFO: {
2361 struct tcp_info info;
2362
2363 if (get_user(len, optlen))
2364 return -EFAULT;
2365
2366 tcp_get_info(sk, &info);
2367
2368 len = min_t(unsigned int, len, sizeof(info));
2369 if (put_user(len, optlen))
2370 return -EFAULT;
2371 if (copy_to_user(optval, &info, len))
2372 return -EFAULT;
2373 return 0;
2374 }
2375 case TCP_QUICKACK:
2376 val = !icsk->icsk_ack.pingpong;
2377 break;
2378
2379 case TCP_CONGESTION:
2380 if (get_user(len, optlen))
2381 return -EFAULT;
2382 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2383 if (put_user(len, optlen))
2384 return -EFAULT;
2385 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2386 return -EFAULT;
2387 return 0;
2388 default:
2389 return -ENOPROTOOPT;
2390 }
2391
2392 if (put_user(len, optlen))
2393 return -EFAULT;
2394 if (copy_to_user(optval, &val, len))
2395 return -EFAULT;
2396 return 0;
2397}
2398
2399int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2400 int __user *optlen)
2401{
2402 struct inet_connection_sock *icsk = inet_csk(sk);
2403
2404 if (level != SOL_TCP)
2405 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2406 optval, optlen);
2407 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2408}
2409
2410#ifdef CONFIG_COMPAT
2411int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2412 char __user *optval, int __user *optlen)
2413{
2414 if (level != SOL_TCP)
2415 return inet_csk_compat_getsockopt(sk, level, optname,
2416 optval, optlen);
2417 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2418}
2419
2420EXPORT_SYMBOL(compat_tcp_getsockopt);
2421#endif
2422
2423struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2424{
2425 struct sk_buff *segs = ERR_PTR(-EINVAL);
2426 struct tcphdr *th;
2427 unsigned thlen;
2428 unsigned int seq;
2429 __be32 delta;
2430 unsigned int oldlen;
2431 unsigned int mss;
2432
2433 if (!pskb_may_pull(skb, sizeof(*th)))
2434 goto out;
2435
2436 th = tcp_hdr(skb);
2437 thlen = th->doff * 4;
2438 if (thlen < sizeof(*th))
2439 goto out;
2440
2441 if (!pskb_may_pull(skb, thlen))
2442 goto out;
2443
2444 oldlen = (u16)~skb->len;
2445 __skb_pull(skb, thlen);
2446
2447 mss = skb_shinfo(skb)->gso_size;
2448 if (unlikely(skb->len <= mss))
2449 goto out;
2450
2451 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2452
2453 int type = skb_shinfo(skb)->gso_type;
2454
2455 if (unlikely(type &
2456 ~(SKB_GSO_TCPV4 |
2457 SKB_GSO_DODGY |
2458 SKB_GSO_TCP_ECN |
2459 SKB_GSO_TCPV6 |
2460 0) ||
2461 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2462 goto out;
2463
2464 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2465
2466 segs = NULL;
2467 goto out;
2468 }
2469
2470 segs = skb_segment(skb, features);
2471 if (IS_ERR(segs))
2472 goto out;
2473
2474 delta = htonl(oldlen + (thlen + mss));
2475
2476 skb = segs;
2477 th = tcp_hdr(skb);
2478 seq = ntohl(th->seq);
2479
2480 do {
2481 th->fin = th->psh = 0;
2482
2483 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2484 (__force u32)delta));
2485 if (skb->ip_summed != CHECKSUM_PARTIAL)
2486 th->check =
2487 csum_fold(csum_partial(skb_transport_header(skb),
2488 thlen, skb->csum));
2489
2490 seq += mss;
2491 skb = skb->next;
2492 th = tcp_hdr(skb);
2493
2494 th->seq = htonl(seq);
2495 th->cwr = 0;
2496 } while (skb->next);
2497
2498 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2499 skb->data_len);
2500 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2501 (__force u32)delta));
2502 if (skb->ip_summed != CHECKSUM_PARTIAL)
2503 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2504 thlen, skb->csum));
2505
2506out:
2507 return segs;
2508}
2509EXPORT_SYMBOL(tcp_tso_segment);
2510
2511struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2512{
2513 struct sk_buff **pp = NULL;
2514 struct sk_buff *p;
2515 struct tcphdr *th;
2516 struct tcphdr *th2;
2517 unsigned int len;
2518 unsigned int thlen;
2519 unsigned int flags;
2520 unsigned int mss = 1;
2521 int flush = 1;
2522 int i;
2523
2524 th = skb_gro_header(skb, sizeof(*th));
2525 if (unlikely(!th))
2526 goto out;
2527
2528 thlen = th->doff * 4;
2529 if (thlen < sizeof(*th))
2530 goto out;
2531
2532 th = skb_gro_header(skb, thlen);
2533 if (unlikely(!th))
2534 goto out;
2535
2536 skb_gro_pull(skb, thlen);
2537
2538 len = skb_gro_len(skb);
2539 flags = tcp_flag_word(th);
2540
2541 for (; (p = *head); head = &p->next) {
2542 if (!NAPI_GRO_CB(p)->same_flow)
2543 continue;
2544
2545 th2 = tcp_hdr(p);
2546
2547 if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) {
2548 NAPI_GRO_CB(p)->same_flow = 0;
2549 continue;
2550 }
2551
2552 goto found;
2553 }
2554
2555 goto out_check_final;
2556
2557found:
2558 flush = NAPI_GRO_CB(p)->flush;
2559 flush |= flags & TCP_FLAG_CWR;
2560 flush |= (flags ^ tcp_flag_word(th2)) &
2561 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
2562 flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window);
2563 for (i = sizeof(*th); !flush && i < thlen; i += 4)
2564 flush |= *(u32 *)((u8 *)th + i) ^
2565 *(u32 *)((u8 *)th2 + i);
2566
2567 mss = skb_shinfo(p)->gso_size;
2568
2569 flush |= (len > mss) | !len;
2570 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
2571
2572 if (flush || skb_gro_receive(head, skb)) {
2573 mss = 1;
2574 goto out_check_final;
2575 }
2576
2577 p = *head;
2578 th2 = tcp_hdr(p);
2579 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
2580
2581out_check_final:
2582 flush = len < mss;
2583 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
2584 TCP_FLAG_SYN | TCP_FLAG_FIN);
2585
2586 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
2587 pp = head;
2588
2589out:
2590 NAPI_GRO_CB(skb)->flush |= flush;
2591
2592 return pp;
2593}
2594EXPORT_SYMBOL(tcp_gro_receive);
2595
2596int tcp_gro_complete(struct sk_buff *skb)
2597{
2598 struct tcphdr *th = tcp_hdr(skb);
2599
2600 skb->csum_start = skb_transport_header(skb) - skb->head;
2601 skb->csum_offset = offsetof(struct tcphdr, check);
2602 skb->ip_summed = CHECKSUM_PARTIAL;
2603
2604 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
2605
2606 if (th->cwr)
2607 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
2608
2609 return 0;
2610}
2611EXPORT_SYMBOL(tcp_gro_complete);
2612
2613#ifdef CONFIG_TCP_MD5SIG
2614static unsigned long tcp_md5sig_users;
2615static struct tcp_md5sig_pool **tcp_md5sig_pool;
2616static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2617
2618static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2619{
2620 int cpu;
2621 for_each_possible_cpu(cpu) {
2622 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2623 if (p) {
2624 if (p->md5_desc.tfm)
2625 crypto_free_hash(p->md5_desc.tfm);
2626 kfree(p);
2627 p = NULL;
2628 }
2629 }
2630 free_percpu(pool);
2631}
2632
2633void tcp_free_md5sig_pool(void)
2634{
2635 struct tcp_md5sig_pool **pool = NULL;
2636
2637 spin_lock_bh(&tcp_md5sig_pool_lock);
2638 if (--tcp_md5sig_users == 0) {
2639 pool = tcp_md5sig_pool;
2640 tcp_md5sig_pool = NULL;
2641 }
2642 spin_unlock_bh(&tcp_md5sig_pool_lock);
2643 if (pool)
2644 __tcp_free_md5sig_pool(pool);
2645}
2646
2647EXPORT_SYMBOL(tcp_free_md5sig_pool);
2648
2649static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2650{
2651 int cpu;
2652 struct tcp_md5sig_pool **pool;
2653
2654 pool = alloc_percpu(struct tcp_md5sig_pool *);
2655 if (!pool)
2656 return NULL;
2657
2658 for_each_possible_cpu(cpu) {
2659 struct tcp_md5sig_pool *p;
2660 struct crypto_hash *hash;
2661
2662 p = kzalloc(sizeof(*p), GFP_KERNEL);
2663 if (!p)
2664 goto out_free;
2665 *per_cpu_ptr(pool, cpu) = p;
2666
2667 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2668 if (!hash || IS_ERR(hash))
2669 goto out_free;
2670
2671 p->md5_desc.tfm = hash;
2672 }
2673 return pool;
2674out_free:
2675 __tcp_free_md5sig_pool(pool);
2676 return NULL;
2677}
2678
2679struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2680{
2681 struct tcp_md5sig_pool **pool;
2682 int alloc = 0;
2683
2684retry:
2685 spin_lock_bh(&tcp_md5sig_pool_lock);
2686 pool = tcp_md5sig_pool;
2687 if (tcp_md5sig_users++ == 0) {
2688 alloc = 1;
2689 spin_unlock_bh(&tcp_md5sig_pool_lock);
2690 } else if (!pool) {
2691 tcp_md5sig_users--;
2692 spin_unlock_bh(&tcp_md5sig_pool_lock);
2693 cpu_relax();
2694 goto retry;
2695 } else
2696 spin_unlock_bh(&tcp_md5sig_pool_lock);
2697
2698 if (alloc) {
2699
2700 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2701 spin_lock_bh(&tcp_md5sig_pool_lock);
2702 if (!p) {
2703 tcp_md5sig_users--;
2704 spin_unlock_bh(&tcp_md5sig_pool_lock);
2705 return NULL;
2706 }
2707 pool = tcp_md5sig_pool;
2708 if (pool) {
2709
2710 spin_unlock_bh(&tcp_md5sig_pool_lock);
2711 __tcp_free_md5sig_pool(p);
2712 } else {
2713 tcp_md5sig_pool = pool = p;
2714 spin_unlock_bh(&tcp_md5sig_pool_lock);
2715 }
2716 }
2717 return pool;
2718}
2719
2720EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2721
2722struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2723{
2724 struct tcp_md5sig_pool **p;
2725 spin_lock_bh(&tcp_md5sig_pool_lock);
2726 p = tcp_md5sig_pool;
2727 if (p)
2728 tcp_md5sig_users++;
2729 spin_unlock_bh(&tcp_md5sig_pool_lock);
2730 return (p ? *per_cpu_ptr(p, cpu) : NULL);
2731}
2732
2733EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2734
2735void __tcp_put_md5sig_pool(void)
2736{
2737 tcp_free_md5sig_pool();
2738}
2739
2740EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2741
2742int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2743 struct tcphdr *th)
2744{
2745 struct scatterlist sg;
2746 int err;
2747
2748 __sum16 old_checksum = th->check;
2749 th->check = 0;
2750
2751 sg_init_one(&sg, th, sizeof(struct tcphdr));
2752 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
2753 th->check = old_checksum;
2754 return err;
2755}
2756
2757EXPORT_SYMBOL(tcp_md5_hash_header);
2758
2759int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2760 struct sk_buff *skb, unsigned header_len)
2761{
2762 struct scatterlist sg;
2763 const struct tcphdr *tp = tcp_hdr(skb);
2764 struct hash_desc *desc = &hp->md5_desc;
2765 unsigned i;
2766 const unsigned head_data_len = skb_headlen(skb) > header_len ?
2767 skb_headlen(skb) - header_len : 0;
2768 const struct skb_shared_info *shi = skb_shinfo(skb);
2769
2770 sg_init_table(&sg, 1);
2771
2772 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
2773 if (crypto_hash_update(desc, &sg, head_data_len))
2774 return 1;
2775
2776 for (i = 0; i < shi->nr_frags; ++i) {
2777 const struct skb_frag_struct *f = &shi->frags[i];
2778 sg_set_page(&sg, f->page, f->size, f->page_offset);
2779 if (crypto_hash_update(desc, &sg, f->size))
2780 return 1;
2781 }
2782
2783 return 0;
2784}
2785
2786EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2787
2788int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
2789{
2790 struct scatterlist sg;
2791
2792 sg_init_one(&sg, key->key, key->keylen);
2793 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2794}
2795
2796EXPORT_SYMBOL(tcp_md5_hash_key);
2797
2798#endif
2799
2800void tcp_done(struct sock *sk)
2801{
2802 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2803 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2804
2805 tcp_set_state(sk, TCP_CLOSE);
2806 tcp_clear_xmit_timers(sk);
2807
2808 sk->sk_shutdown = SHUTDOWN_MASK;
2809
2810 if (!sock_flag(sk, SOCK_DEAD))
2811 sk->sk_state_change(sk);
2812 else
2813 inet_csk_destroy_sock(sk);
2814}
2815EXPORT_SYMBOL_GPL(tcp_done);
2816
2817extern struct tcp_congestion_ops tcp_reno;
2818
2819static __initdata unsigned long thash_entries;
2820static int __init set_thash_entries(char *str)
2821{
2822 if (!str)
2823 return 0;
2824 thash_entries = simple_strtoul(str, &str, 0);
2825 return 1;
2826}
2827__setup("thash_entries=", set_thash_entries);
2828
2829void __init tcp_init(void)
2830{
2831 struct sk_buff *skb = NULL;
2832 unsigned long nr_pages, limit;
2833 int order, i, max_share;
2834
2835 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2836
2837 percpu_counter_init(&tcp_sockets_allocated, 0);
2838 percpu_counter_init(&tcp_orphan_count, 0);
2839 tcp_hashinfo.bind_bucket_cachep =
2840 kmem_cache_create("tcp_bind_bucket",
2841 sizeof(struct inet_bind_bucket), 0,
2842 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2843
2844
2845
2846
2847
2848
2849 tcp_hashinfo.ehash =
2850 alloc_large_system_hash("TCP established",
2851 sizeof(struct inet_ehash_bucket),
2852 thash_entries,
2853 (num_physpages >= 128 * 1024) ?
2854 13 : 15,
2855 0,
2856 &tcp_hashinfo.ehash_size,
2857 NULL,
2858 thash_entries ? 0 : 512 * 1024);
2859 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2860 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2861 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2862 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2863 }
2864 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2865 panic("TCP: failed to alloc ehash_locks");
2866 tcp_hashinfo.bhash =
2867 alloc_large_system_hash("TCP bind",
2868 sizeof(struct inet_bind_hashbucket),
2869 tcp_hashinfo.ehash_size,
2870 (num_physpages >= 128 * 1024) ?
2871 13 : 15,
2872 0,
2873 &tcp_hashinfo.bhash_size,
2874 NULL,
2875 64 * 1024);
2876 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2877 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2878 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2879 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2880 }
2881
2882
2883
2884
2885 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2886 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2887 order++)
2888 ;
2889 if (order >= 4) {
2890 tcp_death_row.sysctl_max_tw_buckets = 180000;
2891 sysctl_tcp_max_orphans = 4096 << (order - 4);
2892 sysctl_max_syn_backlog = 1024;
2893 } else if (order < 3) {
2894 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2895 sysctl_tcp_max_orphans >>= (3 - order);
2896 sysctl_max_syn_backlog = 128;
2897 }
2898
2899
2900
2901
2902
2903 nr_pages = totalram_pages - totalhigh_pages;
2904 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2905 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2906 limit = max(limit, 128UL);
2907 sysctl_tcp_mem[0] = limit / 4 * 3;
2908 sysctl_tcp_mem[1] = limit;
2909 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2910
2911
2912 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2913 max_share = min(4UL*1024*1024, limit);
2914
2915 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
2916 sysctl_tcp_wmem[1] = 16*1024;
2917 sysctl_tcp_wmem[2] = max(64*1024, max_share);
2918
2919 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
2920 sysctl_tcp_rmem[1] = 87380;
2921 sysctl_tcp_rmem[2] = max(87380, max_share);
2922
2923 printk(KERN_INFO "TCP: Hash tables configured "
2924 "(established %d bind %d)\n",
2925 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2926
2927 tcp_register_congestion_control(&tcp_reno);
2928}
2929
2930EXPORT_SYMBOL(tcp_close);
2931EXPORT_SYMBOL(tcp_disconnect);
2932EXPORT_SYMBOL(tcp_getsockopt);
2933EXPORT_SYMBOL(tcp_ioctl);
2934EXPORT_SYMBOL(tcp_poll);
2935EXPORT_SYMBOL(tcp_read_sock);
2936EXPORT_SYMBOL(tcp_recvmsg);
2937EXPORT_SYMBOL(tcp_sendmsg);
2938EXPORT_SYMBOL(tcp_splice_read);
2939EXPORT_SYMBOL(tcp_sendpage);
2940EXPORT_SYMBOL(tcp_setsockopt);
2941EXPORT_SYMBOL(tcp_shutdown);
2942