1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282
283int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
284
285struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count);
287
288int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly;
290
291EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem);
293
294atomic_long_t tcp_memory_allocated;
295EXPORT_SYMBOL(tcp_memory_allocated);
296
297
298
299
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303
304
305
306struct tcp_splice_state {
307 struct pipe_inode_info *pipe;
308 size_t len;
309 unsigned int flags;
310};
311
312
313
314
315
316
317
318int tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL(tcp_memory_pressure);
320
321void tcp_enter_memory_pressure(struct sock *sk)
322{
323 if (!tcp_memory_pressure) {
324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
325 tcp_memory_pressure = 1;
326 }
327}
328EXPORT_SYMBOL(tcp_enter_memory_pressure);
329
330
331static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
332{
333 u8 res = 0;
334
335 if (seconds > 0) {
336 int period = timeout;
337
338 res = 1;
339 while (seconds > period && res < 255) {
340 res++;
341 timeout <<= 1;
342 if (timeout > rto_max)
343 timeout = rto_max;
344 period += timeout;
345 }
346 }
347 return res;
348}
349
350
351static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
352{
353 int period = 0;
354
355 if (retrans > 0) {
356 period = timeout;
357 while (--retrans) {
358 timeout <<= 1;
359 if (timeout > rto_max)
360 timeout = rto_max;
361 period += timeout;
362 }
363 }
364 return period;
365}
366
367
368
369
370
371
372void tcp_init_sock(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375 struct tcp_sock *tp = tcp_sk(sk);
376
377 skb_queue_head_init(&tp->out_of_order_queue);
378 tcp_init_xmit_timers(sk);
379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
381
382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
383 tp->mdev = TCP_TIMEOUT_INIT;
384
385
386
387
388
389
390 tp->snd_cwnd = TCP_INIT_CWND;
391
392
393
394
395 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396 tp->snd_cwnd_clamp = ~0;
397 tp->mss_cache = TCP_MSS_DEFAULT;
398
399 tp->reordering = sysctl_tcp_reordering;
400 tcp_enable_early_retrans(tp);
401 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403 sk->sk_state = TCP_CLOSE;
404
405 sk->sk_write_space = sk_stream_write_space;
406 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
407
408 icsk->icsk_sync_mss = tcp_sync_mss;
409
410
411 if (sysctl_tcp_cookie_size > 0) {
412
413 tp->cookie_values =
414 kzalloc(sizeof(*tp->cookie_values),
415 sk->sk_allocation);
416 if (tp->cookie_values != NULL)
417 kref_init(&tp->cookie_values->kref);
418 }
419
420
421
422
423 sk->sk_sndbuf = sysctl_tcp_wmem[1];
424 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
425
426 local_bh_disable();
427 sock_update_memcg(sk);
428 sk_sockets_allocated_inc(sk);
429 local_bh_enable();
430}
431EXPORT_SYMBOL(tcp_init_sock);
432
433
434
435
436
437
438
439
440unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
441{
442 unsigned int mask;
443 struct sock *sk = sock->sk;
444 const struct tcp_sock *tp = tcp_sk(sk);
445
446 sock_poll_wait(file, sk_sleep(sk), wait);
447 if (sk->sk_state == TCP_LISTEN)
448 return inet_csk_listen_poll(sk);
449
450
451
452
453
454
455 mask = 0;
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
485 mask |= POLLHUP;
486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488
489
490 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
491 int target = sock_rcvlowat(sk, 0, INT_MAX);
492
493 if (tp->urg_seq == tp->copied_seq &&
494 !sock_flag(sk, SOCK_URGINLINE) &&
495 tp->urg_data)
496 target++;
497
498
499
500
501 if (tp->rcv_nxt - tp->copied_seq >= target)
502 mask |= POLLIN | POLLRDNORM;
503
504 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
505 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
506 mask |= POLLOUT | POLLWRNORM;
507 } else {
508 set_bit(SOCK_ASYNC_NOSPACE,
509 &sk->sk_socket->flags);
510 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
511
512
513
514
515
516 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
517 mask |= POLLOUT | POLLWRNORM;
518 }
519 } else
520 mask |= POLLOUT | POLLWRNORM;
521
522 if (tp->urg_data & TCP_URG_VALID)
523 mask |= POLLPRI;
524 }
525
526 smp_rmb();
527 if (sk->sk_err)
528 mask |= POLLERR;
529
530 return mask;
531}
532EXPORT_SYMBOL(tcp_poll);
533
534int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
535{
536 struct tcp_sock *tp = tcp_sk(sk);
537 int answ;
538
539 switch (cmd) {
540 case SIOCINQ:
541 if (sk->sk_state == TCP_LISTEN)
542 return -EINVAL;
543
544 lock_sock(sk);
545 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
546 answ = 0;
547 else if (sock_flag(sk, SOCK_URGINLINE) ||
548 !tp->urg_data ||
549 before(tp->urg_seq, tp->copied_seq) ||
550 !before(tp->urg_seq, tp->rcv_nxt)) {
551 struct sk_buff *skb;
552
553 answ = tp->rcv_nxt - tp->copied_seq;
554
555
556 skb = skb_peek_tail(&sk->sk_receive_queue);
557 if (answ && skb)
558 answ -= tcp_hdr(skb)->fin;
559 } else
560 answ = tp->urg_seq - tp->copied_seq;
561 release_sock(sk);
562 break;
563 case SIOCATMARK:
564 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
565 break;
566 case SIOCOUTQ:
567 if (sk->sk_state == TCP_LISTEN)
568 return -EINVAL;
569
570 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
571 answ = 0;
572 else
573 answ = tp->write_seq - tp->snd_una;
574 break;
575 case SIOCOUTQNSD:
576 if (sk->sk_state == TCP_LISTEN)
577 return -EINVAL;
578
579 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
580 answ = 0;
581 else
582 answ = tp->write_seq - tp->snd_nxt;
583 break;
584 default:
585 return -ENOIOCTLCMD;
586 }
587
588 return put_user(answ, (int __user *)arg);
589}
590EXPORT_SYMBOL(tcp_ioctl);
591
592static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
593{
594 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
595 tp->pushed_seq = tp->write_seq;
596}
597
598static inline bool forced_push(const struct tcp_sock *tp)
599{
600 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
601}
602
603static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
604{
605 struct tcp_sock *tp = tcp_sk(sk);
606 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
607
608 skb->csum = 0;
609 tcb->seq = tcb->end_seq = tp->write_seq;
610 tcb->tcp_flags = TCPHDR_ACK;
611 tcb->sacked = 0;
612 skb_header_release(skb);
613 tcp_add_write_queue_tail(sk, skb);
614 sk->sk_wmem_queued += skb->truesize;
615 sk_mem_charge(sk, skb->truesize);
616 if (tp->nonagle & TCP_NAGLE_PUSH)
617 tp->nonagle &= ~TCP_NAGLE_PUSH;
618}
619
620static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
621{
622 if (flags & MSG_OOB)
623 tp->snd_up = tp->write_seq;
624}
625
626static inline void tcp_push(struct sock *sk, int flags, int mss_now,
627 int nonagle)
628{
629 if (tcp_send_head(sk)) {
630 struct tcp_sock *tp = tcp_sk(sk);
631
632 if (!(flags & MSG_MORE) || forced_push(tp))
633 tcp_mark_push(tp, tcp_write_queue_tail(sk));
634
635 tcp_mark_urg(tp, flags);
636 __tcp_push_pending_frames(sk, mss_now,
637 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
638 }
639}
640
641static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
642 unsigned int offset, size_t len)
643{
644 struct tcp_splice_state *tss = rd_desc->arg.data;
645 int ret;
646
647 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
648 tss->flags);
649 if (ret > 0)
650 rd_desc->count -= ret;
651 return ret;
652}
653
654static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
655{
656
657 read_descriptor_t rd_desc = {
658 .arg.data = tss,
659 .count = tss->len,
660 };
661
662 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
663}
664
665
666
667
668
669
670
671
672
673
674
675
676
677ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
678 struct pipe_inode_info *pipe, size_t len,
679 unsigned int flags)
680{
681 struct sock *sk = sock->sk;
682 struct tcp_splice_state tss = {
683 .pipe = pipe,
684 .len = len,
685 .flags = flags,
686 };
687 long timeo;
688 ssize_t spliced;
689 int ret;
690
691 sock_rps_record_flow(sk);
692
693
694
695 if (unlikely(*ppos))
696 return -ESPIPE;
697
698 ret = spliced = 0;
699
700 lock_sock(sk);
701
702 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
703 while (tss.len) {
704 ret = __tcp_splice_read(sk, &tss);
705 if (ret < 0)
706 break;
707 else if (!ret) {
708 if (spliced)
709 break;
710 if (sock_flag(sk, SOCK_DONE))
711 break;
712 if (sk->sk_err) {
713 ret = sock_error(sk);
714 break;
715 }
716 if (sk->sk_shutdown & RCV_SHUTDOWN)
717 break;
718 if (sk->sk_state == TCP_CLOSE) {
719
720
721
722
723 if (!sock_flag(sk, SOCK_DONE))
724 ret = -ENOTCONN;
725 break;
726 }
727 if (!timeo) {
728 ret = -EAGAIN;
729 break;
730 }
731 sk_wait_data(sk, &timeo);
732 if (signal_pending(current)) {
733 ret = sock_intr_errno(timeo);
734 break;
735 }
736 continue;
737 }
738 tss.len -= ret;
739 spliced += ret;
740
741 if (!timeo)
742 break;
743 release_sock(sk);
744 lock_sock(sk);
745
746 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
747 (sk->sk_shutdown & RCV_SHUTDOWN) ||
748 signal_pending(current))
749 break;
750 }
751
752 release_sock(sk);
753
754 if (spliced)
755 return spliced;
756
757 return ret;
758}
759EXPORT_SYMBOL(tcp_splice_read);
760
761struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
762{
763 struct sk_buff *skb;
764
765
766 size = ALIGN(size, 4);
767
768 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
769 if (skb) {
770 if (sk_wmem_schedule(sk, skb->truesize)) {
771 skb_reserve(skb, sk->sk_prot->max_header);
772
773
774
775
776 skb->avail_size = size;
777 return skb;
778 }
779 __kfree_skb(skb);
780 } else {
781 sk->sk_prot->enter_memory_pressure(sk);
782 sk_stream_moderate_sndbuf(sk);
783 }
784 return NULL;
785}
786
787static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
788 int large_allowed)
789{
790 struct tcp_sock *tp = tcp_sk(sk);
791 u32 xmit_size_goal, old_size_goal;
792
793 xmit_size_goal = mss_now;
794
795 if (large_allowed && sk_can_gso(sk)) {
796 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
797 inet_csk(sk)->icsk_af_ops->net_header_len -
798 inet_csk(sk)->icsk_ext_hdr_len -
799 tp->tcp_header_len);
800
801
802 xmit_size_goal = min_t(u32, xmit_size_goal,
803 sysctl_tcp_limit_output_bytes >> 1);
804
805 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
806
807
808 old_size_goal = tp->xmit_size_goal_segs * mss_now;
809
810 if (likely(old_size_goal <= xmit_size_goal &&
811 old_size_goal + mss_now > xmit_size_goal)) {
812 xmit_size_goal = old_size_goal;
813 } else {
814 tp->xmit_size_goal_segs =
815 min_t(u16, xmit_size_goal / mss_now,
816 sk->sk_gso_max_segs);
817 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
818 }
819 }
820
821 return max(xmit_size_goal, mss_now);
822}
823
824static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
825{
826 int mss_now;
827
828 mss_now = tcp_current_mss(sk);
829 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
830
831 return mss_now;
832}
833
834static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
835 size_t psize, int flags)
836{
837 struct tcp_sock *tp = tcp_sk(sk);
838 int mss_now, size_goal;
839 int err;
840 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842
843
844 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
845 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
846 goto out_err;
847
848 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849
850 mss_now = tcp_send_mss(sk, &size_goal, flags);
851 copied = 0;
852
853 err = -EPIPE;
854 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
855 goto out_err;
856
857 while (psize > 0) {
858 struct sk_buff *skb = tcp_write_queue_tail(sk);
859 struct page *page = pages[poffset / PAGE_SIZE];
860 int copy, i;
861 int offset = poffset % PAGE_SIZE;
862 int size = min_t(size_t, psize, PAGE_SIZE - offset);
863 bool can_coalesce;
864
865 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
866new_segment:
867 if (!sk_stream_memory_free(sk))
868 goto wait_for_sndbuf;
869
870 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
871 if (!skb)
872 goto wait_for_memory;
873
874 skb_entail(sk, skb);
875 copy = size_goal;
876 }
877
878 if (copy > size)
879 copy = size;
880
881 i = skb_shinfo(skb)->nr_frags;
882 can_coalesce = skb_can_coalesce(skb, i, page, offset);
883 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
884 tcp_mark_push(tp, skb);
885 goto new_segment;
886 }
887 if (!sk_wmem_schedule(sk, copy))
888 goto wait_for_memory;
889
890 if (can_coalesce) {
891 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
892 } else {
893 get_page(page);
894 skb_fill_page_desc(skb, i, page, offset, copy);
895 }
896
897 skb->len += copy;
898 skb->data_len += copy;
899 skb->truesize += copy;
900 sk->sk_wmem_queued += copy;
901 sk_mem_charge(sk, copy);
902 skb->ip_summed = CHECKSUM_PARTIAL;
903 tp->write_seq += copy;
904 TCP_SKB_CB(skb)->end_seq += copy;
905 skb_shinfo(skb)->gso_segs = 0;
906
907 if (!copied)
908 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
909
910 copied += copy;
911 poffset += copy;
912 if (!(psize -= copy))
913 goto out;
914
915 if (skb->len < size_goal || (flags & MSG_OOB))
916 continue;
917
918 if (forced_push(tp)) {
919 tcp_mark_push(tp, skb);
920 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
921 } else if (skb == tcp_send_head(sk))
922 tcp_push_one(sk, mss_now);
923 continue;
924
925wait_for_sndbuf:
926 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
927wait_for_memory:
928 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
929
930 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
931 goto do_error;
932
933 mss_now = tcp_send_mss(sk, &size_goal, flags);
934 }
935
936out:
937 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
938 tcp_push(sk, flags, mss_now, tp->nonagle);
939 return copied;
940
941do_error:
942 if (copied)
943 goto out;
944out_err:
945 return sk_stream_error(sk, flags, err);
946}
947
948int tcp_sendpage(struct sock *sk, struct page *page, int offset,
949 size_t size, int flags)
950{
951 ssize_t res;
952
953 if (!(sk->sk_route_caps & NETIF_F_SG) ||
954 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
955 return sock_no_sendpage(sk->sk_socket, page, offset, size,
956 flags);
957
958 lock_sock(sk);
959 res = do_tcp_sendpages(sk, &page, offset, size, flags);
960 release_sock(sk);
961 return res;
962}
963EXPORT_SYMBOL(tcp_sendpage);
964
965static inline int select_size(const struct sock *sk, bool sg)
966{
967 const struct tcp_sock *tp = tcp_sk(sk);
968 int tmp = tp->mss_cache;
969
970 if (sg) {
971 if (sk_can_gso(sk)) {
972
973
974
975 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
976 } else {
977 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
978
979 if (tmp >= pgbreak &&
980 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
981 tmp = pgbreak;
982 }
983 }
984
985 return tmp;
986}
987
988void tcp_free_fastopen_req(struct tcp_sock *tp)
989{
990 if (tp->fastopen_req != NULL) {
991 kfree(tp->fastopen_req);
992 tp->fastopen_req = NULL;
993 }
994}
995
996static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
997{
998 struct tcp_sock *tp = tcp_sk(sk);
999 int err, flags;
1000
1001 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1002 return -EOPNOTSUPP;
1003 if (tp->fastopen_req != NULL)
1004 return -EALREADY;
1005
1006 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1007 sk->sk_allocation);
1008 if (unlikely(tp->fastopen_req == NULL))
1009 return -ENOBUFS;
1010 tp->fastopen_req->data = msg;
1011
1012 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1013 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1014 msg->msg_namelen, flags);
1015 *size = tp->fastopen_req->copied;
1016 tcp_free_fastopen_req(tp);
1017 return err;
1018}
1019
1020int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1021 size_t size)
1022{
1023 struct iovec *iov;
1024 struct tcp_sock *tp = tcp_sk(sk);
1025 struct sk_buff *skb;
1026 int iovlen, flags, err, copied = 0;
1027 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1028 bool sg;
1029 long timeo;
1030
1031 lock_sock(sk);
1032
1033 flags = msg->msg_flags;
1034 if (flags & MSG_FASTOPEN) {
1035 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1036 if (err == -EINPROGRESS && copied_syn > 0)
1037 goto out;
1038 else if (err)
1039 goto out_err;
1040 offset = copied_syn;
1041 }
1042
1043 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1044
1045
1046 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error;
1049
1050 if (unlikely(tp->repair)) {
1051 if (tp->repair_queue == TCP_RECV_QUEUE) {
1052 copied = tcp_send_rcvq(sk, msg, size);
1053 goto out;
1054 }
1055
1056 err = -EINVAL;
1057 if (tp->repair_queue == TCP_NO_QUEUE)
1058 goto out_err;
1059
1060
1061 }
1062
1063
1064 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1065
1066 mss_now = tcp_send_mss(sk, &size_goal, flags);
1067
1068
1069 iovlen = msg->msg_iovlen;
1070 iov = msg->msg_iov;
1071 copied = 0;
1072
1073 err = -EPIPE;
1074 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1075 goto out_err;
1076
1077 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1078
1079 while (--iovlen >= 0) {
1080 size_t seglen = iov->iov_len;
1081 unsigned char __user *from = iov->iov_base;
1082
1083 iov++;
1084 if (unlikely(offset > 0)) {
1085 if (offset >= seglen) {
1086 offset -= seglen;
1087 continue;
1088 }
1089 seglen -= offset;
1090 from += offset;
1091 offset = 0;
1092 }
1093
1094 while (seglen > 0) {
1095 int copy = 0;
1096 int max = size_goal;
1097
1098 skb = tcp_write_queue_tail(sk);
1099 if (tcp_send_head(sk)) {
1100 if (skb->ip_summed == CHECKSUM_NONE)
1101 max = mss_now;
1102 copy = max - skb->len;
1103 }
1104
1105 if (copy <= 0) {
1106new_segment:
1107
1108
1109
1110 if (!sk_stream_memory_free(sk))
1111 goto wait_for_sndbuf;
1112
1113 skb = sk_stream_alloc_skb(sk,
1114 select_size(sk, sg),
1115 sk->sk_allocation);
1116 if (!skb)
1117 goto wait_for_memory;
1118
1119
1120
1121
1122 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1123 skb->ip_summed = CHECKSUM_PARTIAL;
1124
1125 skb_entail(sk, skb);
1126 copy = size_goal;
1127 max = size_goal;
1128 }
1129
1130
1131 if (copy > seglen)
1132 copy = seglen;
1133
1134
1135 if (skb_availroom(skb) > 0) {
1136
1137 copy = min_t(int, copy, skb_availroom(skb));
1138 err = skb_add_data_nocache(sk, skb, from, copy);
1139 if (err)
1140 goto do_fault;
1141 } else {
1142 bool merge = false;
1143 int i = skb_shinfo(skb)->nr_frags;
1144 struct page *page = sk->sk_sndmsg_page;
1145 int off;
1146
1147 if (page && page_count(page) == 1)
1148 sk->sk_sndmsg_off = 0;
1149
1150 off = sk->sk_sndmsg_off;
1151
1152 if (skb_can_coalesce(skb, i, page, off) &&
1153 off != PAGE_SIZE) {
1154
1155
1156 merge = true;
1157 } else if (i == MAX_SKB_FRAGS || !sg) {
1158
1159
1160
1161
1162 tcp_mark_push(tp, skb);
1163 goto new_segment;
1164 } else if (page) {
1165 if (off == PAGE_SIZE) {
1166 put_page(page);
1167 sk->sk_sndmsg_page = page = NULL;
1168 off = 0;
1169 }
1170 } else
1171 off = 0;
1172
1173 if (copy > PAGE_SIZE - off)
1174 copy = PAGE_SIZE - off;
1175
1176 if (!sk_wmem_schedule(sk, copy))
1177 goto wait_for_memory;
1178
1179 if (!page) {
1180
1181 if (!(page = sk_stream_alloc_page(sk)))
1182 goto wait_for_memory;
1183 }
1184
1185
1186
1187 err = skb_copy_to_page_nocache(sk, from, skb,
1188 page, off, copy);
1189 if (err) {
1190
1191
1192
1193 if (!sk->sk_sndmsg_page) {
1194 sk->sk_sndmsg_page = page;
1195 sk->sk_sndmsg_off = 0;
1196 }
1197 goto do_error;
1198 }
1199
1200
1201 if (merge) {
1202 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1203 } else {
1204 skb_fill_page_desc(skb, i, page, off, copy);
1205 if (sk->sk_sndmsg_page) {
1206 get_page(page);
1207 } else if (off + copy < PAGE_SIZE) {
1208 get_page(page);
1209 sk->sk_sndmsg_page = page;
1210 }
1211 }
1212
1213 sk->sk_sndmsg_off = off + copy;
1214 }
1215
1216 if (!copied)
1217 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1218
1219 tp->write_seq += copy;
1220 TCP_SKB_CB(skb)->end_seq += copy;
1221 skb_shinfo(skb)->gso_segs = 0;
1222
1223 from += copy;
1224 copied += copy;
1225 if ((seglen -= copy) == 0 && iovlen == 0)
1226 goto out;
1227
1228 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1229 continue;
1230
1231 if (forced_push(tp)) {
1232 tcp_mark_push(tp, skb);
1233 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1234 } else if (skb == tcp_send_head(sk))
1235 tcp_push_one(sk, mss_now);
1236 continue;
1237
1238wait_for_sndbuf:
1239 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1240wait_for_memory:
1241 if (copied && likely(!tp->repair))
1242 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1243
1244 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1245 goto do_error;
1246
1247 mss_now = tcp_send_mss(sk, &size_goal, flags);
1248 }
1249 }
1250
1251out:
1252 if (copied && likely(!tp->repair))
1253 tcp_push(sk, flags, mss_now, tp->nonagle);
1254 release_sock(sk);
1255 return copied + copied_syn;
1256
1257do_fault:
1258 if (!skb->len) {
1259 tcp_unlink_write_queue(skb, sk);
1260
1261
1262
1263 tcp_check_send_head(sk, skb);
1264 sk_wmem_free_skb(sk, skb);
1265 }
1266
1267do_error:
1268 if (copied + copied_syn)
1269 goto out;
1270out_err:
1271 err = sk_stream_error(sk, flags, err);
1272 release_sock(sk);
1273 return err;
1274}
1275EXPORT_SYMBOL(tcp_sendmsg);
1276
1277
1278
1279
1280
1281
1282static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1283{
1284 struct tcp_sock *tp = tcp_sk(sk);
1285
1286
1287 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1288 tp->urg_data == TCP_URG_READ)
1289 return -EINVAL;
1290
1291 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1292 return -ENOTCONN;
1293
1294 if (tp->urg_data & TCP_URG_VALID) {
1295 int err = 0;
1296 char c = tp->urg_data;
1297
1298 if (!(flags & MSG_PEEK))
1299 tp->urg_data = TCP_URG_READ;
1300
1301
1302 msg->msg_flags |= MSG_OOB;
1303
1304 if (len > 0) {
1305 if (!(flags & MSG_TRUNC))
1306 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1307 len = 1;
1308 } else
1309 msg->msg_flags |= MSG_TRUNC;
1310
1311 return err ? -EFAULT : len;
1312 }
1313
1314 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1315 return 0;
1316
1317
1318
1319
1320
1321
1322
1323 return -EAGAIN;
1324}
1325
1326static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1327{
1328 struct sk_buff *skb;
1329 int copied = 0, err = 0;
1330
1331
1332
1333 skb_queue_walk(&sk->sk_write_queue, skb) {
1334 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1335 if (err)
1336 break;
1337
1338 copied += skb->len;
1339 }
1340
1341 return err ?: copied;
1342}
1343
1344
1345
1346
1347
1348
1349
1350void tcp_cleanup_rbuf(struct sock *sk, int copied)
1351{
1352 struct tcp_sock *tp = tcp_sk(sk);
1353 bool time_to_ack = false;
1354
1355 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1356
1357 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1358 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1359 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1360
1361 if (inet_csk_ack_scheduled(sk)) {
1362 const struct inet_connection_sock *icsk = inet_csk(sk);
1363
1364
1365 if (icsk->icsk_ack.blocked ||
1366
1367 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1368
1369
1370
1371
1372
1373
1374 (copied > 0 &&
1375 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1376 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1377 !icsk->icsk_ack.pingpong)) &&
1378 !atomic_read(&sk->sk_rmem_alloc)))
1379 time_to_ack = true;
1380 }
1381
1382
1383
1384
1385
1386
1387
1388 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1389 __u32 rcv_window_now = tcp_receive_window(tp);
1390
1391
1392 if (2*rcv_window_now <= tp->window_clamp) {
1393 __u32 new_window = __tcp_select_window(sk);
1394
1395
1396
1397
1398
1399
1400 if (new_window && new_window >= 2 * rcv_window_now)
1401 time_to_ack = true;
1402 }
1403 }
1404 if (time_to_ack)
1405 tcp_send_ack(sk);
1406}
1407
1408static void tcp_prequeue_process(struct sock *sk)
1409{
1410 struct sk_buff *skb;
1411 struct tcp_sock *tp = tcp_sk(sk);
1412
1413 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1414
1415
1416
1417 local_bh_disable();
1418 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1419 sk_backlog_rcv(sk, skb);
1420 local_bh_enable();
1421
1422
1423 tp->ucopy.memory = 0;
1424}
1425
1426#ifdef CONFIG_NET_DMA
1427static void tcp_service_net_dma(struct sock *sk, bool wait)
1428{
1429 dma_cookie_t done, used;
1430 dma_cookie_t last_issued;
1431 struct tcp_sock *tp = tcp_sk(sk);
1432
1433 if (!tp->ucopy.dma_chan)
1434 return;
1435
1436 last_issued = tp->ucopy.dma_cookie;
1437 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1438
1439 do {
1440 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1441 last_issued, &done,
1442 &used) == DMA_SUCCESS) {
1443
1444 __skb_queue_purge(&sk->sk_async_wait_queue);
1445 break;
1446 } else {
1447 struct sk_buff *skb;
1448 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1449 (dma_async_is_complete(skb->dma_cookie, done,
1450 used) == DMA_SUCCESS)) {
1451 __skb_dequeue(&sk->sk_async_wait_queue);
1452 kfree_skb(skb);
1453 }
1454 }
1455 } while (wait);
1456}
1457#endif
1458
1459static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1460{
1461 struct sk_buff *skb;
1462 u32 offset;
1463
1464 skb_queue_walk(&sk->sk_receive_queue, skb) {
1465 offset = seq - TCP_SKB_CB(skb)->seq;
1466 if (tcp_hdr(skb)->syn)
1467 offset--;
1468 if (offset < skb->len || tcp_hdr(skb)->fin) {
1469 *off = offset;
1470 return skb;
1471 }
1472 }
1473 return NULL;
1474}
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1488 sk_read_actor_t recv_actor)
1489{
1490 struct sk_buff *skb;
1491 struct tcp_sock *tp = tcp_sk(sk);
1492 u32 seq = tp->copied_seq;
1493 u32 offset;
1494 int copied = 0;
1495
1496 if (sk->sk_state == TCP_LISTEN)
1497 return -ENOTCONN;
1498 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1499 if (offset < skb->len) {
1500 int used;
1501 size_t len;
1502
1503 len = skb->len - offset;
1504
1505 if (tp->urg_data) {
1506 u32 urg_offset = tp->urg_seq - seq;
1507 if (urg_offset < len)
1508 len = urg_offset;
1509 if (!len)
1510 break;
1511 }
1512 used = recv_actor(desc, skb, offset, len);
1513 if (used < 0) {
1514 if (!copied)
1515 copied = used;
1516 break;
1517 } else if (used <= len) {
1518 seq += used;
1519 copied += used;
1520 offset += used;
1521 }
1522
1523
1524
1525
1526
1527
1528 skb = tcp_recv_skb(sk, seq-1, &offset);
1529 if (!skb || (offset+1 != skb->len))
1530 break;
1531 }
1532 if (tcp_hdr(skb)->fin) {
1533 sk_eat_skb(sk, skb, false);
1534 ++seq;
1535 break;
1536 }
1537 sk_eat_skb(sk, skb, false);
1538 if (!desc->count)
1539 break;
1540 tp->copied_seq = seq;
1541 }
1542 tp->copied_seq = seq;
1543
1544 tcp_rcv_space_adjust(sk);
1545
1546
1547 if (copied > 0)
1548 tcp_cleanup_rbuf(sk, copied);
1549 return copied;
1550}
1551EXPORT_SYMBOL(tcp_read_sock);
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1562 size_t len, int nonblock, int flags, int *addr_len)
1563{
1564 struct tcp_sock *tp = tcp_sk(sk);
1565 int copied = 0;
1566 u32 peek_seq;
1567 u32 *seq;
1568 unsigned long used;
1569 int err;
1570 int target;
1571 long timeo;
1572 struct task_struct *user_recv = NULL;
1573 bool copied_early = false;
1574 struct sk_buff *skb;
1575 u32 urg_hole = 0;
1576
1577 lock_sock(sk);
1578
1579 err = -ENOTCONN;
1580 if (sk->sk_state == TCP_LISTEN)
1581 goto out;
1582
1583 timeo = sock_rcvtimeo(sk, nonblock);
1584
1585
1586 if (flags & MSG_OOB)
1587 goto recv_urg;
1588
1589 if (unlikely(tp->repair)) {
1590 err = -EPERM;
1591 if (!(flags & MSG_PEEK))
1592 goto out;
1593
1594 if (tp->repair_queue == TCP_SEND_QUEUE)
1595 goto recv_sndq;
1596
1597 err = -EINVAL;
1598 if (tp->repair_queue == TCP_NO_QUEUE)
1599 goto out;
1600
1601
1602 }
1603
1604 seq = &tp->copied_seq;
1605 if (flags & MSG_PEEK) {
1606 peek_seq = tp->copied_seq;
1607 seq = &peek_seq;
1608 }
1609
1610 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1611
1612#ifdef CONFIG_NET_DMA
1613 tp->ucopy.dma_chan = NULL;
1614 preempt_disable();
1615 skb = skb_peek_tail(&sk->sk_receive_queue);
1616 {
1617 int available = 0;
1618
1619 if (skb)
1620 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1621 if ((available < target) &&
1622 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1623 !sysctl_tcp_low_latency &&
1624 net_dma_find_channel()) {
1625 preempt_enable_no_resched();
1626 tp->ucopy.pinned_list =
1627 dma_pin_iovec_pages(msg->msg_iov, len);
1628 } else {
1629 preempt_enable_no_resched();
1630 }
1631 }
1632#endif
1633
1634 do {
1635 u32 offset;
1636
1637
1638 if (tp->urg_data && tp->urg_seq == *seq) {
1639 if (copied)
1640 break;
1641 if (signal_pending(current)) {
1642 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1643 break;
1644 }
1645 }
1646
1647
1648
1649 skb_queue_walk(&sk->sk_receive_queue, skb) {
1650
1651
1652
1653 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1654 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1655 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1656 flags))
1657 break;
1658
1659 offset = *seq - TCP_SKB_CB(skb)->seq;
1660 if (tcp_hdr(skb)->syn)
1661 offset--;
1662 if (offset < skb->len)
1663 goto found_ok_skb;
1664 if (tcp_hdr(skb)->fin)
1665 goto found_fin_ok;
1666 WARN(!(flags & MSG_PEEK),
1667 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1668 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1669 }
1670
1671
1672
1673 if (copied >= target && !sk->sk_backlog.tail)
1674 break;
1675
1676 if (copied) {
1677 if (sk->sk_err ||
1678 sk->sk_state == TCP_CLOSE ||
1679 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1680 !timeo ||
1681 signal_pending(current))
1682 break;
1683 } else {
1684 if (sock_flag(sk, SOCK_DONE))
1685 break;
1686
1687 if (sk->sk_err) {
1688 copied = sock_error(sk);
1689 break;
1690 }
1691
1692 if (sk->sk_shutdown & RCV_SHUTDOWN)
1693 break;
1694
1695 if (sk->sk_state == TCP_CLOSE) {
1696 if (!sock_flag(sk, SOCK_DONE)) {
1697
1698
1699
1700 copied = -ENOTCONN;
1701 break;
1702 }
1703 break;
1704 }
1705
1706 if (!timeo) {
1707 copied = -EAGAIN;
1708 break;
1709 }
1710
1711 if (signal_pending(current)) {
1712 copied = sock_intr_errno(timeo);
1713 break;
1714 }
1715 }
1716
1717 tcp_cleanup_rbuf(sk, copied);
1718
1719 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1720
1721 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1722 user_recv = current;
1723 tp->ucopy.task = user_recv;
1724 tp->ucopy.iov = msg->msg_iov;
1725 }
1726
1727 tp->ucopy.len = len;
1728
1729 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1730 !(flags & (MSG_PEEK | MSG_TRUNC)));
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758 if (!skb_queue_empty(&tp->ucopy.prequeue))
1759 goto do_prequeue;
1760
1761
1762 }
1763
1764#ifdef CONFIG_NET_DMA
1765 if (tp->ucopy.dma_chan) {
1766 if (tp->rcv_wnd == 0 &&
1767 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1768 tcp_service_net_dma(sk, true);
1769 tcp_cleanup_rbuf(sk, copied);
1770 } else
1771 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1772 }
1773#endif
1774 if (copied >= target) {
1775
1776 release_sock(sk);
1777 lock_sock(sk);
1778 } else
1779 sk_wait_data(sk, &timeo);
1780
1781#ifdef CONFIG_NET_DMA
1782 tcp_service_net_dma(sk, false);
1783 tp->ucopy.wakeup = 0;
1784#endif
1785
1786 if (user_recv) {
1787 int chunk;
1788
1789
1790
1791 if ((chunk = len - tp->ucopy.len) != 0) {
1792 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1793 len -= chunk;
1794 copied += chunk;
1795 }
1796
1797 if (tp->rcv_nxt == tp->copied_seq &&
1798 !skb_queue_empty(&tp->ucopy.prequeue)) {
1799do_prequeue:
1800 tcp_prequeue_process(sk);
1801
1802 if ((chunk = len - tp->ucopy.len) != 0) {
1803 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1804 len -= chunk;
1805 copied += chunk;
1806 }
1807 }
1808 }
1809 if ((flags & MSG_PEEK) &&
1810 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1811 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1812 current->comm,
1813 task_pid_nr(current));
1814 peek_seq = tp->copied_seq;
1815 }
1816 continue;
1817
1818 found_ok_skb:
1819
1820 used = skb->len - offset;
1821 if (len < used)
1822 used = len;
1823
1824
1825 if (tp->urg_data) {
1826 u32 urg_offset = tp->urg_seq - *seq;
1827 if (urg_offset < used) {
1828 if (!urg_offset) {
1829 if (!sock_flag(sk, SOCK_URGINLINE)) {
1830 ++*seq;
1831 urg_hole++;
1832 offset++;
1833 used--;
1834 if (!used)
1835 goto skip_copy;
1836 }
1837 } else
1838 used = urg_offset;
1839 }
1840 }
1841
1842 if (!(flags & MSG_TRUNC)) {
1843#ifdef CONFIG_NET_DMA
1844 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1845 tp->ucopy.dma_chan = net_dma_find_channel();
1846
1847 if (tp->ucopy.dma_chan) {
1848 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1849 tp->ucopy.dma_chan, skb, offset,
1850 msg->msg_iov, used,
1851 tp->ucopy.pinned_list);
1852
1853 if (tp->ucopy.dma_cookie < 0) {
1854
1855 pr_alert("%s: dma_cookie < 0\n",
1856 __func__);
1857
1858
1859 if (!copied)
1860 copied = -EFAULT;
1861 break;
1862 }
1863
1864 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1865
1866 if ((offset + used) == skb->len)
1867 copied_early = true;
1868
1869 } else
1870#endif
1871 {
1872 err = skb_copy_datagram_iovec(skb, offset,
1873 msg->msg_iov, used);
1874 if (err) {
1875
1876 if (!copied)
1877 copied = -EFAULT;
1878 break;
1879 }
1880 }
1881 }
1882
1883 *seq += used;
1884 copied += used;
1885 len -= used;
1886
1887 tcp_rcv_space_adjust(sk);
1888
1889skip_copy:
1890 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1891 tp->urg_data = 0;
1892 tcp_fast_path_check(sk);
1893 }
1894 if (used + offset < skb->len)
1895 continue;
1896
1897 if (tcp_hdr(skb)->fin)
1898 goto found_fin_ok;
1899 if (!(flags & MSG_PEEK)) {
1900 sk_eat_skb(sk, skb, copied_early);
1901 copied_early = false;
1902 }
1903 continue;
1904
1905 found_fin_ok:
1906
1907 ++*seq;
1908 if (!(flags & MSG_PEEK)) {
1909 sk_eat_skb(sk, skb, copied_early);
1910 copied_early = false;
1911 }
1912 break;
1913 } while (len > 0);
1914
1915 if (user_recv) {
1916 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1917 int chunk;
1918
1919 tp->ucopy.len = copied > 0 ? len : 0;
1920
1921 tcp_prequeue_process(sk);
1922
1923 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1924 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1925 len -= chunk;
1926 copied += chunk;
1927 }
1928 }
1929
1930 tp->ucopy.task = NULL;
1931 tp->ucopy.len = 0;
1932 }
1933
1934#ifdef CONFIG_NET_DMA
1935 tcp_service_net_dma(sk, true);
1936 tp->ucopy.dma_chan = NULL;
1937
1938 if (tp->ucopy.pinned_list) {
1939 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1940 tp->ucopy.pinned_list = NULL;
1941 }
1942#endif
1943
1944
1945
1946
1947
1948
1949 tcp_cleanup_rbuf(sk, copied);
1950
1951 release_sock(sk);
1952 return copied;
1953
1954out:
1955 release_sock(sk);
1956 return err;
1957
1958recv_urg:
1959 err = tcp_recv_urg(sk, msg, len, flags);
1960 goto out;
1961
1962recv_sndq:
1963 err = tcp_peek_sndq(sk, msg, len);
1964 goto out;
1965}
1966EXPORT_SYMBOL(tcp_recvmsg);
1967
1968void tcp_set_state(struct sock *sk, int state)
1969{
1970 int oldstate = sk->sk_state;
1971
1972 switch (state) {
1973 case TCP_ESTABLISHED:
1974 if (oldstate != TCP_ESTABLISHED)
1975 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1976 break;
1977
1978 case TCP_CLOSE:
1979 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1980 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1981
1982 sk->sk_prot->unhash(sk);
1983 if (inet_csk(sk)->icsk_bind_hash &&
1984 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1985 inet_put_port(sk);
1986
1987 default:
1988 if (oldstate == TCP_ESTABLISHED)
1989 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1990 }
1991
1992
1993
1994
1995 sk->sk_state = state;
1996
1997#ifdef STATE_TRACE
1998 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1999#endif
2000}
2001EXPORT_SYMBOL_GPL(tcp_set_state);
2002
2003
2004
2005
2006
2007
2008
2009
2010static const unsigned char new_state[16] = {
2011
2012 TCP_CLOSE,
2013 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2014 TCP_CLOSE,
2015 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2016 TCP_FIN_WAIT1,
2017 TCP_FIN_WAIT2,
2018 TCP_CLOSE,
2019 TCP_CLOSE,
2020 TCP_LAST_ACK | TCP_ACTION_FIN,
2021 TCP_LAST_ACK,
2022 TCP_CLOSE,
2023 TCP_CLOSING,
2024};
2025
2026static int tcp_close_state(struct sock *sk)
2027{
2028 int next = (int)new_state[sk->sk_state];
2029 int ns = next & TCP_STATE_MASK;
2030
2031 tcp_set_state(sk, ns);
2032
2033 return next & TCP_ACTION_FIN;
2034}
2035
2036
2037
2038
2039
2040
2041void tcp_shutdown(struct sock *sk, int how)
2042{
2043
2044
2045
2046
2047 if (!(how & SEND_SHUTDOWN))
2048 return;
2049
2050
2051 if ((1 << sk->sk_state) &
2052 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2053 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2054
2055 if (tcp_close_state(sk))
2056 tcp_send_fin(sk);
2057 }
2058}
2059EXPORT_SYMBOL(tcp_shutdown);
2060
2061bool tcp_check_oom(struct sock *sk, int shift)
2062{
2063 bool too_many_orphans, out_of_socket_memory;
2064
2065 too_many_orphans = tcp_too_many_orphans(sk, shift);
2066 out_of_socket_memory = tcp_out_of_memory(sk);
2067
2068 if (too_many_orphans)
2069 net_info_ratelimited("too many orphaned sockets\n");
2070 if (out_of_socket_memory)
2071 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2072 return too_many_orphans || out_of_socket_memory;
2073}
2074
2075void tcp_close(struct sock *sk, long timeout)
2076{
2077 struct sk_buff *skb;
2078 int data_was_unread = 0;
2079 int state;
2080
2081 lock_sock(sk);
2082 sk->sk_shutdown = SHUTDOWN_MASK;
2083
2084 if (sk->sk_state == TCP_LISTEN) {
2085 tcp_set_state(sk, TCP_CLOSE);
2086
2087
2088 inet_csk_listen_stop(sk);
2089
2090 goto adjudge_to_death;
2091 }
2092
2093
2094
2095
2096
2097 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2098 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2099 tcp_hdr(skb)->fin;
2100 data_was_unread += len;
2101 __kfree_skb(skb);
2102 }
2103
2104 sk_mem_reclaim(sk);
2105
2106
2107 if (sk->sk_state == TCP_CLOSE)
2108 goto adjudge_to_death;
2109
2110
2111
2112
2113
2114
2115
2116
2117 if (unlikely(tcp_sk(sk)->repair)) {
2118 sk->sk_prot->disconnect(sk, 0);
2119 } else if (data_was_unread) {
2120
2121 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2122 tcp_set_state(sk, TCP_CLOSE);
2123 tcp_send_active_reset(sk, sk->sk_allocation);
2124 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2125
2126 sk->sk_prot->disconnect(sk, 0);
2127 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2128 } else if (tcp_close_state(sk)) {
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154 tcp_send_fin(sk);
2155 }
2156
2157 sk_stream_wait_close(sk, timeout);
2158
2159adjudge_to_death:
2160 state = sk->sk_state;
2161 sock_hold(sk);
2162 sock_orphan(sk);
2163
2164
2165 release_sock(sk);
2166
2167
2168
2169
2170
2171 local_bh_disable();
2172 bh_lock_sock(sk);
2173 WARN_ON(sock_owned_by_user(sk));
2174
2175 percpu_counter_inc(sk->sk_prot->orphan_count);
2176
2177
2178 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2179 goto out;
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195 if (sk->sk_state == TCP_FIN_WAIT2) {
2196 struct tcp_sock *tp = tcp_sk(sk);
2197 if (tp->linger2 < 0) {
2198 tcp_set_state(sk, TCP_CLOSE);
2199 tcp_send_active_reset(sk, GFP_ATOMIC);
2200 NET_INC_STATS_BH(sock_net(sk),
2201 LINUX_MIB_TCPABORTONLINGER);
2202 } else {
2203 const int tmo = tcp_fin_time(sk);
2204
2205 if (tmo > TCP_TIMEWAIT_LEN) {
2206 inet_csk_reset_keepalive_timer(sk,
2207 tmo - TCP_TIMEWAIT_LEN);
2208 } else {
2209 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2210 goto out;
2211 }
2212 }
2213 }
2214 if (sk->sk_state != TCP_CLOSE) {
2215 sk_mem_reclaim(sk);
2216 if (tcp_check_oom(sk, 0)) {
2217 tcp_set_state(sk, TCP_CLOSE);
2218 tcp_send_active_reset(sk, GFP_ATOMIC);
2219 NET_INC_STATS_BH(sock_net(sk),
2220 LINUX_MIB_TCPABORTONMEMORY);
2221 }
2222 }
2223
2224 if (sk->sk_state == TCP_CLOSE)
2225 inet_csk_destroy_sock(sk);
2226
2227
2228out:
2229 bh_unlock_sock(sk);
2230 local_bh_enable();
2231 sock_put(sk);
2232}
2233EXPORT_SYMBOL(tcp_close);
2234
2235
2236
2237static inline bool tcp_need_reset(int state)
2238{
2239 return (1 << state) &
2240 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2241 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2242}
2243
2244int tcp_disconnect(struct sock *sk, int flags)
2245{
2246 struct inet_sock *inet = inet_sk(sk);
2247 struct inet_connection_sock *icsk = inet_csk(sk);
2248 struct tcp_sock *tp = tcp_sk(sk);
2249 int err = 0;
2250 int old_state = sk->sk_state;
2251
2252 if (old_state != TCP_CLOSE)
2253 tcp_set_state(sk, TCP_CLOSE);
2254
2255
2256 if (old_state == TCP_LISTEN) {
2257 inet_csk_listen_stop(sk);
2258 } else if (unlikely(tp->repair)) {
2259 sk->sk_err = ECONNABORTED;
2260 } else if (tcp_need_reset(old_state) ||
2261 (tp->snd_nxt != tp->write_seq &&
2262 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2263
2264
2265
2266 tcp_send_active_reset(sk, gfp_any());
2267 sk->sk_err = ECONNRESET;
2268 } else if (old_state == TCP_SYN_SENT)
2269 sk->sk_err = ECONNRESET;
2270
2271 tcp_clear_xmit_timers(sk);
2272 __skb_queue_purge(&sk->sk_receive_queue);
2273 tcp_write_queue_purge(sk);
2274 __skb_queue_purge(&tp->out_of_order_queue);
2275#ifdef CONFIG_NET_DMA
2276 __skb_queue_purge(&sk->sk_async_wait_queue);
2277#endif
2278
2279 inet->inet_dport = 0;
2280
2281 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2282 inet_reset_saddr(sk);
2283
2284 sk->sk_shutdown = 0;
2285 sock_reset_flag(sk, SOCK_DONE);
2286 tp->srtt = 0;
2287 if ((tp->write_seq += tp->max_window + 2) == 0)
2288 tp->write_seq = 1;
2289 icsk->icsk_backoff = 0;
2290 tp->snd_cwnd = 2;
2291 icsk->icsk_probes_out = 0;
2292 tp->packets_out = 0;
2293 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2294 tp->snd_cwnd_cnt = 0;
2295 tp->bytes_acked = 0;
2296 tp->window_clamp = 0;
2297 tcp_set_ca_state(sk, TCP_CA_Open);
2298 tcp_clear_retrans(tp);
2299 inet_csk_delack_init(sk);
2300 tcp_init_send_head(sk);
2301 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2302 __sk_dst_reset(sk);
2303
2304 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2305
2306 sk->sk_error_report(sk);
2307 return err;
2308}
2309EXPORT_SYMBOL(tcp_disconnect);
2310
2311static inline bool tcp_can_repair_sock(const struct sock *sk)
2312{
2313 return capable(CAP_NET_ADMIN) &&
2314 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2315}
2316
2317static int tcp_repair_options_est(struct tcp_sock *tp,
2318 struct tcp_repair_opt __user *optbuf, unsigned int len)
2319{
2320 struct tcp_repair_opt opt;
2321
2322 while (len >= sizeof(opt)) {
2323 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2324 return -EFAULT;
2325
2326 optbuf++;
2327 len -= sizeof(opt);
2328
2329 switch (opt.opt_code) {
2330 case TCPOPT_MSS:
2331 tp->rx_opt.mss_clamp = opt.opt_val;
2332 break;
2333 case TCPOPT_WINDOW:
2334 {
2335 u16 snd_wscale = opt.opt_val & 0xFFFF;
2336 u16 rcv_wscale = opt.opt_val >> 16;
2337
2338 if (snd_wscale > 14 || rcv_wscale > 14)
2339 return -EFBIG;
2340
2341 tp->rx_opt.snd_wscale = snd_wscale;
2342 tp->rx_opt.rcv_wscale = rcv_wscale;
2343 tp->rx_opt.wscale_ok = 1;
2344 }
2345 break;
2346 case TCPOPT_SACK_PERM:
2347 if (opt.opt_val != 0)
2348 return -EINVAL;
2349
2350 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2351 if (sysctl_tcp_fack)
2352 tcp_enable_fack(tp);
2353 break;
2354 case TCPOPT_TIMESTAMP:
2355 if (opt.opt_val != 0)
2356 return -EINVAL;
2357
2358 tp->rx_opt.tstamp_ok = 1;
2359 break;
2360 }
2361 }
2362
2363 return 0;
2364}
2365
2366
2367
2368
2369static int do_tcp_setsockopt(struct sock *sk, int level,
2370 int optname, char __user *optval, unsigned int optlen)
2371{
2372 struct tcp_sock *tp = tcp_sk(sk);
2373 struct inet_connection_sock *icsk = inet_csk(sk);
2374 int val;
2375 int err = 0;
2376
2377
2378 switch (optname) {
2379 case TCP_CONGESTION: {
2380 char name[TCP_CA_NAME_MAX];
2381
2382 if (optlen < 1)
2383 return -EINVAL;
2384
2385 val = strncpy_from_user(name, optval,
2386 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2387 if (val < 0)
2388 return -EFAULT;
2389 name[val] = 0;
2390
2391 lock_sock(sk);
2392 err = tcp_set_congestion_control(sk, name);
2393 release_sock(sk);
2394 return err;
2395 }
2396 case TCP_COOKIE_TRANSACTIONS: {
2397 struct tcp_cookie_transactions ctd;
2398 struct tcp_cookie_values *cvp = NULL;
2399
2400 if (sizeof(ctd) > optlen)
2401 return -EINVAL;
2402 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2403 return -EFAULT;
2404
2405 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2406 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2407 return -EINVAL;
2408
2409 if (ctd.tcpct_cookie_desired == 0) {
2410
2411 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2412 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2413 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2414 return -EINVAL;
2415 }
2416
2417 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2418
2419 lock_sock(sk);
2420 if (tp->cookie_values != NULL) {
2421 kref_put(&tp->cookie_values->kref,
2422 tcp_cookie_values_release);
2423 tp->cookie_values = NULL;
2424 }
2425 tp->rx_opt.cookie_in_always = 0;
2426 tp->rx_opt.cookie_out_never = 1;
2427 release_sock(sk);
2428 return err;
2429 }
2430
2431
2432
2433 if (ctd.tcpct_used > 0 ||
2434 (tp->cookie_values == NULL &&
2435 (sysctl_tcp_cookie_size > 0 ||
2436 ctd.tcpct_cookie_desired > 0 ||
2437 ctd.tcpct_s_data_desired > 0))) {
2438 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2439 GFP_KERNEL);
2440 if (cvp == NULL)
2441 return -ENOMEM;
2442
2443 kref_init(&cvp->kref);
2444 }
2445 lock_sock(sk);
2446 tp->rx_opt.cookie_in_always =
2447 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2448 tp->rx_opt.cookie_out_never = 0;
2449
2450 if (tp->cookie_values != NULL) {
2451 if (cvp != NULL) {
2452
2453
2454
2455
2456 kref_put(&tp->cookie_values->kref,
2457 tcp_cookie_values_release);
2458 } else {
2459 cvp = tp->cookie_values;
2460 }
2461 }
2462
2463 if (cvp != NULL) {
2464 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2465
2466 if (ctd.tcpct_used > 0) {
2467 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2468 ctd.tcpct_used);
2469 cvp->s_data_desired = ctd.tcpct_used;
2470 cvp->s_data_constant = 1;
2471 } else {
2472
2473 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2474 cvp->s_data_constant = 0;
2475 }
2476
2477 tp->cookie_values = cvp;
2478 }
2479 release_sock(sk);
2480 return err;
2481 }
2482 default:
2483
2484 break;
2485 }
2486
2487 if (optlen < sizeof(int))
2488 return -EINVAL;
2489
2490 if (get_user(val, (int __user *)optval))
2491 return -EFAULT;
2492
2493 lock_sock(sk);
2494
2495 switch (optname) {
2496 case TCP_MAXSEG:
2497
2498
2499
2500 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2501 err = -EINVAL;
2502 break;
2503 }
2504 tp->rx_opt.user_mss = val;
2505 break;
2506
2507 case TCP_NODELAY:
2508 if (val) {
2509
2510
2511
2512
2513
2514
2515
2516
2517 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2518 tcp_push_pending_frames(sk);
2519 } else {
2520 tp->nonagle &= ~TCP_NAGLE_OFF;
2521 }
2522 break;
2523
2524 case TCP_THIN_LINEAR_TIMEOUTS:
2525 if (val < 0 || val > 1)
2526 err = -EINVAL;
2527 else
2528 tp->thin_lto = val;
2529 break;
2530
2531 case TCP_THIN_DUPACK:
2532 if (val < 0 || val > 1)
2533 err = -EINVAL;
2534 else
2535 tp->thin_dupack = val;
2536 if (tp->thin_dupack)
2537 tcp_disable_early_retrans(tp);
2538 break;
2539
2540 case TCP_REPAIR:
2541 if (!tcp_can_repair_sock(sk))
2542 err = -EPERM;
2543 else if (val == 1) {
2544 tp->repair = 1;
2545 sk->sk_reuse = SK_FORCE_REUSE;
2546 tp->repair_queue = TCP_NO_QUEUE;
2547 } else if (val == 0) {
2548 tp->repair = 0;
2549 sk->sk_reuse = SK_NO_REUSE;
2550 tcp_send_window_probe(sk);
2551 } else
2552 err = -EINVAL;
2553
2554 break;
2555
2556 case TCP_REPAIR_QUEUE:
2557 if (!tp->repair)
2558 err = -EPERM;
2559 else if (val < TCP_QUEUES_NR)
2560 tp->repair_queue = val;
2561 else
2562 err = -EINVAL;
2563 break;
2564
2565 case TCP_QUEUE_SEQ:
2566 if (sk->sk_state != TCP_CLOSE)
2567 err = -EPERM;
2568 else if (tp->repair_queue == TCP_SEND_QUEUE)
2569 tp->write_seq = val;
2570 else if (tp->repair_queue == TCP_RECV_QUEUE)
2571 tp->rcv_nxt = val;
2572 else
2573 err = -EINVAL;
2574 break;
2575
2576 case TCP_REPAIR_OPTIONS:
2577 if (!tp->repair)
2578 err = -EINVAL;
2579 else if (sk->sk_state == TCP_ESTABLISHED)
2580 err = tcp_repair_options_est(tp,
2581 (struct tcp_repair_opt __user *)optval,
2582 optlen);
2583 else
2584 err = -EPERM;
2585 break;
2586
2587 case TCP_CORK:
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599 if (val) {
2600 tp->nonagle |= TCP_NAGLE_CORK;
2601 } else {
2602 tp->nonagle &= ~TCP_NAGLE_CORK;
2603 if (tp->nonagle&TCP_NAGLE_OFF)
2604 tp->nonagle |= TCP_NAGLE_PUSH;
2605 tcp_push_pending_frames(sk);
2606 }
2607 break;
2608
2609 case TCP_KEEPIDLE:
2610 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2611 err = -EINVAL;
2612 else {
2613 tp->keepalive_time = val * HZ;
2614 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2615 !((1 << sk->sk_state) &
2616 (TCPF_CLOSE | TCPF_LISTEN))) {
2617 u32 elapsed = keepalive_time_elapsed(tp);
2618 if (tp->keepalive_time > elapsed)
2619 elapsed = tp->keepalive_time - elapsed;
2620 else
2621 elapsed = 0;
2622 inet_csk_reset_keepalive_timer(sk, elapsed);
2623 }
2624 }
2625 break;
2626 case TCP_KEEPINTVL:
2627 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2628 err = -EINVAL;
2629 else
2630 tp->keepalive_intvl = val * HZ;
2631 break;
2632 case TCP_KEEPCNT:
2633 if (val < 1 || val > MAX_TCP_KEEPCNT)
2634 err = -EINVAL;
2635 else
2636 tp->keepalive_probes = val;
2637 break;
2638 case TCP_SYNCNT:
2639 if (val < 1 || val > MAX_TCP_SYNCNT)
2640 err = -EINVAL;
2641 else
2642 icsk->icsk_syn_retries = val;
2643 break;
2644
2645 case TCP_LINGER2:
2646 if (val < 0)
2647 tp->linger2 = -1;
2648 else if (val > sysctl_tcp_fin_timeout / HZ)
2649 tp->linger2 = 0;
2650 else
2651 tp->linger2 = val * HZ;
2652 break;
2653
2654 case TCP_DEFER_ACCEPT:
2655
2656 icsk->icsk_accept_queue.rskq_defer_accept =
2657 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2658 TCP_RTO_MAX / HZ);
2659 break;
2660
2661 case TCP_WINDOW_CLAMP:
2662 if (!val) {
2663 if (sk->sk_state != TCP_CLOSE) {
2664 err = -EINVAL;
2665 break;
2666 }
2667 tp->window_clamp = 0;
2668 } else
2669 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2670 SOCK_MIN_RCVBUF / 2 : val;
2671 break;
2672
2673 case TCP_QUICKACK:
2674 if (!val) {
2675 icsk->icsk_ack.pingpong = 1;
2676 } else {
2677 icsk->icsk_ack.pingpong = 0;
2678 if ((1 << sk->sk_state) &
2679 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2680 inet_csk_ack_scheduled(sk)) {
2681 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2682 tcp_cleanup_rbuf(sk, 1);
2683 if (!(val & 1))
2684 icsk->icsk_ack.pingpong = 1;
2685 }
2686 }
2687 break;
2688
2689#ifdef CONFIG_TCP_MD5SIG
2690 case TCP_MD5SIG:
2691
2692 err = tp->af_specific->md5_parse(sk, optval, optlen);
2693 break;
2694#endif
2695 case TCP_USER_TIMEOUT:
2696
2697
2698
2699 if (val < 0)
2700 err = -EINVAL;
2701 else
2702 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2703 break;
2704 default:
2705 err = -ENOPROTOOPT;
2706 break;
2707 }
2708
2709 release_sock(sk);
2710 return err;
2711}
2712
2713int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2714 unsigned int optlen)
2715{
2716 const struct inet_connection_sock *icsk = inet_csk(sk);
2717
2718 if (level != SOL_TCP)
2719 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2720 optval, optlen);
2721 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2722}
2723EXPORT_SYMBOL(tcp_setsockopt);
2724
2725#ifdef CONFIG_COMPAT
2726int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2727 char __user *optval, unsigned int optlen)
2728{
2729 if (level != SOL_TCP)
2730 return inet_csk_compat_setsockopt(sk, level, optname,
2731 optval, optlen);
2732 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2733}
2734EXPORT_SYMBOL(compat_tcp_setsockopt);
2735#endif
2736
2737
2738void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2739{
2740 const struct tcp_sock *tp = tcp_sk(sk);
2741 const struct inet_connection_sock *icsk = inet_csk(sk);
2742 u32 now = tcp_time_stamp;
2743
2744 memset(info, 0, sizeof(*info));
2745
2746 info->tcpi_state = sk->sk_state;
2747 info->tcpi_ca_state = icsk->icsk_ca_state;
2748 info->tcpi_retransmits = icsk->icsk_retransmits;
2749 info->tcpi_probes = icsk->icsk_probes_out;
2750 info->tcpi_backoff = icsk->icsk_backoff;
2751
2752 if (tp->rx_opt.tstamp_ok)
2753 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2754 if (tcp_is_sack(tp))
2755 info->tcpi_options |= TCPI_OPT_SACK;
2756 if (tp->rx_opt.wscale_ok) {
2757 info->tcpi_options |= TCPI_OPT_WSCALE;
2758 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2759 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2760 }
2761
2762 if (tp->ecn_flags & TCP_ECN_OK)
2763 info->tcpi_options |= TCPI_OPT_ECN;
2764 if (tp->ecn_flags & TCP_ECN_SEEN)
2765 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2766
2767 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2768 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2769 info->tcpi_snd_mss = tp->mss_cache;
2770 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2771
2772 if (sk->sk_state == TCP_LISTEN) {
2773 info->tcpi_unacked = sk->sk_ack_backlog;
2774 info->tcpi_sacked = sk->sk_max_ack_backlog;
2775 } else {
2776 info->tcpi_unacked = tp->packets_out;
2777 info->tcpi_sacked = tp->sacked_out;
2778 }
2779 info->tcpi_lost = tp->lost_out;
2780 info->tcpi_retrans = tp->retrans_out;
2781 info->tcpi_fackets = tp->fackets_out;
2782
2783 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2784 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2785 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2786
2787 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2788 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2789 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2790 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2791 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2792 info->tcpi_snd_cwnd = tp->snd_cwnd;
2793 info->tcpi_advmss = tp->advmss;
2794 info->tcpi_reordering = tp->reordering;
2795
2796 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2797 info->tcpi_rcv_space = tp->rcvq_space.space;
2798
2799 info->tcpi_total_retrans = tp->total_retrans;
2800}
2801EXPORT_SYMBOL_GPL(tcp_get_info);
2802
2803static int do_tcp_getsockopt(struct sock *sk, int level,
2804 int optname, char __user *optval, int __user *optlen)
2805{
2806 struct inet_connection_sock *icsk = inet_csk(sk);
2807 struct tcp_sock *tp = tcp_sk(sk);
2808 int val, len;
2809
2810 if (get_user(len, optlen))
2811 return -EFAULT;
2812
2813 len = min_t(unsigned int, len, sizeof(int));
2814
2815 if (len < 0)
2816 return -EINVAL;
2817
2818 switch (optname) {
2819 case TCP_MAXSEG:
2820 val = tp->mss_cache;
2821 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2822 val = tp->rx_opt.user_mss;
2823 if (tp->repair)
2824 val = tp->rx_opt.mss_clamp;
2825 break;
2826 case TCP_NODELAY:
2827 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2828 break;
2829 case TCP_CORK:
2830 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2831 break;
2832 case TCP_KEEPIDLE:
2833 val = keepalive_time_when(tp) / HZ;
2834 break;
2835 case TCP_KEEPINTVL:
2836 val = keepalive_intvl_when(tp) / HZ;
2837 break;
2838 case TCP_KEEPCNT:
2839 val = keepalive_probes(tp);
2840 break;
2841 case TCP_SYNCNT:
2842 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2843 break;
2844 case TCP_LINGER2:
2845 val = tp->linger2;
2846 if (val >= 0)
2847 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2848 break;
2849 case TCP_DEFER_ACCEPT:
2850 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2851 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2852 break;
2853 case TCP_WINDOW_CLAMP:
2854 val = tp->window_clamp;
2855 break;
2856 case TCP_INFO: {
2857 struct tcp_info info;
2858
2859 if (get_user(len, optlen))
2860 return -EFAULT;
2861
2862 tcp_get_info(sk, &info);
2863
2864 len = min_t(unsigned int, len, sizeof(info));
2865 if (put_user(len, optlen))
2866 return -EFAULT;
2867 if (copy_to_user(optval, &info, len))
2868 return -EFAULT;
2869 return 0;
2870 }
2871 case TCP_QUICKACK:
2872 val = !icsk->icsk_ack.pingpong;
2873 break;
2874
2875 case TCP_CONGESTION:
2876 if (get_user(len, optlen))
2877 return -EFAULT;
2878 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2879 if (put_user(len, optlen))
2880 return -EFAULT;
2881 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2882 return -EFAULT;
2883 return 0;
2884
2885 case TCP_COOKIE_TRANSACTIONS: {
2886 struct tcp_cookie_transactions ctd;
2887 struct tcp_cookie_values *cvp = tp->cookie_values;
2888
2889 if (get_user(len, optlen))
2890 return -EFAULT;
2891 if (len < sizeof(ctd))
2892 return -EINVAL;
2893
2894 memset(&ctd, 0, sizeof(ctd));
2895 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2896 TCP_COOKIE_IN_ALWAYS : 0)
2897 | (tp->rx_opt.cookie_out_never ?
2898 TCP_COOKIE_OUT_NEVER : 0);
2899
2900 if (cvp != NULL) {
2901 ctd.tcpct_flags |= (cvp->s_data_in ?
2902 TCP_S_DATA_IN : 0)
2903 | (cvp->s_data_out ?
2904 TCP_S_DATA_OUT : 0);
2905
2906 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2907 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2908
2909 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2910 cvp->cookie_pair_size);
2911 ctd.tcpct_used = cvp->cookie_pair_size;
2912 }
2913
2914 if (put_user(sizeof(ctd), optlen))
2915 return -EFAULT;
2916 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2917 return -EFAULT;
2918 return 0;
2919 }
2920 case TCP_THIN_LINEAR_TIMEOUTS:
2921 val = tp->thin_lto;
2922 break;
2923 case TCP_THIN_DUPACK:
2924 val = tp->thin_dupack;
2925 break;
2926
2927 case TCP_REPAIR:
2928 val = tp->repair;
2929 break;
2930
2931 case TCP_REPAIR_QUEUE:
2932 if (tp->repair)
2933 val = tp->repair_queue;
2934 else
2935 return -EINVAL;
2936 break;
2937
2938 case TCP_QUEUE_SEQ:
2939 if (tp->repair_queue == TCP_SEND_QUEUE)
2940 val = tp->write_seq;
2941 else if (tp->repair_queue == TCP_RECV_QUEUE)
2942 val = tp->rcv_nxt;
2943 else
2944 return -EINVAL;
2945 break;
2946
2947 case TCP_USER_TIMEOUT:
2948 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2949 break;
2950 default:
2951 return -ENOPROTOOPT;
2952 }
2953
2954 if (put_user(len, optlen))
2955 return -EFAULT;
2956 if (copy_to_user(optval, &val, len))
2957 return -EFAULT;
2958 return 0;
2959}
2960
2961int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2962 int __user *optlen)
2963{
2964 struct inet_connection_sock *icsk = inet_csk(sk);
2965
2966 if (level != SOL_TCP)
2967 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2968 optval, optlen);
2969 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2970}
2971EXPORT_SYMBOL(tcp_getsockopt);
2972
2973#ifdef CONFIG_COMPAT
2974int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2975 char __user *optval, int __user *optlen)
2976{
2977 if (level != SOL_TCP)
2978 return inet_csk_compat_getsockopt(sk, level, optname,
2979 optval, optlen);
2980 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2981}
2982EXPORT_SYMBOL(compat_tcp_getsockopt);
2983#endif
2984
2985struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2986 netdev_features_t features)
2987{
2988 struct sk_buff *segs = ERR_PTR(-EINVAL);
2989 struct tcphdr *th;
2990 unsigned int thlen;
2991 unsigned int seq;
2992 __be32 delta;
2993 unsigned int oldlen;
2994 unsigned int mss;
2995
2996 if (!pskb_may_pull(skb, sizeof(*th)))
2997 goto out;
2998
2999 th = tcp_hdr(skb);
3000 thlen = th->doff * 4;
3001 if (thlen < sizeof(*th))
3002 goto out;
3003
3004 if (!pskb_may_pull(skb, thlen))
3005 goto out;
3006
3007 oldlen = (u16)~skb->len;
3008 __skb_pull(skb, thlen);
3009
3010 mss = skb_shinfo(skb)->gso_size;
3011 if (unlikely(skb->len <= mss))
3012 goto out;
3013
3014 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
3015
3016 int type = skb_shinfo(skb)->gso_type;
3017
3018 if (unlikely(type &
3019 ~(SKB_GSO_TCPV4 |
3020 SKB_GSO_DODGY |
3021 SKB_GSO_TCP_ECN |
3022 SKB_GSO_TCPV6 |
3023 0) ||
3024 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
3025 goto out;
3026
3027 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
3028
3029 segs = NULL;
3030 goto out;
3031 }
3032
3033 segs = skb_segment(skb, features);
3034 if (IS_ERR(segs))
3035 goto out;
3036
3037 delta = htonl(oldlen + (thlen + mss));
3038
3039 skb = segs;
3040 th = tcp_hdr(skb);
3041 seq = ntohl(th->seq);
3042
3043 do {
3044 th->fin = th->psh = 0;
3045
3046 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3047 (__force u32)delta));
3048 if (skb->ip_summed != CHECKSUM_PARTIAL)
3049 th->check =
3050 csum_fold(csum_partial(skb_transport_header(skb),
3051 thlen, skb->csum));
3052
3053 seq += mss;
3054 skb = skb->next;
3055 th = tcp_hdr(skb);
3056
3057 th->seq = htonl(seq);
3058 th->cwr = 0;
3059 } while (skb->next);
3060
3061 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
3062 skb->data_len);
3063 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3064 (__force u32)delta));
3065 if (skb->ip_summed != CHECKSUM_PARTIAL)
3066 th->check = csum_fold(csum_partial(skb_transport_header(skb),
3067 thlen, skb->csum));
3068
3069out:
3070 return segs;
3071}
3072EXPORT_SYMBOL(tcp_tso_segment);
3073
3074struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3075{
3076 struct sk_buff **pp = NULL;
3077 struct sk_buff *p;
3078 struct tcphdr *th;
3079 struct tcphdr *th2;
3080 unsigned int len;
3081 unsigned int thlen;
3082 __be32 flags;
3083 unsigned int mss = 1;
3084 unsigned int hlen;
3085 unsigned int off;
3086 int flush = 1;
3087 int i;
3088
3089 off = skb_gro_offset(skb);
3090 hlen = off + sizeof(*th);
3091 th = skb_gro_header_fast(skb, off);
3092 if (skb_gro_header_hard(skb, hlen)) {
3093 th = skb_gro_header_slow(skb, hlen, off);
3094 if (unlikely(!th))
3095 goto out;
3096 }
3097
3098 thlen = th->doff * 4;
3099 if (thlen < sizeof(*th))
3100 goto out;
3101
3102 hlen = off + thlen;
3103 if (skb_gro_header_hard(skb, hlen)) {
3104 th = skb_gro_header_slow(skb, hlen, off);
3105 if (unlikely(!th))
3106 goto out;
3107 }
3108
3109 skb_gro_pull(skb, thlen);
3110
3111 len = skb_gro_len(skb);
3112 flags = tcp_flag_word(th);
3113
3114 for (; (p = *head); head = &p->next) {
3115 if (!NAPI_GRO_CB(p)->same_flow)
3116 continue;
3117
3118 th2 = tcp_hdr(p);
3119
3120 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
3121 NAPI_GRO_CB(p)->same_flow = 0;
3122 continue;
3123 }
3124
3125 goto found;
3126 }
3127
3128 goto out_check_final;
3129
3130found:
3131 flush = NAPI_GRO_CB(p)->flush;
3132 flush |= (__force int)(flags & TCP_FLAG_CWR);
3133 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
3134 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
3135 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
3136 for (i = sizeof(*th); i < thlen; i += 4)
3137 flush |= *(u32 *)((u8 *)th + i) ^
3138 *(u32 *)((u8 *)th2 + i);
3139
3140 mss = skb_shinfo(p)->gso_size;
3141
3142 flush |= (len - 1) >= mss;
3143 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
3144
3145 if (flush || skb_gro_receive(head, skb)) {
3146 mss = 1;
3147 goto out_check_final;
3148 }
3149
3150 p = *head;
3151 th2 = tcp_hdr(p);
3152 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
3153
3154out_check_final:
3155 flush = len < mss;
3156 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
3157 TCP_FLAG_RST | TCP_FLAG_SYN |
3158 TCP_FLAG_FIN));
3159
3160 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
3161 pp = head;
3162
3163out:
3164 NAPI_GRO_CB(skb)->flush |= flush;
3165
3166 return pp;
3167}
3168EXPORT_SYMBOL(tcp_gro_receive);
3169
3170int tcp_gro_complete(struct sk_buff *skb)
3171{
3172 struct tcphdr *th = tcp_hdr(skb);
3173
3174 skb->csum_start = skb_transport_header(skb) - skb->head;
3175 skb->csum_offset = offsetof(struct tcphdr, check);
3176 skb->ip_summed = CHECKSUM_PARTIAL;
3177
3178 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
3179
3180 if (th->cwr)
3181 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
3182
3183 return 0;
3184}
3185EXPORT_SYMBOL(tcp_gro_complete);
3186
3187#ifdef CONFIG_TCP_MD5SIG
3188static unsigned long tcp_md5sig_users;
3189static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
3190static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
3191
3192static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
3193{
3194 int cpu;
3195
3196 for_each_possible_cpu(cpu) {
3197 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
3198
3199 if (p->md5_desc.tfm)
3200 crypto_free_hash(p->md5_desc.tfm);
3201 }
3202 free_percpu(pool);
3203}
3204
3205void tcp_free_md5sig_pool(void)
3206{
3207 struct tcp_md5sig_pool __percpu *pool = NULL;
3208
3209 spin_lock_bh(&tcp_md5sig_pool_lock);
3210 if (--tcp_md5sig_users == 0) {
3211 pool = tcp_md5sig_pool;
3212 tcp_md5sig_pool = NULL;
3213 }
3214 spin_unlock_bh(&tcp_md5sig_pool_lock);
3215 if (pool)
3216 __tcp_free_md5sig_pool(pool);
3217}
3218EXPORT_SYMBOL(tcp_free_md5sig_pool);
3219
3220static struct tcp_md5sig_pool __percpu *
3221__tcp_alloc_md5sig_pool(struct sock *sk)
3222{
3223 int cpu;
3224 struct tcp_md5sig_pool __percpu *pool;
3225
3226 pool = alloc_percpu(struct tcp_md5sig_pool);
3227 if (!pool)
3228 return NULL;
3229
3230 for_each_possible_cpu(cpu) {
3231 struct crypto_hash *hash;
3232
3233 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3234 if (!hash || IS_ERR(hash))
3235 goto out_free;
3236
3237 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
3238 }
3239 return pool;
3240out_free:
3241 __tcp_free_md5sig_pool(pool);
3242 return NULL;
3243}
3244
3245struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
3246{
3247 struct tcp_md5sig_pool __percpu *pool;
3248 bool alloc = false;
3249
3250retry:
3251 spin_lock_bh(&tcp_md5sig_pool_lock);
3252 pool = tcp_md5sig_pool;
3253 if (tcp_md5sig_users++ == 0) {
3254 alloc = true;
3255 spin_unlock_bh(&tcp_md5sig_pool_lock);
3256 } else if (!pool) {
3257 tcp_md5sig_users--;
3258 spin_unlock_bh(&tcp_md5sig_pool_lock);
3259 cpu_relax();
3260 goto retry;
3261 } else
3262 spin_unlock_bh(&tcp_md5sig_pool_lock);
3263
3264 if (alloc) {
3265
3266 struct tcp_md5sig_pool __percpu *p;
3267
3268 p = __tcp_alloc_md5sig_pool(sk);
3269 spin_lock_bh(&tcp_md5sig_pool_lock);
3270 if (!p) {
3271 tcp_md5sig_users--;
3272 spin_unlock_bh(&tcp_md5sig_pool_lock);
3273 return NULL;
3274 }
3275 pool = tcp_md5sig_pool;
3276 if (pool) {
3277
3278 spin_unlock_bh(&tcp_md5sig_pool_lock);
3279 __tcp_free_md5sig_pool(p);
3280 } else {
3281 tcp_md5sig_pool = pool = p;
3282 spin_unlock_bh(&tcp_md5sig_pool_lock);
3283 }
3284 }
3285 return pool;
3286}
3287EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3298{
3299 struct tcp_md5sig_pool __percpu *p;
3300
3301 local_bh_disable();
3302
3303 spin_lock(&tcp_md5sig_pool_lock);
3304 p = tcp_md5sig_pool;
3305 if (p)
3306 tcp_md5sig_users++;
3307 spin_unlock(&tcp_md5sig_pool_lock);
3308
3309 if (p)
3310 return this_cpu_ptr(p);
3311
3312 local_bh_enable();
3313 return NULL;
3314}
3315EXPORT_SYMBOL(tcp_get_md5sig_pool);
3316
3317void tcp_put_md5sig_pool(void)
3318{
3319 local_bh_enable();
3320 tcp_free_md5sig_pool();
3321}
3322EXPORT_SYMBOL(tcp_put_md5sig_pool);
3323
3324int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3325 const struct tcphdr *th)
3326{
3327 struct scatterlist sg;
3328 struct tcphdr hdr;
3329 int err;
3330
3331
3332 memcpy(&hdr, th, sizeof(hdr));
3333 hdr.check = 0;
3334
3335
3336 sg_init_one(&sg, &hdr, sizeof(hdr));
3337 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3338 return err;
3339}
3340EXPORT_SYMBOL(tcp_md5_hash_header);
3341
3342int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3343 const struct sk_buff *skb, unsigned int header_len)
3344{
3345 struct scatterlist sg;
3346 const struct tcphdr *tp = tcp_hdr(skb);
3347 struct hash_desc *desc = &hp->md5_desc;
3348 unsigned int i;
3349 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3350 skb_headlen(skb) - header_len : 0;
3351 const struct skb_shared_info *shi = skb_shinfo(skb);
3352 struct sk_buff *frag_iter;
3353
3354 sg_init_table(&sg, 1);
3355
3356 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3357 if (crypto_hash_update(desc, &sg, head_data_len))
3358 return 1;
3359
3360 for (i = 0; i < shi->nr_frags; ++i) {
3361 const struct skb_frag_struct *f = &shi->frags[i];
3362 struct page *page = skb_frag_page(f);
3363 sg_set_page(&sg, page, skb_frag_size(f), f->page_offset);
3364 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3365 return 1;
3366 }
3367
3368 skb_walk_frags(skb, frag_iter)
3369 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3370 return 1;
3371
3372 return 0;
3373}
3374EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3375
3376int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3377{
3378 struct scatterlist sg;
3379
3380 sg_init_one(&sg, key->key, key->keylen);
3381 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3382}
3383EXPORT_SYMBOL(tcp_md5_hash_key);
3384
3385#endif
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410struct tcp_cookie_secret {
3411
3412
3413
3414
3415
3416 u32 secrets[COOKIE_WORKSPACE_WORDS];
3417 unsigned long expires;
3418};
3419
3420#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3421#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3422#define TCP_SECRET_LIFE (HZ * 600)
3423
3424static struct tcp_cookie_secret tcp_secret_one;
3425static struct tcp_cookie_secret tcp_secret_two;
3426
3427
3428static struct tcp_cookie_secret *tcp_secret_generating;
3429static struct tcp_cookie_secret *tcp_secret_primary;
3430static struct tcp_cookie_secret *tcp_secret_retiring;
3431static struct tcp_cookie_secret *tcp_secret_secondary;
3432
3433static DEFINE_SPINLOCK(tcp_secret_locker);
3434
3435
3436
3437static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3438{
3439 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3440}
3441
3442
3443
3444
3445
3446int tcp_cookie_generator(u32 *bakery)
3447{
3448 unsigned long jiffy = jiffies;
3449
3450 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3451 spin_lock_bh(&tcp_secret_locker);
3452 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3453
3454 memcpy(bakery,
3455 &tcp_secret_generating->secrets[0],
3456 COOKIE_WORKSPACE_WORDS);
3457 } else {
3458
3459 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470 if (unlikely(tcp_secret_primary->expires ==
3471 tcp_secret_secondary->expires)) {
3472 struct timespec tv;
3473
3474 getnstimeofday(&tv);
3475 bakery[COOKIE_DIGEST_WORDS+0] ^=
3476 (u32)tv.tv_nsec;
3477
3478 tcp_secret_secondary->expires = jiffy
3479 + TCP_SECRET_1MSL
3480 + (0x0f & tcp_cookie_work(bakery, 0));
3481 } else {
3482 tcp_secret_secondary->expires = jiffy
3483 + TCP_SECRET_LIFE
3484 + (0xff & tcp_cookie_work(bakery, 1));
3485 tcp_secret_primary->expires = jiffy
3486 + TCP_SECRET_2MSL
3487 + (0x1f & tcp_cookie_work(bakery, 2));
3488 }
3489 memcpy(&tcp_secret_secondary->secrets[0],
3490 bakery, COOKIE_WORKSPACE_WORDS);
3491
3492 rcu_assign_pointer(tcp_secret_generating,
3493 tcp_secret_secondary);
3494 rcu_assign_pointer(tcp_secret_retiring,
3495 tcp_secret_primary);
3496
3497
3498
3499
3500
3501
3502 }
3503 spin_unlock_bh(&tcp_secret_locker);
3504 } else {
3505 rcu_read_lock_bh();
3506 memcpy(bakery,
3507 &rcu_dereference(tcp_secret_generating)->secrets[0],
3508 COOKIE_WORKSPACE_WORDS);
3509 rcu_read_unlock_bh();
3510 }
3511 return 0;
3512}
3513EXPORT_SYMBOL(tcp_cookie_generator);
3514
3515void tcp_done(struct sock *sk)
3516{
3517 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3518 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3519
3520 tcp_set_state(sk, TCP_CLOSE);
3521 tcp_clear_xmit_timers(sk);
3522
3523 sk->sk_shutdown = SHUTDOWN_MASK;
3524
3525 if (!sock_flag(sk, SOCK_DEAD))
3526 sk->sk_state_change(sk);
3527 else
3528 inet_csk_destroy_sock(sk);
3529}
3530EXPORT_SYMBOL_GPL(tcp_done);
3531
3532extern struct tcp_congestion_ops tcp_reno;
3533
3534static __initdata unsigned long thash_entries;
3535static int __init set_thash_entries(char *str)
3536{
3537 ssize_t ret;
3538
3539 if (!str)
3540 return 0;
3541
3542 ret = kstrtoul(str, 0, &thash_entries);
3543 if (ret)
3544 return 0;
3545
3546 return 1;
3547}
3548__setup("thash_entries=", set_thash_entries);
3549
3550void tcp_init_mem(struct net *net)
3551{
3552 unsigned long limit = nr_free_buffer_pages() / 8;
3553 limit = max(limit, 128UL);
3554 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3555 net->ipv4.sysctl_tcp_mem[1] = limit;
3556 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3557}
3558
3559void __init tcp_init(void)
3560{
3561 struct sk_buff *skb = NULL;
3562 unsigned long limit;
3563 int max_rshare, max_wshare, cnt;
3564 unsigned int i;
3565 unsigned long jiffy = jiffies;
3566
3567 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3568
3569 percpu_counter_init(&tcp_sockets_allocated, 0);
3570 percpu_counter_init(&tcp_orphan_count, 0);
3571 tcp_hashinfo.bind_bucket_cachep =
3572 kmem_cache_create("tcp_bind_bucket",
3573 sizeof(struct inet_bind_bucket), 0,
3574 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3575
3576
3577
3578
3579
3580
3581 tcp_hashinfo.ehash =
3582 alloc_large_system_hash("TCP established",
3583 sizeof(struct inet_ehash_bucket),
3584 thash_entries,
3585 (totalram_pages >= 128 * 1024) ?
3586 13 : 15,
3587 0,
3588 NULL,
3589 &tcp_hashinfo.ehash_mask,
3590 0,
3591 thash_entries ? 0 : 512 * 1024);
3592 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3593 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3594 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3595 }
3596 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3597 panic("TCP: failed to alloc ehash_locks");
3598 tcp_hashinfo.bhash =
3599 alloc_large_system_hash("TCP bind",
3600 sizeof(struct inet_bind_hashbucket),
3601 tcp_hashinfo.ehash_mask + 1,
3602 (totalram_pages >= 128 * 1024) ?
3603 13 : 15,
3604 0,
3605 &tcp_hashinfo.bhash_size,
3606 NULL,
3607 0,
3608 64 * 1024);
3609 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3610 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3611 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3612 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3613 }
3614
3615
3616 cnt = tcp_hashinfo.ehash_mask + 1;
3617
3618 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3619 sysctl_tcp_max_orphans = cnt / 2;
3620 sysctl_max_syn_backlog = max(128, cnt / 256);
3621
3622 tcp_init_mem(&init_net);
3623
3624 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3625 max_wshare = min(4UL*1024*1024, limit);
3626 max_rshare = min(6UL*1024*1024, limit);
3627
3628 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3629 sysctl_tcp_wmem[1] = 16*1024;
3630 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3631
3632 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3633 sysctl_tcp_rmem[1] = 87380;
3634 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3635
3636 pr_info("Hash tables configured (established %u bind %u)\n",
3637 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3638
3639 tcp_metrics_init();
3640
3641 tcp_register_congestion_control(&tcp_reno);
3642
3643 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3644 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3645 tcp_secret_one.expires = jiffy;
3646 tcp_secret_two.expires = jiffy;
3647 tcp_secret_generating = &tcp_secret_one;
3648 tcp_secret_primary = &tcp_secret_one;
3649 tcp_secret_retiring = &tcp_secret_two;
3650 tcp_secret_secondary = &tcp_secret_two;
3651 tcp_tasklet_init();
3652}
3653