1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282
283int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
284
285struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count);
287
288int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly;
290
291EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem);
293
294atomic_long_t tcp_memory_allocated;
295EXPORT_SYMBOL(tcp_memory_allocated);
296
297
298
299
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303
304
305
306struct tcp_splice_state {
307 struct pipe_inode_info *pipe;
308 size_t len;
309 unsigned int flags;
310};
311
312
313
314
315
316
317
318int tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL(tcp_memory_pressure);
320
321void tcp_enter_memory_pressure(struct sock *sk)
322{
323 if (!tcp_memory_pressure) {
324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
325 tcp_memory_pressure = 1;
326 }
327}
328EXPORT_SYMBOL(tcp_enter_memory_pressure);
329
330
331static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
332{
333 u8 res = 0;
334
335 if (seconds > 0) {
336 int period = timeout;
337
338 res = 1;
339 while (seconds > period && res < 255) {
340 res++;
341 timeout <<= 1;
342 if (timeout > rto_max)
343 timeout = rto_max;
344 period += timeout;
345 }
346 }
347 return res;
348}
349
350
351static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
352{
353 int period = 0;
354
355 if (retrans > 0) {
356 period = timeout;
357 while (--retrans) {
358 timeout <<= 1;
359 if (timeout > rto_max)
360 timeout = rto_max;
361 period += timeout;
362 }
363 }
364 return period;
365}
366
367
368
369
370
371
372void tcp_init_sock(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375 struct tcp_sock *tp = tcp_sk(sk);
376
377 skb_queue_head_init(&tp->out_of_order_queue);
378 tcp_init_xmit_timers(sk);
379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
381
382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
383 tp->mdev = TCP_TIMEOUT_INIT;
384
385
386
387
388
389
390 tp->snd_cwnd = TCP_INIT_CWND;
391
392
393
394
395 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396 tp->snd_cwnd_clamp = ~0;
397 tp->mss_cache = TCP_MSS_DEFAULT;
398
399 tp->reordering = sysctl_tcp_reordering;
400 tcp_enable_early_retrans(tp);
401 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403 sk->sk_state = TCP_CLOSE;
404
405 sk->sk_write_space = sk_stream_write_space;
406 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
407
408 icsk->icsk_sync_mss = tcp_sync_mss;
409
410
411 if (sysctl_tcp_cookie_size > 0) {
412
413 tp->cookie_values =
414 kzalloc(sizeof(*tp->cookie_values),
415 sk->sk_allocation);
416 if (tp->cookie_values != NULL)
417 kref_init(&tp->cookie_values->kref);
418 }
419
420
421
422
423 sk->sk_sndbuf = sysctl_tcp_wmem[1];
424 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
425
426 local_bh_disable();
427 sock_update_memcg(sk);
428 sk_sockets_allocated_inc(sk);
429 local_bh_enable();
430}
431EXPORT_SYMBOL(tcp_init_sock);
432
433
434
435
436
437
438
439
440unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
441{
442 unsigned int mask;
443 struct sock *sk = sock->sk;
444 const struct tcp_sock *tp = tcp_sk(sk);
445
446 sock_poll_wait(file, sk_sleep(sk), wait);
447 if (sk->sk_state == TCP_LISTEN)
448 return inet_csk_listen_poll(sk);
449
450
451
452
453
454
455 mask = 0;
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
485 mask |= POLLHUP;
486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488
489
490 if (sk->sk_state != TCP_SYN_SENT &&
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
492 int target = sock_rcvlowat(sk, 0, INT_MAX);
493
494 if (tp->urg_seq == tp->copied_seq &&
495 !sock_flag(sk, SOCK_URGINLINE) &&
496 tp->urg_data)
497 target++;
498
499
500
501
502 if (tp->rcv_nxt - tp->copied_seq >= target)
503 mask |= POLLIN | POLLRDNORM;
504
505 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
506 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
507 mask |= POLLOUT | POLLWRNORM;
508 } else {
509 set_bit(SOCK_ASYNC_NOSPACE,
510 &sk->sk_socket->flags);
511 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
512
513
514
515
516
517 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
518 mask |= POLLOUT | POLLWRNORM;
519 }
520 } else
521 mask |= POLLOUT | POLLWRNORM;
522
523 if (tp->urg_data & TCP_URG_VALID)
524 mask |= POLLPRI;
525 }
526
527 smp_rmb();
528 if (sk->sk_err)
529 mask |= POLLERR;
530
531 return mask;
532}
533EXPORT_SYMBOL(tcp_poll);
534
535int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
536{
537 struct tcp_sock *tp = tcp_sk(sk);
538 int answ;
539 bool slow;
540
541 switch (cmd) {
542 case SIOCINQ:
543 if (sk->sk_state == TCP_LISTEN)
544 return -EINVAL;
545
546 slow = lock_sock_fast(sk);
547 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
548 answ = 0;
549 else if (sock_flag(sk, SOCK_URGINLINE) ||
550 !tp->urg_data ||
551 before(tp->urg_seq, tp->copied_seq) ||
552 !before(tp->urg_seq, tp->rcv_nxt)) {
553
554 answ = tp->rcv_nxt - tp->copied_seq;
555
556
557 if (answ && sock_flag(sk, SOCK_DONE))
558 answ--;
559 } else
560 answ = tp->urg_seq - tp->copied_seq;
561 unlock_sock_fast(sk, slow);
562 break;
563 case SIOCATMARK:
564 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
565 break;
566 case SIOCOUTQ:
567 if (sk->sk_state == TCP_LISTEN)
568 return -EINVAL;
569
570 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
571 answ = 0;
572 else
573 answ = tp->write_seq - tp->snd_una;
574 break;
575 case SIOCOUTQNSD:
576 if (sk->sk_state == TCP_LISTEN)
577 return -EINVAL;
578
579 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
580 answ = 0;
581 else
582 answ = tp->write_seq - tp->snd_nxt;
583 break;
584 default:
585 return -ENOIOCTLCMD;
586 }
587
588 return put_user(answ, (int __user *)arg);
589}
590EXPORT_SYMBOL(tcp_ioctl);
591
592static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
593{
594 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
595 tp->pushed_seq = tp->write_seq;
596}
597
598static inline bool forced_push(const struct tcp_sock *tp)
599{
600 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
601}
602
603static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
604{
605 struct tcp_sock *tp = tcp_sk(sk);
606 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
607
608 skb->csum = 0;
609 tcb->seq = tcb->end_seq = tp->write_seq;
610 tcb->tcp_flags = TCPHDR_ACK;
611 tcb->sacked = 0;
612 skb_header_release(skb);
613 tcp_add_write_queue_tail(sk, skb);
614 sk->sk_wmem_queued += skb->truesize;
615 sk_mem_charge(sk, skb->truesize);
616 if (tp->nonagle & TCP_NAGLE_PUSH)
617 tp->nonagle &= ~TCP_NAGLE_PUSH;
618}
619
620static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
621{
622 if (flags & MSG_OOB)
623 tp->snd_up = tp->write_seq;
624}
625
626static inline void tcp_push(struct sock *sk, int flags, int mss_now,
627 int nonagle)
628{
629 if (tcp_send_head(sk)) {
630 struct tcp_sock *tp = tcp_sk(sk);
631
632 if (!(flags & MSG_MORE) || forced_push(tp))
633 tcp_mark_push(tp, tcp_write_queue_tail(sk));
634
635 tcp_mark_urg(tp, flags);
636 __tcp_push_pending_frames(sk, mss_now,
637 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
638 }
639}
640
641static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
642 unsigned int offset, size_t len)
643{
644 struct tcp_splice_state *tss = rd_desc->arg.data;
645 int ret;
646
647 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
648 tss->flags);
649 if (ret > 0)
650 rd_desc->count -= ret;
651 return ret;
652}
653
654static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
655{
656
657 read_descriptor_t rd_desc = {
658 .arg.data = tss,
659 .count = tss->len,
660 };
661
662 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
663}
664
665
666
667
668
669
670
671
672
673
674
675
676
677ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
678 struct pipe_inode_info *pipe, size_t len,
679 unsigned int flags)
680{
681 struct sock *sk = sock->sk;
682 struct tcp_splice_state tss = {
683 .pipe = pipe,
684 .len = len,
685 .flags = flags,
686 };
687 long timeo;
688 ssize_t spliced;
689 int ret;
690
691 sock_rps_record_flow(sk);
692
693
694
695 if (unlikely(*ppos))
696 return -ESPIPE;
697
698 ret = spliced = 0;
699
700 lock_sock(sk);
701
702 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
703 while (tss.len) {
704 ret = __tcp_splice_read(sk, &tss);
705 if (ret < 0)
706 break;
707 else if (!ret) {
708 if (spliced)
709 break;
710 if (sock_flag(sk, SOCK_DONE))
711 break;
712 if (sk->sk_err) {
713 ret = sock_error(sk);
714 break;
715 }
716 if (sk->sk_shutdown & RCV_SHUTDOWN)
717 break;
718 if (sk->sk_state == TCP_CLOSE) {
719
720
721
722
723 if (!sock_flag(sk, SOCK_DONE))
724 ret = -ENOTCONN;
725 break;
726 }
727 if (!timeo) {
728 ret = -EAGAIN;
729 break;
730 }
731 sk_wait_data(sk, &timeo);
732 if (signal_pending(current)) {
733 ret = sock_intr_errno(timeo);
734 break;
735 }
736 continue;
737 }
738 tss.len -= ret;
739 spliced += ret;
740
741 if (!timeo)
742 break;
743 release_sock(sk);
744 lock_sock(sk);
745
746 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
747 (sk->sk_shutdown & RCV_SHUTDOWN) ||
748 signal_pending(current))
749 break;
750 }
751
752 release_sock(sk);
753
754 if (spliced)
755 return spliced;
756
757 return ret;
758}
759EXPORT_SYMBOL(tcp_splice_read);
760
761struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
762{
763 struct sk_buff *skb;
764
765
766 size = ALIGN(size, 4);
767
768 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
769 if (skb) {
770 if (sk_wmem_schedule(sk, skb->truesize)) {
771 skb_reserve(skb, sk->sk_prot->max_header);
772
773
774
775
776 skb->reserved_tailroom = skb->end - skb->tail - size;
777 return skb;
778 }
779 __kfree_skb(skb);
780 } else {
781 sk->sk_prot->enter_memory_pressure(sk);
782 sk_stream_moderate_sndbuf(sk);
783 }
784 return NULL;
785}
786
787static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
788 int large_allowed)
789{
790 struct tcp_sock *tp = tcp_sk(sk);
791 u32 xmit_size_goal, old_size_goal;
792
793 xmit_size_goal = mss_now;
794
795 if (large_allowed && sk_can_gso(sk)) {
796 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
797 inet_csk(sk)->icsk_af_ops->net_header_len -
798 inet_csk(sk)->icsk_ext_hdr_len -
799 tp->tcp_header_len);
800
801
802 xmit_size_goal = min_t(u32, xmit_size_goal,
803 sysctl_tcp_limit_output_bytes >> 1);
804
805 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
806
807
808 old_size_goal = tp->xmit_size_goal_segs * mss_now;
809
810 if (likely(old_size_goal <= xmit_size_goal &&
811 old_size_goal + mss_now > xmit_size_goal)) {
812 xmit_size_goal = old_size_goal;
813 } else {
814 tp->xmit_size_goal_segs =
815 min_t(u16, xmit_size_goal / mss_now,
816 sk->sk_gso_max_segs);
817 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
818 }
819 }
820
821 return max(xmit_size_goal, mss_now);
822}
823
824static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
825{
826 int mss_now;
827
828 mss_now = tcp_current_mss(sk);
829 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
830
831 return mss_now;
832}
833
834static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
835 size_t size, int flags)
836{
837 struct tcp_sock *tp = tcp_sk(sk);
838 int mss_now, size_goal;
839 int err;
840 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842
843
844
845
846
847 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
848 !tcp_passive_fastopen(sk)) {
849 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
850 goto out_err;
851 }
852
853 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
854
855 mss_now = tcp_send_mss(sk, &size_goal, flags);
856 copied = 0;
857
858 err = -EPIPE;
859 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
860 goto out_err;
861
862 while (size > 0) {
863 struct sk_buff *skb = tcp_write_queue_tail(sk);
864 int copy, i;
865 bool can_coalesce;
866
867 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
868new_segment:
869 if (!sk_stream_memory_free(sk))
870 goto wait_for_sndbuf;
871
872 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
873 if (!skb)
874 goto wait_for_memory;
875
876 skb_entail(sk, skb);
877 copy = size_goal;
878 }
879
880 if (copy > size)
881 copy = size;
882
883 i = skb_shinfo(skb)->nr_frags;
884 can_coalesce = skb_can_coalesce(skb, i, page, offset);
885 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
886 tcp_mark_push(tp, skb);
887 goto new_segment;
888 }
889 if (!sk_wmem_schedule(sk, copy))
890 goto wait_for_memory;
891
892 if (can_coalesce) {
893 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
894 } else {
895 get_page(page);
896 skb_fill_page_desc(skb, i, page, offset, copy);
897 }
898
899 skb->len += copy;
900 skb->data_len += copy;
901 skb->truesize += copy;
902 sk->sk_wmem_queued += copy;
903 sk_mem_charge(sk, copy);
904 skb->ip_summed = CHECKSUM_PARTIAL;
905 tp->write_seq += copy;
906 TCP_SKB_CB(skb)->end_seq += copy;
907 skb_shinfo(skb)->gso_segs = 0;
908
909 if (!copied)
910 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
911
912 copied += copy;
913 offset += copy;
914 if (!(size -= copy))
915 goto out;
916
917 if (skb->len < size_goal || (flags & MSG_OOB))
918 continue;
919
920 if (forced_push(tp)) {
921 tcp_mark_push(tp, skb);
922 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
923 } else if (skb == tcp_send_head(sk))
924 tcp_push_one(sk, mss_now);
925 continue;
926
927wait_for_sndbuf:
928 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
929wait_for_memory:
930 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
931
932 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
933 goto do_error;
934
935 mss_now = tcp_send_mss(sk, &size_goal, flags);
936 }
937
938out:
939 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
940 tcp_push(sk, flags, mss_now, tp->nonagle);
941 return copied;
942
943do_error:
944 if (copied)
945 goto out;
946out_err:
947 return sk_stream_error(sk, flags, err);
948}
949
950int tcp_sendpage(struct sock *sk, struct page *page, int offset,
951 size_t size, int flags)
952{
953 ssize_t res;
954
955 if (!(sk->sk_route_caps & NETIF_F_SG) ||
956 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
957 return sock_no_sendpage(sk->sk_socket, page, offset, size,
958 flags);
959
960 lock_sock(sk);
961 res = do_tcp_sendpages(sk, page, offset, size, flags);
962 release_sock(sk);
963 return res;
964}
965EXPORT_SYMBOL(tcp_sendpage);
966
967static inline int select_size(const struct sock *sk, bool sg)
968{
969 const struct tcp_sock *tp = tcp_sk(sk);
970 int tmp = tp->mss_cache;
971
972 if (sg) {
973 if (sk_can_gso(sk)) {
974
975
976
977 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
978 } else {
979 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
980
981 if (tmp >= pgbreak &&
982 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
983 tmp = pgbreak;
984 }
985 }
986
987 return tmp;
988}
989
990void tcp_free_fastopen_req(struct tcp_sock *tp)
991{
992 if (tp->fastopen_req != NULL) {
993 kfree(tp->fastopen_req);
994 tp->fastopen_req = NULL;
995 }
996}
997
998static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
999{
1000 struct tcp_sock *tp = tcp_sk(sk);
1001 int err, flags;
1002
1003 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1004 return -EOPNOTSUPP;
1005 if (tp->fastopen_req != NULL)
1006 return -EALREADY;
1007
1008 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1009 sk->sk_allocation);
1010 if (unlikely(tp->fastopen_req == NULL))
1011 return -ENOBUFS;
1012 tp->fastopen_req->data = msg;
1013
1014 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1015 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1016 msg->msg_namelen, flags);
1017 *size = tp->fastopen_req->copied;
1018 tcp_free_fastopen_req(tp);
1019 return err;
1020}
1021
1022int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1023 size_t size)
1024{
1025 struct iovec *iov;
1026 struct tcp_sock *tp = tcp_sk(sk);
1027 struct sk_buff *skb;
1028 int iovlen, flags, err, copied = 0;
1029 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1030 bool sg;
1031 long timeo;
1032
1033 lock_sock(sk);
1034
1035 flags = msg->msg_flags;
1036 if (flags & MSG_FASTOPEN) {
1037 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1038 if (err == -EINPROGRESS && copied_syn > 0)
1039 goto out;
1040 else if (err)
1041 goto out_err;
1042 offset = copied_syn;
1043 }
1044
1045 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1046
1047
1048
1049
1050
1051 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1052 !tcp_passive_fastopen(sk)) {
1053 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1054 goto do_error;
1055 }
1056
1057 if (unlikely(tp->repair)) {
1058 if (tp->repair_queue == TCP_RECV_QUEUE) {
1059 copied = tcp_send_rcvq(sk, msg, size);
1060 goto out;
1061 }
1062
1063 err = -EINVAL;
1064 if (tp->repair_queue == TCP_NO_QUEUE)
1065 goto out_err;
1066
1067
1068 }
1069
1070
1071 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1072
1073 mss_now = tcp_send_mss(sk, &size_goal, flags);
1074
1075
1076 iovlen = msg->msg_iovlen;
1077 iov = msg->msg_iov;
1078 copied = 0;
1079
1080 err = -EPIPE;
1081 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1082 goto out_err;
1083
1084 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1085
1086 while (--iovlen >= 0) {
1087 size_t seglen = iov->iov_len;
1088 unsigned char __user *from = iov->iov_base;
1089
1090 iov++;
1091 if (unlikely(offset > 0)) {
1092 if (offset >= seglen) {
1093 offset -= seglen;
1094 continue;
1095 }
1096 seglen -= offset;
1097 from += offset;
1098 offset = 0;
1099 }
1100
1101 while (seglen > 0) {
1102 int copy = 0;
1103 int max = size_goal;
1104
1105 skb = tcp_write_queue_tail(sk);
1106 if (tcp_send_head(sk)) {
1107 if (skb->ip_summed == CHECKSUM_NONE)
1108 max = mss_now;
1109 copy = max - skb->len;
1110 }
1111
1112 if (copy <= 0) {
1113new_segment:
1114
1115
1116
1117 if (!sk_stream_memory_free(sk))
1118 goto wait_for_sndbuf;
1119
1120 skb = sk_stream_alloc_skb(sk,
1121 select_size(sk, sg),
1122 sk->sk_allocation);
1123 if (!skb)
1124 goto wait_for_memory;
1125
1126
1127
1128
1129 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1130 skb->ip_summed = CHECKSUM_PARTIAL;
1131
1132 skb_entail(sk, skb);
1133 copy = size_goal;
1134 max = size_goal;
1135 }
1136
1137
1138 if (copy > seglen)
1139 copy = seglen;
1140
1141
1142 if (skb_availroom(skb) > 0) {
1143
1144 copy = min_t(int, copy, skb_availroom(skb));
1145 err = skb_add_data_nocache(sk, skb, from, copy);
1146 if (err)
1147 goto do_fault;
1148 } else {
1149 bool merge = true;
1150 int i = skb_shinfo(skb)->nr_frags;
1151 struct page_frag *pfrag = sk_page_frag(sk);
1152
1153 if (!sk_page_frag_refill(sk, pfrag))
1154 goto wait_for_memory;
1155
1156 if (!skb_can_coalesce(skb, i, pfrag->page,
1157 pfrag->offset)) {
1158 if (i == MAX_SKB_FRAGS || !sg) {
1159 tcp_mark_push(tp, skb);
1160 goto new_segment;
1161 }
1162 merge = false;
1163 }
1164
1165 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1166
1167 if (!sk_wmem_schedule(sk, copy))
1168 goto wait_for_memory;
1169
1170 err = skb_copy_to_page_nocache(sk, from, skb,
1171 pfrag->page,
1172 pfrag->offset,
1173 copy);
1174 if (err)
1175 goto do_error;
1176
1177
1178 if (merge) {
1179 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1180 } else {
1181 skb_fill_page_desc(skb, i, pfrag->page,
1182 pfrag->offset, copy);
1183 get_page(pfrag->page);
1184 }
1185 pfrag->offset += copy;
1186 }
1187
1188 if (!copied)
1189 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1190
1191 tp->write_seq += copy;
1192 TCP_SKB_CB(skb)->end_seq += copy;
1193 skb_shinfo(skb)->gso_segs = 0;
1194
1195 from += copy;
1196 copied += copy;
1197 if ((seglen -= copy) == 0 && iovlen == 0)
1198 goto out;
1199
1200 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1201 continue;
1202
1203 if (forced_push(tp)) {
1204 tcp_mark_push(tp, skb);
1205 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1206 } else if (skb == tcp_send_head(sk))
1207 tcp_push_one(sk, mss_now);
1208 continue;
1209
1210wait_for_sndbuf:
1211 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1212wait_for_memory:
1213 if (copied)
1214 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1215
1216 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1217 goto do_error;
1218
1219 mss_now = tcp_send_mss(sk, &size_goal, flags);
1220 }
1221 }
1222
1223out:
1224 if (copied)
1225 tcp_push(sk, flags, mss_now, tp->nonagle);
1226 release_sock(sk);
1227 return copied + copied_syn;
1228
1229do_fault:
1230 if (!skb->len) {
1231 tcp_unlink_write_queue(skb, sk);
1232
1233
1234
1235 tcp_check_send_head(sk, skb);
1236 sk_wmem_free_skb(sk, skb);
1237 }
1238
1239do_error:
1240 if (copied + copied_syn)
1241 goto out;
1242out_err:
1243 err = sk_stream_error(sk, flags, err);
1244 release_sock(sk);
1245 return err;
1246}
1247EXPORT_SYMBOL(tcp_sendmsg);
1248
1249
1250
1251
1252
1253
1254static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1255{
1256 struct tcp_sock *tp = tcp_sk(sk);
1257
1258
1259 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1260 tp->urg_data == TCP_URG_READ)
1261 return -EINVAL;
1262
1263 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1264 return -ENOTCONN;
1265
1266 if (tp->urg_data & TCP_URG_VALID) {
1267 int err = 0;
1268 char c = tp->urg_data;
1269
1270 if (!(flags & MSG_PEEK))
1271 tp->urg_data = TCP_URG_READ;
1272
1273
1274 msg->msg_flags |= MSG_OOB;
1275
1276 if (len > 0) {
1277 if (!(flags & MSG_TRUNC))
1278 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1279 len = 1;
1280 } else
1281 msg->msg_flags |= MSG_TRUNC;
1282
1283 return err ? -EFAULT : len;
1284 }
1285
1286 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1287 return 0;
1288
1289
1290
1291
1292
1293
1294
1295 return -EAGAIN;
1296}
1297
1298static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1299{
1300 struct sk_buff *skb;
1301 int copied = 0, err = 0;
1302
1303
1304
1305 skb_queue_walk(&sk->sk_write_queue, skb) {
1306 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1307 if (err)
1308 break;
1309
1310 copied += skb->len;
1311 }
1312
1313 return err ?: copied;
1314}
1315
1316
1317
1318
1319
1320
1321
1322void tcp_cleanup_rbuf(struct sock *sk, int copied)
1323{
1324 struct tcp_sock *tp = tcp_sk(sk);
1325 bool time_to_ack = false;
1326
1327 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1328
1329 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1330 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1331 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1332
1333 if (inet_csk_ack_scheduled(sk)) {
1334 const struct inet_connection_sock *icsk = inet_csk(sk);
1335
1336
1337 if (icsk->icsk_ack.blocked ||
1338
1339 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1340
1341
1342
1343
1344
1345
1346 (copied > 0 &&
1347 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1348 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1349 !icsk->icsk_ack.pingpong)) &&
1350 !atomic_read(&sk->sk_rmem_alloc)))
1351 time_to_ack = true;
1352 }
1353
1354
1355
1356
1357
1358
1359
1360 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1361 __u32 rcv_window_now = tcp_receive_window(tp);
1362
1363
1364 if (2*rcv_window_now <= tp->window_clamp) {
1365 __u32 new_window = __tcp_select_window(sk);
1366
1367
1368
1369
1370
1371
1372 if (new_window && new_window >= 2 * rcv_window_now)
1373 time_to_ack = true;
1374 }
1375 }
1376 if (time_to_ack)
1377 tcp_send_ack(sk);
1378}
1379
1380static void tcp_prequeue_process(struct sock *sk)
1381{
1382 struct sk_buff *skb;
1383 struct tcp_sock *tp = tcp_sk(sk);
1384
1385 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1386
1387
1388
1389 local_bh_disable();
1390 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1391 sk_backlog_rcv(sk, skb);
1392 local_bh_enable();
1393
1394
1395 tp->ucopy.memory = 0;
1396}
1397
1398#ifdef CONFIG_NET_DMA
1399static void tcp_service_net_dma(struct sock *sk, bool wait)
1400{
1401 dma_cookie_t done, used;
1402 dma_cookie_t last_issued;
1403 struct tcp_sock *tp = tcp_sk(sk);
1404
1405 if (!tp->ucopy.dma_chan)
1406 return;
1407
1408 last_issued = tp->ucopy.dma_cookie;
1409 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1410
1411 do {
1412 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1413 last_issued, &done,
1414 &used) == DMA_SUCCESS) {
1415
1416 __skb_queue_purge(&sk->sk_async_wait_queue);
1417 break;
1418 } else {
1419 struct sk_buff *skb;
1420 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1421 (dma_async_is_complete(skb->dma_cookie, done,
1422 used) == DMA_SUCCESS)) {
1423 __skb_dequeue(&sk->sk_async_wait_queue);
1424 kfree_skb(skb);
1425 }
1426 }
1427 } while (wait);
1428}
1429#endif
1430
1431static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1432{
1433 struct sk_buff *skb;
1434 u32 offset;
1435
1436 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1437 offset = seq - TCP_SKB_CB(skb)->seq;
1438 if (tcp_hdr(skb)->syn)
1439 offset--;
1440 if (offset < skb->len || tcp_hdr(skb)->fin) {
1441 *off = offset;
1442 return skb;
1443 }
1444
1445
1446
1447
1448 sk_eat_skb(sk, skb, false);
1449 }
1450 return NULL;
1451}
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1465 sk_read_actor_t recv_actor)
1466{
1467 struct sk_buff *skb;
1468 struct tcp_sock *tp = tcp_sk(sk);
1469 u32 seq = tp->copied_seq;
1470 u32 offset;
1471 int copied = 0;
1472
1473 if (sk->sk_state == TCP_LISTEN)
1474 return -ENOTCONN;
1475 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1476 if (offset < skb->len) {
1477 int used;
1478 size_t len;
1479
1480 len = skb->len - offset;
1481
1482 if (tp->urg_data) {
1483 u32 urg_offset = tp->urg_seq - seq;
1484 if (urg_offset < len)
1485 len = urg_offset;
1486 if (!len)
1487 break;
1488 }
1489 used = recv_actor(desc, skb, offset, len);
1490 if (used <= 0) {
1491 if (!copied)
1492 copied = used;
1493 break;
1494 } else if (used <= len) {
1495 seq += used;
1496 copied += used;
1497 offset += used;
1498 }
1499
1500
1501
1502
1503
1504 skb = tcp_recv_skb(sk, seq - 1, &offset);
1505 if (!skb)
1506 break;
1507
1508
1509
1510 if (offset + 1 != skb->len)
1511 continue;
1512 }
1513 if (tcp_hdr(skb)->fin) {
1514 sk_eat_skb(sk, skb, false);
1515 ++seq;
1516 break;
1517 }
1518 sk_eat_skb(sk, skb, false);
1519 if (!desc->count)
1520 break;
1521 tp->copied_seq = seq;
1522 }
1523 tp->copied_seq = seq;
1524
1525 tcp_rcv_space_adjust(sk);
1526
1527
1528 if (copied > 0) {
1529 tcp_recv_skb(sk, seq, &offset);
1530 tcp_cleanup_rbuf(sk, copied);
1531 }
1532 return copied;
1533}
1534EXPORT_SYMBOL(tcp_read_sock);
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1545 size_t len, int nonblock, int flags, int *addr_len)
1546{
1547 struct tcp_sock *tp = tcp_sk(sk);
1548 int copied = 0;
1549 u32 peek_seq;
1550 u32 *seq;
1551 unsigned long used;
1552 int err;
1553 int target;
1554 long timeo;
1555 struct task_struct *user_recv = NULL;
1556 bool copied_early = false;
1557 struct sk_buff *skb;
1558 u32 urg_hole = 0;
1559
1560 lock_sock(sk);
1561
1562 err = -ENOTCONN;
1563 if (sk->sk_state == TCP_LISTEN)
1564 goto out;
1565
1566 timeo = sock_rcvtimeo(sk, nonblock);
1567
1568
1569 if (flags & MSG_OOB)
1570 goto recv_urg;
1571
1572 if (unlikely(tp->repair)) {
1573 err = -EPERM;
1574 if (!(flags & MSG_PEEK))
1575 goto out;
1576
1577 if (tp->repair_queue == TCP_SEND_QUEUE)
1578 goto recv_sndq;
1579
1580 err = -EINVAL;
1581 if (tp->repair_queue == TCP_NO_QUEUE)
1582 goto out;
1583
1584
1585 }
1586
1587 seq = &tp->copied_seq;
1588 if (flags & MSG_PEEK) {
1589 peek_seq = tp->copied_seq;
1590 seq = &peek_seq;
1591 }
1592
1593 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1594
1595#ifdef CONFIG_NET_DMA
1596 tp->ucopy.dma_chan = NULL;
1597 preempt_disable();
1598 skb = skb_peek_tail(&sk->sk_receive_queue);
1599 {
1600 int available = 0;
1601
1602 if (skb)
1603 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1604 if ((available < target) &&
1605 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1606 !sysctl_tcp_low_latency &&
1607 net_dma_find_channel()) {
1608 preempt_enable_no_resched();
1609 tp->ucopy.pinned_list =
1610 dma_pin_iovec_pages(msg->msg_iov, len);
1611 } else {
1612 preempt_enable_no_resched();
1613 }
1614 }
1615#endif
1616
1617 do {
1618 u32 offset;
1619
1620
1621 if (tp->urg_data && tp->urg_seq == *seq) {
1622 if (copied)
1623 break;
1624 if (signal_pending(current)) {
1625 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1626 break;
1627 }
1628 }
1629
1630
1631
1632 skb_queue_walk(&sk->sk_receive_queue, skb) {
1633
1634
1635
1636 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1637 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1638 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1639 flags))
1640 break;
1641
1642 offset = *seq - TCP_SKB_CB(skb)->seq;
1643 if (tcp_hdr(skb)->syn)
1644 offset--;
1645 if (offset < skb->len)
1646 goto found_ok_skb;
1647 if (tcp_hdr(skb)->fin)
1648 goto found_fin_ok;
1649 WARN(!(flags & MSG_PEEK),
1650 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1651 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1652 }
1653
1654
1655
1656 if (copied >= target && !sk->sk_backlog.tail)
1657 break;
1658
1659 if (copied) {
1660 if (sk->sk_err ||
1661 sk->sk_state == TCP_CLOSE ||
1662 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1663 !timeo ||
1664 signal_pending(current))
1665 break;
1666 } else {
1667 if (sock_flag(sk, SOCK_DONE))
1668 break;
1669
1670 if (sk->sk_err) {
1671 copied = sock_error(sk);
1672 break;
1673 }
1674
1675 if (sk->sk_shutdown & RCV_SHUTDOWN)
1676 break;
1677
1678 if (sk->sk_state == TCP_CLOSE) {
1679 if (!sock_flag(sk, SOCK_DONE)) {
1680
1681
1682
1683 copied = -ENOTCONN;
1684 break;
1685 }
1686 break;
1687 }
1688
1689 if (!timeo) {
1690 copied = -EAGAIN;
1691 break;
1692 }
1693
1694 if (signal_pending(current)) {
1695 copied = sock_intr_errno(timeo);
1696 break;
1697 }
1698 }
1699
1700 tcp_cleanup_rbuf(sk, copied);
1701
1702 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1703
1704 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1705 user_recv = current;
1706 tp->ucopy.task = user_recv;
1707 tp->ucopy.iov = msg->msg_iov;
1708 }
1709
1710 tp->ucopy.len = len;
1711
1712 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1713 !(flags & (MSG_PEEK | MSG_TRUNC)));
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741 if (!skb_queue_empty(&tp->ucopy.prequeue))
1742 goto do_prequeue;
1743
1744
1745 }
1746
1747#ifdef CONFIG_NET_DMA
1748 if (tp->ucopy.dma_chan) {
1749 if (tp->rcv_wnd == 0 &&
1750 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1751 tcp_service_net_dma(sk, true);
1752 tcp_cleanup_rbuf(sk, copied);
1753 } else
1754 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1755 }
1756#endif
1757 if (copied >= target) {
1758
1759 release_sock(sk);
1760 lock_sock(sk);
1761 } else
1762 sk_wait_data(sk, &timeo);
1763
1764#ifdef CONFIG_NET_DMA
1765 tcp_service_net_dma(sk, false);
1766 tp->ucopy.wakeup = 0;
1767#endif
1768
1769 if (user_recv) {
1770 int chunk;
1771
1772
1773
1774 if ((chunk = len - tp->ucopy.len) != 0) {
1775 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1776 len -= chunk;
1777 copied += chunk;
1778 }
1779
1780 if (tp->rcv_nxt == tp->copied_seq &&
1781 !skb_queue_empty(&tp->ucopy.prequeue)) {
1782do_prequeue:
1783 tcp_prequeue_process(sk);
1784
1785 if ((chunk = len - tp->ucopy.len) != 0) {
1786 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1787 len -= chunk;
1788 copied += chunk;
1789 }
1790 }
1791 }
1792 if ((flags & MSG_PEEK) &&
1793 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1794 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1795 current->comm,
1796 task_pid_nr(current));
1797 peek_seq = tp->copied_seq;
1798 }
1799 continue;
1800
1801 found_ok_skb:
1802
1803 used = skb->len - offset;
1804 if (len < used)
1805 used = len;
1806
1807
1808 if (tp->urg_data) {
1809 u32 urg_offset = tp->urg_seq - *seq;
1810 if (urg_offset < used) {
1811 if (!urg_offset) {
1812 if (!sock_flag(sk, SOCK_URGINLINE)) {
1813 ++*seq;
1814 urg_hole++;
1815 offset++;
1816 used--;
1817 if (!used)
1818 goto skip_copy;
1819 }
1820 } else
1821 used = urg_offset;
1822 }
1823 }
1824
1825 if (!(flags & MSG_TRUNC)) {
1826#ifdef CONFIG_NET_DMA
1827 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1828 tp->ucopy.dma_chan = net_dma_find_channel();
1829
1830 if (tp->ucopy.dma_chan) {
1831 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1832 tp->ucopy.dma_chan, skb, offset,
1833 msg->msg_iov, used,
1834 tp->ucopy.pinned_list);
1835
1836 if (tp->ucopy.dma_cookie < 0) {
1837
1838 pr_alert("%s: dma_cookie < 0\n",
1839 __func__);
1840
1841
1842 if (!copied)
1843 copied = -EFAULT;
1844 break;
1845 }
1846
1847 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1848
1849 if ((offset + used) == skb->len)
1850 copied_early = true;
1851
1852 } else
1853#endif
1854 {
1855 err = skb_copy_datagram_iovec(skb, offset,
1856 msg->msg_iov, used);
1857 if (err) {
1858
1859 if (!copied)
1860 copied = -EFAULT;
1861 break;
1862 }
1863 }
1864 }
1865
1866 *seq += used;
1867 copied += used;
1868 len -= used;
1869
1870 tcp_rcv_space_adjust(sk);
1871
1872skip_copy:
1873 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1874 tp->urg_data = 0;
1875 tcp_fast_path_check(sk);
1876 }
1877 if (used + offset < skb->len)
1878 continue;
1879
1880 if (tcp_hdr(skb)->fin)
1881 goto found_fin_ok;
1882 if (!(flags & MSG_PEEK)) {
1883 sk_eat_skb(sk, skb, copied_early);
1884 copied_early = false;
1885 }
1886 continue;
1887
1888 found_fin_ok:
1889
1890 ++*seq;
1891 if (!(flags & MSG_PEEK)) {
1892 sk_eat_skb(sk, skb, copied_early);
1893 copied_early = false;
1894 }
1895 break;
1896 } while (len > 0);
1897
1898 if (user_recv) {
1899 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1900 int chunk;
1901
1902 tp->ucopy.len = copied > 0 ? len : 0;
1903
1904 tcp_prequeue_process(sk);
1905
1906 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1907 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1908 len -= chunk;
1909 copied += chunk;
1910 }
1911 }
1912
1913 tp->ucopy.task = NULL;
1914 tp->ucopy.len = 0;
1915 }
1916
1917#ifdef CONFIG_NET_DMA
1918 tcp_service_net_dma(sk, true);
1919 tp->ucopy.dma_chan = NULL;
1920
1921 if (tp->ucopy.pinned_list) {
1922 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1923 tp->ucopy.pinned_list = NULL;
1924 }
1925#endif
1926
1927
1928
1929
1930
1931
1932 tcp_cleanup_rbuf(sk, copied);
1933
1934 release_sock(sk);
1935 return copied;
1936
1937out:
1938 release_sock(sk);
1939 return err;
1940
1941recv_urg:
1942 err = tcp_recv_urg(sk, msg, len, flags);
1943 goto out;
1944
1945recv_sndq:
1946 err = tcp_peek_sndq(sk, msg, len);
1947 goto out;
1948}
1949EXPORT_SYMBOL(tcp_recvmsg);
1950
1951void tcp_set_state(struct sock *sk, int state)
1952{
1953 int oldstate = sk->sk_state;
1954
1955 switch (state) {
1956 case TCP_ESTABLISHED:
1957 if (oldstate != TCP_ESTABLISHED)
1958 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1959 break;
1960
1961 case TCP_CLOSE:
1962 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1963 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1964
1965 sk->sk_prot->unhash(sk);
1966 if (inet_csk(sk)->icsk_bind_hash &&
1967 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1968 inet_put_port(sk);
1969
1970 default:
1971 if (oldstate == TCP_ESTABLISHED)
1972 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1973 }
1974
1975
1976
1977
1978 sk->sk_state = state;
1979
1980#ifdef STATE_TRACE
1981 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1982#endif
1983}
1984EXPORT_SYMBOL_GPL(tcp_set_state);
1985
1986
1987
1988
1989
1990
1991
1992
1993static const unsigned char new_state[16] = {
1994
1995 TCP_CLOSE,
1996 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1997 TCP_CLOSE,
1998 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1999 TCP_FIN_WAIT1,
2000 TCP_FIN_WAIT2,
2001 TCP_CLOSE,
2002 TCP_CLOSE,
2003 TCP_LAST_ACK | TCP_ACTION_FIN,
2004 TCP_LAST_ACK,
2005 TCP_CLOSE,
2006 TCP_CLOSING,
2007};
2008
2009static int tcp_close_state(struct sock *sk)
2010{
2011 int next = (int)new_state[sk->sk_state];
2012 int ns = next & TCP_STATE_MASK;
2013
2014 tcp_set_state(sk, ns);
2015
2016 return next & TCP_ACTION_FIN;
2017}
2018
2019
2020
2021
2022
2023
2024void tcp_shutdown(struct sock *sk, int how)
2025{
2026
2027
2028
2029
2030 if (!(how & SEND_SHUTDOWN))
2031 return;
2032
2033
2034 if ((1 << sk->sk_state) &
2035 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2036 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2037
2038 if (tcp_close_state(sk))
2039 tcp_send_fin(sk);
2040 }
2041}
2042EXPORT_SYMBOL(tcp_shutdown);
2043
2044bool tcp_check_oom(struct sock *sk, int shift)
2045{
2046 bool too_many_orphans, out_of_socket_memory;
2047
2048 too_many_orphans = tcp_too_many_orphans(sk, shift);
2049 out_of_socket_memory = tcp_out_of_memory(sk);
2050
2051 if (too_many_orphans)
2052 net_info_ratelimited("too many orphaned sockets\n");
2053 if (out_of_socket_memory)
2054 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2055 return too_many_orphans || out_of_socket_memory;
2056}
2057
2058void tcp_close(struct sock *sk, long timeout)
2059{
2060 struct sk_buff *skb;
2061 int data_was_unread = 0;
2062 int state;
2063
2064 lock_sock(sk);
2065 sk->sk_shutdown = SHUTDOWN_MASK;
2066
2067 if (sk->sk_state == TCP_LISTEN) {
2068 tcp_set_state(sk, TCP_CLOSE);
2069
2070
2071 inet_csk_listen_stop(sk);
2072
2073 goto adjudge_to_death;
2074 }
2075
2076
2077
2078
2079
2080 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2081 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2082 tcp_hdr(skb)->fin;
2083 data_was_unread += len;
2084 __kfree_skb(skb);
2085 }
2086
2087 sk_mem_reclaim(sk);
2088
2089
2090 if (sk->sk_state == TCP_CLOSE)
2091 goto adjudge_to_death;
2092
2093
2094
2095
2096
2097
2098
2099
2100 if (unlikely(tcp_sk(sk)->repair)) {
2101 sk->sk_prot->disconnect(sk, 0);
2102 } else if (data_was_unread) {
2103
2104 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2105 tcp_set_state(sk, TCP_CLOSE);
2106 tcp_send_active_reset(sk, sk->sk_allocation);
2107 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2108
2109 sk->sk_prot->disconnect(sk, 0);
2110 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2111 } else if (tcp_close_state(sk)) {
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141 tcp_send_fin(sk);
2142 }
2143
2144 sk_stream_wait_close(sk, timeout);
2145
2146adjudge_to_death:
2147 state = sk->sk_state;
2148 sock_hold(sk);
2149 sock_orphan(sk);
2150
2151
2152 release_sock(sk);
2153
2154
2155
2156
2157
2158 local_bh_disable();
2159 bh_lock_sock(sk);
2160 WARN_ON(sock_owned_by_user(sk));
2161
2162 percpu_counter_inc(sk->sk_prot->orphan_count);
2163
2164
2165 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2166 goto out;
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182 if (sk->sk_state == TCP_FIN_WAIT2) {
2183 struct tcp_sock *tp = tcp_sk(sk);
2184 if (tp->linger2 < 0) {
2185 tcp_set_state(sk, TCP_CLOSE);
2186 tcp_send_active_reset(sk, GFP_ATOMIC);
2187 NET_INC_STATS_BH(sock_net(sk),
2188 LINUX_MIB_TCPABORTONLINGER);
2189 } else {
2190 const int tmo = tcp_fin_time(sk);
2191
2192 if (tmo > TCP_TIMEWAIT_LEN) {
2193 inet_csk_reset_keepalive_timer(sk,
2194 tmo - TCP_TIMEWAIT_LEN);
2195 } else {
2196 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2197 goto out;
2198 }
2199 }
2200 }
2201 if (sk->sk_state != TCP_CLOSE) {
2202 sk_mem_reclaim(sk);
2203 if (tcp_check_oom(sk, 0)) {
2204 tcp_set_state(sk, TCP_CLOSE);
2205 tcp_send_active_reset(sk, GFP_ATOMIC);
2206 NET_INC_STATS_BH(sock_net(sk),
2207 LINUX_MIB_TCPABORTONMEMORY);
2208 }
2209 }
2210
2211 if (sk->sk_state == TCP_CLOSE) {
2212 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2213
2214
2215
2216
2217 if (req != NULL)
2218 reqsk_fastopen_remove(sk, req, false);
2219 inet_csk_destroy_sock(sk);
2220 }
2221
2222
2223out:
2224 bh_unlock_sock(sk);
2225 local_bh_enable();
2226 sock_put(sk);
2227}
2228EXPORT_SYMBOL(tcp_close);
2229
2230
2231
2232static inline bool tcp_need_reset(int state)
2233{
2234 return (1 << state) &
2235 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2236 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2237}
2238
2239int tcp_disconnect(struct sock *sk, int flags)
2240{
2241 struct inet_sock *inet = inet_sk(sk);
2242 struct inet_connection_sock *icsk = inet_csk(sk);
2243 struct tcp_sock *tp = tcp_sk(sk);
2244 int err = 0;
2245 int old_state = sk->sk_state;
2246
2247 if (old_state != TCP_CLOSE)
2248 tcp_set_state(sk, TCP_CLOSE);
2249
2250
2251 if (old_state == TCP_LISTEN) {
2252 inet_csk_listen_stop(sk);
2253 } else if (unlikely(tp->repair)) {
2254 sk->sk_err = ECONNABORTED;
2255 } else if (tcp_need_reset(old_state) ||
2256 (tp->snd_nxt != tp->write_seq &&
2257 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2258
2259
2260
2261 tcp_send_active_reset(sk, gfp_any());
2262 sk->sk_err = ECONNRESET;
2263 } else if (old_state == TCP_SYN_SENT)
2264 sk->sk_err = ECONNRESET;
2265
2266 tcp_clear_xmit_timers(sk);
2267 __skb_queue_purge(&sk->sk_receive_queue);
2268 tcp_write_queue_purge(sk);
2269 __skb_queue_purge(&tp->out_of_order_queue);
2270#ifdef CONFIG_NET_DMA
2271 __skb_queue_purge(&sk->sk_async_wait_queue);
2272#endif
2273
2274 inet->inet_dport = 0;
2275
2276 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2277 inet_reset_saddr(sk);
2278
2279 sk->sk_shutdown = 0;
2280 sock_reset_flag(sk, SOCK_DONE);
2281 tp->srtt = 0;
2282 if ((tp->write_seq += tp->max_window + 2) == 0)
2283 tp->write_seq = 1;
2284 icsk->icsk_backoff = 0;
2285 tp->snd_cwnd = 2;
2286 icsk->icsk_probes_out = 0;
2287 tp->packets_out = 0;
2288 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2289 tp->snd_cwnd_cnt = 0;
2290 tp->bytes_acked = 0;
2291 tp->window_clamp = 0;
2292 tcp_set_ca_state(sk, TCP_CA_Open);
2293 tcp_clear_retrans(tp);
2294 inet_csk_delack_init(sk);
2295 tcp_init_send_head(sk);
2296 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2297 __sk_dst_reset(sk);
2298
2299 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2300
2301 sk->sk_error_report(sk);
2302 return err;
2303}
2304EXPORT_SYMBOL(tcp_disconnect);
2305
2306void tcp_sock_destruct(struct sock *sk)
2307{
2308 inet_sock_destruct(sk);
2309
2310 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2311}
2312
2313static inline bool tcp_can_repair_sock(const struct sock *sk)
2314{
2315 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2316 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2317}
2318
2319static int tcp_repair_options_est(struct tcp_sock *tp,
2320 struct tcp_repair_opt __user *optbuf, unsigned int len)
2321{
2322 struct tcp_repair_opt opt;
2323
2324 while (len >= sizeof(opt)) {
2325 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2326 return -EFAULT;
2327
2328 optbuf++;
2329 len -= sizeof(opt);
2330
2331 switch (opt.opt_code) {
2332 case TCPOPT_MSS:
2333 tp->rx_opt.mss_clamp = opt.opt_val;
2334 break;
2335 case TCPOPT_WINDOW:
2336 {
2337 u16 snd_wscale = opt.opt_val & 0xFFFF;
2338 u16 rcv_wscale = opt.opt_val >> 16;
2339
2340 if (snd_wscale > 14 || rcv_wscale > 14)
2341 return -EFBIG;
2342
2343 tp->rx_opt.snd_wscale = snd_wscale;
2344 tp->rx_opt.rcv_wscale = rcv_wscale;
2345 tp->rx_opt.wscale_ok = 1;
2346 }
2347 break;
2348 case TCPOPT_SACK_PERM:
2349 if (opt.opt_val != 0)
2350 return -EINVAL;
2351
2352 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2353 if (sysctl_tcp_fack)
2354 tcp_enable_fack(tp);
2355 break;
2356 case TCPOPT_TIMESTAMP:
2357 if (opt.opt_val != 0)
2358 return -EINVAL;
2359
2360 tp->rx_opt.tstamp_ok = 1;
2361 break;
2362 }
2363 }
2364
2365 return 0;
2366}
2367
2368
2369
2370
2371static int do_tcp_setsockopt(struct sock *sk, int level,
2372 int optname, char __user *optval, unsigned int optlen)
2373{
2374 struct tcp_sock *tp = tcp_sk(sk);
2375 struct inet_connection_sock *icsk = inet_csk(sk);
2376 int val;
2377 int err = 0;
2378
2379
2380 switch (optname) {
2381 case TCP_CONGESTION: {
2382 char name[TCP_CA_NAME_MAX];
2383
2384 if (optlen < 1)
2385 return -EINVAL;
2386
2387 val = strncpy_from_user(name, optval,
2388 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2389 if (val < 0)
2390 return -EFAULT;
2391 name[val] = 0;
2392
2393 lock_sock(sk);
2394 err = tcp_set_congestion_control(sk, name);
2395 release_sock(sk);
2396 return err;
2397 }
2398 case TCP_COOKIE_TRANSACTIONS: {
2399 struct tcp_cookie_transactions ctd;
2400 struct tcp_cookie_values *cvp = NULL;
2401
2402 if (sizeof(ctd) > optlen)
2403 return -EINVAL;
2404 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2405 return -EFAULT;
2406
2407 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2408 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2409 return -EINVAL;
2410
2411 if (ctd.tcpct_cookie_desired == 0) {
2412
2413 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2414 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2415 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2416 return -EINVAL;
2417 }
2418
2419 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2420
2421 lock_sock(sk);
2422 if (tp->cookie_values != NULL) {
2423 kref_put(&tp->cookie_values->kref,
2424 tcp_cookie_values_release);
2425 tp->cookie_values = NULL;
2426 }
2427 tp->rx_opt.cookie_in_always = 0;
2428 tp->rx_opt.cookie_out_never = 1;
2429 release_sock(sk);
2430 return err;
2431 }
2432
2433
2434
2435 if (ctd.tcpct_used > 0 ||
2436 (tp->cookie_values == NULL &&
2437 (sysctl_tcp_cookie_size > 0 ||
2438 ctd.tcpct_cookie_desired > 0 ||
2439 ctd.tcpct_s_data_desired > 0))) {
2440 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2441 GFP_KERNEL);
2442 if (cvp == NULL)
2443 return -ENOMEM;
2444
2445 kref_init(&cvp->kref);
2446 }
2447 lock_sock(sk);
2448 tp->rx_opt.cookie_in_always =
2449 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2450 tp->rx_opt.cookie_out_never = 0;
2451
2452 if (tp->cookie_values != NULL) {
2453 if (cvp != NULL) {
2454
2455
2456
2457
2458 kref_put(&tp->cookie_values->kref,
2459 tcp_cookie_values_release);
2460 } else {
2461 cvp = tp->cookie_values;
2462 }
2463 }
2464
2465 if (cvp != NULL) {
2466 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2467
2468 if (ctd.tcpct_used > 0) {
2469 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2470 ctd.tcpct_used);
2471 cvp->s_data_desired = ctd.tcpct_used;
2472 cvp->s_data_constant = 1;
2473 } else {
2474
2475 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2476 cvp->s_data_constant = 0;
2477 }
2478
2479 tp->cookie_values = cvp;
2480 }
2481 release_sock(sk);
2482 return err;
2483 }
2484 default:
2485
2486 break;
2487 }
2488
2489 if (optlen < sizeof(int))
2490 return -EINVAL;
2491
2492 if (get_user(val, (int __user *)optval))
2493 return -EFAULT;
2494
2495 lock_sock(sk);
2496
2497 switch (optname) {
2498 case TCP_MAXSEG:
2499
2500
2501
2502 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2503 err = -EINVAL;
2504 break;
2505 }
2506 tp->rx_opt.user_mss = val;
2507 break;
2508
2509 case TCP_NODELAY:
2510 if (val) {
2511
2512
2513
2514
2515
2516
2517
2518
2519 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2520 tcp_push_pending_frames(sk);
2521 } else {
2522 tp->nonagle &= ~TCP_NAGLE_OFF;
2523 }
2524 break;
2525
2526 case TCP_THIN_LINEAR_TIMEOUTS:
2527 if (val < 0 || val > 1)
2528 err = -EINVAL;
2529 else
2530 tp->thin_lto = val;
2531 break;
2532
2533 case TCP_THIN_DUPACK:
2534 if (val < 0 || val > 1)
2535 err = -EINVAL;
2536 else
2537 tp->thin_dupack = val;
2538 if (tp->thin_dupack)
2539 tcp_disable_early_retrans(tp);
2540 break;
2541
2542 case TCP_REPAIR:
2543 if (!tcp_can_repair_sock(sk))
2544 err = -EPERM;
2545 else if (val == 1) {
2546 tp->repair = 1;
2547 sk->sk_reuse = SK_FORCE_REUSE;
2548 tp->repair_queue = TCP_NO_QUEUE;
2549 } else if (val == 0) {
2550 tp->repair = 0;
2551 sk->sk_reuse = SK_NO_REUSE;
2552 tcp_send_window_probe(sk);
2553 } else
2554 err = -EINVAL;
2555
2556 break;
2557
2558 case TCP_REPAIR_QUEUE:
2559 if (!tp->repair)
2560 err = -EPERM;
2561 else if (val < TCP_QUEUES_NR)
2562 tp->repair_queue = val;
2563 else
2564 err = -EINVAL;
2565 break;
2566
2567 case TCP_QUEUE_SEQ:
2568 if (sk->sk_state != TCP_CLOSE)
2569 err = -EPERM;
2570 else if (tp->repair_queue == TCP_SEND_QUEUE)
2571 tp->write_seq = val;
2572 else if (tp->repair_queue == TCP_RECV_QUEUE)
2573 tp->rcv_nxt = val;
2574 else
2575 err = -EINVAL;
2576 break;
2577
2578 case TCP_REPAIR_OPTIONS:
2579 if (!tp->repair)
2580 err = -EINVAL;
2581 else if (sk->sk_state == TCP_ESTABLISHED)
2582 err = tcp_repair_options_est(tp,
2583 (struct tcp_repair_opt __user *)optval,
2584 optlen);
2585 else
2586 err = -EPERM;
2587 break;
2588
2589 case TCP_CORK:
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601 if (val) {
2602 tp->nonagle |= TCP_NAGLE_CORK;
2603 } else {
2604 tp->nonagle &= ~TCP_NAGLE_CORK;
2605 if (tp->nonagle&TCP_NAGLE_OFF)
2606 tp->nonagle |= TCP_NAGLE_PUSH;
2607 tcp_push_pending_frames(sk);
2608 }
2609 break;
2610
2611 case TCP_KEEPIDLE:
2612 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2613 err = -EINVAL;
2614 else {
2615 tp->keepalive_time = val * HZ;
2616 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2617 !((1 << sk->sk_state) &
2618 (TCPF_CLOSE | TCPF_LISTEN))) {
2619 u32 elapsed = keepalive_time_elapsed(tp);
2620 if (tp->keepalive_time > elapsed)
2621 elapsed = tp->keepalive_time - elapsed;
2622 else
2623 elapsed = 0;
2624 inet_csk_reset_keepalive_timer(sk, elapsed);
2625 }
2626 }
2627 break;
2628 case TCP_KEEPINTVL:
2629 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2630 err = -EINVAL;
2631 else
2632 tp->keepalive_intvl = val * HZ;
2633 break;
2634 case TCP_KEEPCNT:
2635 if (val < 1 || val > MAX_TCP_KEEPCNT)
2636 err = -EINVAL;
2637 else
2638 tp->keepalive_probes = val;
2639 break;
2640 case TCP_SYNCNT:
2641 if (val < 1 || val > MAX_TCP_SYNCNT)
2642 err = -EINVAL;
2643 else
2644 icsk->icsk_syn_retries = val;
2645 break;
2646
2647 case TCP_LINGER2:
2648 if (val < 0)
2649 tp->linger2 = -1;
2650 else if (val > sysctl_tcp_fin_timeout / HZ)
2651 tp->linger2 = 0;
2652 else
2653 tp->linger2 = val * HZ;
2654 break;
2655
2656 case TCP_DEFER_ACCEPT:
2657
2658 icsk->icsk_accept_queue.rskq_defer_accept =
2659 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2660 TCP_RTO_MAX / HZ);
2661 break;
2662
2663 case TCP_WINDOW_CLAMP:
2664 if (!val) {
2665 if (sk->sk_state != TCP_CLOSE) {
2666 err = -EINVAL;
2667 break;
2668 }
2669 tp->window_clamp = 0;
2670 } else
2671 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2672 SOCK_MIN_RCVBUF / 2 : val;
2673 break;
2674
2675 case TCP_QUICKACK:
2676 if (!val) {
2677 icsk->icsk_ack.pingpong = 1;
2678 } else {
2679 icsk->icsk_ack.pingpong = 0;
2680 if ((1 << sk->sk_state) &
2681 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2682 inet_csk_ack_scheduled(sk)) {
2683 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2684 tcp_cleanup_rbuf(sk, 1);
2685 if (!(val & 1))
2686 icsk->icsk_ack.pingpong = 1;
2687 }
2688 }
2689 break;
2690
2691#ifdef CONFIG_TCP_MD5SIG
2692 case TCP_MD5SIG:
2693
2694 err = tp->af_specific->md5_parse(sk, optval, optlen);
2695 break;
2696#endif
2697 case TCP_USER_TIMEOUT:
2698
2699
2700
2701 if (val < 0)
2702 err = -EINVAL;
2703 else
2704 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2705 break;
2706
2707 case TCP_FASTOPEN:
2708 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2709 TCPF_LISTEN)))
2710 err = fastopen_init_queue(sk, val);
2711 else
2712 err = -EINVAL;
2713 break;
2714 default:
2715 err = -ENOPROTOOPT;
2716 break;
2717 }
2718
2719 release_sock(sk);
2720 return err;
2721}
2722
2723int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2724 unsigned int optlen)
2725{
2726 const struct inet_connection_sock *icsk = inet_csk(sk);
2727
2728 if (level != SOL_TCP)
2729 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2730 optval, optlen);
2731 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2732}
2733EXPORT_SYMBOL(tcp_setsockopt);
2734
2735#ifdef CONFIG_COMPAT
2736int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2737 char __user *optval, unsigned int optlen)
2738{
2739 if (level != SOL_TCP)
2740 return inet_csk_compat_setsockopt(sk, level, optname,
2741 optval, optlen);
2742 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2743}
2744EXPORT_SYMBOL(compat_tcp_setsockopt);
2745#endif
2746
2747
2748void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2749{
2750 const struct tcp_sock *tp = tcp_sk(sk);
2751 const struct inet_connection_sock *icsk = inet_csk(sk);
2752 u32 now = tcp_time_stamp;
2753
2754 memset(info, 0, sizeof(*info));
2755
2756 info->tcpi_state = sk->sk_state;
2757 info->tcpi_ca_state = icsk->icsk_ca_state;
2758 info->tcpi_retransmits = icsk->icsk_retransmits;
2759 info->tcpi_probes = icsk->icsk_probes_out;
2760 info->tcpi_backoff = icsk->icsk_backoff;
2761
2762 if (tp->rx_opt.tstamp_ok)
2763 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2764 if (tcp_is_sack(tp))
2765 info->tcpi_options |= TCPI_OPT_SACK;
2766 if (tp->rx_opt.wscale_ok) {
2767 info->tcpi_options |= TCPI_OPT_WSCALE;
2768 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2769 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2770 }
2771
2772 if (tp->ecn_flags & TCP_ECN_OK)
2773 info->tcpi_options |= TCPI_OPT_ECN;
2774 if (tp->ecn_flags & TCP_ECN_SEEN)
2775 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2776 if (tp->syn_data_acked)
2777 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2778
2779 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2780 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2781 info->tcpi_snd_mss = tp->mss_cache;
2782 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2783
2784 if (sk->sk_state == TCP_LISTEN) {
2785 info->tcpi_unacked = sk->sk_ack_backlog;
2786 info->tcpi_sacked = sk->sk_max_ack_backlog;
2787 } else {
2788 info->tcpi_unacked = tp->packets_out;
2789 info->tcpi_sacked = tp->sacked_out;
2790 }
2791 info->tcpi_lost = tp->lost_out;
2792 info->tcpi_retrans = tp->retrans_out;
2793 info->tcpi_fackets = tp->fackets_out;
2794
2795 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2796 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2797 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2798
2799 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2800 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2801 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2802 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2803 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2804 info->tcpi_snd_cwnd = tp->snd_cwnd;
2805 info->tcpi_advmss = tp->advmss;
2806 info->tcpi_reordering = tp->reordering;
2807
2808 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2809 info->tcpi_rcv_space = tp->rcvq_space.space;
2810
2811 info->tcpi_total_retrans = tp->total_retrans;
2812}
2813EXPORT_SYMBOL_GPL(tcp_get_info);
2814
2815static int do_tcp_getsockopt(struct sock *sk, int level,
2816 int optname, char __user *optval, int __user *optlen)
2817{
2818 struct inet_connection_sock *icsk = inet_csk(sk);
2819 struct tcp_sock *tp = tcp_sk(sk);
2820 int val, len;
2821
2822 if (get_user(len, optlen))
2823 return -EFAULT;
2824
2825 len = min_t(unsigned int, len, sizeof(int));
2826
2827 if (len < 0)
2828 return -EINVAL;
2829
2830 switch (optname) {
2831 case TCP_MAXSEG:
2832 val = tp->mss_cache;
2833 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2834 val = tp->rx_opt.user_mss;
2835 if (tp->repair)
2836 val = tp->rx_opt.mss_clamp;
2837 break;
2838 case TCP_NODELAY:
2839 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2840 break;
2841 case TCP_CORK:
2842 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2843 break;
2844 case TCP_KEEPIDLE:
2845 val = keepalive_time_when(tp) / HZ;
2846 break;
2847 case TCP_KEEPINTVL:
2848 val = keepalive_intvl_when(tp) / HZ;
2849 break;
2850 case TCP_KEEPCNT:
2851 val = keepalive_probes(tp);
2852 break;
2853 case TCP_SYNCNT:
2854 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2855 break;
2856 case TCP_LINGER2:
2857 val = tp->linger2;
2858 if (val >= 0)
2859 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2860 break;
2861 case TCP_DEFER_ACCEPT:
2862 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2863 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2864 break;
2865 case TCP_WINDOW_CLAMP:
2866 val = tp->window_clamp;
2867 break;
2868 case TCP_INFO: {
2869 struct tcp_info info;
2870
2871 if (get_user(len, optlen))
2872 return -EFAULT;
2873
2874 tcp_get_info(sk, &info);
2875
2876 len = min_t(unsigned int, len, sizeof(info));
2877 if (put_user(len, optlen))
2878 return -EFAULT;
2879 if (copy_to_user(optval, &info, len))
2880 return -EFAULT;
2881 return 0;
2882 }
2883 case TCP_QUICKACK:
2884 val = !icsk->icsk_ack.pingpong;
2885 break;
2886
2887 case TCP_CONGESTION:
2888 if (get_user(len, optlen))
2889 return -EFAULT;
2890 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2891 if (put_user(len, optlen))
2892 return -EFAULT;
2893 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2894 return -EFAULT;
2895 return 0;
2896
2897 case TCP_COOKIE_TRANSACTIONS: {
2898 struct tcp_cookie_transactions ctd;
2899 struct tcp_cookie_values *cvp = tp->cookie_values;
2900
2901 if (get_user(len, optlen))
2902 return -EFAULT;
2903 if (len < sizeof(ctd))
2904 return -EINVAL;
2905
2906 memset(&ctd, 0, sizeof(ctd));
2907 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2908 TCP_COOKIE_IN_ALWAYS : 0)
2909 | (tp->rx_opt.cookie_out_never ?
2910 TCP_COOKIE_OUT_NEVER : 0);
2911
2912 if (cvp != NULL) {
2913 ctd.tcpct_flags |= (cvp->s_data_in ?
2914 TCP_S_DATA_IN : 0)
2915 | (cvp->s_data_out ?
2916 TCP_S_DATA_OUT : 0);
2917
2918 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2919 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2920
2921 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2922 cvp->cookie_pair_size);
2923 ctd.tcpct_used = cvp->cookie_pair_size;
2924 }
2925
2926 if (put_user(sizeof(ctd), optlen))
2927 return -EFAULT;
2928 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2929 return -EFAULT;
2930 return 0;
2931 }
2932 case TCP_THIN_LINEAR_TIMEOUTS:
2933 val = tp->thin_lto;
2934 break;
2935 case TCP_THIN_DUPACK:
2936 val = tp->thin_dupack;
2937 break;
2938
2939 case TCP_REPAIR:
2940 val = tp->repair;
2941 break;
2942
2943 case TCP_REPAIR_QUEUE:
2944 if (tp->repair)
2945 val = tp->repair_queue;
2946 else
2947 return -EINVAL;
2948 break;
2949
2950 case TCP_QUEUE_SEQ:
2951 if (tp->repair_queue == TCP_SEND_QUEUE)
2952 val = tp->write_seq;
2953 else if (tp->repair_queue == TCP_RECV_QUEUE)
2954 val = tp->rcv_nxt;
2955 else
2956 return -EINVAL;
2957 break;
2958
2959 case TCP_USER_TIMEOUT:
2960 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2961 break;
2962 default:
2963 return -ENOPROTOOPT;
2964 }
2965
2966 if (put_user(len, optlen))
2967 return -EFAULT;
2968 if (copy_to_user(optval, &val, len))
2969 return -EFAULT;
2970 return 0;
2971}
2972
2973int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2974 int __user *optlen)
2975{
2976 struct inet_connection_sock *icsk = inet_csk(sk);
2977
2978 if (level != SOL_TCP)
2979 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2980 optval, optlen);
2981 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2982}
2983EXPORT_SYMBOL(tcp_getsockopt);
2984
2985#ifdef CONFIG_COMPAT
2986int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2987 char __user *optval, int __user *optlen)
2988{
2989 if (level != SOL_TCP)
2990 return inet_csk_compat_getsockopt(sk, level, optname,
2991 optval, optlen);
2992 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2993}
2994EXPORT_SYMBOL(compat_tcp_getsockopt);
2995#endif
2996
2997struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2998 netdev_features_t features)
2999{
3000 struct sk_buff *segs = ERR_PTR(-EINVAL);
3001 struct tcphdr *th;
3002 unsigned int thlen;
3003 unsigned int seq;
3004 __be32 delta;
3005 unsigned int oldlen;
3006 unsigned int mss;
3007
3008 if (!pskb_may_pull(skb, sizeof(*th)))
3009 goto out;
3010
3011 th = tcp_hdr(skb);
3012 thlen = th->doff * 4;
3013 if (thlen < sizeof(*th))
3014 goto out;
3015
3016 if (!pskb_may_pull(skb, thlen))
3017 goto out;
3018
3019 oldlen = (u16)~skb->len;
3020 __skb_pull(skb, thlen);
3021
3022 mss = skb_shinfo(skb)->gso_size;
3023 if (unlikely(skb->len <= mss))
3024 goto out;
3025
3026 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
3027
3028 int type = skb_shinfo(skb)->gso_type;
3029
3030 if (unlikely(type &
3031 ~(SKB_GSO_TCPV4 |
3032 SKB_GSO_DODGY |
3033 SKB_GSO_TCP_ECN |
3034 SKB_GSO_TCPV6 |
3035 0) ||
3036 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
3037 goto out;
3038
3039 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
3040
3041 segs = NULL;
3042 goto out;
3043 }
3044
3045 segs = skb_segment(skb, features);
3046 if (IS_ERR(segs))
3047 goto out;
3048
3049 delta = htonl(oldlen + (thlen + mss));
3050
3051 skb = segs;
3052 th = tcp_hdr(skb);
3053 seq = ntohl(th->seq);
3054
3055 do {
3056 th->fin = th->psh = 0;
3057
3058 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3059 (__force u32)delta));
3060 if (skb->ip_summed != CHECKSUM_PARTIAL)
3061 th->check =
3062 csum_fold(csum_partial(skb_transport_header(skb),
3063 thlen, skb->csum));
3064
3065 seq += mss;
3066 skb = skb->next;
3067 th = tcp_hdr(skb);
3068
3069 th->seq = htonl(seq);
3070 th->cwr = 0;
3071 } while (skb->next);
3072
3073 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
3074 skb->data_len);
3075 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3076 (__force u32)delta));
3077 if (skb->ip_summed != CHECKSUM_PARTIAL)
3078 th->check = csum_fold(csum_partial(skb_transport_header(skb),
3079 thlen, skb->csum));
3080
3081out:
3082 return segs;
3083}
3084EXPORT_SYMBOL(tcp_tso_segment);
3085
3086struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3087{
3088 struct sk_buff **pp = NULL;
3089 struct sk_buff *p;
3090 struct tcphdr *th;
3091 struct tcphdr *th2;
3092 unsigned int len;
3093 unsigned int thlen;
3094 __be32 flags;
3095 unsigned int mss = 1;
3096 unsigned int hlen;
3097 unsigned int off;
3098 int flush = 1;
3099 int i;
3100
3101 off = skb_gro_offset(skb);
3102 hlen = off + sizeof(*th);
3103 th = skb_gro_header_fast(skb, off);
3104 if (skb_gro_header_hard(skb, hlen)) {
3105 th = skb_gro_header_slow(skb, hlen, off);
3106 if (unlikely(!th))
3107 goto out;
3108 }
3109
3110 thlen = th->doff * 4;
3111 if (thlen < sizeof(*th))
3112 goto out;
3113
3114 hlen = off + thlen;
3115 if (skb_gro_header_hard(skb, hlen)) {
3116 th = skb_gro_header_slow(skb, hlen, off);
3117 if (unlikely(!th))
3118 goto out;
3119 }
3120
3121 skb_gro_pull(skb, thlen);
3122
3123 len = skb_gro_len(skb);
3124 flags = tcp_flag_word(th);
3125
3126 for (; (p = *head); head = &p->next) {
3127 if (!NAPI_GRO_CB(p)->same_flow)
3128 continue;
3129
3130 th2 = tcp_hdr(p);
3131
3132 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
3133 NAPI_GRO_CB(p)->same_flow = 0;
3134 continue;
3135 }
3136
3137 goto found;
3138 }
3139
3140 goto out_check_final;
3141
3142found:
3143 flush = NAPI_GRO_CB(p)->flush;
3144 flush |= (__force int)(flags & TCP_FLAG_CWR);
3145 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
3146 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
3147 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
3148 for (i = sizeof(*th); i < thlen; i += 4)
3149 flush |= *(u32 *)((u8 *)th + i) ^
3150 *(u32 *)((u8 *)th2 + i);
3151
3152 mss = skb_shinfo(p)->gso_size;
3153
3154 flush |= (len - 1) >= mss;
3155 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
3156
3157 if (flush || skb_gro_receive(head, skb)) {
3158 mss = 1;
3159 goto out_check_final;
3160 }
3161
3162 p = *head;
3163 th2 = tcp_hdr(p);
3164 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
3165
3166out_check_final:
3167 flush = len < mss;
3168 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
3169 TCP_FLAG_RST | TCP_FLAG_SYN |
3170 TCP_FLAG_FIN));
3171
3172 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
3173 pp = head;
3174
3175out:
3176 NAPI_GRO_CB(skb)->flush |= flush;
3177
3178 return pp;
3179}
3180EXPORT_SYMBOL(tcp_gro_receive);
3181
3182int tcp_gro_complete(struct sk_buff *skb)
3183{
3184 struct tcphdr *th = tcp_hdr(skb);
3185
3186 skb->csum_start = skb_transport_header(skb) - skb->head;
3187 skb->csum_offset = offsetof(struct tcphdr, check);
3188 skb->ip_summed = CHECKSUM_PARTIAL;
3189
3190 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
3191
3192 if (th->cwr)
3193 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
3194
3195 return 0;
3196}
3197EXPORT_SYMBOL(tcp_gro_complete);
3198
3199#ifdef CONFIG_TCP_MD5SIG
3200static unsigned long tcp_md5sig_users;
3201static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
3202static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
3203
3204static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
3205{
3206 int cpu;
3207
3208 for_each_possible_cpu(cpu) {
3209 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
3210
3211 if (p->md5_desc.tfm)
3212 crypto_free_hash(p->md5_desc.tfm);
3213 }
3214 free_percpu(pool);
3215}
3216
3217void tcp_free_md5sig_pool(void)
3218{
3219 struct tcp_md5sig_pool __percpu *pool = NULL;
3220
3221 spin_lock_bh(&tcp_md5sig_pool_lock);
3222 if (--tcp_md5sig_users == 0) {
3223 pool = tcp_md5sig_pool;
3224 tcp_md5sig_pool = NULL;
3225 }
3226 spin_unlock_bh(&tcp_md5sig_pool_lock);
3227 if (pool)
3228 __tcp_free_md5sig_pool(pool);
3229}
3230EXPORT_SYMBOL(tcp_free_md5sig_pool);
3231
3232static struct tcp_md5sig_pool __percpu *
3233__tcp_alloc_md5sig_pool(struct sock *sk)
3234{
3235 int cpu;
3236 struct tcp_md5sig_pool __percpu *pool;
3237
3238 pool = alloc_percpu(struct tcp_md5sig_pool);
3239 if (!pool)
3240 return NULL;
3241
3242 for_each_possible_cpu(cpu) {
3243 struct crypto_hash *hash;
3244
3245 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3246 if (!hash || IS_ERR(hash))
3247 goto out_free;
3248
3249 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
3250 }
3251 return pool;
3252out_free:
3253 __tcp_free_md5sig_pool(pool);
3254 return NULL;
3255}
3256
3257struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
3258{
3259 struct tcp_md5sig_pool __percpu *pool;
3260 bool alloc = false;
3261
3262retry:
3263 spin_lock_bh(&tcp_md5sig_pool_lock);
3264 pool = tcp_md5sig_pool;
3265 if (tcp_md5sig_users++ == 0) {
3266 alloc = true;
3267 spin_unlock_bh(&tcp_md5sig_pool_lock);
3268 } else if (!pool) {
3269 tcp_md5sig_users--;
3270 spin_unlock_bh(&tcp_md5sig_pool_lock);
3271 cpu_relax();
3272 goto retry;
3273 } else
3274 spin_unlock_bh(&tcp_md5sig_pool_lock);
3275
3276 if (alloc) {
3277
3278 struct tcp_md5sig_pool __percpu *p;
3279
3280 p = __tcp_alloc_md5sig_pool(sk);
3281 spin_lock_bh(&tcp_md5sig_pool_lock);
3282 if (!p) {
3283 tcp_md5sig_users--;
3284 spin_unlock_bh(&tcp_md5sig_pool_lock);
3285 return NULL;
3286 }
3287 pool = tcp_md5sig_pool;
3288 if (pool) {
3289
3290 spin_unlock_bh(&tcp_md5sig_pool_lock);
3291 __tcp_free_md5sig_pool(p);
3292 } else {
3293 tcp_md5sig_pool = pool = p;
3294 spin_unlock_bh(&tcp_md5sig_pool_lock);
3295 }
3296 }
3297 return pool;
3298}
3299EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3310{
3311 struct tcp_md5sig_pool __percpu *p;
3312
3313 local_bh_disable();
3314
3315 spin_lock(&tcp_md5sig_pool_lock);
3316 p = tcp_md5sig_pool;
3317 if (p)
3318 tcp_md5sig_users++;
3319 spin_unlock(&tcp_md5sig_pool_lock);
3320
3321 if (p)
3322 return this_cpu_ptr(p);
3323
3324 local_bh_enable();
3325 return NULL;
3326}
3327EXPORT_SYMBOL(tcp_get_md5sig_pool);
3328
3329void tcp_put_md5sig_pool(void)
3330{
3331 local_bh_enable();
3332 tcp_free_md5sig_pool();
3333}
3334EXPORT_SYMBOL(tcp_put_md5sig_pool);
3335
3336int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3337 const struct tcphdr *th)
3338{
3339 struct scatterlist sg;
3340 struct tcphdr hdr;
3341 int err;
3342
3343
3344 memcpy(&hdr, th, sizeof(hdr));
3345 hdr.check = 0;
3346
3347
3348 sg_init_one(&sg, &hdr, sizeof(hdr));
3349 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3350 return err;
3351}
3352EXPORT_SYMBOL(tcp_md5_hash_header);
3353
3354int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3355 const struct sk_buff *skb, unsigned int header_len)
3356{
3357 struct scatterlist sg;
3358 const struct tcphdr *tp = tcp_hdr(skb);
3359 struct hash_desc *desc = &hp->md5_desc;
3360 unsigned int i;
3361 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3362 skb_headlen(skb) - header_len : 0;
3363 const struct skb_shared_info *shi = skb_shinfo(skb);
3364 struct sk_buff *frag_iter;
3365
3366 sg_init_table(&sg, 1);
3367
3368 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3369 if (crypto_hash_update(desc, &sg, head_data_len))
3370 return 1;
3371
3372 for (i = 0; i < shi->nr_frags; ++i) {
3373 const struct skb_frag_struct *f = &shi->frags[i];
3374 struct page *page = skb_frag_page(f);
3375 sg_set_page(&sg, page, skb_frag_size(f), f->page_offset);
3376 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3377 return 1;
3378 }
3379
3380 skb_walk_frags(skb, frag_iter)
3381 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3382 return 1;
3383
3384 return 0;
3385}
3386EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3387
3388int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3389{
3390 struct scatterlist sg;
3391
3392 sg_init_one(&sg, key->key, key->keylen);
3393 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3394}
3395EXPORT_SYMBOL(tcp_md5_hash_key);
3396
3397#endif
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422struct tcp_cookie_secret {
3423
3424
3425
3426
3427
3428 u32 secrets[COOKIE_WORKSPACE_WORDS];
3429 unsigned long expires;
3430};
3431
3432#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3433#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3434#define TCP_SECRET_LIFE (HZ * 600)
3435
3436static struct tcp_cookie_secret tcp_secret_one;
3437static struct tcp_cookie_secret tcp_secret_two;
3438
3439
3440static struct tcp_cookie_secret *tcp_secret_generating;
3441static struct tcp_cookie_secret *tcp_secret_primary;
3442static struct tcp_cookie_secret *tcp_secret_retiring;
3443static struct tcp_cookie_secret *tcp_secret_secondary;
3444
3445static DEFINE_SPINLOCK(tcp_secret_locker);
3446
3447
3448
3449static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3450{
3451 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3452}
3453
3454
3455
3456
3457
3458int tcp_cookie_generator(u32 *bakery)
3459{
3460 unsigned long jiffy = jiffies;
3461
3462 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3463 spin_lock_bh(&tcp_secret_locker);
3464 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3465
3466 memcpy(bakery,
3467 &tcp_secret_generating->secrets[0],
3468 COOKIE_WORKSPACE_WORDS);
3469 } else {
3470
3471 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482 if (unlikely(tcp_secret_primary->expires ==
3483 tcp_secret_secondary->expires)) {
3484 struct timespec tv;
3485
3486 getnstimeofday(&tv);
3487 bakery[COOKIE_DIGEST_WORDS+0] ^=
3488 (u32)tv.tv_nsec;
3489
3490 tcp_secret_secondary->expires = jiffy
3491 + TCP_SECRET_1MSL
3492 + (0x0f & tcp_cookie_work(bakery, 0));
3493 } else {
3494 tcp_secret_secondary->expires = jiffy
3495 + TCP_SECRET_LIFE
3496 + (0xff & tcp_cookie_work(bakery, 1));
3497 tcp_secret_primary->expires = jiffy
3498 + TCP_SECRET_2MSL
3499 + (0x1f & tcp_cookie_work(bakery, 2));
3500 }
3501 memcpy(&tcp_secret_secondary->secrets[0],
3502 bakery, COOKIE_WORKSPACE_WORDS);
3503
3504 rcu_assign_pointer(tcp_secret_generating,
3505 tcp_secret_secondary);
3506 rcu_assign_pointer(tcp_secret_retiring,
3507 tcp_secret_primary);
3508
3509
3510
3511
3512
3513
3514 }
3515 spin_unlock_bh(&tcp_secret_locker);
3516 } else {
3517 rcu_read_lock_bh();
3518 memcpy(bakery,
3519 &rcu_dereference(tcp_secret_generating)->secrets[0],
3520 COOKIE_WORKSPACE_WORDS);
3521 rcu_read_unlock_bh();
3522 }
3523 return 0;
3524}
3525EXPORT_SYMBOL(tcp_cookie_generator);
3526
3527void tcp_done(struct sock *sk)
3528{
3529 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3530
3531 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3532 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3533
3534 tcp_set_state(sk, TCP_CLOSE);
3535 tcp_clear_xmit_timers(sk);
3536 if (req != NULL)
3537 reqsk_fastopen_remove(sk, req, false);
3538
3539 sk->sk_shutdown = SHUTDOWN_MASK;
3540
3541 if (!sock_flag(sk, SOCK_DEAD))
3542 sk->sk_state_change(sk);
3543 else
3544 inet_csk_destroy_sock(sk);
3545}
3546EXPORT_SYMBOL_GPL(tcp_done);
3547
3548extern struct tcp_congestion_ops tcp_reno;
3549
3550static __initdata unsigned long thash_entries;
3551static int __init set_thash_entries(char *str)
3552{
3553 ssize_t ret;
3554
3555 if (!str)
3556 return 0;
3557
3558 ret = kstrtoul(str, 0, &thash_entries);
3559 if (ret)
3560 return 0;
3561
3562 return 1;
3563}
3564__setup("thash_entries=", set_thash_entries);
3565
3566void tcp_init_mem(struct net *net)
3567{
3568 unsigned long limit = nr_free_buffer_pages() / 8;
3569 limit = max(limit, 128UL);
3570 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3571 net->ipv4.sysctl_tcp_mem[1] = limit;
3572 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3573}
3574
3575void __init tcp_init(void)
3576{
3577 struct sk_buff *skb = NULL;
3578 unsigned long limit;
3579 int max_rshare, max_wshare, cnt;
3580 unsigned int i;
3581 unsigned long jiffy = jiffies;
3582
3583 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3584
3585 percpu_counter_init(&tcp_sockets_allocated, 0);
3586 percpu_counter_init(&tcp_orphan_count, 0);
3587 tcp_hashinfo.bind_bucket_cachep =
3588 kmem_cache_create("tcp_bind_bucket",
3589 sizeof(struct inet_bind_bucket), 0,
3590 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3591
3592
3593
3594
3595
3596
3597 tcp_hashinfo.ehash =
3598 alloc_large_system_hash("TCP established",
3599 sizeof(struct inet_ehash_bucket),
3600 thash_entries,
3601 17,
3602 0,
3603 NULL,
3604 &tcp_hashinfo.ehash_mask,
3605 0,
3606 thash_entries ? 0 : 512 * 1024);
3607 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3608 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3609 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3610 }
3611 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3612 panic("TCP: failed to alloc ehash_locks");
3613 tcp_hashinfo.bhash =
3614 alloc_large_system_hash("TCP bind",
3615 sizeof(struct inet_bind_hashbucket),
3616 tcp_hashinfo.ehash_mask + 1,
3617 17,
3618 0,
3619 &tcp_hashinfo.bhash_size,
3620 NULL,
3621 0,
3622 64 * 1024);
3623 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3624 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3625 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3626 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3627 }
3628
3629
3630 cnt = tcp_hashinfo.ehash_mask + 1;
3631
3632 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3633 sysctl_tcp_max_orphans = cnt / 2;
3634 sysctl_max_syn_backlog = max(128, cnt / 256);
3635
3636 tcp_init_mem(&init_net);
3637
3638 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3639 max_wshare = min(4UL*1024*1024, limit);
3640 max_rshare = min(6UL*1024*1024, limit);
3641
3642 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3643 sysctl_tcp_wmem[1] = 16*1024;
3644 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3645
3646 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3647 sysctl_tcp_rmem[1] = 87380;
3648 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3649
3650 pr_info("Hash tables configured (established %u bind %u)\n",
3651 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3652
3653 tcp_metrics_init();
3654
3655 tcp_register_congestion_control(&tcp_reno);
3656
3657 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3658 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3659 tcp_secret_one.expires = jiffy;
3660 tcp_secret_two.expires = jiffy;
3661 tcp_secret_generating = &tcp_secret_one;
3662 tcp_secret_primary = &tcp_secret_one;
3663 tcp_secret_retiring = &tcp_secret_two;
3664 tcp_secret_secondary = &tcp_secret_two;
3665 tcp_tasklet_init();
3666}
3667