1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282
283int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
284
285struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count);
287
288int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly;
290
291EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem);
293
294atomic_long_t tcp_memory_allocated;
295EXPORT_SYMBOL(tcp_memory_allocated);
296
297
298
299
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303
304
305
306struct tcp_splice_state {
307 struct pipe_inode_info *pipe;
308 size_t len;
309 unsigned int flags;
310};
311
312
313
314
315
316
317
318int tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL(tcp_memory_pressure);
320
321void tcp_enter_memory_pressure(struct sock *sk)
322{
323 if (!tcp_memory_pressure) {
324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
325 tcp_memory_pressure = 1;
326 }
327}
328EXPORT_SYMBOL(tcp_enter_memory_pressure);
329
330
331static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
332{
333 u8 res = 0;
334
335 if (seconds > 0) {
336 int period = timeout;
337
338 res = 1;
339 while (seconds > period && res < 255) {
340 res++;
341 timeout <<= 1;
342 if (timeout > rto_max)
343 timeout = rto_max;
344 period += timeout;
345 }
346 }
347 return res;
348}
349
350
351static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
352{
353 int period = 0;
354
355 if (retrans > 0) {
356 period = timeout;
357 while (--retrans) {
358 timeout <<= 1;
359 if (timeout > rto_max)
360 timeout = rto_max;
361 period += timeout;
362 }
363 }
364 return period;
365}
366
367
368
369
370
371
372void tcp_init_sock(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375 struct tcp_sock *tp = tcp_sk(sk);
376
377 skb_queue_head_init(&tp->out_of_order_queue);
378 tcp_init_xmit_timers(sk);
379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
381
382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
383 tp->mdev = TCP_TIMEOUT_INIT;
384
385
386
387
388
389
390 tp->snd_cwnd = TCP_INIT_CWND;
391
392
393
394
395 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396 tp->snd_cwnd_clamp = ~0;
397 tp->mss_cache = TCP_MSS_DEFAULT;
398
399 tp->reordering = sysctl_tcp_reordering;
400 tcp_enable_early_retrans(tp);
401 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403 sk->sk_state = TCP_CLOSE;
404
405 sk->sk_write_space = sk_stream_write_space;
406 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
407
408 icsk->icsk_sync_mss = tcp_sync_mss;
409
410
411 if (sysctl_tcp_cookie_size > 0) {
412
413 tp->cookie_values =
414 kzalloc(sizeof(*tp->cookie_values),
415 sk->sk_allocation);
416 if (tp->cookie_values != NULL)
417 kref_init(&tp->cookie_values->kref);
418 }
419
420
421
422
423 sk->sk_sndbuf = sysctl_tcp_wmem[1];
424 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
425
426 local_bh_disable();
427 sock_update_memcg(sk);
428 sk_sockets_allocated_inc(sk);
429 local_bh_enable();
430}
431EXPORT_SYMBOL(tcp_init_sock);
432
433
434
435
436
437
438
439
440unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
441{
442 unsigned int mask;
443 struct sock *sk = sock->sk;
444 const struct tcp_sock *tp = tcp_sk(sk);
445
446 sock_poll_wait(file, sk_sleep(sk), wait);
447 if (sk->sk_state == TCP_LISTEN)
448 return inet_csk_listen_poll(sk);
449
450
451
452
453
454
455 mask = 0;
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
485 mask |= POLLHUP;
486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488
489
490 if (sk->sk_state != TCP_SYN_SENT &&
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
492 int target = sock_rcvlowat(sk, 0, INT_MAX);
493
494 if (tp->urg_seq == tp->copied_seq &&
495 !sock_flag(sk, SOCK_URGINLINE) &&
496 tp->urg_data)
497 target++;
498
499
500
501
502 if (tp->rcv_nxt - tp->copied_seq >= target)
503 mask |= POLLIN | POLLRDNORM;
504
505 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
506 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
507 mask |= POLLOUT | POLLWRNORM;
508 } else {
509 set_bit(SOCK_ASYNC_NOSPACE,
510 &sk->sk_socket->flags);
511 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
512
513
514
515
516
517 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
518 mask |= POLLOUT | POLLWRNORM;
519 }
520 } else
521 mask |= POLLOUT | POLLWRNORM;
522
523 if (tp->urg_data & TCP_URG_VALID)
524 mask |= POLLPRI;
525 }
526
527 smp_rmb();
528 if (sk->sk_err)
529 mask |= POLLERR;
530
531 return mask;
532}
533EXPORT_SYMBOL(tcp_poll);
534
535int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
536{
537 struct tcp_sock *tp = tcp_sk(sk);
538 int answ;
539
540 switch (cmd) {
541 case SIOCINQ:
542 if (sk->sk_state == TCP_LISTEN)
543 return -EINVAL;
544
545 lock_sock(sk);
546 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
547 answ = 0;
548 else if (sock_flag(sk, SOCK_URGINLINE) ||
549 !tp->urg_data ||
550 before(tp->urg_seq, tp->copied_seq) ||
551 !before(tp->urg_seq, tp->rcv_nxt)) {
552
553 answ = tp->rcv_nxt - tp->copied_seq;
554
555
556 if (answ && sock_flag(sk, SOCK_DONE))
557 answ--;
558 } else
559 answ = tp->urg_seq - tp->copied_seq;
560 release_sock(sk);
561 break;
562 case SIOCATMARK:
563 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
564 break;
565 case SIOCOUTQ:
566 if (sk->sk_state == TCP_LISTEN)
567 return -EINVAL;
568
569 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
570 answ = 0;
571 else
572 answ = tp->write_seq - tp->snd_una;
573 break;
574 case SIOCOUTQNSD:
575 if (sk->sk_state == TCP_LISTEN)
576 return -EINVAL;
577
578 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
579 answ = 0;
580 else
581 answ = tp->write_seq - tp->snd_nxt;
582 break;
583 default:
584 return -ENOIOCTLCMD;
585 }
586
587 return put_user(answ, (int __user *)arg);
588}
589EXPORT_SYMBOL(tcp_ioctl);
590
591static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
592{
593 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
594 tp->pushed_seq = tp->write_seq;
595}
596
597static inline bool forced_push(const struct tcp_sock *tp)
598{
599 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
600}
601
602static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
603{
604 struct tcp_sock *tp = tcp_sk(sk);
605 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
606
607 skb->csum = 0;
608 tcb->seq = tcb->end_seq = tp->write_seq;
609 tcb->tcp_flags = TCPHDR_ACK;
610 tcb->sacked = 0;
611 skb_header_release(skb);
612 tcp_add_write_queue_tail(sk, skb);
613 sk->sk_wmem_queued += skb->truesize;
614 sk_mem_charge(sk, skb->truesize);
615 if (tp->nonagle & TCP_NAGLE_PUSH)
616 tp->nonagle &= ~TCP_NAGLE_PUSH;
617}
618
619static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
620{
621 if (flags & MSG_OOB)
622 tp->snd_up = tp->write_seq;
623}
624
625static inline void tcp_push(struct sock *sk, int flags, int mss_now,
626 int nonagle)
627{
628 if (tcp_send_head(sk)) {
629 struct tcp_sock *tp = tcp_sk(sk);
630
631 if (!(flags & MSG_MORE) || forced_push(tp))
632 tcp_mark_push(tp, tcp_write_queue_tail(sk));
633
634 tcp_mark_urg(tp, flags);
635 __tcp_push_pending_frames(sk, mss_now,
636 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
637 }
638}
639
640static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
641 unsigned int offset, size_t len)
642{
643 struct tcp_splice_state *tss = rd_desc->arg.data;
644 int ret;
645
646 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
647 tss->flags);
648 if (ret > 0)
649 rd_desc->count -= ret;
650 return ret;
651}
652
653static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
654{
655
656 read_descriptor_t rd_desc = {
657 .arg.data = tss,
658 .count = tss->len,
659 };
660
661 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
662}
663
664
665
666
667
668
669
670
671
672
673
674
675
676ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
677 struct pipe_inode_info *pipe, size_t len,
678 unsigned int flags)
679{
680 struct sock *sk = sock->sk;
681 struct tcp_splice_state tss = {
682 .pipe = pipe,
683 .len = len,
684 .flags = flags,
685 };
686 long timeo;
687 ssize_t spliced;
688 int ret;
689
690 sock_rps_record_flow(sk);
691
692
693
694 if (unlikely(*ppos))
695 return -ESPIPE;
696
697 ret = spliced = 0;
698
699 lock_sock(sk);
700
701 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
702 while (tss.len) {
703 ret = __tcp_splice_read(sk, &tss);
704 if (ret < 0)
705 break;
706 else if (!ret) {
707 if (spliced)
708 break;
709 if (sock_flag(sk, SOCK_DONE))
710 break;
711 if (sk->sk_err) {
712 ret = sock_error(sk);
713 break;
714 }
715 if (sk->sk_shutdown & RCV_SHUTDOWN)
716 break;
717 if (sk->sk_state == TCP_CLOSE) {
718
719
720
721
722 if (!sock_flag(sk, SOCK_DONE))
723 ret = -ENOTCONN;
724 break;
725 }
726 if (!timeo) {
727 ret = -EAGAIN;
728 break;
729 }
730 sk_wait_data(sk, &timeo);
731 if (signal_pending(current)) {
732 ret = sock_intr_errno(timeo);
733 break;
734 }
735 continue;
736 }
737 tss.len -= ret;
738 spliced += ret;
739
740 if (!timeo)
741 break;
742 release_sock(sk);
743 lock_sock(sk);
744
745 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
746 (sk->sk_shutdown & RCV_SHUTDOWN) ||
747 signal_pending(current))
748 break;
749 }
750
751 release_sock(sk);
752
753 if (spliced)
754 return spliced;
755
756 return ret;
757}
758EXPORT_SYMBOL(tcp_splice_read);
759
760struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
761{
762 struct sk_buff *skb;
763
764
765 size = ALIGN(size, 4);
766
767 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
768 if (skb) {
769 if (sk_wmem_schedule(sk, skb->truesize)) {
770 skb_reserve(skb, sk->sk_prot->max_header);
771
772
773
774
775 skb->avail_size = size;
776 return skb;
777 }
778 __kfree_skb(skb);
779 } else {
780 sk->sk_prot->enter_memory_pressure(sk);
781 sk_stream_moderate_sndbuf(sk);
782 }
783 return NULL;
784}
785
786static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
787 int large_allowed)
788{
789 struct tcp_sock *tp = tcp_sk(sk);
790 u32 xmit_size_goal, old_size_goal;
791
792 xmit_size_goal = mss_now;
793
794 if (large_allowed && sk_can_gso(sk)) {
795 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
796 inet_csk(sk)->icsk_af_ops->net_header_len -
797 inet_csk(sk)->icsk_ext_hdr_len -
798 tp->tcp_header_len);
799
800
801 xmit_size_goal = min_t(u32, xmit_size_goal,
802 sysctl_tcp_limit_output_bytes >> 1);
803
804 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
805
806
807 old_size_goal = tp->xmit_size_goal_segs * mss_now;
808
809 if (likely(old_size_goal <= xmit_size_goal &&
810 old_size_goal + mss_now > xmit_size_goal)) {
811 xmit_size_goal = old_size_goal;
812 } else {
813 tp->xmit_size_goal_segs =
814 min_t(u16, xmit_size_goal / mss_now,
815 sk->sk_gso_max_segs);
816 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
817 }
818 }
819
820 return max(xmit_size_goal, mss_now);
821}
822
823static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
824{
825 int mss_now;
826
827 mss_now = tcp_current_mss(sk);
828 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
829
830 return mss_now;
831}
832
833static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
834 size_t size, int flags)
835{
836 struct tcp_sock *tp = tcp_sk(sk);
837 int mss_now, size_goal;
838 int err;
839 ssize_t copied;
840 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
841
842
843
844
845
846 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
847 !tcp_passive_fastopen(sk)) {
848 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
849 goto out_err;
850 }
851
852 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
853
854 mss_now = tcp_send_mss(sk, &size_goal, flags);
855 copied = 0;
856
857 err = -EPIPE;
858 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
859 goto out_err;
860
861 while (size > 0) {
862 struct sk_buff *skb = tcp_write_queue_tail(sk);
863 int copy, i;
864 bool can_coalesce;
865
866 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
867new_segment:
868 if (!sk_stream_memory_free(sk))
869 goto wait_for_sndbuf;
870
871 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
872 if (!skb)
873 goto wait_for_memory;
874
875 skb_entail(sk, skb);
876 copy = size_goal;
877 }
878
879 if (copy > size)
880 copy = size;
881
882 i = skb_shinfo(skb)->nr_frags;
883 can_coalesce = skb_can_coalesce(skb, i, page, offset);
884 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
885 tcp_mark_push(tp, skb);
886 goto new_segment;
887 }
888 if (!sk_wmem_schedule(sk, copy))
889 goto wait_for_memory;
890
891 if (can_coalesce) {
892 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
893 } else {
894 get_page(page);
895 skb_fill_page_desc(skb, i, page, offset, copy);
896 }
897
898 skb->len += copy;
899 skb->data_len += copy;
900 skb->truesize += copy;
901 sk->sk_wmem_queued += copy;
902 sk_mem_charge(sk, copy);
903 skb->ip_summed = CHECKSUM_PARTIAL;
904 tp->write_seq += copy;
905 TCP_SKB_CB(skb)->end_seq += copy;
906 skb_shinfo(skb)->gso_segs = 0;
907
908 if (!copied)
909 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
910
911 copied += copy;
912 offset += copy;
913 if (!(size -= copy))
914 goto out;
915
916 if (skb->len < size_goal || (flags & MSG_OOB))
917 continue;
918
919 if (forced_push(tp)) {
920 tcp_mark_push(tp, skb);
921 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
922 } else if (skb == tcp_send_head(sk))
923 tcp_push_one(sk, mss_now);
924 continue;
925
926wait_for_sndbuf:
927 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
928wait_for_memory:
929 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
930
931 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
932 goto do_error;
933
934 mss_now = tcp_send_mss(sk, &size_goal, flags);
935 }
936
937out:
938 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
939 tcp_push(sk, flags, mss_now, tp->nonagle);
940 return copied;
941
942do_error:
943 if (copied)
944 goto out;
945out_err:
946 return sk_stream_error(sk, flags, err);
947}
948
949int tcp_sendpage(struct sock *sk, struct page *page, int offset,
950 size_t size, int flags)
951{
952 ssize_t res;
953
954 if (!(sk->sk_route_caps & NETIF_F_SG) ||
955 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
956 return sock_no_sendpage(sk->sk_socket, page, offset, size,
957 flags);
958
959 lock_sock(sk);
960 res = do_tcp_sendpages(sk, page, offset, size, flags);
961 release_sock(sk);
962 return res;
963}
964EXPORT_SYMBOL(tcp_sendpage);
965
966static inline int select_size(const struct sock *sk, bool sg)
967{
968 const struct tcp_sock *tp = tcp_sk(sk);
969 int tmp = tp->mss_cache;
970
971 if (sg) {
972 if (sk_can_gso(sk)) {
973
974
975
976 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
977 } else {
978 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
979
980 if (tmp >= pgbreak &&
981 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
982 tmp = pgbreak;
983 }
984 }
985
986 return tmp;
987}
988
989void tcp_free_fastopen_req(struct tcp_sock *tp)
990{
991 if (tp->fastopen_req != NULL) {
992 kfree(tp->fastopen_req);
993 tp->fastopen_req = NULL;
994 }
995}
996
997static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
998{
999 struct tcp_sock *tp = tcp_sk(sk);
1000 int err, flags;
1001
1002 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1003 return -EOPNOTSUPP;
1004 if (tp->fastopen_req != NULL)
1005 return -EALREADY;
1006
1007 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1008 sk->sk_allocation);
1009 if (unlikely(tp->fastopen_req == NULL))
1010 return -ENOBUFS;
1011 tp->fastopen_req->data = msg;
1012
1013 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1014 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1015 msg->msg_namelen, flags);
1016 *size = tp->fastopen_req->copied;
1017 tcp_free_fastopen_req(tp);
1018 return err;
1019}
1020
1021int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1022 size_t size)
1023{
1024 struct iovec *iov;
1025 struct tcp_sock *tp = tcp_sk(sk);
1026 struct sk_buff *skb;
1027 int iovlen, flags, err, copied = 0;
1028 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1029 bool sg;
1030 long timeo;
1031
1032 lock_sock(sk);
1033
1034 flags = msg->msg_flags;
1035 if (flags & MSG_FASTOPEN) {
1036 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1037 if (err == -EINPROGRESS && copied_syn > 0)
1038 goto out;
1039 else if (err)
1040 goto out_err;
1041 offset = copied_syn;
1042 }
1043
1044 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1045
1046
1047
1048
1049
1050 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1051 !tcp_passive_fastopen(sk)) {
1052 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1053 goto do_error;
1054 }
1055
1056 if (unlikely(tp->repair)) {
1057 if (tp->repair_queue == TCP_RECV_QUEUE) {
1058 copied = tcp_send_rcvq(sk, msg, size);
1059 goto out;
1060 }
1061
1062 err = -EINVAL;
1063 if (tp->repair_queue == TCP_NO_QUEUE)
1064 goto out_err;
1065
1066
1067 }
1068
1069
1070 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1071
1072 mss_now = tcp_send_mss(sk, &size_goal, flags);
1073
1074
1075 iovlen = msg->msg_iovlen;
1076 iov = msg->msg_iov;
1077 copied = 0;
1078
1079 err = -EPIPE;
1080 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1081 goto out_err;
1082
1083 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1084
1085 while (--iovlen >= 0) {
1086 size_t seglen = iov->iov_len;
1087 unsigned char __user *from = iov->iov_base;
1088
1089 iov++;
1090 if (unlikely(offset > 0)) {
1091 if (offset >= seglen) {
1092 offset -= seglen;
1093 continue;
1094 }
1095 seglen -= offset;
1096 from += offset;
1097 offset = 0;
1098 }
1099
1100 while (seglen > 0) {
1101 int copy = 0;
1102 int max = size_goal;
1103
1104 skb = tcp_write_queue_tail(sk);
1105 if (tcp_send_head(sk)) {
1106 if (skb->ip_summed == CHECKSUM_NONE)
1107 max = mss_now;
1108 copy = max - skb->len;
1109 }
1110
1111 if (copy <= 0) {
1112new_segment:
1113
1114
1115
1116 if (!sk_stream_memory_free(sk))
1117 goto wait_for_sndbuf;
1118
1119 skb = sk_stream_alloc_skb(sk,
1120 select_size(sk, sg),
1121 sk->sk_allocation);
1122 if (!skb)
1123 goto wait_for_memory;
1124
1125
1126
1127
1128 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1129 skb->ip_summed = CHECKSUM_PARTIAL;
1130
1131 skb_entail(sk, skb);
1132 copy = size_goal;
1133 max = size_goal;
1134 }
1135
1136
1137 if (copy > seglen)
1138 copy = seglen;
1139
1140
1141 if (skb_availroom(skb) > 0) {
1142
1143 copy = min_t(int, copy, skb_availroom(skb));
1144 err = skb_add_data_nocache(sk, skb, from, copy);
1145 if (err)
1146 goto do_fault;
1147 } else {
1148 bool merge = true;
1149 int i = skb_shinfo(skb)->nr_frags;
1150 struct page_frag *pfrag = sk_page_frag(sk);
1151
1152 if (!sk_page_frag_refill(sk, pfrag))
1153 goto wait_for_memory;
1154
1155 if (!skb_can_coalesce(skb, i, pfrag->page,
1156 pfrag->offset)) {
1157 if (i == MAX_SKB_FRAGS || !sg) {
1158 tcp_mark_push(tp, skb);
1159 goto new_segment;
1160 }
1161 merge = false;
1162 }
1163
1164 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1165
1166 if (!sk_wmem_schedule(sk, copy))
1167 goto wait_for_memory;
1168
1169 err = skb_copy_to_page_nocache(sk, from, skb,
1170 pfrag->page,
1171 pfrag->offset,
1172 copy);
1173 if (err)
1174 goto do_error;
1175
1176
1177 if (merge) {
1178 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1179 } else {
1180 skb_fill_page_desc(skb, i, pfrag->page,
1181 pfrag->offset, copy);
1182 get_page(pfrag->page);
1183 }
1184 pfrag->offset += copy;
1185 }
1186
1187 if (!copied)
1188 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1189
1190 tp->write_seq += copy;
1191 TCP_SKB_CB(skb)->end_seq += copy;
1192 skb_shinfo(skb)->gso_segs = 0;
1193
1194 from += copy;
1195 copied += copy;
1196 if ((seglen -= copy) == 0 && iovlen == 0)
1197 goto out;
1198
1199 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1200 continue;
1201
1202 if (forced_push(tp)) {
1203 tcp_mark_push(tp, skb);
1204 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1205 } else if (skb == tcp_send_head(sk))
1206 tcp_push_one(sk, mss_now);
1207 continue;
1208
1209wait_for_sndbuf:
1210 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1211wait_for_memory:
1212 if (copied)
1213 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1214
1215 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1216 goto do_error;
1217
1218 mss_now = tcp_send_mss(sk, &size_goal, flags);
1219 }
1220 }
1221
1222out:
1223 if (copied)
1224 tcp_push(sk, flags, mss_now, tp->nonagle);
1225 release_sock(sk);
1226 return copied + copied_syn;
1227
1228do_fault:
1229 if (!skb->len) {
1230 tcp_unlink_write_queue(skb, sk);
1231
1232
1233
1234 tcp_check_send_head(sk, skb);
1235 sk_wmem_free_skb(sk, skb);
1236 }
1237
1238do_error:
1239 if (copied + copied_syn)
1240 goto out;
1241out_err:
1242 err = sk_stream_error(sk, flags, err);
1243 release_sock(sk);
1244 return err;
1245}
1246EXPORT_SYMBOL(tcp_sendmsg);
1247
1248
1249
1250
1251
1252
1253static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1254{
1255 struct tcp_sock *tp = tcp_sk(sk);
1256
1257
1258 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1259 tp->urg_data == TCP_URG_READ)
1260 return -EINVAL;
1261
1262 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1263 return -ENOTCONN;
1264
1265 if (tp->urg_data & TCP_URG_VALID) {
1266 int err = 0;
1267 char c = tp->urg_data;
1268
1269 if (!(flags & MSG_PEEK))
1270 tp->urg_data = TCP_URG_READ;
1271
1272
1273 msg->msg_flags |= MSG_OOB;
1274
1275 if (len > 0) {
1276 if (!(flags & MSG_TRUNC))
1277 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1278 len = 1;
1279 } else
1280 msg->msg_flags |= MSG_TRUNC;
1281
1282 return err ? -EFAULT : len;
1283 }
1284
1285 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1286 return 0;
1287
1288
1289
1290
1291
1292
1293
1294 return -EAGAIN;
1295}
1296
1297static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1298{
1299 struct sk_buff *skb;
1300 int copied = 0, err = 0;
1301
1302
1303
1304 skb_queue_walk(&sk->sk_write_queue, skb) {
1305 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1306 if (err)
1307 break;
1308
1309 copied += skb->len;
1310 }
1311
1312 return err ?: copied;
1313}
1314
1315
1316
1317
1318
1319
1320
1321void tcp_cleanup_rbuf(struct sock *sk, int copied)
1322{
1323 struct tcp_sock *tp = tcp_sk(sk);
1324 bool time_to_ack = false;
1325
1326 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1327
1328 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1329 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1330 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1331
1332 if (inet_csk_ack_scheduled(sk)) {
1333 const struct inet_connection_sock *icsk = inet_csk(sk);
1334
1335
1336 if (icsk->icsk_ack.blocked ||
1337
1338 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1339
1340
1341
1342
1343
1344
1345 (copied > 0 &&
1346 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1347 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1348 !icsk->icsk_ack.pingpong)) &&
1349 !atomic_read(&sk->sk_rmem_alloc)))
1350 time_to_ack = true;
1351 }
1352
1353
1354
1355
1356
1357
1358
1359 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1360 __u32 rcv_window_now = tcp_receive_window(tp);
1361
1362
1363 if (2*rcv_window_now <= tp->window_clamp) {
1364 __u32 new_window = __tcp_select_window(sk);
1365
1366
1367
1368
1369
1370
1371 if (new_window && new_window >= 2 * rcv_window_now)
1372 time_to_ack = true;
1373 }
1374 }
1375 if (time_to_ack)
1376 tcp_send_ack(sk);
1377}
1378
1379static void tcp_prequeue_process(struct sock *sk)
1380{
1381 struct sk_buff *skb;
1382 struct tcp_sock *tp = tcp_sk(sk);
1383
1384 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1385
1386
1387
1388 local_bh_disable();
1389 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1390 sk_backlog_rcv(sk, skb);
1391 local_bh_enable();
1392
1393
1394 tp->ucopy.memory = 0;
1395}
1396
1397#ifdef CONFIG_NET_DMA
1398static void tcp_service_net_dma(struct sock *sk, bool wait)
1399{
1400 dma_cookie_t done, used;
1401 dma_cookie_t last_issued;
1402 struct tcp_sock *tp = tcp_sk(sk);
1403
1404 if (!tp->ucopy.dma_chan)
1405 return;
1406
1407 last_issued = tp->ucopy.dma_cookie;
1408 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1409
1410 do {
1411 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1412 last_issued, &done,
1413 &used) == DMA_SUCCESS) {
1414
1415 __skb_queue_purge(&sk->sk_async_wait_queue);
1416 break;
1417 } else {
1418 struct sk_buff *skb;
1419 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1420 (dma_async_is_complete(skb->dma_cookie, done,
1421 used) == DMA_SUCCESS)) {
1422 __skb_dequeue(&sk->sk_async_wait_queue);
1423 kfree_skb(skb);
1424 }
1425 }
1426 } while (wait);
1427}
1428#endif
1429
1430static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1431{
1432 struct sk_buff *skb;
1433 u32 offset;
1434
1435 skb_queue_walk(&sk->sk_receive_queue, skb) {
1436 offset = seq - TCP_SKB_CB(skb)->seq;
1437 if (tcp_hdr(skb)->syn)
1438 offset--;
1439 if (offset < skb->len || tcp_hdr(skb)->fin) {
1440 *off = offset;
1441 return skb;
1442 }
1443 }
1444 return NULL;
1445}
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1459 sk_read_actor_t recv_actor)
1460{
1461 struct sk_buff *skb;
1462 struct tcp_sock *tp = tcp_sk(sk);
1463 u32 seq = tp->copied_seq;
1464 u32 offset;
1465 int copied = 0;
1466
1467 if (sk->sk_state == TCP_LISTEN)
1468 return -ENOTCONN;
1469 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1470 if (offset < skb->len) {
1471 int used;
1472 size_t len;
1473
1474 len = skb->len - offset;
1475
1476 if (tp->urg_data) {
1477 u32 urg_offset = tp->urg_seq - seq;
1478 if (urg_offset < len)
1479 len = urg_offset;
1480 if (!len)
1481 break;
1482 }
1483 used = recv_actor(desc, skb, offset, len);
1484 if (used < 0) {
1485 if (!copied)
1486 copied = used;
1487 break;
1488 } else if (used <= len) {
1489 seq += used;
1490 copied += used;
1491 offset += used;
1492 }
1493
1494
1495
1496
1497
1498
1499 skb = tcp_recv_skb(sk, seq-1, &offset);
1500 if (!skb || (offset+1 != skb->len))
1501 break;
1502 }
1503 if (tcp_hdr(skb)->fin) {
1504 sk_eat_skb(sk, skb, false);
1505 ++seq;
1506 break;
1507 }
1508 sk_eat_skb(sk, skb, false);
1509 if (!desc->count)
1510 break;
1511 tp->copied_seq = seq;
1512 }
1513 tp->copied_seq = seq;
1514
1515 tcp_rcv_space_adjust(sk);
1516
1517
1518 if (copied > 0)
1519 tcp_cleanup_rbuf(sk, copied);
1520 return copied;
1521}
1522EXPORT_SYMBOL(tcp_read_sock);
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1533 size_t len, int nonblock, int flags, int *addr_len)
1534{
1535 struct tcp_sock *tp = tcp_sk(sk);
1536 int copied = 0;
1537 u32 peek_seq;
1538 u32 *seq;
1539 unsigned long used;
1540 int err;
1541 int target;
1542 long timeo;
1543 struct task_struct *user_recv = NULL;
1544 bool copied_early = false;
1545 struct sk_buff *skb;
1546 u32 urg_hole = 0;
1547
1548 lock_sock(sk);
1549
1550 err = -ENOTCONN;
1551 if (sk->sk_state == TCP_LISTEN)
1552 goto out;
1553
1554 timeo = sock_rcvtimeo(sk, nonblock);
1555
1556
1557 if (flags & MSG_OOB)
1558 goto recv_urg;
1559
1560 if (unlikely(tp->repair)) {
1561 err = -EPERM;
1562 if (!(flags & MSG_PEEK))
1563 goto out;
1564
1565 if (tp->repair_queue == TCP_SEND_QUEUE)
1566 goto recv_sndq;
1567
1568 err = -EINVAL;
1569 if (tp->repair_queue == TCP_NO_QUEUE)
1570 goto out;
1571
1572
1573 }
1574
1575 seq = &tp->copied_seq;
1576 if (flags & MSG_PEEK) {
1577 peek_seq = tp->copied_seq;
1578 seq = &peek_seq;
1579 }
1580
1581 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1582
1583#ifdef CONFIG_NET_DMA
1584 tp->ucopy.dma_chan = NULL;
1585 preempt_disable();
1586 skb = skb_peek_tail(&sk->sk_receive_queue);
1587 {
1588 int available = 0;
1589
1590 if (skb)
1591 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1592 if ((available < target) &&
1593 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1594 !sysctl_tcp_low_latency &&
1595 net_dma_find_channel()) {
1596 preempt_enable_no_resched();
1597 tp->ucopy.pinned_list =
1598 dma_pin_iovec_pages(msg->msg_iov, len);
1599 } else {
1600 preempt_enable_no_resched();
1601 }
1602 }
1603#endif
1604
1605 do {
1606 u32 offset;
1607
1608
1609 if (tp->urg_data && tp->urg_seq == *seq) {
1610 if (copied)
1611 break;
1612 if (signal_pending(current)) {
1613 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1614 break;
1615 }
1616 }
1617
1618
1619
1620 skb_queue_walk(&sk->sk_receive_queue, skb) {
1621
1622
1623
1624 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1625 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1626 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1627 flags))
1628 break;
1629
1630 offset = *seq - TCP_SKB_CB(skb)->seq;
1631 if (tcp_hdr(skb)->syn)
1632 offset--;
1633 if (offset < skb->len)
1634 goto found_ok_skb;
1635 if (tcp_hdr(skb)->fin)
1636 goto found_fin_ok;
1637 WARN(!(flags & MSG_PEEK),
1638 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1639 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1640 }
1641
1642
1643
1644 if (copied >= target && !sk->sk_backlog.tail)
1645 break;
1646
1647 if (copied) {
1648 if (sk->sk_err ||
1649 sk->sk_state == TCP_CLOSE ||
1650 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1651 !timeo ||
1652 signal_pending(current))
1653 break;
1654 } else {
1655 if (sock_flag(sk, SOCK_DONE))
1656 break;
1657
1658 if (sk->sk_err) {
1659 copied = sock_error(sk);
1660 break;
1661 }
1662
1663 if (sk->sk_shutdown & RCV_SHUTDOWN)
1664 break;
1665
1666 if (sk->sk_state == TCP_CLOSE) {
1667 if (!sock_flag(sk, SOCK_DONE)) {
1668
1669
1670
1671 copied = -ENOTCONN;
1672 break;
1673 }
1674 break;
1675 }
1676
1677 if (!timeo) {
1678 copied = -EAGAIN;
1679 break;
1680 }
1681
1682 if (signal_pending(current)) {
1683 copied = sock_intr_errno(timeo);
1684 break;
1685 }
1686 }
1687
1688 tcp_cleanup_rbuf(sk, copied);
1689
1690 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1691
1692 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1693 user_recv = current;
1694 tp->ucopy.task = user_recv;
1695 tp->ucopy.iov = msg->msg_iov;
1696 }
1697
1698 tp->ucopy.len = len;
1699
1700 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1701 !(flags & (MSG_PEEK | MSG_TRUNC)));
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729 if (!skb_queue_empty(&tp->ucopy.prequeue))
1730 goto do_prequeue;
1731
1732
1733 }
1734
1735#ifdef CONFIG_NET_DMA
1736 if (tp->ucopy.dma_chan) {
1737 if (tp->rcv_wnd == 0 &&
1738 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1739 tcp_service_net_dma(sk, true);
1740 tcp_cleanup_rbuf(sk, copied);
1741 } else
1742 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1743 }
1744#endif
1745 if (copied >= target) {
1746
1747 release_sock(sk);
1748 lock_sock(sk);
1749 } else
1750 sk_wait_data(sk, &timeo);
1751
1752#ifdef CONFIG_NET_DMA
1753 tcp_service_net_dma(sk, false);
1754 tp->ucopy.wakeup = 0;
1755#endif
1756
1757 if (user_recv) {
1758 int chunk;
1759
1760
1761
1762 if ((chunk = len - tp->ucopy.len) != 0) {
1763 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1764 len -= chunk;
1765 copied += chunk;
1766 }
1767
1768 if (tp->rcv_nxt == tp->copied_seq &&
1769 !skb_queue_empty(&tp->ucopy.prequeue)) {
1770do_prequeue:
1771 tcp_prequeue_process(sk);
1772
1773 if ((chunk = len - tp->ucopy.len) != 0) {
1774 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1775 len -= chunk;
1776 copied += chunk;
1777 }
1778 }
1779 }
1780 if ((flags & MSG_PEEK) &&
1781 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1782 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1783 current->comm,
1784 task_pid_nr(current));
1785 peek_seq = tp->copied_seq;
1786 }
1787 continue;
1788
1789 found_ok_skb:
1790
1791 used = skb->len - offset;
1792 if (len < used)
1793 used = len;
1794
1795
1796 if (tp->urg_data) {
1797 u32 urg_offset = tp->urg_seq - *seq;
1798 if (urg_offset < used) {
1799 if (!urg_offset) {
1800 if (!sock_flag(sk, SOCK_URGINLINE)) {
1801 ++*seq;
1802 urg_hole++;
1803 offset++;
1804 used--;
1805 if (!used)
1806 goto skip_copy;
1807 }
1808 } else
1809 used = urg_offset;
1810 }
1811 }
1812
1813 if (!(flags & MSG_TRUNC)) {
1814#ifdef CONFIG_NET_DMA
1815 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1816 tp->ucopy.dma_chan = net_dma_find_channel();
1817
1818 if (tp->ucopy.dma_chan) {
1819 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1820 tp->ucopy.dma_chan, skb, offset,
1821 msg->msg_iov, used,
1822 tp->ucopy.pinned_list);
1823
1824 if (tp->ucopy.dma_cookie < 0) {
1825
1826 pr_alert("%s: dma_cookie < 0\n",
1827 __func__);
1828
1829
1830 if (!copied)
1831 copied = -EFAULT;
1832 break;
1833 }
1834
1835 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1836
1837 if ((offset + used) == skb->len)
1838 copied_early = true;
1839
1840 } else
1841#endif
1842 {
1843 err = skb_copy_datagram_iovec(skb, offset,
1844 msg->msg_iov, used);
1845 if (err) {
1846
1847 if (!copied)
1848 copied = -EFAULT;
1849 break;
1850 }
1851 }
1852 }
1853
1854 *seq += used;
1855 copied += used;
1856 len -= used;
1857
1858 tcp_rcv_space_adjust(sk);
1859
1860skip_copy:
1861 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1862 tp->urg_data = 0;
1863 tcp_fast_path_check(sk);
1864 }
1865 if (used + offset < skb->len)
1866 continue;
1867
1868 if (tcp_hdr(skb)->fin)
1869 goto found_fin_ok;
1870 if (!(flags & MSG_PEEK)) {
1871 sk_eat_skb(sk, skb, copied_early);
1872 copied_early = false;
1873 }
1874 continue;
1875
1876 found_fin_ok:
1877
1878 ++*seq;
1879 if (!(flags & MSG_PEEK)) {
1880 sk_eat_skb(sk, skb, copied_early);
1881 copied_early = false;
1882 }
1883 break;
1884 } while (len > 0);
1885
1886 if (user_recv) {
1887 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1888 int chunk;
1889
1890 tp->ucopy.len = copied > 0 ? len : 0;
1891
1892 tcp_prequeue_process(sk);
1893
1894 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1895 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1896 len -= chunk;
1897 copied += chunk;
1898 }
1899 }
1900
1901 tp->ucopy.task = NULL;
1902 tp->ucopy.len = 0;
1903 }
1904
1905#ifdef CONFIG_NET_DMA
1906 tcp_service_net_dma(sk, true);
1907 tp->ucopy.dma_chan = NULL;
1908
1909 if (tp->ucopy.pinned_list) {
1910 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1911 tp->ucopy.pinned_list = NULL;
1912 }
1913#endif
1914
1915
1916
1917
1918
1919
1920 tcp_cleanup_rbuf(sk, copied);
1921
1922 release_sock(sk);
1923 return copied;
1924
1925out:
1926 release_sock(sk);
1927 return err;
1928
1929recv_urg:
1930 err = tcp_recv_urg(sk, msg, len, flags);
1931 goto out;
1932
1933recv_sndq:
1934 err = tcp_peek_sndq(sk, msg, len);
1935 goto out;
1936}
1937EXPORT_SYMBOL(tcp_recvmsg);
1938
1939void tcp_set_state(struct sock *sk, int state)
1940{
1941 int oldstate = sk->sk_state;
1942
1943 switch (state) {
1944 case TCP_ESTABLISHED:
1945 if (oldstate != TCP_ESTABLISHED)
1946 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1947 break;
1948
1949 case TCP_CLOSE:
1950 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1951 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1952
1953 sk->sk_prot->unhash(sk);
1954 if (inet_csk(sk)->icsk_bind_hash &&
1955 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1956 inet_put_port(sk);
1957
1958 default:
1959 if (oldstate == TCP_ESTABLISHED)
1960 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1961 }
1962
1963
1964
1965
1966 sk->sk_state = state;
1967
1968#ifdef STATE_TRACE
1969 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1970#endif
1971}
1972EXPORT_SYMBOL_GPL(tcp_set_state);
1973
1974
1975
1976
1977
1978
1979
1980
1981static const unsigned char new_state[16] = {
1982
1983 TCP_CLOSE,
1984 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1985 TCP_CLOSE,
1986 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1987 TCP_FIN_WAIT1,
1988 TCP_FIN_WAIT2,
1989 TCP_CLOSE,
1990 TCP_CLOSE,
1991 TCP_LAST_ACK | TCP_ACTION_FIN,
1992 TCP_LAST_ACK,
1993 TCP_CLOSE,
1994 TCP_CLOSING,
1995};
1996
1997static int tcp_close_state(struct sock *sk)
1998{
1999 int next = (int)new_state[sk->sk_state];
2000 int ns = next & TCP_STATE_MASK;
2001
2002 tcp_set_state(sk, ns);
2003
2004 return next & TCP_ACTION_FIN;
2005}
2006
2007
2008
2009
2010
2011
2012void tcp_shutdown(struct sock *sk, int how)
2013{
2014
2015
2016
2017
2018 if (!(how & SEND_SHUTDOWN))
2019 return;
2020
2021
2022 if ((1 << sk->sk_state) &
2023 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2024 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2025
2026 if (tcp_close_state(sk))
2027 tcp_send_fin(sk);
2028 }
2029}
2030EXPORT_SYMBOL(tcp_shutdown);
2031
2032bool tcp_check_oom(struct sock *sk, int shift)
2033{
2034 bool too_many_orphans, out_of_socket_memory;
2035
2036 too_many_orphans = tcp_too_many_orphans(sk, shift);
2037 out_of_socket_memory = tcp_out_of_memory(sk);
2038
2039 if (too_many_orphans)
2040 net_info_ratelimited("too many orphaned sockets\n");
2041 if (out_of_socket_memory)
2042 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2043 return too_many_orphans || out_of_socket_memory;
2044}
2045
2046void tcp_close(struct sock *sk, long timeout)
2047{
2048 struct sk_buff *skb;
2049 int data_was_unread = 0;
2050 int state;
2051
2052 lock_sock(sk);
2053 sk->sk_shutdown = SHUTDOWN_MASK;
2054
2055 if (sk->sk_state == TCP_LISTEN) {
2056 tcp_set_state(sk, TCP_CLOSE);
2057
2058
2059 inet_csk_listen_stop(sk);
2060
2061 goto adjudge_to_death;
2062 }
2063
2064
2065
2066
2067
2068 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2069 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2070 tcp_hdr(skb)->fin;
2071 data_was_unread += len;
2072 __kfree_skb(skb);
2073 }
2074
2075 sk_mem_reclaim(sk);
2076
2077
2078 if (sk->sk_state == TCP_CLOSE)
2079 goto adjudge_to_death;
2080
2081
2082
2083
2084
2085
2086
2087
2088 if (unlikely(tcp_sk(sk)->repair)) {
2089 sk->sk_prot->disconnect(sk, 0);
2090 } else if (data_was_unread) {
2091
2092 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2093 tcp_set_state(sk, TCP_CLOSE);
2094 tcp_send_active_reset(sk, sk->sk_allocation);
2095 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2096
2097 sk->sk_prot->disconnect(sk, 0);
2098 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2099 } else if (tcp_close_state(sk)) {
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129 tcp_send_fin(sk);
2130 }
2131
2132 sk_stream_wait_close(sk, timeout);
2133
2134adjudge_to_death:
2135 state = sk->sk_state;
2136 sock_hold(sk);
2137 sock_orphan(sk);
2138
2139
2140 release_sock(sk);
2141
2142
2143
2144
2145
2146 local_bh_disable();
2147 bh_lock_sock(sk);
2148 WARN_ON(sock_owned_by_user(sk));
2149
2150 percpu_counter_inc(sk->sk_prot->orphan_count);
2151
2152
2153 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2154 goto out;
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170 if (sk->sk_state == TCP_FIN_WAIT2) {
2171 struct tcp_sock *tp = tcp_sk(sk);
2172 if (tp->linger2 < 0) {
2173 tcp_set_state(sk, TCP_CLOSE);
2174 tcp_send_active_reset(sk, GFP_ATOMIC);
2175 NET_INC_STATS_BH(sock_net(sk),
2176 LINUX_MIB_TCPABORTONLINGER);
2177 } else {
2178 const int tmo = tcp_fin_time(sk);
2179
2180 if (tmo > TCP_TIMEWAIT_LEN) {
2181 inet_csk_reset_keepalive_timer(sk,
2182 tmo - TCP_TIMEWAIT_LEN);
2183 } else {
2184 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2185 goto out;
2186 }
2187 }
2188 }
2189 if (sk->sk_state != TCP_CLOSE) {
2190 sk_mem_reclaim(sk);
2191 if (tcp_check_oom(sk, 0)) {
2192 tcp_set_state(sk, TCP_CLOSE);
2193 tcp_send_active_reset(sk, GFP_ATOMIC);
2194 NET_INC_STATS_BH(sock_net(sk),
2195 LINUX_MIB_TCPABORTONMEMORY);
2196 }
2197 }
2198
2199 if (sk->sk_state == TCP_CLOSE) {
2200 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2201
2202
2203
2204
2205 if (req != NULL)
2206 reqsk_fastopen_remove(sk, req, false);
2207 inet_csk_destroy_sock(sk);
2208 }
2209
2210
2211out:
2212 bh_unlock_sock(sk);
2213 local_bh_enable();
2214 sock_put(sk);
2215}
2216EXPORT_SYMBOL(tcp_close);
2217
2218
2219
2220static inline bool tcp_need_reset(int state)
2221{
2222 return (1 << state) &
2223 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2224 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2225}
2226
2227int tcp_disconnect(struct sock *sk, int flags)
2228{
2229 struct inet_sock *inet = inet_sk(sk);
2230 struct inet_connection_sock *icsk = inet_csk(sk);
2231 struct tcp_sock *tp = tcp_sk(sk);
2232 int err = 0;
2233 int old_state = sk->sk_state;
2234
2235 if (old_state != TCP_CLOSE)
2236 tcp_set_state(sk, TCP_CLOSE);
2237
2238
2239 if (old_state == TCP_LISTEN) {
2240 inet_csk_listen_stop(sk);
2241 } else if (unlikely(tp->repair)) {
2242 sk->sk_err = ECONNABORTED;
2243 } else if (tcp_need_reset(old_state) ||
2244 (tp->snd_nxt != tp->write_seq &&
2245 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2246
2247
2248
2249 tcp_send_active_reset(sk, gfp_any());
2250 sk->sk_err = ECONNRESET;
2251 } else if (old_state == TCP_SYN_SENT)
2252 sk->sk_err = ECONNRESET;
2253
2254 tcp_clear_xmit_timers(sk);
2255 __skb_queue_purge(&sk->sk_receive_queue);
2256 tcp_write_queue_purge(sk);
2257 __skb_queue_purge(&tp->out_of_order_queue);
2258#ifdef CONFIG_NET_DMA
2259 __skb_queue_purge(&sk->sk_async_wait_queue);
2260#endif
2261
2262 inet->inet_dport = 0;
2263
2264 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2265 inet_reset_saddr(sk);
2266
2267 sk->sk_shutdown = 0;
2268 sock_reset_flag(sk, SOCK_DONE);
2269 tp->srtt = 0;
2270 if ((tp->write_seq += tp->max_window + 2) == 0)
2271 tp->write_seq = 1;
2272 icsk->icsk_backoff = 0;
2273 tp->snd_cwnd = 2;
2274 icsk->icsk_probes_out = 0;
2275 tp->packets_out = 0;
2276 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2277 tp->snd_cwnd_cnt = 0;
2278 tp->bytes_acked = 0;
2279 tp->window_clamp = 0;
2280 tcp_set_ca_state(sk, TCP_CA_Open);
2281 tcp_clear_retrans(tp);
2282 inet_csk_delack_init(sk);
2283 tcp_init_send_head(sk);
2284 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2285 __sk_dst_reset(sk);
2286
2287 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2288
2289 sk->sk_error_report(sk);
2290 return err;
2291}
2292EXPORT_SYMBOL(tcp_disconnect);
2293
2294void tcp_sock_destruct(struct sock *sk)
2295{
2296 inet_sock_destruct(sk);
2297
2298 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2299}
2300
2301static inline bool tcp_can_repair_sock(const struct sock *sk)
2302{
2303 return capable(CAP_NET_ADMIN) &&
2304 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2305}
2306
2307static int tcp_repair_options_est(struct tcp_sock *tp,
2308 struct tcp_repair_opt __user *optbuf, unsigned int len)
2309{
2310 struct tcp_repair_opt opt;
2311
2312 while (len >= sizeof(opt)) {
2313 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2314 return -EFAULT;
2315
2316 optbuf++;
2317 len -= sizeof(opt);
2318
2319 switch (opt.opt_code) {
2320 case TCPOPT_MSS:
2321 tp->rx_opt.mss_clamp = opt.opt_val;
2322 break;
2323 case TCPOPT_WINDOW:
2324 {
2325 u16 snd_wscale = opt.opt_val & 0xFFFF;
2326 u16 rcv_wscale = opt.opt_val >> 16;
2327
2328 if (snd_wscale > 14 || rcv_wscale > 14)
2329 return -EFBIG;
2330
2331 tp->rx_opt.snd_wscale = snd_wscale;
2332 tp->rx_opt.rcv_wscale = rcv_wscale;
2333 tp->rx_opt.wscale_ok = 1;
2334 }
2335 break;
2336 case TCPOPT_SACK_PERM:
2337 if (opt.opt_val != 0)
2338 return -EINVAL;
2339
2340 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2341 if (sysctl_tcp_fack)
2342 tcp_enable_fack(tp);
2343 break;
2344 case TCPOPT_TIMESTAMP:
2345 if (opt.opt_val != 0)
2346 return -EINVAL;
2347
2348 tp->rx_opt.tstamp_ok = 1;
2349 break;
2350 }
2351 }
2352
2353 return 0;
2354}
2355
2356
2357
2358
2359static int do_tcp_setsockopt(struct sock *sk, int level,
2360 int optname, char __user *optval, unsigned int optlen)
2361{
2362 struct tcp_sock *tp = tcp_sk(sk);
2363 struct inet_connection_sock *icsk = inet_csk(sk);
2364 int val;
2365 int err = 0;
2366
2367
2368 switch (optname) {
2369 case TCP_CONGESTION: {
2370 char name[TCP_CA_NAME_MAX];
2371
2372 if (optlen < 1)
2373 return -EINVAL;
2374
2375 val = strncpy_from_user(name, optval,
2376 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2377 if (val < 0)
2378 return -EFAULT;
2379 name[val] = 0;
2380
2381 lock_sock(sk);
2382 err = tcp_set_congestion_control(sk, name);
2383 release_sock(sk);
2384 return err;
2385 }
2386 case TCP_COOKIE_TRANSACTIONS: {
2387 struct tcp_cookie_transactions ctd;
2388 struct tcp_cookie_values *cvp = NULL;
2389
2390 if (sizeof(ctd) > optlen)
2391 return -EINVAL;
2392 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2393 return -EFAULT;
2394
2395 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2396 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2397 return -EINVAL;
2398
2399 if (ctd.tcpct_cookie_desired == 0) {
2400
2401 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2402 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2403 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2404 return -EINVAL;
2405 }
2406
2407 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2408
2409 lock_sock(sk);
2410 if (tp->cookie_values != NULL) {
2411 kref_put(&tp->cookie_values->kref,
2412 tcp_cookie_values_release);
2413 tp->cookie_values = NULL;
2414 }
2415 tp->rx_opt.cookie_in_always = 0;
2416 tp->rx_opt.cookie_out_never = 1;
2417 release_sock(sk);
2418 return err;
2419 }
2420
2421
2422
2423 if (ctd.tcpct_used > 0 ||
2424 (tp->cookie_values == NULL &&
2425 (sysctl_tcp_cookie_size > 0 ||
2426 ctd.tcpct_cookie_desired > 0 ||
2427 ctd.tcpct_s_data_desired > 0))) {
2428 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2429 GFP_KERNEL);
2430 if (cvp == NULL)
2431 return -ENOMEM;
2432
2433 kref_init(&cvp->kref);
2434 }
2435 lock_sock(sk);
2436 tp->rx_opt.cookie_in_always =
2437 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2438 tp->rx_opt.cookie_out_never = 0;
2439
2440 if (tp->cookie_values != NULL) {
2441 if (cvp != NULL) {
2442
2443
2444
2445
2446 kref_put(&tp->cookie_values->kref,
2447 tcp_cookie_values_release);
2448 } else {
2449 cvp = tp->cookie_values;
2450 }
2451 }
2452
2453 if (cvp != NULL) {
2454 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2455
2456 if (ctd.tcpct_used > 0) {
2457 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2458 ctd.tcpct_used);
2459 cvp->s_data_desired = ctd.tcpct_used;
2460 cvp->s_data_constant = 1;
2461 } else {
2462
2463 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2464 cvp->s_data_constant = 0;
2465 }
2466
2467 tp->cookie_values = cvp;
2468 }
2469 release_sock(sk);
2470 return err;
2471 }
2472 default:
2473
2474 break;
2475 }
2476
2477 if (optlen < sizeof(int))
2478 return -EINVAL;
2479
2480 if (get_user(val, (int __user *)optval))
2481 return -EFAULT;
2482
2483 lock_sock(sk);
2484
2485 switch (optname) {
2486 case TCP_MAXSEG:
2487
2488
2489
2490 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2491 err = -EINVAL;
2492 break;
2493 }
2494 tp->rx_opt.user_mss = val;
2495 break;
2496
2497 case TCP_NODELAY:
2498 if (val) {
2499
2500
2501
2502
2503
2504
2505
2506
2507 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2508 tcp_push_pending_frames(sk);
2509 } else {
2510 tp->nonagle &= ~TCP_NAGLE_OFF;
2511 }
2512 break;
2513
2514 case TCP_THIN_LINEAR_TIMEOUTS:
2515 if (val < 0 || val > 1)
2516 err = -EINVAL;
2517 else
2518 tp->thin_lto = val;
2519 break;
2520
2521 case TCP_THIN_DUPACK:
2522 if (val < 0 || val > 1)
2523 err = -EINVAL;
2524 else
2525 tp->thin_dupack = val;
2526 if (tp->thin_dupack)
2527 tcp_disable_early_retrans(tp);
2528 break;
2529
2530 case TCP_REPAIR:
2531 if (!tcp_can_repair_sock(sk))
2532 err = -EPERM;
2533 else if (val == 1) {
2534 tp->repair = 1;
2535 sk->sk_reuse = SK_FORCE_REUSE;
2536 tp->repair_queue = TCP_NO_QUEUE;
2537 } else if (val == 0) {
2538 tp->repair = 0;
2539 sk->sk_reuse = SK_NO_REUSE;
2540 tcp_send_window_probe(sk);
2541 } else
2542 err = -EINVAL;
2543
2544 break;
2545
2546 case TCP_REPAIR_QUEUE:
2547 if (!tp->repair)
2548 err = -EPERM;
2549 else if (val < TCP_QUEUES_NR)
2550 tp->repair_queue = val;
2551 else
2552 err = -EINVAL;
2553 break;
2554
2555 case TCP_QUEUE_SEQ:
2556 if (sk->sk_state != TCP_CLOSE)
2557 err = -EPERM;
2558 else if (tp->repair_queue == TCP_SEND_QUEUE)
2559 tp->write_seq = val;
2560 else if (tp->repair_queue == TCP_RECV_QUEUE)
2561 tp->rcv_nxt = val;
2562 else
2563 err = -EINVAL;
2564 break;
2565
2566 case TCP_REPAIR_OPTIONS:
2567 if (!tp->repair)
2568 err = -EINVAL;
2569 else if (sk->sk_state == TCP_ESTABLISHED)
2570 err = tcp_repair_options_est(tp,
2571 (struct tcp_repair_opt __user *)optval,
2572 optlen);
2573 else
2574 err = -EPERM;
2575 break;
2576
2577 case TCP_CORK:
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589 if (val) {
2590 tp->nonagle |= TCP_NAGLE_CORK;
2591 } else {
2592 tp->nonagle &= ~TCP_NAGLE_CORK;
2593 if (tp->nonagle&TCP_NAGLE_OFF)
2594 tp->nonagle |= TCP_NAGLE_PUSH;
2595 tcp_push_pending_frames(sk);
2596 }
2597 break;
2598
2599 case TCP_KEEPIDLE:
2600 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2601 err = -EINVAL;
2602 else {
2603 tp->keepalive_time = val * HZ;
2604 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2605 !((1 << sk->sk_state) &
2606 (TCPF_CLOSE | TCPF_LISTEN))) {
2607 u32 elapsed = keepalive_time_elapsed(tp);
2608 if (tp->keepalive_time > elapsed)
2609 elapsed = tp->keepalive_time - elapsed;
2610 else
2611 elapsed = 0;
2612 inet_csk_reset_keepalive_timer(sk, elapsed);
2613 }
2614 }
2615 break;
2616 case TCP_KEEPINTVL:
2617 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2618 err = -EINVAL;
2619 else
2620 tp->keepalive_intvl = val * HZ;
2621 break;
2622 case TCP_KEEPCNT:
2623 if (val < 1 || val > MAX_TCP_KEEPCNT)
2624 err = -EINVAL;
2625 else
2626 tp->keepalive_probes = val;
2627 break;
2628 case TCP_SYNCNT:
2629 if (val < 1 || val > MAX_TCP_SYNCNT)
2630 err = -EINVAL;
2631 else
2632 icsk->icsk_syn_retries = val;
2633 break;
2634
2635 case TCP_LINGER2:
2636 if (val < 0)
2637 tp->linger2 = -1;
2638 else if (val > sysctl_tcp_fin_timeout / HZ)
2639 tp->linger2 = 0;
2640 else
2641 tp->linger2 = val * HZ;
2642 break;
2643
2644 case TCP_DEFER_ACCEPT:
2645
2646 icsk->icsk_accept_queue.rskq_defer_accept =
2647 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2648 TCP_RTO_MAX / HZ);
2649 break;
2650
2651 case TCP_WINDOW_CLAMP:
2652 if (!val) {
2653 if (sk->sk_state != TCP_CLOSE) {
2654 err = -EINVAL;
2655 break;
2656 }
2657 tp->window_clamp = 0;
2658 } else
2659 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2660 SOCK_MIN_RCVBUF / 2 : val;
2661 break;
2662
2663 case TCP_QUICKACK:
2664 if (!val) {
2665 icsk->icsk_ack.pingpong = 1;
2666 } else {
2667 icsk->icsk_ack.pingpong = 0;
2668 if ((1 << sk->sk_state) &
2669 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2670 inet_csk_ack_scheduled(sk)) {
2671 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2672 tcp_cleanup_rbuf(sk, 1);
2673 if (!(val & 1))
2674 icsk->icsk_ack.pingpong = 1;
2675 }
2676 }
2677 break;
2678
2679#ifdef CONFIG_TCP_MD5SIG
2680 case TCP_MD5SIG:
2681
2682 err = tp->af_specific->md5_parse(sk, optval, optlen);
2683 break;
2684#endif
2685 case TCP_USER_TIMEOUT:
2686
2687
2688
2689 if (val < 0)
2690 err = -EINVAL;
2691 else
2692 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2693 break;
2694
2695 case TCP_FASTOPEN:
2696 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2697 TCPF_LISTEN)))
2698 err = fastopen_init_queue(sk, val);
2699 else
2700 err = -EINVAL;
2701 break;
2702 default:
2703 err = -ENOPROTOOPT;
2704 break;
2705 }
2706
2707 release_sock(sk);
2708 return err;
2709}
2710
2711int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2712 unsigned int optlen)
2713{
2714 const struct inet_connection_sock *icsk = inet_csk(sk);
2715
2716 if (level != SOL_TCP)
2717 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2718 optval, optlen);
2719 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2720}
2721EXPORT_SYMBOL(tcp_setsockopt);
2722
2723#ifdef CONFIG_COMPAT
2724int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2725 char __user *optval, unsigned int optlen)
2726{
2727 if (level != SOL_TCP)
2728 return inet_csk_compat_setsockopt(sk, level, optname,
2729 optval, optlen);
2730 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2731}
2732EXPORT_SYMBOL(compat_tcp_setsockopt);
2733#endif
2734
2735
2736void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2737{
2738 const struct tcp_sock *tp = tcp_sk(sk);
2739 const struct inet_connection_sock *icsk = inet_csk(sk);
2740 u32 now = tcp_time_stamp;
2741
2742 memset(info, 0, sizeof(*info));
2743
2744 info->tcpi_state = sk->sk_state;
2745 info->tcpi_ca_state = icsk->icsk_ca_state;
2746 info->tcpi_retransmits = icsk->icsk_retransmits;
2747 info->tcpi_probes = icsk->icsk_probes_out;
2748 info->tcpi_backoff = icsk->icsk_backoff;
2749
2750 if (tp->rx_opt.tstamp_ok)
2751 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2752 if (tcp_is_sack(tp))
2753 info->tcpi_options |= TCPI_OPT_SACK;
2754 if (tp->rx_opt.wscale_ok) {
2755 info->tcpi_options |= TCPI_OPT_WSCALE;
2756 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2757 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2758 }
2759
2760 if (tp->ecn_flags & TCP_ECN_OK)
2761 info->tcpi_options |= TCPI_OPT_ECN;
2762 if (tp->ecn_flags & TCP_ECN_SEEN)
2763 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2764 if (tp->syn_data_acked)
2765 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2766
2767 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2768 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2769 info->tcpi_snd_mss = tp->mss_cache;
2770 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2771
2772 if (sk->sk_state == TCP_LISTEN) {
2773 info->tcpi_unacked = sk->sk_ack_backlog;
2774 info->tcpi_sacked = sk->sk_max_ack_backlog;
2775 } else {
2776 info->tcpi_unacked = tp->packets_out;
2777 info->tcpi_sacked = tp->sacked_out;
2778 }
2779 info->tcpi_lost = tp->lost_out;
2780 info->tcpi_retrans = tp->retrans_out;
2781 info->tcpi_fackets = tp->fackets_out;
2782
2783 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2784 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2785 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2786
2787 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2788 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2789 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2790 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2791 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2792 info->tcpi_snd_cwnd = tp->snd_cwnd;
2793 info->tcpi_advmss = tp->advmss;
2794 info->tcpi_reordering = tp->reordering;
2795
2796 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2797 info->tcpi_rcv_space = tp->rcvq_space.space;
2798
2799 info->tcpi_total_retrans = tp->total_retrans;
2800}
2801EXPORT_SYMBOL_GPL(tcp_get_info);
2802
2803static int do_tcp_getsockopt(struct sock *sk, int level,
2804 int optname, char __user *optval, int __user *optlen)
2805{
2806 struct inet_connection_sock *icsk = inet_csk(sk);
2807 struct tcp_sock *tp = tcp_sk(sk);
2808 int val, len;
2809
2810 if (get_user(len, optlen))
2811 return -EFAULT;
2812
2813 len = min_t(unsigned int, len, sizeof(int));
2814
2815 if (len < 0)
2816 return -EINVAL;
2817
2818 switch (optname) {
2819 case TCP_MAXSEG:
2820 val = tp->mss_cache;
2821 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2822 val = tp->rx_opt.user_mss;
2823 if (tp->repair)
2824 val = tp->rx_opt.mss_clamp;
2825 break;
2826 case TCP_NODELAY:
2827 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2828 break;
2829 case TCP_CORK:
2830 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2831 break;
2832 case TCP_KEEPIDLE:
2833 val = keepalive_time_when(tp) / HZ;
2834 break;
2835 case TCP_KEEPINTVL:
2836 val = keepalive_intvl_when(tp) / HZ;
2837 break;
2838 case TCP_KEEPCNT:
2839 val = keepalive_probes(tp);
2840 break;
2841 case TCP_SYNCNT:
2842 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2843 break;
2844 case TCP_LINGER2:
2845 val = tp->linger2;
2846 if (val >= 0)
2847 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2848 break;
2849 case TCP_DEFER_ACCEPT:
2850 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2851 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2852 break;
2853 case TCP_WINDOW_CLAMP:
2854 val = tp->window_clamp;
2855 break;
2856 case TCP_INFO: {
2857 struct tcp_info info;
2858
2859 if (get_user(len, optlen))
2860 return -EFAULT;
2861
2862 tcp_get_info(sk, &info);
2863
2864 len = min_t(unsigned int, len, sizeof(info));
2865 if (put_user(len, optlen))
2866 return -EFAULT;
2867 if (copy_to_user(optval, &info, len))
2868 return -EFAULT;
2869 return 0;
2870 }
2871 case TCP_QUICKACK:
2872 val = !icsk->icsk_ack.pingpong;
2873 break;
2874
2875 case TCP_CONGESTION:
2876 if (get_user(len, optlen))
2877 return -EFAULT;
2878 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2879 if (put_user(len, optlen))
2880 return -EFAULT;
2881 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2882 return -EFAULT;
2883 return 0;
2884
2885 case TCP_COOKIE_TRANSACTIONS: {
2886 struct tcp_cookie_transactions ctd;
2887 struct tcp_cookie_values *cvp = tp->cookie_values;
2888
2889 if (get_user(len, optlen))
2890 return -EFAULT;
2891 if (len < sizeof(ctd))
2892 return -EINVAL;
2893
2894 memset(&ctd, 0, sizeof(ctd));
2895 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2896 TCP_COOKIE_IN_ALWAYS : 0)
2897 | (tp->rx_opt.cookie_out_never ?
2898 TCP_COOKIE_OUT_NEVER : 0);
2899
2900 if (cvp != NULL) {
2901 ctd.tcpct_flags |= (cvp->s_data_in ?
2902 TCP_S_DATA_IN : 0)
2903 | (cvp->s_data_out ?
2904 TCP_S_DATA_OUT : 0);
2905
2906 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2907 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2908
2909 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2910 cvp->cookie_pair_size);
2911 ctd.tcpct_used = cvp->cookie_pair_size;
2912 }
2913
2914 if (put_user(sizeof(ctd), optlen))
2915 return -EFAULT;
2916 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2917 return -EFAULT;
2918 return 0;
2919 }
2920 case TCP_THIN_LINEAR_TIMEOUTS:
2921 val = tp->thin_lto;
2922 break;
2923 case TCP_THIN_DUPACK:
2924 val = tp->thin_dupack;
2925 break;
2926
2927 case TCP_REPAIR:
2928 val = tp->repair;
2929 break;
2930
2931 case TCP_REPAIR_QUEUE:
2932 if (tp->repair)
2933 val = tp->repair_queue;
2934 else
2935 return -EINVAL;
2936 break;
2937
2938 case TCP_QUEUE_SEQ:
2939 if (tp->repair_queue == TCP_SEND_QUEUE)
2940 val = tp->write_seq;
2941 else if (tp->repair_queue == TCP_RECV_QUEUE)
2942 val = tp->rcv_nxt;
2943 else
2944 return -EINVAL;
2945 break;
2946
2947 case TCP_USER_TIMEOUT:
2948 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2949 break;
2950 default:
2951 return -ENOPROTOOPT;
2952 }
2953
2954 if (put_user(len, optlen))
2955 return -EFAULT;
2956 if (copy_to_user(optval, &val, len))
2957 return -EFAULT;
2958 return 0;
2959}
2960
2961int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2962 int __user *optlen)
2963{
2964 struct inet_connection_sock *icsk = inet_csk(sk);
2965
2966 if (level != SOL_TCP)
2967 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2968 optval, optlen);
2969 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2970}
2971EXPORT_SYMBOL(tcp_getsockopt);
2972
2973#ifdef CONFIG_COMPAT
2974int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2975 char __user *optval, int __user *optlen)
2976{
2977 if (level != SOL_TCP)
2978 return inet_csk_compat_getsockopt(sk, level, optname,
2979 optval, optlen);
2980 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2981}
2982EXPORT_SYMBOL(compat_tcp_getsockopt);
2983#endif
2984
2985struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2986 netdev_features_t features)
2987{
2988 struct sk_buff *segs = ERR_PTR(-EINVAL);
2989 struct tcphdr *th;
2990 unsigned int thlen;
2991 unsigned int seq;
2992 __be32 delta;
2993 unsigned int oldlen;
2994 unsigned int mss;
2995
2996 if (!pskb_may_pull(skb, sizeof(*th)))
2997 goto out;
2998
2999 th = tcp_hdr(skb);
3000 thlen = th->doff * 4;
3001 if (thlen < sizeof(*th))
3002 goto out;
3003
3004 if (!pskb_may_pull(skb, thlen))
3005 goto out;
3006
3007 oldlen = (u16)~skb->len;
3008 __skb_pull(skb, thlen);
3009
3010 mss = skb_shinfo(skb)->gso_size;
3011 if (unlikely(skb->len <= mss))
3012 goto out;
3013
3014 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
3015
3016 int type = skb_shinfo(skb)->gso_type;
3017
3018 if (unlikely(type &
3019 ~(SKB_GSO_TCPV4 |
3020 SKB_GSO_DODGY |
3021 SKB_GSO_TCP_ECN |
3022 SKB_GSO_TCPV6 |
3023 0) ||
3024 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
3025 goto out;
3026
3027 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
3028
3029 segs = NULL;
3030 goto out;
3031 }
3032
3033 segs = skb_segment(skb, features);
3034 if (IS_ERR(segs))
3035 goto out;
3036
3037 delta = htonl(oldlen + (thlen + mss));
3038
3039 skb = segs;
3040 th = tcp_hdr(skb);
3041 seq = ntohl(th->seq);
3042
3043 do {
3044 th->fin = th->psh = 0;
3045
3046 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3047 (__force u32)delta));
3048 if (skb->ip_summed != CHECKSUM_PARTIAL)
3049 th->check =
3050 csum_fold(csum_partial(skb_transport_header(skb),
3051 thlen, skb->csum));
3052
3053 seq += mss;
3054 skb = skb->next;
3055 th = tcp_hdr(skb);
3056
3057 th->seq = htonl(seq);
3058 th->cwr = 0;
3059 } while (skb->next);
3060
3061 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
3062 skb->data_len);
3063 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3064 (__force u32)delta));
3065 if (skb->ip_summed != CHECKSUM_PARTIAL)
3066 th->check = csum_fold(csum_partial(skb_transport_header(skb),
3067 thlen, skb->csum));
3068
3069out:
3070 return segs;
3071}
3072EXPORT_SYMBOL(tcp_tso_segment);
3073
3074struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3075{
3076 struct sk_buff **pp = NULL;
3077 struct sk_buff *p;
3078 struct tcphdr *th;
3079 struct tcphdr *th2;
3080 unsigned int len;
3081 unsigned int thlen;
3082 __be32 flags;
3083 unsigned int mss = 1;
3084 unsigned int hlen;
3085 unsigned int off;
3086 int flush = 1;
3087 int i;
3088
3089 off = skb_gro_offset(skb);
3090 hlen = off + sizeof(*th);
3091 th = skb_gro_header_fast(skb, off);
3092 if (skb_gro_header_hard(skb, hlen)) {
3093 th = skb_gro_header_slow(skb, hlen, off);
3094 if (unlikely(!th))
3095 goto out;
3096 }
3097
3098 thlen = th->doff * 4;
3099 if (thlen < sizeof(*th))
3100 goto out;
3101
3102 hlen = off + thlen;
3103 if (skb_gro_header_hard(skb, hlen)) {
3104 th = skb_gro_header_slow(skb, hlen, off);
3105 if (unlikely(!th))
3106 goto out;
3107 }
3108
3109 skb_gro_pull(skb, thlen);
3110
3111 len = skb_gro_len(skb);
3112 flags = tcp_flag_word(th);
3113
3114 for (; (p = *head); head = &p->next) {
3115 if (!NAPI_GRO_CB(p)->same_flow)
3116 continue;
3117
3118 th2 = tcp_hdr(p);
3119
3120 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
3121 NAPI_GRO_CB(p)->same_flow = 0;
3122 continue;
3123 }
3124
3125 goto found;
3126 }
3127
3128 goto out_check_final;
3129
3130found:
3131 flush = NAPI_GRO_CB(p)->flush;
3132 flush |= (__force int)(flags & TCP_FLAG_CWR);
3133 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
3134 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
3135 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
3136 for (i = sizeof(*th); i < thlen; i += 4)
3137 flush |= *(u32 *)((u8 *)th + i) ^
3138 *(u32 *)((u8 *)th2 + i);
3139
3140 mss = skb_shinfo(p)->gso_size;
3141
3142 flush |= (len - 1) >= mss;
3143 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
3144
3145 if (flush || skb_gro_receive(head, skb)) {
3146 mss = 1;
3147 goto out_check_final;
3148 }
3149
3150 p = *head;
3151 th2 = tcp_hdr(p);
3152 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
3153
3154out_check_final:
3155 flush = len < mss;
3156 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
3157 TCP_FLAG_RST | TCP_FLAG_SYN |
3158 TCP_FLAG_FIN));
3159
3160 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
3161 pp = head;
3162
3163out:
3164 NAPI_GRO_CB(skb)->flush |= flush;
3165
3166 return pp;
3167}
3168EXPORT_SYMBOL(tcp_gro_receive);
3169
3170int tcp_gro_complete(struct sk_buff *skb)
3171{
3172 struct tcphdr *th = tcp_hdr(skb);
3173
3174 skb->csum_start = skb_transport_header(skb) - skb->head;
3175 skb->csum_offset = offsetof(struct tcphdr, check);
3176 skb->ip_summed = CHECKSUM_PARTIAL;
3177
3178 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
3179
3180 if (th->cwr)
3181 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
3182
3183 return 0;
3184}
3185EXPORT_SYMBOL(tcp_gro_complete);
3186
3187#ifdef CONFIG_TCP_MD5SIG
3188static unsigned long tcp_md5sig_users;
3189static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
3190static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
3191
3192static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
3193{
3194 int cpu;
3195
3196 for_each_possible_cpu(cpu) {
3197 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
3198
3199 if (p->md5_desc.tfm)
3200 crypto_free_hash(p->md5_desc.tfm);
3201 }
3202 free_percpu(pool);
3203}
3204
3205void tcp_free_md5sig_pool(void)
3206{
3207 struct tcp_md5sig_pool __percpu *pool = NULL;
3208
3209 spin_lock_bh(&tcp_md5sig_pool_lock);
3210 if (--tcp_md5sig_users == 0) {
3211 pool = tcp_md5sig_pool;
3212 tcp_md5sig_pool = NULL;
3213 }
3214 spin_unlock_bh(&tcp_md5sig_pool_lock);
3215 if (pool)
3216 __tcp_free_md5sig_pool(pool);
3217}
3218EXPORT_SYMBOL(tcp_free_md5sig_pool);
3219
3220static struct tcp_md5sig_pool __percpu *
3221__tcp_alloc_md5sig_pool(struct sock *sk)
3222{
3223 int cpu;
3224 struct tcp_md5sig_pool __percpu *pool;
3225
3226 pool = alloc_percpu(struct tcp_md5sig_pool);
3227 if (!pool)
3228 return NULL;
3229
3230 for_each_possible_cpu(cpu) {
3231 struct crypto_hash *hash;
3232
3233 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3234 if (!hash || IS_ERR(hash))
3235 goto out_free;
3236
3237 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
3238 }
3239 return pool;
3240out_free:
3241 __tcp_free_md5sig_pool(pool);
3242 return NULL;
3243}
3244
3245struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
3246{
3247 struct tcp_md5sig_pool __percpu *pool;
3248 bool alloc = false;
3249
3250retry:
3251 spin_lock_bh(&tcp_md5sig_pool_lock);
3252 pool = tcp_md5sig_pool;
3253 if (tcp_md5sig_users++ == 0) {
3254 alloc = true;
3255 spin_unlock_bh(&tcp_md5sig_pool_lock);
3256 } else if (!pool) {
3257 tcp_md5sig_users--;
3258 spin_unlock_bh(&tcp_md5sig_pool_lock);
3259 cpu_relax();
3260 goto retry;
3261 } else
3262 spin_unlock_bh(&tcp_md5sig_pool_lock);
3263
3264 if (alloc) {
3265
3266 struct tcp_md5sig_pool __percpu *p;
3267
3268 p = __tcp_alloc_md5sig_pool(sk);
3269 spin_lock_bh(&tcp_md5sig_pool_lock);
3270 if (!p) {
3271 tcp_md5sig_users--;
3272 spin_unlock_bh(&tcp_md5sig_pool_lock);
3273 return NULL;
3274 }
3275 pool = tcp_md5sig_pool;
3276 if (pool) {
3277
3278 spin_unlock_bh(&tcp_md5sig_pool_lock);
3279 __tcp_free_md5sig_pool(p);
3280 } else {
3281 tcp_md5sig_pool = pool = p;
3282 spin_unlock_bh(&tcp_md5sig_pool_lock);
3283 }
3284 }
3285 return pool;
3286}
3287EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3298{
3299 struct tcp_md5sig_pool __percpu *p;
3300
3301 local_bh_disable();
3302
3303 spin_lock(&tcp_md5sig_pool_lock);
3304 p = tcp_md5sig_pool;
3305 if (p)
3306 tcp_md5sig_users++;
3307 spin_unlock(&tcp_md5sig_pool_lock);
3308
3309 if (p)
3310 return this_cpu_ptr(p);
3311
3312 local_bh_enable();
3313 return NULL;
3314}
3315EXPORT_SYMBOL(tcp_get_md5sig_pool);
3316
3317void tcp_put_md5sig_pool(void)
3318{
3319 local_bh_enable();
3320 tcp_free_md5sig_pool();
3321}
3322EXPORT_SYMBOL(tcp_put_md5sig_pool);
3323
3324int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3325 const struct tcphdr *th)
3326{
3327 struct scatterlist sg;
3328 struct tcphdr hdr;
3329 int err;
3330
3331
3332 memcpy(&hdr, th, sizeof(hdr));
3333 hdr.check = 0;
3334
3335
3336 sg_init_one(&sg, &hdr, sizeof(hdr));
3337 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3338 return err;
3339}
3340EXPORT_SYMBOL(tcp_md5_hash_header);
3341
3342int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3343 const struct sk_buff *skb, unsigned int header_len)
3344{
3345 struct scatterlist sg;
3346 const struct tcphdr *tp = tcp_hdr(skb);
3347 struct hash_desc *desc = &hp->md5_desc;
3348 unsigned int i;
3349 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3350 skb_headlen(skb) - header_len : 0;
3351 const struct skb_shared_info *shi = skb_shinfo(skb);
3352 struct sk_buff *frag_iter;
3353
3354 sg_init_table(&sg, 1);
3355
3356 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3357 if (crypto_hash_update(desc, &sg, head_data_len))
3358 return 1;
3359
3360 for (i = 0; i < shi->nr_frags; ++i) {
3361 const struct skb_frag_struct *f = &shi->frags[i];
3362 struct page *page = skb_frag_page(f);
3363 sg_set_page(&sg, page, skb_frag_size(f), f->page_offset);
3364 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3365 return 1;
3366 }
3367
3368 skb_walk_frags(skb, frag_iter)
3369 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3370 return 1;
3371
3372 return 0;
3373}
3374EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3375
3376int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3377{
3378 struct scatterlist sg;
3379
3380 sg_init_one(&sg, key->key, key->keylen);
3381 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3382}
3383EXPORT_SYMBOL(tcp_md5_hash_key);
3384
3385#endif
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410struct tcp_cookie_secret {
3411
3412
3413
3414
3415
3416 u32 secrets[COOKIE_WORKSPACE_WORDS];
3417 unsigned long expires;
3418};
3419
3420#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3421#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3422#define TCP_SECRET_LIFE (HZ * 600)
3423
3424static struct tcp_cookie_secret tcp_secret_one;
3425static struct tcp_cookie_secret tcp_secret_two;
3426
3427
3428static struct tcp_cookie_secret *tcp_secret_generating;
3429static struct tcp_cookie_secret *tcp_secret_primary;
3430static struct tcp_cookie_secret *tcp_secret_retiring;
3431static struct tcp_cookie_secret *tcp_secret_secondary;
3432
3433static DEFINE_SPINLOCK(tcp_secret_locker);
3434
3435
3436
3437static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3438{
3439 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3440}
3441
3442
3443
3444
3445
3446int tcp_cookie_generator(u32 *bakery)
3447{
3448 unsigned long jiffy = jiffies;
3449
3450 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3451 spin_lock_bh(&tcp_secret_locker);
3452 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3453
3454 memcpy(bakery,
3455 &tcp_secret_generating->secrets[0],
3456 COOKIE_WORKSPACE_WORDS);
3457 } else {
3458
3459 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470 if (unlikely(tcp_secret_primary->expires ==
3471 tcp_secret_secondary->expires)) {
3472 struct timespec tv;
3473
3474 getnstimeofday(&tv);
3475 bakery[COOKIE_DIGEST_WORDS+0] ^=
3476 (u32)tv.tv_nsec;
3477
3478 tcp_secret_secondary->expires = jiffy
3479 + TCP_SECRET_1MSL
3480 + (0x0f & tcp_cookie_work(bakery, 0));
3481 } else {
3482 tcp_secret_secondary->expires = jiffy
3483 + TCP_SECRET_LIFE
3484 + (0xff & tcp_cookie_work(bakery, 1));
3485 tcp_secret_primary->expires = jiffy
3486 + TCP_SECRET_2MSL
3487 + (0x1f & tcp_cookie_work(bakery, 2));
3488 }
3489 memcpy(&tcp_secret_secondary->secrets[0],
3490 bakery, COOKIE_WORKSPACE_WORDS);
3491
3492 rcu_assign_pointer(tcp_secret_generating,
3493 tcp_secret_secondary);
3494 rcu_assign_pointer(tcp_secret_retiring,
3495 tcp_secret_primary);
3496
3497
3498
3499
3500
3501
3502 }
3503 spin_unlock_bh(&tcp_secret_locker);
3504 } else {
3505 rcu_read_lock_bh();
3506 memcpy(bakery,
3507 &rcu_dereference(tcp_secret_generating)->secrets[0],
3508 COOKIE_WORKSPACE_WORDS);
3509 rcu_read_unlock_bh();
3510 }
3511 return 0;
3512}
3513EXPORT_SYMBOL(tcp_cookie_generator);
3514
3515void tcp_done(struct sock *sk)
3516{
3517 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3518
3519 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3520 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3521
3522 tcp_set_state(sk, TCP_CLOSE);
3523 tcp_clear_xmit_timers(sk);
3524 if (req != NULL)
3525 reqsk_fastopen_remove(sk, req, false);
3526
3527 sk->sk_shutdown = SHUTDOWN_MASK;
3528
3529 if (!sock_flag(sk, SOCK_DEAD))
3530 sk->sk_state_change(sk);
3531 else
3532 inet_csk_destroy_sock(sk);
3533}
3534EXPORT_SYMBOL_GPL(tcp_done);
3535
3536extern struct tcp_congestion_ops tcp_reno;
3537
3538static __initdata unsigned long thash_entries;
3539static int __init set_thash_entries(char *str)
3540{
3541 ssize_t ret;
3542
3543 if (!str)
3544 return 0;
3545
3546 ret = kstrtoul(str, 0, &thash_entries);
3547 if (ret)
3548 return 0;
3549
3550 return 1;
3551}
3552__setup("thash_entries=", set_thash_entries);
3553
3554void tcp_init_mem(struct net *net)
3555{
3556 unsigned long limit = nr_free_buffer_pages() / 8;
3557 limit = max(limit, 128UL);
3558 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3559 net->ipv4.sysctl_tcp_mem[1] = limit;
3560 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3561}
3562
3563void __init tcp_init(void)
3564{
3565 struct sk_buff *skb = NULL;
3566 unsigned long limit;
3567 int max_rshare, max_wshare, cnt;
3568 unsigned int i;
3569 unsigned long jiffy = jiffies;
3570
3571 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3572
3573 percpu_counter_init(&tcp_sockets_allocated, 0);
3574 percpu_counter_init(&tcp_orphan_count, 0);
3575 tcp_hashinfo.bind_bucket_cachep =
3576 kmem_cache_create("tcp_bind_bucket",
3577 sizeof(struct inet_bind_bucket), 0,
3578 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3579
3580
3581
3582
3583
3584
3585 tcp_hashinfo.ehash =
3586 alloc_large_system_hash("TCP established",
3587 sizeof(struct inet_ehash_bucket),
3588 thash_entries,
3589 (totalram_pages >= 128 * 1024) ?
3590 13 : 15,
3591 0,
3592 NULL,
3593 &tcp_hashinfo.ehash_mask,
3594 0,
3595 thash_entries ? 0 : 512 * 1024);
3596 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3597 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3598 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3599 }
3600 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3601 panic("TCP: failed to alloc ehash_locks");
3602 tcp_hashinfo.bhash =
3603 alloc_large_system_hash("TCP bind",
3604 sizeof(struct inet_bind_hashbucket),
3605 tcp_hashinfo.ehash_mask + 1,
3606 (totalram_pages >= 128 * 1024) ?
3607 13 : 15,
3608 0,
3609 &tcp_hashinfo.bhash_size,
3610 NULL,
3611 0,
3612 64 * 1024);
3613 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3614 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3615 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3616 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3617 }
3618
3619
3620 cnt = tcp_hashinfo.ehash_mask + 1;
3621
3622 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3623 sysctl_tcp_max_orphans = cnt / 2;
3624 sysctl_max_syn_backlog = max(128, cnt / 256);
3625
3626 tcp_init_mem(&init_net);
3627
3628 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3629 max_wshare = min(4UL*1024*1024, limit);
3630 max_rshare = min(6UL*1024*1024, limit);
3631
3632 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3633 sysctl_tcp_wmem[1] = 16*1024;
3634 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3635
3636 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3637 sysctl_tcp_rmem[1] = 87380;
3638 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3639
3640 pr_info("Hash tables configured (established %u bind %u)\n",
3641 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3642
3643 tcp_metrics_init();
3644
3645 tcp_register_congestion_control(&tcp_reno);
3646
3647 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3648 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3649 tcp_secret_one.expires = jiffy;
3650 tcp_secret_two.expires = jiffy;
3651 tcp_secret_generating = &tcp_secret_one;
3652 tcp_secret_primary = &tcp_secret_one;
3653 tcp_secret_retiring = &tcp_secret_two;
3654 tcp_secret_secondary = &tcp_secret_two;
3655 tcp_tasklet_init();
3656}
3657