1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60#include <linux/sched.h>
61
62#include <net/net_namespace.h>
63#include <net/icmp.h>
64#include <net/inet_hashtables.h>
65#include <net/tcp.h>
66#include <net/transp_v6.h>
67#include <net/ipv6.h>
68#include <net/inet_common.h>
69#include <net/timewait_sock.h>
70#include <net/xfrm.h>
71#include <net/secure_seq.h>
72#include <net/busy_poll.h>
73
74#include <linux/inet.h>
75#include <linux/ipv6.h>
76#include <linux/stddef.h>
77#include <linux/proc_fs.h>
78#include <linux/seq_file.h>
79#include <linux/inetdevice.h>
80#include <linux/btf_ids.h>
81
82#include <crypto/hash.h>
83#include <linux/scatterlist.h>
84
85#include <trace/events/tcp.h>
86
87#ifdef CONFIG_TCP_MD5SIG
88static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 __be32 daddr, __be32 saddr, const struct tcphdr *th);
90#endif
91
92struct inet_hashinfo tcp_hashinfo;
93EXPORT_SYMBOL(tcp_hashinfo);
94
95static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96
97static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98{
99 return secure_tcp_seq(ip_hdr(skb)->daddr,
100 ip_hdr(skb)->saddr,
101 tcp_hdr(skb)->dest,
102 tcp_hdr(skb)->source);
103}
104
105static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106{
107 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108}
109
110int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111{
112 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 struct tcp_sock *tp = tcp_sk(sk);
116
117 if (reuse == 2) {
118
119
120
121
122 bool loopback = false;
123 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 loopback = true;
125#if IS_ENABLED(CONFIG_IPV6)
126 if (tw->tw_family == AF_INET6) {
127 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 loopback = true;
132 } else
133#endif
134 {
135 if (ipv4_is_loopback(tw->tw_daddr) ||
136 ipv4_is_loopback(tw->tw_rcv_saddr))
137 loopback = true;
138 }
139 if (!loopback)
140 reuse = 0;
141 }
142
143
144
145
146
147
148
149
150
151
152
153
154 if (tcptw->tw_ts_recent_stamp &&
155 (!twp || (reuse && time_after32(ktime_get_seconds(),
156 tcptw->tw_ts_recent_stamp)))) {
157
158
159
160
161
162
163
164
165
166
167
168 if (likely(!tp->repair)) {
169 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170
171 if (!seq)
172 seq = 1;
173 WRITE_ONCE(tp->write_seq, seq);
174 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
175 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176 }
177 sock_hold(sktw);
178 return 1;
179 }
180
181 return 0;
182}
183EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184
185static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186 int addr_len)
187{
188
189
190
191
192 if (addr_len < sizeof(struct sockaddr_in))
193 return -EINVAL;
194
195 sock_owned_by_me(sk);
196
197 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
198}
199
200
201int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202{
203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 struct inet_timewait_death_row *tcp_death_row;
205 struct inet_sock *inet = inet_sk(sk);
206 struct tcp_sock *tp = tcp_sk(sk);
207 struct ip_options_rcu *inet_opt;
208 struct net *net = sock_net(sk);
209 __be16 orig_sport, orig_dport;
210 __be32 daddr, nexthop;
211 struct flowi4 *fl4;
212 struct rtable *rt;
213 int err;
214
215 if (addr_len < sizeof(struct sockaddr_in))
216 return -EINVAL;
217
218 if (usin->sin_family != AF_INET)
219 return -EAFNOSUPPORT;
220
221 nexthop = daddr = usin->sin_addr.s_addr;
222 inet_opt = rcu_dereference_protected(inet->inet_opt,
223 lockdep_sock_is_held(sk));
224 if (inet_opt && inet_opt->opt.srr) {
225 if (!daddr)
226 return -EINVAL;
227 nexthop = inet_opt->opt.faddr;
228 }
229
230 orig_sport = inet->inet_sport;
231 orig_dport = usin->sin_port;
232 fl4 = &inet->cork.fl.u.ip4;
233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235 orig_dport, sk);
236 if (IS_ERR(rt)) {
237 err = PTR_ERR(rt);
238 if (err == -ENETUNREACH)
239 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240 return err;
241 }
242
243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 ip_rt_put(rt);
245 return -ENETUNREACH;
246 }
247
248 if (!inet_opt || !inet_opt->opt.srr)
249 daddr = fl4->daddr;
250
251 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252
253 if (!inet->inet_saddr) {
254 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
255 if (err) {
256 ip_rt_put(rt);
257 return err;
258 }
259 } else {
260 sk_rcv_saddr_set(sk, inet->inet_saddr);
261 }
262
263 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264
265 tp->rx_opt.ts_recent = 0;
266 tp->rx_opt.ts_recent_stamp = 0;
267 if (likely(!tp->repair))
268 WRITE_ONCE(tp->write_seq, 0);
269 }
270
271 inet->inet_dport = usin->sin_port;
272 sk_daddr_set(sk, daddr);
273
274 inet_csk(sk)->icsk_ext_hdr_len = 0;
275 if (inet_opt)
276 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277
278 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279
280
281
282
283
284
285 tcp_set_state(sk, TCP_SYN_SENT);
286 err = inet_hash_connect(tcp_death_row, sk);
287 if (err)
288 goto failure;
289
290 sk_set_txhash(sk);
291
292 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293 inet->inet_sport, inet->inet_dport, sk);
294 if (IS_ERR(rt)) {
295 err = PTR_ERR(rt);
296 rt = NULL;
297 goto failure;
298 }
299 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
300
301 sk->sk_gso_type = SKB_GSO_TCPV4;
302 sk_setup_caps(sk, &rt->dst);
303 rt = NULL;
304
305 if (likely(!tp->repair)) {
306 if (!tp->write_seq)
307 WRITE_ONCE(tp->write_seq,
308 secure_tcp_seq(inet->inet_saddr,
309 inet->inet_daddr,
310 inet->inet_sport,
311 usin->sin_port));
312 WRITE_ONCE(tp->tsoffset,
313 secure_tcp_ts_off(net, inet->inet_saddr,
314 inet->inet_daddr));
315 }
316
317 atomic_set(&inet->inet_id, get_random_u16());
318
319 if (tcp_fastopen_defer_connect(sk, &err))
320 return err;
321 if (err)
322 goto failure;
323
324 err = tcp_connect(sk);
325
326 if (err)
327 goto failure;
328
329 return 0;
330
331failure:
332
333
334
335
336 tcp_set_state(sk, TCP_CLOSE);
337 inet_bhash2_reset_saddr(sk);
338 ip_rt_put(rt);
339 sk->sk_route_caps = 0;
340 inet->inet_dport = 0;
341 return err;
342}
343EXPORT_SYMBOL(tcp_v4_connect);
344
345
346
347
348
349
350void tcp_v4_mtu_reduced(struct sock *sk)
351{
352 struct inet_sock *inet = inet_sk(sk);
353 struct dst_entry *dst;
354 u32 mtu;
355
356 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
357 return;
358 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
359 dst = inet_csk_update_pmtu(sk, mtu);
360 if (!dst)
361 return;
362
363
364
365
366 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
367 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
368
369 mtu = dst_mtu(dst);
370
371 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
372 ip_sk_accept_pmtu(sk) &&
373 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
374 tcp_sync_mss(sk, mtu);
375
376
377
378
379
380
381 tcp_simple_retransmit(sk);
382 }
383}
384EXPORT_SYMBOL(tcp_v4_mtu_reduced);
385
386static void do_redirect(struct sk_buff *skb, struct sock *sk)
387{
388 struct dst_entry *dst = __sk_dst_check(sk, 0);
389
390 if (dst)
391 dst->ops->redirect(dst, sk, skb);
392}
393
394
395
396void tcp_req_err(struct sock *sk, u32 seq, bool abort)
397{
398 struct request_sock *req = inet_reqsk(sk);
399 struct net *net = sock_net(sk);
400
401
402
403
404 if (seq != tcp_rsk(req)->snt_isn) {
405 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
406 } else if (abort) {
407
408
409
410
411
412
413 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
414 tcp_listendrop(req->rsk_listener);
415 }
416 reqsk_put(req);
417}
418EXPORT_SYMBOL(tcp_req_err);
419
420
421void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
422{
423 struct inet_connection_sock *icsk = inet_csk(sk);
424 struct tcp_sock *tp = tcp_sk(sk);
425 struct sk_buff *skb;
426 s32 remaining;
427 u32 delta_us;
428
429 if (sock_owned_by_user(sk))
430 return;
431
432 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
433 !icsk->icsk_backoff)
434 return;
435
436 skb = tcp_rtx_queue_head(sk);
437 if (WARN_ON_ONCE(!skb))
438 return;
439
440 icsk->icsk_backoff--;
441 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
442 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
443
444 tcp_mstamp_refresh(tp);
445 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
446 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
447
448 if (remaining > 0) {
449 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
450 remaining, TCP_RTO_MAX);
451 } else {
452
453
454
455 tcp_retransmit_timer(sk);
456 }
457}
458EXPORT_SYMBOL(tcp_ld_RTO_revert);
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476int tcp_v4_err(struct sk_buff *skb, u32 info)
477{
478 const struct iphdr *iph = (const struct iphdr *)skb->data;
479 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
480 struct tcp_sock *tp;
481 const int type = icmp_hdr(skb)->type;
482 const int code = icmp_hdr(skb)->code;
483 struct sock *sk;
484 struct request_sock *fastopen;
485 u32 seq, snd_una;
486 int err;
487 struct net *net = dev_net(skb->dev);
488
489 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
490 iph->daddr, th->dest, iph->saddr,
491 ntohs(th->source), inet_iif(skb), 0);
492 if (!sk) {
493 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
494 return -ENOENT;
495 }
496 if (sk->sk_state == TCP_TIME_WAIT) {
497
498 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
499 inet_twsk_put(inet_twsk(sk));
500 return 0;
501 }
502 seq = ntohl(th->seq);
503 if (sk->sk_state == TCP_NEW_SYN_RECV) {
504 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
505 type == ICMP_TIME_EXCEEDED ||
506 (type == ICMP_DEST_UNREACH &&
507 (code == ICMP_NET_UNREACH ||
508 code == ICMP_HOST_UNREACH)));
509 return 0;
510 }
511
512 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
513 sock_put(sk);
514 return 0;
515 }
516
517 bh_lock_sock(sk);
518
519
520
521
522
523 if (sock_owned_by_user(sk)) {
524 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
525 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
526 }
527 if (sk->sk_state == TCP_CLOSE)
528 goto out;
529
530 if (static_branch_unlikely(&ip4_min_ttl)) {
531
532 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
533 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
534 goto out;
535 }
536 }
537
538 tp = tcp_sk(sk);
539
540 fastopen = rcu_dereference(tp->fastopen_rsk);
541 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
542 if (sk->sk_state != TCP_LISTEN &&
543 !between(seq, snd_una, tp->snd_nxt)) {
544 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
545 goto out;
546 }
547
548 switch (type) {
549 case ICMP_REDIRECT:
550 if (!sock_owned_by_user(sk))
551 do_redirect(skb, sk);
552 goto out;
553 case ICMP_SOURCE_QUENCH:
554
555 goto out;
556 case ICMP_PARAMETERPROB:
557 err = EPROTO;
558 break;
559 case ICMP_DEST_UNREACH:
560 if (code > NR_ICMP_UNREACH)
561 goto out;
562
563 if (code == ICMP_FRAG_NEEDED) {
564
565
566
567
568 if (sk->sk_state == TCP_LISTEN)
569 goto out;
570
571 WRITE_ONCE(tp->mtu_info, info);
572 if (!sock_owned_by_user(sk)) {
573 tcp_v4_mtu_reduced(sk);
574 } else {
575 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
576 sock_hold(sk);
577 }
578 goto out;
579 }
580
581 err = icmp_err_convert[code].errno;
582
583
584
585 if (!fastopen &&
586 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
587 tcp_ld_RTO_revert(sk, seq);
588 break;
589 case ICMP_TIME_EXCEEDED:
590 err = EHOSTUNREACH;
591 break;
592 default:
593 goto out;
594 }
595
596 switch (sk->sk_state) {
597 case TCP_SYN_SENT:
598 case TCP_SYN_RECV:
599
600
601
602 if (fastopen && !fastopen->sk)
603 break;
604
605 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
606
607 if (!sock_owned_by_user(sk)) {
608 WRITE_ONCE(sk->sk_err, err);
609
610 sk_error_report(sk);
611
612 tcp_done(sk);
613 } else {
614 WRITE_ONCE(sk->sk_err_soft, err);
615 }
616 goto out;
617 }
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635 if (!sock_owned_by_user(sk) &&
636 inet_test_bit(RECVERR, sk)) {
637 WRITE_ONCE(sk->sk_err, err);
638 sk_error_report(sk);
639 } else {
640 WRITE_ONCE(sk->sk_err_soft, err);
641 }
642
643out:
644 bh_unlock_sock(sk);
645 sock_put(sk);
646 return 0;
647}
648
649void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
650{
651 struct tcphdr *th = tcp_hdr(skb);
652
653 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
654 skb->csum_start = skb_transport_header(skb) - skb->head;
655 skb->csum_offset = offsetof(struct tcphdr, check);
656}
657
658
659void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
660{
661 const struct inet_sock *inet = inet_sk(sk);
662
663 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
664}
665EXPORT_SYMBOL(tcp_v4_send_check);
666
667#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
668
669static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
670 const struct tcp_ao_hdr *aoh,
671 struct ip_reply_arg *arg, struct tcphdr *reply,
672 __be32 reply_options[REPLY_OPTIONS_LEN])
673{
674#ifdef CONFIG_TCP_AO
675 int sdif = tcp_v4_sdif(skb);
676 int dif = inet_iif(skb);
677 int l3index = sdif ? dif : 0;
678 bool allocated_traffic_key;
679 struct tcp_ao_key *key;
680 char *traffic_key;
681 bool drop = true;
682 u32 ao_sne = 0;
683 u8 keyid;
684
685 rcu_read_lock();
686 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
687 &key, &traffic_key, &allocated_traffic_key,
688 &keyid, &ao_sne))
689 goto out;
690
691 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
692 (aoh->rnext_keyid << 8) | keyid);
693 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
694 reply->doff = arg->iov[0].iov_len / 4;
695
696 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
697 key, traffic_key,
698 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
699 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
700 reply, ao_sne))
701 goto out;
702 drop = false;
703out:
704 rcu_read_unlock();
705 if (allocated_traffic_key)
706 kfree(traffic_key);
707 return drop;
708#else
709 return true;
710#endif
711}
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
727{
728 const struct tcphdr *th = tcp_hdr(skb);
729 struct {
730 struct tcphdr th;
731 __be32 opt[REPLY_OPTIONS_LEN];
732 } rep;
733 const __u8 *md5_hash_location = NULL;
734 const struct tcp_ao_hdr *aoh;
735 struct ip_reply_arg arg;
736#ifdef CONFIG_TCP_MD5SIG
737 struct tcp_md5sig_key *key = NULL;
738 unsigned char newhash[16];
739 struct sock *sk1 = NULL;
740 int genhash;
741#endif
742 u64 transmit_time = 0;
743 struct sock *ctl_sk;
744 struct net *net;
745 u32 txhash = 0;
746
747
748 if (th->rst)
749 return;
750
751
752
753
754 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
755 return;
756
757
758 memset(&rep, 0, sizeof(rep));
759 rep.th.dest = th->source;
760 rep.th.source = th->dest;
761 rep.th.doff = sizeof(struct tcphdr) / 4;
762 rep.th.rst = 1;
763
764 if (th->ack) {
765 rep.th.seq = th->ack_seq;
766 } else {
767 rep.th.ack = 1;
768 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
769 skb->len - (th->doff << 2));
770 }
771
772 memset(&arg, 0, sizeof(arg));
773 arg.iov[0].iov_base = (unsigned char *)&rep;
774 arg.iov[0].iov_len = sizeof(rep.th);
775
776 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
777
778
779 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
780 return;
781
782 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
783 return;
784
785#ifdef CONFIG_TCP_MD5SIG
786 rcu_read_lock();
787 if (sk && sk_fullsock(sk)) {
788 const union tcp_md5_addr *addr;
789 int l3index;
790
791
792
793
794 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
795 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
796 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
797 } else if (md5_hash_location) {
798 const union tcp_md5_addr *addr;
799 int sdif = tcp_v4_sdif(skb);
800 int dif = inet_iif(skb);
801 int l3index;
802
803
804
805
806
807
808
809
810 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
811 NULL, 0, ip_hdr(skb)->saddr,
812 th->source, ip_hdr(skb)->daddr,
813 ntohs(th->source), dif, sdif);
814
815 if (!sk1)
816 goto out;
817
818
819
820
821 l3index = sdif ? dif : 0;
822 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
823 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
824 if (!key)
825 goto out;
826
827
828 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
829 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
830 goto out;
831
832 }
833
834 if (key) {
835 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
836 (TCPOPT_NOP << 16) |
837 (TCPOPT_MD5SIG << 8) |
838 TCPOLEN_MD5SIG);
839
840 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
841 rep.th.doff = arg.iov[0].iov_len / 4;
842
843 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
844 key, ip_hdr(skb)->saddr,
845 ip_hdr(skb)->daddr, &rep.th);
846 }
847#endif
848
849 if (rep.opt[0] == 0) {
850 __be32 mrst = mptcp_reset_option(skb);
851
852 if (mrst) {
853 rep.opt[0] = mrst;
854 arg.iov[0].iov_len += sizeof(mrst);
855 rep.th.doff = arg.iov[0].iov_len / 4;
856 }
857 }
858
859 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
860 ip_hdr(skb)->saddr,
861 arg.iov[0].iov_len, IPPROTO_TCP, 0);
862 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
863 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
864
865
866
867
868
869 if (sk) {
870 arg.bound_dev_if = sk->sk_bound_dev_if;
871 if (sk_fullsock(sk))
872 trace_tcp_send_reset(sk, skb);
873 }
874
875 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
876 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
877
878 arg.tos = ip_hdr(skb)->tos;
879 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
880 local_bh_disable();
881 ctl_sk = this_cpu_read(ipv4_tcp_sk);
882 sock_net_set(ctl_sk, net);
883 if (sk) {
884 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
885 inet_twsk(sk)->tw_mark : sk->sk_mark;
886 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
887 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
888 transmit_time = tcp_transmit_time(sk);
889 xfrm_sk_clone_policy(ctl_sk, sk);
890 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
891 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
892 } else {
893 ctl_sk->sk_mark = 0;
894 ctl_sk->sk_priority = 0;
895 }
896 ip_send_unicast_reply(ctl_sk,
897 skb, &TCP_SKB_CB(skb)->header.h4.opt,
898 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
899 &arg, arg.iov[0].iov_len,
900 transmit_time, txhash);
901
902 xfrm_sk_free_policy(ctl_sk);
903 sock_net_set(ctl_sk, &init_net);
904 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
905 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
906 local_bh_enable();
907
908#ifdef CONFIG_TCP_MD5SIG
909out:
910 rcu_read_unlock();
911#endif
912}
913
914
915
916
917
918static void tcp_v4_send_ack(const struct sock *sk,
919 struct sk_buff *skb, u32 seq, u32 ack,
920 u32 win, u32 tsval, u32 tsecr, int oif,
921 struct tcp_key *key,
922 int reply_flags, u8 tos, u32 txhash)
923{
924 const struct tcphdr *th = tcp_hdr(skb);
925 struct {
926 struct tcphdr th;
927 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
928 } rep;
929 struct net *net = sock_net(sk);
930 struct ip_reply_arg arg;
931 struct sock *ctl_sk;
932 u64 transmit_time;
933
934 memset(&rep.th, 0, sizeof(struct tcphdr));
935 memset(&arg, 0, sizeof(arg));
936
937 arg.iov[0].iov_base = (unsigned char *)&rep;
938 arg.iov[0].iov_len = sizeof(rep.th);
939 if (tsecr) {
940 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
941 (TCPOPT_TIMESTAMP << 8) |
942 TCPOLEN_TIMESTAMP);
943 rep.opt[1] = htonl(tsval);
944 rep.opt[2] = htonl(tsecr);
945 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
946 }
947
948
949 rep.th.dest = th->source;
950 rep.th.source = th->dest;
951 rep.th.doff = arg.iov[0].iov_len / 4;
952 rep.th.seq = htonl(seq);
953 rep.th.ack_seq = htonl(ack);
954 rep.th.ack = 1;
955 rep.th.window = htons(win);
956
957#ifdef CONFIG_TCP_MD5SIG
958 if (tcp_key_is_md5(key)) {
959 int offset = (tsecr) ? 3 : 0;
960
961 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
962 (TCPOPT_NOP << 16) |
963 (TCPOPT_MD5SIG << 8) |
964 TCPOLEN_MD5SIG);
965 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
966 rep.th.doff = arg.iov[0].iov_len/4;
967
968 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
969 key->md5_key, ip_hdr(skb)->saddr,
970 ip_hdr(skb)->daddr, &rep.th);
971 }
972#endif
973#ifdef CONFIG_TCP_AO
974 if (tcp_key_is_ao(key)) {
975 int offset = (tsecr) ? 3 : 0;
976
977 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
978 (tcp_ao_len(key->ao_key) << 16) |
979 (key->ao_key->sndid << 8) |
980 key->rcv_next);
981 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
982 rep.th.doff = arg.iov[0].iov_len / 4;
983
984 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
985 key->ao_key, key->traffic_key,
986 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
987 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
988 &rep.th, key->sne);
989 }
990#endif
991 arg.flags = reply_flags;
992 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
993 ip_hdr(skb)->saddr,
994 arg.iov[0].iov_len, IPPROTO_TCP, 0);
995 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
996 if (oif)
997 arg.bound_dev_if = oif;
998 arg.tos = tos;
999 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1000 local_bh_disable();
1001 ctl_sk = this_cpu_read(ipv4_tcp_sk);
1002 sock_net_set(ctl_sk, net);
1003 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1004 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1005 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1006 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1007 transmit_time = tcp_transmit_time(sk);
1008 ip_send_unicast_reply(ctl_sk,
1009 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1010 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1011 &arg, arg.iov[0].iov_len,
1012 transmit_time, txhash);
1013
1014 sock_net_set(ctl_sk, &init_net);
1015 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1016 local_bh_enable();
1017}
1018
1019static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1020{
1021 struct inet_timewait_sock *tw = inet_twsk(sk);
1022 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1023 struct tcp_key key = {};
1024#ifdef CONFIG_TCP_AO
1025 struct tcp_ao_info *ao_info;
1026
1027 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1028
1029 ao_info = rcu_dereference(tcptw->ao_info);
1030 if (ao_info) {
1031 const struct tcp_ao_hdr *aoh;
1032
1033 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1034 inet_twsk_put(tw);
1035 return;
1036 }
1037
1038 if (aoh)
1039 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1040 }
1041 }
1042 if (key.ao_key) {
1043 struct tcp_ao_key *rnext_key;
1044
1045 key.traffic_key = snd_other_key(key.ao_key);
1046 key.sne = READ_ONCE(ao_info->snd_sne);
1047 rnext_key = READ_ONCE(ao_info->rnext_key);
1048 key.rcv_next = rnext_key->rcvid;
1049 key.type = TCP_KEY_AO;
1050#else
1051 if (0) {
1052#endif
1053#ifdef CONFIG_TCP_MD5SIG
1054 } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1055 key.md5_key = tcp_twsk_md5_key(tcptw);
1056 if (key.md5_key)
1057 key.type = TCP_KEY_MD5;
1058#endif
1059 }
1060
1061 tcp_v4_send_ack(sk, skb,
1062 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1063 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1064 tcp_tw_tsval(tcptw),
1065 tcptw->tw_ts_recent,
1066 tw->tw_bound_dev_if, &key,
1067 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1068 tw->tw_tos,
1069 tw->tw_txhash);
1070
1071 inet_twsk_put(tw);
1072}
1073
1074static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1075 struct request_sock *req)
1076{
1077 struct tcp_key key = {};
1078
1079
1080
1081
1082 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1083 tcp_sk(sk)->snd_nxt;
1084
1085#ifdef CONFIG_TCP_AO
1086 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1087 tcp_rsk_used_ao(req)) {
1088 const union tcp_md5_addr *addr;
1089 const struct tcp_ao_hdr *aoh;
1090 int l3index;
1091
1092
1093 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1094 return;
1095 if (!aoh)
1096 return;
1097
1098 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1099 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1100 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1101 aoh->rnext_keyid, -1);
1102 if (unlikely(!key.ao_key)) {
1103
1104 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1105
1106
1107
1108 if (!key.ao_key) {
1109 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1110 addr,
1111 ntohs(tcp_hdr(skb)->source),
1112 &ip_hdr(skb)->daddr,
1113 ntohs(tcp_hdr(skb)->dest));
1114 return;
1115 }
1116 }
1117 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1118 if (!key.traffic_key)
1119 return;
1120
1121 key.type = TCP_KEY_AO;
1122 key.rcv_next = aoh->keyid;
1123 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1124#else
1125 if (0) {
1126#endif
1127#ifdef CONFIG_TCP_MD5SIG
1128 } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1129 const union tcp_md5_addr *addr;
1130 int l3index;
1131
1132 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1133 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1134 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1135 if (key.md5_key)
1136 key.type = TCP_KEY_MD5;
1137#endif
1138 }
1139
1140
1141
1142
1143
1144
1145 tcp_v4_send_ack(sk, skb, seq,
1146 tcp_rsk(req)->rcv_nxt,
1147 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
1148 tcp_rsk_tsval(tcp_rsk(req)),
1149 READ_ONCE(req->ts_recent),
1150 0, &key,
1151 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1152 ip_hdr(skb)->tos,
1153 READ_ONCE(tcp_rsk(req)->txhash));
1154 if (tcp_key_is_ao(&key))
1155 kfree(key.traffic_key);
1156}
1157
1158
1159
1160
1161
1162
1163static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1164 struct flowi *fl,
1165 struct request_sock *req,
1166 struct tcp_fastopen_cookie *foc,
1167 enum tcp_synack_type synack_type,
1168 struct sk_buff *syn_skb)
1169{
1170 const struct inet_request_sock *ireq = inet_rsk(req);
1171 struct flowi4 fl4;
1172 int err = -1;
1173 struct sk_buff *skb;
1174 u8 tos;
1175
1176
1177 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1178 return -1;
1179
1180 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1181
1182 if (skb) {
1183 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1184
1185 tos = READ_ONCE(inet_sk(sk)->tos);
1186
1187 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1188 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1189 (tos & INET_ECN_MASK);
1190
1191 if (!INET_ECN_is_capable(tos) &&
1192 tcp_bpf_ca_needs_ecn((struct sock *)req))
1193 tos |= INET_ECN_ECT_0;
1194
1195 rcu_read_lock();
1196 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1197 ireq->ir_rmt_addr,
1198 rcu_dereference(ireq->ireq_opt),
1199 tos);
1200 rcu_read_unlock();
1201 err = net_xmit_eval(err);
1202 }
1203
1204 return err;
1205}
1206
1207
1208
1209
1210static void tcp_v4_reqsk_destructor(struct request_sock *req)
1211{
1212 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1213}
1214
1215#ifdef CONFIG_TCP_MD5SIG
1216
1217
1218
1219
1220
1221
1222DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1223EXPORT_SYMBOL(tcp_md5_needed);
1224
1225static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1226{
1227 if (!old)
1228 return true;
1229
1230
1231 if (old->l3index && new->l3index == 0)
1232 return false;
1233 if (old->l3index == 0 && new->l3index)
1234 return true;
1235
1236 return old->prefixlen < new->prefixlen;
1237}
1238
1239
1240struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1241 const union tcp_md5_addr *addr,
1242 int family, bool any_l3index)
1243{
1244 const struct tcp_sock *tp = tcp_sk(sk);
1245 struct tcp_md5sig_key *key;
1246 const struct tcp_md5sig_info *md5sig;
1247 __be32 mask;
1248 struct tcp_md5sig_key *best_match = NULL;
1249 bool match;
1250
1251
1252 md5sig = rcu_dereference_check(tp->md5sig_info,
1253 lockdep_sock_is_held(sk));
1254 if (!md5sig)
1255 return NULL;
1256
1257 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1258 lockdep_sock_is_held(sk)) {
1259 if (key->family != family)
1260 continue;
1261 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1262 key->l3index != l3index)
1263 continue;
1264 if (family == AF_INET) {
1265 mask = inet_make_mask(key->prefixlen);
1266 match = (key->addr.a4.s_addr & mask) ==
1267 (addr->a4.s_addr & mask);
1268#if IS_ENABLED(CONFIG_IPV6)
1269 } else if (family == AF_INET6) {
1270 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1271 key->prefixlen);
1272#endif
1273 } else {
1274 match = false;
1275 }
1276
1277 if (match && better_md5_match(best_match, key))
1278 best_match = key;
1279 }
1280 return best_match;
1281}
1282EXPORT_SYMBOL(__tcp_md5_do_lookup);
1283
1284static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1285 const union tcp_md5_addr *addr,
1286 int family, u8 prefixlen,
1287 int l3index, u8 flags)
1288{
1289 const struct tcp_sock *tp = tcp_sk(sk);
1290 struct tcp_md5sig_key *key;
1291 unsigned int size = sizeof(struct in_addr);
1292 const struct tcp_md5sig_info *md5sig;
1293
1294
1295 md5sig = rcu_dereference_check(tp->md5sig_info,
1296 lockdep_sock_is_held(sk));
1297 if (!md5sig)
1298 return NULL;
1299#if IS_ENABLED(CONFIG_IPV6)
1300 if (family == AF_INET6)
1301 size = sizeof(struct in6_addr);
1302#endif
1303 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1304 lockdep_sock_is_held(sk)) {
1305 if (key->family != family)
1306 continue;
1307 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1308 continue;
1309 if (key->l3index != l3index)
1310 continue;
1311 if (!memcmp(&key->addr, addr, size) &&
1312 key->prefixlen == prefixlen)
1313 return key;
1314 }
1315 return NULL;
1316}
1317
1318struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1319 const struct sock *addr_sk)
1320{
1321 const union tcp_md5_addr *addr;
1322 int l3index;
1323
1324 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1325 addr_sk->sk_bound_dev_if);
1326 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1327 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1328}
1329EXPORT_SYMBOL(tcp_v4_md5_lookup);
1330
1331static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1332{
1333 struct tcp_sock *tp = tcp_sk(sk);
1334 struct tcp_md5sig_info *md5sig;
1335
1336 md5sig = kmalloc(sizeof(*md5sig), gfp);
1337 if (!md5sig)
1338 return -ENOMEM;
1339
1340 sk_gso_disable(sk);
1341 INIT_HLIST_HEAD(&md5sig->head);
1342 rcu_assign_pointer(tp->md5sig_info, md5sig);
1343 return 0;
1344}
1345
1346
1347static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1348 int family, u8 prefixlen, int l3index, u8 flags,
1349 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1350{
1351
1352 struct tcp_md5sig_key *key;
1353 struct tcp_sock *tp = tcp_sk(sk);
1354 struct tcp_md5sig_info *md5sig;
1355
1356 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1357 if (key) {
1358
1359
1360
1361
1362
1363
1364 data_race(memcpy(key->key, newkey, newkeylen));
1365
1366
1367
1368
1369
1370
1371 WRITE_ONCE(key->keylen, newkeylen);
1372
1373 return 0;
1374 }
1375
1376 md5sig = rcu_dereference_protected(tp->md5sig_info,
1377 lockdep_sock_is_held(sk));
1378
1379 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1380 if (!key)
1381 return -ENOMEM;
1382
1383 memcpy(key->key, newkey, newkeylen);
1384 key->keylen = newkeylen;
1385 key->family = family;
1386 key->prefixlen = prefixlen;
1387 key->l3index = l3index;
1388 key->flags = flags;
1389 memcpy(&key->addr, addr,
1390 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1391 sizeof(struct in_addr));
1392 hlist_add_head_rcu(&key->node, &md5sig->head);
1393 return 0;
1394}
1395
1396int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1397 int family, u8 prefixlen, int l3index, u8 flags,
1398 const u8 *newkey, u8 newkeylen)
1399{
1400 struct tcp_sock *tp = tcp_sk(sk);
1401
1402 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1403 if (tcp_md5_alloc_sigpool())
1404 return -ENOMEM;
1405
1406 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1407 tcp_md5_release_sigpool();
1408 return -ENOMEM;
1409 }
1410
1411 if (!static_branch_inc(&tcp_md5_needed.key)) {
1412 struct tcp_md5sig_info *md5sig;
1413
1414 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1415 rcu_assign_pointer(tp->md5sig_info, NULL);
1416 kfree_rcu(md5sig, rcu);
1417 tcp_md5_release_sigpool();
1418 return -EUSERS;
1419 }
1420 }
1421
1422 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1423 newkey, newkeylen, GFP_KERNEL);
1424}
1425EXPORT_SYMBOL(tcp_md5_do_add);
1426
1427int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1428 int family, u8 prefixlen, int l3index,
1429 struct tcp_md5sig_key *key)
1430{
1431 struct tcp_sock *tp = tcp_sk(sk);
1432
1433 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1434 tcp_md5_add_sigpool();
1435
1436 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1437 tcp_md5_release_sigpool();
1438 return -ENOMEM;
1439 }
1440
1441 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1442 struct tcp_md5sig_info *md5sig;
1443
1444 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1445 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1446 rcu_assign_pointer(tp->md5sig_info, NULL);
1447 kfree_rcu(md5sig, rcu);
1448 tcp_md5_release_sigpool();
1449 return -EUSERS;
1450 }
1451 }
1452
1453 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1454 key->flags, key->key, key->keylen,
1455 sk_gfp_mask(sk, GFP_ATOMIC));
1456}
1457EXPORT_SYMBOL(tcp_md5_key_copy);
1458
1459int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1460 u8 prefixlen, int l3index, u8 flags)
1461{
1462 struct tcp_md5sig_key *key;
1463
1464 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1465 if (!key)
1466 return -ENOENT;
1467 hlist_del_rcu(&key->node);
1468 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1469 kfree_rcu(key, rcu);
1470 return 0;
1471}
1472EXPORT_SYMBOL(tcp_md5_do_del);
1473
1474void tcp_clear_md5_list(struct sock *sk)
1475{
1476 struct tcp_sock *tp = tcp_sk(sk);
1477 struct tcp_md5sig_key *key;
1478 struct hlist_node *n;
1479 struct tcp_md5sig_info *md5sig;
1480
1481 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1482
1483 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1484 hlist_del_rcu(&key->node);
1485 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1486 kfree_rcu(key, rcu);
1487 }
1488}
1489
1490static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1491 sockptr_t optval, int optlen)
1492{
1493 struct tcp_md5sig cmd;
1494 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1495 const union tcp_md5_addr *addr;
1496 u8 prefixlen = 32;
1497 int l3index = 0;
1498 bool l3flag;
1499 u8 flags;
1500
1501 if (optlen < sizeof(cmd))
1502 return -EINVAL;
1503
1504 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1505 return -EFAULT;
1506
1507 if (sin->sin_family != AF_INET)
1508 return -EINVAL;
1509
1510 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1511 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1512
1513 if (optname == TCP_MD5SIG_EXT &&
1514 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1515 prefixlen = cmd.tcpm_prefixlen;
1516 if (prefixlen > 32)
1517 return -EINVAL;
1518 }
1519
1520 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1521 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1522 struct net_device *dev;
1523
1524 rcu_read_lock();
1525 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1526 if (dev && netif_is_l3_master(dev))
1527 l3index = dev->ifindex;
1528
1529 rcu_read_unlock();
1530
1531
1532
1533
1534 if (!dev || !l3index)
1535 return -EINVAL;
1536 }
1537
1538 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1539
1540 if (!cmd.tcpm_keylen)
1541 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1542
1543 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1544 return -EINVAL;
1545
1546
1547
1548
1549 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1550 return -EKEYREJECTED;
1551
1552 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1553 cmd.tcpm_key, cmd.tcpm_keylen);
1554}
1555
1556static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1557 __be32 daddr, __be32 saddr,
1558 const struct tcphdr *th, int nbytes)
1559{
1560 struct tcp4_pseudohdr *bp;
1561 struct scatterlist sg;
1562 struct tcphdr *_th;
1563
1564 bp = hp->scratch;
1565 bp->saddr = saddr;
1566 bp->daddr = daddr;
1567 bp->pad = 0;
1568 bp->protocol = IPPROTO_TCP;
1569 bp->len = cpu_to_be16(nbytes);
1570
1571 _th = (struct tcphdr *)(bp + 1);
1572 memcpy(_th, th, sizeof(*th));
1573 _th->check = 0;
1574
1575 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1576 ahash_request_set_crypt(hp->req, &sg, NULL,
1577 sizeof(*bp) + sizeof(*th));
1578 return crypto_ahash_update(hp->req);
1579}
1580
1581static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1582 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1583{
1584 struct tcp_sigpool hp;
1585
1586 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1587 goto clear_hash_nostart;
1588
1589 if (crypto_ahash_init(hp.req))
1590 goto clear_hash;
1591 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1592 goto clear_hash;
1593 if (tcp_md5_hash_key(&hp, key))
1594 goto clear_hash;
1595 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1596 if (crypto_ahash_final(hp.req))
1597 goto clear_hash;
1598
1599 tcp_sigpool_end(&hp);
1600 return 0;
1601
1602clear_hash:
1603 tcp_sigpool_end(&hp);
1604clear_hash_nostart:
1605 memset(md5_hash, 0, 16);
1606 return 1;
1607}
1608
1609int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1610 const struct sock *sk,
1611 const struct sk_buff *skb)
1612{
1613 const struct tcphdr *th = tcp_hdr(skb);
1614 struct tcp_sigpool hp;
1615 __be32 saddr, daddr;
1616
1617 if (sk) {
1618 saddr = sk->sk_rcv_saddr;
1619 daddr = sk->sk_daddr;
1620 } else {
1621 const struct iphdr *iph = ip_hdr(skb);
1622 saddr = iph->saddr;
1623 daddr = iph->daddr;
1624 }
1625
1626 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1627 goto clear_hash_nostart;
1628
1629 if (crypto_ahash_init(hp.req))
1630 goto clear_hash;
1631
1632 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1633 goto clear_hash;
1634 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1635 goto clear_hash;
1636 if (tcp_md5_hash_key(&hp, key))
1637 goto clear_hash;
1638 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1639 if (crypto_ahash_final(hp.req))
1640 goto clear_hash;
1641
1642 tcp_sigpool_end(&hp);
1643 return 0;
1644
1645clear_hash:
1646 tcp_sigpool_end(&hp);
1647clear_hash_nostart:
1648 memset(md5_hash, 0, 16);
1649 return 1;
1650}
1651EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1652
1653#endif
1654
1655static void tcp_v4_init_req(struct request_sock *req,
1656 const struct sock *sk_listener,
1657 struct sk_buff *skb)
1658{
1659 struct inet_request_sock *ireq = inet_rsk(req);
1660 struct net *net = sock_net(sk_listener);
1661
1662 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1663 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1664 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1665}
1666
1667static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1668 struct sk_buff *skb,
1669 struct flowi *fl,
1670 struct request_sock *req)
1671{
1672 tcp_v4_init_req(req, sk, skb);
1673
1674 if (security_inet_conn_request(sk, skb, req))
1675 return NULL;
1676
1677 return inet_csk_route_req(sk, &fl->u.ip4, req);
1678}
1679
1680struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1681 .family = PF_INET,
1682 .obj_size = sizeof(struct tcp_request_sock),
1683 .rtx_syn_ack = tcp_rtx_synack,
1684 .send_ack = tcp_v4_reqsk_send_ack,
1685 .destructor = tcp_v4_reqsk_destructor,
1686 .send_reset = tcp_v4_send_reset,
1687 .syn_ack_timeout = tcp_syn_ack_timeout,
1688};
1689
1690const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1691 .mss_clamp = TCP_MSS_DEFAULT,
1692#ifdef CONFIG_TCP_MD5SIG
1693 .req_md5_lookup = tcp_v4_md5_lookup,
1694 .calc_md5_hash = tcp_v4_md5_hash_skb,
1695#endif
1696#ifdef CONFIG_TCP_AO
1697 .ao_lookup = tcp_v4_ao_lookup_rsk,
1698 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1699 .ao_synack_hash = tcp_v4_ao_synack_hash,
1700#endif
1701#ifdef CONFIG_SYN_COOKIES
1702 .cookie_init_seq = cookie_v4_init_sequence,
1703#endif
1704 .route_req = tcp_v4_route_req,
1705 .init_seq = tcp_v4_init_seq,
1706 .init_ts_off = tcp_v4_init_ts_off,
1707 .send_synack = tcp_v4_send_synack,
1708};
1709
1710int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1711{
1712
1713 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1714 goto drop;
1715
1716 return tcp_conn_request(&tcp_request_sock_ops,
1717 &tcp_request_sock_ipv4_ops, sk, skb);
1718
1719drop:
1720 tcp_listendrop(sk);
1721 return 0;
1722}
1723EXPORT_SYMBOL(tcp_v4_conn_request);
1724
1725
1726
1727
1728
1729
1730struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1731 struct request_sock *req,
1732 struct dst_entry *dst,
1733 struct request_sock *req_unhash,
1734 bool *own_req)
1735{
1736 struct inet_request_sock *ireq;
1737 bool found_dup_sk = false;
1738 struct inet_sock *newinet;
1739 struct tcp_sock *newtp;
1740 struct sock *newsk;
1741#ifdef CONFIG_TCP_MD5SIG
1742 const union tcp_md5_addr *addr;
1743 struct tcp_md5sig_key *key;
1744 int l3index;
1745#endif
1746 struct ip_options_rcu *inet_opt;
1747
1748 if (sk_acceptq_is_full(sk))
1749 goto exit_overflow;
1750
1751 newsk = tcp_create_openreq_child(sk, req, skb);
1752 if (!newsk)
1753 goto exit_nonewsk;
1754
1755 newsk->sk_gso_type = SKB_GSO_TCPV4;
1756 inet_sk_rx_dst_set(newsk, skb);
1757
1758 newtp = tcp_sk(newsk);
1759 newinet = inet_sk(newsk);
1760 ireq = inet_rsk(req);
1761 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1762 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1763 newsk->sk_bound_dev_if = ireq->ir_iif;
1764 newinet->inet_saddr = ireq->ir_loc_addr;
1765 inet_opt = rcu_dereference(ireq->ireq_opt);
1766 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1767 newinet->mc_index = inet_iif(skb);
1768 newinet->mc_ttl = ip_hdr(skb)->ttl;
1769 newinet->rcv_tos = ip_hdr(skb)->tos;
1770 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1771 if (inet_opt)
1772 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1773 atomic_set(&newinet->inet_id, get_random_u16());
1774
1775
1776
1777
1778 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1779 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1780
1781 if (!dst) {
1782 dst = inet_csk_route_child_sock(sk, newsk, req);
1783 if (!dst)
1784 goto put_and_exit;
1785 } else {
1786
1787 }
1788 sk_setup_caps(newsk, dst);
1789
1790 tcp_ca_openreq_child(newsk, dst);
1791
1792 tcp_sync_mss(newsk, dst_mtu(dst));
1793 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1794
1795 tcp_initialize_rcv_mss(newsk);
1796
1797#ifdef CONFIG_TCP_MD5SIG
1798 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1799
1800 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1801 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1802 if (key && !tcp_rsk_used_ao(req)) {
1803 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1804 goto put_and_exit;
1805 sk_gso_disable(newsk);
1806 }
1807#endif
1808#ifdef CONFIG_TCP_AO
1809 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1810 goto put_and_exit;
1811#endif
1812
1813 if (__inet_inherit_port(sk, newsk) < 0)
1814 goto put_and_exit;
1815 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1816 &found_dup_sk);
1817 if (likely(*own_req)) {
1818 tcp_move_syn(newtp, req);
1819 ireq->ireq_opt = NULL;
1820 } else {
1821 newinet->inet_opt = NULL;
1822
1823 if (!req_unhash && found_dup_sk) {
1824
1825
1826
1827 bh_unlock_sock(newsk);
1828 sock_put(newsk);
1829 newsk = NULL;
1830 }
1831 }
1832 return newsk;
1833
1834exit_overflow:
1835 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1836exit_nonewsk:
1837 dst_release(dst);
1838exit:
1839 tcp_listendrop(sk);
1840 return NULL;
1841put_and_exit:
1842 newinet->inet_opt = NULL;
1843 inet_csk_prepare_forced_close(newsk);
1844 tcp_done(newsk);
1845 goto exit;
1846}
1847EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1848
1849static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1850{
1851#ifdef CONFIG_SYN_COOKIES
1852 const struct tcphdr *th = tcp_hdr(skb);
1853
1854 if (!th->syn)
1855 sk = cookie_v4_check(sk, skb);
1856#endif
1857 return sk;
1858}
1859
1860u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1861 struct tcphdr *th, u32 *cookie)
1862{
1863 u16 mss = 0;
1864#ifdef CONFIG_SYN_COOKIES
1865 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1866 &tcp_request_sock_ipv4_ops, sk, th);
1867 if (mss) {
1868 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1869 tcp_synq_overflow(sk);
1870 }
1871#endif
1872 return mss;
1873}
1874
1875INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1876 u32));
1877
1878
1879
1880
1881
1882
1883
1884
1885int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1886{
1887 enum skb_drop_reason reason;
1888 struct sock *rsk;
1889
1890 if (sk->sk_state == TCP_ESTABLISHED) {
1891 struct dst_entry *dst;
1892
1893 dst = rcu_dereference_protected(sk->sk_rx_dst,
1894 lockdep_sock_is_held(sk));
1895
1896 sock_rps_save_rxhash(sk, skb);
1897 sk_mark_napi_id(sk, skb);
1898 if (dst) {
1899 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1900 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1901 dst, 0)) {
1902 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1903 dst_release(dst);
1904 }
1905 }
1906 tcp_rcv_established(sk, skb);
1907 return 0;
1908 }
1909
1910 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1911 if (tcp_checksum_complete(skb))
1912 goto csum_err;
1913
1914 if (sk->sk_state == TCP_LISTEN) {
1915 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1916
1917 if (!nsk)
1918 goto discard;
1919 if (nsk != sk) {
1920 if (tcp_child_process(sk, nsk, skb)) {
1921 rsk = nsk;
1922 goto reset;
1923 }
1924 return 0;
1925 }
1926 } else
1927 sock_rps_save_rxhash(sk, skb);
1928
1929 if (tcp_rcv_state_process(sk, skb)) {
1930 rsk = sk;
1931 goto reset;
1932 }
1933 return 0;
1934
1935reset:
1936 tcp_v4_send_reset(rsk, skb);
1937discard:
1938 kfree_skb_reason(skb, reason);
1939
1940
1941
1942
1943
1944 return 0;
1945
1946csum_err:
1947 reason = SKB_DROP_REASON_TCP_CSUM;
1948 trace_tcp_bad_csum(skb);
1949 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1950 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1951 goto discard;
1952}
1953EXPORT_SYMBOL(tcp_v4_do_rcv);
1954
1955int tcp_v4_early_demux(struct sk_buff *skb)
1956{
1957 struct net *net = dev_net(skb->dev);
1958 const struct iphdr *iph;
1959 const struct tcphdr *th;
1960 struct sock *sk;
1961
1962 if (skb->pkt_type != PACKET_HOST)
1963 return 0;
1964
1965 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1966 return 0;
1967
1968 iph = ip_hdr(skb);
1969 th = tcp_hdr(skb);
1970
1971 if (th->doff < sizeof(struct tcphdr) / 4)
1972 return 0;
1973
1974 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1975 iph->saddr, th->source,
1976 iph->daddr, ntohs(th->dest),
1977 skb->skb_iif, inet_sdif(skb));
1978 if (sk) {
1979 skb->sk = sk;
1980 skb->destructor = sock_edemux;
1981 if (sk_fullsock(sk)) {
1982 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1983
1984 if (dst)
1985 dst = dst_check(dst, 0);
1986 if (dst &&
1987 sk->sk_rx_dst_ifindex == skb->skb_iif)
1988 skb_dst_set_noref(skb, dst);
1989 }
1990 }
1991 return 0;
1992}
1993
1994bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1995 enum skb_drop_reason *reason)
1996{
1997 u32 limit, tail_gso_size, tail_gso_segs;
1998 struct skb_shared_info *shinfo;
1999 const struct tcphdr *th;
2000 struct tcphdr *thtail;
2001 struct sk_buff *tail;
2002 unsigned int hdrlen;
2003 bool fragstolen;
2004 u32 gso_segs;
2005 u32 gso_size;
2006 int delta;
2007
2008
2009
2010
2011
2012
2013
2014 skb_condense(skb);
2015
2016 skb_dst_drop(skb);
2017
2018 if (unlikely(tcp_checksum_complete(skb))) {
2019 bh_unlock_sock(sk);
2020 trace_tcp_bad_csum(skb);
2021 *reason = SKB_DROP_REASON_TCP_CSUM;
2022 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2023 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2024 return true;
2025 }
2026
2027
2028
2029
2030
2031 th = (const struct tcphdr *)skb->data;
2032 hdrlen = th->doff * 4;
2033
2034 tail = sk->sk_backlog.tail;
2035 if (!tail)
2036 goto no_coalesce;
2037 thtail = (struct tcphdr *)tail->data;
2038
2039 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2040 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2041 ((TCP_SKB_CB(tail)->tcp_flags |
2042 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2043 !((TCP_SKB_CB(tail)->tcp_flags &
2044 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2045 ((TCP_SKB_CB(tail)->tcp_flags ^
2046 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2047#ifdef CONFIG_TLS_DEVICE
2048 tail->decrypted != skb->decrypted ||
2049#endif
2050 !mptcp_skb_can_collapse(tail, skb) ||
2051 thtail->doff != th->doff ||
2052 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2053 goto no_coalesce;
2054
2055 __skb_pull(skb, hdrlen);
2056
2057 shinfo = skb_shinfo(skb);
2058 gso_size = shinfo->gso_size ?: skb->len;
2059 gso_segs = shinfo->gso_segs ?: 1;
2060
2061 shinfo = skb_shinfo(tail);
2062 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2063 tail_gso_segs = shinfo->gso_segs ?: 1;
2064
2065 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2066 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2067
2068 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2069 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2070 thtail->window = th->window;
2071 }
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081 thtail->fin |= th->fin;
2082 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2083
2084 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2085 TCP_SKB_CB(tail)->has_rxtstamp = true;
2086 tail->tstamp = skb->tstamp;
2087 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2088 }
2089
2090
2091 shinfo->gso_size = max(gso_size, tail_gso_size);
2092 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2093
2094 sk->sk_backlog.len += delta;
2095 __NET_INC_STATS(sock_net(sk),
2096 LINUX_MIB_TCPBACKLOGCOALESCE);
2097 kfree_skb_partial(skb, fragstolen);
2098 return false;
2099 }
2100 __skb_push(skb, hdrlen);
2101
2102no_coalesce:
2103 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
2104
2105
2106
2107
2108
2109 limit += 64 * 1024;
2110
2111 if (unlikely(sk_add_backlog(sk, skb, limit))) {
2112 bh_unlock_sock(sk);
2113 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2114 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2115 return true;
2116 }
2117 return false;
2118}
2119EXPORT_SYMBOL(tcp_add_backlog);
2120
2121int tcp_filter(struct sock *sk, struct sk_buff *skb)
2122{
2123 struct tcphdr *th = (struct tcphdr *)skb->data;
2124
2125 return sk_filter_trim_cap(sk, skb, th->doff * 4);
2126}
2127EXPORT_SYMBOL(tcp_filter);
2128
2129static void tcp_v4_restore_cb(struct sk_buff *skb)
2130{
2131 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2132 sizeof(struct inet_skb_parm));
2133}
2134
2135static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2136 const struct tcphdr *th)
2137{
2138
2139
2140
2141 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2142 sizeof(struct inet_skb_parm));
2143 barrier();
2144
2145 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2146 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2147 skb->len - th->doff * 4);
2148 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2149 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2150 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
2151 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2152 TCP_SKB_CB(skb)->sacked = 0;
2153 TCP_SKB_CB(skb)->has_rxtstamp =
2154 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2155}
2156
2157
2158
2159
2160
2161int tcp_v4_rcv(struct sk_buff *skb)
2162{
2163 struct net *net = dev_net(skb->dev);
2164 enum skb_drop_reason drop_reason;
2165 int sdif = inet_sdif(skb);
2166 int dif = inet_iif(skb);
2167 const struct iphdr *iph;
2168 const struct tcphdr *th;
2169 bool refcounted;
2170 struct sock *sk;
2171 int ret;
2172
2173 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2174 if (skb->pkt_type != PACKET_HOST)
2175 goto discard_it;
2176
2177
2178 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2179
2180 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2181 goto discard_it;
2182
2183 th = (const struct tcphdr *)skb->data;
2184
2185 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2186 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2187 goto bad_packet;
2188 }
2189 if (!pskb_may_pull(skb, th->doff * 4))
2190 goto discard_it;
2191
2192
2193
2194
2195
2196
2197 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2198 goto csum_error;
2199
2200 th = (const struct tcphdr *)skb->data;
2201 iph = ip_hdr(skb);
2202lookup:
2203 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2204 skb, __tcp_hdrlen(th), th->source,
2205 th->dest, sdif, &refcounted);
2206 if (!sk)
2207 goto no_tcp_socket;
2208
2209process:
2210 if (sk->sk_state == TCP_TIME_WAIT)
2211 goto do_time_wait;
2212
2213 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2214 struct request_sock *req = inet_reqsk(sk);
2215 bool req_stolen = false;
2216 struct sock *nsk;
2217
2218 sk = req->rsk_listener;
2219 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2220 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2221 else
2222 drop_reason = tcp_inbound_hash(sk, req, skb,
2223 &iph->saddr, &iph->daddr,
2224 AF_INET, dif, sdif);
2225 if (unlikely(drop_reason)) {
2226 sk_drops_add(sk, skb);
2227 reqsk_put(req);
2228 goto discard_it;
2229 }
2230 if (tcp_checksum_complete(skb)) {
2231 reqsk_put(req);
2232 goto csum_error;
2233 }
2234 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2235 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2236 if (!nsk) {
2237 inet_csk_reqsk_queue_drop_and_put(sk, req);
2238 goto lookup;
2239 }
2240 sk = nsk;
2241
2242
2243
2244 } else {
2245
2246
2247
2248 sock_hold(sk);
2249 }
2250 refcounted = true;
2251 nsk = NULL;
2252 if (!tcp_filter(sk, skb)) {
2253 th = (const struct tcphdr *)skb->data;
2254 iph = ip_hdr(skb);
2255 tcp_v4_fill_cb(skb, iph, th);
2256 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2257 } else {
2258 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2259 }
2260 if (!nsk) {
2261 reqsk_put(req);
2262 if (req_stolen) {
2263
2264
2265
2266
2267
2268 tcp_v4_restore_cb(skb);
2269 sock_put(sk);
2270 goto lookup;
2271 }
2272 goto discard_and_relse;
2273 }
2274 nf_reset_ct(skb);
2275 if (nsk == sk) {
2276 reqsk_put(req);
2277 tcp_v4_restore_cb(skb);
2278 } else if (tcp_child_process(sk, nsk, skb)) {
2279 tcp_v4_send_reset(nsk, skb);
2280 goto discard_and_relse;
2281 } else {
2282 sock_put(sk);
2283 return 0;
2284 }
2285 }
2286
2287 if (static_branch_unlikely(&ip4_min_ttl)) {
2288
2289 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2290 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2291 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2292 goto discard_and_relse;
2293 }
2294 }
2295
2296 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2297 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2298 goto discard_and_relse;
2299 }
2300
2301 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2302 AF_INET, dif, sdif);
2303 if (drop_reason)
2304 goto discard_and_relse;
2305
2306 nf_reset_ct(skb);
2307
2308 if (tcp_filter(sk, skb)) {
2309 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2310 goto discard_and_relse;
2311 }
2312 th = (const struct tcphdr *)skb->data;
2313 iph = ip_hdr(skb);
2314 tcp_v4_fill_cb(skb, iph, th);
2315
2316 skb->dev = NULL;
2317
2318 if (sk->sk_state == TCP_LISTEN) {
2319 ret = tcp_v4_do_rcv(sk, skb);
2320 goto put_and_return;
2321 }
2322
2323 sk_incoming_cpu_update(sk);
2324
2325 bh_lock_sock_nested(sk);
2326 tcp_segs_in(tcp_sk(sk), skb);
2327 ret = 0;
2328 if (!sock_owned_by_user(sk)) {
2329 ret = tcp_v4_do_rcv(sk, skb);
2330 } else {
2331 if (tcp_add_backlog(sk, skb, &drop_reason))
2332 goto discard_and_relse;
2333 }
2334 bh_unlock_sock(sk);
2335
2336put_and_return:
2337 if (refcounted)
2338 sock_put(sk);
2339
2340 return ret;
2341
2342no_tcp_socket:
2343 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2344 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2345 goto discard_it;
2346
2347 tcp_v4_fill_cb(skb, iph, th);
2348
2349 if (tcp_checksum_complete(skb)) {
2350csum_error:
2351 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2352 trace_tcp_bad_csum(skb);
2353 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2354bad_packet:
2355 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2356 } else {
2357 tcp_v4_send_reset(NULL, skb);
2358 }
2359
2360discard_it:
2361 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2362
2363 kfree_skb_reason(skb, drop_reason);
2364 return 0;
2365
2366discard_and_relse:
2367 sk_drops_add(sk, skb);
2368 if (refcounted)
2369 sock_put(sk);
2370 goto discard_it;
2371
2372do_time_wait:
2373 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2374 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2375 inet_twsk_put(inet_twsk(sk));
2376 goto discard_it;
2377 }
2378
2379 tcp_v4_fill_cb(skb, iph, th);
2380
2381 if (tcp_checksum_complete(skb)) {
2382 inet_twsk_put(inet_twsk(sk));
2383 goto csum_error;
2384 }
2385 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2386 case TCP_TW_SYN: {
2387 struct sock *sk2 = inet_lookup_listener(net,
2388 net->ipv4.tcp_death_row.hashinfo,
2389 skb, __tcp_hdrlen(th),
2390 iph->saddr, th->source,
2391 iph->daddr, th->dest,
2392 inet_iif(skb),
2393 sdif);
2394 if (sk2) {
2395 inet_twsk_deschedule_put(inet_twsk(sk));
2396 sk = sk2;
2397 tcp_v4_restore_cb(skb);
2398 refcounted = false;
2399 goto process;
2400 }
2401 }
2402
2403 fallthrough;
2404 case TCP_TW_ACK:
2405 tcp_v4_timewait_ack(sk, skb);
2406 break;
2407 case TCP_TW_RST:
2408 tcp_v4_send_reset(sk, skb);
2409 inet_twsk_deschedule_put(inet_twsk(sk));
2410 goto discard_it;
2411 case TCP_TW_SUCCESS:;
2412 }
2413 goto discard_it;
2414}
2415
2416static struct timewait_sock_ops tcp_timewait_sock_ops = {
2417 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2418 .twsk_unique = tcp_twsk_unique,
2419 .twsk_destructor= tcp_twsk_destructor,
2420};
2421
2422void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2423{
2424 struct dst_entry *dst = skb_dst(skb);
2425
2426 if (dst && dst_hold_safe(dst)) {
2427 rcu_assign_pointer(sk->sk_rx_dst, dst);
2428 sk->sk_rx_dst_ifindex = skb->skb_iif;
2429 }
2430}
2431EXPORT_SYMBOL(inet_sk_rx_dst_set);
2432
2433const struct inet_connection_sock_af_ops ipv4_specific = {
2434 .queue_xmit = ip_queue_xmit,
2435 .send_check = tcp_v4_send_check,
2436 .rebuild_header = inet_sk_rebuild_header,
2437 .sk_rx_dst_set = inet_sk_rx_dst_set,
2438 .conn_request = tcp_v4_conn_request,
2439 .syn_recv_sock = tcp_v4_syn_recv_sock,
2440 .net_header_len = sizeof(struct iphdr),
2441 .setsockopt = ip_setsockopt,
2442 .getsockopt = ip_getsockopt,
2443 .addr2sockaddr = inet_csk_addr2sockaddr,
2444 .sockaddr_len = sizeof(struct sockaddr_in),
2445 .mtu_reduced = tcp_v4_mtu_reduced,
2446};
2447EXPORT_SYMBOL(ipv4_specific);
2448
2449#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2450static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2451#ifdef CONFIG_TCP_MD5SIG
2452 .md5_lookup = tcp_v4_md5_lookup,
2453 .calc_md5_hash = tcp_v4_md5_hash_skb,
2454 .md5_parse = tcp_v4_parse_md5_keys,
2455#endif
2456#ifdef CONFIG_TCP_AO
2457 .ao_lookup = tcp_v4_ao_lookup,
2458 .calc_ao_hash = tcp_v4_ao_hash_skb,
2459 .ao_parse = tcp_v4_parse_ao,
2460 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2461#endif
2462};
2463#endif
2464
2465
2466
2467
2468static int tcp_v4_init_sock(struct sock *sk)
2469{
2470 struct inet_connection_sock *icsk = inet_csk(sk);
2471
2472 tcp_init_sock(sk);
2473
2474 icsk->icsk_af_ops = &ipv4_specific;
2475
2476#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2477 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2478#endif
2479
2480 return 0;
2481}
2482
2483#ifdef CONFIG_TCP_MD5SIG
2484static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2485{
2486 struct tcp_md5sig_info *md5sig;
2487
2488 md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2489 kfree(md5sig);
2490 static_branch_slow_dec_deferred(&tcp_md5_needed);
2491 tcp_md5_release_sigpool();
2492}
2493#endif
2494
2495void tcp_v4_destroy_sock(struct sock *sk)
2496{
2497 struct tcp_sock *tp = tcp_sk(sk);
2498
2499 trace_tcp_destroy_sock(sk);
2500
2501 tcp_clear_xmit_timers(sk);
2502
2503 tcp_cleanup_congestion_control(sk);
2504
2505 tcp_cleanup_ulp(sk);
2506
2507
2508 tcp_write_queue_purge(sk);
2509
2510
2511 tcp_fastopen_active_disable_ofo_check(sk);
2512
2513
2514 skb_rbtree_purge(&tp->out_of_order_queue);
2515
2516#ifdef CONFIG_TCP_MD5SIG
2517
2518 if (tp->md5sig_info) {
2519 struct tcp_md5sig_info *md5sig;
2520
2521 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2522 tcp_clear_md5_list(sk);
2523 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2524 rcu_assign_pointer(tp->md5sig_info, NULL);
2525 }
2526#endif
2527 tcp_ao_destroy_sock(sk, false);
2528
2529
2530 if (inet_csk(sk)->icsk_bind_hash)
2531 inet_put_port(sk);
2532
2533 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2534
2535
2536 tcp_free_fastopen_req(tp);
2537 tcp_fastopen_destroy_cipher(sk);
2538 tcp_saved_syn_free(tp);
2539
2540 sk_sockets_allocated_dec(sk);
2541}
2542EXPORT_SYMBOL(tcp_v4_destroy_sock);
2543
2544#ifdef CONFIG_PROC_FS
2545
2546
2547static unsigned short seq_file_family(const struct seq_file *seq);
2548
2549static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2550{
2551 unsigned short family = seq_file_family(seq);
2552
2553
2554 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2555 net_eq(sock_net(sk), seq_file_net(seq)));
2556}
2557
2558
2559
2560
2561static void *listening_get_first(struct seq_file *seq)
2562{
2563 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2564 struct tcp_iter_state *st = seq->private;
2565
2566 st->offset = 0;
2567 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2568 struct inet_listen_hashbucket *ilb2;
2569 struct hlist_nulls_node *node;
2570 struct sock *sk;
2571
2572 ilb2 = &hinfo->lhash2[st->bucket];
2573 if (hlist_nulls_empty(&ilb2->nulls_head))
2574 continue;
2575
2576 spin_lock(&ilb2->lock);
2577 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2578 if (seq_sk_match(seq, sk))
2579 return sk;
2580 }
2581 spin_unlock(&ilb2->lock);
2582 }
2583
2584 return NULL;
2585}
2586
2587
2588
2589
2590
2591
2592static void *listening_get_next(struct seq_file *seq, void *cur)
2593{
2594 struct tcp_iter_state *st = seq->private;
2595 struct inet_listen_hashbucket *ilb2;
2596 struct hlist_nulls_node *node;
2597 struct inet_hashinfo *hinfo;
2598 struct sock *sk = cur;
2599
2600 ++st->num;
2601 ++st->offset;
2602
2603 sk = sk_nulls_next(sk);
2604 sk_nulls_for_each_from(sk, node) {
2605 if (seq_sk_match(seq, sk))
2606 return sk;
2607 }
2608
2609 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610 ilb2 = &hinfo->lhash2[st->bucket];
2611 spin_unlock(&ilb2->lock);
2612 ++st->bucket;
2613 return listening_get_first(seq);
2614}
2615
2616static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2617{
2618 struct tcp_iter_state *st = seq->private;
2619 void *rc;
2620
2621 st->bucket = 0;
2622 st->offset = 0;
2623 rc = listening_get_first(seq);
2624
2625 while (rc && *pos) {
2626 rc = listening_get_next(seq, rc);
2627 --*pos;
2628 }
2629 return rc;
2630}
2631
2632static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2633 const struct tcp_iter_state *st)
2634{
2635 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2636}
2637
2638
2639
2640
2641
2642static void *established_get_first(struct seq_file *seq)
2643{
2644 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2645 struct tcp_iter_state *st = seq->private;
2646
2647 st->offset = 0;
2648 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2649 struct sock *sk;
2650 struct hlist_nulls_node *node;
2651 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2652
2653 cond_resched();
2654
2655
2656 if (empty_bucket(hinfo, st))
2657 continue;
2658
2659 spin_lock_bh(lock);
2660 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2661 if (seq_sk_match(seq, sk))
2662 return sk;
2663 }
2664 spin_unlock_bh(lock);
2665 }
2666
2667 return NULL;
2668}
2669
2670static void *established_get_next(struct seq_file *seq, void *cur)
2671{
2672 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2673 struct tcp_iter_state *st = seq->private;
2674 struct hlist_nulls_node *node;
2675 struct sock *sk = cur;
2676
2677 ++st->num;
2678 ++st->offset;
2679
2680 sk = sk_nulls_next(sk);
2681
2682 sk_nulls_for_each_from(sk, node) {
2683 if (seq_sk_match(seq, sk))
2684 return sk;
2685 }
2686
2687 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2688 ++st->bucket;
2689 return established_get_first(seq);
2690}
2691
2692static void *established_get_idx(struct seq_file *seq, loff_t pos)
2693{
2694 struct tcp_iter_state *st = seq->private;
2695 void *rc;
2696
2697 st->bucket = 0;
2698 rc = established_get_first(seq);
2699
2700 while (rc && pos) {
2701 rc = established_get_next(seq, rc);
2702 --pos;
2703 }
2704 return rc;
2705}
2706
2707static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2708{
2709 void *rc;
2710 struct tcp_iter_state *st = seq->private;
2711
2712 st->state = TCP_SEQ_STATE_LISTENING;
2713 rc = listening_get_idx(seq, &pos);
2714
2715 if (!rc) {
2716 st->state = TCP_SEQ_STATE_ESTABLISHED;
2717 rc = established_get_idx(seq, pos);
2718 }
2719
2720 return rc;
2721}
2722
2723static void *tcp_seek_last_pos(struct seq_file *seq)
2724{
2725 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2726 struct tcp_iter_state *st = seq->private;
2727 int bucket = st->bucket;
2728 int offset = st->offset;
2729 int orig_num = st->num;
2730 void *rc = NULL;
2731
2732 switch (st->state) {
2733 case TCP_SEQ_STATE_LISTENING:
2734 if (st->bucket > hinfo->lhash2_mask)
2735 break;
2736 rc = listening_get_first(seq);
2737 while (offset-- && rc && bucket == st->bucket)
2738 rc = listening_get_next(seq, rc);
2739 if (rc)
2740 break;
2741 st->bucket = 0;
2742 st->state = TCP_SEQ_STATE_ESTABLISHED;
2743 fallthrough;
2744 case TCP_SEQ_STATE_ESTABLISHED:
2745 if (st->bucket > hinfo->ehash_mask)
2746 break;
2747 rc = established_get_first(seq);
2748 while (offset-- && rc && bucket == st->bucket)
2749 rc = established_get_next(seq, rc);
2750 }
2751
2752 st->num = orig_num;
2753
2754 return rc;
2755}
2756
2757void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2758{
2759 struct tcp_iter_state *st = seq->private;
2760 void *rc;
2761
2762 if (*pos && *pos == st->last_pos) {
2763 rc = tcp_seek_last_pos(seq);
2764 if (rc)
2765 goto out;
2766 }
2767
2768 st->state = TCP_SEQ_STATE_LISTENING;
2769 st->num = 0;
2770 st->bucket = 0;
2771 st->offset = 0;
2772 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2773
2774out:
2775 st->last_pos = *pos;
2776 return rc;
2777}
2778EXPORT_SYMBOL(tcp_seq_start);
2779
2780void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2781{
2782 struct tcp_iter_state *st = seq->private;
2783 void *rc = NULL;
2784
2785 if (v == SEQ_START_TOKEN) {
2786 rc = tcp_get_idx(seq, 0);
2787 goto out;
2788 }
2789
2790 switch (st->state) {
2791 case TCP_SEQ_STATE_LISTENING:
2792 rc = listening_get_next(seq, v);
2793 if (!rc) {
2794 st->state = TCP_SEQ_STATE_ESTABLISHED;
2795 st->bucket = 0;
2796 st->offset = 0;
2797 rc = established_get_first(seq);
2798 }
2799 break;
2800 case TCP_SEQ_STATE_ESTABLISHED:
2801 rc = established_get_next(seq, v);
2802 break;
2803 }
2804out:
2805 ++*pos;
2806 st->last_pos = *pos;
2807 return rc;
2808}
2809EXPORT_SYMBOL(tcp_seq_next);
2810
2811void tcp_seq_stop(struct seq_file *seq, void *v)
2812{
2813 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2814 struct tcp_iter_state *st = seq->private;
2815
2816 switch (st->state) {
2817 case TCP_SEQ_STATE_LISTENING:
2818 if (v != SEQ_START_TOKEN)
2819 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2820 break;
2821 case TCP_SEQ_STATE_ESTABLISHED:
2822 if (v)
2823 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2824 break;
2825 }
2826}
2827EXPORT_SYMBOL(tcp_seq_stop);
2828
2829static void get_openreq4(const struct request_sock *req,
2830 struct seq_file *f, int i)
2831{
2832 const struct inet_request_sock *ireq = inet_rsk(req);
2833 long delta = req->rsk_timer.expires - jiffies;
2834
2835 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2836 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2837 i,
2838 ireq->ir_loc_addr,
2839 ireq->ir_num,
2840 ireq->ir_rmt_addr,
2841 ntohs(ireq->ir_rmt_port),
2842 TCP_SYN_RECV,
2843 0, 0,
2844 1,
2845 jiffies_delta_to_clock_t(delta),
2846 req->num_timeout,
2847 from_kuid_munged(seq_user_ns(f),
2848 sock_i_uid(req->rsk_listener)),
2849 0,
2850 0,
2851 0,
2852 req);
2853}
2854
2855static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2856{
2857 int timer_active;
2858 unsigned long timer_expires;
2859 const struct tcp_sock *tp = tcp_sk(sk);
2860 const struct inet_connection_sock *icsk = inet_csk(sk);
2861 const struct inet_sock *inet = inet_sk(sk);
2862 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2863 __be32 dest = inet->inet_daddr;
2864 __be32 src = inet->inet_rcv_saddr;
2865 __u16 destp = ntohs(inet->inet_dport);
2866 __u16 srcp = ntohs(inet->inet_sport);
2867 int rx_queue;
2868 int state;
2869
2870 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2871 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2872 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2873 timer_active = 1;
2874 timer_expires = icsk->icsk_timeout;
2875 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2876 timer_active = 4;
2877 timer_expires = icsk->icsk_timeout;
2878 } else if (timer_pending(&sk->sk_timer)) {
2879 timer_active = 2;
2880 timer_expires = sk->sk_timer.expires;
2881 } else {
2882 timer_active = 0;
2883 timer_expires = jiffies;
2884 }
2885
2886 state = inet_sk_state_load(sk);
2887 if (state == TCP_LISTEN)
2888 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2889 else
2890
2891
2892
2893 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2894 READ_ONCE(tp->copied_seq), 0);
2895
2896 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2897 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2898 i, src, srcp, dest, destp, state,
2899 READ_ONCE(tp->write_seq) - tp->snd_una,
2900 rx_queue,
2901 timer_active,
2902 jiffies_delta_to_clock_t(timer_expires - jiffies),
2903 icsk->icsk_retransmits,
2904 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2905 icsk->icsk_probes_out,
2906 sock_i_ino(sk),
2907 refcount_read(&sk->sk_refcnt), sk,
2908 jiffies_to_clock_t(icsk->icsk_rto),
2909 jiffies_to_clock_t(icsk->icsk_ack.ato),
2910 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2911 tcp_snd_cwnd(tp),
2912 state == TCP_LISTEN ?
2913 fastopenq->max_qlen :
2914 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2915}
2916
2917static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2918 struct seq_file *f, int i)
2919{
2920 long delta = tw->tw_timer.expires - jiffies;
2921 __be32 dest, src;
2922 __u16 destp, srcp;
2923
2924 dest = tw->tw_daddr;
2925 src = tw->tw_rcv_saddr;
2926 destp = ntohs(tw->tw_dport);
2927 srcp = ntohs(tw->tw_sport);
2928
2929 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2930 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2931 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2932 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2933 refcount_read(&tw->tw_refcnt), tw);
2934}
2935
2936#define TMPSZ 150
2937
2938static int tcp4_seq_show(struct seq_file *seq, void *v)
2939{
2940 struct tcp_iter_state *st;
2941 struct sock *sk = v;
2942
2943 seq_setwidth(seq, TMPSZ - 1);
2944 if (v == SEQ_START_TOKEN) {
2945 seq_puts(seq, " sl local_address rem_address st tx_queue "
2946 "rx_queue tr tm->when retrnsmt uid timeout "
2947 "inode");
2948 goto out;
2949 }
2950 st = seq->private;
2951
2952 if (sk->sk_state == TCP_TIME_WAIT)
2953 get_timewait4_sock(v, seq, st->num);
2954 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2955 get_openreq4(v, seq, st->num);
2956 else
2957 get_tcp4_sock(v, seq, st->num);
2958out:
2959 seq_pad(seq, '\n');
2960 return 0;
2961}
2962
2963#ifdef CONFIG_BPF_SYSCALL
2964struct bpf_tcp_iter_state {
2965 struct tcp_iter_state state;
2966 unsigned int cur_sk;
2967 unsigned int end_sk;
2968 unsigned int max_sk;
2969 struct sock **batch;
2970 bool st_bucket_done;
2971};
2972
2973struct bpf_iter__tcp {
2974 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2975 __bpf_md_ptr(struct sock_common *, sk_common);
2976 uid_t uid __aligned(8);
2977};
2978
2979static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2980 struct sock_common *sk_common, uid_t uid)
2981{
2982 struct bpf_iter__tcp ctx;
2983
2984 meta->seq_num--;
2985 ctx.meta = meta;
2986 ctx.sk_common = sk_common;
2987 ctx.uid = uid;
2988 return bpf_iter_run_prog(prog, &ctx);
2989}
2990
2991static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2992{
2993 while (iter->cur_sk < iter->end_sk)
2994 sock_gen_put(iter->batch[iter->cur_sk++]);
2995}
2996
2997static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2998 unsigned int new_batch_sz)
2999{
3000 struct sock **new_batch;
3001
3002 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3003 GFP_USER | __GFP_NOWARN);
3004 if (!new_batch)
3005 return -ENOMEM;
3006
3007 bpf_iter_tcp_put_batch(iter);
3008 kvfree(iter->batch);
3009 iter->batch = new_batch;
3010 iter->max_sk = new_batch_sz;
3011
3012 return 0;
3013}
3014
3015static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3016 struct sock *start_sk)
3017{
3018 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3019 struct bpf_tcp_iter_state *iter = seq->private;
3020 struct tcp_iter_state *st = &iter->state;
3021 struct hlist_nulls_node *node;
3022 unsigned int expected = 1;
3023 struct sock *sk;
3024
3025 sock_hold(start_sk);
3026 iter->batch[iter->end_sk++] = start_sk;
3027
3028 sk = sk_nulls_next(start_sk);
3029 sk_nulls_for_each_from(sk, node) {
3030 if (seq_sk_match(seq, sk)) {
3031 if (iter->end_sk < iter->max_sk) {
3032 sock_hold(sk);
3033 iter->batch[iter->end_sk++] = sk;
3034 }
3035 expected++;
3036 }
3037 }
3038 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3039
3040 return expected;
3041}
3042
3043static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3044 struct sock *start_sk)
3045{
3046 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3047 struct bpf_tcp_iter_state *iter = seq->private;
3048 struct tcp_iter_state *st = &iter->state;
3049 struct hlist_nulls_node *node;
3050 unsigned int expected = 1;
3051 struct sock *sk;
3052
3053 sock_hold(start_sk);
3054 iter->batch[iter->end_sk++] = start_sk;
3055
3056 sk = sk_nulls_next(start_sk);
3057 sk_nulls_for_each_from(sk, node) {
3058 if (seq_sk_match(seq, sk)) {
3059 if (iter->end_sk < iter->max_sk) {
3060 sock_hold(sk);
3061 iter->batch[iter->end_sk++] = sk;
3062 }
3063 expected++;
3064 }
3065 }
3066 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3067
3068 return expected;
3069}
3070
3071static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3072{
3073 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3074 struct bpf_tcp_iter_state *iter = seq->private;
3075 struct tcp_iter_state *st = &iter->state;
3076 unsigned int expected;
3077 bool resized = false;
3078 struct sock *sk;
3079
3080
3081
3082
3083
3084
3085 if (iter->st_bucket_done) {
3086 st->offset = 0;
3087 st->bucket++;
3088 if (st->state == TCP_SEQ_STATE_LISTENING &&
3089 st->bucket > hinfo->lhash2_mask) {
3090 st->state = TCP_SEQ_STATE_ESTABLISHED;
3091 st->bucket = 0;
3092 }
3093 }
3094
3095again:
3096
3097 iter->cur_sk = 0;
3098 iter->end_sk = 0;
3099 iter->st_bucket_done = false;
3100
3101 sk = tcp_seek_last_pos(seq);
3102 if (!sk)
3103 return NULL;
3104
3105 if (st->state == TCP_SEQ_STATE_LISTENING)
3106 expected = bpf_iter_tcp_listening_batch(seq, sk);
3107 else
3108 expected = bpf_iter_tcp_established_batch(seq, sk);
3109
3110 if (iter->end_sk == expected) {
3111 iter->st_bucket_done = true;
3112 return sk;
3113 }
3114
3115 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3116 resized = true;
3117 goto again;
3118 }
3119
3120 return sk;
3121}
3122
3123static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3124{
3125
3126
3127
3128 if (*pos)
3129 return bpf_iter_tcp_batch(seq);
3130
3131 return SEQ_START_TOKEN;
3132}
3133
3134static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3135{
3136 struct bpf_tcp_iter_state *iter = seq->private;
3137 struct tcp_iter_state *st = &iter->state;
3138 struct sock *sk;
3139
3140
3141
3142
3143
3144 if (iter->cur_sk < iter->end_sk) {
3145
3146
3147
3148
3149 st->num++;
3150
3151
3152
3153
3154 st->offset++;
3155 sock_gen_put(iter->batch[iter->cur_sk++]);
3156 }
3157
3158 if (iter->cur_sk < iter->end_sk)
3159 sk = iter->batch[iter->cur_sk];
3160 else
3161 sk = bpf_iter_tcp_batch(seq);
3162
3163 ++*pos;
3164
3165
3166
3167 st->last_pos = *pos;
3168 return sk;
3169}
3170
3171static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3172{
3173 struct bpf_iter_meta meta;
3174 struct bpf_prog *prog;
3175 struct sock *sk = v;
3176 uid_t uid;
3177 int ret;
3178
3179 if (v == SEQ_START_TOKEN)
3180 return 0;
3181
3182 if (sk_fullsock(sk))
3183 lock_sock(sk);
3184
3185 if (unlikely(sk_unhashed(sk))) {
3186 ret = SEQ_SKIP;
3187 goto unlock;
3188 }
3189
3190 if (sk->sk_state == TCP_TIME_WAIT) {
3191 uid = 0;
3192 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3193 const struct request_sock *req = v;
3194
3195 uid = from_kuid_munged(seq_user_ns(seq),
3196 sock_i_uid(req->rsk_listener));
3197 } else {
3198 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3199 }
3200
3201 meta.seq = seq;
3202 prog = bpf_iter_get_info(&meta, false);
3203 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3204
3205unlock:
3206 if (sk_fullsock(sk))
3207 release_sock(sk);
3208 return ret;
3209
3210}
3211
3212static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3213{
3214 struct bpf_tcp_iter_state *iter = seq->private;
3215 struct bpf_iter_meta meta;
3216 struct bpf_prog *prog;
3217
3218 if (!v) {
3219 meta.seq = seq;
3220 prog = bpf_iter_get_info(&meta, true);
3221 if (prog)
3222 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3223 }
3224
3225 if (iter->cur_sk < iter->end_sk) {
3226 bpf_iter_tcp_put_batch(iter);
3227 iter->st_bucket_done = false;
3228 }
3229}
3230
3231static const struct seq_operations bpf_iter_tcp_seq_ops = {
3232 .show = bpf_iter_tcp_seq_show,
3233 .start = bpf_iter_tcp_seq_start,
3234 .next = bpf_iter_tcp_seq_next,
3235 .stop = bpf_iter_tcp_seq_stop,
3236};
3237#endif
3238static unsigned short seq_file_family(const struct seq_file *seq)
3239{
3240 const struct tcp_seq_afinfo *afinfo;
3241
3242#ifdef CONFIG_BPF_SYSCALL
3243
3244 if (seq->op == &bpf_iter_tcp_seq_ops)
3245 return AF_UNSPEC;
3246#endif
3247
3248
3249 afinfo = pde_data(file_inode(seq->file));
3250 return afinfo->family;
3251}
3252
3253static const struct seq_operations tcp4_seq_ops = {
3254 .show = tcp4_seq_show,
3255 .start = tcp_seq_start,
3256 .next = tcp_seq_next,
3257 .stop = tcp_seq_stop,
3258};
3259
3260static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3261 .family = AF_INET,
3262};
3263
3264static int __net_init tcp4_proc_init_net(struct net *net)
3265{
3266 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3267 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3268 return -ENOMEM;
3269 return 0;
3270}
3271
3272static void __net_exit tcp4_proc_exit_net(struct net *net)
3273{
3274 remove_proc_entry("tcp", net->proc_net);
3275}
3276
3277static struct pernet_operations tcp4_net_ops = {
3278 .init = tcp4_proc_init_net,
3279 .exit = tcp4_proc_exit_net,
3280};
3281
3282int __init tcp4_proc_init(void)
3283{
3284 return register_pernet_subsys(&tcp4_net_ops);
3285}
3286
3287void tcp4_proc_exit(void)
3288{
3289 unregister_pernet_subsys(&tcp4_net_ops);
3290}
3291#endif
3292
3293
3294
3295
3296
3297bool tcp_stream_memory_free(const struct sock *sk, int wake)
3298{
3299 const struct tcp_sock *tp = tcp_sk(sk);
3300 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3301 READ_ONCE(tp->snd_nxt);
3302
3303 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3304}
3305EXPORT_SYMBOL(tcp_stream_memory_free);
3306
3307struct proto tcp_prot = {
3308 .name = "TCP",
3309 .owner = THIS_MODULE,
3310 .close = tcp_close,
3311 .pre_connect = tcp_v4_pre_connect,
3312 .connect = tcp_v4_connect,
3313 .disconnect = tcp_disconnect,
3314 .accept = inet_csk_accept,
3315 .ioctl = tcp_ioctl,
3316 .init = tcp_v4_init_sock,
3317 .destroy = tcp_v4_destroy_sock,
3318 .shutdown = tcp_shutdown,
3319 .setsockopt = tcp_setsockopt,
3320 .getsockopt = tcp_getsockopt,
3321 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3322 .keepalive = tcp_set_keepalive,
3323 .recvmsg = tcp_recvmsg,
3324 .sendmsg = tcp_sendmsg,
3325 .splice_eof = tcp_splice_eof,
3326 .backlog_rcv = tcp_v4_do_rcv,
3327 .release_cb = tcp_release_cb,
3328 .hash = inet_hash,
3329 .unhash = inet_unhash,
3330 .get_port = inet_csk_get_port,
3331 .put_port = inet_put_port,
3332#ifdef CONFIG_BPF_SYSCALL
3333 .psock_update_sk_prot = tcp_bpf_update_proto,
3334#endif
3335 .enter_memory_pressure = tcp_enter_memory_pressure,
3336 .leave_memory_pressure = tcp_leave_memory_pressure,
3337 .stream_memory_free = tcp_stream_memory_free,
3338 .sockets_allocated = &tcp_sockets_allocated,
3339 .orphan_count = &tcp_orphan_count,
3340
3341 .memory_allocated = &tcp_memory_allocated,
3342 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3343
3344 .memory_pressure = &tcp_memory_pressure,
3345 .sysctl_mem = sysctl_tcp_mem,
3346 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3347 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3348 .max_header = MAX_TCP_HEADER,
3349 .obj_size = sizeof(struct tcp_sock),
3350 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3351 .twsk_prot = &tcp_timewait_sock_ops,
3352 .rsk_prot = &tcp_request_sock_ops,
3353 .h.hashinfo = NULL,
3354 .no_autobind = true,
3355 .diag_destroy = tcp_abort,
3356};
3357EXPORT_SYMBOL(tcp_prot);
3358
3359static void __net_exit tcp_sk_exit(struct net *net)
3360{
3361 if (net->ipv4.tcp_congestion_control)
3362 bpf_module_put(net->ipv4.tcp_congestion_control,
3363 net->ipv4.tcp_congestion_control->owner);
3364}
3365
3366static void __net_init tcp_set_hashinfo(struct net *net)
3367{
3368 struct inet_hashinfo *hinfo;
3369 unsigned int ehash_entries;
3370 struct net *old_net;
3371
3372 if (net_eq(net, &init_net))
3373 goto fallback;
3374
3375 old_net = current->nsproxy->net_ns;
3376 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3377 if (!ehash_entries)
3378 goto fallback;
3379
3380 ehash_entries = roundup_pow_of_two(ehash_entries);
3381 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3382 if (!hinfo) {
3383 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3384 "for a netns, fallback to the global one\n",
3385 ehash_entries);
3386fallback:
3387 hinfo = &tcp_hashinfo;
3388 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3389 }
3390
3391 net->ipv4.tcp_death_row.hashinfo = hinfo;
3392 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3393 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3394}
3395
3396static int __net_init tcp_sk_init(struct net *net)
3397{
3398 net->ipv4.sysctl_tcp_ecn = 2;
3399 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3400
3401 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3402 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3403 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3404 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3405 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3406
3407 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3408 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3409 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3410
3411 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3412 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3413 net->ipv4.sysctl_tcp_syncookies = 1;
3414 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3415 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3416 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3417 net->ipv4.sysctl_tcp_orphan_retries = 0;
3418 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3419 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3420 net->ipv4.sysctl_tcp_tw_reuse = 2;
3421 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3422
3423 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3424 tcp_set_hashinfo(net);
3425
3426 net->ipv4.sysctl_tcp_sack = 1;
3427 net->ipv4.sysctl_tcp_window_scaling = 1;
3428 net->ipv4.sysctl_tcp_timestamps = 1;
3429 net->ipv4.sysctl_tcp_early_retrans = 3;
3430 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3431 net->ipv4.sysctl_tcp_slow_start_after_idle = 1;
3432 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3433 net->ipv4.sysctl_tcp_max_reordering = 300;
3434 net->ipv4.sysctl_tcp_dsack = 1;
3435 net->ipv4.sysctl_tcp_app_win = 31;
3436 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3437 net->ipv4.sysctl_tcp_frto = 2;
3438 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3439
3440
3441
3442
3443 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3444
3445 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3446
3447
3448 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3449
3450 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3451 net->ipv4.sysctl_tcp_tso_rtt_log = 9;
3452 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3453 net->ipv4.sysctl_tcp_autocorking = 1;
3454 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3455 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3456 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3457 if (net != &init_net) {
3458 memcpy(net->ipv4.sysctl_tcp_rmem,
3459 init_net.ipv4.sysctl_tcp_rmem,
3460 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3461 memcpy(net->ipv4.sysctl_tcp_wmem,
3462 init_net.ipv4.sysctl_tcp_wmem,
3463 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3464 }
3465 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3466 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3467 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3468 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3469 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3470 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3471 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3472
3473
3474 net->ipv4.sysctl_tcp_plb_enabled = 0;
3475 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3476 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3477 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3478
3479 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3480
3481
3482 if (!net_eq(net, &init_net) &&
3483 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3484 init_net.ipv4.tcp_congestion_control->owner))
3485 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3486 else
3487 net->ipv4.tcp_congestion_control = &tcp_reno;
3488
3489 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3490 net->ipv4.sysctl_tcp_shrink_window = 0;
3491
3492 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3493
3494 return 0;
3495}
3496
3497static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3498{
3499 struct net *net;
3500
3501 tcp_twsk_purge(net_exit_list, AF_INET);
3502
3503 list_for_each_entry(net, net_exit_list, exit_list) {
3504 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3505 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3506 tcp_fastopen_ctx_destroy(net);
3507 }
3508}
3509
3510static struct pernet_operations __net_initdata tcp_sk_ops = {
3511 .init = tcp_sk_init,
3512 .exit = tcp_sk_exit,
3513 .exit_batch = tcp_sk_exit_batch,
3514};
3515
3516#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3517DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3518 struct sock_common *sk_common, uid_t uid)
3519
3520#define INIT_BATCH_SZ 16
3521
3522static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3523{
3524 struct bpf_tcp_iter_state *iter = priv_data;
3525 int err;
3526
3527 err = bpf_iter_init_seq_net(priv_data, aux);
3528 if (err)
3529 return err;
3530
3531 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3532 if (err) {
3533 bpf_iter_fini_seq_net(priv_data);
3534 return err;
3535 }
3536
3537 return 0;
3538}
3539
3540static void bpf_iter_fini_tcp(void *priv_data)
3541{
3542 struct bpf_tcp_iter_state *iter = priv_data;
3543
3544 bpf_iter_fini_seq_net(priv_data);
3545 kvfree(iter->batch);
3546}
3547
3548static const struct bpf_iter_seq_info tcp_seq_info = {
3549 .seq_ops = &bpf_iter_tcp_seq_ops,
3550 .init_seq_private = bpf_iter_init_tcp,
3551 .fini_seq_private = bpf_iter_fini_tcp,
3552 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3553};
3554
3555static const struct bpf_func_proto *
3556bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3557 const struct bpf_prog *prog)
3558{
3559 switch (func_id) {
3560 case BPF_FUNC_setsockopt:
3561 return &bpf_sk_setsockopt_proto;
3562 case BPF_FUNC_getsockopt:
3563 return &bpf_sk_getsockopt_proto;
3564 default:
3565 return NULL;
3566 }
3567}
3568
3569static struct bpf_iter_reg tcp_reg_info = {
3570 .target = "tcp",
3571 .ctx_arg_info_size = 1,
3572 .ctx_arg_info = {
3573 { offsetof(struct bpf_iter__tcp, sk_common),
3574 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3575 },
3576 .get_func_proto = bpf_iter_tcp_get_func_proto,
3577 .seq_info = &tcp_seq_info,
3578};
3579
3580static void __init bpf_iter_register(void)
3581{
3582 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3583 if (bpf_iter_reg_target(&tcp_reg_info))
3584 pr_warn("Warning: could not register bpf iterator tcp\n");
3585}
3586
3587#endif
3588
3589void __init tcp_v4_init(void)
3590{
3591 int cpu, res;
3592
3593 for_each_possible_cpu(cpu) {
3594 struct sock *sk;
3595
3596 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3597 IPPROTO_TCP, &init_net);
3598 if (res)
3599 panic("Failed to create the TCP control socket.\n");
3600 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3601
3602
3603
3604
3605 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3606
3607 per_cpu(ipv4_tcp_sk, cpu) = sk;
3608 }
3609 if (register_pernet_subsys(&tcp_sk_ops))
3610 panic("Failed to create the TCP control socket.\n");
3611
3612#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3613 bpf_iter_register();
3614#endif
3615}
3616