1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60
61#include <net/net_namespace.h>
62#include <net/icmp.h>
63#include <net/inet_hashtables.h>
64#include <net/tcp.h>
65#include <net/transp_v6.h>
66#include <net/ipv6.h>
67#include <net/inet_common.h>
68#include <net/timewait_sock.h>
69#include <net/xfrm.h>
70#include <net/secure_seq.h>
71#include <net/busy_poll.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/inetdevice.h>
79#include <linux/btf_ids.h>
80
81#include <crypto/hash.h>
82#include <linux/scatterlist.h>
83
84#include <trace/events/tcp.h>
85
86#ifdef CONFIG_TCP_MD5SIG
87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89#endif
90
91struct inet_hashinfo tcp_hashinfo;
92EXPORT_SYMBOL(tcp_hashinfo);
93
94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97{
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102}
103
104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105{
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107}
108
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 if (reuse == 2) {
117
118
119
120
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124#if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 loopback = true;
131 } else
132#endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
141
142
143
144
145
146
147
148
149
150
151
152
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156
157
158
159
160
161
162
163
164
165
166
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181}
182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186{
187
188
189
190
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197}
198
199
200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201{
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
207 struct flowi4 *fl4;
208 struct rtable *rt;
209 int err;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
212
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
218
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
223 if (!daddr)
224 return -EINVAL;
225 nexthop = inet_opt->opt.faddr;
226 }
227
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
233 orig_dport, sk);
234 if (IS_ERR(rt)) {
235 err = PTR_ERR(rt);
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 return err;
239 }
240
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 ip_rt_put(rt);
243 return -ENETUNREACH;
244 }
245
246 if (!inet_opt || !inet_opt->opt.srr)
247 daddr = fl4->daddr;
248
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
252
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
259 }
260
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
263
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 if (inet_opt)
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269
270
271
272
273
274
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
277 if (err)
278 goto failure;
279
280 sk_set_txhash(sk);
281
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
284 if (IS_ERR(rt)) {
285 err = PTR_ERR(rt);
286 rt = NULL;
287 goto failure;
288 }
289
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
292 rt = NULL;
293
294 if (likely(!tp->repair)) {
295 if (!tp->write_seq)
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
298 inet->inet_daddr,
299 inet->inet_sport,
300 usin->sin_port));
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 inet->inet_saddr,
303 inet->inet_daddr);
304 }
305
306 inet->inet_id = prandom_u32();
307
308 if (tcp_fastopen_defer_connect(sk, &err))
309 return err;
310 if (err)
311 goto failure;
312
313 err = tcp_connect(sk);
314
315 if (err)
316 goto failure;
317
318 return 0;
319
320failure:
321
322
323
324
325 tcp_set_state(sk, TCP_CLOSE);
326 ip_rt_put(rt);
327 sk->sk_route_caps = 0;
328 inet->inet_dport = 0;
329 return err;
330}
331EXPORT_SYMBOL(tcp_v4_connect);
332
333
334
335
336
337
338void tcp_v4_mtu_reduced(struct sock *sk)
339{
340 struct inet_sock *inet = inet_sk(sk);
341 struct dst_entry *dst;
342 u32 mtu;
343
344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 return;
346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 dst = inet_csk_update_pmtu(sk, mtu);
348 if (!dst)
349 return;
350
351
352
353
354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 sk->sk_err_soft = EMSGSIZE;
356
357 mtu = dst_mtu(dst);
358
359 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 ip_sk_accept_pmtu(sk) &&
361 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 tcp_sync_mss(sk, mtu);
363
364
365
366
367
368
369 tcp_simple_retransmit(sk);
370 }
371}
372EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373
374static void do_redirect(struct sk_buff *skb, struct sock *sk)
375{
376 struct dst_entry *dst = __sk_dst_check(sk, 0);
377
378 if (dst)
379 dst->ops->redirect(dst, sk, skb);
380}
381
382
383
384void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385{
386 struct request_sock *req = inet_reqsk(sk);
387 struct net *net = sock_net(sk);
388
389
390
391
392 if (seq != tcp_rsk(req)->snt_isn) {
393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 } else if (abort) {
395
396
397
398
399
400
401 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 tcp_listendrop(req->rsk_listener);
403 }
404 reqsk_put(req);
405}
406EXPORT_SYMBOL(tcp_req_err);
407
408
409void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410{
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
413 struct sk_buff *skb;
414 s32 remaining;
415 u32 delta_us;
416
417 if (sock_owned_by_user(sk))
418 return;
419
420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
421 !icsk->icsk_backoff)
422 return;
423
424 skb = tcp_rtx_queue_head(sk);
425 if (WARN_ON_ONCE(!skb))
426 return;
427
428 icsk->icsk_backoff--;
429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431
432 tcp_mstamp_refresh(tp);
433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
435
436 if (remaining > 0) {
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
439 } else {
440
441
442
443 tcp_retransmit_timer(sk);
444 }
445}
446EXPORT_SYMBOL(tcp_ld_RTO_revert);
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464int tcp_v4_err(struct sk_buff *skb, u32 info)
465{
466 const struct iphdr *iph = (const struct iphdr *)skb->data;
467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct tcp_sock *tp;
469 struct inet_sock *inet;
470 const int type = icmp_hdr(skb)->type;
471 const int code = icmp_hdr(skb)->code;
472 struct sock *sk;
473 struct request_sock *fastopen;
474 u32 seq, snd_una;
475 int err;
476 struct net *net = dev_net(skb->dev);
477
478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 th->dest, iph->saddr, ntohs(th->source),
480 inet_iif(skb), 0);
481 if (!sk) {
482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
483 return -ENOENT;
484 }
485 if (sk->sk_state == TCP_TIME_WAIT) {
486 inet_twsk_put(inet_twsk(sk));
487 return 0;
488 }
489 seq = ntohl(th->seq);
490 if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 type == ICMP_TIME_EXCEEDED ||
493 (type == ICMP_DEST_UNREACH &&
494 (code == ICMP_NET_UNREACH ||
495 code == ICMP_HOST_UNREACH)));
496 return 0;
497 }
498
499 bh_lock_sock(sk);
500
501
502
503
504
505 if (sock_owned_by_user(sk)) {
506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 }
509 if (sk->sk_state == TCP_CLOSE)
510 goto out;
511
512 if (static_branch_unlikely(&ip4_min_ttl)) {
513
514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
516 goto out;
517 }
518 }
519
520 tp = tcp_sk(sk);
521
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 goto out;
528 }
529
530 switch (type) {
531 case ICMP_REDIRECT:
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
534 goto out;
535 case ICMP_SOURCE_QUENCH:
536
537 goto out;
538 case ICMP_PARAMETERPROB:
539 err = EPROTO;
540 break;
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
543 goto out;
544
545 if (code == ICMP_FRAG_NEEDED) {
546
547
548
549
550 if (sk->sk_state == TCP_LISTEN)
551 goto out;
552
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
556 } else {
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 sock_hold(sk);
559 }
560 goto out;
561 }
562
563 err = icmp_err_convert[code].errno;
564
565
566
567 if (!fastopen &&
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
570 break;
571 case ICMP_TIME_EXCEEDED:
572 err = EHOSTUNREACH;
573 break;
574 default:
575 goto out;
576 }
577
578 switch (sk->sk_state) {
579 case TCP_SYN_SENT:
580 case TCP_SYN_RECV:
581
582
583
584 if (fastopen && !fastopen->sk)
585 break;
586
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588
589 if (!sock_owned_by_user(sk)) {
590 sk->sk_err = err;
591
592 sk_error_report(sk);
593
594 tcp_done(sk);
595 } else {
596 sk->sk_err_soft = err;
597 }
598 goto out;
599 }
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617 inet = inet_sk(sk);
618 if (!sock_owned_by_user(sk) && inet->recverr) {
619 sk->sk_err = err;
620 sk_error_report(sk);
621 } else {
622 sk->sk_err_soft = err;
623 }
624
625out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628 return 0;
629}
630
631void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632{
633 struct tcphdr *th = tcp_hdr(skb);
634
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
638}
639
640
641void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642{
643 const struct inet_sock *inet = inet_sk(sk);
644
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646}
647EXPORT_SYMBOL(tcp_v4_send_check);
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662#ifdef CONFIG_TCP_MD5SIG
663#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664#else
665#define OPTION_BYTES sizeof(__be32)
666#endif
667
668static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669{
670 const struct tcphdr *th = tcp_hdr(skb);
671 struct {
672 struct tcphdr th;
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
674 } rep;
675 struct ip_reply_arg arg;
676#ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
680 int genhash;
681 struct sock *sk1 = NULL;
682#endif
683 u64 transmit_time = 0;
684 struct sock *ctl_sk;
685 struct net *net;
686
687
688 if (th->rst)
689 return;
690
691
692
693
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 return;
696
697
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
702 rep.th.rst = 1;
703
704 if (th->ack) {
705 rep.th.seq = th->ack_seq;
706 } else {
707 rep.th.ack = 1;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
710 }
711
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
715
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717#ifdef CONFIG_TCP_MD5SIG
718 rcu_read_lock();
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
722 int l3index;
723
724
725
726
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
734 int l3index;
735
736
737
738
739
740
741
742
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 ip_hdr(skb)->saddr,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747
748 if (!sk1)
749 goto out;
750
751
752
753
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 if (!key)
758 goto out;
759
760
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 goto out;
764
765 }
766
767 if (key) {
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 (TCPOPT_NOP << 16) |
770 (TCPOPT_MD5SIG << 8) |
771 TCPOLEN_MD5SIG);
772
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
775
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
779 }
780#endif
781
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
784
785 if (mrst) {
786 rep.opt[0] = mrst;
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
789 }
790 }
791
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr,
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797
798
799
800
801
802 if (sk) {
803 arg.bound_dev_if = sk->sk_bound_dev_if;
804 if (sk_fullsock(sk))
805 trace_tcp_send_reset(sk, skb);
806 }
807
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 local_bh_disable();
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
816 if (sk) {
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
822 xfrm_sk_clone_policy(ctl_sk, sk);
823 }
824 ip_send_unicast_reply(ctl_sk,
825 skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 &arg, arg.iov[0].iov_len,
828 transmit_time);
829
830 ctl_sk->sk_mark = 0;
831 xfrm_sk_free_policy(ctl_sk);
832 sock_net_set(ctl_sk, &init_net);
833 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
834 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
835 local_bh_enable();
836
837#ifdef CONFIG_TCP_MD5SIG
838out:
839 rcu_read_unlock();
840#endif
841}
842
843
844
845
846
847static void tcp_v4_send_ack(const struct sock *sk,
848 struct sk_buff *skb, u32 seq, u32 ack,
849 u32 win, u32 tsval, u32 tsecr, int oif,
850 struct tcp_md5sig_key *key,
851 int reply_flags, u8 tos)
852{
853 const struct tcphdr *th = tcp_hdr(skb);
854 struct {
855 struct tcphdr th;
856 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
857#ifdef CONFIG_TCP_MD5SIG
858 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
859#endif
860 ];
861 } rep;
862 struct net *net = sock_net(sk);
863 struct ip_reply_arg arg;
864 struct sock *ctl_sk;
865 u64 transmit_time;
866
867 memset(&rep.th, 0, sizeof(struct tcphdr));
868 memset(&arg, 0, sizeof(arg));
869
870 arg.iov[0].iov_base = (unsigned char *)&rep;
871 arg.iov[0].iov_len = sizeof(rep.th);
872 if (tsecr) {
873 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
874 (TCPOPT_TIMESTAMP << 8) |
875 TCPOLEN_TIMESTAMP);
876 rep.opt[1] = htonl(tsval);
877 rep.opt[2] = htonl(tsecr);
878 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
879 }
880
881
882 rep.th.dest = th->source;
883 rep.th.source = th->dest;
884 rep.th.doff = arg.iov[0].iov_len / 4;
885 rep.th.seq = htonl(seq);
886 rep.th.ack_seq = htonl(ack);
887 rep.th.ack = 1;
888 rep.th.window = htons(win);
889
890#ifdef CONFIG_TCP_MD5SIG
891 if (key) {
892 int offset = (tsecr) ? 3 : 0;
893
894 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
895 (TCPOPT_NOP << 16) |
896 (TCPOPT_MD5SIG << 8) |
897 TCPOLEN_MD5SIG);
898 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
899 rep.th.doff = arg.iov[0].iov_len/4;
900
901 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
902 key, ip_hdr(skb)->saddr,
903 ip_hdr(skb)->daddr, &rep.th);
904 }
905#endif
906 arg.flags = reply_flags;
907 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
908 ip_hdr(skb)->saddr,
909 arg.iov[0].iov_len, IPPROTO_TCP, 0);
910 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
911 if (oif)
912 arg.bound_dev_if = oif;
913 arg.tos = tos;
914 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
915 local_bh_disable();
916 ctl_sk = this_cpu_read(ipv4_tcp_sk);
917 sock_net_set(ctl_sk, net);
918 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
919 inet_twsk(sk)->tw_mark : sk->sk_mark;
920 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
921 inet_twsk(sk)->tw_priority : sk->sk_priority;
922 transmit_time = tcp_transmit_time(sk);
923 ip_send_unicast_reply(ctl_sk,
924 skb, &TCP_SKB_CB(skb)->header.h4.opt,
925 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
926 &arg, arg.iov[0].iov_len,
927 transmit_time);
928
929 ctl_sk->sk_mark = 0;
930 sock_net_set(ctl_sk, &init_net);
931 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
932 local_bh_enable();
933}
934
935static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
936{
937 struct inet_timewait_sock *tw = inet_twsk(sk);
938 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
939
940 tcp_v4_send_ack(sk, skb,
941 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
942 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
943 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
944 tcptw->tw_ts_recent,
945 tw->tw_bound_dev_if,
946 tcp_twsk_md5_key(tcptw),
947 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
948 tw->tw_tos
949 );
950
951 inet_twsk_put(tw);
952}
953
954static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
955 struct request_sock *req)
956{
957 const union tcp_md5_addr *addr;
958 int l3index;
959
960
961
962
963 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
964 tcp_sk(sk)->snd_nxt;
965
966
967
968
969
970
971 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
972 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
973 tcp_v4_send_ack(sk, skb, seq,
974 tcp_rsk(req)->rcv_nxt,
975 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
976 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
977 req->ts_recent,
978 0,
979 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
980 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
981 ip_hdr(skb)->tos);
982}
983
984
985
986
987
988
989static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
990 struct flowi *fl,
991 struct request_sock *req,
992 struct tcp_fastopen_cookie *foc,
993 enum tcp_synack_type synack_type,
994 struct sk_buff *syn_skb)
995{
996 const struct inet_request_sock *ireq = inet_rsk(req);
997 struct flowi4 fl4;
998 int err = -1;
999 struct sk_buff *skb;
1000 u8 tos;
1001
1002
1003 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004 return -1;
1005
1006 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007
1008 if (skb) {
1009 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010
1011 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013 (inet_sk(sk)->tos & INET_ECN_MASK) :
1014 inet_sk(sk)->tos;
1015
1016 if (!INET_ECN_is_capable(tos) &&
1017 tcp_bpf_ca_needs_ecn((struct sock *)req))
1018 tos |= INET_ECN_ECT_0;
1019
1020 rcu_read_lock();
1021 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022 ireq->ir_rmt_addr,
1023 rcu_dereference(ireq->ireq_opt),
1024 tos);
1025 rcu_read_unlock();
1026 err = net_xmit_eval(err);
1027 }
1028
1029 return err;
1030}
1031
1032
1033
1034
1035static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036{
1037 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038}
1039
1040#ifdef CONFIG_TCP_MD5SIG
1041
1042
1043
1044
1045
1046
1047DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048EXPORT_SYMBOL(tcp_md5_needed);
1049
1050static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1051{
1052 if (!old)
1053 return true;
1054
1055
1056 if (old->l3index && new->l3index == 0)
1057 return false;
1058 if (old->l3index == 0 && new->l3index)
1059 return true;
1060
1061 return old->prefixlen < new->prefixlen;
1062}
1063
1064
1065struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066 const union tcp_md5_addr *addr,
1067 int family)
1068{
1069 const struct tcp_sock *tp = tcp_sk(sk);
1070 struct tcp_md5sig_key *key;
1071 const struct tcp_md5sig_info *md5sig;
1072 __be32 mask;
1073 struct tcp_md5sig_key *best_match = NULL;
1074 bool match;
1075
1076
1077 md5sig = rcu_dereference_check(tp->md5sig_info,
1078 lockdep_sock_is_held(sk));
1079 if (!md5sig)
1080 return NULL;
1081
1082 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083 lockdep_sock_is_held(sk)) {
1084 if (key->family != family)
1085 continue;
1086 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087 continue;
1088 if (family == AF_INET) {
1089 mask = inet_make_mask(key->prefixlen);
1090 match = (key->addr.a4.s_addr & mask) ==
1091 (addr->a4.s_addr & mask);
1092#if IS_ENABLED(CONFIG_IPV6)
1093 } else if (family == AF_INET6) {
1094 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1095 key->prefixlen);
1096#endif
1097 } else {
1098 match = false;
1099 }
1100
1101 if (match && better_md5_match(best_match, key))
1102 best_match = key;
1103 }
1104 return best_match;
1105}
1106EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107
1108static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109 const union tcp_md5_addr *addr,
1110 int family, u8 prefixlen,
1111 int l3index, u8 flags)
1112{
1113 const struct tcp_sock *tp = tcp_sk(sk);
1114 struct tcp_md5sig_key *key;
1115 unsigned int size = sizeof(struct in_addr);
1116 const struct tcp_md5sig_info *md5sig;
1117
1118
1119 md5sig = rcu_dereference_check(tp->md5sig_info,
1120 lockdep_sock_is_held(sk));
1121 if (!md5sig)
1122 return NULL;
1123#if IS_ENABLED(CONFIG_IPV6)
1124 if (family == AF_INET6)
1125 size = sizeof(struct in6_addr);
1126#endif
1127 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128 lockdep_sock_is_held(sk)) {
1129 if (key->family != family)
1130 continue;
1131 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132 continue;
1133 if (key->l3index != l3index)
1134 continue;
1135 if (!memcmp(&key->addr, addr, size) &&
1136 key->prefixlen == prefixlen)
1137 return key;
1138 }
1139 return NULL;
1140}
1141
1142struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143 const struct sock *addr_sk)
1144{
1145 const union tcp_md5_addr *addr;
1146 int l3index;
1147
1148 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149 addr_sk->sk_bound_dev_if);
1150 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152}
1153EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154
1155
1156int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157 int family, u8 prefixlen, int l3index, u8 flags,
1158 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159{
1160
1161 struct tcp_md5sig_key *key;
1162 struct tcp_sock *tp = tcp_sk(sk);
1163 struct tcp_md5sig_info *md5sig;
1164
1165 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166 if (key) {
1167
1168
1169
1170
1171
1172
1173 data_race(memcpy(key->key, newkey, newkeylen));
1174
1175
1176
1177
1178
1179
1180 WRITE_ONCE(key->keylen, newkeylen);
1181
1182 return 0;
1183 }
1184
1185 md5sig = rcu_dereference_protected(tp->md5sig_info,
1186 lockdep_sock_is_held(sk));
1187 if (!md5sig) {
1188 md5sig = kmalloc(sizeof(*md5sig), gfp);
1189 if (!md5sig)
1190 return -ENOMEM;
1191
1192 sk_gso_disable(sk);
1193 INIT_HLIST_HEAD(&md5sig->head);
1194 rcu_assign_pointer(tp->md5sig_info, md5sig);
1195 }
1196
1197 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198 if (!key)
1199 return -ENOMEM;
1200 if (!tcp_alloc_md5sig_pool()) {
1201 sock_kfree_s(sk, key, sizeof(*key));
1202 return -ENOMEM;
1203 }
1204
1205 memcpy(key->key, newkey, newkeylen);
1206 key->keylen = newkeylen;
1207 key->family = family;
1208 key->prefixlen = prefixlen;
1209 key->l3index = l3index;
1210 key->flags = flags;
1211 memcpy(&key->addr, addr,
1212 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213 sizeof(struct in_addr));
1214 hlist_add_head_rcu(&key->node, &md5sig->head);
1215 return 0;
1216}
1217EXPORT_SYMBOL(tcp_md5_do_add);
1218
1219int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220 u8 prefixlen, int l3index, u8 flags)
1221{
1222 struct tcp_md5sig_key *key;
1223
1224 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225 if (!key)
1226 return -ENOENT;
1227 hlist_del_rcu(&key->node);
1228 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229 kfree_rcu(key, rcu);
1230 return 0;
1231}
1232EXPORT_SYMBOL(tcp_md5_do_del);
1233
1234static void tcp_clear_md5_list(struct sock *sk)
1235{
1236 struct tcp_sock *tp = tcp_sk(sk);
1237 struct tcp_md5sig_key *key;
1238 struct hlist_node *n;
1239 struct tcp_md5sig_info *md5sig;
1240
1241 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242
1243 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244 hlist_del_rcu(&key->node);
1245 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246 kfree_rcu(key, rcu);
1247 }
1248}
1249
1250static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251 sockptr_t optval, int optlen)
1252{
1253 struct tcp_md5sig cmd;
1254 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255 const union tcp_md5_addr *addr;
1256 u8 prefixlen = 32;
1257 int l3index = 0;
1258 u8 flags;
1259
1260 if (optlen < sizeof(cmd))
1261 return -EINVAL;
1262
1263 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264 return -EFAULT;
1265
1266 if (sin->sin_family != AF_INET)
1267 return -EINVAL;
1268
1269 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270
1271 if (optname == TCP_MD5SIG_EXT &&
1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273 prefixlen = cmd.tcpm_prefixlen;
1274 if (prefixlen > 32)
1275 return -EINVAL;
1276 }
1277
1278 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280 struct net_device *dev;
1281
1282 rcu_read_lock();
1283 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284 if (dev && netif_is_l3_master(dev))
1285 l3index = dev->ifindex;
1286
1287 rcu_read_unlock();
1288
1289
1290
1291
1292 if (!dev || !l3index)
1293 return -EINVAL;
1294 }
1295
1296 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297
1298 if (!cmd.tcpm_keylen)
1299 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300
1301 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302 return -EINVAL;
1303
1304 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306}
1307
1308static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309 __be32 daddr, __be32 saddr,
1310 const struct tcphdr *th, int nbytes)
1311{
1312 struct tcp4_pseudohdr *bp;
1313 struct scatterlist sg;
1314 struct tcphdr *_th;
1315
1316 bp = hp->scratch;
1317 bp->saddr = saddr;
1318 bp->daddr = daddr;
1319 bp->pad = 0;
1320 bp->protocol = IPPROTO_TCP;
1321 bp->len = cpu_to_be16(nbytes);
1322
1323 _th = (struct tcphdr *)(bp + 1);
1324 memcpy(_th, th, sizeof(*th));
1325 _th->check = 0;
1326
1327 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329 sizeof(*bp) + sizeof(*th));
1330 return crypto_ahash_update(hp->md5_req);
1331}
1332
1333static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335{
1336 struct tcp_md5sig_pool *hp;
1337 struct ahash_request *req;
1338
1339 hp = tcp_get_md5sig_pool();
1340 if (!hp)
1341 goto clear_hash_noput;
1342 req = hp->md5_req;
1343
1344 if (crypto_ahash_init(req))
1345 goto clear_hash;
1346 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347 goto clear_hash;
1348 if (tcp_md5_hash_key(hp, key))
1349 goto clear_hash;
1350 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351 if (crypto_ahash_final(req))
1352 goto clear_hash;
1353
1354 tcp_put_md5sig_pool();
1355 return 0;
1356
1357clear_hash:
1358 tcp_put_md5sig_pool();
1359clear_hash_noput:
1360 memset(md5_hash, 0, 16);
1361 return 1;
1362}
1363
1364int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365 const struct sock *sk,
1366 const struct sk_buff *skb)
1367{
1368 struct tcp_md5sig_pool *hp;
1369 struct ahash_request *req;
1370 const struct tcphdr *th = tcp_hdr(skb);
1371 __be32 saddr, daddr;
1372
1373 if (sk) {
1374 saddr = sk->sk_rcv_saddr;
1375 daddr = sk->sk_daddr;
1376 } else {
1377 const struct iphdr *iph = ip_hdr(skb);
1378 saddr = iph->saddr;
1379 daddr = iph->daddr;
1380 }
1381
1382 hp = tcp_get_md5sig_pool();
1383 if (!hp)
1384 goto clear_hash_noput;
1385 req = hp->md5_req;
1386
1387 if (crypto_ahash_init(req))
1388 goto clear_hash;
1389
1390 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391 goto clear_hash;
1392 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393 goto clear_hash;
1394 if (tcp_md5_hash_key(hp, key))
1395 goto clear_hash;
1396 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397 if (crypto_ahash_final(req))
1398 goto clear_hash;
1399
1400 tcp_put_md5sig_pool();
1401 return 0;
1402
1403clear_hash:
1404 tcp_put_md5sig_pool();
1405clear_hash_noput:
1406 memset(md5_hash, 0, 16);
1407 return 1;
1408}
1409EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1410
1411#endif
1412
1413static void tcp_v4_init_req(struct request_sock *req,
1414 const struct sock *sk_listener,
1415 struct sk_buff *skb)
1416{
1417 struct inet_request_sock *ireq = inet_rsk(req);
1418 struct net *net = sock_net(sk_listener);
1419
1420 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423}
1424
1425static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426 struct sk_buff *skb,
1427 struct flowi *fl,
1428 struct request_sock *req)
1429{
1430 tcp_v4_init_req(req, sk, skb);
1431
1432 if (security_inet_conn_request(sk, skb, req))
1433 return NULL;
1434
1435 return inet_csk_route_req(sk, &fl->u.ip4, req);
1436}
1437
1438struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439 .family = PF_INET,
1440 .obj_size = sizeof(struct tcp_request_sock),
1441 .rtx_syn_ack = tcp_rtx_synack,
1442 .send_ack = tcp_v4_reqsk_send_ack,
1443 .destructor = tcp_v4_reqsk_destructor,
1444 .send_reset = tcp_v4_send_reset,
1445 .syn_ack_timeout = tcp_syn_ack_timeout,
1446};
1447
1448const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449 .mss_clamp = TCP_MSS_DEFAULT,
1450#ifdef CONFIG_TCP_MD5SIG
1451 .req_md5_lookup = tcp_v4_md5_lookup,
1452 .calc_md5_hash = tcp_v4_md5_hash_skb,
1453#endif
1454#ifdef CONFIG_SYN_COOKIES
1455 .cookie_init_seq = cookie_v4_init_sequence,
1456#endif
1457 .route_req = tcp_v4_route_req,
1458 .init_seq = tcp_v4_init_seq,
1459 .init_ts_off = tcp_v4_init_ts_off,
1460 .send_synack = tcp_v4_send_synack,
1461};
1462
1463int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464{
1465
1466 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467 goto drop;
1468
1469 return tcp_conn_request(&tcp_request_sock_ops,
1470 &tcp_request_sock_ipv4_ops, sk, skb);
1471
1472drop:
1473 tcp_listendrop(sk);
1474 return 0;
1475}
1476EXPORT_SYMBOL(tcp_v4_conn_request);
1477
1478
1479
1480
1481
1482
1483struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484 struct request_sock *req,
1485 struct dst_entry *dst,
1486 struct request_sock *req_unhash,
1487 bool *own_req)
1488{
1489 struct inet_request_sock *ireq;
1490 bool found_dup_sk = false;
1491 struct inet_sock *newinet;
1492 struct tcp_sock *newtp;
1493 struct sock *newsk;
1494#ifdef CONFIG_TCP_MD5SIG
1495 const union tcp_md5_addr *addr;
1496 struct tcp_md5sig_key *key;
1497 int l3index;
1498#endif
1499 struct ip_options_rcu *inet_opt;
1500
1501 if (sk_acceptq_is_full(sk))
1502 goto exit_overflow;
1503
1504 newsk = tcp_create_openreq_child(sk, req, skb);
1505 if (!newsk)
1506 goto exit_nonewsk;
1507
1508 newsk->sk_gso_type = SKB_GSO_TCPV4;
1509 inet_sk_rx_dst_set(newsk, skb);
1510
1511 newtp = tcp_sk(newsk);
1512 newinet = inet_sk(newsk);
1513 ireq = inet_rsk(req);
1514 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516 newsk->sk_bound_dev_if = ireq->ir_iif;
1517 newinet->inet_saddr = ireq->ir_loc_addr;
1518 inet_opt = rcu_dereference(ireq->ireq_opt);
1519 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520 newinet->mc_index = inet_iif(skb);
1521 newinet->mc_ttl = ip_hdr(skb)->ttl;
1522 newinet->rcv_tos = ip_hdr(skb)->tos;
1523 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524 if (inet_opt)
1525 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526 newinet->inet_id = prandom_u32();
1527
1528
1529
1530
1531 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533
1534 if (!dst) {
1535 dst = inet_csk_route_child_sock(sk, newsk, req);
1536 if (!dst)
1537 goto put_and_exit;
1538 } else {
1539
1540 }
1541 sk_setup_caps(newsk, dst);
1542
1543 tcp_ca_openreq_child(newsk, dst);
1544
1545 tcp_sync_mss(newsk, dst_mtu(dst));
1546 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547
1548 tcp_initialize_rcv_mss(newsk);
1549
1550#ifdef CONFIG_TCP_MD5SIG
1551 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552
1553 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555 if (key) {
1556
1557
1558
1559
1560
1561
1562 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563 key->key, key->keylen, GFP_ATOMIC);
1564 sk_gso_disable(newsk);
1565 }
1566#endif
1567
1568 if (__inet_inherit_port(sk, newsk) < 0)
1569 goto put_and_exit;
1570 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571 &found_dup_sk);
1572 if (likely(*own_req)) {
1573 tcp_move_syn(newtp, req);
1574 ireq->ireq_opt = NULL;
1575 } else {
1576 newinet->inet_opt = NULL;
1577
1578 if (!req_unhash && found_dup_sk) {
1579
1580
1581
1582 bh_unlock_sock(newsk);
1583 sock_put(newsk);
1584 newsk = NULL;
1585 }
1586 }
1587 return newsk;
1588
1589exit_overflow:
1590 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1591exit_nonewsk:
1592 dst_release(dst);
1593exit:
1594 tcp_listendrop(sk);
1595 return NULL;
1596put_and_exit:
1597 newinet->inet_opt = NULL;
1598 inet_csk_prepare_forced_close(newsk);
1599 tcp_done(newsk);
1600 goto exit;
1601}
1602EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603
1604static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605{
1606#ifdef CONFIG_SYN_COOKIES
1607 const struct tcphdr *th = tcp_hdr(skb);
1608
1609 if (!th->syn)
1610 sk = cookie_v4_check(sk, skb);
1611#endif
1612 return sk;
1613}
1614
1615u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616 struct tcphdr *th, u32 *cookie)
1617{
1618 u16 mss = 0;
1619#ifdef CONFIG_SYN_COOKIES
1620 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621 &tcp_request_sock_ipv4_ops, sk, th);
1622 if (mss) {
1623 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624 tcp_synq_overflow(sk);
1625 }
1626#endif
1627 return mss;
1628}
1629
1630INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631 u32));
1632
1633
1634
1635
1636
1637
1638
1639
1640int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641{
1642 enum skb_drop_reason reason;
1643 struct sock *rsk;
1644
1645 if (sk->sk_state == TCP_ESTABLISHED) {
1646 struct dst_entry *dst;
1647
1648 dst = rcu_dereference_protected(sk->sk_rx_dst,
1649 lockdep_sock_is_held(sk));
1650
1651 sock_rps_save_rxhash(sk, skb);
1652 sk_mark_napi_id(sk, skb);
1653 if (dst) {
1654 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656 dst, 0)) {
1657 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1658 dst_release(dst);
1659 }
1660 }
1661 tcp_rcv_established(sk, skb);
1662 return 0;
1663 }
1664
1665 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666 if (tcp_checksum_complete(skb))
1667 goto csum_err;
1668
1669 if (sk->sk_state == TCP_LISTEN) {
1670 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1671
1672 if (!nsk)
1673 goto discard;
1674 if (nsk != sk) {
1675 if (tcp_child_process(sk, nsk, skb)) {
1676 rsk = nsk;
1677 goto reset;
1678 }
1679 return 0;
1680 }
1681 } else
1682 sock_rps_save_rxhash(sk, skb);
1683
1684 if (tcp_rcv_state_process(sk, skb)) {
1685 rsk = sk;
1686 goto reset;
1687 }
1688 return 0;
1689
1690reset:
1691 tcp_v4_send_reset(rsk, skb);
1692discard:
1693 kfree_skb_reason(skb, reason);
1694
1695
1696
1697
1698
1699 return 0;
1700
1701csum_err:
1702 reason = SKB_DROP_REASON_TCP_CSUM;
1703 trace_tcp_bad_csum(skb);
1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706 goto discard;
1707}
1708EXPORT_SYMBOL(tcp_v4_do_rcv);
1709
1710int tcp_v4_early_demux(struct sk_buff *skb)
1711{
1712 const struct iphdr *iph;
1713 const struct tcphdr *th;
1714 struct sock *sk;
1715
1716 if (skb->pkt_type != PACKET_HOST)
1717 return 0;
1718
1719 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1720 return 0;
1721
1722 iph = ip_hdr(skb);
1723 th = tcp_hdr(skb);
1724
1725 if (th->doff < sizeof(struct tcphdr) / 4)
1726 return 0;
1727
1728 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729 iph->saddr, th->source,
1730 iph->daddr, ntohs(th->dest),
1731 skb->skb_iif, inet_sdif(skb));
1732 if (sk) {
1733 skb->sk = sk;
1734 skb->destructor = sock_edemux;
1735 if (sk_fullsock(sk)) {
1736 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737
1738 if (dst)
1739 dst = dst_check(dst, 0);
1740 if (dst &&
1741 sk->sk_rx_dst_ifindex == skb->skb_iif)
1742 skb_dst_set_noref(skb, dst);
1743 }
1744 }
1745 return 0;
1746}
1747
1748bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749 enum skb_drop_reason *reason)
1750{
1751 u32 limit, tail_gso_size, tail_gso_segs;
1752 struct skb_shared_info *shinfo;
1753 const struct tcphdr *th;
1754 struct tcphdr *thtail;
1755 struct sk_buff *tail;
1756 unsigned int hdrlen;
1757 bool fragstolen;
1758 u32 gso_segs;
1759 u32 gso_size;
1760 int delta;
1761
1762
1763
1764
1765
1766
1767
1768 skb_condense(skb);
1769
1770 skb_dst_drop(skb);
1771
1772 if (unlikely(tcp_checksum_complete(skb))) {
1773 bh_unlock_sock(sk);
1774 trace_tcp_bad_csum(skb);
1775 *reason = SKB_DROP_REASON_TCP_CSUM;
1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778 return true;
1779 }
1780
1781
1782
1783
1784
1785 th = (const struct tcphdr *)skb->data;
1786 hdrlen = th->doff * 4;
1787
1788 tail = sk->sk_backlog.tail;
1789 if (!tail)
1790 goto no_coalesce;
1791 thtail = (struct tcphdr *)tail->data;
1792
1793 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795 ((TCP_SKB_CB(tail)->tcp_flags |
1796 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797 !((TCP_SKB_CB(tail)->tcp_flags &
1798 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799 ((TCP_SKB_CB(tail)->tcp_flags ^
1800 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801#ifdef CONFIG_TLS_DEVICE
1802 tail->decrypted != skb->decrypted ||
1803#endif
1804 thtail->doff != th->doff ||
1805 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806 goto no_coalesce;
1807
1808 __skb_pull(skb, hdrlen);
1809
1810 shinfo = skb_shinfo(skb);
1811 gso_size = shinfo->gso_size ?: skb->len;
1812 gso_segs = shinfo->gso_segs ?: 1;
1813
1814 shinfo = skb_shinfo(tail);
1815 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816 tail_gso_segs = shinfo->gso_segs ?: 1;
1817
1818 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820
1821 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823 thtail->window = th->window;
1824 }
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834 thtail->fin |= th->fin;
1835 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836
1837 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838 TCP_SKB_CB(tail)->has_rxtstamp = true;
1839 tail->tstamp = skb->tstamp;
1840 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841 }
1842
1843
1844 shinfo->gso_size = max(gso_size, tail_gso_size);
1845 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846
1847 sk->sk_backlog.len += delta;
1848 __NET_INC_STATS(sock_net(sk),
1849 LINUX_MIB_TCPBACKLOGCOALESCE);
1850 kfree_skb_partial(skb, fragstolen);
1851 return false;
1852 }
1853 __skb_push(skb, hdrlen);
1854
1855no_coalesce:
1856 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1857
1858
1859
1860
1861
1862 limit += 64 * 1024;
1863
1864 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1865 bh_unlock_sock(sk);
1866 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1867 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1868 return true;
1869 }
1870 return false;
1871}
1872EXPORT_SYMBOL(tcp_add_backlog);
1873
1874int tcp_filter(struct sock *sk, struct sk_buff *skb)
1875{
1876 struct tcphdr *th = (struct tcphdr *)skb->data;
1877
1878 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1879}
1880EXPORT_SYMBOL(tcp_filter);
1881
1882static void tcp_v4_restore_cb(struct sk_buff *skb)
1883{
1884 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1885 sizeof(struct inet_skb_parm));
1886}
1887
1888static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1889 const struct tcphdr *th)
1890{
1891
1892
1893
1894 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1895 sizeof(struct inet_skb_parm));
1896 barrier();
1897
1898 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1899 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1900 skb->len - th->doff * 4);
1901 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1902 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1903 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1904 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1905 TCP_SKB_CB(skb)->sacked = 0;
1906 TCP_SKB_CB(skb)->has_rxtstamp =
1907 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1908}
1909
1910
1911
1912
1913
1914int tcp_v4_rcv(struct sk_buff *skb)
1915{
1916 struct net *net = dev_net(skb->dev);
1917 enum skb_drop_reason drop_reason;
1918 int sdif = inet_sdif(skb);
1919 int dif = inet_iif(skb);
1920 const struct iphdr *iph;
1921 const struct tcphdr *th;
1922 bool refcounted;
1923 struct sock *sk;
1924 int ret;
1925
1926 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1927 if (skb->pkt_type != PACKET_HOST)
1928 goto discard_it;
1929
1930
1931 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1932
1933 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1934 goto discard_it;
1935
1936 th = (const struct tcphdr *)skb->data;
1937
1938 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1939 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1940 goto bad_packet;
1941 }
1942 if (!pskb_may_pull(skb, th->doff * 4))
1943 goto discard_it;
1944
1945
1946
1947
1948
1949
1950 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1951 goto csum_error;
1952
1953 th = (const struct tcphdr *)skb->data;
1954 iph = ip_hdr(skb);
1955lookup:
1956 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1957 th->dest, sdif, &refcounted);
1958 if (!sk)
1959 goto no_tcp_socket;
1960
1961process:
1962 if (sk->sk_state == TCP_TIME_WAIT)
1963 goto do_time_wait;
1964
1965 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1966 struct request_sock *req = inet_reqsk(sk);
1967 bool req_stolen = false;
1968 struct sock *nsk;
1969
1970 sk = req->rsk_listener;
1971 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1972 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1973 else
1974 drop_reason = tcp_inbound_md5_hash(sk, skb,
1975 &iph->saddr, &iph->daddr,
1976 AF_INET, dif, sdif);
1977 if (unlikely(drop_reason)) {
1978 sk_drops_add(sk, skb);
1979 reqsk_put(req);
1980 goto discard_it;
1981 }
1982 if (tcp_checksum_complete(skb)) {
1983 reqsk_put(req);
1984 goto csum_error;
1985 }
1986 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1987 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1988 if (!nsk) {
1989 inet_csk_reqsk_queue_drop_and_put(sk, req);
1990 goto lookup;
1991 }
1992 sk = nsk;
1993
1994
1995
1996 } else {
1997
1998
1999
2000 sock_hold(sk);
2001 }
2002 refcounted = true;
2003 nsk = NULL;
2004 if (!tcp_filter(sk, skb)) {
2005 th = (const struct tcphdr *)skb->data;
2006 iph = ip_hdr(skb);
2007 tcp_v4_fill_cb(skb, iph, th);
2008 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2009 } else {
2010 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2011 }
2012 if (!nsk) {
2013 reqsk_put(req);
2014 if (req_stolen) {
2015
2016
2017
2018
2019
2020 tcp_v4_restore_cb(skb);
2021 sock_put(sk);
2022 goto lookup;
2023 }
2024 goto discard_and_relse;
2025 }
2026 nf_reset_ct(skb);
2027 if (nsk == sk) {
2028 reqsk_put(req);
2029 tcp_v4_restore_cb(skb);
2030 } else if (tcp_child_process(sk, nsk, skb)) {
2031 tcp_v4_send_reset(nsk, skb);
2032 goto discard_and_relse;
2033 } else {
2034 sock_put(sk);
2035 return 0;
2036 }
2037 }
2038
2039 if (static_branch_unlikely(&ip4_min_ttl)) {
2040
2041 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2042 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2043 goto discard_and_relse;
2044 }
2045 }
2046
2047 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2048 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2049 goto discard_and_relse;
2050 }
2051
2052 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2053 &iph->daddr, AF_INET, dif, sdif);
2054 if (drop_reason)
2055 goto discard_and_relse;
2056
2057 nf_reset_ct(skb);
2058
2059 if (tcp_filter(sk, skb)) {
2060 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2061 goto discard_and_relse;
2062 }
2063 th = (const struct tcphdr *)skb->data;
2064 iph = ip_hdr(skb);
2065 tcp_v4_fill_cb(skb, iph, th);
2066
2067 skb->dev = NULL;
2068
2069 if (sk->sk_state == TCP_LISTEN) {
2070 ret = tcp_v4_do_rcv(sk, skb);
2071 goto put_and_return;
2072 }
2073
2074 sk_incoming_cpu_update(sk);
2075
2076 bh_lock_sock_nested(sk);
2077 tcp_segs_in(tcp_sk(sk), skb);
2078 ret = 0;
2079 if (!sock_owned_by_user(sk)) {
2080 ret = tcp_v4_do_rcv(sk, skb);
2081 } else {
2082 if (tcp_add_backlog(sk, skb, &drop_reason))
2083 goto discard_and_relse;
2084 }
2085 bh_unlock_sock(sk);
2086
2087put_and_return:
2088 if (refcounted)
2089 sock_put(sk);
2090
2091 return ret;
2092
2093no_tcp_socket:
2094 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2095 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2096 goto discard_it;
2097
2098 tcp_v4_fill_cb(skb, iph, th);
2099
2100 if (tcp_checksum_complete(skb)) {
2101csum_error:
2102 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2103 trace_tcp_bad_csum(skb);
2104 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2105bad_packet:
2106 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2107 } else {
2108 tcp_v4_send_reset(NULL, skb);
2109 }
2110
2111discard_it:
2112 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2113
2114 kfree_skb_reason(skb, drop_reason);
2115 return 0;
2116
2117discard_and_relse:
2118 sk_drops_add(sk, skb);
2119 if (refcounted)
2120 sock_put(sk);
2121 goto discard_it;
2122
2123do_time_wait:
2124 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2125 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2126 inet_twsk_put(inet_twsk(sk));
2127 goto discard_it;
2128 }
2129
2130 tcp_v4_fill_cb(skb, iph, th);
2131
2132 if (tcp_checksum_complete(skb)) {
2133 inet_twsk_put(inet_twsk(sk));
2134 goto csum_error;
2135 }
2136 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2137 case TCP_TW_SYN: {
2138 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2139 &tcp_hashinfo, skb,
2140 __tcp_hdrlen(th),
2141 iph->saddr, th->source,
2142 iph->daddr, th->dest,
2143 inet_iif(skb),
2144 sdif);
2145 if (sk2) {
2146 inet_twsk_deschedule_put(inet_twsk(sk));
2147 sk = sk2;
2148 tcp_v4_restore_cb(skb);
2149 refcounted = false;
2150 goto process;
2151 }
2152 }
2153
2154 fallthrough;
2155 case TCP_TW_ACK:
2156 tcp_v4_timewait_ack(sk, skb);
2157 break;
2158 case TCP_TW_RST:
2159 tcp_v4_send_reset(sk, skb);
2160 inet_twsk_deschedule_put(inet_twsk(sk));
2161 goto discard_it;
2162 case TCP_TW_SUCCESS:;
2163 }
2164 goto discard_it;
2165}
2166
2167static struct timewait_sock_ops tcp_timewait_sock_ops = {
2168 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2169 .twsk_unique = tcp_twsk_unique,
2170 .twsk_destructor= tcp_twsk_destructor,
2171};
2172
2173void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2174{
2175 struct dst_entry *dst = skb_dst(skb);
2176
2177 if (dst && dst_hold_safe(dst)) {
2178 rcu_assign_pointer(sk->sk_rx_dst, dst);
2179 sk->sk_rx_dst_ifindex = skb->skb_iif;
2180 }
2181}
2182EXPORT_SYMBOL(inet_sk_rx_dst_set);
2183
2184const struct inet_connection_sock_af_ops ipv4_specific = {
2185 .queue_xmit = ip_queue_xmit,
2186 .send_check = tcp_v4_send_check,
2187 .rebuild_header = inet_sk_rebuild_header,
2188 .sk_rx_dst_set = inet_sk_rx_dst_set,
2189 .conn_request = tcp_v4_conn_request,
2190 .syn_recv_sock = tcp_v4_syn_recv_sock,
2191 .net_header_len = sizeof(struct iphdr),
2192 .setsockopt = ip_setsockopt,
2193 .getsockopt = ip_getsockopt,
2194 .addr2sockaddr = inet_csk_addr2sockaddr,
2195 .sockaddr_len = sizeof(struct sockaddr_in),
2196 .mtu_reduced = tcp_v4_mtu_reduced,
2197};
2198EXPORT_SYMBOL(ipv4_specific);
2199
2200#ifdef CONFIG_TCP_MD5SIG
2201static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2202 .md5_lookup = tcp_v4_md5_lookup,
2203 .calc_md5_hash = tcp_v4_md5_hash_skb,
2204 .md5_parse = tcp_v4_parse_md5_keys,
2205};
2206#endif
2207
2208
2209
2210
2211static int tcp_v4_init_sock(struct sock *sk)
2212{
2213 struct inet_connection_sock *icsk = inet_csk(sk);
2214
2215 tcp_init_sock(sk);
2216
2217 icsk->icsk_af_ops = &ipv4_specific;
2218
2219#ifdef CONFIG_TCP_MD5SIG
2220 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2221#endif
2222
2223 return 0;
2224}
2225
2226void tcp_v4_destroy_sock(struct sock *sk)
2227{
2228 struct tcp_sock *tp = tcp_sk(sk);
2229
2230 trace_tcp_destroy_sock(sk);
2231
2232 tcp_clear_xmit_timers(sk);
2233
2234 tcp_cleanup_congestion_control(sk);
2235
2236 tcp_cleanup_ulp(sk);
2237
2238
2239 tcp_write_queue_purge(sk);
2240
2241
2242 tcp_fastopen_active_disable_ofo_check(sk);
2243
2244
2245 skb_rbtree_purge(&tp->out_of_order_queue);
2246
2247#ifdef CONFIG_TCP_MD5SIG
2248
2249 if (tp->md5sig_info) {
2250 tcp_clear_md5_list(sk);
2251 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2252 tp->md5sig_info = NULL;
2253 }
2254#endif
2255
2256
2257 if (inet_csk(sk)->icsk_bind_hash)
2258 inet_put_port(sk);
2259
2260 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2261
2262
2263 tcp_free_fastopen_req(tp);
2264 tcp_fastopen_destroy_cipher(sk);
2265 tcp_saved_syn_free(tp);
2266
2267 sk_sockets_allocated_dec(sk);
2268}
2269EXPORT_SYMBOL(tcp_v4_destroy_sock);
2270
2271#ifdef CONFIG_PROC_FS
2272
2273
2274static unsigned short seq_file_family(const struct seq_file *seq);
2275
2276static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2277{
2278 unsigned short family = seq_file_family(seq);
2279
2280
2281 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2282 net_eq(sock_net(sk), seq_file_net(seq)));
2283}
2284
2285
2286
2287
2288static void *listening_get_first(struct seq_file *seq)
2289{
2290 struct tcp_iter_state *st = seq->private;
2291
2292 st->offset = 0;
2293 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2294 struct inet_listen_hashbucket *ilb2;
2295 struct hlist_nulls_node *node;
2296 struct sock *sk;
2297
2298 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2299 if (hlist_nulls_empty(&ilb2->nulls_head))
2300 continue;
2301
2302 spin_lock(&ilb2->lock);
2303 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2304 if (seq_sk_match(seq, sk))
2305 return sk;
2306 }
2307 spin_unlock(&ilb2->lock);
2308 }
2309
2310 return NULL;
2311}
2312
2313
2314
2315
2316
2317
2318static void *listening_get_next(struct seq_file *seq, void *cur)
2319{
2320 struct tcp_iter_state *st = seq->private;
2321 struct inet_listen_hashbucket *ilb2;
2322 struct hlist_nulls_node *node;
2323 struct sock *sk = cur;
2324
2325 ++st->num;
2326 ++st->offset;
2327
2328 sk = sk_nulls_next(sk);
2329 sk_nulls_for_each_from(sk, node) {
2330 if (seq_sk_match(seq, sk))
2331 return sk;
2332 }
2333
2334 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2335 spin_unlock(&ilb2->lock);
2336 ++st->bucket;
2337 return listening_get_first(seq);
2338}
2339
2340static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2341{
2342 struct tcp_iter_state *st = seq->private;
2343 void *rc;
2344
2345 st->bucket = 0;
2346 st->offset = 0;
2347 rc = listening_get_first(seq);
2348
2349 while (rc && *pos) {
2350 rc = listening_get_next(seq, rc);
2351 --*pos;
2352 }
2353 return rc;
2354}
2355
2356static inline bool empty_bucket(const struct tcp_iter_state *st)
2357{
2358 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2359}
2360
2361
2362
2363
2364
2365static void *established_get_first(struct seq_file *seq)
2366{
2367 struct tcp_iter_state *st = seq->private;
2368
2369 st->offset = 0;
2370 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2371 struct sock *sk;
2372 struct hlist_nulls_node *node;
2373 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2374
2375
2376 if (empty_bucket(st))
2377 continue;
2378
2379 spin_lock_bh(lock);
2380 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2381 if (seq_sk_match(seq, sk))
2382 return sk;
2383 }
2384 spin_unlock_bh(lock);
2385 }
2386
2387 return NULL;
2388}
2389
2390static void *established_get_next(struct seq_file *seq, void *cur)
2391{
2392 struct sock *sk = cur;
2393 struct hlist_nulls_node *node;
2394 struct tcp_iter_state *st = seq->private;
2395
2396 ++st->num;
2397 ++st->offset;
2398
2399 sk = sk_nulls_next(sk);
2400
2401 sk_nulls_for_each_from(sk, node) {
2402 if (seq_sk_match(seq, sk))
2403 return sk;
2404 }
2405
2406 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2407 ++st->bucket;
2408 return established_get_first(seq);
2409}
2410
2411static void *established_get_idx(struct seq_file *seq, loff_t pos)
2412{
2413 struct tcp_iter_state *st = seq->private;
2414 void *rc;
2415
2416 st->bucket = 0;
2417 rc = established_get_first(seq);
2418
2419 while (rc && pos) {
2420 rc = established_get_next(seq, rc);
2421 --pos;
2422 }
2423 return rc;
2424}
2425
2426static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2427{
2428 void *rc;
2429 struct tcp_iter_state *st = seq->private;
2430
2431 st->state = TCP_SEQ_STATE_LISTENING;
2432 rc = listening_get_idx(seq, &pos);
2433
2434 if (!rc) {
2435 st->state = TCP_SEQ_STATE_ESTABLISHED;
2436 rc = established_get_idx(seq, pos);
2437 }
2438
2439 return rc;
2440}
2441
2442static void *tcp_seek_last_pos(struct seq_file *seq)
2443{
2444 struct tcp_iter_state *st = seq->private;
2445 int bucket = st->bucket;
2446 int offset = st->offset;
2447 int orig_num = st->num;
2448 void *rc = NULL;
2449
2450 switch (st->state) {
2451 case TCP_SEQ_STATE_LISTENING:
2452 if (st->bucket > tcp_hashinfo.lhash2_mask)
2453 break;
2454 st->state = TCP_SEQ_STATE_LISTENING;
2455 rc = listening_get_first(seq);
2456 while (offset-- && rc && bucket == st->bucket)
2457 rc = listening_get_next(seq, rc);
2458 if (rc)
2459 break;
2460 st->bucket = 0;
2461 st->state = TCP_SEQ_STATE_ESTABLISHED;
2462 fallthrough;
2463 case TCP_SEQ_STATE_ESTABLISHED:
2464 if (st->bucket > tcp_hashinfo.ehash_mask)
2465 break;
2466 rc = established_get_first(seq);
2467 while (offset-- && rc && bucket == st->bucket)
2468 rc = established_get_next(seq, rc);
2469 }
2470
2471 st->num = orig_num;
2472
2473 return rc;
2474}
2475
2476void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2477{
2478 struct tcp_iter_state *st = seq->private;
2479 void *rc;
2480
2481 if (*pos && *pos == st->last_pos) {
2482 rc = tcp_seek_last_pos(seq);
2483 if (rc)
2484 goto out;
2485 }
2486
2487 st->state = TCP_SEQ_STATE_LISTENING;
2488 st->num = 0;
2489 st->bucket = 0;
2490 st->offset = 0;
2491 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2492
2493out:
2494 st->last_pos = *pos;
2495 return rc;
2496}
2497EXPORT_SYMBOL(tcp_seq_start);
2498
2499void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2500{
2501 struct tcp_iter_state *st = seq->private;
2502 void *rc = NULL;
2503
2504 if (v == SEQ_START_TOKEN) {
2505 rc = tcp_get_idx(seq, 0);
2506 goto out;
2507 }
2508
2509 switch (st->state) {
2510 case TCP_SEQ_STATE_LISTENING:
2511 rc = listening_get_next(seq, v);
2512 if (!rc) {
2513 st->state = TCP_SEQ_STATE_ESTABLISHED;
2514 st->bucket = 0;
2515 st->offset = 0;
2516 rc = established_get_first(seq);
2517 }
2518 break;
2519 case TCP_SEQ_STATE_ESTABLISHED:
2520 rc = established_get_next(seq, v);
2521 break;
2522 }
2523out:
2524 ++*pos;
2525 st->last_pos = *pos;
2526 return rc;
2527}
2528EXPORT_SYMBOL(tcp_seq_next);
2529
2530void tcp_seq_stop(struct seq_file *seq, void *v)
2531{
2532 struct tcp_iter_state *st = seq->private;
2533
2534 switch (st->state) {
2535 case TCP_SEQ_STATE_LISTENING:
2536 if (v != SEQ_START_TOKEN)
2537 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2538 break;
2539 case TCP_SEQ_STATE_ESTABLISHED:
2540 if (v)
2541 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2542 break;
2543 }
2544}
2545EXPORT_SYMBOL(tcp_seq_stop);
2546
2547static void get_openreq4(const struct request_sock *req,
2548 struct seq_file *f, int i)
2549{
2550 const struct inet_request_sock *ireq = inet_rsk(req);
2551 long delta = req->rsk_timer.expires - jiffies;
2552
2553 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2554 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2555 i,
2556 ireq->ir_loc_addr,
2557 ireq->ir_num,
2558 ireq->ir_rmt_addr,
2559 ntohs(ireq->ir_rmt_port),
2560 TCP_SYN_RECV,
2561 0, 0,
2562 1,
2563 jiffies_delta_to_clock_t(delta),
2564 req->num_timeout,
2565 from_kuid_munged(seq_user_ns(f),
2566 sock_i_uid(req->rsk_listener)),
2567 0,
2568 0,
2569 0,
2570 req);
2571}
2572
2573static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2574{
2575 int timer_active;
2576 unsigned long timer_expires;
2577 const struct tcp_sock *tp = tcp_sk(sk);
2578 const struct inet_connection_sock *icsk = inet_csk(sk);
2579 const struct inet_sock *inet = inet_sk(sk);
2580 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2581 __be32 dest = inet->inet_daddr;
2582 __be32 src = inet->inet_rcv_saddr;
2583 __u16 destp = ntohs(inet->inet_dport);
2584 __u16 srcp = ntohs(inet->inet_sport);
2585 int rx_queue;
2586 int state;
2587
2588 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2589 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2590 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2591 timer_active = 1;
2592 timer_expires = icsk->icsk_timeout;
2593 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2594 timer_active = 4;
2595 timer_expires = icsk->icsk_timeout;
2596 } else if (timer_pending(&sk->sk_timer)) {
2597 timer_active = 2;
2598 timer_expires = sk->sk_timer.expires;
2599 } else {
2600 timer_active = 0;
2601 timer_expires = jiffies;
2602 }
2603
2604 state = inet_sk_state_load(sk);
2605 if (state == TCP_LISTEN)
2606 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2607 else
2608
2609
2610
2611 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2612 READ_ONCE(tp->copied_seq), 0);
2613
2614 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2615 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2616 i, src, srcp, dest, destp, state,
2617 READ_ONCE(tp->write_seq) - tp->snd_una,
2618 rx_queue,
2619 timer_active,
2620 jiffies_delta_to_clock_t(timer_expires - jiffies),
2621 icsk->icsk_retransmits,
2622 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2623 icsk->icsk_probes_out,
2624 sock_i_ino(sk),
2625 refcount_read(&sk->sk_refcnt), sk,
2626 jiffies_to_clock_t(icsk->icsk_rto),
2627 jiffies_to_clock_t(icsk->icsk_ack.ato),
2628 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2629 tcp_snd_cwnd(tp),
2630 state == TCP_LISTEN ?
2631 fastopenq->max_qlen :
2632 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2633}
2634
2635static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2636 struct seq_file *f, int i)
2637{
2638 long delta = tw->tw_timer.expires - jiffies;
2639 __be32 dest, src;
2640 __u16 destp, srcp;
2641
2642 dest = tw->tw_daddr;
2643 src = tw->tw_rcv_saddr;
2644 destp = ntohs(tw->tw_dport);
2645 srcp = ntohs(tw->tw_sport);
2646
2647 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2648 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2649 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2650 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2651 refcount_read(&tw->tw_refcnt), tw);
2652}
2653
2654#define TMPSZ 150
2655
2656static int tcp4_seq_show(struct seq_file *seq, void *v)
2657{
2658 struct tcp_iter_state *st;
2659 struct sock *sk = v;
2660
2661 seq_setwidth(seq, TMPSZ - 1);
2662 if (v == SEQ_START_TOKEN) {
2663 seq_puts(seq, " sl local_address rem_address st tx_queue "
2664 "rx_queue tr tm->when retrnsmt uid timeout "
2665 "inode");
2666 goto out;
2667 }
2668 st = seq->private;
2669
2670 if (sk->sk_state == TCP_TIME_WAIT)
2671 get_timewait4_sock(v, seq, st->num);
2672 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2673 get_openreq4(v, seq, st->num);
2674 else
2675 get_tcp4_sock(v, seq, st->num);
2676out:
2677 seq_pad(seq, '\n');
2678 return 0;
2679}
2680
2681#ifdef CONFIG_BPF_SYSCALL
2682struct bpf_tcp_iter_state {
2683 struct tcp_iter_state state;
2684 unsigned int cur_sk;
2685 unsigned int end_sk;
2686 unsigned int max_sk;
2687 struct sock **batch;
2688 bool st_bucket_done;
2689};
2690
2691struct bpf_iter__tcp {
2692 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2693 __bpf_md_ptr(struct sock_common *, sk_common);
2694 uid_t uid __aligned(8);
2695};
2696
2697static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2698 struct sock_common *sk_common, uid_t uid)
2699{
2700 struct bpf_iter__tcp ctx;
2701
2702 meta->seq_num--;
2703 ctx.meta = meta;
2704 ctx.sk_common = sk_common;
2705 ctx.uid = uid;
2706 return bpf_iter_run_prog(prog, &ctx);
2707}
2708
2709static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2710{
2711 while (iter->cur_sk < iter->end_sk)
2712 sock_put(iter->batch[iter->cur_sk++]);
2713}
2714
2715static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2716 unsigned int new_batch_sz)
2717{
2718 struct sock **new_batch;
2719
2720 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2721 GFP_USER | __GFP_NOWARN);
2722 if (!new_batch)
2723 return -ENOMEM;
2724
2725 bpf_iter_tcp_put_batch(iter);
2726 kvfree(iter->batch);
2727 iter->batch = new_batch;
2728 iter->max_sk = new_batch_sz;
2729
2730 return 0;
2731}
2732
2733static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2734 struct sock *start_sk)
2735{
2736 struct bpf_tcp_iter_state *iter = seq->private;
2737 struct tcp_iter_state *st = &iter->state;
2738 struct hlist_nulls_node *node;
2739 unsigned int expected = 1;
2740 struct sock *sk;
2741
2742 sock_hold(start_sk);
2743 iter->batch[iter->end_sk++] = start_sk;
2744
2745 sk = sk_nulls_next(start_sk);
2746 sk_nulls_for_each_from(sk, node) {
2747 if (seq_sk_match(seq, sk)) {
2748 if (iter->end_sk < iter->max_sk) {
2749 sock_hold(sk);
2750 iter->batch[iter->end_sk++] = sk;
2751 }
2752 expected++;
2753 }
2754 }
2755 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2756
2757 return expected;
2758}
2759
2760static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2761 struct sock *start_sk)
2762{
2763 struct bpf_tcp_iter_state *iter = seq->private;
2764 struct tcp_iter_state *st = &iter->state;
2765 struct hlist_nulls_node *node;
2766 unsigned int expected = 1;
2767 struct sock *sk;
2768
2769 sock_hold(start_sk);
2770 iter->batch[iter->end_sk++] = start_sk;
2771
2772 sk = sk_nulls_next(start_sk);
2773 sk_nulls_for_each_from(sk, node) {
2774 if (seq_sk_match(seq, sk)) {
2775 if (iter->end_sk < iter->max_sk) {
2776 sock_hold(sk);
2777 iter->batch[iter->end_sk++] = sk;
2778 }
2779 expected++;
2780 }
2781 }
2782 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2783
2784 return expected;
2785}
2786
2787static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2788{
2789 struct bpf_tcp_iter_state *iter = seq->private;
2790 struct tcp_iter_state *st = &iter->state;
2791 unsigned int expected;
2792 bool resized = false;
2793 struct sock *sk;
2794
2795
2796
2797
2798
2799
2800 if (iter->st_bucket_done) {
2801 st->offset = 0;
2802 st->bucket++;
2803 if (st->state == TCP_SEQ_STATE_LISTENING &&
2804 st->bucket > tcp_hashinfo.lhash2_mask) {
2805 st->state = TCP_SEQ_STATE_ESTABLISHED;
2806 st->bucket = 0;
2807 }
2808 }
2809
2810again:
2811
2812 iter->cur_sk = 0;
2813 iter->end_sk = 0;
2814 iter->st_bucket_done = false;
2815
2816 sk = tcp_seek_last_pos(seq);
2817 if (!sk)
2818 return NULL;
2819
2820 if (st->state == TCP_SEQ_STATE_LISTENING)
2821 expected = bpf_iter_tcp_listening_batch(seq, sk);
2822 else
2823 expected = bpf_iter_tcp_established_batch(seq, sk);
2824
2825 if (iter->end_sk == expected) {
2826 iter->st_bucket_done = true;
2827 return sk;
2828 }
2829
2830 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2831 resized = true;
2832 goto again;
2833 }
2834
2835 return sk;
2836}
2837
2838static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2839{
2840
2841
2842
2843 if (*pos)
2844 return bpf_iter_tcp_batch(seq);
2845
2846 return SEQ_START_TOKEN;
2847}
2848
2849static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2850{
2851 struct bpf_tcp_iter_state *iter = seq->private;
2852 struct tcp_iter_state *st = &iter->state;
2853 struct sock *sk;
2854
2855
2856
2857
2858
2859 if (iter->cur_sk < iter->end_sk) {
2860
2861
2862
2863
2864 st->num++;
2865
2866
2867
2868
2869 st->offset++;
2870 sock_put(iter->batch[iter->cur_sk++]);
2871 }
2872
2873 if (iter->cur_sk < iter->end_sk)
2874 sk = iter->batch[iter->cur_sk];
2875 else
2876 sk = bpf_iter_tcp_batch(seq);
2877
2878 ++*pos;
2879
2880
2881
2882 st->last_pos = *pos;
2883 return sk;
2884}
2885
2886static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2887{
2888 struct bpf_iter_meta meta;
2889 struct bpf_prog *prog;
2890 struct sock *sk = v;
2891 bool slow;
2892 uid_t uid;
2893 int ret;
2894
2895 if (v == SEQ_START_TOKEN)
2896 return 0;
2897
2898 if (sk_fullsock(sk))
2899 slow = lock_sock_fast(sk);
2900
2901 if (unlikely(sk_unhashed(sk))) {
2902 ret = SEQ_SKIP;
2903 goto unlock;
2904 }
2905
2906 if (sk->sk_state == TCP_TIME_WAIT) {
2907 uid = 0;
2908 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2909 const struct request_sock *req = v;
2910
2911 uid = from_kuid_munged(seq_user_ns(seq),
2912 sock_i_uid(req->rsk_listener));
2913 } else {
2914 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2915 }
2916
2917 meta.seq = seq;
2918 prog = bpf_iter_get_info(&meta, false);
2919 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2920
2921unlock:
2922 if (sk_fullsock(sk))
2923 unlock_sock_fast(sk, slow);
2924 return ret;
2925
2926}
2927
2928static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2929{
2930 struct bpf_tcp_iter_state *iter = seq->private;
2931 struct bpf_iter_meta meta;
2932 struct bpf_prog *prog;
2933
2934 if (!v) {
2935 meta.seq = seq;
2936 prog = bpf_iter_get_info(&meta, true);
2937 if (prog)
2938 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2939 }
2940
2941 if (iter->cur_sk < iter->end_sk) {
2942 bpf_iter_tcp_put_batch(iter);
2943 iter->st_bucket_done = false;
2944 }
2945}
2946
2947static const struct seq_operations bpf_iter_tcp_seq_ops = {
2948 .show = bpf_iter_tcp_seq_show,
2949 .start = bpf_iter_tcp_seq_start,
2950 .next = bpf_iter_tcp_seq_next,
2951 .stop = bpf_iter_tcp_seq_stop,
2952};
2953#endif
2954static unsigned short seq_file_family(const struct seq_file *seq)
2955{
2956 const struct tcp_seq_afinfo *afinfo;
2957
2958#ifdef CONFIG_BPF_SYSCALL
2959
2960 if (seq->op == &bpf_iter_tcp_seq_ops)
2961 return AF_UNSPEC;
2962#endif
2963
2964
2965 afinfo = pde_data(file_inode(seq->file));
2966 return afinfo->family;
2967}
2968
2969static const struct seq_operations tcp4_seq_ops = {
2970 .show = tcp4_seq_show,
2971 .start = tcp_seq_start,
2972 .next = tcp_seq_next,
2973 .stop = tcp_seq_stop,
2974};
2975
2976static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2977 .family = AF_INET,
2978};
2979
2980static int __net_init tcp4_proc_init_net(struct net *net)
2981{
2982 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2983 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2984 return -ENOMEM;
2985 return 0;
2986}
2987
2988static void __net_exit tcp4_proc_exit_net(struct net *net)
2989{
2990 remove_proc_entry("tcp", net->proc_net);
2991}
2992
2993static struct pernet_operations tcp4_net_ops = {
2994 .init = tcp4_proc_init_net,
2995 .exit = tcp4_proc_exit_net,
2996};
2997
2998int __init tcp4_proc_init(void)
2999{
3000 return register_pernet_subsys(&tcp4_net_ops);
3001}
3002
3003void tcp4_proc_exit(void)
3004{
3005 unregister_pernet_subsys(&tcp4_net_ops);
3006}
3007#endif
3008
3009
3010
3011
3012
3013bool tcp_stream_memory_free(const struct sock *sk, int wake)
3014{
3015 const struct tcp_sock *tp = tcp_sk(sk);
3016 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3017 READ_ONCE(tp->snd_nxt);
3018
3019 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3020}
3021EXPORT_SYMBOL(tcp_stream_memory_free);
3022
3023struct proto tcp_prot = {
3024 .name = "TCP",
3025 .owner = THIS_MODULE,
3026 .close = tcp_close,
3027 .pre_connect = tcp_v4_pre_connect,
3028 .connect = tcp_v4_connect,
3029 .disconnect = tcp_disconnect,
3030 .accept = inet_csk_accept,
3031 .ioctl = tcp_ioctl,
3032 .init = tcp_v4_init_sock,
3033 .destroy = tcp_v4_destroy_sock,
3034 .shutdown = tcp_shutdown,
3035 .setsockopt = tcp_setsockopt,
3036 .getsockopt = tcp_getsockopt,
3037 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3038 .keepalive = tcp_set_keepalive,
3039 .recvmsg = tcp_recvmsg,
3040 .sendmsg = tcp_sendmsg,
3041 .sendpage = tcp_sendpage,
3042 .backlog_rcv = tcp_v4_do_rcv,
3043 .release_cb = tcp_release_cb,
3044 .hash = inet_hash,
3045 .unhash = inet_unhash,
3046 .get_port = inet_csk_get_port,
3047 .put_port = inet_put_port,
3048#ifdef CONFIG_BPF_SYSCALL
3049 .psock_update_sk_prot = tcp_bpf_update_proto,
3050#endif
3051 .enter_memory_pressure = tcp_enter_memory_pressure,
3052 .leave_memory_pressure = tcp_leave_memory_pressure,
3053 .stream_memory_free = tcp_stream_memory_free,
3054 .sockets_allocated = &tcp_sockets_allocated,
3055 .orphan_count = &tcp_orphan_count,
3056
3057 .memory_allocated = &tcp_memory_allocated,
3058 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3059
3060 .memory_pressure = &tcp_memory_pressure,
3061 .sysctl_mem = sysctl_tcp_mem,
3062 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3063 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3064 .max_header = MAX_TCP_HEADER,
3065 .obj_size = sizeof(struct tcp_sock),
3066 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3067 .twsk_prot = &tcp_timewait_sock_ops,
3068 .rsk_prot = &tcp_request_sock_ops,
3069 .h.hashinfo = &tcp_hashinfo,
3070 .no_autobind = true,
3071 .diag_destroy = tcp_abort,
3072};
3073EXPORT_SYMBOL(tcp_prot);
3074
3075static void __net_exit tcp_sk_exit(struct net *net)
3076{
3077 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3078
3079 if (net->ipv4.tcp_congestion_control)
3080 bpf_module_put(net->ipv4.tcp_congestion_control,
3081 net->ipv4.tcp_congestion_control->owner);
3082 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3083 kfree(tcp_death_row);
3084}
3085
3086static int __net_init tcp_sk_init(struct net *net)
3087{
3088 int cnt;
3089
3090 net->ipv4.sysctl_tcp_ecn = 2;
3091 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3092
3093 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3094 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3095 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3096 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3097 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3098
3099 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3100 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3101 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3102
3103 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3104 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3105 net->ipv4.sysctl_tcp_syncookies = 1;
3106 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3107 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3108 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3109 net->ipv4.sysctl_tcp_orphan_retries = 0;
3110 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3111 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3112 net->ipv4.sysctl_tcp_tw_reuse = 2;
3113 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3114
3115 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3116 if (!net->ipv4.tcp_death_row)
3117 return -ENOMEM;
3118 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3119 cnt = tcp_hashinfo.ehash_mask + 1;
3120 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3121 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3122
3123 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3124 net->ipv4.sysctl_tcp_sack = 1;
3125 net->ipv4.sysctl_tcp_window_scaling = 1;
3126 net->ipv4.sysctl_tcp_timestamps = 1;
3127 net->ipv4.sysctl_tcp_early_retrans = 3;
3128 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3129 net->ipv4.sysctl_tcp_slow_start_after_idle = 1;
3130 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3131 net->ipv4.sysctl_tcp_max_reordering = 300;
3132 net->ipv4.sysctl_tcp_dsack = 1;
3133 net->ipv4.sysctl_tcp_app_win = 31;
3134 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3135 net->ipv4.sysctl_tcp_frto = 2;
3136 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3137
3138
3139
3140
3141 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3142
3143 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3144
3145
3146 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3147
3148 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3149 net->ipv4.sysctl_tcp_tso_rtt_log = 9;
3150 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3151 net->ipv4.sysctl_tcp_autocorking = 1;
3152 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3153 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3154 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3155 if (net != &init_net) {
3156 memcpy(net->ipv4.sysctl_tcp_rmem,
3157 init_net.ipv4.sysctl_tcp_rmem,
3158 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3159 memcpy(net->ipv4.sysctl_tcp_wmem,
3160 init_net.ipv4.sysctl_tcp_wmem,
3161 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3162 }
3163 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3164 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3165 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3166 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3167 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3168 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3169
3170
3171 if (!net_eq(net, &init_net) &&
3172 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3173 init_net.ipv4.tcp_congestion_control->owner))
3174 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3175 else
3176 net->ipv4.tcp_congestion_control = &tcp_reno;
3177
3178 return 0;
3179}
3180
3181static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3182{
3183 struct net *net;
3184
3185 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3186
3187 list_for_each_entry(net, net_exit_list, exit_list)
3188 tcp_fastopen_ctx_destroy(net);
3189}
3190
3191static struct pernet_operations __net_initdata tcp_sk_ops = {
3192 .init = tcp_sk_init,
3193 .exit = tcp_sk_exit,
3194 .exit_batch = tcp_sk_exit_batch,
3195};
3196
3197#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3198DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3199 struct sock_common *sk_common, uid_t uid)
3200
3201#define INIT_BATCH_SZ 16
3202
3203static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3204{
3205 struct bpf_tcp_iter_state *iter = priv_data;
3206 int err;
3207
3208 err = bpf_iter_init_seq_net(priv_data, aux);
3209 if (err)
3210 return err;
3211
3212 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3213 if (err) {
3214 bpf_iter_fini_seq_net(priv_data);
3215 return err;
3216 }
3217
3218 return 0;
3219}
3220
3221static void bpf_iter_fini_tcp(void *priv_data)
3222{
3223 struct bpf_tcp_iter_state *iter = priv_data;
3224
3225 bpf_iter_fini_seq_net(priv_data);
3226 kvfree(iter->batch);
3227}
3228
3229static const struct bpf_iter_seq_info tcp_seq_info = {
3230 .seq_ops = &bpf_iter_tcp_seq_ops,
3231 .init_seq_private = bpf_iter_init_tcp,
3232 .fini_seq_private = bpf_iter_fini_tcp,
3233 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3234};
3235
3236static const struct bpf_func_proto *
3237bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3238 const struct bpf_prog *prog)
3239{
3240 switch (func_id) {
3241 case BPF_FUNC_setsockopt:
3242 return &bpf_sk_setsockopt_proto;
3243 case BPF_FUNC_getsockopt:
3244 return &bpf_sk_getsockopt_proto;
3245 default:
3246 return NULL;
3247 }
3248}
3249
3250static struct bpf_iter_reg tcp_reg_info = {
3251 .target = "tcp",
3252 .ctx_arg_info_size = 1,
3253 .ctx_arg_info = {
3254 { offsetof(struct bpf_iter__tcp, sk_common),
3255 PTR_TO_BTF_ID_OR_NULL },
3256 },
3257 .get_func_proto = bpf_iter_tcp_get_func_proto,
3258 .seq_info = &tcp_seq_info,
3259};
3260
3261static void __init bpf_iter_register(void)
3262{
3263 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3264 if (bpf_iter_reg_target(&tcp_reg_info))
3265 pr_warn("Warning: could not register bpf iterator tcp\n");
3266}
3267
3268#endif
3269
3270void __init tcp_v4_init(void)
3271{
3272 int cpu, res;
3273
3274 for_each_possible_cpu(cpu) {
3275 struct sock *sk;
3276
3277 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3278 IPPROTO_TCP, &init_net);
3279 if (res)
3280 panic("Failed to create the TCP control socket.\n");
3281 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3282
3283
3284
3285
3286 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3287
3288 per_cpu(ipv4_tcp_sk, cpu) = sk;
3289 }
3290 if (register_pernet_subsys(&tcp_sk_ops))
3291 panic("Failed to create the TCP control socket.\n");
3292
3293#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3294 bpf_iter_register();
3295#endif
3296}
3297