linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60#include <linux/sched.h>
  61
  62#include <net/net_namespace.h>
  63#include <net/icmp.h>
  64#include <net/inet_hashtables.h>
  65#include <net/tcp.h>
  66#include <net/transp_v6.h>
  67#include <net/ipv6.h>
  68#include <net/inet_common.h>
  69#include <net/timewait_sock.h>
  70#include <net/xfrm.h>
  71#include <net/secure_seq.h>
  72#include <net/busy_poll.h>
  73
  74#include <linux/inet.h>
  75#include <linux/ipv6.h>
  76#include <linux/stddef.h>
  77#include <linux/proc_fs.h>
  78#include <linux/seq_file.h>
  79#include <linux/inetdevice.h>
  80#include <linux/btf_ids.h>
  81
  82#include <crypto/hash.h>
  83#include <linux/scatterlist.h>
  84
  85#include <trace/events/tcp.h>
  86
  87#ifdef CONFIG_TCP_MD5SIG
  88static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  89                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  90#endif
  91
  92struct inet_hashinfo tcp_hashinfo;
  93EXPORT_SYMBOL(tcp_hashinfo);
  94
  95static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  96
  97static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  98{
  99        return secure_tcp_seq(ip_hdr(skb)->daddr,
 100                              ip_hdr(skb)->saddr,
 101                              tcp_hdr(skb)->dest,
 102                              tcp_hdr(skb)->source);
 103}
 104
 105static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 106{
 107        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 108}
 109
 110int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 111{
 112        int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 113        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115        struct tcp_sock *tp = tcp_sk(sk);
 116
 117        if (reuse == 2) {
 118                /* Still does not detect *everything* that goes through
 119                 * lo, since we require a loopback src or dst address
 120                 * or direct binding to 'lo' interface.
 121                 */
 122                bool loopback = false;
 123                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 124                        loopback = true;
 125#if IS_ENABLED(CONFIG_IPV6)
 126                if (tw->tw_family == AF_INET6) {
 127                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 128                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 129                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 130                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 131                                loopback = true;
 132                } else
 133#endif
 134                {
 135                        if (ipv4_is_loopback(tw->tw_daddr) ||
 136                            ipv4_is_loopback(tw->tw_rcv_saddr))
 137                                loopback = true;
 138                }
 139                if (!loopback)
 140                        reuse = 0;
 141        }
 142
 143        /* With PAWS, it is safe from the viewpoint
 144           of data integrity. Even without PAWS it is safe provided sequence
 145           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 146
 147           Actually, the idea is close to VJ's one, only timestamp cache is
 148           held not per host, but per port pair and TW bucket is used as state
 149           holder.
 150
 151           If TW bucket has been already destroyed we fall back to VJ's scheme
 152           and use initial timestamp retrieved from peer table.
 153         */
 154        if (tcptw->tw_ts_recent_stamp &&
 155            (!twp || (reuse && time_after32(ktime_get_seconds(),
 156                                            tcptw->tw_ts_recent_stamp)))) {
 157                /* In case of repair and re-using TIME-WAIT sockets we still
 158                 * want to be sure that it is safe as above but honor the
 159                 * sequence numbers and time stamps set as part of the repair
 160                 * process.
 161                 *
 162                 * Without this check re-using a TIME-WAIT socket with TCP
 163                 * repair would accumulate a -1 on the repair assigned
 164                 * sequence number. The first time it is reused the sequence
 165                 * is -1, the second time -2, etc. This fixes that issue
 166                 * without appearing to create any others.
 167                 */
 168                if (likely(!tp->repair)) {
 169                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 170
 171                        if (!seq)
 172                                seq = 1;
 173                        WRITE_ONCE(tp->write_seq, seq);
 174                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 175                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 176                }
 177                sock_hold(sktw);
 178                return 1;
 179        }
 180
 181        return 0;
 182}
 183EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 184
 185static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 186                              int addr_len)
 187{
 188        /* This check is replicated from tcp_v4_connect() and intended to
 189         * prevent BPF program called below from accessing bytes that are out
 190         * of the bound specified by user in addr_len.
 191         */
 192        if (addr_len < sizeof(struct sockaddr_in))
 193                return -EINVAL;
 194
 195        sock_owned_by_me(sk);
 196
 197        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
 198}
 199
 200/* This will initiate an outgoing connection. */
 201int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 202{
 203        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 204        struct inet_timewait_death_row *tcp_death_row;
 205        struct inet_sock *inet = inet_sk(sk);
 206        struct tcp_sock *tp = tcp_sk(sk);
 207        struct ip_options_rcu *inet_opt;
 208        struct net *net = sock_net(sk);
 209        __be16 orig_sport, orig_dport;
 210        __be32 daddr, nexthop;
 211        struct flowi4 *fl4;
 212        struct rtable *rt;
 213        int err;
 214
 215        if (addr_len < sizeof(struct sockaddr_in))
 216                return -EINVAL;
 217
 218        if (usin->sin_family != AF_INET)
 219                return -EAFNOSUPPORT;
 220
 221        nexthop = daddr = usin->sin_addr.s_addr;
 222        inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                             lockdep_sock_is_held(sk));
 224        if (inet_opt && inet_opt->opt.srr) {
 225                if (!daddr)
 226                        return -EINVAL;
 227                nexthop = inet_opt->opt.faddr;
 228        }
 229
 230        orig_sport = inet->inet_sport;
 231        orig_dport = usin->sin_port;
 232        fl4 = &inet->cork.fl.u.ip4;
 233        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                              sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 235                              orig_dport, sk);
 236        if (IS_ERR(rt)) {
 237                err = PTR_ERR(rt);
 238                if (err == -ENETUNREACH)
 239                        IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 240                return err;
 241        }
 242
 243        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 244                ip_rt_put(rt);
 245                return -ENETUNREACH;
 246        }
 247
 248        if (!inet_opt || !inet_opt->opt.srr)
 249                daddr = fl4->daddr;
 250
 251        tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 252
 253        if (!inet->inet_saddr) {
 254                err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 255                if (err) {
 256                        ip_rt_put(rt);
 257                        return err;
 258                }
 259        } else {
 260                sk_rcv_saddr_set(sk, inet->inet_saddr);
 261        }
 262
 263        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 264                /* Reset inherited state */
 265                tp->rx_opt.ts_recent       = 0;
 266                tp->rx_opt.ts_recent_stamp = 0;
 267                if (likely(!tp->repair))
 268                        WRITE_ONCE(tp->write_seq, 0);
 269        }
 270
 271        inet->inet_dport = usin->sin_port;
 272        sk_daddr_set(sk, daddr);
 273
 274        inet_csk(sk)->icsk_ext_hdr_len = 0;
 275        if (inet_opt)
 276                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 277
 278        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 279
 280        /* Socket identity is still unknown (sport may be zero).
 281         * However we set state to SYN-SENT and not releasing socket
 282         * lock select source port, enter ourselves into the hash tables and
 283         * complete initialization after this.
 284         */
 285        tcp_set_state(sk, TCP_SYN_SENT);
 286        err = inet_hash_connect(tcp_death_row, sk);
 287        if (err)
 288                goto failure;
 289
 290        sk_set_txhash(sk);
 291
 292        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 293                               inet->inet_sport, inet->inet_dport, sk);
 294        if (IS_ERR(rt)) {
 295                err = PTR_ERR(rt);
 296                rt = NULL;
 297                goto failure;
 298        }
 299        tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
 300        /* OK, now commit destination to socket.  */
 301        sk->sk_gso_type = SKB_GSO_TCPV4;
 302        sk_setup_caps(sk, &rt->dst);
 303        rt = NULL;
 304
 305        if (likely(!tp->repair)) {
 306                if (!tp->write_seq)
 307                        WRITE_ONCE(tp->write_seq,
 308                                   secure_tcp_seq(inet->inet_saddr,
 309                                                  inet->inet_daddr,
 310                                                  inet->inet_sport,
 311                                                  usin->sin_port));
 312                WRITE_ONCE(tp->tsoffset,
 313                           secure_tcp_ts_off(net, inet->inet_saddr,
 314                                             inet->inet_daddr));
 315        }
 316
 317        atomic_set(&inet->inet_id, get_random_u16());
 318
 319        if (tcp_fastopen_defer_connect(sk, &err))
 320                return err;
 321        if (err)
 322                goto failure;
 323
 324        err = tcp_connect(sk);
 325
 326        if (err)
 327                goto failure;
 328
 329        return 0;
 330
 331failure:
 332        /*
 333         * This unhashes the socket and releases the local port,
 334         * if necessary.
 335         */
 336        tcp_set_state(sk, TCP_CLOSE);
 337        inet_bhash2_reset_saddr(sk);
 338        ip_rt_put(rt);
 339        sk->sk_route_caps = 0;
 340        inet->inet_dport = 0;
 341        return err;
 342}
 343EXPORT_SYMBOL(tcp_v4_connect);
 344
 345/*
 346 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 347 * It can be called through tcp_release_cb() if socket was owned by user
 348 * at the time tcp_v4_err() was called to handle ICMP message.
 349 */
 350void tcp_v4_mtu_reduced(struct sock *sk)
 351{
 352        struct inet_sock *inet = inet_sk(sk);
 353        struct dst_entry *dst;
 354        u32 mtu;
 355
 356        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 357                return;
 358        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 359        dst = inet_csk_update_pmtu(sk, mtu);
 360        if (!dst)
 361                return;
 362
 363        /* Something is about to be wrong... Remember soft error
 364         * for the case, if this connection will not able to recover.
 365         */
 366        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 367                WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
 368
 369        mtu = dst_mtu(dst);
 370
 371        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 372            ip_sk_accept_pmtu(sk) &&
 373            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 374                tcp_sync_mss(sk, mtu);
 375
 376                /* Resend the TCP packet because it's
 377                 * clear that the old packet has been
 378                 * dropped. This is the new "fast" path mtu
 379                 * discovery.
 380                 */
 381                tcp_simple_retransmit(sk);
 382        } /* else let the usual retransmit timer handle it */
 383}
 384EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 385
 386static void do_redirect(struct sk_buff *skb, struct sock *sk)
 387{
 388        struct dst_entry *dst = __sk_dst_check(sk, 0);
 389
 390        if (dst)
 391                dst->ops->redirect(dst, sk, skb);
 392}
 393
 394
 395/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 396void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 397{
 398        struct request_sock *req = inet_reqsk(sk);
 399        struct net *net = sock_net(sk);
 400
 401        /* ICMPs are not backlogged, hence we cannot get
 402         * an established socket here.
 403         */
 404        if (seq != tcp_rsk(req)->snt_isn) {
 405                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 406        } else if (abort) {
 407                /*
 408                 * Still in SYN_RECV, just remove it silently.
 409                 * There is no good way to pass the error to the newly
 410                 * created socket, and POSIX does not want network
 411                 * errors returned from accept().
 412                 */
 413                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 414                tcp_listendrop(req->rsk_listener);
 415        }
 416        reqsk_put(req);
 417}
 418EXPORT_SYMBOL(tcp_req_err);
 419
 420/* TCP-LD (RFC 6069) logic */
 421void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 422{
 423        struct inet_connection_sock *icsk = inet_csk(sk);
 424        struct tcp_sock *tp = tcp_sk(sk);
 425        struct sk_buff *skb;
 426        s32 remaining;
 427        u32 delta_us;
 428
 429        if (sock_owned_by_user(sk))
 430                return;
 431
 432        if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 433            !icsk->icsk_backoff)
 434                return;
 435
 436        skb = tcp_rtx_queue_head(sk);
 437        if (WARN_ON_ONCE(!skb))
 438                return;
 439
 440        icsk->icsk_backoff--;
 441        icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 442        icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 443
 444        tcp_mstamp_refresh(tp);
 445        delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 446        remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 447
 448        if (remaining > 0) {
 449                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 450                                          remaining, TCP_RTO_MAX);
 451        } else {
 452                /* RTO revert clocked out retransmission.
 453                 * Will retransmit now.
 454                 */
 455                tcp_retransmit_timer(sk);
 456        }
 457}
 458EXPORT_SYMBOL(tcp_ld_RTO_revert);
 459
 460/*
 461 * This routine is called by the ICMP module when it gets some
 462 * sort of error condition.  If err < 0 then the socket should
 463 * be closed and the error returned to the user.  If err > 0
 464 * it's just the icmp type << 8 | icmp code.  After adjustment
 465 * header points to the first 8 bytes of the tcp header.  We need
 466 * to find the appropriate port.
 467 *
 468 * The locking strategy used here is very "optimistic". When
 469 * someone else accesses the socket the ICMP is just dropped
 470 * and for some paths there is no check at all.
 471 * A more general error queue to queue errors for later handling
 472 * is probably better.
 473 *
 474 */
 475
 476int tcp_v4_err(struct sk_buff *skb, u32 info)
 477{
 478        const struct iphdr *iph = (const struct iphdr *)skb->data;
 479        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 480        struct tcp_sock *tp;
 481        const int type = icmp_hdr(skb)->type;
 482        const int code = icmp_hdr(skb)->code;
 483        struct sock *sk;
 484        struct request_sock *fastopen;
 485        u32 seq, snd_una;
 486        int err;
 487        struct net *net = dev_net(skb->dev);
 488
 489        sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 490                                       iph->daddr, th->dest, iph->saddr,
 491                                       ntohs(th->source), inet_iif(skb), 0);
 492        if (!sk) {
 493                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 494                return -ENOENT;
 495        }
 496        if (sk->sk_state == TCP_TIME_WAIT) {
 497                /* To increase the counter of ignored icmps for TCP-AO */
 498                tcp_ao_ignore_icmp(sk, AF_INET, type, code);
 499                inet_twsk_put(inet_twsk(sk));
 500                return 0;
 501        }
 502        seq = ntohl(th->seq);
 503        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 504                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 505                                     type == ICMP_TIME_EXCEEDED ||
 506                                     (type == ICMP_DEST_UNREACH &&
 507                                      (code == ICMP_NET_UNREACH ||
 508                                       code == ICMP_HOST_UNREACH)));
 509                return 0;
 510        }
 511
 512        if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
 513                sock_put(sk);
 514                return 0;
 515        }
 516
 517        bh_lock_sock(sk);
 518        /* If too many ICMPs get dropped on busy
 519         * servers this needs to be solved differently.
 520         * We do take care of PMTU discovery (RFC1191) special case :
 521         * we can receive locally generated ICMP messages while socket is held.
 522         */
 523        if (sock_owned_by_user(sk)) {
 524                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 525                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 526        }
 527        if (sk->sk_state == TCP_CLOSE)
 528                goto out;
 529
 530        if (static_branch_unlikely(&ip4_min_ttl)) {
 531                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 532                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 533                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 534                        goto out;
 535                }
 536        }
 537
 538        tp = tcp_sk(sk);
 539        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 540        fastopen = rcu_dereference(tp->fastopen_rsk);
 541        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 542        if (sk->sk_state != TCP_LISTEN &&
 543            !between(seq, snd_una, tp->snd_nxt)) {
 544                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 545                goto out;
 546        }
 547
 548        switch (type) {
 549        case ICMP_REDIRECT:
 550                if (!sock_owned_by_user(sk))
 551                        do_redirect(skb, sk);
 552                goto out;
 553        case ICMP_SOURCE_QUENCH:
 554                /* Just silently ignore these. */
 555                goto out;
 556        case ICMP_PARAMETERPROB:
 557                err = EPROTO;
 558                break;
 559        case ICMP_DEST_UNREACH:
 560                if (code > NR_ICMP_UNREACH)
 561                        goto out;
 562
 563                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 564                        /* We are not interested in TCP_LISTEN and open_requests
 565                         * (SYN-ACKs send out by Linux are always <576bytes so
 566                         * they should go through unfragmented).
 567                         */
 568                        if (sk->sk_state == TCP_LISTEN)
 569                                goto out;
 570
 571                        WRITE_ONCE(tp->mtu_info, info);
 572                        if (!sock_owned_by_user(sk)) {
 573                                tcp_v4_mtu_reduced(sk);
 574                        } else {
 575                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 576                                        sock_hold(sk);
 577                        }
 578                        goto out;
 579                }
 580
 581                err = icmp_err_convert[code].errno;
 582                /* check if this ICMP message allows revert of backoff.
 583                 * (see RFC 6069)
 584                 */
 585                if (!fastopen &&
 586                    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 587                        tcp_ld_RTO_revert(sk, seq);
 588                break;
 589        case ICMP_TIME_EXCEEDED:
 590                err = EHOSTUNREACH;
 591                break;
 592        default:
 593                goto out;
 594        }
 595
 596        switch (sk->sk_state) {
 597        case TCP_SYN_SENT:
 598        case TCP_SYN_RECV:
 599                /* Only in fast or simultaneous open. If a fast open socket is
 600                 * already accepted it is treated as a connected one below.
 601                 */
 602                if (fastopen && !fastopen->sk)
 603                        break;
 604
 605                ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 606
 607                if (!sock_owned_by_user(sk)) {
 608                        WRITE_ONCE(sk->sk_err, err);
 609
 610                        sk_error_report(sk);
 611
 612                        tcp_done(sk);
 613                } else {
 614                        WRITE_ONCE(sk->sk_err_soft, err);
 615                }
 616                goto out;
 617        }
 618
 619        /* If we've already connected we will keep trying
 620         * until we time out, or the user gives up.
 621         *
 622         * rfc1122 4.2.3.9 allows to consider as hard errors
 623         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 624         * but it is obsoleted by pmtu discovery).
 625         *
 626         * Note, that in modern internet, where routing is unreliable
 627         * and in each dark corner broken firewalls sit, sending random
 628         * errors ordered by their masters even this two messages finally lose
 629         * their original sense (even Linux sends invalid PORT_UNREACHs)
 630         *
 631         * Now we are in compliance with RFCs.
 632         *                                                      --ANK (980905)
 633         */
 634
 635        if (!sock_owned_by_user(sk) &&
 636            inet_test_bit(RECVERR, sk)) {
 637                WRITE_ONCE(sk->sk_err, err);
 638                sk_error_report(sk);
 639        } else  { /* Only an error on timeout */
 640                WRITE_ONCE(sk->sk_err_soft, err);
 641        }
 642
 643out:
 644        bh_unlock_sock(sk);
 645        sock_put(sk);
 646        return 0;
 647}
 648
 649void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 650{
 651        struct tcphdr *th = tcp_hdr(skb);
 652
 653        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 654        skb->csum_start = skb_transport_header(skb) - skb->head;
 655        skb->csum_offset = offsetof(struct tcphdr, check);
 656}
 657
 658/* This routine computes an IPv4 TCP checksum. */
 659void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 660{
 661        const struct inet_sock *inet = inet_sk(sk);
 662
 663        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 664}
 665EXPORT_SYMBOL(tcp_v4_send_check);
 666
 667#define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
 668
 669static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
 670                                 const struct tcp_ao_hdr *aoh,
 671                                 struct ip_reply_arg *arg, struct tcphdr *reply,
 672                                 __be32 reply_options[REPLY_OPTIONS_LEN])
 673{
 674#ifdef CONFIG_TCP_AO
 675        int sdif = tcp_v4_sdif(skb);
 676        int dif = inet_iif(skb);
 677        int l3index = sdif ? dif : 0;
 678        bool allocated_traffic_key;
 679        struct tcp_ao_key *key;
 680        char *traffic_key;
 681        bool drop = true;
 682        u32 ao_sne = 0;
 683        u8 keyid;
 684
 685        rcu_read_lock();
 686        if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
 687                                 &key, &traffic_key, &allocated_traffic_key,
 688                                 &keyid, &ao_sne))
 689                goto out;
 690
 691        reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
 692                                 (aoh->rnext_keyid << 8) | keyid);
 693        arg->iov[0].iov_len += tcp_ao_len_aligned(key);
 694        reply->doff = arg->iov[0].iov_len / 4;
 695
 696        if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
 697                            key, traffic_key,
 698                            (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
 699                            (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
 700                            reply, ao_sne))
 701                goto out;
 702        drop = false;
 703out:
 704        rcu_read_unlock();
 705        if (allocated_traffic_key)
 706                kfree(traffic_key);
 707        return drop;
 708#else
 709        return true;
 710#endif
 711}
 712
 713/*
 714 *      This routine will send an RST to the other tcp.
 715 *
 716 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 717 *                    for reset.
 718 *      Answer: if a packet caused RST, it is not for a socket
 719 *              existing in our system, if it is matched to a socket,
 720 *              it is just duplicate segment or bug in other side's TCP.
 721 *              So that we build reply only basing on parameters
 722 *              arrived with segment.
 723 *      Exception: precedence violation. We do not implement it in any case.
 724 */
 725
 726static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 727{
 728        const struct tcphdr *th = tcp_hdr(skb);
 729        struct {
 730                struct tcphdr th;
 731                __be32 opt[REPLY_OPTIONS_LEN];
 732        } rep;
 733        const __u8 *md5_hash_location = NULL;
 734        const struct tcp_ao_hdr *aoh;
 735        struct ip_reply_arg arg;
 736#ifdef CONFIG_TCP_MD5SIG
 737        struct tcp_md5sig_key *key = NULL;
 738        unsigned char newhash[16];
 739        struct sock *sk1 = NULL;
 740        int genhash;
 741#endif
 742        u64 transmit_time = 0;
 743        struct sock *ctl_sk;
 744        struct net *net;
 745        u32 txhash = 0;
 746
 747        /* Never send a reset in response to a reset. */
 748        if (th->rst)
 749                return;
 750
 751        /* If sk not NULL, it means we did a successful lookup and incoming
 752         * route had to be correct. prequeue might have dropped our dst.
 753         */
 754        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 755                return;
 756
 757        /* Swap the send and the receive. */
 758        memset(&rep, 0, sizeof(rep));
 759        rep.th.dest   = th->source;
 760        rep.th.source = th->dest;
 761        rep.th.doff   = sizeof(struct tcphdr) / 4;
 762        rep.th.rst    = 1;
 763
 764        if (th->ack) {
 765                rep.th.seq = th->ack_seq;
 766        } else {
 767                rep.th.ack = 1;
 768                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 769                                       skb->len - (th->doff << 2));
 770        }
 771
 772        memset(&arg, 0, sizeof(arg));
 773        arg.iov[0].iov_base = (unsigned char *)&rep;
 774        arg.iov[0].iov_len  = sizeof(rep.th);
 775
 776        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 777
 778        /* Invalid TCP option size or twice included auth */
 779        if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
 780                return;
 781
 782        if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
 783                return;
 784
 785#ifdef CONFIG_TCP_MD5SIG
 786        rcu_read_lock();
 787        if (sk && sk_fullsock(sk)) {
 788                const union tcp_md5_addr *addr;
 789                int l3index;
 790
 791                /* sdif set, means packet ingressed via a device
 792                 * in an L3 domain and inet_iif is set to it.
 793                 */
 794                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 795                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 796                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 797        } else if (md5_hash_location) {
 798                const union tcp_md5_addr *addr;
 799                int sdif = tcp_v4_sdif(skb);
 800                int dif = inet_iif(skb);
 801                int l3index;
 802
 803                /*
 804                 * active side is lost. Try to find listening socket through
 805                 * source port, and then find md5 key through listening socket.
 806                 * we are not loose security here:
 807                 * Incoming packet is checked with md5 hash with finding key,
 808                 * no RST generated if md5 hash doesn't match.
 809                 */
 810                sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 811                                             NULL, 0, ip_hdr(skb)->saddr,
 812                                             th->source, ip_hdr(skb)->daddr,
 813                                             ntohs(th->source), dif, sdif);
 814                /* don't send rst if it can't find key */
 815                if (!sk1)
 816                        goto out;
 817
 818                /* sdif set, means packet ingressed via a device
 819                 * in an L3 domain and dif is set to it.
 820                 */
 821                l3index = sdif ? dif : 0;
 822                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 823                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 824                if (!key)
 825                        goto out;
 826
 827
 828                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 829                if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
 830                        goto out;
 831
 832        }
 833
 834        if (key) {
 835                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 836                                   (TCPOPT_NOP << 16) |
 837                                   (TCPOPT_MD5SIG << 8) |
 838                                   TCPOLEN_MD5SIG);
 839                /* Update length and the length the header thinks exists */
 840                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 841                rep.th.doff = arg.iov[0].iov_len / 4;
 842
 843                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 844                                     key, ip_hdr(skb)->saddr,
 845                                     ip_hdr(skb)->daddr, &rep.th);
 846        }
 847#endif
 848        /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 849        if (rep.opt[0] == 0) {
 850                __be32 mrst = mptcp_reset_option(skb);
 851
 852                if (mrst) {
 853                        rep.opt[0] = mrst;
 854                        arg.iov[0].iov_len += sizeof(mrst);
 855                        rep.th.doff = arg.iov[0].iov_len / 4;
 856                }
 857        }
 858
 859        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 860                                      ip_hdr(skb)->saddr, /* XXX */
 861                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 862        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 863        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 864
 865        /* When socket is gone, all binding information is lost.
 866         * routing might fail in this case. No choice here, if we choose to force
 867         * input interface, we will misroute in case of asymmetric route.
 868         */
 869        if (sk) {
 870                arg.bound_dev_if = sk->sk_bound_dev_if;
 871                if (sk_fullsock(sk))
 872                        trace_tcp_send_reset(sk, skb);
 873        }
 874
 875        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 876                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 877
 878        arg.tos = ip_hdr(skb)->tos;
 879        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 880        local_bh_disable();
 881        ctl_sk = this_cpu_read(ipv4_tcp_sk);
 882        sock_net_set(ctl_sk, net);
 883        if (sk) {
 884                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 885                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 886                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 887                                   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 888                transmit_time = tcp_transmit_time(sk);
 889                xfrm_sk_clone_policy(ctl_sk, sk);
 890                txhash = (sk->sk_state == TCP_TIME_WAIT) ?
 891                         inet_twsk(sk)->tw_txhash : sk->sk_txhash;
 892        } else {
 893                ctl_sk->sk_mark = 0;
 894                ctl_sk->sk_priority = 0;
 895        }
 896        ip_send_unicast_reply(ctl_sk,
 897                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 898                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 899                              &arg, arg.iov[0].iov_len,
 900                              transmit_time, txhash);
 901
 902        xfrm_sk_free_policy(ctl_sk);
 903        sock_net_set(ctl_sk, &init_net);
 904        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 905        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 906        local_bh_enable();
 907
 908#ifdef CONFIG_TCP_MD5SIG
 909out:
 910        rcu_read_unlock();
 911#endif
 912}
 913
 914/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 915   outside socket context is ugly, certainly. What can I do?
 916 */
 917
 918static void tcp_v4_send_ack(const struct sock *sk,
 919                            struct sk_buff *skb, u32 seq, u32 ack,
 920                            u32 win, u32 tsval, u32 tsecr, int oif,
 921                            struct tcp_key *key,
 922                            int reply_flags, u8 tos, u32 txhash)
 923{
 924        const struct tcphdr *th = tcp_hdr(skb);
 925        struct {
 926                struct tcphdr th;
 927                __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
 928        } rep;
 929        struct net *net = sock_net(sk);
 930        struct ip_reply_arg arg;
 931        struct sock *ctl_sk;
 932        u64 transmit_time;
 933
 934        memset(&rep.th, 0, sizeof(struct tcphdr));
 935        memset(&arg, 0, sizeof(arg));
 936
 937        arg.iov[0].iov_base = (unsigned char *)&rep;
 938        arg.iov[0].iov_len  = sizeof(rep.th);
 939        if (tsecr) {
 940                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 941                                   (TCPOPT_TIMESTAMP << 8) |
 942                                   TCPOLEN_TIMESTAMP);
 943                rep.opt[1] = htonl(tsval);
 944                rep.opt[2] = htonl(tsecr);
 945                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 946        }
 947
 948        /* Swap the send and the receive. */
 949        rep.th.dest    = th->source;
 950        rep.th.source  = th->dest;
 951        rep.th.doff    = arg.iov[0].iov_len / 4;
 952        rep.th.seq     = htonl(seq);
 953        rep.th.ack_seq = htonl(ack);
 954        rep.th.ack     = 1;
 955        rep.th.window  = htons(win);
 956
 957#ifdef CONFIG_TCP_MD5SIG
 958        if (tcp_key_is_md5(key)) {
 959                int offset = (tsecr) ? 3 : 0;
 960
 961                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 962                                          (TCPOPT_NOP << 16) |
 963                                          (TCPOPT_MD5SIG << 8) |
 964                                          TCPOLEN_MD5SIG);
 965                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 966                rep.th.doff = arg.iov[0].iov_len/4;
 967
 968                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 969                                    key->md5_key, ip_hdr(skb)->saddr,
 970                                    ip_hdr(skb)->daddr, &rep.th);
 971        }
 972#endif
 973#ifdef CONFIG_TCP_AO
 974        if (tcp_key_is_ao(key)) {
 975                int offset = (tsecr) ? 3 : 0;
 976
 977                rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
 978                                          (tcp_ao_len(key->ao_key) << 16) |
 979                                          (key->ao_key->sndid << 8) |
 980                                          key->rcv_next);
 981                arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
 982                rep.th.doff = arg.iov[0].iov_len / 4;
 983
 984                tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
 985                                key->ao_key, key->traffic_key,
 986                                (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
 987                                (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
 988                                &rep.th, key->sne);
 989        }
 990#endif
 991        arg.flags = reply_flags;
 992        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 993                                      ip_hdr(skb)->saddr, /* XXX */
 994                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 995        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 996        if (oif)
 997                arg.bound_dev_if = oif;
 998        arg.tos = tos;
 999        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1000        local_bh_disable();
1001        ctl_sk = this_cpu_read(ipv4_tcp_sk);
1002        sock_net_set(ctl_sk, net);
1003        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1004                           inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1005        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1006                           inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1007        transmit_time = tcp_transmit_time(sk);
1008        ip_send_unicast_reply(ctl_sk,
1009                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
1010                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1011                              &arg, arg.iov[0].iov_len,
1012                              transmit_time, txhash);
1013
1014        sock_net_set(ctl_sk, &init_net);
1015        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1016        local_bh_enable();
1017}
1018
1019static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1020{
1021        struct inet_timewait_sock *tw = inet_twsk(sk);
1022        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1023        struct tcp_key key = {};
1024#ifdef CONFIG_TCP_AO
1025        struct tcp_ao_info *ao_info;
1026
1027        if (static_branch_unlikely(&tcp_ao_needed.key)) {
1028                /* FIXME: the segment to-be-acked is not verified yet */
1029                ao_info = rcu_dereference(tcptw->ao_info);
1030                if (ao_info) {
1031                        const struct tcp_ao_hdr *aoh;
1032
1033                        if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1034                                inet_twsk_put(tw);
1035                                return;
1036                        }
1037
1038                        if (aoh)
1039                                key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1040                }
1041        }
1042        if (key.ao_key) {
1043                struct tcp_ao_key *rnext_key;
1044
1045                key.traffic_key = snd_other_key(key.ao_key);
1046                key.sne = READ_ONCE(ao_info->snd_sne);
1047                rnext_key = READ_ONCE(ao_info->rnext_key);
1048                key.rcv_next = rnext_key->rcvid;
1049                key.type = TCP_KEY_AO;
1050#else
1051        if (0) {
1052#endif
1053#ifdef CONFIG_TCP_MD5SIG
1054        } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1055                key.md5_key = tcp_twsk_md5_key(tcptw);
1056                if (key.md5_key)
1057                        key.type = TCP_KEY_MD5;
1058#endif
1059        }
1060
1061        tcp_v4_send_ack(sk, skb,
1062                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1063                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1064                        tcp_tw_tsval(tcptw),
1065                        tcptw->tw_ts_recent,
1066                        tw->tw_bound_dev_if, &key,
1067                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1068                        tw->tw_tos,
1069                        tw->tw_txhash);
1070
1071        inet_twsk_put(tw);
1072}
1073
1074static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1075                                  struct request_sock *req)
1076{
1077        struct tcp_key key = {};
1078
1079        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1080         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1081         */
1082        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1083                                             tcp_sk(sk)->snd_nxt;
1084
1085#ifdef CONFIG_TCP_AO
1086        if (static_branch_unlikely(&tcp_ao_needed.key) &&
1087            tcp_rsk_used_ao(req)) {
1088                const union tcp_md5_addr *addr;
1089                const struct tcp_ao_hdr *aoh;
1090                int l3index;
1091
1092                /* Invalid TCP option size or twice included auth */
1093                if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1094                        return;
1095                if (!aoh)
1096                        return;
1097
1098                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1099                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1100                key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1101                                              aoh->rnext_keyid, -1);
1102                if (unlikely(!key.ao_key)) {
1103                        /* Send ACK with any matching MKT for the peer */
1104                        key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1105                        /* Matching key disappeared (user removed the key?)
1106                         * let the handshake timeout.
1107                         */
1108                        if (!key.ao_key) {
1109                                net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1110                                                     addr,
1111                                                     ntohs(tcp_hdr(skb)->source),
1112                                                     &ip_hdr(skb)->daddr,
1113                                                     ntohs(tcp_hdr(skb)->dest));
1114                                return;
1115                        }
1116                }
1117                key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1118                if (!key.traffic_key)
1119                        return;
1120
1121                key.type = TCP_KEY_AO;
1122                key.rcv_next = aoh->keyid;
1123                tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1124#else
1125        if (0) {
1126#endif
1127#ifdef CONFIG_TCP_MD5SIG
1128        } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1129                const union tcp_md5_addr *addr;
1130                int l3index;
1131
1132                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1133                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1134                key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1135                if (key.md5_key)
1136                        key.type = TCP_KEY_MD5;
1137#endif
1138        }
1139
1140        /* RFC 7323 2.3
1141         * The window field (SEG.WND) of every outgoing segment, with the
1142         * exception of <SYN> segments, MUST be right-shifted by
1143         * Rcv.Wind.Shift bits:
1144         */
1145        tcp_v4_send_ack(sk, skb, seq,
1146                        tcp_rsk(req)->rcv_nxt,
1147                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
1148                        tcp_rsk_tsval(tcp_rsk(req)),
1149                        READ_ONCE(req->ts_recent),
1150                        0, &key,
1151                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1152                        ip_hdr(skb)->tos,
1153                        READ_ONCE(tcp_rsk(req)->txhash));
1154        if (tcp_key_is_ao(&key))
1155                kfree(key.traffic_key);
1156}
1157
1158/*
1159 *      Send a SYN-ACK after having received a SYN.
1160 *      This still operates on a request_sock only, not on a big
1161 *      socket.
1162 */
1163static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1164                              struct flowi *fl,
1165                              struct request_sock *req,
1166                              struct tcp_fastopen_cookie *foc,
1167                              enum tcp_synack_type synack_type,
1168                              struct sk_buff *syn_skb)
1169{
1170        const struct inet_request_sock *ireq = inet_rsk(req);
1171        struct flowi4 fl4;
1172        int err = -1;
1173        struct sk_buff *skb;
1174        u8 tos;
1175
1176        /* First, grab a route. */
1177        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1178                return -1;
1179
1180        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1181
1182        if (skb) {
1183                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1184
1185                tos = READ_ONCE(inet_sk(sk)->tos);
1186
1187                if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1188                        tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1189                              (tos & INET_ECN_MASK);
1190
1191                if (!INET_ECN_is_capable(tos) &&
1192                    tcp_bpf_ca_needs_ecn((struct sock *)req))
1193                        tos |= INET_ECN_ECT_0;
1194
1195                rcu_read_lock();
1196                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1197                                            ireq->ir_rmt_addr,
1198                                            rcu_dereference(ireq->ireq_opt),
1199                                            tos);
1200                rcu_read_unlock();
1201                err = net_xmit_eval(err);
1202        }
1203
1204        return err;
1205}
1206
1207/*
1208 *      IPv4 request_sock destructor.
1209 */
1210static void tcp_v4_reqsk_destructor(struct request_sock *req)
1211{
1212        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1213}
1214
1215#ifdef CONFIG_TCP_MD5SIG
1216/*
1217 * RFC2385 MD5 checksumming requires a mapping of
1218 * IP address->MD5 Key.
1219 * We need to maintain these in the sk structure.
1220 */
1221
1222DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1223EXPORT_SYMBOL(tcp_md5_needed);
1224
1225static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1226{
1227        if (!old)
1228                return true;
1229
1230        /* l3index always overrides non-l3index */
1231        if (old->l3index && new->l3index == 0)
1232                return false;
1233        if (old->l3index == 0 && new->l3index)
1234                return true;
1235
1236        return old->prefixlen < new->prefixlen;
1237}
1238
1239/* Find the Key structure for an address.  */
1240struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1241                                           const union tcp_md5_addr *addr,
1242                                           int family, bool any_l3index)
1243{
1244        const struct tcp_sock *tp = tcp_sk(sk);
1245        struct tcp_md5sig_key *key;
1246        const struct tcp_md5sig_info *md5sig;
1247        __be32 mask;
1248        struct tcp_md5sig_key *best_match = NULL;
1249        bool match;
1250
1251        /* caller either holds rcu_read_lock() or socket lock */
1252        md5sig = rcu_dereference_check(tp->md5sig_info,
1253                                       lockdep_sock_is_held(sk));
1254        if (!md5sig)
1255                return NULL;
1256
1257        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1258                                 lockdep_sock_is_held(sk)) {
1259                if (key->family != family)
1260                        continue;
1261                if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1262                    key->l3index != l3index)
1263                        continue;
1264                if (family == AF_INET) {
1265                        mask = inet_make_mask(key->prefixlen);
1266                        match = (key->addr.a4.s_addr & mask) ==
1267                                (addr->a4.s_addr & mask);
1268#if IS_ENABLED(CONFIG_IPV6)
1269                } else if (family == AF_INET6) {
1270                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1271                                                  key->prefixlen);
1272#endif
1273                } else {
1274                        match = false;
1275                }
1276
1277                if (match && better_md5_match(best_match, key))
1278                        best_match = key;
1279        }
1280        return best_match;
1281}
1282EXPORT_SYMBOL(__tcp_md5_do_lookup);
1283
1284static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1285                                                      const union tcp_md5_addr *addr,
1286                                                      int family, u8 prefixlen,
1287                                                      int l3index, u8 flags)
1288{
1289        const struct tcp_sock *tp = tcp_sk(sk);
1290        struct tcp_md5sig_key *key;
1291        unsigned int size = sizeof(struct in_addr);
1292        const struct tcp_md5sig_info *md5sig;
1293
1294        /* caller either holds rcu_read_lock() or socket lock */
1295        md5sig = rcu_dereference_check(tp->md5sig_info,
1296                                       lockdep_sock_is_held(sk));
1297        if (!md5sig)
1298                return NULL;
1299#if IS_ENABLED(CONFIG_IPV6)
1300        if (family == AF_INET6)
1301                size = sizeof(struct in6_addr);
1302#endif
1303        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1304                                 lockdep_sock_is_held(sk)) {
1305                if (key->family != family)
1306                        continue;
1307                if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1308                        continue;
1309                if (key->l3index != l3index)
1310                        continue;
1311                if (!memcmp(&key->addr, addr, size) &&
1312                    key->prefixlen == prefixlen)
1313                        return key;
1314        }
1315        return NULL;
1316}
1317
1318struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1319                                         const struct sock *addr_sk)
1320{
1321        const union tcp_md5_addr *addr;
1322        int l3index;
1323
1324        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1325                                                 addr_sk->sk_bound_dev_if);
1326        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1327        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1328}
1329EXPORT_SYMBOL(tcp_v4_md5_lookup);
1330
1331static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1332{
1333        struct tcp_sock *tp = tcp_sk(sk);
1334        struct tcp_md5sig_info *md5sig;
1335
1336        md5sig = kmalloc(sizeof(*md5sig), gfp);
1337        if (!md5sig)
1338                return -ENOMEM;
1339
1340        sk_gso_disable(sk);
1341        INIT_HLIST_HEAD(&md5sig->head);
1342        rcu_assign_pointer(tp->md5sig_info, md5sig);
1343        return 0;
1344}
1345
1346/* This can be called on a newly created socket, from other files */
1347static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1348                            int family, u8 prefixlen, int l3index, u8 flags,
1349                            const u8 *newkey, u8 newkeylen, gfp_t gfp)
1350{
1351        /* Add Key to the list */
1352        struct tcp_md5sig_key *key;
1353        struct tcp_sock *tp = tcp_sk(sk);
1354        struct tcp_md5sig_info *md5sig;
1355
1356        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1357        if (key) {
1358                /* Pre-existing entry - just update that one.
1359                 * Note that the key might be used concurrently.
1360                 * data_race() is telling kcsan that we do not care of
1361                 * key mismatches, since changing MD5 key on live flows
1362                 * can lead to packet drops.
1363                 */
1364                data_race(memcpy(key->key, newkey, newkeylen));
1365
1366                /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1367                 * Also note that a reader could catch new key->keylen value
1368                 * but old key->key[], this is the reason we use __GFP_ZERO
1369                 * at sock_kmalloc() time below these lines.
1370                 */
1371                WRITE_ONCE(key->keylen, newkeylen);
1372
1373                return 0;
1374        }
1375
1376        md5sig = rcu_dereference_protected(tp->md5sig_info,
1377                                           lockdep_sock_is_held(sk));
1378
1379        key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1380        if (!key)
1381                return -ENOMEM;
1382
1383        memcpy(key->key, newkey, newkeylen);
1384        key->keylen = newkeylen;
1385        key->family = family;
1386        key->prefixlen = prefixlen;
1387        key->l3index = l3index;
1388        key->flags = flags;
1389        memcpy(&key->addr, addr,
1390               (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1391                                                                 sizeof(struct in_addr));
1392        hlist_add_head_rcu(&key->node, &md5sig->head);
1393        return 0;
1394}
1395
1396int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1397                   int family, u8 prefixlen, int l3index, u8 flags,
1398                   const u8 *newkey, u8 newkeylen)
1399{
1400        struct tcp_sock *tp = tcp_sk(sk);
1401
1402        if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1403                if (tcp_md5_alloc_sigpool())
1404                        return -ENOMEM;
1405
1406                if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1407                        tcp_md5_release_sigpool();
1408                        return -ENOMEM;
1409                }
1410
1411                if (!static_branch_inc(&tcp_md5_needed.key)) {
1412                        struct tcp_md5sig_info *md5sig;
1413
1414                        md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1415                        rcu_assign_pointer(tp->md5sig_info, NULL);
1416                        kfree_rcu(md5sig, rcu);
1417                        tcp_md5_release_sigpool();
1418                        return -EUSERS;
1419                }
1420        }
1421
1422        return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1423                                newkey, newkeylen, GFP_KERNEL);
1424}
1425EXPORT_SYMBOL(tcp_md5_do_add);
1426
1427int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1428                     int family, u8 prefixlen, int l3index,
1429                     struct tcp_md5sig_key *key)
1430{
1431        struct tcp_sock *tp = tcp_sk(sk);
1432
1433        if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1434                tcp_md5_add_sigpool();
1435
1436                if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1437                        tcp_md5_release_sigpool();
1438                        return -ENOMEM;
1439                }
1440
1441                if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1442                        struct tcp_md5sig_info *md5sig;
1443
1444                        md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1445                        net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1446                        rcu_assign_pointer(tp->md5sig_info, NULL);
1447                        kfree_rcu(md5sig, rcu);
1448                        tcp_md5_release_sigpool();
1449                        return -EUSERS;
1450                }
1451        }
1452
1453        return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1454                                key->flags, key->key, key->keylen,
1455                                sk_gfp_mask(sk, GFP_ATOMIC));
1456}
1457EXPORT_SYMBOL(tcp_md5_key_copy);
1458
1459int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1460                   u8 prefixlen, int l3index, u8 flags)
1461{
1462        struct tcp_md5sig_key *key;
1463
1464        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1465        if (!key)
1466                return -ENOENT;
1467        hlist_del_rcu(&key->node);
1468        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1469        kfree_rcu(key, rcu);
1470        return 0;
1471}
1472EXPORT_SYMBOL(tcp_md5_do_del);
1473
1474void tcp_clear_md5_list(struct sock *sk)
1475{
1476        struct tcp_sock *tp = tcp_sk(sk);
1477        struct tcp_md5sig_key *key;
1478        struct hlist_node *n;
1479        struct tcp_md5sig_info *md5sig;
1480
1481        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1482
1483        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1484                hlist_del_rcu(&key->node);
1485                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1486                kfree_rcu(key, rcu);
1487        }
1488}
1489
1490static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1491                                 sockptr_t optval, int optlen)
1492{
1493        struct tcp_md5sig cmd;
1494        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1495        const union tcp_md5_addr *addr;
1496        u8 prefixlen = 32;
1497        int l3index = 0;
1498        bool l3flag;
1499        u8 flags;
1500
1501        if (optlen < sizeof(cmd))
1502                return -EINVAL;
1503
1504        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1505                return -EFAULT;
1506
1507        if (sin->sin_family != AF_INET)
1508                return -EINVAL;
1509
1510        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1511        l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1512
1513        if (optname == TCP_MD5SIG_EXT &&
1514            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1515                prefixlen = cmd.tcpm_prefixlen;
1516                if (prefixlen > 32)
1517                        return -EINVAL;
1518        }
1519
1520        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1521            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1522                struct net_device *dev;
1523
1524                rcu_read_lock();
1525                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1526                if (dev && netif_is_l3_master(dev))
1527                        l3index = dev->ifindex;
1528
1529                rcu_read_unlock();
1530
1531                /* ok to reference set/not set outside of rcu;
1532                 * right now device MUST be an L3 master
1533                 */
1534                if (!dev || !l3index)
1535                        return -EINVAL;
1536        }
1537
1538        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1539
1540        if (!cmd.tcpm_keylen)
1541                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1542
1543        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1544                return -EINVAL;
1545
1546        /* Don't allow keys for peers that have a matching TCP-AO key.
1547         * See the comment in tcp_ao_add_cmd()
1548         */
1549        if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1550                return -EKEYREJECTED;
1551
1552        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1553                              cmd.tcpm_key, cmd.tcpm_keylen);
1554}
1555
1556static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1557                                   __be32 daddr, __be32 saddr,
1558                                   const struct tcphdr *th, int nbytes)
1559{
1560        struct tcp4_pseudohdr *bp;
1561        struct scatterlist sg;
1562        struct tcphdr *_th;
1563
1564        bp = hp->scratch;
1565        bp->saddr = saddr;
1566        bp->daddr = daddr;
1567        bp->pad = 0;
1568        bp->protocol = IPPROTO_TCP;
1569        bp->len = cpu_to_be16(nbytes);
1570
1571        _th = (struct tcphdr *)(bp + 1);
1572        memcpy(_th, th, sizeof(*th));
1573        _th->check = 0;
1574
1575        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1576        ahash_request_set_crypt(hp->req, &sg, NULL,
1577                                sizeof(*bp) + sizeof(*th));
1578        return crypto_ahash_update(hp->req);
1579}
1580
1581static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1582                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1583{
1584        struct tcp_sigpool hp;
1585
1586        if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1587                goto clear_hash_nostart;
1588
1589        if (crypto_ahash_init(hp.req))
1590                goto clear_hash;
1591        if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1592                goto clear_hash;
1593        if (tcp_md5_hash_key(&hp, key))
1594                goto clear_hash;
1595        ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1596        if (crypto_ahash_final(hp.req))
1597                goto clear_hash;
1598
1599        tcp_sigpool_end(&hp);
1600        return 0;
1601
1602clear_hash:
1603        tcp_sigpool_end(&hp);
1604clear_hash_nostart:
1605        memset(md5_hash, 0, 16);
1606        return 1;
1607}
1608
1609int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1610                        const struct sock *sk,
1611                        const struct sk_buff *skb)
1612{
1613        const struct tcphdr *th = tcp_hdr(skb);
1614        struct tcp_sigpool hp;
1615        __be32 saddr, daddr;
1616
1617        if (sk) { /* valid for establish/request sockets */
1618                saddr = sk->sk_rcv_saddr;
1619                daddr = sk->sk_daddr;
1620        } else {
1621                const struct iphdr *iph = ip_hdr(skb);
1622                saddr = iph->saddr;
1623                daddr = iph->daddr;
1624        }
1625
1626        if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1627                goto clear_hash_nostart;
1628
1629        if (crypto_ahash_init(hp.req))
1630                goto clear_hash;
1631
1632        if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1633                goto clear_hash;
1634        if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1635                goto clear_hash;
1636        if (tcp_md5_hash_key(&hp, key))
1637                goto clear_hash;
1638        ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1639        if (crypto_ahash_final(hp.req))
1640                goto clear_hash;
1641
1642        tcp_sigpool_end(&hp);
1643        return 0;
1644
1645clear_hash:
1646        tcp_sigpool_end(&hp);
1647clear_hash_nostart:
1648        memset(md5_hash, 0, 16);
1649        return 1;
1650}
1651EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1652
1653#endif
1654
1655static void tcp_v4_init_req(struct request_sock *req,
1656                            const struct sock *sk_listener,
1657                            struct sk_buff *skb)
1658{
1659        struct inet_request_sock *ireq = inet_rsk(req);
1660        struct net *net = sock_net(sk_listener);
1661
1662        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1663        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1664        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1665}
1666
1667static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1668                                          struct sk_buff *skb,
1669                                          struct flowi *fl,
1670                                          struct request_sock *req)
1671{
1672        tcp_v4_init_req(req, sk, skb);
1673
1674        if (security_inet_conn_request(sk, skb, req))
1675                return NULL;
1676
1677        return inet_csk_route_req(sk, &fl->u.ip4, req);
1678}
1679
1680struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1681        .family         =       PF_INET,
1682        .obj_size       =       sizeof(struct tcp_request_sock),
1683        .rtx_syn_ack    =       tcp_rtx_synack,
1684        .send_ack       =       tcp_v4_reqsk_send_ack,
1685        .destructor     =       tcp_v4_reqsk_destructor,
1686        .send_reset     =       tcp_v4_send_reset,
1687        .syn_ack_timeout =      tcp_syn_ack_timeout,
1688};
1689
1690const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1691        .mss_clamp      =       TCP_MSS_DEFAULT,
1692#ifdef CONFIG_TCP_MD5SIG
1693        .req_md5_lookup =       tcp_v4_md5_lookup,
1694        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1695#endif
1696#ifdef CONFIG_TCP_AO
1697        .ao_lookup      =       tcp_v4_ao_lookup_rsk,
1698        .ao_calc_key    =       tcp_v4_ao_calc_key_rsk,
1699        .ao_synack_hash =       tcp_v4_ao_synack_hash,
1700#endif
1701#ifdef CONFIG_SYN_COOKIES
1702        .cookie_init_seq =      cookie_v4_init_sequence,
1703#endif
1704        .route_req      =       tcp_v4_route_req,
1705        .init_seq       =       tcp_v4_init_seq,
1706        .init_ts_off    =       tcp_v4_init_ts_off,
1707        .send_synack    =       tcp_v4_send_synack,
1708};
1709
1710int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1711{
1712        /* Never answer to SYNs send to broadcast or multicast */
1713        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1714                goto drop;
1715
1716        return tcp_conn_request(&tcp_request_sock_ops,
1717                                &tcp_request_sock_ipv4_ops, sk, skb);
1718
1719drop:
1720        tcp_listendrop(sk);
1721        return 0;
1722}
1723EXPORT_SYMBOL(tcp_v4_conn_request);
1724
1725
1726/*
1727 * The three way handshake has completed - we got a valid synack -
1728 * now create the new socket.
1729 */
1730struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1731                                  struct request_sock *req,
1732                                  struct dst_entry *dst,
1733                                  struct request_sock *req_unhash,
1734                                  bool *own_req)
1735{
1736        struct inet_request_sock *ireq;
1737        bool found_dup_sk = false;
1738        struct inet_sock *newinet;
1739        struct tcp_sock *newtp;
1740        struct sock *newsk;
1741#ifdef CONFIG_TCP_MD5SIG
1742        const union tcp_md5_addr *addr;
1743        struct tcp_md5sig_key *key;
1744        int l3index;
1745#endif
1746        struct ip_options_rcu *inet_opt;
1747
1748        if (sk_acceptq_is_full(sk))
1749                goto exit_overflow;
1750
1751        newsk = tcp_create_openreq_child(sk, req, skb);
1752        if (!newsk)
1753                goto exit_nonewsk;
1754
1755        newsk->sk_gso_type = SKB_GSO_TCPV4;
1756        inet_sk_rx_dst_set(newsk, skb);
1757
1758        newtp                 = tcp_sk(newsk);
1759        newinet               = inet_sk(newsk);
1760        ireq                  = inet_rsk(req);
1761        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1762        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1763        newsk->sk_bound_dev_if = ireq->ir_iif;
1764        newinet->inet_saddr   = ireq->ir_loc_addr;
1765        inet_opt              = rcu_dereference(ireq->ireq_opt);
1766        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1767        newinet->mc_index     = inet_iif(skb);
1768        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1769        newinet->rcv_tos      = ip_hdr(skb)->tos;
1770        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1771        if (inet_opt)
1772                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1773        atomic_set(&newinet->inet_id, get_random_u16());
1774
1775        /* Set ToS of the new socket based upon the value of incoming SYN.
1776         * ECT bits are set later in tcp_init_transfer().
1777         */
1778        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1779                newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1780
1781        if (!dst) {
1782                dst = inet_csk_route_child_sock(sk, newsk, req);
1783                if (!dst)
1784                        goto put_and_exit;
1785        } else {
1786                /* syncookie case : see end of cookie_v4_check() */
1787        }
1788        sk_setup_caps(newsk, dst);
1789
1790        tcp_ca_openreq_child(newsk, dst);
1791
1792        tcp_sync_mss(newsk, dst_mtu(dst));
1793        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1794
1795        tcp_initialize_rcv_mss(newsk);
1796
1797#ifdef CONFIG_TCP_MD5SIG
1798        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1799        /* Copy over the MD5 key from the original socket */
1800        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1801        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1802        if (key && !tcp_rsk_used_ao(req)) {
1803                if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1804                        goto put_and_exit;
1805                sk_gso_disable(newsk);
1806        }
1807#endif
1808#ifdef CONFIG_TCP_AO
1809        if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1810                goto put_and_exit; /* OOM, release back memory */
1811#endif
1812
1813        if (__inet_inherit_port(sk, newsk) < 0)
1814                goto put_and_exit;
1815        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1816                                       &found_dup_sk);
1817        if (likely(*own_req)) {
1818                tcp_move_syn(newtp, req);
1819                ireq->ireq_opt = NULL;
1820        } else {
1821                newinet->inet_opt = NULL;
1822
1823                if (!req_unhash && found_dup_sk) {
1824                        /* This code path should only be executed in the
1825                         * syncookie case only
1826                         */
1827                        bh_unlock_sock(newsk);
1828                        sock_put(newsk);
1829                        newsk = NULL;
1830                }
1831        }
1832        return newsk;
1833
1834exit_overflow:
1835        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1836exit_nonewsk:
1837        dst_release(dst);
1838exit:
1839        tcp_listendrop(sk);
1840        return NULL;
1841put_and_exit:
1842        newinet->inet_opt = NULL;
1843        inet_csk_prepare_forced_close(newsk);
1844        tcp_done(newsk);
1845        goto exit;
1846}
1847EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1848
1849static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1850{
1851#ifdef CONFIG_SYN_COOKIES
1852        const struct tcphdr *th = tcp_hdr(skb);
1853
1854        if (!th->syn)
1855                sk = cookie_v4_check(sk, skb);
1856#endif
1857        return sk;
1858}
1859
1860u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1861                         struct tcphdr *th, u32 *cookie)
1862{
1863        u16 mss = 0;
1864#ifdef CONFIG_SYN_COOKIES
1865        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1866                                    &tcp_request_sock_ipv4_ops, sk, th);
1867        if (mss) {
1868                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1869                tcp_synq_overflow(sk);
1870        }
1871#endif
1872        return mss;
1873}
1874
1875INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1876                                                           u32));
1877/* The socket must have it's spinlock held when we get
1878 * here, unless it is a TCP_LISTEN socket.
1879 *
1880 * We have a potential double-lock case here, so even when
1881 * doing backlog processing we use the BH locking scheme.
1882 * This is because we cannot sleep with the original spinlock
1883 * held.
1884 */
1885int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1886{
1887        enum skb_drop_reason reason;
1888        struct sock *rsk;
1889
1890        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1891                struct dst_entry *dst;
1892
1893                dst = rcu_dereference_protected(sk->sk_rx_dst,
1894                                                lockdep_sock_is_held(sk));
1895
1896                sock_rps_save_rxhash(sk, skb);
1897                sk_mark_napi_id(sk, skb);
1898                if (dst) {
1899                        if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1900                            !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1901                                             dst, 0)) {
1902                                RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1903                                dst_release(dst);
1904                        }
1905                }
1906                tcp_rcv_established(sk, skb);
1907                return 0;
1908        }
1909
1910        reason = SKB_DROP_REASON_NOT_SPECIFIED;
1911        if (tcp_checksum_complete(skb))
1912                goto csum_err;
1913
1914        if (sk->sk_state == TCP_LISTEN) {
1915                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1916
1917                if (!nsk)
1918                        goto discard;
1919                if (nsk != sk) {
1920                        if (tcp_child_process(sk, nsk, skb)) {
1921                                rsk = nsk;
1922                                goto reset;
1923                        }
1924                        return 0;
1925                }
1926        } else
1927                sock_rps_save_rxhash(sk, skb);
1928
1929        if (tcp_rcv_state_process(sk, skb)) {
1930                rsk = sk;
1931                goto reset;
1932        }
1933        return 0;
1934
1935reset:
1936        tcp_v4_send_reset(rsk, skb);
1937discard:
1938        kfree_skb_reason(skb, reason);
1939        /* Be careful here. If this function gets more complicated and
1940         * gcc suffers from register pressure on the x86, sk (in %ebx)
1941         * might be destroyed here. This current version compiles correctly,
1942         * but you have been warned.
1943         */
1944        return 0;
1945
1946csum_err:
1947        reason = SKB_DROP_REASON_TCP_CSUM;
1948        trace_tcp_bad_csum(skb);
1949        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1950        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1951        goto discard;
1952}
1953EXPORT_SYMBOL(tcp_v4_do_rcv);
1954
1955int tcp_v4_early_demux(struct sk_buff *skb)
1956{
1957        struct net *net = dev_net(skb->dev);
1958        const struct iphdr *iph;
1959        const struct tcphdr *th;
1960        struct sock *sk;
1961
1962        if (skb->pkt_type != PACKET_HOST)
1963                return 0;
1964
1965        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1966                return 0;
1967
1968        iph = ip_hdr(skb);
1969        th = tcp_hdr(skb);
1970
1971        if (th->doff < sizeof(struct tcphdr) / 4)
1972                return 0;
1973
1974        sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1975                                       iph->saddr, th->source,
1976                                       iph->daddr, ntohs(th->dest),
1977                                       skb->skb_iif, inet_sdif(skb));
1978        if (sk) {
1979                skb->sk = sk;
1980                skb->destructor = sock_edemux;
1981                if (sk_fullsock(sk)) {
1982                        struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1983
1984                        if (dst)
1985                                dst = dst_check(dst, 0);
1986                        if (dst &&
1987                            sk->sk_rx_dst_ifindex == skb->skb_iif)
1988                                skb_dst_set_noref(skb, dst);
1989                }
1990        }
1991        return 0;
1992}
1993
1994bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1995                     enum skb_drop_reason *reason)
1996{
1997        u32 limit, tail_gso_size, tail_gso_segs;
1998        struct skb_shared_info *shinfo;
1999        const struct tcphdr *th;
2000        struct tcphdr *thtail;
2001        struct sk_buff *tail;
2002        unsigned int hdrlen;
2003        bool fragstolen;
2004        u32 gso_segs;
2005        u32 gso_size;
2006        int delta;
2007
2008        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2009         * we can fix skb->truesize to its real value to avoid future drops.
2010         * This is valid because skb is not yet charged to the socket.
2011         * It has been noticed pure SACK packets were sometimes dropped
2012         * (if cooked by drivers without copybreak feature).
2013         */
2014        skb_condense(skb);
2015
2016        skb_dst_drop(skb);
2017
2018        if (unlikely(tcp_checksum_complete(skb))) {
2019                bh_unlock_sock(sk);
2020                trace_tcp_bad_csum(skb);
2021                *reason = SKB_DROP_REASON_TCP_CSUM;
2022                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2023                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2024                return true;
2025        }
2026
2027        /* Attempt coalescing to last skb in backlog, even if we are
2028         * above the limits.
2029         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2030         */
2031        th = (const struct tcphdr *)skb->data;
2032        hdrlen = th->doff * 4;
2033
2034        tail = sk->sk_backlog.tail;
2035        if (!tail)
2036                goto no_coalesce;
2037        thtail = (struct tcphdr *)tail->data;
2038
2039        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2040            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2041            ((TCP_SKB_CB(tail)->tcp_flags |
2042              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2043            !((TCP_SKB_CB(tail)->tcp_flags &
2044              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2045            ((TCP_SKB_CB(tail)->tcp_flags ^
2046              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2047#ifdef CONFIG_TLS_DEVICE
2048            tail->decrypted != skb->decrypted ||
2049#endif
2050            !mptcp_skb_can_collapse(tail, skb) ||
2051            thtail->doff != th->doff ||
2052            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2053                goto no_coalesce;
2054
2055        __skb_pull(skb, hdrlen);
2056
2057        shinfo = skb_shinfo(skb);
2058        gso_size = shinfo->gso_size ?: skb->len;
2059        gso_segs = shinfo->gso_segs ?: 1;
2060
2061        shinfo = skb_shinfo(tail);
2062        tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2063        tail_gso_segs = shinfo->gso_segs ?: 1;
2064
2065        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2066                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2067
2068                if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2069                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2070                        thtail->window = th->window;
2071                }
2072
2073                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2074                 * thtail->fin, so that the fast path in tcp_rcv_established()
2075                 * is not entered if we append a packet with a FIN.
2076                 * SYN, RST, URG are not present.
2077                 * ACK is set on both packets.
2078                 * PSH : we do not really care in TCP stack,
2079                 *       at least for 'GRO' packets.
2080                 */
2081                thtail->fin |= th->fin;
2082                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2083
2084                if (TCP_SKB_CB(skb)->has_rxtstamp) {
2085                        TCP_SKB_CB(tail)->has_rxtstamp = true;
2086                        tail->tstamp = skb->tstamp;
2087                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2088                }
2089
2090                /* Not as strict as GRO. We only need to carry mss max value */
2091                shinfo->gso_size = max(gso_size, tail_gso_size);
2092                shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2093
2094                sk->sk_backlog.len += delta;
2095                __NET_INC_STATS(sock_net(sk),
2096                                LINUX_MIB_TCPBACKLOGCOALESCE);
2097                kfree_skb_partial(skb, fragstolen);
2098                return false;
2099        }
2100        __skb_push(skb, hdrlen);
2101
2102no_coalesce:
2103        limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
2104
2105        /* Only socket owner can try to collapse/prune rx queues
2106         * to reduce memory overhead, so add a little headroom here.
2107         * Few sockets backlog are possibly concurrently non empty.
2108         */
2109        limit += 64 * 1024;
2110
2111        if (unlikely(sk_add_backlog(sk, skb, limit))) {
2112                bh_unlock_sock(sk);
2113                *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2114                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2115                return true;
2116        }
2117        return false;
2118}
2119EXPORT_SYMBOL(tcp_add_backlog);
2120
2121int tcp_filter(struct sock *sk, struct sk_buff *skb)
2122{
2123        struct tcphdr *th = (struct tcphdr *)skb->data;
2124
2125        return sk_filter_trim_cap(sk, skb, th->doff * 4);
2126}
2127EXPORT_SYMBOL(tcp_filter);
2128
2129static void tcp_v4_restore_cb(struct sk_buff *skb)
2130{
2131        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2132                sizeof(struct inet_skb_parm));
2133}
2134
2135static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2136                           const struct tcphdr *th)
2137{
2138        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2139         * barrier() makes sure compiler wont play fool^Waliasing games.
2140         */
2141        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2142                sizeof(struct inet_skb_parm));
2143        barrier();
2144
2145        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2146        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2147                                    skb->len - th->doff * 4);
2148        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2149        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2150        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
2151        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2152        TCP_SKB_CB(skb)->sacked  = 0;
2153        TCP_SKB_CB(skb)->has_rxtstamp =
2154                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2155}
2156
2157/*
2158 *      From tcp_input.c
2159 */
2160
2161int tcp_v4_rcv(struct sk_buff *skb)
2162{
2163        struct net *net = dev_net(skb->dev);
2164        enum skb_drop_reason drop_reason;
2165        int sdif = inet_sdif(skb);
2166        int dif = inet_iif(skb);
2167        const struct iphdr *iph;
2168        const struct tcphdr *th;
2169        bool refcounted;
2170        struct sock *sk;
2171        int ret;
2172
2173        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2174        if (skb->pkt_type != PACKET_HOST)
2175                goto discard_it;
2176
2177        /* Count it even if it's bad */
2178        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2179
2180        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2181                goto discard_it;
2182
2183        th = (const struct tcphdr *)skb->data;
2184
2185        if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2186                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2187                goto bad_packet;
2188        }
2189        if (!pskb_may_pull(skb, th->doff * 4))
2190                goto discard_it;
2191
2192        /* An explanation is required here, I think.
2193         * Packet length and doff are validated by header prediction,
2194         * provided case of th->doff==0 is eliminated.
2195         * So, we defer the checks. */
2196
2197        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2198                goto csum_error;
2199
2200        th = (const struct tcphdr *)skb->data;
2201        iph = ip_hdr(skb);
2202lookup:
2203        sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2204                               skb, __tcp_hdrlen(th), th->source,
2205                               th->dest, sdif, &refcounted);
2206        if (!sk)
2207                goto no_tcp_socket;
2208
2209process:
2210        if (sk->sk_state == TCP_TIME_WAIT)
2211                goto do_time_wait;
2212
2213        if (sk->sk_state == TCP_NEW_SYN_RECV) {
2214                struct request_sock *req = inet_reqsk(sk);
2215                bool req_stolen = false;
2216                struct sock *nsk;
2217
2218                sk = req->rsk_listener;
2219                if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2220                        drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2221                else
2222                        drop_reason = tcp_inbound_hash(sk, req, skb,
2223                                                       &iph->saddr, &iph->daddr,
2224                                                       AF_INET, dif, sdif);
2225                if (unlikely(drop_reason)) {
2226                        sk_drops_add(sk, skb);
2227                        reqsk_put(req);
2228                        goto discard_it;
2229                }
2230                if (tcp_checksum_complete(skb)) {
2231                        reqsk_put(req);
2232                        goto csum_error;
2233                }
2234                if (unlikely(sk->sk_state != TCP_LISTEN)) {
2235                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2236                        if (!nsk) {
2237                                inet_csk_reqsk_queue_drop_and_put(sk, req);
2238                                goto lookup;
2239                        }
2240                        sk = nsk;
2241                        /* reuseport_migrate_sock() has already held one sk_refcnt
2242                         * before returning.
2243                         */
2244                } else {
2245                        /* We own a reference on the listener, increase it again
2246                         * as we might lose it too soon.
2247                         */
2248                        sock_hold(sk);
2249                }
2250                refcounted = true;
2251                nsk = NULL;
2252                if (!tcp_filter(sk, skb)) {
2253                        th = (const struct tcphdr *)skb->data;
2254                        iph = ip_hdr(skb);
2255                        tcp_v4_fill_cb(skb, iph, th);
2256                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2257                } else {
2258                        drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2259                }
2260                if (!nsk) {
2261                        reqsk_put(req);
2262                        if (req_stolen) {
2263                                /* Another cpu got exclusive access to req
2264                                 * and created a full blown socket.
2265                                 * Try to feed this packet to this socket
2266                                 * instead of discarding it.
2267                                 */
2268                                tcp_v4_restore_cb(skb);
2269                                sock_put(sk);
2270                                goto lookup;
2271                        }
2272                        goto discard_and_relse;
2273                }
2274                nf_reset_ct(skb);
2275                if (nsk == sk) {
2276                        reqsk_put(req);
2277                        tcp_v4_restore_cb(skb);
2278                } else if (tcp_child_process(sk, nsk, skb)) {
2279                        tcp_v4_send_reset(nsk, skb);
2280                        goto discard_and_relse;
2281                } else {
2282                        sock_put(sk);
2283                        return 0;
2284                }
2285        }
2286
2287        if (static_branch_unlikely(&ip4_min_ttl)) {
2288                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2289                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2290                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2291                        drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2292                        goto discard_and_relse;
2293                }
2294        }
2295
2296        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2297                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2298                goto discard_and_relse;
2299        }
2300
2301        drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2302                                       AF_INET, dif, sdif);
2303        if (drop_reason)
2304                goto discard_and_relse;
2305
2306        nf_reset_ct(skb);
2307
2308        if (tcp_filter(sk, skb)) {
2309                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2310                goto discard_and_relse;
2311        }
2312        th = (const struct tcphdr *)skb->data;
2313        iph = ip_hdr(skb);
2314        tcp_v4_fill_cb(skb, iph, th);
2315
2316        skb->dev = NULL;
2317
2318        if (sk->sk_state == TCP_LISTEN) {
2319                ret = tcp_v4_do_rcv(sk, skb);
2320                goto put_and_return;
2321        }
2322
2323        sk_incoming_cpu_update(sk);
2324
2325        bh_lock_sock_nested(sk);
2326        tcp_segs_in(tcp_sk(sk), skb);
2327        ret = 0;
2328        if (!sock_owned_by_user(sk)) {
2329                ret = tcp_v4_do_rcv(sk, skb);
2330        } else {
2331                if (tcp_add_backlog(sk, skb, &drop_reason))
2332                        goto discard_and_relse;
2333        }
2334        bh_unlock_sock(sk);
2335
2336put_and_return:
2337        if (refcounted)
2338                sock_put(sk);
2339
2340        return ret;
2341
2342no_tcp_socket:
2343        drop_reason = SKB_DROP_REASON_NO_SOCKET;
2344        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2345                goto discard_it;
2346
2347        tcp_v4_fill_cb(skb, iph, th);
2348
2349        if (tcp_checksum_complete(skb)) {
2350csum_error:
2351                drop_reason = SKB_DROP_REASON_TCP_CSUM;
2352                trace_tcp_bad_csum(skb);
2353                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2354bad_packet:
2355                __TCP_INC_STATS(net, TCP_MIB_INERRS);
2356        } else {
2357                tcp_v4_send_reset(NULL, skb);
2358        }
2359
2360discard_it:
2361        SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2362        /* Discard frame. */
2363        kfree_skb_reason(skb, drop_reason);
2364        return 0;
2365
2366discard_and_relse:
2367        sk_drops_add(sk, skb);
2368        if (refcounted)
2369                sock_put(sk);
2370        goto discard_it;
2371
2372do_time_wait:
2373        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2374                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2375                inet_twsk_put(inet_twsk(sk));
2376                goto discard_it;
2377        }
2378
2379        tcp_v4_fill_cb(skb, iph, th);
2380
2381        if (tcp_checksum_complete(skb)) {
2382                inet_twsk_put(inet_twsk(sk));
2383                goto csum_error;
2384        }
2385        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2386        case TCP_TW_SYN: {
2387                struct sock *sk2 = inet_lookup_listener(net,
2388                                                        net->ipv4.tcp_death_row.hashinfo,
2389                                                        skb, __tcp_hdrlen(th),
2390                                                        iph->saddr, th->source,
2391                                                        iph->daddr, th->dest,
2392                                                        inet_iif(skb),
2393                                                        sdif);
2394                if (sk2) {
2395                        inet_twsk_deschedule_put(inet_twsk(sk));
2396                        sk = sk2;
2397                        tcp_v4_restore_cb(skb);
2398                        refcounted = false;
2399                        goto process;
2400                }
2401        }
2402                /* to ACK */
2403                fallthrough;
2404        case TCP_TW_ACK:
2405                tcp_v4_timewait_ack(sk, skb);
2406                break;
2407        case TCP_TW_RST:
2408                tcp_v4_send_reset(sk, skb);
2409                inet_twsk_deschedule_put(inet_twsk(sk));
2410                goto discard_it;
2411        case TCP_TW_SUCCESS:;
2412        }
2413        goto discard_it;
2414}
2415
2416static struct timewait_sock_ops tcp_timewait_sock_ops = {
2417        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2418        .twsk_unique    = tcp_twsk_unique,
2419        .twsk_destructor= tcp_twsk_destructor,
2420};
2421
2422void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2423{
2424        struct dst_entry *dst = skb_dst(skb);
2425
2426        if (dst && dst_hold_safe(dst)) {
2427                rcu_assign_pointer(sk->sk_rx_dst, dst);
2428                sk->sk_rx_dst_ifindex = skb->skb_iif;
2429        }
2430}
2431EXPORT_SYMBOL(inet_sk_rx_dst_set);
2432
2433const struct inet_connection_sock_af_ops ipv4_specific = {
2434        .queue_xmit        = ip_queue_xmit,
2435        .send_check        = tcp_v4_send_check,
2436        .rebuild_header    = inet_sk_rebuild_header,
2437        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2438        .conn_request      = tcp_v4_conn_request,
2439        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2440        .net_header_len    = sizeof(struct iphdr),
2441        .setsockopt        = ip_setsockopt,
2442        .getsockopt        = ip_getsockopt,
2443        .addr2sockaddr     = inet_csk_addr2sockaddr,
2444        .sockaddr_len      = sizeof(struct sockaddr_in),
2445        .mtu_reduced       = tcp_v4_mtu_reduced,
2446};
2447EXPORT_SYMBOL(ipv4_specific);
2448
2449#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2450static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2451#ifdef CONFIG_TCP_MD5SIG
2452        .md5_lookup             = tcp_v4_md5_lookup,
2453        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2454        .md5_parse              = tcp_v4_parse_md5_keys,
2455#endif
2456#ifdef CONFIG_TCP_AO
2457        .ao_lookup              = tcp_v4_ao_lookup,
2458        .calc_ao_hash           = tcp_v4_ao_hash_skb,
2459        .ao_parse               = tcp_v4_parse_ao,
2460        .ao_calc_key_sk         = tcp_v4_ao_calc_key_sk,
2461#endif
2462};
2463#endif
2464
2465/* NOTE: A lot of things set to zero explicitly by call to
2466 *       sk_alloc() so need not be done here.
2467 */
2468static int tcp_v4_init_sock(struct sock *sk)
2469{
2470        struct inet_connection_sock *icsk = inet_csk(sk);
2471
2472        tcp_init_sock(sk);
2473
2474        icsk->icsk_af_ops = &ipv4_specific;
2475
2476#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2477        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2478#endif
2479
2480        return 0;
2481}
2482
2483#ifdef CONFIG_TCP_MD5SIG
2484static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2485{
2486        struct tcp_md5sig_info *md5sig;
2487
2488        md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2489        kfree(md5sig);
2490        static_branch_slow_dec_deferred(&tcp_md5_needed);
2491        tcp_md5_release_sigpool();
2492}
2493#endif
2494
2495void tcp_v4_destroy_sock(struct sock *sk)
2496{
2497        struct tcp_sock *tp = tcp_sk(sk);
2498
2499        trace_tcp_destroy_sock(sk);
2500
2501        tcp_clear_xmit_timers(sk);
2502
2503        tcp_cleanup_congestion_control(sk);
2504
2505        tcp_cleanup_ulp(sk);
2506
2507        /* Cleanup up the write buffer. */
2508        tcp_write_queue_purge(sk);
2509
2510        /* Check if we want to disable active TFO */
2511        tcp_fastopen_active_disable_ofo_check(sk);
2512
2513        /* Cleans up our, hopefully empty, out_of_order_queue. */
2514        skb_rbtree_purge(&tp->out_of_order_queue);
2515
2516#ifdef CONFIG_TCP_MD5SIG
2517        /* Clean up the MD5 key list, if any */
2518        if (tp->md5sig_info) {
2519                struct tcp_md5sig_info *md5sig;
2520
2521                md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2522                tcp_clear_md5_list(sk);
2523                call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2524                rcu_assign_pointer(tp->md5sig_info, NULL);
2525        }
2526#endif
2527        tcp_ao_destroy_sock(sk, false);
2528
2529        /* Clean up a referenced TCP bind bucket. */
2530        if (inet_csk(sk)->icsk_bind_hash)
2531                inet_put_port(sk);
2532
2533        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2534
2535        /* If socket is aborted during connect operation */
2536        tcp_free_fastopen_req(tp);
2537        tcp_fastopen_destroy_cipher(sk);
2538        tcp_saved_syn_free(tp);
2539
2540        sk_sockets_allocated_dec(sk);
2541}
2542EXPORT_SYMBOL(tcp_v4_destroy_sock);
2543
2544#ifdef CONFIG_PROC_FS
2545/* Proc filesystem TCP sock list dumping. */
2546
2547static unsigned short seq_file_family(const struct seq_file *seq);
2548
2549static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2550{
2551        unsigned short family = seq_file_family(seq);
2552
2553        /* AF_UNSPEC is used as a match all */
2554        return ((family == AF_UNSPEC || family == sk->sk_family) &&
2555                net_eq(sock_net(sk), seq_file_net(seq)));
2556}
2557
2558/* Find a non empty bucket (starting from st->bucket)
2559 * and return the first sk from it.
2560 */
2561static void *listening_get_first(struct seq_file *seq)
2562{
2563        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2564        struct tcp_iter_state *st = seq->private;
2565
2566        st->offset = 0;
2567        for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2568                struct inet_listen_hashbucket *ilb2;
2569                struct hlist_nulls_node *node;
2570                struct sock *sk;
2571
2572                ilb2 = &hinfo->lhash2[st->bucket];
2573                if (hlist_nulls_empty(&ilb2->nulls_head))
2574                        continue;
2575
2576                spin_lock(&ilb2->lock);
2577                sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2578                        if (seq_sk_match(seq, sk))
2579                                return sk;
2580                }
2581                spin_unlock(&ilb2->lock);
2582        }
2583
2584        return NULL;
2585}
2586
2587/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2588 * If "cur" is the last one in the st->bucket,
2589 * call listening_get_first() to return the first sk of the next
2590 * non empty bucket.
2591 */
2592static void *listening_get_next(struct seq_file *seq, void *cur)
2593{
2594        struct tcp_iter_state *st = seq->private;
2595        struct inet_listen_hashbucket *ilb2;
2596        struct hlist_nulls_node *node;
2597        struct inet_hashinfo *hinfo;
2598        struct sock *sk = cur;
2599
2600        ++st->num;
2601        ++st->offset;
2602
2603        sk = sk_nulls_next(sk);
2604        sk_nulls_for_each_from(sk, node) {
2605                if (seq_sk_match(seq, sk))
2606                        return sk;
2607        }
2608
2609        hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610        ilb2 = &hinfo->lhash2[st->bucket];
2611        spin_unlock(&ilb2->lock);
2612        ++st->bucket;
2613        return listening_get_first(seq);
2614}
2615
2616static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2617{
2618        struct tcp_iter_state *st = seq->private;
2619        void *rc;
2620
2621        st->bucket = 0;
2622        st->offset = 0;
2623        rc = listening_get_first(seq);
2624
2625        while (rc && *pos) {
2626                rc = listening_get_next(seq, rc);
2627                --*pos;
2628        }
2629        return rc;
2630}
2631
2632static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2633                                const struct tcp_iter_state *st)
2634{
2635        return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2636}
2637
2638/*
2639 * Get first established socket starting from bucket given in st->bucket.
2640 * If st->bucket is zero, the very first socket in the hash is returned.
2641 */
2642static void *established_get_first(struct seq_file *seq)
2643{
2644        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2645        struct tcp_iter_state *st = seq->private;
2646
2647        st->offset = 0;
2648        for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2649                struct sock *sk;
2650                struct hlist_nulls_node *node;
2651                spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2652
2653                cond_resched();
2654
2655                /* Lockless fast path for the common case of empty buckets */
2656                if (empty_bucket(hinfo, st))
2657                        continue;
2658
2659                spin_lock_bh(lock);
2660                sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2661                        if (seq_sk_match(seq, sk))
2662                                return sk;
2663                }
2664                spin_unlock_bh(lock);
2665        }
2666
2667        return NULL;
2668}
2669
2670static void *established_get_next(struct seq_file *seq, void *cur)
2671{
2672        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2673        struct tcp_iter_state *st = seq->private;
2674        struct hlist_nulls_node *node;
2675        struct sock *sk = cur;
2676
2677        ++st->num;
2678        ++st->offset;
2679
2680        sk = sk_nulls_next(sk);
2681
2682        sk_nulls_for_each_from(sk, node) {
2683                if (seq_sk_match(seq, sk))
2684                        return sk;
2685        }
2686
2687        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2688        ++st->bucket;
2689        return established_get_first(seq);
2690}
2691
2692static void *established_get_idx(struct seq_file *seq, loff_t pos)
2693{
2694        struct tcp_iter_state *st = seq->private;
2695        void *rc;
2696
2697        st->bucket = 0;
2698        rc = established_get_first(seq);
2699
2700        while (rc && pos) {
2701                rc = established_get_next(seq, rc);
2702                --pos;
2703        }
2704        return rc;
2705}
2706
2707static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2708{
2709        void *rc;
2710        struct tcp_iter_state *st = seq->private;
2711
2712        st->state = TCP_SEQ_STATE_LISTENING;
2713        rc        = listening_get_idx(seq, &pos);
2714
2715        if (!rc) {
2716                st->state = TCP_SEQ_STATE_ESTABLISHED;
2717                rc        = established_get_idx(seq, pos);
2718        }
2719
2720        return rc;
2721}
2722
2723static void *tcp_seek_last_pos(struct seq_file *seq)
2724{
2725        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2726        struct tcp_iter_state *st = seq->private;
2727        int bucket = st->bucket;
2728        int offset = st->offset;
2729        int orig_num = st->num;
2730        void *rc = NULL;
2731
2732        switch (st->state) {
2733        case TCP_SEQ_STATE_LISTENING:
2734                if (st->bucket > hinfo->lhash2_mask)
2735                        break;
2736                rc = listening_get_first(seq);
2737                while (offset-- && rc && bucket == st->bucket)
2738                        rc = listening_get_next(seq, rc);
2739                if (rc)
2740                        break;
2741                st->bucket = 0;
2742                st->state = TCP_SEQ_STATE_ESTABLISHED;
2743                fallthrough;
2744        case TCP_SEQ_STATE_ESTABLISHED:
2745                if (st->bucket > hinfo->ehash_mask)
2746                        break;
2747                rc = established_get_first(seq);
2748                while (offset-- && rc && bucket == st->bucket)
2749                        rc = established_get_next(seq, rc);
2750        }
2751
2752        st->num = orig_num;
2753
2754        return rc;
2755}
2756
2757void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2758{
2759        struct tcp_iter_state *st = seq->private;
2760        void *rc;
2761
2762        if (*pos && *pos == st->last_pos) {
2763                rc = tcp_seek_last_pos(seq);
2764                if (rc)
2765                        goto out;
2766        }
2767
2768        st->state = TCP_SEQ_STATE_LISTENING;
2769        st->num = 0;
2770        st->bucket = 0;
2771        st->offset = 0;
2772        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2773
2774out:
2775        st->last_pos = *pos;
2776        return rc;
2777}
2778EXPORT_SYMBOL(tcp_seq_start);
2779
2780void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2781{
2782        struct tcp_iter_state *st = seq->private;
2783        void *rc = NULL;
2784
2785        if (v == SEQ_START_TOKEN) {
2786                rc = tcp_get_idx(seq, 0);
2787                goto out;
2788        }
2789
2790        switch (st->state) {
2791        case TCP_SEQ_STATE_LISTENING:
2792                rc = listening_get_next(seq, v);
2793                if (!rc) {
2794                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2795                        st->bucket = 0;
2796                        st->offset = 0;
2797                        rc        = established_get_first(seq);
2798                }
2799                break;
2800        case TCP_SEQ_STATE_ESTABLISHED:
2801                rc = established_get_next(seq, v);
2802                break;
2803        }
2804out:
2805        ++*pos;
2806        st->last_pos = *pos;
2807        return rc;
2808}
2809EXPORT_SYMBOL(tcp_seq_next);
2810
2811void tcp_seq_stop(struct seq_file *seq, void *v)
2812{
2813        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2814        struct tcp_iter_state *st = seq->private;
2815
2816        switch (st->state) {
2817        case TCP_SEQ_STATE_LISTENING:
2818                if (v != SEQ_START_TOKEN)
2819                        spin_unlock(&hinfo->lhash2[st->bucket].lock);
2820                break;
2821        case TCP_SEQ_STATE_ESTABLISHED:
2822                if (v)
2823                        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2824                break;
2825        }
2826}
2827EXPORT_SYMBOL(tcp_seq_stop);
2828
2829static void get_openreq4(const struct request_sock *req,
2830                         struct seq_file *f, int i)
2831{
2832        const struct inet_request_sock *ireq = inet_rsk(req);
2833        long delta = req->rsk_timer.expires - jiffies;
2834
2835        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2836                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2837                i,
2838                ireq->ir_loc_addr,
2839                ireq->ir_num,
2840                ireq->ir_rmt_addr,
2841                ntohs(ireq->ir_rmt_port),
2842                TCP_SYN_RECV,
2843                0, 0, /* could print option size, but that is af dependent. */
2844                1,    /* timers active (only the expire timer) */
2845                jiffies_delta_to_clock_t(delta),
2846                req->num_timeout,
2847                from_kuid_munged(seq_user_ns(f),
2848                                 sock_i_uid(req->rsk_listener)),
2849                0,  /* non standard timer */
2850                0, /* open_requests have no inode */
2851                0,
2852                req);
2853}
2854
2855static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2856{
2857        int timer_active;
2858        unsigned long timer_expires;
2859        const struct tcp_sock *tp = tcp_sk(sk);
2860        const struct inet_connection_sock *icsk = inet_csk(sk);
2861        const struct inet_sock *inet = inet_sk(sk);
2862        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2863        __be32 dest = inet->inet_daddr;
2864        __be32 src = inet->inet_rcv_saddr;
2865        __u16 destp = ntohs(inet->inet_dport);
2866        __u16 srcp = ntohs(inet->inet_sport);
2867        int rx_queue;
2868        int state;
2869
2870        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2871            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2872            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2873                timer_active    = 1;
2874                timer_expires   = icsk->icsk_timeout;
2875        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2876                timer_active    = 4;
2877                timer_expires   = icsk->icsk_timeout;
2878        } else if (timer_pending(&sk->sk_timer)) {
2879                timer_active    = 2;
2880                timer_expires   = sk->sk_timer.expires;
2881        } else {
2882                timer_active    = 0;
2883                timer_expires = jiffies;
2884        }
2885
2886        state = inet_sk_state_load(sk);
2887        if (state == TCP_LISTEN)
2888                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2889        else
2890                /* Because we don't lock the socket,
2891                 * we might find a transient negative value.
2892                 */
2893                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2894                                      READ_ONCE(tp->copied_seq), 0);
2895
2896        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2897                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2898                i, src, srcp, dest, destp, state,
2899                READ_ONCE(tp->write_seq) - tp->snd_una,
2900                rx_queue,
2901                timer_active,
2902                jiffies_delta_to_clock_t(timer_expires - jiffies),
2903                icsk->icsk_retransmits,
2904                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2905                icsk->icsk_probes_out,
2906                sock_i_ino(sk),
2907                refcount_read(&sk->sk_refcnt), sk,
2908                jiffies_to_clock_t(icsk->icsk_rto),
2909                jiffies_to_clock_t(icsk->icsk_ack.ato),
2910                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2911                tcp_snd_cwnd(tp),
2912                state == TCP_LISTEN ?
2913                    fastopenq->max_qlen :
2914                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2915}
2916
2917static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2918                               struct seq_file *f, int i)
2919{
2920        long delta = tw->tw_timer.expires - jiffies;
2921        __be32 dest, src;
2922        __u16 destp, srcp;
2923
2924        dest  = tw->tw_daddr;
2925        src   = tw->tw_rcv_saddr;
2926        destp = ntohs(tw->tw_dport);
2927        srcp  = ntohs(tw->tw_sport);
2928
2929        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2930                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2931                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2932                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2933                refcount_read(&tw->tw_refcnt), tw);
2934}
2935
2936#define TMPSZ 150
2937
2938static int tcp4_seq_show(struct seq_file *seq, void *v)
2939{
2940        struct tcp_iter_state *st;
2941        struct sock *sk = v;
2942
2943        seq_setwidth(seq, TMPSZ - 1);
2944        if (v == SEQ_START_TOKEN) {
2945                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2946                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2947                           "inode");
2948                goto out;
2949        }
2950        st = seq->private;
2951
2952        if (sk->sk_state == TCP_TIME_WAIT)
2953                get_timewait4_sock(v, seq, st->num);
2954        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2955                get_openreq4(v, seq, st->num);
2956        else
2957                get_tcp4_sock(v, seq, st->num);
2958out:
2959        seq_pad(seq, '\n');
2960        return 0;
2961}
2962
2963#ifdef CONFIG_BPF_SYSCALL
2964struct bpf_tcp_iter_state {
2965        struct tcp_iter_state state;
2966        unsigned int cur_sk;
2967        unsigned int end_sk;
2968        unsigned int max_sk;
2969        struct sock **batch;
2970        bool st_bucket_done;
2971};
2972
2973struct bpf_iter__tcp {
2974        __bpf_md_ptr(struct bpf_iter_meta *, meta);
2975        __bpf_md_ptr(struct sock_common *, sk_common);
2976        uid_t uid __aligned(8);
2977};
2978
2979static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2980                             struct sock_common *sk_common, uid_t uid)
2981{
2982        struct bpf_iter__tcp ctx;
2983
2984        meta->seq_num--;  /* skip SEQ_START_TOKEN */
2985        ctx.meta = meta;
2986        ctx.sk_common = sk_common;
2987        ctx.uid = uid;
2988        return bpf_iter_run_prog(prog, &ctx);
2989}
2990
2991static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2992{
2993        while (iter->cur_sk < iter->end_sk)
2994                sock_gen_put(iter->batch[iter->cur_sk++]);
2995}
2996
2997static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2998                                      unsigned int new_batch_sz)
2999{
3000        struct sock **new_batch;
3001
3002        new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3003                             GFP_USER | __GFP_NOWARN);
3004        if (!new_batch)
3005                return -ENOMEM;
3006
3007        bpf_iter_tcp_put_batch(iter);
3008        kvfree(iter->batch);
3009        iter->batch = new_batch;
3010        iter->max_sk = new_batch_sz;
3011
3012        return 0;
3013}
3014
3015static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3016                                                 struct sock *start_sk)
3017{
3018        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3019        struct bpf_tcp_iter_state *iter = seq->private;
3020        struct tcp_iter_state *st = &iter->state;
3021        struct hlist_nulls_node *node;
3022        unsigned int expected = 1;
3023        struct sock *sk;
3024
3025        sock_hold(start_sk);
3026        iter->batch[iter->end_sk++] = start_sk;
3027
3028        sk = sk_nulls_next(start_sk);
3029        sk_nulls_for_each_from(sk, node) {
3030                if (seq_sk_match(seq, sk)) {
3031                        if (iter->end_sk < iter->max_sk) {
3032                                sock_hold(sk);
3033                                iter->batch[iter->end_sk++] = sk;
3034                        }
3035                        expected++;
3036                }
3037        }
3038        spin_unlock(&hinfo->lhash2[st->bucket].lock);
3039
3040        return expected;
3041}
3042
3043static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3044                                                   struct sock *start_sk)
3045{
3046        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3047        struct bpf_tcp_iter_state *iter = seq->private;
3048        struct tcp_iter_state *st = &iter->state;
3049        struct hlist_nulls_node *node;
3050        unsigned int expected = 1;
3051        struct sock *sk;
3052
3053        sock_hold(start_sk);
3054        iter->batch[iter->end_sk++] = start_sk;
3055
3056        sk = sk_nulls_next(start_sk);
3057        sk_nulls_for_each_from(sk, node) {
3058                if (seq_sk_match(seq, sk)) {
3059                        if (iter->end_sk < iter->max_sk) {
3060                                sock_hold(sk);
3061                                iter->batch[iter->end_sk++] = sk;
3062                        }
3063                        expected++;
3064                }
3065        }
3066        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3067
3068        return expected;
3069}
3070
3071static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3072{
3073        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3074        struct bpf_tcp_iter_state *iter = seq->private;
3075        struct tcp_iter_state *st = &iter->state;
3076        unsigned int expected;
3077        bool resized = false;
3078        struct sock *sk;
3079
3080        /* The st->bucket is done.  Directly advance to the next
3081         * bucket instead of having the tcp_seek_last_pos() to skip
3082         * one by one in the current bucket and eventually find out
3083         * it has to advance to the next bucket.
3084         */
3085        if (iter->st_bucket_done) {
3086                st->offset = 0;
3087                st->bucket++;
3088                if (st->state == TCP_SEQ_STATE_LISTENING &&
3089                    st->bucket > hinfo->lhash2_mask) {
3090                        st->state = TCP_SEQ_STATE_ESTABLISHED;
3091                        st->bucket = 0;
3092                }
3093        }
3094
3095again:
3096        /* Get a new batch */
3097        iter->cur_sk = 0;
3098        iter->end_sk = 0;
3099        iter->st_bucket_done = false;
3100
3101        sk = tcp_seek_last_pos(seq);
3102        if (!sk)
3103                return NULL; /* Done */
3104
3105        if (st->state == TCP_SEQ_STATE_LISTENING)
3106                expected = bpf_iter_tcp_listening_batch(seq, sk);
3107        else
3108                expected = bpf_iter_tcp_established_batch(seq, sk);
3109
3110        if (iter->end_sk == expected) {
3111                iter->st_bucket_done = true;
3112                return sk;
3113        }
3114
3115        if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3116                resized = true;
3117                goto again;
3118        }
3119
3120        return sk;
3121}
3122
3123static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3124{
3125        /* bpf iter does not support lseek, so it always
3126         * continue from where it was stop()-ped.
3127         */
3128        if (*pos)
3129                return bpf_iter_tcp_batch(seq);
3130
3131        return SEQ_START_TOKEN;
3132}
3133
3134static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3135{
3136        struct bpf_tcp_iter_state *iter = seq->private;
3137        struct tcp_iter_state *st = &iter->state;
3138        struct sock *sk;
3139
3140        /* Whenever seq_next() is called, the iter->cur_sk is
3141         * done with seq_show(), so advance to the next sk in
3142         * the batch.
3143         */
3144        if (iter->cur_sk < iter->end_sk) {
3145                /* Keeping st->num consistent in tcp_iter_state.
3146                 * bpf_iter_tcp does not use st->num.
3147                 * meta.seq_num is used instead.
3148                 */
3149                st->num++;
3150                /* Move st->offset to the next sk in the bucket such that
3151                 * the future start() will resume at st->offset in
3152                 * st->bucket.  See tcp_seek_last_pos().
3153                 */
3154                st->offset++;
3155                sock_gen_put(iter->batch[iter->cur_sk++]);
3156        }
3157
3158        if (iter->cur_sk < iter->end_sk)
3159                sk = iter->batch[iter->cur_sk];
3160        else
3161                sk = bpf_iter_tcp_batch(seq);
3162
3163        ++*pos;
3164        /* Keeping st->last_pos consistent in tcp_iter_state.
3165         * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3166         */
3167        st->last_pos = *pos;
3168        return sk;
3169}
3170
3171static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3172{
3173        struct bpf_iter_meta meta;
3174        struct bpf_prog *prog;
3175        struct sock *sk = v;
3176        uid_t uid;
3177        int ret;
3178
3179        if (v == SEQ_START_TOKEN)
3180                return 0;
3181
3182        if (sk_fullsock(sk))
3183                lock_sock(sk);
3184
3185        if (unlikely(sk_unhashed(sk))) {
3186                ret = SEQ_SKIP;
3187                goto unlock;
3188        }
3189
3190        if (sk->sk_state == TCP_TIME_WAIT) {
3191                uid = 0;
3192        } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3193                const struct request_sock *req = v;
3194
3195                uid = from_kuid_munged(seq_user_ns(seq),
3196                                       sock_i_uid(req->rsk_listener));
3197        } else {
3198                uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3199        }
3200
3201        meta.seq = seq;
3202        prog = bpf_iter_get_info(&meta, false);
3203        ret = tcp_prog_seq_show(prog, &meta, v, uid);
3204
3205unlock:
3206        if (sk_fullsock(sk))
3207                release_sock(sk);
3208        return ret;
3209
3210}
3211
3212static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3213{
3214        struct bpf_tcp_iter_state *iter = seq->private;
3215        struct bpf_iter_meta meta;
3216        struct bpf_prog *prog;
3217
3218        if (!v) {
3219                meta.seq = seq;
3220                prog = bpf_iter_get_info(&meta, true);
3221                if (prog)
3222                        (void)tcp_prog_seq_show(prog, &meta, v, 0);
3223        }
3224
3225        if (iter->cur_sk < iter->end_sk) {
3226                bpf_iter_tcp_put_batch(iter);
3227                iter->st_bucket_done = false;
3228        }
3229}
3230
3231static const struct seq_operations bpf_iter_tcp_seq_ops = {
3232        .show           = bpf_iter_tcp_seq_show,
3233        .start          = bpf_iter_tcp_seq_start,
3234        .next           = bpf_iter_tcp_seq_next,
3235        .stop           = bpf_iter_tcp_seq_stop,
3236};
3237#endif
3238static unsigned short seq_file_family(const struct seq_file *seq)
3239{
3240        const struct tcp_seq_afinfo *afinfo;
3241
3242#ifdef CONFIG_BPF_SYSCALL
3243        /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3244        if (seq->op == &bpf_iter_tcp_seq_ops)
3245                return AF_UNSPEC;
3246#endif
3247
3248        /* Iterated from proc fs */
3249        afinfo = pde_data(file_inode(seq->file));
3250        return afinfo->family;
3251}
3252
3253static const struct seq_operations tcp4_seq_ops = {
3254        .show           = tcp4_seq_show,
3255        .start          = tcp_seq_start,
3256        .next           = tcp_seq_next,
3257        .stop           = tcp_seq_stop,
3258};
3259
3260static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3261        .family         = AF_INET,
3262};
3263
3264static int __net_init tcp4_proc_init_net(struct net *net)
3265{
3266        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3267                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3268                return -ENOMEM;
3269        return 0;
3270}
3271
3272static void __net_exit tcp4_proc_exit_net(struct net *net)
3273{
3274        remove_proc_entry("tcp", net->proc_net);
3275}
3276
3277static struct pernet_operations tcp4_net_ops = {
3278        .init = tcp4_proc_init_net,
3279        .exit = tcp4_proc_exit_net,
3280};
3281
3282int __init tcp4_proc_init(void)
3283{
3284        return register_pernet_subsys(&tcp4_net_ops);
3285}
3286
3287void tcp4_proc_exit(void)
3288{
3289        unregister_pernet_subsys(&tcp4_net_ops);
3290}
3291#endif /* CONFIG_PROC_FS */
3292
3293/* @wake is one when sk_stream_write_space() calls us.
3294 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3295 * This mimics the strategy used in sock_def_write_space().
3296 */
3297bool tcp_stream_memory_free(const struct sock *sk, int wake)
3298{
3299        const struct tcp_sock *tp = tcp_sk(sk);
3300        u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3301                            READ_ONCE(tp->snd_nxt);
3302
3303        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3304}
3305EXPORT_SYMBOL(tcp_stream_memory_free);
3306
3307struct proto tcp_prot = {
3308        .name                   = "TCP",
3309        .owner                  = THIS_MODULE,
3310        .close                  = tcp_close,
3311        .pre_connect            = tcp_v4_pre_connect,
3312        .connect                = tcp_v4_connect,
3313        .disconnect             = tcp_disconnect,
3314        .accept                 = inet_csk_accept,
3315        .ioctl                  = tcp_ioctl,
3316        .init                   = tcp_v4_init_sock,
3317        .destroy                = tcp_v4_destroy_sock,
3318        .shutdown               = tcp_shutdown,
3319        .setsockopt             = tcp_setsockopt,
3320        .getsockopt             = tcp_getsockopt,
3321        .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3322        .keepalive              = tcp_set_keepalive,
3323        .recvmsg                = tcp_recvmsg,
3324        .sendmsg                = tcp_sendmsg,
3325        .splice_eof             = tcp_splice_eof,
3326        .backlog_rcv            = tcp_v4_do_rcv,
3327        .release_cb             = tcp_release_cb,
3328        .hash                   = inet_hash,
3329        .unhash                 = inet_unhash,
3330        .get_port               = inet_csk_get_port,
3331        .put_port               = inet_put_port,
3332#ifdef CONFIG_BPF_SYSCALL
3333        .psock_update_sk_prot   = tcp_bpf_update_proto,
3334#endif
3335        .enter_memory_pressure  = tcp_enter_memory_pressure,
3336        .leave_memory_pressure  = tcp_leave_memory_pressure,
3337        .stream_memory_free     = tcp_stream_memory_free,
3338        .sockets_allocated      = &tcp_sockets_allocated,
3339        .orphan_count           = &tcp_orphan_count,
3340
3341        .memory_allocated       = &tcp_memory_allocated,
3342        .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3343
3344        .memory_pressure        = &tcp_memory_pressure,
3345        .sysctl_mem             = sysctl_tcp_mem,
3346        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3347        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3348        .max_header             = MAX_TCP_HEADER,
3349        .obj_size               = sizeof(struct tcp_sock),
3350        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3351        .twsk_prot              = &tcp_timewait_sock_ops,
3352        .rsk_prot               = &tcp_request_sock_ops,
3353        .h.hashinfo             = NULL,
3354        .no_autobind            = true,
3355        .diag_destroy           = tcp_abort,
3356};
3357EXPORT_SYMBOL(tcp_prot);
3358
3359static void __net_exit tcp_sk_exit(struct net *net)
3360{
3361        if (net->ipv4.tcp_congestion_control)
3362                bpf_module_put(net->ipv4.tcp_congestion_control,
3363                               net->ipv4.tcp_congestion_control->owner);
3364}
3365
3366static void __net_init tcp_set_hashinfo(struct net *net)
3367{
3368        struct inet_hashinfo *hinfo;
3369        unsigned int ehash_entries;
3370        struct net *old_net;
3371
3372        if (net_eq(net, &init_net))
3373                goto fallback;
3374
3375        old_net = current->nsproxy->net_ns;
3376        ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3377        if (!ehash_entries)
3378                goto fallback;
3379
3380        ehash_entries = roundup_pow_of_two(ehash_entries);
3381        hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3382        if (!hinfo) {
3383                pr_warn("Failed to allocate TCP ehash (entries: %u) "
3384                        "for a netns, fallback to the global one\n",
3385                        ehash_entries);
3386fallback:
3387                hinfo = &tcp_hashinfo;
3388                ehash_entries = tcp_hashinfo.ehash_mask + 1;
3389        }
3390
3391        net->ipv4.tcp_death_row.hashinfo = hinfo;
3392        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3393        net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3394}
3395
3396static int __net_init tcp_sk_init(struct net *net)
3397{
3398        net->ipv4.sysctl_tcp_ecn = 2;
3399        net->ipv4.sysctl_tcp_ecn_fallback = 1;
3400
3401        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3402        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3403        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3404        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3405        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3406
3407        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3408        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3409        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3410
3411        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3412        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3413        net->ipv4.sysctl_tcp_syncookies = 1;
3414        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3415        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3416        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3417        net->ipv4.sysctl_tcp_orphan_retries = 0;
3418        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3419        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3420        net->ipv4.sysctl_tcp_tw_reuse = 2;
3421        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3422
3423        refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3424        tcp_set_hashinfo(net);
3425
3426        net->ipv4.sysctl_tcp_sack = 1;
3427        net->ipv4.sysctl_tcp_window_scaling = 1;
3428        net->ipv4.sysctl_tcp_timestamps = 1;
3429        net->ipv4.sysctl_tcp_early_retrans = 3;
3430        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3431        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3432        net->ipv4.sysctl_tcp_retrans_collapse = 1;
3433        net->ipv4.sysctl_tcp_max_reordering = 300;
3434        net->ipv4.sysctl_tcp_dsack = 1;
3435        net->ipv4.sysctl_tcp_app_win = 31;
3436        net->ipv4.sysctl_tcp_adv_win_scale = 1;
3437        net->ipv4.sysctl_tcp_frto = 2;
3438        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3439        /* This limits the percentage of the congestion window which we
3440         * will allow a single TSO frame to consume.  Building TSO frames
3441         * which are too large can cause TCP streams to be bursty.
3442         */
3443        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3444        /* Default TSQ limit of 16 TSO segments */
3445        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3446
3447        /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3448        net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3449
3450        net->ipv4.sysctl_tcp_min_tso_segs = 2;
3451        net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3452        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3453        net->ipv4.sysctl_tcp_autocorking = 1;
3454        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3455        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3456        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3457        if (net != &init_net) {
3458                memcpy(net->ipv4.sysctl_tcp_rmem,
3459                       init_net.ipv4.sysctl_tcp_rmem,
3460                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3461                memcpy(net->ipv4.sysctl_tcp_wmem,
3462                       init_net.ipv4.sysctl_tcp_wmem,
3463                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3464        }
3465        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3466        net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3467        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3468        net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3469        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3470        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3471        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3472
3473        /* Set default values for PLB */
3474        net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3475        net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3476        net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3477        net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3478        /* Default congestion threshold for PLB to mark a round is 50% */
3479        net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3480
3481        /* Reno is always built in */
3482        if (!net_eq(net, &init_net) &&
3483            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3484                               init_net.ipv4.tcp_congestion_control->owner))
3485                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3486        else
3487                net->ipv4.tcp_congestion_control = &tcp_reno;
3488
3489        net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3490        net->ipv4.sysctl_tcp_shrink_window = 0;
3491
3492        net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3493
3494        return 0;
3495}
3496
3497static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3498{
3499        struct net *net;
3500
3501        tcp_twsk_purge(net_exit_list, AF_INET);
3502
3503        list_for_each_entry(net, net_exit_list, exit_list) {
3504                inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3505                WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3506                tcp_fastopen_ctx_destroy(net);
3507        }
3508}
3509
3510static struct pernet_operations __net_initdata tcp_sk_ops = {
3511       .init       = tcp_sk_init,
3512       .exit       = tcp_sk_exit,
3513       .exit_batch = tcp_sk_exit_batch,
3514};
3515
3516#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3517DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3518                     struct sock_common *sk_common, uid_t uid)
3519
3520#define INIT_BATCH_SZ 16
3521
3522static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3523{
3524        struct bpf_tcp_iter_state *iter = priv_data;
3525        int err;
3526
3527        err = bpf_iter_init_seq_net(priv_data, aux);
3528        if (err)
3529                return err;
3530
3531        err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3532        if (err) {
3533                bpf_iter_fini_seq_net(priv_data);
3534                return err;
3535        }
3536
3537        return 0;
3538}
3539
3540static void bpf_iter_fini_tcp(void *priv_data)
3541{
3542        struct bpf_tcp_iter_state *iter = priv_data;
3543
3544        bpf_iter_fini_seq_net(priv_data);
3545        kvfree(iter->batch);
3546}
3547
3548static const struct bpf_iter_seq_info tcp_seq_info = {
3549        .seq_ops                = &bpf_iter_tcp_seq_ops,
3550        .init_seq_private       = bpf_iter_init_tcp,
3551        .fini_seq_private       = bpf_iter_fini_tcp,
3552        .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3553};
3554
3555static const struct bpf_func_proto *
3556bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3557                            const struct bpf_prog *prog)
3558{
3559        switch (func_id) {
3560        case BPF_FUNC_setsockopt:
3561                return &bpf_sk_setsockopt_proto;
3562        case BPF_FUNC_getsockopt:
3563                return &bpf_sk_getsockopt_proto;
3564        default:
3565                return NULL;
3566        }
3567}
3568
3569static struct bpf_iter_reg tcp_reg_info = {
3570        .target                 = "tcp",
3571        .ctx_arg_info_size      = 1,
3572        .ctx_arg_info           = {
3573                { offsetof(struct bpf_iter__tcp, sk_common),
3574                  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3575        },
3576        .get_func_proto         = bpf_iter_tcp_get_func_proto,
3577        .seq_info               = &tcp_seq_info,
3578};
3579
3580static void __init bpf_iter_register(void)
3581{
3582        tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3583        if (bpf_iter_reg_target(&tcp_reg_info))
3584                pr_warn("Warning: could not register bpf iterator tcp\n");
3585}
3586
3587#endif
3588
3589void __init tcp_v4_init(void)
3590{
3591        int cpu, res;
3592
3593        for_each_possible_cpu(cpu) {
3594                struct sock *sk;
3595
3596                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3597                                           IPPROTO_TCP, &init_net);
3598                if (res)
3599                        panic("Failed to create the TCP control socket.\n");
3600                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3601
3602                /* Please enforce IP_DF and IPID==0 for RST and
3603                 * ACK sent in SYN-RECV and TIME-WAIT state.
3604                 */
3605                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3606
3607                per_cpu(ipv4_tcp_sk, cpu) = sk;
3608        }
3609        if (register_pernet_subsys(&tcp_sk_ops))
3610                panic("Failed to create the TCP control socket.\n");
3611
3612#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3613        bpf_iter_register();
3614#endif
3615}
3616