linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79#include <linux/btf_ids.h>
  80
  81#include <crypto/hash.h>
  82#include <linux/scatterlist.h>
  83
  84#include <trace/events/tcp.h>
  85
  86#ifdef CONFIG_TCP_MD5SIG
  87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89#endif
  90
  91struct inet_hashinfo tcp_hashinfo;
  92EXPORT_SYMBOL(tcp_hashinfo);
  93
  94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97{
  98        return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                              ip_hdr(skb)->saddr,
 100                              tcp_hdr(skb)->dest,
 101                              tcp_hdr(skb)->source);
 102}
 103
 104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105{
 106        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107}
 108
 109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110{
 111        int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114        struct tcp_sock *tp = tcp_sk(sk);
 115
 116        if (reuse == 2) {
 117                /* Still does not detect *everything* that goes through
 118                 * lo, since we require a loopback src or dst address
 119                 * or direct binding to 'lo' interface.
 120                 */
 121                bool loopback = false;
 122                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                        loopback = true;
 124#if IS_ENABLED(CONFIG_IPV6)
 125                if (tw->tw_family == AF_INET6) {
 126                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                loopback = true;
 131                } else
 132#endif
 133                {
 134                        if (ipv4_is_loopback(tw->tw_daddr) ||
 135                            ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                loopback = true;
 137                }
 138                if (!loopback)
 139                        reuse = 0;
 140        }
 141
 142        /* With PAWS, it is safe from the viewpoint
 143           of data integrity. Even without PAWS it is safe provided sequence
 144           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146           Actually, the idea is close to VJ's one, only timestamp cache is
 147           held not per host, but per port pair and TW bucket is used as state
 148           holder.
 149
 150           If TW bucket has been already destroyed we fall back to VJ's scheme
 151           and use initial timestamp retrieved from peer table.
 152         */
 153        if (tcptw->tw_ts_recent_stamp &&
 154            (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                            tcptw->tw_ts_recent_stamp)))) {
 156                /* In case of repair and re-using TIME-WAIT sockets we still
 157                 * want to be sure that it is safe as above but honor the
 158                 * sequence numbers and time stamps set as part of the repair
 159                 * process.
 160                 *
 161                 * Without this check re-using a TIME-WAIT socket with TCP
 162                 * repair would accumulate a -1 on the repair assigned
 163                 * sequence number. The first time it is reused the sequence
 164                 * is -1, the second time -2, etc. This fixes that issue
 165                 * without appearing to create any others.
 166                 */
 167                if (likely(!tp->repair)) {
 168                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                        if (!seq)
 171                                seq = 1;
 172                        WRITE_ONCE(tp->write_seq, seq);
 173                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                }
 176                sock_hold(sktw);
 177                return 1;
 178        }
 179
 180        return 0;
 181}
 182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                              int addr_len)
 186{
 187        /* This check is replicated from tcp_v4_connect() and intended to
 188         * prevent BPF program called below from accessing bytes that are out
 189         * of the bound specified by user in addr_len.
 190         */
 191        if (addr_len < sizeof(struct sockaddr_in))
 192                return -EINVAL;
 193
 194        sock_owned_by_me(sk);
 195
 196        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197}
 198
 199/* This will initiate an outgoing connection. */
 200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201{
 202        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203        struct inet_sock *inet = inet_sk(sk);
 204        struct tcp_sock *tp = tcp_sk(sk);
 205        __be16 orig_sport, orig_dport;
 206        __be32 daddr, nexthop;
 207        struct flowi4 *fl4;
 208        struct rtable *rt;
 209        int err;
 210        struct ip_options_rcu *inet_opt;
 211        struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 212
 213        if (addr_len < sizeof(struct sockaddr_in))
 214                return -EINVAL;
 215
 216        if (usin->sin_family != AF_INET)
 217                return -EAFNOSUPPORT;
 218
 219        nexthop = daddr = usin->sin_addr.s_addr;
 220        inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                             lockdep_sock_is_held(sk));
 222        if (inet_opt && inet_opt->opt.srr) {
 223                if (!daddr)
 224                        return -EINVAL;
 225                nexthop = inet_opt->opt.faddr;
 226        }
 227
 228        orig_sport = inet->inet_sport;
 229        orig_dport = usin->sin_port;
 230        fl4 = &inet->cork.fl.u.ip4;
 231        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                              sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 233                              orig_dport, sk);
 234        if (IS_ERR(rt)) {
 235                err = PTR_ERR(rt);
 236                if (err == -ENETUNREACH)
 237                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                return err;
 239        }
 240
 241        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                ip_rt_put(rt);
 243                return -ENETUNREACH;
 244        }
 245
 246        if (!inet_opt || !inet_opt->opt.srr)
 247                daddr = fl4->daddr;
 248
 249        if (!inet->inet_saddr)
 250                inet->inet_saddr = fl4->saddr;
 251        sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                /* Reset inherited state */
 255                tp->rx_opt.ts_recent       = 0;
 256                tp->rx_opt.ts_recent_stamp = 0;
 257                if (likely(!tp->repair))
 258                        WRITE_ONCE(tp->write_seq, 0);
 259        }
 260
 261        inet->inet_dport = usin->sin_port;
 262        sk_daddr_set(sk, daddr);
 263
 264        inet_csk(sk)->icsk_ext_hdr_len = 0;
 265        if (inet_opt)
 266                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270        /* Socket identity is still unknown (sport may be zero).
 271         * However we set state to SYN-SENT and not releasing socket
 272         * lock select source port, enter ourselves into the hash tables and
 273         * complete initialization after this.
 274         */
 275        tcp_set_state(sk, TCP_SYN_SENT);
 276        err = inet_hash_connect(tcp_death_row, sk);
 277        if (err)
 278                goto failure;
 279
 280        sk_set_txhash(sk);
 281
 282        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                               inet->inet_sport, inet->inet_dport, sk);
 284        if (IS_ERR(rt)) {
 285                err = PTR_ERR(rt);
 286                rt = NULL;
 287                goto failure;
 288        }
 289        /* OK, now commit destination to socket.  */
 290        sk->sk_gso_type = SKB_GSO_TCPV4;
 291        sk_setup_caps(sk, &rt->dst);
 292        rt = NULL;
 293
 294        if (likely(!tp->repair)) {
 295                if (!tp->write_seq)
 296                        WRITE_ONCE(tp->write_seq,
 297                                   secure_tcp_seq(inet->inet_saddr,
 298                                                  inet->inet_daddr,
 299                                                  inet->inet_sport,
 300                                                  usin->sin_port));
 301                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                 inet->inet_saddr,
 303                                                 inet->inet_daddr);
 304        }
 305
 306        inet->inet_id = prandom_u32();
 307
 308        if (tcp_fastopen_defer_connect(sk, &err))
 309                return err;
 310        if (err)
 311                goto failure;
 312
 313        err = tcp_connect(sk);
 314
 315        if (err)
 316                goto failure;
 317
 318        return 0;
 319
 320failure:
 321        /*
 322         * This unhashes the socket and releases the local port,
 323         * if necessary.
 324         */
 325        tcp_set_state(sk, TCP_CLOSE);
 326        ip_rt_put(rt);
 327        sk->sk_route_caps = 0;
 328        inet->inet_dport = 0;
 329        return err;
 330}
 331EXPORT_SYMBOL(tcp_v4_connect);
 332
 333/*
 334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335 * It can be called through tcp_release_cb() if socket was owned by user
 336 * at the time tcp_v4_err() was called to handle ICMP message.
 337 */
 338void tcp_v4_mtu_reduced(struct sock *sk)
 339{
 340        struct inet_sock *inet = inet_sk(sk);
 341        struct dst_entry *dst;
 342        u32 mtu;
 343
 344        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                return;
 346        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 347        dst = inet_csk_update_pmtu(sk, mtu);
 348        if (!dst)
 349                return;
 350
 351        /* Something is about to be wrong... Remember soft error
 352         * for the case, if this connection will not able to recover.
 353         */
 354        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                sk->sk_err_soft = EMSGSIZE;
 356
 357        mtu = dst_mtu(dst);
 358
 359        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360            ip_sk_accept_pmtu(sk) &&
 361            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                tcp_sync_mss(sk, mtu);
 363
 364                /* Resend the TCP packet because it's
 365                 * clear that the old packet has been
 366                 * dropped. This is the new "fast" path mtu
 367                 * discovery.
 368                 */
 369                tcp_simple_retransmit(sk);
 370        } /* else let the usual retransmit timer handle it */
 371}
 372EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373
 374static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375{
 376        struct dst_entry *dst = __sk_dst_check(sk, 0);
 377
 378        if (dst)
 379                dst->ops->redirect(dst, sk, skb);
 380}
 381
 382
 383/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385{
 386        struct request_sock *req = inet_reqsk(sk);
 387        struct net *net = sock_net(sk);
 388
 389        /* ICMPs are not backlogged, hence we cannot get
 390         * an established socket here.
 391         */
 392        if (seq != tcp_rsk(req)->snt_isn) {
 393                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394        } else if (abort) {
 395                /*
 396                 * Still in SYN_RECV, just remove it silently.
 397                 * There is no good way to pass the error to the newly
 398                 * created socket, and POSIX does not want network
 399                 * errors returned from accept().
 400                 */
 401                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                tcp_listendrop(req->rsk_listener);
 403        }
 404        reqsk_put(req);
 405}
 406EXPORT_SYMBOL(tcp_req_err);
 407
 408/* TCP-LD (RFC 6069) logic */
 409void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 410{
 411        struct inet_connection_sock *icsk = inet_csk(sk);
 412        struct tcp_sock *tp = tcp_sk(sk);
 413        struct sk_buff *skb;
 414        s32 remaining;
 415        u32 delta_us;
 416
 417        if (sock_owned_by_user(sk))
 418                return;
 419
 420        if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 421            !icsk->icsk_backoff)
 422                return;
 423
 424        skb = tcp_rtx_queue_head(sk);
 425        if (WARN_ON_ONCE(!skb))
 426                return;
 427
 428        icsk->icsk_backoff--;
 429        icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 430        icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 431
 432        tcp_mstamp_refresh(tp);
 433        delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 434        remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 435
 436        if (remaining > 0) {
 437                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 438                                          remaining, TCP_RTO_MAX);
 439        } else {
 440                /* RTO revert clocked out retransmission.
 441                 * Will retransmit now.
 442                 */
 443                tcp_retransmit_timer(sk);
 444        }
 445}
 446EXPORT_SYMBOL(tcp_ld_RTO_revert);
 447
 448/*
 449 * This routine is called by the ICMP module when it gets some
 450 * sort of error condition.  If err < 0 then the socket should
 451 * be closed and the error returned to the user.  If err > 0
 452 * it's just the icmp type << 8 | icmp code.  After adjustment
 453 * header points to the first 8 bytes of the tcp header.  We need
 454 * to find the appropriate port.
 455 *
 456 * The locking strategy used here is very "optimistic". When
 457 * someone else accesses the socket the ICMP is just dropped
 458 * and for some paths there is no check at all.
 459 * A more general error queue to queue errors for later handling
 460 * is probably better.
 461 *
 462 */
 463
 464int tcp_v4_err(struct sk_buff *skb, u32 info)
 465{
 466        const struct iphdr *iph = (const struct iphdr *)skb->data;
 467        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 468        struct tcp_sock *tp;
 469        struct inet_sock *inet;
 470        const int type = icmp_hdr(skb)->type;
 471        const int code = icmp_hdr(skb)->code;
 472        struct sock *sk;
 473        struct request_sock *fastopen;
 474        u32 seq, snd_una;
 475        int err;
 476        struct net *net = dev_net(skb->dev);
 477
 478        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 479                                       th->dest, iph->saddr, ntohs(th->source),
 480                                       inet_iif(skb), 0);
 481        if (!sk) {
 482                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 483                return -ENOENT;
 484        }
 485        if (sk->sk_state == TCP_TIME_WAIT) {
 486                inet_twsk_put(inet_twsk(sk));
 487                return 0;
 488        }
 489        seq = ntohl(th->seq);
 490        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 491                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 492                                     type == ICMP_TIME_EXCEEDED ||
 493                                     (type == ICMP_DEST_UNREACH &&
 494                                      (code == ICMP_NET_UNREACH ||
 495                                       code == ICMP_HOST_UNREACH)));
 496                return 0;
 497        }
 498
 499        bh_lock_sock(sk);
 500        /* If too many ICMPs get dropped on busy
 501         * servers this needs to be solved differently.
 502         * We do take care of PMTU discovery (RFC1191) special case :
 503         * we can receive locally generated ICMP messages while socket is held.
 504         */
 505        if (sock_owned_by_user(sk)) {
 506                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 507                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 508        }
 509        if (sk->sk_state == TCP_CLOSE)
 510                goto out;
 511
 512        if (static_branch_unlikely(&ip4_min_ttl)) {
 513                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 514                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 515                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 516                        goto out;
 517                }
 518        }
 519
 520        tp = tcp_sk(sk);
 521        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 522        fastopen = rcu_dereference(tp->fastopen_rsk);
 523        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 524        if (sk->sk_state != TCP_LISTEN &&
 525            !between(seq, snd_una, tp->snd_nxt)) {
 526                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 527                goto out;
 528        }
 529
 530        switch (type) {
 531        case ICMP_REDIRECT:
 532                if (!sock_owned_by_user(sk))
 533                        do_redirect(skb, sk);
 534                goto out;
 535        case ICMP_SOURCE_QUENCH:
 536                /* Just silently ignore these. */
 537                goto out;
 538        case ICMP_PARAMETERPROB:
 539                err = EPROTO;
 540                break;
 541        case ICMP_DEST_UNREACH:
 542                if (code > NR_ICMP_UNREACH)
 543                        goto out;
 544
 545                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 546                        /* We are not interested in TCP_LISTEN and open_requests
 547                         * (SYN-ACKs send out by Linux are always <576bytes so
 548                         * they should go through unfragmented).
 549                         */
 550                        if (sk->sk_state == TCP_LISTEN)
 551                                goto out;
 552
 553                        WRITE_ONCE(tp->mtu_info, info);
 554                        if (!sock_owned_by_user(sk)) {
 555                                tcp_v4_mtu_reduced(sk);
 556                        } else {
 557                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 558                                        sock_hold(sk);
 559                        }
 560                        goto out;
 561                }
 562
 563                err = icmp_err_convert[code].errno;
 564                /* check if this ICMP message allows revert of backoff.
 565                 * (see RFC 6069)
 566                 */
 567                if (!fastopen &&
 568                    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 569                        tcp_ld_RTO_revert(sk, seq);
 570                break;
 571        case ICMP_TIME_EXCEEDED:
 572                err = EHOSTUNREACH;
 573                break;
 574        default:
 575                goto out;
 576        }
 577
 578        switch (sk->sk_state) {
 579        case TCP_SYN_SENT:
 580        case TCP_SYN_RECV:
 581                /* Only in fast or simultaneous open. If a fast open socket is
 582                 * already accepted it is treated as a connected one below.
 583                 */
 584                if (fastopen && !fastopen->sk)
 585                        break;
 586
 587                ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 588
 589                if (!sock_owned_by_user(sk)) {
 590                        sk->sk_err = err;
 591
 592                        sk_error_report(sk);
 593
 594                        tcp_done(sk);
 595                } else {
 596                        sk->sk_err_soft = err;
 597                }
 598                goto out;
 599        }
 600
 601        /* If we've already connected we will keep trying
 602         * until we time out, or the user gives up.
 603         *
 604         * rfc1122 4.2.3.9 allows to consider as hard errors
 605         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 606         * but it is obsoleted by pmtu discovery).
 607         *
 608         * Note, that in modern internet, where routing is unreliable
 609         * and in each dark corner broken firewalls sit, sending random
 610         * errors ordered by their masters even this two messages finally lose
 611         * their original sense (even Linux sends invalid PORT_UNREACHs)
 612         *
 613         * Now we are in compliance with RFCs.
 614         *                                                      --ANK (980905)
 615         */
 616
 617        inet = inet_sk(sk);
 618        if (!sock_owned_by_user(sk) && inet->recverr) {
 619                sk->sk_err = err;
 620                sk_error_report(sk);
 621        } else  { /* Only an error on timeout */
 622                sk->sk_err_soft = err;
 623        }
 624
 625out:
 626        bh_unlock_sock(sk);
 627        sock_put(sk);
 628        return 0;
 629}
 630
 631void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 632{
 633        struct tcphdr *th = tcp_hdr(skb);
 634
 635        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 636        skb->csum_start = skb_transport_header(skb) - skb->head;
 637        skb->csum_offset = offsetof(struct tcphdr, check);
 638}
 639
 640/* This routine computes an IPv4 TCP checksum. */
 641void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 642{
 643        const struct inet_sock *inet = inet_sk(sk);
 644
 645        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 646}
 647EXPORT_SYMBOL(tcp_v4_send_check);
 648
 649/*
 650 *      This routine will send an RST to the other tcp.
 651 *
 652 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 653 *                    for reset.
 654 *      Answer: if a packet caused RST, it is not for a socket
 655 *              existing in our system, if it is matched to a socket,
 656 *              it is just duplicate segment or bug in other side's TCP.
 657 *              So that we build reply only basing on parameters
 658 *              arrived with segment.
 659 *      Exception: precedence violation. We do not implement it in any case.
 660 */
 661
 662#ifdef CONFIG_TCP_MD5SIG
 663#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 664#else
 665#define OPTION_BYTES sizeof(__be32)
 666#endif
 667
 668static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 669{
 670        const struct tcphdr *th = tcp_hdr(skb);
 671        struct {
 672                struct tcphdr th;
 673                __be32 opt[OPTION_BYTES / sizeof(__be32)];
 674        } rep;
 675        struct ip_reply_arg arg;
 676#ifdef CONFIG_TCP_MD5SIG
 677        struct tcp_md5sig_key *key = NULL;
 678        const __u8 *hash_location = NULL;
 679        unsigned char newhash[16];
 680        int genhash;
 681        struct sock *sk1 = NULL;
 682#endif
 683        u64 transmit_time = 0;
 684        struct sock *ctl_sk;
 685        struct net *net;
 686
 687        /* Never send a reset in response to a reset. */
 688        if (th->rst)
 689                return;
 690
 691        /* If sk not NULL, it means we did a successful lookup and incoming
 692         * route had to be correct. prequeue might have dropped our dst.
 693         */
 694        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 695                return;
 696
 697        /* Swap the send and the receive. */
 698        memset(&rep, 0, sizeof(rep));
 699        rep.th.dest   = th->source;
 700        rep.th.source = th->dest;
 701        rep.th.doff   = sizeof(struct tcphdr) / 4;
 702        rep.th.rst    = 1;
 703
 704        if (th->ack) {
 705                rep.th.seq = th->ack_seq;
 706        } else {
 707                rep.th.ack = 1;
 708                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                       skb->len - (th->doff << 2));
 710        }
 711
 712        memset(&arg, 0, sizeof(arg));
 713        arg.iov[0].iov_base = (unsigned char *)&rep;
 714        arg.iov[0].iov_len  = sizeof(rep.th);
 715
 716        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 717#ifdef CONFIG_TCP_MD5SIG
 718        rcu_read_lock();
 719        hash_location = tcp_parse_md5sig_option(th);
 720        if (sk && sk_fullsock(sk)) {
 721                const union tcp_md5_addr *addr;
 722                int l3index;
 723
 724                /* sdif set, means packet ingressed via a device
 725                 * in an L3 domain and inet_iif is set to it.
 726                 */
 727                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 728                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 730        } else if (hash_location) {
 731                const union tcp_md5_addr *addr;
 732                int sdif = tcp_v4_sdif(skb);
 733                int dif = inet_iif(skb);
 734                int l3index;
 735
 736                /*
 737                 * active side is lost. Try to find listening socket through
 738                 * source port, and then find md5 key through listening socket.
 739                 * we are not loose security here:
 740                 * Incoming packet is checked with md5 hash with finding key,
 741                 * no RST generated if md5 hash doesn't match.
 742                 */
 743                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 744                                             ip_hdr(skb)->saddr,
 745                                             th->source, ip_hdr(skb)->daddr,
 746                                             ntohs(th->source), dif, sdif);
 747                /* don't send rst if it can't find key */
 748                if (!sk1)
 749                        goto out;
 750
 751                /* sdif set, means packet ingressed via a device
 752                 * in an L3 domain and dif is set to it.
 753                 */
 754                l3index = sdif ? dif : 0;
 755                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 756                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 757                if (!key)
 758                        goto out;
 759
 760
 761                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 762                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 763                        goto out;
 764
 765        }
 766
 767        if (key) {
 768                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 769                                   (TCPOPT_NOP << 16) |
 770                                   (TCPOPT_MD5SIG << 8) |
 771                                   TCPOLEN_MD5SIG);
 772                /* Update length and the length the header thinks exists */
 773                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 774                rep.th.doff = arg.iov[0].iov_len / 4;
 775
 776                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 777                                     key, ip_hdr(skb)->saddr,
 778                                     ip_hdr(skb)->daddr, &rep.th);
 779        }
 780#endif
 781        /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 782        if (rep.opt[0] == 0) {
 783                __be32 mrst = mptcp_reset_option(skb);
 784
 785                if (mrst) {
 786                        rep.opt[0] = mrst;
 787                        arg.iov[0].iov_len += sizeof(mrst);
 788                        rep.th.doff = arg.iov[0].iov_len / 4;
 789                }
 790        }
 791
 792        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 793                                      ip_hdr(skb)->saddr, /* XXX */
 794                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 795        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 796        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 797
 798        /* When socket is gone, all binding information is lost.
 799         * routing might fail in this case. No choice here, if we choose to force
 800         * input interface, we will misroute in case of asymmetric route.
 801         */
 802        if (sk) {
 803                arg.bound_dev_if = sk->sk_bound_dev_if;
 804                if (sk_fullsock(sk))
 805                        trace_tcp_send_reset(sk, skb);
 806        }
 807
 808        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 809                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 810
 811        arg.tos = ip_hdr(skb)->tos;
 812        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 813        local_bh_disable();
 814        ctl_sk = this_cpu_read(ipv4_tcp_sk);
 815        sock_net_set(ctl_sk, net);
 816        if (sk) {
 817                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 819                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 820                                   inet_twsk(sk)->tw_priority : sk->sk_priority;
 821                transmit_time = tcp_transmit_time(sk);
 822                xfrm_sk_clone_policy(ctl_sk, sk);
 823        }
 824        ip_send_unicast_reply(ctl_sk,
 825                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 826                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 827                              &arg, arg.iov[0].iov_len,
 828                              transmit_time);
 829
 830        ctl_sk->sk_mark = 0;
 831        xfrm_sk_free_policy(ctl_sk);
 832        sock_net_set(ctl_sk, &init_net);
 833        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 834        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 835        local_bh_enable();
 836
 837#ifdef CONFIG_TCP_MD5SIG
 838out:
 839        rcu_read_unlock();
 840#endif
 841}
 842
 843/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 844   outside socket context is ugly, certainly. What can I do?
 845 */
 846
 847static void tcp_v4_send_ack(const struct sock *sk,
 848                            struct sk_buff *skb, u32 seq, u32 ack,
 849                            u32 win, u32 tsval, u32 tsecr, int oif,
 850                            struct tcp_md5sig_key *key,
 851                            int reply_flags, u8 tos)
 852{
 853        const struct tcphdr *th = tcp_hdr(skb);
 854        struct {
 855                struct tcphdr th;
 856                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 857#ifdef CONFIG_TCP_MD5SIG
 858                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 859#endif
 860                        ];
 861        } rep;
 862        struct net *net = sock_net(sk);
 863        struct ip_reply_arg arg;
 864        struct sock *ctl_sk;
 865        u64 transmit_time;
 866
 867        memset(&rep.th, 0, sizeof(struct tcphdr));
 868        memset(&arg, 0, sizeof(arg));
 869
 870        arg.iov[0].iov_base = (unsigned char *)&rep;
 871        arg.iov[0].iov_len  = sizeof(rep.th);
 872        if (tsecr) {
 873                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 874                                   (TCPOPT_TIMESTAMP << 8) |
 875                                   TCPOLEN_TIMESTAMP);
 876                rep.opt[1] = htonl(tsval);
 877                rep.opt[2] = htonl(tsecr);
 878                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 879        }
 880
 881        /* Swap the send and the receive. */
 882        rep.th.dest    = th->source;
 883        rep.th.source  = th->dest;
 884        rep.th.doff    = arg.iov[0].iov_len / 4;
 885        rep.th.seq     = htonl(seq);
 886        rep.th.ack_seq = htonl(ack);
 887        rep.th.ack     = 1;
 888        rep.th.window  = htons(win);
 889
 890#ifdef CONFIG_TCP_MD5SIG
 891        if (key) {
 892                int offset = (tsecr) ? 3 : 0;
 893
 894                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 895                                          (TCPOPT_NOP << 16) |
 896                                          (TCPOPT_MD5SIG << 8) |
 897                                          TCPOLEN_MD5SIG);
 898                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 899                rep.th.doff = arg.iov[0].iov_len/4;
 900
 901                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 902                                    key, ip_hdr(skb)->saddr,
 903                                    ip_hdr(skb)->daddr, &rep.th);
 904        }
 905#endif
 906        arg.flags = reply_flags;
 907        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 908                                      ip_hdr(skb)->saddr, /* XXX */
 909                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 910        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 911        if (oif)
 912                arg.bound_dev_if = oif;
 913        arg.tos = tos;
 914        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 915        local_bh_disable();
 916        ctl_sk = this_cpu_read(ipv4_tcp_sk);
 917        sock_net_set(ctl_sk, net);
 918        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 919                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 920        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 921                           inet_twsk(sk)->tw_priority : sk->sk_priority;
 922        transmit_time = tcp_transmit_time(sk);
 923        ip_send_unicast_reply(ctl_sk,
 924                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 925                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 926                              &arg, arg.iov[0].iov_len,
 927                              transmit_time);
 928
 929        ctl_sk->sk_mark = 0;
 930        sock_net_set(ctl_sk, &init_net);
 931        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 932        local_bh_enable();
 933}
 934
 935static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 936{
 937        struct inet_timewait_sock *tw = inet_twsk(sk);
 938        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 939
 940        tcp_v4_send_ack(sk, skb,
 941                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 942                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 943                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 944                        tcptw->tw_ts_recent,
 945                        tw->tw_bound_dev_if,
 946                        tcp_twsk_md5_key(tcptw),
 947                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 948                        tw->tw_tos
 949                        );
 950
 951        inet_twsk_put(tw);
 952}
 953
 954static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 955                                  struct request_sock *req)
 956{
 957        const union tcp_md5_addr *addr;
 958        int l3index;
 959
 960        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 961         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 962         */
 963        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 964                                             tcp_sk(sk)->snd_nxt;
 965
 966        /* RFC 7323 2.3
 967         * The window field (SEG.WND) of every outgoing segment, with the
 968         * exception of <SYN> segments, MUST be right-shifted by
 969         * Rcv.Wind.Shift bits:
 970         */
 971        addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 972        l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 973        tcp_v4_send_ack(sk, skb, seq,
 974                        tcp_rsk(req)->rcv_nxt,
 975                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 976                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 977                        req->ts_recent,
 978                        0,
 979                        tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 980                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 981                        ip_hdr(skb)->tos);
 982}
 983
 984/*
 985 *      Send a SYN-ACK after having received a SYN.
 986 *      This still operates on a request_sock only, not on a big
 987 *      socket.
 988 */
 989static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 990                              struct flowi *fl,
 991                              struct request_sock *req,
 992                              struct tcp_fastopen_cookie *foc,
 993                              enum tcp_synack_type synack_type,
 994                              struct sk_buff *syn_skb)
 995{
 996        const struct inet_request_sock *ireq = inet_rsk(req);
 997        struct flowi4 fl4;
 998        int err = -1;
 999        struct sk_buff *skb;
1000        u8 tos;
1001
1002        /* First, grab a route. */
1003        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004                return -1;
1005
1006        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007
1008        if (skb) {
1009                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010
1011                tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012                                (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013                                (inet_sk(sk)->tos & INET_ECN_MASK) :
1014                                inet_sk(sk)->tos;
1015
1016                if (!INET_ECN_is_capable(tos) &&
1017                    tcp_bpf_ca_needs_ecn((struct sock *)req))
1018                        tos |= INET_ECN_ECT_0;
1019
1020                rcu_read_lock();
1021                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022                                            ireq->ir_rmt_addr,
1023                                            rcu_dereference(ireq->ireq_opt),
1024                                            tos);
1025                rcu_read_unlock();
1026                err = net_xmit_eval(err);
1027        }
1028
1029        return err;
1030}
1031
1032/*
1033 *      IPv4 request_sock destructor.
1034 */
1035static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036{
1037        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038}
1039
1040#ifdef CONFIG_TCP_MD5SIG
1041/*
1042 * RFC2385 MD5 checksumming requires a mapping of
1043 * IP address->MD5 Key.
1044 * We need to maintain these in the sk structure.
1045 */
1046
1047DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048EXPORT_SYMBOL(tcp_md5_needed);
1049
1050static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1051{
1052        if (!old)
1053                return true;
1054
1055        /* l3index always overrides non-l3index */
1056        if (old->l3index && new->l3index == 0)
1057                return false;
1058        if (old->l3index == 0 && new->l3index)
1059                return true;
1060
1061        return old->prefixlen < new->prefixlen;
1062}
1063
1064/* Find the Key structure for an address.  */
1065struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066                                           const union tcp_md5_addr *addr,
1067                                           int family)
1068{
1069        const struct tcp_sock *tp = tcp_sk(sk);
1070        struct tcp_md5sig_key *key;
1071        const struct tcp_md5sig_info *md5sig;
1072        __be32 mask;
1073        struct tcp_md5sig_key *best_match = NULL;
1074        bool match;
1075
1076        /* caller either holds rcu_read_lock() or socket lock */
1077        md5sig = rcu_dereference_check(tp->md5sig_info,
1078                                       lockdep_sock_is_held(sk));
1079        if (!md5sig)
1080                return NULL;
1081
1082        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083                                 lockdep_sock_is_held(sk)) {
1084                if (key->family != family)
1085                        continue;
1086                if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087                        continue;
1088                if (family == AF_INET) {
1089                        mask = inet_make_mask(key->prefixlen);
1090                        match = (key->addr.a4.s_addr & mask) ==
1091                                (addr->a4.s_addr & mask);
1092#if IS_ENABLED(CONFIG_IPV6)
1093                } else if (family == AF_INET6) {
1094                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1095                                                  key->prefixlen);
1096#endif
1097                } else {
1098                        match = false;
1099                }
1100
1101                if (match && better_md5_match(best_match, key))
1102                        best_match = key;
1103        }
1104        return best_match;
1105}
1106EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107
1108static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109                                                      const union tcp_md5_addr *addr,
1110                                                      int family, u8 prefixlen,
1111                                                      int l3index, u8 flags)
1112{
1113        const struct tcp_sock *tp = tcp_sk(sk);
1114        struct tcp_md5sig_key *key;
1115        unsigned int size = sizeof(struct in_addr);
1116        const struct tcp_md5sig_info *md5sig;
1117
1118        /* caller either holds rcu_read_lock() or socket lock */
1119        md5sig = rcu_dereference_check(tp->md5sig_info,
1120                                       lockdep_sock_is_held(sk));
1121        if (!md5sig)
1122                return NULL;
1123#if IS_ENABLED(CONFIG_IPV6)
1124        if (family == AF_INET6)
1125                size = sizeof(struct in6_addr);
1126#endif
1127        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128                                 lockdep_sock_is_held(sk)) {
1129                if (key->family != family)
1130                        continue;
1131                if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132                        continue;
1133                if (key->l3index != l3index)
1134                        continue;
1135                if (!memcmp(&key->addr, addr, size) &&
1136                    key->prefixlen == prefixlen)
1137                        return key;
1138        }
1139        return NULL;
1140}
1141
1142struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143                                         const struct sock *addr_sk)
1144{
1145        const union tcp_md5_addr *addr;
1146        int l3index;
1147
1148        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149                                                 addr_sk->sk_bound_dev_if);
1150        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152}
1153EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154
1155/* This can be called on a newly created socket, from other files */
1156int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157                   int family, u8 prefixlen, int l3index, u8 flags,
1158                   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159{
1160        /* Add Key to the list */
1161        struct tcp_md5sig_key *key;
1162        struct tcp_sock *tp = tcp_sk(sk);
1163        struct tcp_md5sig_info *md5sig;
1164
1165        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166        if (key) {
1167                /* Pre-existing entry - just update that one.
1168                 * Note that the key might be used concurrently.
1169                 * data_race() is telling kcsan that we do not care of
1170                 * key mismatches, since changing MD5 key on live flows
1171                 * can lead to packet drops.
1172                 */
1173                data_race(memcpy(key->key, newkey, newkeylen));
1174
1175                /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1176                 * Also note that a reader could catch new key->keylen value
1177                 * but old key->key[], this is the reason we use __GFP_ZERO
1178                 * at sock_kmalloc() time below these lines.
1179                 */
1180                WRITE_ONCE(key->keylen, newkeylen);
1181
1182                return 0;
1183        }
1184
1185        md5sig = rcu_dereference_protected(tp->md5sig_info,
1186                                           lockdep_sock_is_held(sk));
1187        if (!md5sig) {
1188                md5sig = kmalloc(sizeof(*md5sig), gfp);
1189                if (!md5sig)
1190                        return -ENOMEM;
1191
1192                sk_gso_disable(sk);
1193                INIT_HLIST_HEAD(&md5sig->head);
1194                rcu_assign_pointer(tp->md5sig_info, md5sig);
1195        }
1196
1197        key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198        if (!key)
1199                return -ENOMEM;
1200        if (!tcp_alloc_md5sig_pool()) {
1201                sock_kfree_s(sk, key, sizeof(*key));
1202                return -ENOMEM;
1203        }
1204
1205        memcpy(key->key, newkey, newkeylen);
1206        key->keylen = newkeylen;
1207        key->family = family;
1208        key->prefixlen = prefixlen;
1209        key->l3index = l3index;
1210        key->flags = flags;
1211        memcpy(&key->addr, addr,
1212               (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213                                                                 sizeof(struct in_addr));
1214        hlist_add_head_rcu(&key->node, &md5sig->head);
1215        return 0;
1216}
1217EXPORT_SYMBOL(tcp_md5_do_add);
1218
1219int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220                   u8 prefixlen, int l3index, u8 flags)
1221{
1222        struct tcp_md5sig_key *key;
1223
1224        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225        if (!key)
1226                return -ENOENT;
1227        hlist_del_rcu(&key->node);
1228        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229        kfree_rcu(key, rcu);
1230        return 0;
1231}
1232EXPORT_SYMBOL(tcp_md5_do_del);
1233
1234static void tcp_clear_md5_list(struct sock *sk)
1235{
1236        struct tcp_sock *tp = tcp_sk(sk);
1237        struct tcp_md5sig_key *key;
1238        struct hlist_node *n;
1239        struct tcp_md5sig_info *md5sig;
1240
1241        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242
1243        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244                hlist_del_rcu(&key->node);
1245                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246                kfree_rcu(key, rcu);
1247        }
1248}
1249
1250static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251                                 sockptr_t optval, int optlen)
1252{
1253        struct tcp_md5sig cmd;
1254        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255        const union tcp_md5_addr *addr;
1256        u8 prefixlen = 32;
1257        int l3index = 0;
1258        u8 flags;
1259
1260        if (optlen < sizeof(cmd))
1261                return -EINVAL;
1262
1263        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264                return -EFAULT;
1265
1266        if (sin->sin_family != AF_INET)
1267                return -EINVAL;
1268
1269        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270
1271        if (optname == TCP_MD5SIG_EXT &&
1272            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273                prefixlen = cmd.tcpm_prefixlen;
1274                if (prefixlen > 32)
1275                        return -EINVAL;
1276        }
1277
1278        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280                struct net_device *dev;
1281
1282                rcu_read_lock();
1283                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284                if (dev && netif_is_l3_master(dev))
1285                        l3index = dev->ifindex;
1286
1287                rcu_read_unlock();
1288
1289                /* ok to reference set/not set outside of rcu;
1290                 * right now device MUST be an L3 master
1291                 */
1292                if (!dev || !l3index)
1293                        return -EINVAL;
1294        }
1295
1296        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297
1298        if (!cmd.tcpm_keylen)
1299                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300
1301        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302                return -EINVAL;
1303
1304        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305                              cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306}
1307
1308static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309                                   __be32 daddr, __be32 saddr,
1310                                   const struct tcphdr *th, int nbytes)
1311{
1312        struct tcp4_pseudohdr *bp;
1313        struct scatterlist sg;
1314        struct tcphdr *_th;
1315
1316        bp = hp->scratch;
1317        bp->saddr = saddr;
1318        bp->daddr = daddr;
1319        bp->pad = 0;
1320        bp->protocol = IPPROTO_TCP;
1321        bp->len = cpu_to_be16(nbytes);
1322
1323        _th = (struct tcphdr *)(bp + 1);
1324        memcpy(_th, th, sizeof(*th));
1325        _th->check = 0;
1326
1327        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329                                sizeof(*bp) + sizeof(*th));
1330        return crypto_ahash_update(hp->md5_req);
1331}
1332
1333static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335{
1336        struct tcp_md5sig_pool *hp;
1337        struct ahash_request *req;
1338
1339        hp = tcp_get_md5sig_pool();
1340        if (!hp)
1341                goto clear_hash_noput;
1342        req = hp->md5_req;
1343
1344        if (crypto_ahash_init(req))
1345                goto clear_hash;
1346        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347                goto clear_hash;
1348        if (tcp_md5_hash_key(hp, key))
1349                goto clear_hash;
1350        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351        if (crypto_ahash_final(req))
1352                goto clear_hash;
1353
1354        tcp_put_md5sig_pool();
1355        return 0;
1356
1357clear_hash:
1358        tcp_put_md5sig_pool();
1359clear_hash_noput:
1360        memset(md5_hash, 0, 16);
1361        return 1;
1362}
1363
1364int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365                        const struct sock *sk,
1366                        const struct sk_buff *skb)
1367{
1368        struct tcp_md5sig_pool *hp;
1369        struct ahash_request *req;
1370        const struct tcphdr *th = tcp_hdr(skb);
1371        __be32 saddr, daddr;
1372
1373        if (sk) { /* valid for establish/request sockets */
1374                saddr = sk->sk_rcv_saddr;
1375                daddr = sk->sk_daddr;
1376        } else {
1377                const struct iphdr *iph = ip_hdr(skb);
1378                saddr = iph->saddr;
1379                daddr = iph->daddr;
1380        }
1381
1382        hp = tcp_get_md5sig_pool();
1383        if (!hp)
1384                goto clear_hash_noput;
1385        req = hp->md5_req;
1386
1387        if (crypto_ahash_init(req))
1388                goto clear_hash;
1389
1390        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391                goto clear_hash;
1392        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393                goto clear_hash;
1394        if (tcp_md5_hash_key(hp, key))
1395                goto clear_hash;
1396        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397        if (crypto_ahash_final(req))
1398                goto clear_hash;
1399
1400        tcp_put_md5sig_pool();
1401        return 0;
1402
1403clear_hash:
1404        tcp_put_md5sig_pool();
1405clear_hash_noput:
1406        memset(md5_hash, 0, 16);
1407        return 1;
1408}
1409EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1410
1411#endif
1412
1413static void tcp_v4_init_req(struct request_sock *req,
1414                            const struct sock *sk_listener,
1415                            struct sk_buff *skb)
1416{
1417        struct inet_request_sock *ireq = inet_rsk(req);
1418        struct net *net = sock_net(sk_listener);
1419
1420        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423}
1424
1425static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426                                          struct sk_buff *skb,
1427                                          struct flowi *fl,
1428                                          struct request_sock *req)
1429{
1430        tcp_v4_init_req(req, sk, skb);
1431
1432        if (security_inet_conn_request(sk, skb, req))
1433                return NULL;
1434
1435        return inet_csk_route_req(sk, &fl->u.ip4, req);
1436}
1437
1438struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439        .family         =       PF_INET,
1440        .obj_size       =       sizeof(struct tcp_request_sock),
1441        .rtx_syn_ack    =       tcp_rtx_synack,
1442        .send_ack       =       tcp_v4_reqsk_send_ack,
1443        .destructor     =       tcp_v4_reqsk_destructor,
1444        .send_reset     =       tcp_v4_send_reset,
1445        .syn_ack_timeout =      tcp_syn_ack_timeout,
1446};
1447
1448const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449        .mss_clamp      =       TCP_MSS_DEFAULT,
1450#ifdef CONFIG_TCP_MD5SIG
1451        .req_md5_lookup =       tcp_v4_md5_lookup,
1452        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1453#endif
1454#ifdef CONFIG_SYN_COOKIES
1455        .cookie_init_seq =      cookie_v4_init_sequence,
1456#endif
1457        .route_req      =       tcp_v4_route_req,
1458        .init_seq       =       tcp_v4_init_seq,
1459        .init_ts_off    =       tcp_v4_init_ts_off,
1460        .send_synack    =       tcp_v4_send_synack,
1461};
1462
1463int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464{
1465        /* Never answer to SYNs send to broadcast or multicast */
1466        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467                goto drop;
1468
1469        return tcp_conn_request(&tcp_request_sock_ops,
1470                                &tcp_request_sock_ipv4_ops, sk, skb);
1471
1472drop:
1473        tcp_listendrop(sk);
1474        return 0;
1475}
1476EXPORT_SYMBOL(tcp_v4_conn_request);
1477
1478
1479/*
1480 * The three way handshake has completed - we got a valid synack -
1481 * now create the new socket.
1482 */
1483struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484                                  struct request_sock *req,
1485                                  struct dst_entry *dst,
1486                                  struct request_sock *req_unhash,
1487                                  bool *own_req)
1488{
1489        struct inet_request_sock *ireq;
1490        bool found_dup_sk = false;
1491        struct inet_sock *newinet;
1492        struct tcp_sock *newtp;
1493        struct sock *newsk;
1494#ifdef CONFIG_TCP_MD5SIG
1495        const union tcp_md5_addr *addr;
1496        struct tcp_md5sig_key *key;
1497        int l3index;
1498#endif
1499        struct ip_options_rcu *inet_opt;
1500
1501        if (sk_acceptq_is_full(sk))
1502                goto exit_overflow;
1503
1504        newsk = tcp_create_openreq_child(sk, req, skb);
1505        if (!newsk)
1506                goto exit_nonewsk;
1507
1508        newsk->sk_gso_type = SKB_GSO_TCPV4;
1509        inet_sk_rx_dst_set(newsk, skb);
1510
1511        newtp                 = tcp_sk(newsk);
1512        newinet               = inet_sk(newsk);
1513        ireq                  = inet_rsk(req);
1514        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516        newsk->sk_bound_dev_if = ireq->ir_iif;
1517        newinet->inet_saddr   = ireq->ir_loc_addr;
1518        inet_opt              = rcu_dereference(ireq->ireq_opt);
1519        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520        newinet->mc_index     = inet_iif(skb);
1521        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1522        newinet->rcv_tos      = ip_hdr(skb)->tos;
1523        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524        if (inet_opt)
1525                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526        newinet->inet_id = prandom_u32();
1527
1528        /* Set ToS of the new socket based upon the value of incoming SYN.
1529         * ECT bits are set later in tcp_init_transfer().
1530         */
1531        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532                newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533
1534        if (!dst) {
1535                dst = inet_csk_route_child_sock(sk, newsk, req);
1536                if (!dst)
1537                        goto put_and_exit;
1538        } else {
1539                /* syncookie case : see end of cookie_v4_check() */
1540        }
1541        sk_setup_caps(newsk, dst);
1542
1543        tcp_ca_openreq_child(newsk, dst);
1544
1545        tcp_sync_mss(newsk, dst_mtu(dst));
1546        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547
1548        tcp_initialize_rcv_mss(newsk);
1549
1550#ifdef CONFIG_TCP_MD5SIG
1551        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552        /* Copy over the MD5 key from the original socket */
1553        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555        if (key) {
1556                /*
1557                 * We're using one, so create a matching key
1558                 * on the newsk structure. If we fail to get
1559                 * memory, then we end up not copying the key
1560                 * across. Shucks.
1561                 */
1562                tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563                               key->key, key->keylen, GFP_ATOMIC);
1564                sk_gso_disable(newsk);
1565        }
1566#endif
1567
1568        if (__inet_inherit_port(sk, newsk) < 0)
1569                goto put_and_exit;
1570        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571                                       &found_dup_sk);
1572        if (likely(*own_req)) {
1573                tcp_move_syn(newtp, req);
1574                ireq->ireq_opt = NULL;
1575        } else {
1576                newinet->inet_opt = NULL;
1577
1578                if (!req_unhash && found_dup_sk) {
1579                        /* This code path should only be executed in the
1580                         * syncookie case only
1581                         */
1582                        bh_unlock_sock(newsk);
1583                        sock_put(newsk);
1584                        newsk = NULL;
1585                }
1586        }
1587        return newsk;
1588
1589exit_overflow:
1590        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1591exit_nonewsk:
1592        dst_release(dst);
1593exit:
1594        tcp_listendrop(sk);
1595        return NULL;
1596put_and_exit:
1597        newinet->inet_opt = NULL;
1598        inet_csk_prepare_forced_close(newsk);
1599        tcp_done(newsk);
1600        goto exit;
1601}
1602EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603
1604static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605{
1606#ifdef CONFIG_SYN_COOKIES
1607        const struct tcphdr *th = tcp_hdr(skb);
1608
1609        if (!th->syn)
1610                sk = cookie_v4_check(sk, skb);
1611#endif
1612        return sk;
1613}
1614
1615u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616                         struct tcphdr *th, u32 *cookie)
1617{
1618        u16 mss = 0;
1619#ifdef CONFIG_SYN_COOKIES
1620        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621                                    &tcp_request_sock_ipv4_ops, sk, th);
1622        if (mss) {
1623                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624                tcp_synq_overflow(sk);
1625        }
1626#endif
1627        return mss;
1628}
1629
1630INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631                                                           u32));
1632/* The socket must have it's spinlock held when we get
1633 * here, unless it is a TCP_LISTEN socket.
1634 *
1635 * We have a potential double-lock case here, so even when
1636 * doing backlog processing we use the BH locking scheme.
1637 * This is because we cannot sleep with the original spinlock
1638 * held.
1639 */
1640int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641{
1642        enum skb_drop_reason reason;
1643        struct sock *rsk;
1644
1645        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1646                struct dst_entry *dst;
1647
1648                dst = rcu_dereference_protected(sk->sk_rx_dst,
1649                                                lockdep_sock_is_held(sk));
1650
1651                sock_rps_save_rxhash(sk, skb);
1652                sk_mark_napi_id(sk, skb);
1653                if (dst) {
1654                        if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655                            !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656                                             dst, 0)) {
1657                                RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1658                                dst_release(dst);
1659                        }
1660                }
1661                tcp_rcv_established(sk, skb);
1662                return 0;
1663        }
1664
1665        reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666        if (tcp_checksum_complete(skb))
1667                goto csum_err;
1668
1669        if (sk->sk_state == TCP_LISTEN) {
1670                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1671
1672                if (!nsk)
1673                        goto discard;
1674                if (nsk != sk) {
1675                        if (tcp_child_process(sk, nsk, skb)) {
1676                                rsk = nsk;
1677                                goto reset;
1678                        }
1679                        return 0;
1680                }
1681        } else
1682                sock_rps_save_rxhash(sk, skb);
1683
1684        if (tcp_rcv_state_process(sk, skb)) {
1685                rsk = sk;
1686                goto reset;
1687        }
1688        return 0;
1689
1690reset:
1691        tcp_v4_send_reset(rsk, skb);
1692discard:
1693        kfree_skb_reason(skb, reason);
1694        /* Be careful here. If this function gets more complicated and
1695         * gcc suffers from register pressure on the x86, sk (in %ebx)
1696         * might be destroyed here. This current version compiles correctly,
1697         * but you have been warned.
1698         */
1699        return 0;
1700
1701csum_err:
1702        reason = SKB_DROP_REASON_TCP_CSUM;
1703        trace_tcp_bad_csum(skb);
1704        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706        goto discard;
1707}
1708EXPORT_SYMBOL(tcp_v4_do_rcv);
1709
1710int tcp_v4_early_demux(struct sk_buff *skb)
1711{
1712        const struct iphdr *iph;
1713        const struct tcphdr *th;
1714        struct sock *sk;
1715
1716        if (skb->pkt_type != PACKET_HOST)
1717                return 0;
1718
1719        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1720                return 0;
1721
1722        iph = ip_hdr(skb);
1723        th = tcp_hdr(skb);
1724
1725        if (th->doff < sizeof(struct tcphdr) / 4)
1726                return 0;
1727
1728        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729                                       iph->saddr, th->source,
1730                                       iph->daddr, ntohs(th->dest),
1731                                       skb->skb_iif, inet_sdif(skb));
1732        if (sk) {
1733                skb->sk = sk;
1734                skb->destructor = sock_edemux;
1735                if (sk_fullsock(sk)) {
1736                        struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737
1738                        if (dst)
1739                                dst = dst_check(dst, 0);
1740                        if (dst &&
1741                            sk->sk_rx_dst_ifindex == skb->skb_iif)
1742                                skb_dst_set_noref(skb, dst);
1743                }
1744        }
1745        return 0;
1746}
1747
1748bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749                     enum skb_drop_reason *reason)
1750{
1751        u32 limit, tail_gso_size, tail_gso_segs;
1752        struct skb_shared_info *shinfo;
1753        const struct tcphdr *th;
1754        struct tcphdr *thtail;
1755        struct sk_buff *tail;
1756        unsigned int hdrlen;
1757        bool fragstolen;
1758        u32 gso_segs;
1759        u32 gso_size;
1760        int delta;
1761
1762        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1763         * we can fix skb->truesize to its real value to avoid future drops.
1764         * This is valid because skb is not yet charged to the socket.
1765         * It has been noticed pure SACK packets were sometimes dropped
1766         * (if cooked by drivers without copybreak feature).
1767         */
1768        skb_condense(skb);
1769
1770        skb_dst_drop(skb);
1771
1772        if (unlikely(tcp_checksum_complete(skb))) {
1773                bh_unlock_sock(sk);
1774                trace_tcp_bad_csum(skb);
1775                *reason = SKB_DROP_REASON_TCP_CSUM;
1776                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778                return true;
1779        }
1780
1781        /* Attempt coalescing to last skb in backlog, even if we are
1782         * above the limits.
1783         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784         */
1785        th = (const struct tcphdr *)skb->data;
1786        hdrlen = th->doff * 4;
1787
1788        tail = sk->sk_backlog.tail;
1789        if (!tail)
1790                goto no_coalesce;
1791        thtail = (struct tcphdr *)tail->data;
1792
1793        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795            ((TCP_SKB_CB(tail)->tcp_flags |
1796              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797            !((TCP_SKB_CB(tail)->tcp_flags &
1798              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799            ((TCP_SKB_CB(tail)->tcp_flags ^
1800              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801#ifdef CONFIG_TLS_DEVICE
1802            tail->decrypted != skb->decrypted ||
1803#endif
1804            thtail->doff != th->doff ||
1805            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806                goto no_coalesce;
1807
1808        __skb_pull(skb, hdrlen);
1809
1810        shinfo = skb_shinfo(skb);
1811        gso_size = shinfo->gso_size ?: skb->len;
1812        gso_segs = shinfo->gso_segs ?: 1;
1813
1814        shinfo = skb_shinfo(tail);
1815        tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816        tail_gso_segs = shinfo->gso_segs ?: 1;
1817
1818        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820
1821                if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823                        thtail->window = th->window;
1824                }
1825
1826                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1827                 * thtail->fin, so that the fast path in tcp_rcv_established()
1828                 * is not entered if we append a packet with a FIN.
1829                 * SYN, RST, URG are not present.
1830                 * ACK is set on both packets.
1831                 * PSH : we do not really care in TCP stack,
1832                 *       at least for 'GRO' packets.
1833                 */
1834                thtail->fin |= th->fin;
1835                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836
1837                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1839                        tail->tstamp = skb->tstamp;
1840                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841                }
1842
1843                /* Not as strict as GRO. We only need to carry mss max value */
1844                shinfo->gso_size = max(gso_size, tail_gso_size);
1845                shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846
1847                sk->sk_backlog.len += delta;
1848                __NET_INC_STATS(sock_net(sk),
1849                                LINUX_MIB_TCPBACKLOGCOALESCE);
1850                kfree_skb_partial(skb, fragstolen);
1851                return false;
1852        }
1853        __skb_push(skb, hdrlen);
1854
1855no_coalesce:
1856        limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1857
1858        /* Only socket owner can try to collapse/prune rx queues
1859         * to reduce memory overhead, so add a little headroom here.
1860         * Few sockets backlog are possibly concurrently non empty.
1861         */
1862        limit += 64 * 1024;
1863
1864        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1865                bh_unlock_sock(sk);
1866                *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1867                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1868                return true;
1869        }
1870        return false;
1871}
1872EXPORT_SYMBOL(tcp_add_backlog);
1873
1874int tcp_filter(struct sock *sk, struct sk_buff *skb)
1875{
1876        struct tcphdr *th = (struct tcphdr *)skb->data;
1877
1878        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1879}
1880EXPORT_SYMBOL(tcp_filter);
1881
1882static void tcp_v4_restore_cb(struct sk_buff *skb)
1883{
1884        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1885                sizeof(struct inet_skb_parm));
1886}
1887
1888static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1889                           const struct tcphdr *th)
1890{
1891        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1892         * barrier() makes sure compiler wont play fool^Waliasing games.
1893         */
1894        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1895                sizeof(struct inet_skb_parm));
1896        barrier();
1897
1898        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1899        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1900                                    skb->len - th->doff * 4);
1901        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1902        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1903        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1904        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1905        TCP_SKB_CB(skb)->sacked  = 0;
1906        TCP_SKB_CB(skb)->has_rxtstamp =
1907                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1908}
1909
1910/*
1911 *      From tcp_input.c
1912 */
1913
1914int tcp_v4_rcv(struct sk_buff *skb)
1915{
1916        struct net *net = dev_net(skb->dev);
1917        enum skb_drop_reason drop_reason;
1918        int sdif = inet_sdif(skb);
1919        int dif = inet_iif(skb);
1920        const struct iphdr *iph;
1921        const struct tcphdr *th;
1922        bool refcounted;
1923        struct sock *sk;
1924        int ret;
1925
1926        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1927        if (skb->pkt_type != PACKET_HOST)
1928                goto discard_it;
1929
1930        /* Count it even if it's bad */
1931        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1932
1933        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1934                goto discard_it;
1935
1936        th = (const struct tcphdr *)skb->data;
1937
1938        if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1939                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1940                goto bad_packet;
1941        }
1942        if (!pskb_may_pull(skb, th->doff * 4))
1943                goto discard_it;
1944
1945        /* An explanation is required here, I think.
1946         * Packet length and doff are validated by header prediction,
1947         * provided case of th->doff==0 is eliminated.
1948         * So, we defer the checks. */
1949
1950        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1951                goto csum_error;
1952
1953        th = (const struct tcphdr *)skb->data;
1954        iph = ip_hdr(skb);
1955lookup:
1956        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1957                               th->dest, sdif, &refcounted);
1958        if (!sk)
1959                goto no_tcp_socket;
1960
1961process:
1962        if (sk->sk_state == TCP_TIME_WAIT)
1963                goto do_time_wait;
1964
1965        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1966                struct request_sock *req = inet_reqsk(sk);
1967                bool req_stolen = false;
1968                struct sock *nsk;
1969
1970                sk = req->rsk_listener;
1971                if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1972                        drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1973                else
1974                        drop_reason = tcp_inbound_md5_hash(sk, skb,
1975                                                   &iph->saddr, &iph->daddr,
1976                                                   AF_INET, dif, sdif);
1977                if (unlikely(drop_reason)) {
1978                        sk_drops_add(sk, skb);
1979                        reqsk_put(req);
1980                        goto discard_it;
1981                }
1982                if (tcp_checksum_complete(skb)) {
1983                        reqsk_put(req);
1984                        goto csum_error;
1985                }
1986                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1987                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1988                        if (!nsk) {
1989                                inet_csk_reqsk_queue_drop_and_put(sk, req);
1990                                goto lookup;
1991                        }
1992                        sk = nsk;
1993                        /* reuseport_migrate_sock() has already held one sk_refcnt
1994                         * before returning.
1995                         */
1996                } else {
1997                        /* We own a reference on the listener, increase it again
1998                         * as we might lose it too soon.
1999                         */
2000                        sock_hold(sk);
2001                }
2002                refcounted = true;
2003                nsk = NULL;
2004                if (!tcp_filter(sk, skb)) {
2005                        th = (const struct tcphdr *)skb->data;
2006                        iph = ip_hdr(skb);
2007                        tcp_v4_fill_cb(skb, iph, th);
2008                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2009                } else {
2010                        drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2011                }
2012                if (!nsk) {
2013                        reqsk_put(req);
2014                        if (req_stolen) {
2015                                /* Another cpu got exclusive access to req
2016                                 * and created a full blown socket.
2017                                 * Try to feed this packet to this socket
2018                                 * instead of discarding it.
2019                                 */
2020                                tcp_v4_restore_cb(skb);
2021                                sock_put(sk);
2022                                goto lookup;
2023                        }
2024                        goto discard_and_relse;
2025                }
2026                nf_reset_ct(skb);
2027                if (nsk == sk) {
2028                        reqsk_put(req);
2029                        tcp_v4_restore_cb(skb);
2030                } else if (tcp_child_process(sk, nsk, skb)) {
2031                        tcp_v4_send_reset(nsk, skb);
2032                        goto discard_and_relse;
2033                } else {
2034                        sock_put(sk);
2035                        return 0;
2036                }
2037        }
2038
2039        if (static_branch_unlikely(&ip4_min_ttl)) {
2040                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2041                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2042                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2043                        goto discard_and_relse;
2044                }
2045        }
2046
2047        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2048                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2049                goto discard_and_relse;
2050        }
2051
2052        drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2053                                           &iph->daddr, AF_INET, dif, sdif);
2054        if (drop_reason)
2055                goto discard_and_relse;
2056
2057        nf_reset_ct(skb);
2058
2059        if (tcp_filter(sk, skb)) {
2060                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2061                goto discard_and_relse;
2062        }
2063        th = (const struct tcphdr *)skb->data;
2064        iph = ip_hdr(skb);
2065        tcp_v4_fill_cb(skb, iph, th);
2066
2067        skb->dev = NULL;
2068
2069        if (sk->sk_state == TCP_LISTEN) {
2070                ret = tcp_v4_do_rcv(sk, skb);
2071                goto put_and_return;
2072        }
2073
2074        sk_incoming_cpu_update(sk);
2075
2076        bh_lock_sock_nested(sk);
2077        tcp_segs_in(tcp_sk(sk), skb);
2078        ret = 0;
2079        if (!sock_owned_by_user(sk)) {
2080                ret = tcp_v4_do_rcv(sk, skb);
2081        } else {
2082                if (tcp_add_backlog(sk, skb, &drop_reason))
2083                        goto discard_and_relse;
2084        }
2085        bh_unlock_sock(sk);
2086
2087put_and_return:
2088        if (refcounted)
2089                sock_put(sk);
2090
2091        return ret;
2092
2093no_tcp_socket:
2094        drop_reason = SKB_DROP_REASON_NO_SOCKET;
2095        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2096                goto discard_it;
2097
2098        tcp_v4_fill_cb(skb, iph, th);
2099
2100        if (tcp_checksum_complete(skb)) {
2101csum_error:
2102                drop_reason = SKB_DROP_REASON_TCP_CSUM;
2103                trace_tcp_bad_csum(skb);
2104                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2105bad_packet:
2106                __TCP_INC_STATS(net, TCP_MIB_INERRS);
2107        } else {
2108                tcp_v4_send_reset(NULL, skb);
2109        }
2110
2111discard_it:
2112        SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2113        /* Discard frame. */
2114        kfree_skb_reason(skb, drop_reason);
2115        return 0;
2116
2117discard_and_relse:
2118        sk_drops_add(sk, skb);
2119        if (refcounted)
2120                sock_put(sk);
2121        goto discard_it;
2122
2123do_time_wait:
2124        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2125                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2126                inet_twsk_put(inet_twsk(sk));
2127                goto discard_it;
2128        }
2129
2130        tcp_v4_fill_cb(skb, iph, th);
2131
2132        if (tcp_checksum_complete(skb)) {
2133                inet_twsk_put(inet_twsk(sk));
2134                goto csum_error;
2135        }
2136        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2137        case TCP_TW_SYN: {
2138                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2139                                                        &tcp_hashinfo, skb,
2140                                                        __tcp_hdrlen(th),
2141                                                        iph->saddr, th->source,
2142                                                        iph->daddr, th->dest,
2143                                                        inet_iif(skb),
2144                                                        sdif);
2145                if (sk2) {
2146                        inet_twsk_deschedule_put(inet_twsk(sk));
2147                        sk = sk2;
2148                        tcp_v4_restore_cb(skb);
2149                        refcounted = false;
2150                        goto process;
2151                }
2152        }
2153                /* to ACK */
2154                fallthrough;
2155        case TCP_TW_ACK:
2156                tcp_v4_timewait_ack(sk, skb);
2157                break;
2158        case TCP_TW_RST:
2159                tcp_v4_send_reset(sk, skb);
2160                inet_twsk_deschedule_put(inet_twsk(sk));
2161                goto discard_it;
2162        case TCP_TW_SUCCESS:;
2163        }
2164        goto discard_it;
2165}
2166
2167static struct timewait_sock_ops tcp_timewait_sock_ops = {
2168        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2169        .twsk_unique    = tcp_twsk_unique,
2170        .twsk_destructor= tcp_twsk_destructor,
2171};
2172
2173void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2174{
2175        struct dst_entry *dst = skb_dst(skb);
2176
2177        if (dst && dst_hold_safe(dst)) {
2178                rcu_assign_pointer(sk->sk_rx_dst, dst);
2179                sk->sk_rx_dst_ifindex = skb->skb_iif;
2180        }
2181}
2182EXPORT_SYMBOL(inet_sk_rx_dst_set);
2183
2184const struct inet_connection_sock_af_ops ipv4_specific = {
2185        .queue_xmit        = ip_queue_xmit,
2186        .send_check        = tcp_v4_send_check,
2187        .rebuild_header    = inet_sk_rebuild_header,
2188        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2189        .conn_request      = tcp_v4_conn_request,
2190        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2191        .net_header_len    = sizeof(struct iphdr),
2192        .setsockopt        = ip_setsockopt,
2193        .getsockopt        = ip_getsockopt,
2194        .addr2sockaddr     = inet_csk_addr2sockaddr,
2195        .sockaddr_len      = sizeof(struct sockaddr_in),
2196        .mtu_reduced       = tcp_v4_mtu_reduced,
2197};
2198EXPORT_SYMBOL(ipv4_specific);
2199
2200#ifdef CONFIG_TCP_MD5SIG
2201static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2202        .md5_lookup             = tcp_v4_md5_lookup,
2203        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2204        .md5_parse              = tcp_v4_parse_md5_keys,
2205};
2206#endif
2207
2208/* NOTE: A lot of things set to zero explicitly by call to
2209 *       sk_alloc() so need not be done here.
2210 */
2211static int tcp_v4_init_sock(struct sock *sk)
2212{
2213        struct inet_connection_sock *icsk = inet_csk(sk);
2214
2215        tcp_init_sock(sk);
2216
2217        icsk->icsk_af_ops = &ipv4_specific;
2218
2219#ifdef CONFIG_TCP_MD5SIG
2220        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2221#endif
2222
2223        return 0;
2224}
2225
2226void tcp_v4_destroy_sock(struct sock *sk)
2227{
2228        struct tcp_sock *tp = tcp_sk(sk);
2229
2230        trace_tcp_destroy_sock(sk);
2231
2232        tcp_clear_xmit_timers(sk);
2233
2234        tcp_cleanup_congestion_control(sk);
2235
2236        tcp_cleanup_ulp(sk);
2237
2238        /* Cleanup up the write buffer. */
2239        tcp_write_queue_purge(sk);
2240
2241        /* Check if we want to disable active TFO */
2242        tcp_fastopen_active_disable_ofo_check(sk);
2243
2244        /* Cleans up our, hopefully empty, out_of_order_queue. */
2245        skb_rbtree_purge(&tp->out_of_order_queue);
2246
2247#ifdef CONFIG_TCP_MD5SIG
2248        /* Clean up the MD5 key list, if any */
2249        if (tp->md5sig_info) {
2250                tcp_clear_md5_list(sk);
2251                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2252                tp->md5sig_info = NULL;
2253        }
2254#endif
2255
2256        /* Clean up a referenced TCP bind bucket. */
2257        if (inet_csk(sk)->icsk_bind_hash)
2258                inet_put_port(sk);
2259
2260        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2261
2262        /* If socket is aborted during connect operation */
2263        tcp_free_fastopen_req(tp);
2264        tcp_fastopen_destroy_cipher(sk);
2265        tcp_saved_syn_free(tp);
2266
2267        sk_sockets_allocated_dec(sk);
2268}
2269EXPORT_SYMBOL(tcp_v4_destroy_sock);
2270
2271#ifdef CONFIG_PROC_FS
2272/* Proc filesystem TCP sock list dumping. */
2273
2274static unsigned short seq_file_family(const struct seq_file *seq);
2275
2276static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2277{
2278        unsigned short family = seq_file_family(seq);
2279
2280        /* AF_UNSPEC is used as a match all */
2281        return ((family == AF_UNSPEC || family == sk->sk_family) &&
2282                net_eq(sock_net(sk), seq_file_net(seq)));
2283}
2284
2285/* Find a non empty bucket (starting from st->bucket)
2286 * and return the first sk from it.
2287 */
2288static void *listening_get_first(struct seq_file *seq)
2289{
2290        struct tcp_iter_state *st = seq->private;
2291
2292        st->offset = 0;
2293        for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2294                struct inet_listen_hashbucket *ilb2;
2295                struct hlist_nulls_node *node;
2296                struct sock *sk;
2297
2298                ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2299                if (hlist_nulls_empty(&ilb2->nulls_head))
2300                        continue;
2301
2302                spin_lock(&ilb2->lock);
2303                sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2304                        if (seq_sk_match(seq, sk))
2305                                return sk;
2306                }
2307                spin_unlock(&ilb2->lock);
2308        }
2309
2310        return NULL;
2311}
2312
2313/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2314 * If "cur" is the last one in the st->bucket,
2315 * call listening_get_first() to return the first sk of the next
2316 * non empty bucket.
2317 */
2318static void *listening_get_next(struct seq_file *seq, void *cur)
2319{
2320        struct tcp_iter_state *st = seq->private;
2321        struct inet_listen_hashbucket *ilb2;
2322        struct hlist_nulls_node *node;
2323        struct sock *sk = cur;
2324
2325        ++st->num;
2326        ++st->offset;
2327
2328        sk = sk_nulls_next(sk);
2329        sk_nulls_for_each_from(sk, node) {
2330                if (seq_sk_match(seq, sk))
2331                        return sk;
2332        }
2333
2334        ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2335        spin_unlock(&ilb2->lock);
2336        ++st->bucket;
2337        return listening_get_first(seq);
2338}
2339
2340static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2341{
2342        struct tcp_iter_state *st = seq->private;
2343        void *rc;
2344
2345        st->bucket = 0;
2346        st->offset = 0;
2347        rc = listening_get_first(seq);
2348
2349        while (rc && *pos) {
2350                rc = listening_get_next(seq, rc);
2351                --*pos;
2352        }
2353        return rc;
2354}
2355
2356static inline bool empty_bucket(const struct tcp_iter_state *st)
2357{
2358        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2359}
2360
2361/*
2362 * Get first established socket starting from bucket given in st->bucket.
2363 * If st->bucket is zero, the very first socket in the hash is returned.
2364 */
2365static void *established_get_first(struct seq_file *seq)
2366{
2367        struct tcp_iter_state *st = seq->private;
2368
2369        st->offset = 0;
2370        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2371                struct sock *sk;
2372                struct hlist_nulls_node *node;
2373                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2374
2375                /* Lockless fast path for the common case of empty buckets */
2376                if (empty_bucket(st))
2377                        continue;
2378
2379                spin_lock_bh(lock);
2380                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2381                        if (seq_sk_match(seq, sk))
2382                                return sk;
2383                }
2384                spin_unlock_bh(lock);
2385        }
2386
2387        return NULL;
2388}
2389
2390static void *established_get_next(struct seq_file *seq, void *cur)
2391{
2392        struct sock *sk = cur;
2393        struct hlist_nulls_node *node;
2394        struct tcp_iter_state *st = seq->private;
2395
2396        ++st->num;
2397        ++st->offset;
2398
2399        sk = sk_nulls_next(sk);
2400
2401        sk_nulls_for_each_from(sk, node) {
2402                if (seq_sk_match(seq, sk))
2403                        return sk;
2404        }
2405
2406        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2407        ++st->bucket;
2408        return established_get_first(seq);
2409}
2410
2411static void *established_get_idx(struct seq_file *seq, loff_t pos)
2412{
2413        struct tcp_iter_state *st = seq->private;
2414        void *rc;
2415
2416        st->bucket = 0;
2417        rc = established_get_first(seq);
2418
2419        while (rc && pos) {
2420                rc = established_get_next(seq, rc);
2421                --pos;
2422        }
2423        return rc;
2424}
2425
2426static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2427{
2428        void *rc;
2429        struct tcp_iter_state *st = seq->private;
2430
2431        st->state = TCP_SEQ_STATE_LISTENING;
2432        rc        = listening_get_idx(seq, &pos);
2433
2434        if (!rc) {
2435                st->state = TCP_SEQ_STATE_ESTABLISHED;
2436                rc        = established_get_idx(seq, pos);
2437        }
2438
2439        return rc;
2440}
2441
2442static void *tcp_seek_last_pos(struct seq_file *seq)
2443{
2444        struct tcp_iter_state *st = seq->private;
2445        int bucket = st->bucket;
2446        int offset = st->offset;
2447        int orig_num = st->num;
2448        void *rc = NULL;
2449
2450        switch (st->state) {
2451        case TCP_SEQ_STATE_LISTENING:
2452                if (st->bucket > tcp_hashinfo.lhash2_mask)
2453                        break;
2454                st->state = TCP_SEQ_STATE_LISTENING;
2455                rc = listening_get_first(seq);
2456                while (offset-- && rc && bucket == st->bucket)
2457                        rc = listening_get_next(seq, rc);
2458                if (rc)
2459                        break;
2460                st->bucket = 0;
2461                st->state = TCP_SEQ_STATE_ESTABLISHED;
2462                fallthrough;
2463        case TCP_SEQ_STATE_ESTABLISHED:
2464                if (st->bucket > tcp_hashinfo.ehash_mask)
2465                        break;
2466                rc = established_get_first(seq);
2467                while (offset-- && rc && bucket == st->bucket)
2468                        rc = established_get_next(seq, rc);
2469        }
2470
2471        st->num = orig_num;
2472
2473        return rc;
2474}
2475
2476void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2477{
2478        struct tcp_iter_state *st = seq->private;
2479        void *rc;
2480
2481        if (*pos && *pos == st->last_pos) {
2482                rc = tcp_seek_last_pos(seq);
2483                if (rc)
2484                        goto out;
2485        }
2486
2487        st->state = TCP_SEQ_STATE_LISTENING;
2488        st->num = 0;
2489        st->bucket = 0;
2490        st->offset = 0;
2491        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2492
2493out:
2494        st->last_pos = *pos;
2495        return rc;
2496}
2497EXPORT_SYMBOL(tcp_seq_start);
2498
2499void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2500{
2501        struct tcp_iter_state *st = seq->private;
2502        void *rc = NULL;
2503
2504        if (v == SEQ_START_TOKEN) {
2505                rc = tcp_get_idx(seq, 0);
2506                goto out;
2507        }
2508
2509        switch (st->state) {
2510        case TCP_SEQ_STATE_LISTENING:
2511                rc = listening_get_next(seq, v);
2512                if (!rc) {
2513                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2514                        st->bucket = 0;
2515                        st->offset = 0;
2516                        rc        = established_get_first(seq);
2517                }
2518                break;
2519        case TCP_SEQ_STATE_ESTABLISHED:
2520                rc = established_get_next(seq, v);
2521                break;
2522        }
2523out:
2524        ++*pos;
2525        st->last_pos = *pos;
2526        return rc;
2527}
2528EXPORT_SYMBOL(tcp_seq_next);
2529
2530void tcp_seq_stop(struct seq_file *seq, void *v)
2531{
2532        struct tcp_iter_state *st = seq->private;
2533
2534        switch (st->state) {
2535        case TCP_SEQ_STATE_LISTENING:
2536                if (v != SEQ_START_TOKEN)
2537                        spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2538                break;
2539        case TCP_SEQ_STATE_ESTABLISHED:
2540                if (v)
2541                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2542                break;
2543        }
2544}
2545EXPORT_SYMBOL(tcp_seq_stop);
2546
2547static void get_openreq4(const struct request_sock *req,
2548                         struct seq_file *f, int i)
2549{
2550        const struct inet_request_sock *ireq = inet_rsk(req);
2551        long delta = req->rsk_timer.expires - jiffies;
2552
2553        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2554                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2555                i,
2556                ireq->ir_loc_addr,
2557                ireq->ir_num,
2558                ireq->ir_rmt_addr,
2559                ntohs(ireq->ir_rmt_port),
2560                TCP_SYN_RECV,
2561                0, 0, /* could print option size, but that is af dependent. */
2562                1,    /* timers active (only the expire timer) */
2563                jiffies_delta_to_clock_t(delta),
2564                req->num_timeout,
2565                from_kuid_munged(seq_user_ns(f),
2566                                 sock_i_uid(req->rsk_listener)),
2567                0,  /* non standard timer */
2568                0, /* open_requests have no inode */
2569                0,
2570                req);
2571}
2572
2573static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2574{
2575        int timer_active;
2576        unsigned long timer_expires;
2577        const struct tcp_sock *tp = tcp_sk(sk);
2578        const struct inet_connection_sock *icsk = inet_csk(sk);
2579        const struct inet_sock *inet = inet_sk(sk);
2580        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2581        __be32 dest = inet->inet_daddr;
2582        __be32 src = inet->inet_rcv_saddr;
2583        __u16 destp = ntohs(inet->inet_dport);
2584        __u16 srcp = ntohs(inet->inet_sport);
2585        int rx_queue;
2586        int state;
2587
2588        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2589            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2590            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2591                timer_active    = 1;
2592                timer_expires   = icsk->icsk_timeout;
2593        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2594                timer_active    = 4;
2595                timer_expires   = icsk->icsk_timeout;
2596        } else if (timer_pending(&sk->sk_timer)) {
2597                timer_active    = 2;
2598                timer_expires   = sk->sk_timer.expires;
2599        } else {
2600                timer_active    = 0;
2601                timer_expires = jiffies;
2602        }
2603
2604        state = inet_sk_state_load(sk);
2605        if (state == TCP_LISTEN)
2606                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2607        else
2608                /* Because we don't lock the socket,
2609                 * we might find a transient negative value.
2610                 */
2611                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2612                                      READ_ONCE(tp->copied_seq), 0);
2613
2614        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2615                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2616                i, src, srcp, dest, destp, state,
2617                READ_ONCE(tp->write_seq) - tp->snd_una,
2618                rx_queue,
2619                timer_active,
2620                jiffies_delta_to_clock_t(timer_expires - jiffies),
2621                icsk->icsk_retransmits,
2622                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2623                icsk->icsk_probes_out,
2624                sock_i_ino(sk),
2625                refcount_read(&sk->sk_refcnt), sk,
2626                jiffies_to_clock_t(icsk->icsk_rto),
2627                jiffies_to_clock_t(icsk->icsk_ack.ato),
2628                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2629                tcp_snd_cwnd(tp),
2630                state == TCP_LISTEN ?
2631                    fastopenq->max_qlen :
2632                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2633}
2634
2635static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2636                               struct seq_file *f, int i)
2637{
2638        long delta = tw->tw_timer.expires - jiffies;
2639        __be32 dest, src;
2640        __u16 destp, srcp;
2641
2642        dest  = tw->tw_daddr;
2643        src   = tw->tw_rcv_saddr;
2644        destp = ntohs(tw->tw_dport);
2645        srcp  = ntohs(tw->tw_sport);
2646
2647        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2648                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2649                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2650                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2651                refcount_read(&tw->tw_refcnt), tw);
2652}
2653
2654#define TMPSZ 150
2655
2656static int tcp4_seq_show(struct seq_file *seq, void *v)
2657{
2658        struct tcp_iter_state *st;
2659        struct sock *sk = v;
2660
2661        seq_setwidth(seq, TMPSZ - 1);
2662        if (v == SEQ_START_TOKEN) {
2663                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2664                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2665                           "inode");
2666                goto out;
2667        }
2668        st = seq->private;
2669
2670        if (sk->sk_state == TCP_TIME_WAIT)
2671                get_timewait4_sock(v, seq, st->num);
2672        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2673                get_openreq4(v, seq, st->num);
2674        else
2675                get_tcp4_sock(v, seq, st->num);
2676out:
2677        seq_pad(seq, '\n');
2678        return 0;
2679}
2680
2681#ifdef CONFIG_BPF_SYSCALL
2682struct bpf_tcp_iter_state {
2683        struct tcp_iter_state state;
2684        unsigned int cur_sk;
2685        unsigned int end_sk;
2686        unsigned int max_sk;
2687        struct sock **batch;
2688        bool st_bucket_done;
2689};
2690
2691struct bpf_iter__tcp {
2692        __bpf_md_ptr(struct bpf_iter_meta *, meta);
2693        __bpf_md_ptr(struct sock_common *, sk_common);
2694        uid_t uid __aligned(8);
2695};
2696
2697static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2698                             struct sock_common *sk_common, uid_t uid)
2699{
2700        struct bpf_iter__tcp ctx;
2701
2702        meta->seq_num--;  /* skip SEQ_START_TOKEN */
2703        ctx.meta = meta;
2704        ctx.sk_common = sk_common;
2705        ctx.uid = uid;
2706        return bpf_iter_run_prog(prog, &ctx);
2707}
2708
2709static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2710{
2711        while (iter->cur_sk < iter->end_sk)
2712                sock_put(iter->batch[iter->cur_sk++]);
2713}
2714
2715static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2716                                      unsigned int new_batch_sz)
2717{
2718        struct sock **new_batch;
2719
2720        new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2721                             GFP_USER | __GFP_NOWARN);
2722        if (!new_batch)
2723                return -ENOMEM;
2724
2725        bpf_iter_tcp_put_batch(iter);
2726        kvfree(iter->batch);
2727        iter->batch = new_batch;
2728        iter->max_sk = new_batch_sz;
2729
2730        return 0;
2731}
2732
2733static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2734                                                 struct sock *start_sk)
2735{
2736        struct bpf_tcp_iter_state *iter = seq->private;
2737        struct tcp_iter_state *st = &iter->state;
2738        struct hlist_nulls_node *node;
2739        unsigned int expected = 1;
2740        struct sock *sk;
2741
2742        sock_hold(start_sk);
2743        iter->batch[iter->end_sk++] = start_sk;
2744
2745        sk = sk_nulls_next(start_sk);
2746        sk_nulls_for_each_from(sk, node) {
2747                if (seq_sk_match(seq, sk)) {
2748                        if (iter->end_sk < iter->max_sk) {
2749                                sock_hold(sk);
2750                                iter->batch[iter->end_sk++] = sk;
2751                        }
2752                        expected++;
2753                }
2754        }
2755        spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2756
2757        return expected;
2758}
2759
2760static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2761                                                   struct sock *start_sk)
2762{
2763        struct bpf_tcp_iter_state *iter = seq->private;
2764        struct tcp_iter_state *st = &iter->state;
2765        struct hlist_nulls_node *node;
2766        unsigned int expected = 1;
2767        struct sock *sk;
2768
2769        sock_hold(start_sk);
2770        iter->batch[iter->end_sk++] = start_sk;
2771
2772        sk = sk_nulls_next(start_sk);
2773        sk_nulls_for_each_from(sk, node) {
2774                if (seq_sk_match(seq, sk)) {
2775                        if (iter->end_sk < iter->max_sk) {
2776                                sock_hold(sk);
2777                                iter->batch[iter->end_sk++] = sk;
2778                        }
2779                        expected++;
2780                }
2781        }
2782        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2783
2784        return expected;
2785}
2786
2787static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2788{
2789        struct bpf_tcp_iter_state *iter = seq->private;
2790        struct tcp_iter_state *st = &iter->state;
2791        unsigned int expected;
2792        bool resized = false;
2793        struct sock *sk;
2794
2795        /* The st->bucket is done.  Directly advance to the next
2796         * bucket instead of having the tcp_seek_last_pos() to skip
2797         * one by one in the current bucket and eventually find out
2798         * it has to advance to the next bucket.
2799         */
2800        if (iter->st_bucket_done) {
2801                st->offset = 0;
2802                st->bucket++;
2803                if (st->state == TCP_SEQ_STATE_LISTENING &&
2804                    st->bucket > tcp_hashinfo.lhash2_mask) {
2805                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2806                        st->bucket = 0;
2807                }
2808        }
2809
2810again:
2811        /* Get a new batch */
2812        iter->cur_sk = 0;
2813        iter->end_sk = 0;
2814        iter->st_bucket_done = false;
2815
2816        sk = tcp_seek_last_pos(seq);
2817        if (!sk)
2818                return NULL; /* Done */
2819
2820        if (st->state == TCP_SEQ_STATE_LISTENING)
2821                expected = bpf_iter_tcp_listening_batch(seq, sk);
2822        else
2823                expected = bpf_iter_tcp_established_batch(seq, sk);
2824
2825        if (iter->end_sk == expected) {
2826                iter->st_bucket_done = true;
2827                return sk;
2828        }
2829
2830        if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2831                resized = true;
2832                goto again;
2833        }
2834
2835        return sk;
2836}
2837
2838static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2839{
2840        /* bpf iter does not support lseek, so it always
2841         * continue from where it was stop()-ped.
2842         */
2843        if (*pos)
2844                return bpf_iter_tcp_batch(seq);
2845
2846        return SEQ_START_TOKEN;
2847}
2848
2849static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2850{
2851        struct bpf_tcp_iter_state *iter = seq->private;
2852        struct tcp_iter_state *st = &iter->state;
2853        struct sock *sk;
2854
2855        /* Whenever seq_next() is called, the iter->cur_sk is
2856         * done with seq_show(), so advance to the next sk in
2857         * the batch.
2858         */
2859        if (iter->cur_sk < iter->end_sk) {
2860                /* Keeping st->num consistent in tcp_iter_state.
2861                 * bpf_iter_tcp does not use st->num.
2862                 * meta.seq_num is used instead.
2863                 */
2864                st->num++;
2865                /* Move st->offset to the next sk in the bucket such that
2866                 * the future start() will resume at st->offset in
2867                 * st->bucket.  See tcp_seek_last_pos().
2868                 */
2869                st->offset++;
2870                sock_put(iter->batch[iter->cur_sk++]);
2871        }
2872
2873        if (iter->cur_sk < iter->end_sk)
2874                sk = iter->batch[iter->cur_sk];
2875        else
2876                sk = bpf_iter_tcp_batch(seq);
2877
2878        ++*pos;
2879        /* Keeping st->last_pos consistent in tcp_iter_state.
2880         * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2881         */
2882        st->last_pos = *pos;
2883        return sk;
2884}
2885
2886static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2887{
2888        struct bpf_iter_meta meta;
2889        struct bpf_prog *prog;
2890        struct sock *sk = v;
2891        bool slow;
2892        uid_t uid;
2893        int ret;
2894
2895        if (v == SEQ_START_TOKEN)
2896                return 0;
2897
2898        if (sk_fullsock(sk))
2899                slow = lock_sock_fast(sk);
2900
2901        if (unlikely(sk_unhashed(sk))) {
2902                ret = SEQ_SKIP;
2903                goto unlock;
2904        }
2905
2906        if (sk->sk_state == TCP_TIME_WAIT) {
2907                uid = 0;
2908        } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2909                const struct request_sock *req = v;
2910
2911                uid = from_kuid_munged(seq_user_ns(seq),
2912                                       sock_i_uid(req->rsk_listener));
2913        } else {
2914                uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2915        }
2916
2917        meta.seq = seq;
2918        prog = bpf_iter_get_info(&meta, false);
2919        ret = tcp_prog_seq_show(prog, &meta, v, uid);
2920
2921unlock:
2922        if (sk_fullsock(sk))
2923                unlock_sock_fast(sk, slow);
2924        return ret;
2925
2926}
2927
2928static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2929{
2930        struct bpf_tcp_iter_state *iter = seq->private;
2931        struct bpf_iter_meta meta;
2932        struct bpf_prog *prog;
2933
2934        if (!v) {
2935                meta.seq = seq;
2936                prog = bpf_iter_get_info(&meta, true);
2937                if (prog)
2938                        (void)tcp_prog_seq_show(prog, &meta, v, 0);
2939        }
2940
2941        if (iter->cur_sk < iter->end_sk) {
2942                bpf_iter_tcp_put_batch(iter);
2943                iter->st_bucket_done = false;
2944        }
2945}
2946
2947static const struct seq_operations bpf_iter_tcp_seq_ops = {
2948        .show           = bpf_iter_tcp_seq_show,
2949        .start          = bpf_iter_tcp_seq_start,
2950        .next           = bpf_iter_tcp_seq_next,
2951        .stop           = bpf_iter_tcp_seq_stop,
2952};
2953#endif
2954static unsigned short seq_file_family(const struct seq_file *seq)
2955{
2956        const struct tcp_seq_afinfo *afinfo;
2957
2958#ifdef CONFIG_BPF_SYSCALL
2959        /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2960        if (seq->op == &bpf_iter_tcp_seq_ops)
2961                return AF_UNSPEC;
2962#endif
2963
2964        /* Iterated from proc fs */
2965        afinfo = pde_data(file_inode(seq->file));
2966        return afinfo->family;
2967}
2968
2969static const struct seq_operations tcp4_seq_ops = {
2970        .show           = tcp4_seq_show,
2971        .start          = tcp_seq_start,
2972        .next           = tcp_seq_next,
2973        .stop           = tcp_seq_stop,
2974};
2975
2976static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2977        .family         = AF_INET,
2978};
2979
2980static int __net_init tcp4_proc_init_net(struct net *net)
2981{
2982        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2983                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2984                return -ENOMEM;
2985        return 0;
2986}
2987
2988static void __net_exit tcp4_proc_exit_net(struct net *net)
2989{
2990        remove_proc_entry("tcp", net->proc_net);
2991}
2992
2993static struct pernet_operations tcp4_net_ops = {
2994        .init = tcp4_proc_init_net,
2995        .exit = tcp4_proc_exit_net,
2996};
2997
2998int __init tcp4_proc_init(void)
2999{
3000        return register_pernet_subsys(&tcp4_net_ops);
3001}
3002
3003void tcp4_proc_exit(void)
3004{
3005        unregister_pernet_subsys(&tcp4_net_ops);
3006}
3007#endif /* CONFIG_PROC_FS */
3008
3009/* @wake is one when sk_stream_write_space() calls us.
3010 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3011 * This mimics the strategy used in sock_def_write_space().
3012 */
3013bool tcp_stream_memory_free(const struct sock *sk, int wake)
3014{
3015        const struct tcp_sock *tp = tcp_sk(sk);
3016        u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3017                            READ_ONCE(tp->snd_nxt);
3018
3019        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3020}
3021EXPORT_SYMBOL(tcp_stream_memory_free);
3022
3023struct proto tcp_prot = {
3024        .name                   = "TCP",
3025        .owner                  = THIS_MODULE,
3026        .close                  = tcp_close,
3027        .pre_connect            = tcp_v4_pre_connect,
3028        .connect                = tcp_v4_connect,
3029        .disconnect             = tcp_disconnect,
3030        .accept                 = inet_csk_accept,
3031        .ioctl                  = tcp_ioctl,
3032        .init                   = tcp_v4_init_sock,
3033        .destroy                = tcp_v4_destroy_sock,
3034        .shutdown               = tcp_shutdown,
3035        .setsockopt             = tcp_setsockopt,
3036        .getsockopt             = tcp_getsockopt,
3037        .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3038        .keepalive              = tcp_set_keepalive,
3039        .recvmsg                = tcp_recvmsg,
3040        .sendmsg                = tcp_sendmsg,
3041        .sendpage               = tcp_sendpage,
3042        .backlog_rcv            = tcp_v4_do_rcv,
3043        .release_cb             = tcp_release_cb,
3044        .hash                   = inet_hash,
3045        .unhash                 = inet_unhash,
3046        .get_port               = inet_csk_get_port,
3047        .put_port               = inet_put_port,
3048#ifdef CONFIG_BPF_SYSCALL
3049        .psock_update_sk_prot   = tcp_bpf_update_proto,
3050#endif
3051        .enter_memory_pressure  = tcp_enter_memory_pressure,
3052        .leave_memory_pressure  = tcp_leave_memory_pressure,
3053        .stream_memory_free     = tcp_stream_memory_free,
3054        .sockets_allocated      = &tcp_sockets_allocated,
3055        .orphan_count           = &tcp_orphan_count,
3056
3057        .memory_allocated       = &tcp_memory_allocated,
3058        .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3059
3060        .memory_pressure        = &tcp_memory_pressure,
3061        .sysctl_mem             = sysctl_tcp_mem,
3062        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3063        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3064        .max_header             = MAX_TCP_HEADER,
3065        .obj_size               = sizeof(struct tcp_sock),
3066        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3067        .twsk_prot              = &tcp_timewait_sock_ops,
3068        .rsk_prot               = &tcp_request_sock_ops,
3069        .h.hashinfo             = &tcp_hashinfo,
3070        .no_autobind            = true,
3071        .diag_destroy           = tcp_abort,
3072};
3073EXPORT_SYMBOL(tcp_prot);
3074
3075static void __net_exit tcp_sk_exit(struct net *net)
3076{
3077        struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3078
3079        if (net->ipv4.tcp_congestion_control)
3080                bpf_module_put(net->ipv4.tcp_congestion_control,
3081                               net->ipv4.tcp_congestion_control->owner);
3082        if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3083                kfree(tcp_death_row);
3084}
3085
3086static int __net_init tcp_sk_init(struct net *net)
3087{
3088        int cnt;
3089
3090        net->ipv4.sysctl_tcp_ecn = 2;
3091        net->ipv4.sysctl_tcp_ecn_fallback = 1;
3092
3093        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3094        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3095        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3096        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3097        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3098
3099        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3100        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3101        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3102
3103        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3104        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3105        net->ipv4.sysctl_tcp_syncookies = 1;
3106        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3107        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3108        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3109        net->ipv4.sysctl_tcp_orphan_retries = 0;
3110        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3111        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3112        net->ipv4.sysctl_tcp_tw_reuse = 2;
3113        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3114
3115        net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3116        if (!net->ipv4.tcp_death_row)
3117                return -ENOMEM;
3118        refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3119        cnt = tcp_hashinfo.ehash_mask + 1;
3120        net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3121        net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3122
3123        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3124        net->ipv4.sysctl_tcp_sack = 1;
3125        net->ipv4.sysctl_tcp_window_scaling = 1;
3126        net->ipv4.sysctl_tcp_timestamps = 1;
3127        net->ipv4.sysctl_tcp_early_retrans = 3;
3128        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3129        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3130        net->ipv4.sysctl_tcp_retrans_collapse = 1;
3131        net->ipv4.sysctl_tcp_max_reordering = 300;
3132        net->ipv4.sysctl_tcp_dsack = 1;
3133        net->ipv4.sysctl_tcp_app_win = 31;
3134        net->ipv4.sysctl_tcp_adv_win_scale = 1;
3135        net->ipv4.sysctl_tcp_frto = 2;
3136        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3137        /* This limits the percentage of the congestion window which we
3138         * will allow a single TSO frame to consume.  Building TSO frames
3139         * which are too large can cause TCP streams to be bursty.
3140         */
3141        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3142        /* Default TSQ limit of 16 TSO segments */
3143        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3144
3145        /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3146        net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3147
3148        net->ipv4.sysctl_tcp_min_tso_segs = 2;
3149        net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3150        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3151        net->ipv4.sysctl_tcp_autocorking = 1;
3152        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3153        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3154        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3155        if (net != &init_net) {
3156                memcpy(net->ipv4.sysctl_tcp_rmem,
3157                       init_net.ipv4.sysctl_tcp_rmem,
3158                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3159                memcpy(net->ipv4.sysctl_tcp_wmem,
3160                       init_net.ipv4.sysctl_tcp_wmem,
3161                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3162        }
3163        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3164        net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3165        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3166        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3167        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3168        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3169
3170        /* Reno is always built in */
3171        if (!net_eq(net, &init_net) &&
3172            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3173                               init_net.ipv4.tcp_congestion_control->owner))
3174                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3175        else
3176                net->ipv4.tcp_congestion_control = &tcp_reno;
3177
3178        return 0;
3179}
3180
3181static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3182{
3183        struct net *net;
3184
3185        inet_twsk_purge(&tcp_hashinfo, AF_INET);
3186
3187        list_for_each_entry(net, net_exit_list, exit_list)
3188                tcp_fastopen_ctx_destroy(net);
3189}
3190
3191static struct pernet_operations __net_initdata tcp_sk_ops = {
3192       .init       = tcp_sk_init,
3193       .exit       = tcp_sk_exit,
3194       .exit_batch = tcp_sk_exit_batch,
3195};
3196
3197#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3198DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3199                     struct sock_common *sk_common, uid_t uid)
3200
3201#define INIT_BATCH_SZ 16
3202
3203static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3204{
3205        struct bpf_tcp_iter_state *iter = priv_data;
3206        int err;
3207
3208        err = bpf_iter_init_seq_net(priv_data, aux);
3209        if (err)
3210                return err;
3211
3212        err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3213        if (err) {
3214                bpf_iter_fini_seq_net(priv_data);
3215                return err;
3216        }
3217
3218        return 0;
3219}
3220
3221static void bpf_iter_fini_tcp(void *priv_data)
3222{
3223        struct bpf_tcp_iter_state *iter = priv_data;
3224
3225        bpf_iter_fini_seq_net(priv_data);
3226        kvfree(iter->batch);
3227}
3228
3229static const struct bpf_iter_seq_info tcp_seq_info = {
3230        .seq_ops                = &bpf_iter_tcp_seq_ops,
3231        .init_seq_private       = bpf_iter_init_tcp,
3232        .fini_seq_private       = bpf_iter_fini_tcp,
3233        .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3234};
3235
3236static const struct bpf_func_proto *
3237bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3238                            const struct bpf_prog *prog)
3239{
3240        switch (func_id) {
3241        case BPF_FUNC_setsockopt:
3242                return &bpf_sk_setsockopt_proto;
3243        case BPF_FUNC_getsockopt:
3244                return &bpf_sk_getsockopt_proto;
3245        default:
3246                return NULL;
3247        }
3248}
3249
3250static struct bpf_iter_reg tcp_reg_info = {
3251        .target                 = "tcp",
3252        .ctx_arg_info_size      = 1,
3253        .ctx_arg_info           = {
3254                { offsetof(struct bpf_iter__tcp, sk_common),
3255                  PTR_TO_BTF_ID_OR_NULL },
3256        },
3257        .get_func_proto         = bpf_iter_tcp_get_func_proto,
3258        .seq_info               = &tcp_seq_info,
3259};
3260
3261static void __init bpf_iter_register(void)
3262{
3263        tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3264        if (bpf_iter_reg_target(&tcp_reg_info))
3265                pr_warn("Warning: could not register bpf iterator tcp\n");
3266}
3267
3268#endif
3269
3270void __init tcp_v4_init(void)
3271{
3272        int cpu, res;
3273
3274        for_each_possible_cpu(cpu) {
3275                struct sock *sk;
3276
3277                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3278                                           IPPROTO_TCP, &init_net);
3279                if (res)
3280                        panic("Failed to create the TCP control socket.\n");
3281                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3282
3283                /* Please enforce IP_DF and IPID==0 for RST and
3284                 * ACK sent in SYN-RECV and TIME-WAIT state.
3285                 */
3286                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3287
3288                per_cpu(ipv4_tcp_sk, cpu) = sk;
3289        }
3290        if (register_pernet_subsys(&tcp_sk_ops))
3291                panic("Failed to create the TCP control socket.\n");
3292
3293#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3294        bpf_iter_register();
3295#endif
3296}
3297