linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/netdma.h>
  76#include <net/secure_seq.h>
  77#include <net/tcp_memcontrol.h>
  78
  79#include <linux/inet.h>
  80#include <linux/ipv6.h>
  81#include <linux/stddef.h>
  82#include <linux/proc_fs.h>
  83#include <linux/seq_file.h>
  84
  85#include <linux/crypto.h>
  86#include <linux/scatterlist.h>
  87
  88int sysctl_tcp_tw_reuse __read_mostly;
  89int sysctl_tcp_low_latency __read_mostly;
  90EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93#ifdef CONFIG_TCP_MD5SIG
  94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96#endif
  97
  98struct inet_hashinfo tcp_hashinfo;
  99EXPORT_SYMBOL(tcp_hashinfo);
 100
 101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102{
 103        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                          ip_hdr(skb)->saddr,
 105                                          tcp_hdr(skb)->dest,
 106                                          tcp_hdr(skb)->source);
 107}
 108
 109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110{
 111        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112        struct tcp_sock *tp = tcp_sk(sk);
 113
 114        /* With PAWS, it is safe from the viewpoint
 115           of data integrity. Even without PAWS it is safe provided sequence
 116           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118           Actually, the idea is close to VJ's one, only timestamp cache is
 119           held not per host, but per port pair and TW bucket is used as state
 120           holder.
 121
 122           If TW bucket has been already destroyed we fall back to VJ's scheme
 123           and use initial timestamp retrieved from peer table.
 124         */
 125        if (tcptw->tw_ts_recent_stamp &&
 126            (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                if (tp->write_seq == 0)
 130                        tp->write_seq = 1;
 131                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                sock_hold(sktw);
 134                return 1;
 135        }
 136
 137        return 0;
 138}
 139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141static int tcp_repair_connect(struct sock *sk)
 142{
 143        tcp_connect_init(sk);
 144        tcp_finish_connect(sk, NULL);
 145
 146        return 0;
 147}
 148
 149/* This will initiate an outgoing connection. */
 150int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151{
 152        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153        struct inet_sock *inet = inet_sk(sk);
 154        struct tcp_sock *tp = tcp_sk(sk);
 155        __be16 orig_sport, orig_dport;
 156        __be32 daddr, nexthop;
 157        struct flowi4 *fl4;
 158        struct rtable *rt;
 159        int err;
 160        struct ip_options_rcu *inet_opt;
 161
 162        if (addr_len < sizeof(struct sockaddr_in))
 163                return -EINVAL;
 164
 165        if (usin->sin_family != AF_INET)
 166                return -EAFNOSUPPORT;
 167
 168        nexthop = daddr = usin->sin_addr.s_addr;
 169        inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                             sock_owned_by_user(sk));
 171        if (inet_opt && inet_opt->opt.srr) {
 172                if (!daddr)
 173                        return -EINVAL;
 174                nexthop = inet_opt->opt.faddr;
 175        }
 176
 177        orig_sport = inet->inet_sport;
 178        orig_dport = usin->sin_port;
 179        fl4 = &inet->cork.fl.u.ip4;
 180        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                              IPPROTO_TCP,
 183                              orig_sport, orig_dport, sk, true);
 184        if (IS_ERR(rt)) {
 185                err = PTR_ERR(rt);
 186                if (err == -ENETUNREACH)
 187                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                return err;
 189        }
 190
 191        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                ip_rt_put(rt);
 193                return -ENETUNREACH;
 194        }
 195
 196        if (!inet_opt || !inet_opt->opt.srr)
 197                daddr = fl4->daddr;
 198
 199        if (!inet->inet_saddr)
 200                inet->inet_saddr = fl4->saddr;
 201        inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                /* Reset inherited state */
 205                tp->rx_opt.ts_recent       = 0;
 206                tp->rx_opt.ts_recent_stamp = 0;
 207                if (likely(!tp->repair))
 208                        tp->write_seq      = 0;
 209        }
 210
 211        if (tcp_death_row.sysctl_tw_recycle &&
 212            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215        inet->inet_dport = usin->sin_port;
 216        inet->inet_daddr = daddr;
 217
 218        inet_csk(sk)->icsk_ext_hdr_len = 0;
 219        if (inet_opt)
 220                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224        /* Socket identity is still unknown (sport may be zero).
 225         * However we set state to SYN-SENT and not releasing socket
 226         * lock select source port, enter ourselves into the hash tables and
 227         * complete initialization after this.
 228         */
 229        tcp_set_state(sk, TCP_SYN_SENT);
 230        err = inet_hash_connect(&tcp_death_row, sk);
 231        if (err)
 232                goto failure;
 233
 234        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                               inet->inet_sport, inet->inet_dport, sk);
 236        if (IS_ERR(rt)) {
 237                err = PTR_ERR(rt);
 238                rt = NULL;
 239                goto failure;
 240        }
 241        /* OK, now commit destination to socket.  */
 242        sk->sk_gso_type = SKB_GSO_TCPV4;
 243        sk_setup_caps(sk, &rt->dst);
 244
 245        if (!tp->write_seq && likely(!tp->repair))
 246                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                           inet->inet_daddr,
 248                                                           inet->inet_sport,
 249                                                           usin->sin_port);
 250
 251        inet->inet_id = tp->write_seq ^ jiffies;
 252
 253        if (likely(!tp->repair))
 254                err = tcp_connect(sk);
 255        else
 256                err = tcp_repair_connect(sk);
 257
 258        rt = NULL;
 259        if (err)
 260                goto failure;
 261
 262        return 0;
 263
 264failure:
 265        /*
 266         * This unhashes the socket and releases the local port,
 267         * if necessary.
 268         */
 269        tcp_set_state(sk, TCP_CLOSE);
 270        ip_rt_put(rt);
 271        sk->sk_route_caps = 0;
 272        inet->inet_dport = 0;
 273        return err;
 274}
 275EXPORT_SYMBOL(tcp_v4_connect);
 276
 277/*
 278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279 * It can be called through tcp_release_cb() if socket was owned by user
 280 * at the time tcp_v4_err() was called to handle ICMP message.
 281 */
 282static void tcp_v4_mtu_reduced(struct sock *sk)
 283{
 284        struct dst_entry *dst;
 285        struct inet_sock *inet = inet_sk(sk);
 286        u32 mtu = tcp_sk(sk)->mtu_info;
 287
 288        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 289         * send out by Linux are always <576bytes so they should go through
 290         * unfragmented).
 291         */
 292        if (sk->sk_state == TCP_LISTEN)
 293                return;
 294
 295        dst = inet_csk_update_pmtu(sk, mtu);
 296        if (!dst)
 297                return;
 298
 299        /* Something is about to be wrong... Remember soft error
 300         * for the case, if this connection will not able to recover.
 301         */
 302        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                sk->sk_err_soft = EMSGSIZE;
 304
 305        mtu = dst_mtu(dst);
 306
 307        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                tcp_sync_mss(sk, mtu);
 310
 311                /* Resend the TCP packet because it's
 312                 * clear that the old packet has been
 313                 * dropped. This is the new "fast" path mtu
 314                 * discovery.
 315                 */
 316                tcp_simple_retransmit(sk);
 317        } /* else let the usual retransmit timer handle it */
 318}
 319
 320static void do_redirect(struct sk_buff *skb, struct sock *sk)
 321{
 322        struct dst_entry *dst = __sk_dst_check(sk, 0);
 323
 324        if (dst)
 325                dst->ops->redirect(dst, sk, skb);
 326}
 327
 328/*
 329 * This routine is called by the ICMP module when it gets some
 330 * sort of error condition.  If err < 0 then the socket should
 331 * be closed and the error returned to the user.  If err > 0
 332 * it's just the icmp type << 8 | icmp code.  After adjustment
 333 * header points to the first 8 bytes of the tcp header.  We need
 334 * to find the appropriate port.
 335 *
 336 * The locking strategy used here is very "optimistic". When
 337 * someone else accesses the socket the ICMP is just dropped
 338 * and for some paths there is no check at all.
 339 * A more general error queue to queue errors for later handling
 340 * is probably better.
 341 *
 342 */
 343
 344void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 345{
 346        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 347        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 348        struct inet_connection_sock *icsk;
 349        struct tcp_sock *tp;
 350        struct inet_sock *inet;
 351        const int type = icmp_hdr(icmp_skb)->type;
 352        const int code = icmp_hdr(icmp_skb)->code;
 353        struct sock *sk;
 354        struct sk_buff *skb;
 355        struct request_sock *req;
 356        __u32 seq;
 357        __u32 remaining;
 358        int err;
 359        struct net *net = dev_net(icmp_skb->dev);
 360
 361        if (icmp_skb->len < (iph->ihl << 2) + 8) {
 362                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 363                return;
 364        }
 365
 366        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 367                        iph->saddr, th->source, inet_iif(icmp_skb));
 368        if (!sk) {
 369                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 370                return;
 371        }
 372        if (sk->sk_state == TCP_TIME_WAIT) {
 373                inet_twsk_put(inet_twsk(sk));
 374                return;
 375        }
 376
 377        bh_lock_sock(sk);
 378        /* If too many ICMPs get dropped on busy
 379         * servers this needs to be solved differently.
 380         * We do take care of PMTU discovery (RFC1191) special case :
 381         * we can receive locally generated ICMP messages while socket is held.
 382         */
 383        if (sock_owned_by_user(sk)) {
 384                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 385                        NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 386        }
 387        if (sk->sk_state == TCP_CLOSE)
 388                goto out;
 389
 390        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 391                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 392                goto out;
 393        }
 394
 395        icsk = inet_csk(sk);
 396        tp = tcp_sk(sk);
 397        req = tp->fastopen_rsk;
 398        seq = ntohl(th->seq);
 399        if (sk->sk_state != TCP_LISTEN &&
 400            !between(seq, tp->snd_una, tp->snd_nxt) &&
 401            (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 402                /* For a Fast Open socket, allow seq to be snt_isn. */
 403                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 404                goto out;
 405        }
 406
 407        switch (type) {
 408        case ICMP_REDIRECT:
 409                do_redirect(icmp_skb, sk);
 410                goto out;
 411        case ICMP_SOURCE_QUENCH:
 412                /* Just silently ignore these. */
 413                goto out;
 414        case ICMP_PARAMETERPROB:
 415                err = EPROTO;
 416                break;
 417        case ICMP_DEST_UNREACH:
 418                if (code > NR_ICMP_UNREACH)
 419                        goto out;
 420
 421                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 422                        tp->mtu_info = info;
 423                        if (!sock_owned_by_user(sk)) {
 424                                tcp_v4_mtu_reduced(sk);
 425                        } else {
 426                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 427                                        sock_hold(sk);
 428                        }
 429                        goto out;
 430                }
 431
 432                err = icmp_err_convert[code].errno;
 433                /* check if icmp_skb allows revert of backoff
 434                 * (see draft-zimmermann-tcp-lcd) */
 435                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 436                        break;
 437                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 438                    !icsk->icsk_backoff)
 439                        break;
 440
 441                /* XXX (TFO) - revisit the following logic for TFO */
 442
 443                if (sock_owned_by_user(sk))
 444                        break;
 445
 446                icsk->icsk_backoff--;
 447                inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 448                        TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 449                tcp_bound_rto(sk);
 450
 451                skb = tcp_write_queue_head(sk);
 452                BUG_ON(!skb);
 453
 454                remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 455                                tcp_time_stamp - TCP_SKB_CB(skb)->when);
 456
 457                if (remaining) {
 458                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 459                                                  remaining, TCP_RTO_MAX);
 460                } else {
 461                        /* RTO revert clocked out retransmission.
 462                         * Will retransmit now */
 463                        tcp_retransmit_timer(sk);
 464                }
 465
 466                break;
 467        case ICMP_TIME_EXCEEDED:
 468                err = EHOSTUNREACH;
 469                break;
 470        default:
 471                goto out;
 472        }
 473
 474        /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 475         * than following the TCP_SYN_RECV case and closing the socket,
 476         * we ignore the ICMP error and keep trying like a fully established
 477         * socket. Is this the right thing to do?
 478         */
 479        if (req && req->sk == NULL)
 480                goto out;
 481
 482        switch (sk->sk_state) {
 483                struct request_sock *req, **prev;
 484        case TCP_LISTEN:
 485                if (sock_owned_by_user(sk))
 486                        goto out;
 487
 488                req = inet_csk_search_req(sk, &prev, th->dest,
 489                                          iph->daddr, iph->saddr);
 490                if (!req)
 491                        goto out;
 492
 493                /* ICMPs are not backlogged, hence we cannot get
 494                   an established socket here.
 495                 */
 496                WARN_ON(req->sk);
 497
 498                if (seq != tcp_rsk(req)->snt_isn) {
 499                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 500                        goto out;
 501                }
 502
 503                /*
 504                 * Still in SYN_RECV, just remove it silently.
 505                 * There is no good way to pass the error to the newly
 506                 * created socket, and POSIX does not want network
 507                 * errors returned from accept().
 508                 */
 509                inet_csk_reqsk_queue_drop(sk, req, prev);
 510                goto out;
 511
 512        case TCP_SYN_SENT:
 513        case TCP_SYN_RECV:  /* Cannot happen.
 514                               It can f.e. if SYNs crossed,
 515                               or Fast Open.
 516                             */
 517                if (!sock_owned_by_user(sk)) {
 518                        sk->sk_err = err;
 519
 520                        sk->sk_error_report(sk);
 521
 522                        tcp_done(sk);
 523                } else {
 524                        sk->sk_err_soft = err;
 525                }
 526                goto out;
 527        }
 528
 529        /* If we've already connected we will keep trying
 530         * until we time out, or the user gives up.
 531         *
 532         * rfc1122 4.2.3.9 allows to consider as hard errors
 533         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 534         * but it is obsoleted by pmtu discovery).
 535         *
 536         * Note, that in modern internet, where routing is unreliable
 537         * and in each dark corner broken firewalls sit, sending random
 538         * errors ordered by their masters even this two messages finally lose
 539         * their original sense (even Linux sends invalid PORT_UNREACHs)
 540         *
 541         * Now we are in compliance with RFCs.
 542         *                                                      --ANK (980905)
 543         */
 544
 545        inet = inet_sk(sk);
 546        if (!sock_owned_by_user(sk) && inet->recverr) {
 547                sk->sk_err = err;
 548                sk->sk_error_report(sk);
 549        } else  { /* Only an error on timeout */
 550                sk->sk_err_soft = err;
 551        }
 552
 553out:
 554        bh_unlock_sock(sk);
 555        sock_put(sk);
 556}
 557
 558static void __tcp_v4_send_check(struct sk_buff *skb,
 559                                __be32 saddr, __be32 daddr)
 560{
 561        struct tcphdr *th = tcp_hdr(skb);
 562
 563        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 564                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 565                skb->csum_start = skb_transport_header(skb) - skb->head;
 566                skb->csum_offset = offsetof(struct tcphdr, check);
 567        } else {
 568                th->check = tcp_v4_check(skb->len, saddr, daddr,
 569                                         csum_partial(th,
 570                                                      th->doff << 2,
 571                                                      skb->csum));
 572        }
 573}
 574
 575/* This routine computes an IPv4 TCP checksum. */
 576void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 577{
 578        const struct inet_sock *inet = inet_sk(sk);
 579
 580        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 581}
 582EXPORT_SYMBOL(tcp_v4_send_check);
 583
 584int tcp_v4_gso_send_check(struct sk_buff *skb)
 585{
 586        const struct iphdr *iph;
 587        struct tcphdr *th;
 588
 589        if (!pskb_may_pull(skb, sizeof(*th)))
 590                return -EINVAL;
 591
 592        iph = ip_hdr(skb);
 593        th = tcp_hdr(skb);
 594
 595        th->check = 0;
 596        skb->ip_summed = CHECKSUM_PARTIAL;
 597        __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 598        return 0;
 599}
 600
 601/*
 602 *      This routine will send an RST to the other tcp.
 603 *
 604 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 605 *                    for reset.
 606 *      Answer: if a packet caused RST, it is not for a socket
 607 *              existing in our system, if it is matched to a socket,
 608 *              it is just duplicate segment or bug in other side's TCP.
 609 *              So that we build reply only basing on parameters
 610 *              arrived with segment.
 611 *      Exception: precedence violation. We do not implement it in any case.
 612 */
 613
 614static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 615{
 616        const struct tcphdr *th = tcp_hdr(skb);
 617        struct {
 618                struct tcphdr th;
 619#ifdef CONFIG_TCP_MD5SIG
 620                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 621#endif
 622        } rep;
 623        struct ip_reply_arg arg;
 624#ifdef CONFIG_TCP_MD5SIG
 625        struct tcp_md5sig_key *key;
 626        const __u8 *hash_location = NULL;
 627        unsigned char newhash[16];
 628        int genhash;
 629        struct sock *sk1 = NULL;
 630#endif
 631        struct net *net;
 632
 633        /* Never send a reset in response to a reset. */
 634        if (th->rst)
 635                return;
 636
 637        if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 638                return;
 639
 640        /* Swap the send and the receive. */
 641        memset(&rep, 0, sizeof(rep));
 642        rep.th.dest   = th->source;
 643        rep.th.source = th->dest;
 644        rep.th.doff   = sizeof(struct tcphdr) / 4;
 645        rep.th.rst    = 1;
 646
 647        if (th->ack) {
 648                rep.th.seq = th->ack_seq;
 649        } else {
 650                rep.th.ack = 1;
 651                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 652                                       skb->len - (th->doff << 2));
 653        }
 654
 655        memset(&arg, 0, sizeof(arg));
 656        arg.iov[0].iov_base = (unsigned char *)&rep;
 657        arg.iov[0].iov_len  = sizeof(rep.th);
 658
 659#ifdef CONFIG_TCP_MD5SIG
 660        hash_location = tcp_parse_md5sig_option(th);
 661        if (!sk && hash_location) {
 662                /*
 663                 * active side is lost. Try to find listening socket through
 664                 * source port, and then find md5 key through listening socket.
 665                 * we are not loose security here:
 666                 * Incoming packet is checked with md5 hash with finding key,
 667                 * no RST generated if md5 hash doesn't match.
 668                 */
 669                sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 670                                             &tcp_hashinfo, ip_hdr(skb)->daddr,
 671                                             ntohs(th->source), inet_iif(skb));
 672                /* don't send rst if it can't find key */
 673                if (!sk1)
 674                        return;
 675                rcu_read_lock();
 676                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 677                                        &ip_hdr(skb)->saddr, AF_INET);
 678                if (!key)
 679                        goto release_sk1;
 680
 681                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 682                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 683                        goto release_sk1;
 684        } else {
 685                key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 686                                             &ip_hdr(skb)->saddr,
 687                                             AF_INET) : NULL;
 688        }
 689
 690        if (key) {
 691                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 692                                   (TCPOPT_NOP << 16) |
 693                                   (TCPOPT_MD5SIG << 8) |
 694                                   TCPOLEN_MD5SIG);
 695                /* Update length and the length the header thinks exists */
 696                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 697                rep.th.doff = arg.iov[0].iov_len / 4;
 698
 699                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 700                                     key, ip_hdr(skb)->saddr,
 701                                     ip_hdr(skb)->daddr, &rep.th);
 702        }
 703#endif
 704        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 705                                      ip_hdr(skb)->saddr, /* XXX */
 706                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 707        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 708        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 709        /* When socket is gone, all binding information is lost.
 710         * routing might fail in this case. No choice here, if we choose to force
 711         * input interface, we will misroute in case of asymmetric route.
 712         */
 713        if (sk)
 714                arg.bound_dev_if = sk->sk_bound_dev_if;
 715
 716        net = dev_net(skb_dst(skb)->dev);
 717        arg.tos = ip_hdr(skb)->tos;
 718        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 719                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 720
 721        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 722        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 723
 724#ifdef CONFIG_TCP_MD5SIG
 725release_sk1:
 726        if (sk1) {
 727                rcu_read_unlock();
 728                sock_put(sk1);
 729        }
 730#endif
 731}
 732
 733/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 734   outside socket context is ugly, certainly. What can I do?
 735 */
 736
 737static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 738                            u32 win, u32 ts, int oif,
 739                            struct tcp_md5sig_key *key,
 740                            int reply_flags, u8 tos)
 741{
 742        const struct tcphdr *th = tcp_hdr(skb);
 743        struct {
 744                struct tcphdr th;
 745                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 746#ifdef CONFIG_TCP_MD5SIG
 747                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 748#endif
 749                        ];
 750        } rep;
 751        struct ip_reply_arg arg;
 752        struct net *net = dev_net(skb_dst(skb)->dev);
 753
 754        memset(&rep.th, 0, sizeof(struct tcphdr));
 755        memset(&arg, 0, sizeof(arg));
 756
 757        arg.iov[0].iov_base = (unsigned char *)&rep;
 758        arg.iov[0].iov_len  = sizeof(rep.th);
 759        if (ts) {
 760                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 761                                   (TCPOPT_TIMESTAMP << 8) |
 762                                   TCPOLEN_TIMESTAMP);
 763                rep.opt[1] = htonl(tcp_time_stamp);
 764                rep.opt[2] = htonl(ts);
 765                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 766        }
 767
 768        /* Swap the send and the receive. */
 769        rep.th.dest    = th->source;
 770        rep.th.source  = th->dest;
 771        rep.th.doff    = arg.iov[0].iov_len / 4;
 772        rep.th.seq     = htonl(seq);
 773        rep.th.ack_seq = htonl(ack);
 774        rep.th.ack     = 1;
 775        rep.th.window  = htons(win);
 776
 777#ifdef CONFIG_TCP_MD5SIG
 778        if (key) {
 779                int offset = (ts) ? 3 : 0;
 780
 781                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 782                                          (TCPOPT_NOP << 16) |
 783                                          (TCPOPT_MD5SIG << 8) |
 784                                          TCPOLEN_MD5SIG);
 785                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 786                rep.th.doff = arg.iov[0].iov_len/4;
 787
 788                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 789                                    key, ip_hdr(skb)->saddr,
 790                                    ip_hdr(skb)->daddr, &rep.th);
 791        }
 792#endif
 793        arg.flags = reply_flags;
 794        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 795                                      ip_hdr(skb)->saddr, /* XXX */
 796                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 797        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 798        if (oif)
 799                arg.bound_dev_if = oif;
 800        arg.tos = tos;
 801        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 802                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 803
 804        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 805}
 806
 807static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 808{
 809        struct inet_timewait_sock *tw = inet_twsk(sk);
 810        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 811
 812        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 813                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 814                        tcptw->tw_ts_recent,
 815                        tw->tw_bound_dev_if,
 816                        tcp_twsk_md5_key(tcptw),
 817                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 818                        tw->tw_tos
 819                        );
 820
 821        inet_twsk_put(tw);
 822}
 823
 824static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 825                                  struct request_sock *req)
 826{
 827        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 828         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 829         */
 830        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 831                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 832                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 833                        req->ts_recent,
 834                        0,
 835                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 836                                          AF_INET),
 837                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 838                        ip_hdr(skb)->tos);
 839}
 840
 841/*
 842 *      Send a SYN-ACK after having received a SYN.
 843 *      This still operates on a request_sock only, not on a big
 844 *      socket.
 845 */
 846static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 847                              struct request_sock *req,
 848                              struct request_values *rvp,
 849                              u16 queue_mapping,
 850                              bool nocache)
 851{
 852        const struct inet_request_sock *ireq = inet_rsk(req);
 853        struct flowi4 fl4;
 854        int err = -1;
 855        struct sk_buff * skb;
 856
 857        /* First, grab a route. */
 858        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 859                return -1;
 860
 861        skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 862
 863        if (skb) {
 864                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 865
 866                skb_set_queue_mapping(skb, queue_mapping);
 867                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 868                                            ireq->rmt_addr,
 869                                            ireq->opt);
 870                err = net_xmit_eval(err);
 871                if (!tcp_rsk(req)->snt_synack && !err)
 872                        tcp_rsk(req)->snt_synack = tcp_time_stamp;
 873        }
 874
 875        return err;
 876}
 877
 878static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 879                              struct request_values *rvp)
 880{
 881        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 882        return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 883}
 884
 885/*
 886 *      IPv4 request_sock destructor.
 887 */
 888static void tcp_v4_reqsk_destructor(struct request_sock *req)
 889{
 890        kfree(inet_rsk(req)->opt);
 891}
 892
 893/*
 894 * Return true if a syncookie should be sent
 895 */
 896bool tcp_syn_flood_action(struct sock *sk,
 897                         const struct sk_buff *skb,
 898                         const char *proto)
 899{
 900        const char *msg = "Dropping request";
 901        bool want_cookie = false;
 902        struct listen_sock *lopt;
 903
 904
 905
 906#ifdef CONFIG_SYN_COOKIES
 907        if (sysctl_tcp_syncookies) {
 908                msg = "Sending cookies";
 909                want_cookie = true;
 910                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 911        } else
 912#endif
 913                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 914
 915        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 916        if (!lopt->synflood_warned) {
 917                lopt->synflood_warned = 1;
 918                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 919                        proto, ntohs(tcp_hdr(skb)->dest), msg);
 920        }
 921        return want_cookie;
 922}
 923EXPORT_SYMBOL(tcp_syn_flood_action);
 924
 925/*
 926 * Save and compile IPv4 options into the request_sock if needed.
 927 */
 928static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 929{
 930        const struct ip_options *opt = &(IPCB(skb)->opt);
 931        struct ip_options_rcu *dopt = NULL;
 932
 933        if (opt && opt->optlen) {
 934                int opt_size = sizeof(*dopt) + opt->optlen;
 935
 936                dopt = kmalloc(opt_size, GFP_ATOMIC);
 937                if (dopt) {
 938                        if (ip_options_echo(&dopt->opt, skb)) {
 939                                kfree(dopt);
 940                                dopt = NULL;
 941                        }
 942                }
 943        }
 944        return dopt;
 945}
 946
 947#ifdef CONFIG_TCP_MD5SIG
 948/*
 949 * RFC2385 MD5 checksumming requires a mapping of
 950 * IP address->MD5 Key.
 951 * We need to maintain these in the sk structure.
 952 */
 953
 954/* Find the Key structure for an address.  */
 955struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 956                                         const union tcp_md5_addr *addr,
 957                                         int family)
 958{
 959        struct tcp_sock *tp = tcp_sk(sk);
 960        struct tcp_md5sig_key *key;
 961        struct hlist_node *pos;
 962        unsigned int size = sizeof(struct in_addr);
 963        struct tcp_md5sig_info *md5sig;
 964
 965        /* caller either holds rcu_read_lock() or socket lock */
 966        md5sig = rcu_dereference_check(tp->md5sig_info,
 967                                       sock_owned_by_user(sk) ||
 968                                       lockdep_is_held(&sk->sk_lock.slock));
 969        if (!md5sig)
 970                return NULL;
 971#if IS_ENABLED(CONFIG_IPV6)
 972        if (family == AF_INET6)
 973                size = sizeof(struct in6_addr);
 974#endif
 975        hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 976                if (key->family != family)
 977                        continue;
 978                if (!memcmp(&key->addr, addr, size))
 979                        return key;
 980        }
 981        return NULL;
 982}
 983EXPORT_SYMBOL(tcp_md5_do_lookup);
 984
 985struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 986                                         struct sock *addr_sk)
 987{
 988        union tcp_md5_addr *addr;
 989
 990        addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 991        return tcp_md5_do_lookup(sk, addr, AF_INET);
 992}
 993EXPORT_SYMBOL(tcp_v4_md5_lookup);
 994
 995static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 996                                                      struct request_sock *req)
 997{
 998        union tcp_md5_addr *addr;
 999
1000        addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1001        return tcp_md5_do_lookup(sk, addr, AF_INET);
1002}
1003
1004/* This can be called on a newly created socket, from other files */
1005int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1006                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1007{
1008        /* Add Key to the list */
1009        struct tcp_md5sig_key *key;
1010        struct tcp_sock *tp = tcp_sk(sk);
1011        struct tcp_md5sig_info *md5sig;
1012
1013        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1014        if (key) {
1015                /* Pre-existing entry - just update that one. */
1016                memcpy(key->key, newkey, newkeylen);
1017                key->keylen = newkeylen;
1018                return 0;
1019        }
1020
1021        md5sig = rcu_dereference_protected(tp->md5sig_info,
1022                                           sock_owned_by_user(sk));
1023        if (!md5sig) {
1024                md5sig = kmalloc(sizeof(*md5sig), gfp);
1025                if (!md5sig)
1026                        return -ENOMEM;
1027
1028                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1029                INIT_HLIST_HEAD(&md5sig->head);
1030                rcu_assign_pointer(tp->md5sig_info, md5sig);
1031        }
1032
1033        key = sock_kmalloc(sk, sizeof(*key), gfp);
1034        if (!key)
1035                return -ENOMEM;
1036        if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1037                sock_kfree_s(sk, key, sizeof(*key));
1038                return -ENOMEM;
1039        }
1040
1041        memcpy(key->key, newkey, newkeylen);
1042        key->keylen = newkeylen;
1043        key->family = family;
1044        memcpy(&key->addr, addr,
1045               (family == AF_INET6) ? sizeof(struct in6_addr) :
1046                                      sizeof(struct in_addr));
1047        hlist_add_head_rcu(&key->node, &md5sig->head);
1048        return 0;
1049}
1050EXPORT_SYMBOL(tcp_md5_do_add);
1051
1052int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1053{
1054        struct tcp_sock *tp = tcp_sk(sk);
1055        struct tcp_md5sig_key *key;
1056        struct tcp_md5sig_info *md5sig;
1057
1058        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1059        if (!key)
1060                return -ENOENT;
1061        hlist_del_rcu(&key->node);
1062        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1063        kfree_rcu(key, rcu);
1064        md5sig = rcu_dereference_protected(tp->md5sig_info,
1065                                           sock_owned_by_user(sk));
1066        if (hlist_empty(&md5sig->head))
1067                tcp_free_md5sig_pool();
1068        return 0;
1069}
1070EXPORT_SYMBOL(tcp_md5_do_del);
1071
1072void tcp_clear_md5_list(struct sock *sk)
1073{
1074        struct tcp_sock *tp = tcp_sk(sk);
1075        struct tcp_md5sig_key *key;
1076        struct hlist_node *pos, *n;
1077        struct tcp_md5sig_info *md5sig;
1078
1079        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1080
1081        if (!hlist_empty(&md5sig->head))
1082                tcp_free_md5sig_pool();
1083        hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1084                hlist_del_rcu(&key->node);
1085                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1086                kfree_rcu(key, rcu);
1087        }
1088}
1089
1090static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1091                                 int optlen)
1092{
1093        struct tcp_md5sig cmd;
1094        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1095
1096        if (optlen < sizeof(cmd))
1097                return -EINVAL;
1098
1099        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1100                return -EFAULT;
1101
1102        if (sin->sin_family != AF_INET)
1103                return -EINVAL;
1104
1105        if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1106                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1107                                      AF_INET);
1108
1109        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1110                return -EINVAL;
1111
1112        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1113                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1114                              GFP_KERNEL);
1115}
1116
1117static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1118                                        __be32 daddr, __be32 saddr, int nbytes)
1119{
1120        struct tcp4_pseudohdr *bp;
1121        struct scatterlist sg;
1122
1123        bp = &hp->md5_blk.ip4;
1124
1125        /*
1126         * 1. the TCP pseudo-header (in the order: source IP address,
1127         * destination IP address, zero-padded protocol number, and
1128         * segment length)
1129         */
1130        bp->saddr = saddr;
1131        bp->daddr = daddr;
1132        bp->pad = 0;
1133        bp->protocol = IPPROTO_TCP;
1134        bp->len = cpu_to_be16(nbytes);
1135
1136        sg_init_one(&sg, bp, sizeof(*bp));
1137        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1138}
1139
1140static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1141                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1142{
1143        struct tcp_md5sig_pool *hp;
1144        struct hash_desc *desc;
1145
1146        hp = tcp_get_md5sig_pool();
1147        if (!hp)
1148                goto clear_hash_noput;
1149        desc = &hp->md5_desc;
1150
1151        if (crypto_hash_init(desc))
1152                goto clear_hash;
1153        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1154                goto clear_hash;
1155        if (tcp_md5_hash_header(hp, th))
1156                goto clear_hash;
1157        if (tcp_md5_hash_key(hp, key))
1158                goto clear_hash;
1159        if (crypto_hash_final(desc, md5_hash))
1160                goto clear_hash;
1161
1162        tcp_put_md5sig_pool();
1163        return 0;
1164
1165clear_hash:
1166        tcp_put_md5sig_pool();
1167clear_hash_noput:
1168        memset(md5_hash, 0, 16);
1169        return 1;
1170}
1171
1172int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1173                        const struct sock *sk, const struct request_sock *req,
1174                        const struct sk_buff *skb)
1175{
1176        struct tcp_md5sig_pool *hp;
1177        struct hash_desc *desc;
1178        const struct tcphdr *th = tcp_hdr(skb);
1179        __be32 saddr, daddr;
1180
1181        if (sk) {
1182                saddr = inet_sk(sk)->inet_saddr;
1183                daddr = inet_sk(sk)->inet_daddr;
1184        } else if (req) {
1185                saddr = inet_rsk(req)->loc_addr;
1186                daddr = inet_rsk(req)->rmt_addr;
1187        } else {
1188                const struct iphdr *iph = ip_hdr(skb);
1189                saddr = iph->saddr;
1190                daddr = iph->daddr;
1191        }
1192
1193        hp = tcp_get_md5sig_pool();
1194        if (!hp)
1195                goto clear_hash_noput;
1196        desc = &hp->md5_desc;
1197
1198        if (crypto_hash_init(desc))
1199                goto clear_hash;
1200
1201        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1202                goto clear_hash;
1203        if (tcp_md5_hash_header(hp, th))
1204                goto clear_hash;
1205        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1206                goto clear_hash;
1207        if (tcp_md5_hash_key(hp, key))
1208                goto clear_hash;
1209        if (crypto_hash_final(desc, md5_hash))
1210                goto clear_hash;
1211
1212        tcp_put_md5sig_pool();
1213        return 0;
1214
1215clear_hash:
1216        tcp_put_md5sig_pool();
1217clear_hash_noput:
1218        memset(md5_hash, 0, 16);
1219        return 1;
1220}
1221EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1222
1223static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1224{
1225        /*
1226         * This gets called for each TCP segment that arrives
1227         * so we want to be efficient.
1228         * We have 3 drop cases:
1229         * o No MD5 hash and one expected.
1230         * o MD5 hash and we're not expecting one.
1231         * o MD5 hash and its wrong.
1232         */
1233        const __u8 *hash_location = NULL;
1234        struct tcp_md5sig_key *hash_expected;
1235        const struct iphdr *iph = ip_hdr(skb);
1236        const struct tcphdr *th = tcp_hdr(skb);
1237        int genhash;
1238        unsigned char newhash[16];
1239
1240        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1241                                          AF_INET);
1242        hash_location = tcp_parse_md5sig_option(th);
1243
1244        /* We've parsed the options - do we have a hash? */
1245        if (!hash_expected && !hash_location)
1246                return false;
1247
1248        if (hash_expected && !hash_location) {
1249                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1250                return true;
1251        }
1252
1253        if (!hash_expected && hash_location) {
1254                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1255                return true;
1256        }
1257
1258        /* Okay, so this is hash_expected and hash_location -
1259         * so we need to calculate the checksum.
1260         */
1261        genhash = tcp_v4_md5_hash_skb(newhash,
1262                                      hash_expected,
1263                                      NULL, NULL, skb);
1264
1265        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1266                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1267                                     &iph->saddr, ntohs(th->source),
1268                                     &iph->daddr, ntohs(th->dest),
1269                                     genhash ? " tcp_v4_calc_md5_hash failed"
1270                                     : "");
1271                return true;
1272        }
1273        return false;
1274}
1275
1276#endif
1277
1278struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1279        .family         =       PF_INET,
1280        .obj_size       =       sizeof(struct tcp_request_sock),
1281        .rtx_syn_ack    =       tcp_v4_rtx_synack,
1282        .send_ack       =       tcp_v4_reqsk_send_ack,
1283        .destructor     =       tcp_v4_reqsk_destructor,
1284        .send_reset     =       tcp_v4_send_reset,
1285        .syn_ack_timeout =      tcp_syn_ack_timeout,
1286};
1287
1288#ifdef CONFIG_TCP_MD5SIG
1289static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1290        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1291        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1292};
1293#endif
1294
1295static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1296                               struct request_sock *req,
1297                               struct tcp_fastopen_cookie *foc,
1298                               struct tcp_fastopen_cookie *valid_foc)
1299{
1300        bool skip_cookie = false;
1301        struct fastopen_queue *fastopenq;
1302
1303        if (likely(!fastopen_cookie_present(foc))) {
1304                /* See include/net/tcp.h for the meaning of these knobs */
1305                if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1306                    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1307                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1308                        skip_cookie = true; /* no cookie to validate */
1309                else
1310                        return false;
1311        }
1312        fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1313        /* A FO option is present; bump the counter. */
1314        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1315
1316        /* Make sure the listener has enabled fastopen, and we don't
1317         * exceed the max # of pending TFO requests allowed before trying
1318         * to validating the cookie in order to avoid burning CPU cycles
1319         * unnecessarily.
1320         *
1321         * XXX (TFO) - The implication of checking the max_qlen before
1322         * processing a cookie request is that clients can't differentiate
1323         * between qlen overflow causing Fast Open to be disabled
1324         * temporarily vs a server not supporting Fast Open at all.
1325         */
1326        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1327            fastopenq == NULL || fastopenq->max_qlen == 0)
1328                return false;
1329
1330        if (fastopenq->qlen >= fastopenq->max_qlen) {
1331                struct request_sock *req1;
1332                spin_lock(&fastopenq->lock);
1333                req1 = fastopenq->rskq_rst_head;
1334                if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1335                        spin_unlock(&fastopenq->lock);
1336                        NET_INC_STATS_BH(sock_net(sk),
1337                            LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1338                        /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1339                        foc->len = -1;
1340                        return false;
1341                }
1342                fastopenq->rskq_rst_head = req1->dl_next;
1343                fastopenq->qlen--;
1344                spin_unlock(&fastopenq->lock);
1345                reqsk_free(req1);
1346        }
1347        if (skip_cookie) {
1348                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1349                return true;
1350        }
1351        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1352                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1353                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1354                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1355                            memcmp(&foc->val[0], &valid_foc->val[0],
1356                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
1357                                return false;
1358                        valid_foc->len = -1;
1359                }
1360                /* Acknowledge the data received from the peer. */
1361                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1362                return true;
1363        } else if (foc->len == 0) { /* Client requesting a cookie */
1364                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1365                NET_INC_STATS_BH(sock_net(sk),
1366                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1367        } else {
1368                /* Client sent a cookie with wrong size. Treat it
1369                 * the same as invalid and return a valid one.
1370                 */
1371                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1372        }
1373        return false;
1374}
1375
1376static int tcp_v4_conn_req_fastopen(struct sock *sk,
1377                                    struct sk_buff *skb,
1378                                    struct sk_buff *skb_synack,
1379                                    struct request_sock *req,
1380                                    struct request_values *rvp)
1381{
1382        struct tcp_sock *tp = tcp_sk(sk);
1383        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1384        const struct inet_request_sock *ireq = inet_rsk(req);
1385        struct sock *child;
1386        int err;
1387
1388        req->retrans = 0;
1389        req->sk = NULL;
1390
1391        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1392        if (child == NULL) {
1393                NET_INC_STATS_BH(sock_net(sk),
1394                                 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1395                kfree_skb(skb_synack);
1396                return -1;
1397        }
1398        err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1399                                    ireq->rmt_addr, ireq->opt);
1400        err = net_xmit_eval(err);
1401        if (!err)
1402                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1403        /* XXX (TFO) - is it ok to ignore error and continue? */
1404
1405        spin_lock(&queue->fastopenq->lock);
1406        queue->fastopenq->qlen++;
1407        spin_unlock(&queue->fastopenq->lock);
1408
1409        /* Initialize the child socket. Have to fix some values to take
1410         * into account the child is a Fast Open socket and is created
1411         * only out of the bits carried in the SYN packet.
1412         */
1413        tp = tcp_sk(child);
1414
1415        tp->fastopen_rsk = req;
1416        /* Do a hold on the listner sk so that if the listener is being
1417         * closed, the child that has been accepted can live on and still
1418         * access listen_lock.
1419         */
1420        sock_hold(sk);
1421        tcp_rsk(req)->listener = sk;
1422
1423        /* RFC1323: The window in SYN & SYN/ACK segments is never
1424         * scaled. So correct it appropriately.
1425         */
1426        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1427
1428        /* Activate the retrans timer so that SYNACK can be retransmitted.
1429         * The request socket is not added to the SYN table of the parent
1430         * because it's been added to the accept queue directly.
1431         */
1432        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1433            TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1434
1435        /* Add the child socket directly into the accept queue */
1436        inet_csk_reqsk_queue_add(sk, req, child);
1437
1438        /* Now finish processing the fastopen child socket. */
1439        inet_csk(child)->icsk_af_ops->rebuild_header(child);
1440        tcp_init_congestion_control(child);
1441        tcp_mtup_init(child);
1442        tcp_init_buffer_space(child);
1443        tcp_init_metrics(child);
1444
1445        /* Queue the data carried in the SYN packet. We need to first
1446         * bump skb's refcnt because the caller will attempt to free it.
1447         *
1448         * XXX (TFO) - we honor a zero-payload TFO request for now.
1449         * (Any reason not to?)
1450         */
1451        if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1452                /* Don't queue the skb if there is no payload in SYN.
1453                 * XXX (TFO) - How about SYN+FIN?
1454                 */
1455                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1456        } else {
1457                skb = skb_get(skb);
1458                skb_dst_drop(skb);
1459                __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1460                skb_set_owner_r(skb, child);
1461                __skb_queue_tail(&child->sk_receive_queue, skb);
1462                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1463                tp->syn_data_acked = 1;
1464        }
1465        sk->sk_data_ready(sk, 0);
1466        bh_unlock_sock(child);
1467        sock_put(child);
1468        WARN_ON(req->sk == NULL);
1469        return 0;
1470}
1471
1472int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1473{
1474        struct tcp_extend_values tmp_ext;
1475        struct tcp_options_received tmp_opt;
1476        const u8 *hash_location;
1477        struct request_sock *req;
1478        struct inet_request_sock *ireq;
1479        struct tcp_sock *tp = tcp_sk(sk);
1480        struct dst_entry *dst = NULL;
1481        __be32 saddr = ip_hdr(skb)->saddr;
1482        __be32 daddr = ip_hdr(skb)->daddr;
1483        __u32 isn = TCP_SKB_CB(skb)->when;
1484        bool want_cookie = false;
1485        struct flowi4 fl4;
1486        struct tcp_fastopen_cookie foc = { .len = -1 };
1487        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1488        struct sk_buff *skb_synack;
1489        int do_fastopen;
1490
1491        /* Never answer to SYNs send to broadcast or multicast */
1492        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1493                goto drop;
1494
1495        /* TW buckets are converted to open requests without
1496         * limitations, they conserve resources and peer is
1497         * evidently real one.
1498         */
1499        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1500                want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1501                if (!want_cookie)
1502                        goto drop;
1503        }
1504
1505        /* Accept backlog is full. If we have already queued enough
1506         * of warm entries in syn queue, drop request. It is better than
1507         * clogging syn queue with openreqs with exponentially increasing
1508         * timeout.
1509         */
1510        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1511                goto drop;
1512
1513        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1514        if (!req)
1515                goto drop;
1516
1517#ifdef CONFIG_TCP_MD5SIG
1518        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1519#endif
1520
1521        tcp_clear_options(&tmp_opt);
1522        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1523        tmp_opt.user_mss  = tp->rx_opt.user_mss;
1524        tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1525            want_cookie ? NULL : &foc);
1526
1527        if (tmp_opt.cookie_plus > 0 &&
1528            tmp_opt.saw_tstamp &&
1529            !tp->rx_opt.cookie_out_never &&
1530            (sysctl_tcp_cookie_size > 0 ||
1531             (tp->cookie_values != NULL &&
1532              tp->cookie_values->cookie_desired > 0))) {
1533                u8 *c;
1534                u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1535                int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1536
1537                if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1538                        goto drop_and_release;
1539
1540                /* Secret recipe starts with IP addresses */
1541                *mess++ ^= (__force u32)daddr;
1542                *mess++ ^= (__force u32)saddr;
1543
1544                /* plus variable length Initiator Cookie */
1545                c = (u8 *)mess;
1546                while (l-- > 0)
1547                        *c++ ^= *hash_location++;
1548
1549                want_cookie = false;    /* not our kind of cookie */
1550                tmp_ext.cookie_out_never = 0; /* false */
1551                tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1552        } else if (!tp->rx_opt.cookie_in_always) {
1553                /* redundant indications, but ensure initialization. */
1554                tmp_ext.cookie_out_never = 1; /* true */
1555                tmp_ext.cookie_plus = 0;
1556        } else {
1557                goto drop_and_release;
1558        }
1559        tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1560
1561        if (want_cookie && !tmp_opt.saw_tstamp)
1562                tcp_clear_options(&tmp_opt);
1563
1564        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1565        tcp_openreq_init(req, &tmp_opt, skb);
1566
1567        ireq = inet_rsk(req);
1568        ireq->loc_addr = daddr;
1569        ireq->rmt_addr = saddr;
1570        ireq->no_srccheck = inet_sk(sk)->transparent;
1571        ireq->opt = tcp_v4_save_options(skb);
1572
1573        if (security_inet_conn_request(sk, skb, req))
1574                goto drop_and_free;
1575
1576        if (!want_cookie || tmp_opt.tstamp_ok)
1577                TCP_ECN_create_request(req, skb);
1578
1579        if (want_cookie) {
1580                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1581                req->cookie_ts = tmp_opt.tstamp_ok;
1582        } else if (!isn) {
1583                /* VJ's idea. We save last timestamp seen
1584                 * from the destination in peer table, when entering
1585                 * state TIME-WAIT, and check against it before
1586                 * accepting new connection request.
1587                 *
1588                 * If "isn" is not zero, this request hit alive
1589                 * timewait bucket, so that all the necessary checks
1590                 * are made in the function processing timewait state.
1591                 */
1592                if (tmp_opt.saw_tstamp &&
1593                    tcp_death_row.sysctl_tw_recycle &&
1594                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1595                    fl4.daddr == saddr) {
1596                        if (!tcp_peer_is_proven(req, dst, true)) {
1597                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1598                                goto drop_and_release;
1599                        }
1600                }
1601                /* Kill the following clause, if you dislike this way. */
1602                else if (!sysctl_tcp_syncookies &&
1603                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1604                          (sysctl_max_syn_backlog >> 2)) &&
1605                         !tcp_peer_is_proven(req, dst, false)) {
1606                        /* Without syncookies last quarter of
1607                         * backlog is filled with destinations,
1608                         * proven to be alive.
1609                         * It means that we continue to communicate
1610                         * to destinations, already remembered
1611                         * to the moment of synflood.
1612                         */
1613                        LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1614                                       &saddr, ntohs(tcp_hdr(skb)->source));
1615                        goto drop_and_release;
1616                }
1617
1618                isn = tcp_v4_init_sequence(skb);
1619        }
1620        tcp_rsk(req)->snt_isn = isn;
1621
1622        if (dst == NULL) {
1623                dst = inet_csk_route_req(sk, &fl4, req);
1624                if (dst == NULL)
1625                        goto drop_and_free;
1626        }
1627        do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1628
1629        /* We don't call tcp_v4_send_synack() directly because we need
1630         * to make sure a child socket can be created successfully before
1631         * sending back synack!
1632         *
1633         * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1634         * (or better yet, call tcp_send_synack() in the child context
1635         * directly, but will have to fix bunch of other code first)
1636         * after syn_recv_sock() except one will need to first fix the
1637         * latter to remove its dependency on the current implementation
1638         * of tcp_v4_send_synack()->tcp_select_initial_window().
1639         */
1640        skb_synack = tcp_make_synack(sk, dst, req,
1641            (struct request_values *)&tmp_ext,
1642            fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1643
1644        if (skb_synack) {
1645                __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1646                skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1647        } else
1648                goto drop_and_free;
1649
1650        if (likely(!do_fastopen)) {
1651                int err;
1652                err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1653                     ireq->rmt_addr, ireq->opt);
1654                err = net_xmit_eval(err);
1655                if (err || want_cookie)
1656                        goto drop_and_free;
1657
1658                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1659                tcp_rsk(req)->listener = NULL;
1660                /* Add the request_sock to the SYN table */
1661                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1662                if (fastopen_cookie_present(&foc) && foc.len != 0)
1663                        NET_INC_STATS_BH(sock_net(sk),
1664                            LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1665        } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1666            (struct request_values *)&tmp_ext))
1667                goto drop_and_free;
1668
1669        return 0;
1670
1671drop_and_release:
1672        dst_release(dst);
1673drop_and_free:
1674        reqsk_free(req);
1675drop:
1676        return 0;
1677}
1678EXPORT_SYMBOL(tcp_v4_conn_request);
1679
1680
1681/*
1682 * The three way handshake has completed - we got a valid synack -
1683 * now create the new socket.
1684 */
1685struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1686                                  struct request_sock *req,
1687                                  struct dst_entry *dst)
1688{
1689        struct inet_request_sock *ireq;
1690        struct inet_sock *newinet;
1691        struct tcp_sock *newtp;
1692        struct sock *newsk;
1693#ifdef CONFIG_TCP_MD5SIG
1694        struct tcp_md5sig_key *key;
1695#endif
1696        struct ip_options_rcu *inet_opt;
1697
1698        if (sk_acceptq_is_full(sk))
1699                goto exit_overflow;
1700
1701        newsk = tcp_create_openreq_child(sk, req, skb);
1702        if (!newsk)
1703                goto exit_nonewsk;
1704
1705        newsk->sk_gso_type = SKB_GSO_TCPV4;
1706        inet_sk_rx_dst_set(newsk, skb);
1707
1708        newtp                 = tcp_sk(newsk);
1709        newinet               = inet_sk(newsk);
1710        ireq                  = inet_rsk(req);
1711        newinet->inet_daddr   = ireq->rmt_addr;
1712        newinet->inet_rcv_saddr = ireq->loc_addr;
1713        newinet->inet_saddr           = ireq->loc_addr;
1714        inet_opt              = ireq->opt;
1715        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1716        ireq->opt             = NULL;
1717        newinet->mc_index     = inet_iif(skb);
1718        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1719        newinet->rcv_tos      = ip_hdr(skb)->tos;
1720        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1721        if (inet_opt)
1722                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1723        newinet->inet_id = newtp->write_seq ^ jiffies;
1724
1725        if (!dst) {
1726                dst = inet_csk_route_child_sock(sk, newsk, req);
1727                if (!dst)
1728                        goto put_and_exit;
1729        } else {
1730                /* syncookie case : see end of cookie_v4_check() */
1731        }
1732        sk_setup_caps(newsk, dst);
1733
1734        tcp_mtup_init(newsk);
1735        tcp_sync_mss(newsk, dst_mtu(dst));
1736        newtp->advmss = dst_metric_advmss(dst);
1737        if (tcp_sk(sk)->rx_opt.user_mss &&
1738            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1739                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1740
1741        tcp_initialize_rcv_mss(newsk);
1742        tcp_synack_rtt_meas(newsk, req);
1743        newtp->total_retrans = req->retrans;
1744
1745#ifdef CONFIG_TCP_MD5SIG
1746        /* Copy over the MD5 key from the original socket */
1747        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1748                                AF_INET);
1749        if (key != NULL) {
1750                /*
1751                 * We're using one, so create a matching key
1752                 * on the newsk structure. If we fail to get
1753                 * memory, then we end up not copying the key
1754                 * across. Shucks.
1755                 */
1756                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1757                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
1758                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1759        }
1760#endif
1761
1762        if (__inet_inherit_port(sk, newsk) < 0)
1763                goto put_and_exit;
1764        __inet_hash_nolisten(newsk, NULL);
1765
1766        return newsk;
1767
1768exit_overflow:
1769        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1770exit_nonewsk:
1771        dst_release(dst);
1772exit:
1773        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1774        return NULL;
1775put_and_exit:
1776        inet_csk_prepare_forced_close(newsk);
1777        tcp_done(newsk);
1778        goto exit;
1779}
1780EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1781
1782static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1783{
1784        struct tcphdr *th = tcp_hdr(skb);
1785        const struct iphdr *iph = ip_hdr(skb);
1786        struct sock *nsk;
1787        struct request_sock **prev;
1788        /* Find possible connection requests. */
1789        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1790                                                       iph->saddr, iph->daddr);
1791        if (req)
1792                return tcp_check_req(sk, skb, req, prev, false);
1793
1794        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1795                        th->source, iph->daddr, th->dest, inet_iif(skb));
1796
1797        if (nsk) {
1798                if (nsk->sk_state != TCP_TIME_WAIT) {
1799                        bh_lock_sock(nsk);
1800                        return nsk;
1801                }
1802                inet_twsk_put(inet_twsk(nsk));
1803                return NULL;
1804        }
1805
1806#ifdef CONFIG_SYN_COOKIES
1807        if (!th->syn)
1808                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1809#endif
1810        return sk;
1811}
1812
1813static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1814{
1815        const struct iphdr *iph = ip_hdr(skb);
1816
1817        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1818                if (!tcp_v4_check(skb->len, iph->saddr,
1819                                  iph->daddr, skb->csum)) {
1820                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1821                        return 0;
1822                }
1823        }
1824
1825        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1826                                       skb->len, IPPROTO_TCP, 0);
1827
1828        if (skb->len <= 76) {
1829                return __skb_checksum_complete(skb);
1830        }
1831        return 0;
1832}
1833
1834
1835/* The socket must have it's spinlock held when we get
1836 * here.
1837 *
1838 * We have a potential double-lock case here, so even when
1839 * doing backlog processing we use the BH locking scheme.
1840 * This is because we cannot sleep with the original spinlock
1841 * held.
1842 */
1843int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1844{
1845        struct sock *rsk;
1846#ifdef CONFIG_TCP_MD5SIG
1847        /*
1848         * We really want to reject the packet as early as possible
1849         * if:
1850         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1851         *  o There is an MD5 option and we're not expecting one
1852         */
1853        if (tcp_v4_inbound_md5_hash(sk, skb))
1854                goto discard;
1855#endif
1856
1857        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1858                struct dst_entry *dst = sk->sk_rx_dst;
1859
1860                sock_rps_save_rxhash(sk, skb);
1861                if (dst) {
1862                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1863                            dst->ops->check(dst, 0) == NULL) {
1864                                dst_release(dst);
1865                                sk->sk_rx_dst = NULL;
1866                        }
1867                }
1868                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1869                        rsk = sk;
1870                        goto reset;
1871                }
1872                return 0;
1873        }
1874
1875        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1876                goto csum_err;
1877
1878        if (sk->sk_state == TCP_LISTEN) {
1879                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1880                if (!nsk)
1881                        goto discard;
1882
1883                if (nsk != sk) {
1884                        sock_rps_save_rxhash(nsk, skb);
1885                        if (tcp_child_process(sk, nsk, skb)) {
1886                                rsk = nsk;
1887                                goto reset;
1888                        }
1889                        return 0;
1890                }
1891        } else
1892                sock_rps_save_rxhash(sk, skb);
1893
1894        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1895                rsk = sk;
1896                goto reset;
1897        }
1898        return 0;
1899
1900reset:
1901        tcp_v4_send_reset(rsk, skb);
1902discard:
1903        kfree_skb(skb);
1904        /* Be careful here. If this function gets more complicated and
1905         * gcc suffers from register pressure on the x86, sk (in %ebx)
1906         * might be destroyed here. This current version compiles correctly,
1907         * but you have been warned.
1908         */
1909        return 0;
1910
1911csum_err:
1912        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1913        goto discard;
1914}
1915EXPORT_SYMBOL(tcp_v4_do_rcv);
1916
1917void tcp_v4_early_demux(struct sk_buff *skb)
1918{
1919        struct net *net = dev_net(skb->dev);
1920        const struct iphdr *iph;
1921        const struct tcphdr *th;
1922        struct sock *sk;
1923
1924        if (skb->pkt_type != PACKET_HOST)
1925                return;
1926
1927        if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1928                return;
1929
1930        iph = ip_hdr(skb);
1931        th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1932
1933        if (th->doff < sizeof(struct tcphdr) / 4)
1934                return;
1935
1936        sk = __inet_lookup_established(net, &tcp_hashinfo,
1937                                       iph->saddr, th->source,
1938                                       iph->daddr, ntohs(th->dest),
1939                                       skb->skb_iif);
1940        if (sk) {
1941                skb->sk = sk;
1942                skb->destructor = sock_edemux;
1943                if (sk->sk_state != TCP_TIME_WAIT) {
1944                        struct dst_entry *dst = sk->sk_rx_dst;
1945
1946                        if (dst)
1947                                dst = dst_check(dst, 0);
1948                        if (dst &&
1949                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1950                                skb_dst_set_noref(skb, dst);
1951                }
1952        }
1953}
1954
1955/*
1956 *      From tcp_input.c
1957 */
1958
1959int tcp_v4_rcv(struct sk_buff *skb)
1960{
1961        const struct iphdr *iph;
1962        const struct tcphdr *th;
1963        struct sock *sk;
1964        int ret;
1965        struct net *net = dev_net(skb->dev);
1966
1967        if (skb->pkt_type != PACKET_HOST)
1968                goto discard_it;
1969
1970        /* Count it even if it's bad */
1971        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1972
1973        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1974                goto discard_it;
1975
1976        th = tcp_hdr(skb);
1977
1978        if (th->doff < sizeof(struct tcphdr) / 4)
1979                goto bad_packet;
1980        if (!pskb_may_pull(skb, th->doff * 4))
1981                goto discard_it;
1982
1983        /* An explanation is required here, I think.
1984         * Packet length and doff are validated by header prediction,
1985         * provided case of th->doff==0 is eliminated.
1986         * So, we defer the checks. */
1987        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1988                goto bad_packet;
1989
1990        th = tcp_hdr(skb);
1991        iph = ip_hdr(skb);
1992        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1993        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1994                                    skb->len - th->doff * 4);
1995        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1996        TCP_SKB_CB(skb)->when    = 0;
1997        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1998        TCP_SKB_CB(skb)->sacked  = 0;
1999
2000        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2001        if (!sk)
2002                goto no_tcp_socket;
2003
2004process:
2005        if (sk->sk_state == TCP_TIME_WAIT)
2006                goto do_time_wait;
2007
2008        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2009                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2010                goto discard_and_relse;
2011        }
2012
2013        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2014                goto discard_and_relse;
2015        nf_reset(skb);
2016
2017        if (sk_filter(sk, skb))
2018                goto discard_and_relse;
2019
2020        skb->dev = NULL;
2021
2022        bh_lock_sock_nested(sk);
2023        ret = 0;
2024        if (!sock_owned_by_user(sk)) {
2025#ifdef CONFIG_NET_DMA
2026                struct tcp_sock *tp = tcp_sk(sk);
2027                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2028                        tp->ucopy.dma_chan = net_dma_find_channel();
2029                if (tp->ucopy.dma_chan)
2030                        ret = tcp_v4_do_rcv(sk, skb);
2031                else
2032#endif
2033                {
2034                        if (!tcp_prequeue(sk, skb))
2035                                ret = tcp_v4_do_rcv(sk, skb);
2036                }
2037        } else if (unlikely(sk_add_backlog(sk, skb,
2038                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
2039                bh_unlock_sock(sk);
2040                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2041                goto discard_and_relse;
2042        }
2043        bh_unlock_sock(sk);
2044
2045        sock_put(sk);
2046
2047        return ret;
2048
2049no_tcp_socket:
2050        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2051                goto discard_it;
2052
2053        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2054bad_packet:
2055                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2056        } else {
2057                tcp_v4_send_reset(NULL, skb);
2058        }
2059
2060discard_it:
2061        /* Discard frame. */
2062        kfree_skb(skb);
2063        return 0;
2064
2065discard_and_relse:
2066        sock_put(sk);
2067        goto discard_it;
2068
2069do_time_wait:
2070        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2071                inet_twsk_put(inet_twsk(sk));
2072                goto discard_it;
2073        }
2074
2075        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2076                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2077                inet_twsk_put(inet_twsk(sk));
2078                goto discard_it;
2079        }
2080        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2081        case TCP_TW_SYN: {
2082                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2083                                                        &tcp_hashinfo,
2084                                                        iph->daddr, th->dest,
2085                                                        inet_iif(skb));
2086                if (sk2) {
2087                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2088                        inet_twsk_put(inet_twsk(sk));
2089                        sk = sk2;
2090                        goto process;
2091                }
2092                /* Fall through to ACK */
2093        }
2094        case TCP_TW_ACK:
2095                tcp_v4_timewait_ack(sk, skb);
2096                break;
2097        case TCP_TW_RST:
2098                goto no_tcp_socket;
2099        case TCP_TW_SUCCESS:;
2100        }
2101        goto discard_it;
2102}
2103
2104static struct timewait_sock_ops tcp_timewait_sock_ops = {
2105        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2106        .twsk_unique    = tcp_twsk_unique,
2107        .twsk_destructor= tcp_twsk_destructor,
2108};
2109
2110void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2111{
2112        struct dst_entry *dst = skb_dst(skb);
2113
2114        dst_hold(dst);
2115        sk->sk_rx_dst = dst;
2116        inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2117}
2118EXPORT_SYMBOL(inet_sk_rx_dst_set);
2119
2120const struct inet_connection_sock_af_ops ipv4_specific = {
2121        .queue_xmit        = ip_queue_xmit,
2122        .send_check        = tcp_v4_send_check,
2123        .rebuild_header    = inet_sk_rebuild_header,
2124        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2125        .conn_request      = tcp_v4_conn_request,
2126        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2127        .net_header_len    = sizeof(struct iphdr),
2128        .setsockopt        = ip_setsockopt,
2129        .getsockopt        = ip_getsockopt,
2130        .addr2sockaddr     = inet_csk_addr2sockaddr,
2131        .sockaddr_len      = sizeof(struct sockaddr_in),
2132        .bind_conflict     = inet_csk_bind_conflict,
2133#ifdef CONFIG_COMPAT
2134        .compat_setsockopt = compat_ip_setsockopt,
2135        .compat_getsockopt = compat_ip_getsockopt,
2136#endif
2137};
2138EXPORT_SYMBOL(ipv4_specific);
2139
2140#ifdef CONFIG_TCP_MD5SIG
2141static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2142        .md5_lookup             = tcp_v4_md5_lookup,
2143        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2144        .md5_parse              = tcp_v4_parse_md5_keys,
2145};
2146#endif
2147
2148/* NOTE: A lot of things set to zero explicitly by call to
2149 *       sk_alloc() so need not be done here.
2150 */
2151static int tcp_v4_init_sock(struct sock *sk)
2152{
2153        struct inet_connection_sock *icsk = inet_csk(sk);
2154
2155        tcp_init_sock(sk);
2156
2157        icsk->icsk_af_ops = &ipv4_specific;
2158
2159#ifdef CONFIG_TCP_MD5SIG
2160        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2161#endif
2162
2163        return 0;
2164}
2165
2166void tcp_v4_destroy_sock(struct sock *sk)
2167{
2168        struct tcp_sock *tp = tcp_sk(sk);
2169
2170        tcp_clear_xmit_timers(sk);
2171
2172        tcp_cleanup_congestion_control(sk);
2173
2174        /* Cleanup up the write buffer. */
2175        tcp_write_queue_purge(sk);
2176
2177        /* Cleans up our, hopefully empty, out_of_order_queue. */
2178        __skb_queue_purge(&tp->out_of_order_queue);
2179
2180#ifdef CONFIG_TCP_MD5SIG
2181        /* Clean up the MD5 key list, if any */
2182        if (tp->md5sig_info) {
2183                tcp_clear_md5_list(sk);
2184                kfree_rcu(tp->md5sig_info, rcu);
2185                tp->md5sig_info = NULL;
2186        }
2187#endif
2188
2189#ifdef CONFIG_NET_DMA
2190        /* Cleans up our sk_async_wait_queue */
2191        __skb_queue_purge(&sk->sk_async_wait_queue);
2192#endif
2193
2194        /* Clean prequeue, it must be empty really */
2195        __skb_queue_purge(&tp->ucopy.prequeue);
2196
2197        /* Clean up a referenced TCP bind bucket. */
2198        if (inet_csk(sk)->icsk_bind_hash)
2199                inet_put_port(sk);
2200
2201        /* TCP Cookie Transactions */
2202        if (tp->cookie_values != NULL) {
2203                kref_put(&tp->cookie_values->kref,
2204                         tcp_cookie_values_release);
2205                tp->cookie_values = NULL;
2206        }
2207        BUG_ON(tp->fastopen_rsk != NULL);
2208
2209        /* If socket is aborted during connect operation */
2210        tcp_free_fastopen_req(tp);
2211
2212        sk_sockets_allocated_dec(sk);
2213        sock_release_memcg(sk);
2214}
2215EXPORT_SYMBOL(tcp_v4_destroy_sock);
2216
2217#ifdef CONFIG_PROC_FS
2218/* Proc filesystem TCP sock list dumping. */
2219
2220static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2221{
2222        return hlist_nulls_empty(head) ? NULL :
2223                list_entry(head->first, struct inet_timewait_sock, tw_node);
2224}
2225
2226static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2227{
2228        return !is_a_nulls(tw->tw_node.next) ?
2229                hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2230}
2231
2232/*
2233 * Get next listener socket follow cur.  If cur is NULL, get first socket
2234 * starting from bucket given in st->bucket; when st->bucket is zero the
2235 * very first socket in the hash table is returned.
2236 */
2237static void *listening_get_next(struct seq_file *seq, void *cur)
2238{
2239        struct inet_connection_sock *icsk;
2240        struct hlist_nulls_node *node;
2241        struct sock *sk = cur;
2242        struct inet_listen_hashbucket *ilb;
2243        struct tcp_iter_state *st = seq->private;
2244        struct net *net = seq_file_net(seq);
2245
2246        if (!sk) {
2247                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2248                spin_lock_bh(&ilb->lock);
2249                sk = sk_nulls_head(&ilb->head);
2250                st->offset = 0;
2251                goto get_sk;
2252        }
2253        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2254        ++st->num;
2255        ++st->offset;
2256
2257        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2258                struct request_sock *req = cur;
2259
2260                icsk = inet_csk(st->syn_wait_sk);
2261                req = req->dl_next;
2262                while (1) {
2263                        while (req) {
2264                                if (req->rsk_ops->family == st->family) {
2265                                        cur = req;
2266                                        goto out;
2267                                }
2268                                req = req->dl_next;
2269                        }
2270                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2271                                break;
2272get_req:
2273                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2274                }
2275                sk        = sk_nulls_next(st->syn_wait_sk);
2276                st->state = TCP_SEQ_STATE_LISTENING;
2277                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2278        } else {
2279                icsk = inet_csk(sk);
2280                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2281                if (reqsk_queue_len(&icsk->icsk_accept_queue))
2282                        goto start_req;
2283                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2284                sk = sk_nulls_next(sk);
2285        }
2286get_sk:
2287        sk_nulls_for_each_from(sk, node) {
2288                if (!net_eq(sock_net(sk), net))
2289                        continue;
2290                if (sk->sk_family == st->family) {
2291                        cur = sk;
2292                        goto out;
2293                }
2294                icsk = inet_csk(sk);
2295                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2296                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2297start_req:
2298                        st->uid         = sock_i_uid(sk);
2299                        st->syn_wait_sk = sk;
2300                        st->state       = TCP_SEQ_STATE_OPENREQ;
2301                        st->sbucket     = 0;
2302                        goto get_req;
2303                }
2304                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2305        }
2306        spin_unlock_bh(&ilb->lock);
2307        st->offset = 0;
2308        if (++st->bucket < INET_LHTABLE_SIZE) {
2309                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2310                spin_lock_bh(&ilb->lock);
2311                sk = sk_nulls_head(&ilb->head);
2312                goto get_sk;
2313        }
2314        cur = NULL;
2315out:
2316        return cur;
2317}
2318
2319static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2320{
2321        struct tcp_iter_state *st = seq->private;
2322        void *rc;
2323
2324        st->bucket = 0;
2325        st->offset = 0;
2326        rc = listening_get_next(seq, NULL);
2327
2328        while (rc && *pos) {
2329                rc = listening_get_next(seq, rc);
2330                --*pos;
2331        }
2332        return rc;
2333}
2334
2335static inline bool empty_bucket(struct tcp_iter_state *st)
2336{
2337        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2338                hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2339}
2340
2341/*
2342 * Get first established socket starting from bucket given in st->bucket.
2343 * If st->bucket is zero, the very first socket in the hash is returned.
2344 */
2345static void *established_get_first(struct seq_file *seq)
2346{
2347        struct tcp_iter_state *st = seq->private;
2348        struct net *net = seq_file_net(seq);
2349        void *rc = NULL;
2350
2351        st->offset = 0;
2352        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2353                struct sock *sk;
2354                struct hlist_nulls_node *node;
2355                struct inet_timewait_sock *tw;
2356                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2357
2358                /* Lockless fast path for the common case of empty buckets */
2359                if (empty_bucket(st))
2360                        continue;
2361
2362                spin_lock_bh(lock);
2363                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2364                        if (sk->sk_family != st->family ||
2365                            !net_eq(sock_net(sk), net)) {
2366                                continue;
2367                        }
2368                        rc = sk;
2369                        goto out;
2370                }
2371                st->state = TCP_SEQ_STATE_TIME_WAIT;
2372                inet_twsk_for_each(tw, node,
2373                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
2374                        if (tw->tw_family != st->family ||
2375                            !net_eq(twsk_net(tw), net)) {
2376                                continue;
2377                        }
2378                        rc = tw;
2379                        goto out;
2380                }
2381                spin_unlock_bh(lock);
2382                st->state = TCP_SEQ_STATE_ESTABLISHED;
2383        }
2384out:
2385        return rc;
2386}
2387
2388static void *established_get_next(struct seq_file *seq, void *cur)
2389{
2390        struct sock *sk = cur;
2391        struct inet_timewait_sock *tw;
2392        struct hlist_nulls_node *node;
2393        struct tcp_iter_state *st = seq->private;
2394        struct net *net = seq_file_net(seq);
2395
2396        ++st->num;
2397        ++st->offset;
2398
2399        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2400                tw = cur;
2401                tw = tw_next(tw);
2402get_tw:
2403                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2404                        tw = tw_next(tw);
2405                }
2406                if (tw) {
2407                        cur = tw;
2408                        goto out;
2409                }
2410                spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2411                st->state = TCP_SEQ_STATE_ESTABLISHED;
2412
2413                /* Look for next non empty bucket */
2414                st->offset = 0;
2415                while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2416                                empty_bucket(st))
2417                        ;
2418                if (st->bucket > tcp_hashinfo.ehash_mask)
2419                        return NULL;
2420
2421                spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2422                sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2423        } else
2424                sk = sk_nulls_next(sk);
2425
2426        sk_nulls_for_each_from(sk, node) {
2427                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2428                        goto found;
2429        }
2430
2431        st->state = TCP_SEQ_STATE_TIME_WAIT;
2432        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2433        goto get_tw;
2434found:
2435        cur = sk;
2436out:
2437        return cur;
2438}
2439
2440static void *established_get_idx(struct seq_file *seq, loff_t pos)
2441{
2442        struct tcp_iter_state *st = seq->private;
2443        void *rc;
2444
2445        st->bucket = 0;
2446        rc = established_get_first(seq);
2447
2448        while (rc && pos) {
2449                rc = established_get_next(seq, rc);
2450                --pos;
2451        }
2452        return rc;
2453}
2454
2455static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2456{
2457        void *rc;
2458        struct tcp_iter_state *st = seq->private;
2459
2460        st->state = TCP_SEQ_STATE_LISTENING;
2461        rc        = listening_get_idx(seq, &pos);
2462
2463        if (!rc) {
2464                st->state = TCP_SEQ_STATE_ESTABLISHED;
2465                rc        = established_get_idx(seq, pos);
2466        }
2467
2468        return rc;
2469}
2470
2471static void *tcp_seek_last_pos(struct seq_file *seq)
2472{
2473        struct tcp_iter_state *st = seq->private;
2474        int offset = st->offset;
2475        int orig_num = st->num;
2476        void *rc = NULL;
2477
2478        switch (st->state) {
2479        case TCP_SEQ_STATE_OPENREQ:
2480        case TCP_SEQ_STATE_LISTENING:
2481                if (st->bucket >= INET_LHTABLE_SIZE)
2482                        break;
2483                st->state = TCP_SEQ_STATE_LISTENING;
2484                rc = listening_get_next(seq, NULL);
2485                while (offset-- && rc)
2486                        rc = listening_get_next(seq, rc);
2487                if (rc)
2488                        break;
2489                st->bucket = 0;
2490                /* Fallthrough */
2491        case TCP_SEQ_STATE_ESTABLISHED:
2492        case TCP_SEQ_STATE_TIME_WAIT:
2493                st->state = TCP_SEQ_STATE_ESTABLISHED;
2494                if (st->bucket > tcp_hashinfo.ehash_mask)
2495                        break;
2496                rc = established_get_first(seq);
2497                while (offset-- && rc)
2498                        rc = established_get_next(seq, rc);
2499        }
2500
2501        st->num = orig_num;
2502
2503        return rc;
2504}
2505
2506static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2507{
2508        struct tcp_iter_state *st = seq->private;
2509        void *rc;
2510
2511        if (*pos && *pos == st->last_pos) {
2512                rc = tcp_seek_last_pos(seq);
2513                if (rc)
2514                        goto out;
2515        }
2516
2517        st->state = TCP_SEQ_STATE_LISTENING;
2518        st->num = 0;
2519        st->bucket = 0;
2520        st->offset = 0;
2521        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2522
2523out:
2524        st->last_pos = *pos;
2525        return rc;
2526}
2527
2528static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2529{
2530        struct tcp_iter_state *st = seq->private;
2531        void *rc = NULL;
2532
2533        if (v == SEQ_START_TOKEN) {
2534                rc = tcp_get_idx(seq, 0);
2535                goto out;
2536        }
2537
2538        switch (st->state) {
2539        case TCP_SEQ_STATE_OPENREQ:
2540        case TCP_SEQ_STATE_LISTENING:
2541                rc = listening_get_next(seq, v);
2542                if (!rc) {
2543                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2544                        st->bucket = 0;
2545                        st->offset = 0;
2546                        rc        = established_get_first(seq);
2547                }
2548                break;
2549        case TCP_SEQ_STATE_ESTABLISHED:
2550        case TCP_SEQ_STATE_TIME_WAIT:
2551                rc = established_get_next(seq, v);
2552                break;
2553        }
2554out:
2555        ++*pos;
2556        st->last_pos = *pos;
2557        return rc;
2558}
2559
2560static void tcp_seq_stop(struct seq_file *seq, void *v)
2561{
2562        struct tcp_iter_state *st = seq->private;
2563
2564        switch (st->state) {
2565        case TCP_SEQ_STATE_OPENREQ:
2566                if (v) {
2567                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2568                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2569                }
2570        case TCP_SEQ_STATE_LISTENING:
2571                if (v != SEQ_START_TOKEN)
2572                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2573                break;
2574        case TCP_SEQ_STATE_TIME_WAIT:
2575        case TCP_SEQ_STATE_ESTABLISHED:
2576                if (v)
2577                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2578                break;
2579        }
2580}
2581
2582int tcp_seq_open(struct inode *inode, struct file *file)
2583{
2584        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2585        struct tcp_iter_state *s;
2586        int err;
2587
2588        err = seq_open_net(inode, file, &afinfo->seq_ops,
2589                          sizeof(struct tcp_iter_state));
2590        if (err < 0)
2591                return err;
2592
2593        s = ((struct seq_file *)file->private_data)->private;
2594        s->family               = afinfo->family;
2595        s->last_pos             = 0;
2596        return 0;
2597}
2598EXPORT_SYMBOL(tcp_seq_open);
2599
2600int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2601{
2602        int rc = 0;
2603        struct proc_dir_entry *p;
2604
2605        afinfo->seq_ops.start           = tcp_seq_start;
2606        afinfo->seq_ops.next            = tcp_seq_next;
2607        afinfo->seq_ops.stop            = tcp_seq_stop;
2608
2609        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2610                             afinfo->seq_fops, afinfo);
2611        if (!p)
2612                rc = -ENOMEM;
2613        return rc;
2614}
2615EXPORT_SYMBOL(tcp_proc_register);
2616
2617void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2618{
2619        proc_net_remove(net, afinfo->name);
2620}
2621EXPORT_SYMBOL(tcp_proc_unregister);
2622
2623static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2624                         struct seq_file *f, int i, kuid_t uid, int *len)
2625{
2626        const struct inet_request_sock *ireq = inet_rsk(req);
2627        long delta = req->expires - jiffies;
2628
2629        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2630                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2631                i,
2632                ireq->loc_addr,
2633                ntohs(inet_sk(sk)->inet_sport),
2634                ireq->rmt_addr,
2635                ntohs(ireq->rmt_port),
2636                TCP_SYN_RECV,
2637                0, 0, /* could print option size, but that is af dependent. */
2638                1,    /* timers active (only the expire timer) */
2639                jiffies_delta_to_clock_t(delta),
2640                req->retrans,
2641                from_kuid_munged(seq_user_ns(f), uid),
2642                0,  /* non standard timer */
2643                0, /* open_requests have no inode */
2644                atomic_read(&sk->sk_refcnt),
2645                req,
2646                len);
2647}
2648
2649static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2650{
2651        int timer_active;
2652        unsigned long timer_expires;
2653        const struct tcp_sock *tp = tcp_sk(sk);
2654        const struct inet_connection_sock *icsk = inet_csk(sk);
2655        const struct inet_sock *inet = inet_sk(sk);
2656        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2657        __be32 dest = inet->inet_daddr;
2658        __be32 src = inet->inet_rcv_saddr;
2659        __u16 destp = ntohs(inet->inet_dport);
2660        __u16 srcp = ntohs(inet->inet_sport);
2661        int rx_queue;
2662
2663        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2664                timer_active    = 1;
2665                timer_expires   = icsk->icsk_timeout;
2666        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2667                timer_active    = 4;
2668                timer_expires   = icsk->icsk_timeout;
2669        } else if (timer_pending(&sk->sk_timer)) {
2670                timer_active    = 2;
2671                timer_expires   = sk->sk_timer.expires;
2672        } else {
2673                timer_active    = 0;
2674                timer_expires = jiffies;
2675        }
2676
2677        if (sk->sk_state == TCP_LISTEN)
2678                rx_queue = sk->sk_ack_backlog;
2679        else
2680                /*
2681                 * because we dont lock socket, we might find a transient negative value
2682                 */
2683                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2684
2685        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2686                        "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2687                i, src, srcp, dest, destp, sk->sk_state,
2688                tp->write_seq - tp->snd_una,
2689                rx_queue,
2690                timer_active,
2691                jiffies_delta_to_clock_t(timer_expires - jiffies),
2692                icsk->icsk_retransmits,
2693                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2694                icsk->icsk_probes_out,
2695                sock_i_ino(sk),
2696                atomic_read(&sk->sk_refcnt), sk,
2697                jiffies_to_clock_t(icsk->icsk_rto),
2698                jiffies_to_clock_t(icsk->icsk_ack.ato),
2699                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2700                tp->snd_cwnd,
2701                sk->sk_state == TCP_LISTEN ?
2702                    (fastopenq ? fastopenq->max_qlen : 0) :
2703                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2704                len);
2705}
2706
2707static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2708                               struct seq_file *f, int i, int *len)
2709{
2710        __be32 dest, src;
2711        __u16 destp, srcp;
2712        long delta = tw->tw_ttd - jiffies;
2713
2714        dest  = tw->tw_daddr;
2715        src   = tw->tw_rcv_saddr;
2716        destp = ntohs(tw->tw_dport);
2717        srcp  = ntohs(tw->tw_sport);
2718
2719        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2720                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2721                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2722                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2723                atomic_read(&tw->tw_refcnt), tw, len);
2724}
2725
2726#define TMPSZ 150
2727
2728static int tcp4_seq_show(struct seq_file *seq, void *v)
2729{
2730        struct tcp_iter_state *st;
2731        int len;
2732
2733        if (v == SEQ_START_TOKEN) {
2734                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2735                           "  sl  local_address rem_address   st tx_queue "
2736                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2737                           "inode");
2738                goto out;
2739        }
2740        st = seq->private;
2741
2742        switch (st->state) {
2743        case TCP_SEQ_STATE_LISTENING:
2744        case TCP_SEQ_STATE_ESTABLISHED:
2745                get_tcp4_sock(v, seq, st->num, &len);
2746                break;
2747        case TCP_SEQ_STATE_OPENREQ:
2748                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2749                break;
2750        case TCP_SEQ_STATE_TIME_WAIT:
2751                get_timewait4_sock(v, seq, st->num, &len);
2752                break;
2753        }
2754        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2755out:
2756        return 0;
2757}
2758
2759static const struct file_operations tcp_afinfo_seq_fops = {
2760        .owner   = THIS_MODULE,
2761        .open    = tcp_seq_open,
2762        .read    = seq_read,
2763        .llseek  = seq_lseek,
2764        .release = seq_release_net
2765};
2766
2767static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2768        .name           = "tcp",
2769        .family         = AF_INET,
2770        .seq_fops       = &tcp_afinfo_seq_fops,
2771        .seq_ops        = {
2772                .show           = tcp4_seq_show,
2773        },
2774};
2775
2776static int __net_init tcp4_proc_init_net(struct net *net)
2777{
2778        return tcp_proc_register(net, &tcp4_seq_afinfo);
2779}
2780
2781static void __net_exit tcp4_proc_exit_net(struct net *net)
2782{
2783        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2784}
2785
2786static struct pernet_operations tcp4_net_ops = {
2787        .init = tcp4_proc_init_net,
2788        .exit = tcp4_proc_exit_net,
2789};
2790
2791int __init tcp4_proc_init(void)
2792{
2793        return register_pernet_subsys(&tcp4_net_ops);
2794}
2795
2796void tcp4_proc_exit(void)
2797{
2798        unregister_pernet_subsys(&tcp4_net_ops);
2799}
2800#endif /* CONFIG_PROC_FS */
2801
2802struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2803{
2804        const struct iphdr *iph = skb_gro_network_header(skb);
2805        __wsum wsum;
2806        __sum16 sum;
2807
2808        switch (skb->ip_summed) {
2809        case CHECKSUM_COMPLETE:
2810                if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2811                                  skb->csum)) {
2812                        skb->ip_summed = CHECKSUM_UNNECESSARY;
2813                        break;
2814                }
2815flush:
2816                NAPI_GRO_CB(skb)->flush = 1;
2817                return NULL;
2818
2819        case CHECKSUM_NONE:
2820                wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2821                                          skb_gro_len(skb), IPPROTO_TCP, 0);
2822                sum = csum_fold(skb_checksum(skb,
2823                                             skb_gro_offset(skb),
2824                                             skb_gro_len(skb),
2825                                             wsum));
2826                if (sum)
2827                        goto flush;
2828
2829                skb->ip_summed = CHECKSUM_UNNECESSARY;
2830                break;
2831        }
2832
2833        return tcp_gro_receive(head, skb);
2834}
2835
2836int tcp4_gro_complete(struct sk_buff *skb)
2837{
2838        const struct iphdr *iph = ip_hdr(skb);
2839        struct tcphdr *th = tcp_hdr(skb);
2840
2841        th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2842                                  iph->saddr, iph->daddr, 0);
2843        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2844
2845        return tcp_gro_complete(skb);
2846}
2847
2848struct proto tcp_prot = {
2849        .name                   = "TCP",
2850        .owner                  = THIS_MODULE,
2851        .close                  = tcp_close,
2852        .connect                = tcp_v4_connect,
2853        .disconnect             = tcp_disconnect,
2854        .accept                 = inet_csk_accept,
2855        .ioctl                  = tcp_ioctl,
2856        .init                   = tcp_v4_init_sock,
2857        .destroy                = tcp_v4_destroy_sock,
2858        .shutdown               = tcp_shutdown,
2859        .setsockopt             = tcp_setsockopt,
2860        .getsockopt             = tcp_getsockopt,
2861        .recvmsg                = tcp_recvmsg,
2862        .sendmsg                = tcp_sendmsg,
2863        .sendpage               = tcp_sendpage,
2864        .backlog_rcv            = tcp_v4_do_rcv,
2865        .release_cb             = tcp_release_cb,
2866        .mtu_reduced            = tcp_v4_mtu_reduced,
2867        .hash                   = inet_hash,
2868        .unhash                 = inet_unhash,
2869        .get_port               = inet_csk_get_port,
2870        .enter_memory_pressure  = tcp_enter_memory_pressure,
2871        .sockets_allocated      = &tcp_sockets_allocated,
2872        .orphan_count           = &tcp_orphan_count,
2873        .memory_allocated       = &tcp_memory_allocated,
2874        .memory_pressure        = &tcp_memory_pressure,
2875        .sysctl_wmem            = sysctl_tcp_wmem,
2876        .sysctl_rmem            = sysctl_tcp_rmem,
2877        .max_header             = MAX_TCP_HEADER,
2878        .obj_size               = sizeof(struct tcp_sock),
2879        .slab_flags             = SLAB_DESTROY_BY_RCU,
2880        .twsk_prot              = &tcp_timewait_sock_ops,
2881        .rsk_prot               = &tcp_request_sock_ops,
2882        .h.hashinfo             = &tcp_hashinfo,
2883        .no_autobind            = true,
2884#ifdef CONFIG_COMPAT
2885        .compat_setsockopt      = compat_tcp_setsockopt,
2886        .compat_getsockopt      = compat_tcp_getsockopt,
2887#endif
2888#ifdef CONFIG_MEMCG_KMEM
2889        .init_cgroup            = tcp_init_cgroup,
2890        .destroy_cgroup         = tcp_destroy_cgroup,
2891        .proto_cgroup           = tcp_proto_cgroup,
2892#endif
2893};
2894EXPORT_SYMBOL(tcp_prot);
2895
2896static int __net_init tcp_sk_init(struct net *net)
2897{
2898        return 0;
2899}
2900
2901static void __net_exit tcp_sk_exit(struct net *net)
2902{
2903}
2904
2905static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2906{
2907        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2908}
2909
2910static struct pernet_operations __net_initdata tcp_sk_ops = {
2911       .init       = tcp_sk_init,
2912       .exit       = tcp_sk_exit,
2913       .exit_batch = tcp_sk_exit_batch,
2914};
2915
2916void __init tcp_v4_init(void)
2917{
2918        inet_hashinfo_init(&tcp_hashinfo);
2919        if (register_pernet_subsys(&tcp_sk_ops))
2920                panic("Failed to create the TCP control socket.\n");
2921}
2922
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.