linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/netdma.h>
  76#include <net/secure_seq.h>
  77#include <net/tcp_memcontrol.h>
  78
  79#include <linux/inet.h>
  80#include <linux/ipv6.h>
  81#include <linux/stddef.h>
  82#include <linux/proc_fs.h>
  83#include <linux/seq_file.h>
  84
  85#include <linux/crypto.h>
  86#include <linux/scatterlist.h>
  87
  88int sysctl_tcp_tw_reuse __read_mostly;
  89int sysctl_tcp_low_latency __read_mostly;
  90EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93#ifdef CONFIG_TCP_MD5SIG
  94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96#endif
  97
  98struct inet_hashinfo tcp_hashinfo;
  99EXPORT_SYMBOL(tcp_hashinfo);
 100
 101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102{
 103        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                          ip_hdr(skb)->saddr,
 105                                          tcp_hdr(skb)->dest,
 106                                          tcp_hdr(skb)->source);
 107}
 108
 109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110{
 111        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112        struct tcp_sock *tp = tcp_sk(sk);
 113
 114        /* With PAWS, it is safe from the viewpoint
 115           of data integrity. Even without PAWS it is safe provided sequence
 116           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118           Actually, the idea is close to VJ's one, only timestamp cache is
 119           held not per host, but per port pair and TW bucket is used as state
 120           holder.
 121
 122           If TW bucket has been already destroyed we fall back to VJ's scheme
 123           and use initial timestamp retrieved from peer table.
 124         */
 125        if (tcptw->tw_ts_recent_stamp &&
 126            (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                if (tp->write_seq == 0)
 130                        tp->write_seq = 1;
 131                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                sock_hold(sktw);
 134                return 1;
 135        }
 136
 137        return 0;
 138}
 139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141static int tcp_repair_connect(struct sock *sk)
 142{
 143        tcp_connect_init(sk);
 144        tcp_finish_connect(sk, NULL);
 145
 146        return 0;
 147}
 148
 149/* This will initiate an outgoing connection. */
 150int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151{
 152        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153        struct inet_sock *inet = inet_sk(sk);
 154        struct tcp_sock *tp = tcp_sk(sk);
 155        __be16 orig_sport, orig_dport;
 156        __be32 daddr, nexthop;
 157        struct flowi4 *fl4;
 158        struct rtable *rt;
 159        int err;
 160        struct ip_options_rcu *inet_opt;
 161
 162        if (addr_len < sizeof(struct sockaddr_in))
 163                return -EINVAL;
 164
 165        if (usin->sin_family != AF_INET)
 166                return -EAFNOSUPPORT;
 167
 168        nexthop = daddr = usin->sin_addr.s_addr;
 169        inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                             sock_owned_by_user(sk));
 171        if (inet_opt && inet_opt->opt.srr) {
 172                if (!daddr)
 173                        return -EINVAL;
 174                nexthop = inet_opt->opt.faddr;
 175        }
 176
 177        orig_sport = inet->inet_sport;
 178        orig_dport = usin->sin_port;
 179        fl4 = &inet->cork.fl.u.ip4;
 180        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                              IPPROTO_TCP,
 183                              orig_sport, orig_dport, sk, true);
 184        if (IS_ERR(rt)) {
 185                err = PTR_ERR(rt);
 186                if (err == -ENETUNREACH)
 187                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                return err;
 189        }
 190
 191        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                ip_rt_put(rt);
 193                return -ENETUNREACH;
 194        }
 195
 196        if (!inet_opt || !inet_opt->opt.srr)
 197                daddr = fl4->daddr;
 198
 199        if (!inet->inet_saddr)
 200                inet->inet_saddr = fl4->saddr;
 201        inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                /* Reset inherited state */
 205                tp->rx_opt.ts_recent       = 0;
 206                tp->rx_opt.ts_recent_stamp = 0;
 207                if (likely(!tp->repair))
 208                        tp->write_seq      = 0;
 209        }
 210
 211        if (tcp_death_row.sysctl_tw_recycle &&
 212            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215        inet->inet_dport = usin->sin_port;
 216        inet->inet_daddr = daddr;
 217
 218        inet_csk(sk)->icsk_ext_hdr_len = 0;
 219        if (inet_opt)
 220                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224        /* Socket identity is still unknown (sport may be zero).
 225         * However we set state to SYN-SENT and not releasing socket
 226         * lock select source port, enter ourselves into the hash tables and
 227         * complete initialization after this.
 228         */
 229        tcp_set_state(sk, TCP_SYN_SENT);
 230        err = inet_hash_connect(&tcp_death_row, sk);
 231        if (err)
 232                goto failure;
 233
 234        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                               inet->inet_sport, inet->inet_dport, sk);
 236        if (IS_ERR(rt)) {
 237                err = PTR_ERR(rt);
 238                rt = NULL;
 239                goto failure;
 240        }
 241        /* OK, now commit destination to socket.  */
 242        sk->sk_gso_type = SKB_GSO_TCPV4;
 243        sk_setup_caps(sk, &rt->dst);
 244
 245        if (!tp->write_seq && likely(!tp->repair))
 246                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                           inet->inet_daddr,
 248                                                           inet->inet_sport,
 249                                                           usin->sin_port);
 250
 251        inet->inet_id = tp->write_seq ^ jiffies;
 252
 253        if (likely(!tp->repair))
 254                err = tcp_connect(sk);
 255        else
 256                err = tcp_repair_connect(sk);
 257
 258        rt = NULL;
 259        if (err)
 260                goto failure;
 261
 262        return 0;
 263
 264failure:
 265        /*
 266         * This unhashes the socket and releases the local port,
 267         * if necessary.
 268         */
 269        tcp_set_state(sk, TCP_CLOSE);
 270        ip_rt_put(rt);
 271        sk->sk_route_caps = 0;
 272        inet->inet_dport = 0;
 273        return err;
 274}
 275EXPORT_SYMBOL(tcp_v4_connect);
 276
 277/*
 278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279 * It can be called through tcp_release_cb() if socket was owned by user
 280 * at the time tcp_v4_err() was called to handle ICMP message.
 281 */
 282static void tcp_v4_mtu_reduced(struct sock *sk)
 283{
 284        struct dst_entry *dst;
 285        struct inet_sock *inet = inet_sk(sk);
 286        u32 mtu = tcp_sk(sk)->mtu_info;
 287
 288        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 289         * send out by Linux are always <576bytes so they should go through
 290         * unfragmented).
 291         */
 292        if (sk->sk_state == TCP_LISTEN)
 293                return;
 294
 295        dst = inet_csk_update_pmtu(sk, mtu);
 296        if (!dst)
 297                return;
 298
 299        /* Something is about to be wrong... Remember soft error
 300         * for the case, if this connection will not able to recover.
 301         */
 302        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                sk->sk_err_soft = EMSGSIZE;
 304
 305        mtu = dst_mtu(dst);
 306
 307        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                tcp_sync_mss(sk, mtu);
 310
 311                /* Resend the TCP packet because it's
 312                 * clear that the old packet has been
 313                 * dropped. This is the new "fast" path mtu
 314                 * discovery.
 315                 */
 316                tcp_simple_retransmit(sk);
 317        } /* else let the usual retransmit timer handle it */
 318}
 319
 320static void do_redirect(struct sk_buff *skb, struct sock *sk)
 321{
 322        struct dst_entry *dst = __sk_dst_check(sk, 0);
 323
 324        if (dst)
 325                dst->ops->redirect(dst, sk, skb);
 326}
 327
 328/*
 329 * This routine is called by the ICMP module when it gets some
 330 * sort of error condition.  If err < 0 then the socket should
 331 * be closed and the error returned to the user.  If err > 0
 332 * it's just the icmp type << 8 | icmp code.  After adjustment
 333 * header points to the first 8 bytes of the tcp header.  We need
 334 * to find the appropriate port.
 335 *
 336 * The locking strategy used here is very "optimistic". When
 337 * someone else accesses the socket the ICMP is just dropped
 338 * and for some paths there is no check at all.
 339 * A more general error queue to queue errors for later handling
 340 * is probably better.
 341 *
 342 */
 343
 344void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 345{
 346        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 347        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 348        struct inet_connection_sock *icsk;
 349        struct tcp_sock *tp;
 350        struct inet_sock *inet;
 351        const int type = icmp_hdr(icmp_skb)->type;
 352        const int code = icmp_hdr(icmp_skb)->code;
 353        struct sock *sk;
 354        struct sk_buff *skb;
 355        struct request_sock *req;
 356        __u32 seq;
 357        __u32 remaining;
 358        int err;
 359        struct net *net = dev_net(icmp_skb->dev);
 360
 361        if (icmp_skb->len < (iph->ihl << 2) + 8) {
 362                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 363                return;
 364        }
 365
 366        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 367                        iph->saddr, th->source, inet_iif(icmp_skb));
 368        if (!sk) {
 369                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 370                return;
 371        }
 372        if (sk->sk_state == TCP_TIME_WAIT) {
 373                inet_twsk_put(inet_twsk(sk));
 374                return;
 375        }
 376
 377        bh_lock_sock(sk);
 378        /* If too many ICMPs get dropped on busy
 379         * servers this needs to be solved differently.
 380         * We do take care of PMTU discovery (RFC1191) special case :
 381         * we can receive locally generated ICMP messages while socket is held.
 382         */
 383        if (sock_owned_by_user(sk) &&
 384            type != ICMP_DEST_UNREACH &&
 385            code != ICMP_FRAG_NEEDED)
 386                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 387
 388        if (sk->sk_state == TCP_CLOSE)
 389                goto out;
 390
 391        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 392                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 393                goto out;
 394        }
 395
 396        icsk = inet_csk(sk);
 397        tp = tcp_sk(sk);
 398        req = tp->fastopen_rsk;
 399        seq = ntohl(th->seq);
 400        if (sk->sk_state != TCP_LISTEN &&
 401            !between(seq, tp->snd_una, tp->snd_nxt) &&
 402            (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 403                /* For a Fast Open socket, allow seq to be snt_isn. */
 404                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 405                goto out;
 406        }
 407
 408        switch (type) {
 409        case ICMP_REDIRECT:
 410                do_redirect(icmp_skb, sk);
 411                goto out;
 412        case ICMP_SOURCE_QUENCH:
 413                /* Just silently ignore these. */
 414                goto out;
 415        case ICMP_PARAMETERPROB:
 416                err = EPROTO;
 417                break;
 418        case ICMP_DEST_UNREACH:
 419                if (code > NR_ICMP_UNREACH)
 420                        goto out;
 421
 422                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 423                        tp->mtu_info = info;
 424                        if (!sock_owned_by_user(sk)) {
 425                                tcp_v4_mtu_reduced(sk);
 426                        } else {
 427                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 428                                        sock_hold(sk);
 429                        }
 430                        goto out;
 431                }
 432
 433                err = icmp_err_convert[code].errno;
 434                /* check if icmp_skb allows revert of backoff
 435                 * (see draft-zimmermann-tcp-lcd) */
 436                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 437                        break;
 438                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 439                    !icsk->icsk_backoff)
 440                        break;
 441
 442                /* XXX (TFO) - revisit the following logic for TFO */
 443
 444                if (sock_owned_by_user(sk))
 445                        break;
 446
 447                icsk->icsk_backoff--;
 448                inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 449                        TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 450                tcp_bound_rto(sk);
 451
 452                skb = tcp_write_queue_head(sk);
 453                BUG_ON(!skb);
 454
 455                remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 456                                tcp_time_stamp - TCP_SKB_CB(skb)->when);
 457
 458                if (remaining) {
 459                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 460                                                  remaining, TCP_RTO_MAX);
 461                } else {
 462                        /* RTO revert clocked out retransmission.
 463                         * Will retransmit now */
 464                        tcp_retransmit_timer(sk);
 465                }
 466
 467                break;
 468        case ICMP_TIME_EXCEEDED:
 469                err = EHOSTUNREACH;
 470                break;
 471        default:
 472                goto out;
 473        }
 474
 475        /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 476         * than following the TCP_SYN_RECV case and closing the socket,
 477         * we ignore the ICMP error and keep trying like a fully established
 478         * socket. Is this the right thing to do?
 479         */
 480        if (req && req->sk == NULL)
 481                goto out;
 482
 483        switch (sk->sk_state) {
 484                struct request_sock *req, **prev;
 485        case TCP_LISTEN:
 486                if (sock_owned_by_user(sk))
 487                        goto out;
 488
 489                req = inet_csk_search_req(sk, &prev, th->dest,
 490                                          iph->daddr, iph->saddr);
 491                if (!req)
 492                        goto out;
 493
 494                /* ICMPs are not backlogged, hence we cannot get
 495                   an established socket here.
 496                 */
 497                WARN_ON(req->sk);
 498
 499                if (seq != tcp_rsk(req)->snt_isn) {
 500                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 501                        goto out;
 502                }
 503
 504                /*
 505                 * Still in SYN_RECV, just remove it silently.
 506                 * There is no good way to pass the error to the newly
 507                 * created socket, and POSIX does not want network
 508                 * errors returned from accept().
 509                 */
 510                inet_csk_reqsk_queue_drop(sk, req, prev);
 511                goto out;
 512
 513        case TCP_SYN_SENT:
 514        case TCP_SYN_RECV:  /* Cannot happen.
 515                               It can f.e. if SYNs crossed,
 516                               or Fast Open.
 517                             */
 518                if (!sock_owned_by_user(sk)) {
 519                        sk->sk_err = err;
 520
 521                        sk->sk_error_report(sk);
 522
 523                        tcp_done(sk);
 524                } else {
 525                        sk->sk_err_soft = err;
 526                }
 527                goto out;
 528        }
 529
 530        /* If we've already connected we will keep trying
 531         * until we time out, or the user gives up.
 532         *
 533         * rfc1122 4.2.3.9 allows to consider as hard errors
 534         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 535         * but it is obsoleted by pmtu discovery).
 536         *
 537         * Note, that in modern internet, where routing is unreliable
 538         * and in each dark corner broken firewalls sit, sending random
 539         * errors ordered by their masters even this two messages finally lose
 540         * their original sense (even Linux sends invalid PORT_UNREACHs)
 541         *
 542         * Now we are in compliance with RFCs.
 543         *                                                      --ANK (980905)
 544         */
 545
 546        inet = inet_sk(sk);
 547        if (!sock_owned_by_user(sk) && inet->recverr) {
 548                sk->sk_err = err;
 549                sk->sk_error_report(sk);
 550        } else  { /* Only an error on timeout */
 551                sk->sk_err_soft = err;
 552        }
 553
 554out:
 555        bh_unlock_sock(sk);
 556        sock_put(sk);
 557}
 558
 559static void __tcp_v4_send_check(struct sk_buff *skb,
 560                                __be32 saddr, __be32 daddr)
 561{
 562        struct tcphdr *th = tcp_hdr(skb);
 563
 564        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                skb->csum_start = skb_transport_header(skb) - skb->head;
 567                skb->csum_offset = offsetof(struct tcphdr, check);
 568        } else {
 569                th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                         csum_partial(th,
 571                                                      th->doff << 2,
 572                                                      skb->csum));
 573        }
 574}
 575
 576/* This routine computes an IPv4 TCP checksum. */
 577void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578{
 579        const struct inet_sock *inet = inet_sk(sk);
 580
 581        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582}
 583EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585int tcp_v4_gso_send_check(struct sk_buff *skb)
 586{
 587        const struct iphdr *iph;
 588        struct tcphdr *th;
 589
 590        if (!pskb_may_pull(skb, sizeof(*th)))
 591                return -EINVAL;
 592
 593        iph = ip_hdr(skb);
 594        th = tcp_hdr(skb);
 595
 596        th->check = 0;
 597        skb->ip_summed = CHECKSUM_PARTIAL;
 598        __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 599        return 0;
 600}
 601
 602/*
 603 *      This routine will send an RST to the other tcp.
 604 *
 605 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 606 *                    for reset.
 607 *      Answer: if a packet caused RST, it is not for a socket
 608 *              existing in our system, if it is matched to a socket,
 609 *              it is just duplicate segment or bug in other side's TCP.
 610 *              So that we build reply only basing on parameters
 611 *              arrived with segment.
 612 *      Exception: precedence violation. We do not implement it in any case.
 613 */
 614
 615static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 616{
 617        const struct tcphdr *th = tcp_hdr(skb);
 618        struct {
 619                struct tcphdr th;
 620#ifdef CONFIG_TCP_MD5SIG
 621                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 622#endif
 623        } rep;
 624        struct ip_reply_arg arg;
 625#ifdef CONFIG_TCP_MD5SIG
 626        struct tcp_md5sig_key *key;
 627        const __u8 *hash_location = NULL;
 628        unsigned char newhash[16];
 629        int genhash;
 630        struct sock *sk1 = NULL;
 631#endif
 632        struct net *net;
 633
 634        /* Never send a reset in response to a reset. */
 635        if (th->rst)
 636                return;
 637
 638        if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 639                return;
 640
 641        /* Swap the send and the receive. */
 642        memset(&rep, 0, sizeof(rep));
 643        rep.th.dest   = th->source;
 644        rep.th.source = th->dest;
 645        rep.th.doff   = sizeof(struct tcphdr) / 4;
 646        rep.th.rst    = 1;
 647
 648        if (th->ack) {
 649                rep.th.seq = th->ack_seq;
 650        } else {
 651                rep.th.ack = 1;
 652                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 653                                       skb->len - (th->doff << 2));
 654        }
 655
 656        memset(&arg, 0, sizeof(arg));
 657        arg.iov[0].iov_base = (unsigned char *)&rep;
 658        arg.iov[0].iov_len  = sizeof(rep.th);
 659
 660#ifdef CONFIG_TCP_MD5SIG
 661        hash_location = tcp_parse_md5sig_option(th);
 662        if (!sk && hash_location) {
 663                /*
 664                 * active side is lost. Try to find listening socket through
 665                 * source port, and then find md5 key through listening socket.
 666                 * we are not loose security here:
 667                 * Incoming packet is checked with md5 hash with finding key,
 668                 * no RST generated if md5 hash doesn't match.
 669                 */
 670                sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 671                                             &tcp_hashinfo, ip_hdr(skb)->daddr,
 672                                             ntohs(th->source), inet_iif(skb));
 673                /* don't send rst if it can't find key */
 674                if (!sk1)
 675                        return;
 676                rcu_read_lock();
 677                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 678                                        &ip_hdr(skb)->saddr, AF_INET);
 679                if (!key)
 680                        goto release_sk1;
 681
 682                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 683                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 684                        goto release_sk1;
 685        } else {
 686                key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 687                                             &ip_hdr(skb)->saddr,
 688                                             AF_INET) : NULL;
 689        }
 690
 691        if (key) {
 692                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 693                                   (TCPOPT_NOP << 16) |
 694                                   (TCPOPT_MD5SIG << 8) |
 695                                   TCPOLEN_MD5SIG);
 696                /* Update length and the length the header thinks exists */
 697                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 698                rep.th.doff = arg.iov[0].iov_len / 4;
 699
 700                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 701                                     key, ip_hdr(skb)->saddr,
 702                                     ip_hdr(skb)->daddr, &rep.th);
 703        }
 704#endif
 705        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 706                                      ip_hdr(skb)->saddr, /* XXX */
 707                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 708        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 709        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 710        /* When socket is gone, all binding information is lost.
 711         * routing might fail in this case. No choice here, if we choose to force
 712         * input interface, we will misroute in case of asymmetric route.
 713         */
 714        if (sk)
 715                arg.bound_dev_if = sk->sk_bound_dev_if;
 716
 717        net = dev_net(skb_dst(skb)->dev);
 718        arg.tos = ip_hdr(skb)->tos;
 719        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 720                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 721
 722        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 723        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 724
 725#ifdef CONFIG_TCP_MD5SIG
 726release_sk1:
 727        if (sk1) {
 728                rcu_read_unlock();
 729                sock_put(sk1);
 730        }
 731#endif
 732}
 733
 734/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 735   outside socket context is ugly, certainly. What can I do?
 736 */
 737
 738static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 739                            u32 win, u32 ts, int oif,
 740                            struct tcp_md5sig_key *key,
 741                            int reply_flags, u8 tos)
 742{
 743        const struct tcphdr *th = tcp_hdr(skb);
 744        struct {
 745                struct tcphdr th;
 746                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 747#ifdef CONFIG_TCP_MD5SIG
 748                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 749#endif
 750                        ];
 751        } rep;
 752        struct ip_reply_arg arg;
 753        struct net *net = dev_net(skb_dst(skb)->dev);
 754
 755        memset(&rep.th, 0, sizeof(struct tcphdr));
 756        memset(&arg, 0, sizeof(arg));
 757
 758        arg.iov[0].iov_base = (unsigned char *)&rep;
 759        arg.iov[0].iov_len  = sizeof(rep.th);
 760        if (ts) {
 761                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 762                                   (TCPOPT_TIMESTAMP << 8) |
 763                                   TCPOLEN_TIMESTAMP);
 764                rep.opt[1] = htonl(tcp_time_stamp);
 765                rep.opt[2] = htonl(ts);
 766                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 767        }
 768
 769        /* Swap the send and the receive. */
 770        rep.th.dest    = th->source;
 771        rep.th.source  = th->dest;
 772        rep.th.doff    = arg.iov[0].iov_len / 4;
 773        rep.th.seq     = htonl(seq);
 774        rep.th.ack_seq = htonl(ack);
 775        rep.th.ack     = 1;
 776        rep.th.window  = htons(win);
 777
 778#ifdef CONFIG_TCP_MD5SIG
 779        if (key) {
 780                int offset = (ts) ? 3 : 0;
 781
 782                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 783                                          (TCPOPT_NOP << 16) |
 784                                          (TCPOPT_MD5SIG << 8) |
 785                                          TCPOLEN_MD5SIG);
 786                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 787                rep.th.doff = arg.iov[0].iov_len/4;
 788
 789                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 790                                    key, ip_hdr(skb)->saddr,
 791                                    ip_hdr(skb)->daddr, &rep.th);
 792        }
 793#endif
 794        arg.flags = reply_flags;
 795        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 796                                      ip_hdr(skb)->saddr, /* XXX */
 797                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 798        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 799        if (oif)
 800                arg.bound_dev_if = oif;
 801        arg.tos = tos;
 802        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 803                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 804
 805        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 806}
 807
 808static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 809{
 810        struct inet_timewait_sock *tw = inet_twsk(sk);
 811        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 812
 813        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 814                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 815                        tcptw->tw_ts_recent,
 816                        tw->tw_bound_dev_if,
 817                        tcp_twsk_md5_key(tcptw),
 818                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 819                        tw->tw_tos
 820                        );
 821
 822        inet_twsk_put(tw);
 823}
 824
 825static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 826                                  struct request_sock *req)
 827{
 828        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 829         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 830         */
 831        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 832                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 833                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 834                        req->ts_recent,
 835                        0,
 836                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 837                                          AF_INET),
 838                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 839                        ip_hdr(skb)->tos);
 840}
 841
 842/*
 843 *      Send a SYN-ACK after having received a SYN.
 844 *      This still operates on a request_sock only, not on a big
 845 *      socket.
 846 */
 847static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 848                              struct request_sock *req,
 849                              struct request_values *rvp,
 850                              u16 queue_mapping,
 851                              bool nocache)
 852{
 853        const struct inet_request_sock *ireq = inet_rsk(req);
 854        struct flowi4 fl4;
 855        int err = -1;
 856        struct sk_buff * skb;
 857
 858        /* First, grab a route. */
 859        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 860                return -1;
 861
 862        skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 863
 864        if (skb) {
 865                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 866
 867                skb_set_queue_mapping(skb, queue_mapping);
 868                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 869                                            ireq->rmt_addr,
 870                                            ireq->opt);
 871                err = net_xmit_eval(err);
 872                if (!tcp_rsk(req)->snt_synack && !err)
 873                        tcp_rsk(req)->snt_synack = tcp_time_stamp;
 874        }
 875
 876        return err;
 877}
 878
 879static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 880                              struct request_values *rvp)
 881{
 882        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 883        return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 884}
 885
 886/*
 887 *      IPv4 request_sock destructor.
 888 */
 889static void tcp_v4_reqsk_destructor(struct request_sock *req)
 890{
 891        kfree(inet_rsk(req)->opt);
 892}
 893
 894/*
 895 * Return true if a syncookie should be sent
 896 */
 897bool tcp_syn_flood_action(struct sock *sk,
 898                         const struct sk_buff *skb,
 899                         const char *proto)
 900{
 901        const char *msg = "Dropping request";
 902        bool want_cookie = false;
 903        struct listen_sock *lopt;
 904
 905
 906
 907#ifdef CONFIG_SYN_COOKIES
 908        if (sysctl_tcp_syncookies) {
 909                msg = "Sending cookies";
 910                want_cookie = true;
 911                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 912        } else
 913#endif
 914                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 915
 916        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 917        if (!lopt->synflood_warned) {
 918                lopt->synflood_warned = 1;
 919                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 920                        proto, ntohs(tcp_hdr(skb)->dest), msg);
 921        }
 922        return want_cookie;
 923}
 924EXPORT_SYMBOL(tcp_syn_flood_action);
 925
 926/*
 927 * Save and compile IPv4 options into the request_sock if needed.
 928 */
 929static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 930{
 931        const struct ip_options *opt = &(IPCB(skb)->opt);
 932        struct ip_options_rcu *dopt = NULL;
 933
 934        if (opt && opt->optlen) {
 935                int opt_size = sizeof(*dopt) + opt->optlen;
 936
 937                dopt = kmalloc(opt_size, GFP_ATOMIC);
 938                if (dopt) {
 939                        if (ip_options_echo(&dopt->opt, skb)) {
 940                                kfree(dopt);
 941                                dopt = NULL;
 942                        }
 943                }
 944        }
 945        return dopt;
 946}
 947
 948#ifdef CONFIG_TCP_MD5SIG
 949/*
 950 * RFC2385 MD5 checksumming requires a mapping of
 951 * IP address->MD5 Key.
 952 * We need to maintain these in the sk structure.
 953 */
 954
 955/* Find the Key structure for an address.  */
 956struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 957                                         const union tcp_md5_addr *addr,
 958                                         int family)
 959{
 960        struct tcp_sock *tp = tcp_sk(sk);
 961        struct tcp_md5sig_key *key;
 962        struct hlist_node *pos;
 963        unsigned int size = sizeof(struct in_addr);
 964        struct tcp_md5sig_info *md5sig;
 965
 966        /* caller either holds rcu_read_lock() or socket lock */
 967        md5sig = rcu_dereference_check(tp->md5sig_info,
 968                                       sock_owned_by_user(sk) ||
 969                                       lockdep_is_held(&sk->sk_lock.slock));
 970        if (!md5sig)
 971                return NULL;
 972#if IS_ENABLED(CONFIG_IPV6)
 973        if (family == AF_INET6)
 974                size = sizeof(struct in6_addr);
 975#endif
 976        hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 977                if (key->family != family)
 978                        continue;
 979                if (!memcmp(&key->addr, addr, size))
 980                        return key;
 981        }
 982        return NULL;
 983}
 984EXPORT_SYMBOL(tcp_md5_do_lookup);
 985
 986struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 987                                         struct sock *addr_sk)
 988{
 989        union tcp_md5_addr *addr;
 990
 991        addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 992        return tcp_md5_do_lookup(sk, addr, AF_INET);
 993}
 994EXPORT_SYMBOL(tcp_v4_md5_lookup);
 995
 996static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 997                                                      struct request_sock *req)
 998{
 999        union tcp_md5_addr *addr;
1000
1001        addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1002        return tcp_md5_do_lookup(sk, addr, AF_INET);
1003}
1004
1005/* This can be called on a newly created socket, from other files */
1006int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1007                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1008{
1009        /* Add Key to the list */
1010        struct tcp_md5sig_key *key;
1011        struct tcp_sock *tp = tcp_sk(sk);
1012        struct tcp_md5sig_info *md5sig;
1013
1014        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1015        if (key) {
1016                /* Pre-existing entry - just update that one. */
1017                memcpy(key->key, newkey, newkeylen);
1018                key->keylen = newkeylen;
1019                return 0;
1020        }
1021
1022        md5sig = rcu_dereference_protected(tp->md5sig_info,
1023                                           sock_owned_by_user(sk));
1024        if (!md5sig) {
1025                md5sig = kmalloc(sizeof(*md5sig), gfp);
1026                if (!md5sig)
1027                        return -ENOMEM;
1028
1029                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1030                INIT_HLIST_HEAD(&md5sig->head);
1031                rcu_assign_pointer(tp->md5sig_info, md5sig);
1032        }
1033
1034        key = sock_kmalloc(sk, sizeof(*key), gfp);
1035        if (!key)
1036                return -ENOMEM;
1037        if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1038                sock_kfree_s(sk, key, sizeof(*key));
1039                return -ENOMEM;
1040        }
1041
1042        memcpy(key->key, newkey, newkeylen);
1043        key->keylen = newkeylen;
1044        key->family = family;
1045        memcpy(&key->addr, addr,
1046               (family == AF_INET6) ? sizeof(struct in6_addr) :
1047                                      sizeof(struct in_addr));
1048        hlist_add_head_rcu(&key->node, &md5sig->head);
1049        return 0;
1050}
1051EXPORT_SYMBOL(tcp_md5_do_add);
1052
1053int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1054{
1055        struct tcp_sock *tp = tcp_sk(sk);
1056        struct tcp_md5sig_key *key;
1057        struct tcp_md5sig_info *md5sig;
1058
1059        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1060        if (!key)
1061                return -ENOENT;
1062        hlist_del_rcu(&key->node);
1063        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1064        kfree_rcu(key, rcu);
1065        md5sig = rcu_dereference_protected(tp->md5sig_info,
1066                                           sock_owned_by_user(sk));
1067        if (hlist_empty(&md5sig->head))
1068                tcp_free_md5sig_pool();
1069        return 0;
1070}
1071EXPORT_SYMBOL(tcp_md5_do_del);
1072
1073void tcp_clear_md5_list(struct sock *sk)
1074{
1075        struct tcp_sock *tp = tcp_sk(sk);
1076        struct tcp_md5sig_key *key;
1077        struct hlist_node *pos, *n;
1078        struct tcp_md5sig_info *md5sig;
1079
1080        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1081
1082        if (!hlist_empty(&md5sig->head))
1083                tcp_free_md5sig_pool();
1084        hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1085                hlist_del_rcu(&key->node);
1086                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1087                kfree_rcu(key, rcu);
1088        }
1089}
1090
1091static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1092                                 int optlen)
1093{
1094        struct tcp_md5sig cmd;
1095        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1096
1097        if (optlen < sizeof(cmd))
1098                return -EINVAL;
1099
1100        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1101                return -EFAULT;
1102
1103        if (sin->sin_family != AF_INET)
1104                return -EINVAL;
1105
1106        if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1107                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108                                      AF_INET);
1109
1110        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1111                return -EINVAL;
1112
1113        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1114                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1115                              GFP_KERNEL);
1116}
1117
1118static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1119                                        __be32 daddr, __be32 saddr, int nbytes)
1120{
1121        struct tcp4_pseudohdr *bp;
1122        struct scatterlist sg;
1123
1124        bp = &hp->md5_blk.ip4;
1125
1126        /*
1127         * 1. the TCP pseudo-header (in the order: source IP address,
1128         * destination IP address, zero-padded protocol number, and
1129         * segment length)
1130         */
1131        bp->saddr = saddr;
1132        bp->daddr = daddr;
1133        bp->pad = 0;
1134        bp->protocol = IPPROTO_TCP;
1135        bp->len = cpu_to_be16(nbytes);
1136
1137        sg_init_one(&sg, bp, sizeof(*bp));
1138        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1139}
1140
1141static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1142                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1143{
1144        struct tcp_md5sig_pool *hp;
1145        struct hash_desc *desc;
1146
1147        hp = tcp_get_md5sig_pool();
1148        if (!hp)
1149                goto clear_hash_noput;
1150        desc = &hp->md5_desc;
1151
1152        if (crypto_hash_init(desc))
1153                goto clear_hash;
1154        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1155                goto clear_hash;
1156        if (tcp_md5_hash_header(hp, th))
1157                goto clear_hash;
1158        if (tcp_md5_hash_key(hp, key))
1159                goto clear_hash;
1160        if (crypto_hash_final(desc, md5_hash))
1161                goto clear_hash;
1162
1163        tcp_put_md5sig_pool();
1164        return 0;
1165
1166clear_hash:
1167        tcp_put_md5sig_pool();
1168clear_hash_noput:
1169        memset(md5_hash, 0, 16);
1170        return 1;
1171}
1172
1173int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1174                        const struct sock *sk, const struct request_sock *req,
1175                        const struct sk_buff *skb)
1176{
1177        struct tcp_md5sig_pool *hp;
1178        struct hash_desc *desc;
1179        const struct tcphdr *th = tcp_hdr(skb);
1180        __be32 saddr, daddr;
1181
1182        if (sk) {
1183                saddr = inet_sk(sk)->inet_saddr;
1184                daddr = inet_sk(sk)->inet_daddr;
1185        } else if (req) {
1186                saddr = inet_rsk(req)->loc_addr;
1187                daddr = inet_rsk(req)->rmt_addr;
1188        } else {
1189                const struct iphdr *iph = ip_hdr(skb);
1190                saddr = iph->saddr;
1191                daddr = iph->daddr;
1192        }
1193
1194        hp = tcp_get_md5sig_pool();
1195        if (!hp)
1196                goto clear_hash_noput;
1197        desc = &hp->md5_desc;
1198
1199        if (crypto_hash_init(desc))
1200                goto clear_hash;
1201
1202        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1203                goto clear_hash;
1204        if (tcp_md5_hash_header(hp, th))
1205                goto clear_hash;
1206        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1207                goto clear_hash;
1208        if (tcp_md5_hash_key(hp, key))
1209                goto clear_hash;
1210        if (crypto_hash_final(desc, md5_hash))
1211                goto clear_hash;
1212
1213        tcp_put_md5sig_pool();
1214        return 0;
1215
1216clear_hash:
1217        tcp_put_md5sig_pool();
1218clear_hash_noput:
1219        memset(md5_hash, 0, 16);
1220        return 1;
1221}
1222EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1223
1224static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1225{
1226        /*
1227         * This gets called for each TCP segment that arrives
1228         * so we want to be efficient.
1229         * We have 3 drop cases:
1230         * o No MD5 hash and one expected.
1231         * o MD5 hash and we're not expecting one.
1232         * o MD5 hash and its wrong.
1233         */
1234        const __u8 *hash_location = NULL;
1235        struct tcp_md5sig_key *hash_expected;
1236        const struct iphdr *iph = ip_hdr(skb);
1237        const struct tcphdr *th = tcp_hdr(skb);
1238        int genhash;
1239        unsigned char newhash[16];
1240
1241        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1242                                          AF_INET);
1243        hash_location = tcp_parse_md5sig_option(th);
1244
1245        /* We've parsed the options - do we have a hash? */
1246        if (!hash_expected && !hash_location)
1247                return false;
1248
1249        if (hash_expected && !hash_location) {
1250                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1251                return true;
1252        }
1253
1254        if (!hash_expected && hash_location) {
1255                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1256                return true;
1257        }
1258
1259        /* Okay, so this is hash_expected and hash_location -
1260         * so we need to calculate the checksum.
1261         */
1262        genhash = tcp_v4_md5_hash_skb(newhash,
1263                                      hash_expected,
1264                                      NULL, NULL, skb);
1265
1266        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1267                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1268                                     &iph->saddr, ntohs(th->source),
1269                                     &iph->daddr, ntohs(th->dest),
1270                                     genhash ? " tcp_v4_calc_md5_hash failed"
1271                                     : "");
1272                return true;
1273        }
1274        return false;
1275}
1276
1277#endif
1278
1279struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1280        .family         =       PF_INET,
1281        .obj_size       =       sizeof(struct tcp_request_sock),
1282        .rtx_syn_ack    =       tcp_v4_rtx_synack,
1283        .send_ack       =       tcp_v4_reqsk_send_ack,
1284        .destructor     =       tcp_v4_reqsk_destructor,
1285        .send_reset     =       tcp_v4_send_reset,
1286        .syn_ack_timeout =      tcp_syn_ack_timeout,
1287};
1288
1289#ifdef CONFIG_TCP_MD5SIG
1290static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1291        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1292        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1293};
1294#endif
1295
1296static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1297                               struct request_sock *req,
1298                               struct tcp_fastopen_cookie *foc,
1299                               struct tcp_fastopen_cookie *valid_foc)
1300{
1301        bool skip_cookie = false;
1302        struct fastopen_queue *fastopenq;
1303
1304        if (likely(!fastopen_cookie_present(foc))) {
1305                /* See include/net/tcp.h for the meaning of these knobs */
1306                if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1307                    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1308                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1309                        skip_cookie = true; /* no cookie to validate */
1310                else
1311                        return false;
1312        }
1313        fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1314        /* A FO option is present; bump the counter. */
1315        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1316
1317        /* Make sure the listener has enabled fastopen, and we don't
1318         * exceed the max # of pending TFO requests allowed before trying
1319         * to validating the cookie in order to avoid burning CPU cycles
1320         * unnecessarily.
1321         *
1322         * XXX (TFO) - The implication of checking the max_qlen before
1323         * processing a cookie request is that clients can't differentiate
1324         * between qlen overflow causing Fast Open to be disabled
1325         * temporarily vs a server not supporting Fast Open at all.
1326         */
1327        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1328            fastopenq == NULL || fastopenq->max_qlen == 0)
1329                return false;
1330
1331        if (fastopenq->qlen >= fastopenq->max_qlen) {
1332                struct request_sock *req1;
1333                spin_lock(&fastopenq->lock);
1334                req1 = fastopenq->rskq_rst_head;
1335                if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1336                        spin_unlock(&fastopenq->lock);
1337                        NET_INC_STATS_BH(sock_net(sk),
1338                            LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1339                        /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1340                        foc->len = -1;
1341                        return false;
1342                }
1343                fastopenq->rskq_rst_head = req1->dl_next;
1344                fastopenq->qlen--;
1345                spin_unlock(&fastopenq->lock);
1346                reqsk_free(req1);
1347        }
1348        if (skip_cookie) {
1349                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1350                return true;
1351        }
1352        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1353                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1354                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1355                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1356                            memcmp(&foc->val[0], &valid_foc->val[0],
1357                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
1358                                return false;
1359                        valid_foc->len = -1;
1360                }
1361                /* Acknowledge the data received from the peer. */
1362                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1363                return true;
1364        } else if (foc->len == 0) { /* Client requesting a cookie */
1365                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1366                NET_INC_STATS_BH(sock_net(sk),
1367                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1368        } else {
1369                /* Client sent a cookie with wrong size. Treat it
1370                 * the same as invalid and return a valid one.
1371                 */
1372                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1373        }
1374        return false;
1375}
1376
1377static int tcp_v4_conn_req_fastopen(struct sock *sk,
1378                                    struct sk_buff *skb,
1379                                    struct sk_buff *skb_synack,
1380                                    struct request_sock *req,
1381                                    struct request_values *rvp)
1382{
1383        struct tcp_sock *tp = tcp_sk(sk);
1384        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1385        const struct inet_request_sock *ireq = inet_rsk(req);
1386        struct sock *child;
1387        int err;
1388
1389        req->retrans = 0;
1390        req->sk = NULL;
1391
1392        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1393        if (child == NULL) {
1394                NET_INC_STATS_BH(sock_net(sk),
1395                                 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1396                kfree_skb(skb_synack);
1397                return -1;
1398        }
1399        err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1400                                    ireq->rmt_addr, ireq->opt);
1401        err = net_xmit_eval(err);
1402        if (!err)
1403                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1404        /* XXX (TFO) - is it ok to ignore error and continue? */
1405
1406        spin_lock(&queue->fastopenq->lock);
1407        queue->fastopenq->qlen++;
1408        spin_unlock(&queue->fastopenq->lock);
1409
1410        /* Initialize the child socket. Have to fix some values to take
1411         * into account the child is a Fast Open socket and is created
1412         * only out of the bits carried in the SYN packet.
1413         */
1414        tp = tcp_sk(child);
1415
1416        tp->fastopen_rsk = req;
1417        /* Do a hold on the listner sk so that if the listener is being
1418         * closed, the child that has been accepted can live on and still
1419         * access listen_lock.
1420         */
1421        sock_hold(sk);
1422        tcp_rsk(req)->listener = sk;
1423
1424        /* RFC1323: The window in SYN & SYN/ACK segments is never
1425         * scaled. So correct it appropriately.
1426         */
1427        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1428
1429        /* Activate the retrans timer so that SYNACK can be retransmitted.
1430         * The request socket is not added to the SYN table of the parent
1431         * because it's been added to the accept queue directly.
1432         */
1433        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1434            TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1435
1436        /* Add the child socket directly into the accept queue */
1437        inet_csk_reqsk_queue_add(sk, req, child);
1438
1439        /* Now finish processing the fastopen child socket. */
1440        inet_csk(child)->icsk_af_ops->rebuild_header(child);
1441        tcp_init_congestion_control(child);
1442        tcp_mtup_init(child);
1443        tcp_init_buffer_space(child);
1444        tcp_init_metrics(child);
1445
1446        /* Queue the data carried in the SYN packet. We need to first
1447         * bump skb's refcnt because the caller will attempt to free it.
1448         *
1449         * XXX (TFO) - we honor a zero-payload TFO request for now.
1450         * (Any reason not to?)
1451         */
1452        if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1453                /* Don't queue the skb if there is no payload in SYN.
1454                 * XXX (TFO) - How about SYN+FIN?
1455                 */
1456                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1457        } else {
1458                skb = skb_get(skb);
1459                skb_dst_drop(skb);
1460                __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1461                skb_set_owner_r(skb, child);
1462                __skb_queue_tail(&child->sk_receive_queue, skb);
1463                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1464                tp->syn_data_acked = 1;
1465        }
1466        sk->sk_data_ready(sk, 0);
1467        bh_unlock_sock(child);
1468        sock_put(child);
1469        WARN_ON(req->sk == NULL);
1470        return 0;
1471}
1472
1473int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1474{
1475        struct tcp_extend_values tmp_ext;
1476        struct tcp_options_received tmp_opt;
1477        const u8 *hash_location;
1478        struct request_sock *req;
1479        struct inet_request_sock *ireq;
1480        struct tcp_sock *tp = tcp_sk(sk);
1481        struct dst_entry *dst = NULL;
1482        __be32 saddr = ip_hdr(skb)->saddr;
1483        __be32 daddr = ip_hdr(skb)->daddr;
1484        __u32 isn = TCP_SKB_CB(skb)->when;
1485        bool want_cookie = false;
1486        struct flowi4 fl4;
1487        struct tcp_fastopen_cookie foc = { .len = -1 };
1488        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1489        struct sk_buff *skb_synack;
1490        int do_fastopen;
1491
1492        /* Never answer to SYNs send to broadcast or multicast */
1493        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1494                goto drop;
1495
1496        /* TW buckets are converted to open requests without
1497         * limitations, they conserve resources and peer is
1498         * evidently real one.
1499         */
1500        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1501                want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1502                if (!want_cookie)
1503                        goto drop;
1504        }
1505
1506        /* Accept backlog is full. If we have already queued enough
1507         * of warm entries in syn queue, drop request. It is better than
1508         * clogging syn queue with openreqs with exponentially increasing
1509         * timeout.
1510         */
1511        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1512                goto drop;
1513
1514        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1515        if (!req)
1516                goto drop;
1517
1518#ifdef CONFIG_TCP_MD5SIG
1519        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1520#endif
1521
1522        tcp_clear_options(&tmp_opt);
1523        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1524        tmp_opt.user_mss  = tp->rx_opt.user_mss;
1525        tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1526            want_cookie ? NULL : &foc);
1527
1528        if (tmp_opt.cookie_plus > 0 &&
1529            tmp_opt.saw_tstamp &&
1530            !tp->rx_opt.cookie_out_never &&
1531            (sysctl_tcp_cookie_size > 0 ||
1532             (tp->cookie_values != NULL &&
1533              tp->cookie_values->cookie_desired > 0))) {
1534                u8 *c;
1535                u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1536                int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1537
1538                if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1539                        goto drop_and_release;
1540
1541                /* Secret recipe starts with IP addresses */
1542                *mess++ ^= (__force u32)daddr;
1543                *mess++ ^= (__force u32)saddr;
1544
1545                /* plus variable length Initiator Cookie */
1546                c = (u8 *)mess;
1547                while (l-- > 0)
1548                        *c++ ^= *hash_location++;
1549
1550                want_cookie = false;    /* not our kind of cookie */
1551                tmp_ext.cookie_out_never = 0; /* false */
1552                tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1553        } else if (!tp->rx_opt.cookie_in_always) {
1554                /* redundant indications, but ensure initialization. */
1555                tmp_ext.cookie_out_never = 1; /* true */
1556                tmp_ext.cookie_plus = 0;
1557        } else {
1558                goto drop_and_release;
1559        }
1560        tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1561
1562        if (want_cookie && !tmp_opt.saw_tstamp)
1563                tcp_clear_options(&tmp_opt);
1564
1565        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1566        tcp_openreq_init(req, &tmp_opt, skb);
1567
1568        ireq = inet_rsk(req);
1569        ireq->loc_addr = daddr;
1570        ireq->rmt_addr = saddr;
1571        ireq->no_srccheck = inet_sk(sk)->transparent;
1572        ireq->opt = tcp_v4_save_options(skb);
1573
1574        if (security_inet_conn_request(sk, skb, req))
1575                goto drop_and_free;
1576
1577        if (!want_cookie || tmp_opt.tstamp_ok)
1578                TCP_ECN_create_request(req, skb);
1579
1580        if (want_cookie) {
1581                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1582                req->cookie_ts = tmp_opt.tstamp_ok;
1583        } else if (!isn) {
1584                /* VJ's idea. We save last timestamp seen
1585                 * from the destination in peer table, when entering
1586                 * state TIME-WAIT, and check against it before
1587                 * accepting new connection request.
1588                 *
1589                 * If "isn" is not zero, this request hit alive
1590                 * timewait bucket, so that all the necessary checks
1591                 * are made in the function processing timewait state.
1592                 */
1593                if (tmp_opt.saw_tstamp &&
1594                    tcp_death_row.sysctl_tw_recycle &&
1595                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1596                    fl4.daddr == saddr) {
1597                        if (!tcp_peer_is_proven(req, dst, true)) {
1598                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1599                                goto drop_and_release;
1600                        }
1601                }
1602                /* Kill the following clause, if you dislike this way. */
1603                else if (!sysctl_tcp_syncookies &&
1604                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1605                          (sysctl_max_syn_backlog >> 2)) &&
1606                         !tcp_peer_is_proven(req, dst, false)) {
1607                        /* Without syncookies last quarter of
1608                         * backlog is filled with destinations,
1609                         * proven to be alive.
1610                         * It means that we continue to communicate
1611                         * to destinations, already remembered
1612                         * to the moment of synflood.
1613                         */
1614                        LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1615                                       &saddr, ntohs(tcp_hdr(skb)->source));
1616                        goto drop_and_release;
1617                }
1618
1619                isn = tcp_v4_init_sequence(skb);
1620        }
1621        tcp_rsk(req)->snt_isn = isn;
1622
1623        if (dst == NULL) {
1624                dst = inet_csk_route_req(sk, &fl4, req);
1625                if (dst == NULL)
1626                        goto drop_and_free;
1627        }
1628        do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1629
1630        /* We don't call tcp_v4_send_synack() directly because we need
1631         * to make sure a child socket can be created successfully before
1632         * sending back synack!
1633         *
1634         * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1635         * (or better yet, call tcp_send_synack() in the child context
1636         * directly, but will have to fix bunch of other code first)
1637         * after syn_recv_sock() except one will need to first fix the
1638         * latter to remove its dependency on the current implementation
1639         * of tcp_v4_send_synack()->tcp_select_initial_window().
1640         */
1641        skb_synack = tcp_make_synack(sk, dst, req,
1642            (struct request_values *)&tmp_ext,
1643            fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1644
1645        if (skb_synack) {
1646                __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1647                skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1648        } else
1649                goto drop_and_free;
1650
1651        if (likely(!do_fastopen)) {
1652                int err;
1653                err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1654                     ireq->rmt_addr, ireq->opt);
1655                err = net_xmit_eval(err);
1656                if (err || want_cookie)
1657                        goto drop_and_free;
1658
1659                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1660                tcp_rsk(req)->listener = NULL;
1661                /* Add the request_sock to the SYN table */
1662                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1663                if (fastopen_cookie_present(&foc) && foc.len != 0)
1664                        NET_INC_STATS_BH(sock_net(sk),
1665                            LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1666        } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1667            (struct request_values *)&tmp_ext))
1668                goto drop_and_free;
1669
1670        return 0;
1671
1672drop_and_release:
1673        dst_release(dst);
1674drop_and_free:
1675        reqsk_free(req);
1676drop:
1677        return 0;
1678}
1679EXPORT_SYMBOL(tcp_v4_conn_request);
1680
1681
1682/*
1683 * The three way handshake has completed - we got a valid synack -
1684 * now create the new socket.
1685 */
1686struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1687                                  struct request_sock *req,
1688                                  struct dst_entry *dst)
1689{
1690        struct inet_request_sock *ireq;
1691        struct inet_sock *newinet;
1692        struct tcp_sock *newtp;
1693        struct sock *newsk;
1694#ifdef CONFIG_TCP_MD5SIG
1695        struct tcp_md5sig_key *key;
1696#endif
1697        struct ip_options_rcu *inet_opt;
1698
1699        if (sk_acceptq_is_full(sk))
1700                goto exit_overflow;
1701
1702        newsk = tcp_create_openreq_child(sk, req, skb);
1703        if (!newsk)
1704                goto exit_nonewsk;
1705
1706        newsk->sk_gso_type = SKB_GSO_TCPV4;
1707        inet_sk_rx_dst_set(newsk, skb);
1708
1709        newtp                 = tcp_sk(newsk);
1710        newinet               = inet_sk(newsk);
1711        ireq                  = inet_rsk(req);
1712        newinet->inet_daddr   = ireq->rmt_addr;
1713        newinet->inet_rcv_saddr = ireq->loc_addr;
1714        newinet->inet_saddr           = ireq->loc_addr;
1715        inet_opt              = ireq->opt;
1716        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1717        ireq->opt             = NULL;
1718        newinet->mc_index     = inet_iif(skb);
1719        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1720        newinet->rcv_tos      = ip_hdr(skb)->tos;
1721        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1722        if (inet_opt)
1723                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1724        newinet->inet_id = newtp->write_seq ^ jiffies;
1725
1726        if (!dst) {
1727                dst = inet_csk_route_child_sock(sk, newsk, req);
1728                if (!dst)
1729                        goto put_and_exit;
1730        } else {
1731                /* syncookie case : see end of cookie_v4_check() */
1732        }
1733        sk_setup_caps(newsk, dst);
1734
1735        tcp_mtup_init(newsk);
1736        tcp_sync_mss(newsk, dst_mtu(dst));
1737        newtp->advmss = dst_metric_advmss(dst);
1738        if (tcp_sk(sk)->rx_opt.user_mss &&
1739            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1740                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1741
1742        tcp_initialize_rcv_mss(newsk);
1743        tcp_synack_rtt_meas(newsk, req);
1744        newtp->total_retrans = req->retrans;
1745
1746#ifdef CONFIG_TCP_MD5SIG
1747        /* Copy over the MD5 key from the original socket */
1748        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1749                                AF_INET);
1750        if (key != NULL) {
1751                /*
1752                 * We're using one, so create a matching key
1753                 * on the newsk structure. If we fail to get
1754                 * memory, then we end up not copying the key
1755                 * across. Shucks.
1756                 */
1757                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1758                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
1759                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1760        }
1761#endif
1762
1763        if (__inet_inherit_port(sk, newsk) < 0)
1764                goto put_and_exit;
1765        __inet_hash_nolisten(newsk, NULL);
1766
1767        return newsk;
1768
1769exit_overflow:
1770        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1771exit_nonewsk:
1772        dst_release(dst);
1773exit:
1774        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1775        return NULL;
1776put_and_exit:
1777        inet_csk_prepare_forced_close(newsk);
1778        tcp_done(newsk);
1779        goto exit;
1780}
1781EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1782
1783static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1784{
1785        struct tcphdr *th = tcp_hdr(skb);
1786        const struct iphdr *iph = ip_hdr(skb);
1787        struct sock *nsk;
1788        struct request_sock **prev;
1789        /* Find possible connection requests. */
1790        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1791                                                       iph->saddr, iph->daddr);
1792        if (req)
1793                return tcp_check_req(sk, skb, req, prev, false);
1794
1795        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1796                        th->source, iph->daddr, th->dest, inet_iif(skb));
1797
1798        if (nsk) {
1799                if (nsk->sk_state != TCP_TIME_WAIT) {
1800                        bh_lock_sock(nsk);
1801                        return nsk;
1802                }
1803                inet_twsk_put(inet_twsk(nsk));
1804                return NULL;
1805        }
1806
1807#ifdef CONFIG_SYN_COOKIES
1808        if (!th->syn)
1809                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1810#endif
1811        return sk;
1812}
1813
1814static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1815{
1816        const struct iphdr *iph = ip_hdr(skb);
1817
1818        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1819                if (!tcp_v4_check(skb->len, iph->saddr,
1820                                  iph->daddr, skb->csum)) {
1821                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1822                        return 0;
1823                }
1824        }
1825
1826        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1827                                       skb->len, IPPROTO_TCP, 0);
1828
1829        if (skb->len <= 76) {
1830                return __skb_checksum_complete(skb);
1831        }
1832        return 0;
1833}
1834
1835
1836/* The socket must have it's spinlock held when we get
1837 * here.
1838 *
1839 * We have a potential double-lock case here, so even when
1840 * doing backlog processing we use the BH locking scheme.
1841 * This is because we cannot sleep with the original spinlock
1842 * held.
1843 */
1844int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1845{
1846        struct sock *rsk;
1847#ifdef CONFIG_TCP_MD5SIG
1848        /*
1849         * We really want to reject the packet as early as possible
1850         * if:
1851         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1852         *  o There is an MD5 option and we're not expecting one
1853         */
1854        if (tcp_v4_inbound_md5_hash(sk, skb))
1855                goto discard;
1856#endif
1857
1858        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1859                struct dst_entry *dst = sk->sk_rx_dst;
1860
1861                sock_rps_save_rxhash(sk, skb);
1862                if (dst) {
1863                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1864                            dst->ops->check(dst, 0) == NULL) {
1865                                dst_release(dst);
1866                                sk->sk_rx_dst = NULL;
1867                        }
1868                }
1869                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1870                        rsk = sk;
1871                        goto reset;
1872                }
1873                return 0;
1874        }
1875
1876        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1877                goto csum_err;
1878
1879        if (sk->sk_state == TCP_LISTEN) {
1880                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1881                if (!nsk)
1882                        goto discard;
1883
1884                if (nsk != sk) {
1885                        sock_rps_save_rxhash(nsk, skb);
1886                        if (tcp_child_process(sk, nsk, skb)) {
1887                                rsk = nsk;
1888                                goto reset;
1889                        }
1890                        return 0;
1891                }
1892        } else
1893                sock_rps_save_rxhash(sk, skb);
1894
1895        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1896                rsk = sk;
1897                goto reset;
1898        }
1899        return 0;
1900
1901reset:
1902        tcp_v4_send_reset(rsk, skb);
1903discard:
1904        kfree_skb(skb);
1905        /* Be careful here. If this function gets more complicated and
1906         * gcc suffers from register pressure on the x86, sk (in %ebx)
1907         * might be destroyed here. This current version compiles correctly,
1908         * but you have been warned.
1909         */
1910        return 0;
1911
1912csum_err:
1913        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1914        goto discard;
1915}
1916EXPORT_SYMBOL(tcp_v4_do_rcv);
1917
1918void tcp_v4_early_demux(struct sk_buff *skb)
1919{
1920        struct net *net = dev_net(skb->dev);
1921        const struct iphdr *iph;
1922        const struct tcphdr *th;
1923        struct sock *sk;
1924
1925        if (skb->pkt_type != PACKET_HOST)
1926                return;
1927
1928        if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1929                return;
1930
1931        iph = ip_hdr(skb);
1932        th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1933
1934        if (th->doff < sizeof(struct tcphdr) / 4)
1935                return;
1936
1937        sk = __inet_lookup_established(net, &tcp_hashinfo,
1938                                       iph->saddr, th->source,
1939                                       iph->daddr, ntohs(th->dest),
1940                                       skb->skb_iif);
1941        if (sk) {
1942                skb->sk = sk;
1943                skb->destructor = sock_edemux;
1944                if (sk->sk_state != TCP_TIME_WAIT) {
1945                        struct dst_entry *dst = sk->sk_rx_dst;
1946
1947                        if (dst)
1948                                dst = dst_check(dst, 0);
1949                        if (dst &&
1950                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1951                                skb_dst_set_noref(skb, dst);
1952                }
1953        }
1954}
1955
1956/*
1957 *      From tcp_input.c
1958 */
1959
1960int tcp_v4_rcv(struct sk_buff *skb)
1961{
1962        const struct iphdr *iph;
1963        const struct tcphdr *th;
1964        struct sock *sk;
1965        int ret;
1966        struct net *net = dev_net(skb->dev);
1967
1968        if (skb->pkt_type != PACKET_HOST)
1969                goto discard_it;
1970
1971        /* Count it even if it's bad */
1972        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1973
1974        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1975                goto discard_it;
1976
1977        th = tcp_hdr(skb);
1978
1979        if (th->doff < sizeof(struct tcphdr) / 4)
1980                goto bad_packet;
1981        if (!pskb_may_pull(skb, th->doff * 4))
1982                goto discard_it;
1983
1984        /* An explanation is required here, I think.
1985         * Packet length and doff are validated by header prediction,
1986         * provided case of th->doff==0 is eliminated.
1987         * So, we defer the checks. */
1988        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1989                goto bad_packet;
1990
1991        th = tcp_hdr(skb);
1992        iph = ip_hdr(skb);
1993        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1994        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1995                                    skb->len - th->doff * 4);
1996        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1997        TCP_SKB_CB(skb)->when    = 0;
1998        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1999        TCP_SKB_CB(skb)->sacked  = 0;
2000
2001        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2002        if (!sk)
2003                goto no_tcp_socket;
2004
2005process:
2006        if (sk->sk_state == TCP_TIME_WAIT)
2007                goto do_time_wait;
2008
2009        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2010                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2011                goto discard_and_relse;
2012        }
2013
2014        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2015                goto discard_and_relse;
2016        nf_reset(skb);
2017
2018        if (sk_filter(sk, skb))
2019                goto discard_and_relse;
2020
2021        skb->dev = NULL;
2022
2023        bh_lock_sock_nested(sk);
2024        ret = 0;
2025        if (!sock_owned_by_user(sk)) {
2026#ifdef CONFIG_NET_DMA
2027                struct tcp_sock *tp = tcp_sk(sk);
2028                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2029                        tp->ucopy.dma_chan = net_dma_find_channel();
2030                if (tp->ucopy.dma_chan)
2031                        ret = tcp_v4_do_rcv(sk, skb);
2032                else
2033#endif
2034                {
2035                        if (!tcp_prequeue(sk, skb))
2036                                ret = tcp_v4_do_rcv(sk, skb);
2037                }
2038        } else if (unlikely(sk_add_backlog(sk, skb,
2039                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
2040                bh_unlock_sock(sk);
2041                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2042                goto discard_and_relse;
2043        }
2044        bh_unlock_sock(sk);
2045
2046        sock_put(sk);
2047
2048        return ret;
2049
2050no_tcp_socket:
2051        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2052                goto discard_it;
2053
2054        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2055bad_packet:
2056                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2057        } else {
2058                tcp_v4_send_reset(NULL, skb);
2059        }
2060
2061discard_it:
2062        /* Discard frame. */
2063        kfree_skb(skb);
2064        return 0;
2065
2066discard_and_relse:
2067        sock_put(sk);
2068        goto discard_it;
2069
2070do_time_wait:
2071        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2072                inet_twsk_put(inet_twsk(sk));
2073                goto discard_it;
2074        }
2075
2076        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2077                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2078                inet_twsk_put(inet_twsk(sk));
2079                goto discard_it;
2080        }
2081        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2082        case TCP_TW_SYN: {
2083                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2084                                                        &tcp_hashinfo,
2085                                                        iph->daddr, th->dest,
2086                                                        inet_iif(skb));
2087                if (sk2) {
2088                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2089                        inet_twsk_put(inet_twsk(sk));
2090                        sk = sk2;
2091                        goto process;
2092                }
2093                /* Fall through to ACK */
2094        }
2095        case TCP_TW_ACK:
2096                tcp_v4_timewait_ack(sk, skb);
2097                break;
2098        case TCP_TW_RST:
2099                goto no_tcp_socket;
2100        case TCP_TW_SUCCESS:;
2101        }
2102        goto discard_it;
2103}
2104
2105static struct timewait_sock_ops tcp_timewait_sock_ops = {
2106        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2107        .twsk_unique    = tcp_twsk_unique,
2108        .twsk_destructor= tcp_twsk_destructor,
2109};
2110
2111void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2112{
2113        struct dst_entry *dst = skb_dst(skb);
2114
2115        dst_hold(dst);
2116        sk->sk_rx_dst = dst;
2117        inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2118}
2119EXPORT_SYMBOL(inet_sk_rx_dst_set);
2120
2121const struct inet_connection_sock_af_ops ipv4_specific = {
2122        .queue_xmit        = ip_queue_xmit,
2123        .send_check        = tcp_v4_send_check,
2124        .rebuild_header    = inet_sk_rebuild_header,
2125        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2126        .conn_request      = tcp_v4_conn_request,
2127        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2128        .net_header_len    = sizeof(struct iphdr),
2129        .setsockopt        = ip_setsockopt,
2130        .getsockopt        = ip_getsockopt,
2131        .addr2sockaddr     = inet_csk_addr2sockaddr,
2132        .sockaddr_len      = sizeof(struct sockaddr_in),
2133        .bind_conflict     = inet_csk_bind_conflict,
2134#ifdef CONFIG_COMPAT
2135        .compat_setsockopt = compat_ip_setsockopt,
2136        .compat_getsockopt = compat_ip_getsockopt,
2137#endif
2138};
2139EXPORT_SYMBOL(ipv4_specific);
2140
2141#ifdef CONFIG_TCP_MD5SIG
2142static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2143        .md5_lookup             = tcp_v4_md5_lookup,
2144        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2145        .md5_parse              = tcp_v4_parse_md5_keys,
2146};
2147#endif
2148
2149/* NOTE: A lot of things set to zero explicitly by call to
2150 *       sk_alloc() so need not be done here.
2151 */
2152static int tcp_v4_init_sock(struct sock *sk)
2153{
2154        struct inet_connection_sock *icsk = inet_csk(sk);
2155
2156        tcp_init_sock(sk);
2157
2158        icsk->icsk_af_ops = &ipv4_specific;
2159
2160#ifdef CONFIG_TCP_MD5SIG
2161        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2162#endif
2163
2164        return 0;
2165}
2166
2167void tcp_v4_destroy_sock(struct sock *sk)
2168{
2169        struct tcp_sock *tp = tcp_sk(sk);
2170
2171        tcp_clear_xmit_timers(sk);
2172
2173        tcp_cleanup_congestion_control(sk);
2174
2175        /* Cleanup up the write buffer. */
2176        tcp_write_queue_purge(sk);
2177
2178        /* Cleans up our, hopefully empty, out_of_order_queue. */
2179        __skb_queue_purge(&tp->out_of_order_queue);
2180
2181#ifdef CONFIG_TCP_MD5SIG
2182        /* Clean up the MD5 key list, if any */
2183        if (tp->md5sig_info) {
2184                tcp_clear_md5_list(sk);
2185                kfree_rcu(tp->md5sig_info, rcu);
2186                tp->md5sig_info = NULL;
2187        }
2188#endif
2189
2190#ifdef CONFIG_NET_DMA
2191        /* Cleans up our sk_async_wait_queue */
2192        __skb_queue_purge(&sk->sk_async_wait_queue);
2193#endif
2194
2195        /* Clean prequeue, it must be empty really */
2196        __skb_queue_purge(&tp->ucopy.prequeue);
2197
2198        /* Clean up a referenced TCP bind bucket. */
2199        if (inet_csk(sk)->icsk_bind_hash)
2200                inet_put_port(sk);
2201
2202        /* TCP Cookie Transactions */
2203        if (tp->cookie_values != NULL) {
2204                kref_put(&tp->cookie_values->kref,
2205                         tcp_cookie_values_release);
2206                tp->cookie_values = NULL;
2207        }
2208        BUG_ON(tp->fastopen_rsk != NULL);
2209
2210        /* If socket is aborted during connect operation */
2211        tcp_free_fastopen_req(tp);
2212
2213        sk_sockets_allocated_dec(sk);
2214        sock_release_memcg(sk);
2215}
2216EXPORT_SYMBOL(tcp_v4_destroy_sock);
2217
2218#ifdef CONFIG_PROC_FS
2219/* Proc filesystem TCP sock list dumping. */
2220
2221static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2222{
2223        return hlist_nulls_empty(head) ? NULL :
2224                list_entry(head->first, struct inet_timewait_sock, tw_node);
2225}
2226
2227static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2228{
2229        return !is_a_nulls(tw->tw_node.next) ?
2230                hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2231}
2232
2233/*
2234 * Get next listener socket follow cur.  If cur is NULL, get first socket
2235 * starting from bucket given in st->bucket; when st->bucket is zero the
2236 * very first socket in the hash table is returned.
2237 */
2238static void *listening_get_next(struct seq_file *seq, void *cur)
2239{
2240        struct inet_connection_sock *icsk;
2241        struct hlist_nulls_node *node;
2242        struct sock *sk = cur;
2243        struct inet_listen_hashbucket *ilb;
2244        struct tcp_iter_state *st = seq->private;
2245        struct net *net = seq_file_net(seq);
2246
2247        if (!sk) {
2248                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2249                spin_lock_bh(&ilb->lock);
2250                sk = sk_nulls_head(&ilb->head);
2251                st->offset = 0;
2252                goto get_sk;
2253        }
2254        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2255        ++st->num;
2256        ++st->offset;
2257
2258        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2259                struct request_sock *req = cur;
2260
2261                icsk = inet_csk(st->syn_wait_sk);
2262                req = req->dl_next;
2263                while (1) {
2264                        while (req) {
2265                                if (req->rsk_ops->family == st->family) {
2266                                        cur = req;
2267                                        goto out;
2268                                }
2269                                req = req->dl_next;
2270                        }
2271                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2272                                break;
2273get_req:
2274                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2275                }
2276                sk        = sk_nulls_next(st->syn_wait_sk);
2277                st->state = TCP_SEQ_STATE_LISTENING;
2278                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2279        } else {
2280                icsk = inet_csk(sk);
2281                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2282                if (reqsk_queue_len(&icsk->icsk_accept_queue))
2283                        goto start_req;
2284                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2285                sk = sk_nulls_next(sk);
2286        }
2287get_sk:
2288        sk_nulls_for_each_from(sk, node) {
2289                if (!net_eq(sock_net(sk), net))
2290                        continue;
2291                if (sk->sk_family == st->family) {
2292                        cur = sk;
2293                        goto out;
2294                }
2295                icsk = inet_csk(sk);
2296                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2297                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2298start_req:
2299                        st->uid         = sock_i_uid(sk);
2300                        st->syn_wait_sk = sk;
2301                        st->state       = TCP_SEQ_STATE_OPENREQ;
2302                        st->sbucket     = 0;
2303                        goto get_req;
2304                }
2305                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2306        }
2307        spin_unlock_bh(&ilb->lock);
2308        st->offset = 0;
2309        if (++st->bucket < INET_LHTABLE_SIZE) {
2310                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2311                spin_lock_bh(&ilb->lock);
2312                sk = sk_nulls_head(&ilb->head);
2313                goto get_sk;
2314        }
2315        cur = NULL;
2316out:
2317        return cur;
2318}
2319
2320static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2321{
2322        struct tcp_iter_state *st = seq->private;
2323        void *rc;
2324
2325        st->bucket = 0;
2326        st->offset = 0;
2327        rc = listening_get_next(seq, NULL);
2328
2329        while (rc && *pos) {
2330                rc = listening_get_next(seq, rc);
2331                --*pos;
2332        }
2333        return rc;
2334}
2335
2336static inline bool empty_bucket(struct tcp_iter_state *st)
2337{
2338        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2339                hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2340}
2341
2342/*
2343 * Get first established socket starting from bucket given in st->bucket.
2344 * If st->bucket is zero, the very first socket in the hash is returned.
2345 */
2346static void *established_get_first(struct seq_file *seq)
2347{
2348        struct tcp_iter_state *st = seq->private;
2349        struct net *net = seq_file_net(seq);
2350        void *rc = NULL;
2351
2352        st->offset = 0;
2353        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2354                struct sock *sk;
2355                struct hlist_nulls_node *node;
2356                struct inet_timewait_sock *tw;
2357                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2358
2359                /* Lockless fast path for the common case of empty buckets */
2360                if (empty_bucket(st))
2361                        continue;
2362
2363                spin_lock_bh(lock);
2364                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2365                        if (sk->sk_family != st->family ||
2366                            !net_eq(sock_net(sk), net)) {
2367                                continue;
2368                        }
2369                        rc = sk;
2370                        goto out;
2371                }
2372                st->state = TCP_SEQ_STATE_TIME_WAIT;
2373                inet_twsk_for_each(tw, node,
2374                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
2375                        if (tw->tw_family != st->family ||
2376                            !net_eq(twsk_net(tw), net)) {
2377                                continue;
2378                        }
2379                        rc = tw;
2380                        goto out;
2381                }
2382                spin_unlock_bh(lock);
2383                st->state = TCP_SEQ_STATE_ESTABLISHED;
2384        }
2385out:
2386        return rc;
2387}
2388
2389static void *established_get_next(struct seq_file *seq, void *cur)
2390{
2391        struct sock *sk = cur;
2392        struct inet_timewait_sock *tw;
2393        struct hlist_nulls_node *node;
2394        struct tcp_iter_state *st = seq->private;
2395        struct net *net = seq_file_net(seq);
2396
2397        ++st->num;
2398        ++st->offset;
2399
2400        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2401                tw = cur;
2402                tw = tw_next(tw);
2403get_tw:
2404                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2405                        tw = tw_next(tw);
2406                }
2407                if (tw) {
2408                        cur = tw;
2409                        goto out;
2410                }
2411                spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2412                st->state = TCP_SEQ_STATE_ESTABLISHED;
2413
2414                /* Look for next non empty bucket */
2415                st->offset = 0;
2416                while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2417                                empty_bucket(st))
2418                        ;
2419                if (st->bucket > tcp_hashinfo.ehash_mask)
2420                        return NULL;
2421
2422                spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2423                sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2424        } else
2425                sk = sk_nulls_next(sk);
2426
2427        sk_nulls_for_each_from(sk, node) {
2428                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2429                        goto found;
2430        }
2431
2432        st->state = TCP_SEQ_STATE_TIME_WAIT;
2433        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2434        goto get_tw;
2435found:
2436        cur = sk;
2437out:
2438        return cur;
2439}
2440
2441static void *established_get_idx(struct seq_file *seq, loff_t pos)
2442{
2443        struct tcp_iter_state *st = seq->private;
2444        void *rc;
2445
2446        st->bucket = 0;
2447        rc = established_get_first(seq);
2448
2449        while (rc && pos) {
2450                rc = established_get_next(seq, rc);
2451                --pos;
2452        }
2453        return rc;
2454}
2455
2456static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2457{
2458        void *rc;
2459        struct tcp_iter_state *st = seq->private;
2460
2461        st->state = TCP_SEQ_STATE_LISTENING;
2462        rc        = listening_get_idx(seq, &pos);
2463
2464        if (!rc) {
2465                st->state = TCP_SEQ_STATE_ESTABLISHED;
2466                rc        = established_get_idx(seq, pos);
2467        }
2468
2469        return rc;
2470}
2471
2472static void *tcp_seek_last_pos(struct seq_file *seq)
2473{
2474        struct tcp_iter_state *st = seq->private;
2475        int offset = st->offset;
2476        int orig_num = st->num;
2477        void *rc = NULL;
2478
2479        switch (st->state) {
2480        case TCP_SEQ_STATE_OPENREQ:
2481        case TCP_SEQ_STATE_LISTENING:
2482                if (st->bucket >= INET_LHTABLE_SIZE)
2483                        break;
2484                st->state = TCP_SEQ_STATE_LISTENING;
2485                rc = listening_get_next(seq, NULL);
2486                while (offset-- && rc)
2487                        rc = listening_get_next(seq, rc);
2488                if (rc)
2489                        break;
2490                st->bucket = 0;
2491                /* Fallthrough */
2492        case TCP_SEQ_STATE_ESTABLISHED:
2493        case TCP_SEQ_STATE_TIME_WAIT:
2494                st->state = TCP_SEQ_STATE_ESTABLISHED;
2495                if (st->bucket > tcp_hashinfo.ehash_mask)
2496                        break;
2497                rc = established_get_first(seq);
2498                while (offset-- && rc)
2499                        rc = established_get_next(seq, rc);
2500        }
2501
2502        st->num = orig_num;
2503
2504        return rc;
2505}
2506
2507static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2508{
2509        struct tcp_iter_state *st = seq->private;
2510        void *rc;
2511
2512        if (*pos && *pos == st->last_pos) {
2513                rc = tcp_seek_last_pos(seq);
2514                if (rc)
2515                        goto out;
2516        }
2517
2518        st->state = TCP_SEQ_STATE_LISTENING;
2519        st->num = 0;
2520        st->bucket = 0;
2521        st->offset = 0;
2522        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2523
2524out:
2525        st->last_pos = *pos;
2526        return rc;
2527}
2528
2529static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2530{
2531        struct tcp_iter_state *st = seq->private;
2532        void *rc = NULL;
2533
2534        if (v == SEQ_START_TOKEN) {
2535                rc = tcp_get_idx(seq, 0);
2536                goto out;
2537        }
2538
2539        switch (st->state) {
2540        case TCP_SEQ_STATE_OPENREQ:
2541        case TCP_SEQ_STATE_LISTENING:
2542                rc = listening_get_next(seq, v);
2543                if (!rc) {
2544                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2545                        st->bucket = 0;
2546                        st->offset = 0;
2547                        rc        = established_get_first(seq);
2548                }
2549                break;
2550        case TCP_SEQ_STATE_ESTABLISHED:
2551        case TCP_SEQ_STATE_TIME_WAIT:
2552                rc = established_get_next(seq, v);
2553                break;
2554        }
2555out:
2556        ++*pos;
2557        st->last_pos = *pos;
2558        return rc;
2559}
2560
2561static void tcp_seq_stop(struct seq_file *seq, void *v)
2562{
2563        struct tcp_iter_state *st = seq->private;
2564
2565        switch (st->state) {
2566        case TCP_SEQ_STATE_OPENREQ:
2567                if (v) {
2568                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2569                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2570                }
2571        case TCP_SEQ_STATE_LISTENING:
2572                if (v != SEQ_START_TOKEN)
2573                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2574                break;
2575        case TCP_SEQ_STATE_TIME_WAIT:
2576        case TCP_SEQ_STATE_ESTABLISHED:
2577                if (v)
2578                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2579                break;
2580        }
2581}
2582
2583int tcp_seq_open(struct inode *inode, struct file *file)
2584{
2585        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2586        struct tcp_iter_state *s;
2587        int err;
2588
2589        err = seq_open_net(inode, file, &afinfo->seq_ops,
2590                          sizeof(struct tcp_iter_state));
2591        if (err < 0)
2592                return err;
2593
2594        s = ((struct seq_file *)file->private_data)->private;
2595        s->family               = afinfo->family;
2596        s->last_pos             = 0;
2597        return 0;
2598}
2599EXPORT_SYMBOL(tcp_seq_open);
2600
2601int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2602{
2603        int rc = 0;
2604        struct proc_dir_entry *p;
2605
2606        afinfo->seq_ops.start           = tcp_seq_start;
2607        afinfo->seq_ops.next            = tcp_seq_next;
2608        afinfo->seq_ops.stop            = tcp_seq_stop;
2609
2610        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2611                             afinfo->seq_fops, afinfo);
2612        if (!p)
2613                rc = -ENOMEM;
2614        return rc;
2615}
2616EXPORT_SYMBOL(tcp_proc_register);
2617
2618void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2619{
2620        proc_net_remove(net, afinfo->name);
2621}
2622EXPORT_SYMBOL(tcp_proc_unregister);
2623
2624static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2625                         struct seq_file *f, int i, kuid_t uid, int *len)
2626{
2627        const struct inet_request_sock *ireq = inet_rsk(req);
2628        long delta = req->expires - jiffies;
2629
2630        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2631                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2632                i,
2633                ireq->loc_addr,
2634                ntohs(inet_sk(sk)->inet_sport),
2635                ireq->rmt_addr,
2636                ntohs(ireq->rmt_port),
2637                TCP_SYN_RECV,
2638                0, 0, /* could print option size, but that is af dependent. */
2639                1,    /* timers active (only the expire timer) */
2640                jiffies_delta_to_clock_t(delta),
2641                req->retrans,
2642                from_kuid_munged(seq_user_ns(f), uid),
2643                0,  /* non standard timer */
2644                0, /* open_requests have no inode */
2645                atomic_read(&sk->sk_refcnt),
2646                req,
2647                len);
2648}
2649
2650static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2651{
2652        int timer_active;
2653        unsigned long timer_expires;
2654        const struct tcp_sock *tp = tcp_sk(sk);
2655        const struct inet_connection_sock *icsk = inet_csk(sk);
2656        const struct inet_sock *inet = inet_sk(sk);
2657        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2658        __be32 dest = inet->inet_daddr;
2659        __be32 src = inet->inet_rcv_saddr;
2660        __u16 destp = ntohs(inet->inet_dport);
2661        __u16 srcp = ntohs(inet->inet_sport);
2662        int rx_queue;
2663
2664        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2665                timer_active    = 1;
2666                timer_expires   = icsk->icsk_timeout;
2667        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2668                timer_active    = 4;
2669                timer_expires   = icsk->icsk_timeout;
2670        } else if (timer_pending(&sk->sk_timer)) {
2671                timer_active    = 2;
2672                timer_expires   = sk->sk_timer.expires;
2673        } else {
2674                timer_active    = 0;
2675                timer_expires = jiffies;
2676        }
2677
2678        if (sk->sk_state == TCP_LISTEN)
2679                rx_queue = sk->sk_ack_backlog;
2680        else
2681                /*
2682                 * because we dont lock socket, we might find a transient negative value
2683                 */
2684                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2685
2686        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2687                        "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2688                i, src, srcp, dest, destp, sk->sk_state,
2689                tp->write_seq - tp->snd_una,
2690                rx_queue,
2691                timer_active,
2692                jiffies_delta_to_clock_t(timer_expires - jiffies),
2693                icsk->icsk_retransmits,
2694                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2695                icsk->icsk_probes_out,
2696                sock_i_ino(sk),
2697                atomic_read(&sk->sk_refcnt), sk,
2698                jiffies_to_clock_t(icsk->icsk_rto),
2699                jiffies_to_clock_t(icsk->icsk_ack.ato),
2700                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2701                tp->snd_cwnd,
2702                sk->sk_state == TCP_LISTEN ?
2703                    (fastopenq ? fastopenq->max_qlen : 0) :
2704                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2705                len);
2706}
2707
2708static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2709                               struct seq_file *f, int i, int *len)
2710{
2711        __be32 dest, src;
2712        __u16 destp, srcp;
2713        long delta = tw->tw_ttd - jiffies;
2714
2715        dest  = tw->tw_daddr;
2716        src   = tw->tw_rcv_saddr;
2717        destp = ntohs(tw->tw_dport);
2718        srcp  = ntohs(tw->tw_sport);
2719
2720        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2721                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2722                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2723                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2724                atomic_read(&tw->tw_refcnt), tw, len);
2725}
2726
2727#define TMPSZ 150
2728
2729static int tcp4_seq_show(struct seq_file *seq, void *v)
2730{
2731        struct tcp_iter_state *st;
2732        int len;
2733
2734        if (v == SEQ_START_TOKEN) {
2735                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2736                           "  sl  local_address rem_address   st tx_queue "
2737                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2738                           "inode");
2739                goto out;
2740        }
2741        st = seq->private;
2742
2743        switch (st->state) {
2744        case TCP_SEQ_STATE_LISTENING:
2745        case TCP_SEQ_STATE_ESTABLISHED:
2746                get_tcp4_sock(v, seq, st->num, &len);
2747                break;
2748        case TCP_SEQ_STATE_OPENREQ:
2749                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2750                break;
2751        case TCP_SEQ_STATE_TIME_WAIT:
2752                get_timewait4_sock(v, seq, st->num, &len);
2753                break;
2754        }
2755        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2756out:
2757        return 0;
2758}
2759
2760static const struct file_operations tcp_afinfo_seq_fops = {
2761        .owner   = THIS_MODULE,
2762        .open    = tcp_seq_open,
2763        .read    = seq_read,
2764        .llseek  = seq_lseek,
2765        .release = seq_release_net
2766};
2767
2768static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2769        .name           = "tcp",
2770        .family         = AF_INET,
2771        .seq_fops       = &tcp_afinfo_seq_fops,
2772        .seq_ops        = {
2773                .show           = tcp4_seq_show,
2774        },
2775};
2776
2777static int __net_init tcp4_proc_init_net(struct net *net)
2778{
2779        return tcp_proc_register(net, &tcp4_seq_afinfo);
2780}
2781
2782static void __net_exit tcp4_proc_exit_net(struct net *net)
2783{
2784        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2785}
2786
2787static struct pernet_operations tcp4_net_ops = {
2788        .init = tcp4_proc_init_net,
2789        .exit = tcp4_proc_exit_net,
2790};
2791
2792int __init tcp4_proc_init(void)
2793{
2794        return register_pernet_subsys(&tcp4_net_ops);
2795}
2796
2797void tcp4_proc_exit(void)
2798{
2799        unregister_pernet_subsys(&tcp4_net_ops);
2800}
2801#endif /* CONFIG_PROC_FS */
2802
2803struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2804{
2805        const struct iphdr *iph = skb_gro_network_header(skb);
2806        __wsum wsum;
2807        __sum16 sum;
2808
2809        switch (skb->ip_summed) {
2810        case CHECKSUM_COMPLETE:
2811                if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2812                                  skb->csum)) {
2813                        skb->ip_summed = CHECKSUM_UNNECESSARY;
2814                        break;
2815                }
2816flush:
2817                NAPI_GRO_CB(skb)->flush = 1;
2818                return NULL;
2819
2820        case CHECKSUM_NONE:
2821                wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2822                                          skb_gro_len(skb), IPPROTO_TCP, 0);
2823                sum = csum_fold(skb_checksum(skb,
2824                                             skb_gro_offset(skb),
2825                                             skb_gro_len(skb),
2826                                             wsum));
2827                if (sum)
2828                        goto flush;
2829
2830                skb->ip_summed = CHECKSUM_UNNECESSARY;
2831                break;
2832        }
2833
2834        return tcp_gro_receive(head, skb);
2835}
2836
2837int tcp4_gro_complete(struct sk_buff *skb)
2838{
2839        const struct iphdr *iph = ip_hdr(skb);
2840        struct tcphdr *th = tcp_hdr(skb);
2841
2842        th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2843                                  iph->saddr, iph->daddr, 0);
2844        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2845
2846        return tcp_gro_complete(skb);
2847}
2848
2849struct proto tcp_prot = {
2850        .name                   = "TCP",
2851        .owner                  = THIS_MODULE,
2852        .close                  = tcp_close,
2853        .connect                = tcp_v4_connect,
2854        .disconnect             = tcp_disconnect,
2855        .accept                 = inet_csk_accept,
2856        .ioctl                  = tcp_ioctl,
2857        .init                   = tcp_v4_init_sock,
2858        .destroy                = tcp_v4_destroy_sock,
2859        .shutdown               = tcp_shutdown,
2860        .setsockopt             = tcp_setsockopt,
2861        .getsockopt             = tcp_getsockopt,
2862        .recvmsg                = tcp_recvmsg,
2863        .sendmsg                = tcp_sendmsg,
2864        .sendpage               = tcp_sendpage,
2865        .backlog_rcv            = tcp_v4_do_rcv,
2866        .release_cb             = tcp_release_cb,
2867        .mtu_reduced            = tcp_v4_mtu_reduced,
2868        .hash                   = inet_hash,
2869        .unhash                 = inet_unhash,
2870        .get_port               = inet_csk_get_port,
2871        .enter_memory_pressure  = tcp_enter_memory_pressure,
2872        .sockets_allocated      = &tcp_sockets_allocated,
2873        .orphan_count           = &tcp_orphan_count,
2874        .memory_allocated       = &tcp_memory_allocated,
2875        .memory_pressure        = &tcp_memory_pressure,
2876        .sysctl_wmem            = sysctl_tcp_wmem,
2877        .sysctl_rmem            = sysctl_tcp_rmem,
2878        .max_header             = MAX_TCP_HEADER,
2879        .obj_size               = sizeof(struct tcp_sock),
2880        .slab_flags             = SLAB_DESTROY_BY_RCU,
2881        .twsk_prot              = &tcp_timewait_sock_ops,
2882        .rsk_prot               = &tcp_request_sock_ops,
2883        .h.hashinfo             = &tcp_hashinfo,
2884        .no_autobind            = true,
2885#ifdef CONFIG_COMPAT
2886        .compat_setsockopt      = compat_tcp_setsockopt,
2887        .compat_getsockopt      = compat_tcp_getsockopt,
2888#endif
2889#ifdef CONFIG_MEMCG_KMEM
2890        .init_cgroup            = tcp_init_cgroup,
2891        .destroy_cgroup         = tcp_destroy_cgroup,
2892        .proto_cgroup           = tcp_proto_cgroup,
2893#endif
2894};
2895EXPORT_SYMBOL(tcp_prot);
2896
2897static int __net_init tcp_sk_init(struct net *net)
2898{
2899        return 0;
2900}
2901
2902static void __net_exit tcp_sk_exit(struct net *net)
2903{
2904}
2905
2906static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2907{
2908        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2909}
2910
2911static struct pernet_operations __net_initdata tcp_sk_ops = {
2912       .init       = tcp_sk_init,
2913       .exit       = tcp_sk_exit,
2914       .exit_batch = tcp_sk_exit_batch,
2915};
2916
2917void __init tcp_v4_init(void)
2918{
2919        inet_hashinfo_init(&tcp_hashinfo);
2920        if (register_pernet_subsys(&tcp_sk_ops))
2921                panic("Failed to create the TCP control socket.\n");
2922}
2923
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.