linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88#include <trace/events/tcp.h>
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100        return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                              ip_hdr(skb)->saddr,
 102                              tcp_hdr(skb)->dest,
 103                              tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114        struct tcp_sock *tp = tcp_sk(sk);
 115
 116        /* With PAWS, it is safe from the viewpoint
 117           of data integrity. Even without PAWS it is safe provided sequence
 118           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120           Actually, the idea is close to VJ's one, only timestamp cache is
 121           held not per host, but per port pair and TW bucket is used as state
 122           holder.
 123
 124           If TW bucket has been already destroyed we fall back to VJ's scheme
 125           and use initial timestamp retrieved from peer table.
 126         */
 127        if (tcptw->tw_ts_recent_stamp &&
 128            (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 129                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                if (tp->write_seq == 0)
 132                        tp->write_seq = 1;
 133                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                sock_hold(sktw);
 136                return 1;
 137        }
 138
 139        return 0;
 140}
 141EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 142
 143/* This will initiate an outgoing connection. */
 144int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 145{
 146        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 147        struct inet_sock *inet = inet_sk(sk);
 148        struct tcp_sock *tp = tcp_sk(sk);
 149        __be16 orig_sport, orig_dport;
 150        __be32 daddr, nexthop;
 151        struct flowi4 *fl4;
 152        struct rtable *rt;
 153        int err;
 154        struct ip_options_rcu *inet_opt;
 155        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 156
 157        if (addr_len < sizeof(struct sockaddr_in))
 158                return -EINVAL;
 159
 160        if (usin->sin_family != AF_INET)
 161                return -EAFNOSUPPORT;
 162
 163        nexthop = daddr = usin->sin_addr.s_addr;
 164        inet_opt = rcu_dereference_protected(inet->inet_opt,
 165                                             lockdep_sock_is_held(sk));
 166        if (inet_opt && inet_opt->opt.srr) {
 167                if (!daddr)
 168                        return -EINVAL;
 169                nexthop = inet_opt->opt.faddr;
 170        }
 171
 172        orig_sport = inet->inet_sport;
 173        orig_dport = usin->sin_port;
 174        fl4 = &inet->cork.fl.u.ip4;
 175        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 176                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 177                              IPPROTO_TCP,
 178                              orig_sport, orig_dport, sk);
 179        if (IS_ERR(rt)) {
 180                err = PTR_ERR(rt);
 181                if (err == -ENETUNREACH)
 182                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 183                return err;
 184        }
 185
 186        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187                ip_rt_put(rt);
 188                return -ENETUNREACH;
 189        }
 190
 191        if (!inet_opt || !inet_opt->opt.srr)
 192                daddr = fl4->daddr;
 193
 194        if (!inet->inet_saddr)
 195                inet->inet_saddr = fl4->saddr;
 196        sk_rcv_saddr_set(sk, inet->inet_saddr);
 197
 198        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 199                /* Reset inherited state */
 200                tp->rx_opt.ts_recent       = 0;
 201                tp->rx_opt.ts_recent_stamp = 0;
 202                if (likely(!tp->repair))
 203                        tp->write_seq      = 0;
 204        }
 205
 206        inet->inet_dport = usin->sin_port;
 207        sk_daddr_set(sk, daddr);
 208
 209        inet_csk(sk)->icsk_ext_hdr_len = 0;
 210        if (inet_opt)
 211                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215        /* Socket identity is still unknown (sport may be zero).
 216         * However we set state to SYN-SENT and not releasing socket
 217         * lock select source port, enter ourselves into the hash tables and
 218         * complete initialization after this.
 219         */
 220        tcp_set_state(sk, TCP_SYN_SENT);
 221        err = inet_hash_connect(tcp_death_row, sk);
 222        if (err)
 223                goto failure;
 224
 225        sk_set_txhash(sk);
 226
 227        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                               inet->inet_sport, inet->inet_dport, sk);
 229        if (IS_ERR(rt)) {
 230                err = PTR_ERR(rt);
 231                rt = NULL;
 232                goto failure;
 233        }
 234        /* OK, now commit destination to socket.  */
 235        sk->sk_gso_type = SKB_GSO_TCPV4;
 236        sk_setup_caps(sk, &rt->dst);
 237        rt = NULL;
 238
 239        if (likely(!tp->repair)) {
 240                if (!tp->write_seq)
 241                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 242                                                       inet->inet_daddr,
 243                                                       inet->inet_sport,
 244                                                       usin->sin_port);
 245                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 246                                                 inet->inet_saddr,
 247                                                 inet->inet_daddr);
 248        }
 249
 250        inet->inet_id = tp->write_seq ^ jiffies;
 251
 252        if (tcp_fastopen_defer_connect(sk, &err))
 253                return err;
 254        if (err)
 255                goto failure;
 256
 257        err = tcp_connect(sk);
 258
 259        if (err)
 260                goto failure;
 261
 262        return 0;
 263
 264failure:
 265        /*
 266         * This unhashes the socket and releases the local port,
 267         * if necessary.
 268         */
 269        tcp_set_state(sk, TCP_CLOSE);
 270        ip_rt_put(rt);
 271        sk->sk_route_caps = 0;
 272        inet->inet_dport = 0;
 273        return err;
 274}
 275EXPORT_SYMBOL(tcp_v4_connect);
 276
 277/*
 278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279 * It can be called through tcp_release_cb() if socket was owned by user
 280 * at the time tcp_v4_err() was called to handle ICMP message.
 281 */
 282void tcp_v4_mtu_reduced(struct sock *sk)
 283{
 284        struct inet_sock *inet = inet_sk(sk);
 285        struct dst_entry *dst;
 286        u32 mtu;
 287
 288        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 289                return;
 290        mtu = tcp_sk(sk)->mtu_info;
 291        dst = inet_csk_update_pmtu(sk, mtu);
 292        if (!dst)
 293                return;
 294
 295        /* Something is about to be wrong... Remember soft error
 296         * for the case, if this connection will not able to recover.
 297         */
 298        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                sk->sk_err_soft = EMSGSIZE;
 300
 301        mtu = dst_mtu(dst);
 302
 303        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304            ip_sk_accept_pmtu(sk) &&
 305            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 306                tcp_sync_mss(sk, mtu);
 307
 308                /* Resend the TCP packet because it's
 309                 * clear that the old packet has been
 310                 * dropped. This is the new "fast" path mtu
 311                 * discovery.
 312                 */
 313                tcp_simple_retransmit(sk);
 314        } /* else let the usual retransmit timer handle it */
 315}
 316EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 317
 318static void do_redirect(struct sk_buff *skb, struct sock *sk)
 319{
 320        struct dst_entry *dst = __sk_dst_check(sk, 0);
 321
 322        if (dst)
 323                dst->ops->redirect(dst, sk, skb);
 324}
 325
 326
 327/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 328void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 329{
 330        struct request_sock *req = inet_reqsk(sk);
 331        struct net *net = sock_net(sk);
 332
 333        /* ICMPs are not backlogged, hence we cannot get
 334         * an established socket here.
 335         */
 336        if (seq != tcp_rsk(req)->snt_isn) {
 337                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 338        } else if (abort) {
 339                /*
 340                 * Still in SYN_RECV, just remove it silently.
 341                 * There is no good way to pass the error to the newly
 342                 * created socket, and POSIX does not want network
 343                 * errors returned from accept().
 344                 */
 345                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 346                tcp_listendrop(req->rsk_listener);
 347        }
 348        reqsk_put(req);
 349}
 350EXPORT_SYMBOL(tcp_req_err);
 351
 352/*
 353 * This routine is called by the ICMP module when it gets some
 354 * sort of error condition.  If err < 0 then the socket should
 355 * be closed and the error returned to the user.  If err > 0
 356 * it's just the icmp type << 8 | icmp code.  After adjustment
 357 * header points to the first 8 bytes of the tcp header.  We need
 358 * to find the appropriate port.
 359 *
 360 * The locking strategy used here is very "optimistic". When
 361 * someone else accesses the socket the ICMP is just dropped
 362 * and for some paths there is no check at all.
 363 * A more general error queue to queue errors for later handling
 364 * is probably better.
 365 *
 366 */
 367
 368void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 369{
 370        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 371        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 372        struct inet_connection_sock *icsk;
 373        struct tcp_sock *tp;
 374        struct inet_sock *inet;
 375        const int type = icmp_hdr(icmp_skb)->type;
 376        const int code = icmp_hdr(icmp_skb)->code;
 377        struct sock *sk;
 378        struct sk_buff *skb;
 379        struct request_sock *fastopen;
 380        u32 seq, snd_una;
 381        s32 remaining;
 382        u32 delta_us;
 383        int err;
 384        struct net *net = dev_net(icmp_skb->dev);
 385
 386        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 387                                       th->dest, iph->saddr, ntohs(th->source),
 388                                       inet_iif(icmp_skb), 0);
 389        if (!sk) {
 390                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 391                return;
 392        }
 393        if (sk->sk_state == TCP_TIME_WAIT) {
 394                inet_twsk_put(inet_twsk(sk));
 395                return;
 396        }
 397        seq = ntohl(th->seq);
 398        if (sk->sk_state == TCP_NEW_SYN_RECV)
 399                return tcp_req_err(sk, seq,
 400                                  type == ICMP_PARAMETERPROB ||
 401                                  type == ICMP_TIME_EXCEEDED ||
 402                                  (type == ICMP_DEST_UNREACH &&
 403                                   (code == ICMP_NET_UNREACH ||
 404                                    code == ICMP_HOST_UNREACH)));
 405
 406        bh_lock_sock(sk);
 407        /* If too many ICMPs get dropped on busy
 408         * servers this needs to be solved differently.
 409         * We do take care of PMTU discovery (RFC1191) special case :
 410         * we can receive locally generated ICMP messages while socket is held.
 411         */
 412        if (sock_owned_by_user(sk)) {
 413                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 414                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 415        }
 416        if (sk->sk_state == TCP_CLOSE)
 417                goto out;
 418
 419        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 420                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 421                goto out;
 422        }
 423
 424        icsk = inet_csk(sk);
 425        tp = tcp_sk(sk);
 426        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 427        fastopen = tp->fastopen_rsk;
 428        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 429        if (sk->sk_state != TCP_LISTEN &&
 430            !between(seq, snd_una, tp->snd_nxt)) {
 431                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 432                goto out;
 433        }
 434
 435        switch (type) {
 436        case ICMP_REDIRECT:
 437                if (!sock_owned_by_user(sk))
 438                        do_redirect(icmp_skb, sk);
 439                goto out;
 440        case ICMP_SOURCE_QUENCH:
 441                /* Just silently ignore these. */
 442                goto out;
 443        case ICMP_PARAMETERPROB:
 444                err = EPROTO;
 445                break;
 446        case ICMP_DEST_UNREACH:
 447                if (code > NR_ICMP_UNREACH)
 448                        goto out;
 449
 450                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 451                        /* We are not interested in TCP_LISTEN and open_requests
 452                         * (SYN-ACKs send out by Linux are always <576bytes so
 453                         * they should go through unfragmented).
 454                         */
 455                        if (sk->sk_state == TCP_LISTEN)
 456                                goto out;
 457
 458                        tp->mtu_info = info;
 459                        if (!sock_owned_by_user(sk)) {
 460                                tcp_v4_mtu_reduced(sk);
 461                        } else {
 462                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 463                                        sock_hold(sk);
 464                        }
 465                        goto out;
 466                }
 467
 468                err = icmp_err_convert[code].errno;
 469                /* check if icmp_skb allows revert of backoff
 470                 * (see draft-zimmermann-tcp-lcd) */
 471                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 472                        break;
 473                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 474                    !icsk->icsk_backoff || fastopen)
 475                        break;
 476
 477                if (sock_owned_by_user(sk))
 478                        break;
 479
 480                icsk->icsk_backoff--;
 481                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 482                                               TCP_TIMEOUT_INIT;
 483                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 484
 485                skb = tcp_rtx_queue_head(sk);
 486                BUG_ON(!skb);
 487
 488                tcp_mstamp_refresh(tp);
 489                delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 490                remaining = icsk->icsk_rto -
 491                            usecs_to_jiffies(delta_us);
 492
 493                if (remaining > 0) {
 494                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 495                                                  remaining, TCP_RTO_MAX);
 496                } else {
 497                        /* RTO revert clocked out retransmission.
 498                         * Will retransmit now */
 499                        tcp_retransmit_timer(sk);
 500                }
 501
 502                break;
 503        case ICMP_TIME_EXCEEDED:
 504                err = EHOSTUNREACH;
 505                break;
 506        default:
 507                goto out;
 508        }
 509
 510        switch (sk->sk_state) {
 511        case TCP_SYN_SENT:
 512        case TCP_SYN_RECV:
 513                /* Only in fast or simultaneous open. If a fast open socket is
 514                 * is already accepted it is treated as a connected one below.
 515                 */
 516                if (fastopen && !fastopen->sk)
 517                        break;
 518
 519                if (!sock_owned_by_user(sk)) {
 520                        sk->sk_err = err;
 521
 522                        sk->sk_error_report(sk);
 523
 524                        tcp_done(sk);
 525                } else {
 526                        sk->sk_err_soft = err;
 527                }
 528                goto out;
 529        }
 530
 531        /* If we've already connected we will keep trying
 532         * until we time out, or the user gives up.
 533         *
 534         * rfc1122 4.2.3.9 allows to consider as hard errors
 535         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 536         * but it is obsoleted by pmtu discovery).
 537         *
 538         * Note, that in modern internet, where routing is unreliable
 539         * and in each dark corner broken firewalls sit, sending random
 540         * errors ordered by their masters even this two messages finally lose
 541         * their original sense (even Linux sends invalid PORT_UNREACHs)
 542         *
 543         * Now we are in compliance with RFCs.
 544         *                                                      --ANK (980905)
 545         */
 546
 547        inet = inet_sk(sk);
 548        if (!sock_owned_by_user(sk) && inet->recverr) {
 549                sk->sk_err = err;
 550                sk->sk_error_report(sk);
 551        } else  { /* Only an error on timeout */
 552                sk->sk_err_soft = err;
 553        }
 554
 555out:
 556        bh_unlock_sock(sk);
 557        sock_put(sk);
 558}
 559
 560void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 561{
 562        struct tcphdr *th = tcp_hdr(skb);
 563
 564        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                skb->csum_start = skb_transport_header(skb) - skb->head;
 567                skb->csum_offset = offsetof(struct tcphdr, check);
 568        } else {
 569                th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                         csum_partial(th,
 571                                                      th->doff << 2,
 572                                                      skb->csum));
 573        }
 574}
 575
 576/* This routine computes an IPv4 TCP checksum. */
 577void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578{
 579        const struct inet_sock *inet = inet_sk(sk);
 580
 581        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582}
 583EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585/*
 586 *      This routine will send an RST to the other tcp.
 587 *
 588 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 589 *                    for reset.
 590 *      Answer: if a packet caused RST, it is not for a socket
 591 *              existing in our system, if it is matched to a socket,
 592 *              it is just duplicate segment or bug in other side's TCP.
 593 *              So that we build reply only basing on parameters
 594 *              arrived with segment.
 595 *      Exception: precedence violation. We do not implement it in any case.
 596 */
 597
 598static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 599{
 600        const struct tcphdr *th = tcp_hdr(skb);
 601        struct {
 602                struct tcphdr th;
 603#ifdef CONFIG_TCP_MD5SIG
 604                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 605#endif
 606        } rep;
 607        struct ip_reply_arg arg;
 608#ifdef CONFIG_TCP_MD5SIG
 609        struct tcp_md5sig_key *key = NULL;
 610        const __u8 *hash_location = NULL;
 611        unsigned char newhash[16];
 612        int genhash;
 613        struct sock *sk1 = NULL;
 614#endif
 615        struct net *net;
 616
 617        /* Never send a reset in response to a reset. */
 618        if (th->rst)
 619                return;
 620
 621        /* If sk not NULL, it means we did a successful lookup and incoming
 622         * route had to be correct. prequeue might have dropped our dst.
 623         */
 624        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 625                return;
 626
 627        /* Swap the send and the receive. */
 628        memset(&rep, 0, sizeof(rep));
 629        rep.th.dest   = th->source;
 630        rep.th.source = th->dest;
 631        rep.th.doff   = sizeof(struct tcphdr) / 4;
 632        rep.th.rst    = 1;
 633
 634        if (th->ack) {
 635                rep.th.seq = th->ack_seq;
 636        } else {
 637                rep.th.ack = 1;
 638                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 639                                       skb->len - (th->doff << 2));
 640        }
 641
 642        memset(&arg, 0, sizeof(arg));
 643        arg.iov[0].iov_base = (unsigned char *)&rep;
 644        arg.iov[0].iov_len  = sizeof(rep.th);
 645
 646        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 647#ifdef CONFIG_TCP_MD5SIG
 648        rcu_read_lock();
 649        hash_location = tcp_parse_md5sig_option(th);
 650        if (sk && sk_fullsock(sk)) {
 651                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 652                                        &ip_hdr(skb)->saddr, AF_INET);
 653        } else if (hash_location) {
 654                /*
 655                 * active side is lost. Try to find listening socket through
 656                 * source port, and then find md5 key through listening socket.
 657                 * we are not loose security here:
 658                 * Incoming packet is checked with md5 hash with finding key,
 659                 * no RST generated if md5 hash doesn't match.
 660                 */
 661                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 662                                             ip_hdr(skb)->saddr,
 663                                             th->source, ip_hdr(skb)->daddr,
 664                                             ntohs(th->source), inet_iif(skb),
 665                                             tcp_v4_sdif(skb));
 666                /* don't send rst if it can't find key */
 667                if (!sk1)
 668                        goto out;
 669
 670                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 671                                        &ip_hdr(skb)->saddr, AF_INET);
 672                if (!key)
 673                        goto out;
 674
 675
 676                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 677                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 678                        goto out;
 679
 680        }
 681
 682        if (key) {
 683                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 684                                   (TCPOPT_NOP << 16) |
 685                                   (TCPOPT_MD5SIG << 8) |
 686                                   TCPOLEN_MD5SIG);
 687                /* Update length and the length the header thinks exists */
 688                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 689                rep.th.doff = arg.iov[0].iov_len / 4;
 690
 691                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 692                                     key, ip_hdr(skb)->saddr,
 693                                     ip_hdr(skb)->daddr, &rep.th);
 694        }
 695#endif
 696        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 697                                      ip_hdr(skb)->saddr, /* XXX */
 698                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 699        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 700        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 701
 702        /* When socket is gone, all binding information is lost.
 703         * routing might fail in this case. No choice here, if we choose to force
 704         * input interface, we will misroute in case of asymmetric route.
 705         */
 706        if (sk) {
 707                arg.bound_dev_if = sk->sk_bound_dev_if;
 708                if (sk_fullsock(sk))
 709                        trace_tcp_send_reset(sk, skb);
 710        }
 711
 712        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 713                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 714
 715        arg.tos = ip_hdr(skb)->tos;
 716        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 717        local_bh_disable();
 718        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 719                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 720                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 721                              &arg, arg.iov[0].iov_len);
 722
 723        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 724        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 725        local_bh_enable();
 726
 727#ifdef CONFIG_TCP_MD5SIG
 728out:
 729        rcu_read_unlock();
 730#endif
 731}
 732
 733/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 734   outside socket context is ugly, certainly. What can I do?
 735 */
 736
 737static void tcp_v4_send_ack(const struct sock *sk,
 738                            struct sk_buff *skb, u32 seq, u32 ack,
 739                            u32 win, u32 tsval, u32 tsecr, int oif,
 740                            struct tcp_md5sig_key *key,
 741                            int reply_flags, u8 tos)
 742{
 743        const struct tcphdr *th = tcp_hdr(skb);
 744        struct {
 745                struct tcphdr th;
 746                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 747#ifdef CONFIG_TCP_MD5SIG
 748                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 749#endif
 750                        ];
 751        } rep;
 752        struct net *net = sock_net(sk);
 753        struct ip_reply_arg arg;
 754
 755        memset(&rep.th, 0, sizeof(struct tcphdr));
 756        memset(&arg, 0, sizeof(arg));
 757
 758        arg.iov[0].iov_base = (unsigned char *)&rep;
 759        arg.iov[0].iov_len  = sizeof(rep.th);
 760        if (tsecr) {
 761                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 762                                   (TCPOPT_TIMESTAMP << 8) |
 763                                   TCPOLEN_TIMESTAMP);
 764                rep.opt[1] = htonl(tsval);
 765                rep.opt[2] = htonl(tsecr);
 766                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 767        }
 768
 769        /* Swap the send and the receive. */
 770        rep.th.dest    = th->source;
 771        rep.th.source  = th->dest;
 772        rep.th.doff    = arg.iov[0].iov_len / 4;
 773        rep.th.seq     = htonl(seq);
 774        rep.th.ack_seq = htonl(ack);
 775        rep.th.ack     = 1;
 776        rep.th.window  = htons(win);
 777
 778#ifdef CONFIG_TCP_MD5SIG
 779        if (key) {
 780                int offset = (tsecr) ? 3 : 0;
 781
 782                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 783                                          (TCPOPT_NOP << 16) |
 784                                          (TCPOPT_MD5SIG << 8) |
 785                                          TCPOLEN_MD5SIG);
 786                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 787                rep.th.doff = arg.iov[0].iov_len/4;
 788
 789                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 790                                    key, ip_hdr(skb)->saddr,
 791                                    ip_hdr(skb)->daddr, &rep.th);
 792        }
 793#endif
 794        arg.flags = reply_flags;
 795        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 796                                      ip_hdr(skb)->saddr, /* XXX */
 797                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 798        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 799        if (oif)
 800                arg.bound_dev_if = oif;
 801        arg.tos = tos;
 802        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 803        local_bh_disable();
 804        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 805                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 806                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 807                              &arg, arg.iov[0].iov_len);
 808
 809        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 810        local_bh_enable();
 811}
 812
 813static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 814{
 815        struct inet_timewait_sock *tw = inet_twsk(sk);
 816        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 817
 818        tcp_v4_send_ack(sk, skb,
 819                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 820                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 821                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 822                        tcptw->tw_ts_recent,
 823                        tw->tw_bound_dev_if,
 824                        tcp_twsk_md5_key(tcptw),
 825                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 826                        tw->tw_tos
 827                        );
 828
 829        inet_twsk_put(tw);
 830}
 831
 832static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 833                                  struct request_sock *req)
 834{
 835        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 836         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 837         */
 838        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 839                                             tcp_sk(sk)->snd_nxt;
 840
 841        /* RFC 7323 2.3
 842         * The window field (SEG.WND) of every outgoing segment, with the
 843         * exception of <SYN> segments, MUST be right-shifted by
 844         * Rcv.Wind.Shift bits:
 845         */
 846        tcp_v4_send_ack(sk, skb, seq,
 847                        tcp_rsk(req)->rcv_nxt,
 848                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 849                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 850                        req->ts_recent,
 851                        0,
 852                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 853                                          AF_INET),
 854                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 855                        ip_hdr(skb)->tos);
 856}
 857
 858/*
 859 *      Send a SYN-ACK after having received a SYN.
 860 *      This still operates on a request_sock only, not on a big
 861 *      socket.
 862 */
 863static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 864                              struct flowi *fl,
 865                              struct request_sock *req,
 866                              struct tcp_fastopen_cookie *foc,
 867                              enum tcp_synack_type synack_type)
 868{
 869        const struct inet_request_sock *ireq = inet_rsk(req);
 870        struct flowi4 fl4;
 871        int err = -1;
 872        struct sk_buff *skb;
 873
 874        /* First, grab a route. */
 875        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 876                return -1;
 877
 878        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 879
 880        if (skb) {
 881                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 882
 883                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 884                                            ireq->ir_rmt_addr,
 885                                            ireq_opt_deref(ireq));
 886                err = net_xmit_eval(err);
 887        }
 888
 889        return err;
 890}
 891
 892/*
 893 *      IPv4 request_sock destructor.
 894 */
 895static void tcp_v4_reqsk_destructor(struct request_sock *req)
 896{
 897        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 898}
 899
 900#ifdef CONFIG_TCP_MD5SIG
 901/*
 902 * RFC2385 MD5 checksumming requires a mapping of
 903 * IP address->MD5 Key.
 904 * We need to maintain these in the sk structure.
 905 */
 906
 907/* Find the Key structure for an address.  */
 908struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 909                                         const union tcp_md5_addr *addr,
 910                                         int family)
 911{
 912        const struct tcp_sock *tp = tcp_sk(sk);
 913        struct tcp_md5sig_key *key;
 914        const struct tcp_md5sig_info *md5sig;
 915        __be32 mask;
 916        struct tcp_md5sig_key *best_match = NULL;
 917        bool match;
 918
 919        /* caller either holds rcu_read_lock() or socket lock */
 920        md5sig = rcu_dereference_check(tp->md5sig_info,
 921                                       lockdep_sock_is_held(sk));
 922        if (!md5sig)
 923                return NULL;
 924
 925        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 926                if (key->family != family)
 927                        continue;
 928
 929                if (family == AF_INET) {
 930                        mask = inet_make_mask(key->prefixlen);
 931                        match = (key->addr.a4.s_addr & mask) ==
 932                                (addr->a4.s_addr & mask);
 933#if IS_ENABLED(CONFIG_IPV6)
 934                } else if (family == AF_INET6) {
 935                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 936                                                  key->prefixlen);
 937#endif
 938                } else {
 939                        match = false;
 940                }
 941
 942                if (match && (!best_match ||
 943                              key->prefixlen > best_match->prefixlen))
 944                        best_match = key;
 945        }
 946        return best_match;
 947}
 948EXPORT_SYMBOL(tcp_md5_do_lookup);
 949
 950static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 951                                                      const union tcp_md5_addr *addr,
 952                                                      int family, u8 prefixlen)
 953{
 954        const struct tcp_sock *tp = tcp_sk(sk);
 955        struct tcp_md5sig_key *key;
 956        unsigned int size = sizeof(struct in_addr);
 957        const struct tcp_md5sig_info *md5sig;
 958
 959        /* caller either holds rcu_read_lock() or socket lock */
 960        md5sig = rcu_dereference_check(tp->md5sig_info,
 961                                       lockdep_sock_is_held(sk));
 962        if (!md5sig)
 963                return NULL;
 964#if IS_ENABLED(CONFIG_IPV6)
 965        if (family == AF_INET6)
 966                size = sizeof(struct in6_addr);
 967#endif
 968        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 969                if (key->family != family)
 970                        continue;
 971                if (!memcmp(&key->addr, addr, size) &&
 972                    key->prefixlen == prefixlen)
 973                        return key;
 974        }
 975        return NULL;
 976}
 977
 978struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 979                                         const struct sock *addr_sk)
 980{
 981        const union tcp_md5_addr *addr;
 982
 983        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 984        return tcp_md5_do_lookup(sk, addr, AF_INET);
 985}
 986EXPORT_SYMBOL(tcp_v4_md5_lookup);
 987
 988/* This can be called on a newly created socket, from other files */
 989int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 990                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 991                   gfp_t gfp)
 992{
 993        /* Add Key to the list */
 994        struct tcp_md5sig_key *key;
 995        struct tcp_sock *tp = tcp_sk(sk);
 996        struct tcp_md5sig_info *md5sig;
 997
 998        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
 999        if (key) {
1000                /* Pre-existing entry - just update that one. */
1001                memcpy(key->key, newkey, newkeylen);
1002                key->keylen = newkeylen;
1003                return 0;
1004        }
1005
1006        md5sig = rcu_dereference_protected(tp->md5sig_info,
1007                                           lockdep_sock_is_held(sk));
1008        if (!md5sig) {
1009                md5sig = kmalloc(sizeof(*md5sig), gfp);
1010                if (!md5sig)
1011                        return -ENOMEM;
1012
1013                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1014                INIT_HLIST_HEAD(&md5sig->head);
1015                rcu_assign_pointer(tp->md5sig_info, md5sig);
1016        }
1017
1018        key = sock_kmalloc(sk, sizeof(*key), gfp);
1019        if (!key)
1020                return -ENOMEM;
1021        if (!tcp_alloc_md5sig_pool()) {
1022                sock_kfree_s(sk, key, sizeof(*key));
1023                return -ENOMEM;
1024        }
1025
1026        memcpy(key->key, newkey, newkeylen);
1027        key->keylen = newkeylen;
1028        key->family = family;
1029        key->prefixlen = prefixlen;
1030        memcpy(&key->addr, addr,
1031               (family == AF_INET6) ? sizeof(struct in6_addr) :
1032                                      sizeof(struct in_addr));
1033        hlist_add_head_rcu(&key->node, &md5sig->head);
1034        return 0;
1035}
1036EXPORT_SYMBOL(tcp_md5_do_add);
1037
1038int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1039                   u8 prefixlen)
1040{
1041        struct tcp_md5sig_key *key;
1042
1043        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1044        if (!key)
1045                return -ENOENT;
1046        hlist_del_rcu(&key->node);
1047        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1048        kfree_rcu(key, rcu);
1049        return 0;
1050}
1051EXPORT_SYMBOL(tcp_md5_do_del);
1052
1053static void tcp_clear_md5_list(struct sock *sk)
1054{
1055        struct tcp_sock *tp = tcp_sk(sk);
1056        struct tcp_md5sig_key *key;
1057        struct hlist_node *n;
1058        struct tcp_md5sig_info *md5sig;
1059
1060        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1061
1062        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1063                hlist_del_rcu(&key->node);
1064                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1065                kfree_rcu(key, rcu);
1066        }
1067}
1068
1069static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1070                                 char __user *optval, int optlen)
1071{
1072        struct tcp_md5sig cmd;
1073        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1074        u8 prefixlen = 32;
1075
1076        if (optlen < sizeof(cmd))
1077                return -EINVAL;
1078
1079        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1080                return -EFAULT;
1081
1082        if (sin->sin_family != AF_INET)
1083                return -EINVAL;
1084
1085        if (optname == TCP_MD5SIG_EXT &&
1086            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1087                prefixlen = cmd.tcpm_prefixlen;
1088                if (prefixlen > 32)
1089                        return -EINVAL;
1090        }
1091
1092        if (!cmd.tcpm_keylen)
1093                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1094                                      AF_INET, prefixlen);
1095
1096        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1097                return -EINVAL;
1098
1099        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1100                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1101                              GFP_KERNEL);
1102}
1103
1104static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1105                                   __be32 daddr, __be32 saddr,
1106                                   const struct tcphdr *th, int nbytes)
1107{
1108        struct tcp4_pseudohdr *bp;
1109        struct scatterlist sg;
1110        struct tcphdr *_th;
1111
1112        bp = hp->scratch;
1113        bp->saddr = saddr;
1114        bp->daddr = daddr;
1115        bp->pad = 0;
1116        bp->protocol = IPPROTO_TCP;
1117        bp->len = cpu_to_be16(nbytes);
1118
1119        _th = (struct tcphdr *)(bp + 1);
1120        memcpy(_th, th, sizeof(*th));
1121        _th->check = 0;
1122
1123        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1124        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1125                                sizeof(*bp) + sizeof(*th));
1126        return crypto_ahash_update(hp->md5_req);
1127}
1128
1129static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1130                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1131{
1132        struct tcp_md5sig_pool *hp;
1133        struct ahash_request *req;
1134
1135        hp = tcp_get_md5sig_pool();
1136        if (!hp)
1137                goto clear_hash_noput;
1138        req = hp->md5_req;
1139
1140        if (crypto_ahash_init(req))
1141                goto clear_hash;
1142        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1143                goto clear_hash;
1144        if (tcp_md5_hash_key(hp, key))
1145                goto clear_hash;
1146        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1147        if (crypto_ahash_final(req))
1148                goto clear_hash;
1149
1150        tcp_put_md5sig_pool();
1151        return 0;
1152
1153clear_hash:
1154        tcp_put_md5sig_pool();
1155clear_hash_noput:
1156        memset(md5_hash, 0, 16);
1157        return 1;
1158}
1159
1160int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1161                        const struct sock *sk,
1162                        const struct sk_buff *skb)
1163{
1164        struct tcp_md5sig_pool *hp;
1165        struct ahash_request *req;
1166        const struct tcphdr *th = tcp_hdr(skb);
1167        __be32 saddr, daddr;
1168
1169        if (sk) { /* valid for establish/request sockets */
1170                saddr = sk->sk_rcv_saddr;
1171                daddr = sk->sk_daddr;
1172        } else {
1173                const struct iphdr *iph = ip_hdr(skb);
1174                saddr = iph->saddr;
1175                daddr = iph->daddr;
1176        }
1177
1178        hp = tcp_get_md5sig_pool();
1179        if (!hp)
1180                goto clear_hash_noput;
1181        req = hp->md5_req;
1182
1183        if (crypto_ahash_init(req))
1184                goto clear_hash;
1185
1186        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1187                goto clear_hash;
1188        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1189                goto clear_hash;
1190        if (tcp_md5_hash_key(hp, key))
1191                goto clear_hash;
1192        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1193        if (crypto_ahash_final(req))
1194                goto clear_hash;
1195
1196        tcp_put_md5sig_pool();
1197        return 0;
1198
1199clear_hash:
1200        tcp_put_md5sig_pool();
1201clear_hash_noput:
1202        memset(md5_hash, 0, 16);
1203        return 1;
1204}
1205EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1206
1207#endif
1208
1209/* Called with rcu_read_lock() */
1210static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1211                                    const struct sk_buff *skb)
1212{
1213#ifdef CONFIG_TCP_MD5SIG
1214        /*
1215         * This gets called for each TCP segment that arrives
1216         * so we want to be efficient.
1217         * We have 3 drop cases:
1218         * o No MD5 hash and one expected.
1219         * o MD5 hash and we're not expecting one.
1220         * o MD5 hash and its wrong.
1221         */
1222        const __u8 *hash_location = NULL;
1223        struct tcp_md5sig_key *hash_expected;
1224        const struct iphdr *iph = ip_hdr(skb);
1225        const struct tcphdr *th = tcp_hdr(skb);
1226        int genhash;
1227        unsigned char newhash[16];
1228
1229        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1230                                          AF_INET);
1231        hash_location = tcp_parse_md5sig_option(th);
1232
1233        /* We've parsed the options - do we have a hash? */
1234        if (!hash_expected && !hash_location)
1235                return false;
1236
1237        if (hash_expected && !hash_location) {
1238                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1239                return true;
1240        }
1241
1242        if (!hash_expected && hash_location) {
1243                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1244                return true;
1245        }
1246
1247        /* Okay, so this is hash_expected and hash_location -
1248         * so we need to calculate the checksum.
1249         */
1250        genhash = tcp_v4_md5_hash_skb(newhash,
1251                                      hash_expected,
1252                                      NULL, skb);
1253
1254        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1255                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1256                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1257                                     &iph->saddr, ntohs(th->source),
1258                                     &iph->daddr, ntohs(th->dest),
1259                                     genhash ? " tcp_v4_calc_md5_hash failed"
1260                                     : "");
1261                return true;
1262        }
1263        return false;
1264#endif
1265        return false;
1266}
1267
1268static void tcp_v4_init_req(struct request_sock *req,
1269                            const struct sock *sk_listener,
1270                            struct sk_buff *skb)
1271{
1272        struct inet_request_sock *ireq = inet_rsk(req);
1273        struct net *net = sock_net(sk_listener);
1274
1275        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1276        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1277        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1278}
1279
1280static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1281                                          struct flowi *fl,
1282                                          const struct request_sock *req)
1283{
1284        return inet_csk_route_req(sk, &fl->u.ip4, req);
1285}
1286
1287struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1288        .family         =       PF_INET,
1289        .obj_size       =       sizeof(struct tcp_request_sock),
1290        .rtx_syn_ack    =       tcp_rtx_synack,
1291        .send_ack       =       tcp_v4_reqsk_send_ack,
1292        .destructor     =       tcp_v4_reqsk_destructor,
1293        .send_reset     =       tcp_v4_send_reset,
1294        .syn_ack_timeout =      tcp_syn_ack_timeout,
1295};
1296
1297static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1298        .mss_clamp      =       TCP_MSS_DEFAULT,
1299#ifdef CONFIG_TCP_MD5SIG
1300        .req_md5_lookup =       tcp_v4_md5_lookup,
1301        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1302#endif
1303        .init_req       =       tcp_v4_init_req,
1304#ifdef CONFIG_SYN_COOKIES
1305        .cookie_init_seq =      cookie_v4_init_sequence,
1306#endif
1307        .route_req      =       tcp_v4_route_req,
1308        .init_seq       =       tcp_v4_init_seq,
1309        .init_ts_off    =       tcp_v4_init_ts_off,
1310        .send_synack    =       tcp_v4_send_synack,
1311};
1312
1313int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1314{
1315        /* Never answer to SYNs send to broadcast or multicast */
1316        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1317                goto drop;
1318
1319        return tcp_conn_request(&tcp_request_sock_ops,
1320                                &tcp_request_sock_ipv4_ops, sk, skb);
1321
1322drop:
1323        tcp_listendrop(sk);
1324        return 0;
1325}
1326EXPORT_SYMBOL(tcp_v4_conn_request);
1327
1328
1329/*
1330 * The three way handshake has completed - we got a valid synack -
1331 * now create the new socket.
1332 */
1333struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1334                                  struct request_sock *req,
1335                                  struct dst_entry *dst,
1336                                  struct request_sock *req_unhash,
1337                                  bool *own_req)
1338{
1339        struct inet_request_sock *ireq;
1340        struct inet_sock *newinet;
1341        struct tcp_sock *newtp;
1342        struct sock *newsk;
1343#ifdef CONFIG_TCP_MD5SIG
1344        struct tcp_md5sig_key *key;
1345#endif
1346        struct ip_options_rcu *inet_opt;
1347
1348        if (sk_acceptq_is_full(sk))
1349                goto exit_overflow;
1350
1351        newsk = tcp_create_openreq_child(sk, req, skb);
1352        if (!newsk)
1353                goto exit_nonewsk;
1354
1355        newsk->sk_gso_type = SKB_GSO_TCPV4;
1356        inet_sk_rx_dst_set(newsk, skb);
1357
1358        newtp                 = tcp_sk(newsk);
1359        newinet               = inet_sk(newsk);
1360        ireq                  = inet_rsk(req);
1361        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1362        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1363        newsk->sk_bound_dev_if = ireq->ir_iif;
1364        newinet->inet_saddr   = ireq->ir_loc_addr;
1365        inet_opt              = rcu_dereference(ireq->ireq_opt);
1366        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1367        newinet->mc_index     = inet_iif(skb);
1368        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1369        newinet->rcv_tos      = ip_hdr(skb)->tos;
1370        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1371        if (inet_opt)
1372                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1373        newinet->inet_id = newtp->write_seq ^ jiffies;
1374
1375        if (!dst) {
1376                dst = inet_csk_route_child_sock(sk, newsk, req);
1377                if (!dst)
1378                        goto put_and_exit;
1379        } else {
1380                /* syncookie case : see end of cookie_v4_check() */
1381        }
1382        sk_setup_caps(newsk, dst);
1383
1384        tcp_ca_openreq_child(newsk, dst);
1385
1386        tcp_sync_mss(newsk, dst_mtu(dst));
1387        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1388
1389        tcp_initialize_rcv_mss(newsk);
1390
1391#ifdef CONFIG_TCP_MD5SIG
1392        /* Copy over the MD5 key from the original socket */
1393        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1394                                AF_INET);
1395        if (key) {
1396                /*
1397                 * We're using one, so create a matching key
1398                 * on the newsk structure. If we fail to get
1399                 * memory, then we end up not copying the key
1400                 * across. Shucks.
1401                 */
1402                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1403                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1404                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1405        }
1406#endif
1407
1408        if (__inet_inherit_port(sk, newsk) < 0)
1409                goto put_and_exit;
1410        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1411        if (likely(*own_req)) {
1412                tcp_move_syn(newtp, req);
1413                ireq->ireq_opt = NULL;
1414        } else {
1415                newinet->inet_opt = NULL;
1416        }
1417        return newsk;
1418
1419exit_overflow:
1420        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1421exit_nonewsk:
1422        dst_release(dst);
1423exit:
1424        tcp_listendrop(sk);
1425        return NULL;
1426put_and_exit:
1427        newinet->inet_opt = NULL;
1428        inet_csk_prepare_forced_close(newsk);
1429        tcp_done(newsk);
1430        goto exit;
1431}
1432EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1433
1434static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1435{
1436#ifdef CONFIG_SYN_COOKIES
1437        const struct tcphdr *th = tcp_hdr(skb);
1438
1439        if (!th->syn)
1440                sk = cookie_v4_check(sk, skb);
1441#endif
1442        return sk;
1443}
1444
1445/* The socket must have it's spinlock held when we get
1446 * here, unless it is a TCP_LISTEN socket.
1447 *
1448 * We have a potential double-lock case here, so even when
1449 * doing backlog processing we use the BH locking scheme.
1450 * This is because we cannot sleep with the original spinlock
1451 * held.
1452 */
1453int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1454{
1455        struct sock *rsk;
1456
1457        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1458                struct dst_entry *dst = sk->sk_rx_dst;
1459
1460                sock_rps_save_rxhash(sk, skb);
1461                sk_mark_napi_id(sk, skb);
1462                if (dst) {
1463                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1464                            !dst->ops->check(dst, 0)) {
1465                                dst_release(dst);
1466                                sk->sk_rx_dst = NULL;
1467                        }
1468                }
1469                tcp_rcv_established(sk, skb, tcp_hdr(skb));
1470                return 0;
1471        }
1472
1473        if (tcp_checksum_complete(skb))
1474                goto csum_err;
1475
1476        if (sk->sk_state == TCP_LISTEN) {
1477                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1478
1479                if (!nsk)
1480                        goto discard;
1481                if (nsk != sk) {
1482                        if (tcp_child_process(sk, nsk, skb)) {
1483                                rsk = nsk;
1484                                goto reset;
1485                        }
1486                        return 0;
1487                }
1488        } else
1489                sock_rps_save_rxhash(sk, skb);
1490
1491        if (tcp_rcv_state_process(sk, skb)) {
1492                rsk = sk;
1493                goto reset;
1494        }
1495        return 0;
1496
1497reset:
1498        tcp_v4_send_reset(rsk, skb);
1499discard:
1500        kfree_skb(skb);
1501        /* Be careful here. If this function gets more complicated and
1502         * gcc suffers from register pressure on the x86, sk (in %ebx)
1503         * might be destroyed here. This current version compiles correctly,
1504         * but you have been warned.
1505         */
1506        return 0;
1507
1508csum_err:
1509        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1510        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1511        goto discard;
1512}
1513EXPORT_SYMBOL(tcp_v4_do_rcv);
1514
1515int tcp_v4_early_demux(struct sk_buff *skb)
1516{
1517        const struct iphdr *iph;
1518        const struct tcphdr *th;
1519        struct sock *sk;
1520
1521        if (skb->pkt_type != PACKET_HOST)
1522                return 0;
1523
1524        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1525                return 0;
1526
1527        iph = ip_hdr(skb);
1528        th = tcp_hdr(skb);
1529
1530        if (th->doff < sizeof(struct tcphdr) / 4)
1531                return 0;
1532
1533        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1534                                       iph->saddr, th->source,
1535                                       iph->daddr, ntohs(th->dest),
1536                                       skb->skb_iif, inet_sdif(skb));
1537        if (sk) {
1538                skb->sk = sk;
1539                skb->destructor = sock_edemux;
1540                if (sk_fullsock(sk)) {
1541                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1542
1543                        if (dst)
1544                                dst = dst_check(dst, 0);
1545                        if (dst &&
1546                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1547                                skb_dst_set_noref(skb, dst);
1548                }
1549        }
1550        return 0;
1551}
1552
1553bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1554{
1555        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1556
1557        /* Only socket owner can try to collapse/prune rx queues
1558         * to reduce memory overhead, so add a little headroom here.
1559         * Few sockets backlog are possibly concurrently non empty.
1560         */
1561        limit += 64*1024;
1562
1563        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1564         * we can fix skb->truesize to its real value to avoid future drops.
1565         * This is valid because skb is not yet charged to the socket.
1566         * It has been noticed pure SACK packets were sometimes dropped
1567         * (if cooked by drivers without copybreak feature).
1568         */
1569        skb_condense(skb);
1570
1571        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1572                bh_unlock_sock(sk);
1573                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1574                return true;
1575        }
1576        return false;
1577}
1578EXPORT_SYMBOL(tcp_add_backlog);
1579
1580int tcp_filter(struct sock *sk, struct sk_buff *skb)
1581{
1582        struct tcphdr *th = (struct tcphdr *)skb->data;
1583        unsigned int eaten = skb->len;
1584        int err;
1585
1586        err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1587        if (!err) {
1588                eaten -= skb->len;
1589                TCP_SKB_CB(skb)->end_seq -= eaten;
1590        }
1591        return err;
1592}
1593EXPORT_SYMBOL(tcp_filter);
1594
1595static void tcp_v4_restore_cb(struct sk_buff *skb)
1596{
1597        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1598                sizeof(struct inet_skb_parm));
1599}
1600
1601static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1602                           const struct tcphdr *th)
1603{
1604        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1605         * barrier() makes sure compiler wont play fool^Waliasing games.
1606         */
1607        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1608                sizeof(struct inet_skb_parm));
1609        barrier();
1610
1611        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1612        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1613                                    skb->len - th->doff * 4);
1614        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1615        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1616        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1617        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1618        TCP_SKB_CB(skb)->sacked  = 0;
1619        TCP_SKB_CB(skb)->has_rxtstamp =
1620                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1621}
1622
1623/*
1624 *      From tcp_input.c
1625 */
1626
1627int tcp_v4_rcv(struct sk_buff *skb)
1628{
1629        struct net *net = dev_net(skb->dev);
1630        int sdif = inet_sdif(skb);
1631        const struct iphdr *iph;
1632        const struct tcphdr *th;
1633        bool refcounted;
1634        struct sock *sk;
1635        int ret;
1636
1637        if (skb->pkt_type != PACKET_HOST)
1638                goto discard_it;
1639
1640        /* Count it even if it's bad */
1641        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1642
1643        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1644                goto discard_it;
1645
1646        th = (const struct tcphdr *)skb->data;
1647
1648        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1649                goto bad_packet;
1650        if (!pskb_may_pull(skb, th->doff * 4))
1651                goto discard_it;
1652
1653        /* An explanation is required here, I think.
1654         * Packet length and doff are validated by header prediction,
1655         * provided case of th->doff==0 is eliminated.
1656         * So, we defer the checks. */
1657
1658        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1659                goto csum_error;
1660
1661        th = (const struct tcphdr *)skb->data;
1662        iph = ip_hdr(skb);
1663lookup:
1664        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1665                               th->dest, sdif, &refcounted);
1666        if (!sk)
1667                goto no_tcp_socket;
1668
1669process:
1670        if (sk->sk_state == TCP_TIME_WAIT)
1671                goto do_time_wait;
1672
1673        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1674                struct request_sock *req = inet_reqsk(sk);
1675                struct sock *nsk;
1676
1677                sk = req->rsk_listener;
1678                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1679                        sk_drops_add(sk, skb);
1680                        reqsk_put(req);
1681                        goto discard_it;
1682                }
1683                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1684                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1685                        goto lookup;
1686                }
1687                /* We own a reference on the listener, increase it again
1688                 * as we might lose it too soon.
1689                 */
1690                sock_hold(sk);
1691                refcounted = true;
1692                nsk = NULL;
1693                if (!tcp_filter(sk, skb)) {
1694                        th = (const struct tcphdr *)skb->data;
1695                        iph = ip_hdr(skb);
1696                        tcp_v4_fill_cb(skb, iph, th);
1697                        nsk = tcp_check_req(sk, skb, req, false);
1698                }
1699                if (!nsk) {
1700                        reqsk_put(req);
1701                        goto discard_and_relse;
1702                }
1703                if (nsk == sk) {
1704                        reqsk_put(req);
1705                        tcp_v4_restore_cb(skb);
1706                } else if (tcp_child_process(sk, nsk, skb)) {
1707                        tcp_v4_send_reset(nsk, skb);
1708                        goto discard_and_relse;
1709                } else {
1710                        sock_put(sk);
1711                        return 0;
1712                }
1713        }
1714        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1715                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1716                goto discard_and_relse;
1717        }
1718
1719        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1720                goto discard_and_relse;
1721
1722        if (tcp_v4_inbound_md5_hash(sk, skb))
1723                goto discard_and_relse;
1724
1725        nf_reset(skb);
1726
1727        if (tcp_filter(sk, skb))
1728                goto discard_and_relse;
1729        th = (const struct tcphdr *)skb->data;
1730        iph = ip_hdr(skb);
1731        tcp_v4_fill_cb(skb, iph, th);
1732
1733        skb->dev = NULL;
1734
1735        if (sk->sk_state == TCP_LISTEN) {
1736                ret = tcp_v4_do_rcv(sk, skb);
1737                goto put_and_return;
1738        }
1739
1740        sk_incoming_cpu_update(sk);
1741
1742        bh_lock_sock_nested(sk);
1743        tcp_segs_in(tcp_sk(sk), skb);
1744        ret = 0;
1745        if (!sock_owned_by_user(sk)) {
1746                ret = tcp_v4_do_rcv(sk, skb);
1747        } else if (tcp_add_backlog(sk, skb)) {
1748                goto discard_and_relse;
1749        }
1750        bh_unlock_sock(sk);
1751
1752put_and_return:
1753        if (refcounted)
1754                sock_put(sk);
1755
1756        return ret;
1757
1758no_tcp_socket:
1759        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1760                goto discard_it;
1761
1762        tcp_v4_fill_cb(skb, iph, th);
1763
1764        if (tcp_checksum_complete(skb)) {
1765csum_error:
1766                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1767bad_packet:
1768                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1769        } else {
1770                tcp_v4_send_reset(NULL, skb);
1771        }
1772
1773discard_it:
1774        /* Discard frame. */
1775        kfree_skb(skb);
1776        return 0;
1777
1778discard_and_relse:
1779        sk_drops_add(sk, skb);
1780        if (refcounted)
1781                sock_put(sk);
1782        goto discard_it;
1783
1784do_time_wait:
1785        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1786                inet_twsk_put(inet_twsk(sk));
1787                goto discard_it;
1788        }
1789
1790        tcp_v4_fill_cb(skb, iph, th);
1791
1792        if (tcp_checksum_complete(skb)) {
1793                inet_twsk_put(inet_twsk(sk));
1794                goto csum_error;
1795        }
1796        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1797        case TCP_TW_SYN: {
1798                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1799                                                        &tcp_hashinfo, skb,
1800                                                        __tcp_hdrlen(th),
1801                                                        iph->saddr, th->source,
1802                                                        iph->daddr, th->dest,
1803                                                        inet_iif(skb),
1804                                                        sdif);
1805                if (sk2) {
1806                        inet_twsk_deschedule_put(inet_twsk(sk));
1807                        sk = sk2;
1808                        tcp_v4_restore_cb(skb);
1809                        refcounted = false;
1810                        goto process;
1811                }
1812        }
1813                /* to ACK */
1814                /* fall through */
1815        case TCP_TW_ACK:
1816                tcp_v4_timewait_ack(sk, skb);
1817                break;
1818        case TCP_TW_RST:
1819                tcp_v4_send_reset(sk, skb);
1820                inet_twsk_deschedule_put(inet_twsk(sk));
1821                goto discard_it;
1822        case TCP_TW_SUCCESS:;
1823        }
1824        goto discard_it;
1825}
1826
1827static struct timewait_sock_ops tcp_timewait_sock_ops = {
1828        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1829        .twsk_unique    = tcp_twsk_unique,
1830        .twsk_destructor= tcp_twsk_destructor,
1831};
1832
1833void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1834{
1835        struct dst_entry *dst = skb_dst(skb);
1836
1837        if (dst && dst_hold_safe(dst)) {
1838                sk->sk_rx_dst = dst;
1839                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1840        }
1841}
1842EXPORT_SYMBOL(inet_sk_rx_dst_set);
1843
1844const struct inet_connection_sock_af_ops ipv4_specific = {
1845        .queue_xmit        = ip_queue_xmit,
1846        .send_check        = tcp_v4_send_check,
1847        .rebuild_header    = inet_sk_rebuild_header,
1848        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1849        .conn_request      = tcp_v4_conn_request,
1850        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1851        .net_header_len    = sizeof(struct iphdr),
1852        .setsockopt        = ip_setsockopt,
1853        .getsockopt        = ip_getsockopt,
1854        .addr2sockaddr     = inet_csk_addr2sockaddr,
1855        .sockaddr_len      = sizeof(struct sockaddr_in),
1856#ifdef CONFIG_COMPAT
1857        .compat_setsockopt = compat_ip_setsockopt,
1858        .compat_getsockopt = compat_ip_getsockopt,
1859#endif
1860        .mtu_reduced       = tcp_v4_mtu_reduced,
1861};
1862EXPORT_SYMBOL(ipv4_specific);
1863
1864#ifdef CONFIG_TCP_MD5SIG
1865static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1866        .md5_lookup             = tcp_v4_md5_lookup,
1867        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1868        .md5_parse              = tcp_v4_parse_md5_keys,
1869};
1870#endif
1871
1872/* NOTE: A lot of things set to zero explicitly by call to
1873 *       sk_alloc() so need not be done here.
1874 */
1875static int tcp_v4_init_sock(struct sock *sk)
1876{
1877        struct inet_connection_sock *icsk = inet_csk(sk);
1878
1879        tcp_init_sock(sk);
1880
1881        icsk->icsk_af_ops = &ipv4_specific;
1882
1883#ifdef CONFIG_TCP_MD5SIG
1884        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1885#endif
1886
1887        return 0;
1888}
1889
1890void tcp_v4_destroy_sock(struct sock *sk)
1891{
1892        struct tcp_sock *tp = tcp_sk(sk);
1893
1894        trace_tcp_destroy_sock(sk);
1895
1896        tcp_clear_xmit_timers(sk);
1897
1898        tcp_cleanup_congestion_control(sk);
1899
1900        tcp_cleanup_ulp(sk);
1901
1902        /* Cleanup up the write buffer. */
1903        tcp_write_queue_purge(sk);
1904
1905        /* Check if we want to disable active TFO */
1906        tcp_fastopen_active_disable_ofo_check(sk);
1907
1908        /* Cleans up our, hopefully empty, out_of_order_queue. */
1909        skb_rbtree_purge(&tp->out_of_order_queue);
1910
1911#ifdef CONFIG_TCP_MD5SIG
1912        /* Clean up the MD5 key list, if any */
1913        if (tp->md5sig_info) {
1914                tcp_clear_md5_list(sk);
1915                kfree_rcu(tp->md5sig_info, rcu);
1916                tp->md5sig_info = NULL;
1917        }
1918#endif
1919
1920        /* Clean up a referenced TCP bind bucket. */
1921        if (inet_csk(sk)->icsk_bind_hash)
1922                inet_put_port(sk);
1923
1924        BUG_ON(tp->fastopen_rsk);
1925
1926        /* If socket is aborted during connect operation */
1927        tcp_free_fastopen_req(tp);
1928        tcp_fastopen_destroy_cipher(sk);
1929        tcp_saved_syn_free(tp);
1930
1931        sk_sockets_allocated_dec(sk);
1932}
1933EXPORT_SYMBOL(tcp_v4_destroy_sock);
1934
1935#ifdef CONFIG_PROC_FS
1936/* Proc filesystem TCP sock list dumping. */
1937
1938/*
1939 * Get next listener socket follow cur.  If cur is NULL, get first socket
1940 * starting from bucket given in st->bucket; when st->bucket is zero the
1941 * very first socket in the hash table is returned.
1942 */
1943static void *listening_get_next(struct seq_file *seq, void *cur)
1944{
1945        struct tcp_iter_state *st = seq->private;
1946        struct net *net = seq_file_net(seq);
1947        struct inet_listen_hashbucket *ilb;
1948        struct sock *sk = cur;
1949
1950        if (!sk) {
1951get_head:
1952                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1953                spin_lock(&ilb->lock);
1954                sk = sk_head(&ilb->head);
1955                st->offset = 0;
1956                goto get_sk;
1957        }
1958        ilb = &tcp_hashinfo.listening_hash[st->bucket];
1959        ++st->num;
1960        ++st->offset;
1961
1962        sk = sk_next(sk);
1963get_sk:
1964        sk_for_each_from(sk) {
1965                if (!net_eq(sock_net(sk), net))
1966                        continue;
1967                if (sk->sk_family == st->family)
1968                        return sk;
1969        }
1970        spin_unlock(&ilb->lock);
1971        st->offset = 0;
1972        if (++st->bucket < INET_LHTABLE_SIZE)
1973                goto get_head;
1974        return NULL;
1975}
1976
1977static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1978{
1979        struct tcp_iter_state *st = seq->private;
1980        void *rc;
1981
1982        st->bucket = 0;
1983        st->offset = 0;
1984        rc = listening_get_next(seq, NULL);
1985
1986        while (rc && *pos) {
1987                rc = listening_get_next(seq, rc);
1988                --*pos;
1989        }
1990        return rc;
1991}
1992
1993static inline bool empty_bucket(const struct tcp_iter_state *st)
1994{
1995        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1996}
1997
1998/*
1999 * Get first established socket starting from bucket given in st->bucket.
2000 * If st->bucket is zero, the very first socket in the hash is returned.
2001 */
2002static void *established_get_first(struct seq_file *seq)
2003{
2004        struct tcp_iter_state *st = seq->private;
2005        struct net *net = seq_file_net(seq);
2006        void *rc = NULL;
2007
2008        st->offset = 0;
2009        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2010                struct sock *sk;
2011                struct hlist_nulls_node *node;
2012                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2013
2014                /* Lockless fast path for the common case of empty buckets */
2015                if (empty_bucket(st))
2016                        continue;
2017
2018                spin_lock_bh(lock);
2019                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2020                        if (sk->sk_family != st->family ||
2021                            !net_eq(sock_net(sk), net)) {
2022                                continue;
2023                        }
2024                        rc = sk;
2025                        goto out;
2026                }
2027                spin_unlock_bh(lock);
2028        }
2029out:
2030        return rc;
2031}
2032
2033static void *established_get_next(struct seq_file *seq, void *cur)
2034{
2035        struct sock *sk = cur;
2036        struct hlist_nulls_node *node;
2037        struct tcp_iter_state *st = seq->private;
2038        struct net *net = seq_file_net(seq);
2039
2040        ++st->num;
2041        ++st->offset;
2042
2043        sk = sk_nulls_next(sk);
2044
2045        sk_nulls_for_each_from(sk, node) {
2046                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2047                        return sk;
2048        }
2049
2050        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2051        ++st->bucket;
2052        return established_get_first(seq);
2053}
2054
2055static void *established_get_idx(struct seq_file *seq, loff_t pos)
2056{
2057        struct tcp_iter_state *st = seq->private;
2058        void *rc;
2059
2060        st->bucket = 0;
2061        rc = established_get_first(seq);
2062
2063        while (rc && pos) {
2064                rc = established_get_next(seq, rc);
2065                --pos;
2066        }
2067        return rc;
2068}
2069
2070static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2071{
2072        void *rc;
2073        struct tcp_iter_state *st = seq->private;
2074
2075        st->state = TCP_SEQ_STATE_LISTENING;
2076        rc        = listening_get_idx(seq, &pos);
2077
2078        if (!rc) {
2079                st->state = TCP_SEQ_STATE_ESTABLISHED;
2080                rc        = established_get_idx(seq, pos);
2081        }
2082
2083        return rc;
2084}
2085
2086static void *tcp_seek_last_pos(struct seq_file *seq)
2087{
2088        struct tcp_iter_state *st = seq->private;
2089        int offset = st->offset;
2090        int orig_num = st->num;
2091        void *rc = NULL;
2092
2093        switch (st->state) {
2094        case TCP_SEQ_STATE_LISTENING:
2095                if (st->bucket >= INET_LHTABLE_SIZE)
2096                        break;
2097                st->state = TCP_SEQ_STATE_LISTENING;
2098                rc = listening_get_next(seq, NULL);
2099                while (offset-- && rc)
2100                        rc = listening_get_next(seq, rc);
2101                if (rc)
2102                        break;
2103                st->bucket = 0;
2104                st->state = TCP_SEQ_STATE_ESTABLISHED;
2105                /* Fallthrough */
2106        case TCP_SEQ_STATE_ESTABLISHED:
2107                if (st->bucket > tcp_hashinfo.ehash_mask)
2108                        break;
2109                rc = established_get_first(seq);
2110                while (offset-- && rc)
2111                        rc = established_get_next(seq, rc);
2112        }
2113
2114        st->num = orig_num;
2115
2116        return rc;
2117}
2118
2119static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2120{
2121        struct tcp_iter_state *st = seq->private;
2122        void *rc;
2123
2124        if (*pos && *pos == st->last_pos) {
2125                rc = tcp_seek_last_pos(seq);
2126                if (rc)
2127                        goto out;
2128        }
2129
2130        st->state = TCP_SEQ_STATE_LISTENING;
2131        st->num = 0;
2132        st->bucket = 0;
2133        st->offset = 0;
2134        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2135
2136out:
2137        st->last_pos = *pos;
2138        return rc;
2139}
2140
2141static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2142{
2143        struct tcp_iter_state *st = seq->private;
2144        void *rc = NULL;
2145
2146        if (v == SEQ_START_TOKEN) {
2147                rc = tcp_get_idx(seq, 0);
2148                goto out;
2149        }
2150
2151        switch (st->state) {
2152        case TCP_SEQ_STATE_LISTENING:
2153                rc = listening_get_next(seq, v);
2154                if (!rc) {
2155                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2156                        st->bucket = 0;
2157                        st->offset = 0;
2158                        rc        = established_get_first(seq);
2159                }
2160                break;
2161        case TCP_SEQ_STATE_ESTABLISHED:
2162                rc = established_get_next(seq, v);
2163                break;
2164        }
2165out:
2166        ++*pos;
2167        st->last_pos = *pos;
2168        return rc;
2169}
2170
2171static void tcp_seq_stop(struct seq_file *seq, void *v)
2172{
2173        struct tcp_iter_state *st = seq->private;
2174
2175        switch (st->state) {
2176        case TCP_SEQ_STATE_LISTENING:
2177                if (v != SEQ_START_TOKEN)
2178                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2179                break;
2180        case TCP_SEQ_STATE_ESTABLISHED:
2181                if (v)
2182                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2183                break;
2184        }
2185}
2186
2187int tcp_seq_open(struct inode *inode, struct file *file)
2188{
2189        struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2190        struct tcp_iter_state *s;
2191        int err;
2192
2193        err = seq_open_net(inode, file, &afinfo->seq_ops,
2194                          sizeof(struct tcp_iter_state));
2195        if (err < 0)
2196                return err;
2197
2198        s = ((struct seq_file *)file->private_data)->private;
2199        s->family               = afinfo->family;
2200        s->last_pos             = 0;
2201        return 0;
2202}
2203EXPORT_SYMBOL(tcp_seq_open);
2204
2205int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2206{
2207        int rc = 0;
2208        struct proc_dir_entry *p;
2209
2210        afinfo->seq_ops.start           = tcp_seq_start;
2211        afinfo->seq_ops.next            = tcp_seq_next;
2212        afinfo->seq_ops.stop            = tcp_seq_stop;
2213
2214        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2215                             afinfo->seq_fops, afinfo);
2216        if (!p)
2217                rc = -ENOMEM;
2218        return rc;
2219}
2220EXPORT_SYMBOL(tcp_proc_register);
2221
2222void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2223{
2224        remove_proc_entry(afinfo->name, net->proc_net);
2225}
2226EXPORT_SYMBOL(tcp_proc_unregister);
2227
2228static void get_openreq4(const struct request_sock *req,
2229                         struct seq_file *f, int i)
2230{
2231        const struct inet_request_sock *ireq = inet_rsk(req);
2232        long delta = req->rsk_timer.expires - jiffies;
2233
2234        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2235                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2236                i,
2237                ireq->ir_loc_addr,
2238                ireq->ir_num,
2239                ireq->ir_rmt_addr,
2240                ntohs(ireq->ir_rmt_port),
2241                TCP_SYN_RECV,
2242                0, 0, /* could print option size, but that is af dependent. */
2243                1,    /* timers active (only the expire timer) */
2244                jiffies_delta_to_clock_t(delta),
2245                req->num_timeout,
2246                from_kuid_munged(seq_user_ns(f),
2247                                 sock_i_uid(req->rsk_listener)),
2248                0,  /* non standard timer */
2249                0, /* open_requests have no inode */
2250                0,
2251                req);
2252}
2253
2254static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2255{
2256        int timer_active;
2257        unsigned long timer_expires;
2258        const struct tcp_sock *tp = tcp_sk(sk);
2259        const struct inet_connection_sock *icsk = inet_csk(sk);
2260        const struct inet_sock *inet = inet_sk(sk);
2261        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2262        __be32 dest = inet->inet_daddr;
2263        __be32 src = inet->inet_rcv_saddr;
2264        __u16 destp = ntohs(inet->inet_dport);
2265        __u16 srcp = ntohs(inet->inet_sport);
2266        int rx_queue;
2267        int state;
2268
2269        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2270            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2271            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2272                timer_active    = 1;
2273                timer_expires   = icsk->icsk_timeout;
2274        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2275                timer_active    = 4;
2276                timer_expires   = icsk->icsk_timeout;
2277        } else if (timer_pending(&sk->sk_timer)) {
2278                timer_active    = 2;
2279                timer_expires   = sk->sk_timer.expires;
2280        } else {
2281                timer_active    = 0;
2282                timer_expires = jiffies;
2283        }
2284
2285        state = sk_state_load(sk);
2286        if (state == TCP_LISTEN)
2287                rx_queue = sk->sk_ack_backlog;
2288        else
2289                /* Because we don't lock the socket,
2290                 * we might find a transient negative value.
2291                 */
2292                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2293
2294        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2295                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2296                i, src, srcp, dest, destp, state,
2297                tp->write_seq - tp->snd_una,
2298                rx_queue,
2299                timer_active,
2300                jiffies_delta_to_clock_t(timer_expires - jiffies),
2301                icsk->icsk_retransmits,
2302                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2303                icsk->icsk_probes_out,
2304                sock_i_ino(sk),
2305                refcount_read(&sk->sk_refcnt), sk,
2306                jiffies_to_clock_t(icsk->icsk_rto),
2307                jiffies_to_clock_t(icsk->icsk_ack.ato),
2308                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2309                tp->snd_cwnd,
2310                state == TCP_LISTEN ?
2311                    fastopenq->max_qlen :
2312                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2313}
2314
2315static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2316                               struct seq_file *f, int i)
2317{
2318        long delta = tw->tw_timer.expires - jiffies;
2319        __be32 dest, src;
2320        __u16 destp, srcp;
2321
2322        dest  = tw->tw_daddr;
2323        src   = tw->tw_rcv_saddr;
2324        destp = ntohs(tw->tw_dport);
2325        srcp  = ntohs(tw->tw_sport);
2326
2327        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2328                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2329                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2330                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2331                refcount_read(&tw->tw_refcnt), tw);
2332}
2333
2334#define TMPSZ 150
2335
2336static int tcp4_seq_show(struct seq_file *seq, void *v)
2337{
2338        struct tcp_iter_state *st;
2339        struct sock *sk = v;
2340
2341        seq_setwidth(seq, TMPSZ - 1);
2342        if (v == SEQ_START_TOKEN) {
2343                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2344                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2345                           "inode");
2346                goto out;
2347        }
2348        st = seq->private;
2349
2350        if (sk->sk_state == TCP_TIME_WAIT)
2351                get_timewait4_sock(v, seq, st->num);
2352        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2353                get_openreq4(v, seq, st->num);
2354        else
2355                get_tcp4_sock(v, seq, st->num);
2356out:
2357        seq_pad(seq, '\n');
2358        return 0;
2359}
2360
2361static const struct file_operations tcp_afinfo_seq_fops = {
2362        .owner   = THIS_MODULE,
2363        .open    = tcp_seq_open,
2364        .read    = seq_read,
2365        .llseek  = seq_lseek,
2366        .release = seq_release_net
2367};
2368
2369static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2370        .name           = "tcp",
2371        .family         = AF_INET,
2372        .seq_fops       = &tcp_afinfo_seq_fops,
2373        .seq_ops        = {
2374                .show           = tcp4_seq_show,
2375        },
2376};
2377
2378static int __net_init tcp4_proc_init_net(struct net *net)
2379{
2380        return tcp_proc_register(net, &tcp4_seq_afinfo);
2381}
2382
2383static void __net_exit tcp4_proc_exit_net(struct net *net)
2384{
2385        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2386}
2387
2388static struct pernet_operations tcp4_net_ops = {
2389        .init = tcp4_proc_init_net,
2390        .exit = tcp4_proc_exit_net,
2391};
2392
2393int __init tcp4_proc_init(void)
2394{
2395        return register_pernet_subsys(&tcp4_net_ops);
2396}
2397
2398void tcp4_proc_exit(void)
2399{
2400        unregister_pernet_subsys(&tcp4_net_ops);
2401}
2402#endif /* CONFIG_PROC_FS */
2403
2404struct proto tcp_prot = {
2405        .name                   = "TCP",
2406        .owner                  = THIS_MODULE,
2407        .close                  = tcp_close,
2408        .connect                = tcp_v4_connect,
2409        .disconnect             = tcp_disconnect,
2410        .accept                 = inet_csk_accept,
2411        .ioctl                  = tcp_ioctl,
2412        .init                   = tcp_v4_init_sock,
2413        .destroy                = tcp_v4_destroy_sock,
2414        .shutdown               = tcp_shutdown,
2415        .setsockopt             = tcp_setsockopt,
2416        .getsockopt             = tcp_getsockopt,
2417        .keepalive              = tcp_set_keepalive,
2418        .recvmsg                = tcp_recvmsg,
2419        .sendmsg                = tcp_sendmsg,
2420        .sendpage               = tcp_sendpage,
2421        .backlog_rcv            = tcp_v4_do_rcv,
2422        .release_cb             = tcp_release_cb,
2423        .hash                   = inet_hash,
2424        .unhash                 = inet_unhash,
2425        .get_port               = inet_csk_get_port,
2426        .enter_memory_pressure  = tcp_enter_memory_pressure,
2427        .leave_memory_pressure  = tcp_leave_memory_pressure,
2428        .stream_memory_free     = tcp_stream_memory_free,
2429        .sockets_allocated      = &tcp_sockets_allocated,
2430        .orphan_count           = &tcp_orphan_count,
2431        .memory_allocated       = &tcp_memory_allocated,
2432        .memory_pressure        = &tcp_memory_pressure,
2433        .sysctl_mem             = sysctl_tcp_mem,
2434        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2435        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2436        .max_header             = MAX_TCP_HEADER,
2437        .obj_size               = sizeof(struct tcp_sock),
2438        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2439        .twsk_prot              = &tcp_timewait_sock_ops,
2440        .rsk_prot               = &tcp_request_sock_ops,
2441        .h.hashinfo             = &tcp_hashinfo,
2442        .no_autobind            = true,
2443#ifdef CONFIG_COMPAT
2444        .compat_setsockopt      = compat_tcp_setsockopt,
2445        .compat_getsockopt      = compat_tcp_getsockopt,
2446#endif
2447        .diag_destroy           = tcp_abort,
2448};
2449EXPORT_SYMBOL(tcp_prot);
2450
2451static void __net_exit tcp_sk_exit(struct net *net)
2452{
2453        int cpu;
2454
2455        module_put(net->ipv4.tcp_congestion_control->owner);
2456
2457        for_each_possible_cpu(cpu)
2458                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2459        free_percpu(net->ipv4.tcp_sk);
2460}
2461
2462static int __net_init tcp_sk_init(struct net *net)
2463{
2464        int res, cpu, cnt;
2465
2466        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2467        if (!net->ipv4.tcp_sk)
2468                return -ENOMEM;
2469
2470        for_each_possible_cpu(cpu) {
2471                struct sock *sk;
2472
2473                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2474                                           IPPROTO_TCP, net);
2475                if (res)
2476                        goto fail;
2477                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2478                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2479        }
2480
2481        net->ipv4.sysctl_tcp_ecn = 2;
2482        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2483
2484        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2485        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2486        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2487
2488        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2489        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2490        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2491
2492        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2493        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2494        net->ipv4.sysctl_tcp_syncookies = 1;
2495        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2496        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2497        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2498        net->ipv4.sysctl_tcp_orphan_retries = 0;
2499        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2500        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2501        net->ipv4.sysctl_tcp_tw_reuse = 0;
2502
2503        cnt = tcp_hashinfo.ehash_mask + 1;
2504        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2505        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2506
2507        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2508        net->ipv4.sysctl_tcp_sack = 1;
2509        net->ipv4.sysctl_tcp_window_scaling = 1;
2510        net->ipv4.sysctl_tcp_timestamps = 1;
2511        net->ipv4.sysctl_tcp_early_retrans = 3;
2512        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2513        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2514        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2515        net->ipv4.sysctl_tcp_max_reordering = 300;
2516        net->ipv4.sysctl_tcp_dsack = 1;
2517        net->ipv4.sysctl_tcp_app_win = 31;
2518        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2519        net->ipv4.sysctl_tcp_frto = 2;
2520        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2521        /* This limits the percentage of the congestion window which we
2522         * will allow a single TSO frame to consume.  Building TSO frames
2523         * which are too large can cause TCP streams to be bursty.
2524         */
2525        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2526        /* Default TSQ limit of four TSO segments */
2527        net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2528        /* rfc5961 challenge ack rate limiting */
2529        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2530        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2531        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2532        net->ipv4.sysctl_tcp_autocorking = 1;
2533        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2534        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2535        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2536        if (net != &init_net) {
2537                memcpy(net->ipv4.sysctl_tcp_rmem,
2538                       init_net.ipv4.sysctl_tcp_rmem,
2539                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2540                memcpy(net->ipv4.sysctl_tcp_wmem,
2541                       init_net.ipv4.sysctl_tcp_wmem,
2542                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2543        }
2544        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2545        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2546        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2547        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2548
2549        /* Reno is always built in */
2550        if (!net_eq(net, &init_net) &&
2551            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2552                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2553        else
2554                net->ipv4.tcp_congestion_control = &tcp_reno;
2555
2556        return 0;
2557fail:
2558        tcp_sk_exit(net);
2559
2560        return res;
2561}
2562
2563static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2564{
2565        struct net *net;
2566
2567        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2568
2569        list_for_each_entry(net, net_exit_list, exit_list)
2570                tcp_fastopen_ctx_destroy(net);
2571}
2572
2573static struct pernet_operations __net_initdata tcp_sk_ops = {
2574       .init       = tcp_sk_init,
2575       .exit       = tcp_sk_exit,
2576       .exit_batch = tcp_sk_exit_batch,
2577};
2578
2579void __init tcp_v4_init(void)
2580{
2581        if (register_pernet_subsys(&tcp_sk_ops))
2582                panic("Failed to create the TCP control socket.\n");
2583}
2584
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.