linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/tcp_memcontrol.h>
  77#include <net/busy_poll.h>
  78
  79#include <linux/inet.h>
  80#include <linux/ipv6.h>
  81#include <linux/stddef.h>
  82#include <linux/proc_fs.h>
  83#include <linux/seq_file.h>
  84
  85#include <linux/crypto.h>
  86#include <linux/scatterlist.h>
  87
  88int sysctl_tcp_tw_reuse __read_mostly;
  89int sysctl_tcp_low_latency __read_mostly;
  90EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92#ifdef CONFIG_TCP_MD5SIG
  93static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  94                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  95#endif
  96
  97struct inet_hashinfo tcp_hashinfo;
  98EXPORT_SYMBOL(tcp_hashinfo);
  99
 100static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 101{
 102        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 103                                          ip_hdr(skb)->saddr,
 104                                          tcp_hdr(skb)->dest,
 105                                          tcp_hdr(skb)->source);
 106}
 107
 108int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 109{
 110        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111        struct tcp_sock *tp = tcp_sk(sk);
 112
 113        /* With PAWS, it is safe from the viewpoint
 114           of data integrity. Even without PAWS it is safe provided sequence
 115           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 116
 117           Actually, the idea is close to VJ's one, only timestamp cache is
 118           held not per host, but per port pair and TW bucket is used as state
 119           holder.
 120
 121           If TW bucket has been already destroyed we fall back to VJ's scheme
 122           and use initial timestamp retrieved from peer table.
 123         */
 124        if (tcptw->tw_ts_recent_stamp &&
 125            (twp == NULL || (sysctl_tcp_tw_reuse &&
 126                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 127                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 128                if (tp->write_seq == 0)
 129                        tp->write_seq = 1;
 130                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 131                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 132                sock_hold(sktw);
 133                return 1;
 134        }
 135
 136        return 0;
 137}
 138EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 139
 140/* This will initiate an outgoing connection. */
 141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 142{
 143        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 144        struct inet_sock *inet = inet_sk(sk);
 145        struct tcp_sock *tp = tcp_sk(sk);
 146        __be16 orig_sport, orig_dport;
 147        __be32 daddr, nexthop;
 148        struct flowi4 *fl4;
 149        struct rtable *rt;
 150        int err;
 151        struct ip_options_rcu *inet_opt;
 152
 153        if (addr_len < sizeof(struct sockaddr_in))
 154                return -EINVAL;
 155
 156        if (usin->sin_family != AF_INET)
 157                return -EAFNOSUPPORT;
 158
 159        nexthop = daddr = usin->sin_addr.s_addr;
 160        inet_opt = rcu_dereference_protected(inet->inet_opt,
 161                                             sock_owned_by_user(sk));
 162        if (inet_opt && inet_opt->opt.srr) {
 163                if (!daddr)
 164                        return -EINVAL;
 165                nexthop = inet_opt->opt.faddr;
 166        }
 167
 168        orig_sport = inet->inet_sport;
 169        orig_dport = usin->sin_port;
 170        fl4 = &inet->cork.fl.u.ip4;
 171        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 172                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                              IPPROTO_TCP,
 174                              orig_sport, orig_dport, sk);
 175        if (IS_ERR(rt)) {
 176                err = PTR_ERR(rt);
 177                if (err == -ENETUNREACH)
 178                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 179                return err;
 180        }
 181
 182        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 183                ip_rt_put(rt);
 184                return -ENETUNREACH;
 185        }
 186
 187        if (!inet_opt || !inet_opt->opt.srr)
 188                daddr = fl4->daddr;
 189
 190        if (!inet->inet_saddr)
 191                inet->inet_saddr = fl4->saddr;
 192        inet->inet_rcv_saddr = inet->inet_saddr;
 193
 194        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 195                /* Reset inherited state */
 196                tp->rx_opt.ts_recent       = 0;
 197                tp->rx_opt.ts_recent_stamp = 0;
 198                if (likely(!tp->repair))
 199                        tp->write_seq      = 0;
 200        }
 201
 202        if (tcp_death_row.sysctl_tw_recycle &&
 203            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 204                tcp_fetch_timewait_stamp(sk, &rt->dst);
 205
 206        inet->inet_dport = usin->sin_port;
 207        inet->inet_daddr = daddr;
 208
 209        inet_csk(sk)->icsk_ext_hdr_len = 0;
 210        if (inet_opt)
 211                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215        /* Socket identity is still unknown (sport may be zero).
 216         * However we set state to SYN-SENT and not releasing socket
 217         * lock select source port, enter ourselves into the hash tables and
 218         * complete initialization after this.
 219         */
 220        tcp_set_state(sk, TCP_SYN_SENT);
 221        err = inet_hash_connect(&tcp_death_row, sk);
 222        if (err)
 223                goto failure;
 224
 225        inet_set_txhash(sk);
 226
 227        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                               inet->inet_sport, inet->inet_dport, sk);
 229        if (IS_ERR(rt)) {
 230                err = PTR_ERR(rt);
 231                rt = NULL;
 232                goto failure;
 233        }
 234        /* OK, now commit destination to socket.  */
 235        sk->sk_gso_type = SKB_GSO_TCPV4;
 236        sk_setup_caps(sk, &rt->dst);
 237
 238        if (!tp->write_seq && likely(!tp->repair))
 239                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                           inet->inet_daddr,
 241                                                           inet->inet_sport,
 242                                                           usin->sin_port);
 243
 244        inet->inet_id = tp->write_seq ^ jiffies;
 245
 246        err = tcp_connect(sk);
 247
 248        rt = NULL;
 249        if (err)
 250                goto failure;
 251
 252        return 0;
 253
 254failure:
 255        /*
 256         * This unhashes the socket and releases the local port,
 257         * if necessary.
 258         */
 259        tcp_set_state(sk, TCP_CLOSE);
 260        ip_rt_put(rt);
 261        sk->sk_route_caps = 0;
 262        inet->inet_dport = 0;
 263        return err;
 264}
 265EXPORT_SYMBOL(tcp_v4_connect);
 266
 267/*
 268 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 269 * It can be called through tcp_release_cb() if socket was owned by user
 270 * at the time tcp_v4_err() was called to handle ICMP message.
 271 */
 272void tcp_v4_mtu_reduced(struct sock *sk)
 273{
 274        struct dst_entry *dst;
 275        struct inet_sock *inet = inet_sk(sk);
 276        u32 mtu = tcp_sk(sk)->mtu_info;
 277
 278        dst = inet_csk_update_pmtu(sk, mtu);
 279        if (!dst)
 280                return;
 281
 282        /* Something is about to be wrong... Remember soft error
 283         * for the case, if this connection will not able to recover.
 284         */
 285        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 286                sk->sk_err_soft = EMSGSIZE;
 287
 288        mtu = dst_mtu(dst);
 289
 290        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 291            ip_sk_accept_pmtu(sk) &&
 292            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 293                tcp_sync_mss(sk, mtu);
 294
 295                /* Resend the TCP packet because it's
 296                 * clear that the old packet has been
 297                 * dropped. This is the new "fast" path mtu
 298                 * discovery.
 299                 */
 300                tcp_simple_retransmit(sk);
 301        } /* else let the usual retransmit timer handle it */
 302}
 303EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 304
 305static void do_redirect(struct sk_buff *skb, struct sock *sk)
 306{
 307        struct dst_entry *dst = __sk_dst_check(sk, 0);
 308
 309        if (dst)
 310                dst->ops->redirect(dst, sk, skb);
 311}
 312
 313/*
 314 * This routine is called by the ICMP module when it gets some
 315 * sort of error condition.  If err < 0 then the socket should
 316 * be closed and the error returned to the user.  If err > 0
 317 * it's just the icmp type << 8 | icmp code.  After adjustment
 318 * header points to the first 8 bytes of the tcp header.  We need
 319 * to find the appropriate port.
 320 *
 321 * The locking strategy used here is very "optimistic". When
 322 * someone else accesses the socket the ICMP is just dropped
 323 * and for some paths there is no check at all.
 324 * A more general error queue to queue errors for later handling
 325 * is probably better.
 326 *
 327 */
 328
 329void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 330{
 331        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 332        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 333        struct inet_connection_sock *icsk;
 334        struct tcp_sock *tp;
 335        struct inet_sock *inet;
 336        const int type = icmp_hdr(icmp_skb)->type;
 337        const int code = icmp_hdr(icmp_skb)->code;
 338        struct sock *sk;
 339        struct sk_buff *skb;
 340        struct request_sock *fastopen;
 341        __u32 seq, snd_una;
 342        __u32 remaining;
 343        int err;
 344        struct net *net = dev_net(icmp_skb->dev);
 345
 346        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 347                        iph->saddr, th->source, inet_iif(icmp_skb));
 348        if (!sk) {
 349                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 350                return;
 351        }
 352        if (sk->sk_state == TCP_TIME_WAIT) {
 353                inet_twsk_put(inet_twsk(sk));
 354                return;
 355        }
 356
 357        bh_lock_sock(sk);
 358        /* If too many ICMPs get dropped on busy
 359         * servers this needs to be solved differently.
 360         * We do take care of PMTU discovery (RFC1191) special case :
 361         * we can receive locally generated ICMP messages while socket is held.
 362         */
 363        if (sock_owned_by_user(sk)) {
 364                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 365                        NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 366        }
 367        if (sk->sk_state == TCP_CLOSE)
 368                goto out;
 369
 370        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 371                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 372                goto out;
 373        }
 374
 375        icsk = inet_csk(sk);
 376        tp = tcp_sk(sk);
 377        seq = ntohl(th->seq);
 378        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 379        fastopen = tp->fastopen_rsk;
 380        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 381        if (sk->sk_state != TCP_LISTEN &&
 382            !between(seq, snd_una, tp->snd_nxt)) {
 383                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 384                goto out;
 385        }
 386
 387        switch (type) {
 388        case ICMP_REDIRECT:
 389                do_redirect(icmp_skb, sk);
 390                goto out;
 391        case ICMP_SOURCE_QUENCH:
 392                /* Just silently ignore these. */
 393                goto out;
 394        case ICMP_PARAMETERPROB:
 395                err = EPROTO;
 396                break;
 397        case ICMP_DEST_UNREACH:
 398                if (code > NR_ICMP_UNREACH)
 399                        goto out;
 400
 401                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 402                        /* We are not interested in TCP_LISTEN and open_requests
 403                         * (SYN-ACKs send out by Linux are always <576bytes so
 404                         * they should go through unfragmented).
 405                         */
 406                        if (sk->sk_state == TCP_LISTEN)
 407                                goto out;
 408
 409                        tp->mtu_info = info;
 410                        if (!sock_owned_by_user(sk)) {
 411                                tcp_v4_mtu_reduced(sk);
 412                        } else {
 413                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 414                                        sock_hold(sk);
 415                        }
 416                        goto out;
 417                }
 418
 419                err = icmp_err_convert[code].errno;
 420                /* check if icmp_skb allows revert of backoff
 421                 * (see draft-zimmermann-tcp-lcd) */
 422                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 423                        break;
 424                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 425                    !icsk->icsk_backoff || fastopen)
 426                        break;
 427
 428                if (sock_owned_by_user(sk))
 429                        break;
 430
 431                icsk->icsk_backoff--;
 432                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 433                                               TCP_TIMEOUT_INIT;
 434                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 435
 436                skb = tcp_write_queue_head(sk);
 437                BUG_ON(!skb);
 438
 439                remaining = icsk->icsk_rto -
 440                            min(icsk->icsk_rto,
 441                                tcp_time_stamp - tcp_skb_timestamp(skb));
 442
 443                if (remaining) {
 444                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 445                                                  remaining, TCP_RTO_MAX);
 446                } else {
 447                        /* RTO revert clocked out retransmission.
 448                         * Will retransmit now */
 449                        tcp_retransmit_timer(sk);
 450                }
 451
 452                break;
 453        case ICMP_TIME_EXCEEDED:
 454                err = EHOSTUNREACH;
 455                break;
 456        default:
 457                goto out;
 458        }
 459
 460        switch (sk->sk_state) {
 461                struct request_sock *req, **prev;
 462        case TCP_LISTEN:
 463                if (sock_owned_by_user(sk))
 464                        goto out;
 465
 466                req = inet_csk_search_req(sk, &prev, th->dest,
 467                                          iph->daddr, iph->saddr);
 468                if (!req)
 469                        goto out;
 470
 471                /* ICMPs are not backlogged, hence we cannot get
 472                   an established socket here.
 473                 */
 474                WARN_ON(req->sk);
 475
 476                if (seq != tcp_rsk(req)->snt_isn) {
 477                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 478                        goto out;
 479                }
 480
 481                /*
 482                 * Still in SYN_RECV, just remove it silently.
 483                 * There is no good way to pass the error to the newly
 484                 * created socket, and POSIX does not want network
 485                 * errors returned from accept().
 486                 */
 487                inet_csk_reqsk_queue_drop(sk, req, prev);
 488                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 489                goto out;
 490
 491        case TCP_SYN_SENT:
 492        case TCP_SYN_RECV:
 493                /* Only in fast or simultaneous open. If a fast open socket is
 494                 * is already accepted it is treated as a connected one below.
 495                 */
 496                if (fastopen && fastopen->sk == NULL)
 497                        break;
 498
 499                if (!sock_owned_by_user(sk)) {
 500                        sk->sk_err = err;
 501
 502                        sk->sk_error_report(sk);
 503
 504                        tcp_done(sk);
 505                } else {
 506                        sk->sk_err_soft = err;
 507                }
 508                goto out;
 509        }
 510
 511        /* If we've already connected we will keep trying
 512         * until we time out, or the user gives up.
 513         *
 514         * rfc1122 4.2.3.9 allows to consider as hard errors
 515         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 516         * but it is obsoleted by pmtu discovery).
 517         *
 518         * Note, that in modern internet, where routing is unreliable
 519         * and in each dark corner broken firewalls sit, sending random
 520         * errors ordered by their masters even this two messages finally lose
 521         * their original sense (even Linux sends invalid PORT_UNREACHs)
 522         *
 523         * Now we are in compliance with RFCs.
 524         *                                                      --ANK (980905)
 525         */
 526
 527        inet = inet_sk(sk);
 528        if (!sock_owned_by_user(sk) && inet->recverr) {
 529                sk->sk_err = err;
 530                sk->sk_error_report(sk);
 531        } else  { /* Only an error on timeout */
 532                sk->sk_err_soft = err;
 533        }
 534
 535out:
 536        bh_unlock_sock(sk);
 537        sock_put(sk);
 538}
 539
 540void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 541{
 542        struct tcphdr *th = tcp_hdr(skb);
 543
 544        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 545                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 546                skb->csum_start = skb_transport_header(skb) - skb->head;
 547                skb->csum_offset = offsetof(struct tcphdr, check);
 548        } else {
 549                th->check = tcp_v4_check(skb->len, saddr, daddr,
 550                                         csum_partial(th,
 551                                                      th->doff << 2,
 552                                                      skb->csum));
 553        }
 554}
 555
 556/* This routine computes an IPv4 TCP checksum. */
 557void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 558{
 559        const struct inet_sock *inet = inet_sk(sk);
 560
 561        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 562}
 563EXPORT_SYMBOL(tcp_v4_send_check);
 564
 565/*
 566 *      This routine will send an RST to the other tcp.
 567 *
 568 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 569 *                    for reset.
 570 *      Answer: if a packet caused RST, it is not for a socket
 571 *              existing in our system, if it is matched to a socket,
 572 *              it is just duplicate segment or bug in other side's TCP.
 573 *              So that we build reply only basing on parameters
 574 *              arrived with segment.
 575 *      Exception: precedence violation. We do not implement it in any case.
 576 */
 577
 578static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 579{
 580        const struct tcphdr *th = tcp_hdr(skb);
 581        struct {
 582                struct tcphdr th;
 583#ifdef CONFIG_TCP_MD5SIG
 584                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 585#endif
 586        } rep;
 587        struct ip_reply_arg arg;
 588#ifdef CONFIG_TCP_MD5SIG
 589        struct tcp_md5sig_key *key;
 590        const __u8 *hash_location = NULL;
 591        unsigned char newhash[16];
 592        int genhash;
 593        struct sock *sk1 = NULL;
 594#endif
 595        struct net *net;
 596
 597        /* Never send a reset in response to a reset. */
 598        if (th->rst)
 599                return;
 600
 601        /* If sk not NULL, it means we did a successful lookup and incoming
 602         * route had to be correct. prequeue might have dropped our dst.
 603         */
 604        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 605                return;
 606
 607        /* Swap the send and the receive. */
 608        memset(&rep, 0, sizeof(rep));
 609        rep.th.dest   = th->source;
 610        rep.th.source = th->dest;
 611        rep.th.doff   = sizeof(struct tcphdr) / 4;
 612        rep.th.rst    = 1;
 613
 614        if (th->ack) {
 615                rep.th.seq = th->ack_seq;
 616        } else {
 617                rep.th.ack = 1;
 618                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 619                                       skb->len - (th->doff << 2));
 620        }
 621
 622        memset(&arg, 0, sizeof(arg));
 623        arg.iov[0].iov_base = (unsigned char *)&rep;
 624        arg.iov[0].iov_len  = sizeof(rep.th);
 625
 626        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 627#ifdef CONFIG_TCP_MD5SIG
 628        hash_location = tcp_parse_md5sig_option(th);
 629        if (!sk && hash_location) {
 630                /*
 631                 * active side is lost. Try to find listening socket through
 632                 * source port, and then find md5 key through listening socket.
 633                 * we are not loose security here:
 634                 * Incoming packet is checked with md5 hash with finding key,
 635                 * no RST generated if md5 hash doesn't match.
 636                 */
 637                sk1 = __inet_lookup_listener(net,
 638                                             &tcp_hashinfo, ip_hdr(skb)->saddr,
 639                                             th->source, ip_hdr(skb)->daddr,
 640                                             ntohs(th->source), inet_iif(skb));
 641                /* don't send rst if it can't find key */
 642                if (!sk1)
 643                        return;
 644                rcu_read_lock();
 645                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 646                                        &ip_hdr(skb)->saddr, AF_INET);
 647                if (!key)
 648                        goto release_sk1;
 649
 650                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 651                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 652                        goto release_sk1;
 653        } else {
 654                key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 655                                             &ip_hdr(skb)->saddr,
 656                                             AF_INET) : NULL;
 657        }
 658
 659        if (key) {
 660                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 661                                   (TCPOPT_NOP << 16) |
 662                                   (TCPOPT_MD5SIG << 8) |
 663                                   TCPOLEN_MD5SIG);
 664                /* Update length and the length the header thinks exists */
 665                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 666                rep.th.doff = arg.iov[0].iov_len / 4;
 667
 668                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 669                                     key, ip_hdr(skb)->saddr,
 670                                     ip_hdr(skb)->daddr, &rep.th);
 671        }
 672#endif
 673        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 674                                      ip_hdr(skb)->saddr, /* XXX */
 675                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 676        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 677        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 678        /* When socket is gone, all binding information is lost.
 679         * routing might fail in this case. No choice here, if we choose to force
 680         * input interface, we will misroute in case of asymmetric route.
 681         */
 682        if (sk)
 683                arg.bound_dev_if = sk->sk_bound_dev_if;
 684
 685        arg.tos = ip_hdr(skb)->tos;
 686        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 687                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 688                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 689                              &arg, arg.iov[0].iov_len);
 690
 691        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 692        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 693
 694#ifdef CONFIG_TCP_MD5SIG
 695release_sk1:
 696        if (sk1) {
 697                rcu_read_unlock();
 698                sock_put(sk1);
 699        }
 700#endif
 701}
 702
 703/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 704   outside socket context is ugly, certainly. What can I do?
 705 */
 706
 707static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 708                            u32 win, u32 tsval, u32 tsecr, int oif,
 709                            struct tcp_md5sig_key *key,
 710                            int reply_flags, u8 tos)
 711{
 712        const struct tcphdr *th = tcp_hdr(skb);
 713        struct {
 714                struct tcphdr th;
 715                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 716#ifdef CONFIG_TCP_MD5SIG
 717                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 718#endif
 719                        ];
 720        } rep;
 721        struct ip_reply_arg arg;
 722        struct net *net = dev_net(skb_dst(skb)->dev);
 723
 724        memset(&rep.th, 0, sizeof(struct tcphdr));
 725        memset(&arg, 0, sizeof(arg));
 726
 727        arg.iov[0].iov_base = (unsigned char *)&rep;
 728        arg.iov[0].iov_len  = sizeof(rep.th);
 729        if (tsecr) {
 730                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 731                                   (TCPOPT_TIMESTAMP << 8) |
 732                                   TCPOLEN_TIMESTAMP);
 733                rep.opt[1] = htonl(tsval);
 734                rep.opt[2] = htonl(tsecr);
 735                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 736        }
 737
 738        /* Swap the send and the receive. */
 739        rep.th.dest    = th->source;
 740        rep.th.source  = th->dest;
 741        rep.th.doff    = arg.iov[0].iov_len / 4;
 742        rep.th.seq     = htonl(seq);
 743        rep.th.ack_seq = htonl(ack);
 744        rep.th.ack     = 1;
 745        rep.th.window  = htons(win);
 746
 747#ifdef CONFIG_TCP_MD5SIG
 748        if (key) {
 749                int offset = (tsecr) ? 3 : 0;
 750
 751                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 752                                          (TCPOPT_NOP << 16) |
 753                                          (TCPOPT_MD5SIG << 8) |
 754                                          TCPOLEN_MD5SIG);
 755                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 756                rep.th.doff = arg.iov[0].iov_len/4;
 757
 758                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 759                                    key, ip_hdr(skb)->saddr,
 760                                    ip_hdr(skb)->daddr, &rep.th);
 761        }
 762#endif
 763        arg.flags = reply_flags;
 764        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 765                                      ip_hdr(skb)->saddr, /* XXX */
 766                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 767        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 768        if (oif)
 769                arg.bound_dev_if = oif;
 770        arg.tos = tos;
 771        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 772                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 773                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 774                              &arg, arg.iov[0].iov_len);
 775
 776        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 777}
 778
 779static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 780{
 781        struct inet_timewait_sock *tw = inet_twsk(sk);
 782        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 783
 784        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 785                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 786                        tcp_time_stamp + tcptw->tw_ts_offset,
 787                        tcptw->tw_ts_recent,
 788                        tw->tw_bound_dev_if,
 789                        tcp_twsk_md5_key(tcptw),
 790                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 791                        tw->tw_tos
 792                        );
 793
 794        inet_twsk_put(tw);
 795}
 796
 797static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 798                                  struct request_sock *req)
 799{
 800        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 801         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 802         */
 803        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 804                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 805                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 806                        tcp_time_stamp,
 807                        req->ts_recent,
 808                        0,
 809                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 810                                          AF_INET),
 811                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 812                        ip_hdr(skb)->tos);
 813}
 814
 815/*
 816 *      Send a SYN-ACK after having received a SYN.
 817 *      This still operates on a request_sock only, not on a big
 818 *      socket.
 819 */
 820static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 821                              struct flowi *fl,
 822                              struct request_sock *req,
 823                              u16 queue_mapping,
 824                              struct tcp_fastopen_cookie *foc)
 825{
 826        const struct inet_request_sock *ireq = inet_rsk(req);
 827        struct flowi4 fl4;
 828        int err = -1;
 829        struct sk_buff *skb;
 830
 831        /* First, grab a route. */
 832        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 833                return -1;
 834
 835        skb = tcp_make_synack(sk, dst, req, foc);
 836
 837        if (skb) {
 838                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 839
 840                skb_set_queue_mapping(skb, queue_mapping);
 841                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 842                                            ireq->ir_rmt_addr,
 843                                            ireq->opt);
 844                err = net_xmit_eval(err);
 845        }
 846
 847        return err;
 848}
 849
 850/*
 851 *      IPv4 request_sock destructor.
 852 */
 853static void tcp_v4_reqsk_destructor(struct request_sock *req)
 854{
 855        kfree(inet_rsk(req)->opt);
 856}
 857
 858/*
 859 * Return true if a syncookie should be sent
 860 */
 861bool tcp_syn_flood_action(struct sock *sk,
 862                         const struct sk_buff *skb,
 863                         const char *proto)
 864{
 865        const char *msg = "Dropping request";
 866        bool want_cookie = false;
 867        struct listen_sock *lopt;
 868
 869#ifdef CONFIG_SYN_COOKIES
 870        if (sysctl_tcp_syncookies) {
 871                msg = "Sending cookies";
 872                want_cookie = true;
 873                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 874        } else
 875#endif
 876                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 877
 878        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 879        if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
 880                lopt->synflood_warned = 1;
 881                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 882                        proto, ntohs(tcp_hdr(skb)->dest), msg);
 883        }
 884        return want_cookie;
 885}
 886EXPORT_SYMBOL(tcp_syn_flood_action);
 887
 888#ifdef CONFIG_TCP_MD5SIG
 889/*
 890 * RFC2385 MD5 checksumming requires a mapping of
 891 * IP address->MD5 Key.
 892 * We need to maintain these in the sk structure.
 893 */
 894
 895/* Find the Key structure for an address.  */
 896struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 897                                         const union tcp_md5_addr *addr,
 898                                         int family)
 899{
 900        struct tcp_sock *tp = tcp_sk(sk);
 901        struct tcp_md5sig_key *key;
 902        unsigned int size = sizeof(struct in_addr);
 903        struct tcp_md5sig_info *md5sig;
 904
 905        /* caller either holds rcu_read_lock() or socket lock */
 906        md5sig = rcu_dereference_check(tp->md5sig_info,
 907                                       sock_owned_by_user(sk) ||
 908                                       lockdep_is_held(&sk->sk_lock.slock));
 909        if (!md5sig)
 910                return NULL;
 911#if IS_ENABLED(CONFIG_IPV6)
 912        if (family == AF_INET6)
 913                size = sizeof(struct in6_addr);
 914#endif
 915        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 916                if (key->family != family)
 917                        continue;
 918                if (!memcmp(&key->addr, addr, size))
 919                        return key;
 920        }
 921        return NULL;
 922}
 923EXPORT_SYMBOL(tcp_md5_do_lookup);
 924
 925struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 926                                         struct sock *addr_sk)
 927{
 928        union tcp_md5_addr *addr;
 929
 930        addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 931        return tcp_md5_do_lookup(sk, addr, AF_INET);
 932}
 933EXPORT_SYMBOL(tcp_v4_md5_lookup);
 934
 935static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 936                                                      struct request_sock *req)
 937{
 938        union tcp_md5_addr *addr;
 939
 940        addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
 941        return tcp_md5_do_lookup(sk, addr, AF_INET);
 942}
 943
 944/* This can be called on a newly created socket, from other files */
 945int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 946                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 947{
 948        /* Add Key to the list */
 949        struct tcp_md5sig_key *key;
 950        struct tcp_sock *tp = tcp_sk(sk);
 951        struct tcp_md5sig_info *md5sig;
 952
 953        key = tcp_md5_do_lookup(sk, addr, family);
 954        if (key) {
 955                /* Pre-existing entry - just update that one. */
 956                memcpy(key->key, newkey, newkeylen);
 957                key->keylen = newkeylen;
 958                return 0;
 959        }
 960
 961        md5sig = rcu_dereference_protected(tp->md5sig_info,
 962                                           sock_owned_by_user(sk));
 963        if (!md5sig) {
 964                md5sig = kmalloc(sizeof(*md5sig), gfp);
 965                if (!md5sig)
 966                        return -ENOMEM;
 967
 968                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 969                INIT_HLIST_HEAD(&md5sig->head);
 970                rcu_assign_pointer(tp->md5sig_info, md5sig);
 971        }
 972
 973        key = sock_kmalloc(sk, sizeof(*key), gfp);
 974        if (!key)
 975                return -ENOMEM;
 976        if (!tcp_alloc_md5sig_pool()) {
 977                sock_kfree_s(sk, key, sizeof(*key));
 978                return -ENOMEM;
 979        }
 980
 981        memcpy(key->key, newkey, newkeylen);
 982        key->keylen = newkeylen;
 983        key->family = family;
 984        memcpy(&key->addr, addr,
 985               (family == AF_INET6) ? sizeof(struct in6_addr) :
 986                                      sizeof(struct in_addr));
 987        hlist_add_head_rcu(&key->node, &md5sig->head);
 988        return 0;
 989}
 990EXPORT_SYMBOL(tcp_md5_do_add);
 991
 992int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 993{
 994        struct tcp_md5sig_key *key;
 995
 996        key = tcp_md5_do_lookup(sk, addr, family);
 997        if (!key)
 998                return -ENOENT;
 999        hlist_del_rcu(&key->node);
1000        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1001        kfree_rcu(key, rcu);
1002        return 0;
1003}
1004EXPORT_SYMBOL(tcp_md5_do_del);
1005
1006static void tcp_clear_md5_list(struct sock *sk)
1007{
1008        struct tcp_sock *tp = tcp_sk(sk);
1009        struct tcp_md5sig_key *key;
1010        struct hlist_node *n;
1011        struct tcp_md5sig_info *md5sig;
1012
1013        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1014
1015        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1016                hlist_del_rcu(&key->node);
1017                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1018                kfree_rcu(key, rcu);
1019        }
1020}
1021
1022static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1023                                 int optlen)
1024{
1025        struct tcp_md5sig cmd;
1026        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1027
1028        if (optlen < sizeof(cmd))
1029                return -EINVAL;
1030
1031        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1032                return -EFAULT;
1033
1034        if (sin->sin_family != AF_INET)
1035                return -EINVAL;
1036
1037        if (!cmd.tcpm_keylen)
1038                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1039                                      AF_INET);
1040
1041        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1042                return -EINVAL;
1043
1044        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1045                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1046                              GFP_KERNEL);
1047}
1048
1049static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1050                                        __be32 daddr, __be32 saddr, int nbytes)
1051{
1052        struct tcp4_pseudohdr *bp;
1053        struct scatterlist sg;
1054
1055        bp = &hp->md5_blk.ip4;
1056
1057        /*
1058         * 1. the TCP pseudo-header (in the order: source IP address,
1059         * destination IP address, zero-padded protocol number, and
1060         * segment length)
1061         */
1062        bp->saddr = saddr;
1063        bp->daddr = daddr;
1064        bp->pad = 0;
1065        bp->protocol = IPPROTO_TCP;
1066        bp->len = cpu_to_be16(nbytes);
1067
1068        sg_init_one(&sg, bp, sizeof(*bp));
1069        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1070}
1071
1072static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1073                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1074{
1075        struct tcp_md5sig_pool *hp;
1076        struct hash_desc *desc;
1077
1078        hp = tcp_get_md5sig_pool();
1079        if (!hp)
1080                goto clear_hash_noput;
1081        desc = &hp->md5_desc;
1082
1083        if (crypto_hash_init(desc))
1084                goto clear_hash;
1085        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1086                goto clear_hash;
1087        if (tcp_md5_hash_header(hp, th))
1088                goto clear_hash;
1089        if (tcp_md5_hash_key(hp, key))
1090                goto clear_hash;
1091        if (crypto_hash_final(desc, md5_hash))
1092                goto clear_hash;
1093
1094        tcp_put_md5sig_pool();
1095        return 0;
1096
1097clear_hash:
1098        tcp_put_md5sig_pool();
1099clear_hash_noput:
1100        memset(md5_hash, 0, 16);
1101        return 1;
1102}
1103
1104int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1105                        const struct sock *sk, const struct request_sock *req,
1106                        const struct sk_buff *skb)
1107{
1108        struct tcp_md5sig_pool *hp;
1109        struct hash_desc *desc;
1110        const struct tcphdr *th = tcp_hdr(skb);
1111        __be32 saddr, daddr;
1112
1113        if (sk) {
1114                saddr = inet_sk(sk)->inet_saddr;
1115                daddr = inet_sk(sk)->inet_daddr;
1116        } else if (req) {
1117                saddr = inet_rsk(req)->ir_loc_addr;
1118                daddr = inet_rsk(req)->ir_rmt_addr;
1119        } else {
1120                const struct iphdr *iph = ip_hdr(skb);
1121                saddr = iph->saddr;
1122                daddr = iph->daddr;
1123        }
1124
1125        hp = tcp_get_md5sig_pool();
1126        if (!hp)
1127                goto clear_hash_noput;
1128        desc = &hp->md5_desc;
1129
1130        if (crypto_hash_init(desc))
1131                goto clear_hash;
1132
1133        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1134                goto clear_hash;
1135        if (tcp_md5_hash_header(hp, th))
1136                goto clear_hash;
1137        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1138                goto clear_hash;
1139        if (tcp_md5_hash_key(hp, key))
1140                goto clear_hash;
1141        if (crypto_hash_final(desc, md5_hash))
1142                goto clear_hash;
1143
1144        tcp_put_md5sig_pool();
1145        return 0;
1146
1147clear_hash:
1148        tcp_put_md5sig_pool();
1149clear_hash_noput:
1150        memset(md5_hash, 0, 16);
1151        return 1;
1152}
1153EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1154
1155static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1156                                      const struct sk_buff *skb)
1157{
1158        /*
1159         * This gets called for each TCP segment that arrives
1160         * so we want to be efficient.
1161         * We have 3 drop cases:
1162         * o No MD5 hash and one expected.
1163         * o MD5 hash and we're not expecting one.
1164         * o MD5 hash and its wrong.
1165         */
1166        const __u8 *hash_location = NULL;
1167        struct tcp_md5sig_key *hash_expected;
1168        const struct iphdr *iph = ip_hdr(skb);
1169        const struct tcphdr *th = tcp_hdr(skb);
1170        int genhash;
1171        unsigned char newhash[16];
1172
1173        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1174                                          AF_INET);
1175        hash_location = tcp_parse_md5sig_option(th);
1176
1177        /* We've parsed the options - do we have a hash? */
1178        if (!hash_expected && !hash_location)
1179                return false;
1180
1181        if (hash_expected && !hash_location) {
1182                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1183                return true;
1184        }
1185
1186        if (!hash_expected && hash_location) {
1187                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1188                return true;
1189        }
1190
1191        /* Okay, so this is hash_expected and hash_location -
1192         * so we need to calculate the checksum.
1193         */
1194        genhash = tcp_v4_md5_hash_skb(newhash,
1195                                      hash_expected,
1196                                      NULL, NULL, skb);
1197
1198        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1199                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1200                                     &iph->saddr, ntohs(th->source),
1201                                     &iph->daddr, ntohs(th->dest),
1202                                     genhash ? " tcp_v4_calc_md5_hash failed"
1203                                     : "");
1204                return true;
1205        }
1206        return false;
1207}
1208
1209static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1210{
1211        bool ret;
1212
1213        rcu_read_lock();
1214        ret = __tcp_v4_inbound_md5_hash(sk, skb);
1215        rcu_read_unlock();
1216
1217        return ret;
1218}
1219
1220#endif
1221
1222static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1223                            struct sk_buff *skb)
1224{
1225        struct inet_request_sock *ireq = inet_rsk(req);
1226
1227        ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1228        ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1229        ireq->no_srccheck = inet_sk(sk)->transparent;
1230        ireq->opt = tcp_v4_save_options(skb);
1231}
1232
1233static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1234                                          const struct request_sock *req,
1235                                          bool *strict)
1236{
1237        struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1238
1239        if (strict) {
1240                if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1241                        *strict = true;
1242                else
1243                        *strict = false;
1244        }
1245
1246        return dst;
1247}
1248
1249struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1250        .family         =       PF_INET,
1251        .obj_size       =       sizeof(struct tcp_request_sock),
1252        .rtx_syn_ack    =       tcp_rtx_synack,
1253        .send_ack       =       tcp_v4_reqsk_send_ack,
1254        .destructor     =       tcp_v4_reqsk_destructor,
1255        .send_reset     =       tcp_v4_send_reset,
1256        .syn_ack_timeout =      tcp_syn_ack_timeout,
1257};
1258
1259static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1260        .mss_clamp      =       TCP_MSS_DEFAULT,
1261#ifdef CONFIG_TCP_MD5SIG
1262        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1263        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1264#endif
1265        .init_req       =       tcp_v4_init_req,
1266#ifdef CONFIG_SYN_COOKIES
1267        .cookie_init_seq =      cookie_v4_init_sequence,
1268#endif
1269        .route_req      =       tcp_v4_route_req,
1270        .init_seq       =       tcp_v4_init_sequence,
1271        .send_synack    =       tcp_v4_send_synack,
1272        .queue_hash_add =       inet_csk_reqsk_queue_hash_add,
1273};
1274
1275int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1276{
1277        /* Never answer to SYNs send to broadcast or multicast */
1278        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1279                goto drop;
1280
1281        return tcp_conn_request(&tcp_request_sock_ops,
1282                                &tcp_request_sock_ipv4_ops, sk, skb);
1283
1284drop:
1285        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1286        return 0;
1287}
1288EXPORT_SYMBOL(tcp_v4_conn_request);
1289
1290
1291/*
1292 * The three way handshake has completed - we got a valid synack -
1293 * now create the new socket.
1294 */
1295struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1296                                  struct request_sock *req,
1297                                  struct dst_entry *dst)
1298{
1299        struct inet_request_sock *ireq;
1300        struct inet_sock *newinet;
1301        struct tcp_sock *newtp;
1302        struct sock *newsk;
1303#ifdef CONFIG_TCP_MD5SIG
1304        struct tcp_md5sig_key *key;
1305#endif
1306        struct ip_options_rcu *inet_opt;
1307
1308        if (sk_acceptq_is_full(sk))
1309                goto exit_overflow;
1310
1311        newsk = tcp_create_openreq_child(sk, req, skb);
1312        if (!newsk)
1313                goto exit_nonewsk;
1314
1315        newsk->sk_gso_type = SKB_GSO_TCPV4;
1316        inet_sk_rx_dst_set(newsk, skb);
1317
1318        newtp                 = tcp_sk(newsk);
1319        newinet               = inet_sk(newsk);
1320        ireq                  = inet_rsk(req);
1321        newinet->inet_daddr   = ireq->ir_rmt_addr;
1322        newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1323        newinet->inet_saddr           = ireq->ir_loc_addr;
1324        inet_opt              = ireq->opt;
1325        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1326        ireq->opt             = NULL;
1327        newinet->mc_index     = inet_iif(skb);
1328        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1329        newinet->rcv_tos      = ip_hdr(skb)->tos;
1330        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1331        inet_set_txhash(newsk);
1332        if (inet_opt)
1333                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1334        newinet->inet_id = newtp->write_seq ^ jiffies;
1335
1336        if (!dst) {
1337                dst = inet_csk_route_child_sock(sk, newsk, req);
1338                if (!dst)
1339                        goto put_and_exit;
1340        } else {
1341                /* syncookie case : see end of cookie_v4_check() */
1342        }
1343        sk_setup_caps(newsk, dst);
1344
1345        tcp_sync_mss(newsk, dst_mtu(dst));
1346        newtp->advmss = dst_metric_advmss(dst);
1347        if (tcp_sk(sk)->rx_opt.user_mss &&
1348            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1349                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1350
1351        tcp_initialize_rcv_mss(newsk);
1352
1353#ifdef CONFIG_TCP_MD5SIG
1354        /* Copy over the MD5 key from the original socket */
1355        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1356                                AF_INET);
1357        if (key != NULL) {
1358                /*
1359                 * We're using one, so create a matching key
1360                 * on the newsk structure. If we fail to get
1361                 * memory, then we end up not copying the key
1362                 * across. Shucks.
1363                 */
1364                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1365                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
1366                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1367        }
1368#endif
1369
1370        if (__inet_inherit_port(sk, newsk) < 0)
1371                goto put_and_exit;
1372        __inet_hash_nolisten(newsk, NULL);
1373
1374        return newsk;
1375
1376exit_overflow:
1377        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1378exit_nonewsk:
1379        dst_release(dst);
1380exit:
1381        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1382        return NULL;
1383put_and_exit:
1384        inet_csk_prepare_forced_close(newsk);
1385        tcp_done(newsk);
1386        goto exit;
1387}
1388EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1389
1390static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1391{
1392        struct tcphdr *th = tcp_hdr(skb);
1393        const struct iphdr *iph = ip_hdr(skb);
1394        struct sock *nsk;
1395        struct request_sock **prev;
1396        /* Find possible connection requests. */
1397        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1398                                                       iph->saddr, iph->daddr);
1399        if (req)
1400                return tcp_check_req(sk, skb, req, prev, false);
1401
1402        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1403                        th->source, iph->daddr, th->dest, inet_iif(skb));
1404
1405        if (nsk) {
1406                if (nsk->sk_state != TCP_TIME_WAIT) {
1407                        bh_lock_sock(nsk);
1408                        return nsk;
1409                }
1410                inet_twsk_put(inet_twsk(nsk));
1411                return NULL;
1412        }
1413
1414#ifdef CONFIG_SYN_COOKIES
1415        if (!th->syn)
1416                sk = cookie_v4_check(sk, skb);
1417#endif
1418        return sk;
1419}
1420
1421/* The socket must have it's spinlock held when we get
1422 * here.
1423 *
1424 * We have a potential double-lock case here, so even when
1425 * doing backlog processing we use the BH locking scheme.
1426 * This is because we cannot sleep with the original spinlock
1427 * held.
1428 */
1429int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1430{
1431        struct sock *rsk;
1432
1433        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1434                struct dst_entry *dst = sk->sk_rx_dst;
1435
1436                sock_rps_save_rxhash(sk, skb);
1437                sk_mark_napi_id(sk, skb);
1438                if (dst) {
1439                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1440                            dst->ops->check(dst, 0) == NULL) {
1441                                dst_release(dst);
1442                                sk->sk_rx_dst = NULL;
1443                        }
1444                }
1445                tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1446                return 0;
1447        }
1448
1449        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1450                goto csum_err;
1451
1452        if (sk->sk_state == TCP_LISTEN) {
1453                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1454                if (!nsk)
1455                        goto discard;
1456
1457                if (nsk != sk) {
1458                        sock_rps_save_rxhash(nsk, skb);
1459                        sk_mark_napi_id(sk, skb);
1460                        if (tcp_child_process(sk, nsk, skb)) {
1461                                rsk = nsk;
1462                                goto reset;
1463                        }
1464                        return 0;
1465                }
1466        } else
1467                sock_rps_save_rxhash(sk, skb);
1468
1469        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1470                rsk = sk;
1471                goto reset;
1472        }
1473        return 0;
1474
1475reset:
1476        tcp_v4_send_reset(rsk, skb);
1477discard:
1478        kfree_skb(skb);
1479        /* Be careful here. If this function gets more complicated and
1480         * gcc suffers from register pressure on the x86, sk (in %ebx)
1481         * might be destroyed here. This current version compiles correctly,
1482         * but you have been warned.
1483         */
1484        return 0;
1485
1486csum_err:
1487        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1488        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1489        goto discard;
1490}
1491EXPORT_SYMBOL(tcp_v4_do_rcv);
1492
1493void tcp_v4_early_demux(struct sk_buff *skb)
1494{
1495        const struct iphdr *iph;
1496        const struct tcphdr *th;
1497        struct sock *sk;
1498
1499        if (skb->pkt_type != PACKET_HOST)
1500                return;
1501
1502        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1503                return;
1504
1505        iph = ip_hdr(skb);
1506        th = tcp_hdr(skb);
1507
1508        if (th->doff < sizeof(struct tcphdr) / 4)
1509                return;
1510
1511        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1512                                       iph->saddr, th->source,
1513                                       iph->daddr, ntohs(th->dest),
1514                                       skb->skb_iif);
1515        if (sk) {
1516                skb->sk = sk;
1517                skb->destructor = sock_edemux;
1518                if (sk->sk_state != TCP_TIME_WAIT) {
1519                        struct dst_entry *dst = sk->sk_rx_dst;
1520
1521                        if (dst)
1522                                dst = dst_check(dst, 0);
1523                        if (dst &&
1524                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1525                                skb_dst_set_noref(skb, dst);
1526                }
1527        }
1528}
1529
1530/* Packet is added to VJ-style prequeue for processing in process
1531 * context, if a reader task is waiting. Apparently, this exciting
1532 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1533 * failed somewhere. Latency? Burstiness? Well, at least now we will
1534 * see, why it failed. 8)8)                               --ANK
1535 *
1536 */
1537bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1538{
1539        struct tcp_sock *tp = tcp_sk(sk);
1540
1541        if (sysctl_tcp_low_latency || !tp->ucopy.task)
1542                return false;
1543
1544        if (skb->len <= tcp_hdrlen(skb) &&
1545            skb_queue_len(&tp->ucopy.prequeue) == 0)
1546                return false;
1547
1548        /* Before escaping RCU protected region, we need to take care of skb
1549         * dst. Prequeue is only enabled for established sockets.
1550         * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1551         * Instead of doing full sk_rx_dst validity here, let's perform
1552         * an optimistic check.
1553         */
1554        if (likely(sk->sk_rx_dst))
1555                skb_dst_drop(skb);
1556        else
1557                skb_dst_force(skb);
1558
1559        __skb_queue_tail(&tp->ucopy.prequeue, skb);
1560        tp->ucopy.memory += skb->truesize;
1561        if (tp->ucopy.memory > sk->sk_rcvbuf) {
1562                struct sk_buff *skb1;
1563
1564                BUG_ON(sock_owned_by_user(sk));
1565
1566                while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1567                        sk_backlog_rcv(sk, skb1);
1568                        NET_INC_STATS_BH(sock_net(sk),
1569                                         LINUX_MIB_TCPPREQUEUEDROPPED);
1570                }
1571
1572                tp->ucopy.memory = 0;
1573        } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1574                wake_up_interruptible_sync_poll(sk_sleep(sk),
1575                                           POLLIN | POLLRDNORM | POLLRDBAND);
1576                if (!inet_csk_ack_scheduled(sk))
1577                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1578                                                  (3 * tcp_rto_min(sk)) / 4,
1579                                                  TCP_RTO_MAX);
1580        }
1581        return true;
1582}
1583EXPORT_SYMBOL(tcp_prequeue);
1584
1585/*
1586 *      From tcp_input.c
1587 */
1588
1589int tcp_v4_rcv(struct sk_buff *skb)
1590{
1591        const struct iphdr *iph;
1592        const struct tcphdr *th;
1593        struct sock *sk;
1594        int ret;
1595        struct net *net = dev_net(skb->dev);
1596
1597        if (skb->pkt_type != PACKET_HOST)
1598                goto discard_it;
1599
1600        /* Count it even if it's bad */
1601        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1602
1603        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1604                goto discard_it;
1605
1606        th = tcp_hdr(skb);
1607
1608        if (th->doff < sizeof(struct tcphdr) / 4)
1609                goto bad_packet;
1610        if (!pskb_may_pull(skb, th->doff * 4))
1611                goto discard_it;
1612
1613        /* An explanation is required here, I think.
1614         * Packet length and doff are validated by header prediction,
1615         * provided case of th->doff==0 is eliminated.
1616         * So, we defer the checks. */
1617
1618        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1619                goto csum_error;
1620
1621        th = tcp_hdr(skb);
1622        iph = ip_hdr(skb);
1623        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1624         * barrier() makes sure compiler wont play fool^Waliasing games.
1625         */
1626        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1627                sizeof(struct inet_skb_parm));
1628        barrier();
1629
1630        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1631        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1632                                    skb->len - th->doff * 4);
1633        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1634        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1635        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1636        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1637        TCP_SKB_CB(skb)->sacked  = 0;
1638
1639        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1640        if (!sk)
1641                goto no_tcp_socket;
1642
1643process:
1644        if (sk->sk_state == TCP_TIME_WAIT)
1645                goto do_time_wait;
1646
1647        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1648                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1649                goto discard_and_relse;
1650        }
1651
1652        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1653                goto discard_and_relse;
1654
1655#ifdef CONFIG_TCP_MD5SIG
1656        /*
1657         * We really want to reject the packet as early as possible
1658         * if:
1659         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1660         *  o There is an MD5 option and we're not expecting one
1661         */
1662        if (tcp_v4_inbound_md5_hash(sk, skb))
1663                goto discard_and_relse;
1664#endif
1665
1666        nf_reset(skb);
1667
1668        if (sk_filter(sk, skb))
1669                goto discard_and_relse;
1670
1671        sk_incoming_cpu_update(sk);
1672        skb->dev = NULL;
1673
1674        bh_lock_sock_nested(sk);
1675        ret = 0;
1676        if (!sock_owned_by_user(sk)) {
1677                if (!tcp_prequeue(sk, skb))
1678                        ret = tcp_v4_do_rcv(sk, skb);
1679        } else if (unlikely(sk_add_backlog(sk, skb,
1680                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
1681                bh_unlock_sock(sk);
1682                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1683                goto discard_and_relse;
1684        }
1685        bh_unlock_sock(sk);
1686
1687        sock_put(sk);
1688
1689        return ret;
1690
1691no_tcp_socket:
1692        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1693                goto discard_it;
1694
1695        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1696csum_error:
1697                TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1698bad_packet:
1699                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1700        } else {
1701                tcp_v4_send_reset(NULL, skb);
1702        }
1703
1704discard_it:
1705        /* Discard frame. */
1706        kfree_skb(skb);
1707        return 0;
1708
1709discard_and_relse:
1710        sock_put(sk);
1711        goto discard_it;
1712
1713do_time_wait:
1714        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1715                inet_twsk_put(inet_twsk(sk));
1716                goto discard_it;
1717        }
1718
1719        if (skb->len < (th->doff << 2)) {
1720                inet_twsk_put(inet_twsk(sk));
1721                goto bad_packet;
1722        }
1723        if (tcp_checksum_complete(skb)) {
1724                inet_twsk_put(inet_twsk(sk));
1725                goto csum_error;
1726        }
1727        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1728        case TCP_TW_SYN: {
1729                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1730                                                        &tcp_hashinfo,
1731                                                        iph->saddr, th->source,
1732                                                        iph->daddr, th->dest,
1733                                                        inet_iif(skb));
1734                if (sk2) {
1735                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1736                        inet_twsk_put(inet_twsk(sk));
1737                        sk = sk2;
1738                        goto process;
1739                }
1740                /* Fall through to ACK */
1741        }
1742        case TCP_TW_ACK:
1743                tcp_v4_timewait_ack(sk, skb);
1744                break;
1745        case TCP_TW_RST:
1746                goto no_tcp_socket;
1747        case TCP_TW_SUCCESS:;
1748        }
1749        goto discard_it;
1750}
1751
1752static struct timewait_sock_ops tcp_timewait_sock_ops = {
1753        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1754        .twsk_unique    = tcp_twsk_unique,
1755        .twsk_destructor= tcp_twsk_destructor,
1756};
1757
1758void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1759{
1760        struct dst_entry *dst = skb_dst(skb);
1761
1762        if (dst) {
1763                dst_hold(dst);
1764                sk->sk_rx_dst = dst;
1765                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1766        }
1767}
1768EXPORT_SYMBOL(inet_sk_rx_dst_set);
1769
1770const struct inet_connection_sock_af_ops ipv4_specific = {
1771        .queue_xmit        = ip_queue_xmit,
1772        .send_check        = tcp_v4_send_check,
1773        .rebuild_header    = inet_sk_rebuild_header,
1774        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1775        .conn_request      = tcp_v4_conn_request,
1776        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1777        .net_header_len    = sizeof(struct iphdr),
1778        .setsockopt        = ip_setsockopt,
1779        .getsockopt        = ip_getsockopt,
1780        .addr2sockaddr     = inet_csk_addr2sockaddr,
1781        .sockaddr_len      = sizeof(struct sockaddr_in),
1782        .bind_conflict     = inet_csk_bind_conflict,
1783#ifdef CONFIG_COMPAT
1784        .compat_setsockopt = compat_ip_setsockopt,
1785        .compat_getsockopt = compat_ip_getsockopt,
1786#endif
1787        .mtu_reduced       = tcp_v4_mtu_reduced,
1788};
1789EXPORT_SYMBOL(ipv4_specific);
1790
1791#ifdef CONFIG_TCP_MD5SIG
1792static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1793        .md5_lookup             = tcp_v4_md5_lookup,
1794        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1795        .md5_parse              = tcp_v4_parse_md5_keys,
1796};
1797#endif
1798
1799/* NOTE: A lot of things set to zero explicitly by call to
1800 *       sk_alloc() so need not be done here.
1801 */
1802static int tcp_v4_init_sock(struct sock *sk)
1803{
1804        struct inet_connection_sock *icsk = inet_csk(sk);
1805
1806        tcp_init_sock(sk);
1807
1808        icsk->icsk_af_ops = &ipv4_specific;
1809
1810#ifdef CONFIG_TCP_MD5SIG
1811        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1812#endif
1813
1814        return 0;
1815}
1816
1817void tcp_v4_destroy_sock(struct sock *sk)
1818{
1819        struct tcp_sock *tp = tcp_sk(sk);
1820
1821        tcp_clear_xmit_timers(sk);
1822
1823        tcp_cleanup_congestion_control(sk);
1824
1825        /* Cleanup up the write buffer. */
1826        tcp_write_queue_purge(sk);
1827
1828        /* Cleans up our, hopefully empty, out_of_order_queue. */
1829        __skb_queue_purge(&tp->out_of_order_queue);
1830
1831#ifdef CONFIG_TCP_MD5SIG
1832        /* Clean up the MD5 key list, if any */
1833        if (tp->md5sig_info) {
1834                tcp_clear_md5_list(sk);
1835                kfree_rcu(tp->md5sig_info, rcu);
1836                tp->md5sig_info = NULL;
1837        }
1838#endif
1839
1840        /* Clean prequeue, it must be empty really */
1841        __skb_queue_purge(&tp->ucopy.prequeue);
1842
1843        /* Clean up a referenced TCP bind bucket. */
1844        if (inet_csk(sk)->icsk_bind_hash)
1845                inet_put_port(sk);
1846
1847        BUG_ON(tp->fastopen_rsk != NULL);
1848
1849        /* If socket is aborted during connect operation */
1850        tcp_free_fastopen_req(tp);
1851
1852        sk_sockets_allocated_dec(sk);
1853        sock_release_memcg(sk);
1854}
1855EXPORT_SYMBOL(tcp_v4_destroy_sock);
1856
1857#ifdef CONFIG_PROC_FS
1858/* Proc filesystem TCP sock list dumping. */
1859
1860/*
1861 * Get next listener socket follow cur.  If cur is NULL, get first socket
1862 * starting from bucket given in st->bucket; when st->bucket is zero the
1863 * very first socket in the hash table is returned.
1864 */
1865static void *listening_get_next(struct seq_file *seq, void *cur)
1866{
1867        struct inet_connection_sock *icsk;
1868        struct hlist_nulls_node *node;
1869        struct sock *sk = cur;
1870        struct inet_listen_hashbucket *ilb;
1871        struct tcp_iter_state *st = seq->private;
1872        struct net *net = seq_file_net(seq);
1873
1874        if (!sk) {
1875                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876                spin_lock_bh(&ilb->lock);
1877                sk = sk_nulls_head(&ilb->head);
1878                st->offset = 0;
1879                goto get_sk;
1880        }
1881        ilb = &tcp_hashinfo.listening_hash[st->bucket];
1882        ++st->num;
1883        ++st->offset;
1884
1885        if (st->state == TCP_SEQ_STATE_OPENREQ) {
1886                struct request_sock *req = cur;
1887
1888                icsk = inet_csk(st->syn_wait_sk);
1889                req = req->dl_next;
1890                while (1) {
1891                        while (req) {
1892                                if (req->rsk_ops->family == st->family) {
1893                                        cur = req;
1894                                        goto out;
1895                                }
1896                                req = req->dl_next;
1897                        }
1898                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1899                                break;
1900get_req:
1901                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1902                }
1903                sk        = sk_nulls_next(st->syn_wait_sk);
1904                st->state = TCP_SEQ_STATE_LISTENING;
1905                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1906        } else {
1907                icsk = inet_csk(sk);
1908                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909                if (reqsk_queue_len(&icsk->icsk_accept_queue))
1910                        goto start_req;
1911                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1912                sk = sk_nulls_next(sk);
1913        }
1914get_sk:
1915        sk_nulls_for_each_from(sk, node) {
1916                if (!net_eq(sock_net(sk), net))
1917                        continue;
1918                if (sk->sk_family == st->family) {
1919                        cur = sk;
1920                        goto out;
1921                }
1922                icsk = inet_csk(sk);
1923                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1924                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1925start_req:
1926                        st->uid         = sock_i_uid(sk);
1927                        st->syn_wait_sk = sk;
1928                        st->state       = TCP_SEQ_STATE_OPENREQ;
1929                        st->sbucket     = 0;
1930                        goto get_req;
1931                }
1932                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1933        }
1934        spin_unlock_bh(&ilb->lock);
1935        st->offset = 0;
1936        if (++st->bucket < INET_LHTABLE_SIZE) {
1937                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1938                spin_lock_bh(&ilb->lock);
1939                sk = sk_nulls_head(&ilb->head);
1940                goto get_sk;
1941        }
1942        cur = NULL;
1943out:
1944        return cur;
1945}
1946
1947static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1948{
1949        struct tcp_iter_state *st = seq->private;
1950        void *rc;
1951
1952        st->bucket = 0;
1953        st->offset = 0;
1954        rc = listening_get_next(seq, NULL);
1955
1956        while (rc && *pos) {
1957                rc = listening_get_next(seq, rc);
1958                --*pos;
1959        }
1960        return rc;
1961}
1962
1963static inline bool empty_bucket(const struct tcp_iter_state *st)
1964{
1965        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1966}
1967
1968/*
1969 * Get first established socket starting from bucket given in st->bucket.
1970 * If st->bucket is zero, the very first socket in the hash is returned.
1971 */
1972static void *established_get_first(struct seq_file *seq)
1973{
1974        struct tcp_iter_state *st = seq->private;
1975        struct net *net = seq_file_net(seq);
1976        void *rc = NULL;
1977
1978        st->offset = 0;
1979        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1980                struct sock *sk;
1981                struct hlist_nulls_node *node;
1982                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1983
1984                /* Lockless fast path for the common case of empty buckets */
1985                if (empty_bucket(st))
1986                        continue;
1987
1988                spin_lock_bh(lock);
1989                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1990                        if (sk->sk_family != st->family ||
1991                            !net_eq(sock_net(sk), net)) {
1992                                continue;
1993                        }
1994                        rc = sk;
1995                        goto out;
1996                }
1997                spin_unlock_bh(lock);
1998        }
1999out:
2000        return rc;
2001}
2002
2003static void *established_get_next(struct seq_file *seq, void *cur)
2004{
2005        struct sock *sk = cur;
2006        struct hlist_nulls_node *node;
2007        struct tcp_iter_state *st = seq->private;
2008        struct net *net = seq_file_net(seq);
2009
2010        ++st->num;
2011        ++st->offset;
2012
2013        sk = sk_nulls_next(sk);
2014
2015        sk_nulls_for_each_from(sk, node) {
2016                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2017                        return sk;
2018        }
2019
2020        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2021        ++st->bucket;
2022        return established_get_first(seq);
2023}
2024
2025static void *established_get_idx(struct seq_file *seq, loff_t pos)
2026{
2027        struct tcp_iter_state *st = seq->private;
2028        void *rc;
2029
2030        st->bucket = 0;
2031        rc = established_get_first(seq);
2032
2033        while (rc && pos) {
2034                rc = established_get_next(seq, rc);
2035                --pos;
2036        }
2037        return rc;
2038}
2039
2040static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2041{
2042        void *rc;
2043        struct tcp_iter_state *st = seq->private;
2044
2045        st->state = TCP_SEQ_STATE_LISTENING;
2046        rc        = listening_get_idx(seq, &pos);
2047
2048        if (!rc) {
2049                st->state = TCP_SEQ_STATE_ESTABLISHED;
2050                rc        = established_get_idx(seq, pos);
2051        }
2052
2053        return rc;
2054}
2055
2056static void *tcp_seek_last_pos(struct seq_file *seq)
2057{
2058        struct tcp_iter_state *st = seq->private;
2059        int offset = st->offset;
2060        int orig_num = st->num;
2061        void *rc = NULL;
2062
2063        switch (st->state) {
2064        case TCP_SEQ_STATE_OPENREQ:
2065        case TCP_SEQ_STATE_LISTENING:
2066                if (st->bucket >= INET_LHTABLE_SIZE)
2067                        break;
2068                st->state = TCP_SEQ_STATE_LISTENING;
2069                rc = listening_get_next(seq, NULL);
2070                while (offset-- && rc)
2071                        rc = listening_get_next(seq, rc);
2072                if (rc)
2073                        break;
2074                st->bucket = 0;
2075                st->state = TCP_SEQ_STATE_ESTABLISHED;
2076                /* Fallthrough */
2077        case TCP_SEQ_STATE_ESTABLISHED:
2078                if (st->bucket > tcp_hashinfo.ehash_mask)
2079                        break;
2080                rc = established_get_first(seq);
2081                while (offset-- && rc)
2082                        rc = established_get_next(seq, rc);
2083        }
2084
2085        st->num = orig_num;
2086
2087        return rc;
2088}
2089
2090static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2091{
2092        struct tcp_iter_state *st = seq->private;
2093        void *rc;
2094
2095        if (*pos && *pos == st->last_pos) {
2096                rc = tcp_seek_last_pos(seq);
2097                if (rc)
2098                        goto out;
2099        }
2100
2101        st->state = TCP_SEQ_STATE_LISTENING;
2102        st->num = 0;
2103        st->bucket = 0;
2104        st->offset = 0;
2105        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2106
2107out:
2108        st->last_pos = *pos;
2109        return rc;
2110}
2111
2112static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2113{
2114        struct tcp_iter_state *st = seq->private;
2115        void *rc = NULL;
2116
2117        if (v == SEQ_START_TOKEN) {
2118                rc = tcp_get_idx(seq, 0);
2119                goto out;
2120        }
2121
2122        switch (st->state) {
2123        case TCP_SEQ_STATE_OPENREQ:
2124        case TCP_SEQ_STATE_LISTENING:
2125                rc = listening_get_next(seq, v);
2126                if (!rc) {
2127                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2128                        st->bucket = 0;
2129                        st->offset = 0;
2130                        rc        = established_get_first(seq);
2131                }
2132                break;
2133        case TCP_SEQ_STATE_ESTABLISHED:
2134                rc = established_get_next(seq, v);
2135                break;
2136        }
2137out:
2138        ++*pos;
2139        st->last_pos = *pos;
2140        return rc;
2141}
2142
2143static void tcp_seq_stop(struct seq_file *seq, void *v)
2144{
2145        struct tcp_iter_state *st = seq->private;
2146
2147        switch (st->state) {
2148        case TCP_SEQ_STATE_OPENREQ:
2149                if (v) {
2150                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2151                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2152                }
2153        case TCP_SEQ_STATE_LISTENING:
2154                if (v != SEQ_START_TOKEN)
2155                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2156                break;
2157        case TCP_SEQ_STATE_ESTABLISHED:
2158                if (v)
2159                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2160                break;
2161        }
2162}
2163
2164int tcp_seq_open(struct inode *inode, struct file *file)
2165{
2166        struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2167        struct tcp_iter_state *s;
2168        int err;
2169
2170        err = seq_open_net(inode, file, &afinfo->seq_ops,
2171                          sizeof(struct tcp_iter_state));
2172        if (err < 0)
2173                return err;
2174
2175        s = ((struct seq_file *)file->private_data)->private;
2176        s->family               = afinfo->family;
2177        s->last_pos             = 0;
2178        return 0;
2179}
2180EXPORT_SYMBOL(tcp_seq_open);
2181
2182int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2183{
2184        int rc = 0;
2185        struct proc_dir_entry *p;
2186
2187        afinfo->seq_ops.start           = tcp_seq_start;
2188        afinfo->seq_ops.next            = tcp_seq_next;
2189        afinfo->seq_ops.stop            = tcp_seq_stop;
2190
2191        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2192                             afinfo->seq_fops, afinfo);
2193        if (!p)
2194                rc = -ENOMEM;
2195        return rc;
2196}
2197EXPORT_SYMBOL(tcp_proc_register);
2198
2199void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2200{
2201        remove_proc_entry(afinfo->name, net->proc_net);
2202}
2203EXPORT_SYMBOL(tcp_proc_unregister);
2204
2205static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2206                         struct seq_file *f, int i, kuid_t uid)
2207{
2208        const struct inet_request_sock *ireq = inet_rsk(req);
2209        long delta = req->expires - jiffies;
2210
2211        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2212                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2213                i,
2214                ireq->ir_loc_addr,
2215                ntohs(inet_sk(sk)->inet_sport),
2216                ireq->ir_rmt_addr,
2217                ntohs(ireq->ir_rmt_port),
2218                TCP_SYN_RECV,
2219                0, 0, /* could print option size, but that is af dependent. */
2220                1,    /* timers active (only the expire timer) */
2221                jiffies_delta_to_clock_t(delta),
2222                req->num_timeout,
2223                from_kuid_munged(seq_user_ns(f), uid),
2224                0,  /* non standard timer */
2225                0, /* open_requests have no inode */
2226                atomic_read(&sk->sk_refcnt),
2227                req);
2228}
2229
2230static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2231{
2232        int timer_active;
2233        unsigned long timer_expires;
2234        const struct tcp_sock *tp = tcp_sk(sk);
2235        const struct inet_connection_sock *icsk = inet_csk(sk);
2236        const struct inet_sock *inet = inet_sk(sk);
2237        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2238        __be32 dest = inet->inet_daddr;
2239        __be32 src = inet->inet_rcv_saddr;
2240        __u16 destp = ntohs(inet->inet_dport);
2241        __u16 srcp = ntohs(inet->inet_sport);
2242        int rx_queue;
2243
2244        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2245            icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2246            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2247                timer_active    = 1;
2248                timer_expires   = icsk->icsk_timeout;
2249        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2250                timer_active    = 4;
2251                timer_expires   = icsk->icsk_timeout;
2252        } else if (timer_pending(&sk->sk_timer)) {
2253                timer_active    = 2;
2254                timer_expires   = sk->sk_timer.expires;
2255        } else {
2256                timer_active    = 0;
2257                timer_expires = jiffies;
2258        }
2259
2260        if (sk->sk_state == TCP_LISTEN)
2261                rx_queue = sk->sk_ack_backlog;
2262        else
2263                /*
2264                 * because we dont lock socket, we might find a transient negative value
2265                 */
2266                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2267
2268        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2269                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2270                i, src, srcp, dest, destp, sk->sk_state,
2271                tp->write_seq - tp->snd_una,
2272                rx_queue,
2273                timer_active,
2274                jiffies_delta_to_clock_t(timer_expires - jiffies),
2275                icsk->icsk_retransmits,
2276                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2277                icsk->icsk_probes_out,
2278                sock_i_ino(sk),
2279                atomic_read(&sk->sk_refcnt), sk,
2280                jiffies_to_clock_t(icsk->icsk_rto),
2281                jiffies_to_clock_t(icsk->icsk_ack.ato),
2282                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2283                tp->snd_cwnd,
2284                sk->sk_state == TCP_LISTEN ?
2285                    (fastopenq ? fastopenq->max_qlen : 0) :
2286                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2287}
2288
2289static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2290                               struct seq_file *f, int i)
2291{
2292        __be32 dest, src;
2293        __u16 destp, srcp;
2294        s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2295
2296        dest  = tw->tw_daddr;
2297        src   = tw->tw_rcv_saddr;
2298        destp = ntohs(tw->tw_dport);
2299        srcp  = ntohs(tw->tw_sport);
2300
2301        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2302                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2303                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2304                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2305                atomic_read(&tw->tw_refcnt), tw);
2306}
2307
2308#define TMPSZ 150
2309
2310static int tcp4_seq_show(struct seq_file *seq, void *v)
2311{
2312        struct tcp_iter_state *st;
2313        struct sock *sk = v;
2314
2315        seq_setwidth(seq, TMPSZ - 1);
2316        if (v == SEQ_START_TOKEN) {
2317                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2318                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2319                           "inode");
2320                goto out;
2321        }
2322        st = seq->private;
2323
2324        switch (st->state) {
2325        case TCP_SEQ_STATE_LISTENING:
2326        case TCP_SEQ_STATE_ESTABLISHED:
2327                if (sk->sk_state == TCP_TIME_WAIT)
2328                        get_timewait4_sock(v, seq, st->num);
2329                else
2330                        get_tcp4_sock(v, seq, st->num);
2331                break;
2332        case TCP_SEQ_STATE_OPENREQ:
2333                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2334                break;
2335        }
2336out:
2337        seq_pad(seq, '\n');
2338        return 0;
2339}
2340
2341static const struct file_operations tcp_afinfo_seq_fops = {
2342        .owner   = THIS_MODULE,
2343        .open    = tcp_seq_open,
2344        .read    = seq_read,
2345        .llseek  = seq_lseek,
2346        .release = seq_release_net
2347};
2348
2349static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2350        .name           = "tcp",
2351        .family         = AF_INET,
2352        .seq_fops       = &tcp_afinfo_seq_fops,
2353        .seq_ops        = {
2354                .show           = tcp4_seq_show,
2355        },
2356};
2357
2358static int __net_init tcp4_proc_init_net(struct net *net)
2359{
2360        return tcp_proc_register(net, &tcp4_seq_afinfo);
2361}
2362
2363static void __net_exit tcp4_proc_exit_net(struct net *net)
2364{
2365        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2366}
2367
2368static struct pernet_operations tcp4_net_ops = {
2369        .init = tcp4_proc_init_net,
2370        .exit = tcp4_proc_exit_net,
2371};
2372
2373int __init tcp4_proc_init(void)
2374{
2375        return register_pernet_subsys(&tcp4_net_ops);
2376}
2377
2378void tcp4_proc_exit(void)
2379{
2380        unregister_pernet_subsys(&tcp4_net_ops);
2381}
2382#endif /* CONFIG_PROC_FS */
2383
2384struct proto tcp_prot = {
2385        .name                   = "TCP",
2386        .owner                  = THIS_MODULE,
2387        .close                  = tcp_close,
2388        .connect                = tcp_v4_connect,
2389        .disconnect             = tcp_disconnect,
2390        .accept                 = inet_csk_accept,
2391        .ioctl                  = tcp_ioctl,
2392        .init                   = tcp_v4_init_sock,
2393        .destroy                = tcp_v4_destroy_sock,
2394        .shutdown               = tcp_shutdown,
2395        .setsockopt             = tcp_setsockopt,
2396        .getsockopt             = tcp_getsockopt,
2397        .recvmsg                = tcp_recvmsg,
2398        .sendmsg                = tcp_sendmsg,
2399        .sendpage               = tcp_sendpage,
2400        .backlog_rcv            = tcp_v4_do_rcv,
2401        .release_cb             = tcp_release_cb,
2402        .hash                   = inet_hash,
2403        .unhash                 = inet_unhash,
2404        .get_port               = inet_csk_get_port,
2405        .enter_memory_pressure  = tcp_enter_memory_pressure,
2406        .stream_memory_free     = tcp_stream_memory_free,
2407        .sockets_allocated      = &tcp_sockets_allocated,
2408        .orphan_count           = &tcp_orphan_count,
2409        .memory_allocated       = &tcp_memory_allocated,
2410        .memory_pressure        = &tcp_memory_pressure,
2411        .sysctl_mem             = sysctl_tcp_mem,
2412        .sysctl_wmem            = sysctl_tcp_wmem,
2413        .sysctl_rmem            = sysctl_tcp_rmem,
2414        .max_header             = MAX_TCP_HEADER,
2415        .obj_size               = sizeof(struct tcp_sock),
2416        .slab_flags             = SLAB_DESTROY_BY_RCU,
2417        .twsk_prot              = &tcp_timewait_sock_ops,
2418        .rsk_prot               = &tcp_request_sock_ops,
2419        .h.hashinfo             = &tcp_hashinfo,
2420        .no_autobind            = true,
2421#ifdef CONFIG_COMPAT
2422        .compat_setsockopt      = compat_tcp_setsockopt,
2423        .compat_getsockopt      = compat_tcp_getsockopt,
2424#endif
2425#ifdef CONFIG_MEMCG_KMEM
2426        .init_cgroup            = tcp_init_cgroup,
2427        .destroy_cgroup         = tcp_destroy_cgroup,
2428        .proto_cgroup           = tcp_proto_cgroup,
2429#endif
2430};
2431EXPORT_SYMBOL(tcp_prot);
2432
2433static void __net_exit tcp_sk_exit(struct net *net)
2434{
2435        int cpu;
2436
2437        for_each_possible_cpu(cpu)
2438                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2439        free_percpu(net->ipv4.tcp_sk);
2440}
2441
2442static int __net_init tcp_sk_init(struct net *net)
2443{
2444        int res, cpu;
2445
2446        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2447        if (!net->ipv4.tcp_sk)
2448                return -ENOMEM;
2449
2450        for_each_possible_cpu(cpu) {
2451                struct sock *sk;
2452
2453                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2454                                           IPPROTO_TCP, net);
2455                if (res)
2456                        goto fail;
2457                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2458        }
2459        net->ipv4.sysctl_tcp_ecn = 2;
2460        return 0;
2461
2462fail:
2463        tcp_sk_exit(net);
2464
2465        return res;
2466}
2467
2468static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2469{
2470        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2471}
2472
2473static struct pernet_operations __net_initdata tcp_sk_ops = {
2474       .init       = tcp_sk_init,
2475       .exit       = tcp_sk_exit,
2476       .exit_batch = tcp_sk_exit_batch,
2477};
2478
2479void __init tcp_v4_init(void)
2480{
2481        inet_hashinfo_init(&tcp_hashinfo);
2482        if (register_pernet_subsys(&tcp_sk_ops))
2483                panic("Failed to create the TCP control socket.\n");
2484}
2485
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.