linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53
  54#include <linux/bottom_half.h>
  55#include <linux/types.h>
  56#include <linux/fcntl.h>
  57#include <linux/module.h>
  58#include <linux/random.h>
  59#include <linux/cache.h>
  60#include <linux/jhash.h>
  61#include <linux/init.h>
  62#include <linux/times.h>
  63#include <linux/slab.h>
  64
  65#include <net/net_namespace.h>
  66#include <net/icmp.h>
  67#include <net/inet_hashtables.h>
  68#include <net/tcp.h>
  69#include <net/transp_v6.h>
  70#include <net/ipv6.h>
  71#include <net/inet_common.h>
  72#include <net/timewait_sock.h>
  73#include <net/xfrm.h>
  74#include <net/netdma.h>
  75
  76#include <linux/inet.h>
  77#include <linux/ipv6.h>
  78#include <linux/stddef.h>
  79#include <linux/proc_fs.h>
  80#include <linux/seq_file.h>
  81
  82#include <linux/crypto.h>
  83#include <linux/scatterlist.h>
  84
  85int sysctl_tcp_tw_reuse __read_mostly;
  86int sysctl_tcp_low_latency __read_mostly;
  87
  88
  89#ifdef CONFIG_TCP_MD5SIG
  90static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  91                                                   __be32 addr);
  92static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  93                               __be32 daddr, __be32 saddr, struct tcphdr *th);
  94#else
  95static inline
  96struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  97{
  98        return NULL;
  99}
 100#endif
 101
 102struct inet_hashinfo tcp_hashinfo;
 103
 104static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 105{
 106        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 107                                          ip_hdr(skb)->saddr,
 108                                          tcp_hdr(skb)->dest,
 109                                          tcp_hdr(skb)->source);
 110}
 111
 112int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 113{
 114        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115        struct tcp_sock *tp = tcp_sk(sk);
 116
 117        /* With PAWS, it is safe from the viewpoint
 118           of data integrity. Even without PAWS it is safe provided sequence
 119           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 120
 121           Actually, the idea is close to VJ's one, only timestamp cache is
 122           held not per host, but per port pair and TW bucket is used as state
 123           holder.
 124
 125           If TW bucket has been already destroyed we fall back to VJ's scheme
 126           and use initial timestamp retrieved from peer table.
 127         */
 128        if (tcptw->tw_ts_recent_stamp &&
 129            (twp == NULL || (sysctl_tcp_tw_reuse &&
 130                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 131                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 132                if (tp->write_seq == 0)
 133                        tp->write_seq = 1;
 134                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 135                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 136                sock_hold(sktw);
 137                return 1;
 138        }
 139
 140        return 0;
 141}
 142
 143EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 144
 145/* This will initiate an outgoing connection. */
 146int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 147{
 148        struct inet_sock *inet = inet_sk(sk);
 149        struct tcp_sock *tp = tcp_sk(sk);
 150        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 151        struct rtable *rt;
 152        __be32 daddr, nexthop;
 153        int tmp;
 154        int err;
 155
 156        if (addr_len < sizeof(struct sockaddr_in))
 157                return -EINVAL;
 158
 159        if (usin->sin_family != AF_INET)
 160                return -EAFNOSUPPORT;
 161
 162        nexthop = daddr = usin->sin_addr.s_addr;
 163        if (inet->opt && inet->opt->srr) {
 164                if (!daddr)
 165                        return -EINVAL;
 166                nexthop = inet->opt->faddr;
 167        }
 168
 169        tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               inet->inet_sport, usin->sin_port, sk, 1);
 173        if (tmp < 0) {
 174                if (tmp == -ENETUNREACH)
 175                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 176                return tmp;
 177        }
 178
 179        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 180                ip_rt_put(rt);
 181                return -ENETUNREACH;
 182        }
 183
 184        if (!inet->opt || !inet->opt->srr)
 185                daddr = rt->rt_dst;
 186
 187        if (!inet->inet_saddr)
 188                inet->inet_saddr = rt->rt_src;
 189        inet->inet_rcv_saddr = inet->inet_saddr;
 190
 191        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 192                /* Reset inherited state */
 193                tp->rx_opt.ts_recent       = 0;
 194                tp->rx_opt.ts_recent_stamp = 0;
 195                tp->write_seq              = 0;
 196        }
 197
 198        if (tcp_death_row.sysctl_tw_recycle &&
 199            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 200                struct inet_peer *peer = rt_get_peer(rt);
 201                /*
 202                 * VJ's idea. We save last timestamp seen from
 203                 * the destination in peer table, when entering state
 204                 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 205                 * when trying new connection.
 206                 */
 207                if (peer != NULL &&
 208                    (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 209                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 210                        tp->rx_opt.ts_recent = peer->tcp_ts;
 211                }
 212        }
 213
 214        inet->inet_dport = usin->sin_port;
 215        inet->inet_daddr = daddr;
 216
 217        inet_csk(sk)->icsk_ext_hdr_len = 0;
 218        if (inet->opt)
 219                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 220
 221        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 222
 223        /* Socket identity is still unknown (sport may be zero).
 224         * However we set state to SYN-SENT and not releasing socket
 225         * lock select source port, enter ourselves into the hash tables and
 226         * complete initialization after this.
 227         */
 228        tcp_set_state(sk, TCP_SYN_SENT);
 229        err = inet_hash_connect(&tcp_death_row, sk);
 230        if (err)
 231                goto failure;
 232
 233        err = ip_route_newports(&rt, IPPROTO_TCP,
 234                                inet->inet_sport, inet->inet_dport, sk);
 235        if (err)
 236                goto failure;
 237
 238        /* OK, now commit destination to socket.  */
 239        sk->sk_gso_type = SKB_GSO_TCPV4;
 240        sk_setup_caps(sk, &rt->u.dst);
 241
 242        if (!tp->write_seq)
 243                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 244                                                           inet->inet_daddr,
 245                                                           inet->inet_sport,
 246                                                           usin->sin_port);
 247
 248        inet->inet_id = tp->write_seq ^ jiffies;
 249
 250        err = tcp_connect(sk);
 251        rt = NULL;
 252        if (err)
 253                goto failure;
 254
 255        return 0;
 256
 257failure:
 258        /*
 259         * This unhashes the socket and releases the local port,
 260         * if necessary.
 261         */
 262        tcp_set_state(sk, TCP_CLOSE);
 263        ip_rt_put(rt);
 264        sk->sk_route_caps = 0;
 265        inet->inet_dport = 0;
 266        return err;
 267}
 268
 269/*
 270 * This routine does path mtu discovery as defined in RFC1191.
 271 */
 272static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 273{
 274        struct dst_entry *dst;
 275        struct inet_sock *inet = inet_sk(sk);
 276
 277        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 278         * send out by Linux are always <576bytes so they should go through
 279         * unfragmented).
 280         */
 281        if (sk->sk_state == TCP_LISTEN)
 282                return;
 283
 284        /* We don't check in the destentry if pmtu discovery is forbidden
 285         * on this route. We just assume that no packet_to_big packets
 286         * are send back when pmtu discovery is not active.
 287         * There is a small race when the user changes this flag in the
 288         * route, but I think that's acceptable.
 289         */
 290        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 291                return;
 292
 293        dst->ops->update_pmtu(dst, mtu);
 294
 295        /* Something is about to be wrong... Remember soft error
 296         * for the case, if this connection will not able to recover.
 297         */
 298        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                sk->sk_err_soft = EMSGSIZE;
 300
 301        mtu = dst_mtu(dst);
 302
 303        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 305                tcp_sync_mss(sk, mtu);
 306
 307                /* Resend the TCP packet because it's
 308                 * clear that the old packet has been
 309                 * dropped. This is the new "fast" path mtu
 310                 * discovery.
 311                 */
 312                tcp_simple_retransmit(sk);
 313        } /* else let the usual retransmit timer handle it */
 314}
 315
 316/*
 317 * This routine is called by the ICMP module when it gets some
 318 * sort of error condition.  If err < 0 then the socket should
 319 * be closed and the error returned to the user.  If err > 0
 320 * it's just the icmp type << 8 | icmp code.  After adjustment
 321 * header points to the first 8 bytes of the tcp header.  We need
 322 * to find the appropriate port.
 323 *
 324 * The locking strategy used here is very "optimistic". When
 325 * someone else accesses the socket the ICMP is just dropped
 326 * and for some paths there is no check at all.
 327 * A more general error queue to queue errors for later handling
 328 * is probably better.
 329 *
 330 */
 331
 332void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 333{
 334        struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 335        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 336        struct inet_connection_sock *icsk;
 337        struct tcp_sock *tp;
 338        struct inet_sock *inet;
 339        const int type = icmp_hdr(icmp_skb)->type;
 340        const int code = icmp_hdr(icmp_skb)->code;
 341        struct sock *sk;
 342        struct sk_buff *skb;
 343        __u32 seq;
 344        __u32 remaining;
 345        int err;
 346        struct net *net = dev_net(icmp_skb->dev);
 347
 348        if (icmp_skb->len < (iph->ihl << 2) + 8) {
 349                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 350                return;
 351        }
 352
 353        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 354                        iph->saddr, th->source, inet_iif(icmp_skb));
 355        if (!sk) {
 356                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 357                return;
 358        }
 359        if (sk->sk_state == TCP_TIME_WAIT) {
 360                inet_twsk_put(inet_twsk(sk));
 361                return;
 362        }
 363
 364        bh_lock_sock(sk);
 365        /* If too many ICMPs get dropped on busy
 366         * servers this needs to be solved differently.
 367         */
 368        if (sock_owned_by_user(sk))
 369                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 370
 371        if (sk->sk_state == TCP_CLOSE)
 372                goto out;
 373
 374        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 375                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 376                goto out;
 377        }
 378
 379        icsk = inet_csk(sk);
 380        tp = tcp_sk(sk);
 381        seq = ntohl(th->seq);
 382        if (sk->sk_state != TCP_LISTEN &&
 383            !between(seq, tp->snd_una, tp->snd_nxt)) {
 384                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 385                goto out;
 386        }
 387
 388        switch (type) {
 389        case ICMP_SOURCE_QUENCH:
 390                /* Just silently ignore these. */
 391                goto out;
 392        case ICMP_PARAMETERPROB:
 393                err = EPROTO;
 394                break;
 395        case ICMP_DEST_UNREACH:
 396                if (code > NR_ICMP_UNREACH)
 397                        goto out;
 398
 399                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 400                        if (!sock_owned_by_user(sk))
 401                                do_pmtu_discovery(sk, iph, info);
 402                        goto out;
 403                }
 404
 405                err = icmp_err_convert[code].errno;
 406                /* check if icmp_skb allows revert of backoff
 407                 * (see draft-zimmermann-tcp-lcd) */
 408                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 409                        break;
 410                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 411                    !icsk->icsk_backoff)
 412                        break;
 413
 414                icsk->icsk_backoff--;
 415                inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 416                                         icsk->icsk_backoff;
 417                tcp_bound_rto(sk);
 418
 419                skb = tcp_write_queue_head(sk);
 420                BUG_ON(!skb);
 421
 422                remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 423                                tcp_time_stamp - TCP_SKB_CB(skb)->when);
 424
 425                if (remaining) {
 426                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 427                                                  remaining, TCP_RTO_MAX);
 428                } else if (sock_owned_by_user(sk)) {
 429                        /* RTO revert clocked out retransmission,
 430                         * but socket is locked. Will defer. */
 431                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 432                                                  HZ/20, TCP_RTO_MAX);
 433                } else {
 434                        /* RTO revert clocked out retransmission.
 435                         * Will retransmit now */
 436                        tcp_retransmit_timer(sk);
 437                }
 438
 439                break;
 440        case ICMP_TIME_EXCEEDED:
 441                err = EHOSTUNREACH;
 442                break;
 443        default:
 444                goto out;
 445        }
 446
 447        switch (sk->sk_state) {
 448                struct request_sock *req, **prev;
 449        case TCP_LISTEN:
 450                if (sock_owned_by_user(sk))
 451                        goto out;
 452
 453                req = inet_csk_search_req(sk, &prev, th->dest,
 454                                          iph->daddr, iph->saddr);
 455                if (!req)
 456                        goto out;
 457
 458                /* ICMPs are not backlogged, hence we cannot get
 459                   an established socket here.
 460                 */
 461                WARN_ON(req->sk);
 462
 463                if (seq != tcp_rsk(req)->snt_isn) {
 464                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 465                        goto out;
 466                }
 467
 468                /*
 469                 * Still in SYN_RECV, just remove it silently.
 470                 * There is no good way to pass the error to the newly
 471                 * created socket, and POSIX does not want network
 472                 * errors returned from accept().
 473                 */
 474                inet_csk_reqsk_queue_drop(sk, req, prev);
 475                goto out;
 476
 477        case TCP_SYN_SENT:
 478        case TCP_SYN_RECV:  /* Cannot happen.
 479                               It can f.e. if SYNs crossed.
 480                             */
 481                if (!sock_owned_by_user(sk)) {
 482                        sk->sk_err = err;
 483
 484                        sk->sk_error_report(sk);
 485
 486                        tcp_done(sk);
 487                } else {
 488                        sk->sk_err_soft = err;
 489                }
 490                goto out;
 491        }
 492
 493        /* If we've already connected we will keep trying
 494         * until we time out, or the user gives up.
 495         *
 496         * rfc1122 4.2.3.9 allows to consider as hard errors
 497         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 498         * but it is obsoleted by pmtu discovery).
 499         *
 500         * Note, that in modern internet, where routing is unreliable
 501         * and in each dark corner broken firewalls sit, sending random
 502         * errors ordered by their masters even this two messages finally lose
 503         * their original sense (even Linux sends invalid PORT_UNREACHs)
 504         *
 505         * Now we are in compliance with RFCs.
 506         *                                                      --ANK (980905)
 507         */
 508
 509        inet = inet_sk(sk);
 510        if (!sock_owned_by_user(sk) && inet->recverr) {
 511                sk->sk_err = err;
 512                sk->sk_error_report(sk);
 513        } else  { /* Only an error on timeout */
 514                sk->sk_err_soft = err;
 515        }
 516
 517out:
 518        bh_unlock_sock(sk);
 519        sock_put(sk);
 520}
 521
 522static void __tcp_v4_send_check(struct sk_buff *skb,
 523                                __be32 saddr, __be32 daddr)
 524{
 525        struct tcphdr *th = tcp_hdr(skb);
 526
 527        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 528                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 529                skb->csum_start = skb_transport_header(skb) - skb->head;
 530                skb->csum_offset = offsetof(struct tcphdr, check);
 531        } else {
 532                th->check = tcp_v4_check(skb->len, saddr, daddr,
 533                                         csum_partial(th,
 534                                                      th->doff << 2,
 535                                                      skb->csum));
 536        }
 537}
 538
 539/* This routine computes an IPv4 TCP checksum. */
 540void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 541{
 542        struct inet_sock *inet = inet_sk(sk);
 543
 544        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 545}
 546
 547int tcp_v4_gso_send_check(struct sk_buff *skb)
 548{
 549        const struct iphdr *iph;
 550        struct tcphdr *th;
 551
 552        if (!pskb_may_pull(skb, sizeof(*th)))
 553                return -EINVAL;
 554
 555        iph = ip_hdr(skb);
 556        th = tcp_hdr(skb);
 557
 558        th->check = 0;
 559        skb->ip_summed = CHECKSUM_PARTIAL;
 560        __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 561        return 0;
 562}
 563
 564/*
 565 *      This routine will send an RST to the other tcp.
 566 *
 567 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 568 *                    for reset.
 569 *      Answer: if a packet caused RST, it is not for a socket
 570 *              existing in our system, if it is matched to a socket,
 571 *              it is just duplicate segment or bug in other side's TCP.
 572 *              So that we build reply only basing on parameters
 573 *              arrived with segment.
 574 *      Exception: precedence violation. We do not implement it in any case.
 575 */
 576
 577static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 578{
 579        struct tcphdr *th = tcp_hdr(skb);
 580        struct {
 581                struct tcphdr th;
 582#ifdef CONFIG_TCP_MD5SIG
 583                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 584#endif
 585        } rep;
 586        struct ip_reply_arg arg;
 587#ifdef CONFIG_TCP_MD5SIG
 588        struct tcp_md5sig_key *key;
 589#endif
 590        struct net *net;
 591
 592        /* Never send a reset in response to a reset. */
 593        if (th->rst)
 594                return;
 595
 596        if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 597                return;
 598
 599        /* Swap the send and the receive. */
 600        memset(&rep, 0, sizeof(rep));
 601        rep.th.dest   = th->source;
 602        rep.th.source = th->dest;
 603        rep.th.doff   = sizeof(struct tcphdr) / 4;
 604        rep.th.rst    = 1;
 605
 606        if (th->ack) {
 607                rep.th.seq = th->ack_seq;
 608        } else {
 609                rep.th.ack = 1;
 610                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 611                                       skb->len - (th->doff << 2));
 612        }
 613
 614        memset(&arg, 0, sizeof(arg));
 615        arg.iov[0].iov_base = (unsigned char *)&rep;
 616        arg.iov[0].iov_len  = sizeof(rep.th);
 617
 618#ifdef CONFIG_TCP_MD5SIG
 619        key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 620        if (key) {
 621                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 622                                   (TCPOPT_NOP << 16) |
 623                                   (TCPOPT_MD5SIG << 8) |
 624                                   TCPOLEN_MD5SIG);
 625                /* Update length and the length the header thinks exists */
 626                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 627                rep.th.doff = arg.iov[0].iov_len / 4;
 628
 629                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 630                                     key, ip_hdr(skb)->saddr,
 631                                     ip_hdr(skb)->daddr, &rep.th);
 632        }
 633#endif
 634        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 635                                      ip_hdr(skb)->saddr, /* XXX */
 636                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 637        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 638        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 639
 640        net = dev_net(skb_dst(skb)->dev);
 641        ip_send_reply(net->ipv4.tcp_sock, skb,
 642                      &arg, arg.iov[0].iov_len);
 643
 644        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 645        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 646}
 647
 648/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 649   outside socket context is ugly, certainly. What can I do?
 650 */
 651
 652static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 653                            u32 win, u32 ts, int oif,
 654                            struct tcp_md5sig_key *key,
 655                            int reply_flags)
 656{
 657        struct tcphdr *th = tcp_hdr(skb);
 658        struct {
 659                struct tcphdr th;
 660                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 661#ifdef CONFIG_TCP_MD5SIG
 662                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 663#endif
 664                        ];
 665        } rep;
 666        struct ip_reply_arg arg;
 667        struct net *net = dev_net(skb_dst(skb)->dev);
 668
 669        memset(&rep.th, 0, sizeof(struct tcphdr));
 670        memset(&arg, 0, sizeof(arg));
 671
 672        arg.iov[0].iov_base = (unsigned char *)&rep;
 673        arg.iov[0].iov_len  = sizeof(rep.th);
 674        if (ts) {
 675                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 676                                   (TCPOPT_TIMESTAMP << 8) |
 677                                   TCPOLEN_TIMESTAMP);
 678                rep.opt[1] = htonl(tcp_time_stamp);
 679                rep.opt[2] = htonl(ts);
 680                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 681        }
 682
 683        /* Swap the send and the receive. */
 684        rep.th.dest    = th->source;
 685        rep.th.source  = th->dest;
 686        rep.th.doff    = arg.iov[0].iov_len / 4;
 687        rep.th.seq     = htonl(seq);
 688        rep.th.ack_seq = htonl(ack);
 689        rep.th.ack     = 1;
 690        rep.th.window  = htons(win);
 691
 692#ifdef CONFIG_TCP_MD5SIG
 693        if (key) {
 694                int offset = (ts) ? 3 : 0;
 695
 696                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 697                                          (TCPOPT_NOP << 16) |
 698                                          (TCPOPT_MD5SIG << 8) |
 699                                          TCPOLEN_MD5SIG);
 700                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 701                rep.th.doff = arg.iov[0].iov_len/4;
 702
 703                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 704                                    key, ip_hdr(skb)->saddr,
 705                                    ip_hdr(skb)->daddr, &rep.th);
 706        }
 707#endif
 708        arg.flags = reply_flags;
 709        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 710                                      ip_hdr(skb)->saddr, /* XXX */
 711                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 712        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 713        if (oif)
 714                arg.bound_dev_if = oif;
 715
 716        ip_send_reply(net->ipv4.tcp_sock, skb,
 717                      &arg, arg.iov[0].iov_len);
 718
 719        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 720}
 721
 722static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 723{
 724        struct inet_timewait_sock *tw = inet_twsk(sk);
 725        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 726
 727        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 728                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 729                        tcptw->tw_ts_recent,
 730                        tw->tw_bound_dev_if,
 731                        tcp_twsk_md5_key(tcptw),
 732                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 733                        );
 734
 735        inet_twsk_put(tw);
 736}
 737
 738static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 739                                  struct request_sock *req)
 740{
 741        tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 742                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 743                        req->ts_recent,
 744                        0,
 745                        tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 746                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 747}
 748
 749/*
 750 *      Send a SYN-ACK after having received a SYN.
 751 *      This still operates on a request_sock only, not on a big
 752 *      socket.
 753 */
 754static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 755                              struct request_sock *req,
 756                              struct request_values *rvp)
 757{
 758        const struct inet_request_sock *ireq = inet_rsk(req);
 759        int err = -1;
 760        struct sk_buff * skb;
 761
 762        /* First, grab a route. */
 763        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 764                return -1;
 765
 766        skb = tcp_make_synack(sk, dst, req, rvp);
 767
 768        if (skb) {
 769                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 770
 771                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 772                                            ireq->rmt_addr,
 773                                            ireq->opt);
 774                err = net_xmit_eval(err);
 775        }
 776
 777        dst_release(dst);
 778        return err;
 779}
 780
 781static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 782                              struct request_values *rvp)
 783{
 784        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 785        return tcp_v4_send_synack(sk, NULL, req, rvp);
 786}
 787
 788/*
 789 *      IPv4 request_sock destructor.
 790 */
 791static void tcp_v4_reqsk_destructor(struct request_sock *req)
 792{
 793        kfree(inet_rsk(req)->opt);
 794}
 795
 796#ifdef CONFIG_SYN_COOKIES
 797static void syn_flood_warning(struct sk_buff *skb)
 798{
 799        static unsigned long warntime;
 800
 801        if (time_after(jiffies, (warntime + HZ * 60))) {
 802                warntime = jiffies;
 803                printk(KERN_INFO
 804                       "possible SYN flooding on port %d. Sending cookies.\n",
 805                       ntohs(tcp_hdr(skb)->dest));
 806        }
 807}
 808#endif
 809
 810/*
 811 * Save and compile IPv4 options into the request_sock if needed.
 812 */
 813static struct ip_options *tcp_v4_save_options(struct sock *sk,
 814                                              struct sk_buff *skb)
 815{
 816        struct ip_options *opt = &(IPCB(skb)->opt);
 817        struct ip_options *dopt = NULL;
 818
 819        if (opt && opt->optlen) {
 820                int opt_size = optlength(opt);
 821                dopt = kmalloc(opt_size, GFP_ATOMIC);
 822                if (dopt) {
 823                        if (ip_options_echo(dopt, skb)) {
 824                                kfree(dopt);
 825                                dopt = NULL;
 826                        }
 827                }
 828        }
 829        return dopt;
 830}
 831
 832#ifdef CONFIG_TCP_MD5SIG
 833/*
 834 * RFC2385 MD5 checksumming requires a mapping of
 835 * IP address->MD5 Key.
 836 * We need to maintain these in the sk structure.
 837 */
 838
 839/* Find the Key structure for an address.  */
 840static struct tcp_md5sig_key *
 841                        tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 842{
 843        struct tcp_sock *tp = tcp_sk(sk);
 844        int i;
 845
 846        if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 847                return NULL;
 848        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 849                if (tp->md5sig_info->keys4[i].addr == addr)
 850                        return &tp->md5sig_info->keys4[i].base;
 851        }
 852        return NULL;
 853}
 854
 855struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 856                                         struct sock *addr_sk)
 857{
 858        return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 859}
 860
 861EXPORT_SYMBOL(tcp_v4_md5_lookup);
 862
 863static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 864                                                      struct request_sock *req)
 865{
 866        return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 867}
 868
 869/* This can be called on a newly created socket, from other files */
 870int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 871                      u8 *newkey, u8 newkeylen)
 872{
 873        /* Add Key to the list */
 874        struct tcp_md5sig_key *key;
 875        struct tcp_sock *tp = tcp_sk(sk);
 876        struct tcp4_md5sig_key *keys;
 877
 878        key = tcp_v4_md5_do_lookup(sk, addr);
 879        if (key) {
 880                /* Pre-existing entry - just update that one. */
 881                kfree(key->key);
 882                key->key = newkey;
 883                key->keylen = newkeylen;
 884        } else {
 885                struct tcp_md5sig_info *md5sig;
 886
 887                if (!tp->md5sig_info) {
 888                        tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 889                                                  GFP_ATOMIC);
 890                        if (!tp->md5sig_info) {
 891                                kfree(newkey);
 892                                return -ENOMEM;
 893                        }
 894                        sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 895                }
 896                if (tcp_alloc_md5sig_pool(sk) == NULL) {
 897                        kfree(newkey);
 898                        return -ENOMEM;
 899                }
 900                md5sig = tp->md5sig_info;
 901
 902                if (md5sig->alloced4 == md5sig->entries4) {
 903                        keys = kmalloc((sizeof(*keys) *
 904                                        (md5sig->entries4 + 1)), GFP_ATOMIC);
 905                        if (!keys) {
 906                                kfree(newkey);
 907                                tcp_free_md5sig_pool();
 908                                return -ENOMEM;
 909                        }
 910
 911                        if (md5sig->entries4)
 912                                memcpy(keys, md5sig->keys4,
 913                                       sizeof(*keys) * md5sig->entries4);
 914
 915                        /* Free old key list, and reference new one */
 916                        kfree(md5sig->keys4);
 917                        md5sig->keys4 = keys;
 918                        md5sig->alloced4++;
 919                }
 920                md5sig->entries4++;
 921                md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 922                md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 923                md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 924        }
 925        return 0;
 926}
 927
 928EXPORT_SYMBOL(tcp_v4_md5_do_add);
 929
 930static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 931                               u8 *newkey, u8 newkeylen)
 932{
 933        return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 934                                 newkey, newkeylen);
 935}
 936
 937int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 938{
 939        struct tcp_sock *tp = tcp_sk(sk);
 940        int i;
 941
 942        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 943                if (tp->md5sig_info->keys4[i].addr == addr) {
 944                        /* Free the key */
 945                        kfree(tp->md5sig_info->keys4[i].base.key);
 946                        tp->md5sig_info->entries4--;
 947
 948                        if (tp->md5sig_info->entries4 == 0) {
 949                                kfree(tp->md5sig_info->keys4);
 950                                tp->md5sig_info->keys4 = NULL;
 951                                tp->md5sig_info->alloced4 = 0;
 952                        } else if (tp->md5sig_info->entries4 != i) {
 953                                /* Need to do some manipulation */
 954                                memmove(&tp->md5sig_info->keys4[i],
 955                                        &tp->md5sig_info->keys4[i+1],
 956                                        (tp->md5sig_info->entries4 - i) *
 957                                         sizeof(struct tcp4_md5sig_key));
 958                        }
 959                        tcp_free_md5sig_pool();
 960                        return 0;
 961                }
 962        }
 963        return -ENOENT;
 964}
 965
 966EXPORT_SYMBOL(tcp_v4_md5_do_del);
 967
 968static void tcp_v4_clear_md5_list(struct sock *sk)
 969{
 970        struct tcp_sock *tp = tcp_sk(sk);
 971
 972        /* Free each key, then the set of key keys,
 973         * the crypto element, and then decrement our
 974         * hold on the last resort crypto.
 975         */
 976        if (tp->md5sig_info->entries4) {
 977                int i;
 978                for (i = 0; i < tp->md5sig_info->entries4; i++)
 979                        kfree(tp->md5sig_info->keys4[i].base.key);
 980                tp->md5sig_info->entries4 = 0;
 981                tcp_free_md5sig_pool();
 982        }
 983        if (tp->md5sig_info->keys4) {
 984                kfree(tp->md5sig_info->keys4);
 985                tp->md5sig_info->keys4 = NULL;
 986                tp->md5sig_info->alloced4  = 0;
 987        }
 988}
 989
 990static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 991                                 int optlen)
 992{
 993        struct tcp_md5sig cmd;
 994        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 995        u8 *newkey;
 996
 997        if (optlen < sizeof(cmd))
 998                return -EINVAL;
 999
1000        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1001                return -EFAULT;
1002
1003        if (sin->sin_family != AF_INET)
1004                return -EINVAL;
1005
1006        if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1007                if (!tcp_sk(sk)->md5sig_info)
1008                        return -ENOENT;
1009                return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1010        }
1011
1012        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1013                return -EINVAL;
1014
1015        if (!tcp_sk(sk)->md5sig_info) {
1016                struct tcp_sock *tp = tcp_sk(sk);
1017                struct tcp_md5sig_info *p;
1018
1019                p = kzalloc(sizeof(*p), sk->sk_allocation);
1020                if (!p)
1021                        return -EINVAL;
1022
1023                tp->md5sig_info = p;
1024                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1025        }
1026
1027        newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1028        if (!newkey)
1029                return -ENOMEM;
1030        return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1031                                 newkey, cmd.tcpm_keylen);
1032}
1033
1034static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1035                                        __be32 daddr, __be32 saddr, int nbytes)
1036{
1037        struct tcp4_pseudohdr *bp;
1038        struct scatterlist sg;
1039
1040        bp = &hp->md5_blk.ip4;
1041
1042        /*
1043         * 1. the TCP pseudo-header (in the order: source IP address,
1044         * destination IP address, zero-padded protocol number, and
1045         * segment length)
1046         */
1047        bp->saddr = saddr;
1048        bp->daddr = daddr;
1049        bp->pad = 0;
1050        bp->protocol = IPPROTO_TCP;
1051        bp->len = cpu_to_be16(nbytes);
1052
1053        sg_init_one(&sg, bp, sizeof(*bp));
1054        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1055}
1056
1057static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1058                               __be32 daddr, __be32 saddr, struct tcphdr *th)
1059{
1060        struct tcp_md5sig_pool *hp;
1061        struct hash_desc *desc;
1062
1063        hp = tcp_get_md5sig_pool();
1064        if (!hp)
1065                goto clear_hash_noput;
1066        desc = &hp->md5_desc;
1067
1068        if (crypto_hash_init(desc))
1069                goto clear_hash;
1070        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1071                goto clear_hash;
1072        if (tcp_md5_hash_header(hp, th))
1073                goto clear_hash;
1074        if (tcp_md5_hash_key(hp, key))
1075                goto clear_hash;
1076        if (crypto_hash_final(desc, md5_hash))
1077                goto clear_hash;
1078
1079        tcp_put_md5sig_pool();
1080        return 0;
1081
1082clear_hash:
1083        tcp_put_md5sig_pool();
1084clear_hash_noput:
1085        memset(md5_hash, 0, 16);
1086        return 1;
1087}
1088
1089int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1090                        struct sock *sk, struct request_sock *req,
1091                        struct sk_buff *skb)
1092{
1093        struct tcp_md5sig_pool *hp;
1094        struct hash_desc *desc;
1095        struct tcphdr *th = tcp_hdr(skb);
1096        __be32 saddr, daddr;
1097
1098        if (sk) {
1099                saddr = inet_sk(sk)->inet_saddr;
1100                daddr = inet_sk(sk)->inet_daddr;
1101        } else if (req) {
1102                saddr = inet_rsk(req)->loc_addr;
1103                daddr = inet_rsk(req)->rmt_addr;
1104        } else {
1105                const struct iphdr *iph = ip_hdr(skb);
1106                saddr = iph->saddr;
1107                daddr = iph->daddr;
1108        }
1109
1110        hp = tcp_get_md5sig_pool();
1111        if (!hp)
1112                goto clear_hash_noput;
1113        desc = &hp->md5_desc;
1114
1115        if (crypto_hash_init(desc))
1116                goto clear_hash;
1117
1118        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1119                goto clear_hash;
1120        if (tcp_md5_hash_header(hp, th))
1121                goto clear_hash;
1122        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1123                goto clear_hash;
1124        if (tcp_md5_hash_key(hp, key))
1125                goto clear_hash;
1126        if (crypto_hash_final(desc, md5_hash))
1127                goto clear_hash;
1128
1129        tcp_put_md5sig_pool();
1130        return 0;
1131
1132clear_hash:
1133        tcp_put_md5sig_pool();
1134clear_hash_noput:
1135        memset(md5_hash, 0, 16);
1136        return 1;
1137}
1138
1139EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1140
1141static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1142{
1143        /*
1144         * This gets called for each TCP segment that arrives
1145         * so we want to be efficient.
1146         * We have 3 drop cases:
1147         * o No MD5 hash and one expected.
1148         * o MD5 hash and we're not expecting one.
1149         * o MD5 hash and its wrong.
1150         */
1151        __u8 *hash_location = NULL;
1152        struct tcp_md5sig_key *hash_expected;
1153        const struct iphdr *iph = ip_hdr(skb);
1154        struct tcphdr *th = tcp_hdr(skb);
1155        int genhash;
1156        unsigned char newhash[16];
1157
1158        hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1159        hash_location = tcp_parse_md5sig_option(th);
1160
1161        /* We've parsed the options - do we have a hash? */
1162        if (!hash_expected && !hash_location)
1163                return 0;
1164
1165        if (hash_expected && !hash_location) {
1166                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1167                return 1;
1168        }
1169
1170        if (!hash_expected && hash_location) {
1171                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1172                return 1;
1173        }
1174
1175        /* Okay, so this is hash_expected and hash_location -
1176         * so we need to calculate the checksum.
1177         */
1178        genhash = tcp_v4_md5_hash_skb(newhash,
1179                                      hash_expected,
1180                                      NULL, NULL, skb);
1181
1182        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1183                if (net_ratelimit()) {
1184                        printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1185                               &iph->saddr, ntohs(th->source),
1186                               &iph->daddr, ntohs(th->dest),
1187                               genhash ? " tcp_v4_calc_md5_hash failed" : "");
1188                }
1189                return 1;
1190        }
1191        return 0;
1192}
1193
1194#endif
1195
1196struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1197        .family         =       PF_INET,
1198        .obj_size       =       sizeof(struct tcp_request_sock),
1199        .rtx_syn_ack    =       tcp_v4_rtx_synack,
1200        .send_ack       =       tcp_v4_reqsk_send_ack,
1201        .destructor     =       tcp_v4_reqsk_destructor,
1202        .send_reset     =       tcp_v4_send_reset,
1203        .syn_ack_timeout =      tcp_syn_ack_timeout,
1204};
1205
1206#ifdef CONFIG_TCP_MD5SIG
1207static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1208        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1209        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1210};
1211#endif
1212
1213static struct timewait_sock_ops tcp_timewait_sock_ops = {
1214        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1215        .twsk_unique    = tcp_twsk_unique,
1216        .twsk_destructor= tcp_twsk_destructor,
1217};
1218
1219int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1220{
1221        struct tcp_extend_values tmp_ext;
1222        struct tcp_options_received tmp_opt;
1223        u8 *hash_location;
1224        struct request_sock *req;
1225        struct inet_request_sock *ireq;
1226        struct tcp_sock *tp = tcp_sk(sk);
1227        struct dst_entry *dst = NULL;
1228        __be32 saddr = ip_hdr(skb)->saddr;
1229        __be32 daddr = ip_hdr(skb)->daddr;
1230        __u32 isn = TCP_SKB_CB(skb)->when;
1231#ifdef CONFIG_SYN_COOKIES
1232        int want_cookie = 0;
1233#else
1234#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1235#endif
1236
1237        /* Never answer to SYNs send to broadcast or multicast */
1238        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1239                goto drop;
1240
1241        /* TW buckets are converted to open requests without
1242         * limitations, they conserve resources and peer is
1243         * evidently real one.
1244         */
1245        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1246#ifdef CONFIG_SYN_COOKIES
1247                if (sysctl_tcp_syncookies) {
1248                        want_cookie = 1;
1249                } else
1250#endif
1251                goto drop;
1252        }
1253
1254        /* Accept backlog is full. If we have already queued enough
1255         * of warm entries in syn queue, drop request. It is better than
1256         * clogging syn queue with openreqs with exponentially increasing
1257         * timeout.
1258         */
1259        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1260                goto drop;
1261
1262        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1263        if (!req)
1264                goto drop;
1265
1266#ifdef CONFIG_TCP_MD5SIG
1267        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1268#endif
1269
1270        tcp_clear_options(&tmp_opt);
1271        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1272        tmp_opt.user_mss  = tp->rx_opt.user_mss;
1273        tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1274
1275        if (tmp_opt.cookie_plus > 0 &&
1276            tmp_opt.saw_tstamp &&
1277            !tp->rx_opt.cookie_out_never &&
1278            (sysctl_tcp_cookie_size > 0 ||
1279             (tp->cookie_values != NULL &&
1280              tp->cookie_values->cookie_desired > 0))) {
1281                u8 *c;
1282                u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1283                int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1284
1285                if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1286                        goto drop_and_release;
1287
1288                /* Secret recipe starts with IP addresses */
1289                *mess++ ^= (__force u32)daddr;
1290                *mess++ ^= (__force u32)saddr;
1291
1292                /* plus variable length Initiator Cookie */
1293                c = (u8 *)mess;
1294                while (l-- > 0)
1295                        *c++ ^= *hash_location++;
1296
1297#ifdef CONFIG_SYN_COOKIES
1298                want_cookie = 0;        /* not our kind of cookie */
1299#endif
1300                tmp_ext.cookie_out_never = 0; /* false */
1301                tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1302        } else if (!tp->rx_opt.cookie_in_always) {
1303                /* redundant indications, but ensure initialization. */
1304                tmp_ext.cookie_out_never = 1; /* true */
1305                tmp_ext.cookie_plus = 0;
1306        } else {
1307                goto drop_and_release;
1308        }
1309        tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1310
1311        if (want_cookie && !tmp_opt.saw_tstamp)
1312                tcp_clear_options(&tmp_opt);
1313
1314        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1315        tcp_openreq_init(req, &tmp_opt, skb);
1316
1317        ireq = inet_rsk(req);
1318        ireq->loc_addr = daddr;
1319        ireq->rmt_addr = saddr;
1320        ireq->no_srccheck = inet_sk(sk)->transparent;
1321        ireq->opt = tcp_v4_save_options(sk, skb);
1322
1323        if (security_inet_conn_request(sk, skb, req))
1324                goto drop_and_free;
1325
1326        if (!want_cookie)
1327                TCP_ECN_create_request(req, tcp_hdr(skb));
1328
1329        if (want_cookie) {
1330#ifdef CONFIG_SYN_COOKIES
1331                syn_flood_warning(skb);
1332                req->cookie_ts = tmp_opt.tstamp_ok;
1333#endif
1334                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1335        } else if (!isn) {
1336                struct inet_peer *peer = NULL;
1337
1338                /* VJ's idea. We save last timestamp seen
1339                 * from the destination in peer table, when entering
1340                 * state TIME-WAIT, and check against it before
1341                 * accepting new connection request.
1342                 *
1343                 * If "isn" is not zero, this request hit alive
1344                 * timewait bucket, so that all the necessary checks
1345                 * are made in the function processing timewait state.
1346                 */
1347                if (tmp_opt.saw_tstamp &&
1348                    tcp_death_row.sysctl_tw_recycle &&
1349                    (dst = inet_csk_route_req(sk, req)) != NULL &&
1350                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1351                    peer->v4daddr == saddr) {
1352                        if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1353                            (s32)(peer->tcp_ts - req->ts_recent) >
1354                                                        TCP_PAWS_WINDOW) {
1355                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1356                                goto drop_and_release;
1357                        }
1358                }
1359                /* Kill the following clause, if you dislike this way. */
1360                else if (!sysctl_tcp_syncookies &&
1361                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1362                          (sysctl_max_syn_backlog >> 2)) &&
1363                         (!peer || !peer->tcp_ts_stamp) &&
1364                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1365                        /* Without syncookies last quarter of
1366                         * backlog is filled with destinations,
1367                         * proven to be alive.
1368                         * It means that we continue to communicate
1369                         * to destinations, already remembered
1370                         * to the moment of synflood.
1371                         */
1372                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1373                                       &saddr, ntohs(tcp_hdr(skb)->source));
1374                        goto drop_and_release;
1375                }
1376
1377                isn = tcp_v4_init_sequence(skb);
1378        }
1379        tcp_rsk(req)->snt_isn = isn;
1380
1381        if (tcp_v4_send_synack(sk, dst, req,
1382                               (struct request_values *)&tmp_ext) ||
1383            want_cookie)
1384                goto drop_and_free;
1385
1386        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1387        return 0;
1388
1389drop_and_release:
1390        dst_release(dst);
1391drop_and_free:
1392        reqsk_free(req);
1393drop:
1394        return 0;
1395}
1396
1397
1398/*
1399 * The three way handshake has completed - we got a valid synack -
1400 * now create the new socket.
1401 */
1402struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1403                                  struct request_sock *req,
1404                                  struct dst_entry *dst)
1405{
1406        struct inet_request_sock *ireq;
1407        struct inet_sock *newinet;
1408        struct tcp_sock *newtp;
1409        struct sock *newsk;
1410#ifdef CONFIG_TCP_MD5SIG
1411        struct tcp_md5sig_key *key;
1412#endif
1413
1414        if (sk_acceptq_is_full(sk))
1415                goto exit_overflow;
1416
1417        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1418                goto exit;
1419
1420        newsk = tcp_create_openreq_child(sk, req, skb);
1421        if (!newsk)
1422                goto exit;
1423
1424        newsk->sk_gso_type = SKB_GSO_TCPV4;
1425        sk_setup_caps(newsk, dst);
1426
1427        newtp                 = tcp_sk(newsk);
1428        newinet               = inet_sk(newsk);
1429        ireq                  = inet_rsk(req);
1430        newinet->inet_daddr   = ireq->rmt_addr;
1431        newinet->inet_rcv_saddr = ireq->loc_addr;
1432        newinet->inet_saddr           = ireq->loc_addr;
1433        newinet->opt          = ireq->opt;
1434        ireq->opt             = NULL;
1435        newinet->mc_index     = inet_iif(skb);
1436        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1437        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1438        if (newinet->opt)
1439                inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1440        newinet->inet_id = newtp->write_seq ^ jiffies;
1441
1442        tcp_mtup_init(newsk);
1443        tcp_sync_mss(newsk, dst_mtu(dst));
1444        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1445        if (tcp_sk(sk)->rx_opt.user_mss &&
1446            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1447                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1448
1449        tcp_initialize_rcv_mss(newsk);
1450
1451#ifdef CONFIG_TCP_MD5SIG
1452        /* Copy over the MD5 key from the original socket */
1453        key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1454        if (key != NULL) {
1455                /*
1456                 * We're using one, so create a matching key
1457                 * on the newsk structure. If we fail to get
1458                 * memory, then we end up not copying the key
1459                 * across. Shucks.
1460                 */
1461                char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1462                if (newkey != NULL)
1463                        tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1464                                          newkey, key->keylen);
1465                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1466        }
1467#endif
1468
1469        __inet_hash_nolisten(newsk, NULL);
1470        __inet_inherit_port(sk, newsk);
1471
1472        return newsk;
1473
1474exit_overflow:
1475        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1476exit:
1477        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1478        dst_release(dst);
1479        return NULL;
1480}
1481
1482static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1483{
1484        struct tcphdr *th = tcp_hdr(skb);
1485        const struct iphdr *iph = ip_hdr(skb);
1486        struct sock *nsk;
1487        struct request_sock **prev;
1488        /* Find possible connection requests. */
1489        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1490                                                       iph->saddr, iph->daddr);
1491        if (req)
1492                return tcp_check_req(sk, skb, req, prev);
1493
1494        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1495                        th->source, iph->daddr, th->dest, inet_iif(skb));
1496
1497        if (nsk) {
1498                if (nsk->sk_state != TCP_TIME_WAIT) {
1499                        bh_lock_sock(nsk);
1500                        return nsk;
1501                }
1502                inet_twsk_put(inet_twsk(nsk));
1503                return NULL;
1504        }
1505
1506#ifdef CONFIG_SYN_COOKIES
1507        if (!th->rst && !th->syn && th->ack)
1508                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1509#endif
1510        return sk;
1511}
1512
1513static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1514{
1515        const struct iphdr *iph = ip_hdr(skb);
1516
1517        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1518                if (!tcp_v4_check(skb->len, iph->saddr,
1519                                  iph->daddr, skb->csum)) {
1520                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1521                        return 0;
1522                }
1523        }
1524
1525        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1526                                       skb->len, IPPROTO_TCP, 0);
1527
1528        if (skb->len <= 76) {
1529                return __skb_checksum_complete(skb);
1530        }
1531        return 0;
1532}
1533
1534
1535/* The socket must have it's spinlock held when we get
1536 * here.
1537 *
1538 * We have a potential double-lock case here, so even when
1539 * doing backlog processing we use the BH locking scheme.
1540 * This is because we cannot sleep with the original spinlock
1541 * held.
1542 */
1543int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1544{
1545        struct sock *rsk;
1546#ifdef CONFIG_TCP_MD5SIG
1547        /*
1548         * We really want to reject the packet as early as possible
1549         * if:
1550         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1551         *  o There is an MD5 option and we're not expecting one
1552         */
1553        if (tcp_v4_inbound_md5_hash(sk, skb))
1554                goto discard;
1555#endif
1556
1557        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1558                sock_rps_save_rxhash(sk, skb->rxhash);
1559                TCP_CHECK_TIMER(sk);
1560                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1561                        rsk = sk;
1562                        goto reset;
1563                }
1564                TCP_CHECK_TIMER(sk);
1565                return 0;
1566        }
1567
1568        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1569                goto csum_err;
1570
1571        if (sk->sk_state == TCP_LISTEN) {
1572                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1573                if (!nsk)
1574                        goto discard;
1575
1576                if (nsk != sk) {
1577                        if (tcp_child_process(sk, nsk, skb)) {
1578                                rsk = nsk;
1579                                goto reset;
1580                        }
1581                        return 0;
1582                }
1583        } else
1584                sock_rps_save_rxhash(sk, skb->rxhash);
1585
1586
1587        TCP_CHECK_TIMER(sk);
1588        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1589                rsk = sk;
1590                goto reset;
1591        }
1592        TCP_CHECK_TIMER(sk);
1593        return 0;
1594
1595reset:
1596        tcp_v4_send_reset(rsk, skb);
1597discard:
1598        kfree_skb(skb);
1599        /* Be careful here. If this function gets more complicated and
1600         * gcc suffers from register pressure on the x86, sk (in %ebx)
1601         * might be destroyed here. This current version compiles correctly,
1602         * but you have been warned.
1603         */
1604        return 0;
1605
1606csum_err:
1607        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1608        goto discard;
1609}
1610
1611/*
1612 *      From tcp_input.c
1613 */
1614
1615int tcp_v4_rcv(struct sk_buff *skb)
1616{
1617        const struct iphdr *iph;
1618        struct tcphdr *th;
1619        struct sock *sk;
1620        int ret;
1621        struct net *net = dev_net(skb->dev);
1622
1623        if (skb->pkt_type != PACKET_HOST)
1624                goto discard_it;
1625
1626        /* Count it even if it's bad */
1627        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1628
1629        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1630                goto discard_it;
1631
1632        th = tcp_hdr(skb);
1633
1634        if (th->doff < sizeof(struct tcphdr) / 4)
1635                goto bad_packet;
1636        if (!pskb_may_pull(skb, th->doff * 4))
1637                goto discard_it;
1638
1639        /* An explanation is required here, I think.
1640         * Packet length and doff are validated by header prediction,
1641         * provided case of th->doff==0 is eliminated.
1642         * So, we defer the checks. */
1643        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1644                goto bad_packet;
1645
1646        th = tcp_hdr(skb);
1647        iph = ip_hdr(skb);
1648        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1649        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1650                                    skb->len - th->doff * 4);
1651        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1652        TCP_SKB_CB(skb)->when    = 0;
1653        TCP_SKB_CB(skb)->flags   = iph->tos;
1654        TCP_SKB_CB(skb)->sacked  = 0;
1655
1656        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1657        if (!sk)
1658                goto no_tcp_socket;
1659
1660process:
1661        if (sk->sk_state == TCP_TIME_WAIT)
1662                goto do_time_wait;
1663
1664        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1665                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1666                goto discard_and_relse;
1667        }
1668
1669        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1670                goto discard_and_relse;
1671        nf_reset(skb);
1672
1673        if (sk_filter(sk, skb))
1674                goto discard_and_relse;
1675
1676        skb->dev = NULL;
1677
1678        bh_lock_sock_nested(sk);
1679        ret = 0;
1680        if (!sock_owned_by_user(sk)) {
1681#ifdef CONFIG_NET_DMA
1682                struct tcp_sock *tp = tcp_sk(sk);
1683                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1684                        tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1685                if (tp->ucopy.dma_chan)
1686                        ret = tcp_v4_do_rcv(sk, skb);
1687                else
1688#endif
1689                {
1690                        if (!tcp_prequeue(sk, skb))
1691                                ret = tcp_v4_do_rcv(sk, skb);
1692                }
1693        } else if (unlikely(sk_add_backlog(sk, skb))) {
1694                bh_unlock_sock(sk);
1695                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1696                goto discard_and_relse;
1697        }
1698        bh_unlock_sock(sk);
1699
1700        sock_put(sk);
1701
1702        return ret;
1703
1704no_tcp_socket:
1705        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1706                goto discard_it;
1707
1708        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1709bad_packet:
1710                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1711        } else {
1712                tcp_v4_send_reset(NULL, skb);
1713        }
1714
1715discard_it:
1716        /* Discard frame. */
1717        kfree_skb(skb);
1718        return 0;
1719
1720discard_and_relse:
1721        sock_put(sk);
1722        goto discard_it;
1723
1724do_time_wait:
1725        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1726                inet_twsk_put(inet_twsk(sk));
1727                goto discard_it;
1728        }
1729
1730        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1731                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1732                inet_twsk_put(inet_twsk(sk));
1733                goto discard_it;
1734        }
1735        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1736        case TCP_TW_SYN: {
1737                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1738                                                        &tcp_hashinfo,
1739                                                        iph->daddr, th->dest,
1740                                                        inet_iif(skb));
1741                if (sk2) {
1742                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743                        inet_twsk_put(inet_twsk(sk));
1744                        sk = sk2;
1745                        goto process;
1746                }
1747                /* Fall through to ACK */
1748        }
1749        case TCP_TW_ACK:
1750                tcp_v4_timewait_ack(sk, skb);
1751                break;
1752        case TCP_TW_RST:
1753                goto no_tcp_socket;
1754        case TCP_TW_SUCCESS:;
1755        }
1756        goto discard_it;
1757}
1758
1759/* VJ's idea. Save last timestamp seen from this destination
1760 * and hold it at least for normal timewait interval to use for duplicate
1761 * segment detection in subsequent connections, before they enter synchronized
1762 * state.
1763 */
1764
1765int tcp_v4_remember_stamp(struct sock *sk)
1766{
1767        struct inet_sock *inet = inet_sk(sk);
1768        struct tcp_sock *tp = tcp_sk(sk);
1769        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1770        struct inet_peer *peer = NULL;
1771        int release_it = 0;
1772
1773        if (!rt || rt->rt_dst != inet->inet_daddr) {
1774                peer = inet_getpeer(inet->inet_daddr, 1);
1775                release_it = 1;
1776        } else {
1777                if (!rt->peer)
1778                        rt_bind_peer(rt, 1);
1779                peer = rt->peer;
1780        }
1781
1782        if (peer) {
1783                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1784                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1785                     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1786                        peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1787                        peer->tcp_ts = tp->rx_opt.ts_recent;
1788                }
1789                if (release_it)
1790                        inet_putpeer(peer);
1791                return 1;
1792        }
1793
1794        return 0;
1795}
1796
1797int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1798{
1799        struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1800
1801        if (peer) {
1802                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1803
1804                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1805                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1806                     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1807                        peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1808                        peer->tcp_ts       = tcptw->tw_ts_recent;
1809                }
1810                inet_putpeer(peer);
1811                return 1;
1812        }
1813
1814        return 0;
1815}
1816
1817const struct inet_connection_sock_af_ops ipv4_specific = {
1818        .queue_xmit        = ip_queue_xmit,
1819        .send_check        = tcp_v4_send_check,
1820        .rebuild_header    = inet_sk_rebuild_header,
1821        .conn_request      = tcp_v4_conn_request,
1822        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1823        .remember_stamp    = tcp_v4_remember_stamp,
1824        .net_header_len    = sizeof(struct iphdr),
1825        .setsockopt        = ip_setsockopt,
1826        .getsockopt        = ip_getsockopt,
1827        .addr2sockaddr     = inet_csk_addr2sockaddr,
1828        .sockaddr_len      = sizeof(struct sockaddr_in),
1829        .bind_conflict     = inet_csk_bind_conflict,
1830#ifdef CONFIG_COMPAT
1831        .compat_setsockopt = compat_ip_setsockopt,
1832        .compat_getsockopt = compat_ip_getsockopt,
1833#endif
1834};
1835
1836#ifdef CONFIG_TCP_MD5SIG
1837static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1838        .md5_lookup             = tcp_v4_md5_lookup,
1839        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1840        .md5_add                = tcp_v4_md5_add_func,
1841        .md5_parse              = tcp_v4_parse_md5_keys,
1842};
1843#endif
1844
1845/* NOTE: A lot of things set to zero explicitly by call to
1846 *       sk_alloc() so need not be done here.
1847 */
1848static int tcp_v4_init_sock(struct sock *sk)
1849{
1850        struct inet_connection_sock *icsk = inet_csk(sk);
1851        struct tcp_sock *tp = tcp_sk(sk);
1852
1853        skb_queue_head_init(&tp->out_of_order_queue);
1854        tcp_init_xmit_timers(sk);
1855        tcp_prequeue_init(tp);
1856
1857        icsk->icsk_rto = TCP_TIMEOUT_INIT;
1858        tp->mdev = TCP_TIMEOUT_INIT;
1859
1860        /* So many TCP implementations out there (incorrectly) count the
1861         * initial SYN frame in their delayed-ACK and congestion control
1862         * algorithms that we must have the following bandaid to talk
1863         * efficiently to them.  -DaveM
1864         */
1865        tp->snd_cwnd = 2;
1866
1867        /* See draft-stevens-tcpca-spec-01 for discussion of the
1868         * initialization of these values.
1869         */
1870        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1871        tp->snd_cwnd_clamp = ~0;
1872        tp->mss_cache = TCP_MSS_DEFAULT;
1873
1874        tp->reordering = sysctl_tcp_reordering;
1875        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1876
1877        sk->sk_state = TCP_CLOSE;
1878
1879        sk->sk_write_space = sk_stream_write_space;
1880        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1881
1882        icsk->icsk_af_ops = &ipv4_specific;
1883        icsk->icsk_sync_mss = tcp_sync_mss;
1884#ifdef CONFIG_TCP_MD5SIG
1885        tp->af_specific = &tcp_sock_ipv4_specific;
1886#endif
1887
1888        /* TCP Cookie Transactions */
1889        if (sysctl_tcp_cookie_size > 0) {
1890                /* Default, cookies without s_data_payload. */
1891                tp->cookie_values =
1892                        kzalloc(sizeof(*tp->cookie_values),
1893                                sk->sk_allocation);
1894                if (tp->cookie_values != NULL)
1895                        kref_init(&tp->cookie_values->kref);
1896        }
1897        /* Presumed zeroed, in order of appearance:
1898         *      cookie_in_always, cookie_out_never,
1899         *      s_data_constant, s_data_in, s_data_out
1900         */
1901        sk->sk_sndbuf = sysctl_tcp_wmem[1];
1902        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1903
1904        local_bh_disable();
1905        percpu_counter_inc(&tcp_sockets_allocated);
1906        local_bh_enable();
1907
1908        return 0;
1909}
1910
1911void tcp_v4_destroy_sock(struct sock *sk)
1912{
1913        struct tcp_sock *tp = tcp_sk(sk);
1914
1915        tcp_clear_xmit_timers(sk);
1916
1917        tcp_cleanup_congestion_control(sk);
1918
1919        /* Cleanup up the write buffer. */
1920        tcp_write_queue_purge(sk);
1921
1922        /* Cleans up our, hopefully empty, out_of_order_queue. */
1923        __skb_queue_purge(&tp->out_of_order_queue);
1924
1925#ifdef CONFIG_TCP_MD5SIG
1926        /* Clean up the MD5 key list, if any */
1927        if (tp->md5sig_info) {
1928                tcp_v4_clear_md5_list(sk);
1929                kfree(tp->md5sig_info);
1930                tp->md5sig_info = NULL;
1931        }
1932#endif
1933
1934#ifdef CONFIG_NET_DMA
1935        /* Cleans up our sk_async_wait_queue */
1936        __skb_queue_purge(&sk->sk_async_wait_queue);
1937#endif
1938
1939        /* Clean prequeue, it must be empty really */
1940        __skb_queue_purge(&tp->ucopy.prequeue);
1941
1942        /* Clean up a referenced TCP bind bucket. */
1943        if (inet_csk(sk)->icsk_bind_hash)
1944                inet_put_port(sk);
1945
1946        /*
1947         * If sendmsg cached page exists, toss it.
1948         */
1949        if (sk->sk_sndmsg_page) {
1950                __free_page(sk->sk_sndmsg_page);
1951                sk->sk_sndmsg_page = NULL;
1952        }
1953
1954        /* TCP Cookie Transactions */
1955        if (tp->cookie_values != NULL) {
1956                kref_put(&tp->cookie_values->kref,
1957                         tcp_cookie_values_release);
1958                tp->cookie_values = NULL;
1959        }
1960
1961        percpu_counter_dec(&tcp_sockets_allocated);
1962}
1963
1964EXPORT_SYMBOL(tcp_v4_destroy_sock);
1965
1966#ifdef CONFIG_PROC_FS
1967/* Proc filesystem TCP sock list dumping. */
1968
1969static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1970{
1971        return hlist_nulls_empty(head) ? NULL :
1972                list_entry(head->first, struct inet_timewait_sock, tw_node);
1973}
1974
1975static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1976{
1977        return !is_a_nulls(tw->tw_node.next) ?
1978                hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1979}
1980
1981static void *listening_get_next(struct seq_file *seq, void *cur)
1982{
1983        struct inet_connection_sock *icsk;
1984        struct hlist_nulls_node *node;
1985        struct sock *sk = cur;
1986        struct inet_listen_hashbucket *ilb;
1987        struct tcp_iter_state *st = seq->private;
1988        struct net *net = seq_file_net(seq);
1989
1990        if (!sk) {
1991                st->bucket = 0;
1992                ilb = &tcp_hashinfo.listening_hash[0];
1993                spin_lock_bh(&ilb->lock);
1994                sk = sk_nulls_head(&ilb->head);
1995                goto get_sk;
1996        }
1997        ilb = &tcp_hashinfo.listening_hash[st->bucket];
1998        ++st->num;
1999
2000        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2001                struct request_sock *req = cur;
2002
2003                icsk = inet_csk(st->syn_wait_sk);
2004                req = req->dl_next;
2005                while (1) {
2006                        while (req) {
2007                                if (req->rsk_ops->family == st->family) {
2008                                        cur = req;
2009                                        goto out;
2010                                }
2011                                req = req->dl_next;
2012                        }
2013                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2014                                break;
2015get_req:
2016                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2017                }
2018                sk        = sk_next(st->syn_wait_sk);
2019                st->state = TCP_SEQ_STATE_LISTENING;
2020                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021        } else {
2022                icsk = inet_csk(sk);
2023                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024                if (reqsk_queue_len(&icsk->icsk_accept_queue))
2025                        goto start_req;
2026                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2027                sk = sk_next(sk);
2028        }
2029get_sk:
2030        sk_nulls_for_each_from(sk, node) {
2031                if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2032                        cur = sk;
2033                        goto out;
2034                }
2035                icsk = inet_csk(sk);
2036                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2037                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2038start_req:
2039                        st->uid         = sock_i_uid(sk);
2040                        st->syn_wait_sk = sk;
2041                        st->state       = TCP_SEQ_STATE_OPENREQ;
2042                        st->sbucket     = 0;
2043                        goto get_req;
2044                }
2045                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2046        }
2047        spin_unlock_bh(&ilb->lock);
2048        if (++st->bucket < INET_LHTABLE_SIZE) {
2049                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2050                spin_lock_bh(&ilb->lock);
2051                sk = sk_nulls_head(&ilb->head);
2052                goto get_sk;
2053        }
2054        cur = NULL;
2055out:
2056        return cur;
2057}
2058
2059static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2060{
2061        void *rc = listening_get_next(seq, NULL);
2062
2063        while (rc && *pos) {
2064                rc = listening_get_next(seq, rc);
2065                --*pos;
2066        }
2067        return rc;
2068}
2069
2070static inline int empty_bucket(struct tcp_iter_state *st)
2071{
2072        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2073                hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2074}
2075
2076static void *established_get_first(struct seq_file *seq)
2077{
2078        struct tcp_iter_state *st = seq->private;
2079        struct net *net = seq_file_net(seq);
2080        void *rc = NULL;
2081
2082        for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2083                struct sock *sk;
2084                struct hlist_nulls_node *node;
2085                struct inet_timewait_sock *tw;
2086                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2087
2088                /* Lockless fast path for the common case of empty buckets */
2089                if (empty_bucket(st))
2090                        continue;
2091
2092                spin_lock_bh(lock);
2093                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2094                        if (sk->sk_family != st->family ||
2095                            !net_eq(sock_net(sk), net)) {
2096                                continue;
2097                        }
2098                        rc = sk;
2099                        goto out;
2100                }
2101                st->state = TCP_SEQ_STATE_TIME_WAIT;
2102                inet_twsk_for_each(tw, node,
2103                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
2104                        if (tw->tw_family != st->family ||
2105                            !net_eq(twsk_net(tw), net)) {
2106                                continue;
2107                        }
2108                        rc = tw;
2109                        goto out;
2110                }
2111                spin_unlock_bh(lock);
2112                st->state = TCP_SEQ_STATE_ESTABLISHED;
2113        }
2114out:
2115        return rc;
2116}
2117
2118static void *established_get_next(struct seq_file *seq, void *cur)
2119{
2120        struct sock *sk = cur;
2121        struct inet_timewait_sock *tw;
2122        struct hlist_nulls_node *node;
2123        struct tcp_iter_state *st = seq->private;
2124        struct net *net = seq_file_net(seq);
2125
2126        ++st->num;
2127
2128        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2129                tw = cur;
2130                tw = tw_next(tw);
2131get_tw:
2132                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2133                        tw = tw_next(tw);
2134                }
2135                if (tw) {
2136                        cur = tw;
2137                        goto out;
2138                }
2139                spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2140                st->state = TCP_SEQ_STATE_ESTABLISHED;
2141
2142                /* Look for next non empty bucket */
2143                while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2144                                empty_bucket(st))
2145                        ;
2146                if (st->bucket > tcp_hashinfo.ehash_mask)
2147                        return NULL;
2148
2149                spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2150                sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2151        } else
2152                sk = sk_nulls_next(sk);
2153
2154        sk_nulls_for_each_from(sk, node) {
2155                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2156                        goto found;
2157        }
2158
2159        st->state = TCP_SEQ_STATE_TIME_WAIT;
2160        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2161        goto get_tw;
2162found:
2163        cur = sk;
2164out:
2165        return cur;
2166}
2167
2168static void *established_get_idx(struct seq_file *seq, loff_t pos)
2169{
2170        void *rc = established_get_first(seq);
2171
2172        while (rc && pos) {
2173                rc = established_get_next(seq, rc);
2174                --pos;
2175        }
2176        return rc;
2177}
2178
2179static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2180{
2181        void *rc;
2182        struct tcp_iter_state *st = seq->private;
2183
2184        st->state = TCP_SEQ_STATE_LISTENING;
2185        rc        = listening_get_idx(seq, &pos);
2186
2187        if (!rc) {
2188                st->state = TCP_SEQ_STATE_ESTABLISHED;
2189                rc        = established_get_idx(seq, pos);
2190        }
2191
2192        return rc;
2193}
2194
2195static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2196{
2197        struct tcp_iter_state *st = seq->private;
2198        st->state = TCP_SEQ_STATE_LISTENING;
2199        st->num = 0;
2200        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2201}
2202
2203static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2204{
2205        void *rc = NULL;
2206        struct tcp_iter_state *st;
2207
2208        if (v == SEQ_START_TOKEN) {
2209                rc = tcp_get_idx(seq, 0);
2210                goto out;
2211        }
2212        st = seq->private;
2213
2214        switch (st->state) {
2215        case TCP_SEQ_STATE_OPENREQ:
2216        case TCP_SEQ_STATE_LISTENING:
2217                rc = listening_get_next(seq, v);
2218                if (!rc) {
2219                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2220                        rc        = established_get_first(seq);
2221                }
2222                break;
2223        case TCP_SEQ_STATE_ESTABLISHED:
2224        case TCP_SEQ_STATE_TIME_WAIT:
2225                rc = established_get_next(seq, v);
2226                break;
2227        }
2228out:
2229        ++*pos;
2230        return rc;
2231}
2232
2233static void tcp_seq_stop(struct seq_file *seq, void *v)
2234{
2235        struct tcp_iter_state *st = seq->private;
2236
2237        switch (st->state) {
2238        case TCP_SEQ_STATE_OPENREQ:
2239                if (v) {
2240                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2241                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2242                }
2243        case TCP_SEQ_STATE_LISTENING:
2244                if (v != SEQ_START_TOKEN)
2245                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2246                break;
2247        case TCP_SEQ_STATE_TIME_WAIT:
2248        case TCP_SEQ_STATE_ESTABLISHED:
2249                if (v)
2250                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2251                break;
2252        }
2253}
2254
2255static int tcp_seq_open(struct inode *inode, struct file *file)
2256{
2257        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2258        struct tcp_iter_state *s;
2259        int err;
2260
2261        err = seq_open_net(inode, file, &afinfo->seq_ops,
2262                          sizeof(struct tcp_iter_state));
2263        if (err < 0)
2264                return err;
2265
2266        s = ((struct seq_file *)file->private_data)->private;
2267        s->family               = afinfo->family;
2268        return 0;
2269}
2270
2271int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2272{
2273        int rc = 0;
2274        struct proc_dir_entry *p;
2275
2276        afinfo->seq_fops.open           = tcp_seq_open;
2277        afinfo->seq_fops.read           = seq_read;
2278        afinfo->seq_fops.llseek         = seq_lseek;
2279        afinfo->seq_fops.release        = seq_release_net;
2280
2281        afinfo->seq_ops.start           = tcp_seq_start;
2282        afinfo->seq_ops.next            = tcp_seq_next;
2283        afinfo->seq_ops.stop            = tcp_seq_stop;
2284
2285        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2286                             &afinfo->seq_fops, afinfo);
2287        if (!p)
2288                rc = -ENOMEM;
2289        return rc;
2290}
2291
2292void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2293{
2294        proc_net_remove(net, afinfo->name);
2295}
2296
2297static void get_openreq4(struct sock *sk, struct request_sock *req,
2298                         struct seq_file *f, int i, int uid, int *len)
2299{
2300        const struct inet_request_sock *ireq = inet_rsk(req);
2301        int ttd = req->expires - jiffies;
2302
2303        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2304                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2305                i,
2306                ireq->loc_addr,
2307                ntohs(inet_sk(sk)->inet_sport),
2308                ireq->rmt_addr,
2309                ntohs(ireq->rmt_port),
2310                TCP_SYN_RECV,
2311                0, 0, /* could print option size, but that is af dependent. */
2312                1,    /* timers active (only the expire timer) */
2313                jiffies_to_clock_t(ttd),
2314                req->retrans,
2315                uid,
2316                0,  /* non standard timer */
2317                0, /* open_requests have no inode */
2318                atomic_read(&sk->sk_refcnt),
2319                req,
2320                len);
2321}
2322
2323static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2324{
2325        int timer_active;
2326        unsigned long timer_expires;
2327        struct tcp_sock *tp = tcp_sk(sk);
2328        const struct inet_connection_sock *icsk = inet_csk(sk);
2329        struct inet_sock *inet = inet_sk(sk);
2330        __be32 dest = inet->inet_daddr;
2331        __be32 src = inet->inet_rcv_saddr;
2332        __u16 destp = ntohs(inet->inet_dport);
2333        __u16 srcp = ntohs(inet->inet_sport);
2334        int rx_queue;
2335
2336        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2337                timer_active    = 1;
2338                timer_expires   = icsk->icsk_timeout;
2339        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2340                timer_active    = 4;
2341                timer_expires   = icsk->icsk_timeout;
2342        } else if (timer_pending(&sk->sk_timer)) {
2343                timer_active    = 2;
2344                timer_expires   = sk->sk_timer.expires;
2345        } else {
2346                timer_active    = 0;
2347                timer_expires = jiffies;
2348        }
2349
2350        if (sk->sk_state == TCP_LISTEN)
2351                rx_queue = sk->sk_ack_backlog;
2352        else
2353                /*
2354                 * because we dont lock socket, we might find a transient negative value
2355                 */
2356                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2357
2358        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2359                        "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2360                i, src, srcp, dest, destp, sk->sk_state,
2361                tp->write_seq - tp->snd_una,
2362                rx_queue,
2363                timer_active,
2364                jiffies_to_clock_t(timer_expires - jiffies),
2365                icsk->icsk_retransmits,
2366                sock_i_uid(sk),
2367                icsk->icsk_probes_out,
2368                sock_i_ino(sk),
2369                atomic_read(&sk->sk_refcnt), sk,
2370                jiffies_to_clock_t(icsk->icsk_rto),
2371                jiffies_to_clock_t(icsk->icsk_ack.ato),
2372                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2373                tp->snd_cwnd,
2374                tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2375                len);
2376}
2377
2378static void get_timewait4_sock(struct inet_timewait_sock *tw,
2379                               struct seq_file *f, int i, int *len)
2380{
2381        __be32 dest, src;
2382        __u16 destp, srcp;
2383        int ttd = tw->tw_ttd - jiffies;
2384
2385        if (ttd < 0)
2386                ttd = 0;
2387
2388        dest  = tw->tw_daddr;
2389        src   = tw->tw_rcv_saddr;
2390        destp = ntohs(tw->tw_dport);
2391        srcp  = ntohs(tw->tw_sport);
2392
2393        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2394                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2395                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2396                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2397                atomic_read(&tw->tw_refcnt), tw, len);
2398}
2399
2400#define TMPSZ 150
2401
2402static int tcp4_seq_show(struct seq_file *seq, void *v)
2403{
2404        struct tcp_iter_state *st;
2405        int len;
2406
2407        if (v == SEQ_START_TOKEN) {
2408                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2409                           "  sl  local_address rem_address   st tx_queue "
2410                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2411                           "inode");
2412                goto out;
2413        }
2414        st = seq->private;
2415
2416        switch (st->state) {
2417        case TCP_SEQ_STATE_LISTENING:
2418        case TCP_SEQ_STATE_ESTABLISHED:
2419                get_tcp4_sock(v, seq, st->num, &len);
2420                break;
2421        case TCP_SEQ_STATE_OPENREQ:
2422                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2423                break;
2424        case TCP_SEQ_STATE_TIME_WAIT:
2425                get_timewait4_sock(v, seq, st->num, &len);
2426                break;
2427        }
2428        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2429out:
2430        return 0;
2431}
2432
2433static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2434        .name           = "tcp",
2435        .family         = AF_INET,
2436        .seq_fops       = {
2437                .owner          = THIS_MODULE,
2438        },
2439        .seq_ops        = {
2440                .show           = tcp4_seq_show,
2441        },
2442};
2443
2444static int __net_init tcp4_proc_init_net(struct net *net)
2445{
2446        return tcp_proc_register(net, &tcp4_seq_afinfo);
2447}
2448
2449static void __net_exit tcp4_proc_exit_net(struct net *net)
2450{
2451        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2452}
2453
2454static struct pernet_operations tcp4_net_ops = {
2455        .init = tcp4_proc_init_net,
2456        .exit = tcp4_proc_exit_net,
2457};
2458
2459int __init tcp4_proc_init(void)
2460{
2461        return register_pernet_subsys(&tcp4_net_ops);
2462}
2463
2464void tcp4_proc_exit(void)
2465{
2466        unregister_pernet_subsys(&tcp4_net_ops);
2467}
2468#endif /* CONFIG_PROC_FS */
2469
2470struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2471{
2472        struct iphdr *iph = skb_gro_network_header(skb);
2473
2474        switch (skb->ip_summed) {
2475        case CHECKSUM_COMPLETE:
2476                if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2477                                  skb->csum)) {
2478                        skb->ip_summed = CHECKSUM_UNNECESSARY;
2479                        break;
2480                }
2481
2482                /* fall through */
2483        case CHECKSUM_NONE:
2484                NAPI_GRO_CB(skb)->flush = 1;
2485                return NULL;
2486        }
2487
2488        return tcp_gro_receive(head, skb);
2489}
2490EXPORT_SYMBOL(tcp4_gro_receive);
2491
2492int tcp4_gro_complete(struct sk_buff *skb)
2493{
2494        struct iphdr *iph = ip_hdr(skb);
2495        struct tcphdr *th = tcp_hdr(skb);
2496
2497        th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2498                                  iph->saddr, iph->daddr, 0);
2499        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2500
2501        return tcp_gro_complete(skb);
2502}
2503EXPORT_SYMBOL(tcp4_gro_complete);
2504
2505struct proto tcp_prot = {
2506        .name                   = "TCP",
2507        .owner                  = THIS_MODULE,
2508        .close                  = tcp_close,
2509        .connect                = tcp_v4_connect,
2510        .disconnect             = tcp_disconnect,
2511        .accept                 = inet_csk_accept,
2512        .ioctl                  = tcp_ioctl,
2513        .init                   = tcp_v4_init_sock,
2514        .destroy                = tcp_v4_destroy_sock,
2515        .shutdown               = tcp_shutdown,
2516        .setsockopt             = tcp_setsockopt,
2517        .getsockopt             = tcp_getsockopt,
2518        .recvmsg                = tcp_recvmsg,
2519        .backlog_rcv            = tcp_v4_do_rcv,
2520        .hash                   = inet_hash,
2521        .unhash                 = inet_unhash,
2522        .get_port               = inet_csk_get_port,
2523        .enter_memory_pressure  = tcp_enter_memory_pressure,
2524        .sockets_allocated      = &tcp_sockets_allocated,
2525        .orphan_count           = &tcp_orphan_count,
2526        .memory_allocated       = &tcp_memory_allocated,
2527        .memory_pressure        = &tcp_memory_pressure,
2528        .sysctl_mem             = sysctl_tcp_mem,
2529        .sysctl_wmem            = sysctl_tcp_wmem,
2530        .sysctl_rmem            = sysctl_tcp_rmem,
2531        .max_header             = MAX_TCP_HEADER,
2532        .obj_size               = sizeof(struct tcp_sock),
2533        .slab_flags             = SLAB_DESTROY_BY_RCU,
2534        .twsk_prot              = &tcp_timewait_sock_ops,
2535        .rsk_prot               = &tcp_request_sock_ops,
2536        .h.hashinfo             = &tcp_hashinfo,
2537#ifdef CONFIG_COMPAT
2538        .compat_setsockopt      = compat_tcp_setsockopt,
2539        .compat_getsockopt      = compat_tcp_getsockopt,
2540#endif
2541};
2542
2543
2544static int __net_init tcp_sk_init(struct net *net)
2545{
2546        return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2547                                    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2548}
2549
2550static void __net_exit tcp_sk_exit(struct net *net)
2551{
2552        inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2553}
2554
2555static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2556{
2557        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2558}
2559
2560static struct pernet_operations __net_initdata tcp_sk_ops = {
2561       .init       = tcp_sk_init,
2562       .exit       = tcp_sk_exit,
2563       .exit_batch = tcp_sk_exit_batch,
2564};
2565
2566void __init tcp_v4_init(void)
2567{
2568        inet_hashinfo_init(&tcp_hashinfo);
2569        if (register_pernet_subsys(&tcp_sk_ops))
2570                panic("Failed to create the TCP control socket.\n");
2571}
2572
2573EXPORT_SYMBOL(ipv4_specific);
2574EXPORT_SYMBOL(tcp_hashinfo);
2575EXPORT_SYMBOL(tcp_prot);
2576EXPORT_SYMBOL(tcp_v4_conn_request);
2577EXPORT_SYMBOL(tcp_v4_connect);
2578EXPORT_SYMBOL(tcp_v4_do_rcv);
2579EXPORT_SYMBOL(tcp_v4_remember_stamp);
2580EXPORT_SYMBOL(tcp_v4_send_check);
2581EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2582
2583#ifdef CONFIG_PROC_FS
2584EXPORT_SYMBOL(tcp_proc_register);
2585EXPORT_SYMBOL(tcp_proc_unregister);
2586#endif
2587EXPORT_SYMBOL(sysctl_tcp_low_latency);
2588
2589