linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53
  54#include <linux/bottom_half.h>
  55#include <linux/types.h>
  56#include <linux/fcntl.h>
  57#include <linux/module.h>
  58#include <linux/random.h>
  59#include <linux/cache.h>
  60#include <linux/jhash.h>
  61#include <linux/init.h>
  62#include <linux/times.h>
  63#include <linux/slab.h>
  64
  65#include <net/net_namespace.h>
  66#include <net/icmp.h>
  67#include <net/inet_hashtables.h>
  68#include <net/tcp.h>
  69#include <net/transp_v6.h>
  70#include <net/ipv6.h>
  71#include <net/inet_common.h>
  72#include <net/timewait_sock.h>
  73#include <net/xfrm.h>
  74#include <net/netdma.h>
  75
  76#include <linux/inet.h>
  77#include <linux/ipv6.h>
  78#include <linux/stddef.h>
  79#include <linux/proc_fs.h>
  80#include <linux/seq_file.h>
  81
  82#include <linux/crypto.h>
  83#include <linux/scatterlist.h>
  84
  85int sysctl_tcp_tw_reuse __read_mostly;
  86int sysctl_tcp_low_latency __read_mostly;
  87EXPORT_SYMBOL(sysctl_tcp_low_latency);
  88
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  92                                                   __be32 addr);
  93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  94                               __be32 daddr, __be32 saddr, struct tcphdr *th);
  95#else
  96static inline
  97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  98{
  99        return NULL;
 100}
 101#endif
 102
 103struct inet_hashinfo tcp_hashinfo;
 104EXPORT_SYMBOL(tcp_hashinfo);
 105
 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 107{
 108        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 109                                          ip_hdr(skb)->saddr,
 110                                          tcp_hdr(skb)->dest,
 111                                          tcp_hdr(skb)->source);
 112}
 113
 114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 115{
 116        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 117        struct tcp_sock *tp = tcp_sk(sk);
 118
 119        /* With PAWS, it is safe from the viewpoint
 120           of data integrity. Even without PAWS it is safe provided sequence
 121           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 122
 123           Actually, the idea is close to VJ's one, only timestamp cache is
 124           held not per host, but per port pair and TW bucket is used as state
 125           holder.
 126
 127           If TW bucket has been already destroyed we fall back to VJ's scheme
 128           and use initial timestamp retrieved from peer table.
 129         */
 130        if (tcptw->tw_ts_recent_stamp &&
 131            (twp == NULL || (sysctl_tcp_tw_reuse &&
 132                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 133                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 134                if (tp->write_seq == 0)
 135                        tp->write_seq = 1;
 136                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 137                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 138                sock_hold(sktw);
 139                return 1;
 140        }
 141
 142        return 0;
 143}
 144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 145
 146/* This will initiate an outgoing connection. */
 147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 148{
 149        struct inet_sock *inet = inet_sk(sk);
 150        struct tcp_sock *tp = tcp_sk(sk);
 151        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 152        struct rtable *rt;
 153        __be32 daddr, nexthop;
 154        int tmp;
 155        int err;
 156
 157        if (addr_len < sizeof(struct sockaddr_in))
 158                return -EINVAL;
 159
 160        if (usin->sin_family != AF_INET)
 161                return -EAFNOSUPPORT;
 162
 163        nexthop = daddr = usin->sin_addr.s_addr;
 164        if (inet->opt && inet->opt->srr) {
 165                if (!daddr)
 166                        return -EINVAL;
 167                nexthop = inet->opt->faddr;
 168        }
 169
 170        tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               inet->inet_sport, usin->sin_port, sk, 1);
 174        if (tmp < 0) {
 175                if (tmp == -ENETUNREACH)
 176                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                return tmp;
 178        }
 179
 180        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                ip_rt_put(rt);
 182                return -ENETUNREACH;
 183        }
 184
 185        if (!inet->opt || !inet->opt->srr)
 186                daddr = rt->rt_dst;
 187
 188        if (!inet->inet_saddr)
 189                inet->inet_saddr = rt->rt_src;
 190        inet->inet_rcv_saddr = inet->inet_saddr;
 191
 192        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                /* Reset inherited state */
 194                tp->rx_opt.ts_recent       = 0;
 195                tp->rx_opt.ts_recent_stamp = 0;
 196                tp->write_seq              = 0;
 197        }
 198
 199        if (tcp_death_row.sysctl_tw_recycle &&
 200            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 201                struct inet_peer *peer = rt_get_peer(rt);
 202                /*
 203                 * VJ's idea. We save last timestamp seen from
 204                 * the destination in peer table, when entering state
 205                 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 206                 * when trying new connection.
 207                 */
 208                if (peer) {
 209                        inet_peer_refcheck(peer);
 210                        if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 211                                tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 212                                tp->rx_opt.ts_recent = peer->tcp_ts;
 213                        }
 214                }
 215        }
 216
 217        inet->inet_dport = usin->sin_port;
 218        inet->inet_daddr = daddr;
 219
 220        inet_csk(sk)->icsk_ext_hdr_len = 0;
 221        if (inet->opt)
 222                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 223
 224        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 225
 226        /* Socket identity is still unknown (sport may be zero).
 227         * However we set state to SYN-SENT and not releasing socket
 228         * lock select source port, enter ourselves into the hash tables and
 229         * complete initialization after this.
 230         */
 231        tcp_set_state(sk, TCP_SYN_SENT);
 232        err = inet_hash_connect(&tcp_death_row, sk);
 233        if (err)
 234                goto failure;
 235
 236        err = ip_route_newports(&rt, IPPROTO_TCP,
 237                                inet->inet_sport, inet->inet_dport, sk);
 238        if (err)
 239                goto failure;
 240
 241        /* OK, now commit destination to socket.  */
 242        sk->sk_gso_type = SKB_GSO_TCPV4;
 243        sk_setup_caps(sk, &rt->dst);
 244
 245        if (!tp->write_seq)
 246                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                           inet->inet_daddr,
 248                                                           inet->inet_sport,
 249                                                           usin->sin_port);
 250
 251        inet->inet_id = tp->write_seq ^ jiffies;
 252
 253        err = tcp_connect(sk);
 254        rt = NULL;
 255        if (err)
 256                goto failure;
 257
 258        return 0;
 259
 260failure:
 261        /*
 262         * This unhashes the socket and releases the local port,
 263         * if necessary.
 264         */
 265        tcp_set_state(sk, TCP_CLOSE);
 266        ip_rt_put(rt);
 267        sk->sk_route_caps = 0;
 268        inet->inet_dport = 0;
 269        return err;
 270}
 271EXPORT_SYMBOL(tcp_v4_connect);
 272
 273/*
 274 * This routine does path mtu discovery as defined in RFC1191.
 275 */
 276static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 277{
 278        struct dst_entry *dst;
 279        struct inet_sock *inet = inet_sk(sk);
 280
 281        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 282         * send out by Linux are always <576bytes so they should go through
 283         * unfragmented).
 284         */
 285        if (sk->sk_state == TCP_LISTEN)
 286                return;
 287
 288        /* We don't check in the destentry if pmtu discovery is forbidden
 289         * on this route. We just assume that no packet_to_big packets
 290         * are send back when pmtu discovery is not active.
 291         * There is a small race when the user changes this flag in the
 292         * route, but I think that's acceptable.
 293         */
 294        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 295                return;
 296
 297        dst->ops->update_pmtu(dst, mtu);
 298
 299        /* Something is about to be wrong... Remember soft error
 300         * for the case, if this connection will not able to recover.
 301         */
 302        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                sk->sk_err_soft = EMSGSIZE;
 304
 305        mtu = dst_mtu(dst);
 306
 307        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                tcp_sync_mss(sk, mtu);
 310
 311                /* Resend the TCP packet because it's
 312                 * clear that the old packet has been
 313                 * dropped. This is the new "fast" path mtu
 314                 * discovery.
 315                 */
 316                tcp_simple_retransmit(sk);
 317        } /* else let the usual retransmit timer handle it */
 318}
 319
 320/*
 321 * This routine is called by the ICMP module when it gets some
 322 * sort of error condition.  If err < 0 then the socket should
 323 * be closed and the error returned to the user.  If err > 0
 324 * it's just the icmp type << 8 | icmp code.  After adjustment
 325 * header points to the first 8 bytes of the tcp header.  We need
 326 * to find the appropriate port.
 327 *
 328 * The locking strategy used here is very "optimistic". When
 329 * someone else accesses the socket the ICMP is just dropped
 330 * and for some paths there is no check at all.
 331 * A more general error queue to queue errors for later handling
 332 * is probably better.
 333 *
 334 */
 335
 336void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 337{
 338        struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 339        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 340        struct inet_connection_sock *icsk;
 341        struct tcp_sock *tp;
 342        struct inet_sock *inet;
 343        const int type = icmp_hdr(icmp_skb)->type;
 344        const int code = icmp_hdr(icmp_skb)->code;
 345        struct sock *sk;
 346        struct sk_buff *skb;
 347        __u32 seq;
 348        __u32 remaining;
 349        int err;
 350        struct net *net = dev_net(icmp_skb->dev);
 351
 352        if (icmp_skb->len < (iph->ihl << 2) + 8) {
 353                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 354                return;
 355        }
 356
 357        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 358                        iph->saddr, th->source, inet_iif(icmp_skb));
 359        if (!sk) {
 360                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 361                return;
 362        }
 363        if (sk->sk_state == TCP_TIME_WAIT) {
 364                inet_twsk_put(inet_twsk(sk));
 365                return;
 366        }
 367
 368        bh_lock_sock(sk);
 369        /* If too many ICMPs get dropped on busy
 370         * servers this needs to be solved differently.
 371         */
 372        if (sock_owned_by_user(sk))
 373                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 374
 375        if (sk->sk_state == TCP_CLOSE)
 376                goto out;
 377
 378        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 379                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 380                goto out;
 381        }
 382
 383        icsk = inet_csk(sk);
 384        tp = tcp_sk(sk);
 385        seq = ntohl(th->seq);
 386        if (sk->sk_state != TCP_LISTEN &&
 387            !between(seq, tp->snd_una, tp->snd_nxt)) {
 388                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 389                goto out;
 390        }
 391
 392        switch (type) {
 393        case ICMP_SOURCE_QUENCH:
 394                /* Just silently ignore these. */
 395                goto out;
 396        case ICMP_PARAMETERPROB:
 397                err = EPROTO;
 398                break;
 399        case ICMP_DEST_UNREACH:
 400                if (code > NR_ICMP_UNREACH)
 401                        goto out;
 402
 403                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 404                        if (!sock_owned_by_user(sk))
 405                                do_pmtu_discovery(sk, iph, info);
 406                        goto out;
 407                }
 408
 409                err = icmp_err_convert[code].errno;
 410                /* check if icmp_skb allows revert of backoff
 411                 * (see draft-zimmermann-tcp-lcd) */
 412                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 413                        break;
 414                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 415                    !icsk->icsk_backoff)
 416                        break;
 417
 418                if (sock_owned_by_user(sk))
 419                        break;
 420
 421                icsk->icsk_backoff--;
 422                inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 423                                         icsk->icsk_backoff;
 424                tcp_bound_rto(sk);
 425
 426                skb = tcp_write_queue_head(sk);
 427                BUG_ON(!skb);
 428
 429                remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 430                                tcp_time_stamp - TCP_SKB_CB(skb)->when);
 431
 432                if (remaining) {
 433                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 434                                                  remaining, TCP_RTO_MAX);
 435                } else {
 436                        /* RTO revert clocked out retransmission.
 437                         * Will retransmit now */
 438                        tcp_retransmit_timer(sk);
 439                }
 440
 441                break;
 442        case ICMP_TIME_EXCEEDED:
 443                err = EHOSTUNREACH;
 444                break;
 445        default:
 446                goto out;
 447        }
 448
 449        switch (sk->sk_state) {
 450                struct request_sock *req, **prev;
 451        case TCP_LISTEN:
 452                if (sock_owned_by_user(sk))
 453                        goto out;
 454
 455                req = inet_csk_search_req(sk, &prev, th->dest,
 456                                          iph->daddr, iph->saddr);
 457                if (!req)
 458                        goto out;
 459
 460                /* ICMPs are not backlogged, hence we cannot get
 461                   an established socket here.
 462                 */
 463                WARN_ON(req->sk);
 464
 465                if (seq != tcp_rsk(req)->snt_isn) {
 466                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 467                        goto out;
 468                }
 469
 470                /*
 471                 * Still in SYN_RECV, just remove it silently.
 472                 * There is no good way to pass the error to the newly
 473                 * created socket, and POSIX does not want network
 474                 * errors returned from accept().
 475                 */
 476                inet_csk_reqsk_queue_drop(sk, req, prev);
 477                goto out;
 478
 479        case TCP_SYN_SENT:
 480        case TCP_SYN_RECV:  /* Cannot happen.
 481                               It can f.e. if SYNs crossed.
 482                             */
 483                if (!sock_owned_by_user(sk)) {
 484                        sk->sk_err = err;
 485
 486                        sk->sk_error_report(sk);
 487
 488                        tcp_done(sk);
 489                } else {
 490                        sk->sk_err_soft = err;
 491                }
 492                goto out;
 493        }
 494
 495        /* If we've already connected we will keep trying
 496         * until we time out, or the user gives up.
 497         *
 498         * rfc1122 4.2.3.9 allows to consider as hard errors
 499         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 500         * but it is obsoleted by pmtu discovery).
 501         *
 502         * Note, that in modern internet, where routing is unreliable
 503         * and in each dark corner broken firewalls sit, sending random
 504         * errors ordered by their masters even this two messages finally lose
 505         * their original sense (even Linux sends invalid PORT_UNREACHs)
 506         *
 507         * Now we are in compliance with RFCs.
 508         *                                                      --ANK (980905)
 509         */
 510
 511        inet = inet_sk(sk);
 512        if (!sock_owned_by_user(sk) && inet->recverr) {
 513                sk->sk_err = err;
 514                sk->sk_error_report(sk);
 515        } else  { /* Only an error on timeout */
 516                sk->sk_err_soft = err;
 517        }
 518
 519out:
 520        bh_unlock_sock(sk);
 521        sock_put(sk);
 522}
 523
 524static void __tcp_v4_send_check(struct sk_buff *skb,
 525                                __be32 saddr, __be32 daddr)
 526{
 527        struct tcphdr *th = tcp_hdr(skb);
 528
 529        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 530                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 531                skb->csum_start = skb_transport_header(skb) - skb->head;
 532                skb->csum_offset = offsetof(struct tcphdr, check);
 533        } else {
 534                th->check = tcp_v4_check(skb->len, saddr, daddr,
 535                                         csum_partial(th,
 536                                                      th->doff << 2,
 537                                                      skb->csum));
 538        }
 539}
 540
 541/* This routine computes an IPv4 TCP checksum. */
 542void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 543{
 544        struct inet_sock *inet = inet_sk(sk);
 545
 546        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 547}
 548EXPORT_SYMBOL(tcp_v4_send_check);
 549
 550int tcp_v4_gso_send_check(struct sk_buff *skb)
 551{
 552        const struct iphdr *iph;
 553        struct tcphdr *th;
 554
 555        if (!pskb_may_pull(skb, sizeof(*th)))
 556                return -EINVAL;
 557
 558        iph = ip_hdr(skb);
 559        th = tcp_hdr(skb);
 560
 561        th->check = 0;
 562        skb->ip_summed = CHECKSUM_PARTIAL;
 563        __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 564        return 0;
 565}
 566
 567/*
 568 *      This routine will send an RST to the other tcp.
 569 *
 570 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 571 *                    for reset.
 572 *      Answer: if a packet caused RST, it is not for a socket
 573 *              existing in our system, if it is matched to a socket,
 574 *              it is just duplicate segment or bug in other side's TCP.
 575 *              So that we build reply only basing on parameters
 576 *              arrived with segment.
 577 *      Exception: precedence violation. We do not implement it in any case.
 578 */
 579
 580static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 581{
 582        struct tcphdr *th = tcp_hdr(skb);
 583        struct {
 584                struct tcphdr th;
 585#ifdef CONFIG_TCP_MD5SIG
 586                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 587#endif
 588        } rep;
 589        struct ip_reply_arg arg;
 590#ifdef CONFIG_TCP_MD5SIG
 591        struct tcp_md5sig_key *key;
 592#endif
 593        struct net *net;
 594
 595        /* Never send a reset in response to a reset. */
 596        if (th->rst)
 597                return;
 598
 599        if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 600                return;
 601
 602        /* Swap the send and the receive. */
 603        memset(&rep, 0, sizeof(rep));
 604        rep.th.dest   = th->source;
 605        rep.th.source = th->dest;
 606        rep.th.doff   = sizeof(struct tcphdr) / 4;
 607        rep.th.rst    = 1;
 608
 609        if (th->ack) {
 610                rep.th.seq = th->ack_seq;
 611        } else {
 612                rep.th.ack = 1;
 613                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 614                                       skb->len - (th->doff << 2));
 615        }
 616
 617        memset(&arg, 0, sizeof(arg));
 618        arg.iov[0].iov_base = (unsigned char *)&rep;
 619        arg.iov[0].iov_len  = sizeof(rep.th);
 620
 621#ifdef CONFIG_TCP_MD5SIG
 622        key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 623        if (key) {
 624                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 625                                   (TCPOPT_NOP << 16) |
 626                                   (TCPOPT_MD5SIG << 8) |
 627                                   TCPOLEN_MD5SIG);
 628                /* Update length and the length the header thinks exists */
 629                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 630                rep.th.doff = arg.iov[0].iov_len / 4;
 631
 632                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 633                                     key, ip_hdr(skb)->saddr,
 634                                     ip_hdr(skb)->daddr, &rep.th);
 635        }
 636#endif
 637        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 638                                      ip_hdr(skb)->saddr, /* XXX */
 639                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 640        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 641        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 642
 643        net = dev_net(skb_dst(skb)->dev);
 644        ip_send_reply(net->ipv4.tcp_sock, skb,
 645                      &arg, arg.iov[0].iov_len);
 646
 647        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 648        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 649}
 650
 651/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 652   outside socket context is ugly, certainly. What can I do?
 653 */
 654
 655static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 656                            u32 win, u32 ts, int oif,
 657                            struct tcp_md5sig_key *key,
 658                            int reply_flags)
 659{
 660        struct tcphdr *th = tcp_hdr(skb);
 661        struct {
 662                struct tcphdr th;
 663                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 664#ifdef CONFIG_TCP_MD5SIG
 665                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 666#endif
 667                        ];
 668        } rep;
 669        struct ip_reply_arg arg;
 670        struct net *net = dev_net(skb_dst(skb)->dev);
 671
 672        memset(&rep.th, 0, sizeof(struct tcphdr));
 673        memset(&arg, 0, sizeof(arg));
 674
 675        arg.iov[0].iov_base = (unsigned char *)&rep;
 676        arg.iov[0].iov_len  = sizeof(rep.th);
 677        if (ts) {
 678                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 679                                   (TCPOPT_TIMESTAMP << 8) |
 680                                   TCPOLEN_TIMESTAMP);
 681                rep.opt[1] = htonl(tcp_time_stamp);
 682                rep.opt[2] = htonl(ts);
 683                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 684        }
 685
 686        /* Swap the send and the receive. */
 687        rep.th.dest    = th->source;
 688        rep.th.source  = th->dest;
 689        rep.th.doff    = arg.iov[0].iov_len / 4;
 690        rep.th.seq     = htonl(seq);
 691        rep.th.ack_seq = htonl(ack);
 692        rep.th.ack     = 1;
 693        rep.th.window  = htons(win);
 694
 695#ifdef CONFIG_TCP_MD5SIG
 696        if (key) {
 697                int offset = (ts) ? 3 : 0;
 698
 699                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 700                                          (TCPOPT_NOP << 16) |
 701                                          (TCPOPT_MD5SIG << 8) |
 702                                          TCPOLEN_MD5SIG);
 703                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 704                rep.th.doff = arg.iov[0].iov_len/4;
 705
 706                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 707                                    key, ip_hdr(skb)->saddr,
 708                                    ip_hdr(skb)->daddr, &rep.th);
 709        }
 710#endif
 711        arg.flags = reply_flags;
 712        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 713                                      ip_hdr(skb)->saddr, /* XXX */
 714                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 715        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 716        if (oif)
 717                arg.bound_dev_if = oif;
 718
 719        ip_send_reply(net->ipv4.tcp_sock, skb,
 720                      &arg, arg.iov[0].iov_len);
 721
 722        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 723}
 724
 725static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 726{
 727        struct inet_timewait_sock *tw = inet_twsk(sk);
 728        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 729
 730        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 731                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 732                        tcptw->tw_ts_recent,
 733                        tw->tw_bound_dev_if,
 734                        tcp_twsk_md5_key(tcptw),
 735                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 736                        );
 737
 738        inet_twsk_put(tw);
 739}
 740
 741static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 742                                  struct request_sock *req)
 743{
 744        tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 745                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 746                        req->ts_recent,
 747                        0,
 748                        tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 749                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 750}
 751
 752/*
 753 *      Send a SYN-ACK after having received a SYN.
 754 *      This still operates on a request_sock only, not on a big
 755 *      socket.
 756 */
 757static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 758                              struct request_sock *req,
 759                              struct request_values *rvp)
 760{
 761        const struct inet_request_sock *ireq = inet_rsk(req);
 762        int err = -1;
 763        struct sk_buff * skb;
 764
 765        /* First, grab a route. */
 766        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 767                return -1;
 768
 769        skb = tcp_make_synack(sk, dst, req, rvp);
 770
 771        if (skb) {
 772                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 773
 774                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 775                                            ireq->rmt_addr,
 776                                            ireq->opt);
 777                err = net_xmit_eval(err);
 778        }
 779
 780        dst_release(dst);
 781        return err;
 782}
 783
 784static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 785                              struct request_values *rvp)
 786{
 787        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 788        return tcp_v4_send_synack(sk, NULL, req, rvp);
 789}
 790
 791/*
 792 *      IPv4 request_sock destructor.
 793 */
 794static void tcp_v4_reqsk_destructor(struct request_sock *req)
 795{
 796        kfree(inet_rsk(req)->opt);
 797}
 798
 799static void syn_flood_warning(const struct sk_buff *skb)
 800{
 801        const char *msg;
 802
 803#ifdef CONFIG_SYN_COOKIES
 804        if (sysctl_tcp_syncookies)
 805                msg = "Sending cookies";
 806        else
 807#endif
 808                msg = "Dropping request";
 809
 810        pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
 811                                ntohs(tcp_hdr(skb)->dest), msg);
 812}
 813
 814/*
 815 * Save and compile IPv4 options into the request_sock if needed.
 816 */
 817static struct ip_options *tcp_v4_save_options(struct sock *sk,
 818                                              struct sk_buff *skb)
 819{
 820        struct ip_options *opt = &(IPCB(skb)->opt);
 821        struct ip_options *dopt = NULL;
 822
 823        if (opt && opt->optlen) {
 824                int opt_size = optlength(opt);
 825                dopt = kmalloc(opt_size, GFP_ATOMIC);
 826                if (dopt) {
 827                        if (ip_options_echo(dopt, skb)) {
 828                                kfree(dopt);
 829                                dopt = NULL;
 830                        }
 831                }
 832        }
 833        return dopt;
 834}
 835
 836#ifdef CONFIG_TCP_MD5SIG
 837/*
 838 * RFC2385 MD5 checksumming requires a mapping of
 839 * IP address->MD5 Key.
 840 * We need to maintain these in the sk structure.
 841 */
 842
 843/* Find the Key structure for an address.  */
 844static struct tcp_md5sig_key *
 845                        tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 846{
 847        struct tcp_sock *tp = tcp_sk(sk);
 848        int i;
 849
 850        if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 851                return NULL;
 852        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 853                if (tp->md5sig_info->keys4[i].addr == addr)
 854                        return &tp->md5sig_info->keys4[i].base;
 855        }
 856        return NULL;
 857}
 858
 859struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 860                                         struct sock *addr_sk)
 861{
 862        return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 863}
 864EXPORT_SYMBOL(tcp_v4_md5_lookup);
 865
 866static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 867                                                      struct request_sock *req)
 868{
 869        return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 870}
 871
 872/* This can be called on a newly created socket, from other files */
 873int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 874                      u8 *newkey, u8 newkeylen)
 875{
 876        /* Add Key to the list */
 877        struct tcp_md5sig_key *key;
 878        struct tcp_sock *tp = tcp_sk(sk);
 879        struct tcp4_md5sig_key *keys;
 880
 881        key = tcp_v4_md5_do_lookup(sk, addr);
 882        if (key) {
 883                /* Pre-existing entry - just update that one. */
 884                kfree(key->key);
 885                key->key = newkey;
 886                key->keylen = newkeylen;
 887        } else {
 888                struct tcp_md5sig_info *md5sig;
 889
 890                if (!tp->md5sig_info) {
 891                        tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 892                                                  GFP_ATOMIC);
 893                        if (!tp->md5sig_info) {
 894                                kfree(newkey);
 895                                return -ENOMEM;
 896                        }
 897                        sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 898                }
 899                if (tcp_alloc_md5sig_pool(sk) == NULL) {
 900                        kfree(newkey);
 901                        return -ENOMEM;
 902                }
 903                md5sig = tp->md5sig_info;
 904
 905                if (md5sig->alloced4 == md5sig->entries4) {
 906                        keys = kmalloc((sizeof(*keys) *
 907                                        (md5sig->entries4 + 1)), GFP_ATOMIC);
 908                        if (!keys) {
 909                                kfree(newkey);
 910                                tcp_free_md5sig_pool();
 911                                return -ENOMEM;
 912                        }
 913
 914                        if (md5sig->entries4)
 915                                memcpy(keys, md5sig->keys4,
 916                                       sizeof(*keys) * md5sig->entries4);
 917
 918                        /* Free old key list, and reference new one */
 919                        kfree(md5sig->keys4);
 920                        md5sig->keys4 = keys;
 921                        md5sig->alloced4++;
 922                }
 923                md5sig->entries4++;
 924                md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 925                md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 926                md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 927        }
 928        return 0;
 929}
 930EXPORT_SYMBOL(tcp_v4_md5_do_add);
 931
 932static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 933                               u8 *newkey, u8 newkeylen)
 934{
 935        return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 936                                 newkey, newkeylen);
 937}
 938
 939int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 940{
 941        struct tcp_sock *tp = tcp_sk(sk);
 942        int i;
 943
 944        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 945                if (tp->md5sig_info->keys4[i].addr == addr) {
 946                        /* Free the key */
 947                        kfree(tp->md5sig_info->keys4[i].base.key);
 948                        tp->md5sig_info->entries4--;
 949
 950                        if (tp->md5sig_info->entries4 == 0) {
 951                                kfree(tp->md5sig_info->keys4);
 952                                tp->md5sig_info->keys4 = NULL;
 953                                tp->md5sig_info->alloced4 = 0;
 954                        } else if (tp->md5sig_info->entries4 != i) {
 955                                /* Need to do some manipulation */
 956                                memmove(&tp->md5sig_info->keys4[i],
 957                                        &tp->md5sig_info->keys4[i+1],
 958                                        (tp->md5sig_info->entries4 - i) *
 959                                         sizeof(struct tcp4_md5sig_key));
 960                        }
 961                        tcp_free_md5sig_pool();
 962                        return 0;
 963                }
 964        }
 965        return -ENOENT;
 966}
 967EXPORT_SYMBOL(tcp_v4_md5_do_del);
 968
 969static void tcp_v4_clear_md5_list(struct sock *sk)
 970{
 971        struct tcp_sock *tp = tcp_sk(sk);
 972
 973        /* Free each key, then the set of key keys,
 974         * the crypto element, and then decrement our
 975         * hold on the last resort crypto.
 976         */
 977        if (tp->md5sig_info->entries4) {
 978                int i;
 979                for (i = 0; i < tp->md5sig_info->entries4; i++)
 980                        kfree(tp->md5sig_info->keys4[i].base.key);
 981                tp->md5sig_info->entries4 = 0;
 982                tcp_free_md5sig_pool();
 983        }
 984        if (tp->md5sig_info->keys4) {
 985                kfree(tp->md5sig_info->keys4);
 986                tp->md5sig_info->keys4 = NULL;
 987                tp->md5sig_info->alloced4  = 0;
 988        }
 989}
 990
 991static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 992                                 int optlen)
 993{
 994        struct tcp_md5sig cmd;
 995        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 996        u8 *newkey;
 997
 998        if (optlen < sizeof(cmd))
 999                return -EINVAL;
1000
1001        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1002                return -EFAULT;
1003
1004        if (sin->sin_family != AF_INET)
1005                return -EINVAL;
1006
1007        if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1008                if (!tcp_sk(sk)->md5sig_info)
1009                        return -ENOENT;
1010                return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1011        }
1012
1013        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014                return -EINVAL;
1015
1016        if (!tcp_sk(sk)->md5sig_info) {
1017                struct tcp_sock *tp = tcp_sk(sk);
1018                struct tcp_md5sig_info *p;
1019
1020                p = kzalloc(sizeof(*p), sk->sk_allocation);
1021                if (!p)
1022                        return -EINVAL;
1023
1024                tp->md5sig_info = p;
1025                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1026        }
1027
1028        newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1029        if (!newkey)
1030                return -ENOMEM;
1031        return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1032                                 newkey, cmd.tcpm_keylen);
1033}
1034
1035static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1036                                        __be32 daddr, __be32 saddr, int nbytes)
1037{
1038        struct tcp4_pseudohdr *bp;
1039        struct scatterlist sg;
1040
1041        bp = &hp->md5_blk.ip4;
1042
1043        /*
1044         * 1. the TCP pseudo-header (in the order: source IP address,
1045         * destination IP address, zero-padded protocol number, and
1046         * segment length)
1047         */
1048        bp->saddr = saddr;
1049        bp->daddr = daddr;
1050        bp->pad = 0;
1051        bp->protocol = IPPROTO_TCP;
1052        bp->len = cpu_to_be16(nbytes);
1053
1054        sg_init_one(&sg, bp, sizeof(*bp));
1055        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1056}
1057
1058static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1059                               __be32 daddr, __be32 saddr, struct tcphdr *th)
1060{
1061        struct tcp_md5sig_pool *hp;
1062        struct hash_desc *desc;
1063
1064        hp = tcp_get_md5sig_pool();
1065        if (!hp)
1066                goto clear_hash_noput;
1067        desc = &hp->md5_desc;
1068
1069        if (crypto_hash_init(desc))
1070                goto clear_hash;
1071        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1072                goto clear_hash;
1073        if (tcp_md5_hash_header(hp, th))
1074                goto clear_hash;
1075        if (tcp_md5_hash_key(hp, key))
1076                goto clear_hash;
1077        if (crypto_hash_final(desc, md5_hash))
1078                goto clear_hash;
1079
1080        tcp_put_md5sig_pool();
1081        return 0;
1082
1083clear_hash:
1084        tcp_put_md5sig_pool();
1085clear_hash_noput:
1086        memset(md5_hash, 0, 16);
1087        return 1;
1088}
1089
1090int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1091                        struct sock *sk, struct request_sock *req,
1092                        struct sk_buff *skb)
1093{
1094        struct tcp_md5sig_pool *hp;
1095        struct hash_desc *desc;
1096        struct tcphdr *th = tcp_hdr(skb);
1097        __be32 saddr, daddr;
1098
1099        if (sk) {
1100                saddr = inet_sk(sk)->inet_saddr;
1101                daddr = inet_sk(sk)->inet_daddr;
1102        } else if (req) {
1103                saddr = inet_rsk(req)->loc_addr;
1104                daddr = inet_rsk(req)->rmt_addr;
1105        } else {
1106                const struct iphdr *iph = ip_hdr(skb);
1107                saddr = iph->saddr;
1108                daddr = iph->daddr;
1109        }
1110
1111        hp = tcp_get_md5sig_pool();
1112        if (!hp)
1113                goto clear_hash_noput;
1114        desc = &hp->md5_desc;
1115
1116        if (crypto_hash_init(desc))
1117                goto clear_hash;
1118
1119        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1120                goto clear_hash;
1121        if (tcp_md5_hash_header(hp, th))
1122                goto clear_hash;
1123        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124                goto clear_hash;
1125        if (tcp_md5_hash_key(hp, key))
1126                goto clear_hash;
1127        if (crypto_hash_final(desc, md5_hash))
1128                goto clear_hash;
1129
1130        tcp_put_md5sig_pool();
1131        return 0;
1132
1133clear_hash:
1134        tcp_put_md5sig_pool();
1135clear_hash_noput:
1136        memset(md5_hash, 0, 16);
1137        return 1;
1138}
1139EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1140
1141static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1142{
1143        /*
1144         * This gets called for each TCP segment that arrives
1145         * so we want to be efficient.
1146         * We have 3 drop cases:
1147         * o No MD5 hash and one expected.
1148         * o MD5 hash and we're not expecting one.
1149         * o MD5 hash and its wrong.
1150         */
1151        __u8 *hash_location = NULL;
1152        struct tcp_md5sig_key *hash_expected;
1153        const struct iphdr *iph = ip_hdr(skb);
1154        struct tcphdr *th = tcp_hdr(skb);
1155        int genhash;
1156        unsigned char newhash[16];
1157
1158        hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1159        hash_location = tcp_parse_md5sig_option(th);
1160
1161        /* We've parsed the options - do we have a hash? */
1162        if (!hash_expected && !hash_location)
1163                return 0;
1164
1165        if (hash_expected && !hash_location) {
1166                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1167                return 1;
1168        }
1169
1170        if (!hash_expected && hash_location) {
1171                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1172                return 1;
1173        }
1174
1175        /* Okay, so this is hash_expected and hash_location -
1176         * so we need to calculate the checksum.
1177         */
1178        genhash = tcp_v4_md5_hash_skb(newhash,
1179                                      hash_expected,
1180                                      NULL, NULL, skb);
1181
1182        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1183                if (net_ratelimit()) {
1184                        printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1185                               &iph->saddr, ntohs(th->source),
1186                               &iph->daddr, ntohs(th->dest),
1187                               genhash ? " tcp_v4_calc_md5_hash failed" : "");
1188                }
1189                return 1;
1190        }
1191        return 0;
1192}
1193
1194#endif
1195
1196struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1197        .family         =       PF_INET,
1198        .obj_size       =       sizeof(struct tcp_request_sock),
1199        .rtx_syn_ack    =       tcp_v4_rtx_synack,
1200        .send_ack       =       tcp_v4_reqsk_send_ack,
1201        .destructor     =       tcp_v4_reqsk_destructor,
1202        .send_reset     =       tcp_v4_send_reset,
1203        .syn_ack_timeout =      tcp_syn_ack_timeout,
1204};
1205
1206#ifdef CONFIG_TCP_MD5SIG
1207static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1208        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1209        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1210};
1211#endif
1212
1213static struct timewait_sock_ops tcp_timewait_sock_ops = {
1214        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1215        .twsk_unique    = tcp_twsk_unique,
1216        .twsk_destructor= tcp_twsk_destructor,
1217};
1218
1219int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1220{
1221        struct tcp_extend_values tmp_ext;
1222        struct tcp_options_received tmp_opt;
1223        u8 *hash_location;
1224        struct request_sock *req;
1225        struct inet_request_sock *ireq;
1226        struct tcp_sock *tp = tcp_sk(sk);
1227        struct dst_entry *dst = NULL;
1228        __be32 saddr = ip_hdr(skb)->saddr;
1229        __be32 daddr = ip_hdr(skb)->daddr;
1230        __u32 isn = TCP_SKB_CB(skb)->when;
1231#ifdef CONFIG_SYN_COOKIES
1232        int want_cookie = 0;
1233#else
1234#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1235#endif
1236
1237        /* Never answer to SYNs send to broadcast or multicast */
1238        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1239                goto drop;
1240
1241        /* TW buckets are converted to open requests without
1242         * limitations, they conserve resources and peer is
1243         * evidently real one.
1244         */
1245        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1246                if (net_ratelimit())
1247                        syn_flood_warning(skb);
1248#ifdef CONFIG_SYN_COOKIES
1249                if (sysctl_tcp_syncookies) {
1250                        want_cookie = 1;
1251                } else
1252#endif
1253                goto drop;
1254        }
1255
1256        /* Accept backlog is full. If we have already queued enough
1257         * of warm entries in syn queue, drop request. It is better than
1258         * clogging syn queue with openreqs with exponentially increasing
1259         * timeout.
1260         */
1261        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1262                goto drop;
1263
1264        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1265        if (!req)
1266                goto drop;
1267
1268#ifdef CONFIG_TCP_MD5SIG
1269        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1270#endif
1271
1272        tcp_clear_options(&tmp_opt);
1273        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1274        tmp_opt.user_mss  = tp->rx_opt.user_mss;
1275        tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1276
1277        if (tmp_opt.cookie_plus > 0 &&
1278            tmp_opt.saw_tstamp &&
1279            !tp->rx_opt.cookie_out_never &&
1280            (sysctl_tcp_cookie_size > 0 ||
1281             (tp->cookie_values != NULL &&
1282              tp->cookie_values->cookie_desired > 0))) {
1283                u8 *c;
1284                u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1285                int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1286
1287                if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1288                        goto drop_and_release;
1289
1290                /* Secret recipe starts with IP addresses */
1291                *mess++ ^= (__force u32)daddr;
1292                *mess++ ^= (__force u32)saddr;
1293
1294                /* plus variable length Initiator Cookie */
1295                c = (u8 *)mess;
1296                while (l-- > 0)
1297                        *c++ ^= *hash_location++;
1298
1299#ifdef CONFIG_SYN_COOKIES
1300                want_cookie = 0;        /* not our kind of cookie */
1301#endif
1302                tmp_ext.cookie_out_never = 0; /* false */
1303                tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1304        } else if (!tp->rx_opt.cookie_in_always) {
1305                /* redundant indications, but ensure initialization. */
1306                tmp_ext.cookie_out_never = 1; /* true */
1307                tmp_ext.cookie_plus = 0;
1308        } else {
1309                goto drop_and_release;
1310        }
1311        tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1312
1313        if (want_cookie && !tmp_opt.saw_tstamp)
1314                tcp_clear_options(&tmp_opt);
1315
1316        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1317        tcp_openreq_init(req, &tmp_opt, skb);
1318
1319        ireq = inet_rsk(req);
1320        ireq->loc_addr = daddr;
1321        ireq->rmt_addr = saddr;
1322        ireq->no_srccheck = inet_sk(sk)->transparent;
1323        ireq->opt = tcp_v4_save_options(sk, skb);
1324
1325        if (security_inet_conn_request(sk, skb, req))
1326                goto drop_and_free;
1327
1328        if (!want_cookie || tmp_opt.tstamp_ok)
1329                TCP_ECN_create_request(req, tcp_hdr(skb));
1330
1331        if (want_cookie) {
1332                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1333                req->cookie_ts = tmp_opt.tstamp_ok;
1334        } else if (!isn) {
1335                struct inet_peer *peer = NULL;
1336
1337                /* VJ's idea. We save last timestamp seen
1338                 * from the destination in peer table, when entering
1339                 * state TIME-WAIT, and check against it before
1340                 * accepting new connection request.
1341                 *
1342                 * If "isn" is not zero, this request hit alive
1343                 * timewait bucket, so that all the necessary checks
1344                 * are made in the function processing timewait state.
1345                 */
1346                if (tmp_opt.saw_tstamp &&
1347                    tcp_death_row.sysctl_tw_recycle &&
1348                    (dst = inet_csk_route_req(sk, req)) != NULL &&
1349                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1350                    peer->v4daddr == saddr) {
1351                        inet_peer_refcheck(peer);
1352                        if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1353                            (s32)(peer->tcp_ts - req->ts_recent) >
1354                                                        TCP_PAWS_WINDOW) {
1355                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1356                                goto drop_and_release;
1357                        }
1358                }
1359                /* Kill the following clause, if you dislike this way. */
1360                else if (!sysctl_tcp_syncookies &&
1361                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1362                          (sysctl_max_syn_backlog >> 2)) &&
1363                         (!peer || !peer->tcp_ts_stamp) &&
1364                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1365                        /* Without syncookies last quarter of
1366                         * backlog is filled with destinations,
1367                         * proven to be alive.
1368                         * It means that we continue to communicate
1369                         * to destinations, already remembered
1370                         * to the moment of synflood.
1371                         */
1372                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1373                                       &saddr, ntohs(tcp_hdr(skb)->source));
1374                        goto drop_and_release;
1375                }
1376
1377                isn = tcp_v4_init_sequence(skb);
1378        }
1379        tcp_rsk(req)->snt_isn = isn;
1380
1381        if (tcp_v4_send_synack(sk, dst, req,
1382                               (struct request_values *)&tmp_ext) ||
1383            want_cookie)
1384                goto drop_and_free;
1385
1386        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1387        return 0;
1388
1389drop_and_release:
1390        dst_release(dst);
1391drop_and_free:
1392        reqsk_free(req);
1393drop:
1394        return 0;
1395}
1396EXPORT_SYMBOL(tcp_v4_conn_request);
1397
1398
1399/*
1400 * The three way handshake has completed - we got a valid synack -
1401 * now create the new socket.
1402 */
1403struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1404                                  struct request_sock *req,
1405                                  struct dst_entry *dst)
1406{
1407        struct inet_request_sock *ireq;
1408        struct inet_sock *newinet;
1409        struct tcp_sock *newtp;
1410        struct sock *newsk;
1411#ifdef CONFIG_TCP_MD5SIG
1412        struct tcp_md5sig_key *key;
1413#endif
1414
1415        if (sk_acceptq_is_full(sk))
1416                goto exit_overflow;
1417
1418        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1419                goto exit;
1420
1421        newsk = tcp_create_openreq_child(sk, req, skb);
1422        if (!newsk)
1423                goto exit_nonewsk;
1424
1425        newsk->sk_gso_type = SKB_GSO_TCPV4;
1426        sk_setup_caps(newsk, dst);
1427
1428        newtp                 = tcp_sk(newsk);
1429        newinet               = inet_sk(newsk);
1430        ireq                  = inet_rsk(req);
1431        newinet->inet_daddr   = ireq->rmt_addr;
1432        newinet->inet_rcv_saddr = ireq->loc_addr;
1433        newinet->inet_saddr           = ireq->loc_addr;
1434        newinet->opt          = ireq->opt;
1435        ireq->opt             = NULL;
1436        newinet->mc_index     = inet_iif(skb);
1437        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1438        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1439        if (newinet->opt)
1440                inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1441        newinet->inet_id = newtp->write_seq ^ jiffies;
1442
1443        tcp_mtup_init(newsk);
1444        tcp_sync_mss(newsk, dst_mtu(dst));
1445        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1446        if (tcp_sk(sk)->rx_opt.user_mss &&
1447            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1448                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1449
1450        tcp_initialize_rcv_mss(newsk);
1451
1452#ifdef CONFIG_TCP_MD5SIG
1453        /* Copy over the MD5 key from the original socket */
1454        key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1455        if (key != NULL) {
1456                /*
1457                 * We're using one, so create a matching key
1458                 * on the newsk structure. If we fail to get
1459                 * memory, then we end up not copying the key
1460                 * across. Shucks.
1461                 */
1462                char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1463                if (newkey != NULL)
1464                        tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1465                                          newkey, key->keylen);
1466                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1467        }
1468#endif
1469
1470        if (__inet_inherit_port(sk, newsk) < 0) {
1471                sock_put(newsk);
1472                goto exit;
1473        }
1474        __inet_hash_nolisten(newsk, NULL);
1475
1476        return newsk;
1477
1478exit_overflow:
1479        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1480exit_nonewsk:
1481        dst_release(dst);
1482exit:
1483        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1484        return NULL;
1485}
1486EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1487
1488static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1489{
1490        struct tcphdr *th = tcp_hdr(skb);
1491        const struct iphdr *iph = ip_hdr(skb);
1492        struct sock *nsk;
1493        struct request_sock **prev;
1494        /* Find possible connection requests. */
1495        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1496                                                       iph->saddr, iph->daddr);
1497        if (req)
1498                return tcp_check_req(sk, skb, req, prev);
1499
1500        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1501                        th->source, iph->daddr, th->dest, inet_iif(skb));
1502
1503        if (nsk) {
1504                if (nsk->sk_state != TCP_TIME_WAIT) {
1505                        bh_lock_sock(nsk);
1506                        return nsk;
1507                }
1508                inet_twsk_put(inet_twsk(nsk));
1509                return NULL;
1510        }
1511
1512#ifdef CONFIG_SYN_COOKIES
1513        if (!th->syn)
1514                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1515#endif
1516        return sk;
1517}
1518
1519static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1520{
1521        const struct iphdr *iph = ip_hdr(skb);
1522
1523        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1524                if (!tcp_v4_check(skb->len, iph->saddr,
1525                                  iph->daddr, skb->csum)) {
1526                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1527                        return 0;
1528                }
1529        }
1530
1531        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1532                                       skb->len, IPPROTO_TCP, 0);
1533
1534        if (skb->len <= 76) {
1535                return __skb_checksum_complete(skb);
1536        }
1537        return 0;
1538}
1539
1540
1541/* The socket must have it's spinlock held when we get
1542 * here.
1543 *
1544 * We have a potential double-lock case here, so even when
1545 * doing backlog processing we use the BH locking scheme.
1546 * This is because we cannot sleep with the original spinlock
1547 * held.
1548 */
1549int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1550{
1551        struct sock *rsk;
1552#ifdef CONFIG_TCP_MD5SIG
1553        /*
1554         * We really want to reject the packet as early as possible
1555         * if:
1556         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1557         *  o There is an MD5 option and we're not expecting one
1558         */
1559        if (tcp_v4_inbound_md5_hash(sk, skb))
1560                goto discard;
1561#endif
1562
1563        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1564                sock_rps_save_rxhash(sk, skb->rxhash);
1565                TCP_CHECK_TIMER(sk);
1566                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1567                        rsk = sk;
1568                        goto reset;
1569                }
1570                TCP_CHECK_TIMER(sk);
1571                return 0;
1572        }
1573
1574        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1575                goto csum_err;
1576
1577        if (sk->sk_state == TCP_LISTEN) {
1578                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1579                if (!nsk)
1580                        goto discard;
1581
1582                if (nsk != sk) {
1583                        if (tcp_child_process(sk, nsk, skb)) {
1584                                rsk = nsk;
1585                                goto reset;
1586                        }
1587                        return 0;
1588                }
1589        } else
1590                sock_rps_save_rxhash(sk, skb->rxhash);
1591
1592
1593        TCP_CHECK_TIMER(sk);
1594        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1595                rsk = sk;
1596                goto reset;
1597        }
1598        TCP_CHECK_TIMER(sk);
1599        return 0;
1600
1601reset:
1602        tcp_v4_send_reset(rsk, skb);
1603discard:
1604        kfree_skb(skb);
1605        /* Be careful here. If this function gets more complicated and
1606         * gcc suffers from register pressure on the x86, sk (in %ebx)
1607         * might be destroyed here. This current version compiles correctly,
1608         * but you have been warned.
1609         */
1610        return 0;
1611
1612csum_err:
1613        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1614        goto discard;
1615}
1616EXPORT_SYMBOL(tcp_v4_do_rcv);
1617
1618/*
1619 *      From tcp_input.c
1620 */
1621
1622int tcp_v4_rcv(struct sk_buff *skb)
1623{
1624        const struct iphdr *iph;
1625        struct tcphdr *th;
1626        struct sock *sk;
1627        int ret;
1628        struct net *net = dev_net(skb->dev);
1629
1630        if (skb->pkt_type != PACKET_HOST)
1631                goto discard_it;
1632
1633        /* Count it even if it's bad */
1634        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1635
1636        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1637                goto discard_it;
1638
1639        th = tcp_hdr(skb);
1640
1641        if (th->doff < sizeof(struct tcphdr) / 4)
1642                goto bad_packet;
1643        if (!pskb_may_pull(skb, th->doff * 4))
1644                goto discard_it;
1645
1646        /* An explanation is required here, I think.
1647         * Packet length and doff are validated by header prediction,
1648         * provided case of th->doff==0 is eliminated.
1649         * So, we defer the checks. */
1650        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1651                goto bad_packet;
1652
1653        th = tcp_hdr(skb);
1654        iph = ip_hdr(skb);
1655        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1656        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1657                                    skb->len - th->doff * 4);
1658        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1659        TCP_SKB_CB(skb)->when    = 0;
1660        TCP_SKB_CB(skb)->flags   = iph->tos;
1661        TCP_SKB_CB(skb)->sacked  = 0;
1662
1663        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1664        if (!sk)
1665                goto no_tcp_socket;
1666
1667process:
1668        if (sk->sk_state == TCP_TIME_WAIT)
1669                goto do_time_wait;
1670
1671        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1672                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1673                goto discard_and_relse;
1674        }
1675
1676        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1677                goto discard_and_relse;
1678        nf_reset(skb);
1679
1680        if (sk_filter(sk, skb))
1681                goto discard_and_relse;
1682
1683        skb->dev = NULL;
1684
1685        bh_lock_sock_nested(sk);
1686        ret = 0;
1687        if (!sock_owned_by_user(sk)) {
1688#ifdef CONFIG_NET_DMA
1689                struct tcp_sock *tp = tcp_sk(sk);
1690                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1691                        tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1692                if (tp->ucopy.dma_chan)
1693                        ret = tcp_v4_do_rcv(sk, skb);
1694                else
1695#endif
1696                {
1697                        if (!tcp_prequeue(sk, skb))
1698                                ret = tcp_v4_do_rcv(sk, skb);
1699                }
1700        } else if (unlikely(sk_add_backlog(sk, skb))) {
1701                bh_unlock_sock(sk);
1702                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1703                goto discard_and_relse;
1704        }
1705        bh_unlock_sock(sk);
1706
1707        sock_put(sk);
1708
1709        return ret;
1710
1711no_tcp_socket:
1712        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1713                goto discard_it;
1714
1715        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1716bad_packet:
1717                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1718        } else {
1719                tcp_v4_send_reset(NULL, skb);
1720        }
1721
1722discard_it:
1723        /* Discard frame. */
1724        kfree_skb(skb);
1725        return 0;
1726
1727discard_and_relse:
1728        sock_put(sk);
1729        goto discard_it;
1730
1731do_time_wait:
1732        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1733                inet_twsk_put(inet_twsk(sk));
1734                goto discard_it;
1735        }
1736
1737        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1738                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1739                inet_twsk_put(inet_twsk(sk));
1740                goto discard_it;
1741        }
1742        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1743        case TCP_TW_SYN: {
1744                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1745                                                        &tcp_hashinfo,
1746                                                        iph->daddr, th->dest,
1747                                                        inet_iif(skb));
1748                if (sk2) {
1749                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1750                        inet_twsk_put(inet_twsk(sk));
1751                        sk = sk2;
1752                        goto process;
1753                }
1754                /* Fall through to ACK */
1755        }
1756        case TCP_TW_ACK:
1757                tcp_v4_timewait_ack(sk, skb);
1758                break;
1759        case TCP_TW_RST:
1760                goto no_tcp_socket;
1761        case TCP_TW_SUCCESS:;
1762        }
1763        goto discard_it;
1764}
1765
1766/* VJ's idea. Save last timestamp seen from this destination
1767 * and hold it at least for normal timewait interval to use for duplicate
1768 * segment detection in subsequent connections, before they enter synchronized
1769 * state.
1770 */
1771
1772int tcp_v4_remember_stamp(struct sock *sk)
1773{
1774        struct inet_sock *inet = inet_sk(sk);
1775        struct tcp_sock *tp = tcp_sk(sk);
1776        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1777        struct inet_peer *peer = NULL;
1778        int release_it = 0;
1779
1780        if (!rt || rt->rt_dst != inet->inet_daddr) {
1781                peer = inet_getpeer(inet->inet_daddr, 1);
1782                release_it = 1;
1783        } else {
1784                if (!rt->peer)
1785                        rt_bind_peer(rt, 1);
1786                peer = rt->peer;
1787        }
1788
1789        if (peer) {
1790                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1791                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1792                     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1793                        peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1794                        peer->tcp_ts = tp->rx_opt.ts_recent;
1795                }
1796                if (release_it)
1797                        inet_putpeer(peer);
1798                return 1;
1799        }
1800
1801        return 0;
1802}
1803EXPORT_SYMBOL(tcp_v4_remember_stamp);
1804
1805int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1806{
1807        struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1808
1809        if (peer) {
1810                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1811
1812                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1813                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1814                     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1815                        peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1816                        peer->tcp_ts       = tcptw->tw_ts_recent;
1817                }
1818                inet_putpeer(peer);
1819                return 1;
1820        }
1821
1822        return 0;
1823}
1824
1825const struct inet_connection_sock_af_ops ipv4_specific = {
1826        .queue_xmit        = ip_queue_xmit,
1827        .send_check        = tcp_v4_send_check,
1828        .rebuild_header    = inet_sk_rebuild_header,
1829        .conn_request      = tcp_v4_conn_request,
1830        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1831        .remember_stamp    = tcp_v4_remember_stamp,
1832        .net_header_len    = sizeof(struct iphdr),
1833        .setsockopt        = ip_setsockopt,
1834        .getsockopt        = ip_getsockopt,
1835        .addr2sockaddr     = inet_csk_addr2sockaddr,
1836        .sockaddr_len      = sizeof(struct sockaddr_in),
1837        .bind_conflict     = inet_csk_bind_conflict,
1838#ifdef CONFIG_COMPAT
1839        .compat_setsockopt = compat_ip_setsockopt,
1840        .compat_getsockopt = compat_ip_getsockopt,
1841#endif
1842};
1843EXPORT_SYMBOL(ipv4_specific);
1844
1845#ifdef CONFIG_TCP_MD5SIG
1846static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1847        .md5_lookup             = tcp_v4_md5_lookup,
1848        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1849        .md5_add                = tcp_v4_md5_add_func,
1850        .md5_parse              = tcp_v4_parse_md5_keys,
1851};
1852#endif
1853
1854/* NOTE: A lot of things set to zero explicitly by call to
1855 *       sk_alloc() so need not be done here.
1856 */
1857static int tcp_v4_init_sock(struct sock *sk)
1858{
1859        struct inet_connection_sock *icsk = inet_csk(sk);
1860        struct tcp_sock *tp = tcp_sk(sk);
1861
1862        skb_queue_head_init(&tp->out_of_order_queue);
1863        tcp_init_xmit_timers(sk);
1864        tcp_prequeue_init(tp);
1865
1866        icsk->icsk_rto = TCP_TIMEOUT_INIT;
1867        tp->mdev = TCP_TIMEOUT_INIT;
1868
1869        /* So many TCP implementations out there (incorrectly) count the
1870         * initial SYN frame in their delayed-ACK and congestion control
1871         * algorithms that we must have the following bandaid to talk
1872         * efficiently to them.  -DaveM
1873         */
1874        tp->snd_cwnd = 2;
1875
1876        /* See draft-stevens-tcpca-spec-01 for discussion of the
1877         * initialization of these values.
1878         */
1879        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1880        tp->snd_cwnd_clamp = ~0;
1881        tp->mss_cache = TCP_MSS_DEFAULT;
1882
1883        tp->reordering = sysctl_tcp_reordering;
1884        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1885
1886        sk->sk_state = TCP_CLOSE;
1887
1888        sk->sk_write_space = sk_stream_write_space;
1889        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1890
1891        icsk->icsk_af_ops = &ipv4_specific;
1892        icsk->icsk_sync_mss = tcp_sync_mss;
1893#ifdef CONFIG_TCP_MD5SIG
1894        tp->af_specific = &tcp_sock_ipv4_specific;
1895#endif
1896
1897        /* TCP Cookie Transactions */
1898        if (sysctl_tcp_cookie_size > 0) {
1899                /* Default, cookies without s_data_payload. */
1900                tp->cookie_values =
1901                        kzalloc(sizeof(*tp->cookie_values),
1902                                sk->sk_allocation);
1903                if (tp->cookie_values != NULL)
1904                        kref_init(&tp->cookie_values->kref);
1905        }
1906        /* Presumed zeroed, in order of appearance:
1907         *      cookie_in_always, cookie_out_never,
1908         *      s_data_constant, s_data_in, s_data_out
1909         */
1910        sk->sk_sndbuf = sysctl_tcp_wmem[1];
1911        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1912
1913        local_bh_disable();
1914        percpu_counter_inc(&tcp_sockets_allocated);
1915        local_bh_enable();
1916
1917        return 0;
1918}
1919
1920void tcp_v4_destroy_sock(struct sock *sk)
1921{
1922        struct tcp_sock *tp = tcp_sk(sk);
1923
1924        tcp_clear_xmit_timers(sk);
1925
1926        tcp_cleanup_congestion_control(sk);
1927
1928        /* Cleanup up the write buffer. */
1929        tcp_write_queue_purge(sk);
1930
1931        /* Cleans up our, hopefully empty, out_of_order_queue. */
1932        __skb_queue_purge(&tp->out_of_order_queue);
1933
1934#ifdef CONFIG_TCP_MD5SIG
1935        /* Clean up the MD5 key list, if any */
1936        if (tp->md5sig_info) {
1937                tcp_v4_clear_md5_list(sk);
1938                kfree(tp->md5sig_info);
1939                tp->md5sig_info = NULL;
1940        }
1941#endif
1942
1943#ifdef CONFIG_NET_DMA
1944        /* Cleans up our sk_async_wait_queue */
1945        __skb_queue_purge(&sk->sk_async_wait_queue);
1946#endif
1947
1948        /* Clean prequeue, it must be empty really */
1949        __skb_queue_purge(&tp->ucopy.prequeue);
1950
1951        /* Clean up a referenced TCP bind bucket. */
1952        if (inet_csk(sk)->icsk_bind_hash)
1953                inet_put_port(sk);
1954
1955        /*
1956         * If sendmsg cached page exists, toss it.
1957         */
1958        if (sk->sk_sndmsg_page) {
1959                __free_page(sk->sk_sndmsg_page);
1960                sk->sk_sndmsg_page = NULL;
1961        }
1962
1963        /* TCP Cookie Transactions */
1964        if (tp->cookie_values != NULL) {
1965                kref_put(&tp->cookie_values->kref,
1966                         tcp_cookie_values_release);
1967                tp->cookie_values = NULL;
1968        }
1969
1970        percpu_counter_dec(&tcp_sockets_allocated);
1971}
1972EXPORT_SYMBOL(tcp_v4_destroy_sock);
1973
1974#ifdef CONFIG_PROC_FS
1975/* Proc filesystem TCP sock list dumping. */
1976
1977static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1978{
1979        return hlist_nulls_empty(head) ? NULL :
1980                list_entry(head->first, struct inet_timewait_sock, tw_node);
1981}
1982
1983static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1984{
1985        return !is_a_nulls(tw->tw_node.next) ?
1986                hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1987}
1988
1989/*
1990 * Get next listener socket follow cur.  If cur is NULL, get first socket
1991 * starting from bucket given in st->bucket; when st->bucket is zero the
1992 * very first socket in the hash table is returned.
1993 */
1994static void *listening_get_next(struct seq_file *seq, void *cur)
1995{
1996        struct inet_connection_sock *icsk;
1997        struct hlist_nulls_node *node;
1998        struct sock *sk = cur;
1999        struct inet_listen_hashbucket *ilb;
2000        struct tcp_iter_state *st = seq->private;
2001        struct net *net = seq_file_net(seq);
2002
2003        if (!sk) {
2004                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2005                spin_lock_bh(&ilb->lock);
2006                sk = sk_nulls_head(&ilb->head);
2007                st->offset = 0;
2008                goto get_sk;
2009        }
2010        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2011        ++st->num;
2012        ++st->offset;
2013
2014        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2015                struct request_sock *req = cur;
2016
2017                icsk = inet_csk(st->syn_wait_sk);
2018                req = req->dl_next;
2019                while (1) {
2020                        while (req) {
2021                                if (req->rsk_ops->family == st->family) {
2022                                        cur = req;
2023                                        goto out;
2024                                }
2025                                req = req->dl_next;
2026                        }
2027                        st->offset = 0;
2028                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2029                                break;
2030get_req:
2031                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2032                }
2033                sk        = sk_nulls_next(st->syn_wait_sk);
2034                st->state = TCP_SEQ_STATE_LISTENING;
2035                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036        } else {
2037                icsk = inet_csk(sk);
2038                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2039                if (reqsk_queue_len(&icsk->icsk_accept_queue))
2040                        goto start_req;
2041                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2042                sk = sk_nulls_next(sk);
2043        }
2044get_sk:
2045        sk_nulls_for_each_from(sk, node) {
2046                if (!net_eq(sock_net(sk), net))
2047                        continue;
2048                if (sk->sk_family == st->family) {
2049                        cur = sk;
2050                        goto out;
2051                }
2052                icsk = inet_csk(sk);
2053                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2054                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2055start_req:
2056                        st->uid         = sock_i_uid(sk);
2057                        st->syn_wait_sk = sk;
2058                        st->state       = TCP_SEQ_STATE_OPENREQ;
2059                        st->sbucket     = 0;
2060                        goto get_req;
2061                }
2062                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2063        }
2064        spin_unlock_bh(&ilb->lock);
2065        st->offset = 0;
2066        if (++st->bucket < INET_LHTABLE_SIZE) {
2067                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2068                spin_lock_bh(&ilb->lock);
2069                sk = sk_nulls_head(&ilb->head);
2070                goto get_sk;
2071        }
2072        cur = NULL;
2073out:
2074        return cur;
2075}
2076
2077static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2078{
2079        struct tcp_iter_state *st = seq->private;
2080        void *rc;
2081
2082        st->bucket = 0;
2083        st->offset = 0;
2084        rc = listening_get_next(seq, NULL);
2085
2086        while (rc && *pos) {
2087                rc = listening_get_next(seq, rc);
2088                --*pos;
2089        }
2090        return rc;
2091}
2092
2093static inline int empty_bucket(struct tcp_iter_state *st)
2094{
2095        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2096                hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2097}
2098
2099/*
2100 * Get first established socket starting from bucket given in st->bucket.
2101 * If st->bucket is zero, the very first socket in the hash is returned.
2102 */
2103static void *established_get_first(struct seq_file *seq)
2104{
2105        struct tcp_iter_state *st = seq->private;
2106        struct net *net = seq_file_net(seq);
2107        void *rc = NULL;
2108
2109        st->offset = 0;
2110        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2111                struct sock *sk;
2112                struct hlist_nulls_node *node;
2113                struct inet_timewait_sock *tw;
2114                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2115
2116                /* Lockless fast path for the common case of empty buckets */
2117                if (empty_bucket(st))
2118                        continue;
2119
2120                spin_lock_bh(lock);
2121                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2122                        if (sk->sk_family != st->family ||
2123                            !net_eq(sock_net(sk), net)) {
2124                                continue;
2125                        }
2126                        rc = sk;
2127                        goto out;
2128                }
2129                st->state = TCP_SEQ_STATE_TIME_WAIT;
2130                inet_twsk_for_each(tw, node,
2131                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
2132                        if (tw->tw_family != st->family ||
2133                            !net_eq(twsk_net(tw), net)) {
2134                                continue;
2135                        }
2136                        rc = tw;
2137                        goto out;
2138                }
2139                spin_unlock_bh(lock);
2140                st->state = TCP_SEQ_STATE_ESTABLISHED;
2141        }
2142out:
2143        return rc;
2144}
2145
2146static void *established_get_next(struct seq_file *seq, void *cur)
2147{
2148        struct sock *sk = cur;
2149        struct inet_timewait_sock *tw;
2150        struct hlist_nulls_node *node;
2151        struct tcp_iter_state *st = seq->private;
2152        struct net *net = seq_file_net(seq);
2153
2154        ++st->num;
2155        ++st->offset;
2156
2157        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2158                tw = cur;
2159                tw = tw_next(tw);
2160get_tw:
2161                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2162                        tw = tw_next(tw);
2163                }
2164                if (tw) {
2165                        cur = tw;
2166                        goto out;
2167                }
2168                spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2169                st->state = TCP_SEQ_STATE_ESTABLISHED;
2170
2171                /* Look for next non empty bucket */
2172                st->offset = 0;
2173                while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2174                                empty_bucket(st))
2175                        ;
2176                if (st->bucket > tcp_hashinfo.ehash_mask)
2177                        return NULL;
2178
2179                spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2180                sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2181        } else
2182                sk = sk_nulls_next(sk);
2183
2184        sk_nulls_for_each_from(sk, node) {
2185                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2186                        goto found;
2187        }
2188
2189        st->state = TCP_SEQ_STATE_TIME_WAIT;
2190        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2191        goto get_tw;
2192found:
2193        cur = sk;
2194out:
2195        return cur;
2196}
2197
2198static void *established_get_idx(struct seq_file *seq, loff_t pos)
2199{
2200        struct tcp_iter_state *st = seq->private;
2201        void *rc;
2202
2203        st->bucket = 0;
2204        rc = established_get_first(seq);
2205
2206        while (rc && pos) {
2207                rc = established_get_next(seq, rc);
2208                --pos;
2209        }
2210        return rc;
2211}
2212
2213static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2214{
2215        void *rc;
2216        struct tcp_iter_state *st = seq->private;
2217
2218        st->state = TCP_SEQ_STATE_LISTENING;
2219        rc        = listening_get_idx(seq, &pos);
2220
2221        if (!rc) {
2222                st->state = TCP_SEQ_STATE_ESTABLISHED;
2223                rc        = established_get_idx(seq, pos);
2224        }
2225
2226        return rc;
2227}
2228
2229static void *tcp_seek_last_pos(struct seq_file *seq)
2230{
2231        struct tcp_iter_state *st = seq->private;
2232        int offset = st->offset;
2233        int orig_num = st->num;
2234        void *rc = NULL;
2235
2236        switch (st->state) {
2237        case TCP_SEQ_STATE_OPENREQ:
2238        case TCP_SEQ_STATE_LISTENING:
2239                if (st->bucket >= INET_LHTABLE_SIZE)
2240                        break;
2241                st->state = TCP_SEQ_STATE_LISTENING;
2242                rc = listening_get_next(seq, NULL);
2243                while (offset-- && rc)
2244                        rc = listening_get_next(seq, rc);
2245                if (rc)
2246                        break;
2247                st->bucket = 0;
2248                /* Fallthrough */
2249        case TCP_SEQ_STATE_ESTABLISHED:
2250        case TCP_SEQ_STATE_TIME_WAIT:
2251                st->state = TCP_SEQ_STATE_ESTABLISHED;
2252                if (st->bucket > tcp_hashinfo.ehash_mask)
2253                        break;
2254                rc = established_get_first(seq);
2255                while (offset-- && rc)
2256                        rc = established_get_next(seq, rc);
2257        }
2258
2259        st->num = orig_num;
2260
2261        return rc;
2262}
2263
2264static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2265{
2266        struct tcp_iter_state *st = seq->private;
2267        void *rc;
2268
2269        if (*pos && *pos == st->last_pos) {
2270                rc = tcp_seek_last_pos(seq);
2271                if (rc)
2272                        goto out;
2273        }
2274
2275        st->state = TCP_SEQ_STATE_LISTENING;
2276        st->num = 0;
2277        st->bucket = 0;
2278        st->offset = 0;
2279        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2280
2281out:
2282        st->last_pos = *pos;
2283        return rc;
2284}
2285
2286static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2287{
2288        struct tcp_iter_state *st = seq->private;
2289        void *rc = NULL;
2290
2291        if (v == SEQ_START_TOKEN) {
2292                rc = tcp_get_idx(seq, 0);
2293                goto out;
2294        }
2295
2296        switch (st->state) {
2297        case TCP_SEQ_STATE_OPENREQ:
2298        case TCP_SEQ_STATE_LISTENING:
2299                rc = listening_get_next(seq, v);
2300                if (!rc) {
2301                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2302                        st->bucket = 0;
2303                        st->offset = 0;
2304                        rc        = established_get_first(seq);
2305                }
2306                break;
2307        case TCP_SEQ_STATE_ESTABLISHED:
2308        case TCP_SEQ_STATE_TIME_WAIT:
2309                rc = established_get_next(seq, v);
2310                break;
2311        }
2312out:
2313        ++*pos;
2314        st->last_pos = *pos;
2315        return rc;
2316}
2317
2318static void tcp_seq_stop(struct seq_file *seq, void *v)
2319{
2320        struct tcp_iter_state *st = seq->private;
2321
2322        switch (st->state) {
2323        case TCP_SEQ_STATE_OPENREQ:
2324                if (v) {
2325                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2326                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2327                }
2328        case TCP_SEQ_STATE_LISTENING:
2329                if (v != SEQ_START_TOKEN)
2330                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2331                break;
2332        case TCP_SEQ_STATE_TIME_WAIT:
2333        case TCP_SEQ_STATE_ESTABLISHED:
2334                if (v)
2335                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2336                break;
2337        }
2338}
2339
2340static int tcp_seq_open(struct inode *inode, struct file *file)
2341{
2342        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2343        struct tcp_iter_state *s;
2344        int err;
2345
2346        err = seq_open_net(inode, file, &afinfo->seq_ops,
2347                          sizeof(struct tcp_iter_state));
2348        if (err < 0)
2349                return err;
2350
2351        s = ((struct seq_file *)file->private_data)->private;
2352        s->family               = afinfo->family;
2353        s->last_pos             = 0;
2354        return 0;
2355}
2356
2357int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2358{
2359        int rc = 0;
2360        struct proc_dir_entry *p;
2361
2362        afinfo->seq_fops.open           = tcp_seq_open;
2363        afinfo->seq_fops.read           = seq_read;
2364        afinfo->seq_fops.llseek         = seq_lseek;
2365        afinfo->seq_fops.release        = seq_release_net;
2366
2367        afinfo->seq_ops.start           = tcp_seq_start;
2368        afinfo->seq_ops.next            = tcp_seq_next;
2369        afinfo->seq_ops.stop            = tcp_seq_stop;
2370
2371        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2372                             &afinfo->seq_fops, afinfo);
2373        if (!p)
2374                rc = -ENOMEM;
2375        return rc;
2376}
2377EXPORT_SYMBOL(tcp_proc_register);
2378
2379void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2380{
2381        proc_net_remove(net, afinfo->name);
2382}
2383EXPORT_SYMBOL(tcp_proc_unregister);
2384
2385static void get_openreq4(struct sock *sk, struct request_sock *req,
2386                         struct seq_file *f, int i, int uid, int *len)
2387{
2388        const struct inet_request_sock *ireq = inet_rsk(req);
2389        int ttd = req->expires - jiffies;
2390
2391        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2392                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2393                i,
2394                ireq->loc_addr,
2395                ntohs(inet_sk(sk)->inet_sport),
2396                ireq->rmt_addr,
2397                ntohs(ireq->rmt_port),
2398                TCP_SYN_RECV,
2399                0, 0, /* could print option size, but that is af dependent. */
2400                1,    /* timers active (only the expire timer) */
2401                jiffies_to_clock_t(ttd),
2402                req->retrans,
2403                uid,
2404                0,  /* non standard timer */
2405                0, /* open_requests have no inode */
2406                atomic_read(&sk->sk_refcnt),
2407                req,
2408                len);
2409}
2410
2411static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2412{
2413        int timer_active;
2414        unsigned long timer_expires;
2415        struct tcp_sock *tp = tcp_sk(sk);
2416        const struct inet_connection_sock *icsk = inet_csk(sk);
2417        struct inet_sock *inet = inet_sk(sk);
2418        __be32 dest = inet->inet_daddr;
2419        __be32 src = inet->inet_rcv_saddr;
2420        __u16 destp = ntohs(inet->inet_dport);
2421        __u16 srcp = ntohs(inet->inet_sport);
2422        int rx_queue;
2423
2424        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2425                timer_active    = 1;
2426                timer_expires   = icsk->icsk_timeout;
2427        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2428                timer_active    = 4;
2429                timer_expires   = icsk->icsk_timeout;
2430        } else if (timer_pending(&sk->sk_timer)) {
2431                timer_active    = 2;
2432                timer_expires   = sk->sk_timer.expires;
2433        } else {
2434                timer_active    = 0;
2435                timer_expires = jiffies;
2436        }
2437
2438        if (sk->sk_state == TCP_LISTEN)
2439                rx_queue = sk->sk_ack_backlog;
2440        else
2441                /*
2442                 * because we dont lock socket, we might find a transient negative value
2443                 */
2444                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2445
2446        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2447                        "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2448                i, src, srcp, dest, destp, sk->sk_state,
2449                tp->write_seq - tp->snd_una,
2450                rx_queue,
2451                timer_active,
2452                jiffies_to_clock_t(timer_expires - jiffies),
2453                icsk->icsk_retransmits,
2454                sock_i_uid(sk),
2455                icsk->icsk_probes_out,
2456                sock_i_ino(sk),
2457                atomic_read(&sk->sk_refcnt), sk,
2458                jiffies_to_clock_t(icsk->icsk_rto),
2459                jiffies_to_clock_t(icsk->icsk_ack.ato),
2460                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2461                tp->snd_cwnd,
2462                tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2463                len);
2464}
2465
2466static void get_timewait4_sock(struct inet_timewait_sock *tw,
2467                               struct seq_file *f, int i, int *len)
2468{
2469        __be32 dest, src;
2470        __u16 destp, srcp;
2471        int ttd = tw->tw_ttd - jiffies;
2472
2473        if (ttd < 0)
2474                ttd = 0;
2475
2476        dest  = tw->tw_daddr;
2477        src   = tw->tw_rcv_saddr;
2478        destp = ntohs(tw->tw_dport);
2479        srcp  = ntohs(tw->tw_sport);
2480
2481        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2482                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2483                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2484                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2485                atomic_read(&tw->tw_refcnt), tw, len);
2486}
2487
2488#define TMPSZ 150
2489
2490static int tcp4_seq_show(struct seq_file *seq, void *v)
2491{
2492        struct tcp_iter_state *st;
2493        int len;
2494
2495        if (v == SEQ_START_TOKEN) {
2496                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2497                           "  sl  local_address rem_address   st tx_queue "
2498                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2499                           "inode");
2500                goto out;
2501        }
2502        st = seq->private;
2503
2504        switch (st->state) {
2505        case TCP_SEQ_STATE_LISTENING:
2506        case TCP_SEQ_STATE_ESTABLISHED:
2507                get_tcp4_sock(v, seq, st->num, &len);
2508                break;
2509        case TCP_SEQ_STATE_OPENREQ:
2510                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2511                break;
2512        case TCP_SEQ_STATE_TIME_WAIT:
2513                get_timewait4_sock(v, seq, st->num, &len);
2514                break;
2515        }
2516        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2517out:
2518        return 0;
2519}
2520
2521static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2522        .name           = "tcp",
2523        .family         = AF_INET,
2524        .seq_fops       = {
2525                .owner          = THIS_MODULE,
2526        },
2527        .seq_ops        = {
2528                .show           = tcp4_seq_show,
2529        },
2530};
2531
2532static int __net_init tcp4_proc_init_net(struct net *net)
2533{
2534        return tcp_proc_register(net, &tcp4_seq_afinfo);
2535}
2536
2537static void __net_exit tcp4_proc_exit_net(struct net *net)
2538{
2539        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2540}
2541
2542static struct pernet_operations tcp4_net_ops = {
2543        .init = tcp4_proc_init_net,
2544        .exit = tcp4_proc_exit_net,
2545};
2546
2547int __init tcp4_proc_init(void)
2548{
2549        return register_pernet_subsys(&tcp4_net_ops);
2550}
2551
2552void tcp4_proc_exit(void)
2553{
2554        unregister_pernet_subsys(&tcp4_net_ops);
2555}
2556#endif /* CONFIG_PROC_FS */
2557
2558struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2559{
2560        struct iphdr *iph = skb_gro_network_header(skb);
2561
2562        switch (skb->ip_summed) {
2563        case CHECKSUM_COMPLETE:
2564                if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2565                                  skb->csum)) {
2566                        skb->ip_summed = CHECKSUM_UNNECESSARY;
2567                        break;
2568                }
2569
2570                /* fall through */
2571        case CHECKSUM_NONE:
2572                NAPI_GRO_CB(skb)->flush = 1;
2573                return NULL;
2574        }
2575
2576        return tcp_gro_receive(head, skb);
2577}
2578
2579int tcp4_gro_complete(struct sk_buff *skb)
2580{
2581        struct iphdr *iph = ip_hdr(skb);
2582        struct tcphdr *th = tcp_hdr(skb);
2583
2584        th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2585                                  iph->saddr, iph->daddr, 0);
2586        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2587
2588        return tcp_gro_complete(skb);
2589}
2590
2591struct proto tcp_prot = {
2592        .name                   = "TCP",
2593        .owner                  = THIS_MODULE,
2594        .close                  = tcp_close,
2595        .connect                = tcp_v4_connect,
2596        .disconnect             = tcp_disconnect,
2597        .accept                 = inet_csk_accept,
2598        .ioctl                  = tcp_ioctl,
2599        .init                   = tcp_v4_init_sock,
2600        .destroy                = tcp_v4_destroy_sock,
2601        .shutdown               = tcp_shutdown,
2602        .setsockopt             = tcp_setsockopt,
2603        .getsockopt             = tcp_getsockopt,
2604        .recvmsg                = tcp_recvmsg,
2605        .sendmsg                = tcp_sendmsg,
2606        .sendpage               = tcp_sendpage,
2607        .backlog_rcv            = tcp_v4_do_rcv,
2608        .hash                   = inet_hash,
2609        .unhash                 = inet_unhash,
2610        .get_port               = inet_csk_get_port,
2611        .enter_memory_pressure  = tcp_enter_memory_pressure,
2612        .sockets_allocated      = &tcp_sockets_allocated,
2613        .orphan_count           = &tcp_orphan_count,
2614        .memory_allocated       = &tcp_memory_allocated,
2615        .memory_pressure        = &tcp_memory_pressure,
2616        .sysctl_mem             = sysctl_tcp_mem,
2617        .sysctl_wmem            = sysctl_tcp_wmem,
2618        .sysctl_rmem            = sysctl_tcp_rmem,
2619        .max_header             = MAX_TCP_HEADER,
2620        .obj_size               = sizeof(struct tcp_sock),
2621        .slab_flags             = SLAB_DESTROY_BY_RCU,
2622        .twsk_prot              = &tcp_timewait_sock_ops,
2623        .rsk_prot               = &tcp_request_sock_ops,
2624        .h.hashinfo             = &tcp_hashinfo,
2625        .no_autobind            = true,
2626#ifdef CONFIG_COMPAT
2627        .compat_setsockopt      = compat_tcp_setsockopt,
2628        .compat_getsockopt      = compat_tcp_getsockopt,
2629#endif
2630};
2631EXPORT_SYMBOL(tcp_prot);
2632
2633
2634static int __net_init tcp_sk_init(struct net *net)
2635{
2636        return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2637                                    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2638}
2639
2640static void __net_exit tcp_sk_exit(struct net *net)
2641{
2642        inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2643}
2644
2645static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2646{
2647        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2648}
2649
2650static struct pernet_operations __net_initdata tcp_sk_ops = {
2651       .init       = tcp_sk_init,
2652       .exit       = tcp_sk_exit,
2653       .exit_batch = tcp_sk_exit_batch,
2654};
2655
2656void __init tcp_v4_init(void)
2657{
2658        inet_hashinfo_init(&tcp_hashinfo);
2659        if (register_pernet_subsys(&tcp_sk_ops))
2660                panic("Failed to create the TCP control socket.\n");
2661}
2662
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.