linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53
  54#include <linux/types.h>
  55#include <linux/fcntl.h>
  56#include <linux/module.h>
  57#include <linux/random.h>
  58#include <linux/cache.h>
  59#include <linux/jhash.h>
  60#include <linux/init.h>
  61#include <linux/times.h>
  62
  63#include <net/net_namespace.h>
  64#include <net/icmp.h>
  65#include <net/inet_hashtables.h>
  66#include <net/tcp.h>
  67#include <net/transp_v6.h>
  68#include <net/ipv6.h>
  69#include <net/inet_common.h>
  70#include <net/timewait_sock.h>
  71#include <net/xfrm.h>
  72#include <net/netdma.h>
  73
  74#include <linux/inet.h>
  75#include <linux/ipv6.h>
  76#include <linux/stddef.h>
  77#include <linux/proc_fs.h>
  78#include <linux/seq_file.h>
  79
  80#include <linux/crypto.h>
  81#include <linux/scatterlist.h>
  82
  83int sysctl_tcp_tw_reuse __read_mostly;
  84int sysctl_tcp_low_latency __read_mostly;
  85
  86
  87#ifdef CONFIG_TCP_MD5SIG
  88static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  89                                                   __be32 addr);
  90static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  91                               __be32 daddr, __be32 saddr, struct tcphdr *th);
  92#else
  93static inline
  94struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  95{
  96        return NULL;
  97}
  98#endif
  99
 100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 101        .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 102        .lhash_users = ATOMIC_INIT(0),
 103        .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 104};
 105
 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 107{
 108        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 109                                          ip_hdr(skb)->saddr,
 110                                          tcp_hdr(skb)->dest,
 111                                          tcp_hdr(skb)->source);
 112}
 113
 114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 115{
 116        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 117        struct tcp_sock *tp = tcp_sk(sk);
 118
 119        /* With PAWS, it is safe from the viewpoint
 120           of data integrity. Even without PAWS it is safe provided sequence
 121           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 122
 123           Actually, the idea is close to VJ's one, only timestamp cache is
 124           held not per host, but per port pair and TW bucket is used as state
 125           holder.
 126
 127           If TW bucket has been already destroyed we fall back to VJ's scheme
 128           and use initial timestamp retrieved from peer table.
 129         */
 130        if (tcptw->tw_ts_recent_stamp &&
 131            (twp == NULL || (sysctl_tcp_tw_reuse &&
 132                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 133                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 134                if (tp->write_seq == 0)
 135                        tp->write_seq = 1;
 136                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 137                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 138                sock_hold(sktw);
 139                return 1;
 140        }
 141
 142        return 0;
 143}
 144
 145EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147/* This will initiate an outgoing connection. */
 148int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149{
 150        struct inet_sock *inet = inet_sk(sk);
 151        struct tcp_sock *tp = tcp_sk(sk);
 152        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153        struct rtable *rt;
 154        __be32 daddr, nexthop;
 155        int tmp;
 156        int err;
 157
 158        if (addr_len < sizeof(struct sockaddr_in))
 159                return -EINVAL;
 160
 161        if (usin->sin_family != AF_INET)
 162                return -EAFNOSUPPORT;
 163
 164        nexthop = daddr = usin->sin_addr.s_addr;
 165        if (inet->opt && inet->opt->srr) {
 166                if (!daddr)
 167                        return -EINVAL;
 168                nexthop = inet->opt->faddr;
 169        }
 170
 171        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               inet->sport, usin->sin_port, sk, 1);
 175        if (tmp < 0) {
 176                if (tmp == -ENETUNREACH)
 177                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                return tmp;
 179        }
 180
 181        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                ip_rt_put(rt);
 183                return -ENETUNREACH;
 184        }
 185
 186        if (!inet->opt || !inet->opt->srr)
 187                daddr = rt->rt_dst;
 188
 189        if (!inet->saddr)
 190                inet->saddr = rt->rt_src;
 191        inet->rcv_saddr = inet->saddr;
 192
 193        if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 194                /* Reset inherited state */
 195                tp->rx_opt.ts_recent       = 0;
 196                tp->rx_opt.ts_recent_stamp = 0;
 197                tp->write_seq              = 0;
 198        }
 199
 200        if (tcp_death_row.sysctl_tw_recycle &&
 201            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 202                struct inet_peer *peer = rt_get_peer(rt);
 203                /*
 204                 * VJ's idea. We save last timestamp seen from
 205                 * the destination in peer table, when entering state
 206                 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 207                 * when trying new connection.
 208                 */
 209                if (peer != NULL &&
 210                    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 211                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 212                        tp->rx_opt.ts_recent = peer->tcp_ts;
 213                }
 214        }
 215
 216        inet->dport = usin->sin_port;
 217        inet->daddr = daddr;
 218
 219        inet_csk(sk)->icsk_ext_hdr_len = 0;
 220        if (inet->opt)
 221                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 222
 223        tp->rx_opt.mss_clamp = 536;
 224
 225        /* Socket identity is still unknown (sport may be zero).
 226         * However we set state to SYN-SENT and not releasing socket
 227         * lock select source port, enter ourselves into the hash tables and
 228         * complete initialization after this.
 229         */
 230        tcp_set_state(sk, TCP_SYN_SENT);
 231        err = inet_hash_connect(&tcp_death_row, sk);
 232        if (err)
 233                goto failure;
 234
 235        err = ip_route_newports(&rt, IPPROTO_TCP,
 236                                inet->sport, inet->dport, sk);
 237        if (err)
 238                goto failure;
 239
 240        /* OK, now commit destination to socket.  */
 241        sk->sk_gso_type = SKB_GSO_TCPV4;
 242        sk_setup_caps(sk, &rt->u.dst);
 243
 244        if (!tp->write_seq)
 245                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 246                                                           inet->daddr,
 247                                                           inet->sport,
 248                                                           usin->sin_port);
 249
 250        inet->id = tp->write_seq ^ jiffies;
 251
 252        err = tcp_connect(sk);
 253        rt = NULL;
 254        if (err)
 255                goto failure;
 256
 257        return 0;
 258
 259failure:
 260        /*
 261         * This unhashes the socket and releases the local port,
 262         * if necessary.
 263         */
 264        tcp_set_state(sk, TCP_CLOSE);
 265        ip_rt_put(rt);
 266        sk->sk_route_caps = 0;
 267        inet->dport = 0;
 268        return err;
 269}
 270
 271/*
 272 * This routine does path mtu discovery as defined in RFC1191.
 273 */
 274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 275{
 276        struct dst_entry *dst;
 277        struct inet_sock *inet = inet_sk(sk);
 278
 279        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 280         * send out by Linux are always <576bytes so they should go through
 281         * unfragmented).
 282         */
 283        if (sk->sk_state == TCP_LISTEN)
 284                return;
 285
 286        /* We don't check in the destentry if pmtu discovery is forbidden
 287         * on this route. We just assume that no packet_to_big packets
 288         * are send back when pmtu discovery is not active.
 289         * There is a small race when the user changes this flag in the
 290         * route, but I think that's acceptable.
 291         */
 292        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 293                return;
 294
 295        dst->ops->update_pmtu(dst, mtu);
 296
 297        /* Something is about to be wrong... Remember soft error
 298         * for the case, if this connection will not able to recover.
 299         */
 300        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 301                sk->sk_err_soft = EMSGSIZE;
 302
 303        mtu = dst_mtu(dst);
 304
 305        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 306            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 307                tcp_sync_mss(sk, mtu);
 308
 309                /* Resend the TCP packet because it's
 310                 * clear that the old packet has been
 311                 * dropped. This is the new "fast" path mtu
 312                 * discovery.
 313                 */
 314                tcp_simple_retransmit(sk);
 315        } /* else let the usual retransmit timer handle it */
 316}
 317
 318/*
 319 * This routine is called by the ICMP module when it gets some
 320 * sort of error condition.  If err < 0 then the socket should
 321 * be closed and the error returned to the user.  If err > 0
 322 * it's just the icmp type << 8 | icmp code.  After adjustment
 323 * header points to the first 8 bytes of the tcp header.  We need
 324 * to find the appropriate port.
 325 *
 326 * The locking strategy used here is very "optimistic". When
 327 * someone else accesses the socket the ICMP is just dropped
 328 * and for some paths there is no check at all.
 329 * A more general error queue to queue errors for later handling
 330 * is probably better.
 331 *
 332 */
 333
 334void tcp_v4_err(struct sk_buff *skb, u32 info)
 335{
 336        struct iphdr *iph = (struct iphdr *)skb->data;
 337        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 338        struct tcp_sock *tp;
 339        struct inet_sock *inet;
 340        const int type = icmp_hdr(skb)->type;
 341        const int code = icmp_hdr(skb)->code;
 342        struct sock *sk;
 343        __u32 seq;
 344        int err;
 345        struct net *net = dev_net(skb->dev);
 346
 347        if (skb->len < (iph->ihl << 2) + 8) {
 348                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 349                return;
 350        }
 351
 352        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 353                        iph->saddr, th->source, inet_iif(skb));
 354        if (!sk) {
 355                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 356                return;
 357        }
 358        if (sk->sk_state == TCP_TIME_WAIT) {
 359                inet_twsk_put(inet_twsk(sk));
 360                return;
 361        }
 362
 363        bh_lock_sock(sk);
 364        /* If too many ICMPs get dropped on busy
 365         * servers this needs to be solved differently.
 366         */
 367        if (sock_owned_by_user(sk))
 368                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 369
 370        if (sk->sk_state == TCP_CLOSE)
 371                goto out;
 372
 373        tp = tcp_sk(sk);
 374        seq = ntohl(th->seq);
 375        if (sk->sk_state != TCP_LISTEN &&
 376            !between(seq, tp->snd_una, tp->snd_nxt)) {
 377                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 378                goto out;
 379        }
 380
 381        switch (type) {
 382        case ICMP_SOURCE_QUENCH:
 383                /* Just silently ignore these. */
 384                goto out;
 385        case ICMP_PARAMETERPROB:
 386                err = EPROTO;
 387                break;
 388        case ICMP_DEST_UNREACH:
 389                if (code > NR_ICMP_UNREACH)
 390                        goto out;
 391
 392                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 393                        if (!sock_owned_by_user(sk))
 394                                do_pmtu_discovery(sk, iph, info);
 395                        goto out;
 396                }
 397
 398                err = icmp_err_convert[code].errno;
 399                break;
 400        case ICMP_TIME_EXCEEDED:
 401                err = EHOSTUNREACH;
 402                break;
 403        default:
 404                goto out;
 405        }
 406
 407        switch (sk->sk_state) {
 408                struct request_sock *req, **prev;
 409        case TCP_LISTEN:
 410                if (sock_owned_by_user(sk))
 411                        goto out;
 412
 413                req = inet_csk_search_req(sk, &prev, th->dest,
 414                                          iph->daddr, iph->saddr);
 415                if (!req)
 416                        goto out;
 417
 418                /* ICMPs are not backlogged, hence we cannot get
 419                   an established socket here.
 420                 */
 421                WARN_ON(req->sk);
 422
 423                if (seq != tcp_rsk(req)->snt_isn) {
 424                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 425                        goto out;
 426                }
 427
 428                /*
 429                 * Still in SYN_RECV, just remove it silently.
 430                 * There is no good way to pass the error to the newly
 431                 * created socket, and POSIX does not want network
 432                 * errors returned from accept().
 433                 */
 434                inet_csk_reqsk_queue_drop(sk, req, prev);
 435                goto out;
 436
 437        case TCP_SYN_SENT:
 438        case TCP_SYN_RECV:  /* Cannot happen.
 439                               It can f.e. if SYNs crossed.
 440                             */
 441                if (!sock_owned_by_user(sk)) {
 442                        sk->sk_err = err;
 443
 444                        sk->sk_error_report(sk);
 445
 446                        tcp_done(sk);
 447                } else {
 448                        sk->sk_err_soft = err;
 449                }
 450                goto out;
 451        }
 452
 453        /* If we've already connected we will keep trying
 454         * until we time out, or the user gives up.
 455         *
 456         * rfc1122 4.2.3.9 allows to consider as hard errors
 457         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 458         * but it is obsoleted by pmtu discovery).
 459         *
 460         * Note, that in modern internet, where routing is unreliable
 461         * and in each dark corner broken firewalls sit, sending random
 462         * errors ordered by their masters even this two messages finally lose
 463         * their original sense (even Linux sends invalid PORT_UNREACHs)
 464         *
 465         * Now we are in compliance with RFCs.
 466         *                                                      --ANK (980905)
 467         */
 468
 469        inet = inet_sk(sk);
 470        if (!sock_owned_by_user(sk) && inet->recverr) {
 471                sk->sk_err = err;
 472                sk->sk_error_report(sk);
 473        } else  { /* Only an error on timeout */
 474                sk->sk_err_soft = err;
 475        }
 476
 477out:
 478        bh_unlock_sock(sk);
 479        sock_put(sk);
 480}
 481
 482/* This routine computes an IPv4 TCP checksum. */
 483void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 484{
 485        struct inet_sock *inet = inet_sk(sk);
 486        struct tcphdr *th = tcp_hdr(skb);
 487
 488        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 489                th->check = ~tcp_v4_check(len, inet->saddr,
 490                                          inet->daddr, 0);
 491                skb->csum_start = skb_transport_header(skb) - skb->head;
 492                skb->csum_offset = offsetof(struct tcphdr, check);
 493        } else {
 494                th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 495                                         csum_partial((char *)th,
 496                                                      th->doff << 2,
 497                                                      skb->csum));
 498        }
 499}
 500
 501int tcp_v4_gso_send_check(struct sk_buff *skb)
 502{
 503        const struct iphdr *iph;
 504        struct tcphdr *th;
 505
 506        if (!pskb_may_pull(skb, sizeof(*th)))
 507                return -EINVAL;
 508
 509        iph = ip_hdr(skb);
 510        th = tcp_hdr(skb);
 511
 512        th->check = 0;
 513        th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 514        skb->csum_start = skb_transport_header(skb) - skb->head;
 515        skb->csum_offset = offsetof(struct tcphdr, check);
 516        skb->ip_summed = CHECKSUM_PARTIAL;
 517        return 0;
 518}
 519
 520/*
 521 *      This routine will send an RST to the other tcp.
 522 *
 523 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 524 *                    for reset.
 525 *      Answer: if a packet caused RST, it is not for a socket
 526 *              existing in our system, if it is matched to a socket,
 527 *              it is just duplicate segment or bug in other side's TCP.
 528 *              So that we build reply only basing on parameters
 529 *              arrived with segment.
 530 *      Exception: precedence violation. We do not implement it in any case.
 531 */
 532
 533static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 534{
 535        struct tcphdr *th = tcp_hdr(skb);
 536        struct {
 537                struct tcphdr th;
 538#ifdef CONFIG_TCP_MD5SIG
 539                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 540#endif
 541        } rep;
 542        struct ip_reply_arg arg;
 543#ifdef CONFIG_TCP_MD5SIG
 544        struct tcp_md5sig_key *key;
 545#endif
 546        struct net *net;
 547
 548        /* Never send a reset in response to a reset. */
 549        if (th->rst)
 550                return;
 551
 552        if (skb->rtable->rt_type != RTN_LOCAL)
 553                return;
 554
 555        /* Swap the send and the receive. */
 556        memset(&rep, 0, sizeof(rep));
 557        rep.th.dest   = th->source;
 558        rep.th.source = th->dest;
 559        rep.th.doff   = sizeof(struct tcphdr) / 4;
 560        rep.th.rst    = 1;
 561
 562        if (th->ack) {
 563                rep.th.seq = th->ack_seq;
 564        } else {
 565                rep.th.ack = 1;
 566                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 567                                       skb->len - (th->doff << 2));
 568        }
 569
 570        memset(&arg, 0, sizeof(arg));
 571        arg.iov[0].iov_base = (unsigned char *)&rep;
 572        arg.iov[0].iov_len  = sizeof(rep.th);
 573
 574#ifdef CONFIG_TCP_MD5SIG
 575        key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 576        if (key) {
 577                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 578                                   (TCPOPT_NOP << 16) |
 579                                   (TCPOPT_MD5SIG << 8) |
 580                                   TCPOLEN_MD5SIG);
 581                /* Update length and the length the header thinks exists */
 582                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 583                rep.th.doff = arg.iov[0].iov_len / 4;
 584
 585                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 586                                     key, ip_hdr(skb)->daddr,
 587                                     ip_hdr(skb)->saddr, &rep.th);
 588        }
 589#endif
 590        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 591                                      ip_hdr(skb)->saddr, /* XXX */
 592                                      sizeof(struct tcphdr), IPPROTO_TCP, 0);
 593        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 594
 595        net = dev_net(skb->dst->dev);
 596        ip_send_reply(net->ipv4.tcp_sock, skb,
 597                      &arg, arg.iov[0].iov_len);
 598
 599        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 600        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 601}
 602
 603/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 604   outside socket context is ugly, certainly. What can I do?
 605 */
 606
 607static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 608                            u32 win, u32 ts, int oif,
 609                            struct tcp_md5sig_key *key)
 610{
 611        struct tcphdr *th = tcp_hdr(skb);
 612        struct {
 613                struct tcphdr th;
 614                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 615#ifdef CONFIG_TCP_MD5SIG
 616                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 617#endif
 618                        ];
 619        } rep;
 620        struct ip_reply_arg arg;
 621        struct net *net = dev_net(skb->dst->dev);
 622
 623        memset(&rep.th, 0, sizeof(struct tcphdr));
 624        memset(&arg, 0, sizeof(arg));
 625
 626        arg.iov[0].iov_base = (unsigned char *)&rep;
 627        arg.iov[0].iov_len  = sizeof(rep.th);
 628        if (ts) {
 629                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 630                                   (TCPOPT_TIMESTAMP << 8) |
 631                                   TCPOLEN_TIMESTAMP);
 632                rep.opt[1] = htonl(tcp_time_stamp);
 633                rep.opt[2] = htonl(ts);
 634                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 635        }
 636
 637        /* Swap the send and the receive. */
 638        rep.th.dest    = th->source;
 639        rep.th.source  = th->dest;
 640        rep.th.doff    = arg.iov[0].iov_len / 4;
 641        rep.th.seq     = htonl(seq);
 642        rep.th.ack_seq = htonl(ack);
 643        rep.th.ack     = 1;
 644        rep.th.window  = htons(win);
 645
 646#ifdef CONFIG_TCP_MD5SIG
 647        if (key) {
 648                int offset = (ts) ? 3 : 0;
 649
 650                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 651                                          (TCPOPT_NOP << 16) |
 652                                          (TCPOPT_MD5SIG << 8) |
 653                                          TCPOLEN_MD5SIG);
 654                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 655                rep.th.doff = arg.iov[0].iov_len/4;
 656
 657                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 658                                    key, ip_hdr(skb)->saddr,
 659                                    ip_hdr(skb)->daddr, &rep.th);
 660        }
 661#endif
 662        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 663                                      ip_hdr(skb)->saddr, /* XXX */
 664                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 665        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 666        if (oif)
 667                arg.bound_dev_if = oif;
 668
 669        ip_send_reply(net->ipv4.tcp_sock, skb,
 670                      &arg, arg.iov[0].iov_len);
 671
 672        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 673}
 674
 675static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 676{
 677        struct inet_timewait_sock *tw = inet_twsk(sk);
 678        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 679
 680        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 681                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 682                        tcptw->tw_ts_recent,
 683                        tw->tw_bound_dev_if,
 684                        tcp_twsk_md5_key(tcptw)
 685                        );
 686
 687        inet_twsk_put(tw);
 688}
 689
 690static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 691                                  struct request_sock *req)
 692{
 693        tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 694                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 695                        req->ts_recent,
 696                        0,
 697                        tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr));
 698}
 699
 700/*
 701 *      Send a SYN-ACK after having received a SYN.
 702 *      This still operates on a request_sock only, not on a big
 703 *      socket.
 704 */
 705static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 706                                struct dst_entry *dst)
 707{
 708        const struct inet_request_sock *ireq = inet_rsk(req);
 709        int err = -1;
 710        struct sk_buff * skb;
 711
 712        /* First, grab a route. */
 713        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 714                return -1;
 715
 716        skb = tcp_make_synack(sk, dst, req);
 717
 718        if (skb) {
 719                struct tcphdr *th = tcp_hdr(skb);
 720
 721                th->check = tcp_v4_check(skb->len,
 722                                         ireq->loc_addr,
 723                                         ireq->rmt_addr,
 724                                         csum_partial((char *)th, skb->len,
 725                                                      skb->csum));
 726
 727                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 728                                            ireq->rmt_addr,
 729                                            ireq->opt);
 730                err = net_xmit_eval(err);
 731        }
 732
 733        dst_release(dst);
 734        return err;
 735}
 736
 737static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 738{
 739        return __tcp_v4_send_synack(sk, req, NULL);
 740}
 741
 742/*
 743 *      IPv4 request_sock destructor.
 744 */
 745static void tcp_v4_reqsk_destructor(struct request_sock *req)
 746{
 747        kfree(inet_rsk(req)->opt);
 748}
 749
 750#ifdef CONFIG_SYN_COOKIES
 751static void syn_flood_warning(struct sk_buff *skb)
 752{
 753        static unsigned long warntime;
 754
 755        if (time_after(jiffies, (warntime + HZ * 60))) {
 756                warntime = jiffies;
 757                printk(KERN_INFO
 758                       "possible SYN flooding on port %d. Sending cookies.\n",
 759                       ntohs(tcp_hdr(skb)->dest));
 760        }
 761}
 762#endif
 763
 764/*
 765 * Save and compile IPv4 options into the request_sock if needed.
 766 */
 767static struct ip_options *tcp_v4_save_options(struct sock *sk,
 768                                              struct sk_buff *skb)
 769{
 770        struct ip_options *opt = &(IPCB(skb)->opt);
 771        struct ip_options *dopt = NULL;
 772
 773        if (opt && opt->optlen) {
 774                int opt_size = optlength(opt);
 775                dopt = kmalloc(opt_size, GFP_ATOMIC);
 776                if (dopt) {
 777                        if (ip_options_echo(dopt, skb)) {
 778                                kfree(dopt);
 779                                dopt = NULL;
 780                        }
 781                }
 782        }
 783        return dopt;
 784}
 785
 786#ifdef CONFIG_TCP_MD5SIG
 787/*
 788 * RFC2385 MD5 checksumming requires a mapping of
 789 * IP address->MD5 Key.
 790 * We need to maintain these in the sk structure.
 791 */
 792
 793/* Find the Key structure for an address.  */
 794static struct tcp_md5sig_key *
 795                        tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 796{
 797        struct tcp_sock *tp = tcp_sk(sk);
 798        int i;
 799
 800        if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 801                return NULL;
 802        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 803                if (tp->md5sig_info->keys4[i].addr == addr)
 804                        return &tp->md5sig_info->keys4[i].base;
 805        }
 806        return NULL;
 807}
 808
 809struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 810                                         struct sock *addr_sk)
 811{
 812        return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 813}
 814
 815EXPORT_SYMBOL(tcp_v4_md5_lookup);
 816
 817static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 818                                                      struct request_sock *req)
 819{
 820        return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 821}
 822
 823/* This can be called on a newly created socket, from other files */
 824int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 825                      u8 *newkey, u8 newkeylen)
 826{
 827        /* Add Key to the list */
 828        struct tcp_md5sig_key *key;
 829        struct tcp_sock *tp = tcp_sk(sk);
 830        struct tcp4_md5sig_key *keys;
 831
 832        key = tcp_v4_md5_do_lookup(sk, addr);
 833        if (key) {
 834                /* Pre-existing entry - just update that one. */
 835                kfree(key->key);
 836                key->key = newkey;
 837                key->keylen = newkeylen;
 838        } else {
 839                struct tcp_md5sig_info *md5sig;
 840
 841                if (!tp->md5sig_info) {
 842                        tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 843                                                  GFP_ATOMIC);
 844                        if (!tp->md5sig_info) {
 845                                kfree(newkey);
 846                                return -ENOMEM;
 847                        }
 848                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 849                }
 850                if (tcp_alloc_md5sig_pool() == NULL) {
 851                        kfree(newkey);
 852                        return -ENOMEM;
 853                }
 854                md5sig = tp->md5sig_info;
 855
 856                if (md5sig->alloced4 == md5sig->entries4) {
 857                        keys = kmalloc((sizeof(*keys) *
 858                                        (md5sig->entries4 + 1)), GFP_ATOMIC);
 859                        if (!keys) {
 860                                kfree(newkey);
 861                                tcp_free_md5sig_pool();
 862                                return -ENOMEM;
 863                        }
 864
 865                        if (md5sig->entries4)
 866                                memcpy(keys, md5sig->keys4,
 867                                       sizeof(*keys) * md5sig->entries4);
 868
 869                        /* Free old key list, and reference new one */
 870                        kfree(md5sig->keys4);
 871                        md5sig->keys4 = keys;
 872                        md5sig->alloced4++;
 873                }
 874                md5sig->entries4++;
 875                md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 876                md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 877                md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 878        }
 879        return 0;
 880}
 881
 882EXPORT_SYMBOL(tcp_v4_md5_do_add);
 883
 884static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 885                               u8 *newkey, u8 newkeylen)
 886{
 887        return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 888                                 newkey, newkeylen);
 889}
 890
 891int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 892{
 893        struct tcp_sock *tp = tcp_sk(sk);
 894        int i;
 895
 896        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 897                if (tp->md5sig_info->keys4[i].addr == addr) {
 898                        /* Free the key */
 899                        kfree(tp->md5sig_info->keys4[i].base.key);
 900                        tp->md5sig_info->entries4--;
 901
 902                        if (tp->md5sig_info->entries4 == 0) {
 903                                kfree(tp->md5sig_info->keys4);
 904                                tp->md5sig_info->keys4 = NULL;
 905                                tp->md5sig_info->alloced4 = 0;
 906                        } else if (tp->md5sig_info->entries4 != i) {
 907                                /* Need to do some manipulation */
 908                                memmove(&tp->md5sig_info->keys4[i],
 909                                        &tp->md5sig_info->keys4[i+1],
 910                                        (tp->md5sig_info->entries4 - i) *
 911                                         sizeof(struct tcp4_md5sig_key));
 912                        }
 913                        tcp_free_md5sig_pool();
 914                        return 0;
 915                }
 916        }
 917        return -ENOENT;
 918}
 919
 920EXPORT_SYMBOL(tcp_v4_md5_do_del);
 921
 922static void tcp_v4_clear_md5_list(struct sock *sk)
 923{
 924        struct tcp_sock *tp = tcp_sk(sk);
 925
 926        /* Free each key, then the set of key keys,
 927         * the crypto element, and then decrement our
 928         * hold on the last resort crypto.
 929         */
 930        if (tp->md5sig_info->entries4) {
 931                int i;
 932                for (i = 0; i < tp->md5sig_info->entries4; i++)
 933                        kfree(tp->md5sig_info->keys4[i].base.key);
 934                tp->md5sig_info->entries4 = 0;
 935                tcp_free_md5sig_pool();
 936        }
 937        if (tp->md5sig_info->keys4) {
 938                kfree(tp->md5sig_info->keys4);
 939                tp->md5sig_info->keys4 = NULL;
 940                tp->md5sig_info->alloced4  = 0;
 941        }
 942}
 943
 944static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 945                                 int optlen)
 946{
 947        struct tcp_md5sig cmd;
 948        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 949        u8 *newkey;
 950
 951        if (optlen < sizeof(cmd))
 952                return -EINVAL;
 953
 954        if (copy_from_user(&cmd, optval, sizeof(cmd)))
 955                return -EFAULT;
 956
 957        if (sin->sin_family != AF_INET)
 958                return -EINVAL;
 959
 960        if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 961                if (!tcp_sk(sk)->md5sig_info)
 962                        return -ENOENT;
 963                return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 964        }
 965
 966        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 967                return -EINVAL;
 968
 969        if (!tcp_sk(sk)->md5sig_info) {
 970                struct tcp_sock *tp = tcp_sk(sk);
 971                struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
 972
 973                if (!p)
 974                        return -EINVAL;
 975
 976                tp->md5sig_info = p;
 977                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 978        }
 979
 980        newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
 981        if (!newkey)
 982                return -ENOMEM;
 983        return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
 984                                 newkey, cmd.tcpm_keylen);
 985}
 986
 987static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
 988                                        __be32 daddr, __be32 saddr, int nbytes)
 989{
 990        struct tcp4_pseudohdr *bp;
 991        struct scatterlist sg;
 992
 993        bp = &hp->md5_blk.ip4;
 994
 995        /*
 996         * 1. the TCP pseudo-header (in the order: source IP address,
 997         * destination IP address, zero-padded protocol number, and
 998         * segment length)
 999         */
1000        bp->saddr = saddr;
1001        bp->daddr = daddr;
1002        bp->pad = 0;
1003        bp->protocol = IPPROTO_TCP;
1004        bp->len = cpu_to_be16(nbytes);
1005
1006        sg_init_one(&sg, bp, sizeof(*bp));
1007        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1008}
1009
1010static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1011                               __be32 daddr, __be32 saddr, struct tcphdr *th)
1012{
1013        struct tcp_md5sig_pool *hp;
1014        struct hash_desc *desc;
1015
1016        hp = tcp_get_md5sig_pool();
1017        if (!hp)
1018                goto clear_hash_noput;
1019        desc = &hp->md5_desc;
1020
1021        if (crypto_hash_init(desc))
1022                goto clear_hash;
1023        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1024                goto clear_hash;
1025        if (tcp_md5_hash_header(hp, th))
1026                goto clear_hash;
1027        if (tcp_md5_hash_key(hp, key))
1028                goto clear_hash;
1029        if (crypto_hash_final(desc, md5_hash))
1030                goto clear_hash;
1031
1032        tcp_put_md5sig_pool();
1033        return 0;
1034
1035clear_hash:
1036        tcp_put_md5sig_pool();
1037clear_hash_noput:
1038        memset(md5_hash, 0, 16);
1039        return 1;
1040}
1041
1042int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1043                        struct sock *sk, struct request_sock *req,
1044                        struct sk_buff *skb)
1045{
1046        struct tcp_md5sig_pool *hp;
1047        struct hash_desc *desc;
1048        struct tcphdr *th = tcp_hdr(skb);
1049        __be32 saddr, daddr;
1050
1051        if (sk) {
1052                saddr = inet_sk(sk)->saddr;
1053                daddr = inet_sk(sk)->daddr;
1054        } else if (req) {
1055                saddr = inet_rsk(req)->loc_addr;
1056                daddr = inet_rsk(req)->rmt_addr;
1057        } else {
1058                const struct iphdr *iph = ip_hdr(skb);
1059                saddr = iph->saddr;
1060                daddr = iph->daddr;
1061        }
1062
1063        hp = tcp_get_md5sig_pool();
1064        if (!hp)
1065                goto clear_hash_noput;
1066        desc = &hp->md5_desc;
1067
1068        if (crypto_hash_init(desc))
1069                goto clear_hash;
1070
1071        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1072                goto clear_hash;
1073        if (tcp_md5_hash_header(hp, th))
1074                goto clear_hash;
1075        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1076                goto clear_hash;
1077        if (tcp_md5_hash_key(hp, key))
1078                goto clear_hash;
1079        if (crypto_hash_final(desc, md5_hash))
1080                goto clear_hash;
1081
1082        tcp_put_md5sig_pool();
1083        return 0;
1084
1085clear_hash:
1086        tcp_put_md5sig_pool();
1087clear_hash_noput:
1088        memset(md5_hash, 0, 16);
1089        return 1;
1090}
1091
1092EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1093
1094static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1095{
1096        /*
1097         * This gets called for each TCP segment that arrives
1098         * so we want to be efficient.
1099         * We have 3 drop cases:
1100         * o No MD5 hash and one expected.
1101         * o MD5 hash and we're not expecting one.
1102         * o MD5 hash and its wrong.
1103         */
1104        __u8 *hash_location = NULL;
1105        struct tcp_md5sig_key *hash_expected;
1106        const struct iphdr *iph = ip_hdr(skb);
1107        struct tcphdr *th = tcp_hdr(skb);
1108        int genhash;
1109        unsigned char newhash[16];
1110
1111        hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1112        hash_location = tcp_parse_md5sig_option(th);
1113
1114        /* We've parsed the options - do we have a hash? */
1115        if (!hash_expected && !hash_location)
1116                return 0;
1117
1118        if (hash_expected && !hash_location) {
1119                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1120                return 1;
1121        }
1122
1123        if (!hash_expected && hash_location) {
1124                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1125                return 1;
1126        }
1127
1128        /* Okay, so this is hash_expected and hash_location -
1129         * so we need to calculate the checksum.
1130         */
1131        genhash = tcp_v4_md5_hash_skb(newhash,
1132                                      hash_expected,
1133                                      NULL, NULL, skb);
1134
1135        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1136                if (net_ratelimit()) {
1137                        printk(KERN_INFO "MD5 Hash failed for "
1138                               "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1139                               NIPQUAD(iph->saddr), ntohs(th->source),
1140                               NIPQUAD(iph->daddr), ntohs(th->dest),
1141                               genhash ? " tcp_v4_calc_md5_hash failed" : "");
1142                }
1143                return 1;
1144        }
1145        return 0;
1146}
1147
1148#endif
1149
1150struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1151        .family         =       PF_INET,
1152        .obj_size       =       sizeof(struct tcp_request_sock),
1153        .rtx_syn_ack    =       tcp_v4_send_synack,
1154        .send_ack       =       tcp_v4_reqsk_send_ack,
1155        .destructor     =       tcp_v4_reqsk_destructor,
1156        .send_reset     =       tcp_v4_send_reset,
1157};
1158
1159#ifdef CONFIG_TCP_MD5SIG
1160static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1161        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1162};
1163#endif
1164
1165static struct timewait_sock_ops tcp_timewait_sock_ops = {
1166        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1167        .twsk_unique    = tcp_twsk_unique,
1168        .twsk_destructor= tcp_twsk_destructor,
1169};
1170
1171int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1172{
1173        struct inet_request_sock *ireq;
1174        struct tcp_options_received tmp_opt;
1175        struct request_sock *req;
1176        __be32 saddr = ip_hdr(skb)->saddr;
1177        __be32 daddr = ip_hdr(skb)->daddr;
1178        __u32 isn = TCP_SKB_CB(skb)->when;
1179        struct dst_entry *dst = NULL;
1180#ifdef CONFIG_SYN_COOKIES
1181        int want_cookie = 0;
1182#else
1183#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1184#endif
1185
1186        /* Never answer to SYNs send to broadcast or multicast */
1187        if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1188                goto drop;
1189
1190        /* TW buckets are converted to open requests without
1191         * limitations, they conserve resources and peer is
1192         * evidently real one.
1193         */
1194        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1195#ifdef CONFIG_SYN_COOKIES
1196                if (sysctl_tcp_syncookies) {
1197                        want_cookie = 1;
1198                } else
1199#endif
1200                goto drop;
1201        }
1202
1203        /* Accept backlog is full. If we have already queued enough
1204         * of warm entries in syn queue, drop request. It is better than
1205         * clogging syn queue with openreqs with exponentially increasing
1206         * timeout.
1207         */
1208        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1209                goto drop;
1210
1211        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1212        if (!req)
1213                goto drop;
1214
1215#ifdef CONFIG_TCP_MD5SIG
1216        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1217#endif
1218
1219        tcp_clear_options(&tmp_opt);
1220        tmp_opt.mss_clamp = 536;
1221        tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1222
1223        tcp_parse_options(skb, &tmp_opt, 0);
1224
1225        if (want_cookie && !tmp_opt.saw_tstamp)
1226                tcp_clear_options(&tmp_opt);
1227
1228        if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1229                /* Some OSes (unknown ones, but I see them on web server, which
1230                 * contains information interesting only for windows'
1231                 * users) do not send their stamp in SYN. It is easy case.
1232                 * We simply do not advertise TS support.
1233                 */
1234                tmp_opt.saw_tstamp = 0;
1235                tmp_opt.tstamp_ok  = 0;
1236        }
1237        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1238
1239        tcp_openreq_init(req, &tmp_opt, skb);
1240
1241        if (security_inet_conn_request(sk, skb, req))
1242                goto drop_and_free;
1243
1244        ireq = inet_rsk(req);
1245        ireq->loc_addr = daddr;
1246        ireq->rmt_addr = saddr;
1247        ireq->opt = tcp_v4_save_options(sk, skb);
1248        if (!want_cookie)
1249                TCP_ECN_create_request(req, tcp_hdr(skb));
1250
1251        if (want_cookie) {
1252#ifdef CONFIG_SYN_COOKIES
1253                syn_flood_warning(skb);
1254                req->cookie_ts = tmp_opt.tstamp_ok;
1255#endif
1256                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1257        } else if (!isn) {
1258                struct inet_peer *peer = NULL;
1259
1260                /* VJ's idea. We save last timestamp seen
1261                 * from the destination in peer table, when entering
1262                 * state TIME-WAIT, and check against it before
1263                 * accepting new connection request.
1264                 *
1265                 * If "isn" is not zero, this request hit alive
1266                 * timewait bucket, so that all the necessary checks
1267                 * are made in the function processing timewait state.
1268                 */
1269                if (tmp_opt.saw_tstamp &&
1270                    tcp_death_row.sysctl_tw_recycle &&
1271                    (dst = inet_csk_route_req(sk, req)) != NULL &&
1272                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1273                    peer->v4daddr == saddr) {
1274                        if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1275                            (s32)(peer->tcp_ts - req->ts_recent) >
1276                                                        TCP_PAWS_WINDOW) {
1277                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1278                                goto drop_and_release;
1279                        }
1280                }
1281                /* Kill the following clause, if you dislike this way. */
1282                else if (!sysctl_tcp_syncookies &&
1283                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1284                          (sysctl_max_syn_backlog >> 2)) &&
1285                         (!peer || !peer->tcp_ts_stamp) &&
1286                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1287                        /* Without syncookies last quarter of
1288                         * backlog is filled with destinations,
1289                         * proven to be alive.
1290                         * It means that we continue to communicate
1291                         * to destinations, already remembered
1292                         * to the moment of synflood.
1293                         */
1294                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1295                                       "request from " NIPQUAD_FMT "/%u\n",
1296                                       NIPQUAD(saddr),
1297                                       ntohs(tcp_hdr(skb)->source));
1298                        goto drop_and_release;
1299                }
1300
1301                isn = tcp_v4_init_sequence(skb);
1302        }
1303        tcp_rsk(req)->snt_isn = isn;
1304
1305        if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1306                goto drop_and_free;
1307
1308        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1309        return 0;
1310
1311drop_and_release:
1312        dst_release(dst);
1313drop_and_free:
1314        reqsk_free(req);
1315drop:
1316        return 0;
1317}
1318
1319
1320/*
1321 * The three way handshake has completed - we got a valid synack -
1322 * now create the new socket.
1323 */
1324struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1325                                  struct request_sock *req,
1326                                  struct dst_entry *dst)
1327{
1328        struct inet_request_sock *ireq;
1329        struct inet_sock *newinet;
1330        struct tcp_sock *newtp;
1331        struct sock *newsk;
1332#ifdef CONFIG_TCP_MD5SIG
1333        struct tcp_md5sig_key *key;
1334#endif
1335
1336        if (sk_acceptq_is_full(sk))
1337                goto exit_overflow;
1338
1339        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1340                goto exit;
1341
1342        newsk = tcp_create_openreq_child(sk, req, skb);
1343        if (!newsk)
1344                goto exit;
1345
1346        newsk->sk_gso_type = SKB_GSO_TCPV4;
1347        sk_setup_caps(newsk, dst);
1348
1349        newtp                 = tcp_sk(newsk);
1350        newinet               = inet_sk(newsk);
1351        ireq                  = inet_rsk(req);
1352        newinet->daddr        = ireq->rmt_addr;
1353        newinet->rcv_saddr    = ireq->loc_addr;
1354        newinet->saddr        = ireq->loc_addr;
1355        newinet->opt          = ireq->opt;
1356        ireq->opt             = NULL;
1357        newinet->mc_index     = inet_iif(skb);
1358        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1359        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1360        if (newinet->opt)
1361                inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1362        newinet->id = newtp->write_seq ^ jiffies;
1363
1364        tcp_mtup_init(newsk);
1365        tcp_sync_mss(newsk, dst_mtu(dst));
1366        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1367        if (tcp_sk(sk)->rx_opt.user_mss &&
1368            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1369                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1370
1371        tcp_initialize_rcv_mss(newsk);
1372
1373#ifdef CONFIG_TCP_MD5SIG
1374        /* Copy over the MD5 key from the original socket */
1375        if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1376                /*
1377                 * We're using one, so create a matching key
1378                 * on the newsk structure. If we fail to get
1379                 * memory, then we end up not copying the key
1380                 * across. Shucks.
1381                 */
1382                char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1383                if (newkey != NULL)
1384                        tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1385                                          newkey, key->keylen);
1386                newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1387        }
1388#endif
1389
1390        __inet_hash_nolisten(newsk);
1391        __inet_inherit_port(sk, newsk);
1392
1393        return newsk;
1394
1395exit_overflow:
1396        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1397exit:
1398        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1399        dst_release(dst);
1400        return NULL;
1401}
1402
1403static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1404{
1405        struct tcphdr *th = tcp_hdr(skb);
1406        const struct iphdr *iph = ip_hdr(skb);
1407        struct sock *nsk;
1408        struct request_sock **prev;
1409        /* Find possible connection requests. */
1410        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1411                                                       iph->saddr, iph->daddr);
1412        if (req)
1413                return tcp_check_req(sk, skb, req, prev);
1414
1415        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1416                        th->source, iph->daddr, th->dest, inet_iif(skb));
1417
1418        if (nsk) {
1419                if (nsk->sk_state != TCP_TIME_WAIT) {
1420                        bh_lock_sock(nsk);
1421                        return nsk;
1422                }
1423                inet_twsk_put(inet_twsk(nsk));
1424                return NULL;
1425        }
1426
1427#ifdef CONFIG_SYN_COOKIES
1428        if (!th->rst && !th->syn && th->ack)
1429                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1430#endif
1431        return sk;
1432}
1433
1434static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1435{
1436        const struct iphdr *iph = ip_hdr(skb);
1437
1438        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1439                if (!tcp_v4_check(skb->len, iph->saddr,
1440                                  iph->daddr, skb->csum)) {
1441                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1442                        return 0;
1443                }
1444        }
1445
1446        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1447                                       skb->len, IPPROTO_TCP, 0);
1448
1449        if (skb->len <= 76) {
1450                return __skb_checksum_complete(skb);
1451        }
1452        return 0;
1453}
1454
1455
1456/* The socket must have it's spinlock held when we get
1457 * here.
1458 *
1459 * We have a potential double-lock case here, so even when
1460 * doing backlog processing we use the BH locking scheme.
1461 * This is because we cannot sleep with the original spinlock
1462 * held.
1463 */
1464int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1465{
1466        struct sock *rsk;
1467#ifdef CONFIG_TCP_MD5SIG
1468        /*
1469         * We really want to reject the packet as early as possible
1470         * if:
1471         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1472         *  o There is an MD5 option and we're not expecting one
1473         */
1474        if (tcp_v4_inbound_md5_hash(sk, skb))
1475                goto discard;
1476#endif
1477
1478        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1479                TCP_CHECK_TIMER(sk);
1480                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1481                        rsk = sk;
1482                        goto reset;
1483                }
1484                TCP_CHECK_TIMER(sk);
1485                return 0;
1486        }
1487
1488        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1489                goto csum_err;
1490
1491        if (sk->sk_state == TCP_LISTEN) {
1492                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1493                if (!nsk)
1494                        goto discard;
1495
1496                if (nsk != sk) {
1497                        if (tcp_child_process(sk, nsk, skb)) {
1498                                rsk = nsk;
1499                                goto reset;
1500                        }
1501                        return 0;
1502                }
1503        }
1504
1505        TCP_CHECK_TIMER(sk);
1506        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1507                rsk = sk;
1508                goto reset;
1509        }
1510        TCP_CHECK_TIMER(sk);
1511        return 0;
1512
1513reset:
1514        tcp_v4_send_reset(rsk, skb);
1515discard:
1516        kfree_skb(skb);
1517        /* Be careful here. If this function gets more complicated and
1518         * gcc suffers from register pressure on the x86, sk (in %ebx)
1519         * might be destroyed here. This current version compiles correctly,
1520         * but you have been warned.
1521         */
1522        return 0;
1523
1524csum_err:
1525        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1526        goto discard;
1527}
1528
1529/*
1530 *      From tcp_input.c
1531 */
1532
1533int tcp_v4_rcv(struct sk_buff *skb)
1534{
1535        const struct iphdr *iph;
1536        struct tcphdr *th;
1537        struct sock *sk;
1538        int ret;
1539        struct net *net = dev_net(skb->dev);
1540
1541        if (skb->pkt_type != PACKET_HOST)
1542                goto discard_it;
1543
1544        /* Count it even if it's bad */
1545        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1546
1547        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1548                goto discard_it;
1549
1550        th = tcp_hdr(skb);
1551
1552        if (th->doff < sizeof(struct tcphdr) / 4)
1553                goto bad_packet;
1554        if (!pskb_may_pull(skb, th->doff * 4))
1555                goto discard_it;
1556
1557        /* An explanation is required here, I think.
1558         * Packet length and doff are validated by header prediction,
1559         * provided case of th->doff==0 is eliminated.
1560         * So, we defer the checks. */
1561        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1562                goto bad_packet;
1563
1564        th = tcp_hdr(skb);
1565        iph = ip_hdr(skb);
1566        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1567        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1568                                    skb->len - th->doff * 4);
1569        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1570        TCP_SKB_CB(skb)->when    = 0;
1571        TCP_SKB_CB(skb)->flags   = iph->tos;
1572        TCP_SKB_CB(skb)->sacked  = 0;
1573
1574        sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
1575                        th->source, iph->daddr, th->dest, inet_iif(skb));
1576        if (!sk)
1577                goto no_tcp_socket;
1578
1579process:
1580        if (sk->sk_state == TCP_TIME_WAIT)
1581                goto do_time_wait;
1582
1583        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1584                goto discard_and_relse;
1585        nf_reset(skb);
1586
1587        if (sk_filter(sk, skb))
1588                goto discard_and_relse;
1589
1590        skb->dev = NULL;
1591
1592        bh_lock_sock_nested(sk);
1593        ret = 0;
1594        if (!sock_owned_by_user(sk)) {
1595#ifdef CONFIG_NET_DMA
1596                struct tcp_sock *tp = tcp_sk(sk);
1597                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1598                        tp->ucopy.dma_chan = get_softnet_dma();
1599                if (tp->ucopy.dma_chan)
1600                        ret = tcp_v4_do_rcv(sk, skb);
1601                else
1602#endif
1603                {
1604                        if (!tcp_prequeue(sk, skb))
1605                        ret = tcp_v4_do_rcv(sk, skb);
1606                }
1607        } else
1608                sk_add_backlog(sk, skb);
1609        bh_unlock_sock(sk);
1610
1611        sock_put(sk);
1612
1613        return ret;
1614
1615no_tcp_socket:
1616        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1617                goto discard_it;
1618
1619        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1620bad_packet:
1621                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1622        } else {
1623                tcp_v4_send_reset(NULL, skb);
1624        }
1625
1626discard_it:
1627        /* Discard frame. */
1628        kfree_skb(skb);
1629        return 0;
1630
1631discard_and_relse:
1632        sock_put(sk);
1633        goto discard_it;
1634
1635do_time_wait:
1636        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1637                inet_twsk_put(inet_twsk(sk));
1638                goto discard_it;
1639        }
1640
1641        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1642                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1643                inet_twsk_put(inet_twsk(sk));
1644                goto discard_it;
1645        }
1646        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1647        case TCP_TW_SYN: {
1648                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1649                                                        &tcp_hashinfo,
1650                                                        iph->daddr, th->dest,
1651                                                        inet_iif(skb));
1652                if (sk2) {
1653                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1654                        inet_twsk_put(inet_twsk(sk));
1655                        sk = sk2;
1656                        goto process;
1657                }
1658                /* Fall through to ACK */
1659        }
1660        case TCP_TW_ACK:
1661                tcp_v4_timewait_ack(sk, skb);
1662                break;
1663        case TCP_TW_RST:
1664                goto no_tcp_socket;
1665        case TCP_TW_SUCCESS:;
1666        }
1667        goto discard_it;
1668}
1669
1670/* VJ's idea. Save last timestamp seen from this destination
1671 * and hold it at least for normal timewait interval to use for duplicate
1672 * segment detection in subsequent connections, before they enter synchronized
1673 * state.
1674 */
1675
1676int tcp_v4_remember_stamp(struct sock *sk)
1677{
1678        struct inet_sock *inet = inet_sk(sk);
1679        struct tcp_sock *tp = tcp_sk(sk);
1680        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1681        struct inet_peer *peer = NULL;
1682        int release_it = 0;
1683
1684        if (!rt || rt->rt_dst != inet->daddr) {
1685                peer = inet_getpeer(inet->daddr, 1);
1686                release_it = 1;
1687        } else {
1688                if (!rt->peer)
1689                        rt_bind_peer(rt, 1);
1690                peer = rt->peer;
1691        }
1692
1693        if (peer) {
1694                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1695                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1696                     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1697                        peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1698                        peer->tcp_ts = tp->rx_opt.ts_recent;
1699                }
1700                if (release_it)
1701                        inet_putpeer(peer);
1702                return 1;
1703        }
1704
1705        return 0;
1706}
1707
1708int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1709{
1710        struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1711
1712        if (peer) {
1713                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1714
1715                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1716                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1717                     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1718                        peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1719                        peer->tcp_ts       = tcptw->tw_ts_recent;
1720                }
1721                inet_putpeer(peer);
1722                return 1;
1723        }
1724
1725        return 0;
1726}
1727
1728struct inet_connection_sock_af_ops ipv4_specific = {
1729        .queue_xmit        = ip_queue_xmit,
1730        .send_check        = tcp_v4_send_check,
1731        .rebuild_header    = inet_sk_rebuild_header,
1732        .conn_request      = tcp_v4_conn_request,
1733        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1734        .remember_stamp    = tcp_v4_remember_stamp,
1735        .net_header_len    = sizeof(struct iphdr),
1736        .setsockopt        = ip_setsockopt,
1737        .getsockopt        = ip_getsockopt,
1738        .addr2sockaddr     = inet_csk_addr2sockaddr,
1739        .sockaddr_len      = sizeof(struct sockaddr_in),
1740        .bind_conflict     = inet_csk_bind_conflict,
1741#ifdef CONFIG_COMPAT
1742        .compat_setsockopt = compat_ip_setsockopt,
1743        .compat_getsockopt = compat_ip_getsockopt,
1744#endif
1745};
1746
1747#ifdef CONFIG_TCP_MD5SIG
1748static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1749        .md5_lookup             = tcp_v4_md5_lookup,
1750        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1751        .md5_add                = tcp_v4_md5_add_func,
1752        .md5_parse              = tcp_v4_parse_md5_keys,
1753};
1754#endif
1755
1756/* NOTE: A lot of things set to zero explicitly by call to
1757 *       sk_alloc() so need not be done here.
1758 */
1759static int tcp_v4_init_sock(struct sock *sk)
1760{
1761        struct inet_connection_sock *icsk = inet_csk(sk);
1762        struct tcp_sock *tp = tcp_sk(sk);
1763
1764        skb_queue_head_init(&tp->out_of_order_queue);
1765        tcp_init_xmit_timers(sk);
1766        tcp_prequeue_init(tp);
1767
1768        icsk->icsk_rto = TCP_TIMEOUT_INIT;
1769        tp->mdev = TCP_TIMEOUT_INIT;
1770
1771        /* So many TCP implementations out there (incorrectly) count the
1772         * initial SYN frame in their delayed-ACK and congestion control
1773         * algorithms that we must have the following bandaid to talk
1774         * efficiently to them.  -DaveM
1775         */
1776        tp->snd_cwnd = 2;
1777
1778        /* See draft-stevens-tcpca-spec-01 for discussion of the
1779         * initialization of these values.
1780         */
1781        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1782        tp->snd_cwnd_clamp = ~0;
1783        tp->mss_cache = 536;
1784
1785        tp->reordering = sysctl_tcp_reordering;
1786        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1787
1788        sk->sk_state = TCP_CLOSE;
1789
1790        sk->sk_write_space = sk_stream_write_space;
1791        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1792
1793        icsk->icsk_af_ops = &ipv4_specific;
1794        icsk->icsk_sync_mss = tcp_sync_mss;
1795#ifdef CONFIG_TCP_MD5SIG
1796        tp->af_specific = &tcp_sock_ipv4_specific;
1797#endif
1798
1799        sk->sk_sndbuf = sysctl_tcp_wmem[1];
1800        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1801
1802        atomic_inc(&tcp_sockets_allocated);
1803
1804        return 0;
1805}
1806
1807void tcp_v4_destroy_sock(struct sock *sk)
1808{
1809        struct tcp_sock *tp = tcp_sk(sk);
1810
1811        tcp_clear_xmit_timers(sk);
1812
1813        tcp_cleanup_congestion_control(sk);
1814
1815        /* Cleanup up the write buffer. */
1816        tcp_write_queue_purge(sk);
1817
1818        /* Cleans up our, hopefully empty, out_of_order_queue. */
1819        __skb_queue_purge(&tp->out_of_order_queue);
1820
1821#ifdef CONFIG_TCP_MD5SIG
1822        /* Clean up the MD5 key list, if any */
1823        if (tp->md5sig_info) {
1824                tcp_v4_clear_md5_list(sk);
1825                kfree(tp->md5sig_info);
1826                tp->md5sig_info = NULL;
1827        }
1828#endif
1829
1830#ifdef CONFIG_NET_DMA
1831        /* Cleans up our sk_async_wait_queue */
1832        __skb_queue_purge(&sk->sk_async_wait_queue);
1833#endif
1834
1835        /* Clean prequeue, it must be empty really */
1836        __skb_queue_purge(&tp->ucopy.prequeue);
1837
1838        /* Clean up a referenced TCP bind bucket. */
1839        if (inet_csk(sk)->icsk_bind_hash)
1840                inet_put_port(sk);
1841
1842        /*
1843         * If sendmsg cached page exists, toss it.
1844         */
1845        if (sk->sk_sndmsg_page) {
1846                __free_page(sk->sk_sndmsg_page);
1847                sk->sk_sndmsg_page = NULL;
1848        }
1849
1850        atomic_dec(&tcp_sockets_allocated);
1851}
1852
1853EXPORT_SYMBOL(tcp_v4_destroy_sock);
1854
1855#ifdef CONFIG_PROC_FS
1856/* Proc filesystem TCP sock list dumping. */
1857
1858static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1859{
1860        return hlist_empty(head) ? NULL :
1861                list_entry(head->first, struct inet_timewait_sock, tw_node);
1862}
1863
1864static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1865{
1866        return tw->tw_node.next ?
1867                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1868}
1869
1870static void *listening_get_next(struct seq_file *seq, void *cur)
1871{
1872        struct inet_connection_sock *icsk;
1873        struct hlist_node *node;
1874        struct sock *sk = cur;
1875        struct tcp_iter_state* st = seq->private;
1876        struct net *net = seq_file_net(seq);
1877
1878        if (!sk) {
1879                st->bucket = 0;
1880                sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1881                goto get_sk;
1882        }
1883
1884        ++st->num;
1885
1886        if (st->state == TCP_SEQ_STATE_OPENREQ) {
1887                struct request_sock *req = cur;
1888
1889                icsk = inet_csk(st->syn_wait_sk);
1890                req = req->dl_next;
1891                while (1) {
1892                        while (req) {
1893                                if (req->rsk_ops->family == st->family) {
1894                                        cur = req;
1895                                        goto out;
1896                                }
1897                                req = req->dl_next;
1898                        }
1899                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1900                                break;
1901get_req:
1902                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1903                }
1904                sk        = sk_next(st->syn_wait_sk);
1905                st->state = TCP_SEQ_STATE_LISTENING;
1906                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907        } else {
1908                icsk = inet_csk(sk);
1909                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1910                if (reqsk_queue_len(&icsk->icsk_accept_queue))
1911                        goto start_req;
1912                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1913                sk = sk_next(sk);
1914        }
1915get_sk:
1916        sk_for_each_from(sk, node) {
1917                if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1918                        cur = sk;
1919                        goto out;
1920                }
1921                icsk = inet_csk(sk);
1922                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1923                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1924start_req:
1925                        st->uid         = sock_i_uid(sk);
1926                        st->syn_wait_sk = sk;
1927                        st->state       = TCP_SEQ_STATE_OPENREQ;
1928                        st->sbucket     = 0;
1929                        goto get_req;
1930                }
1931                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1932        }
1933        if (++st->bucket < INET_LHTABLE_SIZE) {
1934                sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1935                goto get_sk;
1936        }
1937        cur = NULL;
1938out:
1939        return cur;
1940}
1941
1942static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1943{
1944        void *rc = listening_get_next(seq, NULL);
1945
1946        while (rc && *pos) {
1947                rc = listening_get_next(seq, rc);
1948                --*pos;
1949        }
1950        return rc;
1951}
1952
1953static void *established_get_first(struct seq_file *seq)
1954{
1955        struct tcp_iter_state* st = seq->private;
1956        struct net *net = seq_file_net(seq);
1957        void *rc = NULL;
1958
1959        for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1960                struct sock *sk;
1961                struct hlist_node *node;
1962                struct inet_timewait_sock *tw;
1963                rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1964
1965                read_lock_bh(lock);
1966                sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1967                        if (sk->sk_family != st->family ||
1968                            !net_eq(sock_net(sk), net)) {
1969                                continue;
1970                        }
1971                        rc = sk;
1972                        goto out;
1973                }
1974                st->state = TCP_SEQ_STATE_TIME_WAIT;
1975                inet_twsk_for_each(tw, node,
1976                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
1977                        if (tw->tw_family != st->family ||
1978                            !net_eq(twsk_net(tw), net)) {
1979                                continue;
1980                        }
1981                        rc = tw;
1982                        goto out;
1983                }
1984                read_unlock_bh(lock);
1985                st->state = TCP_SEQ_STATE_ESTABLISHED;
1986        }
1987out:
1988        return rc;
1989}
1990
1991static void *established_get_next(struct seq_file *seq, void *cur)
1992{
1993        struct sock *sk = cur;
1994        struct inet_timewait_sock *tw;
1995        struct hlist_node *node;
1996        struct tcp_iter_state* st = seq->private;
1997        struct net *net = seq_file_net(seq);
1998
1999        ++st->num;
2000
2001        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2002                tw = cur;
2003                tw = tw_next(tw);
2004get_tw:
2005                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2006                        tw = tw_next(tw);
2007                }
2008                if (tw) {
2009                        cur = tw;
2010                        goto out;
2011                }
2012                read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2013                st->state = TCP_SEQ_STATE_ESTABLISHED;
2014
2015                if (++st->bucket < tcp_hashinfo.ehash_size) {
2016                        read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2017                        sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2018                } else {
2019                        cur = NULL;
2020                        goto out;
2021                }
2022        } else
2023                sk = sk_next(sk);
2024
2025        sk_for_each_from(sk, node) {
2026                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2027                        goto found;
2028        }
2029
2030        st->state = TCP_SEQ_STATE_TIME_WAIT;
2031        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2032        goto get_tw;
2033found:
2034        cur = sk;
2035out:
2036        return cur;
2037}
2038
2039static void *established_get_idx(struct seq_file *seq, loff_t pos)
2040{
2041        void *rc = established_get_first(seq);
2042
2043        while (rc && pos) {
2044                rc = established_get_next(seq, rc);
2045                --pos;
2046        }
2047        return rc;
2048}
2049
2050static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2051{
2052        void *rc;
2053        struct tcp_iter_state* st = seq->private;
2054
2055        inet_listen_lock(&tcp_hashinfo);
2056        st->state = TCP_SEQ_STATE_LISTENING;
2057        rc        = listening_get_idx(seq, &pos);
2058
2059        if (!rc) {
2060                inet_listen_unlock(&tcp_hashinfo);
2061                st->state = TCP_SEQ_STATE_ESTABLISHED;
2062                rc        = established_get_idx(seq, pos);
2063        }
2064
2065        return rc;
2066}
2067
2068static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2069{
2070        struct tcp_iter_state* st = seq->private;
2071        st->state = TCP_SEQ_STATE_LISTENING;
2072        st->num = 0;
2073        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2074}
2075
2076static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2077{
2078        void *rc = NULL;
2079        struct tcp_iter_state* st;
2080
2081        if (v == SEQ_START_TOKEN) {
2082                rc = tcp_get_idx(seq, 0);
2083                goto out;
2084        }
2085        st = seq->private;
2086
2087        switch (st->state) {
2088        case TCP_SEQ_STATE_OPENREQ:
2089        case TCP_SEQ_STATE_LISTENING:
2090                rc = listening_get_next(seq, v);
2091                if (!rc) {
2092                        inet_listen_unlock(&tcp_hashinfo);
2093                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2094                        rc        = established_get_first(seq);
2095                }
2096                break;
2097        case TCP_SEQ_STATE_ESTABLISHED:
2098        case TCP_SEQ_STATE_TIME_WAIT:
2099                rc = established_get_next(seq, v);
2100                break;
2101        }
2102out:
2103        ++*pos;
2104        return rc;
2105}
2106
2107static void tcp_seq_stop(struct seq_file *seq, void *v)
2108{
2109        struct tcp_iter_state* st = seq->private;
2110
2111        switch (st->state) {
2112        case TCP_SEQ_STATE_OPENREQ:
2113                if (v) {
2114                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2115                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2116                }
2117        case TCP_SEQ_STATE_LISTENING:
2118                if (v != SEQ_START_TOKEN)
2119                        inet_listen_unlock(&tcp_hashinfo);
2120                break;
2121        case TCP_SEQ_STATE_TIME_WAIT:
2122        case TCP_SEQ_STATE_ESTABLISHED:
2123                if (v)
2124                        read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2125                break;
2126        }
2127}
2128
2129static int tcp_seq_open(struct inode *inode, struct file *file)
2130{
2131        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2132        struct tcp_iter_state *s;
2133        int err;
2134
2135        err = seq_open_net(inode, file, &afinfo->seq_ops,
2136                          sizeof(struct tcp_iter_state));
2137        if (err < 0)
2138                return err;
2139
2140        s = ((struct seq_file *)file->private_data)->private;
2141        s->family               = afinfo->family;
2142        return 0;
2143}
2144
2145int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2146{
2147        int rc = 0;
2148        struct proc_dir_entry *p;
2149
2150        afinfo->seq_fops.open           = tcp_seq_open;
2151        afinfo->seq_fops.read           = seq_read;
2152        afinfo->seq_fops.llseek         = seq_lseek;
2153        afinfo->seq_fops.release        = seq_release_net;
2154
2155        afinfo->seq_ops.start           = tcp_seq_start;
2156        afinfo->seq_ops.next            = tcp_seq_next;
2157        afinfo->seq_ops.stop            = tcp_seq_stop;
2158
2159        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2160                             &afinfo->seq_fops, afinfo);
2161        if (!p)
2162                rc = -ENOMEM;
2163        return rc;
2164}
2165
2166void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2167{
2168        proc_net_remove(net, afinfo->name);
2169}
2170
2171static void get_openreq4(struct sock *sk, struct request_sock *req,
2172                         struct seq_file *f, int i, int uid, int *len)
2173{
2174        const struct inet_request_sock *ireq = inet_rsk(req);
2175        int ttd = req->expires - jiffies;
2176
2177        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2178                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2179                i,
2180                ireq->loc_addr,
2181                ntohs(inet_sk(sk)->sport),
2182                ireq->rmt_addr,
2183                ntohs(ireq->rmt_port),
2184                TCP_SYN_RECV,
2185                0, 0, /* could print option size, but that is af dependent. */
2186                1,    /* timers active (only the expire timer) */
2187                jiffies_to_clock_t(ttd),
2188                req->retrans,
2189                uid,
2190                0,  /* non standard timer */
2191                0, /* open_requests have no inode */
2192                atomic_read(&sk->sk_refcnt),
2193                req,
2194                len);
2195}
2196
2197static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2198{
2199        int timer_active;
2200        unsigned long timer_expires;
2201        struct tcp_sock *tp = tcp_sk(sk);
2202        const struct inet_connection_sock *icsk = inet_csk(sk);
2203        struct inet_sock *inet = inet_sk(sk);
2204        __be32 dest = inet->daddr;
2205        __be32 src = inet->rcv_saddr;
2206        __u16 destp = ntohs(inet->dport);
2207        __u16 srcp = ntohs(inet->sport);
2208
2209        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2210                timer_active    = 1;
2211                timer_expires   = icsk->icsk_timeout;
2212        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2213                timer_active    = 4;
2214                timer_expires   = icsk->icsk_timeout;
2215        } else if (timer_pending(&sk->sk_timer)) {
2216                timer_active    = 2;
2217                timer_expires   = sk->sk_timer.expires;
2218        } else {
2219                timer_active    = 0;
2220                timer_expires = jiffies;
2221        }
2222
2223        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2224                        "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2225                i, src, srcp, dest, destp, sk->sk_state,
2226                tp->write_seq - tp->snd_una,
2227                sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2228                                             (tp->rcv_nxt - tp->copied_seq),
2229                timer_active,
2230                jiffies_to_clock_t(timer_expires - jiffies),
2231                icsk->icsk_retransmits,
2232                sock_i_uid(sk),
2233                icsk->icsk_probes_out,
2234                sock_i_ino(sk),
2235                atomic_read(&sk->sk_refcnt), sk,
2236                jiffies_to_clock_t(icsk->icsk_rto),
2237                jiffies_to_clock_t(icsk->icsk_ack.ato),
2238                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2239                tp->snd_cwnd,
2240                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2241                len);
2242}
2243
2244static void get_timewait4_sock(struct inet_timewait_sock *tw,
2245                               struct seq_file *f, int i, int *len)
2246{
2247        __be32 dest, src;
2248        __u16 destp, srcp;
2249        int ttd = tw->tw_ttd - jiffies;
2250
2251        if (ttd < 0)
2252                ttd = 0;
2253
2254        dest  = tw->tw_daddr;
2255        src   = tw->tw_rcv_saddr;
2256        destp = ntohs(tw->tw_dport);
2257        srcp  = ntohs(tw->tw_sport);
2258
2259        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2260                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2261                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2262                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2263                atomic_read(&tw->tw_refcnt), tw, len);
2264}
2265
2266#define TMPSZ 150
2267
2268static int tcp4_seq_show(struct seq_file *seq, void *v)
2269{
2270        struct tcp_iter_state* st;
2271        int len;
2272
2273        if (v == SEQ_START_TOKEN) {
2274                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2275                           "  sl  local_address rem_address   st tx_queue "
2276                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2277                           "inode");
2278                goto out;
2279        }
2280        st = seq->private;
2281
2282        switch (st->state) {
2283        case TCP_SEQ_STATE_LISTENING:
2284        case TCP_SEQ_STATE_ESTABLISHED:
2285                get_tcp4_sock(v, seq, st->num, &len);
2286                break;
2287        case TCP_SEQ_STATE_OPENREQ:
2288                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2289                break;
2290        case TCP_SEQ_STATE_TIME_WAIT:
2291                get_timewait4_sock(v, seq, st->num, &len);
2292                break;
2293        }
2294        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2295out:
2296        return 0;
2297}
2298
2299static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2300        .name           = "tcp",
2301        .family         = AF_INET,
2302        .seq_fops       = {
2303                .owner          = THIS_MODULE,
2304        },
2305        .seq_ops        = {
2306                .show           = tcp4_seq_show,
2307        },
2308};
2309
2310static int tcp4_proc_init_net(struct net *net)
2311{
2312        return tcp_proc_register(net, &tcp4_seq_afinfo);
2313}
2314
2315static void tcp4_proc_exit_net(struct net *net)
2316{
2317        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2318}
2319
2320static struct pernet_operations tcp4_net_ops = {
2321        .init = tcp4_proc_init_net,
2322        .exit = tcp4_proc_exit_net,
2323};
2324
2325int __init tcp4_proc_init(void)
2326{
2327        return register_pernet_subsys(&tcp4_net_ops);
2328}
2329
2330void tcp4_proc_exit(void)
2331{
2332        unregister_pernet_subsys(&tcp4_net_ops);
2333}
2334#endif /* CONFIG_PROC_FS */
2335
2336struct proto tcp_prot = {
2337        .name                   = "TCP",
2338        .owner                  = THIS_MODULE,
2339        .close                  = tcp_close,
2340        .connect                = tcp_v4_connect,
2341        .disconnect             = tcp_disconnect,
2342        .accept                 = inet_csk_accept,
2343        .ioctl                  = tcp_ioctl,
2344        .init                   = tcp_v4_init_sock,
2345        .destroy                = tcp_v4_destroy_sock,
2346        .shutdown               = tcp_shutdown,
2347        .setsockopt             = tcp_setsockopt,
2348        .getsockopt             = tcp_getsockopt,
2349        .recvmsg                = tcp_recvmsg,
2350        .backlog_rcv            = tcp_v4_do_rcv,
2351        .hash                   = inet_hash,
2352        .unhash                 = inet_unhash,
2353        .get_port               = inet_csk_get_port,
2354        .enter_memory_pressure  = tcp_enter_memory_pressure,
2355        .sockets_allocated      = &tcp_sockets_allocated,
2356        .orphan_count           = &tcp_orphan_count,
2357        .memory_allocated       = &tcp_memory_allocated,
2358        .memory_pressure        = &tcp_memory_pressure,
2359        .sysctl_mem             = sysctl_tcp_mem,
2360        .sysctl_wmem            = sysctl_tcp_wmem,
2361        .sysctl_rmem            = sysctl_tcp_rmem,
2362        .max_header             = MAX_TCP_HEADER,
2363        .obj_size               = sizeof(struct tcp_sock),
2364        .twsk_prot              = &tcp_timewait_sock_ops,
2365        .rsk_prot               = &tcp_request_sock_ops,
2366        .h.hashinfo             = &tcp_hashinfo,
2367#ifdef CONFIG_COMPAT
2368        .compat_setsockopt      = compat_tcp_setsockopt,
2369        .compat_getsockopt      = compat_tcp_getsockopt,
2370#endif
2371};
2372
2373
2374static int __net_init tcp_sk_init(struct net *net)
2375{
2376        return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2377                                    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2378}
2379
2380static void __net_exit tcp_sk_exit(struct net *net)
2381{
2382        inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2383        inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2384}
2385
2386static struct pernet_operations __net_initdata tcp_sk_ops = {
2387       .init = tcp_sk_init,
2388       .exit = tcp_sk_exit,
2389};
2390
2391void __init tcp_v4_init(void)
2392{
2393        if (register_pernet_device(&tcp_sk_ops))
2394                panic("Failed to create the TCP control socket.\n");
2395}
2396
2397EXPORT_SYMBOL(ipv4_specific);
2398EXPORT_SYMBOL(tcp_hashinfo);
2399EXPORT_SYMBOL(tcp_prot);
2400EXPORT_SYMBOL(tcp_v4_conn_request);
2401EXPORT_SYMBOL(tcp_v4_connect);
2402EXPORT_SYMBOL(tcp_v4_do_rcv);
2403EXPORT_SYMBOL(tcp_v4_remember_stamp);
2404EXPORT_SYMBOL(tcp_v4_send_check);
2405EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2406
2407#ifdef CONFIG_PROC_FS
2408EXPORT_SYMBOL(tcp_proc_register);
2409EXPORT_SYMBOL(tcp_proc_unregister);
2410#endif
2411EXPORT_SYMBOL(sysctl_tcp_low_latency);
2412
2413