linux-bk/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9 *
  10 *              IPv4 specific functions
  11 *
  12 *
  13 *              code split from:
  14 *              linux/ipv4/tcp.c
  15 *              linux/ipv4/tcp_input.c
  16 *              linux/ipv4/tcp_output.c
  17 *
  18 *              See tcp.c for author information
  19 *
  20 *      This program is free software; you can redistribute it and/or
  21 *      modify it under the terms of the GNU General Public License
  22 *      as published by the Free Software Foundation; either version
  23 *      2 of the License, or (at your option) any later version.
  24 */
  25
  26/*
  27 * Changes:
  28 *              David S. Miller :       New socket lookup architecture.
  29 *                                      This code is dedicated to John Dyson.
  30 *              David S. Miller :       Change semantics of established hash,
  31 *                                      half is devoted to TIME_WAIT sockets
  32 *                                      and the rest go in the other half.
  33 *              Andi Kleen :            Add support for syncookies and fixed
  34 *                                      some bugs: ip options weren't passed to
  35 *                                      the TCP layer, missed a check for an
  36 *                                      ACK bit.
  37 *              Andi Kleen :            Implemented fast path mtu discovery.
  38 *                                      Fixed many serious bugs in the
  39 *                                      open_request handling and moved
  40 *                                      most of it into the af independent code.
  41 *                                      Added tail drop and some other bugfixes.
  42 *                                      Added new listen sematics.
  43 *              Mike McLagan    :       Routing by source
  44 *      Juan Jose Ciarlante:            ip_dynaddr bits
  45 *              Andi Kleen:             various fixes.
  46 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47 *                                      coma.
  48 *      Andi Kleen              :       Fix new listen.
  49 *      Andi Kleen              :       Fix accept error reporting.
  50 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52 *                                      a single port at the same time.
  53 */
  54
  55#include <linux/config.h>
  56
  57#include <linux/types.h>
  58#include <linux/fcntl.h>
  59#include <linux/module.h>
  60#include <linux/random.h>
  61#include <linux/cache.h>
  62#include <linux/jhash.h>
  63#include <linux/init.h>
  64#include <linux/times.h>
  65
  66#include <net/icmp.h>
  67#include <net/tcp.h>
  68#include <net/ipv6.h>
  69#include <net/inet_common.h>
  70#include <net/xfrm.h>
  71
  72#include <linux/inet.h>
  73#include <linux/ipv6.h>
  74#include <linux/stddef.h>
  75#include <linux/proc_fs.h>
  76#include <linux/seq_file.h>
  77
  78extern int sysctl_ip_dynaddr;
  79int sysctl_tcp_tw_reuse;
  80int sysctl_tcp_low_latency;
  81
  82/* Check TCP sequence numbers in ICMP packets. */
  83#define ICMP_MIN_LENGTH 8
  84
  85/* Socket used for sending RSTs */
  86static struct socket *tcp_socket;
  87
  88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                       struct sk_buff *skb);
  90
  91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92        .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93        .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94        .__tcp_lhash_wait
  95          = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96        .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97};
  98
  99/*
 100 * This array holds the first and last local port number.
 101 * For high-usage systems, use sysctl to change this to
 102 * 32768-61000
 103 */
 104int sysctl_local_port_range[2] = { 1024, 4999 };
 105int tcp_port_rover = 1024 - 1;
 106
 107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                 __u32 faddr, __u16 fport)
 109{
 110        int h = (laddr ^ lport) ^ (faddr ^ fport);
 111        h ^= h >> 16;
 112        h ^= h >> 8;
 113        return h & (tcp_ehash_size - 1);
 114}
 115
 116static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117{
 118        struct inet_opt *inet = inet_sk(sk);
 119        __u32 laddr = inet->rcv_saddr;
 120        __u16 lport = inet->num;
 121        __u32 faddr = inet->daddr;
 122        __u16 fport = inet->dport;
 123
 124        return tcp_hashfn(laddr, lport, faddr, fport);
 125}
 126
 127/* Allocate and initialize a new TCP local port bind bucket.
 128 * The bindhash mutex for snum's hash chain must be held here.
 129 */
 130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                          unsigned short snum)
 132{
 133        struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                      SLAB_ATOMIC);
 135        if (tb) {
 136                tb->port = snum;
 137                tb->fastreuse = 0;
 138                INIT_HLIST_HEAD(&tb->owners);
 139                hlist_add_head(&tb->node, &head->chain);
 140        }
 141        return tb;
 142}
 143
 144/* Caller must hold hashbucket lock for this tb with local BH disabled */
 145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146{
 147        if (hlist_empty(&tb->owners)) {
 148                __hlist_del(&tb->node);
 149                kmem_cache_free(tcp_bucket_cachep, tb);
 150        }
 151}
 152
 153/* Caller must disable local BH processing. */
 154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155{
 156        struct tcp_bind_hashbucket *head =
 157                                &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158        struct tcp_bind_bucket *tb;
 159
 160        spin_lock(&head->lock);
 161        tb = tcp_sk(sk)->bind_hash;
 162        sk_add_bind_node(child, &tb->owners);
 163        tcp_sk(child)->bind_hash = tb;
 164        spin_unlock(&head->lock);
 165}
 166
 167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168{
 169        local_bh_disable();
 170        __tcp_inherit_port(sk, child);
 171        local_bh_enable();
 172}
 173
 174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                   unsigned short snum)
 176{
 177        inet_sk(sk)->num = snum;
 178        sk_add_bind_node(sk, &tb->owners);
 179        tcp_sk(sk)->bind_hash = tb;
 180}
 181
 182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183{
 184        const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185        struct sock *sk2;
 186        struct hlist_node *node;
 187        int reuse = sk->sk_reuse;
 188
 189        sk_for_each_bound(sk2, node, &tb->owners) {
 190                if (sk != sk2 &&
 191                    !tcp_v6_ipv6only(sk2) &&
 192                    (!sk->sk_bound_dev_if ||
 193                     !sk2->sk_bound_dev_if ||
 194                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                        if (!reuse || !sk2->sk_reuse ||
 196                            sk2->sk_state == TCP_LISTEN) {
 197                                const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                    sk2_rcv_saddr == sk_rcv_saddr)
 200                                        break;
 201                        }
 202                }
 203        }
 204        return node != NULL;
 205}
 206
 207/* Obtain a reference to a local port for the given sock,
 208 * if snum is zero it means select any available local port.
 209 */
 210static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211{
 212        struct tcp_bind_hashbucket *head;
 213        struct hlist_node *node;
 214        struct tcp_bind_bucket *tb;
 215        int ret;
 216
 217        local_bh_disable();
 218        if (!snum) {
 219                int low = sysctl_local_port_range[0];
 220                int high = sysctl_local_port_range[1];
 221                int remaining = (high - low) + 1;
 222                int rover;
 223
 224                spin_lock(&tcp_portalloc_lock);
 225                rover = tcp_port_rover;
 226                do {
 227                        rover++;
 228                        if (rover < low || rover > high)
 229                                rover = low;
 230                        head = &tcp_bhash[tcp_bhashfn(rover)];
 231                        spin_lock(&head->lock);
 232                        tb_for_each(tb, node, &head->chain)
 233                                if (tb->port == rover)
 234                                        goto next;
 235                        break;
 236                next:
 237                        spin_unlock(&head->lock);
 238                } while (--remaining > 0);
 239                tcp_port_rover = rover;
 240                spin_unlock(&tcp_portalloc_lock);
 241
 242                /* Exhausted local port range during search? */
 243                ret = 1;
 244                if (remaining <= 0)
 245                        goto fail;
 246
 247                /* OK, here is the one we will use.  HEAD is
 248                 * non-NULL and we hold it's mutex.
 249                 */
 250                snum = rover;
 251        } else {
 252                head = &tcp_bhash[tcp_bhashfn(snum)];
 253                spin_lock(&head->lock);
 254                tb_for_each(tb, node, &head->chain)
 255                        if (tb->port == snum)
 256                                goto tb_found;
 257        }
 258        tb = NULL;
 259        goto tb_not_found;
 260tb_found:
 261        if (!hlist_empty(&tb->owners)) {
 262                if (sk->sk_reuse > 1)
 263                        goto success;
 264                if (tb->fastreuse > 0 &&
 265                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 266                        goto success;
 267                } else {
 268                        ret = 1;
 269                        if (tcp_bind_conflict(sk, tb))
 270                                goto fail_unlock;
 271                }
 272        }
 273tb_not_found:
 274        ret = 1;
 275        if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 276                goto fail_unlock;
 277        if (hlist_empty(&tb->owners)) {
 278                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 279                        tb->fastreuse = 1;
 280                else
 281                        tb->fastreuse = 0;
 282        } else if (tb->fastreuse &&
 283                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 284                tb->fastreuse = 0;
 285success:
 286        if (!tcp_sk(sk)->bind_hash)
 287                tcp_bind_hash(sk, tb, snum);
 288        BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 289        ret = 0;
 290
 291fail_unlock:
 292        spin_unlock(&head->lock);
 293fail:
 294        local_bh_enable();
 295        return ret;
 296}
 297
 298/* Get rid of any references to a local port held by the
 299 * given sock.
 300 */
 301static void __tcp_put_port(struct sock *sk)
 302{
 303        struct inet_opt *inet = inet_sk(sk);
 304        struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 305        struct tcp_bind_bucket *tb;
 306
 307        spin_lock(&head->lock);
 308        tb = tcp_sk(sk)->bind_hash;
 309        __sk_del_bind_node(sk);
 310        tcp_sk(sk)->bind_hash = NULL;
 311        inet->num = 0;
 312        tcp_bucket_destroy(tb);
 313        spin_unlock(&head->lock);
 314}
 315
 316void tcp_put_port(struct sock *sk)
 317{
 318        local_bh_disable();
 319        __tcp_put_port(sk);
 320        local_bh_enable();
 321}
 322
 323/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 324 * Look, when several writers sleep and reader wakes them up, all but one
 325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 326 * this, _but_ remember, it adds useless work on UP machines (wake up each
 327 * exclusive lock release). It should be ifdefed really.
 328 */
 329
 330void tcp_listen_wlock(void)
 331{
 332        write_lock(&tcp_lhash_lock);
 333
 334        if (atomic_read(&tcp_lhash_users)) {
 335                DEFINE_WAIT(wait);
 336
 337                for (;;) {
 338                        prepare_to_wait_exclusive(&tcp_lhash_wait,
 339                                                &wait, TASK_UNINTERRUPTIBLE);
 340                        if (!atomic_read(&tcp_lhash_users))
 341                                break;
 342                        write_unlock_bh(&tcp_lhash_lock);
 343                        schedule();
 344                        write_lock_bh(&tcp_lhash_lock);
 345                }
 346
 347                finish_wait(&tcp_lhash_wait, &wait);
 348        }
 349}
 350
 351static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 352{
 353        struct hlist_head *list;
 354        rwlock_t *lock;
 355
 356        BUG_TRAP(sk_unhashed(sk));
 357        if (listen_possible && sk->sk_state == TCP_LISTEN) {
 358                list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 359                lock = &tcp_lhash_lock;
 360                tcp_listen_wlock();
 361        } else {
 362                list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 363                lock = &tcp_ehash[sk->sk_hashent].lock;
 364                write_lock(lock);
 365        }
 366        __sk_add_node(sk, list);
 367        sock_prot_inc_use(sk->sk_prot);
 368        write_unlock(lock);
 369        if (listen_possible && sk->sk_state == TCP_LISTEN)
 370                wake_up(&tcp_lhash_wait);
 371}
 372
 373static void tcp_v4_hash(struct sock *sk)
 374{
 375        if (sk->sk_state != TCP_CLOSE) {
 376                local_bh_disable();
 377                __tcp_v4_hash(sk, 1);
 378                local_bh_enable();
 379        }
 380}
 381
 382void tcp_unhash(struct sock *sk)
 383{
 384        rwlock_t *lock;
 385
 386        if (sk_unhashed(sk))
 387                goto ende;
 388
 389        if (sk->sk_state == TCP_LISTEN) {
 390                local_bh_disable();
 391                tcp_listen_wlock();
 392                lock = &tcp_lhash_lock;
 393        } else {
 394                struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 395                lock = &head->lock;
 396                write_lock_bh(&head->lock);
 397        }
 398
 399        if (__sk_del_node_init(sk))
 400                sock_prot_dec_use(sk->sk_prot);
 401        write_unlock_bh(lock);
 402
 403 ende:
 404        if (sk->sk_state == TCP_LISTEN)
 405                wake_up(&tcp_lhash_wait);
 406}
 407
 408/* Don't inline this cruft.  Here are some nice properties to
 409 * exploit here.  The BSD API does not allow a listening TCP
 410 * to specify the remote port nor the remote address for the
 411 * connection.  So always assume those are both wildcarded
 412 * during the search since they can never be otherwise.
 413 */
 414static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 415                                             unsigned short hnum, int dif)
 416{
 417        struct sock *result = NULL, *sk;
 418        struct hlist_node *node;
 419        int score, hiscore;
 420
 421        hiscore=-1;
 422        sk_for_each(sk, node, head) {
 423                struct inet_opt *inet = inet_sk(sk);
 424
 425                if (inet->num == hnum && !ipv6_only_sock(sk)) {
 426                        __u32 rcv_saddr = inet->rcv_saddr;
 427
 428                        score = (sk->sk_family == PF_INET ? 1 : 0);
 429                        if (rcv_saddr) {
 430                                if (rcv_saddr != daddr)
 431                                        continue;
 432                                score+=2;
 433                        }
 434                        if (sk->sk_bound_dev_if) {
 435                                if (sk->sk_bound_dev_if != dif)
 436                                        continue;
 437                                score+=2;
 438                        }
 439                        if (score == 5)
 440                                return sk;
 441                        if (score > hiscore) {
 442                                hiscore = score;
 443                                result = sk;
 444                        }
 445                }
 446        }
 447        return result;
 448}
 449
 450/* Optimize the common listener case. */
 451inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
 452                                           int dif)
 453{
 454        struct sock *sk = NULL;
 455        struct hlist_head *head;
 456
 457        read_lock(&tcp_lhash_lock);
 458        head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 459        if (!hlist_empty(head)) {
 460                struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
 461
 462                if (inet->num == hnum && !sk->sk_node.next &&
 463                    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 464                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 465                    !sk->sk_bound_dev_if)
 466                        goto sherry_cache;
 467                sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 468        }
 469        if (sk) {
 470sherry_cache:
 471                sock_hold(sk);
 472        }
 473        read_unlock(&tcp_lhash_lock);
 474        return sk;
 475}
 476
 477/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 479 *
 480 * Local BH must be disabled here.
 481 */
 482
 483static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 484                                                       u32 daddr, u16 hnum,
 485                                                       int dif)
 486{
 487        struct tcp_ehash_bucket *head;
 488        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 489        __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 490        struct sock *sk;
 491        struct hlist_node *node;
 492        /* Optimize here for direct hit, only listening connections can
 493         * have wildcards anyways.
 494         */
 495        int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 496        head = &tcp_ehash[hash];
 497        read_lock(&head->lock);
 498        sk_for_each(sk, node, &head->chain) {
 499                if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 500                        goto hit; /* You sunk my battleship! */
 501        }
 502
 503        /* Must check for a TIME_WAIT'er before going to listener hash. */
 504        sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 505                if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 506                        goto hit;
 507        }
 508        sk = NULL;
 509out:
 510        read_unlock(&head->lock);
 511        return sk;
 512hit:
 513        sock_hold(sk);
 514        goto out;
 515}
 516
 517static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 518                                           u32 daddr, u16 hnum, int dif)
 519{
 520        struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 521                                                      daddr, hnum, dif);
 522
 523        return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 524}
 525
 526inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 527                                  u16 dport, int dif)
 528{
 529        struct sock *sk;
 530
 531        local_bh_disable();
 532        sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 533        local_bh_enable();
 534
 535        return sk;
 536}
 537
 538static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 539{
 540        return secure_tcp_sequence_number(skb->nh.iph->daddr,
 541                                          skb->nh.iph->saddr,
 542                                          skb->h.th->dest,
 543                                          skb->h.th->source);
 544}
 545
 546/* called with local bh disabled */
 547static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 548                                      struct tcp_tw_bucket **twp)
 549{
 550        struct inet_opt *inet = inet_sk(sk);
 551        u32 daddr = inet->rcv_saddr;
 552        u32 saddr = inet->daddr;
 553        int dif = sk->sk_bound_dev_if;
 554        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 555        __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 556        int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 557        struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 558        struct sock *sk2;
 559        struct hlist_node *node;
 560        struct tcp_tw_bucket *tw;
 561
 562        write_lock(&head->lock);
 563
 564        /* Check TIME-WAIT sockets first. */
 565        sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 566                tw = (struct tcp_tw_bucket *)sk2;
 567
 568                if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 569                        struct tcp_opt *tp = tcp_sk(sk);
 570
 571                        /* With PAWS, it is safe from the viewpoint
 572                           of data integrity. Even without PAWS it
 573                           is safe provided sequence spaces do not
 574                           overlap i.e. at data rates <= 80Mbit/sec.
 575
 576                           Actually, the idea is close to VJ's one,
 577                           only timestamp cache is held not per host,
 578                           but per port pair and TW bucket is used
 579                           as state holder.
 580
 581                           If TW bucket has been already destroyed we
 582                           fall back to VJ's scheme and use initial
 583                           timestamp retrieved from peer table.
 584                         */
 585                        if (tw->tw_ts_recent_stamp &&
 586                            (!twp || (sysctl_tcp_tw_reuse &&
 587                                      xtime.tv_sec -
 588                                      tw->tw_ts_recent_stamp > 1))) {
 589                                if ((tp->write_seq =
 590                                                tw->tw_snd_nxt + 65535 + 2) == 0)
 591                                        tp->write_seq = 1;
 592                                tp->ts_recent       = tw->tw_ts_recent;
 593                                tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
 594                                sock_hold(sk2);
 595                                goto unique;
 596                        } else
 597                                goto not_unique;
 598                }
 599        }
 600        tw = NULL;
 601
 602        /* And established part... */
 603        sk_for_each(sk2, node, &head->chain) {
 604                if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 605                        goto not_unique;
 606        }
 607
 608unique:
 609        /* Must record num and sport now. Otherwise we will see
 610         * in hash table socket with a funny identity. */
 611        inet->num = lport;
 612        inet->sport = htons(lport);
 613        sk->sk_hashent = hash;
 614        BUG_TRAP(sk_unhashed(sk));
 615        __sk_add_node(sk, &head->chain);
 616        sock_prot_inc_use(sk->sk_prot);
 617        write_unlock(&head->lock);
 618
 619        if (twp) {
 620                *twp = tw;
 621                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 622        } else if (tw) {
 623                /* Silly. Should hash-dance instead... */
 624                tcp_tw_deschedule(tw);
 625                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 626
 627                tcp_tw_put(tw);
 628        }
 629
 630        return 0;
 631
 632not_unique:
 633        write_unlock(&head->lock);
 634        return -EADDRNOTAVAIL;
 635}
 636
 637/*
 638 * Bind a port for a connect operation and hash it.
 639 */
 640static int tcp_v4_hash_connect(struct sock *sk)
 641{
 642        unsigned short snum = inet_sk(sk)->num;
 643        struct tcp_bind_hashbucket *head;
 644        struct tcp_bind_bucket *tb;
 645        int ret;
 646
 647        if (!snum) {
 648                int rover;
 649                int low = sysctl_local_port_range[0];
 650                int high = sysctl_local_port_range[1];
 651                int remaining = (high - low) + 1;
 652                struct hlist_node *node;
 653                struct tcp_tw_bucket *tw = NULL;
 654
 655                local_bh_disable();
 656
 657                /* TODO. Actually it is not so bad idea to remove
 658                 * tcp_portalloc_lock before next submission to Linus.
 659                 * As soon as we touch this place at all it is time to think.
 660                 *
 661                 * Now it protects single _advisory_ variable tcp_port_rover,
 662                 * hence it is mostly useless.
 663                 * Code will work nicely if we just delete it, but
 664                 * I am afraid in contented case it will work not better or
 665                 * even worse: another cpu just will hit the same bucket
 666                 * and spin there.
 667                 * So some cpu salt could remove both contention and
 668                 * memory pingpong. Any ideas how to do this in a nice way?
 669                 */
 670                spin_lock(&tcp_portalloc_lock);
 671                rover = tcp_port_rover;
 672
 673                do {
 674                        rover++;
 675                        if ((rover < low) || (rover > high))
 676                                rover = low;
 677                        head = &tcp_bhash[tcp_bhashfn(rover)];
 678                        spin_lock(&head->lock);
 679
 680                        /* Does not bother with rcv_saddr checks,
 681                         * because the established check is already
 682                         * unique enough.
 683                         */
 684                        tb_for_each(tb, node, &head->chain) {
 685                                if (tb->port == rover) {
 686                                        BUG_TRAP(!hlist_empty(&tb->owners));
 687                                        if (tb->fastreuse >= 0)
 688                                                goto next_port;
 689                                        if (!__tcp_v4_check_established(sk,
 690                                                                        rover,
 691                                                                        &tw))
 692                                                goto ok;
 693                                        goto next_port;
 694                                }
 695                        }
 696
 697                        tb = tcp_bucket_create(head, rover);
 698                        if (!tb) {
 699                                spin_unlock(&head->lock);
 700                                break;
 701                        }
 702                        tb->fastreuse = -1;
 703                        goto ok;
 704
 705                next_port:
 706                        spin_unlock(&head->lock);
 707                } while (--remaining > 0);
 708                tcp_port_rover = rover;
 709                spin_unlock(&tcp_portalloc_lock);
 710
 711                local_bh_enable();
 712
 713                return -EADDRNOTAVAIL;
 714
 715ok:
 716                /* All locks still held and bhs disabled */
 717                tcp_port_rover = rover;
 718                spin_unlock(&tcp_portalloc_lock);
 719
 720                tcp_bind_hash(sk, tb, rover);
 721                if (sk_unhashed(sk)) {
 722                        inet_sk(sk)->sport = htons(rover);
 723                        __tcp_v4_hash(sk, 0);
 724                }
 725                spin_unlock(&head->lock);
 726
 727                if (tw) {
 728                        tcp_tw_deschedule(tw);
 729                        tcp_tw_put(tw);
 730                }
 731
 732                ret = 0;
 733                goto out;
 734        }
 735
 736        head  = &tcp_bhash[tcp_bhashfn(snum)];
 737        tb  = tcp_sk(sk)->bind_hash;
 738        spin_lock_bh(&head->lock);
 739        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 740                __tcp_v4_hash(sk, 0);
 741                spin_unlock_bh(&head->lock);
 742                return 0;
 743        } else {
 744                spin_unlock(&head->lock);
 745                /* No definite answer... Walk to established hash table */
 746                ret = __tcp_v4_check_established(sk, snum, NULL);
 747out:
 748                local_bh_enable();
 749                return ret;
 750        }
 751}
 752
 753/* This will initiate an outgoing connection. */
 754int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 755{
 756        struct inet_opt *inet = inet_sk(sk);
 757        struct tcp_opt *tp = tcp_sk(sk);
 758        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 759        struct rtable *rt;
 760        u32 daddr, nexthop;
 761        int tmp;
 762        int err;
 763
 764        if (addr_len < sizeof(struct sockaddr_in))
 765                return -EINVAL;
 766
 767        if (usin->sin_family != AF_INET)
 768                return -EAFNOSUPPORT;
 769
 770        nexthop = daddr = usin->sin_addr.s_addr;
 771        if (inet->opt && inet->opt->srr) {
 772                if (!daddr)
 773                        return -EINVAL;
 774                nexthop = inet->opt->faddr;
 775        }
 776
 777        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 778                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 779                               IPPROTO_TCP,
 780                               inet->sport, usin->sin_port, sk);
 781        if (tmp < 0)
 782                return tmp;
 783
 784        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 785                ip_rt_put(rt);
 786                return -ENETUNREACH;
 787        }
 788
 789        if (!inet->opt || !inet->opt->srr)
 790                daddr = rt->rt_dst;
 791
 792        if (!inet->saddr)
 793                inet->saddr = rt->rt_src;
 794        inet->rcv_saddr = inet->saddr;
 795
 796        if (tp->ts_recent_stamp && inet->daddr != daddr) {
 797                /* Reset inherited state */
 798                tp->ts_recent       = 0;
 799                tp->ts_recent_stamp = 0;
 800                tp->write_seq       = 0;
 801        }
 802
 803        if (sysctl_tcp_tw_recycle &&
 804            !tp->ts_recent_stamp && rt->rt_dst == daddr) {
 805                struct inet_peer *peer = rt_get_peer(rt);
 806
 807                /* VJ's idea. We save last timestamp seen from
 808                 * the destination in peer table, when entering state TIME-WAIT
 809                 * and initialize ts_recent from it, when trying new connection.
 810                 */
 811
 812                if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 813                        tp->ts_recent_stamp = peer->tcp_ts_stamp;
 814                        tp->ts_recent = peer->tcp_ts;
 815                }
 816        }
 817
 818        inet->dport = usin->sin_port;
 819        inet->daddr = daddr;
 820
 821        tp->ext_header_len = 0;
 822        if (inet->opt)
 823                tp->ext_header_len = inet->opt->optlen;
 824
 825        tp->mss_clamp = 536;
 826
 827        /* Socket identity is still unknown (sport may be zero).
 828         * However we set state to SYN-SENT and not releasing socket
 829         * lock select source port, enter ourselves into the hash tables and
 830         * complete initialization after this.
 831         */
 832        tcp_set_state(sk, TCP_SYN_SENT);
 833        err = tcp_v4_hash_connect(sk);
 834        if (err)
 835                goto failure;
 836
 837        err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 838        if (err)
 839                goto failure;
 840
 841        /* OK, now commit destination to socket.  */
 842        __sk_dst_set(sk, &rt->u.dst);
 843        tcp_v4_setup_caps(sk, &rt->u.dst);
 844        tp->ext2_header_len = rt->u.dst.header_len;
 845
 846        if (!tp->write_seq)
 847                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 848                                                           inet->daddr,
 849                                                           inet->sport,
 850                                                           usin->sin_port);
 851
 852        inet->id = tp->write_seq ^ jiffies;
 853
 854        err = tcp_connect(sk);
 855        rt = NULL;
 856        if (err)
 857                goto failure;
 858
 859        return 0;
 860
 861failure:
 862        /* This unhashes the socket and releases the local port, if necessary. */
 863        tcp_set_state(sk, TCP_CLOSE);
 864        ip_rt_put(rt);
 865        sk->sk_route_caps = 0;
 866        inet->dport = 0;
 867        return err;
 868}
 869
 870static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 871{
 872        return ((struct rtable *)skb->dst)->rt_iif;
 873}
 874
 875static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 876{
 877        return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 878}
 879
 880static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 881                                              struct open_request ***prevp,
 882                                              __u16 rport,
 883                                              __u32 raddr, __u32 laddr)
 884{
 885        struct tcp_listen_opt *lopt = tp->listen_opt;
 886        struct open_request *req, **prev;
 887
 888        for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 889             (req = *prev) != NULL;
 890             prev = &req->dl_next) {
 891                if (req->rmt_port == rport &&
 892                    req->af.v4_req.rmt_addr == raddr &&
 893                    req->af.v4_req.loc_addr == laddr &&
 894                    TCP_INET_FAMILY(req->class->family)) {
 895                        BUG_TRAP(!req->sk);
 896                        *prevp = prev;
 897                        break;
 898                }
 899        }
 900
 901        return req;
 902}
 903
 904static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 905{
 906        struct tcp_opt *tp = tcp_sk(sk);
 907        struct tcp_listen_opt *lopt = tp->listen_opt;
 908        u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 909
 910        req->expires = jiffies + TCP_TIMEOUT_INIT;
 911        req->retrans = 0;
 912        req->sk = NULL;
 913        req->dl_next = lopt->syn_table[h];
 914
 915        write_lock(&tp->syn_wait_lock);
 916        lopt->syn_table[h] = req;
 917        write_unlock(&tp->syn_wait_lock);
 918
 919        tcp_synq_added(sk);
 920}
 921
 922
 923/*
 924 * This routine does path mtu discovery as defined in RFC1191.
 925 */
 926static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 927                                     u32 mtu)
 928{
 929        struct dst_entry *dst;
 930        struct inet_opt *inet = inet_sk(sk);
 931        struct tcp_opt *tp = tcp_sk(sk);
 932
 933        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 934         * send out by Linux are always <576bytes so they should go through
 935         * unfragmented).
 936         */
 937        if (sk->sk_state == TCP_LISTEN)
 938                return;
 939
 940        /* We don't check in the destentry if pmtu discovery is forbidden
 941         * on this route. We just assume that no packet_to_big packets
 942         * are send back when pmtu discovery is not active.
 943         * There is a small race when the user changes this flag in the
 944         * route, but I think that's acceptable.
 945         */
 946        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 947                return;
 948
 949        dst->ops->update_pmtu(dst, mtu);
 950
 951        /* Something is about to be wrong... Remember soft error
 952         * for the case, if this connection will not able to recover.
 953         */
 954        if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
 955                sk->sk_err_soft = EMSGSIZE;
 956
 957        mtu = dst_pmtu(dst);
 958
 959        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 960            tp->pmtu_cookie > mtu) {
 961                tcp_sync_mss(sk, mtu);
 962
 963                /* Resend the TCP packet because it's
 964                 * clear that the old packet has been
 965                 * dropped. This is the new "fast" path mtu
 966                 * discovery.
 967                 */
 968                tcp_simple_retransmit(sk);
 969        } /* else let the usual retransmit timer handle it */
 970}
 971
 972/*
 973 * This routine is called by the ICMP module when it gets some
 974 * sort of error condition.  If err < 0 then the socket should
 975 * be closed and the error returned to the user.  If err > 0
 976 * it's just the icmp type << 8 | icmp code.  After adjustment
 977 * header points to the first 8 bytes of the tcp header.  We need
 978 * to find the appropriate port.
 979 *
 980 * The locking strategy used here is very "optimistic". When
 981 * someone else accesses the socket the ICMP is just dropped
 982 * and for some paths there is no check at all.
 983 * A more general error queue to queue errors for later handling
 984 * is probably better.
 985 *
 986 */
 987
 988void tcp_v4_err(struct sk_buff *skb, u32 info)
 989{
 990        struct iphdr *iph = (struct iphdr *)skb->data;
 991        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 992        struct tcp_opt *tp;
 993        struct inet_opt *inet;
 994        int type = skb->h.icmph->type;
 995        int code = skb->h.icmph->code;
 996        struct sock *sk;
 997        __u32 seq;
 998        int err;
 999
1000        if (skb->len < (iph->ihl << 2) + 8) {
1001                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1002                return;
1003        }
1004
1005        sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1006                           th->source, tcp_v4_iif(skb));
1007        if (!sk) {
1008                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1009                return;
1010        }
1011        if (sk->sk_state == TCP_TIME_WAIT) {
1012                tcp_tw_put((struct tcp_tw_bucket *)sk);
1013                return;
1014        }
1015
1016        bh_lock_sock(sk);
1017        /* If too many ICMPs get dropped on busy
1018         * servers this needs to be solved differently.
1019         */
1020        if (sock_owned_by_user(sk))
1021                NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1022
1023        if (sk->sk_state == TCP_CLOSE)
1024                goto out;
1025
1026        tp = tcp_sk(sk);
1027        seq = ntohl(th->seq);
1028        if (sk->sk_state != TCP_LISTEN &&
1029            !between(seq, tp->snd_una, tp->snd_nxt)) {
1030                NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1031                goto out;
1032        }
1033
1034        switch (type) {
1035        case ICMP_SOURCE_QUENCH:
1036                /* This is deprecated, but if someone generated it,
1037                 * we have no reasons to ignore it.
1038                 */
1039                if (!sock_owned_by_user(sk))
1040                        tcp_enter_cwr(tp);
1041                goto out;
1042        case ICMP_PARAMETERPROB:
1043                err = EPROTO;
1044                break;
1045        case ICMP_DEST_UNREACH:
1046                if (code > NR_ICMP_UNREACH)
1047                        goto out;
1048
1049                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1050                        if (!sock_owned_by_user(sk))
1051                                do_pmtu_discovery(sk, iph, info);
1052                        goto out;
1053                }
1054
1055                err = icmp_err_convert[code].errno;
1056                break;
1057        case ICMP_TIME_EXCEEDED:
1058                err = EHOSTUNREACH;
1059                break;
1060        default:
1061                goto out;
1062        }
1063
1064        switch (sk->sk_state) {
1065                struct open_request *req, **prev;
1066        case TCP_LISTEN:
1067                if (sock_owned_by_user(sk))
1068                        goto out;
1069
1070                req = tcp_v4_search_req(tp, &prev, th->dest,
1071                                        iph->daddr, iph->saddr);
1072                if (!req)
1073                        goto out;
1074
1075                /* ICMPs are not backlogged, hence we cannot get
1076                   an established socket here.
1077                 */
1078                BUG_TRAP(!req->sk);
1079
1080                if (seq != req->snt_isn) {
1081                        NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1082                        goto out;
1083                }
1084
1085                /*
1086                 * Still in SYN_RECV, just remove it silently.
1087                 * There is no good way to pass the error to the newly
1088                 * created socket, and POSIX does not want network
1089                 * errors returned from accept().
1090                 */
1091                tcp_synq_drop(sk, req, prev);
1092                goto out;
1093
1094        case TCP_SYN_SENT:
1095        case TCP_SYN_RECV:  /* Cannot happen.
1096                               It can f.e. if SYNs crossed.
1097                             */
1098                if (!sock_owned_by_user(sk)) {
1099                        TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1100                        sk->sk_err = err;
1101
1102                        sk->sk_error_report(sk);
1103
1104                        tcp_done(sk);
1105                } else {
1106                        sk->sk_err_soft = err;
1107                }
1108                goto out;
1109        }
1110
1111        /* If we've already connected we will keep trying
1112         * until we time out, or the user gives up.
1113         *
1114         * rfc1122 4.2.3.9 allows to consider as hard errors
1115         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1116         * but it is obsoleted by pmtu discovery).
1117         *
1118         * Note, that in modern internet, where routing is unreliable
1119         * and in each dark corner broken firewalls sit, sending random
1120         * errors ordered by their masters even this two messages finally lose
1121         * their original sense (even Linux sends invalid PORT_UNREACHs)
1122         *
1123         * Now we are in compliance with RFCs.
1124         *                                                      --ANK (980905)
1125         */
1126
1127        inet = inet_sk(sk);
1128        if (!sock_owned_by_user(sk) && inet->recverr) {
1129                sk->sk_err = err;
1130                sk->sk_error_report(sk);
1131        } else  { /* Only an error on timeout */
1132                sk->sk_err_soft = err;
1133        }
1134
1135out:
1136        bh_unlock_sock(sk);
1137        sock_put(sk);
1138}
1139
1140/* This routine computes an IPv4 TCP checksum. */
1141void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1142                       struct sk_buff *skb)
1143{
1144        struct inet_opt *inet = inet_sk(sk);
1145
1146        if (skb->ip_summed == CHECKSUM_HW) {
1147                th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1148                skb->csum = offsetof(struct tcphdr, check);
1149        } else {
1150                th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1151                                         csum_partial((char *)th,
1152                                                      th->doff << 2,
1153                                                      skb->csum));
1154        }
1155}
1156
1157/*
1158 *      This routine will send an RST to the other tcp.
1159 *
1160 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1161 *                    for reset.
1162 *      Answer: if a packet caused RST, it is not for a socket
1163 *              existing in our system, if it is matched to a socket,
1164 *              it is just duplicate segment or bug in other side's TCP.
1165 *              So that we build reply only basing on parameters
1166 *              arrived with segment.
1167 *      Exception: precedence violation. We do not implement it in any case.
1168 */
1169
1170static void tcp_v4_send_reset(struct sk_buff *skb)
1171{
1172        struct tcphdr *th = skb->h.th;
1173        struct tcphdr rth;
1174        struct ip_reply_arg arg;
1175
1176        /* Never send a reset in response to a reset. */
1177        if (th->rst)
1178                return;
1179
1180        if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1181                return;
1182
1183        /* Swap the send and the receive. */
1184        memset(&rth, 0, sizeof(struct tcphdr));
1185        rth.dest   = th->source;
1186        rth.source = th->dest;
1187        rth.doff   = sizeof(struct tcphdr) / 4;
1188        rth.rst    = 1;
1189
1190        if (th->ack) {
1191                rth.seq = th->ack_seq;
1192        } else {
1193                rth.ack = 1;
1194                rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1195                                    skb->len - (th->doff << 2));
1196        }
1197
1198        memset(&arg, 0, sizeof arg);
1199        arg.iov[0].iov_base = (unsigned char *)&rth;
1200        arg.iov[0].iov_len  = sizeof rth;
1201        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1202                                      skb->nh.iph->saddr, /*XXX*/
1203                                      sizeof(struct tcphdr), IPPROTO_TCP, 0);
1204        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1205
1206        ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1207
1208        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1209        TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1210}
1211
1212/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1213   outside socket context is ugly, certainly. What can I do?
1214 */
1215
1216static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1217                            u32 win, u32 ts)
1218{
1219        struct tcphdr *th = skb->h.th;
1220        struct {
1221                struct tcphdr th;
1222                u32 tsopt[3];
1223        } rep;
1224        struct ip_reply_arg arg;
1225
1226        memset(&rep.th, 0, sizeof(struct tcphdr));
1227        memset(&arg, 0, sizeof arg);
1228
1229        arg.iov[0].iov_base = (unsigned char *)&rep;
1230        arg.iov[0].iov_len  = sizeof(rep.th);
1231        if (ts) {
1232                rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1233                                     (TCPOPT_TIMESTAMP << 8) |
1234                                     TCPOLEN_TIMESTAMP);
1235                rep.tsopt[1] = htonl(tcp_time_stamp);
1236                rep.tsopt[2] = htonl(ts);
1237                arg.iov[0].iov_len = sizeof(rep);
1238        }
1239
1240        /* Swap the send and the receive. */
1241        rep.th.dest    = th->source;
1242        rep.th.source  = th->dest;
1243        rep.th.doff    = arg.iov[0].iov_len / 4;
1244        rep.th.seq     = htonl(seq);
1245        rep.th.ack_seq = htonl(ack);
1246        rep.th.ack     = 1;
1247        rep.th.window  = htons(win);
1248
1249        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1250                                      skb->nh.iph->saddr, /*XXX*/
1251                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1252        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1253
1254        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1255
1256        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1257}
1258
1259static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1260{
1261        struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1262
1263        tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1264                        tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1265
1266        tcp_tw_put(tw);
1267}
1268
1269static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1270{
1271        tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1272                        req->ts_recent);
1273}
1274
1275static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1276                                          struct open_request *req)
1277{
1278        struct rtable *rt;
1279        struct ip_options *opt = req->af.v4_req.opt;
1280        struct flowi fl = { .oif = sk->sk_bound_dev_if,
1281                            .nl_u = { .ip4_u =
1282                                      { .daddr = ((opt && opt->srr) ?
1283                                                  opt->faddr :
1284                                                  req->af.v4_req.rmt_addr),
1285                                        .saddr = req->af.v4_req.loc_addr,
1286                                        .tos = RT_CONN_FLAGS(sk) } },
1287                            .proto = IPPROTO_TCP,
1288                            .uli_u = { .ports =
1289                                       { .sport = inet_sk(sk)->sport,
1290                                         .dport = req->rmt_port } } };
1291
1292        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1293                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1294                return NULL;
1295        }
1296        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1297                ip_rt_put(rt);
1298                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1299                return NULL;
1300        }
1301        return &rt->u.dst;
1302}
1303
1304/*
1305 *      Send a SYN-ACK after having received an ACK.
1306 *      This still operates on a open_request only, not on a big
1307 *      socket.
1308 */
1309static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1310                              struct dst_entry *dst)
1311{
1312        int err = -1;
1313        struct sk_buff * skb;
1314
1315        /* First, grab a route. */
1316        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1317                goto out;
1318
1319        skb = tcp_make_synack(sk, dst, req);
1320
1321        if (skb) {
1322                struct tcphdr *th = skb->h.th;
1323
1324                th->check = tcp_v4_check(th, skb->len,
1325                                         req->af.v4_req.loc_addr,
1326                                         req->af.v4_req.rmt_addr,
1327                                         csum_partial((char *)th, skb->len,
1328                                                      skb->csum));
1329
1330                err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1331                                            req->af.v4_req.rmt_addr,
1332                                            req->af.v4_req.opt);
1333                if (err == NET_XMIT_CN)
1334                        err = 0;
1335        }
1336
1337out:
1338        dst_release(dst);
1339        return err;
1340}
1341
1342/*
1343 *      IPv4 open_request destructor.
1344 */
1345static void tcp_v4_or_free(struct open_request *req)
1346{
1347        if (req->af.v4_req.opt)
1348                kfree(req->af.v4_req.opt);
1349}
1350
1351static inline void syn_flood_warning(struct sk_buff *skb)
1352{
1353        static unsigned long warntime;
1354
1355        if (time_after(jiffies, (warntime + HZ * 60))) {
1356                warntime = jiffies;
1357                printk(KERN_INFO
1358                       "possible SYN flooding on port %d. Sending cookies.\n",
1359                       ntohs(skb->h.th->dest));
1360        }
1361}
1362
1363/*
1364 * Save and compile IPv4 options into the open_request if needed.
1365 */
1366static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1367                                                     struct sk_buff *skb)
1368{
1369        struct ip_options *opt = &(IPCB(skb)->opt);
1370        struct ip_options *dopt = NULL;
1371
1372        if (opt && opt->optlen) {
1373                int opt_size = optlength(opt);
1374                dopt = kmalloc(opt_size, GFP_ATOMIC);
1375                if (dopt) {
1376                        if (ip_options_echo(dopt, skb)) {
1377                                kfree(dopt);
1378                                dopt = NULL;
1379                        }
1380                }
1381        }
1382        return dopt;
1383}
1384
1385/*
1386 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1387 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1388 * It would be better to replace it with a global counter for all sockets
1389 * but then some measure against one socket starving all other sockets
1390 * would be needed.
1391 *
1392 * It was 128 by default. Experiments with real servers show, that
1393 * it is absolutely not enough even at 100conn/sec. 256 cures most
1394 * of problems. This value is adjusted to 128 for very small machines
1395 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1396 * Further increasing requires to change hash table size.
1397 */
1398int sysctl_max_syn_backlog = 256;
1399
1400struct or_calltable or_ipv4 = {
1401        .family         =       PF_INET,
1402        .rtx_syn_ack    =       tcp_v4_send_synack,
1403        .send_ack       =       tcp_v4_or_send_ack,
1404        .destructor     =       tcp_v4_or_free,
1405        .send_reset     =       tcp_v4_send_reset,
1406};
1407
1408int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1409{
1410        struct tcp_opt tp;
1411        struct open_request *req;
1412        __u32 saddr = skb->nh.iph->saddr;
1413        __u32 daddr = skb->nh.iph->daddr;
1414        __u32 isn = TCP_SKB_CB(skb)->when;
1415        struct dst_entry *dst = NULL;
1416#ifdef CONFIG_SYN_COOKIES
1417        int want_cookie = 0;
1418#else
1419#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1420#endif
1421
1422        /* Never answer to SYNs send to broadcast or multicast */
1423        if (((struct rtable *)skb->dst)->rt_flags &
1424            (RTCF_BROADCAST | RTCF_MULTICAST))
1425                goto drop;
1426
1427        /* TW buckets are converted to open requests without
1428         * limitations, they conserve resources and peer is
1429         * evidently real one.
1430         */
1431        if (tcp_synq_is_full(sk) && !isn) {
1432#ifdef CONFIG_SYN_COOKIES
1433                if (sysctl_tcp_syncookies) {
1434                        want_cookie = 1;
1435                } else
1436#endif
1437                goto drop;
1438        }
1439
1440        /* Accept backlog is full. If we have already queued enough
1441         * of warm entries in syn queue, drop request. It is better than
1442         * clogging syn queue with openreqs with exponentially increasing
1443         * timeout.
1444         */
1445        if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1446                goto drop;
1447
1448        req = tcp_openreq_alloc();
1449        if (!req)
1450                goto drop;
1451
1452        tcp_clear_options(&tp);
1453        tp.mss_clamp = 536;
1454        tp.user_mss  = tcp_sk(sk)->user_mss;
1455
1456        tcp_parse_options(skb, &tp, 0);
1457
1458        if (want_cookie) {
1459                tcp_clear_options(&tp);
1460                tp.saw_tstamp = 0;
1461        }
1462
1463        if (tp.saw_tstamp && !tp.rcv_tsval) {
1464                /* Some OSes (unknown ones, but I see them on web server, which
1465                 * contains information interesting only for windows'
1466                 * users) do not send their stamp in SYN. It is easy case.
1467                 * We simply do not advertise TS support.
1468                 */
1469                tp.saw_tstamp = 0;
1470                tp.tstamp_ok  = 0;
1471        }
1472        tp.tstamp_ok = tp.saw_tstamp;
1473
1474        tcp_openreq_init(req, &tp, skb);
1475
1476        req->af.v4_req.loc_addr = daddr;
1477        req->af.v4_req.rmt_addr = saddr;
1478        req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1479        req->class = &or_ipv4;
1480        if (!want_cookie)
1481                TCP_ECN_create_request(req, skb->h.th);
1482
1483        if (want_cookie) {
1484#ifdef CONFIG_SYN_COOKIES
1485                syn_flood_warning(skb);
1486#endif
1487                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1488        } else if (!isn) {
1489                struct inet_peer *peer = NULL;
1490
1491                /* VJ's idea. We save last timestamp seen
1492                 * from the destination in peer table, when entering
1493                 * state TIME-WAIT, and check against it before
1494                 * accepting new connection request.
1495                 *
1496                 * If "isn" is not zero, this request hit alive
1497                 * timewait bucket, so that all the necessary checks
1498                 * are made in the function processing timewait state.
1499                 */
1500                if (tp.saw_tstamp &&
1501                    sysctl_tcp_tw_recycle &&
1502                    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1503                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1504                    peer->v4daddr == saddr) {
1505                        if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1506                            (s32)(peer->tcp_ts - req->ts_recent) >
1507                                                        TCP_PAWS_WINDOW) {
1508                                NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1509                                dst_release(dst);
1510                                goto drop_and_free;
1511                        }
1512                }
1513                /* Kill the following clause, if you dislike this way. */
1514                else if (!sysctl_tcp_syncookies &&
1515                         (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1516                          (sysctl_max_syn_backlog >> 2)) &&
1517                         (!peer || !peer->tcp_ts_stamp) &&
1518                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1519                        /* Without syncookies last quarter of
1520                         * backlog is filled with destinations,
1521                         * proven to be alive.
1522                         * It means that we continue to communicate
1523                         * to destinations, already remembered
1524                         * to the moment of synflood.
1525                         */
1526                        NETDEBUG(if (net_ratelimit()) \
1527                                        printk(KERN_DEBUG "TCP: drop open "
1528                                                          "request from %u.%u."
1529                                                          "%u.%u/%u\n", \
1530                                               NIPQUAD(saddr),
1531                                               ntohs(skb->h.th->source)));
1532                        dst_release(dst);
1533                        goto drop_and_free;
1534                }
1535
1536                isn = tcp_v4_init_sequence(sk, skb);
1537        }
1538        req->snt_isn = isn;
1539
1540        if (tcp_v4_send_synack(sk, req, dst))
1541                goto drop_and_free;
1542
1543        if (want_cookie) {
1544                tcp_openreq_free(req);
1545        } else {
1546                tcp_v4_synq_add(sk, req);
1547        }
1548        return 0;
1549
1550drop_and_free:
1551        tcp_openreq_free(req);
1552drop:
1553        TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1554        return 0;
1555}
1556
1557
1558/*
1559 * The three way handshake has completed - we got a valid synack -
1560 * now create the new socket.
1561 */
1562struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1563                                  struct open_request *req,
1564                                  struct dst_entry *dst)
1565{
1566        struct inet_opt *newinet;
1567        struct tcp_opt *newtp;
1568        struct sock *newsk;
1569
1570        if (sk_acceptq_is_full(sk))
1571                goto exit_overflow;
1572
1573        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1574                goto exit;
1575
1576        newsk = tcp_create_openreq_child(sk, req, skb);
1577        if (!newsk)
1578                goto exit;
1579
1580        newsk->sk_dst_cache = dst;
1581        tcp_v4_setup_caps(newsk, dst);
1582
1583        newtp                 = tcp_sk(newsk);
1584        newinet               = inet_sk(newsk);
1585        newinet->daddr        = req->af.v4_req.rmt_addr;
1586        newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1587        newinet->saddr        = req->af.v4_req.loc_addr;
1588        newinet->opt          = req->af.v4_req.opt;
1589        req->af.v4_req.opt    = NULL;
1590        newinet->mc_index     = tcp_v4_iif(skb);
1591        newinet->mc_ttl       = skb->nh.iph->ttl;
1592        newtp->ext_header_len = 0;
1593        if (newinet->opt)
1594                newtp->ext_header_len = newinet->opt->optlen;
1595        newtp->ext2_header_len = dst->header_len;
1596        newinet->id = newtp->write_seq ^ jiffies;
1597
1598        tcp_sync_mss(newsk, dst_pmtu(dst));
1599        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1600        tcp_initialize_rcv_mss(newsk);
1601
1602        __tcp_v4_hash(newsk, 0);
1603        __tcp_inherit_port(sk, newsk);
1604
1605        return newsk;
1606
1607exit_overflow:
1608        NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1609exit:
1610        NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1611        dst_release(dst);
1612        return NULL;
1613}
1614
1615static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1616{
1617        struct tcphdr *th = skb->h.th;
1618        struct iphdr *iph = skb->nh.iph;
1619        struct tcp_opt *tp = tcp_sk(sk);
1620        struct sock *nsk;
1621        struct open_request **prev;
1622        /* Find possible connection requests. */
1623        struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1624                                                     iph->saddr, iph->daddr);
1625        if (req)
1626                return tcp_check_req(sk, skb, req, prev);
1627
1628        nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1629                                          th->source,
1630                                          skb->nh.iph->daddr,
1631                                          ntohs(th->dest),
1632                                          tcp_v4_iif(skb));
1633
1634        if (nsk) {
1635                if (nsk->sk_state != TCP_TIME_WAIT) {
1636                        bh_lock_sock(nsk);
1637                        return nsk;
1638                }
1639                tcp_tw_put((struct tcp_tw_bucket *)nsk);
1640                return NULL;
1641        }
1642
1643#ifdef CONFIG_SYN_COOKIES
1644        if (!th->rst && !th->syn && th->ack)
1645                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1646#endif
1647        return sk;
1648}
1649
1650static int tcp_v4_checksum_init(struct sk_buff *skb)
1651{
1652        if (skb->ip_summed == CHECKSUM_HW) {
1653                skb->ip_summed = CHECKSUM_UNNECESSARY;
1654                if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1655                                  skb->nh.iph->daddr, skb->csum))
1656                        return 0;
1657
1658                NETDEBUG(if (net_ratelimit())
1659                                printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1660                skb->ip_summed = CHECKSUM_NONE;
1661        }
1662        if (skb->len <= 76) {
1663                if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1664                                 skb->nh.iph->daddr,
1665                                 skb_checksum(skb, 0, skb->len, 0)))
1666                        return -1;
1667                skb->ip_summed = CHECKSUM_UNNECESSARY;
1668        } else {
1669                skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1670                                          skb->nh.iph->saddr,
1671                                          skb->nh.iph->daddr, 0);
1672        }
1673        return 0;
1674}
1675
1676
1677/* The socket must have it's spinlock held when we get
1678 * here.
1679 *
1680 * We have a potential double-lock case here, so even when
1681 * doing backlog processing we use the BH locking scheme.
1682 * This is because we cannot sleep with the original spinlock
1683 * held.
1684 */
1685int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1686{
1687        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1688                TCP_CHECK_TIMER(sk);
1689                if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1690                        goto reset;
1691                TCP_CHECK_TIMER(sk);
1692                return 0;
1693        }
1694
1695        if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1696                goto csum_err;
1697
1698        if (sk->sk_state == TCP_LISTEN) {
1699                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1700                if (!nsk)
1701                        goto discard;
1702
1703                if (nsk != sk) {
1704                        if (tcp_child_process(sk, nsk, skb))
1705                                goto reset;
1706                        return 0;
1707                }
1708        }
1709
1710        TCP_CHECK_TIMER(sk);
1711        if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1712                goto reset;
1713        TCP_CHECK_TIMER(sk);
1714        return 0;
1715
1716reset:
1717        tcp_v4_send_reset(skb);
1718discard:
1719        kfree_skb(skb);
1720        /* Be careful here. If this function gets more complicated and
1721         * gcc suffers from register pressure on the x86, sk (in %ebx)
1722         * might be destroyed here. This current version compiles correctly,
1723         * but you have been warned.
1724         */
1725        return 0;
1726
1727csum_err:
1728        TCP_INC_STATS_BH(TCP_MIB_INERRS);
1729        goto discard;
1730}
1731
1732/*
1733 *      From tcp_input.c
1734 */
1735
1736int tcp_v4_rcv(struct sk_buff *skb)
1737{
1738        struct tcphdr *th;
1739        struct sock *sk;
1740        int ret;
1741
1742        if (skb->pkt_type != PACKET_HOST)
1743                goto discard_it;
1744
1745        /* Count it even if it's bad */
1746        TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1747
1748        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1749                goto discard_it;
1750
1751        th = skb->h.th;
1752
1753        if (th->doff < sizeof(struct tcphdr) / 4)
1754                goto bad_packet;
1755        if (!pskb_may_pull(skb, th->doff * 4))
1756                goto discard_it;
1757
1758        /* An explanation is required here, I think.
1759         * Packet length and doff are validated by header prediction,
1760         * provided case of th->doff==0 is elimineted.
1761         * So, we defer the checks. */
1762        if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1763             tcp_v4_checksum_init(skb) < 0))
1764                goto bad_packet;
1765
1766        th = skb->h.th;
1767        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1768        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1769                                    skb->len - th->doff * 4);
1770        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1771        TCP_SKB_CB(skb)->when    = 0;
1772        TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1773        TCP_SKB_CB(skb)->sacked  = 0;
1774
1775        sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1776                             skb->nh.iph->daddr, ntohs(th->dest),
1777                             tcp_v4_iif(skb));
1778
1779        if (!sk)
1780                goto no_tcp_socket;
1781
1782process:
1783        if (sk->sk_state == TCP_TIME_WAIT)
1784                goto do_time_wait;
1785
1786        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1787                goto discard_and_relse;
1788
1789        if (sk_filter(sk, skb, 0))
1790                goto discard_and_relse;
1791
1792        skb->dev = NULL;
1793
1794        bh_lock_sock(sk);
1795        ret = 0;
1796        if (!sock_owned_by_user(sk)) {
1797                if (!tcp_prequeue(sk, skb))
1798                        ret = tcp_v4_do_rcv(sk, skb);
1799        } else
1800                sk_add_backlog(sk, skb);
1801        bh_unlock_sock(sk);
1802
1803        sock_put(sk);
1804
1805        return ret;
1806
1807no_tcp_socket:
1808        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1809                goto discard_it;
1810
1811        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1812bad_packet:
1813                TCP_INC_STATS_BH(TCP_MIB_INERRS);
1814        } else {
1815                tcp_v4_send_reset(skb);
1816        }
1817
1818discard_it:
1819        /* Discard frame. */
1820        kfree_skb(skb);
1821        return 0;
1822
1823discard_and_relse:
1824        sock_put(sk);
1825        goto discard_it;
1826
1827do_time_wait:
1828        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1829                tcp_tw_put((struct tcp_tw_bucket *) sk);
1830                goto discard_it;
1831        }
1832
1833        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1834                TCP_INC_STATS_BH(TCP_MIB_INERRS);
1835                tcp_tw_put((struct tcp_tw_bucket *) sk);
1836                goto discard_it;
1837        }
1838        switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1839                                           skb, th, skb->len)) {
1840        case TCP_TW_SYN: {
1841                struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1842                                                          ntohs(th->dest),
1843                                                          tcp_v4_iif(skb));
1844                if (sk2) {
1845                        tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1846                        tcp_tw_put((struct tcp_tw_bucket *)sk);
1847                        sk = sk2;
1848                        goto process;
1849                }
1850                /* Fall through to ACK */
1851        }
1852        case TCP_TW_ACK:
1853                tcp_v4_timewait_ack(sk, skb);
1854                break;
1855        case TCP_TW_RST:
1856                goto no_tcp_socket;
1857        case TCP_TW_SUCCESS:;
1858        }
1859        goto discard_it;
1860}
1861
1862/* With per-bucket locks this operation is not-atomic, so that
1863 * this version is not worse.
1864 */
1865static void __tcp_v4_rehash(struct sock *sk)
1866{
1867        sk->sk_prot->unhash(sk);
1868        sk->sk_prot->hash(sk);
1869}
1870
1871static int tcp_v4_reselect_saddr(struct sock *sk)
1872{
1873        struct inet_opt *inet = inet_sk(sk);
1874        int err;
1875        struct rtable *rt;
1876        __u32 old_saddr = inet->saddr;
1877        __u32 new_saddr;
1878        __u32 daddr = inet->daddr;
1879
1880        if (inet->opt && inet->opt->srr)
1881                daddr = inet->opt->faddr;
1882
1883        /* Query new route. */
1884        err = ip_route_connect(&rt, daddr, 0,
1885                               RT_TOS(inet->tos) | sk->sk_localroute,
1886                               sk->sk_bound_dev_if,
1887                               IPPROTO_TCP,
1888                               inet->sport, inet->dport, sk);
1889        if (err)
1890                return err;
1891
1892        __sk_dst_set(sk, &rt->u.dst);
1893        tcp_v4_setup_caps(sk, &rt->u.dst);
1894        tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1895
1896        new_saddr = rt->rt_src;
1897
1898        if (new_saddr == old_saddr)
1899                return 0;
1900
1901        if (sysctl_ip_dynaddr > 1) {
1902                printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1903                                 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1904                       NIPQUAD(old_saddr),
1905                       NIPQUAD(new_saddr));
1906        }
1907
1908        inet->saddr = new_saddr;
1909        inet->rcv_saddr = new_saddr;
1910
1911        /* XXX The only one ugly spot where we need to
1912         * XXX really change the sockets identity after
1913         * XXX it has entered the hashes. -DaveM
1914         *
1915         * Besides that, it does not check for connection
1916         * uniqueness. Wait for troubles.
1917         */
1918        __tcp_v4_rehash(sk);
1919        return 0;
1920}
1921
1922int tcp_v4_rebuild_header(struct sock *sk)
1923{
1924        struct inet_opt *inet = inet_sk(sk);
1925        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1926        u32 daddr;
1927        int err;
1928
1929        /* Route is OK, nothing to do. */
1930        if (rt)
1931                return 0;
1932
1933        /* Reroute. */
1934        daddr = inet->daddr;
1935        if (inet->opt && inet->opt->srr)
1936                daddr = inet->opt->faddr;
1937
1938        {
1939                struct flowi fl = { .oif = sk->sk_bound_dev_if,
1940                                    .nl_u = { .ip4_u =
1941                                              { .daddr = daddr,
1942                                                .saddr = inet->saddr,
1943                                                .tos = RT_CONN_FLAGS(sk) } },
1944                                    .proto = IPPROTO_TCP,
1945                                    .uli_u = { .ports =
1946                                               { .sport = inet->sport,
1947                                                 .dport = inet->dport } } };
1948                                                
1949                err = ip_route_output_flow(&rt, &fl, sk, 0);
1950        }
1951        if (!err) {
1952                __sk_dst_set(sk, &rt->u.dst);
1953                tcp_v4_setup_caps(sk, &rt->u.dst);
1954                tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1955                return 0;
1956        }
1957
1958        /* Routing failed... */
1959        sk->sk_route_caps = 0;
1960
1961        if (!sysctl_ip_dynaddr ||
1962            sk->sk_state != TCP_SYN_SENT ||
1963            (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1964            (err = tcp_v4_reselect_saddr(sk)) != 0)
1965                sk->sk_err_soft = -err;
1966
1967        return err;
1968}
1969
1970static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1971{
1972        struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1973        struct inet_opt *inet = inet_sk(sk);
1974
1975        sin->sin_family         = AF_INET;
1976        sin->sin_addr.s_addr    = inet->daddr;
1977        sin->sin_port           = inet->dport;
1978}
1979
1980/* VJ's idea. Save last timestamp seen from this destination
1981 * and hold it at least for normal timewait interval to use for duplicate
1982 * segment detection in subsequent connections, before they enter synchronized
1983 * state.
1984 */
1985
1986int tcp_v4_remember_stamp(struct sock *sk)
1987{
1988        struct inet_opt *inet = inet_sk(sk);
1989        struct tcp_opt *tp = tcp_sk(sk);
1990        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1991        struct inet_peer *peer = NULL;
1992        int release_it = 0;
1993
1994        if (!rt || rt->rt_dst != inet->daddr) {
1995                peer = inet_getpeer(inet->daddr, 1);
1996                release_it = 1;
1997        } else {
1998                if (!rt->peer)
1999                        rt_bind_peer(rt, 1);
2000                peer = rt->peer;
2001        }
2002
2003        if (peer) {
2004                if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2005                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2006                     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2007                        peer->tcp_ts_stamp = tp->ts_recent_stamp;
2008                        peer->tcp_ts = tp->ts_recent;
2009                }
2010                if (release_it)
2011                        inet_putpeer(peer);
2012                return 1;
2013        }
2014
2015        return 0;
2016}
2017
2018int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2019{
2020        struct inet_peer *peer = NULL;
2021
2022        peer = inet_getpeer(tw->tw_daddr, 1);
2023
2024        if (peer) {
2025                if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2026                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2027                     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2028                        peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2029                        peer->tcp_ts = tw->tw_ts_recent;
2030                }
2031                inet_putpeer(peer);
2032                return 1;
2033        }
2034
2035        return 0;
2036}
2037
2038struct tcp_func ipv4_specific = {
2039        .queue_xmit     =       ip_queue_xmit,
2040        .send_check     =       tcp_v4_send_check,
2041        .rebuild_header =       tcp_v4_rebuild_header,
2042        .conn_request   =       tcp_v4_conn_request,
2043        .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2044        .remember_stamp =       tcp_v4_remember_stamp,
2045        .net_header_len =       sizeof(struct iphdr),
2046        .setsockopt     =       ip_setsockopt,
2047        .getsockopt     =       ip_getsockopt,
2048        .addr2sockaddr  =       v4_addr2sockaddr,
2049        .sockaddr_len   =       sizeof(struct sockaddr_in),
2050};
2051
2052/* NOTE: A lot of things set to zero explicitly by call to
2053 *       sk_alloc() so need not be done here.
2054 */
2055static int tcp_v4_init_sock(struct sock *sk)
2056{
2057        struct tcp_opt *tp = tcp_sk(sk);
2058
2059        skb_queue_head_init(&tp->out_of_order_queue);
2060        tcp_init_xmit_timers(sk);
2061        tcp_prequeue_init(tp);
2062
2063        tp->rto  = TCP_TIMEOUT_INIT;
2064        tp->mdev = TCP_TIMEOUT_INIT;
2065
2066        /* So many TCP implementations out there (incorrectly) count the
2067         * initial SYN frame in their delayed-ACK and congestion control
2068         * algorithms that we must have the following bandaid to talk
2069         * efficiently to them.  -DaveM
2070         */
2071        tp->snd_cwnd = 2;
2072
2073        /* See draft-stevens-tcpca-spec-01 for discussion of the
2074         * initialization of these values.
2075         */
2076        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2077        tp->snd_cwnd_clamp = ~0;
2078        tp->mss_cache = 536;
2079
2080        tp->reordering = sysctl_tcp_reordering;
2081
2082        sk->sk_state = TCP_CLOSE;
2083
2084        sk->sk_write_space = sk_stream_write_space;
2085        sk->sk_use_write_queue = 1;
2086
2087        tp->af_specific = &ipv4_specific;
2088
2089        sk->sk_sndbuf = sysctl_tcp_wmem[1];
2090        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2091
2092        atomic_inc(&tcp_sockets_allocated);
2093
2094        return 0;
2095}
2096
2097int tcp_v4_destroy_sock(struct sock *sk)
2098{
2099        struct tcp_opt *tp = tcp_sk(sk);
2100
2101        tcp_clear_xmit_timers(sk);
2102
2103        /* Cleanup up the write buffer. */
2104        sk_stream_writequeue_purge(sk);
2105
2106        /* Cleans up our, hopefully empty, out_of_order_queue. */
2107        __skb_queue_purge(&tp->out_of_order_queue);
2108
2109        /* Clean prequeue, it must be empty really */
2110        __skb_queue_purge(&tp->ucopy.prequeue);
2111
2112        /* Clean up a referenced TCP bind bucket. */
2113        if (tp->bind_hash)
2114                tcp_put_port(sk);
2115
2116        /*
2117         * If sendmsg cached page exists, toss it.
2118         */
2119        if (sk->sk_sndmsg_page) {
2120                __free_page(sk->sk_sndmsg_page);
2121                sk->sk_sndmsg_page = NULL;
2122        }
2123
2124        atomic_dec(&tcp_sockets_allocated);
2125
2126        return 0;
2127}
2128
2129EXPORT_SYMBOL(tcp_v4_destroy_sock);
2130
2131#ifdef CONFIG_PROC_FS
2132/* Proc filesystem TCP sock list dumping. */
2133
2134static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2135{
2136        return hlist_empty(head) ? NULL :
2137                list_entry(head->first, struct tcp_tw_bucket, tw_node);
2138}
2139
2140static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2141{
2142        return tw->tw_node.next ?
2143                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2144}
2145
2146static void *listening_get_next(struct seq_file *seq, void *cur)
2147{
2148        struct tcp_opt *tp;
2149        struct hlist_node *node;
2150        struct sock *sk = cur;
2151        struct tcp_iter_state* st = seq->private;
2152
2153        if (!sk) {
2154                st->bucket = 0;
2155                sk = sk_head(&tcp_listening_hash[0]);
2156                goto get_sk;
2157        }
2158
2159        ++st->num;
2160
2161        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2162                struct open_request *req = cur;
2163
2164                tp = tcp_sk(st->syn_wait_sk);
2165                req = req->dl_next;
2166                while (1) {
2167                        while (req) {
2168                                if (req->class->family == st->family) {
2169                                        cur = req;
2170                                        goto out;
2171                                }
2172                                req = req->dl_next;
2173                        }
2174                        if (++st->sbucket >= TCP_SYNQ_HSIZE)
2175                                break;
2176get_req:
2177                        req = tp->listen_opt->syn_table[st->sbucket];
2178                }
2179                sk        = sk_next(st->syn_wait_sk);
2180                st->state = TCP_SEQ_STATE_LISTENING;
2181                read_unlock_bh(&tp->syn_wait_lock);
2182        } else
2183                sk = sk_next(sk);
2184get_sk:
2185        sk_for_each_from(sk, node) {
2186                if (sk->sk_family == st->family) {
2187                        cur = sk;
2188                        goto out;
2189                }
2190                tp = tcp_sk(sk);
2191                read_lock_bh(&tp->syn_wait_lock);
2192                if (tp->listen_opt && tp->listen_opt->qlen) {
2193                        st->uid         = sock_i_uid(sk);
2194                        st->syn_wait_sk = sk;
2195                        st->state       = TCP_SEQ_STATE_OPENREQ;
2196                        st->sbucket     = 0;
2197                        goto get_req;
2198                }
2199                read_unlock_bh(&tp->syn_wait_lock);
2200        }
2201        if (++st->bucket < TCP_LHTABLE_SIZE) {
2202                sk = sk_head(&tcp_listening_hash[st->bucket]);
2203                goto get_sk;
2204        }
2205        cur = NULL;
2206out:
2207        return cur;
2208}
2209
2210static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2211{
2212        void *rc = listening_get_next(seq, NULL);
2213
2214        while (rc && *pos) {
2215                rc = listening_get_next(seq, rc);
2216                --*pos;
2217        }
2218        return rc;
2219}
2220
2221static void *established_get_first(struct seq_file *seq)
2222{
2223        struct tcp_iter_state* st = seq->private;
2224        void *rc = NULL;
2225
2226        for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2227                struct sock *sk;
2228                struct hlist_node *node;
2229                struct tcp_tw_bucket *tw;
2230               
2231                read_lock(&tcp_ehash[st->bucket].lock);
2232                sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2233                        if (sk->sk_family != st->family) {
2234                                continue;
2235                        }
2236                        rc = sk;
2237                        goto out;
2238                }
2239                st->state = TCP_SEQ_STATE_TIME_WAIT;
2240                tw_for_each(tw, node,
2241                            &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2242                        if (tw->tw_family != st->family) {
2243                                continue;
2244                        }
2245                        rc = tw;
2246                        goto out;
2247                }
2248                read_unlock(&tcp_ehash[st->bucket].lock);
2249                st->state = TCP_SEQ_STATE_ESTABLISHED;
2250        }
2251out:
2252        return rc;
2253}
2254
2255static void *established_get_next(struct seq_file *seq, void *cur)
2256{
2257        struct sock *sk = cur;
2258        struct tcp_tw_bucket *tw;
2259        struct hlist_node *node;
2260        struct tcp_iter_state* st = seq->private;
2261
2262        ++st->num;
2263
2264        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2265                tw = cur;
2266                tw = tw_next(tw);
2267get_tw:
2268                while (tw && tw->tw_family != st->family) {
2269                        tw = tw_next(tw);
2270                }
2271                if (tw) {
2272                        cur = tw;
2273                        goto out;
2274                }
2275                read_unlock(&tcp_ehash[st->bucket].lock);
2276                st->state = TCP_SEQ_STATE_ESTABLISHED;
2277                if (++st->bucket < tcp_ehash_size) {
2278                        read_lock(&tcp_ehash[st->bucket].lock);
2279                        sk = sk_head(&tcp_ehash[st->bucket].chain);
2280                } else {
2281                        cur = NULL;
2282                        goto out;
2283                }
2284        } else
2285                sk = sk_next(sk);
2286
2287        sk_for_each_from(sk, node) {
2288                if (sk->sk_family == st->family)
2289                        goto found;
2290        }
2291
2292        st->state = TCP_SEQ_STATE_TIME_WAIT;
2293        tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2294        goto get_tw;
2295found:
2296        cur = sk;
2297out:
2298        return cur;
2299}
2300
2301static void *established_get_idx(struct seq_file *seq, loff_t pos)
2302{
2303        void *rc = established_get_first(seq);
2304
2305        while (rc && pos) {
2306                rc = established_get_next(seq, rc);
2307                --pos;
2308        }               
2309        return rc;
2310}
2311
2312static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2313{
2314        void *rc;
2315        struct tcp_iter_state* st = seq->private;
2316
2317        tcp_listen_lock();
2318        st->state = TCP_SEQ_STATE_LISTENING;
2319        rc        = listening_get_idx(seq, &pos);
2320
2321        if (!rc) {
2322                tcp_listen_unlock();
2323                local_bh_disable();
2324                st->state = TCP_SEQ_STATE_ESTABLISHED;
2325                rc        = established_get_idx(seq, pos);
2326        }
2327
2328        return rc;
2329}
2330
2331static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2332{
2333        struct tcp_iter_state* st = seq->private;
2334        st->state = TCP_SEQ_STATE_LISTENING;
2335        st->num = 0;
2336        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2337}
2338
2339static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2340{
2341        void *rc = NULL;
2342        struct tcp_iter_state* st;
2343
2344        if (v == SEQ_START_TOKEN) {
2345                rc = tcp_get_idx(seq, 0);
2346                goto out;
2347        }
2348        st = seq->private;
2349
2350        switch (st->state) {
2351        case TCP_SEQ_STATE_OPENREQ:
2352        case TCP_SEQ_STATE_LISTENING:
2353                rc = listening_get_next(seq, v);
2354                if (!rc) {
2355                        tcp_listen_unlock();
2356                        local_bh_disable();
2357                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2358                        rc        = established_get_first(seq);
2359                }
2360                break;
2361        case TCP_SEQ_STATE_ESTABLISHED:
2362        case TCP_SEQ_STATE_TIME_WAIT:
2363                rc = established_get_next(seq, v);
2364                break;
2365        }
2366out:
2367        ++*pos;
2368        return rc;
2369}
2370
2371static void tcp_seq_stop(struct seq_file *seq, void *v)
2372{
2373        struct tcp_iter_state* st = seq->private;
2374
2375        switch (st->state) {
2376        case TCP_SEQ_STATE_OPENREQ:
2377                if (v) {
2378                        struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2379                        read_unlock_bh(&tp->syn_wait_lock);
2380                }
2381        case TCP_SEQ_STATE_LISTENING:
2382                if (v != SEQ_START_TOKEN)
2383                        tcp_listen_unlock();
2384                break;
2385        case TCP_SEQ_STATE_TIME_WAIT:
2386        case TCP_SEQ_STATE_ESTABLISHED:
2387                if (v)
2388                        read_unlock(&tcp_ehash[st->bucket].lock);
2389                local_bh_enable();
2390                break;
2391        }
2392}
2393
2394static int tcp_seq_open(struct inode *inode, struct file *file)
2395{
2396        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2397        struct seq_file *seq;
2398        struct tcp_iter_state *s;
2399        int rc;
2400
2401        if (unlikely(afinfo == NULL))
2402                return -EINVAL;
2403
2404        s = kmalloc(sizeof(*s), GFP_KERNEL);
2405        if (!s)
2406                return -ENOMEM;
2407        memset(s, 0, sizeof(*s));
2408        s->family               = afinfo->family;
2409        s->seq_ops.start        = tcp_seq_start;
2410        s->seq_ops.next         = tcp_seq_next;
2411        s->seq_ops.show         = afinfo->seq_show;
2412        s->seq_ops.stop         = tcp_seq_stop;
2413
2414        rc = seq_open(file, &s->seq_ops);
2415        if (rc)
2416                goto out_kfree;
2417        seq          = file->private_data;
2418        seq->private = s;
2419out:
2420        return rc;
2421out_kfree:
2422        kfree(s);
2423        goto out;
2424}
2425
2426int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2427{
2428        int rc = 0;
2429        struct proc_dir_entry *p;
2430
2431        if (!afinfo)
2432                return -EINVAL;
2433        afinfo->seq_fops->owner         = afinfo->owner;
2434        afinfo->seq_fops->open          = tcp_seq_open;
2435        afinfo->seq_fops->read          = seq_read;
2436        afinfo->seq_fops->llseek        = seq_lseek;
2437        afinfo->seq_fops->release       = seq_release_private;
2438        
2439        p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2440        if (p)
2441                p->data = afinfo;
2442        else
2443                rc = -ENOMEM;
2444        return rc;
2445}
2446
2447void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2448{
2449        if (!afinfo)
2450                return;
2451        proc_net_remove(afinfo->name);
2452        memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2453}
2454
2455static void get_openreq4(struct sock *sk, struct open_request *req,
2456                         char *tmpbuf, int i, int uid)
2457{
2458        int ttd = req->expires - jiffies;
2459
2460        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2461                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2462                i,
2463                req->af.v4_req.loc_addr,
2464                ntohs(inet_sk(sk)->sport),
2465                req->af.v4_req.rmt_addr,
2466                ntohs(req->rmt_port),
2467                TCP_SYN_RECV,
2468                0, 0, /* could print option size, but that is af dependent. */
2469                1,    /* timers active (only the expire timer) */
2470                jiffies_to_clock_t(ttd),
2471                req->retrans,
2472                uid,
2473                0,  /* non standard timer */
2474                0, /* open_requests have no inode */
2475                atomic_read(&sk->sk_refcnt),
2476                req);
2477}
2478
2479static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2480{
2481        int timer_active;
2482        unsigned long timer_expires;
2483        struct tcp_opt *tp = tcp_sk(sp);
2484        struct inet_opt *inet = inet_sk(sp);
2485        unsigned int dest = inet->daddr;
2486        unsigned int src = inet->rcv_saddr;
2487        __u16 destp = ntohs(inet->dport);
2488        __u16 srcp = ntohs(inet->sport);
2489
2490        if (tp->pending == TCP_TIME_RETRANS) {
2491                timer_active    = 1;
2492                timer_expires   = tp->timeout;
2493        } else if (tp->pending == TCP_TIME_PROBE0) {
2494                timer_active    = 4;
2495                timer_expires   = tp->timeout;
2496        } else if (timer_pending(&sp->sk_timer)) {
2497                timer_active    = 2;
2498                timer_expires   = sp->sk_timer.expires;
2499        } else {
2500                timer_active    = 0;
2501                timer_expires = jiffies;
2502        }
2503
2504        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2505                        "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2506                i, src, srcp, dest, destp, sp->sk_state,
2507                tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2508                timer_active,
2509                jiffies_to_clock_t(timer_expires - jiffies),
2510                tp->retransmits,
2511                sock_i_uid(sp),
2512                tp->probes_out,
2513                sock_i_ino(sp),
2514                atomic_read(&sp->sk_refcnt), sp,
2515                tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2516                tp->snd_cwnd,
2517                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2518}
2519
2520static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2521{
2522        unsigned int dest, src;
2523        __u16 destp, srcp;
2524        int ttd = tw->tw_ttd - jiffies;
2525
2526        if (ttd < 0)
2527                ttd = 0;
2528
2529        dest  = tw->tw_daddr;
2530        src   = tw->tw_rcv_saddr;
2531        destp = ntohs(tw->tw_dport);
2532        srcp  = ntohs(tw->tw_sport);
2533
2534        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2535                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2536                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2537                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2538                atomic_read(&tw->tw_refcnt), tw);
2539}
2540
2541#define TMPSZ 150
2542
2543static int tcp4_seq_show(struct seq_file *seq, void *v)
2544{
2545        struct tcp_iter_state* st;
2546        char tmpbuf[TMPSZ + 1];
2547
2548        if (v == SEQ_START_TOKEN) {
2549                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2550                           "  sl  local_address rem_address   st tx_queue "
2551                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2552                           "inode");
2553                goto out;
2554        }
2555        st = seq->private;
2556
2557        switch (st->state) {
2558        case TCP_SEQ_STATE_LISTENING:
2559        case TCP_SEQ_STATE_ESTABLISHED:
2560                get_tcp4_sock(v, tmpbuf, st->num);
2561                break;
2562        case TCP_SEQ_STATE_OPENREQ:
2563                get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2564                break;
2565        case TCP_SEQ_STATE_TIME_WAIT:
2566                get_timewait4_sock(v, tmpbuf, st->num);
2567                break;
2568        }
2569        seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2570out:
2571        return 0;
2572}
2573
2574static struct file_operations tcp4_seq_fops;
2575static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2576        .owner          = THIS_MODULE,
2577        .name           = "tcp",
2578        .family         = AF_INET,
2579        .seq_show       = tcp4_seq_show,
2580        .seq_fops       = &tcp4_seq_fops,
2581};
2582
2583int __init tcp4_proc_init(void)
2584{
2585        return tcp_proc_register(&tcp4_seq_afinfo);
2586}
2587
2588void tcp4_proc_exit(void)
2589{
2590        tcp_proc_unregister(&tcp4_seq_afinfo);
2591}
2592#endif /* CONFIG_PROC_FS */
2593
2594struct proto tcp_prot = {
2595        .name                   = "TCP",
2596        .close                  = tcp_close,
2597        .connect                = tcp_v4_connect,
2598        .disconnect             = tcp_disconnect,
2599        .accept                 = tcp_accept,
2600        .ioctl                  = tcp_ioctl,
2601        .init                   = tcp_v4_init_sock,
2602        .destroy                = tcp_v4_destroy_sock,
2603        .shutdown               = tcp_shutdown,
2604        .setsockopt             = tcp_setsockopt,
2605        .getsockopt             = tcp_getsockopt,
2606        .sendmsg                = tcp_sendmsg,
2607        .recvmsg                = tcp_recvmsg,
2608        .backlog_rcv            = tcp_v4_do_rcv,
2609        .hash                   = tcp_v4_hash,
2610        .unhash                 = tcp_unhash,
2611        .get_port               = tcp_v4_get_port,
2612        .enter_memory_pressure  = tcp_enter_memory_pressure,
2613        .sockets_allocated      = &tcp_sockets_allocated,
2614        .memory_allocated       = &tcp_memory_allocated,
2615        .memory_pressure        = &tcp_memory_pressure,
2616        .sysctl_mem             = sysctl_tcp_mem,
2617        .sysctl_wmem            = sysctl_tcp_wmem,
2618        .sysctl_rmem            = sysctl_tcp_rmem,
2619        .max_header             = MAX_TCP_HEADER,
2620};
2621
2622
2623
2624void __init tcp_v4_init(struct net_proto_family *ops)
2625{
2626        int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2627        if (err < 0)
2628                panic("Failed to create the TCP control socket.\n");
2629        tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2630        inet_sk(tcp_socket->sk)->uc_ttl = -1;
2631
2632        /* Unhash it so that IP input processing does not even
2633         * see it, we do not wish this socket to see incoming
2634         * packets.
2635         */
2636        tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2637}
2638
2639EXPORT_SYMBOL(ipv4_specific);
2640EXPORT_SYMBOL(tcp_bind_hash);
2641EXPORT_SYMBOL(tcp_bucket_create);
2642EXPORT_SYMBOL(tcp_hashinfo);
2643EXPORT_SYMBOL(tcp_inherit_port);
2644EXPORT_SYMBOL(tcp_listen_wlock);
2645EXPORT_SYMBOL(tcp_port_rover);
2646EXPORT_SYMBOL(tcp_prot);
2647EXPORT_SYMBOL(tcp_put_port);
2648EXPORT_SYMBOL(tcp_unhash);
2649EXPORT_SYMBOL(tcp_v4_conn_request);
2650EXPORT_SYMBOL(tcp_v4_connect);
2651EXPORT_SYMBOL(tcp_v4_do_rcv);
2652EXPORT_SYMBOL(tcp_v4_lookup_listener);
2653EXPORT_SYMBOL(tcp_v4_rebuild_header);
2654EXPORT_SYMBOL(tcp_v4_remember_stamp);
2655EXPORT_SYMBOL(tcp_v4_send_check);
2656EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2657
2658#ifdef CONFIG_PROC_FS
2659EXPORT_SYMBOL(tcp_proc_register);
2660EXPORT_SYMBOL(tcp_proc_unregister);
2661#endif
2662#ifdef CONFIG_SYSCTL
2663EXPORT_SYMBOL(sysctl_local_port_range);
2664EXPORT_SYMBOL(sysctl_max_syn_backlog);
2665EXPORT_SYMBOL(sysctl_tcp_low_latency);
2666#endif
2667
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.