linux-bk/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9 *
  10 *              IPv4 specific functions
  11 *
  12 *
  13 *              code split from:
  14 *              linux/ipv4/tcp.c
  15 *              linux/ipv4/tcp_input.c
  16 *              linux/ipv4/tcp_output.c
  17 *
  18 *              See tcp.c for author information
  19 *
  20 *      This program is free software; you can redistribute it and/or
  21 *      modify it under the terms of the GNU General Public License
  22 *      as published by the Free Software Foundation; either version
  23 *      2 of the License, or (at your option) any later version.
  24 */
  25
  26/*
  27 * Changes:
  28 *              David S. Miller :       New socket lookup architecture.
  29 *                                      This code is dedicated to John Dyson.
  30 *              David S. Miller :       Change semantics of established hash,
  31 *                                      half is devoted to TIME_WAIT sockets
  32 *                                      and the rest go in the other half.
  33 *              Andi Kleen :            Add support for syncookies and fixed
  34 *                                      some bugs: ip options weren't passed to
  35 *                                      the TCP layer, missed a check for an
  36 *                                      ACK bit.
  37 *              Andi Kleen :            Implemented fast path mtu discovery.
  38 *                                      Fixed many serious bugs in the
  39 *                                      open_request handling and moved
  40 *                                      most of it into the af independent code.
  41 *                                      Added tail drop and some other bugfixes.
  42 *                                      Added new listen sematics.
  43 *              Mike McLagan    :       Routing by source
  44 *      Juan Jose Ciarlante:            ip_dynaddr bits
  45 *              Andi Kleen:             various fixes.
  46 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47 *                                      coma.
  48 *      Andi Kleen              :       Fix new listen.
  49 *      Andi Kleen              :       Fix accept error reporting.
  50 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52 *                                      a single port at the same time.
  53 */
  54
  55#include <linux/config.h>
  56
  57#include <linux/types.h>
  58#include <linux/fcntl.h>
  59#include <linux/module.h>
  60#include <linux/random.h>
  61#include <linux/cache.h>
  62#include <linux/jhash.h>
  63#include <linux/init.h>
  64#include <linux/times.h>
  65
  66#include <net/icmp.h>
  67#include <net/tcp.h>
  68#include <net/ipv6.h>
  69#include <net/inet_common.h>
  70#include <net/xfrm.h>
  71
  72#include <linux/inet.h>
  73#include <linux/ipv6.h>
  74#include <linux/stddef.h>
  75#include <linux/proc_fs.h>
  76#include <linux/seq_file.h>
  77
  78extern int sysctl_ip_dynaddr;
  79int sysctl_tcp_tw_reuse;
  80int sysctl_tcp_low_latency;
  81
  82/* Check TCP sequence numbers in ICMP packets. */
  83#define ICMP_MIN_LENGTH 8
  84
  85/* Socket used for sending RSTs */
  86static struct socket *tcp_socket;
  87
  88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                       struct sk_buff *skb);
  90
  91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92        .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93        .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94        .__tcp_lhash_wait
  95          = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96        .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97};
  98
  99/*
 100 * This array holds the first and last local port number.
 101 * For high-usage systems, use sysctl to change this to
 102 * 32768-61000
 103 */
 104int sysctl_local_port_range[2] = { 1024, 4999 };
 105int tcp_port_rover = 1024 - 1;
 106
 107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                 __u32 faddr, __u16 fport)
 109{
 110        int h = (laddr ^ lport) ^ (faddr ^ fport);
 111        h ^= h >> 16;
 112        h ^= h >> 8;
 113        return h & (tcp_ehash_size - 1);
 114}
 115
 116static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117{
 118        struct inet_opt *inet = inet_sk(sk);
 119        __u32 laddr = inet->rcv_saddr;
 120        __u16 lport = inet->num;
 121        __u32 faddr = inet->daddr;
 122        __u16 fport = inet->dport;
 123
 124        return tcp_hashfn(laddr, lport, faddr, fport);
 125}
 126
 127/* Allocate and initialize a new TCP local port bind bucket.
 128 * The bindhash mutex for snum's hash chain must be held here.
 129 */
 130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                          unsigned short snum)
 132{
 133        struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                      SLAB_ATOMIC);
 135        if (tb) {
 136                tb->port = snum;
 137                tb->fastreuse = 0;
 138                INIT_HLIST_HEAD(&tb->owners);
 139                hlist_add_head(&tb->node, &head->chain);
 140        }
 141        return tb;
 142}
 143
 144/* Caller must hold hashbucket lock for this tb with local BH disabled */
 145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146{
 147        if (hlist_empty(&tb->owners)) {
 148                __hlist_del(&tb->node);
 149                kmem_cache_free(tcp_bucket_cachep, tb);
 150        }
 151}
 152
 153/* Caller must disable local BH processing. */
 154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155{
 156        struct tcp_bind_hashbucket *head =
 157                                &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158        struct tcp_bind_bucket *tb;
 159
 160        spin_lock(&head->lock);
 161        tb = tcp_sk(sk)->bind_hash;
 162        sk_add_bind_node(child, &tb->owners);
 163        tcp_sk(child)->bind_hash = tb;
 164        spin_unlock(&head->lock);
 165}
 166
 167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168{
 169        local_bh_disable();
 170        __tcp_inherit_port(sk, child);
 171        local_bh_enable();
 172}
 173
 174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                   unsigned short snum)
 176{
 177        inet_sk(sk)->num = snum;
 178        sk_add_bind_node(sk, &tb->owners);
 179        tcp_sk(sk)->bind_hash = tb;
 180}
 181
 182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183{
 184        const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185        struct sock *sk2;
 186        struct hlist_node *node;
 187        int reuse = sk->sk_reuse;
 188
 189        sk_for_each_bound(sk2, node, &tb->owners) {
 190                if (sk != sk2 &&
 191                    !tcp_v6_ipv6only(sk2) &&
 192                    (!sk->sk_bound_dev_if ||
 193                     !sk2->sk_bound_dev_if ||
 194                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                        if (!reuse || !sk2->sk_reuse ||
 196                            sk2->sk_state == TCP_LISTEN) {
 197                                const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                    sk2_rcv_saddr == sk_rcv_saddr)
 200                                        break;
 201                        }
 202                }
 203        }
 204        return node != NULL;
 205}
 206
 207/* Obtain a reference to a local port for the given sock,
 208 * if snum is zero it means select any available local port.
 209 */
 210static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211{
 212        struct tcp_bind_hashbucket *head;
 213        struct hlist_node *node;
 214        struct tcp_bind_bucket *tb;
 215        int ret;
 216
 217        local_bh_disable();
 218        if (!snum) {
 219                int low = sysctl_local_port_range[0];
 220                int high = sysctl_local_port_range[1];
 221                int remaining = (high - low) + 1;
 222                int rover;
 223
 224                spin_lock(&tcp_portalloc_lock);
 225                rover = tcp_port_rover;
 226                do {
 227                        rover++;
 228                        if (rover < low || rover > high)
 229                                rover = low;
 230                        head = &tcp_bhash[tcp_bhashfn(rover)];
 231                        spin_lock(&head->lock);
 232                        tb_for_each(tb, node, &head->chain)
 233                                if (tb->port == rover)
 234                                        goto next;
 235                        break;
 236                next:
 237                        spin_unlock(&head->lock);
 238                } while (--remaining > 0);
 239                tcp_port_rover = rover;
 240                spin_unlock(&tcp_portalloc_lock);
 241
 242                /* Exhausted local port range during search? */
 243                ret = 1;
 244                if (remaining <= 0)
 245                        goto fail;
 246
 247                /* OK, here is the one we will use.  HEAD is
 248                 * non-NULL and we hold it's mutex.
 249                 */
 250                snum = rover;
 251        } else {
 252                head = &tcp_bhash[tcp_bhashfn(snum)];
 253                spin_lock(&head->lock);
 254                tb_for_each(tb, node, &head->chain)
 255                        if (tb->port == snum)
 256                                goto tb_found;
 257        }
 258        tb = NULL;
 259        goto tb_not_found;
 260tb_found:
 261        if (!hlist_empty(&tb->owners)) {
 262                if (sk->sk_reuse > 1)
 263                        goto success;
 264                if (tb->fastreuse > 0 &&
 265                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 266                        goto success;
 267                } else {
 268                        ret = 1;
 269                        if (tcp_bind_conflict(sk, tb))
 270                                goto fail_unlock;
 271                }
 272        }
 273tb_not_found:
 274        ret = 1;
 275        if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 276                goto fail_unlock;
 277        if (hlist_empty(&tb->owners)) {
 278                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 279                        tb->fastreuse = 1;
 280                else
 281                        tb->fastreuse = 0;
 282        } else if (tb->fastreuse &&
 283                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 284                tb->fastreuse = 0;
 285success:
 286        if (!tcp_sk(sk)->bind_hash)
 287                tcp_bind_hash(sk, tb, snum);
 288        BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 289        ret = 0;
 290
 291fail_unlock:
 292        spin_unlock(&head->lock);
 293fail:
 294        local_bh_enable();
 295        return ret;
 296}
 297
 298/* Get rid of any references to a local port held by the
 299 * given sock.
 300 */
 301static void __tcp_put_port(struct sock *sk)
 302{
 303        struct inet_opt *inet = inet_sk(sk);
 304        struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 305        struct tcp_bind_bucket *tb;
 306
 307        spin_lock(&head->lock);
 308        tb = tcp_sk(sk)->bind_hash;
 309        __sk_del_bind_node(sk);
 310        tcp_sk(sk)->bind_hash = NULL;
 311        inet->num = 0;
 312        tcp_bucket_destroy(tb);
 313        spin_unlock(&head->lock);
 314}
 315
 316inline void tcp_put_port(struct sock *sk)
 317{
 318        local_bh_disable();
 319        __tcp_put_port(sk);
 320        local_bh_enable();
 321}
 322
 323/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 324 * Look, when several writers sleep and reader wakes them up, all but one
 325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 326 * this, _but_ remember, it adds useless work on UP machines (wake up each
 327 * exclusive lock release). It should be ifdefed really.
 328 */
 329
 330void tcp_listen_wlock(void)
 331{
 332        write_lock(&tcp_lhash_lock);
 333
 334        if (atomic_read(&tcp_lhash_users)) {
 335                DEFINE_WAIT(wait);
 336
 337                for (;;) {
 338                        prepare_to_wait_exclusive(&tcp_lhash_wait,
 339                                                &wait, TASK_UNINTERRUPTIBLE);
 340                        if (!atomic_read(&tcp_lhash_users))
 341                                break;
 342                        write_unlock_bh(&tcp_lhash_lock);
 343                        schedule();
 344                        write_lock_bh(&tcp_lhash_lock);
 345                }
 346
 347                finish_wait(&tcp_lhash_wait, &wait);
 348        }
 349}
 350
 351static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 352{
 353        struct hlist_head *list;
 354        rwlock_t *lock;
 355
 356        BUG_TRAP(sk_unhashed(sk));
 357        if (listen_possible && sk->sk_state == TCP_LISTEN) {
 358                list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 359                lock = &tcp_lhash_lock;
 360                tcp_listen_wlock();
 361        } else {
 362                list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 363                lock = &tcp_ehash[sk->sk_hashent].lock;
 364                write_lock(lock);
 365        }
 366        __sk_add_node(sk, list);
 367        sock_prot_inc_use(sk->sk_prot);
 368        write_unlock(lock);
 369        if (listen_possible && sk->sk_state == TCP_LISTEN)
 370                wake_up(&tcp_lhash_wait);
 371}
 372
 373static void tcp_v4_hash(struct sock *sk)
 374{
 375        if (sk->sk_state != TCP_CLOSE) {
 376                local_bh_disable();
 377                __tcp_v4_hash(sk, 1);
 378                local_bh_enable();
 379        }
 380}
 381
 382void tcp_unhash(struct sock *sk)
 383{
 384        rwlock_t *lock;
 385
 386        if (sk_unhashed(sk))
 387                goto ende;
 388
 389        if (sk->sk_state == TCP_LISTEN) {
 390                local_bh_disable();
 391                tcp_listen_wlock();
 392                lock = &tcp_lhash_lock;
 393        } else {
 394                struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 395                lock = &head->lock;
 396                write_lock_bh(&head->lock);
 397        }
 398
 399        if (__sk_del_node_init(sk))
 400                sock_prot_dec_use(sk->sk_prot);
 401        write_unlock_bh(lock);
 402
 403 ende:
 404        if (sk->sk_state == TCP_LISTEN)
 405                wake_up(&tcp_lhash_wait);
 406}
 407
 408/* Don't inline this cruft.  Here are some nice properties to
 409 * exploit here.  The BSD API does not allow a listening TCP
 410 * to specify the remote port nor the remote address for the
 411 * connection.  So always assume those are both wildcarded
 412 * during the search since they can never be otherwise.
 413 */
 414static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 415                                             unsigned short hnum, int dif)
 416{
 417        struct sock *result = NULL, *sk;
 418        struct hlist_node *node;
 419        int score, hiscore;
 420
 421        hiscore=-1;
 422        sk_for_each(sk, node, head) {
 423                struct inet_opt *inet = inet_sk(sk);
 424
 425                if (inet->num == hnum && !ipv6_only_sock(sk)) {
 426                        __u32 rcv_saddr = inet->rcv_saddr;
 427
 428                        score = (sk->sk_family == PF_INET ? 1 : 0);
 429                        if (rcv_saddr) {
 430                                if (rcv_saddr != daddr)
 431                                        continue;
 432                                score+=2;
 433                        }
 434                        if (sk->sk_bound_dev_if) {
 435                                if (sk->sk_bound_dev_if != dif)
 436                                        continue;
 437                                score+=2;
 438                        }
 439                        if (score == 5)
 440                                return sk;
 441                        if (score > hiscore) {
 442                                hiscore = score;
 443                                result = sk;
 444                        }
 445                }
 446        }
 447        return result;
 448}
 449
 450/* Optimize the common listener case. */
 451inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
 452                                           int dif)
 453{
 454        struct sock *sk = NULL;
 455        struct hlist_head *head;
 456
 457        read_lock(&tcp_lhash_lock);
 458        head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 459        if (!hlist_empty(head)) {
 460                struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
 461
 462                if (inet->num == hnum && !sk->sk_node.next &&
 463                    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 464                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 465                    !sk->sk_bound_dev_if)
 466                        goto sherry_cache;
 467                sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 468        }
 469        if (sk) {
 470sherry_cache:
 471                sock_hold(sk);
 472        }
 473        read_unlock(&tcp_lhash_lock);
 474        return sk;
 475}
 476
 477/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 479 *
 480 * Local BH must be disabled here.
 481 */
 482
 483static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 484                                                       u32 daddr, u16 hnum,
 485                                                       int dif)
 486{
 487        struct tcp_ehash_bucket *head;
 488        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 489        __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 490        struct sock *sk;
 491        struct hlist_node *node;
 492        /* Optimize here for direct hit, only listening connections can
 493         * have wildcards anyways.
 494         */
 495        int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 496        head = &tcp_ehash[hash];
 497        read_lock(&head->lock);
 498        sk_for_each(sk, node, &head->chain) {
 499                if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 500                        goto hit; /* You sunk my battleship! */
 501        }
 502
 503        /* Must check for a TIME_WAIT'er before going to listener hash. */
 504        sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 505                if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 506                        goto hit;
 507        }
 508        sk = NULL;
 509out:
 510        read_unlock(&head->lock);
 511        return sk;
 512hit:
 513        sock_hold(sk);
 514        goto out;
 515}
 516
 517static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 518                                           u32 daddr, u16 hnum, int dif)
 519{
 520        struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 521                                                      daddr, hnum, dif);
 522
 523        return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 524}
 525
 526inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 527                                  u16 dport, int dif)
 528{
 529        struct sock *sk;
 530
 531        local_bh_disable();
 532        sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 533        local_bh_enable();
 534
 535        return sk;
 536}
 537
 538static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 539{
 540        return secure_tcp_sequence_number(skb->nh.iph->daddr,
 541                                          skb->nh.iph->saddr,
 542                                          skb->h.th->dest,
 543                                          skb->h.th->source);
 544}
 545
 546/* called with local bh disabled */
 547static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 548                                      struct tcp_tw_bucket **twp)
 549{
 550        struct inet_opt *inet = inet_sk(sk);
 551        u32 daddr = inet->rcv_saddr;
 552        u32 saddr = inet->daddr;
 553        int dif = sk->sk_bound_dev_if;
 554        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 555        __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 556        int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 557        struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 558        struct sock *sk2;
 559        struct hlist_node *node;
 560        struct tcp_tw_bucket *tw;
 561
 562        write_lock(&head->lock);
 563
 564        /* Check TIME-WAIT sockets first. */
 565        sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 566                tw = (struct tcp_tw_bucket *)sk2;
 567
 568                if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 569                        struct tcp_opt *tp = tcp_sk(sk);
 570
 571                        /* With PAWS, it is safe from the viewpoint
 572                           of data integrity. Even without PAWS it
 573                           is safe provided sequence spaces do not
 574                           overlap i.e. at data rates <= 80Mbit/sec.
 575
 576                           Actually, the idea is close to VJ's one,
 577                           only timestamp cache is held not per host,
 578                           but per port pair and TW bucket is used
 579                           as state holder.
 580
 581                           If TW bucket has been already destroyed we
 582                           fall back to VJ's scheme and use initial
 583                           timestamp retrieved from peer table.
 584                         */
 585                        if (tw->tw_ts_recent_stamp &&
 586                            (!twp || (sysctl_tcp_tw_reuse &&
 587                                      xtime.tv_sec -
 588                                      tw->tw_ts_recent_stamp > 1))) {
 589                                if ((tp->write_seq =
 590                                                tw->tw_snd_nxt + 65535 + 2) == 0)
 591                                        tp->write_seq = 1;
 592                                tp->ts_recent       = tw->tw_ts_recent;
 593                                tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
 594                                sock_hold(sk2);
 595                                goto unique;
 596                        } else
 597                                goto not_unique;
 598                }
 599        }
 600        tw = NULL;
 601
 602        /* And established part... */
 603        sk_for_each(sk2, node, &head->chain) {
 604                if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 605                        goto not_unique;
 606        }
 607
 608unique:
 609        /* Must record num and sport now. Otherwise we will see
 610         * in hash table socket with a funny identity. */
 611        inet->num = lport;
 612        inet->sport = htons(lport);
 613        sk->sk_hashent = hash;
 614        BUG_TRAP(sk_unhashed(sk));
 615        __sk_add_node(sk, &head->chain);
 616        sock_prot_inc_use(sk->sk_prot);
 617        write_unlock(&head->lock);
 618
 619        if (twp) {
 620                *twp = tw;
 621                NET_INC_STATS_BH(TimeWaitRecycled);
 622        } else if (tw) {
 623                /* Silly. Should hash-dance instead... */
 624                tcp_tw_deschedule(tw);
 625                NET_INC_STATS_BH(TimeWaitRecycled);
 626
 627                tcp_tw_put(tw);
 628        }
 629
 630        return 0;
 631
 632not_unique:
 633        write_unlock(&head->lock);
 634        return -EADDRNOTAVAIL;
 635}
 636
 637/*
 638 * Bind a port for a connect operation and hash it.
 639 */
 640static int tcp_v4_hash_connect(struct sock *sk)
 641{
 642        unsigned short snum = inet_sk(sk)->num;
 643        struct tcp_bind_hashbucket *head;
 644        struct tcp_bind_bucket *tb;
 645        int ret;
 646
 647        if (!snum) {
 648                int rover;
 649                int low = sysctl_local_port_range[0];
 650                int high = sysctl_local_port_range[1];
 651                int remaining = (high - low) + 1;
 652                struct hlist_node *node;
 653                struct tcp_tw_bucket *tw = NULL;
 654
 655                local_bh_disable();
 656
 657                /* TODO. Actually it is not so bad idea to remove
 658                 * tcp_portalloc_lock before next submission to Linus.
 659                 * As soon as we touch this place at all it is time to think.
 660                 *
 661                 * Now it protects single _advisory_ variable tcp_port_rover,
 662                 * hence it is mostly useless.
 663                 * Code will work nicely if we just delete it, but
 664                 * I am afraid in contented case it will work not better or
 665                 * even worse: another cpu just will hit the same bucket
 666                 * and spin there.
 667                 * So some cpu salt could remove both contention and
 668                 * memory pingpong. Any ideas how to do this in a nice way?
 669                 */
 670                spin_lock(&tcp_portalloc_lock);
 671                rover = tcp_port_rover;
 672
 673                do {
 674                        rover++;
 675                        if ((rover < low) || (rover > high))
 676                                rover = low;
 677                        head = &tcp_bhash[tcp_bhashfn(rover)];
 678                        spin_lock(&head->lock);
 679
 680                        /* Does not bother with rcv_saddr checks,
 681                         * because the established check is already
 682                         * unique enough.
 683                         */
 684                        tb_for_each(tb, node, &head->chain) {
 685                                if (tb->port == rover) {
 686                                        BUG_TRAP(!hlist_empty(&tb->owners));
 687                                        if (tb->fastreuse >= 0)
 688                                                goto next_port;
 689                                        if (!__tcp_v4_check_established(sk,
 690                                                                        rover,
 691                                                                        &tw))
 692                                                goto ok;
 693                                        goto next_port;
 694                                }
 695                        }
 696
 697                        tb = tcp_bucket_create(head, rover);
 698                        if (!tb) {
 699                                spin_unlock(&head->lock);
 700                                break;
 701                        }
 702                        tb->fastreuse = -1;
 703                        goto ok;
 704
 705                next_port:
 706                        spin_unlock(&head->lock);
 707                } while (--remaining > 0);
 708                tcp_port_rover = rover;
 709                spin_unlock(&tcp_portalloc_lock);
 710
 711                local_bh_enable();
 712
 713                return -EADDRNOTAVAIL;
 714
 715ok:
 716                /* All locks still held and bhs disabled */
 717                tcp_port_rover = rover;
 718                spin_unlock(&tcp_portalloc_lock);
 719
 720                tcp_bind_hash(sk, tb, rover);
 721                if (sk_unhashed(sk)) {
 722                        inet_sk(sk)->sport = htons(rover);
 723                        __tcp_v4_hash(sk, 0);
 724                }
 725                spin_unlock(&head->lock);
 726
 727                if (tw) {
 728                        tcp_tw_deschedule(tw);
 729                        tcp_tw_put(tw);
 730                }
 731
 732                ret = 0;
 733                goto out;
 734        }
 735
 736        head  = &tcp_bhash[tcp_bhashfn(snum)];
 737        tb  = tcp_sk(sk)->bind_hash;
 738        spin_lock_bh(&head->lock);
 739        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 740                __tcp_v4_hash(sk, 0);
 741                spin_unlock_bh(&head->lock);
 742                return 0;
 743        } else {
 744                spin_unlock(&head->lock);
 745                /* No definite answer... Walk to established hash table */
 746                ret = __tcp_v4_check_established(sk, snum, NULL);
 747out:
 748                local_bh_enable();
 749                return ret;
 750        }
 751}
 752
 753/* This will initiate an outgoing connection. */
 754int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 755{
 756        struct inet_opt *inet = inet_sk(sk);
 757        struct tcp_opt *tp = tcp_sk(sk);
 758        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 759        struct rtable *rt;
 760        u32 daddr, nexthop;
 761        int tmp;
 762        int err;
 763
 764        if (addr_len < sizeof(struct sockaddr_in))
 765                return -EINVAL;
 766
 767        if (usin->sin_family != AF_INET)
 768                return -EAFNOSUPPORT;
 769
 770        nexthop = daddr = usin->sin_addr.s_addr;
 771        if (inet->opt && inet->opt->srr) {
 772                if (!daddr)
 773                        return -EINVAL;
 774                nexthop = inet->opt->faddr;
 775        }
 776
 777        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 778                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 779                               IPPROTO_TCP,
 780                               inet->sport, usin->sin_port, sk);
 781        if (tmp < 0)
 782                return tmp;
 783
 784        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 785                ip_rt_put(rt);
 786                return -ENETUNREACH;
 787        }
 788
 789        if (!inet->opt || !inet->opt->srr)
 790                daddr = rt->rt_dst;
 791
 792        if (!inet->saddr)
 793                inet->saddr = rt->rt_src;
 794        inet->rcv_saddr = inet->saddr;
 795
 796        if (tp->ts_recent_stamp && inet->daddr != daddr) {
 797                /* Reset inherited state */
 798                tp->ts_recent       = 0;
 799                tp->ts_recent_stamp = 0;
 800                tp->write_seq       = 0;
 801        }
 802
 803        if (sysctl_tcp_tw_recycle &&
 804            !tp->ts_recent_stamp && rt->rt_dst == daddr) {
 805                struct inet_peer *peer = rt_get_peer(rt);
 806
 807                /* VJ's idea. We save last timestamp seen from
 808                 * the destination in peer table, when entering state TIME-WAIT
 809                 * and initialize ts_recent from it, when trying new connection.
 810                 */
 811
 812                if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 813                        tp->ts_recent_stamp = peer->tcp_ts_stamp;
 814                        tp->ts_recent = peer->tcp_ts;
 815                }
 816        }
 817
 818        inet->dport = usin->sin_port;
 819        inet->daddr = daddr;
 820
 821        tp->ext_header_len = 0;
 822        if (inet->opt)
 823                tp->ext_header_len = inet->opt->optlen;
 824
 825        tp->mss_clamp = 536;
 826
 827        /* Socket identity is still unknown (sport may be zero).
 828         * However we set state to SYN-SENT and not releasing socket
 829         * lock select source port, enter ourselves into the hash tables and
 830         * complete initialization after this.
 831         */
 832        tcp_set_state(sk, TCP_SYN_SENT);
 833        err = tcp_v4_hash_connect(sk);
 834        if (err)
 835                goto failure;
 836
 837        err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 838        if (err)
 839                goto failure;
 840
 841        /* OK, now commit destination to socket.  */
 842        __sk_dst_set(sk, &rt->u.dst);
 843        tcp_v4_setup_caps(sk, &rt->u.dst);
 844        tp->ext2_header_len = rt->u.dst.header_len;
 845
 846        if (!tp->write_seq)
 847                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 848                                                           inet->daddr,
 849                                                           inet->sport,
 850                                                           usin->sin_port);
 851
 852        inet->id = tp->write_seq ^ jiffies;
 853
 854        err = tcp_connect(sk);
 855        rt = NULL;
 856        if (err)
 857                goto failure;
 858
 859        return 0;
 860
 861failure:
 862        /* This unhashes the socket and releases the local port, if necessary. */
 863        tcp_set_state(sk, TCP_CLOSE);
 864        ip_rt_put(rt);
 865        sk->sk_route_caps = 0;
 866        inet->dport = 0;
 867        return err;
 868}
 869
 870static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 871{
 872        return ((struct rtable *)skb->dst)->rt_iif;
 873}
 874
 875static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 876{
 877        return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 878}
 879
 880static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 881                                              struct open_request ***prevp,
 882                                              __u16 rport,
 883                                              __u32 raddr, __u32 laddr)
 884{
 885        struct tcp_listen_opt *lopt = tp->listen_opt;
 886        struct open_request *req, **prev;
 887
 888        for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 889             (req = *prev) != NULL;
 890             prev = &req->dl_next) {
 891                if (req->rmt_port == rport &&
 892                    req->af.v4_req.rmt_addr == raddr &&
 893                    req->af.v4_req.loc_addr == laddr &&
 894                    TCP_INET_FAMILY(req->class->family)) {
 895                        BUG_TRAP(!req->sk);
 896                        *prevp = prev;
 897                        break;
 898                }
 899        }
 900
 901        return req;
 902}
 903
 904static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 905{
 906        struct tcp_opt *tp = tcp_sk(sk);
 907        struct tcp_listen_opt *lopt = tp->listen_opt;
 908        u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 909
 910        req->expires = jiffies + TCP_TIMEOUT_INIT;
 911        req->retrans = 0;
 912        req->sk = NULL;
 913        req->dl_next = lopt->syn_table[h];
 914
 915        write_lock(&tp->syn_wait_lock);
 916        lopt->syn_table[h] = req;
 917        write_unlock(&tp->syn_wait_lock);
 918
 919        tcp_synq_added(sk);
 920}
 921
 922
 923/*
 924 * This routine does path mtu discovery as defined in RFC1191.
 925 */
 926static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 927                                     u32 mtu)
 928{
 929        struct dst_entry *dst;
 930        struct inet_opt *inet = inet_sk(sk);
 931        struct tcp_opt *tp = tcp_sk(sk);
 932
 933        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 934         * send out by Linux are always <576bytes so they should go through
 935         * unfragmented).
 936         */
 937        if (sk->sk_state == TCP_LISTEN)
 938                return;
 939
 940        /* We don't check in the destentry if pmtu discovery is forbidden
 941         * on this route. We just assume that no packet_to_big packets
 942         * are send back when pmtu discovery is not active.
 943         * There is a small race when the user changes this flag in the
 944         * route, but I think that's acceptable.
 945         */
 946        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 947                return;
 948
 949        dst->ops->update_pmtu(dst, mtu);
 950
 951        /* Something is about to be wrong... Remember soft error
 952         * for the case, if this connection will not able to recover.
 953         */
 954        if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
 955                sk->sk_err_soft = EMSGSIZE;
 956
 957        mtu = dst_pmtu(dst);
 958
 959        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 960            tp->pmtu_cookie > mtu) {
 961                tcp_sync_mss(sk, mtu);
 962
 963                /* Resend the TCP packet because it's
 964                 * clear that the old packet has been
 965                 * dropped. This is the new "fast" path mtu
 966                 * discovery.
 967                 */
 968                tcp_simple_retransmit(sk);
 969        } /* else let the usual retransmit timer handle it */
 970}
 971
 972/*
 973 * This routine is called by the ICMP module when it gets some
 974 * sort of error condition.  If err < 0 then the socket should
 975 * be closed and the error returned to the user.  If err > 0
 976 * it's just the icmp type << 8 | icmp code.  After adjustment
 977 * header points to the first 8 bytes of the tcp header.  We need
 978 * to find the appropriate port.
 979 *
 980 * The locking strategy used here is very "optimistic". When
 981 * someone else accesses the socket the ICMP is just dropped
 982 * and for some paths there is no check at all.
 983 * A more general error queue to queue errors for later handling
 984 * is probably better.
 985 *
 986 */
 987
 988void tcp_v4_err(struct sk_buff *skb, u32 info)
 989{
 990        struct iphdr *iph = (struct iphdr *)skb->data;
 991        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 992        struct tcp_opt *tp;
 993        struct inet_opt *inet;
 994        int type = skb->h.icmph->type;
 995        int code = skb->h.icmph->code;
 996        struct sock *sk;
 997        __u32 seq;
 998        int err;
 999
1000        if (skb->len < (iph->ihl << 2) + 8) {
1001                ICMP_INC_STATS_BH(IcmpInErrors);
1002                return;
1003        }
1004
1005        sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1006                           th->source, tcp_v4_iif(skb));
1007        if (!sk) {
1008                ICMP_INC_STATS_BH(IcmpInErrors);
1009                return;
1010        }
1011        if (sk->sk_state == TCP_TIME_WAIT) {
1012                tcp_tw_put((struct tcp_tw_bucket *)sk);
1013                return;
1014        }
1015
1016        bh_lock_sock(sk);
1017        /* If too many ICMPs get dropped on busy
1018         * servers this needs to be solved differently.
1019         */
1020        if (sock_owned_by_user(sk))
1021                NET_INC_STATS_BH(LockDroppedIcmps);
1022
1023        if (sk->sk_state == TCP_CLOSE)
1024                goto out;
1025
1026        tp = tcp_sk(sk);
1027        seq = ntohl(th->seq);
1028        if (sk->sk_state != TCP_LISTEN &&
1029            !between(seq, tp->snd_una, tp->snd_nxt)) {
1030                NET_INC_STATS(OutOfWindowIcmps);
1031                goto out;
1032        }
1033
1034        switch (type) {
1035        case ICMP_SOURCE_QUENCH:
1036                /* This is deprecated, but if someone generated it,
1037                 * we have no reasons to ignore it.
1038                 */
1039                if (!sock_owned_by_user(sk))
1040                        tcp_enter_cwr(tp);
1041                goto out;
1042        case ICMP_PARAMETERPROB:
1043                err = EPROTO;
1044                break;
1045        case ICMP_DEST_UNREACH:
1046                if (code > NR_ICMP_UNREACH)
1047                        goto out;
1048
1049                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1050                        if (!sock_owned_by_user(sk))
1051                                do_pmtu_discovery(sk, iph, info);
1052                        goto out;
1053                }
1054
1055                err = icmp_err_convert[code].errno;
1056                break;
1057        case ICMP_TIME_EXCEEDED:
1058                err = EHOSTUNREACH;
1059                break;
1060        default:
1061                goto out;
1062        }
1063
1064        switch (sk->sk_state) {
1065                struct open_request *req, **prev;
1066        case TCP_LISTEN:
1067                if (sock_owned_by_user(sk))
1068                        goto out;
1069
1070                req = tcp_v4_search_req(tp, &prev, th->dest,
1071                                        iph->daddr, iph->saddr);
1072                if (!req)
1073                        goto out;
1074
1075                /* ICMPs are not backlogged, hence we cannot get
1076                   an established socket here.
1077                 */
1078                BUG_TRAP(!req->sk);
1079
1080                if (seq != req->snt_isn) {
1081                        NET_INC_STATS_BH(OutOfWindowIcmps);
1082                        goto out;
1083                }
1084
1085                /*
1086                 * Still in SYN_RECV, just remove it silently.
1087                 * There is no good way to pass the error to the newly
1088                 * created socket, and POSIX does not want network
1089                 * errors returned from accept().
1090                 */
1091                tcp_synq_drop(sk, req, prev);
1092                goto out;
1093
1094        case TCP_SYN_SENT:
1095        case TCP_SYN_RECV:  /* Cannot happen.
1096                               It can f.e. if SYNs crossed.
1097                             */
1098                if (!sock_owned_by_user(sk)) {
1099                        TCP_INC_STATS_BH(TcpAttemptFails);
1100                        sk->sk_err = err;
1101
1102                        sk->sk_error_report(sk);
1103
1104                        tcp_done(sk);
1105                } else {
1106                        sk->sk_err_soft = err;
1107                }
1108                goto out;
1109        }
1110
1111        /* If we've already connected we will keep trying
1112         * until we time out, or the user gives up.
1113         *
1114         * rfc1122 4.2.3.9 allows to consider as hard errors
1115         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1116         * but it is obsoleted by pmtu discovery).
1117         *
1118         * Note, that in modern internet, where routing is unreliable
1119         * and in each dark corner broken firewalls sit, sending random
1120         * errors ordered by their masters even this two messages finally lose
1121         * their original sense (even Linux sends invalid PORT_UNREACHs)
1122         *
1123         * Now we are in compliance with RFCs.
1124         *                                                      --ANK (980905)
1125         */
1126
1127        inet = inet_sk(sk);
1128        if (!sock_owned_by_user(sk) && inet->recverr) {
1129                sk->sk_err = err;
1130                sk->sk_error_report(sk);
1131        } else  { /* Only an error on timeout */
1132                sk->sk_err_soft = err;
1133        }
1134
1135out:
1136        bh_unlock_sock(sk);
1137        sock_put(sk);
1138}
1139
1140/* This routine computes an IPv4 TCP checksum. */
1141void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1142                       struct sk_buff *skb)
1143{
1144        struct inet_opt *inet = inet_sk(sk);
1145
1146        if (skb->ip_summed == CHECKSUM_HW) {
1147                th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1148                skb->csum = offsetof(struct tcphdr, check);
1149        } else {
1150                th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1151                                         csum_partial((char *)th,
1152                                                      th->doff << 2,
1153                                                      skb->csum));
1154        }
1155}
1156
1157/*
1158 *      This routine will send an RST to the other tcp.
1159 *
1160 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1161 *                    for reset.
1162 *      Answer: if a packet caused RST, it is not for a socket
1163 *              existing in our system, if it is matched to a socket,
1164 *              it is just duplicate segment or bug in other side's TCP.
1165 *              So that we build reply only basing on parameters
1166 *              arrived with segment.
1167 *      Exception: precedence violation. We do not implement it in any case.
1168 */
1169
1170static void tcp_v4_send_reset(struct sk_buff *skb)
1171{
1172        struct tcphdr *th = skb->h.th;
1173        struct tcphdr rth;
1174        struct ip_reply_arg arg;
1175
1176        /* Never send a reset in response to a reset. */
1177        if (th->rst)
1178                return;
1179
1180        if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1181                return;
1182
1183        /* Swap the send and the receive. */
1184        memset(&rth, 0, sizeof(struct tcphdr));
1185        rth.dest   = th->source;
1186        rth.source = th->dest;
1187        rth.doff   = sizeof(struct tcphdr) / 4;
1188        rth.rst    = 1;
1189
1190        if (th->ack) {
1191                rth.seq = th->ack_seq;
1192        } else {
1193                rth.ack = 1;
1194                rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1195                                    skb->len - (th->doff << 2));
1196        }
1197
1198        memset(&arg, 0, sizeof arg);
1199        arg.iov[0].iov_base = (unsigned char *)&rth;
1200        arg.iov[0].iov_len  = sizeof rth;
1201        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1202                                      skb->nh.iph->saddr, /*XXX*/
1203                                      sizeof(struct tcphdr), IPPROTO_TCP, 0);
1204        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1205
1206        ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1207
1208        TCP_INC_STATS_BH(TcpOutSegs);
1209        TCP_INC_STATS_BH(TcpOutRsts);
1210}
1211
1212/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1213   outside socket context is ugly, certainly. What can I do?
1214 */
1215
1216static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1217                            u32 win, u32 ts)
1218{
1219        struct tcphdr *th = skb->h.th;
1220        struct {
1221                struct tcphdr th;
1222                u32 tsopt[3];
1223        } rep;
1224        struct ip_reply_arg arg;
1225
1226        memset(&rep.th, 0, sizeof(struct tcphdr));
1227        memset(&arg, 0, sizeof arg);
1228
1229        arg.iov[0].iov_base = (unsigned char *)&rep;
1230        arg.iov[0].iov_len  = sizeof(rep.th);
1231        if (ts) {
1232                rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1233                                     (TCPOPT_TIMESTAMP << 8) |
1234                                     TCPOLEN_TIMESTAMP);
1235                rep.tsopt[1] = htonl(tcp_time_stamp);
1236                rep.tsopt[2] = htonl(ts);
1237                arg.iov[0].iov_len = sizeof(rep);
1238        }
1239
1240        /* Swap the send and the receive. */
1241        rep.th.dest    = th->source;
1242        rep.th.source  = th->dest;
1243        rep.th.doff    = arg.iov[0].iov_len / 4;
1244        rep.th.seq     = htonl(seq);
1245        rep.th.ack_seq = htonl(ack);
1246        rep.th.ack     = 1;
1247        rep.th.window  = htons(win);
1248
1249        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1250                                      skb->nh.iph->saddr, /*XXX*/
1251                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1252        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1253
1254        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1255
1256        TCP_INC_STATS_BH(TcpOutSegs);
1257}
1258
1259static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1260{
1261        struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1262
1263        tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1264                        tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1265
1266        tcp_tw_put(tw);
1267}
1268
1269static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1270{
1271        tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1272                        req->ts_recent);
1273}
1274
1275static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1276                                          struct open_request *req)
1277{
1278        struct rtable *rt;
1279        struct ip_options *opt = req->af.v4_req.opt;
1280        struct flowi fl = { .oif = sk->sk_bound_dev_if,
1281                            .nl_u = { .ip4_u =
1282                                      { .daddr = ((opt && opt->srr) ?
1283                                                  opt->faddr :
1284                                                  req->af.v4_req.rmt_addr),
1285                                        .saddr = req->af.v4_req.loc_addr,
1286                                        .tos = RT_CONN_FLAGS(sk) } },
1287                            .proto = IPPROTO_TCP,
1288                            .uli_u = { .ports =
1289                                       { .sport = inet_sk(sk)->sport,
1290                                         .dport = req->rmt_port } } };
1291
1292        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1293                IP_INC_STATS_BH(IpOutNoRoutes);
1294                return NULL;
1295        }
1296        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1297                ip_rt_put(rt);
1298                IP_INC_STATS_BH(IpOutNoRoutes);
1299                return NULL;
1300        }
1301        return &rt->u.dst;
1302}
1303
1304/*
1305 *      Send a SYN-ACK after having received an ACK.
1306 *      This still operates on a open_request only, not on a big
1307 *      socket.
1308 */
1309static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1310                              struct dst_entry *dst)
1311{
1312        int err = -1;
1313        struct sk_buff * skb;
1314
1315        /* First, grab a route. */
1316        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1317                goto out;
1318
1319        skb = tcp_make_synack(sk, dst, req);
1320
1321        if (skb) {
1322                struct tcphdr *th = skb->h.th;
1323
1324                th->check = tcp_v4_check(th, skb->len,
1325                                         req->af.v4_req.loc_addr,
1326                                         req->af.v4_req.rmt_addr,
1327                                         csum_partial((char *)th, skb->len,
1328                                                      skb->csum));
1329
1330                err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1331                                            req->af.v4_req.rmt_addr,
1332                                            req->af.v4_req.opt);
1333                if (err == NET_XMIT_CN)
1334                        err = 0;
1335        }
1336
1337out:
1338        dst_release(dst);
1339        return err;
1340}
1341
1342/*
1343 *      IPv4 open_request destructor.
1344 */
1345static void tcp_v4_or_free(struct open_request *req)
1346{
1347        if (req->af.v4_req.opt)
1348                kfree(req->af.v4_req.opt);
1349}
1350
1351static inline void syn_flood_warning(struct sk_buff *skb)
1352{
1353        static unsigned long warntime;
1354
1355        if (time_after(jiffies, (warntime + HZ * 60))) {
1356                warntime = jiffies;
1357                printk(KERN_INFO
1358                       "possible SYN flooding on port %d. Sending cookies.\n",
1359                       ntohs(skb->h.th->dest));
1360        }
1361}
1362
1363/*
1364 * Save and compile IPv4 options into the open_request if needed.
1365 */
1366static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1367                                                     struct sk_buff *skb)
1368{
1369        struct ip_options *opt = &(IPCB(skb)->opt);
1370        struct ip_options *dopt = NULL;
1371
1372        if (opt && opt->optlen) {
1373                int opt_size = optlength(opt);
1374                dopt = kmalloc(opt_size, GFP_ATOMIC);
1375                if (dopt) {
1376                        if (ip_options_echo(dopt, skb)) {
1377                                kfree(dopt);
1378                                dopt = NULL;
1379                        }
1380                }
1381        }
1382        return dopt;
1383}
1384
1385/*
1386 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1387 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1388 * It would be better to replace it with a global counter for all sockets
1389 * but then some measure against one socket starving all other sockets
1390 * would be needed.
1391 *
1392 * It was 128 by default. Experiments with real servers show, that
1393 * it is absolutely not enough even at 100conn/sec. 256 cures most
1394 * of problems. This value is adjusted to 128 for very small machines
1395 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1396 * Further increasing requires to change hash table size.
1397 */
1398int sysctl_max_syn_backlog = 256;
1399
1400struct or_calltable or_ipv4 = {
1401        .family         =       PF_INET,
1402        .rtx_syn_ack    =       tcp_v4_send_synack,
1403        .send_ack       =       tcp_v4_or_send_ack,
1404        .destructor     =       tcp_v4_or_free,
1405        .send_reset     =       tcp_v4_send_reset,
1406};
1407
1408int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1409{
1410        struct tcp_opt tp;
1411        struct open_request *req;
1412        __u32 saddr = skb->nh.iph->saddr;
1413        __u32 daddr = skb->nh.iph->daddr;
1414        __u32 isn = TCP_SKB_CB(skb)->when;
1415        struct dst_entry *dst = NULL;
1416#ifdef CONFIG_SYN_COOKIES
1417        int want_cookie = 0;
1418#else
1419#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1420#endif
1421
1422        /* Never answer to SYNs send to broadcast or multicast */
1423        if (((struct rtable *)skb->dst)->rt_flags &
1424            (RTCF_BROADCAST | RTCF_MULTICAST))
1425                goto drop;
1426
1427        /* TW buckets are converted to open requests without
1428         * limitations, they conserve resources and peer is
1429         * evidently real one.
1430         */
1431        if (tcp_synq_is_full(sk) && !isn) {
1432#ifdef CONFIG_SYN_COOKIES
1433                if (sysctl_tcp_syncookies) {
1434                        want_cookie = 1;
1435                } else
1436#endif
1437                goto drop;
1438        }
1439
1440        /* Accept backlog is full. If we have already queued enough
1441         * of warm entries in syn queue, drop request. It is better than
1442         * clogging syn queue with openreqs with exponentially increasing
1443         * timeout.
1444         */
1445        if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1446                goto drop;
1447
1448        req = tcp_openreq_alloc();
1449        if (!req)
1450                goto drop;
1451
1452        tcp_clear_options(&tp);
1453        tp.mss_clamp = 536;
1454        tp.user_mss  = tcp_sk(sk)->user_mss;
1455
1456        tcp_parse_options(skb, &tp, 0);
1457
1458        if (want_cookie) {
1459                tcp_clear_options(&tp);
1460                tp.saw_tstamp = 0;
1461        }
1462
1463        if (tp.saw_tstamp && !tp.rcv_tsval) {
1464                /* Some OSes (unknown ones, but I see them on web server, which
1465                 * contains information interesting only for windows'
1466                 * users) do not send their stamp in SYN. It is easy case.
1467                 * We simply do not advertise TS support.
1468                 */
1469                tp.saw_tstamp = 0;
1470                tp.tstamp_ok  = 0;
1471        }
1472        tp.tstamp_ok = tp.saw_tstamp;
1473
1474        tcp_openreq_init(req, &tp, skb);
1475
1476        req->af.v4_req.loc_addr = daddr;
1477        req->af.v4_req.rmt_addr = saddr;
1478        req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1479        req->class = &or_ipv4;
1480        if (!want_cookie)
1481                TCP_ECN_create_request(req, skb->h.th);
1482
1483        if (want_cookie) {
1484#ifdef CONFIG_SYN_COOKIES
1485                syn_flood_warning(skb);
1486#endif
1487                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1488        } else if (!isn) {
1489                struct inet_peer *peer = NULL;
1490
1491                /* VJ's idea. We save last timestamp seen
1492                 * from the destination in peer table, when entering
1493                 * state TIME-WAIT, and check against it before
1494                 * accepting new connection request.
1495                 *
1496                 * If "isn" is not zero, this request hit alive
1497                 * timewait bucket, so that all the necessary checks
1498                 * are made in the function processing timewait state.
1499                 */
1500                if (tp.saw_tstamp &&
1501                    sysctl_tcp_tw_recycle &&
1502                    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1503                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1504                    peer->v4daddr == saddr) {
1505                        if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1506                            (s32)(peer->tcp_ts - req->ts_recent) >
1507                                                        TCP_PAWS_WINDOW) {
1508                                NET_INC_STATS_BH(PAWSPassiveRejected);
1509                                dst_release(dst);
1510                                goto drop_and_free;
1511                        }
1512                }
1513                /* Kill the following clause, if you dislike this way. */
1514                else if (!sysctl_tcp_syncookies &&
1515                         (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1516                          (sysctl_max_syn_backlog >> 2)) &&
1517                         (!peer || !peer->tcp_ts_stamp) &&
1518                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1519                        /* Without syncookies last quarter of
1520                         * backlog is filled with destinations,
1521                         * proven to be alive.
1522                         * It means that we continue to communicate
1523                         * to destinations, already remembered
1524                         * to the moment of synflood.
1525                         */
1526                        NETDEBUG(if (net_ratelimit()) \
1527                                        printk(KERN_DEBUG "TCP: drop open "
1528                                                          "request from %u.%u."
1529                                                          "%u.%u/%u\n", \
1530                                               NIPQUAD(saddr),
1531                                               ntohs(skb->h.th->source)));
1532                        dst_release(dst);
1533                        goto drop_and_free;
1534                }
1535
1536                isn = tcp_v4_init_sequence(sk, skb);
1537        }
1538        req->snt_isn = isn;
1539
1540        if (tcp_v4_send_synack(sk, req, dst))
1541                goto drop_and_free;
1542
1543        if (want_cookie) {
1544                tcp_openreq_free(req);
1545        } else {
1546                tcp_v4_synq_add(sk, req);
1547        }
1548        return 0;
1549
1550drop_and_free:
1551        tcp_openreq_free(req);
1552drop:
1553        TCP_INC_STATS_BH(TcpAttemptFails);
1554        return 0;
1555}
1556
1557
1558/*
1559 * The three way handshake has completed - we got a valid synack -
1560 * now create the new socket.
1561 */
1562struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1563                                  struct open_request *req,
1564                                  struct dst_entry *dst)
1565{
1566        struct inet_opt *newinet;
1567        struct tcp_opt *newtp;
1568        struct sock *newsk;
1569
1570        if (tcp_acceptq_is_full(sk))
1571                goto exit_overflow;
1572
1573        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1574                goto exit;
1575
1576        newsk = tcp_create_openreq_child(sk, req, skb);
1577        if (!newsk)
1578                goto exit;
1579
1580        newsk->sk_dst_cache = dst;
1581        tcp_v4_setup_caps(newsk, dst);
1582
1583        newtp                 = tcp_sk(newsk);
1584        newinet               = inet_sk(newsk);
1585        newinet->daddr        = req->af.v4_req.rmt_addr;
1586        newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1587        newinet->saddr        = req->af.v4_req.loc_addr;
1588        newinet->opt          = req->af.v4_req.opt;
1589        req->af.v4_req.opt    = NULL;
1590        newinet->mc_index     = tcp_v4_iif(skb);
1591        newinet->mc_ttl       = skb->nh.iph->ttl;
1592        newtp->ext_header_len = 0;
1593        if (newinet->opt)
1594                newtp->ext_header_len = newinet->opt->optlen;
1595        newtp->ext2_header_len = dst->header_len;
1596        newinet->id = newtp->write_seq ^ jiffies;
1597
1598        tcp_sync_mss(newsk, dst_pmtu(dst));
1599        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);;
1600        tcp_initialize_rcv_mss(newsk);
1601
1602        __tcp_v4_hash(newsk, 0);
1603        __tcp_inherit_port(sk, newsk);
1604
1605        return newsk;
1606
1607exit_overflow:
1608        NET_INC_STATS_BH(ListenOverflows);
1609exit:
1610        NET_INC_STATS_BH(ListenDrops);
1611        dst_release(dst);
1612        return NULL;
1613}
1614
1615static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1616{
1617        struct tcphdr *th = skb->h.th;
1618        struct iphdr *iph = skb->nh.iph;
1619        struct tcp_opt *tp = tcp_sk(sk);
1620        struct sock *nsk;
1621        struct open_request **prev;
1622        /* Find possible connection requests. */
1623        struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1624                                                     iph->saddr, iph->daddr);
1625        if (req)
1626                return tcp_check_req(sk, skb, req, prev);
1627
1628        nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1629                                          th->source,
1630                                          skb->nh.iph->daddr,
1631                                          ntohs(th->dest),
1632                                          tcp_v4_iif(skb));
1633
1634        if (nsk) {
1635                if (nsk->sk_state != TCP_TIME_WAIT) {
1636                        bh_lock_sock(nsk);
1637                        return nsk;
1638                }
1639                tcp_tw_put((struct tcp_tw_bucket *)nsk);
1640                return NULL;
1641        }
1642
1643#ifdef CONFIG_SYN_COOKIES
1644        if (!th->rst && !th->syn && th->ack)
1645                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1646#endif
1647        return sk;
1648}
1649
1650static int tcp_v4_checksum_init(struct sk_buff *skb)
1651{
1652        if (skb->ip_summed == CHECKSUM_HW) {
1653                skb->ip_summed = CHECKSUM_UNNECESSARY;
1654                if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1655                                  skb->nh.iph->daddr, skb->csum))
1656                        return 0;
1657
1658                NETDEBUG(if (net_ratelimit())
1659                                printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1660                skb->ip_summed = CHECKSUM_NONE;
1661        }
1662        if (skb->len <= 76) {
1663                if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1664                                 skb->nh.iph->daddr,
1665                                 skb_checksum(skb, 0, skb->len, 0)))
1666                        return -1;
1667                skb->ip_summed = CHECKSUM_UNNECESSARY;
1668        } else {
1669                skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1670                                          skb->nh.iph->saddr,
1671                                          skb->nh.iph->daddr, 0);
1672        }
1673        return 0;
1674}
1675
1676
1677/* The socket must have it's spinlock held when we get
1678 * here.
1679 *
1680 * We have a potential double-lock case here, so even when
1681 * doing backlog processing we use the BH locking scheme.
1682 * This is because we cannot sleep with the original spinlock
1683 * held.
1684 */
1685int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1686{
1687        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1688                TCP_CHECK_TIMER(sk);
1689                if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1690                        goto reset;
1691                TCP_CHECK_TIMER(sk);
1692                return 0;
1693        }
1694
1695        if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1696                goto csum_err;
1697
1698        if (sk->sk_state == TCP_LISTEN) {
1699                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1700                if (!nsk)
1701                        goto discard;
1702
1703                if (nsk != sk) {
1704                        if (tcp_child_process(sk, nsk, skb))
1705                                goto reset;
1706                        return 0;
1707                }
1708        }
1709
1710        TCP_CHECK_TIMER(sk);
1711        if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1712                goto reset;
1713        TCP_CHECK_TIMER(sk);
1714        return 0;
1715
1716reset:
1717        tcp_v4_send_reset(skb);
1718discard:
1719        kfree_skb(skb);
1720        /* Be careful here. If this function gets more complicated and
1721         * gcc suffers from register pressure on the x86, sk (in %ebx)
1722         * might be destroyed here. This current version compiles correctly,
1723         * but you have been warned.
1724         */
1725        return 0;
1726
1727csum_err:
1728        TCP_INC_STATS_BH(TcpInErrs);
1729        goto discard;
1730}
1731
1732/*
1733 *      From tcp_input.c
1734 */
1735
1736int tcp_v4_rcv(struct sk_buff *skb)
1737{
1738        struct tcphdr *th;
1739        struct sock *sk;
1740        int ret;
1741
1742        if (skb->pkt_type != PACKET_HOST)
1743                goto discard_it;
1744
1745        /* Count it even if it's bad */
1746        TCP_INC_STATS_BH(TcpInSegs);
1747
1748        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1749                goto discard_it;
1750
1751        th = skb->h.th;
1752
1753        if (th->doff < sizeof(struct tcphdr) / 4)
1754                goto bad_packet;
1755        if (!pskb_may_pull(skb, th->doff * 4))
1756                goto discard_it;
1757
1758        /* An explanation is required here, I think.
1759         * Packet length and doff are validated by header prediction,
1760         * provided case of th->doff==0 is elimineted.
1761         * So, we defer the checks. */
1762        if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1763             tcp_v4_checksum_init(skb) < 0))
1764                goto bad_packet;
1765
1766        th = skb->h.th;
1767        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1768        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1769                                    skb->len - th->doff * 4);
1770        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1771        TCP_SKB_CB(skb)->when    = 0;
1772        TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1773        TCP_SKB_CB(skb)->sacked  = 0;
1774
1775        sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1776                             skb->nh.iph->daddr, ntohs(th->dest),
1777                             tcp_v4_iif(skb));
1778
1779        if (!sk)
1780                goto no_tcp_socket;
1781
1782process:
1783        if (sk->sk_state == TCP_TIME_WAIT)
1784                goto do_time_wait;
1785
1786        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1787                goto discard_and_relse;
1788
1789        if (sk_filter(sk, skb, 0))
1790                goto discard_and_relse;
1791
1792        skb->dev = NULL;
1793
1794        bh_lock_sock(sk);
1795        ret = 0;
1796        if (!sock_owned_by_user(sk)) {
1797                if (!tcp_prequeue(sk, skb))
1798                        ret = tcp_v4_do_rcv(sk, skb);
1799        } else
1800                sk_add_backlog(sk, skb);
1801        bh_unlock_sock(sk);
1802
1803        sock_put(sk);
1804
1805        return ret;
1806
1807no_tcp_socket:
1808        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1809                goto discard_it;
1810
1811        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1812bad_packet:
1813                TCP_INC_STATS_BH(TcpInErrs);
1814        } else {
1815                tcp_v4_send_reset(skb);
1816        }
1817
1818discard_it:
1819        /* Discard frame. */
1820        kfree_skb(skb);
1821        return 0;
1822
1823discard_and_relse:
1824        sock_put(sk);
1825        goto discard_it;
1826
1827do_time_wait:
1828        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1829                goto discard_and_relse;
1830
1831        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1832                TCP_INC_STATS_BH(TcpInErrs);
1833                goto discard_and_relse;
1834        }
1835        switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1836                                           skb, th, skb->len)) {
1837        case TCP_TW_SYN: {
1838                struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1839                                                          ntohs(th->dest),
1840                                                          tcp_v4_iif(skb));
1841                if (sk2) {
1842                        tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1843                        tcp_tw_put((struct tcp_tw_bucket *)sk);
1844                        sk = sk2;
1845                        goto process;
1846                }
1847                /* Fall through to ACK */
1848        }
1849        case TCP_TW_ACK:
1850                tcp_v4_timewait_ack(sk, skb);
1851                break;
1852        case TCP_TW_RST:
1853                goto no_tcp_socket;
1854        case TCP_TW_SUCCESS:;
1855        }
1856        goto discard_it;
1857}
1858
1859/* With per-bucket locks this operation is not-atomic, so that
1860 * this version is not worse.
1861 */
1862static void __tcp_v4_rehash(struct sock *sk)
1863{
1864        sk->sk_prot->unhash(sk);
1865        sk->sk_prot->hash(sk);
1866}
1867
1868static int tcp_v4_reselect_saddr(struct sock *sk)
1869{
1870        struct inet_opt *inet = inet_sk(sk);
1871        int err;
1872        struct rtable *rt;
1873        __u32 old_saddr = inet->saddr;
1874        __u32 new_saddr;
1875        __u32 daddr = inet->daddr;
1876
1877        if (inet->opt && inet->opt->srr)
1878                daddr = inet->opt->faddr;
1879
1880        /* Query new route. */
1881        err = ip_route_connect(&rt, daddr, 0,
1882                               RT_TOS(inet->tos) | sk->sk_localroute,
1883                               sk->sk_bound_dev_if,
1884                               IPPROTO_TCP,
1885                               inet->sport, inet->dport, sk);
1886        if (err)
1887                return err;
1888
1889        __sk_dst_set(sk, &rt->u.dst);
1890        tcp_v4_setup_caps(sk, &rt->u.dst);
1891        tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1892
1893        new_saddr = rt->rt_src;
1894
1895        if (new_saddr == old_saddr)
1896                return 0;
1897
1898        if (sysctl_ip_dynaddr > 1) {
1899                printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1900                                 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1901                       NIPQUAD(old_saddr),
1902                       NIPQUAD(new_saddr));
1903        }
1904
1905        inet->saddr = new_saddr;
1906        inet->rcv_saddr = new_saddr;
1907
1908        /* XXX The only one ugly spot where we need to
1909         * XXX really change the sockets identity after
1910         * XXX it has entered the hashes. -DaveM
1911         *
1912         * Besides that, it does not check for connection
1913         * uniqueness. Wait for troubles.
1914         */
1915        __tcp_v4_rehash(sk);
1916        return 0;
1917}
1918
1919int tcp_v4_rebuild_header(struct sock *sk)
1920{
1921        struct inet_opt *inet = inet_sk(sk);
1922        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1923        u32 daddr;
1924        int err;
1925
1926        /* Route is OK, nothing to do. */
1927        if (rt)
1928                return 0;
1929
1930        /* Reroute. */
1931        daddr = inet->daddr;
1932        if (inet->opt && inet->opt->srr)
1933                daddr = inet->opt->faddr;
1934
1935        {
1936                struct flowi fl = { .oif = sk->sk_bound_dev_if,
1937                                    .nl_u = { .ip4_u =
1938                                              { .daddr = daddr,
1939                                                .saddr = inet->saddr,
1940                                                .tos = RT_CONN_FLAGS(sk) } },
1941                                    .proto = IPPROTO_TCP,
1942                                    .uli_u = { .ports =
1943                                               { .sport = inet->sport,
1944                                                 .dport = inet->dport } } };
1945                                                
1946                err = ip_route_output_flow(&rt, &fl, sk, 0);
1947        }
1948        if (!err) {
1949                __sk_dst_set(sk, &rt->u.dst);
1950                tcp_v4_setup_caps(sk, &rt->u.dst);
1951                tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1952                return 0;
1953        }
1954
1955        /* Routing failed... */
1956        sk->sk_route_caps = 0;
1957
1958        if (!sysctl_ip_dynaddr ||
1959            sk->sk_state != TCP_SYN_SENT ||
1960            (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1961            (err = tcp_v4_reselect_saddr(sk)) != 0)
1962                sk->sk_err_soft = -err;
1963
1964        return err;
1965}
1966
1967static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1968{
1969        struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1970        struct inet_opt *inet = inet_sk(sk);
1971
1972        sin->sin_family         = AF_INET;
1973        sin->sin_addr.s_addr    = inet->daddr;
1974        sin->sin_port           = inet->dport;
1975}
1976
1977/* VJ's idea. Save last timestamp seen from this destination
1978 * and hold it at least for normal timewait interval to use for duplicate
1979 * segment detection in subsequent connections, before they enter synchronized
1980 * state.
1981 */
1982
1983int tcp_v4_remember_stamp(struct sock *sk)
1984{
1985        struct inet_opt *inet = inet_sk(sk);
1986        struct tcp_opt *tp = tcp_sk(sk);
1987        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1988        struct inet_peer *peer = NULL;
1989        int release_it = 0;
1990
1991        if (!rt || rt->rt_dst != inet->daddr) {
1992                peer = inet_getpeer(inet->daddr, 1);
1993                release_it = 1;
1994        } else {
1995                if (!rt->peer)
1996                        rt_bind_peer(rt, 1);
1997                peer = rt->peer;
1998        }
1999
2000        if (peer) {
2001                if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2002                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2003                     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2004                        peer->tcp_ts_stamp = tp->ts_recent_stamp;
2005                        peer->tcp_ts = tp->ts_recent;
2006                }
2007                if (release_it)
2008                        inet_putpeer(peer);
2009                return 1;
2010        }
2011
2012        return 0;
2013}
2014
2015int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2016{
2017        struct inet_peer *peer = NULL;
2018
2019        peer = inet_getpeer(tw->tw_daddr, 1);
2020
2021        if (peer) {
2022                if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2023                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2024                     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2025                        peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2026                        peer->tcp_ts = tw->tw_ts_recent;
2027                }
2028                inet_putpeer(peer);
2029                return 1;
2030        }
2031
2032        return 0;
2033}
2034
2035struct tcp_func ipv4_specific = {
2036        .queue_xmit     =       ip_queue_xmit,
2037        .send_check     =       tcp_v4_send_check,
2038        .rebuild_header =       tcp_v4_rebuild_header,
2039        .conn_request   =       tcp_v4_conn_request,
2040        .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2041        .remember_stamp =       tcp_v4_remember_stamp,
2042        .net_header_len =       sizeof(struct iphdr),
2043        .setsockopt     =       ip_setsockopt,
2044        .getsockopt     =       ip_getsockopt,
2045        .addr2sockaddr  =       v4_addr2sockaddr,
2046        .sockaddr_len   =       sizeof(struct sockaddr_in),
2047};
2048
2049/* NOTE: A lot of things set to zero explicitly by call to
2050 *       sk_alloc() so need not be done here.
2051 */
2052static int tcp_v4_init_sock(struct sock *sk)
2053{
2054        struct tcp_opt *tp = tcp_sk(sk);
2055
2056        skb_queue_head_init(&tp->out_of_order_queue);
2057        tcp_init_xmit_timers(sk);
2058        tcp_prequeue_init(tp);
2059
2060        tp->rto  = TCP_TIMEOUT_INIT;
2061        tp->mdev = TCP_TIMEOUT_INIT;
2062
2063        /* So many TCP implementations out there (incorrectly) count the
2064         * initial SYN frame in their delayed-ACK and congestion control
2065         * algorithms that we must have the following bandaid to talk
2066         * efficiently to them.  -DaveM
2067         */
2068        tp->snd_cwnd = 2;
2069
2070        /* See draft-stevens-tcpca-spec-01 for discussion of the
2071         * initialization of these values.
2072         */
2073        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2074        tp->snd_cwnd_clamp = ~0;
2075        tp->mss_cache = 536;
2076
2077        tp->reordering = sysctl_tcp_reordering;
2078
2079        sk->sk_state = TCP_CLOSE;
2080
2081        sk->sk_write_space = tcp_write_space;
2082        sk->sk_use_write_queue = 1;
2083
2084        tp->af_specific = &ipv4_specific;
2085
2086        sk->sk_sndbuf = sysctl_tcp_wmem[1];
2087        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2088
2089        atomic_inc(&tcp_sockets_allocated);
2090
2091        return 0;
2092}
2093
2094static int tcp_v4_destroy_sock(struct sock *sk)
2095{
2096        struct tcp_opt *tp = tcp_sk(sk);
2097
2098        tcp_clear_xmit_timers(sk);
2099
2100        /* Cleanup up the write buffer. */
2101        tcp_writequeue_purge(sk);
2102
2103        /* Cleans up our, hopefully empty, out_of_order_queue. */
2104        __skb_queue_purge(&tp->out_of_order_queue);
2105
2106        /* Clean prequeue, it must be empty really */
2107        __skb_queue_purge(&tp->ucopy.prequeue);
2108
2109        /* Clean up a referenced TCP bind bucket. */
2110        if (tp->bind_hash)
2111                tcp_put_port(sk);
2112
2113        /* If sendmsg cached page exists, toss it. */
2114        if (inet_sk(sk)->sndmsg_page)
2115                __free_page(inet_sk(sk)->sndmsg_page);
2116
2117        atomic_dec(&tcp_sockets_allocated);
2118
2119        return 0;
2120}
2121
2122#ifdef CONFIG_PROC_FS
2123/* Proc filesystem TCP sock list dumping. */
2124
2125static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2126{
2127        return hlist_empty(head) ? NULL :
2128                list_entry(head->first, struct tcp_tw_bucket, tw_node);
2129}
2130
2131static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2132{
2133        return tw->tw_node.next ?
2134                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2135}
2136
2137static void *listening_get_first(struct seq_file *seq)
2138{
2139        struct tcp_iter_state* st = seq->private;
2140        void *rc = NULL;
2141
2142        for (st->bucket = 0; st->bucket < TCP_LHTABLE_SIZE; ++st->bucket) {
2143                struct open_request *req;
2144                struct tcp_opt *tp;
2145                struct sock *sk = sk_head(&tcp_listening_hash[st->bucket]);
2146
2147                if (!sk)
2148                        continue;
2149                if (sk->sk_family == st->family) {
2150                        rc = sk;
2151                        goto out;
2152                }
2153                tp = tcp_sk(sk);
2154                read_lock_bh(&tp->syn_wait_lock);
2155                if (tp->listen_opt && tp->listen_opt->qlen) {
2156                        st->uid         = sock_i_uid(sk);
2157                        st->syn_wait_sk = sk;
2158                        st->state       = TCP_SEQ_STATE_OPENREQ;
2159                        for (st->sbucket = 0; st->sbucket < TCP_SYNQ_HSIZE;
2160                             ++st->sbucket) {
2161                                for (req = tp->listen_opt->syn_table[st->sbucket];
2162                                     req; req = req->dl_next) {
2163                                        if (req->class->family != st->family)
2164                                                continue;
2165                                        rc = req;
2166                                        goto out;
2167                                }
2168                        }
2169                        st->state = TCP_SEQ_STATE_LISTENING;
2170                }
2171                read_unlock_bh(&tp->syn_wait_lock);
2172        }
2173out:
2174        return rc;
2175}
2176
2177static void *listening_get_next(struct seq_file *seq, void *cur)
2178{
2179        struct tcp_opt *tp;
2180        struct hlist_node *node;
2181        struct sock *sk = cur;
2182        struct tcp_iter_state* st = seq->private;
2183
2184        ++st->num;
2185
2186        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2187                struct open_request *req = cur;
2188
2189                tp = tcp_sk(st->syn_wait_sk);
2190                req = req->dl_next;
2191                while (1) {
2192                        while (req) {
2193                                if (req->class->family == st->family) {
2194                                        cur = req;
2195                                        goto out;
2196                                }
2197                                req = req->dl_next;
2198                        }
2199                        if (++st->sbucket >= TCP_SYNQ_HSIZE)
2200                                break;
2201get_req:
2202                        req = tp->listen_opt->syn_table[st->sbucket];
2203                }
2204                sk        = sk_next(st->syn_wait_sk);
2205                st->state = TCP_SEQ_STATE_LISTENING;
2206                read_unlock_bh(&tp->syn_wait_lock);
2207        } else
2208                sk = sk_next(sk);
2209get_sk:
2210        sk_for_each_from(sk, node) {
2211                if (sk->sk_family == st->family) {
2212                        cur = sk;
2213                        goto out;
2214                }
2215                tp = tcp_sk(sk);
2216                read_lock_bh(&tp->syn_wait_lock);
2217                if (tp->listen_opt && tp->listen_opt->qlen) {
2218                        st->uid         = sock_i_uid(sk);
2219                        st->syn_wait_sk = sk;
2220                        st->state       = TCP_SEQ_STATE_OPENREQ;
2221                        st->sbucket     = 0;
2222                        goto get_req;
2223                }
2224                read_unlock_bh(&tp->syn_wait_lock);
2225        }
2226        if (++st->bucket < TCP_LHTABLE_SIZE) {
2227                sk = sk_head(&tcp_listening_hash[st->bucket]);
2228                goto get_sk;
2229        }
2230        cur = NULL;
2231out:
2232        return cur;
2233}
2234
2235static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2236{
2237        void *rc = listening_get_first(seq);
2238
2239        while (rc && *pos) {
2240                rc = listening_get_next(seq, rc);
2241                --*pos;
2242        }
2243        return rc;
2244}
2245
2246static void *established_get_first(struct seq_file *seq)
2247{
2248        struct tcp_iter_state* st = seq->private;
2249        void *rc = NULL;
2250
2251        for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2252                struct sock *sk;
2253                struct hlist_node *node;
2254                struct tcp_tw_bucket *tw;
2255               
2256                read_lock(&tcp_ehash[st->bucket].lock);
2257                sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2258                        if (sk->sk_family != st->family) {
2259                                continue;
2260                        }
2261                        rc = sk;
2262                        goto out;
2263                }
2264                st->state = TCP_SEQ_STATE_TIME_WAIT;
2265                tw_for_each(tw, node,
2266                            &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2267                        if (tw->tw_family != st->family) {
2268                                continue;
2269                        }
2270                        rc = tw;
2271                        goto out;
2272                }
2273                read_unlock(&tcp_ehash[st->bucket].lock);
2274                st->state = TCP_SEQ_STATE_ESTABLISHED;
2275        }
2276out:
2277        return rc;
2278}
2279
2280static void *established_get_next(struct seq_file *seq, void *cur)
2281{
2282        struct sock *sk = cur;
2283        struct tcp_tw_bucket *tw;
2284        struct hlist_node *node;
2285        struct tcp_iter_state* st = seq->private;
2286
2287        ++st->num;
2288
2289        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2290                tw = cur;
2291                tw = tw_next(tw);
2292get_tw:
2293                while (tw && tw->tw_family != st->family) {
2294                        tw = tw_next(tw);
2295                }
2296                if (tw) {
2297                        cur = tw;
2298                        goto out;
2299                }
2300                read_unlock(&tcp_ehash[st->bucket].lock);
2301                st->state = TCP_SEQ_STATE_ESTABLISHED;
2302                if (++st->bucket < tcp_ehash_size) {
2303                        read_lock(&tcp_ehash[st->bucket].lock);
2304                        sk = sk_head(&tcp_ehash[st->bucket].chain);
2305                } else {
2306                        cur = NULL;
2307                        goto out;
2308                }
2309        } else
2310                sk = sk_next(sk);
2311
2312        sk_for_each_from(sk, node) {
2313                if (sk->sk_family == st->family)
2314                        goto found;
2315        }
2316
2317        st->state = TCP_SEQ_STATE_TIME_WAIT;
2318        tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2319        goto get_tw;
2320found:
2321        cur = sk;
2322out:
2323        return cur;
2324}
2325
2326static void *established_get_idx(struct seq_file *seq, loff_t pos)
2327{
2328        void *rc = established_get_first(seq);
2329
2330        while (rc && pos) {
2331                rc = established_get_next(seq, rc);
2332                --pos;
2333        }               
2334        return rc;
2335}
2336
2337static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2338{
2339        void *rc;
2340        struct tcp_iter_state* st = seq->private;
2341
2342        tcp_listen_lock();
2343        st->state = TCP_SEQ_STATE_LISTENING;
2344        rc        = listening_get_idx(seq, &pos);
2345
2346        if (!rc) {
2347                tcp_listen_unlock();
2348                local_bh_disable();
2349                st->state = TCP_SEQ_STATE_ESTABLISHED;
2350                rc        = established_get_idx(seq, pos);
2351        }
2352
2353        return rc;
2354}
2355
2356static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2357{
2358        struct tcp_iter_state* st = seq->private;
2359        st->state = TCP_SEQ_STATE_LISTENING;
2360        st->num = 0;
2361        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2362}
2363
2364static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2365{
2366        void *rc = NULL;
2367        struct tcp_iter_state* st;
2368
2369        if (v == SEQ_START_TOKEN) {
2370                rc = tcp_get_idx(seq, 0);
2371                goto out;
2372        }
2373        st = seq->private;
2374
2375        switch (st->state) {
2376        case TCP_SEQ_STATE_OPENREQ:
2377        case TCP_SEQ_STATE_LISTENING:
2378                rc = listening_get_next(seq, v);
2379                if (!rc) {
2380                        tcp_listen_unlock();
2381                        local_bh_disable();
2382                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2383                        rc        = established_get_first(seq);
2384                }
2385                break;
2386        case TCP_SEQ_STATE_ESTABLISHED:
2387        case TCP_SEQ_STATE_TIME_WAIT:
2388                rc = established_get_next(seq, v);
2389                break;
2390        }
2391out:
2392        ++*pos;
2393        return rc;
2394}
2395
2396static void tcp_seq_stop(struct seq_file *seq, void *v)
2397{
2398        struct tcp_iter_state* st = seq->private;
2399
2400        switch (st->state) {
2401        case TCP_SEQ_STATE_OPENREQ:
2402                if (v) {
2403                        struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2404                        read_unlock_bh(&tp->syn_wait_lock);
2405                }
2406        case TCP_SEQ_STATE_LISTENING:
2407                if (v != SEQ_START_TOKEN)
2408                        tcp_listen_unlock();
2409                break;
2410        case TCP_SEQ_STATE_TIME_WAIT:
2411        case TCP_SEQ_STATE_ESTABLISHED:
2412                if (v)
2413                        read_unlock(&tcp_ehash[st->bucket].lock);
2414                local_bh_enable();
2415                break;
2416        }
2417}
2418
2419static int tcp_seq_open(struct inode *inode, struct file *file)
2420{
2421        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2422        struct seq_file *seq;
2423        struct tcp_iter_state *s;
2424        int rc;
2425
2426        if (unlikely(afinfo == NULL))
2427                return -EINVAL;
2428
2429        s = kmalloc(sizeof(*s), GFP_KERNEL);
2430        if (!s)
2431                return -ENOMEM;
2432        memset(s, 0, sizeof(*s));
2433        s->family               = afinfo->family;
2434        s->seq_ops.start        = tcp_seq_start;
2435        s->seq_ops.next         = tcp_seq_next;
2436        s->seq_ops.show         = afinfo->seq_show;
2437        s->seq_ops.stop         = tcp_seq_stop;
2438
2439        rc = seq_open(file, &s->seq_ops);
2440        if (rc)
2441                goto out_kfree;
2442        seq          = file->private_data;
2443        seq->private = s;
2444out:
2445        return rc;
2446out_kfree:
2447        kfree(s);
2448        goto out;
2449}
2450
2451int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2452{
2453        int rc = 0;
2454        struct proc_dir_entry *p;
2455
2456        if (!afinfo)
2457                return -EINVAL;
2458        afinfo->seq_fops->owner         = afinfo->owner;
2459        afinfo->seq_fops->open          = tcp_seq_open;
2460        afinfo->seq_fops->read          = seq_read;
2461        afinfo->seq_fops->llseek        = seq_lseek;
2462        afinfo->seq_fops->release       = seq_release_private;
2463        
2464        p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2465        if (p)
2466                p->data = afinfo;
2467        else
2468                rc = -ENOMEM;
2469        return rc;
2470}
2471
2472void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2473{
2474        if (!afinfo)
2475                return;
2476        proc_net_remove(afinfo->name);
2477        memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2478}
2479
2480static void get_openreq4(struct sock *sk, struct open_request *req,
2481                         char *tmpbuf, int i, int uid)
2482{
2483        int ttd = req->expires - jiffies;
2484
2485        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2486                " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2487                i,
2488                req->af.v4_req.loc_addr,
2489                ntohs(inet_sk(sk)->sport),
2490                req->af.v4_req.rmt_addr,
2491                ntohs(req->rmt_port),
2492                TCP_SYN_RECV,
2493                0, 0, /* could print option size, but that is af dependent. */
2494                1,    /* timers active (only the expire timer) */
2495                jiffies_to_clock_t(ttd),
2496                req->retrans,
2497                uid,
2498                0,  /* non standard timer */
2499                0, /* open_requests have no inode */
2500                atomic_read(&sk->sk_refcnt),
2501                req);
2502}
2503
2504static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2505{
2506        int timer_active;
2507        unsigned long timer_expires;
2508        struct tcp_opt *tp = tcp_sk(sp);
2509        struct inet_opt *inet = inet_sk(sp);
2510        unsigned int dest = inet->daddr;
2511        unsigned int src = inet->rcv_saddr;
2512        __u16 destp = ntohs(inet->dport);
2513        __u16 srcp = ntohs(inet->sport);
2514
2515        if (tp->pending == TCP_TIME_RETRANS) {
2516                timer_active    = 1;
2517                timer_expires   = tp->timeout;
2518        } else if (tp->pending == TCP_TIME_PROBE0) {
2519                timer_active    = 4;
2520                timer_expires   = tp->timeout;
2521        } else if (timer_pending(&sp->sk_timer)) {
2522                timer_active    = 2;
2523                timer_expires   = sp->sk_timer.expires;
2524        } else {
2525                timer_active    = 0;
2526                timer_expires = jiffies;
2527        }
2528
2529        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2530                        "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2531                i, src, srcp, dest, destp, sp->sk_state,
2532                tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2533                timer_active,
2534                jiffies_to_clock_t(timer_expires - jiffies),
2535                tp->retransmits,
2536                sock_i_uid(sp),
2537                tp->probes_out,
2538                sock_i_ino(sp),
2539                atomic_read(&sp->sk_refcnt), sp,
2540                tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2541                tp->snd_cwnd,
2542                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2543}
2544
2545static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2546{
2547        unsigned int dest, src;
2548        __u16 destp, srcp;
2549        int ttd = tw->tw_ttd - jiffies;
2550
2551        if (ttd < 0)
2552                ttd = 0;
2553
2554        dest  = tw->tw_daddr;
2555        src   = tw->tw_rcv_saddr;
2556        destp = ntohs(tw->tw_dport);
2557        srcp  = ntohs(tw->tw_sport);
2558
2559        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2560                " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2561                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2562                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2563                atomic_read(&tw->tw_refcnt), tw);
2564}
2565
2566#define TMPSZ 150
2567
2568static int tcp4_seq_show(struct seq_file *seq, void *v)
2569{
2570        struct tcp_iter_state* st;
2571        char tmpbuf[TMPSZ + 1];
2572
2573        if (v == SEQ_START_TOKEN) {
2574                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2575                           "  sl  local_address rem_address   st tx_queue "
2576                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2577                           "inode");
2578                goto out;
2579        }
2580        st = seq->private;
2581
2582        switch (st->state) {
2583        case TCP_SEQ_STATE_LISTENING:
2584        case TCP_SEQ_STATE_ESTABLISHED:
2585                get_tcp4_sock(v, tmpbuf, st->num);
2586                break;
2587        case TCP_SEQ_STATE_OPENREQ:
2588                get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2589                break;
2590        case TCP_SEQ_STATE_TIME_WAIT:
2591                get_timewait4_sock(v, tmpbuf, st->num);
2592                break;
2593        }
2594        seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2595out:
2596        return 0;
2597}
2598
2599static struct file_operations tcp4_seq_fops;
2600static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2601        .owner          = THIS_MODULE,
2602        .name           = "tcp",
2603        .family         = AF_INET,
2604        .seq_show       = tcp4_seq_show,
2605        .seq_fops       = &tcp4_seq_fops,
2606};
2607
2608int __init tcp4_proc_init(void)
2609{
2610        return tcp_proc_register(&tcp4_seq_afinfo);
2611}
2612
2613void tcp4_proc_exit(void)
2614{
2615        tcp_proc_unregister(&tcp4_seq_afinfo);
2616}
2617#endif /* CONFIG_PROC_FS */
2618
2619struct proto tcp_prot = {
2620        .name           =       "TCP",
2621        .close          =       tcp_close,
2622        .connect        =       tcp_v4_connect,
2623        .disconnect     =       tcp_disconnect,
2624        .accept         =       tcp_accept,
2625        .ioctl          =       tcp_ioctl,
2626        .init           =       tcp_v4_init_sock,
2627        .destroy        =       tcp_v4_destroy_sock,
2628        .shutdown       =       tcp_shutdown,
2629        .setsockopt     =       tcp_setsockopt,
2630        .getsockopt     =       tcp_getsockopt,
2631        .sendmsg        =       tcp_sendmsg,
2632        .recvmsg        =       tcp_recvmsg,
2633        .backlog_rcv    =       tcp_v4_do_rcv,
2634        .hash           =       tcp_v4_hash,
2635        .unhash         =       tcp_unhash,
2636        .get_port       =       tcp_v4_get_port,
2637};
2638
2639
2640
2641void __init tcp_v4_init(struct net_proto_family *ops)
2642{
2643        int err = sock_create(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2644        if (err < 0)
2645                panic("Failed to create the TCP control socket.\n");
2646        tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2647        inet_sk(tcp_socket->sk)->uc_ttl = -1;
2648
2649        /* Unhash it so that IP input processing does not even
2650         * see it, we do not wish this socket to see incoming
2651         * packets.
2652         */
2653        tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2654}
2655
2656EXPORT_SYMBOL(ipv4_specific);
2657EXPORT_SYMBOL(tcp_bind_hash);
2658EXPORT_SYMBOL(tcp_bucket_create);
2659EXPORT_SYMBOL(tcp_hashinfo);
2660EXPORT_SYMBOL(tcp_inherit_port);
2661EXPORT_SYMBOL(tcp_listen_wlock);
2662EXPORT_SYMBOL(tcp_port_rover);
2663EXPORT_SYMBOL(tcp_prot);
2664EXPORT_SYMBOL(tcp_put_port);
2665EXPORT_SYMBOL(tcp_unhash);
2666EXPORT_SYMBOL(tcp_v4_conn_request);
2667EXPORT_SYMBOL(tcp_v4_connect);
2668EXPORT_SYMBOL(tcp_v4_do_rcv);
2669EXPORT_SYMBOL(tcp_v4_lookup_listener);
2670EXPORT_SYMBOL(tcp_v4_rebuild_header);
2671EXPORT_SYMBOL(tcp_v4_remember_stamp);
2672EXPORT_SYMBOL(tcp_v4_send_check);
2673EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2674
2675#ifdef CONFIG_PROC_FS
2676EXPORT_SYMBOL(tcp_proc_register);
2677EXPORT_SYMBOL(tcp_proc_unregister);
2678#endif
2679#ifdef CONFIG_SYSCTL
2680EXPORT_SYMBOL(sysctl_local_port_range);
2681EXPORT_SYMBOL(sysctl_max_syn_backlog);
2682EXPORT_SYMBOL(sysctl_tcp_low_latency);
2683#endif
2684
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.