linux-bk/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9 *
  10 *              IPv4 specific functions
  11 *
  12 *
  13 *              code split from:
  14 *              linux/ipv4/tcp.c
  15 *              linux/ipv4/tcp_input.c
  16 *              linux/ipv4/tcp_output.c
  17 *
  18 *              See tcp.c for author information
  19 *
  20 *      This program is free software; you can redistribute it and/or
  21 *      modify it under the terms of the GNU General Public License
  22 *      as published by the Free Software Foundation; either version
  23 *      2 of the License, or (at your option) any later version.
  24 */
  25
  26/*
  27 * Changes:
  28 *              David S. Miller :       New socket lookup architecture.
  29 *                                      This code is dedicated to John Dyson.
  30 *              David S. Miller :       Change semantics of established hash,
  31 *                                      half is devoted to TIME_WAIT sockets
  32 *                                      and the rest go in the other half.
  33 *              Andi Kleen :            Add support for syncookies and fixed
  34 *                                      some bugs: ip options weren't passed to
  35 *                                      the TCP layer, missed a check for an
  36 *                                      ACK bit.
  37 *              Andi Kleen :            Implemented fast path mtu discovery.
  38 *                                      Fixed many serious bugs in the
  39 *                                      open_request handling and moved
  40 *                                      most of it into the af independent code.
  41 *                                      Added tail drop and some other bugfixes.
  42 *                                      Added new listen sematics.
  43 *              Mike McLagan    :       Routing by source
  44 *      Juan Jose Ciarlante:            ip_dynaddr bits
  45 *              Andi Kleen:             various fixes.
  46 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47 *                                      coma.
  48 *      Andi Kleen              :       Fix new listen.
  49 *      Andi Kleen              :       Fix accept error reporting.
  50 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52 *                                      a single port at the same time.
  53 */
  54
  55#include <linux/config.h>
  56
  57#include <linux/types.h>
  58#include <linux/fcntl.h>
  59#include <linux/module.h>
  60#include <linux/random.h>
  61#include <linux/cache.h>
  62#include <linux/jhash.h>
  63#include <linux/init.h>
  64#include <linux/times.h>
  65
  66#include <net/icmp.h>
  67#include <net/tcp.h>
  68#include <net/ipv6.h>
  69#include <net/inet_common.h>
  70#include <net/xfrm.h>
  71
  72#include <linux/inet.h>
  73#include <linux/ipv6.h>
  74#include <linux/stddef.h>
  75#include <linux/proc_fs.h>
  76#include <linux/seq_file.h>
  77
  78extern int sysctl_ip_dynaddr;
  79int sysctl_tcp_tw_reuse;
  80int sysctl_tcp_low_latency;
  81
  82/* Check TCP sequence numbers in ICMP packets. */
  83#define ICMP_MIN_LENGTH 8
  84
  85/* Socket used for sending RSTs */
  86static struct socket *tcp_socket;
  87
  88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  89                       struct sk_buff *skb);
  90
  91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
  92        .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
  93        .__tcp_lhash_users      =       ATOMIC_INIT(0),
  94        .__tcp_lhash_wait
  95          = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
  96        .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
  97};
  98
  99/*
 100 * This array holds the first and last local port number.
 101 * For high-usage systems, use sysctl to change this to
 102 * 32768-61000
 103 */
 104int sysctl_local_port_range[2] = { 1024, 4999 };
 105int tcp_port_rover = 1024 - 1;
 106
 107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 108                                 __u32 faddr, __u16 fport)
 109{
 110        int h = (laddr ^ lport) ^ (faddr ^ fport);
 111        h ^= h >> 16;
 112        h ^= h >> 8;
 113        return h & (tcp_ehash_size - 1);
 114}
 115
 116static __inline__ int tcp_sk_hashfn(struct sock *sk)
 117{
 118        struct inet_opt *inet = inet_sk(sk);
 119        __u32 laddr = inet->rcv_saddr;
 120        __u16 lport = inet->num;
 121        __u32 faddr = inet->daddr;
 122        __u16 fport = inet->dport;
 123
 124        return tcp_hashfn(laddr, lport, faddr, fport);
 125}
 126
 127/* Allocate and initialize a new TCP local port bind bucket.
 128 * The bindhash mutex for snum's hash chain must be held here.
 129 */
 130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 131                                          unsigned short snum)
 132{
 133        struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
 134                                                      SLAB_ATOMIC);
 135        if (tb) {
 136                tb->port = snum;
 137                tb->fastreuse = 0;
 138                INIT_HLIST_HEAD(&tb->owners);
 139                hlist_add_head(&tb->node, &head->chain);
 140        }
 141        return tb;
 142}
 143
 144/* Caller must hold hashbucket lock for this tb with local BH disabled */
 145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
 146{
 147        if (hlist_empty(&tb->owners)) {
 148                __hlist_del(&tb->node);
 149                kmem_cache_free(tcp_bucket_cachep, tb);
 150        }
 151}
 152
 153/* Caller must disable local BH processing. */
 154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
 155{
 156        struct tcp_bind_hashbucket *head =
 157                                &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
 158        struct tcp_bind_bucket *tb;
 159
 160        spin_lock(&head->lock);
 161        tb = tcp_sk(sk)->bind_hash;
 162        sk_add_bind_node(child, &tb->owners);
 163        tcp_sk(child)->bind_hash = tb;
 164        spin_unlock(&head->lock);
 165}
 166
 167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
 168{
 169        local_bh_disable();
 170        __tcp_inherit_port(sk, child);
 171        local_bh_enable();
 172}
 173
 174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
 175                   unsigned short snum)
 176{
 177        inet_sk(sk)->num = snum;
 178        sk_add_bind_node(sk, &tb->owners);
 179        tcp_sk(sk)->bind_hash = tb;
 180}
 181
 182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
 183{
 184        const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
 185        struct sock *sk2;
 186        struct hlist_node *node;
 187        int reuse = sk->sk_reuse;
 188
 189        sk_for_each_bound(sk2, node, &tb->owners) {
 190                if (sk != sk2 &&
 191                    !tcp_v6_ipv6only(sk2) &&
 192                    (!sk->sk_bound_dev_if ||
 193                     !sk2->sk_bound_dev_if ||
 194                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
 195                        if (!reuse || !sk2->sk_reuse ||
 196                            sk2->sk_state == TCP_LISTEN) {
 197                                const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
 198                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 199                                    sk2_rcv_saddr == sk_rcv_saddr)
 200                                        break;
 201                        }
 202                }
 203        }
 204        return node != NULL;
 205}
 206
 207/* Obtain a reference to a local port for the given sock,
 208 * if snum is zero it means select any available local port.
 209 */
 210static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 211{
 212        struct tcp_bind_hashbucket *head;
 213        struct hlist_node *node;
 214        struct tcp_bind_bucket *tb;
 215        int ret;
 216
 217        local_bh_disable();
 218        if (!snum) {
 219                int low = sysctl_local_port_range[0];
 220                int high = sysctl_local_port_range[1];
 221                int remaining = (high - low) + 1;
 222                int rover;
 223
 224                spin_lock(&tcp_portalloc_lock);
 225                rover = tcp_port_rover;
 226                do {
 227                        rover++;
 228                        if (rover < low || rover > high)
 229                                rover = low;
 230                        head = &tcp_bhash[tcp_bhashfn(rover)];
 231                        spin_lock(&head->lock);
 232                        tb_for_each(tb, node, &head->chain)
 233                                if (tb->port == rover)
 234                                        goto next;
 235                        break;
 236                next:
 237                        spin_unlock(&head->lock);
 238                } while (--remaining > 0);
 239                tcp_port_rover = rover;
 240                spin_unlock(&tcp_portalloc_lock);
 241
 242                /* Exhausted local port range during search? */
 243                ret = 1;
 244                if (remaining <= 0)
 245                        goto fail;
 246
 247                /* OK, here is the one we will use.  HEAD is
 248                 * non-NULL and we hold it's mutex.
 249                 */
 250                snum = rover;
 251        } else {
 252                head = &tcp_bhash[tcp_bhashfn(snum)];
 253                spin_lock(&head->lock);
 254                tb_for_each(tb, node, &head->chain)
 255                        if (tb->port == snum)
 256                                goto tb_found;
 257        }
 258        tb = NULL;
 259        goto tb_not_found;
 260tb_found:
 261        if (!hlist_empty(&tb->owners)) {
 262                if (sk->sk_reuse > 1)
 263                        goto success;
 264                if (tb->fastreuse > 0 &&
 265                    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
 266                        goto success;
 267                } else {
 268                        ret = 1;
 269                        if (tcp_bind_conflict(sk, tb))
 270                                goto fail_unlock;
 271                }
 272        }
 273tb_not_found:
 274        ret = 1;
 275        if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
 276                goto fail_unlock;
 277        if (hlist_empty(&tb->owners)) {
 278                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 279                        tb->fastreuse = 1;
 280                else
 281                        tb->fastreuse = 0;
 282        } else if (tb->fastreuse &&
 283                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 284                tb->fastreuse = 0;
 285success:
 286        if (!tcp_sk(sk)->bind_hash)
 287                tcp_bind_hash(sk, tb, snum);
 288        BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
 289        ret = 0;
 290
 291fail_unlock:
 292        spin_unlock(&head->lock);
 293fail:
 294        local_bh_enable();
 295        return ret;
 296}
 297
 298/* Get rid of any references to a local port held by the
 299 * given sock.
 300 */
 301static void __tcp_put_port(struct sock *sk)
 302{
 303        struct inet_opt *inet = inet_sk(sk);
 304        struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
 305        struct tcp_bind_bucket *tb;
 306
 307        spin_lock(&head->lock);
 308        tb = tcp_sk(sk)->bind_hash;
 309        __sk_del_bind_node(sk);
 310        tcp_sk(sk)->bind_hash = NULL;
 311        inet->num = 0;
 312        tcp_bucket_destroy(tb);
 313        spin_unlock(&head->lock);
 314}
 315
 316void tcp_put_port(struct sock *sk)
 317{
 318        local_bh_disable();
 319        __tcp_put_port(sk);
 320        local_bh_enable();
 321}
 322
 323/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
 324 * Look, when several writers sleep and reader wakes them up, all but one
 325 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 326 * this, _but_ remember, it adds useless work on UP machines (wake up each
 327 * exclusive lock release). It should be ifdefed really.
 328 */
 329
 330void tcp_listen_wlock(void)
 331{
 332        write_lock(&tcp_lhash_lock);
 333
 334        if (atomic_read(&tcp_lhash_users)) {
 335                DEFINE_WAIT(wait);
 336
 337                for (;;) {
 338                        prepare_to_wait_exclusive(&tcp_lhash_wait,
 339                                                &wait, TASK_UNINTERRUPTIBLE);
 340                        if (!atomic_read(&tcp_lhash_users))
 341                                break;
 342                        write_unlock_bh(&tcp_lhash_lock);
 343                        schedule();
 344                        write_lock_bh(&tcp_lhash_lock);
 345                }
 346
 347                finish_wait(&tcp_lhash_wait, &wait);
 348        }
 349}
 350
 351static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
 352{
 353        struct hlist_head *list;
 354        rwlock_t *lock;
 355
 356        BUG_TRAP(sk_unhashed(sk));
 357        if (listen_possible && sk->sk_state == TCP_LISTEN) {
 358                list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 359                lock = &tcp_lhash_lock;
 360                tcp_listen_wlock();
 361        } else {
 362                list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
 363                lock = &tcp_ehash[sk->sk_hashent].lock;
 364                write_lock(lock);
 365        }
 366        __sk_add_node(sk, list);
 367        sock_prot_inc_use(sk->sk_prot);
 368        write_unlock(lock);
 369        if (listen_possible && sk->sk_state == TCP_LISTEN)
 370                wake_up(&tcp_lhash_wait);
 371}
 372
 373static void tcp_v4_hash(struct sock *sk)
 374{
 375        if (sk->sk_state != TCP_CLOSE) {
 376                local_bh_disable();
 377                __tcp_v4_hash(sk, 1);
 378                local_bh_enable();
 379        }
 380}
 381
 382void tcp_unhash(struct sock *sk)
 383{
 384        rwlock_t *lock;
 385
 386        if (sk_unhashed(sk))
 387                goto ende;
 388
 389        if (sk->sk_state == TCP_LISTEN) {
 390                local_bh_disable();
 391                tcp_listen_wlock();
 392                lock = &tcp_lhash_lock;
 393        } else {
 394                struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
 395                lock = &head->lock;
 396                write_lock_bh(&head->lock);
 397        }
 398
 399        if (__sk_del_node_init(sk))
 400                sock_prot_dec_use(sk->sk_prot);
 401        write_unlock_bh(lock);
 402
 403 ende:
 404        if (sk->sk_state == TCP_LISTEN)
 405                wake_up(&tcp_lhash_wait);
 406}
 407
 408/* Don't inline this cruft.  Here are some nice properties to
 409 * exploit here.  The BSD API does not allow a listening TCP
 410 * to specify the remote port nor the remote address for the
 411 * connection.  So always assume those are both wildcarded
 412 * during the search since they can never be otherwise.
 413 */
 414static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
 415                                             unsigned short hnum, int dif)
 416{
 417        struct sock *result = NULL, *sk;
 418        struct hlist_node *node;
 419        int score, hiscore;
 420
 421        hiscore=-1;
 422        sk_for_each(sk, node, head) {
 423                struct inet_opt *inet = inet_sk(sk);
 424
 425                if (inet->num == hnum && !ipv6_only_sock(sk)) {
 426                        __u32 rcv_saddr = inet->rcv_saddr;
 427
 428                        score = (sk->sk_family == PF_INET ? 1 : 0);
 429                        if (rcv_saddr) {
 430                                if (rcv_saddr != daddr)
 431                                        continue;
 432                                score+=2;
 433                        }
 434                        if (sk->sk_bound_dev_if) {
 435                                if (sk->sk_bound_dev_if != dif)
 436                                        continue;
 437                                score+=2;
 438                        }
 439                        if (score == 5)
 440                                return sk;
 441                        if (score > hiscore) {
 442                                hiscore = score;
 443                                result = sk;
 444                        }
 445                }
 446        }
 447        return result;
 448}
 449
 450/* Optimize the common listener case. */
 451inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum,
 452                                           int dif)
 453{
 454        struct sock *sk = NULL;
 455        struct hlist_head *head;
 456
 457        read_lock(&tcp_lhash_lock);
 458        head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 459        if (!hlist_empty(head)) {
 460                struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
 461
 462                if (inet->num == hnum && !sk->sk_node.next &&
 463                    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 464                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 465                    !sk->sk_bound_dev_if)
 466                        goto sherry_cache;
 467                sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
 468        }
 469        if (sk) {
 470sherry_cache:
 471                sock_hold(sk);
 472        }
 473        read_unlock(&tcp_lhash_lock);
 474        return sk;
 475}
 476
 477/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 478 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 479 *
 480 * Local BH must be disabled here.
 481 */
 482
 483static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
 484                                                       u32 daddr, u16 hnum,
 485                                                       int dif)
 486{
 487        struct tcp_ehash_bucket *head;
 488        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 489        __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
 490        struct sock *sk;
 491        struct hlist_node *node;
 492        /* Optimize here for direct hit, only listening connections can
 493         * have wildcards anyways.
 494         */
 495        int hash = tcp_hashfn(daddr, hnum, saddr, sport);
 496        head = &tcp_ehash[hash];
 497        read_lock(&head->lock);
 498        sk_for_each(sk, node, &head->chain) {
 499                if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 500                        goto hit; /* You sunk my battleship! */
 501        }
 502
 503        /* Must check for a TIME_WAIT'er before going to listener hash. */
 504        sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
 505                if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
 506                        goto hit;
 507        }
 508        sk = NULL;
 509out:
 510        read_unlock(&head->lock);
 511        return sk;
 512hit:
 513        sock_hold(sk);
 514        goto out;
 515}
 516
 517static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 518                                           u32 daddr, u16 hnum, int dif)
 519{
 520        struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
 521                                                      daddr, hnum, dif);
 522
 523        return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
 524}
 525
 526inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
 527                                  u16 dport, int dif)
 528{
 529        struct sock *sk;
 530
 531        local_bh_disable();
 532        sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
 533        local_bh_enable();
 534
 535        return sk;
 536}
 537
 538static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 539{
 540        return secure_tcp_sequence_number(skb->nh.iph->daddr,
 541                                          skb->nh.iph->saddr,
 542                                          skb->h.th->dest,
 543                                          skb->h.th->source);
 544}
 545
 546/* called with local bh disabled */
 547static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 548                                      struct tcp_tw_bucket **twp)
 549{
 550        struct inet_opt *inet = inet_sk(sk);
 551        u32 daddr = inet->rcv_saddr;
 552        u32 saddr = inet->daddr;
 553        int dif = sk->sk_bound_dev_if;
 554        TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
 555        __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
 556        int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
 557        struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 558        struct sock *sk2;
 559        struct hlist_node *node;
 560        struct tcp_tw_bucket *tw;
 561
 562        write_lock(&head->lock);
 563
 564        /* Check TIME-WAIT sockets first. */
 565        sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
 566                tw = (struct tcp_tw_bucket *)sk2;
 567
 568                if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
 569                        struct tcp_opt *tp = tcp_sk(sk);
 570
 571                        /* With PAWS, it is safe from the viewpoint
 572                           of data integrity. Even without PAWS it
 573                           is safe provided sequence spaces do not
 574                           overlap i.e. at data rates <= 80Mbit/sec.
 575
 576                           Actually, the idea is close to VJ's one,
 577                           only timestamp cache is held not per host,
 578                           but per port pair and TW bucket is used
 579                           as state holder.
 580
 581                           If TW bucket has been already destroyed we
 582                           fall back to VJ's scheme and use initial
 583                           timestamp retrieved from peer table.
 584                         */
 585                        if (tw->tw_ts_recent_stamp &&
 586                            (!twp || (sysctl_tcp_tw_reuse &&
 587                                      xtime.tv_sec -
 588                                      tw->tw_ts_recent_stamp > 1))) {
 589                                if ((tp->write_seq =
 590                                                tw->tw_snd_nxt + 65535 + 2) == 0)
 591                                        tp->write_seq = 1;
 592                                tp->ts_recent       = tw->tw_ts_recent;
 593                                tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
 594                                sock_hold(sk2);
 595                                goto unique;
 596                        } else
 597                                goto not_unique;
 598                }
 599        }
 600        tw = NULL;
 601
 602        /* And established part... */
 603        sk_for_each(sk2, node, &head->chain) {
 604                if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 605                        goto not_unique;
 606        }
 607
 608unique:
 609        /* Must record num and sport now. Otherwise we will see
 610         * in hash table socket with a funny identity. */
 611        inet->num = lport;
 612        inet->sport = htons(lport);
 613        sk->sk_hashent = hash;
 614        BUG_TRAP(sk_unhashed(sk));
 615        __sk_add_node(sk, &head->chain);
 616        sock_prot_inc_use(sk->sk_prot);
 617        write_unlock(&head->lock);
 618
 619        if (twp) {
 620                *twp = tw;
 621                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 622        } else if (tw) {
 623                /* Silly. Should hash-dance instead... */
 624                tcp_tw_deschedule(tw);
 625                NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 626
 627                tcp_tw_put(tw);
 628        }
 629
 630        return 0;
 631
 632not_unique:
 633        write_unlock(&head->lock);
 634        return -EADDRNOTAVAIL;
 635}
 636
 637/*
 638 * Bind a port for a connect operation and hash it.
 639 */
 640static int tcp_v4_hash_connect(struct sock *sk)
 641{
 642        unsigned short snum = inet_sk(sk)->num;
 643        struct tcp_bind_hashbucket *head;
 644        struct tcp_bind_bucket *tb;
 645        int ret;
 646
 647        if (!snum) {
 648                int rover;
 649                int low = sysctl_local_port_range[0];
 650                int high = sysctl_local_port_range[1];
 651                int remaining = (high - low) + 1;
 652                struct hlist_node *node;
 653                struct tcp_tw_bucket *tw = NULL;
 654
 655                local_bh_disable();
 656
 657                /* TODO. Actually it is not so bad idea to remove
 658                 * tcp_portalloc_lock before next submission to Linus.
 659                 * As soon as we touch this place at all it is time to think.
 660                 *
 661                 * Now it protects single _advisory_ variable tcp_port_rover,
 662                 * hence it is mostly useless.
 663                 * Code will work nicely if we just delete it, but
 664                 * I am afraid in contented case it will work not better or
 665                 * even worse: another cpu just will hit the same bucket
 666                 * and spin there.
 667                 * So some cpu salt could remove both contention and
 668                 * memory pingpong. Any ideas how to do this in a nice way?
 669                 */
 670                spin_lock(&tcp_portalloc_lock);
 671                rover = tcp_port_rover;
 672
 673                do {
 674                        rover++;
 675                        if ((rover < low) || (rover > high))
 676                                rover = low;
 677                        head = &tcp_bhash[tcp_bhashfn(rover)];
 678                        spin_lock(&head->lock);
 679
 680                        /* Does not bother with rcv_saddr checks,
 681                         * because the established check is already
 682                         * unique enough.
 683                         */
 684                        tb_for_each(tb, node, &head->chain) {
 685                                if (tb->port == rover) {
 686                                        BUG_TRAP(!hlist_empty(&tb->owners));
 687                                        if (tb->fastreuse >= 0)
 688                                                goto next_port;
 689                                        if (!__tcp_v4_check_established(sk,
 690                                                                        rover,
 691                                                                        &tw))
 692                                                goto ok;
 693                                        goto next_port;
 694                                }
 695                        }
 696
 697                        tb = tcp_bucket_create(head, rover);
 698                        if (!tb) {
 699                                spin_unlock(&head->lock);
 700                                break;
 701                        }
 702                        tb->fastreuse = -1;
 703                        goto ok;
 704
 705                next_port:
 706                        spin_unlock(&head->lock);
 707                } while (--remaining > 0);
 708                tcp_port_rover = rover;
 709                spin_unlock(&tcp_portalloc_lock);
 710
 711                local_bh_enable();
 712
 713                return -EADDRNOTAVAIL;
 714
 715ok:
 716                /* All locks still held and bhs disabled */
 717                tcp_port_rover = rover;
 718                spin_unlock(&tcp_portalloc_lock);
 719
 720                tcp_bind_hash(sk, tb, rover);
 721                if (sk_unhashed(sk)) {
 722                        inet_sk(sk)->sport = htons(rover);
 723                        __tcp_v4_hash(sk, 0);
 724                }
 725                spin_unlock(&head->lock);
 726
 727                if (tw) {
 728                        tcp_tw_deschedule(tw);
 729                        tcp_tw_put(tw);
 730                }
 731
 732                ret = 0;
 733                goto out;
 734        }
 735
 736        head  = &tcp_bhash[tcp_bhashfn(snum)];
 737        tb  = tcp_sk(sk)->bind_hash;
 738        spin_lock_bh(&head->lock);
 739        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 740                __tcp_v4_hash(sk, 0);
 741                spin_unlock_bh(&head->lock);
 742                return 0;
 743        } else {
 744                spin_unlock(&head->lock);
 745                /* No definite answer... Walk to established hash table */
 746                ret = __tcp_v4_check_established(sk, snum, NULL);
 747out:
 748                local_bh_enable();
 749                return ret;
 750        }
 751}
 752
 753/* This will initiate an outgoing connection. */
 754int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 755{
 756        struct inet_opt *inet = inet_sk(sk);
 757        struct tcp_opt *tp = tcp_sk(sk);
 758        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 759        struct rtable *rt;
 760        u32 daddr, nexthop;
 761        int tmp;
 762        int err;
 763
 764        if (addr_len < sizeof(struct sockaddr_in))
 765                return -EINVAL;
 766
 767        if (usin->sin_family != AF_INET)
 768                return -EAFNOSUPPORT;
 769
 770        nexthop = daddr = usin->sin_addr.s_addr;
 771        if (inet->opt && inet->opt->srr) {
 772                if (!daddr)
 773                        return -EINVAL;
 774                nexthop = inet->opt->faddr;
 775        }
 776
 777        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 778                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 779                               IPPROTO_TCP,
 780                               inet->sport, usin->sin_port, sk);
 781        if (tmp < 0)
 782                return tmp;
 783
 784        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 785                ip_rt_put(rt);
 786                return -ENETUNREACH;
 787        }
 788
 789        if (!inet->opt || !inet->opt->srr)
 790                daddr = rt->rt_dst;
 791
 792        if (!inet->saddr)
 793                inet->saddr = rt->rt_src;
 794        inet->rcv_saddr = inet->saddr;
 795
 796        if (tp->ts_recent_stamp && inet->daddr != daddr) {
 797                /* Reset inherited state */
 798                tp->ts_recent       = 0;
 799                tp->ts_recent_stamp = 0;
 800                tp->write_seq       = 0;
 801        }
 802
 803        if (sysctl_tcp_tw_recycle &&
 804            !tp->ts_recent_stamp && rt->rt_dst == daddr) {
 805                struct inet_peer *peer = rt_get_peer(rt);
 806
 807                /* VJ's idea. We save last timestamp seen from
 808                 * the destination in peer table, when entering state TIME-WAIT
 809                 * and initialize ts_recent from it, when trying new connection.
 810                 */
 811
 812                if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 813                        tp->ts_recent_stamp = peer->tcp_ts_stamp;
 814                        tp->ts_recent = peer->tcp_ts;
 815                }
 816        }
 817
 818        inet->dport = usin->sin_port;
 819        inet->daddr = daddr;
 820
 821        tp->ext_header_len = 0;
 822        if (inet->opt)
 823                tp->ext_header_len = inet->opt->optlen;
 824
 825        tp->mss_clamp = 536;
 826
 827        /* Socket identity is still unknown (sport may be zero).
 828         * However we set state to SYN-SENT and not releasing socket
 829         * lock select source port, enter ourselves into the hash tables and
 830         * complete initialization after this.
 831         */
 832        tcp_set_state(sk, TCP_SYN_SENT);
 833        err = tcp_v4_hash_connect(sk);
 834        if (err)
 835                goto failure;
 836
 837        err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 838        if (err)
 839                goto failure;
 840
 841        /* OK, now commit destination to socket.  */
 842        __sk_dst_set(sk, &rt->u.dst);
 843        tcp_v4_setup_caps(sk, &rt->u.dst);
 844        tp->ext2_header_len = rt->u.dst.header_len;
 845
 846        if (!tp->write_seq)
 847                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 848                                                           inet->daddr,
 849                                                           inet->sport,
 850                                                           usin->sin_port);
 851
 852        inet->id = tp->write_seq ^ jiffies;
 853
 854        err = tcp_connect(sk);
 855        rt = NULL;
 856        if (err)
 857                goto failure;
 858
 859        return 0;
 860
 861failure:
 862        /* This unhashes the socket and releases the local port, if necessary. */
 863        tcp_set_state(sk, TCP_CLOSE);
 864        ip_rt_put(rt);
 865        sk->sk_route_caps = 0;
 866        inet->dport = 0;
 867        return err;
 868}
 869
 870static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 871{
 872        return ((struct rtable *)skb->dst)->rt_iif;
 873}
 874
 875static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 876{
 877        return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 878}
 879
 880static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
 881                                              struct open_request ***prevp,
 882                                              __u16 rport,
 883                                              __u32 raddr, __u32 laddr)
 884{
 885        struct tcp_listen_opt *lopt = tp->listen_opt;
 886        struct open_request *req, **prev;
 887
 888        for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 889             (req = *prev) != NULL;
 890             prev = &req->dl_next) {
 891                if (req->rmt_port == rport &&
 892                    req->af.v4_req.rmt_addr == raddr &&
 893                    req->af.v4_req.loc_addr == laddr &&
 894                    TCP_INET_FAMILY(req->class->family)) {
 895                        BUG_TRAP(!req->sk);
 896                        *prevp = prev;
 897                        break;
 898                }
 899        }
 900
 901        return req;
 902}
 903
 904static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 905{
 906        struct tcp_opt *tp = tcp_sk(sk);
 907        struct tcp_listen_opt *lopt = tp->listen_opt;
 908        u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
 909
 910        req->expires = jiffies + TCP_TIMEOUT_INIT;
 911        req->retrans = 0;
 912        req->sk = NULL;
 913        req->dl_next = lopt->syn_table[h];
 914
 915        write_lock(&tp->syn_wait_lock);
 916        lopt->syn_table[h] = req;
 917        write_unlock(&tp->syn_wait_lock);
 918
 919        tcp_synq_added(sk);
 920}
 921
 922
 923/*
 924 * This routine does path mtu discovery as defined in RFC1191.
 925 */
 926static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 927                                     u32 mtu)
 928{
 929        struct dst_entry *dst;
 930        struct inet_opt *inet = inet_sk(sk);
 931        struct tcp_opt *tp = tcp_sk(sk);
 932
 933        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 934         * send out by Linux are always <576bytes so they should go through
 935         * unfragmented).
 936         */
 937        if (sk->sk_state == TCP_LISTEN)
 938                return;
 939
 940        /* We don't check in the destentry if pmtu discovery is forbidden
 941         * on this route. We just assume that no packet_to_big packets
 942         * are send back when pmtu discovery is not active.
 943         * There is a small race when the user changes this flag in the
 944         * route, but I think that's acceptable.
 945         */
 946        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 947                return;
 948
 949        dst->ops->update_pmtu(dst, mtu);
 950
 951        /* Something is about to be wrong... Remember soft error
 952         * for the case, if this connection will not able to recover.
 953         */
 954        if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
 955                sk->sk_err_soft = EMSGSIZE;
 956
 957        mtu = dst_pmtu(dst);
 958
 959        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 960            tp->pmtu_cookie > mtu) {
 961                tcp_sync_mss(sk, mtu);
 962
 963                /* Resend the TCP packet because it's
 964                 * clear that the old packet has been
 965                 * dropped. This is the new "fast" path mtu
 966                 * discovery.
 967                 */
 968                tcp_simple_retransmit(sk);
 969        } /* else let the usual retransmit timer handle it */
 970}
 971
 972/*
 973 * This routine is called by the ICMP module when it gets some
 974 * sort of error condition.  If err < 0 then the socket should
 975 * be closed and the error returned to the user.  If err > 0
 976 * it's just the icmp type << 8 | icmp code.  After adjustment
 977 * header points to the first 8 bytes of the tcp header.  We need
 978 * to find the appropriate port.
 979 *
 980 * The locking strategy used here is very "optimistic". When
 981 * someone else accesses the socket the ICMP is just dropped
 982 * and for some paths there is no check at all.
 983 * A more general error queue to queue errors for later handling
 984 * is probably better.
 985 *
 986 */
 987
 988void tcp_v4_err(struct sk_buff *skb, u32 info)
 989{
 990        struct iphdr *iph = (struct iphdr *)skb->data;
 991        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 992        struct tcp_opt *tp;
 993        struct inet_opt *inet;
 994        int type = skb->h.icmph->type;
 995        int code = skb->h.icmph->code;
 996        struct sock *sk;
 997        __u32 seq;
 998        int err;
 999
1000        if (skb->len < (iph->ihl << 2) + 8) {
1001                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1002                return;
1003        }
1004
1005        sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
1006                           th->source, tcp_v4_iif(skb));
1007        if (!sk) {
1008                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1009                return;
1010        }
1011        if (sk->sk_state == TCP_TIME_WAIT) {
1012                tcp_tw_put((struct tcp_tw_bucket *)sk);
1013                return;
1014        }
1015
1016        bh_lock_sock(sk);
1017        /* If too many ICMPs get dropped on busy
1018         * servers this needs to be solved differently.
1019         */
1020        if (sock_owned_by_user(sk))
1021                NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1022
1023        if (sk->sk_state == TCP_CLOSE)
1024                goto out;
1025
1026        tp = tcp_sk(sk);
1027        seq = ntohl(th->seq);
1028        if (sk->sk_state != TCP_LISTEN &&
1029            !between(seq, tp->snd_una, tp->snd_nxt)) {
1030                NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1031                goto out;
1032        }
1033
1034        switch (type) {
1035        case ICMP_SOURCE_QUENCH:
1036                /* Just silently ignore these. */
1037                goto out;
1038        case ICMP_PARAMETERPROB:
1039                err = EPROTO;
1040                break;
1041        case ICMP_DEST_UNREACH:
1042                if (code > NR_ICMP_UNREACH)
1043                        goto out;
1044
1045                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1046                        if (!sock_owned_by_user(sk))
1047                                do_pmtu_discovery(sk, iph, info);
1048                        goto out;
1049                }
1050
1051                err = icmp_err_convert[code].errno;
1052                break;
1053        case ICMP_TIME_EXCEEDED:
1054                err = EHOSTUNREACH;
1055                break;
1056        default:
1057                goto out;
1058        }
1059
1060        switch (sk->sk_state) {
1061                struct open_request *req, **prev;
1062        case TCP_LISTEN:
1063                if (sock_owned_by_user(sk))
1064                        goto out;
1065
1066                req = tcp_v4_search_req(tp, &prev, th->dest,
1067                                        iph->daddr, iph->saddr);
1068                if (!req)
1069                        goto out;
1070
1071                /* ICMPs are not backlogged, hence we cannot get
1072                   an established socket here.
1073                 */
1074                BUG_TRAP(!req->sk);
1075
1076                if (seq != req->snt_isn) {
1077                        NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1078                        goto out;
1079                }
1080
1081                /*
1082                 * Still in SYN_RECV, just remove it silently.
1083                 * There is no good way to pass the error to the newly
1084                 * created socket, and POSIX does not want network
1085                 * errors returned from accept().
1086                 */
1087                tcp_synq_drop(sk, req, prev);
1088                goto out;
1089
1090        case TCP_SYN_SENT:
1091        case TCP_SYN_RECV:  /* Cannot happen.
1092                               It can f.e. if SYNs crossed.
1093                             */
1094                if (!sock_owned_by_user(sk)) {
1095                        TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1096                        sk->sk_err = err;
1097
1098                        sk->sk_error_report(sk);
1099
1100                        tcp_done(sk);
1101                } else {
1102                        sk->sk_err_soft = err;
1103                }
1104                goto out;
1105        }
1106
1107        /* If we've already connected we will keep trying
1108         * until we time out, or the user gives up.
1109         *
1110         * rfc1122 4.2.3.9 allows to consider as hard errors
1111         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1112         * but it is obsoleted by pmtu discovery).
1113         *
1114         * Note, that in modern internet, where routing is unreliable
1115         * and in each dark corner broken firewalls sit, sending random
1116         * errors ordered by their masters even this two messages finally lose
1117         * their original sense (even Linux sends invalid PORT_UNREACHs)
1118         *
1119         * Now we are in compliance with RFCs.
1120         *                                                      --ANK (980905)
1121         */
1122
1123        inet = inet_sk(sk);
1124        if (!sock_owned_by_user(sk) && inet->recverr) {
1125                sk->sk_err = err;
1126                sk->sk_error_report(sk);
1127        } else  { /* Only an error on timeout */
1128                sk->sk_err_soft = err;
1129        }
1130
1131out:
1132        bh_unlock_sock(sk);
1133        sock_put(sk);
1134}
1135
1136/* This routine computes an IPv4 TCP checksum. */
1137void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1138                       struct sk_buff *skb)
1139{
1140        struct inet_opt *inet = inet_sk(sk);
1141
1142        if (skb->ip_summed == CHECKSUM_HW) {
1143                th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1144                skb->csum = offsetof(struct tcphdr, check);
1145        } else {
1146                th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1147                                         csum_partial((char *)th,
1148                                                      th->doff << 2,
1149                                                      skb->csum));
1150        }
1151}
1152
1153/*
1154 *      This routine will send an RST to the other tcp.
1155 *
1156 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1157 *                    for reset.
1158 *      Answer: if a packet caused RST, it is not for a socket
1159 *              existing in our system, if it is matched to a socket,
1160 *              it is just duplicate segment or bug in other side's TCP.
1161 *              So that we build reply only basing on parameters
1162 *              arrived with segment.
1163 *      Exception: precedence violation. We do not implement it in any case.
1164 */
1165
1166static void tcp_v4_send_reset(struct sk_buff *skb)
1167{
1168        struct tcphdr *th = skb->h.th;
1169        struct tcphdr rth;
1170        struct ip_reply_arg arg;
1171
1172        /* Never send a reset in response to a reset. */
1173        if (th->rst)
1174                return;
1175
1176        if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1177                return;
1178
1179        /* Swap the send and the receive. */
1180        memset(&rth, 0, sizeof(struct tcphdr));
1181        rth.dest   = th->source;
1182        rth.source = th->dest;
1183        rth.doff   = sizeof(struct tcphdr) / 4;
1184        rth.rst    = 1;
1185
1186        if (th->ack) {
1187                rth.seq = th->ack_seq;
1188        } else {
1189                rth.ack = 1;
1190                rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1191                                    skb->len - (th->doff << 2));
1192        }
1193
1194        memset(&arg, 0, sizeof arg);
1195        arg.iov[0].iov_base = (unsigned char *)&rth;
1196        arg.iov[0].iov_len  = sizeof rth;
1197        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1198                                      skb->nh.iph->saddr, /*XXX*/
1199                                      sizeof(struct tcphdr), IPPROTO_TCP, 0);
1200        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1201
1202        ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1203
1204        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1205        TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1206}
1207
1208/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1209   outside socket context is ugly, certainly. What can I do?
1210 */
1211
1212static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1213                            u32 win, u32 ts)
1214{
1215        struct tcphdr *th = skb->h.th;
1216        struct {
1217                struct tcphdr th;
1218                u32 tsopt[3];
1219        } rep;
1220        struct ip_reply_arg arg;
1221
1222        memset(&rep.th, 0, sizeof(struct tcphdr));
1223        memset(&arg, 0, sizeof arg);
1224
1225        arg.iov[0].iov_base = (unsigned char *)&rep;
1226        arg.iov[0].iov_len  = sizeof(rep.th);
1227        if (ts) {
1228                rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1229                                     (TCPOPT_TIMESTAMP << 8) |
1230                                     TCPOLEN_TIMESTAMP);
1231                rep.tsopt[1] = htonl(tcp_time_stamp);
1232                rep.tsopt[2] = htonl(ts);
1233                arg.iov[0].iov_len = sizeof(rep);
1234        }
1235
1236        /* Swap the send and the receive. */
1237        rep.th.dest    = th->source;
1238        rep.th.source  = th->dest;
1239        rep.th.doff    = arg.iov[0].iov_len / 4;
1240        rep.th.seq     = htonl(seq);
1241        rep.th.ack_seq = htonl(ack);
1242        rep.th.ack     = 1;
1243        rep.th.window  = htons(win);
1244
1245        arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1246                                      skb->nh.iph->saddr, /*XXX*/
1247                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1248        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1249
1250        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1251
1252        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1253}
1254
1255static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1256{
1257        struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1258
1259        tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1260                        tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1261
1262        tcp_tw_put(tw);
1263}
1264
1265static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1266{
1267        tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
1268                        req->ts_recent);
1269}
1270
1271static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1272                                          struct open_request *req)
1273{
1274        struct rtable *rt;
1275        struct ip_options *opt = req->af.v4_req.opt;
1276        struct flowi fl = { .oif = sk->sk_bound_dev_if,
1277                            .nl_u = { .ip4_u =
1278                                      { .daddr = ((opt && opt->srr) ?
1279                                                  opt->faddr :
1280                                                  req->af.v4_req.rmt_addr),
1281                                        .saddr = req->af.v4_req.loc_addr,
1282                                        .tos = RT_CONN_FLAGS(sk) } },
1283                            .proto = IPPROTO_TCP,
1284                            .uli_u = { .ports =
1285                                       { .sport = inet_sk(sk)->sport,
1286                                         .dport = req->rmt_port } } };
1287
1288        if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1289                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1290                return NULL;
1291        }
1292        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1293                ip_rt_put(rt);
1294                IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1295                return NULL;
1296        }
1297        return &rt->u.dst;
1298}
1299
1300/*
1301 *      Send a SYN-ACK after having received an ACK.
1302 *      This still operates on a open_request only, not on a big
1303 *      socket.
1304 */
1305static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1306                              struct dst_entry *dst)
1307{
1308        int err = -1;
1309        struct sk_buff * skb;
1310
1311        /* First, grab a route. */
1312        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1313                goto out;
1314
1315        skb = tcp_make_synack(sk, dst, req);
1316
1317        if (skb) {
1318                struct tcphdr *th = skb->h.th;
1319
1320                th->check = tcp_v4_check(th, skb->len,
1321                                         req->af.v4_req.loc_addr,
1322                                         req->af.v4_req.rmt_addr,
1323                                         csum_partial((char *)th, skb->len,
1324                                                      skb->csum));
1325
1326                err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1327                                            req->af.v4_req.rmt_addr,
1328                                            req->af.v4_req.opt);
1329                if (err == NET_XMIT_CN)
1330                        err = 0;
1331        }
1332
1333out:
1334        dst_release(dst);
1335        return err;
1336}
1337
1338/*
1339 *      IPv4 open_request destructor.
1340 */
1341static void tcp_v4_or_free(struct open_request *req)
1342{
1343        if (req->af.v4_req.opt)
1344                kfree(req->af.v4_req.opt);
1345}
1346
1347static inline void syn_flood_warning(struct sk_buff *skb)
1348{
1349        static unsigned long warntime;
1350
1351        if (time_after(jiffies, (warntime + HZ * 60))) {
1352                warntime = jiffies;
1353                printk(KERN_INFO
1354                       "possible SYN flooding on port %d. Sending cookies.\n",
1355                       ntohs(skb->h.th->dest));
1356        }
1357}
1358
1359/*
1360 * Save and compile IPv4 options into the open_request if needed.
1361 */
1362static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1363                                                     struct sk_buff *skb)
1364{
1365        struct ip_options *opt = &(IPCB(skb)->opt);
1366        struct ip_options *dopt = NULL;
1367
1368        if (opt && opt->optlen) {
1369                int opt_size = optlength(opt);
1370                dopt = kmalloc(opt_size, GFP_ATOMIC);
1371                if (dopt) {
1372                        if (ip_options_echo(dopt, skb)) {
1373                                kfree(dopt);
1374                                dopt = NULL;
1375                        }
1376                }
1377        }
1378        return dopt;
1379}
1380
1381/*
1382 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1383 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1384 * It would be better to replace it with a global counter for all sockets
1385 * but then some measure against one socket starving all other sockets
1386 * would be needed.
1387 *
1388 * It was 128 by default. Experiments with real servers show, that
1389 * it is absolutely not enough even at 100conn/sec. 256 cures most
1390 * of problems. This value is adjusted to 128 for very small machines
1391 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1392 * Further increasing requires to change hash table size.
1393 */
1394int sysctl_max_syn_backlog = 256;
1395
1396struct or_calltable or_ipv4 = {
1397        .family         =       PF_INET,
1398        .rtx_syn_ack    =       tcp_v4_send_synack,
1399        .send_ack       =       tcp_v4_or_send_ack,
1400        .destructor     =       tcp_v4_or_free,
1401        .send_reset     =       tcp_v4_send_reset,
1402};
1403
1404int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1405{
1406        struct tcp_opt tp;
1407        struct open_request *req;
1408        __u32 saddr = skb->nh.iph->saddr;
1409        __u32 daddr = skb->nh.iph->daddr;
1410        __u32 isn = TCP_SKB_CB(skb)->when;
1411        struct dst_entry *dst = NULL;
1412#ifdef CONFIG_SYN_COOKIES
1413        int want_cookie = 0;
1414#else
1415#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1416#endif
1417
1418        /* Never answer to SYNs send to broadcast or multicast */
1419        if (((struct rtable *)skb->dst)->rt_flags &
1420            (RTCF_BROADCAST | RTCF_MULTICAST))
1421                goto drop;
1422
1423        /* TW buckets are converted to open requests without
1424         * limitations, they conserve resources and peer is
1425         * evidently real one.
1426         */
1427        if (tcp_synq_is_full(sk) && !isn) {
1428#ifdef CONFIG_SYN_COOKIES
1429                if (sysctl_tcp_syncookies) {
1430                        want_cookie = 1;
1431                } else
1432#endif
1433                goto drop;
1434        }
1435
1436        /* Accept backlog is full. If we have already queued enough
1437         * of warm entries in syn queue, drop request. It is better than
1438         * clogging syn queue with openreqs with exponentially increasing
1439         * timeout.
1440         */
1441        if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1442                goto drop;
1443
1444        req = tcp_openreq_alloc();
1445        if (!req)
1446                goto drop;
1447
1448        tcp_clear_options(&tp);
1449        tp.mss_clamp = 536;
1450        tp.user_mss  = tcp_sk(sk)->user_mss;
1451
1452        tcp_parse_options(skb, &tp, 0);
1453
1454        if (want_cookie) {
1455                tcp_clear_options(&tp);
1456                tp.saw_tstamp = 0;
1457        }
1458
1459        if (tp.saw_tstamp && !tp.rcv_tsval) {
1460                /* Some OSes (unknown ones, but I see them on web server, which
1461                 * contains information interesting only for windows'
1462                 * users) do not send their stamp in SYN. It is easy case.
1463                 * We simply do not advertise TS support.
1464                 */
1465                tp.saw_tstamp = 0;
1466                tp.tstamp_ok  = 0;
1467        }
1468        tp.tstamp_ok = tp.saw_tstamp;
1469
1470        tcp_openreq_init(req, &tp, skb);
1471
1472        req->af.v4_req.loc_addr = daddr;
1473        req->af.v4_req.rmt_addr = saddr;
1474        req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1475        req->class = &or_ipv4;
1476        if (!want_cookie)
1477                TCP_ECN_create_request(req, skb->h.th);
1478
1479        if (want_cookie) {
1480#ifdef CONFIG_SYN_COOKIES
1481                syn_flood_warning(skb);
1482#endif
1483                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1484        } else if (!isn) {
1485                struct inet_peer *peer = NULL;
1486
1487                /* VJ's idea. We save last timestamp seen
1488                 * from the destination in peer table, when entering
1489                 * state TIME-WAIT, and check against it before
1490                 * accepting new connection request.
1491                 *
1492                 * If "isn" is not zero, this request hit alive
1493                 * timewait bucket, so that all the necessary checks
1494                 * are made in the function processing timewait state.
1495                 */
1496                if (tp.saw_tstamp &&
1497                    sysctl_tcp_tw_recycle &&
1498                    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1499                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1500                    peer->v4daddr == saddr) {
1501                        if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1502                            (s32)(peer->tcp_ts - req->ts_recent) >
1503                                                        TCP_PAWS_WINDOW) {
1504                                NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1505                                dst_release(dst);
1506                                goto drop_and_free;
1507                        }
1508                }
1509                /* Kill the following clause, if you dislike this way. */
1510                else if (!sysctl_tcp_syncookies &&
1511                         (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1512                          (sysctl_max_syn_backlog >> 2)) &&
1513                         (!peer || !peer->tcp_ts_stamp) &&
1514                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1515                        /* Without syncookies last quarter of
1516                         * backlog is filled with destinations,
1517                         * proven to be alive.
1518                         * It means that we continue to communicate
1519                         * to destinations, already remembered
1520                         * to the moment of synflood.
1521                         */
1522                        NETDEBUG(if (net_ratelimit()) \
1523                                        printk(KERN_DEBUG "TCP: drop open "
1524                                                          "request from %u.%u."
1525                                                          "%u.%u/%u\n", \
1526                                               NIPQUAD(saddr),
1527                                               ntohs(skb->h.th->source)));
1528                        dst_release(dst);
1529                        goto drop_and_free;
1530                }
1531
1532                isn = tcp_v4_init_sequence(sk, skb);
1533        }
1534        req->snt_isn = isn;
1535
1536        if (tcp_v4_send_synack(sk, req, dst))
1537                goto drop_and_free;
1538
1539        if (want_cookie) {
1540                tcp_openreq_free(req);
1541        } else {
1542                tcp_v4_synq_add(sk, req);
1543        }
1544        return 0;
1545
1546drop_and_free:
1547        tcp_openreq_free(req);
1548drop:
1549        TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1550        return 0;
1551}
1552
1553
1554/*
1555 * The three way handshake has completed - we got a valid synack -
1556 * now create the new socket.
1557 */
1558struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1559                                  struct open_request *req,
1560                                  struct dst_entry *dst)
1561{
1562        struct inet_opt *newinet;
1563        struct tcp_opt *newtp;
1564        struct sock *newsk;
1565
1566        if (sk_acceptq_is_full(sk))
1567                goto exit_overflow;
1568
1569        if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1570                goto exit;
1571
1572        newsk = tcp_create_openreq_child(sk, req, skb);
1573        if (!newsk)
1574                goto exit;
1575
1576        newsk->sk_dst_cache = dst;
1577        tcp_v4_setup_caps(newsk, dst);
1578
1579        newtp                 = tcp_sk(newsk);
1580        newinet               = inet_sk(newsk);
1581        newinet->daddr        = req->af.v4_req.rmt_addr;
1582        newinet->rcv_saddr    = req->af.v4_req.loc_addr;
1583        newinet->saddr        = req->af.v4_req.loc_addr;
1584        newinet->opt          = req->af.v4_req.opt;
1585        req->af.v4_req.opt    = NULL;
1586        newinet->mc_index     = tcp_v4_iif(skb);
1587        newinet->mc_ttl       = skb->nh.iph->ttl;
1588        newtp->ext_header_len = 0;
1589        if (newinet->opt)
1590                newtp->ext_header_len = newinet->opt->optlen;
1591        newtp->ext2_header_len = dst->header_len;
1592        newinet->id = newtp->write_seq ^ jiffies;
1593
1594        tcp_sync_mss(newsk, dst_pmtu(dst));
1595        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1596        tcp_initialize_rcv_mss(newsk);
1597
1598        __tcp_v4_hash(newsk, 0);
1599        __tcp_inherit_port(sk, newsk);
1600
1601        return newsk;
1602
1603exit_overflow:
1604        NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1605exit:
1606        NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1607        dst_release(dst);
1608        return NULL;
1609}
1610
1611static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1612{
1613        struct tcphdr *th = skb->h.th;
1614        struct iphdr *iph = skb->nh.iph;
1615        struct tcp_opt *tp = tcp_sk(sk);
1616        struct sock *nsk;
1617        struct open_request **prev;
1618        /* Find possible connection requests. */
1619        struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
1620                                                     iph->saddr, iph->daddr);
1621        if (req)
1622                return tcp_check_req(sk, skb, req, prev);
1623
1624        nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1625                                          th->source,
1626                                          skb->nh.iph->daddr,
1627                                          ntohs(th->dest),
1628                                          tcp_v4_iif(skb));
1629
1630        if (nsk) {
1631                if (nsk->sk_state != TCP_TIME_WAIT) {
1632                        bh_lock_sock(nsk);
1633                        return nsk;
1634                }
1635                tcp_tw_put((struct tcp_tw_bucket *)nsk);
1636                return NULL;
1637        }
1638
1639#ifdef CONFIG_SYN_COOKIES
1640        if (!th->rst && !th->syn && th->ack)
1641                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1642#endif
1643        return sk;
1644}
1645
1646static int tcp_v4_checksum_init(struct sk_buff *skb)
1647{
1648        if (skb->ip_summed == CHECKSUM_HW) {
1649                skb->ip_summed = CHECKSUM_UNNECESSARY;
1650                if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1651                                  skb->nh.iph->daddr, skb->csum))
1652                        return 0;
1653
1654                NETDEBUG(if (net_ratelimit())
1655                                printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1656                skb->ip_summed = CHECKSUM_NONE;
1657        }
1658        if (skb->len <= 76) {
1659                if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1660                                 skb->nh.iph->daddr,
1661                                 skb_checksum(skb, 0, skb->len, 0)))
1662                        return -1;
1663                skb->ip_summed = CHECKSUM_UNNECESSARY;
1664        } else {
1665                skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1666                                          skb->nh.iph->saddr,
1667                                          skb->nh.iph->daddr, 0);
1668        }
1669        return 0;
1670}
1671
1672
1673/* The socket must have it's spinlock held when we get
1674 * here.
1675 *
1676 * We have a potential double-lock case here, so even when
1677 * doing backlog processing we use the BH locking scheme.
1678 * This is because we cannot sleep with the original spinlock
1679 * held.
1680 */
1681int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1682{
1683        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1684                TCP_CHECK_TIMER(sk);
1685                if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1686                        goto reset;
1687                TCP_CHECK_TIMER(sk);
1688                return 0;
1689        }
1690
1691        if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1692                goto csum_err;
1693
1694        if (sk->sk_state == TCP_LISTEN) {
1695                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1696                if (!nsk)
1697                        goto discard;
1698
1699                if (nsk != sk) {
1700                        if (tcp_child_process(sk, nsk, skb))
1701                                goto reset;
1702                        return 0;
1703                }
1704        }
1705
1706        TCP_CHECK_TIMER(sk);
1707        if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1708                goto reset;
1709        TCP_CHECK_TIMER(sk);
1710        return 0;
1711
1712reset:
1713        tcp_v4_send_reset(skb);
1714discard:
1715        kfree_skb(skb);
1716        /* Be careful here. If this function gets more complicated and
1717         * gcc suffers from register pressure on the x86, sk (in %ebx)
1718         * might be destroyed here. This current version compiles correctly,
1719         * but you have been warned.
1720         */
1721        return 0;
1722
1723csum_err:
1724        TCP_INC_STATS_BH(TCP_MIB_INERRS);
1725        goto discard;
1726}
1727
1728/*
1729 *      From tcp_input.c
1730 */
1731
1732int tcp_v4_rcv(struct sk_buff *skb)
1733{
1734        struct tcphdr *th;
1735        struct sock *sk;
1736        int ret;
1737
1738        if (skb->pkt_type != PACKET_HOST)
1739                goto discard_it;
1740
1741        /* Count it even if it's bad */
1742        TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1743
1744        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1745                goto discard_it;
1746
1747        th = skb->h.th;
1748
1749        if (th->doff < sizeof(struct tcphdr) / 4)
1750                goto bad_packet;
1751        if (!pskb_may_pull(skb, th->doff * 4))
1752                goto discard_it;
1753
1754        /* An explanation is required here, I think.
1755         * Packet length and doff are validated by header prediction,
1756         * provided case of th->doff==0 is elimineted.
1757         * So, we defer the checks. */
1758        if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1759             tcp_v4_checksum_init(skb) < 0))
1760                goto bad_packet;
1761
1762        th = skb->h.th;
1763        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1764        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1765                                    skb->len - th->doff * 4);
1766        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1767        TCP_SKB_CB(skb)->when    = 0;
1768        TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1769        TCP_SKB_CB(skb)->sacked  = 0;
1770
1771        sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1772                             skb->nh.iph->daddr, ntohs(th->dest),
1773                             tcp_v4_iif(skb));
1774
1775        if (!sk)
1776                goto no_tcp_socket;
1777
1778process:
1779        if (sk->sk_state == TCP_TIME_WAIT)
1780                goto do_time_wait;
1781
1782        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1783                goto discard_and_relse;
1784
1785        if (sk_filter(sk, skb, 0))
1786                goto discard_and_relse;
1787
1788        skb->dev = NULL;
1789
1790        bh_lock_sock(sk);
1791        ret = 0;
1792        if (!sock_owned_by_user(sk)) {
1793                if (!tcp_prequeue(sk, skb))
1794                        ret = tcp_v4_do_rcv(sk, skb);
1795        } else
1796                sk_add_backlog(sk, skb);
1797        bh_unlock_sock(sk);
1798
1799        sock_put(sk);
1800
1801        return ret;
1802
1803no_tcp_socket:
1804        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1805                goto discard_it;
1806
1807        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1808bad_packet:
1809                TCP_INC_STATS_BH(TCP_MIB_INERRS);
1810        } else {
1811                tcp_v4_send_reset(skb);
1812        }
1813
1814discard_it:
1815        /* Discard frame. */
1816        kfree_skb(skb);
1817        return 0;
1818
1819discard_and_relse:
1820        sock_put(sk);
1821        goto discard_it;
1822
1823do_time_wait:
1824        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1825                tcp_tw_put((struct tcp_tw_bucket *) sk);
1826                goto discard_it;
1827        }
1828
1829        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1830                TCP_INC_STATS_BH(TCP_MIB_INERRS);
1831                tcp_tw_put((struct tcp_tw_bucket *) sk);
1832                goto discard_it;
1833        }
1834        switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1835                                           skb, th, skb->len)) {
1836        case TCP_TW_SYN: {
1837                struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1838                                                          ntohs(th->dest),
1839                                                          tcp_v4_iif(skb));
1840                if (sk2) {
1841                        tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1842                        tcp_tw_put((struct tcp_tw_bucket *)sk);
1843                        sk = sk2;
1844                        goto process;
1845                }
1846                /* Fall through to ACK */
1847        }
1848        case TCP_TW_ACK:
1849                tcp_v4_timewait_ack(sk, skb);
1850                break;
1851        case TCP_TW_RST:
1852                goto no_tcp_socket;
1853        case TCP_TW_SUCCESS:;
1854        }
1855        goto discard_it;
1856}
1857
1858/* With per-bucket locks this operation is not-atomic, so that
1859 * this version is not worse.
1860 */
1861static void __tcp_v4_rehash(struct sock *sk)
1862{
1863        sk->sk_prot->unhash(sk);
1864        sk->sk_prot->hash(sk);
1865}
1866
1867static int tcp_v4_reselect_saddr(struct sock *sk)
1868{
1869        struct inet_opt *inet = inet_sk(sk);
1870        int err;
1871        struct rtable *rt;
1872        __u32 old_saddr = inet->saddr;
1873        __u32 new_saddr;
1874        __u32 daddr = inet->daddr;
1875
1876        if (inet->opt && inet->opt->srr)
1877                daddr = inet->opt->faddr;
1878
1879        /* Query new route. */
1880        err = ip_route_connect(&rt, daddr, 0,
1881                               RT_TOS(inet->tos) | sk->sk_localroute,
1882                               sk->sk_bound_dev_if,
1883                               IPPROTO_TCP,
1884                               inet->sport, inet->dport, sk);
1885        if (err)
1886                return err;
1887
1888        __sk_dst_set(sk, &rt->u.dst);
1889        tcp_v4_setup_caps(sk, &rt->u.dst);
1890        tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1891
1892        new_saddr = rt->rt_src;
1893
1894        if (new_saddr == old_saddr)
1895                return 0;
1896
1897        if (sysctl_ip_dynaddr > 1) {
1898                printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1899                                 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1900                       NIPQUAD(old_saddr),
1901                       NIPQUAD(new_saddr));
1902        }
1903
1904        inet->saddr = new_saddr;
1905        inet->rcv_saddr = new_saddr;
1906
1907        /* XXX The only one ugly spot where we need to
1908         * XXX really change the sockets identity after
1909         * XXX it has entered the hashes. -DaveM
1910         *
1911         * Besides that, it does not check for connection
1912         * uniqueness. Wait for troubles.
1913         */
1914        __tcp_v4_rehash(sk);
1915        return 0;
1916}
1917
1918int tcp_v4_rebuild_header(struct sock *sk)
1919{
1920        struct inet_opt *inet = inet_sk(sk);
1921        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1922        u32 daddr;
1923        int err;
1924
1925        /* Route is OK, nothing to do. */
1926        if (rt)
1927                return 0;
1928
1929        /* Reroute. */
1930        daddr = inet->daddr;
1931        if (inet->opt && inet->opt->srr)
1932                daddr = inet->opt->faddr;
1933
1934        {
1935                struct flowi fl = { .oif = sk->sk_bound_dev_if,
1936                                    .nl_u = { .ip4_u =
1937                                              { .daddr = daddr,
1938                                                .saddr = inet->saddr,
1939                                                .tos = RT_CONN_FLAGS(sk) } },
1940                                    .proto = IPPROTO_TCP,
1941                                    .uli_u = { .ports =
1942                                               { .sport = inet->sport,
1943                                                 .dport = inet->dport } } };
1944                                                
1945                err = ip_route_output_flow(&rt, &fl, sk, 0);
1946        }
1947        if (!err) {
1948                __sk_dst_set(sk, &rt->u.dst);
1949                tcp_v4_setup_caps(sk, &rt->u.dst);
1950                tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
1951                return 0;
1952        }
1953
1954        /* Routing failed... */
1955        sk->sk_route_caps = 0;
1956
1957        if (!sysctl_ip_dynaddr ||
1958            sk->sk_state != TCP_SYN_SENT ||
1959            (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1960            (err = tcp_v4_reselect_saddr(sk)) != 0)
1961                sk->sk_err_soft = -err;
1962
1963        return err;
1964}
1965
1966static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1967{
1968        struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1969        struct inet_opt *inet = inet_sk(sk);
1970
1971        sin->sin_family         = AF_INET;
1972        sin->sin_addr.s_addr    = inet->daddr;
1973        sin->sin_port           = inet->dport;
1974}
1975
1976/* VJ's idea. Save last timestamp seen from this destination
1977 * and hold it at least for normal timewait interval to use for duplicate
1978 * segment detection in subsequent connections, before they enter synchronized
1979 * state.
1980 */
1981
1982int tcp_v4_remember_stamp(struct sock *sk)
1983{
1984        struct inet_opt *inet = inet_sk(sk);
1985        struct tcp_opt *tp = tcp_sk(sk);
1986        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1987        struct inet_peer *peer = NULL;
1988        int release_it = 0;
1989
1990        if (!rt || rt->rt_dst != inet->daddr) {
1991                peer = inet_getpeer(inet->daddr, 1);
1992                release_it = 1;
1993        } else {
1994                if (!rt->peer)
1995                        rt_bind_peer(rt, 1);
1996                peer = rt->peer;
1997        }
1998
1999        if (peer) {
2000                if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
2001                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2002                     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
2003                        peer->tcp_ts_stamp = tp->ts_recent_stamp;
2004                        peer->tcp_ts = tp->ts_recent;
2005                }
2006                if (release_it)
2007                        inet_putpeer(peer);
2008                return 1;
2009        }
2010
2011        return 0;
2012}
2013
2014int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2015{
2016        struct inet_peer *peer = NULL;
2017
2018        peer = inet_getpeer(tw->tw_daddr, 1);
2019
2020        if (peer) {
2021                if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
2022                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2023                     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2024                        peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2025                        peer->tcp_ts = tw->tw_ts_recent;
2026                }
2027                inet_putpeer(peer);
2028                return 1;
2029        }
2030
2031        return 0;
2032}
2033
2034struct tcp_func ipv4_specific = {
2035        .queue_xmit     =       ip_queue_xmit,
2036        .send_check     =       tcp_v4_send_check,
2037        .rebuild_header =       tcp_v4_rebuild_header,
2038        .conn_request   =       tcp_v4_conn_request,
2039        .syn_recv_sock  =       tcp_v4_syn_recv_sock,
2040        .remember_stamp =       tcp_v4_remember_stamp,
2041        .net_header_len =       sizeof(struct iphdr),
2042        .setsockopt     =       ip_setsockopt,
2043        .getsockopt     =       ip_getsockopt,
2044        .addr2sockaddr  =       v4_addr2sockaddr,
2045        .sockaddr_len   =       sizeof(struct sockaddr_in),
2046};
2047
2048/* NOTE: A lot of things set to zero explicitly by call to
2049 *       sk_alloc() so need not be done here.
2050 */
2051static int tcp_v4_init_sock(struct sock *sk)
2052{
2053        struct tcp_opt *tp = tcp_sk(sk);
2054
2055        skb_queue_head_init(&tp->out_of_order_queue);
2056        tcp_init_xmit_timers(sk);
2057        tcp_prequeue_init(tp);
2058
2059        tp->rto  = TCP_TIMEOUT_INIT;
2060        tp->mdev = TCP_TIMEOUT_INIT;
2061
2062        /* So many TCP implementations out there (incorrectly) count the
2063         * initial SYN frame in their delayed-ACK and congestion control
2064         * algorithms that we must have the following bandaid to talk
2065         * efficiently to them.  -DaveM
2066         */
2067        tp->snd_cwnd = 2;
2068
2069        /* See draft-stevens-tcpca-spec-01 for discussion of the
2070         * initialization of these values.
2071         */
2072        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
2073        tp->snd_cwnd_clamp = ~0;
2074        tp->mss_cache_std = tp->mss_cache = 536;
2075
2076        tp->reordering = sysctl_tcp_reordering;
2077
2078        sk->sk_state = TCP_CLOSE;
2079
2080        sk->sk_write_space = sk_stream_write_space;
2081        sk->sk_use_write_queue = 1;
2082
2083        tp->af_specific = &ipv4_specific;
2084
2085        sk->sk_sndbuf = sysctl_tcp_wmem[1];
2086        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2087
2088        atomic_inc(&tcp_sockets_allocated);
2089
2090        return 0;
2091}
2092
2093int tcp_v4_destroy_sock(struct sock *sk)
2094{
2095        struct tcp_opt *tp = tcp_sk(sk);
2096
2097        tcp_clear_xmit_timers(sk);
2098
2099        /* Cleanup up the write buffer. */
2100        sk_stream_writequeue_purge(sk);
2101
2102        /* Cleans up our, hopefully empty, out_of_order_queue. */
2103        __skb_queue_purge(&tp->out_of_order_queue);
2104
2105        /* Clean prequeue, it must be empty really */
2106        __skb_queue_purge(&tp->ucopy.prequeue);
2107
2108        /* Clean up a referenced TCP bind bucket. */
2109        if (tp->bind_hash)
2110                tcp_put_port(sk);
2111
2112        /*
2113         * If sendmsg cached page exists, toss it.
2114         */
2115        if (sk->sk_sndmsg_page) {
2116                __free_page(sk->sk_sndmsg_page);
2117                sk->sk_sndmsg_page = NULL;
2118        }
2119
2120        atomic_dec(&tcp_sockets_allocated);
2121
2122        return 0;
2123}
2124
2125EXPORT_SYMBOL(tcp_v4_destroy_sock);
2126
2127#ifdef CONFIG_PROC_FS
2128/* Proc filesystem TCP sock list dumping. */
2129
2130static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2131{
2132        return hlist_empty(head) ? NULL :
2133                list_entry(head->first, struct tcp_tw_bucket, tw_node);
2134}
2135
2136static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2137{
2138        return tw->tw_node.next ?
2139                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2140}
2141
2142static void *listening_get_next(struct seq_file *seq, void *cur)
2143{
2144        struct tcp_opt *tp;
2145        struct hlist_node *node;
2146        struct sock *sk = cur;
2147        struct tcp_iter_state* st = seq->private;
2148
2149        if (!sk) {
2150                st->bucket = 0;
2151                sk = sk_head(&tcp_listening_hash[0]);
2152                goto get_sk;
2153        }
2154
2155        ++st->num;
2156
2157        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2158                struct open_request *req = cur;
2159
2160                tp = tcp_sk(st->syn_wait_sk);
2161                req = req->dl_next;
2162                while (1) {
2163                        while (req) {
2164                                if (req->class->family == st->family) {
2165                                        cur = req;
2166                                        goto out;
2167                                }
2168                                req = req->dl_next;
2169                        }
2170                        if (++st->sbucket >= TCP_SYNQ_HSIZE)
2171                                break;
2172get_req:
2173                        req = tp->listen_opt->syn_table[st->sbucket];
2174                }
2175                sk        = sk_next(st->syn_wait_sk);
2176                st->state = TCP_SEQ_STATE_LISTENING;
2177                read_unlock_bh(&tp->syn_wait_lock);
2178        } else {
2179                tp = tcp_sk(sk);
2180                read_lock_bh(&tp->syn_wait_lock);
2181                if (tp->listen_opt && tp->listen_opt->qlen)
2182                        goto start_req;
2183                read_unlock_bh(&tp->syn_wait_lock);
2184                sk = sk_next(sk);
2185        }
2186get_sk:
2187        sk_for_each_from(sk, node) {
2188                if (sk->sk_family == st->family) {
2189                        cur = sk;
2190                        goto out;
2191                }
2192                tp = tcp_sk(sk);
2193                read_lock_bh(&tp->syn_wait_lock);
2194                if (tp->listen_opt && tp->listen_opt->qlen) {
2195start_req:
2196                        st->uid         = sock_i_uid(sk);
2197                        st->syn_wait_sk = sk;
2198                        st->state       = TCP_SEQ_STATE_OPENREQ;
2199                        st->sbucket     = 0;
2200                        goto get_req;
2201                }
2202                read_unlock_bh(&tp->syn_wait_lock);
2203        }
2204        if (++st->bucket < TCP_LHTABLE_SIZE) {
2205                sk = sk_head(&tcp_listening_hash[st->bucket]);
2206                goto get_sk;
2207        }
2208        cur = NULL;
2209out:
2210        return cur;
2211}
2212
2213static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2214{
2215        void *rc = listening_get_next(seq, NULL);
2216
2217        while (rc && *pos) {
2218                rc = listening_get_next(seq, rc);
2219                --*pos;
2220        }
2221        return rc;
2222}
2223
2224static void *established_get_first(struct seq_file *seq)
2225{
2226        struct tcp_iter_state* st = seq->private;
2227        void *rc = NULL;
2228
2229        for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2230                struct sock *sk;
2231                struct hlist_node *node;
2232                struct tcp_tw_bucket *tw;
2233               
2234                read_lock(&tcp_ehash[st->bucket].lock);
2235                sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2236                        if (sk->sk_family != st->family) {
2237                                continue;
2238                        }
2239                        rc = sk;
2240                        goto out;
2241                }
2242                st->state = TCP_SEQ_STATE_TIME_WAIT;
2243                tw_for_each(tw, node,
2244                            &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2245                        if (tw->tw_family != st->family) {
2246                                continue;
2247                        }
2248                        rc = tw;
2249                        goto out;
2250                }
2251                read_unlock(&tcp_ehash[st->bucket].lock);
2252                st->state = TCP_SEQ_STATE_ESTABLISHED;
2253        }
2254out:
2255        return rc;
2256}
2257
2258static void *established_get_next(struct seq_file *seq, void *cur)
2259{
2260        struct sock *sk = cur;
2261        struct tcp_tw_bucket *tw;
2262        struct hlist_node *node;
2263        struct tcp_iter_state* st = seq->private;
2264
2265        ++st->num;
2266
2267        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2268                tw = cur;
2269                tw = tw_next(tw);
2270get_tw:
2271                while (tw && tw->tw_family != st->family) {
2272                        tw = tw_next(tw);
2273                }
2274                if (tw) {
2275                        cur = tw;
2276                        goto out;
2277                }
2278                read_unlock(&tcp_ehash[st->bucket].lock);
2279                st->state = TCP_SEQ_STATE_ESTABLISHED;
2280                if (++st->bucket < tcp_ehash_size) {
2281                        read_lock(&tcp_ehash[st->bucket].lock);
2282                        sk = sk_head(&tcp_ehash[st->bucket].chain);
2283                } else {
2284                        cur = NULL;
2285                        goto out;
2286                }
2287        } else
2288                sk = sk_next(sk);
2289
2290        sk_for_each_from(sk, node) {
2291                if (sk->sk_family == st->family)
2292                        goto found;
2293        }
2294
2295        st->state = TCP_SEQ_STATE_TIME_WAIT;
2296        tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2297        goto get_tw;
2298found:
2299        cur = sk;
2300out:
2301        return cur;
2302}
2303
2304static void *established_get_idx(struct seq_file *seq, loff_t pos)
2305{
2306        void *rc = established_get_first(seq);
2307
2308        while (rc && pos) {
2309                rc = established_get_next(seq, rc);
2310                --pos;
2311        }               
2312        return rc;
2313}
2314
2315static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2316{
2317        void *rc;
2318        struct tcp_iter_state* st = seq->private;
2319
2320        tcp_listen_lock();
2321        st->state = TCP_SEQ_STATE_LISTENING;
2322        rc        = listening_get_idx(seq, &pos);
2323
2324        if (!rc) {
2325                tcp_listen_unlock();
2326                local_bh_disable();
2327                st->state = TCP_SEQ_STATE_ESTABLISHED;
2328                rc        = established_get_idx(seq, pos);
2329        }
2330
2331        return rc;
2332}
2333
2334static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2335{
2336        struct tcp_iter_state* st = seq->private;
2337        st->state = TCP_SEQ_STATE_LISTENING;
2338        st->num = 0;
2339        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2340}
2341
2342static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2343{
2344        void *rc = NULL;
2345        struct tcp_iter_state* st;
2346
2347        if (v == SEQ_START_TOKEN) {
2348                rc = tcp_get_idx(seq, 0);
2349                goto out;
2350        }
2351        st = seq->private;
2352
2353        switch (st->state) {
2354        case TCP_SEQ_STATE_OPENREQ:
2355        case TCP_SEQ_STATE_LISTENING:
2356                rc = listening_get_next(seq, v);
2357                if (!rc) {
2358                        tcp_listen_unlock();
2359                        local_bh_disable();
2360                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2361                        rc        = established_get_first(seq);
2362                }
2363                break;
2364        case TCP_SEQ_STATE_ESTABLISHED:
2365        case TCP_SEQ_STATE_TIME_WAIT:
2366                rc = established_get_next(seq, v);
2367                break;
2368        }
2369out:
2370        ++*pos;
2371        return rc;
2372}
2373
2374static void tcp_seq_stop(struct seq_file *seq, void *v)
2375{
2376        struct tcp_iter_state* st = seq->private;
2377
2378        switch (st->state) {
2379        case TCP_SEQ_STATE_OPENREQ:
2380                if (v) {
2381                        struct tcp_opt *tp = tcp_sk(st->syn_wait_sk);
2382                        read_unlock_bh(&tp->syn_wait_lock);
2383                }
2384        case TCP_SEQ_STATE_LISTENING:
2385                if (v != SEQ_START_TOKEN)
2386                        tcp_listen_unlock();
2387                break;
2388        case TCP_SEQ_STATE_TIME_WAIT:
2389        case TCP_SEQ_STATE_ESTABLISHED:
2390                if (v)
2391                        read_unlock(&tcp_ehash[st->bucket].lock);
2392                local_bh_enable();
2393                break;
2394        }
2395}
2396
2397static int tcp_seq_open(struct inode *inode, struct file *file)
2398{
2399        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2400        struct seq_file *seq;
2401        struct tcp_iter_state *s;
2402        int rc;
2403
2404        if (unlikely(afinfo == NULL))
2405                return -EINVAL;
2406
2407        s = kmalloc(sizeof(*s), GFP_KERNEL);
2408        if (!s)
2409                return -ENOMEM;
2410        memset(s, 0, sizeof(*s));
2411        s->family               = afinfo->family;
2412        s->seq_ops.start        = tcp_seq_start;
2413        s->seq_ops.next         = tcp_seq_next;
2414        s->seq_ops.show         = afinfo->seq_show;
2415        s->seq_ops.stop         = tcp_seq_stop;
2416
2417        rc = seq_open(file, &s->seq_ops);
2418        if (rc)
2419                goto out_kfree;
2420        seq          = file->private_data;
2421        seq->private = s;
2422out:
2423        return rc;
2424out_kfree:
2425        kfree(s);
2426        goto out;
2427}
2428
2429int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2430{
2431        int rc = 0;
2432        struct proc_dir_entry *p;
2433
2434        if (!afinfo)
2435                return -EINVAL;
2436        afinfo->seq_fops->owner         = afinfo->owner;
2437        afinfo->seq_fops->open          = tcp_seq_open;
2438        afinfo->seq_fops->read          = seq_read;
2439        afinfo->seq_fops->llseek        = seq_lseek;
2440        afinfo->seq_fops->release       = seq_release_private;
2441        
2442        p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2443        if (p)
2444                p->data = afinfo;
2445        else
2446                rc = -ENOMEM;
2447        return rc;
2448}
2449
2450void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2451{
2452        if (!afinfo)
2453                return;
2454        proc_net_remove(afinfo->name);
2455        memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2456}
2457
2458static void get_openreq4(struct sock *sk, struct open_request *req,
2459                         char *tmpbuf, int i, int uid)
2460{
2461        int ttd = req->expires - jiffies;
2462
2463        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2464                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2465                i,
2466                req->af.v4_req.loc_addr,
2467                ntohs(inet_sk(sk)->sport),
2468                req->af.v4_req.rmt_addr,
2469                ntohs(req->rmt_port),
2470                TCP_SYN_RECV,
2471                0, 0, /* could print option size, but that is af dependent. */
2472                1,    /* timers active (only the expire timer) */
2473                jiffies_to_clock_t(ttd),
2474                req->retrans,
2475                uid,
2476                0,  /* non standard timer */
2477                0, /* open_requests have no inode */
2478                atomic_read(&sk->sk_refcnt),
2479                req);
2480}
2481
2482static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2483{
2484        int timer_active;
2485        unsigned long timer_expires;
2486        struct tcp_opt *tp = tcp_sk(sp);
2487        struct inet_opt *inet = inet_sk(sp);
2488        unsigned int dest = inet->daddr;
2489        unsigned int src = inet->rcv_saddr;
2490        __u16 destp = ntohs(inet->dport);
2491        __u16 srcp = ntohs(inet->sport);
2492
2493        if (tp->pending == TCP_TIME_RETRANS) {
2494                timer_active    = 1;
2495                timer_expires   = tp->timeout;
2496        } else if (tp->pending == TCP_TIME_PROBE0) {
2497                timer_active    = 4;
2498                timer_expires   = tp->timeout;
2499        } else if (timer_pending(&sp->sk_timer)) {
2500                timer_active    = 2;
2501                timer_expires   = sp->sk_timer.expires;
2502        } else {
2503                timer_active    = 0;
2504                timer_expires = jiffies;
2505        }
2506
2507        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2508                        "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2509                i, src, srcp, dest, destp, sp->sk_state,
2510                tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2511                timer_active,
2512                jiffies_to_clock_t(timer_expires - jiffies),
2513                tp->retransmits,
2514                sock_i_uid(sp),
2515                tp->probes_out,
2516                sock_i_ino(sp),
2517                atomic_read(&sp->sk_refcnt), sp,
2518                tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2519                tp->snd_cwnd,
2520                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2521}
2522
2523static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2524{
2525        unsigned int dest, src;
2526        __u16 destp, srcp;
2527        int ttd = tw->tw_ttd - jiffies;
2528
2529        if (ttd < 0)
2530                ttd = 0;
2531
2532        dest  = tw->tw_daddr;
2533        src   = tw->tw_rcv_saddr;
2534        destp = ntohs(tw->tw_dport);
2535        srcp  = ntohs(tw->tw_sport);
2536
2537        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2538                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2539                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2540                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2541                atomic_read(&tw->tw_refcnt), tw);
2542}
2543
2544#define TMPSZ 150
2545
2546static int tcp4_seq_show(struct seq_file *seq, void *v)
2547{
2548        struct tcp_iter_state* st;
2549        char tmpbuf[TMPSZ + 1];
2550
2551        if (v == SEQ_START_TOKEN) {
2552                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2553                           "  sl  local_address rem_address   st tx_queue "
2554                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2555                           "inode");
2556                goto out;
2557        }
2558        st = seq->private;
2559
2560        switch (st->state) {
2561        case TCP_SEQ_STATE_LISTENING:
2562        case TCP_SEQ_STATE_ESTABLISHED:
2563                get_tcp4_sock(v, tmpbuf, st->num);
2564                break;
2565        case TCP_SEQ_STATE_OPENREQ:
2566                get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2567                break;
2568        case TCP_SEQ_STATE_TIME_WAIT:
2569                get_timewait4_sock(v, tmpbuf, st->num);
2570                break;
2571        }
2572        seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2573out:
2574        return 0;
2575}
2576
2577static struct file_operations tcp4_seq_fops;
2578static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2579        .owner          = THIS_MODULE,
2580        .name           = "tcp",
2581        .family         = AF_INET,
2582        .seq_show       = tcp4_seq_show,
2583        .seq_fops       = &tcp4_seq_fops,
2584};
2585
2586int __init tcp4_proc_init(void)
2587{
2588        return tcp_proc_register(&tcp4_seq_afinfo);
2589}
2590
2591void tcp4_proc_exit(void)
2592{
2593        tcp_proc_unregister(&tcp4_seq_afinfo);
2594}
2595#endif /* CONFIG_PROC_FS */
2596
2597struct proto tcp_prot = {
2598        .name                   = "TCP",
2599        .close                  = tcp_close,
2600        .connect                = tcp_v4_connect,
2601        .disconnect             = tcp_disconnect,
2602        .accept                 = tcp_accept,
2603        .ioctl                  = tcp_ioctl,
2604        .init                   = tcp_v4_init_sock,
2605        .destroy                = tcp_v4_destroy_sock,
2606        .shutdown               = tcp_shutdown,
2607        .setsockopt             = tcp_setsockopt,
2608        .getsockopt             = tcp_getsockopt,
2609        .sendmsg                = tcp_sendmsg,
2610        .recvmsg                = tcp_recvmsg,
2611        .backlog_rcv            = tcp_v4_do_rcv,
2612        .hash                   = tcp_v4_hash,
2613        .unhash                 = tcp_unhash,
2614        .get_port               = tcp_v4_get_port,
2615        .enter_memory_pressure  = tcp_enter_memory_pressure,
2616        .sockets_allocated      = &tcp_sockets_allocated,
2617        .memory_allocated       = &tcp_memory_allocated,
2618        .memory_pressure        = &tcp_memory_pressure,
2619        .sysctl_mem             = sysctl_tcp_mem,
2620        .sysctl_wmem            = sysctl_tcp_wmem,
2621        .sysctl_rmem            = sysctl_tcp_rmem,
2622        .max_header             = MAX_TCP_HEADER,
2623        .slab_obj_size          = sizeof(struct tcp_sock),
2624};
2625
2626
2627
2628void __init tcp_v4_init(struct net_proto_family *ops)
2629{
2630        int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2631        if (err < 0)
2632                panic("Failed to create the TCP control socket.\n");
2633        tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2634        inet_sk(tcp_socket->sk)->uc_ttl = -1;
2635
2636        /* Unhash it so that IP input processing does not even
2637         * see it, we do not wish this socket to see incoming
2638         * packets.
2639         */
2640        tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2641}
2642
2643EXPORT_SYMBOL(ipv4_specific);
2644EXPORT_SYMBOL(tcp_bind_hash);
2645EXPORT_SYMBOL(tcp_bucket_create);
2646EXPORT_SYMBOL(tcp_hashinfo);
2647EXPORT_SYMBOL(tcp_inherit_port);
2648EXPORT_SYMBOL(tcp_listen_wlock);
2649EXPORT_SYMBOL(tcp_port_rover);
2650EXPORT_SYMBOL(tcp_prot);
2651EXPORT_SYMBOL(tcp_put_port);
2652EXPORT_SYMBOL(tcp_unhash);
2653EXPORT_SYMBOL(tcp_v4_conn_request);
2654EXPORT_SYMBOL(tcp_v4_connect);
2655EXPORT_SYMBOL(tcp_v4_do_rcv);
2656EXPORT_SYMBOL(tcp_v4_lookup_listener);
2657EXPORT_SYMBOL(tcp_v4_rebuild_header);
2658EXPORT_SYMBOL(tcp_v4_remember_stamp);
2659EXPORT_SYMBOL(tcp_v4_send_check);
2660EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2661
2662#ifdef CONFIG_PROC_FS
2663EXPORT_SYMBOL(tcp_proc_register);
2664EXPORT_SYMBOL(tcp_proc_unregister);
2665#endif
2666#ifdef CONFIG_SYSCTL
2667EXPORT_SYMBOL(sysctl_local_port_range);
2668EXPORT_SYMBOL(sysctl_max_syn_backlog);
2669EXPORT_SYMBOL(sysctl_tcp_low_latency);
2670#endif
2671
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.