linux/net/netfilter/nf_conntrack_core.c
<<
>>
Prefs
   1/* Connection state tracking for netfilter.  This is separated from,
   2   but required by, the NAT layer; it can also be used by an iptables
   3   extension. */
   4
   5/* (C) 1999-2001 Paul `Rusty' Russell
   6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 */
  13
  14#include <linux/types.h>
  15#include <linux/netfilter.h>
  16#include <linux/module.h>
  17#include <linux/skbuff.h>
  18#include <linux/proc_fs.h>
  19#include <linux/vmalloc.h>
  20#include <linux/stddef.h>
  21#include <linux/slab.h>
  22#include <linux/random.h>
  23#include <linux/jhash.h>
  24#include <linux/err.h>
  25#include <linux/percpu.h>
  26#include <linux/moduleparam.h>
  27#include <linux/notifier.h>
  28#include <linux/kernel.h>
  29#include <linux/netdevice.h>
  30#include <linux/socket.h>
  31#include <linux/mm.h>
  32
  33#include <net/netfilter/nf_conntrack.h>
  34#include <net/netfilter/nf_conntrack_l3proto.h>
  35#include <net/netfilter/nf_conntrack_l4proto.h>
  36#include <net/netfilter/nf_conntrack_expect.h>
  37#include <net/netfilter/nf_conntrack_helper.h>
  38#include <net/netfilter/nf_conntrack_core.h>
  39#include <net/netfilter/nf_conntrack_extend.h>
  40#include <net/netfilter/nf_conntrack_acct.h>
  41#include <net/netfilter/nf_nat.h>
  42
  43#define NF_CONNTRACK_VERSION    "0.5.0"
  44
  45unsigned int
  46(*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
  47                                  enum nf_nat_manip_type manip,
  48                                  struct nlattr *attr) __read_mostly;
  49EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
  50
  51DEFINE_SPINLOCK(nf_conntrack_lock);
  52EXPORT_SYMBOL_GPL(nf_conntrack_lock);
  53
  54unsigned int nf_conntrack_htable_size __read_mostly;
  55EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
  56
  57int nf_conntrack_max __read_mostly;
  58EXPORT_SYMBOL_GPL(nf_conntrack_max);
  59
  60struct nf_conn nf_conntrack_untracked __read_mostly;
  61EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
  62
  63static struct kmem_cache *nf_conntrack_cachep __read_mostly;
  64
  65static int nf_conntrack_hash_rnd_initted;
  66static unsigned int nf_conntrack_hash_rnd;
  67
  68static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
  69                                  unsigned int size, unsigned int rnd)
  70{
  71        unsigned int n;
  72        u_int32_t h;
  73
  74        /* The direction must be ignored, so we hash everything up to the
  75         * destination ports (which is a multiple of 4) and treat the last
  76         * three bytes manually.
  77         */
  78        n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
  79        h = jhash2((u32 *)tuple, n,
  80                   rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
  81                          tuple->dst.protonum));
  82
  83        return ((u64)h * size) >> 32;
  84}
  85
  86static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
  87{
  88        return __hash_conntrack(tuple, nf_conntrack_htable_size,
  89                                nf_conntrack_hash_rnd);
  90}
  91
  92bool
  93nf_ct_get_tuple(const struct sk_buff *skb,
  94                unsigned int nhoff,
  95                unsigned int dataoff,
  96                u_int16_t l3num,
  97                u_int8_t protonum,
  98                struct nf_conntrack_tuple *tuple,
  99                const struct nf_conntrack_l3proto *l3proto,
 100                const struct nf_conntrack_l4proto *l4proto)
 101{
 102        memset(tuple, 0, sizeof(*tuple));
 103
 104        tuple->src.l3num = l3num;
 105        if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
 106                return false;
 107
 108        tuple->dst.protonum = protonum;
 109        tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 110
 111        return l4proto->pkt_to_tuple(skb, dataoff, tuple);
 112}
 113EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
 114
 115bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 116                       u_int16_t l3num, struct nf_conntrack_tuple *tuple)
 117{
 118        struct nf_conntrack_l3proto *l3proto;
 119        struct nf_conntrack_l4proto *l4proto;
 120        unsigned int protoff;
 121        u_int8_t protonum;
 122        int ret;
 123
 124        rcu_read_lock();
 125
 126        l3proto = __nf_ct_l3proto_find(l3num);
 127        ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
 128        if (ret != NF_ACCEPT) {
 129                rcu_read_unlock();
 130                return false;
 131        }
 132
 133        l4proto = __nf_ct_l4proto_find(l3num, protonum);
 134
 135        ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
 136                              l3proto, l4proto);
 137
 138        rcu_read_unlock();
 139        return ret;
 140}
 141EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 142
 143bool
 144nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 145                   const struct nf_conntrack_tuple *orig,
 146                   const struct nf_conntrack_l3proto *l3proto,
 147                   const struct nf_conntrack_l4proto *l4proto)
 148{
 149        memset(inverse, 0, sizeof(*inverse));
 150
 151        inverse->src.l3num = orig->src.l3num;
 152        if (l3proto->invert_tuple(inverse, orig) == 0)
 153                return false;
 154
 155        inverse->dst.dir = !orig->dst.dir;
 156
 157        inverse->dst.protonum = orig->dst.protonum;
 158        return l4proto->invert_tuple(inverse, orig);
 159}
 160EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 161
 162static void
 163clean_from_lists(struct nf_conn *ct)
 164{
 165        pr_debug("clean_from_lists(%p)\n", ct);
 166        hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 167        hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode);
 168
 169        /* Destroy all pending expectations */
 170        nf_ct_remove_expectations(ct);
 171}
 172
 173static void
 174destroy_conntrack(struct nf_conntrack *nfct)
 175{
 176        struct nf_conn *ct = (struct nf_conn *)nfct;
 177        struct net *net = nf_ct_net(ct);
 178        struct nf_conntrack_l4proto *l4proto;
 179
 180        pr_debug("destroy_conntrack(%p)\n", ct);
 181        NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 182        NF_CT_ASSERT(!timer_pending(&ct->timeout));
 183
 184        nf_conntrack_event(IPCT_DESTROY, ct);
 185        set_bit(IPS_DYING_BIT, &ct->status);
 186
 187        /* To make sure we don't get any weird locking issues here:
 188         * destroy_conntrack() MUST NOT be called with a write lock
 189         * to nf_conntrack_lock!!! -HW */
 190        rcu_read_lock();
 191        l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 192        if (l4proto && l4proto->destroy)
 193                l4proto->destroy(ct);
 194
 195        rcu_read_unlock();
 196
 197        spin_lock_bh(&nf_conntrack_lock);
 198        /* Expectations will have been removed in clean_from_lists,
 199         * except TFTP can create an expectation on the first packet,
 200         * before connection is in the list, so we need to clean here,
 201         * too. */
 202        nf_ct_remove_expectations(ct);
 203
 204        /* We overload first tuple to link into unconfirmed list. */
 205        if (!nf_ct_is_confirmed(ct)) {
 206                BUG_ON(hlist_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode));
 207                hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 208        }
 209
 210        NF_CT_STAT_INC(net, delete);
 211        spin_unlock_bh(&nf_conntrack_lock);
 212
 213        if (ct->master)
 214                nf_ct_put(ct->master);
 215
 216        pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 217        nf_conntrack_free(ct);
 218}
 219
 220static void death_by_timeout(unsigned long ul_conntrack)
 221{
 222        struct nf_conn *ct = (void *)ul_conntrack;
 223        struct net *net = nf_ct_net(ct);
 224        struct nf_conn_help *help = nfct_help(ct);
 225        struct nf_conntrack_helper *helper;
 226
 227        if (help) {
 228                rcu_read_lock();
 229                helper = rcu_dereference(help->helper);
 230                if (helper && helper->destroy)
 231                        helper->destroy(ct);
 232                rcu_read_unlock();
 233        }
 234
 235        spin_lock_bh(&nf_conntrack_lock);
 236        /* Inside lock so preempt is disabled on module removal path.
 237         * Otherwise we can get spurious warnings. */
 238        NF_CT_STAT_INC(net, delete_list);
 239        clean_from_lists(ct);
 240        spin_unlock_bh(&nf_conntrack_lock);
 241        nf_ct_put(ct);
 242}
 243
 244struct nf_conntrack_tuple_hash *
 245__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple)
 246{
 247        struct nf_conntrack_tuple_hash *h;
 248        struct hlist_node *n;
 249        unsigned int hash = hash_conntrack(tuple);
 250
 251        /* Disable BHs the entire time since we normally need to disable them
 252         * at least once for the stats anyway.
 253         */
 254        local_bh_disable();
 255        hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) {
 256                if (nf_ct_tuple_equal(tuple, &h->tuple)) {
 257                        NF_CT_STAT_INC(net, found);
 258                        local_bh_enable();
 259                        return h;
 260                }
 261                NF_CT_STAT_INC(net, searched);
 262        }
 263        local_bh_enable();
 264
 265        return NULL;
 266}
 267EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 268
 269/* Find a connection corresponding to a tuple. */
 270struct nf_conntrack_tuple_hash *
 271nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple)
 272{
 273        struct nf_conntrack_tuple_hash *h;
 274        struct nf_conn *ct;
 275
 276        rcu_read_lock();
 277        h = __nf_conntrack_find(net, tuple);
 278        if (h) {
 279                ct = nf_ct_tuplehash_to_ctrack(h);
 280                if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
 281                        h = NULL;
 282        }
 283        rcu_read_unlock();
 284
 285        return h;
 286}
 287EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 288
 289static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 290                                       unsigned int hash,
 291                                       unsigned int repl_hash)
 292{
 293        struct net *net = nf_ct_net(ct);
 294
 295        hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
 296                           &net->ct.hash[hash]);
 297        hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode,
 298                           &net->ct.hash[repl_hash]);
 299}
 300
 301void nf_conntrack_hash_insert(struct nf_conn *ct)
 302{
 303        unsigned int hash, repl_hash;
 304
 305        hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 306        repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 307
 308        __nf_conntrack_hash_insert(ct, hash, repl_hash);
 309}
 310EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
 311
 312/* Confirm a connection given skb; places it in hash table */
 313int
 314__nf_conntrack_confirm(struct sk_buff *skb)
 315{
 316        unsigned int hash, repl_hash;
 317        struct nf_conntrack_tuple_hash *h;
 318        struct nf_conn *ct;
 319        struct nf_conn_help *help;
 320        struct hlist_node *n;
 321        enum ip_conntrack_info ctinfo;
 322        struct net *net;
 323
 324        ct = nf_ct_get(skb, &ctinfo);
 325        net = nf_ct_net(ct);
 326
 327        /* ipt_REJECT uses nf_conntrack_attach to attach related
 328           ICMP/TCP RST packets in other direction.  Actual packet
 329           which created connection will be IP_CT_NEW or for an
 330           expected connection, IP_CT_RELATED. */
 331        if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
 332                return NF_ACCEPT;
 333
 334        hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 335        repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 336
 337        /* We're not in hash table, and we refuse to set up related
 338           connections for unconfirmed conns.  But packet copies and
 339           REJECT will give spurious warnings here. */
 340        /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
 341
 342        /* No external references means noone else could have
 343           confirmed us. */
 344        NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 345        pr_debug("Confirming conntrack %p\n", ct);
 346
 347        spin_lock_bh(&nf_conntrack_lock);
 348
 349        /* See if there's one in the list already, including reverse:
 350           NAT could have grabbed it without realizing, since we're
 351           not in the hash.  If there is, we lost race. */
 352        hlist_for_each_entry(h, n, &net->ct.hash[hash], hnode)
 353                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 354                                      &h->tuple))
 355                        goto out;
 356        hlist_for_each_entry(h, n, &net->ct.hash[repl_hash], hnode)
 357                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 358                                      &h->tuple))
 359                        goto out;
 360
 361        /* Remove from unconfirmed list */
 362        hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 363
 364        __nf_conntrack_hash_insert(ct, hash, repl_hash);
 365        /* Timer relative to confirmation time, not original
 366           setting time, otherwise we'd get timer wrap in
 367           weird delay cases. */
 368        ct->timeout.expires += jiffies;
 369        add_timer(&ct->timeout);
 370        atomic_inc(&ct->ct_general.use);
 371        set_bit(IPS_CONFIRMED_BIT, &ct->status);
 372        NF_CT_STAT_INC(net, insert);
 373        spin_unlock_bh(&nf_conntrack_lock);
 374        help = nfct_help(ct);
 375        if (help && help->helper)
 376                nf_conntrack_event_cache(IPCT_HELPER, ct);
 377#ifdef CONFIG_NF_NAT_NEEDED
 378        if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
 379            test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
 380                nf_conntrack_event_cache(IPCT_NATINFO, ct);
 381#endif
 382        nf_conntrack_event_cache(master_ct(ct) ?
 383                                 IPCT_RELATED : IPCT_NEW, ct);
 384        return NF_ACCEPT;
 385
 386out:
 387        NF_CT_STAT_INC(net, insert_failed);
 388        spin_unlock_bh(&nf_conntrack_lock);
 389        return NF_DROP;
 390}
 391EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 392
 393/* Returns true if a connection correspondings to the tuple (required
 394   for NAT). */
 395int
 396nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 397                         const struct nf_conn *ignored_conntrack)
 398{
 399        struct net *net = nf_ct_net(ignored_conntrack);
 400        struct nf_conntrack_tuple_hash *h;
 401        struct hlist_node *n;
 402        unsigned int hash = hash_conntrack(tuple);
 403
 404        /* Disable BHs the entire time since we need to disable them at
 405         * least once for the stats anyway.
 406         */
 407        rcu_read_lock_bh();
 408        hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) {
 409                if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
 410                    nf_ct_tuple_equal(tuple, &h->tuple)) {
 411                        NF_CT_STAT_INC(net, found);
 412                        rcu_read_unlock_bh();
 413                        return 1;
 414                }
 415                NF_CT_STAT_INC(net, searched);
 416        }
 417        rcu_read_unlock_bh();
 418
 419        return 0;
 420}
 421EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 422
 423#define NF_CT_EVICTION_RANGE    8
 424
 425/* There's a small race here where we may free a just-assured
 426   connection.  Too bad: we're in trouble anyway. */
 427static noinline int early_drop(struct net *net, unsigned int hash)
 428{
 429        /* Use oldest entry, which is roughly LRU */
 430        struct nf_conntrack_tuple_hash *h;
 431        struct nf_conn *ct = NULL, *tmp;
 432        struct hlist_node *n;
 433        unsigned int i, cnt = 0;
 434        int dropped = 0;
 435
 436        rcu_read_lock();
 437        for (i = 0; i < nf_conntrack_htable_size; i++) {
 438                hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash],
 439                                         hnode) {
 440                        tmp = nf_ct_tuplehash_to_ctrack(h);
 441                        if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
 442                                ct = tmp;
 443                        cnt++;
 444                }
 445
 446                if (ct && unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
 447                        ct = NULL;
 448                if (ct || cnt >= NF_CT_EVICTION_RANGE)
 449                        break;
 450                hash = (hash + 1) % nf_conntrack_htable_size;
 451        }
 452        rcu_read_unlock();
 453
 454        if (!ct)
 455                return dropped;
 456
 457        if (del_timer(&ct->timeout)) {
 458                death_by_timeout((unsigned long)ct);
 459                dropped = 1;
 460                NF_CT_STAT_INC_ATOMIC(net, early_drop);
 461        }
 462        nf_ct_put(ct);
 463        return dropped;
 464}
 465
 466struct nf_conn *nf_conntrack_alloc(struct net *net,
 467                                   const struct nf_conntrack_tuple *orig,
 468                                   const struct nf_conntrack_tuple *repl,
 469                                   gfp_t gfp)
 470{
 471        struct nf_conn *ct = NULL;
 472
 473        if (unlikely(!nf_conntrack_hash_rnd_initted)) {
 474                get_random_bytes(&nf_conntrack_hash_rnd, 4);
 475                nf_conntrack_hash_rnd_initted = 1;
 476        }
 477
 478        /* We don't want any race condition at early drop stage */
 479        atomic_inc(&net->ct.count);
 480
 481        if (nf_conntrack_max &&
 482            unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
 483                unsigned int hash = hash_conntrack(orig);
 484                if (!early_drop(net, hash)) {
 485                        atomic_dec(&net->ct.count);
 486                        if (net_ratelimit())
 487                                printk(KERN_WARNING
 488                                       "nf_conntrack: table full, dropping"
 489                                       " packet.\n");
 490                        return ERR_PTR(-ENOMEM);
 491                }
 492        }
 493
 494        ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp);
 495        if (ct == NULL) {
 496                pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
 497                atomic_dec(&net->ct.count);
 498                return ERR_PTR(-ENOMEM);
 499        }
 500
 501        atomic_set(&ct->ct_general.use, 1);
 502        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 503        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
 504        /* Don't set timer yet: wait for confirmation */
 505        setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
 506#ifdef CONFIG_NET_NS
 507        ct->ct_net = net;
 508#endif
 509        INIT_RCU_HEAD(&ct->rcu);
 510
 511        return ct;
 512}
 513EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 514
 515static void nf_conntrack_free_rcu(struct rcu_head *head)
 516{
 517        struct nf_conn *ct = container_of(head, struct nf_conn, rcu);
 518        struct net *net = nf_ct_net(ct);
 519
 520        nf_ct_ext_free(ct);
 521        kmem_cache_free(nf_conntrack_cachep, ct);
 522        atomic_dec(&net->ct.count);
 523}
 524
 525void nf_conntrack_free(struct nf_conn *ct)
 526{
 527        nf_ct_ext_destroy(ct);
 528        call_rcu(&ct->rcu, nf_conntrack_free_rcu);
 529}
 530EXPORT_SYMBOL_GPL(nf_conntrack_free);
 531
 532/* Allocate a new conntrack: we return -ENOMEM if classification
 533   failed due to stress.  Otherwise it really is unclassifiable. */
 534static struct nf_conntrack_tuple_hash *
 535init_conntrack(struct net *net,
 536               const struct nf_conntrack_tuple *tuple,
 537               struct nf_conntrack_l3proto *l3proto,
 538               struct nf_conntrack_l4proto *l4proto,
 539               struct sk_buff *skb,
 540               unsigned int dataoff)
 541{
 542        struct nf_conn *ct;
 543        struct nf_conn_help *help;
 544        struct nf_conntrack_tuple repl_tuple;
 545        struct nf_conntrack_expect *exp;
 546
 547        if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
 548                pr_debug("Can't invert tuple.\n");
 549                return NULL;
 550        }
 551
 552        ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC);
 553        if (ct == NULL || IS_ERR(ct)) {
 554                pr_debug("Can't allocate conntrack.\n");
 555                return (struct nf_conntrack_tuple_hash *)ct;
 556        }
 557
 558        if (!l4proto->new(ct, skb, dataoff)) {
 559                nf_conntrack_free(ct);
 560                pr_debug("init conntrack: can't track with proto module\n");
 561                return NULL;
 562        }
 563
 564        nf_ct_acct_ext_add(ct, GFP_ATOMIC);
 565
 566        spin_lock_bh(&nf_conntrack_lock);
 567        exp = nf_ct_find_expectation(net, tuple);
 568        if (exp) {
 569                pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
 570                         ct, exp);
 571                /* Welcome, Mr. Bond.  We've been expecting you... */
 572                __set_bit(IPS_EXPECTED_BIT, &ct->status);
 573                ct->master = exp->master;
 574                if (exp->helper) {
 575                        help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
 576                        if (help)
 577                                rcu_assign_pointer(help->helper, exp->helper);
 578                }
 579
 580#ifdef CONFIG_NF_CONNTRACK_MARK
 581                ct->mark = exp->master->mark;
 582#endif
 583#ifdef CONFIG_NF_CONNTRACK_SECMARK
 584                ct->secmark = exp->master->secmark;
 585#endif
 586                nf_conntrack_get(&ct->master->ct_general);
 587                NF_CT_STAT_INC(net, expect_new);
 588        } else {
 589                struct nf_conntrack_helper *helper;
 590
 591                helper = __nf_ct_helper_find(&repl_tuple);
 592                if (helper) {
 593                        help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
 594                        if (help)
 595                                rcu_assign_pointer(help->helper, helper);
 596                }
 597                NF_CT_STAT_INC(net, new);
 598        }
 599
 600        /* Overload tuple linked list to put us in unconfirmed list. */
 601        hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
 602                       &net->ct.unconfirmed);
 603
 604        spin_unlock_bh(&nf_conntrack_lock);
 605
 606        if (exp) {
 607                if (exp->expectfn)
 608                        exp->expectfn(ct, exp);
 609                nf_ct_expect_put(exp);
 610        }
 611
 612        return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
 613}
 614
 615/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
 616static inline struct nf_conn *
 617resolve_normal_ct(struct net *net,
 618                  struct sk_buff *skb,
 619                  unsigned int dataoff,
 620                  u_int16_t l3num,
 621                  u_int8_t protonum,
 622                  struct nf_conntrack_l3proto *l3proto,
 623                  struct nf_conntrack_l4proto *l4proto,
 624                  int *set_reply,
 625                  enum ip_conntrack_info *ctinfo)
 626{
 627        struct nf_conntrack_tuple tuple;
 628        struct nf_conntrack_tuple_hash *h;
 629        struct nf_conn *ct;
 630
 631        if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 632                             dataoff, l3num, protonum, &tuple, l3proto,
 633                             l4proto)) {
 634                pr_debug("resolve_normal_ct: Can't get tuple\n");
 635                return NULL;
 636        }
 637
 638        /* look for tuple match */
 639        h = nf_conntrack_find_get(net, &tuple);
 640        if (!h) {
 641                h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
 642                if (!h)
 643                        return NULL;
 644                if (IS_ERR(h))
 645                        return (void *)h;
 646        }
 647        ct = nf_ct_tuplehash_to_ctrack(h);
 648
 649        /* It exists; we have (non-exclusive) reference. */
 650        if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
 651                *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
 652                /* Please set reply bit if this packet OK */
 653                *set_reply = 1;
 654        } else {
 655                /* Once we've had two way comms, always ESTABLISHED. */
 656                if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
 657                        pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
 658                        *ctinfo = IP_CT_ESTABLISHED;
 659                } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
 660                        pr_debug("nf_conntrack_in: related packet for %p\n",
 661                                 ct);
 662                        *ctinfo = IP_CT_RELATED;
 663                } else {
 664                        pr_debug("nf_conntrack_in: new packet for %p\n", ct);
 665                        *ctinfo = IP_CT_NEW;
 666                }
 667                *set_reply = 0;
 668        }
 669        skb->nfct = &ct->ct_general;
 670        skb->nfctinfo = *ctinfo;
 671        return ct;
 672}
 673
 674unsigned int
 675nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 676                struct sk_buff *skb)
 677{
 678        struct nf_conn *ct;
 679        enum ip_conntrack_info ctinfo;
 680        struct nf_conntrack_l3proto *l3proto;
 681        struct nf_conntrack_l4proto *l4proto;
 682        unsigned int dataoff;
 683        u_int8_t protonum;
 684        int set_reply = 0;
 685        int ret;
 686
 687        /* Previously seen (loopback or untracked)?  Ignore. */
 688        if (skb->nfct) {
 689                NF_CT_STAT_INC_ATOMIC(net, ignore);
 690                return NF_ACCEPT;
 691        }
 692
 693        /* rcu_read_lock()ed by nf_hook_slow */
 694        l3proto = __nf_ct_l3proto_find(pf);
 695        ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
 696                                   &dataoff, &protonum);
 697        if (ret <= 0) {
 698                pr_debug("not prepared to track yet or error occured\n");
 699                NF_CT_STAT_INC_ATOMIC(net, error);
 700                NF_CT_STAT_INC_ATOMIC(net, invalid);
 701                return -ret;
 702        }
 703
 704        l4proto = __nf_ct_l4proto_find(pf, protonum);
 705
 706        /* It may be an special packet, error, unclean...
 707         * inverse of the return code tells to the netfilter
 708         * core what to do with the packet. */
 709        if (l4proto->error != NULL) {
 710                ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum);
 711                if (ret <= 0) {
 712                        NF_CT_STAT_INC_ATOMIC(net, error);
 713                        NF_CT_STAT_INC_ATOMIC(net, invalid);
 714                        return -ret;
 715                }
 716        }
 717
 718        ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
 719                               l3proto, l4proto, &set_reply, &ctinfo);
 720        if (!ct) {
 721                /* Not valid part of a connection */
 722                NF_CT_STAT_INC_ATOMIC(net, invalid);
 723                return NF_ACCEPT;
 724        }
 725
 726        if (IS_ERR(ct)) {
 727                /* Too stressed to deal. */
 728                NF_CT_STAT_INC_ATOMIC(net, drop);
 729                return NF_DROP;
 730        }
 731
 732        NF_CT_ASSERT(skb->nfct);
 733
 734        ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
 735        if (ret < 0) {
 736                /* Invalid: inverse of the return code tells
 737                 * the netfilter core what to do */
 738                pr_debug("nf_conntrack_in: Can't track with proto module\n");
 739                nf_conntrack_put(skb->nfct);
 740                skb->nfct = NULL;
 741                NF_CT_STAT_INC_ATOMIC(net, invalid);
 742                return -ret;
 743        }
 744
 745        if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
 746                nf_conntrack_event_cache(IPCT_STATUS, ct);
 747
 748        return ret;
 749}
 750EXPORT_SYMBOL_GPL(nf_conntrack_in);
 751
 752bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
 753                          const struct nf_conntrack_tuple *orig)
 754{
 755        bool ret;
 756
 757        rcu_read_lock();
 758        ret = nf_ct_invert_tuple(inverse, orig,
 759                                 __nf_ct_l3proto_find(orig->src.l3num),
 760                                 __nf_ct_l4proto_find(orig->src.l3num,
 761                                                      orig->dst.protonum));
 762        rcu_read_unlock();
 763        return ret;
 764}
 765EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
 766
 767/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
 768   implicitly racy: see __nf_conntrack_confirm */
 769void nf_conntrack_alter_reply(struct nf_conn *ct,
 770                              const struct nf_conntrack_tuple *newreply)
 771{
 772        struct nf_conn_help *help = nfct_help(ct);
 773        struct nf_conntrack_helper *helper;
 774
 775        /* Should be unconfirmed, so not in hash table yet */
 776        NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 777
 778        pr_debug("Altering reply tuple of %p to ", ct);
 779        nf_ct_dump_tuple(newreply);
 780
 781        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
 782        if (ct->master || (help && !hlist_empty(&help->expectations)))
 783                return;
 784
 785        rcu_read_lock();
 786        helper = __nf_ct_helper_find(newreply);
 787        if (helper == NULL) {
 788                if (help)
 789                        rcu_assign_pointer(help->helper, NULL);
 790                goto out;
 791        }
 792
 793        if (help == NULL) {
 794                help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
 795                if (help == NULL)
 796                        goto out;
 797        } else {
 798                memset(&help->help, 0, sizeof(help->help));
 799        }
 800
 801        rcu_assign_pointer(help->helper, helper);
 802out:
 803        rcu_read_unlock();
 804}
 805EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
 806
 807/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
 808void __nf_ct_refresh_acct(struct nf_conn *ct,
 809                          enum ip_conntrack_info ctinfo,
 810                          const struct sk_buff *skb,
 811                          unsigned long extra_jiffies,
 812                          int do_acct)
 813{
 814        int event = 0;
 815
 816        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
 817        NF_CT_ASSERT(skb);
 818
 819        spin_lock_bh(&nf_conntrack_lock);
 820
 821        /* Only update if this is not a fixed timeout */
 822        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
 823                goto acct;
 824
 825        /* If not in hash table, timer will not be active yet */
 826        if (!nf_ct_is_confirmed(ct)) {
 827                ct->timeout.expires = extra_jiffies;
 828                event = IPCT_REFRESH;
 829        } else {
 830                unsigned long newtime = jiffies + extra_jiffies;
 831
 832                /* Only update the timeout if the new timeout is at least
 833                   HZ jiffies from the old timeout. Need del_timer for race
 834                   avoidance (may already be dying). */
 835                if (newtime - ct->timeout.expires >= HZ
 836                    && del_timer(&ct->timeout)) {
 837                        ct->timeout.expires = newtime;
 838                        add_timer(&ct->timeout);
 839                        event = IPCT_REFRESH;
 840                }
 841        }
 842
 843acct:
 844        if (do_acct) {
 845                struct nf_conn_counter *acct;
 846
 847                acct = nf_conn_acct_find(ct);
 848                if (acct) {
 849                        acct[CTINFO2DIR(ctinfo)].packets++;
 850                        acct[CTINFO2DIR(ctinfo)].bytes +=
 851                                skb->len - skb_network_offset(skb);
 852                }
 853        }
 854
 855        spin_unlock_bh(&nf_conntrack_lock);
 856
 857        /* must be unlocked when calling event cache */
 858        if (event)
 859                nf_conntrack_event_cache(event, ct);
 860}
 861EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
 862
 863bool __nf_ct_kill_acct(struct nf_conn *ct,
 864                       enum ip_conntrack_info ctinfo,
 865                       const struct sk_buff *skb,
 866                       int do_acct)
 867{
 868        if (do_acct) {
 869                struct nf_conn_counter *acct;
 870
 871                spin_lock_bh(&nf_conntrack_lock);
 872                acct = nf_conn_acct_find(ct);
 873                if (acct) {
 874                        acct[CTINFO2DIR(ctinfo)].packets++;
 875                        acct[CTINFO2DIR(ctinfo)].bytes +=
 876                                skb->len - skb_network_offset(skb);
 877                }
 878                spin_unlock_bh(&nf_conntrack_lock);
 879        }
 880
 881        if (del_timer(&ct->timeout)) {
 882                ct->timeout.function((unsigned long)ct);
 883                return true;
 884        }
 885        return false;
 886}
 887EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 888
 889#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 890
 891#include <linux/netfilter/nfnetlink.h>
 892#include <linux/netfilter/nfnetlink_conntrack.h>
 893#include <linux/mutex.h>
 894
 895/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
 896 * in ip_conntrack_core, since we don't want the protocols to autoload
 897 * or depend on ctnetlink */
 898int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
 899                               const struct nf_conntrack_tuple *tuple)
 900{
 901        NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port);
 902        NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port);
 903        return 0;
 904
 905nla_put_failure:
 906        return -1;
 907}
 908EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
 909
 910const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
 911        [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
 912        [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
 913};
 914EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
 915
 916int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
 917                               struct nf_conntrack_tuple *t)
 918{
 919        if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
 920                return -EINVAL;
 921
 922        t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
 923        t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
 924
 925        return 0;
 926}
 927EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
 928#endif
 929
 930/* Used by ipt_REJECT and ip6t_REJECT. */
 931static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 932{
 933        struct nf_conn *ct;
 934        enum ip_conntrack_info ctinfo;
 935
 936        /* This ICMP is in reverse direction to the packet which caused it */
 937        ct = nf_ct_get(skb, &ctinfo);
 938        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
 939                ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
 940        else
 941                ctinfo = IP_CT_RELATED;
 942
 943        /* Attach to new skbuff, and increment count */
 944        nskb->nfct = &ct->ct_general;
 945        nskb->nfctinfo = ctinfo;
 946        nf_conntrack_get(nskb->nfct);
 947}
 948
 949/* Bring out ya dead! */
 950static struct nf_conn *
 951get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 952                void *data, unsigned int *bucket)
 953{
 954        struct nf_conntrack_tuple_hash *h;
 955        struct nf_conn *ct;
 956        struct hlist_node *n;
 957
 958        spin_lock_bh(&nf_conntrack_lock);
 959        for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
 960                hlist_for_each_entry(h, n, &net->ct.hash[*bucket], hnode) {
 961                        ct = nf_ct_tuplehash_to_ctrack(h);
 962                        if (iter(ct, data))
 963                                goto found;
 964                }
 965        }
 966        hlist_for_each_entry(h, n, &net->ct.unconfirmed, hnode) {
 967                ct = nf_ct_tuplehash_to_ctrack(h);
 968                if (iter(ct, data))
 969                        set_bit(IPS_DYING_BIT, &ct->status);
 970        }
 971        spin_unlock_bh(&nf_conntrack_lock);
 972        return NULL;
 973found:
 974        atomic_inc(&ct->ct_general.use);
 975        spin_unlock_bh(&nf_conntrack_lock);
 976        return ct;
 977}
 978
 979void nf_ct_iterate_cleanup(struct net *net,
 980                           int (*iter)(struct nf_conn *i, void *data),
 981                           void *data)
 982{
 983        struct nf_conn *ct;
 984        unsigned int bucket = 0;
 985
 986        while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
 987                /* Time to push up daises... */
 988                if (del_timer(&ct->timeout))
 989                        death_by_timeout((unsigned long)ct);
 990                /* ... else the timer will get him soon. */
 991
 992                nf_ct_put(ct);
 993        }
 994}
 995EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
 996
 997static int kill_all(struct nf_conn *i, void *data)
 998{
 999        return 1;
1000}
1001
1002void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, unsigned int size)
1003{
1004        if (vmalloced)
1005                vfree(hash);
1006        else
1007                free_pages((unsigned long)hash,
1008                           get_order(sizeof(struct hlist_head) * size));
1009}
1010EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1011
1012void nf_conntrack_flush(struct net *net)
1013{
1014        nf_ct_iterate_cleanup(net, kill_all, NULL);
1015}
1016EXPORT_SYMBOL_GPL(nf_conntrack_flush);
1017
1018static void nf_conntrack_cleanup_init_net(void)
1019{
1020        nf_conntrack_helper_fini();
1021        nf_conntrack_proto_fini();
1022        kmem_cache_destroy(nf_conntrack_cachep);
1023}
1024
1025static void nf_conntrack_cleanup_net(struct net *net)
1026{
1027        nf_ct_event_cache_flush(net);
1028        nf_conntrack_ecache_fini(net);
1029 i_see_dead_people:
1030        nf_conntrack_flush(net);
1031        if (atomic_read(&net->ct.count) != 0) {
1032                schedule();
1033                goto i_see_dead_people;
1034        }
1035        /* wait until all references to nf_conntrack_untracked are dropped */
1036        while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1037                schedule();
1038
1039        nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1040                             nf_conntrack_htable_size);
1041        nf_conntrack_acct_fini(net);
1042        nf_conntrack_expect_fini(net);
1043        free_percpu(net->ct.stat);
1044}
1045
1046/* Mishearing the voices in his head, our hero wonders how he's
1047   supposed to kill the mall. */
1048void nf_conntrack_cleanup(struct net *net)
1049{
1050        if (net_eq(net, &init_net))
1051                rcu_assign_pointer(ip_ct_attach, NULL);
1052
1053        /* This makes sure all current packets have passed through
1054           netfilter framework.  Roll on, two-stage module
1055           delete... */
1056        synchronize_net();
1057
1058        nf_conntrack_cleanup_net(net);
1059
1060        if (net_eq(net, &init_net)) {
1061                rcu_assign_pointer(nf_ct_destroy, NULL);
1062                nf_conntrack_cleanup_init_net();
1063        }
1064}
1065
1066struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced)
1067{
1068        struct hlist_head *hash;
1069        unsigned int size, i;
1070
1071        *vmalloced = 0;
1072
1073        size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head));
1074        hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN,
1075                                       get_order(sizeof(struct hlist_head)
1076                                                 * size));
1077        if (!hash) {
1078                *vmalloced = 1;
1079                printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1080                hash = vmalloc(sizeof(struct hlist_head) * size);
1081        }
1082
1083        if (hash)
1084                for (i = 0; i < size; i++)
1085                        INIT_HLIST_HEAD(&hash[i]);
1086
1087        return hash;
1088}
1089EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1090
1091int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1092{
1093        int i, bucket, vmalloced, old_vmalloced;
1094        unsigned int hashsize, old_size;
1095        int rnd;
1096        struct hlist_head *hash, *old_hash;
1097        struct nf_conntrack_tuple_hash *h;
1098
1099        /* On boot, we can set this without any fancy locking. */
1100        if (!nf_conntrack_htable_size)
1101                return param_set_uint(val, kp);
1102
1103        hashsize = simple_strtoul(val, NULL, 0);
1104        if (!hashsize)
1105                return -EINVAL;
1106
1107        hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced);
1108        if (!hash)
1109                return -ENOMEM;
1110
1111        /* We have to rehahs for the new table anyway, so we also can
1112         * use a newrandom seed */
1113        get_random_bytes(&rnd, 4);
1114
1115        /* Lookups in the old hash might happen in parallel, which means we
1116         * might get false negatives during connection lookup. New connections
1117         * created because of a false negative won't make it into the hash
1118         * though since that required taking the lock.
1119         */
1120        spin_lock_bh(&nf_conntrack_lock);
1121        for (i = 0; i < nf_conntrack_htable_size; i++) {
1122                while (!hlist_empty(&init_net.ct.hash[i])) {
1123                        h = hlist_entry(init_net.ct.hash[i].first,
1124                                        struct nf_conntrack_tuple_hash, hnode);
1125                        hlist_del_rcu(&h->hnode);
1126                        bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1127                        hlist_add_head(&h->hnode, &hash[bucket]);
1128                }
1129        }
1130        old_size = nf_conntrack_htable_size;
1131        old_vmalloced = init_net.ct.hash_vmalloc;
1132        old_hash = init_net.ct.hash;
1133
1134        nf_conntrack_htable_size = hashsize;
1135        init_net.ct.hash_vmalloc = vmalloced;
1136        init_net.ct.hash = hash;
1137        nf_conntrack_hash_rnd = rnd;
1138        spin_unlock_bh(&nf_conntrack_lock);
1139
1140        nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
1141        return 0;
1142}
1143EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1144
1145module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1146                  &nf_conntrack_htable_size, 0600);
1147
1148static int nf_conntrack_init_init_net(void)
1149{
1150        int max_factor = 8;
1151        int ret;
1152
1153        /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1154         * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
1155        if (!nf_conntrack_htable_size) {
1156                nf_conntrack_htable_size
1157                        = (((num_physpages << PAGE_SHIFT) / 16384)
1158                           / sizeof(struct hlist_head));
1159                if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1160                        nf_conntrack_htable_size = 16384;
1161                if (nf_conntrack_htable_size < 32)
1162                        nf_conntrack_htable_size = 32;
1163
1164                /* Use a max. factor of four by default to get the same max as
1165                 * with the old struct list_heads. When a table size is given
1166                 * we use the old value of 8 to avoid reducing the max.
1167                 * entries. */
1168                max_factor = 4;
1169        }
1170        nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1171
1172        printk("nf_conntrack version %s (%u buckets, %d max)\n",
1173               NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1174               nf_conntrack_max);
1175
1176        nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1177                                                sizeof(struct nf_conn),
1178                                                0, 0, NULL);
1179        if (!nf_conntrack_cachep) {
1180                printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1181                ret = -ENOMEM;
1182                goto err_cache;
1183        }
1184
1185        ret = nf_conntrack_proto_init();
1186        if (ret < 0)
1187                goto err_proto;
1188
1189        ret = nf_conntrack_helper_init();
1190        if (ret < 0)
1191                goto err_helper;
1192
1193        return 0;
1194
1195err_helper:
1196        nf_conntrack_proto_fini();
1197err_proto:
1198        kmem_cache_destroy(nf_conntrack_cachep);
1199err_cache:
1200        return ret;
1201}
1202
1203static int nf_conntrack_init_net(struct net *net)
1204{
1205        int ret;
1206
1207        atomic_set(&net->ct.count, 0);
1208        INIT_HLIST_HEAD(&net->ct.unconfirmed);
1209        net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1210        if (!net->ct.stat) {
1211                ret = -ENOMEM;
1212                goto err_stat;
1213        }
1214        ret = nf_conntrack_ecache_init(net);
1215        if (ret < 0)
1216                goto err_ecache;
1217        net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
1218                                                  &net->ct.hash_vmalloc);
1219        if (!net->ct.hash) {
1220                ret = -ENOMEM;
1221                printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1222                goto err_hash;
1223        }
1224        ret = nf_conntrack_expect_init(net);
1225        if (ret < 0)
1226                goto err_expect;
1227        ret = nf_conntrack_acct_init(net);
1228        if (ret < 0)
1229                goto err_acct;
1230
1231        /* Set up fake conntrack:
1232            - to never be deleted, not in any hashes */
1233#ifdef CONFIG_NET_NS
1234        nf_conntrack_untracked.ct_net = &init_net;
1235#endif
1236        atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1237        /*  - and look it like as a confirmed connection */
1238        set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1239
1240        return 0;
1241
1242err_acct:
1243        nf_conntrack_expect_fini(net);
1244err_expect:
1245        nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
1246                             nf_conntrack_htable_size);
1247err_hash:
1248        nf_conntrack_ecache_fini(net);
1249err_ecache:
1250        free_percpu(net->ct.stat);
1251err_stat:
1252        return ret;
1253}
1254
1255int nf_conntrack_init(struct net *net)
1256{
1257        int ret;
1258
1259        if (net_eq(net, &init_net)) {
1260                ret = nf_conntrack_init_init_net();
1261                if (ret < 0)
1262                        goto out_init_net;
1263        }
1264        ret = nf_conntrack_init_net(net);
1265        if (ret < 0)
1266                goto out_net;
1267
1268        if (net_eq(net, &init_net)) {
1269                /* For use by REJECT target */
1270                rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
1271                rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
1272        }
1273        return 0;
1274
1275out_net:
1276        if (net_eq(net, &init_net))
1277                nf_conntrack_cleanup_init_net();
1278out_init_net:
1279        return ret;
1280}
1281