linux/net/ipv4/ipvs/ip_vs_conn.c
<<
>>
Prefs
   1/*
   2 * IPVS         An implementation of the IP virtual server support for the
   3 *              LINUX operating system.  IPVS is now implemented as a module
   4 *              over the Netfilter framework. IPVS can be used to build a
   5 *              high-performance and highly available server based on a
   6 *              cluster of servers.
   7 *
   8 * Version:     $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
   9 *
  10 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
  11 *              Peter Kese <peter.kese@ijs.si>
  12 *              Julian Anastasov <ja@ssi.bg>
  13 *
  14 *              This program is free software; you can redistribute it and/or
  15 *              modify it under the terms of the GNU General Public License
  16 *              as published by the Free Software Foundation; either version
  17 *              2 of the License, or (at your option) any later version.
  18 *
  19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  21 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
  22 *
  23 * Changes:
  24 *
  25 */
  26
  27#include <linux/interrupt.h>
  28#include <linux/in.h>
  29#include <linux/net.h>
  30#include <linux/kernel.h>
  31#include <linux/module.h>
  32#include <linux/vmalloc.h>
  33#include <linux/proc_fs.h>              /* for proc_net_* */
  34#include <linux/seq_file.h>
  35#include <linux/jhash.h>
  36#include <linux/random.h>
  37
  38#include <net/net_namespace.h>
  39#include <net/ip_vs.h>
  40
  41
  42/*
  43 *  Connection hash table: for input and output packets lookups of IPVS
  44 */
  45static struct list_head *ip_vs_conn_tab;
  46
  47/*  SLAB cache for IPVS connections */
  48static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
  49
  50/*  counter for current IPVS connections */
  51static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
  52
  53/*  counter for no client port connections */
  54static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
  55
  56/* random value for IPVS connection hash */
  57static unsigned int ip_vs_conn_rnd;
  58
  59/*
  60 *  Fine locking granularity for big connection hash table
  61 */
  62#define CT_LOCKARRAY_BITS  4
  63#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
  64#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
  65
  66struct ip_vs_aligned_lock
  67{
  68        rwlock_t        l;
  69} __attribute__((__aligned__(SMP_CACHE_BYTES)));
  70
  71/* lock array for conn table */
  72static struct ip_vs_aligned_lock
  73__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
  74
  75static inline void ct_read_lock(unsigned key)
  76{
  77        read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
  78}
  79
  80static inline void ct_read_unlock(unsigned key)
  81{
  82        read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
  83}
  84
  85static inline void ct_write_lock(unsigned key)
  86{
  87        write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
  88}
  89
  90static inline void ct_write_unlock(unsigned key)
  91{
  92        write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
  93}
  94
  95static inline void ct_read_lock_bh(unsigned key)
  96{
  97        read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
  98}
  99
 100static inline void ct_read_unlock_bh(unsigned key)
 101{
 102        read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 103}
 104
 105static inline void ct_write_lock_bh(unsigned key)
 106{
 107        write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 108}
 109
 110static inline void ct_write_unlock_bh(unsigned key)
 111{
 112        write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 113}
 114
 115
 116/*
 117 *      Returns hash value for IPVS connection entry
 118 */
 119static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port)
 120{
 121        return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd)
 122                & IP_VS_CONN_TAB_MASK;
 123}
 124
 125
 126/*
 127 *      Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
 128 *      returns bool success.
 129 */
 130static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 131{
 132        unsigned hash;
 133        int ret;
 134
 135        /* Hash by protocol, client address and port */
 136        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
 137
 138        ct_write_lock(hash);
 139
 140        if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
 141                list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
 142                cp->flags |= IP_VS_CONN_F_HASHED;
 143                atomic_inc(&cp->refcnt);
 144                ret = 1;
 145        } else {
 146                IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
 147                          "called from %p\n", __builtin_return_address(0));
 148                ret = 0;
 149        }
 150
 151        ct_write_unlock(hash);
 152
 153        return ret;
 154}
 155
 156
 157/*
 158 *      UNhashes ip_vs_conn from ip_vs_conn_tab.
 159 *      returns bool success.
 160 */
 161static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 162{
 163        unsigned hash;
 164        int ret;
 165
 166        /* unhash it and decrease its reference counter */
 167        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
 168
 169        ct_write_lock(hash);
 170
 171        if (cp->flags & IP_VS_CONN_F_HASHED) {
 172                list_del(&cp->c_list);
 173                cp->flags &= ~IP_VS_CONN_F_HASHED;
 174                atomic_dec(&cp->refcnt);
 175                ret = 1;
 176        } else
 177                ret = 0;
 178
 179        ct_write_unlock(hash);
 180
 181        return ret;
 182}
 183
 184
 185/*
 186 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
 187 *  Called for pkts coming from OUTside-to-INside.
 188 *      s_addr, s_port: pkt source address (foreign host)
 189 *      d_addr, d_port: pkt dest address (load balancer)
 190 */
 191static inline struct ip_vs_conn *__ip_vs_conn_in_get
 192(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
 193{
 194        unsigned hash;
 195        struct ip_vs_conn *cp;
 196
 197        hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
 198
 199        ct_read_lock(hash);
 200
 201        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
 202                if (s_addr==cp->caddr && s_port==cp->cport &&
 203                    d_port==cp->vport && d_addr==cp->vaddr &&
 204                    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
 205                    protocol==cp->protocol) {
 206                        /* HIT */
 207                        atomic_inc(&cp->refcnt);
 208                        ct_read_unlock(hash);
 209                        return cp;
 210                }
 211        }
 212
 213        ct_read_unlock(hash);
 214
 215        return NULL;
 216}
 217
 218struct ip_vs_conn *ip_vs_conn_in_get
 219(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
 220{
 221        struct ip_vs_conn *cp;
 222
 223        cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
 224        if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
 225                cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
 226
 227        IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
 228                  ip_vs_proto_name(protocol),
 229                  NIPQUAD(s_addr), ntohs(s_port),
 230                  NIPQUAD(d_addr), ntohs(d_port),
 231                  cp?"hit":"not hit");
 232
 233        return cp;
 234}
 235
 236/* Get reference to connection template */
 237struct ip_vs_conn *ip_vs_ct_in_get
 238(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
 239{
 240        unsigned hash;
 241        struct ip_vs_conn *cp;
 242
 243        hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
 244
 245        ct_read_lock(hash);
 246
 247        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
 248                if (s_addr==cp->caddr && s_port==cp->cport &&
 249                    d_port==cp->vport && d_addr==cp->vaddr &&
 250                    cp->flags & IP_VS_CONN_F_TEMPLATE &&
 251                    protocol==cp->protocol) {
 252                        /* HIT */
 253                        atomic_inc(&cp->refcnt);
 254                        goto out;
 255                }
 256        }
 257        cp = NULL;
 258
 259  out:
 260        ct_read_unlock(hash);
 261
 262        IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
 263                  ip_vs_proto_name(protocol),
 264                  NIPQUAD(s_addr), ntohs(s_port),
 265                  NIPQUAD(d_addr), ntohs(d_port),
 266                  cp?"hit":"not hit");
 267
 268        return cp;
 269}
 270
 271/*
 272 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
 273 *  Called for pkts coming from inside-to-OUTside.
 274 *      s_addr, s_port: pkt source address (inside host)
 275 *      d_addr, d_port: pkt dest address (foreign host)
 276 */
 277struct ip_vs_conn *ip_vs_conn_out_get
 278(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
 279{
 280        unsigned hash;
 281        struct ip_vs_conn *cp, *ret=NULL;
 282
 283        /*
 284         *      Check for "full" addressed entries
 285         */
 286        hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
 287
 288        ct_read_lock(hash);
 289
 290        list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
 291                if (d_addr == cp->caddr && d_port == cp->cport &&
 292                    s_port == cp->dport && s_addr == cp->daddr &&
 293                    protocol == cp->protocol) {
 294                        /* HIT */
 295                        atomic_inc(&cp->refcnt);
 296                        ret = cp;
 297                        break;
 298                }
 299        }
 300
 301        ct_read_unlock(hash);
 302
 303        IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
 304                  ip_vs_proto_name(protocol),
 305                  NIPQUAD(s_addr), ntohs(s_port),
 306                  NIPQUAD(d_addr), ntohs(d_port),
 307                  ret?"hit":"not hit");
 308
 309        return ret;
 310}
 311
 312
 313/*
 314 *      Put back the conn and restart its timer with its timeout
 315 */
 316void ip_vs_conn_put(struct ip_vs_conn *cp)
 317{
 318        /* reset it expire in its timeout */
 319        mod_timer(&cp->timer, jiffies+cp->timeout);
 320
 321        __ip_vs_conn_put(cp);
 322}
 323
 324
 325/*
 326 *      Fill a no_client_port connection with a client port number
 327 */
 328void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 329{
 330        if (ip_vs_conn_unhash(cp)) {
 331                spin_lock(&cp->lock);
 332                if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
 333                        atomic_dec(&ip_vs_conn_no_cport_cnt);
 334                        cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
 335                        cp->cport = cport;
 336                }
 337                spin_unlock(&cp->lock);
 338
 339                /* hash on new dport */
 340                ip_vs_conn_hash(cp);
 341        }
 342}
 343
 344
 345/*
 346 *      Bind a connection entry with the corresponding packet_xmit.
 347 *      Called by ip_vs_conn_new.
 348 */
 349static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
 350{
 351        switch (IP_VS_FWD_METHOD(cp)) {
 352        case IP_VS_CONN_F_MASQ:
 353                cp->packet_xmit = ip_vs_nat_xmit;
 354                break;
 355
 356        case IP_VS_CONN_F_TUNNEL:
 357                cp->packet_xmit = ip_vs_tunnel_xmit;
 358                break;
 359
 360        case IP_VS_CONN_F_DROUTE:
 361                cp->packet_xmit = ip_vs_dr_xmit;
 362                break;
 363
 364        case IP_VS_CONN_F_LOCALNODE:
 365                cp->packet_xmit = ip_vs_null_xmit;
 366                break;
 367
 368        case IP_VS_CONN_F_BYPASS:
 369                cp->packet_xmit = ip_vs_bypass_xmit;
 370                break;
 371        }
 372}
 373
 374
 375static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
 376{
 377        return atomic_read(&dest->activeconns)
 378                + atomic_read(&dest->inactconns);
 379}
 380
 381/*
 382 *      Bind a connection entry with a virtual service destination
 383 *      Called just after a new connection entry is created.
 384 */
 385static inline void
 386ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 387{
 388        /* if dest is NULL, then return directly */
 389        if (!dest)
 390                return;
 391
 392        /* Increase the refcnt counter of the dest */
 393        atomic_inc(&dest->refcnt);
 394
 395        /* Bind with the destination and its corresponding transmitter */
 396        if ((cp->flags & IP_VS_CONN_F_SYNC) &&
 397            (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
 398                /* if the connection is not template and is created
 399                 * by sync, preserve the activity flag.
 400                 */
 401                cp->flags |= atomic_read(&dest->conn_flags) &
 402                             (~IP_VS_CONN_F_INACTIVE);
 403        else
 404                cp->flags |= atomic_read(&dest->conn_flags);
 405        cp->dest = dest;
 406
 407        IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
 408                  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
 409                  "dest->refcnt:%d\n",
 410                  ip_vs_proto_name(cp->protocol),
 411                  NIPQUAD(cp->caddr), ntohs(cp->cport),
 412                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
 413                  NIPQUAD(cp->daddr), ntohs(cp->dport),
 414                  ip_vs_fwd_tag(cp), cp->state,
 415                  cp->flags, atomic_read(&cp->refcnt),
 416                  atomic_read(&dest->refcnt));
 417
 418        /* Update the connection counters */
 419        if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
 420                /* It is a normal connection, so increase the inactive
 421                   connection counter because it is in TCP SYNRECV
 422                   state (inactive) or other protocol inacive state */
 423                if ((cp->flags & IP_VS_CONN_F_SYNC) &&
 424                    (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
 425                        atomic_inc(&dest->activeconns);
 426                else
 427                        atomic_inc(&dest->inactconns);
 428        } else {
 429                /* It is a persistent connection/template, so increase
 430                   the peristent connection counter */
 431                atomic_inc(&dest->persistconns);
 432        }
 433
 434        if (dest->u_threshold != 0 &&
 435            ip_vs_dest_totalconns(dest) >= dest->u_threshold)
 436                dest->flags |= IP_VS_DEST_F_OVERLOAD;
 437}
 438
 439
 440/*
 441 * Check if there is a destination for the connection, if so
 442 * bind the connection to the destination.
 443 */
 444struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 445{
 446        struct ip_vs_dest *dest;
 447
 448        if ((cp) && (!cp->dest)) {
 449                dest = ip_vs_find_dest(cp->daddr, cp->dport,
 450                                       cp->vaddr, cp->vport, cp->protocol);
 451                ip_vs_bind_dest(cp, dest);
 452                return dest;
 453        } else
 454                return NULL;
 455}
 456
 457
 458/*
 459 *      Unbind a connection entry with its VS destination
 460 *      Called by the ip_vs_conn_expire function.
 461 */
 462static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
 463{
 464        struct ip_vs_dest *dest = cp->dest;
 465
 466        if (!dest)
 467                return;
 468
 469        IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
 470                  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
 471                  "dest->refcnt:%d\n",
 472                  ip_vs_proto_name(cp->protocol),
 473                  NIPQUAD(cp->caddr), ntohs(cp->cport),
 474                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
 475                  NIPQUAD(cp->daddr), ntohs(cp->dport),
 476                  ip_vs_fwd_tag(cp), cp->state,
 477                  cp->flags, atomic_read(&cp->refcnt),
 478                  atomic_read(&dest->refcnt));
 479
 480        /* Update the connection counters */
 481        if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
 482                /* It is a normal connection, so decrease the inactconns
 483                   or activeconns counter */
 484                if (cp->flags & IP_VS_CONN_F_INACTIVE) {
 485                        atomic_dec(&dest->inactconns);
 486                } else {
 487                        atomic_dec(&dest->activeconns);
 488                }
 489        } else {
 490                /* It is a persistent connection/template, so decrease
 491                   the peristent connection counter */
 492                atomic_dec(&dest->persistconns);
 493        }
 494
 495        if (dest->l_threshold != 0) {
 496                if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
 497                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 498        } else if (dest->u_threshold != 0) {
 499                if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
 500                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 501        } else {
 502                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 503                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 504        }
 505
 506        /*
 507         * Simply decrease the refcnt of the dest, because the
 508         * dest will be either in service's destination list
 509         * or in the trash.
 510         */
 511        atomic_dec(&dest->refcnt);
 512}
 513
 514
 515/*
 516 *      Checking if the destination of a connection template is available.
 517 *      If available, return 1, otherwise invalidate this connection
 518 *      template and return 0.
 519 */
 520int ip_vs_check_template(struct ip_vs_conn *ct)
 521{
 522        struct ip_vs_dest *dest = ct->dest;
 523
 524        /*
 525         * Checking the dest server status.
 526         */
 527        if ((dest == NULL) ||
 528            !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
 529            (sysctl_ip_vs_expire_quiescent_template &&
 530             (atomic_read(&dest->weight) == 0))) {
 531                IP_VS_DBG(9, "check_template: dest not available for "
 532                          "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
 533                          "-> d:%u.%u.%u.%u:%d\n",
 534                          ip_vs_proto_name(ct->protocol),
 535                          NIPQUAD(ct->caddr), ntohs(ct->cport),
 536                          NIPQUAD(ct->vaddr), ntohs(ct->vport),
 537                          NIPQUAD(ct->daddr), ntohs(ct->dport));
 538
 539                /*
 540                 * Invalidate the connection template
 541                 */
 542                if (ct->vport != htons(0xffff)) {
 543                        if (ip_vs_conn_unhash(ct)) {
 544                                ct->dport = htons(0xffff);
 545                                ct->vport = htons(0xffff);
 546                                ct->cport = 0;
 547                                ip_vs_conn_hash(ct);
 548                        }
 549                }
 550
 551                /*
 552                 * Simply decrease the refcnt of the template,
 553                 * don't restart its timer.
 554                 */
 555                atomic_dec(&ct->refcnt);
 556                return 0;
 557        }
 558        return 1;
 559}
 560
 561static void ip_vs_conn_expire(unsigned long data)
 562{
 563        struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
 564
 565        cp->timeout = 60*HZ;
 566
 567        /*
 568         *      hey, I'm using it
 569         */
 570        atomic_inc(&cp->refcnt);
 571
 572        /*
 573         *      do I control anybody?
 574         */
 575        if (atomic_read(&cp->n_control))
 576                goto expire_later;
 577
 578        /*
 579         *      unhash it if it is hashed in the conn table
 580         */
 581        if (!ip_vs_conn_unhash(cp))
 582                goto expire_later;
 583
 584        /*
 585         *      refcnt==1 implies I'm the only one referrer
 586         */
 587        if (likely(atomic_read(&cp->refcnt) == 1)) {
 588                /* delete the timer if it is activated by other users */
 589                if (timer_pending(&cp->timer))
 590                        del_timer(&cp->timer);
 591
 592                /* does anybody control me? */
 593                if (cp->control)
 594                        ip_vs_control_del(cp);
 595
 596                if (unlikely(cp->app != NULL))
 597                        ip_vs_unbind_app(cp);
 598                ip_vs_unbind_dest(cp);
 599                if (cp->flags & IP_VS_CONN_F_NO_CPORT)
 600                        atomic_dec(&ip_vs_conn_no_cport_cnt);
 601                atomic_dec(&ip_vs_conn_count);
 602
 603                kmem_cache_free(ip_vs_conn_cachep, cp);
 604                return;
 605        }
 606
 607        /* hash it back to the table */
 608        ip_vs_conn_hash(cp);
 609
 610  expire_later:
 611        IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
 612                  atomic_read(&cp->refcnt)-1,
 613                  atomic_read(&cp->n_control));
 614
 615        ip_vs_conn_put(cp);
 616}
 617
 618
 619void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 620{
 621        if (del_timer(&cp->timer))
 622                mod_timer(&cp->timer, jiffies);
 623}
 624
 625
 626/*
 627 *      Create a new connection entry and hash it into the ip_vs_conn_tab
 628 */
 629struct ip_vs_conn *
 630ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport,
 631               __be32 daddr, __be16 dport, unsigned flags,
 632               struct ip_vs_dest *dest)
 633{
 634        struct ip_vs_conn *cp;
 635        struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
 636
 637        cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
 638        if (cp == NULL) {
 639                IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
 640                return NULL;
 641        }
 642
 643        INIT_LIST_HEAD(&cp->c_list);
 644        setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
 645        cp->protocol       = proto;
 646        cp->caddr          = caddr;
 647        cp->cport          = cport;
 648        cp->vaddr          = vaddr;
 649        cp->vport          = vport;
 650        cp->daddr          = daddr;
 651        cp->dport          = dport;
 652        cp->flags          = flags;
 653        spin_lock_init(&cp->lock);
 654
 655        /*
 656         * Set the entry is referenced by the current thread before hashing
 657         * it in the table, so that other thread run ip_vs_random_dropentry
 658         * but cannot drop this entry.
 659         */
 660        atomic_set(&cp->refcnt, 1);
 661
 662        atomic_set(&cp->n_control, 0);
 663        atomic_set(&cp->in_pkts, 0);
 664
 665        atomic_inc(&ip_vs_conn_count);
 666        if (flags & IP_VS_CONN_F_NO_CPORT)
 667                atomic_inc(&ip_vs_conn_no_cport_cnt);
 668
 669        /* Bind the connection with a destination server */
 670        ip_vs_bind_dest(cp, dest);
 671
 672        /* Set its state and timeout */
 673        cp->state = 0;
 674        cp->timeout = 3*HZ;
 675
 676        /* Bind its packet transmitter */
 677        ip_vs_bind_xmit(cp);
 678
 679        if (unlikely(pp && atomic_read(&pp->appcnt)))
 680                ip_vs_bind_app(cp, pp);
 681
 682        /* Hash it in the ip_vs_conn_tab finally */
 683        ip_vs_conn_hash(cp);
 684
 685        return cp;
 686}
 687
 688
 689/*
 690 *      /proc/net/ip_vs_conn entries
 691 */
 692#ifdef CONFIG_PROC_FS
 693
 694static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 695{
 696        int idx;
 697        struct ip_vs_conn *cp;
 698
 699        for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
 700                ct_read_lock_bh(idx);
 701                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 702                        if (pos-- == 0) {
 703                                seq->private = &ip_vs_conn_tab[idx];
 704                                return cp;
 705                        }
 706                }
 707                ct_read_unlock_bh(idx);
 708        }
 709
 710        return NULL;
 711}
 712
 713static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
 714{
 715        seq->private = NULL;
 716        return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
 717}
 718
 719static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 720{
 721        struct ip_vs_conn *cp = v;
 722        struct list_head *e, *l = seq->private;
 723        int idx;
 724
 725        ++*pos;
 726        if (v == SEQ_START_TOKEN)
 727                return ip_vs_conn_array(seq, 0);
 728
 729        /* more on same hash chain? */
 730        if ((e = cp->c_list.next) != l)
 731                return list_entry(e, struct ip_vs_conn, c_list);
 732
 733        idx = l - ip_vs_conn_tab;
 734        ct_read_unlock_bh(idx);
 735
 736        while (++idx < IP_VS_CONN_TAB_SIZE) {
 737                ct_read_lock_bh(idx);
 738                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 739                        seq->private = &ip_vs_conn_tab[idx];
 740                        return cp;
 741                }
 742                ct_read_unlock_bh(idx);
 743        }
 744        seq->private = NULL;
 745        return NULL;
 746}
 747
 748static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
 749{
 750        struct list_head *l = seq->private;
 751
 752        if (l)
 753                ct_read_unlock_bh(l - ip_vs_conn_tab);
 754}
 755
 756static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 757{
 758
 759        if (v == SEQ_START_TOKEN)
 760                seq_puts(seq,
 761   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
 762        else {
 763                const struct ip_vs_conn *cp = v;
 764
 765                seq_printf(seq,
 766                        "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
 767                                ip_vs_proto_name(cp->protocol),
 768                                ntohl(cp->caddr), ntohs(cp->cport),
 769                                ntohl(cp->vaddr), ntohs(cp->vport),
 770                                ntohl(cp->daddr), ntohs(cp->dport),
 771                                ip_vs_state_name(cp->protocol, cp->state),
 772                                (cp->timer.expires-jiffies)/HZ);
 773        }
 774        return 0;
 775}
 776
 777static const struct seq_operations ip_vs_conn_seq_ops = {
 778        .start = ip_vs_conn_seq_start,
 779        .next  = ip_vs_conn_seq_next,
 780        .stop  = ip_vs_conn_seq_stop,
 781        .show  = ip_vs_conn_seq_show,
 782};
 783
 784static int ip_vs_conn_open(struct inode *inode, struct file *file)
 785{
 786        return seq_open(file, &ip_vs_conn_seq_ops);
 787}
 788
 789static const struct file_operations ip_vs_conn_fops = {
 790        .owner   = THIS_MODULE,
 791        .open    = ip_vs_conn_open,
 792        .read    = seq_read,
 793        .llseek  = seq_lseek,
 794        .release = seq_release,
 795};
 796
 797static const char *ip_vs_origin_name(unsigned flags)
 798{
 799        if (flags & IP_VS_CONN_F_SYNC)
 800                return "SYNC";
 801        else
 802                return "LOCAL";
 803}
 804
 805static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
 806{
 807
 808        if (v == SEQ_START_TOKEN)
 809                seq_puts(seq,
 810   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
 811        else {
 812                const struct ip_vs_conn *cp = v;
 813
 814                seq_printf(seq,
 815                        "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n",
 816                                ip_vs_proto_name(cp->protocol),
 817                                ntohl(cp->caddr), ntohs(cp->cport),
 818                                ntohl(cp->vaddr), ntohs(cp->vport),
 819                                ntohl(cp->daddr), ntohs(cp->dport),
 820                                ip_vs_state_name(cp->protocol, cp->state),
 821                                ip_vs_origin_name(cp->flags),
 822                                (cp->timer.expires-jiffies)/HZ);
 823        }
 824        return 0;
 825}
 826
 827static const struct seq_operations ip_vs_conn_sync_seq_ops = {
 828        .start = ip_vs_conn_seq_start,
 829        .next  = ip_vs_conn_seq_next,
 830        .stop  = ip_vs_conn_seq_stop,
 831        .show  = ip_vs_conn_sync_seq_show,
 832};
 833
 834static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
 835{
 836        return seq_open(file, &ip_vs_conn_sync_seq_ops);
 837}
 838
 839static const struct file_operations ip_vs_conn_sync_fops = {
 840        .owner   = THIS_MODULE,
 841        .open    = ip_vs_conn_sync_open,
 842        .read    = seq_read,
 843        .llseek  = seq_lseek,
 844        .release = seq_release,
 845};
 846
 847#endif
 848
 849
 850/*
 851 *      Randomly drop connection entries before running out of memory
 852 */
 853static inline int todrop_entry(struct ip_vs_conn *cp)
 854{
 855        /*
 856         * The drop rate array needs tuning for real environments.
 857         * Called from timer bh only => no locking
 858         */
 859        static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
 860        static char todrop_counter[9] = {0};
 861        int i;
 862
 863        /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
 864           This will leave enough time for normal connection to get
 865           through. */
 866        if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
 867                return 0;
 868
 869        /* Don't drop the entry if its number of incoming packets is not
 870           located in [0, 8] */
 871        i = atomic_read(&cp->in_pkts);
 872        if (i > 8 || i < 0) return 0;
 873
 874        if (!todrop_rate[i]) return 0;
 875        if (--todrop_counter[i] > 0) return 0;
 876
 877        todrop_counter[i] = todrop_rate[i];
 878        return 1;
 879}
 880
 881/* Called from keventd and must protect itself from softirqs */
 882void ip_vs_random_dropentry(void)
 883{
 884        int idx;
 885        struct ip_vs_conn *cp;
 886
 887        /*
 888         * Randomly scan 1/32 of the whole table every second
 889         */
 890        for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
 891                unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
 892
 893                /*
 894                 *  Lock is actually needed in this loop.
 895                 */
 896                ct_write_lock_bh(hash);
 897
 898                list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
 899                        if (cp->flags & IP_VS_CONN_F_TEMPLATE)
 900                                /* connection template */
 901                                continue;
 902
 903                        if (cp->protocol == IPPROTO_TCP) {
 904                                switch(cp->state) {
 905                                case IP_VS_TCP_S_SYN_RECV:
 906                                case IP_VS_TCP_S_SYNACK:
 907                                        break;
 908
 909                                case IP_VS_TCP_S_ESTABLISHED:
 910                                        if (todrop_entry(cp))
 911                                                break;
 912                                        continue;
 913
 914                                default:
 915                                        continue;
 916                                }
 917                        } else {
 918                                if (!todrop_entry(cp))
 919                                        continue;
 920                        }
 921
 922                        IP_VS_DBG(4, "del connection\n");
 923                        ip_vs_conn_expire_now(cp);
 924                        if (cp->control) {
 925                                IP_VS_DBG(4, "del conn template\n");
 926                                ip_vs_conn_expire_now(cp->control);
 927                        }
 928                }
 929                ct_write_unlock_bh(hash);
 930        }
 931}
 932
 933
 934/*
 935 *      Flush all the connection entries in the ip_vs_conn_tab
 936 */
 937static void ip_vs_conn_flush(void)
 938{
 939        int idx;
 940        struct ip_vs_conn *cp;
 941
 942  flush_again:
 943        for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
 944                /*
 945                 *  Lock is actually needed in this loop.
 946                 */
 947                ct_write_lock_bh(idx);
 948
 949                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 950
 951                        IP_VS_DBG(4, "del connection\n");
 952                        ip_vs_conn_expire_now(cp);
 953                        if (cp->control) {
 954                                IP_VS_DBG(4, "del conn template\n");
 955                                ip_vs_conn_expire_now(cp->control);
 956                        }
 957                }
 958                ct_write_unlock_bh(idx);
 959        }
 960
 961        /* the counter may be not NULL, because maybe some conn entries
 962           are run by slow timer handler or unhashed but still referred */
 963        if (atomic_read(&ip_vs_conn_count) != 0) {
 964                schedule();
 965                goto flush_again;
 966        }
 967}
 968
 969
 970int ip_vs_conn_init(void)
 971{
 972        int idx;
 973
 974        /*
 975         * Allocate the connection hash table and initialize its list heads
 976         */
 977        ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
 978        if (!ip_vs_conn_tab)
 979                return -ENOMEM;
 980
 981        /* Allocate ip_vs_conn slab cache */
 982        ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
 983                                              sizeof(struct ip_vs_conn), 0,
 984                                              SLAB_HWCACHE_ALIGN, NULL);
 985        if (!ip_vs_conn_cachep) {
 986                vfree(ip_vs_conn_tab);
 987                return -ENOMEM;
 988        }
 989
 990        IP_VS_INFO("Connection hash table configured "
 991                   "(size=%d, memory=%ldKbytes)\n",
 992                   IP_VS_CONN_TAB_SIZE,
 993                   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
 994        IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
 995                  sizeof(struct ip_vs_conn));
 996
 997        for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
 998                INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
 999        }
1000
1001        for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1002                rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1003        }
1004
1005        proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1006        proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1007
1008        /* calculate the random value for connection hash */
1009        get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1010
1011        return 0;
1012}
1013
1014
1015void ip_vs_conn_cleanup(void)
1016{
1017        /* flush all the connection entries first */
1018        ip_vs_conn_flush();
1019
1020        /* Release the empty cache */
1021        kmem_cache_destroy(ip_vs_conn_cachep);
1022        proc_net_remove(&init_net, "ip_vs_conn");
1023        proc_net_remove(&init_net, "ip_vs_conn_sync");
1024        vfree(ip_vs_conn_tab);
1025}
1026
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.