linux/net/netfilter/ipvs/ip_vs_proto_tcp.c
<<
>>
Prefs
   1/*
   2 * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
   3 *
   4 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   5 *              Julian Anastasov <ja@ssi.bg>
   6 *
   7 *              This program is free software; you can redistribute it and/or
   8 *              modify it under the terms of the GNU General Public License
   9 *              as published by the Free Software Foundation; either version
  10 *              2 of the License, or (at your option) any later version.
  11 *
  12 * Changes:
  13 *
  14 */
  15
  16#include <linux/kernel.h>
  17#include <linux/ip.h>
  18#include <linux/tcp.h>                  /* for tcphdr */
  19#include <net/ip.h>
  20#include <net/tcp.h>                    /* for csum_tcpudp_magic */
  21#include <net/ip6_checksum.h>
  22#include <linux/netfilter.h>
  23#include <linux/netfilter_ipv4.h>
  24
  25#include <net/ip_vs.h>
  26
  27
  28static struct ip_vs_conn *
  29tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
  30                const struct ip_vs_iphdr *iph, unsigned int proto_off,
  31                int inverse)
  32{
  33        __be16 _ports[2], *pptr;
  34
  35        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  36        if (pptr == NULL)
  37                return NULL;
  38
  39        if (likely(!inverse)) {
  40                return ip_vs_conn_in_get(af, iph->protocol,
  41                                         &iph->saddr, pptr[0],
  42                                         &iph->daddr, pptr[1]);
  43        } else {
  44                return ip_vs_conn_in_get(af, iph->protocol,
  45                                         &iph->daddr, pptr[1],
  46                                         &iph->saddr, pptr[0]);
  47        }
  48}
  49
  50static struct ip_vs_conn *
  51tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
  52                 const struct ip_vs_iphdr *iph, unsigned int proto_off,
  53                 int inverse)
  54{
  55        __be16 _ports[2], *pptr;
  56
  57        pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  58        if (pptr == NULL)
  59                return NULL;
  60
  61        if (likely(!inverse)) {
  62                return ip_vs_conn_out_get(af, iph->protocol,
  63                                          &iph->saddr, pptr[0],
  64                                          &iph->daddr, pptr[1]);
  65        } else {
  66                return ip_vs_conn_out_get(af, iph->protocol,
  67                                          &iph->daddr, pptr[1],
  68                                          &iph->saddr, pptr[0]);
  69        }
  70}
  71
  72
  73static int
  74tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
  75                  int *verdict, struct ip_vs_conn **cpp)
  76{
  77        struct ip_vs_service *svc;
  78        struct tcphdr _tcph, *th;
  79        struct ip_vs_iphdr iph;
  80
  81        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
  82
  83        th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
  84        if (th == NULL) {
  85                *verdict = NF_DROP;
  86                return 0;
  87        }
  88
  89        if (th->syn &&
  90            (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
  91                                     th->dest))) {
  92                if (ip_vs_todrop()) {
  93                        /*
  94                         * It seems that we are very loaded.
  95                         * We have to drop this packet :(
  96                         */
  97                        ip_vs_service_put(svc);
  98                        *verdict = NF_DROP;
  99                        return 0;
 100                }
 101
 102                /*
 103                 * Let the virtual server select a real server for the
 104                 * incoming connection, and create a connection entry.
 105                 */
 106                *cpp = ip_vs_schedule(svc, skb);
 107                if (!*cpp) {
 108                        *verdict = ip_vs_leave(svc, skb, pp);
 109                        return 0;
 110                }
 111                ip_vs_service_put(svc);
 112        }
 113        return 1;
 114}
 115
 116
 117static inline void
 118tcp_fast_csum_update(int af, struct tcphdr *tcph,
 119                     const union nf_inet_addr *oldip,
 120                     const union nf_inet_addr *newip,
 121                     __be16 oldport, __be16 newport)
 122{
 123#ifdef CONFIG_IP_VS_IPV6
 124        if (af == AF_INET6)
 125                tcph->check =
 126                        csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 127                                         ip_vs_check_diff2(oldport, newport,
 128                                                ~csum_unfold(tcph->check))));
 129        else
 130#endif
 131        tcph->check =
 132                csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 133                                 ip_vs_check_diff2(oldport, newport,
 134                                                ~csum_unfold(tcph->check))));
 135}
 136
 137
 138static inline void
 139tcp_partial_csum_update(int af, struct tcphdr *tcph,
 140                     const union nf_inet_addr *oldip,
 141                     const union nf_inet_addr *newip,
 142                     __be16 oldlen, __be16 newlen)
 143{
 144#ifdef CONFIG_IP_VS_IPV6
 145        if (af == AF_INET6)
 146                tcph->check =
 147                        csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 148                                         ip_vs_check_diff2(oldlen, newlen,
 149                                                ~csum_unfold(tcph->check))));
 150        else
 151#endif
 152        tcph->check =
 153                csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 154                                ip_vs_check_diff2(oldlen, newlen,
 155                                                ~csum_unfold(tcph->check))));
 156}
 157
 158
 159static int
 160tcp_snat_handler(struct sk_buff *skb,
 161                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 162{
 163        struct tcphdr *tcph;
 164        unsigned int tcphoff;
 165        int oldlen;
 166
 167#ifdef CONFIG_IP_VS_IPV6
 168        if (cp->af == AF_INET6)
 169                tcphoff = sizeof(struct ipv6hdr);
 170        else
 171#endif
 172                tcphoff = ip_hdrlen(skb);
 173        oldlen = skb->len - tcphoff;
 174
 175        /* csum_check requires unshared skb */
 176        if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
 177                return 0;
 178
 179        if (unlikely(cp->app != NULL)) {
 180                /* Some checks before mangling */
 181                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 182                        return 0;
 183
 184                /* Call application helper if needed */
 185                if (!ip_vs_app_pkt_out(cp, skb))
 186                        return 0;
 187        }
 188
 189        tcph = (void *)skb_network_header(skb) + tcphoff;
 190        tcph->source = cp->vport;
 191
 192        /* Adjust TCP checksums */
 193        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 194                tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 195                                        htonl(oldlen),
 196                                        htonl(skb->len - tcphoff));
 197        } else if (!cp->app) {
 198                /* Only port and addr are changed, do fast csum update */
 199                tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 200                                     cp->dport, cp->vport);
 201                if (skb->ip_summed == CHECKSUM_COMPLETE)
 202                        skb->ip_summed = CHECKSUM_NONE;
 203        } else {
 204                /* full checksum calculation */
 205                tcph->check = 0;
 206                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 207#ifdef CONFIG_IP_VS_IPV6
 208                if (cp->af == AF_INET6)
 209                        tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
 210                                                      &cp->caddr.in6,
 211                                                      skb->len - tcphoff,
 212                                                      cp->protocol, skb->csum);
 213                else
 214#endif
 215                        tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
 216                                                        cp->caddr.ip,
 217                                                        skb->len - tcphoff,
 218                                                        cp->protocol,
 219                                                        skb->csum);
 220
 221                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 222                          pp->name, tcph->check,
 223                          (char*)&(tcph->check) - (char*)tcph);
 224        }
 225        return 1;
 226}
 227
 228
 229static int
 230tcp_dnat_handler(struct sk_buff *skb,
 231                 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 232{
 233        struct tcphdr *tcph;
 234        unsigned int tcphoff;
 235        int oldlen;
 236
 237#ifdef CONFIG_IP_VS_IPV6
 238        if (cp->af == AF_INET6)
 239                tcphoff = sizeof(struct ipv6hdr);
 240        else
 241#endif
 242                tcphoff = ip_hdrlen(skb);
 243        oldlen = skb->len - tcphoff;
 244
 245        /* csum_check requires unshared skb */
 246        if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
 247                return 0;
 248
 249        if (unlikely(cp->app != NULL)) {
 250                /* Some checks before mangling */
 251                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 252                        return 0;
 253
 254                /*
 255                 *      Attempt ip_vs_app call.
 256                 *      It will fix ip_vs_conn and iph ack_seq stuff
 257                 */
 258                if (!ip_vs_app_pkt_in(cp, skb))
 259                        return 0;
 260        }
 261
 262        tcph = (void *)skb_network_header(skb) + tcphoff;
 263        tcph->dest = cp->dport;
 264
 265        /*
 266         *      Adjust TCP checksums
 267         */
 268        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 269                tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 270                                        htonl(oldlen),
 271                                        htonl(skb->len - tcphoff));
 272        } else if (!cp->app) {
 273                /* Only port and addr are changed, do fast csum update */
 274                tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 275                                     cp->vport, cp->dport);
 276                if (skb->ip_summed == CHECKSUM_COMPLETE)
 277                        skb->ip_summed = CHECKSUM_NONE;
 278        } else {
 279                /* full checksum calculation */
 280                tcph->check = 0;
 281                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 282#ifdef CONFIG_IP_VS_IPV6
 283                if (cp->af == AF_INET6)
 284                        tcph->check = csum_ipv6_magic(&cp->caddr.in6,
 285                                                      &cp->daddr.in6,
 286                                                      skb->len - tcphoff,
 287                                                      cp->protocol, skb->csum);
 288                else
 289#endif
 290                        tcph->check = csum_tcpudp_magic(cp->caddr.ip,
 291                                                        cp->daddr.ip,
 292                                                        skb->len - tcphoff,
 293                                                        cp->protocol,
 294                                                        skb->csum);
 295                skb->ip_summed = CHECKSUM_UNNECESSARY;
 296        }
 297        return 1;
 298}
 299
 300
 301static int
 302tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 303{
 304        unsigned int tcphoff;
 305
 306#ifdef CONFIG_IP_VS_IPV6
 307        if (af == AF_INET6)
 308                tcphoff = sizeof(struct ipv6hdr);
 309        else
 310#endif
 311                tcphoff = ip_hdrlen(skb);
 312
 313        switch (skb->ip_summed) {
 314        case CHECKSUM_NONE:
 315                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 316        case CHECKSUM_COMPLETE:
 317#ifdef CONFIG_IP_VS_IPV6
 318                if (af == AF_INET6) {
 319                        if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
 320                                            &ipv6_hdr(skb)->daddr,
 321                                            skb->len - tcphoff,
 322                                            ipv6_hdr(skb)->nexthdr,
 323                                            skb->csum)) {
 324                                IP_VS_DBG_RL_PKT(0, pp, skb, 0,
 325                                                 "Failed checksum for");
 326                                return 0;
 327                        }
 328                } else
 329#endif
 330                        if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
 331                                              ip_hdr(skb)->daddr,
 332                                              skb->len - tcphoff,
 333                                              ip_hdr(skb)->protocol,
 334                                              skb->csum)) {
 335                                IP_VS_DBG_RL_PKT(0, pp, skb, 0,
 336                                                 "Failed checksum for");
 337                                return 0;
 338                        }
 339                break;
 340        default:
 341                /* No need to checksum. */
 342                break;
 343        }
 344
 345        return 1;
 346}
 347
 348
 349#define TCP_DIR_INPUT           0
 350#define TCP_DIR_OUTPUT          4
 351#define TCP_DIR_INPUT_ONLY      8
 352
 353static const int tcp_state_off[IP_VS_DIR_LAST] = {
 354        [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
 355        [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
 356        [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
 357};
 358
 359/*
 360 *      Timeout table[state]
 361 */
 362static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
 363        [IP_VS_TCP_S_NONE]              =       2*HZ,
 364        [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
 365        [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
 366        [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
 367        [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
 368        [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
 369        [IP_VS_TCP_S_CLOSE]             =       10*HZ,
 370        [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
 371        [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
 372        [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
 373        [IP_VS_TCP_S_SYNACK]            =       120*HZ,
 374        [IP_VS_TCP_S_LAST]              =       2*HZ,
 375};
 376
 377static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
 378        [IP_VS_TCP_S_NONE]              =       "NONE",
 379        [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
 380        [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
 381        [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
 382        [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
 383        [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
 384        [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
 385        [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
 386        [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
 387        [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
 388        [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
 389        [IP_VS_TCP_S_LAST]              =       "BUG!",
 390};
 391
 392#define sNO IP_VS_TCP_S_NONE
 393#define sES IP_VS_TCP_S_ESTABLISHED
 394#define sSS IP_VS_TCP_S_SYN_SENT
 395#define sSR IP_VS_TCP_S_SYN_RECV
 396#define sFW IP_VS_TCP_S_FIN_WAIT
 397#define sTW IP_VS_TCP_S_TIME_WAIT
 398#define sCL IP_VS_TCP_S_CLOSE
 399#define sCW IP_VS_TCP_S_CLOSE_WAIT
 400#define sLA IP_VS_TCP_S_LAST_ACK
 401#define sLI IP_VS_TCP_S_LISTEN
 402#define sSA IP_VS_TCP_S_SYNACK
 403
 404struct tcp_states_t {
 405        int next_state[IP_VS_TCP_S_LAST];
 406};
 407
 408static const char * tcp_state_name(int state)
 409{
 410        if (state >= IP_VS_TCP_S_LAST)
 411                return "ERR!";
 412        return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
 413}
 414
 415static struct tcp_states_t tcp_states [] = {
 416/*      INPUT */
 417/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 418/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 419/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
 420/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 421/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
 422
 423/*      OUTPUT */
 424/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 425/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
 426/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
 427/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
 428/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 429
 430/*      INPUT-ONLY */
 431/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 432/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 433/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
 434/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 435/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 436};
 437
 438static struct tcp_states_t tcp_states_dos [] = {
 439/*      INPUT */
 440/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 441/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
 442/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
 443/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
 444/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 445
 446/*      OUTPUT */
 447/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 448/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
 449/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
 450/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
 451/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 452
 453/*      INPUT-ONLY */
 454/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 455/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
 456/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
 457/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 458/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 459};
 460
 461static struct tcp_states_t *tcp_state_table = tcp_states;
 462
 463
 464static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
 465{
 466        int on = (flags & 1);           /* secure_tcp */
 467
 468        /*
 469        ** FIXME: change secure_tcp to independent sysctl var
 470        ** or make it per-service or per-app because it is valid
 471        ** for most if not for all of the applications. Something
 472        ** like "capabilities" (flags) for each object.
 473        */
 474        tcp_state_table = (on? tcp_states_dos : tcp_states);
 475}
 476
 477static int
 478tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
 479{
 480        return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
 481                                       tcp_state_name_table, sname, to);
 482}
 483
 484static inline int tcp_state_idx(struct tcphdr *th)
 485{
 486        if (th->rst)
 487                return 3;
 488        if (th->syn)
 489                return 0;
 490        if (th->fin)
 491                return 1;
 492        if (th->ack)
 493                return 2;
 494        return -1;
 495}
 496
 497static inline void
 498set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 499              int direction, struct tcphdr *th)
 500{
 501        int state_idx;
 502        int new_state = IP_VS_TCP_S_CLOSE;
 503        int state_off = tcp_state_off[direction];
 504
 505        /*
 506         *    Update state offset to INPUT_ONLY if necessary
 507         *    or delete NO_OUTPUT flag if output packet detected
 508         */
 509        if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
 510                if (state_off == TCP_DIR_OUTPUT)
 511                        cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
 512                else
 513                        state_off = TCP_DIR_INPUT_ONLY;
 514        }
 515
 516        if ((state_idx = tcp_state_idx(th)) < 0) {
 517                IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
 518                goto tcp_state_out;
 519        }
 520
 521        new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
 522
 523  tcp_state_out:
 524        if (new_state != cp->state) {
 525                struct ip_vs_dest *dest = cp->dest;
 526
 527                IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
 528                              "%s:%d state: %s->%s conn->refcnt:%d\n",
 529                              pp->name,
 530                              ((state_off == TCP_DIR_OUTPUT) ?
 531                               "output " : "input "),
 532                              th->syn ? 'S' : '.',
 533                              th->fin ? 'F' : '.',
 534                              th->ack ? 'A' : '.',
 535                              th->rst ? 'R' : '.',
 536                              IP_VS_DBG_ADDR(cp->af, &cp->daddr),
 537                              ntohs(cp->dport),
 538                              IP_VS_DBG_ADDR(cp->af, &cp->caddr),
 539                              ntohs(cp->cport),
 540                              tcp_state_name(cp->state),
 541                              tcp_state_name(new_state),
 542                              atomic_read(&cp->refcnt));
 543
 544                if (dest) {
 545                        if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
 546                            (new_state != IP_VS_TCP_S_ESTABLISHED)) {
 547                                atomic_dec(&dest->activeconns);
 548                                atomic_inc(&dest->inactconns);
 549                                cp->flags |= IP_VS_CONN_F_INACTIVE;
 550                        } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
 551                                   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
 552                                atomic_inc(&dest->activeconns);
 553                                atomic_dec(&dest->inactconns);
 554                                cp->flags &= ~IP_VS_CONN_F_INACTIVE;
 555                        }
 556                }
 557        }
 558
 559        cp->timeout = pp->timeout_table[cp->state = new_state];
 560}
 561
 562
 563/*
 564 *      Handle state transitions
 565 */
 566static int
 567tcp_state_transition(struct ip_vs_conn *cp, int direction,
 568                     const struct sk_buff *skb,
 569                     struct ip_vs_protocol *pp)
 570{
 571        struct tcphdr _tcph, *th;
 572
 573#ifdef CONFIG_IP_VS_IPV6
 574        int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
 575#else
 576        int ihl = ip_hdrlen(skb);
 577#endif
 578
 579        th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
 580        if (th == NULL)
 581                return 0;
 582
 583        spin_lock(&cp->lock);
 584        set_tcp_state(pp, cp, direction, th);
 585        spin_unlock(&cp->lock);
 586
 587        return 1;
 588}
 589
 590
 591/*
 592 *      Hash table for TCP application incarnations
 593 */
 594#define TCP_APP_TAB_BITS        4
 595#define TCP_APP_TAB_SIZE        (1 << TCP_APP_TAB_BITS)
 596#define TCP_APP_TAB_MASK        (TCP_APP_TAB_SIZE - 1)
 597
 598static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
 599static DEFINE_SPINLOCK(tcp_app_lock);
 600
 601static inline __u16 tcp_app_hashkey(__be16 port)
 602{
 603        return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
 604                & TCP_APP_TAB_MASK;
 605}
 606
 607
 608static int tcp_register_app(struct ip_vs_app *inc)
 609{
 610        struct ip_vs_app *i;
 611        __u16 hash;
 612        __be16 port = inc->port;
 613        int ret = 0;
 614
 615        hash = tcp_app_hashkey(port);
 616
 617        spin_lock_bh(&tcp_app_lock);
 618        list_for_each_entry(i, &tcp_apps[hash], p_list) {
 619                if (i->port == port) {
 620                        ret = -EEXIST;
 621                        goto out;
 622                }
 623        }
 624        list_add(&inc->p_list, &tcp_apps[hash]);
 625        atomic_inc(&ip_vs_protocol_tcp.appcnt);
 626
 627  out:
 628        spin_unlock_bh(&tcp_app_lock);
 629        return ret;
 630}
 631
 632
 633static void
 634tcp_unregister_app(struct ip_vs_app *inc)
 635{
 636        spin_lock_bh(&tcp_app_lock);
 637        atomic_dec(&ip_vs_protocol_tcp.appcnt);
 638        list_del(&inc->p_list);
 639        spin_unlock_bh(&tcp_app_lock);
 640}
 641
 642
 643static int
 644tcp_app_conn_bind(struct ip_vs_conn *cp)
 645{
 646        int hash;
 647        struct ip_vs_app *inc;
 648        int result = 0;
 649
 650        /* Default binding: bind app only for NAT */
 651        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
 652                return 0;
 653
 654        /* Lookup application incarnations and bind the right one */
 655        hash = tcp_app_hashkey(cp->vport);
 656
 657        spin_lock(&tcp_app_lock);
 658        list_for_each_entry(inc, &tcp_apps[hash], p_list) {
 659                if (inc->port == cp->vport) {
 660                        if (unlikely(!ip_vs_app_inc_get(inc)))
 661                                break;
 662                        spin_unlock(&tcp_app_lock);
 663
 664                        IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
 665                                      "%s:%u to app %s on port %u\n",
 666                                      __func__,
 667                                      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
 668                                      ntohs(cp->cport),
 669                                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
 670                                      ntohs(cp->vport),
 671                                      inc->name, ntohs(inc->port));
 672
 673                        cp->app = inc;
 674                        if (inc->init_conn)
 675                                result = inc->init_conn(inc, cp);
 676                        goto out;
 677                }
 678        }
 679        spin_unlock(&tcp_app_lock);
 680
 681  out:
 682        return result;
 683}
 684
 685
 686/*
 687 *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
 688 */
 689void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
 690{
 691        spin_lock(&cp->lock);
 692        cp->state = IP_VS_TCP_S_LISTEN;
 693        cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
 694        spin_unlock(&cp->lock);
 695}
 696
 697
 698static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
 699{
 700        IP_VS_INIT_HASH_TABLE(tcp_apps);
 701        pp->timeout_table = tcp_timeouts;
 702}
 703
 704
 705static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
 706{
 707}
 708
 709
 710struct ip_vs_protocol ip_vs_protocol_tcp = {
 711        .name =                 "TCP",
 712        .protocol =             IPPROTO_TCP,
 713        .num_states =           IP_VS_TCP_S_LAST,
 714        .dont_defrag =          0,
 715        .appcnt =               ATOMIC_INIT(0),
 716        .init =                 ip_vs_tcp_init,
 717        .exit =                 ip_vs_tcp_exit,
 718        .register_app =         tcp_register_app,
 719        .unregister_app =       tcp_unregister_app,
 720        .conn_schedule =        tcp_conn_schedule,
 721        .conn_in_get =          tcp_conn_in_get,
 722        .conn_out_get =         tcp_conn_out_get,
 723        .snat_handler =         tcp_snat_handler,
 724        .dnat_handler =         tcp_dnat_handler,
 725        .csum_check =           tcp_csum_check,
 726        .state_name =           tcp_state_name,
 727        .state_transition =     tcp_state_transition,
 728        .app_conn_bind =        tcp_app_conn_bind,
 729        .debug_packet =         ip_vs_tcpudp_debug_packet,
 730        .timeout_change =       tcp_timeout_change,
 731        .set_state_timeout =    tcp_set_state_timeout,
 732};
 733