linux/net/netfilter/ipvs/ip_vs_core.c
<<
>>
Prefs
   1/*
   2 * IPVS         An implementation of the IP virtual server support for the
   3 *              LINUX operating system.  IPVS is now implemented as a module
   4 *              over the Netfilter framework. IPVS can be used to build a
   5 *              high-performance and highly available server based on a
   6 *              cluster of servers.
   7 *
   8 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   9 *              Peter Kese <peter.kese@ijs.si>
  10 *              Julian Anastasov <ja@ssi.bg>
  11 *
  12 *              This program is free software; you can redistribute it and/or
  13 *              modify it under the terms of the GNU General Public License
  14 *              as published by the Free Software Foundation; either version
  15 *              2 of the License, or (at your option) any later version.
  16 *
  17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19 * and others.
  20 *
  21 * Changes:
  22 *      Paul `Rusty' Russell            properly handle non-linear skbs
  23 *      Harald Welte                    don't use nfcache
  24 *
  25 */
  26
  27#define KMSG_COMPONENT "IPVS"
  28#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  29
  30#include <linux/module.h>
  31#include <linux/kernel.h>
  32#include <linux/ip.h>
  33#include <linux/tcp.h>
  34#include <linux/sctp.h>
  35#include <linux/icmp.h>
  36#include <linux/slab.h>
  37
  38#include <net/ip.h>
  39#include <net/tcp.h>
  40#include <net/udp.h>
  41#include <net/icmp.h>                   /* for icmp_send */
  42#include <net/route.h>
  43#include <net/ip6_checksum.h>
  44#include <net/netns/generic.h>          /* net_generic() */
  45
  46#include <linux/netfilter.h>
  47#include <linux/netfilter_ipv4.h>
  48
  49#ifdef CONFIG_IP_VS_IPV6
  50#include <net/ipv6.h>
  51#include <linux/netfilter_ipv6.h>
  52#include <net/ip6_route.h>
  53#endif
  54
  55#include <net/ip_vs.h>
  56
  57
  58EXPORT_SYMBOL(register_ip_vs_scheduler);
  59EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  60EXPORT_SYMBOL(ip_vs_proto_name);
  61EXPORT_SYMBOL(ip_vs_conn_new);
  62EXPORT_SYMBOL(ip_vs_conn_in_get);
  63EXPORT_SYMBOL(ip_vs_conn_out_get);
  64#ifdef CONFIG_IP_VS_PROTO_TCP
  65EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  66#endif
  67EXPORT_SYMBOL(ip_vs_conn_put);
  68#ifdef CONFIG_IP_VS_DEBUG
  69EXPORT_SYMBOL(ip_vs_get_debug_level);
  70#endif
  71
  72int ip_vs_net_id __read_mostly;
  73#ifdef IP_VS_GENERIC_NETNS
  74EXPORT_SYMBOL(ip_vs_net_id);
  75#endif
  76/* netns cnt used for uniqueness */
  77static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
  78
  79/* ID used in ICMP lookups */
  80#define icmp_id(icmph)          (((icmph)->un).echo.id)
  81#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
  82
  83const char *ip_vs_proto_name(unsigned proto)
  84{
  85        static char buf[20];
  86
  87        switch (proto) {
  88        case IPPROTO_IP:
  89                return "IP";
  90        case IPPROTO_UDP:
  91                return "UDP";
  92        case IPPROTO_TCP:
  93                return "TCP";
  94        case IPPROTO_SCTP:
  95                return "SCTP";
  96        case IPPROTO_ICMP:
  97                return "ICMP";
  98#ifdef CONFIG_IP_VS_IPV6
  99        case IPPROTO_ICMPV6:
 100                return "ICMPv6";
 101#endif
 102        default:
 103                sprintf(buf, "IP_%d", proto);
 104                return buf;
 105        }
 106}
 107
 108void ip_vs_init_hash_table(struct list_head *table, int rows)
 109{
 110        while (--rows >= 0)
 111                INIT_LIST_HEAD(&table[rows]);
 112}
 113
 114static inline void
 115ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 116{
 117        struct ip_vs_dest *dest = cp->dest;
 118        struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
 119
 120        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 121                struct ip_vs_cpu_stats *s;
 122
 123                s = this_cpu_ptr(dest->stats.cpustats);
 124                s->ustats.inpkts++;
 125                u64_stats_update_begin(&s->syncp);
 126                s->ustats.inbytes += skb->len;
 127                u64_stats_update_end(&s->syncp);
 128
 129                s = this_cpu_ptr(dest->svc->stats.cpustats);
 130                s->ustats.inpkts++;
 131                u64_stats_update_begin(&s->syncp);
 132                s->ustats.inbytes += skb->len;
 133                u64_stats_update_end(&s->syncp);
 134
 135                s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 136                s->ustats.inpkts++;
 137                u64_stats_update_begin(&s->syncp);
 138                s->ustats.inbytes += skb->len;
 139                u64_stats_update_end(&s->syncp);
 140        }
 141}
 142
 143
 144static inline void
 145ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 146{
 147        struct ip_vs_dest *dest = cp->dest;
 148        struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
 149
 150        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 151                struct ip_vs_cpu_stats *s;
 152
 153                s = this_cpu_ptr(dest->stats.cpustats);
 154                s->ustats.outpkts++;
 155                u64_stats_update_begin(&s->syncp);
 156                s->ustats.outbytes += skb->len;
 157                u64_stats_update_end(&s->syncp);
 158
 159                s = this_cpu_ptr(dest->svc->stats.cpustats);
 160                s->ustats.outpkts++;
 161                u64_stats_update_begin(&s->syncp);
 162                s->ustats.outbytes += skb->len;
 163                u64_stats_update_end(&s->syncp);
 164
 165                s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 166                s->ustats.outpkts++;
 167                u64_stats_update_begin(&s->syncp);
 168                s->ustats.outbytes += skb->len;
 169                u64_stats_update_end(&s->syncp);
 170        }
 171}
 172
 173
 174static inline void
 175ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 176{
 177        struct netns_ipvs *ipvs = net_ipvs(svc->net);
 178        struct ip_vs_cpu_stats *s;
 179
 180        s = this_cpu_ptr(cp->dest->stats.cpustats);
 181        s->ustats.conns++;
 182
 183        s = this_cpu_ptr(svc->stats.cpustats);
 184        s->ustats.conns++;
 185
 186        s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 187        s->ustats.conns++;
 188}
 189
 190
 191static inline void
 192ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 193                const struct sk_buff *skb,
 194                struct ip_vs_proto_data *pd)
 195{
 196        if (likely(pd->pp->state_transition))
 197                pd->pp->state_transition(cp, direction, skb, pd);
 198}
 199
 200static inline int
 201ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 202                              struct sk_buff *skb, int protocol,
 203                              const union nf_inet_addr *caddr, __be16 cport,
 204                              const union nf_inet_addr *vaddr, __be16 vport,
 205                              struct ip_vs_conn_param *p)
 206{
 207        ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
 208                              vport, p);
 209        p->pe = svc->pe;
 210        if (p->pe && p->pe->fill_param)
 211                return p->pe->fill_param(p, skb);
 212
 213        return 0;
 214}
 215
 216/*
 217 *  IPVS persistent scheduling function
 218 *  It creates a connection entry according to its template if exists,
 219 *  or selects a server and creates a connection entry plus a template.
 220 *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 221 *  Protocols supported: TCP, UDP
 222 */
 223static struct ip_vs_conn *
 224ip_vs_sched_persist(struct ip_vs_service *svc,
 225                    struct sk_buff *skb,
 226                    __be16 src_port, __be16 dst_port, int *ignored)
 227{
 228        struct ip_vs_conn *cp = NULL;
 229        struct ip_vs_iphdr iph;
 230        struct ip_vs_dest *dest;
 231        struct ip_vs_conn *ct;
 232        __be16 dport = 0;               /* destination port to forward */
 233        unsigned int flags;
 234        struct ip_vs_conn_param param;
 235        const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
 236        union nf_inet_addr snet;        /* source network of the client,
 237                                           after masking */
 238
 239        ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 240
 241        /* Mask saddr with the netmask to adjust template granularity */
 242#ifdef CONFIG_IP_VS_IPV6
 243        if (svc->af == AF_INET6)
 244                ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
 245        else
 246#endif
 247                snet.ip = iph.saddr.ip & svc->netmask;
 248
 249        IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 250                      "mnet %s\n",
 251                      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
 252                      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
 253                      IP_VS_DBG_ADDR(svc->af, &snet));
 254
 255        /*
 256         * As far as we know, FTP is a very complicated network protocol, and
 257         * it uses control connection and data connections. For active FTP,
 258         * FTP server initialize data connection to the client, its source port
 259         * is often 20. For passive FTP, FTP server tells the clients the port
 260         * that it passively listens to,  and the client issues the data
 261         * connection. In the tunneling or direct routing mode, the load
 262         * balancer is on the client-to-server half of connection, the port
 263         * number is unknown to the load balancer. So, a conn template like
 264         * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 265         * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 266         * is created for other persistent services.
 267         */
 268        {
 269                int protocol = iph.protocol;
 270                const union nf_inet_addr *vaddr = &iph.daddr;
 271                __be16 vport = 0;
 272
 273                if (dst_port == svc->port) {
 274                        /* non-FTP template:
 275                         * <protocol, caddr, 0, vaddr, vport, daddr, dport>
 276                         * FTP template:
 277                         * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 278                         */
 279                        if (svc->port != FTPPORT)
 280                                vport = dst_port;
 281                } else {
 282                        /* Note: persistent fwmark-based services and
 283                         * persistent port zero service are handled here.
 284                         * fwmark template:
 285                         * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 286                         * port zero template:
 287                         * <protocol,caddr,0,vaddr,0,daddr,0>
 288                         */
 289                        if (svc->fwmark) {
 290                                protocol = IPPROTO_IP;
 291                                vaddr = &fwmark;
 292                        }
 293                }
 294                /* return *ignored = -1 so NF_DROP can be used */
 295                if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
 296                                                  vaddr, vport, &param) < 0) {
 297                        *ignored = -1;
 298                        return NULL;
 299                }
 300        }
 301
 302        /* Check if a template already exists */
 303        ct = ip_vs_ct_in_get(&param);
 304        if (!ct || !ip_vs_check_template(ct)) {
 305                /*
 306                 * No template found or the dest of the connection
 307                 * template is not available.
 308                 * return *ignored=0 i.e. ICMP and NF_DROP
 309                 */
 310                dest = svc->scheduler->schedule(svc, skb);
 311                if (!dest) {
 312                        IP_VS_DBG(1, "p-schedule: no dest found.\n");
 313                        kfree(param.pe_data);
 314                        *ignored = 0;
 315                        return NULL;
 316                }
 317
 318                if (dst_port == svc->port && svc->port != FTPPORT)
 319                        dport = dest->port;
 320
 321                /* Create a template
 322                 * This adds param.pe_data to the template,
 323                 * and thus param.pe_data will be destroyed
 324                 * when the template expires */
 325                ct = ip_vs_conn_new(&param, &dest->addr, dport,
 326                                    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 327                if (ct == NULL) {
 328                        kfree(param.pe_data);
 329                        *ignored = -1;
 330                        return NULL;
 331                }
 332
 333                ct->timeout = svc->timeout;
 334        } else {
 335                /* set destination with the found template */
 336                dest = ct->dest;
 337                kfree(param.pe_data);
 338        }
 339
 340        dport = dst_port;
 341        if (dport == svc->port && dest->port)
 342                dport = dest->port;
 343
 344        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 345                 && iph.protocol == IPPROTO_UDP)?
 346                IP_VS_CONN_F_ONE_PACKET : 0;
 347
 348        /*
 349         *    Create a new connection according to the template
 350         */
 351        ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
 352                              src_port, &iph.daddr, dst_port, &param);
 353
 354        cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
 355        if (cp == NULL) {
 356                ip_vs_conn_put(ct);
 357                *ignored = -1;
 358                return NULL;
 359        }
 360
 361        /*
 362         *    Add its control
 363         */
 364        ip_vs_control_add(cp, ct);
 365        ip_vs_conn_put(ct);
 366
 367        ip_vs_conn_stats(cp, svc);
 368        return cp;
 369}
 370
 371
 372/*
 373 *  IPVS main scheduling function
 374 *  It selects a server according to the virtual service, and
 375 *  creates a connection entry.
 376 *  Protocols supported: TCP, UDP
 377 *
 378 *  Usage of *ignored
 379 *
 380 * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
 381 *       svc/scheduler decides that this packet should be accepted with
 382 *       NF_ACCEPT because it must not be scheduled.
 383 *
 384 * 0 :   scheduler can not find destination, so try bypass or
 385 *       return ICMP and then NF_DROP (ip_vs_leave).
 386 *
 387 * -1 :  scheduler tried to schedule but fatal error occurred, eg.
 388 *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
 389 *       failure such as missing Call-ID, ENOMEM on skb_linearize
 390 *       or pe_data. In this case we should return NF_DROP without
 391 *       any attempts to send ICMP with ip_vs_leave.
 392 */
 393struct ip_vs_conn *
 394ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 395               struct ip_vs_proto_data *pd, int *ignored)
 396{
 397        struct ip_vs_protocol *pp = pd->pp;
 398        struct ip_vs_conn *cp = NULL;
 399        struct ip_vs_iphdr iph;
 400        struct ip_vs_dest *dest;
 401        __be16 _ports[2], *pptr;
 402        unsigned int flags;
 403
 404        *ignored = 1;
 405        ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 406        pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 407        if (pptr == NULL)
 408                return NULL;
 409
 410        /*
 411         * FTPDATA needs this check when using local real server.
 412         * Never schedule Active FTPDATA connections from real server.
 413         * For LVS-NAT they must be already created. For other methods
 414         * with persistence the connection is created on SYN+ACK.
 415         */
 416        if (pptr[0] == FTPDATA) {
 417                IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
 418                              "Not scheduling FTPDATA");
 419                return NULL;
 420        }
 421
 422        /*
 423         *    Do not schedule replies from local real server.
 424         */
 425        if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
 426            (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
 427                IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
 428                              "Not scheduling reply for existing connection");
 429                __ip_vs_conn_put(cp);
 430                return NULL;
 431        }
 432
 433        /*
 434         *    Persistent service
 435         */
 436        if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 437                return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
 438
 439        *ignored = 0;
 440
 441        /*
 442         *    Non-persistent service
 443         */
 444        if (!svc->fwmark && pptr[1] != svc->port) {
 445                if (!svc->port)
 446                        pr_err("Schedule: port zero only supported "
 447                               "in persistent services, "
 448                               "check your ipvs configuration\n");
 449                return NULL;
 450        }
 451
 452        dest = svc->scheduler->schedule(svc, skb);
 453        if (dest == NULL) {
 454                IP_VS_DBG(1, "Schedule: no dest found.\n");
 455                return NULL;
 456        }
 457
 458        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 459                 && iph.protocol == IPPROTO_UDP)?
 460                IP_VS_CONN_F_ONE_PACKET : 0;
 461
 462        /*
 463         *    Create a connection entry.
 464         */
 465        {
 466                struct ip_vs_conn_param p;
 467
 468                ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
 469                                      &iph.saddr, pptr[0], &iph.daddr, pptr[1],
 470                                      &p);
 471                cp = ip_vs_conn_new(&p, &dest->addr,
 472                                    dest->port ? dest->port : pptr[1],
 473                                    flags, dest, skb->mark);
 474                if (!cp) {
 475                        *ignored = -1;
 476                        return NULL;
 477                }
 478        }
 479
 480        IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 481                      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 482                      ip_vs_fwd_tag(cp),
 483                      IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
 484                      IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
 485                      IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
 486                      cp->flags, atomic_read(&cp->refcnt));
 487
 488        ip_vs_conn_stats(cp, svc);
 489        return cp;
 490}
 491
 492
 493/*
 494 *  Pass or drop the packet.
 495 *  Called by ip_vs_in, when the virtual service is available but
 496 *  no destination is available for a new connection.
 497 */
 498int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 499                struct ip_vs_proto_data *pd)
 500{
 501        __be16 _ports[2], *pptr;
 502        struct ip_vs_iphdr iph;
 503#ifdef CONFIG_SYSCTL
 504        struct net *net;
 505        struct netns_ipvs *ipvs;
 506        int unicast;
 507#endif
 508
 509        ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 510
 511        pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 512        if (pptr == NULL) {
 513                ip_vs_service_put(svc);
 514                return NF_DROP;
 515        }
 516
 517#ifdef CONFIG_SYSCTL
 518        net = skb_net(skb);
 519
 520#ifdef CONFIG_IP_VS_IPV6
 521        if (svc->af == AF_INET6)
 522                unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
 523        else
 524#endif
 525                unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
 526
 527        /* if it is fwmark-based service, the cache_bypass sysctl is up
 528           and the destination is a non-local unicast, then create
 529           a cache_bypass connection entry */
 530        ipvs = net_ipvs(net);
 531        if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
 532                int ret;
 533                struct ip_vs_conn *cp;
 534                unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
 535                                      iph.protocol == IPPROTO_UDP)?
 536                                      IP_VS_CONN_F_ONE_PACKET : 0;
 537                union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 538
 539                ip_vs_service_put(svc);
 540
 541                /* create a new connection entry */
 542                IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 543                {
 544                        struct ip_vs_conn_param p;
 545                        ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
 546                                              &iph.saddr, pptr[0],
 547                                              &iph.daddr, pptr[1], &p);
 548                        cp = ip_vs_conn_new(&p, &daddr, 0,
 549                                            IP_VS_CONN_F_BYPASS | flags,
 550                                            NULL, skb->mark);
 551                        if (!cp)
 552                                return NF_DROP;
 553                }
 554
 555                /* statistics */
 556                ip_vs_in_stats(cp, skb);
 557
 558                /* set state */
 559                ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 560
 561                /* transmit the first SYN packet */
 562                ret = cp->packet_xmit(skb, cp, pd->pp);
 563                /* do not touch skb anymore */
 564
 565                atomic_inc(&cp->in_pkts);
 566                ip_vs_conn_put(cp);
 567                return ret;
 568        }
 569#endif
 570
 571        /*
 572         * When the virtual ftp service is presented, packets destined
 573         * for other services on the VIP may get here (except services
 574         * listed in the ipvs table), pass the packets, because it is
 575         * not ipvs job to decide to drop the packets.
 576         */
 577        if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
 578                ip_vs_service_put(svc);
 579                return NF_ACCEPT;
 580        }
 581
 582        ip_vs_service_put(svc);
 583
 584        /*
 585         * Notify the client that the destination is unreachable, and
 586         * release the socket buffer.
 587         * Since it is in IP layer, the TCP socket is not actually
 588         * created, the TCP RST packet cannot be sent, instead that
 589         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 590         */
 591#ifdef CONFIG_IP_VS_IPV6
 592        if (svc->af == AF_INET6) {
 593                if (!skb->dev) {
 594                        struct net *net = dev_net(skb_dst(skb)->dev);
 595
 596                        skb->dev = net->loopback_dev;
 597                }
 598                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 599        } else
 600#endif
 601                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 602
 603        return NF_DROP;
 604}
 605
 606#ifdef CONFIG_SYSCTL
 607
 608static int sysctl_snat_reroute(struct sk_buff *skb)
 609{
 610        struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
 611        return ipvs->sysctl_snat_reroute;
 612}
 613
 614static int sysctl_nat_icmp_send(struct net *net)
 615{
 616        struct netns_ipvs *ipvs = net_ipvs(net);
 617        return ipvs->sysctl_nat_icmp_send;
 618}
 619
 620static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
 621{
 622        return ipvs->sysctl_expire_nodest_conn;
 623}
 624
 625#else
 626
 627static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
 628static int sysctl_nat_icmp_send(struct net *net) { return 0; }
 629static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
 630
 631#endif
 632
 633__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 634{
 635        return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 636}
 637
 638static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
 639{
 640        if (NF_INET_LOCAL_IN == hooknum)
 641                return IP_DEFRAG_VS_IN;
 642        if (NF_INET_FORWARD == hooknum)
 643                return IP_DEFRAG_VS_FWD;
 644        return IP_DEFRAG_VS_OUT;
 645}
 646
 647static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 648{
 649        int err = ip_defrag(skb, user);
 650
 651        if (!err)
 652                ip_send_check(ip_hdr(skb));
 653
 654        return err;
 655}
 656
 657#ifdef CONFIG_IP_VS_IPV6
 658static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
 659{
 660        /* TODO IPv6: Find out what to do here for IPv6 */
 661        return 0;
 662}
 663#endif
 664
 665static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
 666{
 667#ifdef CONFIG_IP_VS_IPV6
 668        if (af == AF_INET6) {
 669                if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
 670                        return 1;
 671        } else
 672#endif
 673                if ((sysctl_snat_reroute(skb) ||
 674                     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
 675                    ip_route_me_harder(skb, RTN_LOCAL) != 0)
 676                        return 1;
 677
 678        return 0;
 679}
 680
 681/*
 682 * Packet has been made sufficiently writable in caller
 683 * - inout: 1=in->out, 0=out->in
 684 */
 685void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 686                    struct ip_vs_conn *cp, int inout)
 687{
 688        struct iphdr *iph        = ip_hdr(skb);
 689        unsigned int icmp_offset = iph->ihl*4;
 690        struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 691                                                      icmp_offset);
 692        struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 693
 694        if (inout) {
 695                iph->saddr = cp->vaddr.ip;
 696                ip_send_check(iph);
 697                ciph->daddr = cp->vaddr.ip;
 698                ip_send_check(ciph);
 699        } else {
 700                iph->daddr = cp->daddr.ip;
 701                ip_send_check(iph);
 702                ciph->saddr = cp->daddr.ip;
 703                ip_send_check(ciph);
 704        }
 705
 706        /* the TCP/UDP/SCTP port */
 707        if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
 708            IPPROTO_SCTP == ciph->protocol) {
 709                __be16 *ports = (void *)ciph + ciph->ihl*4;
 710
 711                if (inout)
 712                        ports[1] = cp->vport;
 713                else
 714                        ports[0] = cp->dport;
 715        }
 716
 717        /* And finally the ICMP checksum */
 718        icmph->checksum = 0;
 719        icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 720        skb->ip_summed = CHECKSUM_UNNECESSARY;
 721
 722        if (inout)
 723                IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 724                        "Forwarding altered outgoing ICMP");
 725        else
 726                IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 727                        "Forwarding altered incoming ICMP");
 728}
 729
 730#ifdef CONFIG_IP_VS_IPV6
 731void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 732                    struct ip_vs_conn *cp, int inout)
 733{
 734        struct ipv6hdr *iph      = ipv6_hdr(skb);
 735        unsigned int icmp_offset = sizeof(struct ipv6hdr);
 736        struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
 737                                                      icmp_offset);
 738        struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
 739
 740        if (inout) {
 741                iph->saddr = cp->vaddr.in6;
 742                ciph->daddr = cp->vaddr.in6;
 743        } else {
 744                iph->daddr = cp->daddr.in6;
 745                ciph->saddr = cp->daddr.in6;
 746        }
 747
 748        /* the TCP/UDP/SCTP port */
 749        if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
 750            IPPROTO_SCTP == ciph->nexthdr) {
 751                __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
 752
 753                if (inout)
 754                        ports[1] = cp->vport;
 755                else
 756                        ports[0] = cp->dport;
 757        }
 758
 759        /* And finally the ICMP checksum */
 760        icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
 761                                              skb->len - icmp_offset,
 762                                              IPPROTO_ICMPV6, 0);
 763        skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
 764        skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
 765        skb->ip_summed = CHECKSUM_PARTIAL;
 766
 767        if (inout)
 768                IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 769                              (void *)ciph - (void *)iph,
 770                              "Forwarding altered outgoing ICMPv6");
 771        else
 772                IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 773                              (void *)ciph - (void *)iph,
 774                              "Forwarding altered incoming ICMPv6");
 775}
 776#endif
 777
 778/* Handle relevant response ICMP messages - forward to the right
 779 * destination host.
 780 */
 781static int handle_response_icmp(int af, struct sk_buff *skb,
 782                                union nf_inet_addr *snet,
 783                                __u8 protocol, struct ip_vs_conn *cp,
 784                                struct ip_vs_protocol *pp,
 785                                unsigned int offset, unsigned int ihl)
 786{
 787        unsigned int verdict = NF_DROP;
 788
 789        if (IP_VS_FWD_METHOD(cp) != 0) {
 790                pr_err("shouldn't reach here, because the box is on the "
 791                       "half connection in the tun/dr module.\n");
 792        }
 793
 794        /* Ensure the checksum is correct */
 795        if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 796                /* Failed checksum! */
 797                IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 798                              IP_VS_DBG_ADDR(af, snet));
 799                goto out;
 800        }
 801
 802        if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 803            IPPROTO_SCTP == protocol)
 804                offset += 2 * sizeof(__u16);
 805        if (!skb_make_writable(skb, offset))
 806                goto out;
 807
 808#ifdef CONFIG_IP_VS_IPV6
 809        if (af == AF_INET6)
 810                ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 811        else
 812#endif
 813                ip_vs_nat_icmp(skb, pp, cp, 1);
 814
 815        if (ip_vs_route_me_harder(af, skb))
 816                goto out;
 817
 818        /* do the statistics and put it back */
 819        ip_vs_out_stats(cp, skb);
 820
 821        skb->ipvs_property = 1;
 822        if (!(cp->flags & IP_VS_CONN_F_NFCT))
 823                ip_vs_notrack(skb);
 824        else
 825                ip_vs_update_conntrack(skb, cp, 0);
 826        verdict = NF_ACCEPT;
 827
 828out:
 829        __ip_vs_conn_put(cp);
 830
 831        return verdict;
 832}
 833
 834/*
 835 *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 836 *      Find any that might be relevant, check against existing connections.
 837 *      Currently handles error types - unreachable, quench, ttl exceeded.
 838 */
 839static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
 840                          unsigned int hooknum)
 841{
 842        struct iphdr *iph;
 843        struct icmphdr  _icmph, *ic;
 844        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 845        struct ip_vs_iphdr ciph;
 846        struct ip_vs_conn *cp;
 847        struct ip_vs_protocol *pp;
 848        unsigned int offset, ihl;
 849        union nf_inet_addr snet;
 850
 851        *related = 1;
 852
 853        /* reassemble IP fragments */
 854        if (ip_is_fragment(ip_hdr(skb))) {
 855                if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
 856                        return NF_STOLEN;
 857        }
 858
 859        iph = ip_hdr(skb);
 860        offset = ihl = iph->ihl * 4;
 861        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 862        if (ic == NULL)
 863                return NF_DROP;
 864
 865        IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 866                  ic->type, ntohs(icmp_id(ic)),
 867                  &iph->saddr, &iph->daddr);
 868
 869        /*
 870         * Work through seeing if this is for us.
 871         * These checks are supposed to be in an order that means easy
 872         * things are checked first to speed up processing.... however
 873         * this means that some packets will manage to get a long way
 874         * down this stack and then be rejected, but that's life.
 875         */
 876        if ((ic->type != ICMP_DEST_UNREACH) &&
 877            (ic->type != ICMP_SOURCE_QUENCH) &&
 878            (ic->type != ICMP_TIME_EXCEEDED)) {
 879                *related = 0;
 880                return NF_ACCEPT;
 881        }
 882
 883        /* Now find the contained IP header */
 884        offset += sizeof(_icmph);
 885        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 886        if (cih == NULL)
 887                return NF_ACCEPT; /* The packet looks wrong, ignore */
 888
 889        pp = ip_vs_proto_get(cih->protocol);
 890        if (!pp)
 891                return NF_ACCEPT;
 892
 893        /* Is the embedded protocol header present? */
 894        if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 895                     pp->dont_defrag))
 896                return NF_ACCEPT;
 897
 898        IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
 899                      "Checking outgoing ICMP for");
 900
 901        offset += cih->ihl * 4;
 902
 903        ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 904        /* The embedded headers contain source and dest in reverse order */
 905        cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
 906        if (!cp)
 907                return NF_ACCEPT;
 908
 909        snet.ip = iph->saddr;
 910        return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
 911                                    pp, offset, ihl);
 912}
 913
 914#ifdef CONFIG_IP_VS_IPV6
 915static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
 916                             unsigned int hooknum)
 917{
 918        struct ipv6hdr *iph;
 919        struct icmp6hdr _icmph, *ic;
 920        struct ipv6hdr  _ciph, *cih;    /* The ip header contained
 921                                           within the ICMP */
 922        struct ip_vs_iphdr ciph;
 923        struct ip_vs_conn *cp;
 924        struct ip_vs_protocol *pp;
 925        unsigned int offset;
 926        union nf_inet_addr snet;
 927
 928        *related = 1;
 929
 930        /* reassemble IP fragments */
 931        if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
 932                if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
 933                        return NF_STOLEN;
 934        }
 935
 936        iph = ipv6_hdr(skb);
 937        offset = sizeof(struct ipv6hdr);
 938        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 939        if (ic == NULL)
 940                return NF_DROP;
 941
 942        IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
 943                  ic->icmp6_type, ntohs(icmpv6_id(ic)),
 944                  &iph->saddr, &iph->daddr);
 945
 946        /*
 947         * Work through seeing if this is for us.
 948         * These checks are supposed to be in an order that means easy
 949         * things are checked first to speed up processing.... however
 950         * this means that some packets will manage to get a long way
 951         * down this stack and then be rejected, but that's life.
 952         */
 953        if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
 954            (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
 955            (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
 956                *related = 0;
 957                return NF_ACCEPT;
 958        }
 959
 960        /* Now find the contained IP header */
 961        offset += sizeof(_icmph);
 962        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 963        if (cih == NULL)
 964                return NF_ACCEPT; /* The packet looks wrong, ignore */
 965
 966        pp = ip_vs_proto_get(cih->nexthdr);
 967        if (!pp)
 968                return NF_ACCEPT;
 969
 970        /* Is the embedded protocol header present? */
 971        /* TODO: we don't support fragmentation at the moment anyways */
 972        if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 973                return NF_ACCEPT;
 974
 975        IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
 976                      "Checking outgoing ICMPv6 for");
 977
 978        offset += sizeof(struct ipv6hdr);
 979
 980        ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 981        /* The embedded headers contain source and dest in reverse order */
 982        cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
 983        if (!cp)
 984                return NF_ACCEPT;
 985
 986        snet.in6 = iph->saddr;
 987        return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
 988                                    pp, offset, sizeof(struct ipv6hdr));
 989}
 990#endif
 991
 992/*
 993 * Check if sctp chunc is ABORT chunk
 994 */
 995static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
 996{
 997        sctp_chunkhdr_t *sch, schunk;
 998        sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
 999                        sizeof(schunk), &schunk);
1000        if (sch == NULL)
1001                return 0;
1002        if (sch->type == SCTP_CID_ABORT)
1003                return 1;
1004        return 0;
1005}
1006
1007static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1008{
1009        struct tcphdr _tcph, *th;
1010
1011        th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1012        if (th == NULL)
1013                return 0;
1014        return th->rst;
1015}
1016
1017/* Handle response packets: rewrite addresses and send away...
1018 */
1019static unsigned int
1020handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
1021                struct ip_vs_conn *cp, int ihl)
1022{
1023        struct ip_vs_protocol *pp = pd->pp;
1024
1025        IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
1026
1027        if (!skb_make_writable(skb, ihl))
1028                goto drop;
1029
1030        /* mangle the packet */
1031        if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
1032                goto drop;
1033
1034#ifdef CONFIG_IP_VS_IPV6
1035        if (af == AF_INET6)
1036                ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1037        else
1038#endif
1039        {
1040                ip_hdr(skb)->saddr = cp->vaddr.ip;
1041                ip_send_check(ip_hdr(skb));
1042        }
1043
1044        /*
1045         * nf_iterate does not expect change in the skb->dst->dev.
1046         * It looks like it is not fatal to enable this code for hooks
1047         * where our handlers are at the end of the chain list and
1048         * when all next handlers use skb->dst->dev and not outdev.
1049         * It will definitely route properly the inout NAT traffic
1050         * when multiple paths are used.
1051         */
1052
1053        /* For policy routing, packets originating from this
1054         * machine itself may be routed differently to packets
1055         * passing through.  We want this packet to be routed as
1056         * if it came from this machine itself.  So re-compute
1057         * the routing information.
1058         */
1059        if (ip_vs_route_me_harder(af, skb))
1060                goto drop;
1061
1062        IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
1063
1064        ip_vs_out_stats(cp, skb);
1065        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
1066        skb->ipvs_property = 1;
1067        if (!(cp->flags & IP_VS_CONN_F_NFCT))
1068                ip_vs_notrack(skb);
1069        else
1070                ip_vs_update_conntrack(skb, cp, 0);
1071        ip_vs_conn_put(cp);
1072
1073        LeaveFunction(11);
1074        return NF_ACCEPT;
1075
1076drop:
1077        ip_vs_conn_put(cp);
1078        kfree_skb(skb);
1079        LeaveFunction(11);
1080        return NF_STOLEN;
1081}
1082
1083/*
1084 *      Check if outgoing packet belongs to the established ip_vs_conn.
1085 */
1086static unsigned int
1087ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1088{
1089        struct net *net = NULL;
1090        struct ip_vs_iphdr iph;
1091        struct ip_vs_protocol *pp;
1092        struct ip_vs_proto_data *pd;
1093        struct ip_vs_conn *cp;
1094
1095        EnterFunction(11);
1096
1097        /* Already marked as IPVS request or reply? */
1098        if (skb->ipvs_property)
1099                return NF_ACCEPT;
1100
1101        /* Bad... Do not break raw sockets */
1102        if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1103                     af == AF_INET)) {
1104                struct sock *sk = skb->sk;
1105                struct inet_sock *inet = inet_sk(skb->sk);
1106
1107                if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1108                        return NF_ACCEPT;
1109        }
1110
1111        if (unlikely(!skb_dst(skb)))
1112                return NF_ACCEPT;
1113
1114        net = skb_net(skb);
1115        if (!net_ipvs(net)->enable)
1116                return NF_ACCEPT;
1117
1118        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1119#ifdef CONFIG_IP_VS_IPV6
1120        if (af == AF_INET6) {
1121                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1122                        int related;
1123                        int verdict = ip_vs_out_icmp_v6(skb, &related,
1124                                                        hooknum);
1125
1126                        if (related)
1127                                return verdict;
1128                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1129                }
1130        } else
1131#endif
1132                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1133                        int related;
1134                        int verdict = ip_vs_out_icmp(skb, &related, hooknum);
1135
1136                        if (related)
1137                                return verdict;
1138                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1139                }
1140
1141        pd = ip_vs_proto_data_get(net, iph.protocol);
1142        if (unlikely(!pd))
1143                return NF_ACCEPT;
1144        pp = pd->pp;
1145
1146        /* reassemble IP fragments */
1147#ifdef CONFIG_IP_VS_IPV6
1148        if (af == AF_INET6) {
1149                if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1150                        if (ip_vs_gather_frags_v6(skb,
1151                                                  ip_vs_defrag_user(hooknum)))
1152                                return NF_STOLEN;
1153                }
1154
1155                ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1156        } else
1157#endif
1158                if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
1159                        if (ip_vs_gather_frags(skb,
1160                                               ip_vs_defrag_user(hooknum)))
1161                                return NF_STOLEN;
1162
1163                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1164                }
1165
1166        /*
1167         * Check if the packet belongs to an existing entry
1168         */
1169        cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
1170
1171        if (likely(cp))
1172                return handle_response(af, skb, pd, cp, iph.len);
1173        if (sysctl_nat_icmp_send(net) &&
1174            (pp->protocol == IPPROTO_TCP ||
1175             pp->protocol == IPPROTO_UDP ||
1176             pp->protocol == IPPROTO_SCTP)) {
1177                __be16 _ports[2], *pptr;
1178
1179                pptr = skb_header_pointer(skb, iph.len,
1180                                          sizeof(_ports), _ports);
1181                if (pptr == NULL)
1182                        return NF_ACCEPT;       /* Not for me */
1183                if (ip_vs_lookup_real_service(net, af, iph.protocol,
1184                                              &iph.saddr,
1185                                              pptr[0])) {
1186                        /*
1187                         * Notify the real server: there is no
1188                         * existing entry if it is not RST
1189                         * packet or not TCP packet.
1190                         */
1191                        if ((iph.protocol != IPPROTO_TCP &&
1192                             iph.protocol != IPPROTO_SCTP)
1193                             || ((iph.protocol == IPPROTO_TCP
1194                                  && !is_tcp_reset(skb, iph.len))
1195                                 || (iph.protocol == IPPROTO_SCTP
1196                                        && !is_sctp_abort(skb,
1197                                                iph.len)))) {
1198#ifdef CONFIG_IP_VS_IPV6
1199                                if (af == AF_INET6) {
1200                                        struct net *net =
1201                                                dev_net(skb_dst(skb)->dev);
1202
1203                                        if (!skb->dev)
1204                                                skb->dev = net->loopback_dev;
1205                                        icmpv6_send(skb,
1206                                                    ICMPV6_DEST_UNREACH,
1207                                                    ICMPV6_PORT_UNREACH,
1208                                                    0);
1209                                } else
1210#endif
1211                                        icmp_send(skb,
1212                                                  ICMP_DEST_UNREACH,
1213                                                  ICMP_PORT_UNREACH, 0);
1214                                return NF_DROP;
1215                        }
1216                }
1217        }
1218        IP_VS_DBG_PKT(12, af, pp, skb, 0,
1219                      "ip_vs_out: packet continues traversal as normal");
1220        return NF_ACCEPT;
1221}
1222
1223/*
1224 *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1225 *      used only for VS/NAT.
1226 *      Check if packet is reply for established ip_vs_conn.
1227 */
1228static unsigned int
1229ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1230             const struct net_device *in, const struct net_device *out,
1231             int (*okfn)(struct sk_buff *))
1232{
1233        return ip_vs_out(hooknum, skb, AF_INET);
1234}
1235
1236/*
1237 *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1238 *      Check if packet is reply for established ip_vs_conn.
1239 */
1240static unsigned int
1241ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1242                   const struct net_device *in, const struct net_device *out,
1243                   int (*okfn)(struct sk_buff *))
1244{
1245        unsigned int verdict;
1246
1247        /* Disable BH in LOCAL_OUT until all places are fixed */
1248        local_bh_disable();
1249        verdict = ip_vs_out(hooknum, skb, AF_INET);
1250        local_bh_enable();
1251        return verdict;
1252}
1253
1254#ifdef CONFIG_IP_VS_IPV6
1255
1256/*
1257 *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1258 *      used only for VS/NAT.
1259 *      Check if packet is reply for established ip_vs_conn.
1260 */
1261static unsigned int
1262ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1263             const struct net_device *in, const struct net_device *out,
1264             int (*okfn)(struct sk_buff *))
1265{
1266        return ip_vs_out(hooknum, skb, AF_INET6);
1267}
1268
1269/*
1270 *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1271 *      Check if packet is reply for established ip_vs_conn.
1272 */
1273static unsigned int
1274ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1275                   const struct net_device *in, const struct net_device *out,
1276                   int (*okfn)(struct sk_buff *))
1277{
1278        unsigned int verdict;
1279
1280        /* Disable BH in LOCAL_OUT until all places are fixed */
1281        local_bh_disable();
1282        verdict = ip_vs_out(hooknum, skb, AF_INET6);
1283        local_bh_enable();
1284        return verdict;
1285}
1286
1287#endif
1288
1289/*
1290 *      Handle ICMP messages in the outside-to-inside direction (incoming).
1291 *      Find any that might be relevant, check against existing connections,
1292 *      forward to the right destination host if relevant.
1293 *      Currently handles error types - unreachable, quench, ttl exceeded.
1294 */
1295static int
1296ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1297{
1298        struct net *net = NULL;
1299        struct iphdr *iph;
1300        struct icmphdr  _icmph, *ic;
1301        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1302        struct ip_vs_iphdr ciph;
1303        struct ip_vs_conn *cp;
1304        struct ip_vs_protocol *pp;
1305        struct ip_vs_proto_data *pd;
1306        unsigned int offset, ihl, verdict;
1307
1308        *related = 1;
1309
1310        /* reassemble IP fragments */
1311        if (ip_is_fragment(ip_hdr(skb))) {
1312                if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1313                        return NF_STOLEN;
1314        }
1315
1316        iph = ip_hdr(skb);
1317        offset = ihl = iph->ihl * 4;
1318        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1319        if (ic == NULL)
1320                return NF_DROP;
1321
1322        IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1323                  ic->type, ntohs(icmp_id(ic)),
1324                  &iph->saddr, &iph->daddr);
1325
1326        /*
1327         * Work through seeing if this is for us.
1328         * These checks are supposed to be in an order that means easy
1329         * things are checked first to speed up processing.... however
1330         * this means that some packets will manage to get a long way
1331         * down this stack and then be rejected, but that's life.
1332         */
1333        if ((ic->type != ICMP_DEST_UNREACH) &&
1334            (ic->type != ICMP_SOURCE_QUENCH) &&
1335            (ic->type != ICMP_TIME_EXCEEDED)) {
1336                *related = 0;
1337                return NF_ACCEPT;
1338        }
1339
1340        /* Now find the contained IP header */
1341        offset += sizeof(_icmph);
1342        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1343        if (cih == NULL)
1344                return NF_ACCEPT; /* The packet looks wrong, ignore */
1345
1346        net = skb_net(skb);
1347
1348        pd = ip_vs_proto_data_get(net, cih->protocol);
1349        if (!pd)
1350                return NF_ACCEPT;
1351        pp = pd->pp;
1352
1353        /* Is the embedded protocol header present? */
1354        if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1355                     pp->dont_defrag))
1356                return NF_ACCEPT;
1357
1358        IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1359                      "Checking incoming ICMP for");
1360
1361        offset += cih->ihl * 4;
1362
1363        ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1364        /* The embedded headers contain source and dest in reverse order */
1365        cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
1366        if (!cp)
1367                return NF_ACCEPT;
1368
1369        verdict = NF_DROP;
1370
1371        /* Ensure the checksum is correct */
1372        if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1373                /* Failed checksum! */
1374                IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1375                          &iph->saddr);
1376                goto out;
1377        }
1378
1379        /* do the statistics and put it back */
1380        ip_vs_in_stats(cp, skb);
1381        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1382                offset += 2 * sizeof(__u16);
1383        verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum);
1384
1385out:
1386        __ip_vs_conn_put(cp);
1387
1388        return verdict;
1389}
1390
1391#ifdef CONFIG_IP_VS_IPV6
1392static int
1393ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1394{
1395        struct net *net = NULL;
1396        struct ipv6hdr *iph;
1397        struct icmp6hdr _icmph, *ic;
1398        struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1399                                           within the ICMP */
1400        struct ip_vs_iphdr ciph;
1401        struct ip_vs_conn *cp;
1402        struct ip_vs_protocol *pp;
1403        struct ip_vs_proto_data *pd;
1404        unsigned int offset, verdict;
1405
1406        *related = 1;
1407
1408        /* reassemble IP fragments */
1409        if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1410                if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
1411                        return NF_STOLEN;
1412        }
1413
1414        iph = ipv6_hdr(skb);
1415        offset = sizeof(struct ipv6hdr);
1416        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1417        if (ic == NULL)
1418                return NF_DROP;
1419
1420        IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1421                  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1422                  &iph->saddr, &iph->daddr);
1423
1424        /*
1425         * Work through seeing if this is for us.
1426         * These checks are supposed to be in an order that means easy
1427         * things are checked first to speed up processing.... however
1428         * this means that some packets will manage to get a long way
1429         * down this stack and then be rejected, but that's life.
1430         */
1431        if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1432            (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1433            (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1434                *related = 0;
1435                return NF_ACCEPT;
1436        }
1437
1438        /* Now find the contained IP header */
1439        offset += sizeof(_icmph);
1440        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1441        if (cih == NULL)
1442                return NF_ACCEPT; /* The packet looks wrong, ignore */
1443
1444        net = skb_net(skb);
1445        pd = ip_vs_proto_data_get(net, cih->nexthdr);
1446        if (!pd)
1447                return NF_ACCEPT;
1448        pp = pd->pp;
1449
1450        /* Is the embedded protocol header present? */
1451        /* TODO: we don't support fragmentation at the moment anyways */
1452        if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1453                return NF_ACCEPT;
1454
1455        IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1456                      "Checking incoming ICMPv6 for");
1457
1458        offset += sizeof(struct ipv6hdr);
1459
1460        ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1461        /* The embedded headers contain source and dest in reverse order */
1462        cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
1463        if (!cp)
1464                return NF_ACCEPT;
1465
1466        /* do the statistics and put it back */
1467        ip_vs_in_stats(cp, skb);
1468        if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1469            IPPROTO_SCTP == cih->nexthdr)
1470                offset += 2 * sizeof(__u16);
1471        verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum);
1472
1473        __ip_vs_conn_put(cp);
1474
1475        return verdict;
1476}
1477#endif
1478
1479
1480/*
1481 *      Check if it's for virtual services, look it up,
1482 *      and send it on its way...
1483 */
1484static unsigned int
1485ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1486{
1487        struct net *net;
1488        struct ip_vs_iphdr iph;
1489        struct ip_vs_protocol *pp;
1490        struct ip_vs_proto_data *pd;
1491        struct ip_vs_conn *cp;
1492        int ret, pkts;
1493        struct netns_ipvs *ipvs;
1494
1495        /* Already marked as IPVS request or reply? */
1496        if (skb->ipvs_property)
1497                return NF_ACCEPT;
1498
1499        /*
1500         *      Big tappo:
1501         *      - remote client: only PACKET_HOST
1502         *      - route: used for struct net when skb->dev is unset
1503         */
1504        if (unlikely((skb->pkt_type != PACKET_HOST &&
1505                      hooknum != NF_INET_LOCAL_OUT) ||
1506                     !skb_dst(skb))) {
1507                ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1508                IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1509                              " ignored in hook %u\n",
1510                              skb->pkt_type, iph.protocol,
1511                              IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1512                return NF_ACCEPT;
1513        }
1514        /* ipvs enabled in this netns ? */
1515        net = skb_net(skb);
1516        if (!net_ipvs(net)->enable)
1517                return NF_ACCEPT;
1518
1519        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1520
1521        /* Bad... Do not break raw sockets */
1522        if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1523                     af == AF_INET)) {
1524                struct sock *sk = skb->sk;
1525                struct inet_sock *inet = inet_sk(skb->sk);
1526
1527                if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1528                        return NF_ACCEPT;
1529        }
1530
1531#ifdef CONFIG_IP_VS_IPV6
1532        if (af == AF_INET6) {
1533                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1534                        int related;
1535                        int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1536
1537                        if (related)
1538                                return verdict;
1539                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1540                }
1541        } else
1542#endif
1543                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1544                        int related;
1545                        int verdict = ip_vs_in_icmp(skb, &related, hooknum);
1546
1547                        if (related)
1548                                return verdict;
1549                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1550                }
1551
1552        /* Protocol supported? */
1553        pd = ip_vs_proto_data_get(net, iph.protocol);
1554        if (unlikely(!pd))
1555                return NF_ACCEPT;
1556        pp = pd->pp;
1557        /*
1558         * Check if the packet belongs to an existing connection entry
1559         */
1560        cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
1561
1562        if (unlikely(!cp)) {
1563                int v;
1564
1565                if (!pp->conn_schedule(af, skb, pd, &v, &cp))
1566                        return v;
1567        }
1568
1569        if (unlikely(!cp)) {
1570                /* sorry, all this trouble for a no-hit :) */
1571                IP_VS_DBG_PKT(12, af, pp, skb, 0,
1572                              "ip_vs_in: packet continues traversal as normal");
1573                return NF_ACCEPT;
1574        }
1575
1576        IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1577        ipvs = net_ipvs(net);
1578        /* Check the server status */
1579        if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1580                /* the destination server is not available */
1581
1582                if (sysctl_expire_nodest_conn(ipvs)) {
1583                        /* try to expire the connection immediately */
1584                        ip_vs_conn_expire_now(cp);
1585                }
1586                /* don't restart its timer, and silently
1587                   drop the packet. */
1588                __ip_vs_conn_put(cp);
1589                return NF_DROP;
1590        }
1591
1592        ip_vs_in_stats(cp, skb);
1593        ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
1594        if (cp->packet_xmit)
1595                ret = cp->packet_xmit(skb, cp, pp);
1596                /* do not touch skb anymore */
1597        else {
1598                IP_VS_DBG_RL("warning: packet_xmit is null");
1599                ret = NF_ACCEPT;
1600        }
1601
1602        /* Increase its packet counter and check if it is needed
1603         * to be synchronized
1604         *
1605         * Sync connection if it is about to close to
1606         * encorage the standby servers to update the connections timeout
1607         *
1608         * For ONE_PKT let ip_vs_sync_conn() do the filter work.
1609         */
1610
1611        if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1612                pkts = sysctl_sync_threshold(ipvs);
1613        else
1614                pkts = atomic_add_return(1, &cp->in_pkts);
1615
1616        if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
1617            cp->protocol == IPPROTO_SCTP) {
1618                if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1619                        (pkts % sysctl_sync_period(ipvs)
1620                         == sysctl_sync_threshold(ipvs))) ||
1621                                (cp->old_state != cp->state &&
1622                                 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1623                                  (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1624                                  (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1625                        ip_vs_sync_conn(net, cp);
1626                        goto out;
1627                }
1628        }
1629
1630        /* Keep this block last: TCP and others with pp->num_states <= 1 */
1631        else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
1632            (((cp->protocol != IPPROTO_TCP ||
1633               cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1634              (pkts % sysctl_sync_period(ipvs)
1635               == sysctl_sync_threshold(ipvs))) ||
1636             ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1637              ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1638               (cp->state == IP_VS_TCP_S_CLOSE) ||
1639               (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1640               (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1641                ip_vs_sync_conn(net, cp);
1642out:
1643        cp->old_state = cp->state;
1644
1645        ip_vs_conn_put(cp);
1646        return ret;
1647}
1648
1649/*
1650 *      AF_INET handler in NF_INET_LOCAL_IN chain
1651 *      Schedule and forward packets from remote clients
1652 */
1653static unsigned int
1654ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1655                      const struct net_device *in,
1656                      const struct net_device *out,
1657                      int (*okfn)(struct sk_buff *))
1658{
1659        return ip_vs_in(hooknum, skb, AF_INET);
1660}
1661
1662/*
1663 *      AF_INET handler in NF_INET_LOCAL_OUT chain
1664 *      Schedule and forward packets from local clients
1665 */
1666static unsigned int
1667ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1668                     const struct net_device *in, const struct net_device *out,
1669                     int (*okfn)(struct sk_buff *))
1670{
1671        unsigned int verdict;
1672
1673        /* Disable BH in LOCAL_OUT until all places are fixed */
1674        local_bh_disable();
1675        verdict = ip_vs_in(hooknum, skb, AF_INET);
1676        local_bh_enable();
1677        return verdict;
1678}
1679
1680#ifdef CONFIG_IP_VS_IPV6
1681
1682/*
1683 *      AF_INET6 handler in NF_INET_LOCAL_IN chain
1684 *      Schedule and forward packets from remote clients
1685 */
1686static unsigned int
1687ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1688                      const struct net_device *in,
1689                      const struct net_device *out,
1690                      int (*okfn)(struct sk_buff *))
1691{
1692        return ip_vs_in(hooknum, skb, AF_INET6);
1693}
1694
1695/*
1696 *      AF_INET6 handler in NF_INET_LOCAL_OUT chain
1697 *      Schedule and forward packets from local clients
1698 */
1699static unsigned int
1700ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1701                     const struct net_device *in, const struct net_device *out,
1702                     int (*okfn)(struct sk_buff *))
1703{
1704        unsigned int verdict;
1705
1706        /* Disable BH in LOCAL_OUT until all places are fixed */
1707        local_bh_disable();
1708        verdict = ip_vs_in(hooknum, skb, AF_INET6);
1709        local_bh_enable();
1710        return verdict;
1711}
1712
1713#endif
1714
1715
1716/*
1717 *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1718 *      related packets destined for 0.0.0.0/0.
1719 *      When fwmark-based virtual service is used, such as transparent
1720 *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1721 *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1722 *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1723 *      and send them to ip_vs_in_icmp.
1724 */
1725static unsigned int
1726ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1727                   const struct net_device *in, const struct net_device *out,
1728                   int (*okfn)(struct sk_buff *))
1729{
1730        int r;
1731        struct net *net;
1732
1733        if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1734                return NF_ACCEPT;
1735
1736        /* ipvs enabled in this netns ? */
1737        net = skb_net(skb);
1738        if (!net_ipvs(net)->enable)
1739                return NF_ACCEPT;
1740
1741        return ip_vs_in_icmp(skb, &r, hooknum);
1742}
1743
1744#ifdef CONFIG_IP_VS_IPV6
1745static unsigned int
1746ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1747                      const struct net_device *in, const struct net_device *out,
1748                      int (*okfn)(struct sk_buff *))
1749{
1750        int r;
1751        struct net *net;
1752
1753        if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1754                return NF_ACCEPT;
1755
1756        /* ipvs enabled in this netns ? */
1757        net = skb_net(skb);
1758        if (!net_ipvs(net)->enable)
1759                return NF_ACCEPT;
1760
1761        return ip_vs_in_icmp_v6(skb, &r, hooknum);
1762}
1763#endif
1764
1765
1766static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1767        /* After packet filtering, change source only for VS/NAT */
1768        {
1769                .hook           = ip_vs_reply4,
1770                .owner          = THIS_MODULE,
1771                .pf             = PF_INET,
1772                .hooknum        = NF_INET_LOCAL_IN,
1773                .priority       = NF_IP_PRI_NAT_SRC - 2,
1774        },
1775        /* After packet filtering, forward packet through VS/DR, VS/TUN,
1776         * or VS/NAT(change destination), so that filtering rules can be
1777         * applied to IPVS. */
1778        {
1779                .hook           = ip_vs_remote_request4,
1780                .owner          = THIS_MODULE,
1781                .pf             = PF_INET,
1782                .hooknum        = NF_INET_LOCAL_IN,
1783                .priority       = NF_IP_PRI_NAT_SRC - 1,
1784        },
1785        /* Before ip_vs_in, change source only for VS/NAT */
1786        {
1787                .hook           = ip_vs_local_reply4,
1788                .owner          = THIS_MODULE,
1789                .pf             = PF_INET,
1790                .hooknum        = NF_INET_LOCAL_OUT,
1791                .priority       = NF_IP_PRI_NAT_DST + 1,
1792        },
1793        /* After mangle, schedule and forward local requests */
1794        {
1795                .hook           = ip_vs_local_request4,
1796                .owner          = THIS_MODULE,
1797                .pf             = PF_INET,
1798                .hooknum        = NF_INET_LOCAL_OUT,
1799                .priority       = NF_IP_PRI_NAT_DST + 2,
1800        },
1801        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1802         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1803        {
1804                .hook           = ip_vs_forward_icmp,
1805                .owner          = THIS_MODULE,
1806                .pf             = PF_INET,
1807                .hooknum        = NF_INET_FORWARD,
1808                .priority       = 99,
1809        },
1810        /* After packet filtering, change source only for VS/NAT */
1811        {
1812                .hook           = ip_vs_reply4,
1813                .owner          = THIS_MODULE,
1814                .pf             = PF_INET,
1815                .hooknum        = NF_INET_FORWARD,
1816                .priority       = 100,
1817        },
1818#ifdef CONFIG_IP_VS_IPV6
1819        /* After packet filtering, change source only for VS/NAT */
1820        {
1821                .hook           = ip_vs_reply6,
1822                .owner          = THIS_MODULE,
1823                .pf             = PF_INET6,
1824                .hooknum        = NF_INET_LOCAL_IN,
1825                .priority       = NF_IP6_PRI_NAT_SRC - 2,
1826        },
1827        /* After packet filtering, forward packet through VS/DR, VS/TUN,
1828         * or VS/NAT(change destination), so that filtering rules can be
1829         * applied to IPVS. */
1830        {
1831                .hook           = ip_vs_remote_request6,
1832                .owner          = THIS_MODULE,
1833                .pf             = PF_INET6,
1834                .hooknum        = NF_INET_LOCAL_IN,
1835                .priority       = NF_IP6_PRI_NAT_SRC - 1,
1836        },
1837        /* Before ip_vs_in, change source only for VS/NAT */
1838        {
1839                .hook           = ip_vs_local_reply6,
1840                .owner          = THIS_MODULE,
1841                .pf             = PF_INET,
1842                .hooknum        = NF_INET_LOCAL_OUT,
1843                .priority       = NF_IP6_PRI_NAT_DST + 1,
1844        },
1845        /* After mangle, schedule and forward local requests */
1846        {
1847                .hook           = ip_vs_local_request6,
1848                .owner          = THIS_MODULE,
1849                .pf             = PF_INET6,
1850                .hooknum        = NF_INET_LOCAL_OUT,
1851                .priority       = NF_IP6_PRI_NAT_DST + 2,
1852        },
1853        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1854         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1855        {
1856                .hook           = ip_vs_forward_icmp_v6,
1857                .owner          = THIS_MODULE,
1858                .pf             = PF_INET6,
1859                .hooknum        = NF_INET_FORWARD,
1860                .priority       = 99,
1861        },
1862        /* After packet filtering, change source only for VS/NAT */
1863        {
1864                .hook           = ip_vs_reply6,
1865                .owner          = THIS_MODULE,
1866                .pf             = PF_INET6,
1867                .hooknum        = NF_INET_FORWARD,
1868                .priority       = 100,
1869        },
1870#endif
1871};
1872/*
1873 *      Initialize IP Virtual Server netns mem.
1874 */
1875static int __net_init __ip_vs_init(struct net *net)
1876{
1877        struct netns_ipvs *ipvs;
1878
1879        ipvs = net_generic(net, ip_vs_net_id);
1880        if (ipvs == NULL)
1881                return -ENOMEM;
1882
1883        /* Hold the beast until a service is registerd */
1884        ipvs->enable = 0;
1885        ipvs->net = net;
1886        /* Counters used for creating unique names */
1887        ipvs->gen = atomic_read(&ipvs_netns_cnt);
1888        atomic_inc(&ipvs_netns_cnt);
1889        net->ipvs = ipvs;
1890
1891        if (ip_vs_estimator_net_init(net) < 0)
1892                goto estimator_fail;
1893
1894        if (ip_vs_control_net_init(net) < 0)
1895                goto control_fail;
1896
1897        if (ip_vs_protocol_net_init(net) < 0)
1898                goto protocol_fail;
1899
1900        if (ip_vs_app_net_init(net) < 0)
1901                goto app_fail;
1902
1903        if (ip_vs_conn_net_init(net) < 0)
1904                goto conn_fail;
1905
1906        if (ip_vs_sync_net_init(net) < 0)
1907                goto sync_fail;
1908
1909        printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
1910                         sizeof(struct netns_ipvs), ipvs->gen);
1911        return 0;
1912/*
1913 * Error handling
1914 */
1915
1916sync_fail:
1917        ip_vs_conn_net_cleanup(net);
1918conn_fail:
1919        ip_vs_app_net_cleanup(net);
1920app_fail:
1921        ip_vs_protocol_net_cleanup(net);
1922protocol_fail:
1923        ip_vs_control_net_cleanup(net);
1924control_fail:
1925        ip_vs_estimator_net_cleanup(net);
1926estimator_fail:
1927        return -ENOMEM;
1928}
1929
1930static void __net_exit __ip_vs_cleanup(struct net *net)
1931{
1932        ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
1933        ip_vs_conn_net_cleanup(net);
1934        ip_vs_app_net_cleanup(net);
1935        ip_vs_protocol_net_cleanup(net);
1936        ip_vs_control_net_cleanup(net);
1937        ip_vs_estimator_net_cleanup(net);
1938        IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
1939}
1940
1941static void __net_exit __ip_vs_dev_cleanup(struct net *net)
1942{
1943        EnterFunction(2);
1944        net_ipvs(net)->enable = 0;      /* Disable packet reception */
1945        smp_wmb();
1946        ip_vs_sync_net_cleanup(net);
1947        LeaveFunction(2);
1948}
1949
1950static struct pernet_operations ipvs_core_ops = {
1951        .init = __ip_vs_init,
1952        .exit = __ip_vs_cleanup,
1953        .id   = &ip_vs_net_id,
1954        .size = sizeof(struct netns_ipvs),
1955};
1956
1957static struct pernet_operations ipvs_core_dev_ops = {
1958        .exit = __ip_vs_dev_cleanup,
1959};
1960
1961/*
1962 *      Initialize IP Virtual Server
1963 */
1964static int __init ip_vs_init(void)
1965{
1966        int ret;
1967
1968        ret = ip_vs_control_init();
1969        if (ret < 0) {
1970                pr_err("can't setup control.\n");
1971                goto exit;
1972        }
1973
1974        ip_vs_protocol_init();
1975
1976        ret = ip_vs_conn_init();
1977        if (ret < 0) {
1978                pr_err("can't setup connection table.\n");
1979                goto cleanup_protocol;
1980        }
1981
1982        ret = register_pernet_subsys(&ipvs_core_ops);   /* Alloc ip_vs struct */
1983        if (ret < 0)
1984                goto cleanup_conn;
1985
1986        ret = register_pernet_device(&ipvs_core_dev_ops);
1987        if (ret < 0)
1988                goto cleanup_sub;
1989
1990        ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1991        if (ret < 0) {
1992                pr_err("can't register hooks.\n");
1993                goto cleanup_dev;
1994        }
1995
1996        pr_info("ipvs loaded.\n");
1997
1998        return ret;
1999
2000cleanup_dev:
2001        unregister_pernet_device(&ipvs_core_dev_ops);
2002cleanup_sub:
2003        unregister_pernet_subsys(&ipvs_core_ops);
2004cleanup_conn:
2005        ip_vs_conn_cleanup();
2006cleanup_protocol:
2007        ip_vs_protocol_cleanup();
2008        ip_vs_control_cleanup();
2009exit:
2010        return ret;
2011}
2012
2013static void __exit ip_vs_cleanup(void)
2014{
2015        nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2016        unregister_pernet_device(&ipvs_core_dev_ops);
2017        unregister_pernet_subsys(&ipvs_core_ops);       /* free ip_vs struct */
2018        ip_vs_conn_cleanup();
2019        ip_vs_protocol_cleanup();
2020        ip_vs_control_cleanup();
2021        pr_info("ipvs unloaded.\n");
2022}
2023
2024module_init(ip_vs_init);
2025module_exit(ip_vs_cleanup);
2026MODULE_LICENSE("GPL");
2027