linux/net/ipv4/ipvs/ip_vs_xmit.c
<<
>>
Prefs
   1/*
   2 * ip_vs_xmit.c: various packet transmitters for IPVS
   3 *
   4 * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
   5 *
   6 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   7 *              Julian Anastasov <ja@ssi.bg>
   8 *
   9 *              This program is free software; you can redistribute it and/or
  10 *              modify it under the terms of the GNU General Public License
  11 *              as published by the Free Software Foundation; either version
  12 *              2 of the License, or (at your option) any later version.
  13 *
  14 * Changes:
  15 *
  16 */
  17
  18#include <linux/kernel.h>
  19#include <linux/tcp.h>                  /* for tcphdr */
  20#include <net/ip.h>
  21#include <net/tcp.h>                    /* for csum_tcpudp_magic */
  22#include <net/udp.h>
  23#include <net/icmp.h>                   /* for icmp_send */
  24#include <net/route.h>                  /* for ip_route_output */
  25#include <linux/netfilter.h>
  26#include <linux/netfilter_ipv4.h>
  27
  28#include <net/ip_vs.h>
  29
  30
  31/*
  32 *      Destination cache to speed up outgoing route lookup
  33 */
  34static inline void
  35__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
  36{
  37        struct dst_entry *old_dst;
  38
  39        old_dst = dest->dst_cache;
  40        dest->dst_cache = dst;
  41        dest->dst_rtos = rtos;
  42        dst_release(old_dst);
  43}
  44
  45static inline struct dst_entry *
  46__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
  47{
  48        struct dst_entry *dst = dest->dst_cache;
  49
  50        if (!dst)
  51                return NULL;
  52        if ((dst->obsolete || rtos != dest->dst_rtos) &&
  53            dst->ops->check(dst, cookie) == NULL) {
  54                dest->dst_cache = NULL;
  55                dst_release(dst);
  56                return NULL;
  57        }
  58        dst_hold(dst);
  59        return dst;
  60}
  61
  62static struct rtable *
  63__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
  64{
  65        struct rtable *rt;                      /* Route to the other host */
  66        struct ip_vs_dest *dest = cp->dest;
  67
  68        if (dest) {
  69                spin_lock(&dest->dst_lock);
  70                if (!(rt = (struct rtable *)
  71                      __ip_vs_dst_check(dest, rtos, 0))) {
  72                        struct flowi fl = {
  73                                .oif = 0,
  74                                .nl_u = {
  75                                        .ip4_u = {
  76                                                .daddr = dest->addr,
  77                                                .saddr = 0,
  78                                                .tos = rtos, } },
  79                        };
  80
  81                        if (ip_route_output_key(&init_net, &rt, &fl)) {
  82                                spin_unlock(&dest->dst_lock);
  83                                IP_VS_DBG_RL("ip_route_output error, "
  84                                             "dest: %u.%u.%u.%u\n",
  85                                             NIPQUAD(dest->addr));
  86                                return NULL;
  87                        }
  88                        __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
  89                        IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
  90                                  NIPQUAD(dest->addr),
  91                                  atomic_read(&rt->u.dst.__refcnt), rtos);
  92                }
  93                spin_unlock(&dest->dst_lock);
  94        } else {
  95                struct flowi fl = {
  96                        .oif = 0,
  97                        .nl_u = {
  98                                .ip4_u = {
  99                                        .daddr = cp->daddr,
 100                                        .saddr = 0,
 101                                        .tos = rtos, } },
 102                };
 103
 104                if (ip_route_output_key(&init_net, &rt, &fl)) {
 105                        IP_VS_DBG_RL("ip_route_output error, dest: "
 106                                     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
 107                        return NULL;
 108                }
 109        }
 110
 111        return rt;
 112}
 113
 114
 115/*
 116 *      Release dest->dst_cache before a dest is removed
 117 */
 118void
 119ip_vs_dst_reset(struct ip_vs_dest *dest)
 120{
 121        struct dst_entry *old_dst;
 122
 123        old_dst = dest->dst_cache;
 124        dest->dst_cache = NULL;
 125        dst_release(old_dst);
 126}
 127
 128#define IP_VS_XMIT(skb, rt)                             \
 129do {                                                    \
 130        (skb)->ipvs_property = 1;                       \
 131        skb_forward_csum(skb);                          \
 132        NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL,        \
 133                (rt)->u.dst.dev, dst_output);           \
 134} while (0)
 135
 136
 137/*
 138 *      NULL transmitter (do nothing except return NF_ACCEPT)
 139 */
 140int
 141ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 142                struct ip_vs_protocol *pp)
 143{
 144        /* we do not touch skb and do not need pskb ptr */
 145        return NF_ACCEPT;
 146}
 147
 148
 149/*
 150 *      Bypass transmitter
 151 *      Let packets bypass the destination when the destination is not
 152 *      available, it may be only used in transparent cache cluster.
 153 */
 154int
 155ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 156                  struct ip_vs_protocol *pp)
 157{
 158        struct rtable *rt;                      /* Route to the other host */
 159        struct iphdr  *iph = ip_hdr(skb);
 160        u8     tos = iph->tos;
 161        int    mtu;
 162        struct flowi fl = {
 163                .oif = 0,
 164                .nl_u = {
 165                        .ip4_u = {
 166                                .daddr = iph->daddr,
 167                                .saddr = 0,
 168                                .tos = RT_TOS(tos), } },
 169        };
 170
 171        EnterFunction(10);
 172
 173        if (ip_route_output_key(&init_net, &rt, &fl)) {
 174                IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
 175                             "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
 176                goto tx_error_icmp;
 177        }
 178
 179        /* MTU checking */
 180        mtu = dst_mtu(&rt->u.dst);
 181        if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
 182                ip_rt_put(rt);
 183                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 184                IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
 185                goto tx_error;
 186        }
 187
 188        /*
 189         * Call ip_send_check because we are not sure it is called
 190         * after ip_defrag. Is copy-on-write needed?
 191         */
 192        if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
 193                ip_rt_put(rt);
 194                return NF_STOLEN;
 195        }
 196        ip_send_check(ip_hdr(skb));
 197
 198        /* drop old route */
 199        dst_release(skb->dst);
 200        skb->dst = &rt->u.dst;
 201
 202        /* Another hack: avoid icmp_send in ip_fragment */
 203        skb->local_df = 1;
 204
 205        IP_VS_XMIT(skb, rt);
 206
 207        LeaveFunction(10);
 208        return NF_STOLEN;
 209
 210 tx_error_icmp:
 211        dst_link_failure(skb);
 212 tx_error:
 213        kfree_skb(skb);
 214        LeaveFunction(10);
 215        return NF_STOLEN;
 216}
 217
 218
 219/*
 220 *      NAT transmitter (only for outside-to-inside nat forwarding)
 221 *      Not used for related ICMP
 222 */
 223int
 224ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 225               struct ip_vs_protocol *pp)
 226{
 227        struct rtable *rt;              /* Route to the other host */
 228        int mtu;
 229        struct iphdr *iph = ip_hdr(skb);
 230
 231        EnterFunction(10);
 232
 233        /* check if it is a connection of no-client-port */
 234        if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 235                __be16 _pt, *p;
 236                p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
 237                if (p == NULL)
 238                        goto tx_error;
 239                ip_vs_conn_fill_cport(cp, *p);
 240                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 241        }
 242
 243        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
 244                goto tx_error_icmp;
 245
 246        /* MTU checking */
 247        mtu = dst_mtu(&rt->u.dst);
 248        if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
 249                ip_rt_put(rt);
 250                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 251                IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
 252                goto tx_error;
 253        }
 254
 255        /* copy-on-write the packet before mangling it */
 256        if (!skb_make_writable(skb, sizeof(struct iphdr)))
 257                goto tx_error_put;
 258
 259        if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
 260                goto tx_error_put;
 261
 262        /* drop old route */
 263        dst_release(skb->dst);
 264        skb->dst = &rt->u.dst;
 265
 266        /* mangle the packet */
 267        if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
 268                goto tx_error;
 269        ip_hdr(skb)->daddr = cp->daddr;
 270        ip_send_check(ip_hdr(skb));
 271
 272        IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
 273
 274        /* FIXME: when application helper enlarges the packet and the length
 275           is larger than the MTU of outgoing device, there will be still
 276           MTU problem. */
 277
 278        /* Another hack: avoid icmp_send in ip_fragment */
 279        skb->local_df = 1;
 280
 281        IP_VS_XMIT(skb, rt);
 282
 283        LeaveFunction(10);
 284        return NF_STOLEN;
 285
 286  tx_error_icmp:
 287        dst_link_failure(skb);
 288  tx_error:
 289        LeaveFunction(10);
 290        kfree_skb(skb);
 291        return NF_STOLEN;
 292  tx_error_put:
 293        ip_rt_put(rt);
 294        goto tx_error;
 295}
 296
 297
 298/*
 299 *   IP Tunneling transmitter
 300 *
 301 *   This function encapsulates the packet in a new IP packet, its
 302 *   destination will be set to cp->daddr. Most code of this function
 303 *   is taken from ipip.c.
 304 *
 305 *   It is used in VS/TUN cluster. The load balancer selects a real
 306 *   server from a cluster based on a scheduling algorithm,
 307 *   encapsulates the request packet and forwards it to the selected
 308 *   server. For example, all real servers are configured with
 309 *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
 310 *   the encapsulated packet, it will decapsulate the packet, processe
 311 *   the request and return the response packets directly to the client
 312 *   without passing the load balancer. This can greatly increase the
 313 *   scalability of virtual server.
 314 *
 315 *   Used for ANY protocol
 316 */
 317int
 318ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 319                  struct ip_vs_protocol *pp)
 320{
 321        struct rtable *rt;                      /* Route to the other host */
 322        struct net_device *tdev;                /* Device to other host */
 323        struct iphdr  *old_iph = ip_hdr(skb);
 324        u8     tos = old_iph->tos;
 325        __be16 df = old_iph->frag_off;
 326        sk_buff_data_t old_transport_header = skb->transport_header;
 327        struct iphdr  *iph;                     /* Our new IP header */
 328        unsigned int max_headroom;              /* The extra header space needed */
 329        int    mtu;
 330
 331        EnterFunction(10);
 332
 333        if (skb->protocol != htons(ETH_P_IP)) {
 334                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
 335                             "ETH_P_IP: %d, skb protocol: %d\n",
 336                             htons(ETH_P_IP), skb->protocol);
 337                goto tx_error;
 338        }
 339
 340        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
 341                goto tx_error_icmp;
 342
 343        tdev = rt->u.dst.dev;
 344
 345        mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
 346        if (mtu < 68) {
 347                ip_rt_put(rt);
 348                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
 349                goto tx_error;
 350        }
 351        if (skb->dst)
 352                skb->dst->ops->update_pmtu(skb->dst, mtu);
 353
 354        df |= (old_iph->frag_off & htons(IP_DF));
 355
 356        if ((old_iph->frag_off & htons(IP_DF))
 357            && mtu < ntohs(old_iph->tot_len)) {
 358                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 359                ip_rt_put(rt);
 360                IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
 361                goto tx_error;
 362        }
 363
 364        /*
 365         * Okay, now see if we can stuff it in the buffer as-is.
 366         */
 367        max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
 368
 369        if (skb_headroom(skb) < max_headroom
 370            || skb_cloned(skb) || skb_shared(skb)) {
 371                struct sk_buff *new_skb =
 372                        skb_realloc_headroom(skb, max_headroom);
 373                if (!new_skb) {
 374                        ip_rt_put(rt);
 375                        kfree_skb(skb);
 376                        IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
 377                        return NF_STOLEN;
 378                }
 379                kfree_skb(skb);
 380                skb = new_skb;
 381                old_iph = ip_hdr(skb);
 382        }
 383
 384        skb->transport_header = old_transport_header;
 385
 386        /* fix old IP header checksum */
 387        ip_send_check(old_iph);
 388
 389        skb_push(skb, sizeof(struct iphdr));
 390        skb_reset_network_header(skb);
 391        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 392
 393        /* drop old route */
 394        dst_release(skb->dst);
 395        skb->dst = &rt->u.dst;
 396
 397        /*
 398         *      Push down and install the IPIP header.
 399         */
 400        iph                     =       ip_hdr(skb);
 401        iph->version            =       4;
 402        iph->ihl                =       sizeof(struct iphdr)>>2;
 403        iph->frag_off           =       df;
 404        iph->protocol           =       IPPROTO_IPIP;
 405        iph->tos                =       tos;
 406        iph->daddr              =       rt->rt_dst;
 407        iph->saddr              =       rt->rt_src;
 408        iph->ttl                =       old_iph->ttl;
 409        ip_select_ident(iph, &rt->u.dst, NULL);
 410
 411        /* Another hack: avoid icmp_send in ip_fragment */
 412        skb->local_df = 1;
 413
 414        ip_local_out(skb);
 415
 416        LeaveFunction(10);
 417
 418        return NF_STOLEN;
 419
 420  tx_error_icmp:
 421        dst_link_failure(skb);
 422  tx_error:
 423        kfree_skb(skb);
 424        LeaveFunction(10);
 425        return NF_STOLEN;
 426}
 427
 428
 429/*
 430 *      Direct Routing transmitter
 431 *      Used for ANY protocol
 432 */
 433int
 434ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 435              struct ip_vs_protocol *pp)
 436{
 437        struct rtable *rt;                      /* Route to the other host */
 438        struct iphdr  *iph = ip_hdr(skb);
 439        int    mtu;
 440
 441        EnterFunction(10);
 442
 443        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
 444                goto tx_error_icmp;
 445
 446        /* MTU checking */
 447        mtu = dst_mtu(&rt->u.dst);
 448        if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
 449                icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 450                ip_rt_put(rt);
 451                IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
 452                goto tx_error;
 453        }
 454
 455        /*
 456         * Call ip_send_check because we are not sure it is called
 457         * after ip_defrag. Is copy-on-write needed?
 458         */
 459        if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
 460                ip_rt_put(rt);
 461                return NF_STOLEN;
 462        }
 463        ip_send_check(ip_hdr(skb));
 464
 465        /* drop old route */
 466        dst_release(skb->dst);
 467        skb->dst = &rt->u.dst;
 468
 469        /* Another hack: avoid icmp_send in ip_fragment */
 470        skb->local_df = 1;
 471
 472        IP_VS_XMIT(skb, rt);
 473
 474        LeaveFunction(10);
 475        return NF_STOLEN;
 476
 477  tx_error_icmp:
 478        dst_link_failure(skb);
 479  tx_error:
 480        kfree_skb(skb);
 481        LeaveFunction(10);
 482        return NF_STOLEN;
 483}
 484
 485
 486/*
 487 *      ICMP packet transmitter
 488 *      called by the ip_vs_in_icmp
 489 */
 490int
 491ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 492                struct ip_vs_protocol *pp, int offset)
 493{
 494        struct rtable   *rt;    /* Route to the other host */
 495        int mtu;
 496        int rc;
 497
 498        EnterFunction(10);
 499
 500        /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
 501           forwarded directly here, because there is no need to
 502           translate address/port back */
 503        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
 504                if (cp->packet_xmit)
 505                        rc = cp->packet_xmit(skb, cp, pp);
 506                else
 507                        rc = NF_ACCEPT;
 508                /* do not touch skb anymore */
 509                atomic_inc(&cp->in_pkts);
 510                goto out;
 511        }
 512
 513        /*
 514         * mangle and send the packet here (only for VS/NAT)
 515         */
 516
 517        if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
 518                goto tx_error_icmp;
 519
 520        /* MTU checking */
 521        mtu = dst_mtu(&rt->u.dst);
 522        if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
 523                ip_rt_put(rt);
 524                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 525                IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
 526                goto tx_error;
 527        }
 528
 529        /* copy-on-write the packet before mangling it */
 530        if (!skb_make_writable(skb, offset))
 531                goto tx_error_put;
 532
 533        if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
 534                goto tx_error_put;
 535
 536        /* drop the old route when skb is not shared */
 537        dst_release(skb->dst);
 538        skb->dst = &rt->u.dst;
 539
 540        ip_vs_nat_icmp(skb, pp, cp, 0);
 541
 542        /* Another hack: avoid icmp_send in ip_fragment */
 543        skb->local_df = 1;
 544
 545        IP_VS_XMIT(skb, rt);
 546
 547        rc = NF_STOLEN;
 548        goto out;
 549
 550  tx_error_icmp:
 551        dst_link_failure(skb);
 552  tx_error:
 553        dev_kfree_skb(skb);
 554        rc = NF_STOLEN;
 555  out:
 556        LeaveFunction(10);
 557        return rc;
 558  tx_error_put:
 559        ip_rt_put(rt);
 560        goto tx_error;
 561}
 562
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.