linux/net/sched/sch_netem.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_netem.c        Network emulator
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License.
   8 *
   9 *              Many of the algorithms and ideas for this came from
  10 *              NIST Net which is not copyrighted.
  11 *
  12 * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13 *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14 */
  15
  16#include <linux/module.h>
  17#include <linux/types.h>
  18#include <linux/kernel.h>
  19#include <linux/errno.h>
  20#include <linux/skbuff.h>
  21#include <linux/rtnetlink.h>
  22
  23#include <net/netlink.h>
  24#include <net/pkt_sched.h>
  25
  26#define VERSION "1.2"
  27
  28/*      Network Emulation Queuing algorithm.
  29        ====================================
  30
  31        Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  32                 Network Emulation Tool
  33                 [2] Luigi Rizzo, DummyNet for FreeBSD
  34
  35         ----------------------------------------------------------------
  36
  37         This started out as a simple way to delay outgoing packets to
  38         test TCP but has grown to include most of the functionality
  39         of a full blown network emulator like NISTnet. It can delay
  40         packets and add random jitter (and correlation). The random
  41         distribution can be loaded from a table as well to provide
  42         normal, Pareto, or experimental curves. Packet loss,
  43         duplication, and reordering can also be emulated.
  44
  45         This qdisc does not do classification that can be handled in
  46         layering other disciplines.  It does not need to do bandwidth
  47         control either since that can be handled by using token
  48         bucket or other rate control.
  49*/
  50
  51struct netem_sched_data {
  52        struct Qdisc    *qdisc;
  53        struct qdisc_watchdog watchdog;
  54
  55        psched_tdiff_t latency;
  56        psched_tdiff_t jitter;
  57
  58        u32 loss;
  59        u32 limit;
  60        u32 counter;
  61        u32 gap;
  62        u32 duplicate;
  63        u32 reorder;
  64        u32 corrupt;
  65
  66        struct crndstate {
  67                u32 last;
  68                u32 rho;
  69        } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  70
  71        struct disttable {
  72                u32  size;
  73                s16 table[0];
  74        } *delay_dist;
  75};
  76
  77/* Time stamp put into socket buffer control block */
  78struct netem_skb_cb {
  79        psched_time_t   time_to_send;
  80};
  81
  82static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
  83{
  84        BUILD_BUG_ON(sizeof(skb->cb) <
  85                sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
  86        return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
  87}
  88
  89/* init_crandom - initialize correlated random number generator
  90 * Use entropy source for initial seed.
  91 */
  92static void init_crandom(struct crndstate *state, unsigned long rho)
  93{
  94        state->rho = rho;
  95        state->last = net_random();
  96}
  97
  98/* get_crandom - correlated random number generator
  99 * Next number depends on last value.
 100 * rho is scaled to avoid floating point.
 101 */
 102static u32 get_crandom(struct crndstate *state)
 103{
 104        u64 value, rho;
 105        unsigned long answer;
 106
 107        if (state->rho == 0)    /* no correlation */
 108                return net_random();
 109
 110        value = net_random();
 111        rho = (u64)state->rho + 1;
 112        answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 113        state->last = answer;
 114        return answer;
 115}
 116
 117/* tabledist - return a pseudo-randomly distributed value with mean mu and
 118 * std deviation sigma.  Uses table lookup to approximate the desired
 119 * distribution, and a uniformly-distributed pseudo-random source.
 120 */
 121static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 122                                struct crndstate *state,
 123                                const struct disttable *dist)
 124{
 125        psched_tdiff_t x;
 126        long t;
 127        u32 rnd;
 128
 129        if (sigma == 0)
 130                return mu;
 131
 132        rnd = get_crandom(state);
 133
 134        /* default uniform distribution */
 135        if (dist == NULL)
 136                return (rnd % (2*sigma)) - sigma + mu;
 137
 138        t = dist->table[rnd % dist->size];
 139        x = (sigma % NETEM_DIST_SCALE) * t;
 140        if (x >= 0)
 141                x += NETEM_DIST_SCALE/2;
 142        else
 143                x -= NETEM_DIST_SCALE/2;
 144
 145        return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 146}
 147
 148/*
 149 * Insert one skb into qdisc.
 150 * Note: parent depends on return value to account for queue length.
 151 *      NET_XMIT_DROP: queue length didn't change.
 152 *      NET_XMIT_SUCCESS: one skb was queued.
 153 */
 154static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 155{
 156        struct netem_sched_data *q = qdisc_priv(sch);
 157        /* We don't fill cb now as skb_unshare() may invalidate it */
 158        struct netem_skb_cb *cb;
 159        struct sk_buff *skb2;
 160        int ret;
 161        int count = 1;
 162
 163        pr_debug("netem_enqueue skb=%p\n", skb);
 164
 165        /* Random duplication */
 166        if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 167                ++count;
 168
 169        /* Random packet drop 0 => none, ~0 => all */
 170        if (q->loss && q->loss >= get_crandom(&q->loss_cor))
 171                --count;
 172
 173        if (count == 0) {
 174                sch->qstats.drops++;
 175                kfree_skb(skb);
 176                return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 177        }
 178
 179        skb_orphan(skb);
 180
 181        /*
 182         * If we need to duplicate packet, then re-insert at top of the
 183         * qdisc tree, since parent queuer expects that only one
 184         * skb will be queued.
 185         */
 186        if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 187                struct Qdisc *rootq = qdisc_root(sch);
 188                u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 189                q->duplicate = 0;
 190
 191                qdisc_enqueue_root(skb2, rootq);
 192                q->duplicate = dupsave;
 193        }
 194
 195        /*
 196         * Randomized packet corruption.
 197         * Make copy if needed since we are modifying
 198         * If packet is going to be hardware checksummed, then
 199         * do it now in software before we mangle it.
 200         */
 201        if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 202                if (!(skb = skb_unshare(skb, GFP_ATOMIC))
 203                    || (skb->ip_summed == CHECKSUM_PARTIAL
 204                        && skb_checksum_help(skb))) {
 205                        sch->qstats.drops++;
 206                        return NET_XMIT_DROP;
 207                }
 208
 209                skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 210        }
 211
 212        cb = netem_skb_cb(skb);
 213        if (q->gap == 0                 /* not doing reordering */
 214            || q->counter < q->gap      /* inside last reordering gap */
 215            || q->reorder < get_crandom(&q->reorder_cor)) {
 216                psched_time_t now;
 217                psched_tdiff_t delay;
 218
 219                delay = tabledist(q->latency, q->jitter,
 220                                  &q->delay_cor, q->delay_dist);
 221
 222                now = psched_get_time();
 223                cb->time_to_send = now + delay;
 224                ++q->counter;
 225                ret = qdisc_enqueue(skb, q->qdisc);
 226        } else {
 227                /*
 228                 * Do re-ordering by putting one out of N packets at the front
 229                 * of the queue.
 230                 */
 231                cb->time_to_send = psched_get_time();
 232                q->counter = 0;
 233
 234                __skb_queue_head(&q->qdisc->q, skb);
 235                q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
 236                q->qdisc->qstats.requeues++;
 237                ret = NET_XMIT_SUCCESS;
 238        }
 239
 240        if (likely(ret == NET_XMIT_SUCCESS)) {
 241                sch->q.qlen++;
 242                sch->bstats.bytes += qdisc_pkt_len(skb);
 243                sch->bstats.packets++;
 244        } else if (net_xmit_drop_count(ret)) {
 245                sch->qstats.drops++;
 246        }
 247
 248        pr_debug("netem: enqueue ret %d\n", ret);
 249        return ret;
 250}
 251
 252static unsigned int netem_drop(struct Qdisc* sch)
 253{
 254        struct netem_sched_data *q = qdisc_priv(sch);
 255        unsigned int len = 0;
 256
 257        if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 258                sch->q.qlen--;
 259                sch->qstats.drops++;
 260        }
 261        return len;
 262}
 263
 264static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 265{
 266        struct netem_sched_data *q = qdisc_priv(sch);
 267        struct sk_buff *skb;
 268
 269        if (sch->flags & TCQ_F_THROTTLED)
 270                return NULL;
 271
 272        skb = q->qdisc->ops->peek(q->qdisc);
 273        if (skb) {
 274                const struct netem_skb_cb *cb = netem_skb_cb(skb);
 275                psched_time_t now = psched_get_time();
 276
 277                /* if more time remaining? */
 278                if (cb->time_to_send <= now) {
 279                        skb = qdisc_dequeue_peeked(q->qdisc);
 280                        if (unlikely(!skb))
 281                                return NULL;
 282
 283                        pr_debug("netem_dequeue: return skb=%p\n", skb);
 284                        sch->q.qlen--;
 285                        return skb;
 286                }
 287
 288                qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 289        }
 290
 291        return NULL;
 292}
 293
 294static void netem_reset(struct Qdisc *sch)
 295{
 296        struct netem_sched_data *q = qdisc_priv(sch);
 297
 298        qdisc_reset(q->qdisc);
 299        sch->q.qlen = 0;
 300        qdisc_watchdog_cancel(&q->watchdog);
 301}
 302
 303/*
 304 * Distribution data is a variable size payload containing
 305 * signed 16 bit values.
 306 */
 307static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 308{
 309        struct netem_sched_data *q = qdisc_priv(sch);
 310        unsigned long n = nla_len(attr)/sizeof(__s16);
 311        const __s16 *data = nla_data(attr);
 312        spinlock_t *root_lock;
 313        struct disttable *d;
 314        int i;
 315
 316        if (n > 65536)
 317                return -EINVAL;
 318
 319        d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
 320        if (!d)
 321                return -ENOMEM;
 322
 323        d->size = n;
 324        for (i = 0; i < n; i++)
 325                d->table[i] = data[i];
 326
 327        root_lock = qdisc_root_sleeping_lock(sch);
 328
 329        spin_lock_bh(root_lock);
 330        kfree(q->delay_dist);
 331        q->delay_dist = d;
 332        spin_unlock_bh(root_lock);
 333        return 0;
 334}
 335
 336static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 337{
 338        struct netem_sched_data *q = qdisc_priv(sch);
 339        const struct tc_netem_corr *c = nla_data(attr);
 340
 341        init_crandom(&q->delay_cor, c->delay_corr);
 342        init_crandom(&q->loss_cor, c->loss_corr);
 343        init_crandom(&q->dup_cor, c->dup_corr);
 344}
 345
 346static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 347{
 348        struct netem_sched_data *q = qdisc_priv(sch);
 349        const struct tc_netem_reorder *r = nla_data(attr);
 350
 351        q->reorder = r->probability;
 352        init_crandom(&q->reorder_cor, r->correlation);
 353}
 354
 355static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 356{
 357        struct netem_sched_data *q = qdisc_priv(sch);
 358        const struct tc_netem_corrupt *r = nla_data(attr);
 359
 360        q->corrupt = r->probability;
 361        init_crandom(&q->corrupt_cor, r->correlation);
 362}
 363
 364static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 365        [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 366        [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 367        [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 368};
 369
 370static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 371                      const struct nla_policy *policy, int len)
 372{
 373        int nested_len = nla_len(nla) - NLA_ALIGN(len);
 374
 375        if (nested_len < 0)
 376                return -EINVAL;
 377        if (nested_len >= nla_attr_size(0))
 378                return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 379                                 nested_len, policy);
 380        memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 381        return 0;
 382}
 383
 384/* Parse netlink message to set options */
 385static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 386{
 387        struct netem_sched_data *q = qdisc_priv(sch);
 388        struct nlattr *tb[TCA_NETEM_MAX + 1];
 389        struct tc_netem_qopt *qopt;
 390        int ret;
 391
 392        if (opt == NULL)
 393                return -EINVAL;
 394
 395        qopt = nla_data(opt);
 396        ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 397        if (ret < 0)
 398                return ret;
 399
 400        ret = fifo_set_limit(q->qdisc, qopt->limit);
 401        if (ret) {
 402                pr_debug("netem: can't set fifo limit\n");
 403                return ret;
 404        }
 405
 406        q->latency = qopt->latency;
 407        q->jitter = qopt->jitter;
 408        q->limit = qopt->limit;
 409        q->gap = qopt->gap;
 410        q->counter = 0;
 411        q->loss = qopt->loss;
 412        q->duplicate = qopt->duplicate;
 413
 414        /* for compatibility with earlier versions.
 415         * if gap is set, need to assume 100% probability
 416         */
 417        if (q->gap)
 418                q->reorder = ~0;
 419
 420        if (tb[TCA_NETEM_CORR])
 421                get_correlation(sch, tb[TCA_NETEM_CORR]);
 422
 423        if (tb[TCA_NETEM_DELAY_DIST]) {
 424                ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 425                if (ret)
 426                        return ret;
 427        }
 428
 429        if (tb[TCA_NETEM_REORDER])
 430                get_reorder(sch, tb[TCA_NETEM_REORDER]);
 431
 432        if (tb[TCA_NETEM_CORRUPT])
 433                get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 434
 435        return 0;
 436}
 437
 438/*
 439 * Special case version of FIFO queue for use by netem.
 440 * It queues in order based on timestamps in skb's
 441 */
 442struct fifo_sched_data {
 443        u32 limit;
 444        psched_time_t oldest;
 445};
 446
 447static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 448{
 449        struct fifo_sched_data *q = qdisc_priv(sch);
 450        struct sk_buff_head *list = &sch->q;
 451        psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 452        struct sk_buff *skb;
 453
 454        if (likely(skb_queue_len(list) < q->limit)) {
 455                /* Optimize for add at tail */
 456                if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
 457                        q->oldest = tnext;
 458                        return qdisc_enqueue_tail(nskb, sch);
 459                }
 460
 461                skb_queue_reverse_walk(list, skb) {
 462                        const struct netem_skb_cb *cb = netem_skb_cb(skb);
 463
 464                        if (tnext >= cb->time_to_send)
 465                                break;
 466                }
 467
 468                __skb_queue_after(list, skb, nskb);
 469
 470                sch->qstats.backlog += qdisc_pkt_len(nskb);
 471                sch->bstats.bytes += qdisc_pkt_len(nskb);
 472                sch->bstats.packets++;
 473
 474                return NET_XMIT_SUCCESS;
 475        }
 476
 477        return qdisc_reshape_fail(nskb, sch);
 478}
 479
 480static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
 481{
 482        struct fifo_sched_data *q = qdisc_priv(sch);
 483
 484        if (opt) {
 485                struct tc_fifo_qopt *ctl = nla_data(opt);
 486                if (nla_len(opt) < sizeof(*ctl))
 487                        return -EINVAL;
 488
 489                q->limit = ctl->limit;
 490        } else
 491                q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
 492
 493        q->oldest = PSCHED_PASTPERFECT;
 494        return 0;
 495}
 496
 497static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 498{
 499        struct fifo_sched_data *q = qdisc_priv(sch);
 500        struct tc_fifo_qopt opt = { .limit = q->limit };
 501
 502        NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 503        return skb->len;
 504
 505nla_put_failure:
 506        return -1;
 507}
 508
 509static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 510        .id             =       "tfifo",
 511        .priv_size      =       sizeof(struct fifo_sched_data),
 512        .enqueue        =       tfifo_enqueue,
 513        .dequeue        =       qdisc_dequeue_head,
 514        .peek           =       qdisc_peek_head,
 515        .drop           =       qdisc_queue_drop,
 516        .init           =       tfifo_init,
 517        .reset          =       qdisc_reset_queue,
 518        .change         =       tfifo_init,
 519        .dump           =       tfifo_dump,
 520};
 521
 522static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 523{
 524        struct netem_sched_data *q = qdisc_priv(sch);
 525        int ret;
 526
 527        if (!opt)
 528                return -EINVAL;
 529
 530        qdisc_watchdog_init(&q->watchdog, sch);
 531
 532        q->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
 533                                     &tfifo_qdisc_ops,
 534                                     TC_H_MAKE(sch->handle, 1));
 535        if (!q->qdisc) {
 536                pr_debug("netem: qdisc create failed\n");
 537                return -ENOMEM;
 538        }
 539
 540        ret = netem_change(sch, opt);
 541        if (ret) {
 542                pr_debug("netem: change failed\n");
 543                qdisc_destroy(q->qdisc);
 544        }
 545        return ret;
 546}
 547
 548static void netem_destroy(struct Qdisc *sch)
 549{
 550        struct netem_sched_data *q = qdisc_priv(sch);
 551
 552        qdisc_watchdog_cancel(&q->watchdog);
 553        qdisc_destroy(q->qdisc);
 554        kfree(q->delay_dist);
 555}
 556
 557static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 558{
 559        const struct netem_sched_data *q = qdisc_priv(sch);
 560        unsigned char *b = skb_tail_pointer(skb);
 561        struct nlattr *nla = (struct nlattr *) b;
 562        struct tc_netem_qopt qopt;
 563        struct tc_netem_corr cor;
 564        struct tc_netem_reorder reorder;
 565        struct tc_netem_corrupt corrupt;
 566
 567        qopt.latency = q->latency;
 568        qopt.jitter = q->jitter;
 569        qopt.limit = q->limit;
 570        qopt.loss = q->loss;
 571        qopt.gap = q->gap;
 572        qopt.duplicate = q->duplicate;
 573        NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 574
 575        cor.delay_corr = q->delay_cor.rho;
 576        cor.loss_corr = q->loss_cor.rho;
 577        cor.dup_corr = q->dup_cor.rho;
 578        NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 579
 580        reorder.probability = q->reorder;
 581        reorder.correlation = q->reorder_cor.rho;
 582        NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 583
 584        corrupt.probability = q->corrupt;
 585        corrupt.correlation = q->corrupt_cor.rho;
 586        NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 587
 588        nla->nla_len = skb_tail_pointer(skb) - b;
 589
 590        return skb->len;
 591
 592nla_put_failure:
 593        nlmsg_trim(skb, b);
 594        return -1;
 595}
 596
 597static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 598        .id             =       "netem",
 599        .priv_size      =       sizeof(struct netem_sched_data),
 600        .enqueue        =       netem_enqueue,
 601        .dequeue        =       netem_dequeue,
 602        .peek           =       qdisc_peek_dequeued,
 603        .drop           =       netem_drop,
 604        .init           =       netem_init,
 605        .reset          =       netem_reset,
 606        .destroy        =       netem_destroy,
 607        .change         =       netem_change,
 608        .dump           =       netem_dump,
 609        .owner          =       THIS_MODULE,
 610};
 611
 612
 613static int __init netem_module_init(void)
 614{
 615        pr_info("netem: version " VERSION "\n");
 616        return register_qdisc(&netem_qdisc_ops);
 617}
 618static void __exit netem_module_exit(void)
 619{
 620        unregister_qdisc(&netem_qdisc_ops);
 621}
 622module_init(netem_module_init)
 623module_exit(netem_module_exit)
 624MODULE_LICENSE("GPL");
 625