linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/ipv6.h>
  21#include <linux/skbuff.h>
  22#include <linux/jhash.h>
  23#include <linux/slab.h>
  24#include <net/ip.h>
  25#include <net/netlink.h>
  26#include <net/pkt_sched.h>
  27
  28
  29/*      Stochastic Fairness Queuing algorithm.
  30        =======================================
  31
  32        Source:
  33        Paul E. McKenney "Stochastic Fairness Queuing",
  34        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  35
  36        Paul E. McKenney "Stochastic Fairness Queuing",
  37        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  38
  39
  40        See also:
  41        M. Shreedhar and George Varghese "Efficient Fair
  42        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  43
  44
  45        This is not the thing that is usually called (W)FQ nowadays.
  46        It does not use any timestamp mechanism, but instead
  47        processes queues in round-robin order.
  48
  49        ADVANTAGE:
  50
  51        - It is very cheap. Both CPU and memory requirements are minimal.
  52
  53        DRAWBACKS:
  54
  55        - "Stochastic" -> It is not 100% fair.
  56        When hash collisions occur, several flows are considered as one.
  57
  58        - "Round-robin" -> It introduces larger delays than virtual clock
  59        based schemes, and should not be used for isolating interactive
  60        traffic from non-interactive. It means, that this scheduler
  61        should be used as leaf of CBQ or P3, which put interactive traffic
  62        to higher priority band.
  63
  64        We still need true WFQ for top level CSZ, but using WFQ
  65        for the best effort traffic is absolutely pointless:
  66        SFQ is superior for this purpose.
  67
  68        IMPLEMENTATION:
  69        This implementation limits maximal queue length to 128;
  70        maximal mtu to 2^15-1; number of hash buckets to 1024.
  71        The only goal of this restrictions was that all data
  72        fit into one 4K page :-). Struct sfq_sched_data is
  73        organized in anti-cache manner: all the data for a bucket
  74        are scattered over different locations. This is not good,
  75        but it allowed me to put it into 4K.
  76
  77        It is easy to increase these values, but not in flight.  */
  78
  79#define SFQ_DEPTH               128
  80#define SFQ_HASH_DIVISOR        1024
  81
  82/* This type should contain at least SFQ_DEPTH*2 values */
  83typedef unsigned char sfq_index;
  84
  85struct sfq_head
  86{
  87        sfq_index       next;
  88        sfq_index       prev;
  89};
  90
  91struct sfq_sched_data
  92{
  93/* Parameters */
  94        int             perturb_period;
  95        unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
  96        int             limit;
  97
  98/* Variables */
  99        struct tcf_proto *filter_list;
 100        struct timer_list perturb_timer;
 101        u32             perturbation;
 102        sfq_index       tail;           /* Index of current slot in round */
 103        sfq_index       max_depth;      /* Maximal depth */
 104
 105        sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 106        sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 107        short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 108        unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 109        struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 110        struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 111};
 112
 113static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 114{
 115        return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
 116}
 117
 118static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 119{
 120        u32 h, h2;
 121
 122        switch (skb->protocol) {
 123        case htons(ETH_P_IP):
 124        {
 125                const struct iphdr *iph = ip_hdr(skb);
 126                h = (__force u32)iph->daddr;
 127                h2 = (__force u32)iph->saddr ^ iph->protocol;
 128                if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 129                    (iph->protocol == IPPROTO_TCP ||
 130                     iph->protocol == IPPROTO_UDP ||
 131                     iph->protocol == IPPROTO_UDPLITE ||
 132                     iph->protocol == IPPROTO_SCTP ||
 133                     iph->protocol == IPPROTO_DCCP ||
 134                     iph->protocol == IPPROTO_ESP))
 135                        h2 ^= *(((u32*)iph) + iph->ihl);
 136                break;
 137        }
 138        case htons(ETH_P_IPV6):
 139        {
 140                struct ipv6hdr *iph = ipv6_hdr(skb);
 141                h = (__force u32)iph->daddr.s6_addr32[3];
 142                h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 143                if (iph->nexthdr == IPPROTO_TCP ||
 144                    iph->nexthdr == IPPROTO_UDP ||
 145                    iph->nexthdr == IPPROTO_UDPLITE ||
 146                    iph->nexthdr == IPPROTO_SCTP ||
 147                    iph->nexthdr == IPPROTO_DCCP ||
 148                    iph->nexthdr == IPPROTO_ESP)
 149                        h2 ^= *(u32*)&iph[1];
 150                break;
 151        }
 152        default:
 153                h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
 154                h2 = (unsigned long)skb->sk;
 155        }
 156
 157        return sfq_fold_hash(q, h, h2);
 158}
 159
 160static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 161                                 int *qerr)
 162{
 163        struct sfq_sched_data *q = qdisc_priv(sch);
 164        struct tcf_result res;
 165        int result;
 166
 167        if (TC_H_MAJ(skb->priority) == sch->handle &&
 168            TC_H_MIN(skb->priority) > 0 &&
 169            TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
 170                return TC_H_MIN(skb->priority);
 171
 172        if (!q->filter_list)
 173                return sfq_hash(q, skb) + 1;
 174
 175        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 176        result = tc_classify(skb, q->filter_list, &res);
 177        if (result >= 0) {
 178#ifdef CONFIG_NET_CLS_ACT
 179                switch (result) {
 180                case TC_ACT_STOLEN:
 181                case TC_ACT_QUEUED:
 182                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 183                case TC_ACT_SHOT:
 184                        return 0;
 185                }
 186#endif
 187                if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
 188                        return TC_H_MIN(res.classid);
 189        }
 190        return 0;
 191}
 192
 193static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 194{
 195        sfq_index p, n;
 196        int d = q->qs[x].qlen + SFQ_DEPTH;
 197
 198        p = d;
 199        n = q->dep[d].next;
 200        q->dep[x].next = n;
 201        q->dep[x].prev = p;
 202        q->dep[p].next = q->dep[n].prev = x;
 203}
 204
 205static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 206{
 207        sfq_index p, n;
 208
 209        n = q->dep[x].next;
 210        p = q->dep[x].prev;
 211        q->dep[p].next = n;
 212        q->dep[n].prev = p;
 213
 214        if (n == p && q->max_depth == q->qs[x].qlen + 1)
 215                q->max_depth--;
 216
 217        sfq_link(q, x);
 218}
 219
 220static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 221{
 222        sfq_index p, n;
 223        int d;
 224
 225        n = q->dep[x].next;
 226        p = q->dep[x].prev;
 227        q->dep[p].next = n;
 228        q->dep[n].prev = p;
 229        d = q->qs[x].qlen;
 230        if (q->max_depth < d)
 231                q->max_depth = d;
 232
 233        sfq_link(q, x);
 234}
 235
 236static unsigned int sfq_drop(struct Qdisc *sch)
 237{
 238        struct sfq_sched_data *q = qdisc_priv(sch);
 239        sfq_index d = q->max_depth;
 240        struct sk_buff *skb;
 241        unsigned int len;
 242
 243        /* Queue is full! Find the longest slot and
 244           drop a packet from it */
 245
 246        if (d > 1) {
 247                sfq_index x = q->dep[d + SFQ_DEPTH].next;
 248                skb = q->qs[x].prev;
 249                len = qdisc_pkt_len(skb);
 250                __skb_unlink(skb, &q->qs[x]);
 251                kfree_skb(skb);
 252                sfq_dec(q, x);
 253                sch->q.qlen--;
 254                sch->qstats.drops++;
 255                sch->qstats.backlog -= len;
 256                return len;
 257        }
 258
 259        if (d == 1) {
 260                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 261                d = q->next[q->tail];
 262                q->next[q->tail] = q->next[d];
 263                q->allot[q->next[d]] += q->quantum;
 264                skb = q->qs[d].prev;
 265                len = qdisc_pkt_len(skb);
 266                __skb_unlink(skb, &q->qs[d]);
 267                kfree_skb(skb);
 268                sfq_dec(q, d);
 269                sch->q.qlen--;
 270                q->ht[q->hash[d]] = SFQ_DEPTH;
 271                sch->qstats.drops++;
 272                sch->qstats.backlog -= len;
 273                return len;
 274        }
 275
 276        return 0;
 277}
 278
 279static int
 280sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 281{
 282        struct sfq_sched_data *q = qdisc_priv(sch);
 283        unsigned int hash;
 284        sfq_index x;
 285        int uninitialized_var(ret);
 286
 287        hash = sfq_classify(skb, sch, &ret);
 288        if (hash == 0) {
 289                if (ret & __NET_XMIT_BYPASS)
 290                        sch->qstats.drops++;
 291                kfree_skb(skb);
 292                return ret;
 293        }
 294        hash--;
 295
 296        x = q->ht[hash];
 297        if (x == SFQ_DEPTH) {
 298                q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 299                q->hash[x] = hash;
 300        }
 301
 302        /* If selected queue has length q->limit, this means that
 303         * all another queues are empty and that we do simple tail drop,
 304         * i.e. drop _this_ packet.
 305         */
 306        if (q->qs[x].qlen >= q->limit)
 307                return qdisc_drop(skb, sch);
 308
 309        sch->qstats.backlog += qdisc_pkt_len(skb);
 310        __skb_queue_tail(&q->qs[x], skb);
 311        sfq_inc(q, x);
 312        if (q->qs[x].qlen == 1) {               /* The flow is new */
 313                if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 314                        q->tail = x;
 315                        q->next[x] = x;
 316                        q->allot[x] = q->quantum;
 317                } else {
 318                        q->next[x] = q->next[q->tail];
 319                        q->next[q->tail] = x;
 320                        q->tail = x;
 321                }
 322        }
 323        if (++sch->q.qlen <= q->limit) {
 324                sch->bstats.bytes += qdisc_pkt_len(skb);
 325                sch->bstats.packets++;
 326                return 0;
 327        }
 328
 329        sfq_drop(sch);
 330        return NET_XMIT_CN;
 331}
 332
 333static struct sk_buff *
 334sfq_peek(struct Qdisc *sch)
 335{
 336        struct sfq_sched_data *q = qdisc_priv(sch);
 337        sfq_index a;
 338
 339        /* No active slots */
 340        if (q->tail == SFQ_DEPTH)
 341                return NULL;
 342
 343        a = q->next[q->tail];
 344        return skb_peek(&q->qs[a]);
 345}
 346
 347static struct sk_buff *
 348sfq_dequeue(struct Qdisc *sch)
 349{
 350        struct sfq_sched_data *q = qdisc_priv(sch);
 351        struct sk_buff *skb;
 352        sfq_index a, old_a;
 353
 354        /* No active slots */
 355        if (q->tail == SFQ_DEPTH)
 356                return NULL;
 357
 358        a = old_a = q->next[q->tail];
 359
 360        /* Grab packet */
 361        skb = __skb_dequeue(&q->qs[a]);
 362        sfq_dec(q, a);
 363        sch->q.qlen--;
 364        sch->qstats.backlog -= qdisc_pkt_len(skb);
 365
 366        /* Is the slot empty? */
 367        if (q->qs[a].qlen == 0) {
 368                q->ht[q->hash[a]] = SFQ_DEPTH;
 369                a = q->next[a];
 370                if (a == old_a) {
 371                        q->tail = SFQ_DEPTH;
 372                        return skb;
 373                }
 374                q->next[q->tail] = a;
 375                q->allot[a] += q->quantum;
 376        } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
 377                q->tail = a;
 378                a = q->next[a];
 379                q->allot[a] += q->quantum;
 380        }
 381        return skb;
 382}
 383
 384static void
 385sfq_reset(struct Qdisc *sch)
 386{
 387        struct sk_buff *skb;
 388
 389        while ((skb = sfq_dequeue(sch)) != NULL)
 390                kfree_skb(skb);
 391}
 392
 393static void sfq_perturbation(unsigned long arg)
 394{
 395        struct Qdisc *sch = (struct Qdisc *)arg;
 396        struct sfq_sched_data *q = qdisc_priv(sch);
 397
 398        q->perturbation = net_random();
 399
 400        if (q->perturb_period)
 401                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 402}
 403
 404static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 405{
 406        struct sfq_sched_data *q = qdisc_priv(sch);
 407        struct tc_sfq_qopt *ctl = nla_data(opt);
 408        unsigned int qlen;
 409
 410        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 411                return -EINVAL;
 412
 413        sch_tree_lock(sch);
 414        q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
 415        q->perturb_period = ctl->perturb_period * HZ;
 416        if (ctl->limit)
 417                q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
 418
 419        qlen = sch->q.qlen;
 420        while (sch->q.qlen > q->limit)
 421                sfq_drop(sch);
 422        qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 423
 424        del_timer(&q->perturb_timer);
 425        if (q->perturb_period) {
 426                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 427                q->perturbation = net_random();
 428        }
 429        sch_tree_unlock(sch);
 430        return 0;
 431}
 432
 433static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 434{
 435        struct sfq_sched_data *q = qdisc_priv(sch);
 436        int i;
 437
 438        q->perturb_timer.function = sfq_perturbation;
 439        q->perturb_timer.data = (unsigned long)sch;
 440        init_timer_deferrable(&q->perturb_timer);
 441
 442        for (i = 0; i < SFQ_HASH_DIVISOR; i++)
 443                q->ht[i] = SFQ_DEPTH;
 444
 445        for (i = 0; i < SFQ_DEPTH; i++) {
 446                skb_queue_head_init(&q->qs[i]);
 447                q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
 448                q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
 449        }
 450
 451        q->limit = SFQ_DEPTH - 1;
 452        q->max_depth = 0;
 453        q->tail = SFQ_DEPTH;
 454        if (opt == NULL) {
 455                q->quantum = psched_mtu(qdisc_dev(sch));
 456                q->perturb_period = 0;
 457                q->perturbation = net_random();
 458        } else {
 459                int err = sfq_change(sch, opt);
 460                if (err)
 461                        return err;
 462        }
 463
 464        for (i = 0; i < SFQ_DEPTH; i++)
 465                sfq_link(q, i);
 466        return 0;
 467}
 468
 469static void sfq_destroy(struct Qdisc *sch)
 470{
 471        struct sfq_sched_data *q = qdisc_priv(sch);
 472
 473        tcf_destroy_chain(&q->filter_list);
 474        q->perturb_period = 0;
 475        del_timer_sync(&q->perturb_timer);
 476}
 477
 478static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 479{
 480        struct sfq_sched_data *q = qdisc_priv(sch);
 481        unsigned char *b = skb_tail_pointer(skb);
 482        struct tc_sfq_qopt opt;
 483
 484        opt.quantum = q->quantum;
 485        opt.perturb_period = q->perturb_period / HZ;
 486
 487        opt.limit = q->limit;
 488        opt.divisor = SFQ_HASH_DIVISOR;
 489        opt.flows = q->limit;
 490
 491        NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 492
 493        return skb->len;
 494
 495nla_put_failure:
 496        nlmsg_trim(skb, b);
 497        return -1;
 498}
 499
 500static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 501{
 502        return NULL;
 503}
 504
 505static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 506{
 507        return 0;
 508}
 509
 510static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 511                              u32 classid)
 512{
 513        return 0;
 514}
 515
 516static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 517{
 518        struct sfq_sched_data *q = qdisc_priv(sch);
 519
 520        if (cl)
 521                return NULL;
 522        return &q->filter_list;
 523}
 524
 525static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 526                          struct sk_buff *skb, struct tcmsg *tcm)
 527{
 528        tcm->tcm_handle |= TC_H_MIN(cl);
 529        return 0;
 530}
 531
 532static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 533                                struct gnet_dump *d)
 534{
 535        struct sfq_sched_data *q = qdisc_priv(sch);
 536        sfq_index idx = q->ht[cl-1];
 537        struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
 538        struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
 539
 540        if (gnet_stats_copy_queue(d, &qs) < 0)
 541                return -1;
 542        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 543}
 544
 545static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 546{
 547        struct sfq_sched_data *q = qdisc_priv(sch);
 548        unsigned int i;
 549
 550        if (arg->stop)
 551                return;
 552
 553        for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
 554                if (q->ht[i] == SFQ_DEPTH ||
 555                    arg->count < arg->skip) {
 556                        arg->count++;
 557                        continue;
 558                }
 559                if (arg->fn(sch, i + 1, arg) < 0) {
 560                        arg->stop = 1;
 561                        break;
 562                }
 563                arg->count++;
 564        }
 565}
 566
 567static const struct Qdisc_class_ops sfq_class_ops = {
 568        .leaf           =       sfq_leaf,
 569        .get            =       sfq_get,
 570        .tcf_chain      =       sfq_find_tcf,
 571        .bind_tcf       =       sfq_bind,
 572        .dump           =       sfq_dump_class,
 573        .dump_stats     =       sfq_dump_class_stats,
 574        .walk           =       sfq_walk,
 575};
 576
 577static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 578        .cl_ops         =       &sfq_class_ops,
 579        .id             =       "sfq",
 580        .priv_size      =       sizeof(struct sfq_sched_data),
 581        .enqueue        =       sfq_enqueue,
 582        .dequeue        =       sfq_dequeue,
 583        .peek           =       sfq_peek,
 584        .drop           =       sfq_drop,
 585        .init           =       sfq_init,
 586        .reset          =       sfq_reset,
 587        .destroy        =       sfq_destroy,
 588        .change         =       NULL,
 589        .dump           =       sfq_dump,
 590        .owner          =       THIS_MODULE,
 591};
 592
 593static int __init sfq_module_init(void)
 594{
 595        return register_qdisc(&sfq_qdisc_ops);
 596}
 597static void __exit sfq_module_exit(void)
 598{
 599        unregister_qdisc(&sfq_qdisc_ops);
 600}
 601module_init(sfq_module_init)
 602module_exit(sfq_module_exit)
 603MODULE_LICENSE("GPL");
 604
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.