linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/ipv6.h>
  21#include <linux/skbuff.h>
  22#include <linux/jhash.h>
  23#include <net/ip.h>
  24#include <net/netlink.h>
  25#include <net/pkt_sched.h>
  26
  27
  28/*      Stochastic Fairness Queuing algorithm.
  29        =======================================
  30
  31        Source:
  32        Paul E. McKenney "Stochastic Fairness Queuing",
  33        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  34
  35        Paul E. McKenney "Stochastic Fairness Queuing",
  36        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  37
  38
  39        See also:
  40        M. Shreedhar and George Varghese "Efficient Fair
  41        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  42
  43
  44        This is not the thing that is usually called (W)FQ nowadays.
  45        It does not use any timestamp mechanism, but instead
  46        processes queues in round-robin order.
  47
  48        ADVANTAGE:
  49
  50        - It is very cheap. Both CPU and memory requirements are minimal.
  51
  52        DRAWBACKS:
  53
  54        - "Stochastic" -> It is not 100% fair.
  55        When hash collisions occur, several flows are considered as one.
  56
  57        - "Round-robin" -> It introduces larger delays than virtual clock
  58        based schemes, and should not be used for isolating interactive
  59        traffic from non-interactive. It means, that this scheduler
  60        should be used as leaf of CBQ or P3, which put interactive traffic
  61        to higher priority band.
  62
  63        We still need true WFQ for top level CSZ, but using WFQ
  64        for the best effort traffic is absolutely pointless:
  65        SFQ is superior for this purpose.
  66
  67        IMPLEMENTATION:
  68        This implementation limits maximal queue length to 128;
  69        maximal mtu to 2^15-1; number of hash buckets to 1024.
  70        The only goal of this restrictions was that all data
  71        fit into one 4K page :-). Struct sfq_sched_data is
  72        organized in anti-cache manner: all the data for a bucket
  73        are scattered over different locations. This is not good,
  74        but it allowed me to put it into 4K.
  75
  76        It is easy to increase these values, but not in flight.  */
  77
  78#define SFQ_DEPTH               128
  79#define SFQ_HASH_DIVISOR        1024
  80
  81/* This type should contain at least SFQ_DEPTH*2 values */
  82typedef unsigned char sfq_index;
  83
  84struct sfq_head
  85{
  86        sfq_index       next;
  87        sfq_index       prev;
  88};
  89
  90struct sfq_sched_data
  91{
  92/* Parameters */
  93        int             perturb_period;
  94        unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
  95        int             limit;
  96
  97/* Variables */
  98        struct tcf_proto *filter_list;
  99        struct timer_list perturb_timer;
 100        u32             perturbation;
 101        sfq_index       tail;           /* Index of current slot in round */
 102        sfq_index       max_depth;      /* Maximal depth */
 103
 104        sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 105        sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 106        short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 107        unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 108        struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 109        struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 110};
 111
 112static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 113{
 114        return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
 115}
 116
 117static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 118{
 119        u32 h, h2;
 120
 121        switch (skb->protocol) {
 122        case htons(ETH_P_IP):
 123        {
 124                const struct iphdr *iph = ip_hdr(skb);
 125                h = iph->daddr;
 126                h2 = iph->saddr ^ iph->protocol;
 127                if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 128                    (iph->protocol == IPPROTO_TCP ||
 129                     iph->protocol == IPPROTO_UDP ||
 130                     iph->protocol == IPPROTO_UDPLITE ||
 131                     iph->protocol == IPPROTO_SCTP ||
 132                     iph->protocol == IPPROTO_DCCP ||
 133                     iph->protocol == IPPROTO_ESP))
 134                        h2 ^= *(((u32*)iph) + iph->ihl);
 135                break;
 136        }
 137        case htons(ETH_P_IPV6):
 138        {
 139                struct ipv6hdr *iph = ipv6_hdr(skb);
 140                h = iph->daddr.s6_addr32[3];
 141                h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 142                if (iph->nexthdr == IPPROTO_TCP ||
 143                    iph->nexthdr == IPPROTO_UDP ||
 144                    iph->nexthdr == IPPROTO_UDPLITE ||
 145                    iph->nexthdr == IPPROTO_SCTP ||
 146                    iph->nexthdr == IPPROTO_DCCP ||
 147                    iph->nexthdr == IPPROTO_ESP)
 148                        h2 ^= *(u32*)&iph[1];
 149                break;
 150        }
 151        default:
 152                h = (unsigned long)skb->dst ^ skb->protocol;
 153                h2 = (unsigned long)skb->sk;
 154        }
 155
 156        return sfq_fold_hash(q, h, h2);
 157}
 158
 159static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 160                                 int *qerr)
 161{
 162        struct sfq_sched_data *q = qdisc_priv(sch);
 163        struct tcf_result res;
 164        int result;
 165
 166        if (TC_H_MAJ(skb->priority) == sch->handle &&
 167            TC_H_MIN(skb->priority) > 0 &&
 168            TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
 169                return TC_H_MIN(skb->priority);
 170
 171        if (!q->filter_list)
 172                return sfq_hash(q, skb) + 1;
 173
 174        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 175        result = tc_classify(skb, q->filter_list, &res);
 176        if (result >= 0) {
 177#ifdef CONFIG_NET_CLS_ACT
 178                switch (result) {
 179                case TC_ACT_STOLEN:
 180                case TC_ACT_QUEUED:
 181                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 182                case TC_ACT_SHOT:
 183                        return 0;
 184                }
 185#endif
 186                if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
 187                        return TC_H_MIN(res.classid);
 188        }
 189        return 0;
 190}
 191
 192static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 193{
 194        sfq_index p, n;
 195        int d = q->qs[x].qlen + SFQ_DEPTH;
 196
 197        p = d;
 198        n = q->dep[d].next;
 199        q->dep[x].next = n;
 200        q->dep[x].prev = p;
 201        q->dep[p].next = q->dep[n].prev = x;
 202}
 203
 204static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 205{
 206        sfq_index p, n;
 207
 208        n = q->dep[x].next;
 209        p = q->dep[x].prev;
 210        q->dep[p].next = n;
 211        q->dep[n].prev = p;
 212
 213        if (n == p && q->max_depth == q->qs[x].qlen + 1)
 214                q->max_depth--;
 215
 216        sfq_link(q, x);
 217}
 218
 219static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 220{
 221        sfq_index p, n;
 222        int d;
 223
 224        n = q->dep[x].next;
 225        p = q->dep[x].prev;
 226        q->dep[p].next = n;
 227        q->dep[n].prev = p;
 228        d = q->qs[x].qlen;
 229        if (q->max_depth < d)
 230                q->max_depth = d;
 231
 232        sfq_link(q, x);
 233}
 234
 235static unsigned int sfq_drop(struct Qdisc *sch)
 236{
 237        struct sfq_sched_data *q = qdisc_priv(sch);
 238        sfq_index d = q->max_depth;
 239        struct sk_buff *skb;
 240        unsigned int len;
 241
 242        /* Queue is full! Find the longest slot and
 243           drop a packet from it */
 244
 245        if (d > 1) {
 246                sfq_index x = q->dep[d + SFQ_DEPTH].next;
 247                skb = q->qs[x].prev;
 248                len = qdisc_pkt_len(skb);
 249                __skb_unlink(skb, &q->qs[x]);
 250                kfree_skb(skb);
 251                sfq_dec(q, x);
 252                sch->q.qlen--;
 253                sch->qstats.drops++;
 254                sch->qstats.backlog -= len;
 255                return len;
 256        }
 257
 258        if (d == 1) {
 259                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 260                d = q->next[q->tail];
 261                q->next[q->tail] = q->next[d];
 262                q->allot[q->next[d]] += q->quantum;
 263                skb = q->qs[d].prev;
 264                len = qdisc_pkt_len(skb);
 265                __skb_unlink(skb, &q->qs[d]);
 266                kfree_skb(skb);
 267                sfq_dec(q, d);
 268                sch->q.qlen--;
 269                q->ht[q->hash[d]] = SFQ_DEPTH;
 270                sch->qstats.drops++;
 271                sch->qstats.backlog -= len;
 272                return len;
 273        }
 274
 275        return 0;
 276}
 277
 278static int
 279sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 280{
 281        struct sfq_sched_data *q = qdisc_priv(sch);
 282        unsigned int hash;
 283        sfq_index x;
 284        int uninitialized_var(ret);
 285
 286        hash = sfq_classify(skb, sch, &ret);
 287        if (hash == 0) {
 288                if (ret & __NET_XMIT_BYPASS)
 289                        sch->qstats.drops++;
 290                kfree_skb(skb);
 291                return ret;
 292        }
 293        hash--;
 294
 295        x = q->ht[hash];
 296        if (x == SFQ_DEPTH) {
 297                q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 298                q->hash[x] = hash;
 299        }
 300
 301        /* If selected queue has length q->limit, this means that
 302         * all another queues are empty and that we do simple tail drop,
 303         * i.e. drop _this_ packet.
 304         */
 305        if (q->qs[x].qlen >= q->limit)
 306                return qdisc_drop(skb, sch);
 307
 308        sch->qstats.backlog += qdisc_pkt_len(skb);
 309        __skb_queue_tail(&q->qs[x], skb);
 310        sfq_inc(q, x);
 311        if (q->qs[x].qlen == 1) {               /* The flow is new */
 312                if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 313                        q->tail = x;
 314                        q->next[x] = x;
 315                        q->allot[x] = q->quantum;
 316                } else {
 317                        q->next[x] = q->next[q->tail];
 318                        q->next[q->tail] = x;
 319                        q->tail = x;
 320                }
 321        }
 322        if (++sch->q.qlen <= q->limit) {
 323                sch->bstats.bytes += qdisc_pkt_len(skb);
 324                sch->bstats.packets++;
 325                return 0;
 326        }
 327
 328        sfq_drop(sch);
 329        return NET_XMIT_CN;
 330}
 331
 332static struct sk_buff *
 333sfq_peek(struct Qdisc *sch)
 334{
 335        struct sfq_sched_data *q = qdisc_priv(sch);
 336        sfq_index a;
 337
 338        /* No active slots */
 339        if (q->tail == SFQ_DEPTH)
 340                return NULL;
 341
 342        a = q->next[q->tail];
 343        return skb_peek(&q->qs[a]);
 344}
 345
 346static struct sk_buff *
 347sfq_dequeue(struct Qdisc *sch)
 348{
 349        struct sfq_sched_data *q = qdisc_priv(sch);
 350        struct sk_buff *skb;
 351        sfq_index a, old_a;
 352
 353        /* No active slots */
 354        if (q->tail == SFQ_DEPTH)
 355                return NULL;
 356
 357        a = old_a = q->next[q->tail];
 358
 359        /* Grab packet */
 360        skb = __skb_dequeue(&q->qs[a]);
 361        sfq_dec(q, a);
 362        sch->q.qlen--;
 363        sch->qstats.backlog -= qdisc_pkt_len(skb);
 364
 365        /* Is the slot empty? */
 366        if (q->qs[a].qlen == 0) {
 367                q->ht[q->hash[a]] = SFQ_DEPTH;
 368                a = q->next[a];
 369                if (a == old_a) {
 370                        q->tail = SFQ_DEPTH;
 371                        return skb;
 372                }
 373                q->next[q->tail] = a;
 374                q->allot[a] += q->quantum;
 375        } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
 376                q->tail = a;
 377                a = q->next[a];
 378                q->allot[a] += q->quantum;
 379        }
 380        return skb;
 381}
 382
 383static void
 384sfq_reset(struct Qdisc *sch)
 385{
 386        struct sk_buff *skb;
 387
 388        while ((skb = sfq_dequeue(sch)) != NULL)
 389                kfree_skb(skb);
 390}
 391
 392static void sfq_perturbation(unsigned long arg)
 393{
 394        struct Qdisc *sch = (struct Qdisc *)arg;
 395        struct sfq_sched_data *q = qdisc_priv(sch);
 396
 397        q->perturbation = net_random();
 398
 399        if (q->perturb_period)
 400                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 401}
 402
 403static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 404{
 405        struct sfq_sched_data *q = qdisc_priv(sch);
 406        struct tc_sfq_qopt *ctl = nla_data(opt);
 407        unsigned int qlen;
 408
 409        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 410                return -EINVAL;
 411
 412        sch_tree_lock(sch);
 413        q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
 414        q->perturb_period = ctl->perturb_period * HZ;
 415        if (ctl->limit)
 416                q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
 417
 418        qlen = sch->q.qlen;
 419        while (sch->q.qlen > q->limit)
 420                sfq_drop(sch);
 421        qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 422
 423        del_timer(&q->perturb_timer);
 424        if (q->perturb_period) {
 425                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 426                q->perturbation = net_random();
 427        }
 428        sch_tree_unlock(sch);
 429        return 0;
 430}
 431
 432static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 433{
 434        struct sfq_sched_data *q = qdisc_priv(sch);
 435        int i;
 436
 437        q->perturb_timer.function = sfq_perturbation;
 438        q->perturb_timer.data = (unsigned long)sch;
 439        init_timer_deferrable(&q->perturb_timer);
 440
 441        for (i = 0; i < SFQ_HASH_DIVISOR; i++)
 442                q->ht[i] = SFQ_DEPTH;
 443
 444        for (i = 0; i < SFQ_DEPTH; i++) {
 445                skb_queue_head_init(&q->qs[i]);
 446                q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
 447                q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
 448        }
 449
 450        q->limit = SFQ_DEPTH - 1;
 451        q->max_depth = 0;
 452        q->tail = SFQ_DEPTH;
 453        if (opt == NULL) {
 454                q->quantum = psched_mtu(qdisc_dev(sch));
 455                q->perturb_period = 0;
 456                q->perturbation = net_random();
 457        } else {
 458                int err = sfq_change(sch, opt);
 459                if (err)
 460                        return err;
 461        }
 462
 463        for (i = 0; i < SFQ_DEPTH; i++)
 464                sfq_link(q, i);
 465        return 0;
 466}
 467
 468static void sfq_destroy(struct Qdisc *sch)
 469{
 470        struct sfq_sched_data *q = qdisc_priv(sch);
 471
 472        tcf_destroy_chain(&q->filter_list);
 473        q->perturb_period = 0;
 474        del_timer_sync(&q->perturb_timer);
 475}
 476
 477static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 478{
 479        struct sfq_sched_data *q = qdisc_priv(sch);
 480        unsigned char *b = skb_tail_pointer(skb);
 481        struct tc_sfq_qopt opt;
 482
 483        opt.quantum = q->quantum;
 484        opt.perturb_period = q->perturb_period / HZ;
 485
 486        opt.limit = q->limit;
 487        opt.divisor = SFQ_HASH_DIVISOR;
 488        opt.flows = q->limit;
 489
 490        NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 491
 492        return skb->len;
 493
 494nla_put_failure:
 495        nlmsg_trim(skb, b);
 496        return -1;
 497}
 498
 499static int sfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 500                            struct nlattr **tca, unsigned long *arg)
 501{
 502        return -EOPNOTSUPP;
 503}
 504
 505static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 506{
 507        return 0;
 508}
 509
 510static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 511{
 512        struct sfq_sched_data *q = qdisc_priv(sch);
 513
 514        if (cl)
 515                return NULL;
 516        return &q->filter_list;
 517}
 518
 519static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 520                          struct sk_buff *skb, struct tcmsg *tcm)
 521{
 522        tcm->tcm_handle |= TC_H_MIN(cl);
 523        return 0;
 524}
 525
 526static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 527                                struct gnet_dump *d)
 528{
 529        struct sfq_sched_data *q = qdisc_priv(sch);
 530        sfq_index idx = q->ht[cl-1];
 531        struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
 532        struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
 533
 534        if (gnet_stats_copy_queue(d, &qs) < 0)
 535                return -1;
 536        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 537}
 538
 539static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 540{
 541        struct sfq_sched_data *q = qdisc_priv(sch);
 542        unsigned int i;
 543
 544        if (arg->stop)
 545                return;
 546
 547        for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
 548                if (q->ht[i] == SFQ_DEPTH ||
 549                    arg->count < arg->skip) {
 550                        arg->count++;
 551                        continue;
 552                }
 553                if (arg->fn(sch, i + 1, arg) < 0) {
 554                        arg->stop = 1;
 555                        break;
 556                }
 557                arg->count++;
 558        }
 559}
 560
 561static const struct Qdisc_class_ops sfq_class_ops = {
 562        .get            =       sfq_get,
 563        .change         =       sfq_change_class,
 564        .tcf_chain      =       sfq_find_tcf,
 565        .dump           =       sfq_dump_class,
 566        .dump_stats     =       sfq_dump_class_stats,
 567        .walk           =       sfq_walk,
 568};
 569
 570static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 571        .cl_ops         =       &sfq_class_ops,
 572        .id             =       "sfq",
 573        .priv_size      =       sizeof(struct sfq_sched_data),
 574        .enqueue        =       sfq_enqueue,
 575        .dequeue        =       sfq_dequeue,
 576        .peek           =       sfq_peek,
 577        .drop           =       sfq_drop,
 578        .init           =       sfq_init,
 579        .reset          =       sfq_reset,
 580        .destroy        =       sfq_destroy,
 581        .change         =       NULL,
 582        .dump           =       sfq_dump,
 583        .owner          =       THIS_MODULE,
 584};
 585
 586static int __init sfq_module_init(void)
 587{
 588        return register_qdisc(&sfq_qdisc_ops);
 589}
 590static void __exit sfq_module_exit(void)
 591{
 592        unregister_qdisc(&sfq_qdisc_ops);
 593}
 594module_init(sfq_module_init)
 595module_exit(sfq_module_exit)
 596MODULE_LICENSE("GPL");
 597