linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/ipv6.h>
  21#include <linux/skbuff.h>
  22#include <linux/jhash.h>
  23#include <linux/slab.h>
  24#include <net/ip.h>
  25#include <net/netlink.h>
  26#include <net/pkt_sched.h>
  27
  28
  29/*      Stochastic Fairness Queuing algorithm.
  30        =======================================
  31
  32        Source:
  33        Paul E. McKenney "Stochastic Fairness Queuing",
  34        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  35
  36        Paul E. McKenney "Stochastic Fairness Queuing",
  37        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  38
  39
  40        See also:
  41        M. Shreedhar and George Varghese "Efficient Fair
  42        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  43
  44
  45        This is not the thing that is usually called (W)FQ nowadays.
  46        It does not use any timestamp mechanism, but instead
  47        processes queues in round-robin order.
  48
  49        ADVANTAGE:
  50
  51        - It is very cheap. Both CPU and memory requirements are minimal.
  52
  53        DRAWBACKS:
  54
  55        - "Stochastic" -> It is not 100% fair.
  56        When hash collisions occur, several flows are considered as one.
  57
  58        - "Round-robin" -> It introduces larger delays than virtual clock
  59        based schemes, and should not be used for isolating interactive
  60        traffic from non-interactive. It means, that this scheduler
  61        should be used as leaf of CBQ or P3, which put interactive traffic
  62        to higher priority band.
  63
  64        We still need true WFQ for top level CSZ, but using WFQ
  65        for the best effort traffic is absolutely pointless:
  66        SFQ is superior for this purpose.
  67
  68        IMPLEMENTATION:
  69        This implementation limits maximal queue length to 128;
  70        maximal mtu to 2^15-1; number of hash buckets to 1024.
  71        The only goal of this restrictions was that all data
  72        fit into one 4K page :-). Struct sfq_sched_data is
  73        organized in anti-cache manner: all the data for a bucket
  74        are scattered over different locations. This is not good,
  75        but it allowed me to put it into 4K.
  76
  77        It is easy to increase these values, but not in flight.  */
  78
  79#define SFQ_DEPTH               128
  80#define SFQ_HASH_DIVISOR        1024
  81
  82/* This type should contain at least SFQ_DEPTH*2 values */
  83typedef unsigned char sfq_index;
  84
  85struct sfq_head
  86{
  87        sfq_index       next;
  88        sfq_index       prev;
  89};
  90
  91struct sfq_sched_data
  92{
  93/* Parameters */
  94        int             perturb_period;
  95        unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
  96        int             limit;
  97
  98/* Variables */
  99        struct tcf_proto *filter_list;
 100        struct timer_list perturb_timer;
 101        u32             perturbation;
 102        sfq_index       tail;           /* Index of current slot in round */
 103        sfq_index       max_depth;      /* Maximal depth */
 104
 105        sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 106        sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 107        short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 108        unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 109        struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 110        struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 111};
 112
 113static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 114{
 115        return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
 116}
 117
 118static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 119{
 120        u32 h, h2;
 121
 122        switch (skb->protocol) {
 123        case htons(ETH_P_IP):
 124        {
 125                const struct iphdr *iph;
 126                int poff;
 127
 128                if (!pskb_network_may_pull(skb, sizeof(*iph)))
 129                        goto err;
 130                iph = ip_hdr(skb);
 131                h = (__force u32)iph->daddr;
 132                h2 = (__force u32)iph->saddr ^ iph->protocol;
 133                if (iph->frag_off & htons(IP_MF|IP_OFFSET))
 134                        break;
 135                poff = proto_ports_offset(iph->protocol);
 136                if (poff >= 0 &&
 137                    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
 138                        iph = ip_hdr(skb);
 139                        h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
 140                }
 141                break;
 142        }
 143        case htons(ETH_P_IPV6):
 144        {
 145                struct ipv6hdr *iph;
 146                int poff;
 147
 148                if (!pskb_network_may_pull(skb, sizeof(*iph)))
 149                        goto err;
 150                iph = ipv6_hdr(skb);
 151                h = (__force u32)iph->daddr.s6_addr32[3];
 152                h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 153                poff = proto_ports_offset(iph->nexthdr);
 154                if (poff >= 0 &&
 155                    pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
 156                        iph = ipv6_hdr(skb);
 157                        h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
 158                }
 159                break;
 160        }
 161        default:
 162err:
 163                h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
 164                h2 = (unsigned long)skb->sk;
 165        }
 166
 167        return sfq_fold_hash(q, h, h2);
 168}
 169
 170static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 171                                 int *qerr)
 172{
 173        struct sfq_sched_data *q = qdisc_priv(sch);
 174        struct tcf_result res;
 175        int result;
 176
 177        if (TC_H_MAJ(skb->priority) == sch->handle &&
 178            TC_H_MIN(skb->priority) > 0 &&
 179            TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
 180                return TC_H_MIN(skb->priority);
 181
 182        if (!q->filter_list)
 183                return sfq_hash(q, skb) + 1;
 184
 185        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 186        result = tc_classify(skb, q->filter_list, &res);
 187        if (result >= 0) {
 188#ifdef CONFIG_NET_CLS_ACT
 189                switch (result) {
 190                case TC_ACT_STOLEN:
 191                case TC_ACT_QUEUED:
 192                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 193                case TC_ACT_SHOT:
 194                        return 0;
 195                }
 196#endif
 197                if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
 198                        return TC_H_MIN(res.classid);
 199        }
 200        return 0;
 201}
 202
 203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 204{
 205        sfq_index p, n;
 206        int d = q->qs[x].qlen + SFQ_DEPTH;
 207
 208        p = d;
 209        n = q->dep[d].next;
 210        q->dep[x].next = n;
 211        q->dep[x].prev = p;
 212        q->dep[p].next = q->dep[n].prev = x;
 213}
 214
 215static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 216{
 217        sfq_index p, n;
 218
 219        n = q->dep[x].next;
 220        p = q->dep[x].prev;
 221        q->dep[p].next = n;
 222        q->dep[n].prev = p;
 223
 224        if (n == p && q->max_depth == q->qs[x].qlen + 1)
 225                q->max_depth--;
 226
 227        sfq_link(q, x);
 228}
 229
 230static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 231{
 232        sfq_index p, n;
 233        int d;
 234
 235        n = q->dep[x].next;
 236        p = q->dep[x].prev;
 237        q->dep[p].next = n;
 238        q->dep[n].prev = p;
 239        d = q->qs[x].qlen;
 240        if (q->max_depth < d)
 241                q->max_depth = d;
 242
 243        sfq_link(q, x);
 244}
 245
 246static unsigned int sfq_drop(struct Qdisc *sch)
 247{
 248        struct sfq_sched_data *q = qdisc_priv(sch);
 249        sfq_index d = q->max_depth;
 250        struct sk_buff *skb;
 251        unsigned int len;
 252
 253        /* Queue is full! Find the longest slot and
 254           drop a packet from it */
 255
 256        if (d > 1) {
 257                sfq_index x = q->dep[d + SFQ_DEPTH].next;
 258                skb = q->qs[x].prev;
 259                len = qdisc_pkt_len(skb);
 260                __skb_unlink(skb, &q->qs[x]);
 261                kfree_skb(skb);
 262                sfq_dec(q, x);
 263                sch->q.qlen--;
 264                sch->qstats.drops++;
 265                sch->qstats.backlog -= len;
 266                return len;
 267        }
 268
 269        if (d == 1) {
 270                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 271                d = q->next[q->tail];
 272                q->next[q->tail] = q->next[d];
 273                skb = q->qs[d].prev;
 274                len = qdisc_pkt_len(skb);
 275                __skb_unlink(skb, &q->qs[d]);
 276                kfree_skb(skb);
 277                sfq_dec(q, d);
 278                sch->q.qlen--;
 279                q->ht[q->hash[d]] = SFQ_DEPTH;
 280                sch->qstats.drops++;
 281                sch->qstats.backlog -= len;
 282                return len;
 283        }
 284
 285        return 0;
 286}
 287
 288static int
 289sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 290{
 291        struct sfq_sched_data *q = qdisc_priv(sch);
 292        unsigned int hash;
 293        sfq_index x;
 294        int uninitialized_var(ret);
 295
 296        hash = sfq_classify(skb, sch, &ret);
 297        if (hash == 0) {
 298                if (ret & __NET_XMIT_BYPASS)
 299                        sch->qstats.drops++;
 300                kfree_skb(skb);
 301                return ret;
 302        }
 303        hash--;
 304
 305        x = q->ht[hash];
 306        if (x == SFQ_DEPTH) {
 307                q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 308                q->hash[x] = hash;
 309        }
 310
 311        /* If selected queue has length q->limit, this means that
 312         * all another queues are empty and that we do simple tail drop,
 313         * i.e. drop _this_ packet.
 314         */
 315        if (q->qs[x].qlen >= q->limit)
 316                return qdisc_drop(skb, sch);
 317
 318        sch->qstats.backlog += qdisc_pkt_len(skb);
 319        __skb_queue_tail(&q->qs[x], skb);
 320        sfq_inc(q, x);
 321        if (q->qs[x].qlen == 1) {               /* The flow is new */
 322                if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 323                        q->next[x] = x;
 324                } else {
 325                        q->next[x] = q->next[q->tail];
 326                        q->next[q->tail] = x;
 327                }
 328                q->tail = x;
 329                q->allot[x] = q->quantum;
 330        }
 331        if (++sch->q.qlen <= q->limit) {
 332                sch->bstats.bytes += qdisc_pkt_len(skb);
 333                sch->bstats.packets++;
 334                return NET_XMIT_SUCCESS;
 335        }
 336
 337        sfq_drop(sch);
 338        return NET_XMIT_CN;
 339}
 340
 341static struct sk_buff *
 342sfq_peek(struct Qdisc *sch)
 343{
 344        struct sfq_sched_data *q = qdisc_priv(sch);
 345        sfq_index a;
 346
 347        /* No active slots */
 348        if (q->tail == SFQ_DEPTH)
 349                return NULL;
 350
 351        a = q->next[q->tail];
 352        return skb_peek(&q->qs[a]);
 353}
 354
 355static struct sk_buff *
 356sfq_dequeue(struct Qdisc *sch)
 357{
 358        struct sfq_sched_data *q = qdisc_priv(sch);
 359        struct sk_buff *skb;
 360        sfq_index a, next_a;
 361
 362        /* No active slots */
 363        if (q->tail == SFQ_DEPTH)
 364                return NULL;
 365
 366        a = q->next[q->tail];
 367
 368        /* Grab packet */
 369        skb = __skb_dequeue(&q->qs[a]);
 370        sfq_dec(q, a);
 371        sch->q.qlen--;
 372        sch->qstats.backlog -= qdisc_pkt_len(skb);
 373
 374        /* Is the slot empty? */
 375        if (q->qs[a].qlen == 0) {
 376                q->ht[q->hash[a]] = SFQ_DEPTH;
 377                next_a = q->next[a];
 378                if (a == next_a) {
 379                        q->tail = SFQ_DEPTH;
 380                        return skb;
 381                }
 382                q->next[q->tail] = next_a;
 383        } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
 384                q->allot[a] += q->quantum;
 385                q->tail = a;
 386        }
 387        return skb;
 388}
 389
 390static void
 391sfq_reset(struct Qdisc *sch)
 392{
 393        struct sk_buff *skb;
 394
 395        while ((skb = sfq_dequeue(sch)) != NULL)
 396                kfree_skb(skb);
 397}
 398
 399static void sfq_perturbation(unsigned long arg)
 400{
 401        struct Qdisc *sch = (struct Qdisc *)arg;
 402        struct sfq_sched_data *q = qdisc_priv(sch);
 403
 404        q->perturbation = net_random();
 405
 406        if (q->perturb_period)
 407                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 408}
 409
 410static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 411{
 412        struct sfq_sched_data *q = qdisc_priv(sch);
 413        struct tc_sfq_qopt *ctl = nla_data(opt);
 414        unsigned int qlen;
 415
 416        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 417                return -EINVAL;
 418
 419        sch_tree_lock(sch);
 420        q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
 421        q->perturb_period = ctl->perturb_period * HZ;
 422        if (ctl->limit)
 423                q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
 424
 425        qlen = sch->q.qlen;
 426        while (sch->q.qlen > q->limit)
 427                sfq_drop(sch);
 428        qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 429
 430        del_timer(&q->perturb_timer);
 431        if (q->perturb_period) {
 432                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 433                q->perturbation = net_random();
 434        }
 435        sch_tree_unlock(sch);
 436        return 0;
 437}
 438
 439static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 440{
 441        struct sfq_sched_data *q = qdisc_priv(sch);
 442        int i;
 443
 444        q->perturb_timer.function = sfq_perturbation;
 445        q->perturb_timer.data = (unsigned long)sch;
 446        init_timer_deferrable(&q->perturb_timer);
 447
 448        for (i = 0; i < SFQ_HASH_DIVISOR; i++)
 449                q->ht[i] = SFQ_DEPTH;
 450
 451        for (i = 0; i < SFQ_DEPTH; i++) {
 452                skb_queue_head_init(&q->qs[i]);
 453                q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
 454                q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
 455        }
 456
 457        q->limit = SFQ_DEPTH - 1;
 458        q->max_depth = 0;
 459        q->tail = SFQ_DEPTH;
 460        if (opt == NULL) {
 461                q->quantum = psched_mtu(qdisc_dev(sch));
 462                q->perturb_period = 0;
 463                q->perturbation = net_random();
 464        } else {
 465                int err = sfq_change(sch, opt);
 466                if (err)
 467                        return err;
 468        }
 469
 470        for (i = 0; i < SFQ_DEPTH; i++)
 471                sfq_link(q, i);
 472        return 0;
 473}
 474
 475static void sfq_destroy(struct Qdisc *sch)
 476{
 477        struct sfq_sched_data *q = qdisc_priv(sch);
 478
 479        tcf_destroy_chain(&q->filter_list);
 480        q->perturb_period = 0;
 481        del_timer_sync(&q->perturb_timer);
 482}
 483
 484static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 485{
 486        struct sfq_sched_data *q = qdisc_priv(sch);
 487        unsigned char *b = skb_tail_pointer(skb);
 488        struct tc_sfq_qopt opt;
 489
 490        opt.quantum = q->quantum;
 491        opt.perturb_period = q->perturb_period / HZ;
 492
 493        opt.limit = q->limit;
 494        opt.divisor = SFQ_HASH_DIVISOR;
 495        opt.flows = q->limit;
 496
 497        NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 498
 499        return skb->len;
 500
 501nla_put_failure:
 502        nlmsg_trim(skb, b);
 503        return -1;
 504}
 505
 506static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 507{
 508        return NULL;
 509}
 510
 511static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 512{
 513        return 0;
 514}
 515
 516static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 517                              u32 classid)
 518{
 519        return 0;
 520}
 521
 522static void sfq_put(struct Qdisc *q, unsigned long cl)
 523{
 524}
 525
 526static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 527{
 528        struct sfq_sched_data *q = qdisc_priv(sch);
 529
 530        if (cl)
 531                return NULL;
 532        return &q->filter_list;
 533}
 534
 535static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 536                          struct sk_buff *skb, struct tcmsg *tcm)
 537{
 538        tcm->tcm_handle |= TC_H_MIN(cl);
 539        return 0;
 540}
 541
 542static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 543                                struct gnet_dump *d)
 544{
 545        struct sfq_sched_data *q = qdisc_priv(sch);
 546        sfq_index idx = q->ht[cl-1];
 547        struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
 548        struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
 549
 550        if (gnet_stats_copy_queue(d, &qs) < 0)
 551                return -1;
 552        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 553}
 554
 555static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 556{
 557        struct sfq_sched_data *q = qdisc_priv(sch);
 558        unsigned int i;
 559
 560        if (arg->stop)
 561                return;
 562
 563        for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
 564                if (q->ht[i] == SFQ_DEPTH ||
 565                    arg->count < arg->skip) {
 566                        arg->count++;
 567                        continue;
 568                }
 569                if (arg->fn(sch, i + 1, arg) < 0) {
 570                        arg->stop = 1;
 571                        break;
 572                }
 573                arg->count++;
 574        }
 575}
 576
 577static const struct Qdisc_class_ops sfq_class_ops = {
 578        .leaf           =       sfq_leaf,
 579        .get            =       sfq_get,
 580        .put            =       sfq_put,
 581        .tcf_chain      =       sfq_find_tcf,
 582        .bind_tcf       =       sfq_bind,
 583        .unbind_tcf     =       sfq_put,
 584        .dump           =       sfq_dump_class,
 585        .dump_stats     =       sfq_dump_class_stats,
 586        .walk           =       sfq_walk,
 587};
 588
 589static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 590        .cl_ops         =       &sfq_class_ops,
 591        .id             =       "sfq",
 592        .priv_size      =       sizeof(struct sfq_sched_data),
 593        .enqueue        =       sfq_enqueue,
 594        .dequeue        =       sfq_dequeue,
 595        .peek           =       sfq_peek,
 596        .drop           =       sfq_drop,
 597        .init           =       sfq_init,
 598        .reset          =       sfq_reset,
 599        .destroy        =       sfq_destroy,
 600        .change         =       NULL,
 601        .dump           =       sfq_dump,
 602        .owner          =       THIS_MODULE,
 603};
 604
 605static int __init sfq_module_init(void)
 606{
 607        return register_qdisc(&sfq_qdisc_ops);
 608}
 609static void __exit sfq_module_exit(void)
 610{
 611        unregister_qdisc(&sfq_qdisc_ops);
 612}
 613module_init(sfq_module_init)
 614module_exit(sfq_module_exit)
 615MODULE_LICENSE("GPL");
 616