linux/net/sched/sch_sfq.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/types.h>
  14#include <linux/kernel.h>
  15#include <linux/jiffies.h>
  16#include <linux/string.h>
  17#include <linux/in.h>
  18#include <linux/errno.h>
  19#include <linux/init.h>
  20#include <linux/ipv6.h>
  21#include <linux/skbuff.h>
  22#include <linux/jhash.h>
  23#include <linux/slab.h>
  24#include <net/ip.h>
  25#include <net/netlink.h>
  26#include <net/pkt_sched.h>
  27
  28
  29/*      Stochastic Fairness Queuing algorithm.
  30        =======================================
  31
  32        Source:
  33        Paul E. McKenney "Stochastic Fairness Queuing",
  34        IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  35
  36        Paul E. McKenney "Stochastic Fairness Queuing",
  37        "Interworking: Research and Experience", v.2, 1991, p.113-131.
  38
  39
  40        See also:
  41        M. Shreedhar and George Varghese "Efficient Fair
  42        Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  43
  44
  45        This is not the thing that is usually called (W)FQ nowadays.
  46        It does not use any timestamp mechanism, but instead
  47        processes queues in round-robin order.
  48
  49        ADVANTAGE:
  50
  51        - It is very cheap. Both CPU and memory requirements are minimal.
  52
  53        DRAWBACKS:
  54
  55        - "Stochastic" -> It is not 100% fair.
  56        When hash collisions occur, several flows are considered as one.
  57
  58        - "Round-robin" -> It introduces larger delays than virtual clock
  59        based schemes, and should not be used for isolating interactive
  60        traffic from non-interactive. It means, that this scheduler
  61        should be used as leaf of CBQ or P3, which put interactive traffic
  62        to higher priority band.
  63
  64        We still need true WFQ for top level CSZ, but using WFQ
  65        for the best effort traffic is absolutely pointless:
  66        SFQ is superior for this purpose.
  67
  68        IMPLEMENTATION:
  69        This implementation limits maximal queue length to 128;
  70        maximal mtu to 2^15-1; number of hash buckets to 1024.
  71        The only goal of this restrictions was that all data
  72        fit into one 4K page :-). Struct sfq_sched_data is
  73        organized in anti-cache manner: all the data for a bucket
  74        are scattered over different locations. This is not good,
  75        but it allowed me to put it into 4K.
  76
  77        It is easy to increase these values, but not in flight.  */
  78
  79#define SFQ_DEPTH               128
  80#define SFQ_HASH_DIVISOR        1024
  81
  82/* This type should contain at least SFQ_DEPTH*2 values */
  83typedef unsigned char sfq_index;
  84
  85struct sfq_head
  86{
  87        sfq_index       next;
  88        sfq_index       prev;
  89};
  90
  91struct sfq_sched_data
  92{
  93/* Parameters */
  94        int             perturb_period;
  95        unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
  96        int             limit;
  97
  98/* Variables */
  99        struct tcf_proto *filter_list;
 100        struct timer_list perturb_timer;
 101        u32             perturbation;
 102        sfq_index       tail;           /* Index of current slot in round */
 103        sfq_index       max_depth;      /* Maximal depth */
 104
 105        sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 106        sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 107        short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 108        unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 109        struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 110        struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 111};
 112
 113static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 114{
 115        return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
 116}
 117
 118static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 119{
 120        u32 h, h2;
 121
 122        switch (skb->protocol) {
 123        case htons(ETH_P_IP):
 124        {
 125                const struct iphdr *iph;
 126
 127                if (!pskb_network_may_pull(skb, sizeof(*iph)))
 128                        goto err;
 129                iph = ip_hdr(skb);
 130                h = (__force u32)iph->daddr;
 131                h2 = (__force u32)iph->saddr ^ iph->protocol;
 132                if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 133                    (iph->protocol == IPPROTO_TCP ||
 134                     iph->protocol == IPPROTO_UDP ||
 135                     iph->protocol == IPPROTO_UDPLITE ||
 136                     iph->protocol == IPPROTO_SCTP ||
 137                     iph->protocol == IPPROTO_DCCP ||
 138                     iph->protocol == IPPROTO_ESP) &&
 139                     pskb_network_may_pull(skb, iph->ihl * 4 + 4))
 140                        h2 ^= *(((u32*)iph) + iph->ihl);
 141                break;
 142        }
 143        case htons(ETH_P_IPV6):
 144        {
 145                struct ipv6hdr *iph;
 146
 147                if (!pskb_network_may_pull(skb, sizeof(*iph)))
 148                        goto err;
 149                iph = ipv6_hdr(skb);
 150                h = (__force u32)iph->daddr.s6_addr32[3];
 151                h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 152                if ((iph->nexthdr == IPPROTO_TCP ||
 153                     iph->nexthdr == IPPROTO_UDP ||
 154                     iph->nexthdr == IPPROTO_UDPLITE ||
 155                     iph->nexthdr == IPPROTO_SCTP ||
 156                     iph->nexthdr == IPPROTO_DCCP ||
 157                     iph->nexthdr == IPPROTO_ESP) &&
 158                    pskb_network_may_pull(skb, sizeof(*iph) + 4))
 159                        h2 ^= *(u32*)&iph[1];
 160                break;
 161        }
 162        default:
 163err:
 164                h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
 165                h2 = (unsigned long)skb->sk;
 166        }
 167
 168        return sfq_fold_hash(q, h, h2);
 169}
 170
 171static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 172                                 int *qerr)
 173{
 174        struct sfq_sched_data *q = qdisc_priv(sch);
 175        struct tcf_result res;
 176        int result;
 177
 178        if (TC_H_MAJ(skb->priority) == sch->handle &&
 179            TC_H_MIN(skb->priority) > 0 &&
 180            TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
 181                return TC_H_MIN(skb->priority);
 182
 183        if (!q->filter_list)
 184                return sfq_hash(q, skb) + 1;
 185
 186        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 187        result = tc_classify(skb, q->filter_list, &res);
 188        if (result >= 0) {
 189#ifdef CONFIG_NET_CLS_ACT
 190                switch (result) {
 191                case TC_ACT_STOLEN:
 192                case TC_ACT_QUEUED:
 193                        *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 194                case TC_ACT_SHOT:
 195                        return 0;
 196                }
 197#endif
 198                if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
 199                        return TC_H_MIN(res.classid);
 200        }
 201        return 0;
 202}
 203
 204static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 205{
 206        sfq_index p, n;
 207        int d = q->qs[x].qlen + SFQ_DEPTH;
 208
 209        p = d;
 210        n = q->dep[d].next;
 211        q->dep[x].next = n;
 212        q->dep[x].prev = p;
 213        q->dep[p].next = q->dep[n].prev = x;
 214}
 215
 216static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 217{
 218        sfq_index p, n;
 219
 220        n = q->dep[x].next;
 221        p = q->dep[x].prev;
 222        q->dep[p].next = n;
 223        q->dep[n].prev = p;
 224
 225        if (n == p && q->max_depth == q->qs[x].qlen + 1)
 226                q->max_depth--;
 227
 228        sfq_link(q, x);
 229}
 230
 231static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 232{
 233        sfq_index p, n;
 234        int d;
 235
 236        n = q->dep[x].next;
 237        p = q->dep[x].prev;
 238        q->dep[p].next = n;
 239        q->dep[n].prev = p;
 240        d = q->qs[x].qlen;
 241        if (q->max_depth < d)
 242                q->max_depth = d;
 243
 244        sfq_link(q, x);
 245}
 246
 247static unsigned int sfq_drop(struct Qdisc *sch)
 248{
 249        struct sfq_sched_data *q = qdisc_priv(sch);
 250        sfq_index d = q->max_depth;
 251        struct sk_buff *skb;
 252        unsigned int len;
 253
 254        /* Queue is full! Find the longest slot and
 255           drop a packet from it */
 256
 257        if (d > 1) {
 258                sfq_index x = q->dep[d + SFQ_DEPTH].next;
 259                skb = q->qs[x].prev;
 260                len = qdisc_pkt_len(skb);
 261                __skb_unlink(skb, &q->qs[x]);
 262                kfree_skb(skb);
 263                sfq_dec(q, x);
 264                sch->q.qlen--;
 265                sch->qstats.drops++;
 266                sch->qstats.backlog -= len;
 267                return len;
 268        }
 269
 270        if (d == 1) {
 271                /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 272                d = q->next[q->tail];
 273                q->next[q->tail] = q->next[d];
 274                q->allot[q->next[d]] += q->quantum;
 275                skb = q->qs[d].prev;
 276                len = qdisc_pkt_len(skb);
 277                __skb_unlink(skb, &q->qs[d]);
 278                kfree_skb(skb);
 279                sfq_dec(q, d);
 280                sch->q.qlen--;
 281                q->ht[q->hash[d]] = SFQ_DEPTH;
 282                sch->qstats.drops++;
 283                sch->qstats.backlog -= len;
 284                return len;
 285        }
 286
 287        return 0;
 288}
 289
 290static int
 291sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 292{
 293        struct sfq_sched_data *q = qdisc_priv(sch);
 294        unsigned int hash;
 295        sfq_index x;
 296        int uninitialized_var(ret);
 297
 298        hash = sfq_classify(skb, sch, &ret);
 299        if (hash == 0) {
 300                if (ret & __NET_XMIT_BYPASS)
 301                        sch->qstats.drops++;
 302                kfree_skb(skb);
 303                return ret;
 304        }
 305        hash--;
 306
 307        x = q->ht[hash];
 308        if (x == SFQ_DEPTH) {
 309                q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 310                q->hash[x] = hash;
 311        }
 312
 313        /* If selected queue has length q->limit, this means that
 314         * all another queues are empty and that we do simple tail drop,
 315         * i.e. drop _this_ packet.
 316         */
 317        if (q->qs[x].qlen >= q->limit)
 318                return qdisc_drop(skb, sch);
 319
 320        sch->qstats.backlog += qdisc_pkt_len(skb);
 321        __skb_queue_tail(&q->qs[x], skb);
 322        sfq_inc(q, x);
 323        if (q->qs[x].qlen == 1) {               /* The flow is new */
 324                if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 325                        q->tail = x;
 326                        q->next[x] = x;
 327                        q->allot[x] = q->quantum;
 328                } else {
 329                        q->next[x] = q->next[q->tail];
 330                        q->next[q->tail] = x;
 331                        q->tail = x;
 332                }
 333        }
 334        if (++sch->q.qlen <= q->limit) {
 335                sch->bstats.bytes += qdisc_pkt_len(skb);
 336                sch->bstats.packets++;
 337                return NET_XMIT_SUCCESS;
 338        }
 339
 340        sfq_drop(sch);
 341        return NET_XMIT_CN;
 342}
 343
 344static struct sk_buff *
 345sfq_peek(struct Qdisc *sch)
 346{
 347        struct sfq_sched_data *q = qdisc_priv(sch);
 348        sfq_index a;
 349
 350        /* No active slots */
 351        if (q->tail == SFQ_DEPTH)
 352                return NULL;
 353
 354        a = q->next[q->tail];
 355        return skb_peek(&q->qs[a]);
 356}
 357
 358static struct sk_buff *
 359sfq_dequeue(struct Qdisc *sch)
 360{
 361        struct sfq_sched_data *q = qdisc_priv(sch);
 362        struct sk_buff *skb;
 363        sfq_index a, old_a;
 364
 365        /* No active slots */
 366        if (q->tail == SFQ_DEPTH)
 367                return NULL;
 368
 369        a = old_a = q->next[q->tail];
 370
 371        /* Grab packet */
 372        skb = __skb_dequeue(&q->qs[a]);
 373        sfq_dec(q, a);
 374        sch->q.qlen--;
 375        sch->qstats.backlog -= qdisc_pkt_len(skb);
 376
 377        /* Is the slot empty? */
 378        if (q->qs[a].qlen == 0) {
 379                q->ht[q->hash[a]] = SFQ_DEPTH;
 380                a = q->next[a];
 381                if (a == old_a) {
 382                        q->tail = SFQ_DEPTH;
 383                        return skb;
 384                }
 385                q->next[q->tail] = a;
 386                q->allot[a] += q->quantum;
 387        } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
 388                q->tail = a;
 389                a = q->next[a];
 390                q->allot[a] += q->quantum;
 391        }
 392        return skb;
 393}
 394
 395static void
 396sfq_reset(struct Qdisc *sch)
 397{
 398        struct sk_buff *skb;
 399
 400        while ((skb = sfq_dequeue(sch)) != NULL)
 401                kfree_skb(skb);
 402}
 403
 404static void sfq_perturbation(unsigned long arg)
 405{
 406        struct Qdisc *sch = (struct Qdisc *)arg;
 407        struct sfq_sched_data *q = qdisc_priv(sch);
 408
 409        q->perturbation = net_random();
 410
 411        if (q->perturb_period)
 412                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 413}
 414
 415static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 416{
 417        struct sfq_sched_data *q = qdisc_priv(sch);
 418        struct tc_sfq_qopt *ctl = nla_data(opt);
 419        unsigned int qlen;
 420
 421        if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 422                return -EINVAL;
 423
 424        sch_tree_lock(sch);
 425        q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
 426        q->perturb_period = ctl->perturb_period * HZ;
 427        if (ctl->limit)
 428                q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
 429
 430        qlen = sch->q.qlen;
 431        while (sch->q.qlen > q->limit)
 432                sfq_drop(sch);
 433        qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 434
 435        del_timer(&q->perturb_timer);
 436        if (q->perturb_period) {
 437                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 438                q->perturbation = net_random();
 439        }
 440        sch_tree_unlock(sch);
 441        return 0;
 442}
 443
 444static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 445{
 446        struct sfq_sched_data *q = qdisc_priv(sch);
 447        int i;
 448
 449        q->perturb_timer.function = sfq_perturbation;
 450        q->perturb_timer.data = (unsigned long)sch;
 451        init_timer_deferrable(&q->perturb_timer);
 452
 453        for (i = 0; i < SFQ_HASH_DIVISOR; i++)
 454                q->ht[i] = SFQ_DEPTH;
 455
 456        for (i = 0; i < SFQ_DEPTH; i++) {
 457                skb_queue_head_init(&q->qs[i]);
 458                q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
 459                q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
 460        }
 461
 462        q->limit = SFQ_DEPTH - 1;
 463        q->max_depth = 0;
 464        q->tail = SFQ_DEPTH;
 465        if (opt == NULL) {
 466                q->quantum = psched_mtu(qdisc_dev(sch));
 467                q->perturb_period = 0;
 468                q->perturbation = net_random();
 469        } else {
 470                int err = sfq_change(sch, opt);
 471                if (err)
 472                        return err;
 473        }
 474
 475        for (i = 0; i < SFQ_DEPTH; i++)
 476                sfq_link(q, i);
 477        return 0;
 478}
 479
 480static void sfq_destroy(struct Qdisc *sch)
 481{
 482        struct sfq_sched_data *q = qdisc_priv(sch);
 483
 484        tcf_destroy_chain(&q->filter_list);
 485        q->perturb_period = 0;
 486        del_timer_sync(&q->perturb_timer);
 487}
 488
 489static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 490{
 491        struct sfq_sched_data *q = qdisc_priv(sch);
 492        unsigned char *b = skb_tail_pointer(skb);
 493        struct tc_sfq_qopt opt;
 494
 495        opt.quantum = q->quantum;
 496        opt.perturb_period = q->perturb_period / HZ;
 497
 498        opt.limit = q->limit;
 499        opt.divisor = SFQ_HASH_DIVISOR;
 500        opt.flows = q->limit;
 501
 502        NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 503
 504        return skb->len;
 505
 506nla_put_failure:
 507        nlmsg_trim(skb, b);
 508        return -1;
 509}
 510
 511static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 512{
 513        return NULL;
 514}
 515
 516static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 517{
 518        return 0;
 519}
 520
 521static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 522                              u32 classid)
 523{
 524        return 0;
 525}
 526
 527static void sfq_put(struct Qdisc *q, unsigned long cl)
 528{
 529}
 530
 531static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 532{
 533        struct sfq_sched_data *q = qdisc_priv(sch);
 534
 535        if (cl)
 536                return NULL;
 537        return &q->filter_list;
 538}
 539
 540static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 541                          struct sk_buff *skb, struct tcmsg *tcm)
 542{
 543        tcm->tcm_handle |= TC_H_MIN(cl);
 544        return 0;
 545}
 546
 547static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 548                                struct gnet_dump *d)
 549{
 550        struct sfq_sched_data *q = qdisc_priv(sch);
 551        sfq_index idx = q->ht[cl-1];
 552        struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
 553        struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
 554
 555        if (gnet_stats_copy_queue(d, &qs) < 0)
 556                return -1;
 557        return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 558}
 559
 560static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 561{
 562        struct sfq_sched_data *q = qdisc_priv(sch);
 563        unsigned int i;
 564
 565        if (arg->stop)
 566                return;
 567
 568        for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
 569                if (q->ht[i] == SFQ_DEPTH ||
 570                    arg->count < arg->skip) {
 571                        arg->count++;
 572                        continue;
 573                }
 574                if (arg->fn(sch, i + 1, arg) < 0) {
 575                        arg->stop = 1;
 576                        break;
 577                }
 578                arg->count++;
 579        }
 580}
 581
 582static const struct Qdisc_class_ops sfq_class_ops = {
 583        .leaf           =       sfq_leaf,
 584        .get            =       sfq_get,
 585        .put            =       sfq_put,
 586        .tcf_chain      =       sfq_find_tcf,
 587        .bind_tcf       =       sfq_bind,
 588        .unbind_tcf     =       sfq_put,
 589        .dump           =       sfq_dump_class,
 590        .dump_stats     =       sfq_dump_class_stats,
 591        .walk           =       sfq_walk,
 592};
 593
 594static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 595        .cl_ops         =       &sfq_class_ops,
 596        .id             =       "sfq",
 597        .priv_size      =       sizeof(struct sfq_sched_data),
 598        .enqueue        =       sfq_enqueue,
 599        .dequeue        =       sfq_dequeue,
 600        .peek           =       sfq_peek,
 601        .drop           =       sfq_drop,
 602        .init           =       sfq_init,
 603        .reset          =       sfq_reset,
 604        .destroy        =       sfq_destroy,
 605        .change         =       NULL,
 606        .dump           =       sfq_dump,
 607        .owner          =       THIS_MODULE,
 608};
 609
 610static int __init sfq_module_init(void)
 611{
 612        return register_qdisc(&sfq_qdisc_ops);
 613}
 614static void __exit sfq_module_exit(void)
 615{
 616        unregister_qdisc(&sfq_qdisc_ops);
 617}
 618module_init(sfq_module_init)
 619module_exit(sfq_module_exit)
 620MODULE_LICENSE("GPL");
 621