linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        if (dev_ingress_queue(dev))
 244                q = qdisc_match_from_root(
 245                        dev_ingress_queue(dev)->qdisc_sleeping,
 246                        handle);
 247out:
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288static struct qdisc_rate_table *qdisc_rtab_list;
 289
 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 291{
 292        struct qdisc_rate_table *rtab;
 293
 294        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 295                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 296                        rtab->refcnt++;
 297                        return rtab;
 298                }
 299        }
 300
 301        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 302            nla_len(tab) != TC_RTAB_SIZE)
 303                return NULL;
 304
 305        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 306        if (rtab) {
 307                rtab->rate = *r;
 308                rtab->refcnt = 1;
 309                memcpy(rtab->data, nla_data(tab), 1024);
 310                rtab->next = qdisc_rtab_list;
 311                qdisc_rtab_list = rtab;
 312        }
 313        return rtab;
 314}
 315EXPORT_SYMBOL(qdisc_get_rtab);
 316
 317void qdisc_put_rtab(struct qdisc_rate_table *tab)
 318{
 319        struct qdisc_rate_table *rtab, **rtabp;
 320
 321        if (!tab || --tab->refcnt)
 322                return;
 323
 324        for (rtabp = &qdisc_rtab_list;
 325             (rtab = *rtabp) != NULL;
 326             rtabp = &rtab->next) {
 327                if (rtab == tab) {
 328                        *rtabp = rtab->next;
 329                        kfree(rtab);
 330                        return;
 331                }
 332        }
 333}
 334EXPORT_SYMBOL(qdisc_put_rtab);
 335
 336static LIST_HEAD(qdisc_stab_list);
 337static DEFINE_SPINLOCK(qdisc_stab_lock);
 338
 339static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 340        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 341        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 342};
 343
 344static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 345{
 346        struct nlattr *tb[TCA_STAB_MAX + 1];
 347        struct qdisc_size_table *stab;
 348        struct tc_sizespec *s;
 349        unsigned int tsize = 0;
 350        u16 *tab = NULL;
 351        int err;
 352
 353        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 354        if (err < 0)
 355                return ERR_PTR(err);
 356        if (!tb[TCA_STAB_BASE])
 357                return ERR_PTR(-EINVAL);
 358
 359        s = nla_data(tb[TCA_STAB_BASE]);
 360
 361        if (s->tsize > 0) {
 362                if (!tb[TCA_STAB_DATA])
 363                        return ERR_PTR(-EINVAL);
 364                tab = nla_data(tb[TCA_STAB_DATA]);
 365                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 366        }
 367
 368        if (tsize != s->tsize || (!tab && tsize > 0))
 369                return ERR_PTR(-EINVAL);
 370
 371        spin_lock(&qdisc_stab_lock);
 372
 373        list_for_each_entry(stab, &qdisc_stab_list, list) {
 374                if (memcmp(&stab->szopts, s, sizeof(*s)))
 375                        continue;
 376                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 377                        continue;
 378                stab->refcnt++;
 379                spin_unlock(&qdisc_stab_lock);
 380                return stab;
 381        }
 382
 383        spin_unlock(&qdisc_stab_lock);
 384
 385        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 386        if (!stab)
 387                return ERR_PTR(-ENOMEM);
 388
 389        stab->refcnt = 1;
 390        stab->szopts = *s;
 391        if (tsize > 0)
 392                memcpy(stab->data, tab, tsize * sizeof(u16));
 393
 394        spin_lock(&qdisc_stab_lock);
 395        list_add_tail(&stab->list, &qdisc_stab_list);
 396        spin_unlock(&qdisc_stab_lock);
 397
 398        return stab;
 399}
 400
 401static void stab_kfree_rcu(struct rcu_head *head)
 402{
 403        kfree(container_of(head, struct qdisc_size_table, rcu));
 404}
 405
 406void qdisc_put_stab(struct qdisc_size_table *tab)
 407{
 408        if (!tab)
 409                return;
 410
 411        spin_lock(&qdisc_stab_lock);
 412
 413        if (--tab->refcnt == 0) {
 414                list_del(&tab->list);
 415                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 416        }
 417
 418        spin_unlock(&qdisc_stab_lock);
 419}
 420EXPORT_SYMBOL(qdisc_put_stab);
 421
 422static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 423{
 424        struct nlattr *nest;
 425
 426        nest = nla_nest_start(skb, TCA_STAB);
 427        if (nest == NULL)
 428                goto nla_put_failure;
 429        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 430        nla_nest_end(skb, nest);
 431
 432        return skb->len;
 433
 434nla_put_failure:
 435        return -1;
 436}
 437
 438void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 439{
 440        int pkt_len, slot;
 441
 442        pkt_len = skb->len + stab->szopts.overhead;
 443        if (unlikely(!stab->szopts.tsize))
 444                goto out;
 445
 446        slot = pkt_len + stab->szopts.cell_align;
 447        if (unlikely(slot < 0))
 448                slot = 0;
 449
 450        slot >>= stab->szopts.cell_log;
 451        if (likely(slot < stab->szopts.tsize))
 452                pkt_len = stab->data[slot];
 453        else
 454                pkt_len = stab->data[stab->szopts.tsize - 1] *
 455                                (slot / stab->szopts.tsize) +
 456                                stab->data[slot % stab->szopts.tsize];
 457
 458        pkt_len <<= stab->szopts.size_log;
 459out:
 460        if (unlikely(pkt_len < 1))
 461                pkt_len = 1;
 462        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 463}
 464EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 465
 466void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 467{
 468        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 469                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 470                        txt, qdisc->ops->id, qdisc->handle >> 16);
 471                qdisc->flags |= TCQ_F_WARN_NONWC;
 472        }
 473}
 474EXPORT_SYMBOL(qdisc_warn_nonwc);
 475
 476static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 477{
 478        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 479                                                 timer);
 480
 481        qdisc_unthrottled(wd->qdisc);
 482        __netif_schedule(qdisc_root(wd->qdisc));
 483
 484        return HRTIMER_NORESTART;
 485}
 486
 487void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 488{
 489        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 490        wd->timer.function = qdisc_watchdog;
 491        wd->qdisc = qdisc;
 492}
 493EXPORT_SYMBOL(qdisc_watchdog_init);
 494
 495void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 496{
 497        ktime_t time;
 498
 499        if (test_bit(__QDISC_STATE_DEACTIVATED,
 500                     &qdisc_root_sleeping(wd->qdisc)->state))
 501                return;
 502
 503        qdisc_throttled(wd->qdisc);
 504        time = ktime_set(0, 0);
 505        time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 506        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 507}
 508EXPORT_SYMBOL(qdisc_watchdog_schedule);
 509
 510void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 511{
 512        hrtimer_cancel(&wd->timer);
 513        qdisc_unthrottled(wd->qdisc);
 514}
 515EXPORT_SYMBOL(qdisc_watchdog_cancel);
 516
 517static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 518{
 519        unsigned int size = n * sizeof(struct hlist_head), i;
 520        struct hlist_head *h;
 521
 522        if (size <= PAGE_SIZE)
 523                h = kmalloc(size, GFP_KERNEL);
 524        else
 525                h = (struct hlist_head *)
 526                        __get_free_pages(GFP_KERNEL, get_order(size));
 527
 528        if (h != NULL) {
 529                for (i = 0; i < n; i++)
 530                        INIT_HLIST_HEAD(&h[i]);
 531        }
 532        return h;
 533}
 534
 535static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 536{
 537        unsigned int size = n * sizeof(struct hlist_head);
 538
 539        if (size <= PAGE_SIZE)
 540                kfree(h);
 541        else
 542                free_pages((unsigned long)h, get_order(size));
 543}
 544
 545void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 546{
 547        struct Qdisc_class_common *cl;
 548        struct hlist_node *n, *next;
 549        struct hlist_head *nhash, *ohash;
 550        unsigned int nsize, nmask, osize;
 551        unsigned int i, h;
 552
 553        /* Rehash when load factor exceeds 0.75 */
 554        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 555                return;
 556        nsize = clhash->hashsize * 2;
 557        nmask = nsize - 1;
 558        nhash = qdisc_class_hash_alloc(nsize);
 559        if (nhash == NULL)
 560                return;
 561
 562        ohash = clhash->hash;
 563        osize = clhash->hashsize;
 564
 565        sch_tree_lock(sch);
 566        for (i = 0; i < osize; i++) {
 567                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 568                        h = qdisc_class_hash(cl->classid, nmask);
 569                        hlist_add_head(&cl->hnode, &nhash[h]);
 570                }
 571        }
 572        clhash->hash     = nhash;
 573        clhash->hashsize = nsize;
 574        clhash->hashmask = nmask;
 575        sch_tree_unlock(sch);
 576
 577        qdisc_class_hash_free(ohash, osize);
 578}
 579EXPORT_SYMBOL(qdisc_class_hash_grow);
 580
 581int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 582{
 583        unsigned int size = 4;
 584
 585        clhash->hash = qdisc_class_hash_alloc(size);
 586        if (clhash->hash == NULL)
 587                return -ENOMEM;
 588        clhash->hashsize  = size;
 589        clhash->hashmask  = size - 1;
 590        clhash->hashelems = 0;
 591        return 0;
 592}
 593EXPORT_SYMBOL(qdisc_class_hash_init);
 594
 595void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 596{
 597        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 598}
 599EXPORT_SYMBOL(qdisc_class_hash_destroy);
 600
 601void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 602                             struct Qdisc_class_common *cl)
 603{
 604        unsigned int h;
 605
 606        INIT_HLIST_NODE(&cl->hnode);
 607        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 608        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 609        clhash->hashelems++;
 610}
 611EXPORT_SYMBOL(qdisc_class_hash_insert);
 612
 613void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 614                             struct Qdisc_class_common *cl)
 615{
 616        hlist_del(&cl->hnode);
 617        clhash->hashelems--;
 618}
 619EXPORT_SYMBOL(qdisc_class_hash_remove);
 620
 621/* Allocate an unique handle from space managed by kernel
 622 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 623 */
 624static u32 qdisc_alloc_handle(struct net_device *dev)
 625{
 626        int i = 0x8000;
 627        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 628
 629        do {
 630                autohandle += TC_H_MAKE(0x10000U, 0);
 631                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 632                        autohandle = TC_H_MAKE(0x80000000U, 0);
 633                if (!qdisc_lookup(dev, autohandle))
 634                        return autohandle;
 635                cond_resched();
 636        } while (--i > 0);
 637
 638        return 0;
 639}
 640
 641void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 642{
 643        const struct Qdisc_class_ops *cops;
 644        unsigned long cl;
 645        u32 parentid;
 646
 647        if (n == 0)
 648                return;
 649        while ((parentid = sch->parent)) {
 650                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 651                        return;
 652
 653                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 654                if (sch == NULL) {
 655                        WARN_ON(parentid != TC_H_ROOT);
 656                        return;
 657                }
 658                cops = sch->ops->cl_ops;
 659                if (cops->qlen_notify) {
 660                        cl = cops->get(sch, parentid);
 661                        cops->qlen_notify(sch, cl);
 662                        cops->put(sch, cl);
 663                }
 664                sch->q.qlen -= n;
 665        }
 666}
 667EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 668
 669static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 670                               struct nlmsghdr *n, u32 clid,
 671                               struct Qdisc *old, struct Qdisc *new)
 672{
 673        if (new || old)
 674                qdisc_notify(net, skb, n, clid, old, new);
 675
 676        if (old)
 677                qdisc_destroy(old);
 678}
 679
 680/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 681 * to device "dev".
 682 *
 683 * When appropriate send a netlink notification using 'skb'
 684 * and "n".
 685 *
 686 * On success, destroy old qdisc.
 687 */
 688
 689static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 690                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 691                       struct Qdisc *new, struct Qdisc *old)
 692{
 693        struct Qdisc *q = old;
 694        struct net *net = dev_net(dev);
 695        int err = 0;
 696
 697        if (parent == NULL) {
 698                unsigned int i, num_q, ingress;
 699
 700                ingress = 0;
 701                num_q = dev->num_tx_queues;
 702                if ((q && q->flags & TCQ_F_INGRESS) ||
 703                    (new && new->flags & TCQ_F_INGRESS)) {
 704                        num_q = 1;
 705                        ingress = 1;
 706                        if (!dev_ingress_queue(dev))
 707                                return -ENOENT;
 708                }
 709
 710                if (dev->flags & IFF_UP)
 711                        dev_deactivate(dev);
 712
 713                if (new && new->ops->attach) {
 714                        new->ops->attach(new);
 715                        num_q = 0;
 716                }
 717
 718                for (i = 0; i < num_q; i++) {
 719                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 720
 721                        if (!ingress)
 722                                dev_queue = netdev_get_tx_queue(dev, i);
 723
 724                        old = dev_graft_qdisc(dev_queue, new);
 725                        if (new && i > 0)
 726                                atomic_inc(&new->refcnt);
 727
 728                        if (!ingress)
 729                                qdisc_destroy(old);
 730                }
 731
 732                if (!ingress) {
 733                        notify_and_destroy(net, skb, n, classid,
 734                                           dev->qdisc, new);
 735                        if (new && !new->ops->attach)
 736                                atomic_inc(&new->refcnt);
 737                        dev->qdisc = new ? : &noop_qdisc;
 738                } else {
 739                        notify_and_destroy(net, skb, n, classid, old, new);
 740                }
 741
 742                if (dev->flags & IFF_UP)
 743                        dev_activate(dev);
 744        } else {
 745                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 746
 747                err = -EOPNOTSUPP;
 748                if (cops && cops->graft) {
 749                        unsigned long cl = cops->get(parent, classid);
 750                        if (cl) {
 751                                err = cops->graft(parent, cl, new, &old);
 752                                cops->put(parent, cl);
 753                        } else
 754                                err = -ENOENT;
 755                }
 756                if (!err)
 757                        notify_and_destroy(net, skb, n, classid, old, new);
 758        }
 759        return err;
 760}
 761
 762/* lockdep annotation is needed for ingress; egress gets it only for name */
 763static struct lock_class_key qdisc_tx_lock;
 764static struct lock_class_key qdisc_rx_lock;
 765
 766/*
 767   Allocate and initialize new qdisc.
 768
 769   Parameters are passed via opt.
 770 */
 771
 772static struct Qdisc *
 773qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 774             struct Qdisc *p, u32 parent, u32 handle,
 775             struct nlattr **tca, int *errp)
 776{
 777        int err;
 778        struct nlattr *kind = tca[TCA_KIND];
 779        struct Qdisc *sch;
 780        struct Qdisc_ops *ops;
 781        struct qdisc_size_table *stab;
 782
 783        ops = qdisc_lookup_ops(kind);
 784#ifdef CONFIG_MODULES
 785        if (ops == NULL && kind != NULL) {
 786                char name[IFNAMSIZ];
 787                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 788                        /* We dropped the RTNL semaphore in order to
 789                         * perform the module load.  So, even if we
 790                         * succeeded in loading the module we have to
 791                         * tell the caller to replay the request.  We
 792                         * indicate this using -EAGAIN.
 793                         * We replay the request because the device may
 794                         * go away in the mean time.
 795                         */
 796                        rtnl_unlock();
 797                        request_module("sch_%s", name);
 798                        rtnl_lock();
 799                        ops = qdisc_lookup_ops(kind);
 800                        if (ops != NULL) {
 801                                /* We will try again qdisc_lookup_ops,
 802                                 * so don't keep a reference.
 803                                 */
 804                                module_put(ops->owner);
 805                                err = -EAGAIN;
 806                                goto err_out;
 807                        }
 808                }
 809        }
 810#endif
 811
 812        err = -ENOENT;
 813        if (ops == NULL)
 814                goto err_out;
 815
 816        sch = qdisc_alloc(dev_queue, ops);
 817        if (IS_ERR(sch)) {
 818                err = PTR_ERR(sch);
 819                goto err_out2;
 820        }
 821
 822        sch->parent = parent;
 823
 824        if (handle == TC_H_INGRESS) {
 825                sch->flags |= TCQ_F_INGRESS;
 826                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 827                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 828        } else {
 829                if (handle == 0) {
 830                        handle = qdisc_alloc_handle(dev);
 831                        err = -ENOMEM;
 832                        if (handle == 0)
 833                                goto err_out3;
 834                }
 835                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 836        }
 837
 838        sch->handle = handle;
 839
 840        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 841                if (tca[TCA_STAB]) {
 842                        stab = qdisc_get_stab(tca[TCA_STAB]);
 843                        if (IS_ERR(stab)) {
 844                                err = PTR_ERR(stab);
 845                                goto err_out4;
 846                        }
 847                        rcu_assign_pointer(sch->stab, stab);
 848                }
 849                if (tca[TCA_RATE]) {
 850                        spinlock_t *root_lock;
 851
 852                        err = -EOPNOTSUPP;
 853                        if (sch->flags & TCQ_F_MQROOT)
 854                                goto err_out4;
 855
 856                        if ((sch->parent != TC_H_ROOT) &&
 857                            !(sch->flags & TCQ_F_INGRESS) &&
 858                            (!p || !(p->flags & TCQ_F_MQROOT)))
 859                                root_lock = qdisc_root_sleeping_lock(sch);
 860                        else
 861                                root_lock = qdisc_lock(sch);
 862
 863                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 864                                                root_lock, tca[TCA_RATE]);
 865                        if (err)
 866                                goto err_out4;
 867                }
 868
 869                qdisc_list_add(sch);
 870
 871                return sch;
 872        }
 873err_out3:
 874        dev_put(dev);
 875        kfree((char *) sch - sch->padded);
 876err_out2:
 877        module_put(ops->owner);
 878err_out:
 879        *errp = err;
 880        return NULL;
 881
 882err_out4:
 883        /*
 884         * Any broken qdiscs that would require a ops->reset() here?
 885         * The qdisc was never in action so it shouldn't be necessary.
 886         */
 887        qdisc_put_stab(rtnl_dereference(sch->stab));
 888        if (ops->destroy)
 889                ops->destroy(sch);
 890        goto err_out3;
 891}
 892
 893static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 894{
 895        struct qdisc_size_table *ostab, *stab = NULL;
 896        int err = 0;
 897
 898        if (tca[TCA_OPTIONS]) {
 899                if (sch->ops->change == NULL)
 900                        return -EINVAL;
 901                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 902                if (err)
 903                        return err;
 904        }
 905
 906        if (tca[TCA_STAB]) {
 907                stab = qdisc_get_stab(tca[TCA_STAB]);
 908                if (IS_ERR(stab))
 909                        return PTR_ERR(stab);
 910        }
 911
 912        ostab = rtnl_dereference(sch->stab);
 913        rcu_assign_pointer(sch->stab, stab);
 914        qdisc_put_stab(ostab);
 915
 916        if (tca[TCA_RATE]) {
 917                /* NB: ignores errors from replace_estimator
 918                   because change can't be undone. */
 919                if (sch->flags & TCQ_F_MQROOT)
 920                        goto out;
 921                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 922                                            qdisc_root_sleeping_lock(sch),
 923                                            tca[TCA_RATE]);
 924        }
 925out:
 926        return 0;
 927}
 928
 929struct check_loop_arg {
 930        struct qdisc_walker     w;
 931        struct Qdisc            *p;
 932        int                     depth;
 933};
 934
 935static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 936
 937static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 938{
 939        struct check_loop_arg   arg;
 940
 941        if (q->ops->cl_ops == NULL)
 942                return 0;
 943
 944        arg.w.stop = arg.w.skip = arg.w.count = 0;
 945        arg.w.fn = check_loop_fn;
 946        arg.depth = depth;
 947        arg.p = p;
 948        q->ops->cl_ops->walk(q, &arg.w);
 949        return arg.w.stop ? -ELOOP : 0;
 950}
 951
 952static int
 953check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 954{
 955        struct Qdisc *leaf;
 956        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 957        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 958
 959        leaf = cops->leaf(q, cl);
 960        if (leaf) {
 961                if (leaf == arg->p || arg->depth > 7)
 962                        return -ELOOP;
 963                return check_loop(leaf, arg->p, arg->depth + 1);
 964        }
 965        return 0;
 966}
 967
 968/*
 969 * Delete/get qdisc.
 970 */
 971
 972static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 973{
 974        struct net *net = sock_net(skb->sk);
 975        struct tcmsg *tcm = NLMSG_DATA(n);
 976        struct nlattr *tca[TCA_MAX + 1];
 977        struct net_device *dev;
 978        u32 clid = tcm->tcm_parent;
 979        struct Qdisc *q = NULL;
 980        struct Qdisc *p = NULL;
 981        int err;
 982
 983        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
 984        if (!dev)
 985                return -ENODEV;
 986
 987        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 988        if (err < 0)
 989                return err;
 990
 991        if (clid) {
 992                if (clid != TC_H_ROOT) {
 993                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 994                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
 995                                if (!p)
 996                                        return -ENOENT;
 997                                q = qdisc_leaf(p, clid);
 998                        } else if (dev_ingress_queue(dev)) {
 999                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1000                        }
1001                } else {
1002                        q = dev->qdisc;
1003                }
1004                if (!q)
1005                        return -ENOENT;
1006
1007                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1008                        return -EINVAL;
1009        } else {
1010                q = qdisc_lookup(dev, tcm->tcm_handle);
1011                if (!q)
1012                        return -ENOENT;
1013        }
1014
1015        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1016                return -EINVAL;
1017
1018        if (n->nlmsg_type == RTM_DELQDISC) {
1019                if (!clid)
1020                        return -EINVAL;
1021                if (q->handle == 0)
1022                        return -ENOENT;
1023                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1024                if (err != 0)
1025                        return err;
1026        } else {
1027                qdisc_notify(net, skb, n, clid, NULL, q);
1028        }
1029        return 0;
1030}
1031
1032/*
1033 * Create/change qdisc.
1034 */
1035
1036static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1037{
1038        struct net *net = sock_net(skb->sk);
1039        struct tcmsg *tcm;
1040        struct nlattr *tca[TCA_MAX + 1];
1041        struct net_device *dev;
1042        u32 clid;
1043        struct Qdisc *q, *p;
1044        int err;
1045
1046replay:
1047        /* Reinit, just in case something touches this. */
1048        tcm = NLMSG_DATA(n);
1049        clid = tcm->tcm_parent;
1050        q = p = NULL;
1051
1052        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1053        if (!dev)
1054                return -ENODEV;
1055
1056        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1057        if (err < 0)
1058                return err;
1059
1060        if (clid) {
1061                if (clid != TC_H_ROOT) {
1062                        if (clid != TC_H_INGRESS) {
1063                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1064                                if (!p)
1065                                        return -ENOENT;
1066                                q = qdisc_leaf(p, clid);
1067                        } else if (dev_ingress_queue_create(dev)) {
1068                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1069                        }
1070                } else {
1071                        q = dev->qdisc;
1072                }
1073
1074                /* It may be default qdisc, ignore it */
1075                if (q && q->handle == 0)
1076                        q = NULL;
1077
1078                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1079                        if (tcm->tcm_handle) {
1080                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1081                                        return -EEXIST;
1082                                if (TC_H_MIN(tcm->tcm_handle))
1083                                        return -EINVAL;
1084                                q = qdisc_lookup(dev, tcm->tcm_handle);
1085                                if (!q)
1086                                        goto create_n_graft;
1087                                if (n->nlmsg_flags & NLM_F_EXCL)
1088                                        return -EEXIST;
1089                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1090                                        return -EINVAL;
1091                                if (q == p ||
1092                                    (p && check_loop(q, p, 0)))
1093                                        return -ELOOP;
1094                                atomic_inc(&q->refcnt);
1095                                goto graft;
1096                        } else {
1097                                if (!q)
1098                                        goto create_n_graft;
1099
1100                                /* This magic test requires explanation.
1101                                 *
1102                                 *   We know, that some child q is already
1103                                 *   attached to this parent and have choice:
1104                                 *   either to change it or to create/graft new one.
1105                                 *
1106                                 *   1. We are allowed to create/graft only
1107                                 *   if CREATE and REPLACE flags are set.
1108                                 *
1109                                 *   2. If EXCL is set, requestor wanted to say,
1110                                 *   that qdisc tcm_handle is not expected
1111                                 *   to exist, so that we choose create/graft too.
1112                                 *
1113                                 *   3. The last case is when no flags are set.
1114                                 *   Alas, it is sort of hole in API, we
1115                                 *   cannot decide what to do unambiguously.
1116                                 *   For now we select create/graft, if
1117                                 *   user gave KIND, which does not match existing.
1118                                 */
1119                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1120                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1121                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1122                                     (tca[TCA_KIND] &&
1123                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1124                                        goto create_n_graft;
1125                        }
1126                }
1127        } else {
1128                if (!tcm->tcm_handle)
1129                        return -EINVAL;
1130                q = qdisc_lookup(dev, tcm->tcm_handle);
1131        }
1132
1133        /* Change qdisc parameters */
1134        if (q == NULL)
1135                return -ENOENT;
1136        if (n->nlmsg_flags & NLM_F_EXCL)
1137                return -EEXIST;
1138        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1139                return -EINVAL;
1140        err = qdisc_change(q, tca);
1141        if (err == 0)
1142                qdisc_notify(net, skb, n, clid, NULL, q);
1143        return err;
1144
1145create_n_graft:
1146        if (!(n->nlmsg_flags & NLM_F_CREATE))
1147                return -ENOENT;
1148        if (clid == TC_H_INGRESS) {
1149                if (dev_ingress_queue(dev))
1150                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1151                                         tcm->tcm_parent, tcm->tcm_parent,
1152                                         tca, &err);
1153                else
1154                        err = -ENOENT;
1155        } else {
1156                struct netdev_queue *dev_queue;
1157
1158                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1159                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1160                else if (p)
1161                        dev_queue = p->dev_queue;
1162                else
1163                        dev_queue = netdev_get_tx_queue(dev, 0);
1164
1165                q = qdisc_create(dev, dev_queue, p,
1166                                 tcm->tcm_parent, tcm->tcm_handle,
1167                                 tca, &err);
1168        }
1169        if (q == NULL) {
1170                if (err == -EAGAIN)
1171                        goto replay;
1172                return err;
1173        }
1174
1175graft:
1176        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1177        if (err) {
1178                if (q)
1179                        qdisc_destroy(q);
1180                return err;
1181        }
1182
1183        return 0;
1184}
1185
1186static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1187                         u32 pid, u32 seq, u16 flags, int event)
1188{
1189        struct tcmsg *tcm;
1190        struct nlmsghdr  *nlh;
1191        unsigned char *b = skb_tail_pointer(skb);
1192        struct gnet_dump d;
1193        struct qdisc_size_table *stab;
1194
1195        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1196        tcm = NLMSG_DATA(nlh);
1197        tcm->tcm_family = AF_UNSPEC;
1198        tcm->tcm__pad1 = 0;
1199        tcm->tcm__pad2 = 0;
1200        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1201        tcm->tcm_parent = clid;
1202        tcm->tcm_handle = q->handle;
1203        tcm->tcm_info = atomic_read(&q->refcnt);
1204        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1205        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1206                goto nla_put_failure;
1207        q->qstats.qlen = q->q.qlen;
1208
1209        stab = rtnl_dereference(q->stab);
1210        if (stab && qdisc_dump_stab(skb, stab) < 0)
1211                goto nla_put_failure;
1212
1213        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1214                                         qdisc_root_sleeping_lock(q), &d) < 0)
1215                goto nla_put_failure;
1216
1217        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1218                goto nla_put_failure;
1219
1220        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1221            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1222            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1223                goto nla_put_failure;
1224
1225        if (gnet_stats_finish_copy(&d) < 0)
1226                goto nla_put_failure;
1227
1228        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1229        return skb->len;
1230
1231nlmsg_failure:
1232nla_put_failure:
1233        nlmsg_trim(skb, b);
1234        return -1;
1235}
1236
1237static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1238{
1239        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1240}
1241
1242static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1243                        struct nlmsghdr *n, u32 clid,
1244                        struct Qdisc *old, struct Qdisc *new)
1245{
1246        struct sk_buff *skb;
1247        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1248
1249        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1250        if (!skb)
1251                return -ENOBUFS;
1252
1253        if (old && !tc_qdisc_dump_ignore(old)) {
1254                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1255                                  0, RTM_DELQDISC) < 0)
1256                        goto err_out;
1257        }
1258        if (new && !tc_qdisc_dump_ignore(new)) {
1259                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1260                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1261                        goto err_out;
1262        }
1263
1264        if (skb->len)
1265                return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1266                                      n->nlmsg_flags & NLM_F_ECHO);
1267
1268err_out:
1269        kfree_skb(skb);
1270        return -EINVAL;
1271}
1272
1273static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1274                              struct netlink_callback *cb,
1275                              int *q_idx_p, int s_q_idx)
1276{
1277        int ret = 0, q_idx = *q_idx_p;
1278        struct Qdisc *q;
1279
1280        if (!root)
1281                return 0;
1282
1283        q = root;
1284        if (q_idx < s_q_idx) {
1285                q_idx++;
1286        } else {
1287                if (!tc_qdisc_dump_ignore(q) &&
1288                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1289                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1290                        goto done;
1291                q_idx++;
1292        }
1293        list_for_each_entry(q, &root->list, list) {
1294                if (q_idx < s_q_idx) {
1295                        q_idx++;
1296                        continue;
1297                }
1298                if (!tc_qdisc_dump_ignore(q) &&
1299                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1300                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1301                        goto done;
1302                q_idx++;
1303        }
1304
1305out:
1306        *q_idx_p = q_idx;
1307        return ret;
1308done:
1309        ret = -1;
1310        goto out;
1311}
1312
1313static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1314{
1315        struct net *net = sock_net(skb->sk);
1316        int idx, q_idx;
1317        int s_idx, s_q_idx;
1318        struct net_device *dev;
1319
1320        s_idx = cb->args[0];
1321        s_q_idx = q_idx = cb->args[1];
1322
1323        rcu_read_lock();
1324        idx = 0;
1325        for_each_netdev_rcu(net, dev) {
1326                struct netdev_queue *dev_queue;
1327
1328                if (idx < s_idx)
1329                        goto cont;
1330                if (idx > s_idx)
1331                        s_q_idx = 0;
1332                q_idx = 0;
1333
1334                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1335                        goto done;
1336
1337                dev_queue = dev_ingress_queue(dev);
1338                if (dev_queue &&
1339                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1340                                       &q_idx, s_q_idx) < 0)
1341                        goto done;
1342
1343cont:
1344                idx++;
1345        }
1346
1347done:
1348        rcu_read_unlock();
1349
1350        cb->args[0] = idx;
1351        cb->args[1] = q_idx;
1352
1353        return skb->len;
1354}
1355
1356
1357
1358/************************************************
1359 *      Traffic classes manipulation.           *
1360 ************************************************/
1361
1362
1363
1364static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1365{
1366        struct net *net = sock_net(skb->sk);
1367        struct tcmsg *tcm = NLMSG_DATA(n);
1368        struct nlattr *tca[TCA_MAX + 1];
1369        struct net_device *dev;
1370        struct Qdisc *q = NULL;
1371        const struct Qdisc_class_ops *cops;
1372        unsigned long cl = 0;
1373        unsigned long new_cl;
1374        u32 pid = tcm->tcm_parent;
1375        u32 clid = tcm->tcm_handle;
1376        u32 qid = TC_H_MAJ(clid);
1377        int err;
1378
1379        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1380        if (!dev)
1381                return -ENODEV;
1382
1383        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1384        if (err < 0)
1385                return err;
1386
1387        /*
1388           parent == TC_H_UNSPEC - unspecified parent.
1389           parent == TC_H_ROOT   - class is root, which has no parent.
1390           parent == X:0         - parent is root class.
1391           parent == X:Y         - parent is a node in hierarchy.
1392           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1393
1394           handle == 0:0         - generate handle from kernel pool.
1395           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1396           handle == X:Y         - clear.
1397           handle == X:0         - root class.
1398         */
1399
1400        /* Step 1. Determine qdisc handle X:0 */
1401
1402        if (pid != TC_H_ROOT) {
1403                u32 qid1 = TC_H_MAJ(pid);
1404
1405                if (qid && qid1) {
1406                        /* If both majors are known, they must be identical. */
1407                        if (qid != qid1)
1408                                return -EINVAL;
1409                } else if (qid1) {
1410                        qid = qid1;
1411                } else if (qid == 0)
1412                        qid = dev->qdisc->handle;
1413
1414                /* Now qid is genuine qdisc handle consistent
1415                 * both with parent and child.
1416                 *
1417                 * TC_H_MAJ(pid) still may be unspecified, complete it now.
1418                 */
1419                if (pid)
1420                        pid = TC_H_MAKE(qid, pid);
1421        } else {
1422                if (qid == 0)
1423                        qid = dev->qdisc->handle;
1424        }
1425
1426        /* OK. Locate qdisc */
1427        q = qdisc_lookup(dev, qid);
1428        if (!q)
1429                return -ENOENT;
1430
1431        /* An check that it supports classes */
1432        cops = q->ops->cl_ops;
1433        if (cops == NULL)
1434                return -EINVAL;
1435
1436        /* Now try to get class */
1437        if (clid == 0) {
1438                if (pid == TC_H_ROOT)
1439                        clid = qid;
1440        } else
1441                clid = TC_H_MAKE(qid, clid);
1442
1443        if (clid)
1444                cl = cops->get(q, clid);
1445
1446        if (cl == 0) {
1447                err = -ENOENT;
1448                if (n->nlmsg_type != RTM_NEWTCLASS ||
1449                    !(n->nlmsg_flags & NLM_F_CREATE))
1450                        goto out;
1451        } else {
1452                switch (n->nlmsg_type) {
1453                case RTM_NEWTCLASS:
1454                        err = -EEXIST;
1455                        if (n->nlmsg_flags & NLM_F_EXCL)
1456                                goto out;
1457                        break;
1458                case RTM_DELTCLASS:
1459                        err = -EOPNOTSUPP;
1460                        if (cops->delete)
1461                                err = cops->delete(q, cl);
1462                        if (err == 0)
1463                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1464                        goto out;
1465                case RTM_GETTCLASS:
1466                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1467                        goto out;
1468                default:
1469                        err = -EINVAL;
1470                        goto out;
1471                }
1472        }
1473
1474        new_cl = cl;
1475        err = -EOPNOTSUPP;
1476        if (cops->change)
1477                err = cops->change(q, clid, pid, tca, &new_cl);
1478        if (err == 0)
1479                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1480
1481out:
1482        if (cl)
1483                cops->put(q, cl);
1484
1485        return err;
1486}
1487
1488
1489static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1490                          unsigned long cl,
1491                          u32 pid, u32 seq, u16 flags, int event)
1492{
1493        struct tcmsg *tcm;
1494        struct nlmsghdr  *nlh;
1495        unsigned char *b = skb_tail_pointer(skb);
1496        struct gnet_dump d;
1497        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1498
1499        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1500        tcm = NLMSG_DATA(nlh);
1501        tcm->tcm_family = AF_UNSPEC;
1502        tcm->tcm__pad1 = 0;
1503        tcm->tcm__pad2 = 0;
1504        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1505        tcm->tcm_parent = q->handle;
1506        tcm->tcm_handle = q->handle;
1507        tcm->tcm_info = 0;
1508        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1509        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1510                goto nla_put_failure;
1511
1512        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1513                                         qdisc_root_sleeping_lock(q), &d) < 0)
1514                goto nla_put_failure;
1515
1516        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1517                goto nla_put_failure;
1518
1519        if (gnet_stats_finish_copy(&d) < 0)
1520                goto nla_put_failure;
1521
1522        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1523        return skb->len;
1524
1525nlmsg_failure:
1526nla_put_failure:
1527        nlmsg_trim(skb, b);
1528        return -1;
1529}
1530
1531static int tclass_notify(struct net *net, struct sk_buff *oskb,
1532                         struct nlmsghdr *n, struct Qdisc *q,
1533                         unsigned long cl, int event)
1534{
1535        struct sk_buff *skb;
1536        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1537
1538        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1539        if (!skb)
1540                return -ENOBUFS;
1541
1542        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1543                kfree_skb(skb);
1544                return -EINVAL;
1545        }
1546
1547        return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1548                              n->nlmsg_flags & NLM_F_ECHO);
1549}
1550
1551struct qdisc_dump_args {
1552        struct qdisc_walker     w;
1553        struct sk_buff          *skb;
1554        struct netlink_callback *cb;
1555};
1556
1557static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1558{
1559        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1560
1561        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1562                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1563}
1564
1565static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1566                                struct tcmsg *tcm, struct netlink_callback *cb,
1567                                int *t_p, int s_t)
1568{
1569        struct qdisc_dump_args arg;
1570
1571        if (tc_qdisc_dump_ignore(q) ||
1572            *t_p < s_t || !q->ops->cl_ops ||
1573            (tcm->tcm_parent &&
1574             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1575                (*t_p)++;
1576                return 0;
1577        }
1578        if (*t_p > s_t)
1579                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1580        arg.w.fn = qdisc_class_dump;
1581        arg.skb = skb;
1582        arg.cb = cb;
1583        arg.w.stop  = 0;
1584        arg.w.skip = cb->args[1];
1585        arg.w.count = 0;
1586        q->ops->cl_ops->walk(q, &arg.w);
1587        cb->args[1] = arg.w.count;
1588        if (arg.w.stop)
1589                return -1;
1590        (*t_p)++;
1591        return 0;
1592}
1593
1594static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1595                               struct tcmsg *tcm, struct netlink_callback *cb,
1596                               int *t_p, int s_t)
1597{
1598        struct Qdisc *q;
1599
1600        if (!root)
1601                return 0;
1602
1603        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1604                return -1;
1605
1606        list_for_each_entry(q, &root->list, list) {
1607                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1608                        return -1;
1609        }
1610
1611        return 0;
1612}
1613
1614static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1615{
1616        struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1617        struct net *net = sock_net(skb->sk);
1618        struct netdev_queue *dev_queue;
1619        struct net_device *dev;
1620        int t, s_t;
1621
1622        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1623                return 0;
1624        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1625        if (!dev)
1626                return 0;
1627
1628        s_t = cb->args[0];
1629        t = 0;
1630
1631        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1632                goto done;
1633
1634        dev_queue = dev_ingress_queue(dev);
1635        if (dev_queue &&
1636            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1637                                &t, s_t) < 0)
1638                goto done;
1639
1640done:
1641        cb->args[0] = t;
1642
1643        dev_put(dev);
1644        return skb->len;
1645}
1646
1647/* Main classifier routine: scans classifier chain attached
1648 * to this qdisc, (optionally) tests for protocol and asks
1649 * specific classifiers.
1650 */
1651int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1652                       struct tcf_result *res)
1653{
1654        __be16 protocol = skb->protocol;
1655        int err;
1656
1657        for (; tp; tp = tp->next) {
1658                if (tp->protocol != protocol &&
1659                    tp->protocol != htons(ETH_P_ALL))
1660                        continue;
1661                err = tp->classify(skb, tp, res);
1662
1663                if (err >= 0) {
1664#ifdef CONFIG_NET_CLS_ACT
1665                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1666                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1667#endif
1668                        return err;
1669                }
1670        }
1671        return -1;
1672}
1673EXPORT_SYMBOL(tc_classify_compat);
1674
1675int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1676                struct tcf_result *res)
1677{
1678        int err = 0;
1679#ifdef CONFIG_NET_CLS_ACT
1680        const struct tcf_proto *otp = tp;
1681reclassify:
1682#endif
1683
1684        err = tc_classify_compat(skb, tp, res);
1685#ifdef CONFIG_NET_CLS_ACT
1686        if (err == TC_ACT_RECLASSIFY) {
1687                u32 verd = G_TC_VERD(skb->tc_verd);
1688                tp = otp;
1689
1690                if (verd++ >= MAX_REC_LOOP) {
1691                        if (net_ratelimit())
1692                                pr_notice("%s: packet reclassify loop"
1693                                          " rule prio %u protocol %02x\n",
1694                                          tp->q->ops->id,
1695                                          tp->prio & 0xffff,
1696                                          ntohs(tp->protocol));
1697                        return TC_ACT_SHOT;
1698                }
1699                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1700                goto reclassify;
1701        }
1702#endif
1703        return err;
1704}
1705EXPORT_SYMBOL(tc_classify);
1706
1707void tcf_destroy(struct tcf_proto *tp)
1708{
1709        tp->ops->destroy(tp);
1710        module_put(tp->ops->owner);
1711        kfree(tp);
1712}
1713
1714void tcf_destroy_chain(struct tcf_proto **fl)
1715{
1716        struct tcf_proto *tp;
1717
1718        while ((tp = *fl) != NULL) {
1719                *fl = tp->next;
1720                tcf_destroy(tp);
1721        }
1722}
1723EXPORT_SYMBOL(tcf_destroy_chain);
1724
1725#ifdef CONFIG_PROC_FS
1726static int psched_show(struct seq_file *seq, void *v)
1727{
1728        struct timespec ts;
1729
1730        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1731        seq_printf(seq, "%08x %08x %08x %08x\n",
1732                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1733                   1000000,
1734                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1735
1736        return 0;
1737}
1738
1739static int psched_open(struct inode *inode, struct file *file)
1740{
1741        return single_open(file, psched_show, NULL);
1742}
1743
1744static const struct file_operations psched_fops = {
1745        .owner = THIS_MODULE,
1746        .open = psched_open,
1747        .read  = seq_read,
1748        .llseek = seq_lseek,
1749        .release = single_release,
1750};
1751
1752static int __net_init psched_net_init(struct net *net)
1753{
1754        struct proc_dir_entry *e;
1755
1756        e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1757        if (e == NULL)
1758                return -ENOMEM;
1759
1760        return 0;
1761}
1762
1763static void __net_exit psched_net_exit(struct net *net)
1764{
1765        proc_net_remove(net, "psched");
1766}
1767#else
1768static int __net_init psched_net_init(struct net *net)
1769{
1770        return 0;
1771}
1772
1773static void __net_exit psched_net_exit(struct net *net)
1774{
1775}
1776#endif
1777
1778static struct pernet_operations psched_net_ops = {
1779        .init = psched_net_init,
1780        .exit = psched_net_exit,
1781};
1782
1783static int __init pktsched_init(void)
1784{
1785        int err;
1786
1787        err = register_pernet_subsys(&psched_net_ops);
1788        if (err) {
1789                pr_err("pktsched_init: "
1790                       "cannot initialize per netns operations\n");
1791                return err;
1792        }
1793
1794        register_qdisc(&pfifo_qdisc_ops);
1795        register_qdisc(&bfifo_qdisc_ops);
1796        register_qdisc(&pfifo_head_drop_qdisc_ops);
1797        register_qdisc(&mq_qdisc_ops);
1798
1799        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1800        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1801        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1802        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1803        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1804        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1805
1806        return 0;
1807}
1808
1809subsys_initcall(pktsched_init);
1810
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.