linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31
  32#include <net/net_namespace.h>
  33#include <net/sock.h>
  34#include <net/netlink.h>
  35#include <net/pkt_sched.h>
  36
  37static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
  38                        struct Qdisc *old, struct Qdisc *new);
  39static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
  40                         struct Qdisc *q, unsigned long cl, int event);
  41
  42/*
  43
  44   Short review.
  45   -------------
  46
  47   This file consists of two interrelated parts:
  48
  49   1. queueing disciplines manager frontend.
  50   2. traffic classes manager frontend.
  51
  52   Generally, queueing discipline ("qdisc") is a black box,
  53   which is able to enqueue packets and to dequeue them (when
  54   device is ready to send something) in order and at times
  55   determined by algorithm hidden in it.
  56
  57   qdisc's are divided to two categories:
  58   - "queues", which have no internal structure visible from outside.
  59   - "schedulers", which split all the packets to "traffic classes",
  60     using "packet classifiers" (look at cls_api.c)
  61
  62   In turn, classes may have child qdiscs (as rule, queues)
  63   attached to them etc. etc. etc.
  64
  65   The goal of the routines in this file is to translate
  66   information supplied by user in the form of handles
  67   to more intelligible for kernel form, to make some sanity
  68   checks and part of work, which is common to all qdiscs
  69   and to provide rtnetlink notifications.
  70
  71   All real intelligent work is done inside qdisc modules.
  72
  73
  74
  75   Every discipline has two major routines: enqueue and dequeue.
  76
  77   ---dequeue
  78
  79   dequeue usually returns a skb to send. It is allowed to return NULL,
  80   but it does not mean that queue is empty, it just means that
  81   discipline does not want to send anything this time.
  82   Queue is really empty if q->q.qlen == 0.
  83   For complicated disciplines with multiple queues q->q is not
  84   real packet queue, but however q->q.qlen must be valid.
  85
  86   ---enqueue
  87
  88   enqueue returns 0, if packet was enqueued successfully.
  89   If packet (this one or another one) was dropped, it returns
  90   not zero error code.
  91   NET_XMIT_DROP        - this packet dropped
  92     Expected action: do not backoff, but wait until queue will clear.
  93   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  94     Expected action: backoff or ignore
  95   NET_XMIT_POLICED     - dropped by police.
  96     Expected action: backoff or error to real-time apps.
  97
  98   Auxiliary routines:
  99
 100   ---peek
 101
 102   like dequeue but without removing a packet from the queue
 103
 104   ---reset
 105
 106   returns qdisc to initial state: purge all buffers, clear all
 107   timers, counters (except for statistics) etc.
 108
 109   ---init
 110
 111   initializes newly created qdisc.
 112
 113   ---destroy
 114
 115   destroys resources allocated by init and during lifetime of qdisc.
 116
 117   ---change
 118
 119   changes qdisc parameters.
 120 */
 121
 122/* Protects list of registered TC modules. It is pure SMP lock. */
 123static DEFINE_RWLOCK(qdisc_mod_lock);
 124
 125
 126/************************************************
 127 *      Queueing disciplines manipulation.      *
 128 ************************************************/
 129
 130
 131/* The list of all installed queueing disciplines. */
 132
 133static struct Qdisc_ops *qdisc_base;
 134
 135/* Register/uregister queueing discipline */
 136
 137int register_qdisc(struct Qdisc_ops *qops)
 138{
 139        struct Qdisc_ops *q, **qp;
 140        int rc = -EEXIST;
 141
 142        write_lock(&qdisc_mod_lock);
 143        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 144                if (!strcmp(qops->id, q->id))
 145                        goto out;
 146
 147        if (qops->enqueue == NULL)
 148                qops->enqueue = noop_qdisc_ops.enqueue;
 149        if (qops->peek == NULL) {
 150                if (qops->dequeue == NULL) {
 151                        qops->peek = noop_qdisc_ops.peek;
 152                } else {
 153                        rc = -EINVAL;
 154                        goto out;
 155                }
 156        }
 157        if (qops->dequeue == NULL)
 158                qops->dequeue = noop_qdisc_ops.dequeue;
 159
 160        qops->next = NULL;
 161        *qp = qops;
 162        rc = 0;
 163out:
 164        write_unlock(&qdisc_mod_lock);
 165        return rc;
 166}
 167EXPORT_SYMBOL(register_qdisc);
 168
 169int unregister_qdisc(struct Qdisc_ops *qops)
 170{
 171        struct Qdisc_ops *q, **qp;
 172        int err = -ENOENT;
 173
 174        write_lock(&qdisc_mod_lock);
 175        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
 176                if (q == qops)
 177                        break;
 178        if (q) {
 179                *qp = q->next;
 180                q->next = NULL;
 181                err = 0;
 182        }
 183        write_unlock(&qdisc_mod_lock);
 184        return err;
 185}
 186EXPORT_SYMBOL(unregister_qdisc);
 187
 188/* We know handle. Find qdisc among all qdisc's attached to device
 189   (root qdisc, all its children, children of children etc.)
 190 */
 191
 192static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 193{
 194        struct Qdisc *q;
 195
 196        if (!(root->flags & TCQ_F_BUILTIN) &&
 197            root->handle == handle)
 198                return root;
 199
 200        list_for_each_entry(q, &root->list, list) {
 201                if (q->handle == handle)
 202                        return q;
 203        }
 204        return NULL;
 205}
 206
 207static void qdisc_list_add(struct Qdisc *q)
 208{
 209        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 210                list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
 211}
 212
 213void qdisc_list_del(struct Qdisc *q)
 214{
 215        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 216                list_del(&q->list);
 217}
 218EXPORT_SYMBOL(qdisc_list_del);
 219
 220struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 221{
 222        unsigned int i;
 223        struct Qdisc *q;
 224
 225        for (i = 0; i < dev->num_tx_queues; i++) {
 226                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 227                struct Qdisc *txq_root = txq->qdisc_sleeping;
 228
 229                q = qdisc_match_from_root(txq_root, handle);
 230                if (q)
 231                        goto out;
 232        }
 233
 234        q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
 235out:
 236        return q;
 237}
 238
 239static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 240{
 241        unsigned long cl;
 242        struct Qdisc *leaf;
 243        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 244
 245        if (cops == NULL)
 246                return NULL;
 247        cl = cops->get(p, classid);
 248
 249        if (cl == 0)
 250                return NULL;
 251        leaf = cops->leaf(p, cl);
 252        cops->put(p, cl);
 253        return leaf;
 254}
 255
 256/* Find queueing discipline by name */
 257
 258static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 259{
 260        struct Qdisc_ops *q = NULL;
 261
 262        if (kind) {
 263                read_lock(&qdisc_mod_lock);
 264                for (q = qdisc_base; q; q = q->next) {
 265                        if (nla_strcmp(kind, q->id) == 0) {
 266                                if (!try_module_get(q->owner))
 267                                        q = NULL;
 268                                break;
 269                        }
 270                }
 271                read_unlock(&qdisc_mod_lock);
 272        }
 273        return q;
 274}
 275
 276static struct qdisc_rate_table *qdisc_rtab_list;
 277
 278struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 279{
 280        struct qdisc_rate_table *rtab;
 281
 282        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 283                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 284                        rtab->refcnt++;
 285                        return rtab;
 286                }
 287        }
 288
 289        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 290            nla_len(tab) != TC_RTAB_SIZE)
 291                return NULL;
 292
 293        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 294        if (rtab) {
 295                rtab->rate = *r;
 296                rtab->refcnt = 1;
 297                memcpy(rtab->data, nla_data(tab), 1024);
 298                rtab->next = qdisc_rtab_list;
 299                qdisc_rtab_list = rtab;
 300        }
 301        return rtab;
 302}
 303EXPORT_SYMBOL(qdisc_get_rtab);
 304
 305void qdisc_put_rtab(struct qdisc_rate_table *tab)
 306{
 307        struct qdisc_rate_table *rtab, **rtabp;
 308
 309        if (!tab || --tab->refcnt)
 310                return;
 311
 312        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
 313                if (rtab == tab) {
 314                        *rtabp = rtab->next;
 315                        kfree(rtab);
 316                        return;
 317                }
 318        }
 319}
 320EXPORT_SYMBOL(qdisc_put_rtab);
 321
 322static LIST_HEAD(qdisc_stab_list);
 323static DEFINE_SPINLOCK(qdisc_stab_lock);
 324
 325static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 326        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 327        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 328};
 329
 330static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 331{
 332        struct nlattr *tb[TCA_STAB_MAX + 1];
 333        struct qdisc_size_table *stab;
 334        struct tc_sizespec *s;
 335        unsigned int tsize = 0;
 336        u16 *tab = NULL;
 337        int err;
 338
 339        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 340        if (err < 0)
 341                return ERR_PTR(err);
 342        if (!tb[TCA_STAB_BASE])
 343                return ERR_PTR(-EINVAL);
 344
 345        s = nla_data(tb[TCA_STAB_BASE]);
 346
 347        if (s->tsize > 0) {
 348                if (!tb[TCA_STAB_DATA])
 349                        return ERR_PTR(-EINVAL);
 350                tab = nla_data(tb[TCA_STAB_DATA]);
 351                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 352        }
 353
 354        if (!s || tsize != s->tsize || (!tab && tsize > 0))
 355                return ERR_PTR(-EINVAL);
 356
 357        spin_lock(&qdisc_stab_lock);
 358
 359        list_for_each_entry(stab, &qdisc_stab_list, list) {
 360                if (memcmp(&stab->szopts, s, sizeof(*s)))
 361                        continue;
 362                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 363                        continue;
 364                stab->refcnt++;
 365                spin_unlock(&qdisc_stab_lock);
 366                return stab;
 367        }
 368
 369        spin_unlock(&qdisc_stab_lock);
 370
 371        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 372        if (!stab)
 373                return ERR_PTR(-ENOMEM);
 374
 375        stab->refcnt = 1;
 376        stab->szopts = *s;
 377        if (tsize > 0)
 378                memcpy(stab->data, tab, tsize * sizeof(u16));
 379
 380        spin_lock(&qdisc_stab_lock);
 381        list_add_tail(&stab->list, &qdisc_stab_list);
 382        spin_unlock(&qdisc_stab_lock);
 383
 384        return stab;
 385}
 386
 387void qdisc_put_stab(struct qdisc_size_table *tab)
 388{
 389        if (!tab)
 390                return;
 391
 392        spin_lock(&qdisc_stab_lock);
 393
 394        if (--tab->refcnt == 0) {
 395                list_del(&tab->list);
 396                kfree(tab);
 397        }
 398
 399        spin_unlock(&qdisc_stab_lock);
 400}
 401EXPORT_SYMBOL(qdisc_put_stab);
 402
 403static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 404{
 405        struct nlattr *nest;
 406
 407        nest = nla_nest_start(skb, TCA_STAB);
 408        if (nest == NULL)
 409                goto nla_put_failure;
 410        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 411        nla_nest_end(skb, nest);
 412
 413        return skb->len;
 414
 415nla_put_failure:
 416        return -1;
 417}
 418
 419void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
 420{
 421        int pkt_len, slot;
 422
 423        pkt_len = skb->len + stab->szopts.overhead;
 424        if (unlikely(!stab->szopts.tsize))
 425                goto out;
 426
 427        slot = pkt_len + stab->szopts.cell_align;
 428        if (unlikely(slot < 0))
 429                slot = 0;
 430
 431        slot >>= stab->szopts.cell_log;
 432        if (likely(slot < stab->szopts.tsize))
 433                pkt_len = stab->data[slot];
 434        else
 435                pkt_len = stab->data[stab->szopts.tsize - 1] *
 436                                (slot / stab->szopts.tsize) +
 437                                stab->data[slot % stab->szopts.tsize];
 438
 439        pkt_len <<= stab->szopts.size_log;
 440out:
 441        if (unlikely(pkt_len < 1))
 442                pkt_len = 1;
 443        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 444}
 445EXPORT_SYMBOL(qdisc_calculate_pkt_len);
 446
 447static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 448{
 449        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 450                                                 timer);
 451
 452        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 453        __netif_schedule(qdisc_root(wd->qdisc));
 454
 455        return HRTIMER_NORESTART;
 456}
 457
 458void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 459{
 460        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 461        wd->timer.function = qdisc_watchdog;
 462        wd->qdisc = qdisc;
 463}
 464EXPORT_SYMBOL(qdisc_watchdog_init);
 465
 466void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 467{
 468        ktime_t time;
 469
 470        if (test_bit(__QDISC_STATE_DEACTIVATED,
 471                     &qdisc_root_sleeping(wd->qdisc)->state))
 472                return;
 473
 474        wd->qdisc->flags |= TCQ_F_THROTTLED;
 475        time = ktime_set(0, 0);
 476        time = ktime_add_ns(time, PSCHED_US2NS(expires));
 477        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 478}
 479EXPORT_SYMBOL(qdisc_watchdog_schedule);
 480
 481void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 482{
 483        hrtimer_cancel(&wd->timer);
 484        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 485}
 486EXPORT_SYMBOL(qdisc_watchdog_cancel);
 487
 488static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 489{
 490        unsigned int size = n * sizeof(struct hlist_head), i;
 491        struct hlist_head *h;
 492
 493        if (size <= PAGE_SIZE)
 494                h = kmalloc(size, GFP_KERNEL);
 495        else
 496                h = (struct hlist_head *)
 497                        __get_free_pages(GFP_KERNEL, get_order(size));
 498
 499        if (h != NULL) {
 500                for (i = 0; i < n; i++)
 501                        INIT_HLIST_HEAD(&h[i]);
 502        }
 503        return h;
 504}
 505
 506static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 507{
 508        unsigned int size = n * sizeof(struct hlist_head);
 509
 510        if (size <= PAGE_SIZE)
 511                kfree(h);
 512        else
 513                free_pages((unsigned long)h, get_order(size));
 514}
 515
 516void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 517{
 518        struct Qdisc_class_common *cl;
 519        struct hlist_node *n, *next;
 520        struct hlist_head *nhash, *ohash;
 521        unsigned int nsize, nmask, osize;
 522        unsigned int i, h;
 523
 524        /* Rehash when load factor exceeds 0.75 */
 525        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 526                return;
 527        nsize = clhash->hashsize * 2;
 528        nmask = nsize - 1;
 529        nhash = qdisc_class_hash_alloc(nsize);
 530        if (nhash == NULL)
 531                return;
 532
 533        ohash = clhash->hash;
 534        osize = clhash->hashsize;
 535
 536        sch_tree_lock(sch);
 537        for (i = 0; i < osize; i++) {
 538                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 539                        h = qdisc_class_hash(cl->classid, nmask);
 540                        hlist_add_head(&cl->hnode, &nhash[h]);
 541                }
 542        }
 543        clhash->hash     = nhash;
 544        clhash->hashsize = nsize;
 545        clhash->hashmask = nmask;
 546        sch_tree_unlock(sch);
 547
 548        qdisc_class_hash_free(ohash, osize);
 549}
 550EXPORT_SYMBOL(qdisc_class_hash_grow);
 551
 552int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 553{
 554        unsigned int size = 4;
 555
 556        clhash->hash = qdisc_class_hash_alloc(size);
 557        if (clhash->hash == NULL)
 558                return -ENOMEM;
 559        clhash->hashsize  = size;
 560        clhash->hashmask  = size - 1;
 561        clhash->hashelems = 0;
 562        return 0;
 563}
 564EXPORT_SYMBOL(qdisc_class_hash_init);
 565
 566void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 567{
 568        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 569}
 570EXPORT_SYMBOL(qdisc_class_hash_destroy);
 571
 572void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 573                             struct Qdisc_class_common *cl)
 574{
 575        unsigned int h;
 576
 577        INIT_HLIST_NODE(&cl->hnode);
 578        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 579        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 580        clhash->hashelems++;
 581}
 582EXPORT_SYMBOL(qdisc_class_hash_insert);
 583
 584void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 585                             struct Qdisc_class_common *cl)
 586{
 587        hlist_del(&cl->hnode);
 588        clhash->hashelems--;
 589}
 590EXPORT_SYMBOL(qdisc_class_hash_remove);
 591
 592/* Allocate an unique handle from space managed by kernel */
 593
 594static u32 qdisc_alloc_handle(struct net_device *dev)
 595{
 596        int i = 0x10000;
 597        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 598
 599        do {
 600                autohandle += TC_H_MAKE(0x10000U, 0);
 601                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 602                        autohandle = TC_H_MAKE(0x80000000U, 0);
 603        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 604
 605        return i>0 ? autohandle : 0;
 606}
 607
 608/* Attach toplevel qdisc to device queue. */
 609
 610static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 611                                     struct Qdisc *qdisc)
 612{
 613        struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
 614        spinlock_t *root_lock;
 615
 616        root_lock = qdisc_lock(oqdisc);
 617        spin_lock_bh(root_lock);
 618
 619        /* Prune old scheduler */
 620        if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
 621                qdisc_reset(oqdisc);
 622
 623        /* ... and graft new one */
 624        if (qdisc == NULL)
 625                qdisc = &noop_qdisc;
 626        dev_queue->qdisc_sleeping = qdisc;
 627        rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
 628
 629        spin_unlock_bh(root_lock);
 630
 631        return oqdisc;
 632}
 633
 634void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 635{
 636        const struct Qdisc_class_ops *cops;
 637        unsigned long cl;
 638        u32 parentid;
 639
 640        if (n == 0)
 641                return;
 642        while ((parentid = sch->parent)) {
 643                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 644                        return;
 645
 646                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 647                if (sch == NULL) {
 648                        WARN_ON(parentid != TC_H_ROOT);
 649                        return;
 650                }
 651                cops = sch->ops->cl_ops;
 652                if (cops->qlen_notify) {
 653                        cl = cops->get(sch, parentid);
 654                        cops->qlen_notify(sch, cl);
 655                        cops->put(sch, cl);
 656                }
 657                sch->q.qlen -= n;
 658        }
 659}
 660EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 661
 662static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
 663                               struct Qdisc *old, struct Qdisc *new)
 664{
 665        if (new || old)
 666                qdisc_notify(skb, n, clid, old, new);
 667
 668        if (old)
 669                qdisc_destroy(old);
 670}
 671
 672/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 673 * to device "dev".
 674 *
 675 * When appropriate send a netlink notification using 'skb'
 676 * and "n".
 677 *
 678 * On success, destroy old qdisc.
 679 */
 680
 681static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 682                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 683                       struct Qdisc *new, struct Qdisc *old)
 684{
 685        struct Qdisc *q = old;
 686        int err = 0;
 687
 688        if (parent == NULL) {
 689                unsigned int i, num_q, ingress;
 690
 691                ingress = 0;
 692                num_q = dev->num_tx_queues;
 693                if ((q && q->flags & TCQ_F_INGRESS) ||
 694                    (new && new->flags & TCQ_F_INGRESS)) {
 695                        num_q = 1;
 696                        ingress = 1;
 697                }
 698
 699                if (dev->flags & IFF_UP)
 700                        dev_deactivate(dev);
 701
 702                for (i = 0; i < num_q; i++) {
 703                        struct netdev_queue *dev_queue = &dev->rx_queue;
 704
 705                        if (!ingress)
 706                                dev_queue = netdev_get_tx_queue(dev, i);
 707
 708                        old = dev_graft_qdisc(dev_queue, new);
 709                        if (new && i > 0)
 710                                atomic_inc(&new->refcnt);
 711
 712                        notify_and_destroy(skb, n, classid, old, new);
 713                }
 714
 715                if (dev->flags & IFF_UP)
 716                        dev_activate(dev);
 717        } else {
 718                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 719
 720                err = -EINVAL;
 721
 722                if (cops) {
 723                        unsigned long cl = cops->get(parent, classid);
 724                        if (cl) {
 725                                err = cops->graft(parent, cl, new, &old);
 726                                cops->put(parent, cl);
 727                        }
 728                }
 729                if (!err)
 730                        notify_and_destroy(skb, n, classid, old, new);
 731        }
 732        return err;
 733}
 734
 735/* lockdep annotation is needed for ingress; egress gets it only for name */
 736static struct lock_class_key qdisc_tx_lock;
 737static struct lock_class_key qdisc_rx_lock;
 738
 739/*
 740   Allocate and initialize new qdisc.
 741
 742   Parameters are passed via opt.
 743 */
 744
 745static struct Qdisc *
 746qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 747             u32 parent, u32 handle, struct nlattr **tca, int *errp)
 748{
 749        int err;
 750        struct nlattr *kind = tca[TCA_KIND];
 751        struct Qdisc *sch;
 752        struct Qdisc_ops *ops;
 753        struct qdisc_size_table *stab;
 754
 755        ops = qdisc_lookup_ops(kind);
 756#ifdef CONFIG_MODULES
 757        if (ops == NULL && kind != NULL) {
 758                char name[IFNAMSIZ];
 759                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 760                        /* We dropped the RTNL semaphore in order to
 761                         * perform the module load.  So, even if we
 762                         * succeeded in loading the module we have to
 763                         * tell the caller to replay the request.  We
 764                         * indicate this using -EAGAIN.
 765                         * We replay the request because the device may
 766                         * go away in the mean time.
 767                         */
 768                        rtnl_unlock();
 769                        request_module("sch_%s", name);
 770                        rtnl_lock();
 771                        ops = qdisc_lookup_ops(kind);
 772                        if (ops != NULL) {
 773                                /* We will try again qdisc_lookup_ops,
 774                                 * so don't keep a reference.
 775                                 */
 776                                module_put(ops->owner);
 777                                err = -EAGAIN;
 778                                goto err_out;
 779                        }
 780                }
 781        }
 782#endif
 783
 784        err = -ENOENT;
 785        if (ops == NULL)
 786                goto err_out;
 787
 788        sch = qdisc_alloc(dev_queue, ops);
 789        if (IS_ERR(sch)) {
 790                err = PTR_ERR(sch);
 791                goto err_out2;
 792        }
 793
 794        sch->parent = parent;
 795
 796        if (handle == TC_H_INGRESS) {
 797                sch->flags |= TCQ_F_INGRESS;
 798                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 799                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 800        } else {
 801                if (handle == 0) {
 802                        handle = qdisc_alloc_handle(dev);
 803                        err = -ENOMEM;
 804                        if (handle == 0)
 805                                goto err_out3;
 806                }
 807                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 808        }
 809
 810        sch->handle = handle;
 811
 812        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 813                if (tca[TCA_STAB]) {
 814                        stab = qdisc_get_stab(tca[TCA_STAB]);
 815                        if (IS_ERR(stab)) {
 816                                err = PTR_ERR(stab);
 817                                goto err_out3;
 818                        }
 819                        sch->stab = stab;
 820                }
 821                if (tca[TCA_RATE]) {
 822                        spinlock_t *root_lock;
 823
 824                        if ((sch->parent != TC_H_ROOT) &&
 825                            !(sch->flags & TCQ_F_INGRESS))
 826                                root_lock = qdisc_root_sleeping_lock(sch);
 827                        else
 828                                root_lock = qdisc_lock(sch);
 829
 830                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 831                                                root_lock, tca[TCA_RATE]);
 832                        if (err) {
 833                                /*
 834                                 * Any broken qdiscs that would require
 835                                 * a ops->reset() here? The qdisc was never
 836                                 * in action so it shouldn't be necessary.
 837                                 */
 838                                if (ops->destroy)
 839                                        ops->destroy(sch);
 840                                goto err_out3;
 841                        }
 842                }
 843
 844                qdisc_list_add(sch);
 845
 846                return sch;
 847        }
 848err_out3:
 849        qdisc_put_stab(sch->stab);
 850        dev_put(dev);
 851        kfree((char *) sch - sch->padded);
 852err_out2:
 853        module_put(ops->owner);
 854err_out:
 855        *errp = err;
 856        return NULL;
 857}
 858
 859static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 860{
 861        struct qdisc_size_table *stab = NULL;
 862        int err = 0;
 863
 864        if (tca[TCA_OPTIONS]) {
 865                if (sch->ops->change == NULL)
 866                        return -EINVAL;
 867                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 868                if (err)
 869                        return err;
 870        }
 871
 872        if (tca[TCA_STAB]) {
 873                stab = qdisc_get_stab(tca[TCA_STAB]);
 874                if (IS_ERR(stab))
 875                        return PTR_ERR(stab);
 876        }
 877
 878        qdisc_put_stab(sch->stab);
 879        sch->stab = stab;
 880
 881        if (tca[TCA_RATE])
 882                /* NB: ignores errors from replace_estimator
 883                   because change can't be undone. */
 884                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 885                                            qdisc_root_sleeping_lock(sch),
 886                                            tca[TCA_RATE]);
 887
 888        return 0;
 889}
 890
 891struct check_loop_arg
 892{
 893        struct qdisc_walker     w;
 894        struct Qdisc            *p;
 895        int                     depth;
 896};
 897
 898static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 899
 900static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 901{
 902        struct check_loop_arg   arg;
 903
 904        if (q->ops->cl_ops == NULL)
 905                return 0;
 906
 907        arg.w.stop = arg.w.skip = arg.w.count = 0;
 908        arg.w.fn = check_loop_fn;
 909        arg.depth = depth;
 910        arg.p = p;
 911        q->ops->cl_ops->walk(q, &arg.w);
 912        return arg.w.stop ? -ELOOP : 0;
 913}
 914
 915static int
 916check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 917{
 918        struct Qdisc *leaf;
 919        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 920        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 921
 922        leaf = cops->leaf(q, cl);
 923        if (leaf) {
 924                if (leaf == arg->p || arg->depth > 7)
 925                        return -ELOOP;
 926                return check_loop(leaf, arg->p, arg->depth + 1);
 927        }
 928        return 0;
 929}
 930
 931/*
 932 * Delete/get qdisc.
 933 */
 934
 935static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 936{
 937        struct net *net = sock_net(skb->sk);
 938        struct tcmsg *tcm = NLMSG_DATA(n);
 939        struct nlattr *tca[TCA_MAX + 1];
 940        struct net_device *dev;
 941        u32 clid = tcm->tcm_parent;
 942        struct Qdisc *q = NULL;
 943        struct Qdisc *p = NULL;
 944        int err;
 945
 946        if (net != &init_net)
 947                return -EINVAL;
 948
 949        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 950                return -ENODEV;
 951
 952        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 953        if (err < 0)
 954                return err;
 955
 956        if (clid) {
 957                if (clid != TC_H_ROOT) {
 958                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 959                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 960                                        return -ENOENT;
 961                                q = qdisc_leaf(p, clid);
 962                        } else { /* ingress */
 963                                q = dev->rx_queue.qdisc_sleeping;
 964                        }
 965                } else {
 966                        struct netdev_queue *dev_queue;
 967                        dev_queue = netdev_get_tx_queue(dev, 0);
 968                        q = dev_queue->qdisc_sleeping;
 969                }
 970                if (!q)
 971                        return -ENOENT;
 972
 973                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 974                        return -EINVAL;
 975        } else {
 976                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
 977                        return -ENOENT;
 978        }
 979
 980        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 981                return -EINVAL;
 982
 983        if (n->nlmsg_type == RTM_DELQDISC) {
 984                if (!clid)
 985                        return -EINVAL;
 986                if (q->handle == 0)
 987                        return -ENOENT;
 988                if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
 989                        return err;
 990        } else {
 991                qdisc_notify(skb, n, clid, NULL, q);
 992        }
 993        return 0;
 994}
 995
 996/*
 997   Create/change qdisc.
 998 */
 999
1000static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1001{
1002        struct net *net = sock_net(skb->sk);
1003        struct tcmsg *tcm;
1004        struct nlattr *tca[TCA_MAX + 1];
1005        struct net_device *dev;
1006        u32 clid;
1007        struct Qdisc *q, *p;
1008        int err;
1009
1010        if (net != &init_net)
1011                return -EINVAL;
1012
1013replay:
1014        /* Reinit, just in case something touches this. */
1015        tcm = NLMSG_DATA(n);
1016        clid = tcm->tcm_parent;
1017        q = p = NULL;
1018
1019        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1020                return -ENODEV;
1021
1022        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1023        if (err < 0)
1024                return err;
1025
1026        if (clid) {
1027                if (clid != TC_H_ROOT) {
1028                        if (clid != TC_H_INGRESS) {
1029                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1030                                        return -ENOENT;
1031                                q = qdisc_leaf(p, clid);
1032                        } else { /*ingress */
1033                                q = dev->rx_queue.qdisc_sleeping;
1034                        }
1035                } else {
1036                        struct netdev_queue *dev_queue;
1037                        dev_queue = netdev_get_tx_queue(dev, 0);
1038                        q = dev_queue->qdisc_sleeping;
1039                }
1040
1041                /* It may be default qdisc, ignore it */
1042                if (q && q->handle == 0)
1043                        q = NULL;
1044
1045                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1046                        if (tcm->tcm_handle) {
1047                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1048                                        return -EEXIST;
1049                                if (TC_H_MIN(tcm->tcm_handle))
1050                                        return -EINVAL;
1051                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1052                                        goto create_n_graft;
1053                                if (n->nlmsg_flags&NLM_F_EXCL)
1054                                        return -EEXIST;
1055                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1056                                        return -EINVAL;
1057                                if (q == p ||
1058                                    (p && check_loop(q, p, 0)))
1059                                        return -ELOOP;
1060                                atomic_inc(&q->refcnt);
1061                                goto graft;
1062                        } else {
1063                                if (q == NULL)
1064                                        goto create_n_graft;
1065
1066                                /* This magic test requires explanation.
1067                                 *
1068                                 *   We know, that some child q is already
1069                                 *   attached to this parent and have choice:
1070                                 *   either to change it or to create/graft new one.
1071                                 *
1072                                 *   1. We are allowed to create/graft only
1073                                 *   if CREATE and REPLACE flags are set.
1074                                 *
1075                                 *   2. If EXCL is set, requestor wanted to say,
1076                                 *   that qdisc tcm_handle is not expected
1077                                 *   to exist, so that we choose create/graft too.
1078                                 *
1079                                 *   3. The last case is when no flags are set.
1080                                 *   Alas, it is sort of hole in API, we
1081                                 *   cannot decide what to do unambiguously.
1082                                 *   For now we select create/graft, if
1083                                 *   user gave KIND, which does not match existing.
1084                                 */
1085                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
1086                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
1087                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
1088                                     (tca[TCA_KIND] &&
1089                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1090                                        goto create_n_graft;
1091                        }
1092                }
1093        } else {
1094                if (!tcm->tcm_handle)
1095                        return -EINVAL;
1096                q = qdisc_lookup(dev, tcm->tcm_handle);
1097        }
1098
1099        /* Change qdisc parameters */
1100        if (q == NULL)
1101                return -ENOENT;
1102        if (n->nlmsg_flags&NLM_F_EXCL)
1103                return -EEXIST;
1104        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1105                return -EINVAL;
1106        err = qdisc_change(q, tca);
1107        if (err == 0)
1108                qdisc_notify(skb, n, clid, NULL, q);
1109        return err;
1110
1111create_n_graft:
1112        if (!(n->nlmsg_flags&NLM_F_CREATE))
1113                return -ENOENT;
1114        if (clid == TC_H_INGRESS)
1115                q = qdisc_create(dev, &dev->rx_queue,
1116                                 tcm->tcm_parent, tcm->tcm_parent,
1117                                 tca, &err);
1118        else
1119                q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1120                                 tcm->tcm_parent, tcm->tcm_handle,
1121                                 tca, &err);
1122        if (q == NULL) {
1123                if (err == -EAGAIN)
1124                        goto replay;
1125                return err;
1126        }
1127
1128graft:
1129        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1130        if (err) {
1131                if (q)
1132                        qdisc_destroy(q);
1133                return err;
1134        }
1135
1136        return 0;
1137}
1138
1139static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1140                         u32 pid, u32 seq, u16 flags, int event)
1141{
1142        struct tcmsg *tcm;
1143        struct nlmsghdr  *nlh;
1144        unsigned char *b = skb_tail_pointer(skb);
1145        struct gnet_dump d;
1146
1147        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1148        tcm = NLMSG_DATA(nlh);
1149        tcm->tcm_family = AF_UNSPEC;
1150        tcm->tcm__pad1 = 0;
1151        tcm->tcm__pad2 = 0;
1152        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1153        tcm->tcm_parent = clid;
1154        tcm->tcm_handle = q->handle;
1155        tcm->tcm_info = atomic_read(&q->refcnt);
1156        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1157        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1158                goto nla_put_failure;
1159        q->qstats.qlen = q->q.qlen;
1160
1161        if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1162                goto nla_put_failure;
1163
1164        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1165                                         qdisc_root_sleeping_lock(q), &d) < 0)
1166                goto nla_put_failure;
1167
1168        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1169                goto nla_put_failure;
1170
1171        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1172            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1173            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1174                goto nla_put_failure;
1175
1176        if (gnet_stats_finish_copy(&d) < 0)
1177                goto nla_put_failure;
1178
1179        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1180        return skb->len;
1181
1182nlmsg_failure:
1183nla_put_failure:
1184        nlmsg_trim(skb, b);
1185        return -1;
1186}
1187
1188static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1189                        u32 clid, struct Qdisc *old, struct Qdisc *new)
1190{
1191        struct sk_buff *skb;
1192        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1193
1194        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1195        if (!skb)
1196                return -ENOBUFS;
1197
1198        if (old && old->handle) {
1199                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1200                        goto err_out;
1201        }
1202        if (new) {
1203                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1204                        goto err_out;
1205        }
1206
1207        if (skb->len)
1208                return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1209
1210err_out:
1211        kfree_skb(skb);
1212        return -EINVAL;
1213}
1214
1215static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1216{
1217        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1218}
1219
1220static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1221                              struct netlink_callback *cb,
1222                              int *q_idx_p, int s_q_idx)
1223{
1224        int ret = 0, q_idx = *q_idx_p;
1225        struct Qdisc *q;
1226
1227        if (!root)
1228                return 0;
1229
1230        q = root;
1231        if (q_idx < s_q_idx) {
1232                q_idx++;
1233        } else {
1234                if (!tc_qdisc_dump_ignore(q) &&
1235                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1236                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1237                        goto done;
1238                q_idx++;
1239        }
1240        list_for_each_entry(q, &root->list, list) {
1241                if (q_idx < s_q_idx) {
1242                        q_idx++;
1243                        continue;
1244                }
1245                if (!tc_qdisc_dump_ignore(q) && 
1246                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248                        goto done;
1249                q_idx++;
1250        }
1251
1252out:
1253        *q_idx_p = q_idx;
1254        return ret;
1255done:
1256        ret = -1;
1257        goto out;
1258}
1259
1260static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1261{
1262        struct net *net = sock_net(skb->sk);
1263        int idx, q_idx;
1264        int s_idx, s_q_idx;
1265        struct net_device *dev;
1266
1267        if (net != &init_net)
1268                return 0;
1269
1270        s_idx = cb->args[0];
1271        s_q_idx = q_idx = cb->args[1];
1272        read_lock(&dev_base_lock);
1273        idx = 0;
1274        for_each_netdev(&init_net, dev) {
1275                struct netdev_queue *dev_queue;
1276
1277                if (idx < s_idx)
1278                        goto cont;
1279                if (idx > s_idx)
1280                        s_q_idx = 0;
1281                q_idx = 0;
1282
1283                dev_queue = netdev_get_tx_queue(dev, 0);
1284                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1285                        goto done;
1286
1287                dev_queue = &dev->rx_queue;
1288                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1289                        goto done;
1290
1291cont:
1292                idx++;
1293        }
1294
1295done:
1296        read_unlock(&dev_base_lock);
1297
1298        cb->args[0] = idx;
1299        cb->args[1] = q_idx;
1300
1301        return skb->len;
1302}
1303
1304
1305
1306/************************************************
1307 *      Traffic classes manipulation.           *
1308 ************************************************/
1309
1310
1311
1312static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1313{
1314        struct net *net = sock_net(skb->sk);
1315        struct netdev_queue *dev_queue;
1316        struct tcmsg *tcm = NLMSG_DATA(n);
1317        struct nlattr *tca[TCA_MAX + 1];
1318        struct net_device *dev;
1319        struct Qdisc *q = NULL;
1320        const struct Qdisc_class_ops *cops;
1321        unsigned long cl = 0;
1322        unsigned long new_cl;
1323        u32 pid = tcm->tcm_parent;
1324        u32 clid = tcm->tcm_handle;
1325        u32 qid = TC_H_MAJ(clid);
1326        int err;
1327
1328        if (net != &init_net)
1329                return -EINVAL;
1330
1331        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1332                return -ENODEV;
1333
1334        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1335        if (err < 0)
1336                return err;
1337
1338        /*
1339           parent == TC_H_UNSPEC - unspecified parent.
1340           parent == TC_H_ROOT   - class is root, which has no parent.
1341           parent == X:0         - parent is root class.
1342           parent == X:Y         - parent is a node in hierarchy.
1343           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1344
1345           handle == 0:0         - generate handle from kernel pool.
1346           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1347           handle == X:Y         - clear.
1348           handle == X:0         - root class.
1349         */
1350
1351        /* Step 1. Determine qdisc handle X:0 */
1352
1353        dev_queue = netdev_get_tx_queue(dev, 0);
1354        if (pid != TC_H_ROOT) {
1355                u32 qid1 = TC_H_MAJ(pid);
1356
1357                if (qid && qid1) {
1358                        /* If both majors are known, they must be identical. */
1359                        if (qid != qid1)
1360                                return -EINVAL;
1361                } else if (qid1) {
1362                        qid = qid1;
1363                } else if (qid == 0)
1364                        qid = dev_queue->qdisc_sleeping->handle;
1365
1366                /* Now qid is genuine qdisc handle consistent
1367                   both with parent and child.
1368
1369                   TC_H_MAJ(pid) still may be unspecified, complete it now.
1370                 */
1371                if (pid)
1372                        pid = TC_H_MAKE(qid, pid);
1373        } else {
1374                if (qid == 0)
1375                        qid = dev_queue->qdisc_sleeping->handle;
1376        }
1377
1378        /* OK. Locate qdisc */
1379        if ((q = qdisc_lookup(dev, qid)) == NULL)
1380                return -ENOENT;
1381
1382        /* An check that it supports classes */
1383        cops = q->ops->cl_ops;
1384        if (cops == NULL)
1385                return -EINVAL;
1386
1387        /* Now try to get class */
1388        if (clid == 0) {
1389                if (pid == TC_H_ROOT)
1390                        clid = qid;
1391        } else
1392                clid = TC_H_MAKE(qid, clid);
1393
1394        if (clid)
1395                cl = cops->get(q, clid);
1396
1397        if (cl == 0) {
1398                err = -ENOENT;
1399                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1400                        goto out;
1401        } else {
1402                switch (n->nlmsg_type) {
1403                case RTM_NEWTCLASS:
1404                        err = -EEXIST;
1405                        if (n->nlmsg_flags&NLM_F_EXCL)
1406                                goto out;
1407                        break;
1408                case RTM_DELTCLASS:
1409                        err = cops->delete(q, cl);
1410                        if (err == 0)
1411                                tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1412                        goto out;
1413                case RTM_GETTCLASS:
1414                        err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1415                        goto out;
1416                default:
1417                        err = -EINVAL;
1418                        goto out;
1419                }
1420        }
1421
1422        new_cl = cl;
1423        err = cops->change(q, clid, pid, tca, &new_cl);
1424        if (err == 0)
1425                tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1426
1427out:
1428        if (cl)
1429                cops->put(q, cl);
1430
1431        return err;
1432}
1433
1434
1435static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1436                          unsigned long cl,
1437                          u32 pid, u32 seq, u16 flags, int event)
1438{
1439        struct tcmsg *tcm;
1440        struct nlmsghdr  *nlh;
1441        unsigned char *b = skb_tail_pointer(skb);
1442        struct gnet_dump d;
1443        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1444
1445        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1446        tcm = NLMSG_DATA(nlh);
1447        tcm->tcm_family = AF_UNSPEC;
1448        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1449        tcm->tcm_parent = q->handle;
1450        tcm->tcm_handle = q->handle;
1451        tcm->tcm_info = 0;
1452        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1453        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1454                goto nla_put_failure;
1455
1456        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1457                                         qdisc_root_sleeping_lock(q), &d) < 0)
1458                goto nla_put_failure;
1459
1460        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1461                goto nla_put_failure;
1462
1463        if (gnet_stats_finish_copy(&d) < 0)
1464                goto nla_put_failure;
1465
1466        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1467        return skb->len;
1468
1469nlmsg_failure:
1470nla_put_failure:
1471        nlmsg_trim(skb, b);
1472        return -1;
1473}
1474
1475static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1476                          struct Qdisc *q, unsigned long cl, int event)
1477{
1478        struct sk_buff *skb;
1479        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1480
1481        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1482        if (!skb)
1483                return -ENOBUFS;
1484
1485        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1486                kfree_skb(skb);
1487                return -EINVAL;
1488        }
1489
1490        return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1491}
1492
1493struct qdisc_dump_args
1494{
1495        struct qdisc_walker w;
1496        struct sk_buff *skb;
1497        struct netlink_callback *cb;
1498};
1499
1500static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1501{
1502        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1503
1504        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1505                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1506}
1507
1508static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1509                                struct tcmsg *tcm, struct netlink_callback *cb,
1510                                int *t_p, int s_t)
1511{
1512        struct qdisc_dump_args arg;
1513
1514        if (tc_qdisc_dump_ignore(q) ||
1515            *t_p < s_t || !q->ops->cl_ops ||
1516            (tcm->tcm_parent &&
1517             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1518                (*t_p)++;
1519                return 0;
1520        }
1521        if (*t_p > s_t)
1522                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1523        arg.w.fn = qdisc_class_dump;
1524        arg.skb = skb;
1525        arg.cb = cb;
1526        arg.w.stop  = 0;
1527        arg.w.skip = cb->args[1];
1528        arg.w.count = 0;
1529        q->ops->cl_ops->walk(q, &arg.w);
1530        cb->args[1] = arg.w.count;
1531        if (arg.w.stop)
1532                return -1;
1533        (*t_p)++;
1534        return 0;
1535}
1536
1537static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1538                               struct tcmsg *tcm, struct netlink_callback *cb,
1539                               int *t_p, int s_t)
1540{
1541        struct Qdisc *q;
1542
1543        if (!root)
1544                return 0;
1545
1546        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1547                return -1;
1548
1549        list_for_each_entry(q, &root->list, list) {
1550                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1551                        return -1;
1552        }
1553
1554        return 0;
1555}
1556
1557static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1558{
1559        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1560        struct net *net = sock_net(skb->sk);
1561        struct netdev_queue *dev_queue;
1562        struct net_device *dev;
1563        int t, s_t;
1564
1565        if (net != &init_net)
1566                return 0;
1567
1568        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1569                return 0;
1570        if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1571                return 0;
1572
1573        s_t = cb->args[0];
1574        t = 0;
1575
1576        dev_queue = netdev_get_tx_queue(dev, 0);
1577        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1578                goto done;
1579
1580        dev_queue = &dev->rx_queue;
1581        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1582                goto done;
1583
1584done:
1585        cb->args[0] = t;
1586
1587        dev_put(dev);
1588        return skb->len;
1589}
1590
1591/* Main classifier routine: scans classifier chain attached
1592   to this qdisc, (optionally) tests for protocol and asks
1593   specific classifiers.
1594 */
1595int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1596                       struct tcf_result *res)
1597{
1598        __be16 protocol = skb->protocol;
1599        int err = 0;
1600
1601        for (; tp; tp = tp->next) {
1602                if ((tp->protocol == protocol ||
1603                     tp->protocol == htons(ETH_P_ALL)) &&
1604                    (err = tp->classify(skb, tp, res)) >= 0) {
1605#ifdef CONFIG_NET_CLS_ACT
1606                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1607                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1608#endif
1609                        return err;
1610                }
1611        }
1612        return -1;
1613}
1614EXPORT_SYMBOL(tc_classify_compat);
1615
1616int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1617                struct tcf_result *res)
1618{
1619        int err = 0;
1620        __be16 protocol;
1621#ifdef CONFIG_NET_CLS_ACT
1622        struct tcf_proto *otp = tp;
1623reclassify:
1624#endif
1625        protocol = skb->protocol;
1626
1627        err = tc_classify_compat(skb, tp, res);
1628#ifdef CONFIG_NET_CLS_ACT
1629        if (err == TC_ACT_RECLASSIFY) {
1630                u32 verd = G_TC_VERD(skb->tc_verd);
1631                tp = otp;
1632
1633                if (verd++ >= MAX_REC_LOOP) {
1634                        printk("rule prio %u protocol %02x reclassify loop, "
1635                               "packet dropped\n",
1636                               tp->prio&0xffff, ntohs(tp->protocol));
1637                        return TC_ACT_SHOT;
1638                }
1639                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1640                goto reclassify;
1641        }
1642#endif
1643        return err;
1644}
1645EXPORT_SYMBOL(tc_classify);
1646
1647void tcf_destroy(struct tcf_proto *tp)
1648{
1649        tp->ops->destroy(tp);
1650        module_put(tp->ops->owner);
1651        kfree(tp);
1652}
1653
1654void tcf_destroy_chain(struct tcf_proto **fl)
1655{
1656        struct tcf_proto *tp;
1657
1658        while ((tp = *fl) != NULL) {
1659                *fl = tp->next;
1660                tcf_destroy(tp);
1661        }
1662}
1663EXPORT_SYMBOL(tcf_destroy_chain);
1664
1665#ifdef CONFIG_PROC_FS
1666static int psched_show(struct seq_file *seq, void *v)
1667{
1668        struct timespec ts;
1669
1670        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1671        seq_printf(seq, "%08x %08x %08x %08x\n",
1672                   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1673                   1000000,
1674                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1675
1676        return 0;
1677}
1678
1679static int psched_open(struct inode *inode, struct file *file)
1680{
1681        return single_open(file, psched_show, PDE(inode)->data);
1682}
1683
1684static const struct file_operations psched_fops = {
1685        .owner = THIS_MODULE,
1686        .open = psched_open,
1687        .read  = seq_read,
1688        .llseek = seq_lseek,
1689        .release = single_release,
1690};
1691#endif
1692
1693static int __init pktsched_init(void)
1694{
1695        register_qdisc(&pfifo_qdisc_ops);
1696        register_qdisc(&bfifo_qdisc_ops);
1697        proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1698
1699        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1700        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1701        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1702        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1703        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1704        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1705
1706        return 0;
1707}
1708
1709subsys_initcall(pktsched_init);
1710