linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31
  32#include <net/net_namespace.h>
  33#include <net/sock.h>
  34#include <net/netlink.h>
  35#include <net/pkt_sched.h>
  36
  37static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
  38                        struct Qdisc *old, struct Qdisc *new);
  39static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
  40                         struct Qdisc *q, unsigned long cl, int event);
  41
  42/*
  43
  44   Short review.
  45   -------------
  46
  47   This file consists of two interrelated parts:
  48
  49   1. queueing disciplines manager frontend.
  50   2. traffic classes manager frontend.
  51
  52   Generally, queueing discipline ("qdisc") is a black box,
  53   which is able to enqueue packets and to dequeue them (when
  54   device is ready to send something) in order and at times
  55   determined by algorithm hidden in it.
  56
  57   qdisc's are divided to two categories:
  58   - "queues", which have no internal structure visible from outside.
  59   - "schedulers", which split all the packets to "traffic classes",
  60     using "packet classifiers" (look at cls_api.c)
  61
  62   In turn, classes may have child qdiscs (as rule, queues)
  63   attached to them etc. etc. etc.
  64
  65   The goal of the routines in this file is to translate
  66   information supplied by user in the form of handles
  67   to more intelligible for kernel form, to make some sanity
  68   checks and part of work, which is common to all qdiscs
  69   and to provide rtnetlink notifications.
  70
  71   All real intelligent work is done inside qdisc modules.
  72
  73
  74
  75   Every discipline has two major routines: enqueue and dequeue.
  76
  77   ---dequeue
  78
  79   dequeue usually returns a skb to send. It is allowed to return NULL,
  80   but it does not mean that queue is empty, it just means that
  81   discipline does not want to send anything this time.
  82   Queue is really empty if q->q.qlen == 0.
  83   For complicated disciplines with multiple queues q->q is not
  84   real packet queue, but however q->q.qlen must be valid.
  85
  86   ---enqueue
  87
  88   enqueue returns 0, if packet was enqueued successfully.
  89   If packet (this one or another one) was dropped, it returns
  90   not zero error code.
  91   NET_XMIT_DROP        - this packet dropped
  92     Expected action: do not backoff, but wait until queue will clear.
  93   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  94     Expected action: backoff or ignore
  95   NET_XMIT_POLICED     - dropped by police.
  96     Expected action: backoff or error to real-time apps.
  97
  98   Auxiliary routines:
  99
 100   ---peek
 101
 102   like dequeue but without removing a packet from the queue
 103
 104   ---reset
 105
 106   returns qdisc to initial state: purge all buffers, clear all
 107   timers, counters (except for statistics) etc.
 108
 109   ---init
 110
 111   initializes newly created qdisc.
 112
 113   ---destroy
 114
 115   destroys resources allocated by init and during lifetime of qdisc.
 116
 117   ---change
 118
 119   changes qdisc parameters.
 120 */
 121
 122/* Protects list of registered TC modules. It is pure SMP lock. */
 123static DEFINE_RWLOCK(qdisc_mod_lock);
 124
 125
 126/************************************************
 127 *      Queueing disciplines manipulation.      *
 128 ************************************************/
 129
 130
 131/* The list of all installed queueing disciplines. */
 132
 133static struct Qdisc_ops *qdisc_base;
 134
 135/* Register/uregister queueing discipline */
 136
 137int register_qdisc(struct Qdisc_ops *qops)
 138{
 139        struct Qdisc_ops *q, **qp;
 140        int rc = -EEXIST;
 141
 142        write_lock(&qdisc_mod_lock);
 143        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 144                if (!strcmp(qops->id, q->id))
 145                        goto out;
 146
 147        if (qops->enqueue == NULL)
 148                qops->enqueue = noop_qdisc_ops.enqueue;
 149        if (qops->peek == NULL) {
 150                if (qops->dequeue == NULL) {
 151                        qops->peek = noop_qdisc_ops.peek;
 152                } else {
 153                        rc = -EINVAL;
 154                        goto out;
 155                }
 156        }
 157        if (qops->dequeue == NULL)
 158                qops->dequeue = noop_qdisc_ops.dequeue;
 159
 160        qops->next = NULL;
 161        *qp = qops;
 162        rc = 0;
 163out:
 164        write_unlock(&qdisc_mod_lock);
 165        return rc;
 166}
 167EXPORT_SYMBOL(register_qdisc);
 168
 169int unregister_qdisc(struct Qdisc_ops *qops)
 170{
 171        struct Qdisc_ops *q, **qp;
 172        int err = -ENOENT;
 173
 174        write_lock(&qdisc_mod_lock);
 175        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
 176                if (q == qops)
 177                        break;
 178        if (q) {
 179                *qp = q->next;
 180                q->next = NULL;
 181                err = 0;
 182        }
 183        write_unlock(&qdisc_mod_lock);
 184        return err;
 185}
 186EXPORT_SYMBOL(unregister_qdisc);
 187
 188/* We know handle. Find qdisc among all qdisc's attached to device
 189   (root qdisc, all its children, children of children etc.)
 190 */
 191
 192static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 193{
 194        struct Qdisc *q;
 195
 196        if (!(root->flags & TCQ_F_BUILTIN) &&
 197            root->handle == handle)
 198                return root;
 199
 200        list_for_each_entry(q, &root->list, list) {
 201                if (q->handle == handle)
 202                        return q;
 203        }
 204        return NULL;
 205}
 206
 207static void qdisc_list_add(struct Qdisc *q)
 208{
 209        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 210                list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
 211}
 212
 213void qdisc_list_del(struct Qdisc *q)
 214{
 215        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 216                list_del(&q->list);
 217}
 218EXPORT_SYMBOL(qdisc_list_del);
 219
 220struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 221{
 222        unsigned int i;
 223        struct Qdisc *q;
 224
 225        for (i = 0; i < dev->num_tx_queues; i++) {
 226                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 227                struct Qdisc *txq_root = txq->qdisc_sleeping;
 228
 229                q = qdisc_match_from_root(txq_root, handle);
 230                if (q)
 231                        goto out;
 232        }
 233
 234        q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
 235out:
 236        return q;
 237}
 238
 239static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 240{
 241        unsigned long cl;
 242        struct Qdisc *leaf;
 243        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 244
 245        if (cops == NULL)
 246                return NULL;
 247        cl = cops->get(p, classid);
 248
 249        if (cl == 0)
 250                return NULL;
 251        leaf = cops->leaf(p, cl);
 252        cops->put(p, cl);
 253        return leaf;
 254}
 255
 256/* Find queueing discipline by name */
 257
 258static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 259{
 260        struct Qdisc_ops *q = NULL;
 261
 262        if (kind) {
 263                read_lock(&qdisc_mod_lock);
 264                for (q = qdisc_base; q; q = q->next) {
 265                        if (nla_strcmp(kind, q->id) == 0) {
 266                                if (!try_module_get(q->owner))
 267                                        q = NULL;
 268                                break;
 269                        }
 270                }
 271                read_unlock(&qdisc_mod_lock);
 272        }
 273        return q;
 274}
 275
 276static struct qdisc_rate_table *qdisc_rtab_list;
 277
 278struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 279{
 280        struct qdisc_rate_table *rtab;
 281
 282        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 283                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 284                        rtab->refcnt++;
 285                        return rtab;
 286                }
 287        }
 288
 289        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 290            nla_len(tab) != TC_RTAB_SIZE)
 291                return NULL;
 292
 293        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 294        if (rtab) {
 295                rtab->rate = *r;
 296                rtab->refcnt = 1;
 297                memcpy(rtab->data, nla_data(tab), 1024);
 298                rtab->next = qdisc_rtab_list;
 299                qdisc_rtab_list = rtab;
 300        }
 301        return rtab;
 302}
 303EXPORT_SYMBOL(qdisc_get_rtab);
 304
 305void qdisc_put_rtab(struct qdisc_rate_table *tab)
 306{
 307        struct qdisc_rate_table *rtab, **rtabp;
 308
 309        if (!tab || --tab->refcnt)
 310                return;
 311
 312        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
 313                if (rtab == tab) {
 314                        *rtabp = rtab->next;
 315                        kfree(rtab);
 316                        return;
 317                }
 318        }
 319}
 320EXPORT_SYMBOL(qdisc_put_rtab);
 321
 322static LIST_HEAD(qdisc_stab_list);
 323static DEFINE_SPINLOCK(qdisc_stab_lock);
 324
 325static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 326        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 327        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 328};
 329
 330static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 331{
 332        struct nlattr *tb[TCA_STAB_MAX + 1];
 333        struct qdisc_size_table *stab;
 334        struct tc_sizespec *s;
 335        unsigned int tsize = 0;
 336        u16 *tab = NULL;
 337        int err;
 338
 339        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 340        if (err < 0)
 341                return ERR_PTR(err);
 342        if (!tb[TCA_STAB_BASE])
 343                return ERR_PTR(-EINVAL);
 344
 345        s = nla_data(tb[TCA_STAB_BASE]);
 346
 347        if (s->tsize > 0) {
 348                if (!tb[TCA_STAB_DATA])
 349                        return ERR_PTR(-EINVAL);
 350                tab = nla_data(tb[TCA_STAB_DATA]);
 351                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 352        }
 353
 354        if (!s || tsize != s->tsize || (!tab && tsize > 0))
 355                return ERR_PTR(-EINVAL);
 356
 357        spin_lock(&qdisc_stab_lock);
 358
 359        list_for_each_entry(stab, &qdisc_stab_list, list) {
 360                if (memcmp(&stab->szopts, s, sizeof(*s)))
 361                        continue;
 362                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 363                        continue;
 364                stab->refcnt++;
 365                spin_unlock(&qdisc_stab_lock);
 366                return stab;
 367        }
 368
 369        spin_unlock(&qdisc_stab_lock);
 370
 371        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 372        if (!stab)
 373                return ERR_PTR(-ENOMEM);
 374
 375        stab->refcnt = 1;
 376        stab->szopts = *s;
 377        if (tsize > 0)
 378                memcpy(stab->data, tab, tsize * sizeof(u16));
 379
 380        spin_lock(&qdisc_stab_lock);
 381        list_add_tail(&stab->list, &qdisc_stab_list);
 382        spin_unlock(&qdisc_stab_lock);
 383
 384        return stab;
 385}
 386
 387void qdisc_put_stab(struct qdisc_size_table *tab)
 388{
 389        if (!tab)
 390                return;
 391
 392        spin_lock(&qdisc_stab_lock);
 393
 394        if (--tab->refcnt == 0) {
 395                list_del(&tab->list);
 396                kfree(tab);
 397        }
 398
 399        spin_unlock(&qdisc_stab_lock);
 400}
 401EXPORT_SYMBOL(qdisc_put_stab);
 402
 403static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 404{
 405        struct nlattr *nest;
 406
 407        nest = nla_nest_start(skb, TCA_STAB);
 408        if (nest == NULL)
 409                goto nla_put_failure;
 410        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 411        nla_nest_end(skb, nest);
 412
 413        return skb->len;
 414
 415nla_put_failure:
 416        return -1;
 417}
 418
 419void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
 420{
 421        int pkt_len, slot;
 422
 423        pkt_len = skb->len + stab->szopts.overhead;
 424        if (unlikely(!stab->szopts.tsize))
 425                goto out;
 426
 427        slot = pkt_len + stab->szopts.cell_align;
 428        if (unlikely(slot < 0))
 429                slot = 0;
 430
 431        slot >>= stab->szopts.cell_log;
 432        if (likely(slot < stab->szopts.tsize))
 433                pkt_len = stab->data[slot];
 434        else
 435                pkt_len = stab->data[stab->szopts.tsize - 1] *
 436                                (slot / stab->szopts.tsize) +
 437                                stab->data[slot % stab->szopts.tsize];
 438
 439        pkt_len <<= stab->szopts.size_log;
 440out:
 441        if (unlikely(pkt_len < 1))
 442                pkt_len = 1;
 443        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 444}
 445EXPORT_SYMBOL(qdisc_calculate_pkt_len);
 446
 447void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 448{
 449        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 450                printk(KERN_WARNING
 451                       "%s: %s qdisc %X: is non-work-conserving?\n",
 452                       txt, qdisc->ops->id, qdisc->handle >> 16);
 453                qdisc->flags |= TCQ_F_WARN_NONWC;
 454        }
 455}
 456EXPORT_SYMBOL(qdisc_warn_nonwc);
 457
 458static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 459{
 460        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 461                                                 timer);
 462
 463        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 464        __netif_schedule(qdisc_root(wd->qdisc));
 465
 466        return HRTIMER_NORESTART;
 467}
 468
 469void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 470{
 471        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 472        wd->timer.function = qdisc_watchdog;
 473        wd->qdisc = qdisc;
 474}
 475EXPORT_SYMBOL(qdisc_watchdog_init);
 476
 477void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 478{
 479        ktime_t time;
 480
 481        if (test_bit(__QDISC_STATE_DEACTIVATED,
 482                     &qdisc_root_sleeping(wd->qdisc)->state))
 483                return;
 484
 485        wd->qdisc->flags |= TCQ_F_THROTTLED;
 486        time = ktime_set(0, 0);
 487        time = ktime_add_ns(time, PSCHED_US2NS(expires));
 488        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 489}
 490EXPORT_SYMBOL(qdisc_watchdog_schedule);
 491
 492void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 493{
 494        hrtimer_cancel(&wd->timer);
 495        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 496}
 497EXPORT_SYMBOL(qdisc_watchdog_cancel);
 498
 499static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 500{
 501        unsigned int size = n * sizeof(struct hlist_head), i;
 502        struct hlist_head *h;
 503
 504        if (size <= PAGE_SIZE)
 505                h = kmalloc(size, GFP_KERNEL);
 506        else
 507                h = (struct hlist_head *)
 508                        __get_free_pages(GFP_KERNEL, get_order(size));
 509
 510        if (h != NULL) {
 511                for (i = 0; i < n; i++)
 512                        INIT_HLIST_HEAD(&h[i]);
 513        }
 514        return h;
 515}
 516
 517static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 518{
 519        unsigned int size = n * sizeof(struct hlist_head);
 520
 521        if (size <= PAGE_SIZE)
 522                kfree(h);
 523        else
 524                free_pages((unsigned long)h, get_order(size));
 525}
 526
 527void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 528{
 529        struct Qdisc_class_common *cl;
 530        struct hlist_node *n, *next;
 531        struct hlist_head *nhash, *ohash;
 532        unsigned int nsize, nmask, osize;
 533        unsigned int i, h;
 534
 535        /* Rehash when load factor exceeds 0.75 */
 536        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 537                return;
 538        nsize = clhash->hashsize * 2;
 539        nmask = nsize - 1;
 540        nhash = qdisc_class_hash_alloc(nsize);
 541        if (nhash == NULL)
 542                return;
 543
 544        ohash = clhash->hash;
 545        osize = clhash->hashsize;
 546
 547        sch_tree_lock(sch);
 548        for (i = 0; i < osize; i++) {
 549                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 550                        h = qdisc_class_hash(cl->classid, nmask);
 551                        hlist_add_head(&cl->hnode, &nhash[h]);
 552                }
 553        }
 554        clhash->hash     = nhash;
 555        clhash->hashsize = nsize;
 556        clhash->hashmask = nmask;
 557        sch_tree_unlock(sch);
 558
 559        qdisc_class_hash_free(ohash, osize);
 560}
 561EXPORT_SYMBOL(qdisc_class_hash_grow);
 562
 563int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 564{
 565        unsigned int size = 4;
 566
 567        clhash->hash = qdisc_class_hash_alloc(size);
 568        if (clhash->hash == NULL)
 569                return -ENOMEM;
 570        clhash->hashsize  = size;
 571        clhash->hashmask  = size - 1;
 572        clhash->hashelems = 0;
 573        return 0;
 574}
 575EXPORT_SYMBOL(qdisc_class_hash_init);
 576
 577void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 578{
 579        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 580}
 581EXPORT_SYMBOL(qdisc_class_hash_destroy);
 582
 583void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 584                             struct Qdisc_class_common *cl)
 585{
 586        unsigned int h;
 587
 588        INIT_HLIST_NODE(&cl->hnode);
 589        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 590        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 591        clhash->hashelems++;
 592}
 593EXPORT_SYMBOL(qdisc_class_hash_insert);
 594
 595void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 596                             struct Qdisc_class_common *cl)
 597{
 598        hlist_del(&cl->hnode);
 599        clhash->hashelems--;
 600}
 601EXPORT_SYMBOL(qdisc_class_hash_remove);
 602
 603/* Allocate an unique handle from space managed by kernel */
 604
 605static u32 qdisc_alloc_handle(struct net_device *dev)
 606{
 607        int i = 0x10000;
 608        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 609
 610        do {
 611                autohandle += TC_H_MAKE(0x10000U, 0);
 612                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 613                        autohandle = TC_H_MAKE(0x80000000U, 0);
 614        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 615
 616        return i>0 ? autohandle : 0;
 617}
 618
 619/* Attach toplevel qdisc to device queue. */
 620
 621static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 622                                     struct Qdisc *qdisc)
 623{
 624        struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
 625        spinlock_t *root_lock;
 626
 627        root_lock = qdisc_lock(oqdisc);
 628        spin_lock_bh(root_lock);
 629
 630        /* Prune old scheduler */
 631        if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
 632                qdisc_reset(oqdisc);
 633
 634        /* ... and graft new one */
 635        if (qdisc == NULL)
 636                qdisc = &noop_qdisc;
 637        dev_queue->qdisc_sleeping = qdisc;
 638        rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
 639
 640        spin_unlock_bh(root_lock);
 641
 642        return oqdisc;
 643}
 644
 645void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 646{
 647        const struct Qdisc_class_ops *cops;
 648        unsigned long cl;
 649        u32 parentid;
 650
 651        if (n == 0)
 652                return;
 653        while ((parentid = sch->parent)) {
 654                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 655                        return;
 656
 657                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 658                if (sch == NULL) {
 659                        WARN_ON(parentid != TC_H_ROOT);
 660                        return;
 661                }
 662                cops = sch->ops->cl_ops;
 663                if (cops->qlen_notify) {
 664                        cl = cops->get(sch, parentid);
 665                        cops->qlen_notify(sch, cl);
 666                        cops->put(sch, cl);
 667                }
 668                sch->q.qlen -= n;
 669        }
 670}
 671EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 672
 673static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
 674                               struct Qdisc *old, struct Qdisc *new)
 675{
 676        if (new || old)
 677                qdisc_notify(skb, n, clid, old, new);
 678
 679        if (old)
 680                qdisc_destroy(old);
 681}
 682
 683/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 684 * to device "dev".
 685 *
 686 * When appropriate send a netlink notification using 'skb'
 687 * and "n".
 688 *
 689 * On success, destroy old qdisc.
 690 */
 691
 692static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 693                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 694                       struct Qdisc *new, struct Qdisc *old)
 695{
 696        struct Qdisc *q = old;
 697        int err = 0;
 698
 699        if (parent == NULL) {
 700                unsigned int i, num_q, ingress;
 701
 702                ingress = 0;
 703                num_q = dev->num_tx_queues;
 704                if ((q && q->flags & TCQ_F_INGRESS) ||
 705                    (new && new->flags & TCQ_F_INGRESS)) {
 706                        num_q = 1;
 707                        ingress = 1;
 708                }
 709
 710                if (dev->flags & IFF_UP)
 711                        dev_deactivate(dev);
 712
 713                for (i = 0; i < num_q; i++) {
 714                        struct netdev_queue *dev_queue = &dev->rx_queue;
 715
 716                        if (!ingress)
 717                                dev_queue = netdev_get_tx_queue(dev, i);
 718
 719                        old = dev_graft_qdisc(dev_queue, new);
 720                        if (new && i > 0)
 721                                atomic_inc(&new->refcnt);
 722
 723                        notify_and_destroy(skb, n, classid, old, new);
 724                }
 725
 726                if (dev->flags & IFF_UP)
 727                        dev_activate(dev);
 728        } else {
 729                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 730
 731                err = -EINVAL;
 732
 733                if (cops) {
 734                        unsigned long cl = cops->get(parent, classid);
 735                        if (cl) {
 736                                err = cops->graft(parent, cl, new, &old);
 737                                cops->put(parent, cl);
 738                        }
 739                }
 740                if (!err)
 741                        notify_and_destroy(skb, n, classid, old, new);
 742        }
 743        return err;
 744}
 745
 746/* lockdep annotation is needed for ingress; egress gets it only for name */
 747static struct lock_class_key qdisc_tx_lock;
 748static struct lock_class_key qdisc_rx_lock;
 749
 750/*
 751   Allocate and initialize new qdisc.
 752
 753   Parameters are passed via opt.
 754 */
 755
 756static struct Qdisc *
 757qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 758             u32 parent, u32 handle, struct nlattr **tca, int *errp)
 759{
 760        int err;
 761        struct nlattr *kind = tca[TCA_KIND];
 762        struct Qdisc *sch;
 763        struct Qdisc_ops *ops;
 764        struct qdisc_size_table *stab;
 765
 766        ops = qdisc_lookup_ops(kind);
 767#ifdef CONFIG_MODULES
 768        if (ops == NULL && kind != NULL) {
 769                char name[IFNAMSIZ];
 770                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 771                        /* We dropped the RTNL semaphore in order to
 772                         * perform the module load.  So, even if we
 773                         * succeeded in loading the module we have to
 774                         * tell the caller to replay the request.  We
 775                         * indicate this using -EAGAIN.
 776                         * We replay the request because the device may
 777                         * go away in the mean time.
 778                         */
 779                        rtnl_unlock();
 780                        request_module("sch_%s", name);
 781                        rtnl_lock();
 782                        ops = qdisc_lookup_ops(kind);
 783                        if (ops != NULL) {
 784                                /* We will try again qdisc_lookup_ops,
 785                                 * so don't keep a reference.
 786                                 */
 787                                module_put(ops->owner);
 788                                err = -EAGAIN;
 789                                goto err_out;
 790                        }
 791                }
 792        }
 793#endif
 794
 795        err = -ENOENT;
 796        if (ops == NULL)
 797                goto err_out;
 798
 799        sch = qdisc_alloc(dev_queue, ops);
 800        if (IS_ERR(sch)) {
 801                err = PTR_ERR(sch);
 802                goto err_out2;
 803        }
 804
 805        sch->parent = parent;
 806
 807        if (handle == TC_H_INGRESS) {
 808                sch->flags |= TCQ_F_INGRESS;
 809                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 810                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 811        } else {
 812                if (handle == 0) {
 813                        handle = qdisc_alloc_handle(dev);
 814                        err = -ENOMEM;
 815                        if (handle == 0)
 816                                goto err_out3;
 817                }
 818                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 819        }
 820
 821        sch->handle = handle;
 822
 823        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 824                if (tca[TCA_STAB]) {
 825                        stab = qdisc_get_stab(tca[TCA_STAB]);
 826                        if (IS_ERR(stab)) {
 827                                err = PTR_ERR(stab);
 828                                goto err_out3;
 829                        }
 830                        sch->stab = stab;
 831                }
 832                if (tca[TCA_RATE]) {
 833                        spinlock_t *root_lock;
 834
 835                        if ((sch->parent != TC_H_ROOT) &&
 836                            !(sch->flags & TCQ_F_INGRESS))
 837                                root_lock = qdisc_root_sleeping_lock(sch);
 838                        else
 839                                root_lock = qdisc_lock(sch);
 840
 841                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 842                                                root_lock, tca[TCA_RATE]);
 843                        if (err) {
 844                                /*
 845                                 * Any broken qdiscs that would require
 846                                 * a ops->reset() here? The qdisc was never
 847                                 * in action so it shouldn't be necessary.
 848                                 */
 849                                if (ops->destroy)
 850                                        ops->destroy(sch);
 851                                goto err_out3;
 852                        }
 853                }
 854
 855                qdisc_list_add(sch);
 856
 857                return sch;
 858        }
 859err_out3:
 860        qdisc_put_stab(sch->stab);
 861        dev_put(dev);
 862        kfree((char *) sch - sch->padded);
 863err_out2:
 864        module_put(ops->owner);
 865err_out:
 866        *errp = err;
 867        return NULL;
 868}
 869
 870static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 871{
 872        struct qdisc_size_table *stab = NULL;
 873        int err = 0;
 874
 875        if (tca[TCA_OPTIONS]) {
 876                if (sch->ops->change == NULL)
 877                        return -EINVAL;
 878                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 879                if (err)
 880                        return err;
 881        }
 882
 883        if (tca[TCA_STAB]) {
 884                stab = qdisc_get_stab(tca[TCA_STAB]);
 885                if (IS_ERR(stab))
 886                        return PTR_ERR(stab);
 887        }
 888
 889        qdisc_put_stab(sch->stab);
 890        sch->stab = stab;
 891
 892        if (tca[TCA_RATE])
 893                /* NB: ignores errors from replace_estimator
 894                   because change can't be undone. */
 895                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 896                                            qdisc_root_sleeping_lock(sch),
 897                                            tca[TCA_RATE]);
 898
 899        return 0;
 900}
 901
 902struct check_loop_arg
 903{
 904        struct qdisc_walker     w;
 905        struct Qdisc            *p;
 906        int                     depth;
 907};
 908
 909static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 910
 911static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 912{
 913        struct check_loop_arg   arg;
 914
 915        if (q->ops->cl_ops == NULL)
 916                return 0;
 917
 918        arg.w.stop = arg.w.skip = arg.w.count = 0;
 919        arg.w.fn = check_loop_fn;
 920        arg.depth = depth;
 921        arg.p = p;
 922        q->ops->cl_ops->walk(q, &arg.w);
 923        return arg.w.stop ? -ELOOP : 0;
 924}
 925
 926static int
 927check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 928{
 929        struct Qdisc *leaf;
 930        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 931        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 932
 933        leaf = cops->leaf(q, cl);
 934        if (leaf) {
 935                if (leaf == arg->p || arg->depth > 7)
 936                        return -ELOOP;
 937                return check_loop(leaf, arg->p, arg->depth + 1);
 938        }
 939        return 0;
 940}
 941
 942/*
 943 * Delete/get qdisc.
 944 */
 945
 946static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 947{
 948        struct net *net = sock_net(skb->sk);
 949        struct tcmsg *tcm = NLMSG_DATA(n);
 950        struct nlattr *tca[TCA_MAX + 1];
 951        struct net_device *dev;
 952        u32 clid = tcm->tcm_parent;
 953        struct Qdisc *q = NULL;
 954        struct Qdisc *p = NULL;
 955        int err;
 956
 957        if (net != &init_net)
 958                return -EINVAL;
 959
 960        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 961                return -ENODEV;
 962
 963        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 964        if (err < 0)
 965                return err;
 966
 967        if (clid) {
 968                if (clid != TC_H_ROOT) {
 969                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 970                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 971                                        return -ENOENT;
 972                                q = qdisc_leaf(p, clid);
 973                        } else { /* ingress */
 974                                q = dev->rx_queue.qdisc_sleeping;
 975                        }
 976                } else {
 977                        struct netdev_queue *dev_queue;
 978                        dev_queue = netdev_get_tx_queue(dev, 0);
 979                        q = dev_queue->qdisc_sleeping;
 980                }
 981                if (!q)
 982                        return -ENOENT;
 983
 984                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 985                        return -EINVAL;
 986        } else {
 987                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
 988                        return -ENOENT;
 989        }
 990
 991        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 992                return -EINVAL;
 993
 994        if (n->nlmsg_type == RTM_DELQDISC) {
 995                if (!clid)
 996                        return -EINVAL;
 997                if (q->handle == 0)
 998                        return -ENOENT;
 999                if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1000                        return err;
1001        } else {
1002                qdisc_notify(skb, n, clid, NULL, q);
1003        }
1004        return 0;
1005}
1006
1007/*
1008   Create/change qdisc.
1009 */
1010
1011static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1012{
1013        struct net *net = sock_net(skb->sk);
1014        struct tcmsg *tcm;
1015        struct nlattr *tca[TCA_MAX + 1];
1016        struct net_device *dev;
1017        u32 clid;
1018        struct Qdisc *q, *p;
1019        int err;
1020
1021        if (net != &init_net)
1022                return -EINVAL;
1023
1024replay:
1025        /* Reinit, just in case something touches this. */
1026        tcm = NLMSG_DATA(n);
1027        clid = tcm->tcm_parent;
1028        q = p = NULL;
1029
1030        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1031                return -ENODEV;
1032
1033        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1034        if (err < 0)
1035                return err;
1036
1037        if (clid) {
1038                if (clid != TC_H_ROOT) {
1039                        if (clid != TC_H_INGRESS) {
1040                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1041                                        return -ENOENT;
1042                                q = qdisc_leaf(p, clid);
1043                        } else { /*ingress */
1044                                q = dev->rx_queue.qdisc_sleeping;
1045                        }
1046                } else {
1047                        struct netdev_queue *dev_queue;
1048                        dev_queue = netdev_get_tx_queue(dev, 0);
1049                        q = dev_queue->qdisc_sleeping;
1050                }
1051
1052                /* It may be default qdisc, ignore it */
1053                if (q && q->handle == 0)
1054                        q = NULL;
1055
1056                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1057                        if (tcm->tcm_handle) {
1058                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1059                                        return -EEXIST;
1060                                if (TC_H_MIN(tcm->tcm_handle))
1061                                        return -EINVAL;
1062                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1063                                        goto create_n_graft;
1064                                if (n->nlmsg_flags&NLM_F_EXCL)
1065                                        return -EEXIST;
1066                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1067                                        return -EINVAL;
1068                                if (q == p ||
1069                                    (p && check_loop(q, p, 0)))
1070                                        return -ELOOP;
1071                                atomic_inc(&q->refcnt);
1072                                goto graft;
1073                        } else {
1074                                if (q == NULL)
1075                                        goto create_n_graft;
1076
1077                                /* This magic test requires explanation.
1078                                 *
1079                                 *   We know, that some child q is already
1080                                 *   attached to this parent and have choice:
1081                                 *   either to change it or to create/graft new one.
1082                                 *
1083                                 *   1. We are allowed to create/graft only
1084                                 *   if CREATE and REPLACE flags are set.
1085                                 *
1086                                 *   2. If EXCL is set, requestor wanted to say,
1087                                 *   that qdisc tcm_handle is not expected
1088                                 *   to exist, so that we choose create/graft too.
1089                                 *
1090                                 *   3. The last case is when no flags are set.
1091                                 *   Alas, it is sort of hole in API, we
1092                                 *   cannot decide what to do unambiguously.
1093                                 *   For now we select create/graft, if
1094                                 *   user gave KIND, which does not match existing.
1095                                 */
1096                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
1097                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
1098                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
1099                                     (tca[TCA_KIND] &&
1100                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1101                                        goto create_n_graft;
1102                        }
1103                }
1104        } else {
1105                if (!tcm->tcm_handle)
1106                        return -EINVAL;
1107                q = qdisc_lookup(dev, tcm->tcm_handle);
1108        }
1109
1110        /* Change qdisc parameters */
1111        if (q == NULL)
1112                return -ENOENT;
1113        if (n->nlmsg_flags&NLM_F_EXCL)
1114                return -EEXIST;
1115        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1116                return -EINVAL;
1117        err = qdisc_change(q, tca);
1118        if (err == 0)
1119                qdisc_notify(skb, n, clid, NULL, q);
1120        return err;
1121
1122create_n_graft:
1123        if (!(n->nlmsg_flags&NLM_F_CREATE))
1124                return -ENOENT;
1125        if (clid == TC_H_INGRESS)
1126                q = qdisc_create(dev, &dev->rx_queue,
1127                                 tcm->tcm_parent, tcm->tcm_parent,
1128                                 tca, &err);
1129        else
1130                q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1131                                 tcm->tcm_parent, tcm->tcm_handle,
1132                                 tca, &err);
1133        if (q == NULL) {
1134                if (err == -EAGAIN)
1135                        goto replay;
1136                return err;
1137        }
1138
1139graft:
1140        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1141        if (err) {
1142                if (q)
1143                        qdisc_destroy(q);
1144                return err;
1145        }
1146
1147        return 0;
1148}
1149
1150static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1151                         u32 pid, u32 seq, u16 flags, int event)
1152{
1153        struct tcmsg *tcm;
1154        struct nlmsghdr  *nlh;
1155        unsigned char *b = skb_tail_pointer(skb);
1156        struct gnet_dump d;
1157
1158        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1159        tcm = NLMSG_DATA(nlh);
1160        tcm->tcm_family = AF_UNSPEC;
1161        tcm->tcm__pad1 = 0;
1162        tcm->tcm__pad2 = 0;
1163        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1164        tcm->tcm_parent = clid;
1165        tcm->tcm_handle = q->handle;
1166        tcm->tcm_info = atomic_read(&q->refcnt);
1167        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1168        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1169                goto nla_put_failure;
1170        q->qstats.qlen = q->q.qlen;
1171
1172        if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1173                goto nla_put_failure;
1174
1175        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1176                                         qdisc_root_sleeping_lock(q), &d) < 0)
1177                goto nla_put_failure;
1178
1179        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1180                goto nla_put_failure;
1181
1182        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1183            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1184            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1185                goto nla_put_failure;
1186
1187        if (gnet_stats_finish_copy(&d) < 0)
1188                goto nla_put_failure;
1189
1190        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1191        return skb->len;
1192
1193nlmsg_failure:
1194nla_put_failure:
1195        nlmsg_trim(skb, b);
1196        return -1;
1197}
1198
1199static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1200                        u32 clid, struct Qdisc *old, struct Qdisc *new)
1201{
1202        struct sk_buff *skb;
1203        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1204
1205        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1206        if (!skb)
1207                return -ENOBUFS;
1208
1209        if (old && old->handle) {
1210                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1211                        goto err_out;
1212        }
1213        if (new) {
1214                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1215                        goto err_out;
1216        }
1217
1218        if (skb->len)
1219                return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1220
1221err_out:
1222        kfree_skb(skb);
1223        return -EINVAL;
1224}
1225
1226static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1227{
1228        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1229}
1230
1231static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1232                              struct netlink_callback *cb,
1233                              int *q_idx_p, int s_q_idx)
1234{
1235        int ret = 0, q_idx = *q_idx_p;
1236        struct Qdisc *q;
1237
1238        if (!root)
1239                return 0;
1240
1241        q = root;
1242        if (q_idx < s_q_idx) {
1243                q_idx++;
1244        } else {
1245                if (!tc_qdisc_dump_ignore(q) &&
1246                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248                        goto done;
1249                q_idx++;
1250        }
1251        list_for_each_entry(q, &root->list, list) {
1252                if (q_idx < s_q_idx) {
1253                        q_idx++;
1254                        continue;
1255                }
1256                if (!tc_qdisc_dump_ignore(q) && 
1257                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1258                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1259                        goto done;
1260                q_idx++;
1261        }
1262
1263out:
1264        *q_idx_p = q_idx;
1265        return ret;
1266done:
1267        ret = -1;
1268        goto out;
1269}
1270
1271static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1272{
1273        struct net *net = sock_net(skb->sk);
1274        int idx, q_idx;
1275        int s_idx, s_q_idx;
1276        struct net_device *dev;
1277
1278        if (net != &init_net)
1279                return 0;
1280
1281        s_idx = cb->args[0];
1282        s_q_idx = q_idx = cb->args[1];
1283        read_lock(&dev_base_lock);
1284        idx = 0;
1285        for_each_netdev(&init_net, dev) {
1286                struct netdev_queue *dev_queue;
1287
1288                if (idx < s_idx)
1289                        goto cont;
1290                if (idx > s_idx)
1291                        s_q_idx = 0;
1292                q_idx = 0;
1293
1294                dev_queue = netdev_get_tx_queue(dev, 0);
1295                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1296                        goto done;
1297
1298                dev_queue = &dev->rx_queue;
1299                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1300                        goto done;
1301
1302cont:
1303                idx++;
1304        }
1305
1306done:
1307        read_unlock(&dev_base_lock);
1308
1309        cb->args[0] = idx;
1310        cb->args[1] = q_idx;
1311
1312        return skb->len;
1313}
1314
1315
1316
1317/************************************************
1318 *      Traffic classes manipulation.           *
1319 ************************************************/
1320
1321
1322
1323static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1324{
1325        struct net *net = sock_net(skb->sk);
1326        struct netdev_queue *dev_queue;
1327        struct tcmsg *tcm = NLMSG_DATA(n);
1328        struct nlattr *tca[TCA_MAX + 1];
1329        struct net_device *dev;
1330        struct Qdisc *q = NULL;
1331        const struct Qdisc_class_ops *cops;
1332        unsigned long cl = 0;
1333        unsigned long new_cl;
1334        u32 pid = tcm->tcm_parent;
1335        u32 clid = tcm->tcm_handle;
1336        u32 qid = TC_H_MAJ(clid);
1337        int err;
1338
1339        if (net != &init_net)
1340                return -EINVAL;
1341
1342        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1343                return -ENODEV;
1344
1345        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1346        if (err < 0)
1347                return err;
1348
1349        /*
1350           parent == TC_H_UNSPEC - unspecified parent.
1351           parent == TC_H_ROOT   - class is root, which has no parent.
1352           parent == X:0         - parent is root class.
1353           parent == X:Y         - parent is a node in hierarchy.
1354           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1355
1356           handle == 0:0         - generate handle from kernel pool.
1357           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1358           handle == X:Y         - clear.
1359           handle == X:0         - root class.
1360         */
1361
1362        /* Step 1. Determine qdisc handle X:0 */
1363
1364        dev_queue = netdev_get_tx_queue(dev, 0);
1365        if (pid != TC_H_ROOT) {
1366                u32 qid1 = TC_H_MAJ(pid);
1367
1368                if (qid && qid1) {
1369                        /* If both majors are known, they must be identical. */
1370                        if (qid != qid1)
1371                                return -EINVAL;
1372                } else if (qid1) {
1373                        qid = qid1;
1374                } else if (qid == 0)
1375                        qid = dev_queue->qdisc_sleeping->handle;
1376
1377                /* Now qid is genuine qdisc handle consistent
1378                   both with parent and child.
1379
1380                   TC_H_MAJ(pid) still may be unspecified, complete it now.
1381                 */
1382                if (pid)
1383                        pid = TC_H_MAKE(qid, pid);
1384        } else {
1385                if (qid == 0)
1386                        qid = dev_queue->qdisc_sleeping->handle;
1387        }
1388
1389        /* OK. Locate qdisc */
1390        if ((q = qdisc_lookup(dev, qid)) == NULL)
1391                return -ENOENT;
1392
1393        /* An check that it supports classes */
1394        cops = q->ops->cl_ops;
1395        if (cops == NULL)
1396                return -EINVAL;
1397
1398        /* Now try to get class */
1399        if (clid == 0) {
1400                if (pid == TC_H_ROOT)
1401                        clid = qid;
1402        } else
1403                clid = TC_H_MAKE(qid, clid);
1404
1405        if (clid)
1406                cl = cops->get(q, clid);
1407
1408        if (cl == 0) {
1409                err = -ENOENT;
1410                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1411                        goto out;
1412        } else {
1413                switch (n->nlmsg_type) {
1414                case RTM_NEWTCLASS:
1415                        err = -EEXIST;
1416                        if (n->nlmsg_flags&NLM_F_EXCL)
1417                                goto out;
1418                        break;
1419                case RTM_DELTCLASS:
1420                        err = cops->delete(q, cl);
1421                        if (err == 0)
1422                                tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1423                        goto out;
1424                case RTM_GETTCLASS:
1425                        err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1426                        goto out;
1427                default:
1428                        err = -EINVAL;
1429                        goto out;
1430                }
1431        }
1432
1433        new_cl = cl;
1434        err = cops->change(q, clid, pid, tca, &new_cl);
1435        if (err == 0)
1436                tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1437
1438out:
1439        if (cl)
1440                cops->put(q, cl);
1441
1442        return err;
1443}
1444
1445
1446static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1447                          unsigned long cl,
1448                          u32 pid, u32 seq, u16 flags, int event)
1449{
1450        struct tcmsg *tcm;
1451        struct nlmsghdr  *nlh;
1452        unsigned char *b = skb_tail_pointer(skb);
1453        struct gnet_dump d;
1454        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1455
1456        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1457        tcm = NLMSG_DATA(nlh);
1458        tcm->tcm_family = AF_UNSPEC;
1459        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1460        tcm->tcm_parent = q->handle;
1461        tcm->tcm_handle = q->handle;
1462        tcm->tcm_info = 0;
1463        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1464        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1465                goto nla_put_failure;
1466
1467        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1468                                         qdisc_root_sleeping_lock(q), &d) < 0)
1469                goto nla_put_failure;
1470
1471        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1472                goto nla_put_failure;
1473
1474        if (gnet_stats_finish_copy(&d) < 0)
1475                goto nla_put_failure;
1476
1477        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1478        return skb->len;
1479
1480nlmsg_failure:
1481nla_put_failure:
1482        nlmsg_trim(skb, b);
1483        return -1;
1484}
1485
1486static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1487                          struct Qdisc *q, unsigned long cl, int event)
1488{
1489        struct sk_buff *skb;
1490        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1491
1492        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1493        if (!skb)
1494                return -ENOBUFS;
1495
1496        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1497                kfree_skb(skb);
1498                return -EINVAL;
1499        }
1500
1501        return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1502}
1503
1504struct qdisc_dump_args
1505{
1506        struct qdisc_walker w;
1507        struct sk_buff *skb;
1508        struct netlink_callback *cb;
1509};
1510
1511static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1512{
1513        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1514
1515        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1516                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1517}
1518
1519static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1520                                struct tcmsg *tcm, struct netlink_callback *cb,
1521                                int *t_p, int s_t)
1522{
1523        struct qdisc_dump_args arg;
1524
1525        if (tc_qdisc_dump_ignore(q) ||
1526            *t_p < s_t || !q->ops->cl_ops ||
1527            (tcm->tcm_parent &&
1528             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1529                (*t_p)++;
1530                return 0;
1531        }
1532        if (*t_p > s_t)
1533                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1534        arg.w.fn = qdisc_class_dump;
1535        arg.skb = skb;
1536        arg.cb = cb;
1537        arg.w.stop  = 0;
1538        arg.w.skip = cb->args[1];
1539        arg.w.count = 0;
1540        q->ops->cl_ops->walk(q, &arg.w);
1541        cb->args[1] = arg.w.count;
1542        if (arg.w.stop)
1543                return -1;
1544        (*t_p)++;
1545        return 0;
1546}
1547
1548static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1549                               struct tcmsg *tcm, struct netlink_callback *cb,
1550                               int *t_p, int s_t)
1551{
1552        struct Qdisc *q;
1553
1554        if (!root)
1555                return 0;
1556
1557        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1558                return -1;
1559
1560        list_for_each_entry(q, &root->list, list) {
1561                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1562                        return -1;
1563        }
1564
1565        return 0;
1566}
1567
1568static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1569{
1570        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1571        struct net *net = sock_net(skb->sk);
1572        struct netdev_queue *dev_queue;
1573        struct net_device *dev;
1574        int t, s_t;
1575
1576        if (net != &init_net)
1577                return 0;
1578
1579        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1580                return 0;
1581        if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1582                return 0;
1583
1584        s_t = cb->args[0];
1585        t = 0;
1586
1587        dev_queue = netdev_get_tx_queue(dev, 0);
1588        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1589                goto done;
1590
1591        dev_queue = &dev->rx_queue;
1592        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1593                goto done;
1594
1595done:
1596        cb->args[0] = t;
1597
1598        dev_put(dev);
1599        return skb->len;
1600}
1601
1602/* Main classifier routine: scans classifier chain attached
1603   to this qdisc, (optionally) tests for protocol and asks
1604   specific classifiers.
1605 */
1606int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1607                       struct tcf_result *res)
1608{
1609        __be16 protocol = skb->protocol;
1610        int err = 0;
1611
1612        for (; tp; tp = tp->next) {
1613                if ((tp->protocol == protocol ||
1614                     tp->protocol == htons(ETH_P_ALL)) &&
1615                    (err = tp->classify(skb, tp, res)) >= 0) {
1616#ifdef CONFIG_NET_CLS_ACT
1617                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1618                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1619#endif
1620                        return err;
1621                }
1622        }
1623        return -1;
1624}
1625EXPORT_SYMBOL(tc_classify_compat);
1626
1627int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1628                struct tcf_result *res)
1629{
1630        int err = 0;
1631        __be16 protocol;
1632#ifdef CONFIG_NET_CLS_ACT
1633        struct tcf_proto *otp = tp;
1634reclassify:
1635#endif
1636        protocol = skb->protocol;
1637
1638        err = tc_classify_compat(skb, tp, res);
1639#ifdef CONFIG_NET_CLS_ACT
1640        if (err == TC_ACT_RECLASSIFY) {
1641                u32 verd = G_TC_VERD(skb->tc_verd);
1642                tp = otp;
1643
1644                if (verd++ >= MAX_REC_LOOP) {
1645                        printk("rule prio %u protocol %02x reclassify loop, "
1646                               "packet dropped\n",
1647                               tp->prio&0xffff, ntohs(tp->protocol));
1648                        return TC_ACT_SHOT;
1649                }
1650                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1651                goto reclassify;
1652        }
1653#endif
1654        return err;
1655}
1656EXPORT_SYMBOL(tc_classify);
1657
1658void tcf_destroy(struct tcf_proto *tp)
1659{
1660        tp->ops->destroy(tp);
1661        module_put(tp->ops->owner);
1662        kfree(tp);
1663}
1664
1665void tcf_destroy_chain(struct tcf_proto **fl)
1666{
1667        struct tcf_proto *tp;
1668
1669        while ((tp = *fl) != NULL) {
1670                *fl = tp->next;
1671                tcf_destroy(tp);
1672        }
1673}
1674EXPORT_SYMBOL(tcf_destroy_chain);
1675
1676#ifdef CONFIG_PROC_FS
1677static int psched_show(struct seq_file *seq, void *v)
1678{
1679        struct timespec ts;
1680
1681        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1682        seq_printf(seq, "%08x %08x %08x %08x\n",
1683                   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1684                   1000000,
1685                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1686
1687        return 0;
1688}
1689
1690static int psched_open(struct inode *inode, struct file *file)
1691{
1692        return single_open(file, psched_show, PDE(inode)->data);
1693}
1694
1695static const struct file_operations psched_fops = {
1696        .owner = THIS_MODULE,
1697        .open = psched_open,
1698        .read  = seq_read,
1699        .llseek = seq_lseek,
1700        .release = single_release,
1701};
1702#endif
1703
1704static int __init pktsched_init(void)
1705{
1706        register_qdisc(&pfifo_qdisc_ops);
1707        register_qdisc(&bfifo_qdisc_ops);
1708        proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1709
1710        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1711        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1712        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1713        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1714        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1715        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1716
1717        return 0;
1718}
1719
1720subsys_initcall(pktsched_init);
1721