linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31
  32#include <net/net_namespace.h>
  33#include <net/sock.h>
  34#include <net/netlink.h>
  35#include <net/pkt_sched.h>
  36
  37static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
  38                        struct Qdisc *old, struct Qdisc *new);
  39static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
  40                         struct Qdisc *q, unsigned long cl, int event);
  41
  42/*
  43
  44   Short review.
  45   -------------
  46
  47   This file consists of two interrelated parts:
  48
  49   1. queueing disciplines manager frontend.
  50   2. traffic classes manager frontend.
  51
  52   Generally, queueing discipline ("qdisc") is a black box,
  53   which is able to enqueue packets and to dequeue them (when
  54   device is ready to send something) in order and at times
  55   determined by algorithm hidden in it.
  56
  57   qdisc's are divided to two categories:
  58   - "queues", which have no internal structure visible from outside.
  59   - "schedulers", which split all the packets to "traffic classes",
  60     using "packet classifiers" (look at cls_api.c)
  61
  62   In turn, classes may have child qdiscs (as rule, queues)
  63   attached to them etc. etc. etc.
  64
  65   The goal of the routines in this file is to translate
  66   information supplied by user in the form of handles
  67   to more intelligible for kernel form, to make some sanity
  68   checks and part of work, which is common to all qdiscs
  69   and to provide rtnetlink notifications.
  70
  71   All real intelligent work is done inside qdisc modules.
  72
  73
  74
  75   Every discipline has two major routines: enqueue and dequeue.
  76
  77   ---dequeue
  78
  79   dequeue usually returns a skb to send. It is allowed to return NULL,
  80   but it does not mean that queue is empty, it just means that
  81   discipline does not want to send anything this time.
  82   Queue is really empty if q->q.qlen == 0.
  83   For complicated disciplines with multiple queues q->q is not
  84   real packet queue, but however q->q.qlen must be valid.
  85
  86   ---enqueue
  87
  88   enqueue returns 0, if packet was enqueued successfully.
  89   If packet (this one or another one) was dropped, it returns
  90   not zero error code.
  91   NET_XMIT_DROP        - this packet dropped
  92     Expected action: do not backoff, but wait until queue will clear.
  93   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  94     Expected action: backoff or ignore
  95   NET_XMIT_POLICED     - dropped by police.
  96     Expected action: backoff or error to real-time apps.
  97
  98   Auxiliary routines:
  99
 100   ---requeue
 101
 102   requeues once dequeued packet. It is used for non-standard or
 103   just buggy devices, which can defer output even if netif_queue_stopped()=0.
 104
 105   ---reset
 106
 107   returns qdisc to initial state: purge all buffers, clear all
 108   timers, counters (except for statistics) etc.
 109
 110   ---init
 111
 112   initializes newly created qdisc.
 113
 114   ---destroy
 115
 116   destroys resources allocated by init and during lifetime of qdisc.
 117
 118   ---change
 119
 120   changes qdisc parameters.
 121 */
 122
 123/* Protects list of registered TC modules. It is pure SMP lock. */
 124static DEFINE_RWLOCK(qdisc_mod_lock);
 125
 126
 127/************************************************
 128 *      Queueing disciplines manipulation.      *
 129 ************************************************/
 130
 131
 132/* The list of all installed queueing disciplines. */
 133
 134static struct Qdisc_ops *qdisc_base;
 135
 136/* Register/uregister queueing discipline */
 137
 138int register_qdisc(struct Qdisc_ops *qops)
 139{
 140        struct Qdisc_ops *q, **qp;
 141        int rc = -EEXIST;
 142
 143        write_lock(&qdisc_mod_lock);
 144        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 145                if (!strcmp(qops->id, q->id))
 146                        goto out;
 147
 148        if (qops->enqueue == NULL)
 149                qops->enqueue = noop_qdisc_ops.enqueue;
 150        if (qops->requeue == NULL)
 151                qops->requeue = noop_qdisc_ops.requeue;
 152        if (qops->dequeue == NULL)
 153                qops->dequeue = noop_qdisc_ops.dequeue;
 154
 155        qops->next = NULL;
 156        *qp = qops;
 157        rc = 0;
 158out:
 159        write_unlock(&qdisc_mod_lock);
 160        return rc;
 161}
 162EXPORT_SYMBOL(register_qdisc);
 163
 164int unregister_qdisc(struct Qdisc_ops *qops)
 165{
 166        struct Qdisc_ops *q, **qp;
 167        int err = -ENOENT;
 168
 169        write_lock(&qdisc_mod_lock);
 170        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
 171                if (q == qops)
 172                        break;
 173        if (q) {
 174                *qp = q->next;
 175                q->next = NULL;
 176                err = 0;
 177        }
 178        write_unlock(&qdisc_mod_lock);
 179        return err;
 180}
 181EXPORT_SYMBOL(unregister_qdisc);
 182
 183/* We know handle. Find qdisc among all qdisc's attached to device
 184   (root qdisc, all its children, children of children etc.)
 185 */
 186
 187struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 188{
 189        struct Qdisc *q;
 190
 191        if (!(root->flags & TCQ_F_BUILTIN) &&
 192            root->handle == handle)
 193                return root;
 194
 195        list_for_each_entry(q, &root->list, list) {
 196                if (q->handle == handle)
 197                        return q;
 198        }
 199        return NULL;
 200}
 201
 202/*
 203 * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
 204 * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
 205 */
 206static DEFINE_SPINLOCK(qdisc_list_lock);
 207
 208static void qdisc_list_add(struct Qdisc *q)
 209{
 210        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 211                spin_lock_bh(&qdisc_list_lock);
 212                list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
 213                spin_unlock_bh(&qdisc_list_lock);
 214        }
 215}
 216
 217void qdisc_list_del(struct Qdisc *q)
 218{
 219        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 220                spin_lock_bh(&qdisc_list_lock);
 221                list_del(&q->list);
 222                spin_unlock_bh(&qdisc_list_lock);
 223        }
 224}
 225EXPORT_SYMBOL(qdisc_list_del);
 226
 227struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 228{
 229        unsigned int i;
 230        struct Qdisc *q;
 231
 232        spin_lock_bh(&qdisc_list_lock);
 233
 234        for (i = 0; i < dev->num_tx_queues; i++) {
 235                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 236                struct Qdisc *txq_root = txq->qdisc_sleeping;
 237
 238                q = qdisc_match_from_root(txq_root, handle);
 239                if (q)
 240                        goto unlock;
 241        }
 242
 243        q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
 244
 245unlock:
 246        spin_unlock_bh(&qdisc_list_lock);
 247
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288static struct qdisc_rate_table *qdisc_rtab_list;
 289
 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 291{
 292        struct qdisc_rate_table *rtab;
 293
 294        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 295                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 296                        rtab->refcnt++;
 297                        return rtab;
 298                }
 299        }
 300
 301        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 302            nla_len(tab) != TC_RTAB_SIZE)
 303                return NULL;
 304
 305        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 306        if (rtab) {
 307                rtab->rate = *r;
 308                rtab->refcnt = 1;
 309                memcpy(rtab->data, nla_data(tab), 1024);
 310                rtab->next = qdisc_rtab_list;
 311                qdisc_rtab_list = rtab;
 312        }
 313        return rtab;
 314}
 315EXPORT_SYMBOL(qdisc_get_rtab);
 316
 317void qdisc_put_rtab(struct qdisc_rate_table *tab)
 318{
 319        struct qdisc_rate_table *rtab, **rtabp;
 320
 321        if (!tab || --tab->refcnt)
 322                return;
 323
 324        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
 325                if (rtab == tab) {
 326                        *rtabp = rtab->next;
 327                        kfree(rtab);
 328                        return;
 329                }
 330        }
 331}
 332EXPORT_SYMBOL(qdisc_put_rtab);
 333
 334static LIST_HEAD(qdisc_stab_list);
 335static DEFINE_SPINLOCK(qdisc_stab_lock);
 336
 337static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 338        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 339        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 340};
 341
 342static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 343{
 344        struct nlattr *tb[TCA_STAB_MAX + 1];
 345        struct qdisc_size_table *stab;
 346        struct tc_sizespec *s;
 347        unsigned int tsize = 0;
 348        u16 *tab = NULL;
 349        int err;
 350
 351        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 352        if (err < 0)
 353                return ERR_PTR(err);
 354        if (!tb[TCA_STAB_BASE])
 355                return ERR_PTR(-EINVAL);
 356
 357        s = nla_data(tb[TCA_STAB_BASE]);
 358
 359        if (s->tsize > 0) {
 360                if (!tb[TCA_STAB_DATA])
 361                        return ERR_PTR(-EINVAL);
 362                tab = nla_data(tb[TCA_STAB_DATA]);
 363                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 364        }
 365
 366        if (!s || tsize != s->tsize || (!tab && tsize > 0))
 367                return ERR_PTR(-EINVAL);
 368
 369        spin_lock(&qdisc_stab_lock);
 370
 371        list_for_each_entry(stab, &qdisc_stab_list, list) {
 372                if (memcmp(&stab->szopts, s, sizeof(*s)))
 373                        continue;
 374                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 375                        continue;
 376                stab->refcnt++;
 377                spin_unlock(&qdisc_stab_lock);
 378                return stab;
 379        }
 380
 381        spin_unlock(&qdisc_stab_lock);
 382
 383        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 384        if (!stab)
 385                return ERR_PTR(-ENOMEM);
 386
 387        stab->refcnt = 1;
 388        stab->szopts = *s;
 389        if (tsize > 0)
 390                memcpy(stab->data, tab, tsize * sizeof(u16));
 391
 392        spin_lock(&qdisc_stab_lock);
 393        list_add_tail(&stab->list, &qdisc_stab_list);
 394        spin_unlock(&qdisc_stab_lock);
 395
 396        return stab;
 397}
 398
 399void qdisc_put_stab(struct qdisc_size_table *tab)
 400{
 401        if (!tab)
 402                return;
 403
 404        spin_lock(&qdisc_stab_lock);
 405
 406        if (--tab->refcnt == 0) {
 407                list_del(&tab->list);
 408                kfree(tab);
 409        }
 410
 411        spin_unlock(&qdisc_stab_lock);
 412}
 413EXPORT_SYMBOL(qdisc_put_stab);
 414
 415static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 416{
 417        struct nlattr *nest;
 418
 419        nest = nla_nest_start(skb, TCA_STAB);
 420        if (nest == NULL)
 421                goto nla_put_failure;
 422        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 423        nla_nest_end(skb, nest);
 424
 425        return skb->len;
 426
 427nla_put_failure:
 428        return -1;
 429}
 430
 431void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
 432{
 433        int pkt_len, slot;
 434
 435        pkt_len = skb->len + stab->szopts.overhead;
 436        if (unlikely(!stab->szopts.tsize))
 437                goto out;
 438
 439        slot = pkt_len + stab->szopts.cell_align;
 440        if (unlikely(slot < 0))
 441                slot = 0;
 442
 443        slot >>= stab->szopts.cell_log;
 444        if (likely(slot < stab->szopts.tsize))
 445                pkt_len = stab->data[slot];
 446        else
 447                pkt_len = stab->data[stab->szopts.tsize - 1] *
 448                                (slot / stab->szopts.tsize) +
 449                                stab->data[slot % stab->szopts.tsize];
 450
 451        pkt_len <<= stab->szopts.size_log;
 452out:
 453        if (unlikely(pkt_len < 1))
 454                pkt_len = 1;
 455        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 456}
 457EXPORT_SYMBOL(qdisc_calculate_pkt_len);
 458
 459static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 460{
 461        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 462                                                 timer);
 463
 464        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 465        smp_wmb();
 466        __netif_schedule(qdisc_root(wd->qdisc));
 467
 468        return HRTIMER_NORESTART;
 469}
 470
 471void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 472{
 473        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 474        wd->timer.function = qdisc_watchdog;
 475        wd->qdisc = qdisc;
 476}
 477EXPORT_SYMBOL(qdisc_watchdog_init);
 478
 479void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 480{
 481        ktime_t time;
 482
 483        if (test_bit(__QDISC_STATE_DEACTIVATED,
 484                     &qdisc_root_sleeping(wd->qdisc)->state))
 485                return;
 486
 487        wd->qdisc->flags |= TCQ_F_THROTTLED;
 488        time = ktime_set(0, 0);
 489        time = ktime_add_ns(time, PSCHED_US2NS(expires));
 490        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 491}
 492EXPORT_SYMBOL(qdisc_watchdog_schedule);
 493
 494void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 495{
 496        hrtimer_cancel(&wd->timer);
 497        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 498}
 499EXPORT_SYMBOL(qdisc_watchdog_cancel);
 500
 501static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 502{
 503        unsigned int size = n * sizeof(struct hlist_head), i;
 504        struct hlist_head *h;
 505
 506        if (size <= PAGE_SIZE)
 507                h = kmalloc(size, GFP_KERNEL);
 508        else
 509                h = (struct hlist_head *)
 510                        __get_free_pages(GFP_KERNEL, get_order(size));
 511
 512        if (h != NULL) {
 513                for (i = 0; i < n; i++)
 514                        INIT_HLIST_HEAD(&h[i]);
 515        }
 516        return h;
 517}
 518
 519static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 520{
 521        unsigned int size = n * sizeof(struct hlist_head);
 522
 523        if (size <= PAGE_SIZE)
 524                kfree(h);
 525        else
 526                free_pages((unsigned long)h, get_order(size));
 527}
 528
 529void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 530{
 531        struct Qdisc_class_common *cl;
 532        struct hlist_node *n, *next;
 533        struct hlist_head *nhash, *ohash;
 534        unsigned int nsize, nmask, osize;
 535        unsigned int i, h;
 536
 537        /* Rehash when load factor exceeds 0.75 */
 538        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 539                return;
 540        nsize = clhash->hashsize * 2;
 541        nmask = nsize - 1;
 542        nhash = qdisc_class_hash_alloc(nsize);
 543        if (nhash == NULL)
 544                return;
 545
 546        ohash = clhash->hash;
 547        osize = clhash->hashsize;
 548
 549        sch_tree_lock(sch);
 550        for (i = 0; i < osize; i++) {
 551                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 552                        h = qdisc_class_hash(cl->classid, nmask);
 553                        hlist_add_head(&cl->hnode, &nhash[h]);
 554                }
 555        }
 556        clhash->hash     = nhash;
 557        clhash->hashsize = nsize;
 558        clhash->hashmask = nmask;
 559        sch_tree_unlock(sch);
 560
 561        qdisc_class_hash_free(ohash, osize);
 562}
 563EXPORT_SYMBOL(qdisc_class_hash_grow);
 564
 565int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 566{
 567        unsigned int size = 4;
 568
 569        clhash->hash = qdisc_class_hash_alloc(size);
 570        if (clhash->hash == NULL)
 571                return -ENOMEM;
 572        clhash->hashsize  = size;
 573        clhash->hashmask  = size - 1;
 574        clhash->hashelems = 0;
 575        return 0;
 576}
 577EXPORT_SYMBOL(qdisc_class_hash_init);
 578
 579void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 580{
 581        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 582}
 583EXPORT_SYMBOL(qdisc_class_hash_destroy);
 584
 585void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 586                             struct Qdisc_class_common *cl)
 587{
 588        unsigned int h;
 589
 590        INIT_HLIST_NODE(&cl->hnode);
 591        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 592        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 593        clhash->hashelems++;
 594}
 595EXPORT_SYMBOL(qdisc_class_hash_insert);
 596
 597void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 598                             struct Qdisc_class_common *cl)
 599{
 600        hlist_del(&cl->hnode);
 601        clhash->hashelems--;
 602}
 603EXPORT_SYMBOL(qdisc_class_hash_remove);
 604
 605/* Allocate an unique handle from space managed by kernel */
 606
 607static u32 qdisc_alloc_handle(struct net_device *dev)
 608{
 609        int i = 0x10000;
 610        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 611
 612        do {
 613                autohandle += TC_H_MAKE(0x10000U, 0);
 614                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 615                        autohandle = TC_H_MAKE(0x80000000U, 0);
 616        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 617
 618        return i>0 ? autohandle : 0;
 619}
 620
 621/* Attach toplevel qdisc to device queue. */
 622
 623static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 624                                     struct Qdisc *qdisc)
 625{
 626        struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
 627        spinlock_t *root_lock;
 628
 629        root_lock = qdisc_lock(oqdisc);
 630        spin_lock_bh(root_lock);
 631
 632        /* Prune old scheduler */
 633        if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
 634                qdisc_reset(oqdisc);
 635
 636        /* ... and graft new one */
 637        if (qdisc == NULL)
 638                qdisc = &noop_qdisc;
 639        dev_queue->qdisc_sleeping = qdisc;
 640        rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
 641
 642        spin_unlock_bh(root_lock);
 643
 644        return oqdisc;
 645}
 646
 647void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 648{
 649        const struct Qdisc_class_ops *cops;
 650        unsigned long cl;
 651        u32 parentid;
 652
 653        if (n == 0)
 654                return;
 655        while ((parentid = sch->parent)) {
 656                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 657                        return;
 658
 659                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 660                if (sch == NULL) {
 661                        WARN_ON(parentid != TC_H_ROOT);
 662                        return;
 663                }
 664                cops = sch->ops->cl_ops;
 665                if (cops->qlen_notify) {
 666                        cl = cops->get(sch, parentid);
 667                        cops->qlen_notify(sch, cl);
 668                        cops->put(sch, cl);
 669                }
 670                sch->q.qlen -= n;
 671        }
 672}
 673EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 674
 675static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
 676                               struct Qdisc *old, struct Qdisc *new)
 677{
 678        if (new || old)
 679                qdisc_notify(skb, n, clid, old, new);
 680
 681        if (old)
 682                qdisc_destroy(old);
 683}
 684
 685/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 686 * to device "dev".
 687 *
 688 * When appropriate send a netlink notification using 'skb'
 689 * and "n".
 690 *
 691 * On success, destroy old qdisc.
 692 */
 693
 694static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 695                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 696                       struct Qdisc *new, struct Qdisc *old)
 697{
 698        struct Qdisc *q = old;
 699        int err = 0;
 700
 701        if (parent == NULL) {
 702                unsigned int i, num_q, ingress;
 703
 704                ingress = 0;
 705                num_q = dev->num_tx_queues;
 706                if ((q && q->flags & TCQ_F_INGRESS) ||
 707                    (new && new->flags & TCQ_F_INGRESS)) {
 708                        num_q = 1;
 709                        ingress = 1;
 710                }
 711
 712                if (dev->flags & IFF_UP)
 713                        dev_deactivate(dev);
 714
 715                for (i = 0; i < num_q; i++) {
 716                        struct netdev_queue *dev_queue = &dev->rx_queue;
 717
 718                        if (!ingress)
 719                                dev_queue = netdev_get_tx_queue(dev, i);
 720
 721                        old = dev_graft_qdisc(dev_queue, new);
 722                        if (new && i > 0)
 723                                atomic_inc(&new->refcnt);
 724
 725                        notify_and_destroy(skb, n, classid, old, new);
 726                }
 727
 728                if (dev->flags & IFF_UP)
 729                        dev_activate(dev);
 730        } else {
 731                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 732
 733                err = -EINVAL;
 734
 735                if (cops) {
 736                        unsigned long cl = cops->get(parent, classid);
 737                        if (cl) {
 738                                err = cops->graft(parent, cl, new, &old);
 739                                cops->put(parent, cl);
 740                        }
 741                }
 742                if (!err)
 743                        notify_and_destroy(skb, n, classid, old, new);
 744        }
 745        return err;
 746}
 747
 748/* lockdep annotation is needed for ingress; egress gets it only for name */
 749static struct lock_class_key qdisc_tx_lock;
 750static struct lock_class_key qdisc_rx_lock;
 751
 752/*
 753   Allocate and initialize new qdisc.
 754
 755   Parameters are passed via opt.
 756 */
 757
 758static struct Qdisc *
 759qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 760             u32 parent, u32 handle, struct nlattr **tca, int *errp)
 761{
 762        int err;
 763        struct nlattr *kind = tca[TCA_KIND];
 764        struct Qdisc *sch;
 765        struct Qdisc_ops *ops;
 766        struct qdisc_size_table *stab;
 767
 768        ops = qdisc_lookup_ops(kind);
 769#ifdef CONFIG_MODULES
 770        if (ops == NULL && kind != NULL) {
 771                char name[IFNAMSIZ];
 772                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 773                        /* We dropped the RTNL semaphore in order to
 774                         * perform the module load.  So, even if we
 775                         * succeeded in loading the module we have to
 776                         * tell the caller to replay the request.  We
 777                         * indicate this using -EAGAIN.
 778                         * We replay the request because the device may
 779                         * go away in the mean time.
 780                         */
 781                        rtnl_unlock();
 782                        request_module("sch_%s", name);
 783                        rtnl_lock();
 784                        ops = qdisc_lookup_ops(kind);
 785                        if (ops != NULL) {
 786                                /* We will try again qdisc_lookup_ops,
 787                                 * so don't keep a reference.
 788                                 */
 789                                module_put(ops->owner);
 790                                err = -EAGAIN;
 791                                goto err_out;
 792                        }
 793                }
 794        }
 795#endif
 796
 797        err = -ENOENT;
 798        if (ops == NULL)
 799                goto err_out;
 800
 801        sch = qdisc_alloc(dev_queue, ops);
 802        if (IS_ERR(sch)) {
 803                err = PTR_ERR(sch);
 804                goto err_out2;
 805        }
 806
 807        sch->parent = parent;
 808
 809        if (handle == TC_H_INGRESS) {
 810                sch->flags |= TCQ_F_INGRESS;
 811                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 812                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 813        } else {
 814                if (handle == 0) {
 815                        handle = qdisc_alloc_handle(dev);
 816                        err = -ENOMEM;
 817                        if (handle == 0)
 818                                goto err_out3;
 819                }
 820                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 821        }
 822
 823        sch->handle = handle;
 824
 825        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 826                if (tca[TCA_STAB]) {
 827                        stab = qdisc_get_stab(tca[TCA_STAB]);
 828                        if (IS_ERR(stab)) {
 829                                err = PTR_ERR(stab);
 830                                goto err_out3;
 831                        }
 832                        sch->stab = stab;
 833                }
 834                if (tca[TCA_RATE]) {
 835                        spinlock_t *root_lock;
 836
 837                        if ((sch->parent != TC_H_ROOT) &&
 838                            !(sch->flags & TCQ_F_INGRESS))
 839                                root_lock = qdisc_root_sleeping_lock(sch);
 840                        else
 841                                root_lock = qdisc_lock(sch);
 842
 843                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 844                                                root_lock, tca[TCA_RATE]);
 845                        if (err) {
 846                                /*
 847                                 * Any broken qdiscs that would require
 848                                 * a ops->reset() here? The qdisc was never
 849                                 * in action so it shouldn't be necessary.
 850                                 */
 851                                if (ops->destroy)
 852                                        ops->destroy(sch);
 853                                goto err_out3;
 854                        }
 855                }
 856
 857                qdisc_list_add(sch);
 858
 859                return sch;
 860        }
 861err_out3:
 862        qdisc_put_stab(sch->stab);
 863        dev_put(dev);
 864        kfree((char *) sch - sch->padded);
 865err_out2:
 866        module_put(ops->owner);
 867err_out:
 868        *errp = err;
 869        return NULL;
 870}
 871
 872static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 873{
 874        struct qdisc_size_table *stab = NULL;
 875        int err = 0;
 876
 877        if (tca[TCA_OPTIONS]) {
 878                if (sch->ops->change == NULL)
 879                        return -EINVAL;
 880                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 881                if (err)
 882                        return err;
 883        }
 884
 885        if (tca[TCA_STAB]) {
 886                stab = qdisc_get_stab(tca[TCA_STAB]);
 887                if (IS_ERR(stab))
 888                        return PTR_ERR(stab);
 889        }
 890
 891        qdisc_put_stab(sch->stab);
 892        sch->stab = stab;
 893
 894        if (tca[TCA_RATE])
 895                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 896                                      qdisc_root_sleeping_lock(sch),
 897                                      tca[TCA_RATE]);
 898        return 0;
 899}
 900
 901struct check_loop_arg
 902{
 903        struct qdisc_walker     w;
 904        struct Qdisc            *p;
 905        int                     depth;
 906};
 907
 908static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 909
 910static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 911{
 912        struct check_loop_arg   arg;
 913
 914        if (q->ops->cl_ops == NULL)
 915                return 0;
 916
 917        arg.w.stop = arg.w.skip = arg.w.count = 0;
 918        arg.w.fn = check_loop_fn;
 919        arg.depth = depth;
 920        arg.p = p;
 921        q->ops->cl_ops->walk(q, &arg.w);
 922        return arg.w.stop ? -ELOOP : 0;
 923}
 924
 925static int
 926check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 927{
 928        struct Qdisc *leaf;
 929        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 930        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 931
 932        leaf = cops->leaf(q, cl);
 933        if (leaf) {
 934                if (leaf == arg->p || arg->depth > 7)
 935                        return -ELOOP;
 936                return check_loop(leaf, arg->p, arg->depth + 1);
 937        }
 938        return 0;
 939}
 940
 941/*
 942 * Delete/get qdisc.
 943 */
 944
 945static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 946{
 947        struct net *net = sock_net(skb->sk);
 948        struct tcmsg *tcm = NLMSG_DATA(n);
 949        struct nlattr *tca[TCA_MAX + 1];
 950        struct net_device *dev;
 951        u32 clid = tcm->tcm_parent;
 952        struct Qdisc *q = NULL;
 953        struct Qdisc *p = NULL;
 954        int err;
 955
 956        if (net != &init_net)
 957                return -EINVAL;
 958
 959        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 960                return -ENODEV;
 961
 962        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 963        if (err < 0)
 964                return err;
 965
 966        if (clid) {
 967                if (clid != TC_H_ROOT) {
 968                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 969                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 970                                        return -ENOENT;
 971                                q = qdisc_leaf(p, clid);
 972                        } else { /* ingress */
 973                                q = dev->rx_queue.qdisc_sleeping;
 974                        }
 975                } else {
 976                        struct netdev_queue *dev_queue;
 977                        dev_queue = netdev_get_tx_queue(dev, 0);
 978                        q = dev_queue->qdisc_sleeping;
 979                }
 980                if (!q)
 981                        return -ENOENT;
 982
 983                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 984                        return -EINVAL;
 985        } else {
 986                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
 987                        return -ENOENT;
 988        }
 989
 990        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 991                return -EINVAL;
 992
 993        if (n->nlmsg_type == RTM_DELQDISC) {
 994                if (!clid)
 995                        return -EINVAL;
 996                if (q->handle == 0)
 997                        return -ENOENT;
 998                if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
 999                        return err;
1000        } else {
1001                qdisc_notify(skb, n, clid, NULL, q);
1002        }
1003        return 0;
1004}
1005
1006/*
1007   Create/change qdisc.
1008 */
1009
1010static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1011{
1012        struct net *net = sock_net(skb->sk);
1013        struct tcmsg *tcm;
1014        struct nlattr *tca[TCA_MAX + 1];
1015        struct net_device *dev;
1016        u32 clid;
1017        struct Qdisc *q, *p;
1018        int err;
1019
1020        if (net != &init_net)
1021                return -EINVAL;
1022
1023replay:
1024        /* Reinit, just in case something touches this. */
1025        tcm = NLMSG_DATA(n);
1026        clid = tcm->tcm_parent;
1027        q = p = NULL;
1028
1029        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1030                return -ENODEV;
1031
1032        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1033        if (err < 0)
1034                return err;
1035
1036        if (clid) {
1037                if (clid != TC_H_ROOT) {
1038                        if (clid != TC_H_INGRESS) {
1039                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1040                                        return -ENOENT;
1041                                q = qdisc_leaf(p, clid);
1042                        } else { /*ingress */
1043                                q = dev->rx_queue.qdisc_sleeping;
1044                        }
1045                } else {
1046                        struct netdev_queue *dev_queue;
1047                        dev_queue = netdev_get_tx_queue(dev, 0);
1048                        q = dev_queue->qdisc_sleeping;
1049                }
1050
1051                /* It may be default qdisc, ignore it */
1052                if (q && q->handle == 0)
1053                        q = NULL;
1054
1055                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1056                        if (tcm->tcm_handle) {
1057                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1058                                        return -EEXIST;
1059                                if (TC_H_MIN(tcm->tcm_handle))
1060                                        return -EINVAL;
1061                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1062                                        goto create_n_graft;
1063                                if (n->nlmsg_flags&NLM_F_EXCL)
1064                                        return -EEXIST;
1065                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1066                                        return -EINVAL;
1067                                if (q == p ||
1068                                    (p && check_loop(q, p, 0)))
1069                                        return -ELOOP;
1070                                atomic_inc(&q->refcnt);
1071                                goto graft;
1072                        } else {
1073                                if (q == NULL)
1074                                        goto create_n_graft;
1075
1076                                /* This magic test requires explanation.
1077                                 *
1078                                 *   We know, that some child q is already
1079                                 *   attached to this parent and have choice:
1080                                 *   either to change it or to create/graft new one.
1081                                 *
1082                                 *   1. We are allowed to create/graft only
1083                                 *   if CREATE and REPLACE flags are set.
1084                                 *
1085                                 *   2. If EXCL is set, requestor wanted to say,
1086                                 *   that qdisc tcm_handle is not expected
1087                                 *   to exist, so that we choose create/graft too.
1088                                 *
1089                                 *   3. The last case is when no flags are set.
1090                                 *   Alas, it is sort of hole in API, we
1091                                 *   cannot decide what to do unambiguously.
1092                                 *   For now we select create/graft, if
1093                                 *   user gave KIND, which does not match existing.
1094                                 */
1095                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
1096                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
1097                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
1098                                     (tca[TCA_KIND] &&
1099                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1100                                        goto create_n_graft;
1101                        }
1102                }
1103        } else {
1104                if (!tcm->tcm_handle)
1105                        return -EINVAL;
1106                q = qdisc_lookup(dev, tcm->tcm_handle);
1107        }
1108
1109        /* Change qdisc parameters */
1110        if (q == NULL)
1111                return -ENOENT;
1112        if (n->nlmsg_flags&NLM_F_EXCL)
1113                return -EEXIST;
1114        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1115                return -EINVAL;
1116        err = qdisc_change(q, tca);
1117        if (err == 0)
1118                qdisc_notify(skb, n, clid, NULL, q);
1119        return err;
1120
1121create_n_graft:
1122        if (!(n->nlmsg_flags&NLM_F_CREATE))
1123                return -ENOENT;
1124        if (clid == TC_H_INGRESS)
1125                q = qdisc_create(dev, &dev->rx_queue,
1126                                 tcm->tcm_parent, tcm->tcm_parent,
1127                                 tca, &err);
1128        else
1129                q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1130                                 tcm->tcm_parent, tcm->tcm_handle,
1131                                 tca, &err);
1132        if (q == NULL) {
1133                if (err == -EAGAIN)
1134                        goto replay;
1135                return err;
1136        }
1137
1138graft:
1139        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140        if (err) {
1141                if (q)
1142                        qdisc_destroy(q);
1143                return err;
1144        }
1145
1146        return 0;
1147}
1148
1149static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1150                         u32 pid, u32 seq, u16 flags, int event)
1151{
1152        struct tcmsg *tcm;
1153        struct nlmsghdr  *nlh;
1154        unsigned char *b = skb_tail_pointer(skb);
1155        struct gnet_dump d;
1156
1157        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1158        tcm = NLMSG_DATA(nlh);
1159        tcm->tcm_family = AF_UNSPEC;
1160        tcm->tcm__pad1 = 0;
1161        tcm->tcm__pad2 = 0;
1162        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1163        tcm->tcm_parent = clid;
1164        tcm->tcm_handle = q->handle;
1165        tcm->tcm_info = atomic_read(&q->refcnt);
1166        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1167        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1168                goto nla_put_failure;
1169        q->qstats.qlen = q->q.qlen;
1170
1171        if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1172                goto nla_put_failure;
1173
1174        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1175                                         qdisc_root_sleeping_lock(q), &d) < 0)
1176                goto nla_put_failure;
1177
1178        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1179                goto nla_put_failure;
1180
1181        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1182            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1183            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1184                goto nla_put_failure;
1185
1186        if (gnet_stats_finish_copy(&d) < 0)
1187                goto nla_put_failure;
1188
1189        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1190        return skb->len;
1191
1192nlmsg_failure:
1193nla_put_failure:
1194        nlmsg_trim(skb, b);
1195        return -1;
1196}
1197
1198static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1199                        u32 clid, struct Qdisc *old, struct Qdisc *new)
1200{
1201        struct sk_buff *skb;
1202        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1203
1204        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1205        if (!skb)
1206                return -ENOBUFS;
1207
1208        if (old && old->handle) {
1209                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1210                        goto err_out;
1211        }
1212        if (new) {
1213                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1214                        goto err_out;
1215        }
1216
1217        if (skb->len)
1218                return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1219
1220err_out:
1221        kfree_skb(skb);
1222        return -EINVAL;
1223}
1224
1225static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1226{
1227        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1228}
1229
1230static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1231                              struct netlink_callback *cb,
1232                              int *q_idx_p, int s_q_idx)
1233{
1234        int ret = 0, q_idx = *q_idx_p;
1235        struct Qdisc *q;
1236
1237        if (!root)
1238                return 0;
1239
1240        q = root;
1241        if (q_idx < s_q_idx) {
1242                q_idx++;
1243        } else {
1244                if (!tc_qdisc_dump_ignore(q) &&
1245                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1246                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1247                        goto done;
1248                q_idx++;
1249        }
1250        list_for_each_entry(q, &root->list, list) {
1251                if (q_idx < s_q_idx) {
1252                        q_idx++;
1253                        continue;
1254                }
1255                if (!tc_qdisc_dump_ignore(q) && 
1256                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1257                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1258                        goto done;
1259                q_idx++;
1260        }
1261
1262out:
1263        *q_idx_p = q_idx;
1264        return ret;
1265done:
1266        ret = -1;
1267        goto out;
1268}
1269
1270static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1271{
1272        struct net *net = sock_net(skb->sk);
1273        int idx, q_idx;
1274        int s_idx, s_q_idx;
1275        struct net_device *dev;
1276
1277        if (net != &init_net)
1278                return 0;
1279
1280        s_idx = cb->args[0];
1281        s_q_idx = q_idx = cb->args[1];
1282        read_lock(&dev_base_lock);
1283        idx = 0;
1284        for_each_netdev(&init_net, dev) {
1285                struct netdev_queue *dev_queue;
1286
1287                if (idx < s_idx)
1288                        goto cont;
1289                if (idx > s_idx)
1290                        s_q_idx = 0;
1291                q_idx = 0;
1292
1293                dev_queue = netdev_get_tx_queue(dev, 0);
1294                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1295                        goto done;
1296
1297                dev_queue = &dev->rx_queue;
1298                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1299                        goto done;
1300
1301cont:
1302                idx++;
1303        }
1304
1305done:
1306        read_unlock(&dev_base_lock);
1307
1308        cb->args[0] = idx;
1309        cb->args[1] = q_idx;
1310
1311        return skb->len;
1312}
1313
1314
1315
1316/************************************************
1317 *      Traffic classes manipulation.           *
1318 ************************************************/
1319
1320
1321
1322static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1323{
1324        struct net *net = sock_net(skb->sk);
1325        struct netdev_queue *dev_queue;
1326        struct tcmsg *tcm = NLMSG_DATA(n);
1327        struct nlattr *tca[TCA_MAX + 1];
1328        struct net_device *dev;
1329        struct Qdisc *q = NULL;
1330        const struct Qdisc_class_ops *cops;
1331        unsigned long cl = 0;
1332        unsigned long new_cl;
1333        u32 pid = tcm->tcm_parent;
1334        u32 clid = tcm->tcm_handle;
1335        u32 qid = TC_H_MAJ(clid);
1336        int err;
1337
1338        if (net != &init_net)
1339                return -EINVAL;
1340
1341        if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1342                return -ENODEV;
1343
1344        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1345        if (err < 0)
1346                return err;
1347
1348        /*
1349           parent == TC_H_UNSPEC - unspecified parent.
1350           parent == TC_H_ROOT   - class is root, which has no parent.
1351           parent == X:0         - parent is root class.
1352           parent == X:Y         - parent is a node in hierarchy.
1353           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1354
1355           handle == 0:0         - generate handle from kernel pool.
1356           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1357           handle == X:Y         - clear.
1358           handle == X:0         - root class.
1359         */
1360
1361        /* Step 1. Determine qdisc handle X:0 */
1362
1363        dev_queue = netdev_get_tx_queue(dev, 0);
1364        if (pid != TC_H_ROOT) {
1365                u32 qid1 = TC_H_MAJ(pid);
1366
1367                if (qid && qid1) {
1368                        /* If both majors are known, they must be identical. */
1369                        if (qid != qid1)
1370                                return -EINVAL;
1371                } else if (qid1) {
1372                        qid = qid1;
1373                } else if (qid == 0)
1374                        qid = dev_queue->qdisc_sleeping->handle;
1375
1376                /* Now qid is genuine qdisc handle consistent
1377                   both with parent and child.
1378
1379                   TC_H_MAJ(pid) still may be unspecified, complete it now.
1380                 */
1381                if (pid)
1382                        pid = TC_H_MAKE(qid, pid);
1383        } else {
1384                if (qid == 0)
1385                        qid = dev_queue->qdisc_sleeping->handle;
1386        }
1387
1388        /* OK. Locate qdisc */
1389        if ((q = qdisc_lookup(dev, qid)) == NULL)
1390                return -ENOENT;
1391
1392        /* An check that it supports classes */
1393        cops = q->ops->cl_ops;
1394        if (cops == NULL)
1395                return -EINVAL;
1396
1397        /* Now try to get class */
1398        if (clid == 0) {
1399                if (pid == TC_H_ROOT)
1400                        clid = qid;
1401        } else
1402                clid = TC_H_MAKE(qid, clid);
1403
1404        if (clid)
1405                cl = cops->get(q, clid);
1406
1407        if (cl == 0) {
1408                err = -ENOENT;
1409                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1410                        goto out;
1411        } else {
1412                switch (n->nlmsg_type) {
1413                case RTM_NEWTCLASS:
1414                        err = -EEXIST;
1415                        if (n->nlmsg_flags&NLM_F_EXCL)
1416                                goto out;
1417                        break;
1418                case RTM_DELTCLASS:
1419                        err = cops->delete(q, cl);
1420                        if (err == 0)
1421                                tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1422                        goto out;
1423                case RTM_GETTCLASS:
1424                        err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1425                        goto out;
1426                default:
1427                        err = -EINVAL;
1428                        goto out;
1429                }
1430        }
1431
1432        new_cl = cl;
1433        err = cops->change(q, clid, pid, tca, &new_cl);
1434        if (err == 0)
1435                tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1436
1437out:
1438        if (cl)
1439                cops->put(q, cl);
1440
1441        return err;
1442}
1443
1444
1445static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1446                          unsigned long cl,
1447                          u32 pid, u32 seq, u16 flags, int event)
1448{
1449        struct tcmsg *tcm;
1450        struct nlmsghdr  *nlh;
1451        unsigned char *b = skb_tail_pointer(skb);
1452        struct gnet_dump d;
1453        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1454
1455        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1456        tcm = NLMSG_DATA(nlh);
1457        tcm->tcm_family = AF_UNSPEC;
1458        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1459        tcm->tcm_parent = q->handle;
1460        tcm->tcm_handle = q->handle;
1461        tcm->tcm_info = 0;
1462        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1463        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1464                goto nla_put_failure;
1465
1466        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1467                                         qdisc_root_sleeping_lock(q), &d) < 0)
1468                goto nla_put_failure;
1469
1470        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1471                goto nla_put_failure;
1472
1473        if (gnet_stats_finish_copy(&d) < 0)
1474                goto nla_put_failure;
1475
1476        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1477        return skb->len;
1478
1479nlmsg_failure:
1480nla_put_failure:
1481        nlmsg_trim(skb, b);
1482        return -1;
1483}
1484
1485static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1486                          struct Qdisc *q, unsigned long cl, int event)
1487{
1488        struct sk_buff *skb;
1489        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1490
1491        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1492        if (!skb)
1493                return -ENOBUFS;
1494
1495        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1496                kfree_skb(skb);
1497                return -EINVAL;
1498        }
1499
1500        return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1501}
1502
1503struct qdisc_dump_args
1504{
1505        struct qdisc_walker w;
1506        struct sk_buff *skb;
1507        struct netlink_callback *cb;
1508};
1509
1510static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1511{
1512        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1513
1514        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1515                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1516}
1517
1518static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1519                                struct tcmsg *tcm, struct netlink_callback *cb,
1520                                int *t_p, int s_t)
1521{
1522        struct qdisc_dump_args arg;
1523
1524        if (tc_qdisc_dump_ignore(q) ||
1525            *t_p < s_t || !q->ops->cl_ops ||
1526            (tcm->tcm_parent &&
1527             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1528                (*t_p)++;
1529                return 0;
1530        }
1531        if (*t_p > s_t)
1532                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1533        arg.w.fn = qdisc_class_dump;
1534        arg.skb = skb;
1535        arg.cb = cb;
1536        arg.w.stop  = 0;
1537        arg.w.skip = cb->args[1];
1538        arg.w.count = 0;
1539        q->ops->cl_ops->walk(q, &arg.w);
1540        cb->args[1] = arg.w.count;
1541        if (arg.w.stop)
1542                return -1;
1543        (*t_p)++;
1544        return 0;
1545}
1546
1547static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1548                               struct tcmsg *tcm, struct netlink_callback *cb,
1549                               int *t_p, int s_t)
1550{
1551        struct Qdisc *q;
1552
1553        if (!root)
1554                return 0;
1555
1556        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1557                return -1;
1558
1559        list_for_each_entry(q, &root->list, list) {
1560                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1561                        return -1;
1562        }
1563
1564        return 0;
1565}
1566
1567static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1568{
1569        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1570        struct net *net = sock_net(skb->sk);
1571        struct netdev_queue *dev_queue;
1572        struct net_device *dev;
1573        int t, s_t;
1574
1575        if (net != &init_net)
1576                return 0;
1577
1578        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1579                return 0;
1580        if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1581                return 0;
1582
1583        s_t = cb->args[0];
1584        t = 0;
1585
1586        dev_queue = netdev_get_tx_queue(dev, 0);
1587        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1588                goto done;
1589
1590        dev_queue = &dev->rx_queue;
1591        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1592                goto done;
1593
1594done:
1595        cb->args[0] = t;
1596
1597        dev_put(dev);
1598        return skb->len;
1599}
1600
1601/* Main classifier routine: scans classifier chain attached
1602   to this qdisc, (optionally) tests for protocol and asks
1603   specific classifiers.
1604 */
1605int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1606                       struct tcf_result *res)
1607{
1608        __be16 protocol = skb->protocol;
1609        int err = 0;
1610
1611        for (; tp; tp = tp->next) {
1612                if ((tp->protocol == protocol ||
1613                     tp->protocol == htons(ETH_P_ALL)) &&
1614                    (err = tp->classify(skb, tp, res)) >= 0) {
1615#ifdef CONFIG_NET_CLS_ACT
1616                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1617                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1618#endif
1619                        return err;
1620                }
1621        }
1622        return -1;
1623}
1624EXPORT_SYMBOL(tc_classify_compat);
1625
1626int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1627                struct tcf_result *res)
1628{
1629        int err = 0;
1630        __be16 protocol;
1631#ifdef CONFIG_NET_CLS_ACT
1632        struct tcf_proto *otp = tp;
1633reclassify:
1634#endif
1635        protocol = skb->protocol;
1636
1637        err = tc_classify_compat(skb, tp, res);
1638#ifdef CONFIG_NET_CLS_ACT
1639        if (err == TC_ACT_RECLASSIFY) {
1640                u32 verd = G_TC_VERD(skb->tc_verd);
1641                tp = otp;
1642
1643                if (verd++ >= MAX_REC_LOOP) {
1644                        printk("rule prio %u protocol %02x reclassify loop, "
1645                               "packet dropped\n",
1646                               tp->prio&0xffff, ntohs(tp->protocol));
1647                        return TC_ACT_SHOT;
1648                }
1649                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1650                goto reclassify;
1651        }
1652#endif
1653        return err;
1654}
1655EXPORT_SYMBOL(tc_classify);
1656
1657void tcf_destroy(struct tcf_proto *tp)
1658{
1659        tp->ops->destroy(tp);
1660        module_put(tp->ops->owner);
1661        kfree(tp);
1662}
1663
1664void tcf_destroy_chain(struct tcf_proto **fl)
1665{
1666        struct tcf_proto *tp;
1667
1668        while ((tp = *fl) != NULL) {
1669                *fl = tp->next;
1670                tcf_destroy(tp);
1671        }
1672}
1673EXPORT_SYMBOL(tcf_destroy_chain);
1674
1675#ifdef CONFIG_PROC_FS
1676static int psched_show(struct seq_file *seq, void *v)
1677{
1678        struct timespec ts;
1679
1680        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1681        seq_printf(seq, "%08x %08x %08x %08x\n",
1682                   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1683                   1000000,
1684                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1685
1686        return 0;
1687}
1688
1689static int psched_open(struct inode *inode, struct file *file)
1690{
1691        return single_open(file, psched_show, PDE(inode)->data);
1692}
1693
1694static const struct file_operations psched_fops = {
1695        .owner = THIS_MODULE,
1696        .open = psched_open,
1697        .read  = seq_read,
1698        .llseek = seq_lseek,
1699        .release = single_release,
1700};
1701#endif
1702
1703static int __init pktsched_init(void)
1704{
1705        register_qdisc(&pfifo_qdisc_ops);
1706        register_qdisc(&bfifo_qdisc_ops);
1707        proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1708
1709        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1710        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1711        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1712        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1713        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1714        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1715
1716        return 0;
1717}
1718
1719subsys_initcall(pktsched_init);
1720
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.