linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
 244out:
 245        return q;
 246}
 247
 248static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 249{
 250        unsigned long cl;
 251        struct Qdisc *leaf;
 252        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 253
 254        if (cops == NULL)
 255                return NULL;
 256        cl = cops->get(p, classid);
 257
 258        if (cl == 0)
 259                return NULL;
 260        leaf = cops->leaf(p, cl);
 261        cops->put(p, cl);
 262        return leaf;
 263}
 264
 265/* Find queueing discipline by name */
 266
 267static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 268{
 269        struct Qdisc_ops *q = NULL;
 270
 271        if (kind) {
 272                read_lock(&qdisc_mod_lock);
 273                for (q = qdisc_base; q; q = q->next) {
 274                        if (nla_strcmp(kind, q->id) == 0) {
 275                                if (!try_module_get(q->owner))
 276                                        q = NULL;
 277                                break;
 278                        }
 279                }
 280                read_unlock(&qdisc_mod_lock);
 281        }
 282        return q;
 283}
 284
 285static struct qdisc_rate_table *qdisc_rtab_list;
 286
 287struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 288{
 289        struct qdisc_rate_table *rtab;
 290
 291        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 292                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 293                        rtab->refcnt++;
 294                        return rtab;
 295                }
 296        }
 297
 298        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 299            nla_len(tab) != TC_RTAB_SIZE)
 300                return NULL;
 301
 302        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 303        if (rtab) {
 304                rtab->rate = *r;
 305                rtab->refcnt = 1;
 306                memcpy(rtab->data, nla_data(tab), 1024);
 307                rtab->next = qdisc_rtab_list;
 308                qdisc_rtab_list = rtab;
 309        }
 310        return rtab;
 311}
 312EXPORT_SYMBOL(qdisc_get_rtab);
 313
 314void qdisc_put_rtab(struct qdisc_rate_table *tab)
 315{
 316        struct qdisc_rate_table *rtab, **rtabp;
 317
 318        if (!tab || --tab->refcnt)
 319                return;
 320
 321        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
 322                if (rtab == tab) {
 323                        *rtabp = rtab->next;
 324                        kfree(rtab);
 325                        return;
 326                }
 327        }
 328}
 329EXPORT_SYMBOL(qdisc_put_rtab);
 330
 331static LIST_HEAD(qdisc_stab_list);
 332static DEFINE_SPINLOCK(qdisc_stab_lock);
 333
 334static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 335        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 336        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 337};
 338
 339static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 340{
 341        struct nlattr *tb[TCA_STAB_MAX + 1];
 342        struct qdisc_size_table *stab;
 343        struct tc_sizespec *s;
 344        unsigned int tsize = 0;
 345        u16 *tab = NULL;
 346        int err;
 347
 348        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 349        if (err < 0)
 350                return ERR_PTR(err);
 351        if (!tb[TCA_STAB_BASE])
 352                return ERR_PTR(-EINVAL);
 353
 354        s = nla_data(tb[TCA_STAB_BASE]);
 355
 356        if (s->tsize > 0) {
 357                if (!tb[TCA_STAB_DATA])
 358                        return ERR_PTR(-EINVAL);
 359                tab = nla_data(tb[TCA_STAB_DATA]);
 360                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 361        }
 362
 363        if (!s || tsize != s->tsize || (!tab && tsize > 0))
 364                return ERR_PTR(-EINVAL);
 365
 366        spin_lock(&qdisc_stab_lock);
 367
 368        list_for_each_entry(stab, &qdisc_stab_list, list) {
 369                if (memcmp(&stab->szopts, s, sizeof(*s)))
 370                        continue;
 371                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 372                        continue;
 373                stab->refcnt++;
 374                spin_unlock(&qdisc_stab_lock);
 375                return stab;
 376        }
 377
 378        spin_unlock(&qdisc_stab_lock);
 379
 380        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 381        if (!stab)
 382                return ERR_PTR(-ENOMEM);
 383
 384        stab->refcnt = 1;
 385        stab->szopts = *s;
 386        if (tsize > 0)
 387                memcpy(stab->data, tab, tsize * sizeof(u16));
 388
 389        spin_lock(&qdisc_stab_lock);
 390        list_add_tail(&stab->list, &qdisc_stab_list);
 391        spin_unlock(&qdisc_stab_lock);
 392
 393        return stab;
 394}
 395
 396void qdisc_put_stab(struct qdisc_size_table *tab)
 397{
 398        if (!tab)
 399                return;
 400
 401        spin_lock(&qdisc_stab_lock);
 402
 403        if (--tab->refcnt == 0) {
 404                list_del(&tab->list);
 405                kfree(tab);
 406        }
 407
 408        spin_unlock(&qdisc_stab_lock);
 409}
 410EXPORT_SYMBOL(qdisc_put_stab);
 411
 412static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 413{
 414        struct nlattr *nest;
 415
 416        nest = nla_nest_start(skb, TCA_STAB);
 417        if (nest == NULL)
 418                goto nla_put_failure;
 419        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 420        nla_nest_end(skb, nest);
 421
 422        return skb->len;
 423
 424nla_put_failure:
 425        return -1;
 426}
 427
 428void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
 429{
 430        int pkt_len, slot;
 431
 432        pkt_len = skb->len + stab->szopts.overhead;
 433        if (unlikely(!stab->szopts.tsize))
 434                goto out;
 435
 436        slot = pkt_len + stab->szopts.cell_align;
 437        if (unlikely(slot < 0))
 438                slot = 0;
 439
 440        slot >>= stab->szopts.cell_log;
 441        if (likely(slot < stab->szopts.tsize))
 442                pkt_len = stab->data[slot];
 443        else
 444                pkt_len = stab->data[stab->szopts.tsize - 1] *
 445                                (slot / stab->szopts.tsize) +
 446                                stab->data[slot % stab->szopts.tsize];
 447
 448        pkt_len <<= stab->szopts.size_log;
 449out:
 450        if (unlikely(pkt_len < 1))
 451                pkt_len = 1;
 452        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 453}
 454EXPORT_SYMBOL(qdisc_calculate_pkt_len);
 455
 456void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 457{
 458        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 459                printk(KERN_WARNING
 460                       "%s: %s qdisc %X: is non-work-conserving?\n",
 461                       txt, qdisc->ops->id, qdisc->handle >> 16);
 462                qdisc->flags |= TCQ_F_WARN_NONWC;
 463        }
 464}
 465EXPORT_SYMBOL(qdisc_warn_nonwc);
 466
 467static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 468{
 469        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 470                                                 timer);
 471
 472        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 473        __netif_schedule(qdisc_root(wd->qdisc));
 474
 475        return HRTIMER_NORESTART;
 476}
 477
 478void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 479{
 480        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 481        wd->timer.function = qdisc_watchdog;
 482        wd->qdisc = qdisc;
 483}
 484EXPORT_SYMBOL(qdisc_watchdog_init);
 485
 486void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 487{
 488        ktime_t time;
 489
 490        if (test_bit(__QDISC_STATE_DEACTIVATED,
 491                     &qdisc_root_sleeping(wd->qdisc)->state))
 492                return;
 493
 494        wd->qdisc->flags |= TCQ_F_THROTTLED;
 495        time = ktime_set(0, 0);
 496        time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 497        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 498}
 499EXPORT_SYMBOL(qdisc_watchdog_schedule);
 500
 501void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 502{
 503        hrtimer_cancel(&wd->timer);
 504        wd->qdisc->flags &= ~TCQ_F_THROTTLED;
 505}
 506EXPORT_SYMBOL(qdisc_watchdog_cancel);
 507
 508static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 509{
 510        unsigned int size = n * sizeof(struct hlist_head), i;
 511        struct hlist_head *h;
 512
 513        if (size <= PAGE_SIZE)
 514                h = kmalloc(size, GFP_KERNEL);
 515        else
 516                h = (struct hlist_head *)
 517                        __get_free_pages(GFP_KERNEL, get_order(size));
 518
 519        if (h != NULL) {
 520                for (i = 0; i < n; i++)
 521                        INIT_HLIST_HEAD(&h[i]);
 522        }
 523        return h;
 524}
 525
 526static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 527{
 528        unsigned int size = n * sizeof(struct hlist_head);
 529
 530        if (size <= PAGE_SIZE)
 531                kfree(h);
 532        else
 533                free_pages((unsigned long)h, get_order(size));
 534}
 535
 536void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 537{
 538        struct Qdisc_class_common *cl;
 539        struct hlist_node *n, *next;
 540        struct hlist_head *nhash, *ohash;
 541        unsigned int nsize, nmask, osize;
 542        unsigned int i, h;
 543
 544        /* Rehash when load factor exceeds 0.75 */
 545        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 546                return;
 547        nsize = clhash->hashsize * 2;
 548        nmask = nsize - 1;
 549        nhash = qdisc_class_hash_alloc(nsize);
 550        if (nhash == NULL)
 551                return;
 552
 553        ohash = clhash->hash;
 554        osize = clhash->hashsize;
 555
 556        sch_tree_lock(sch);
 557        for (i = 0; i < osize; i++) {
 558                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 559                        h = qdisc_class_hash(cl->classid, nmask);
 560                        hlist_add_head(&cl->hnode, &nhash[h]);
 561                }
 562        }
 563        clhash->hash     = nhash;
 564        clhash->hashsize = nsize;
 565        clhash->hashmask = nmask;
 566        sch_tree_unlock(sch);
 567
 568        qdisc_class_hash_free(ohash, osize);
 569}
 570EXPORT_SYMBOL(qdisc_class_hash_grow);
 571
 572int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 573{
 574        unsigned int size = 4;
 575
 576        clhash->hash = qdisc_class_hash_alloc(size);
 577        if (clhash->hash == NULL)
 578                return -ENOMEM;
 579        clhash->hashsize  = size;
 580        clhash->hashmask  = size - 1;
 581        clhash->hashelems = 0;
 582        return 0;
 583}
 584EXPORT_SYMBOL(qdisc_class_hash_init);
 585
 586void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 587{
 588        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 589}
 590EXPORT_SYMBOL(qdisc_class_hash_destroy);
 591
 592void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 593                             struct Qdisc_class_common *cl)
 594{
 595        unsigned int h;
 596
 597        INIT_HLIST_NODE(&cl->hnode);
 598        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 599        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 600        clhash->hashelems++;
 601}
 602EXPORT_SYMBOL(qdisc_class_hash_insert);
 603
 604void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 605                             struct Qdisc_class_common *cl)
 606{
 607        hlist_del(&cl->hnode);
 608        clhash->hashelems--;
 609}
 610EXPORT_SYMBOL(qdisc_class_hash_remove);
 611
 612/* Allocate an unique handle from space managed by kernel */
 613
 614static u32 qdisc_alloc_handle(struct net_device *dev)
 615{
 616        int i = 0x10000;
 617        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 618
 619        do {
 620                autohandle += TC_H_MAKE(0x10000U, 0);
 621                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 622                        autohandle = TC_H_MAKE(0x80000000U, 0);
 623        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 624
 625        return i>0 ? autohandle : 0;
 626}
 627
 628void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 629{
 630        const struct Qdisc_class_ops *cops;
 631        unsigned long cl;
 632        u32 parentid;
 633
 634        if (n == 0)
 635                return;
 636        while ((parentid = sch->parent)) {
 637                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 638                        return;
 639
 640                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 641                if (sch == NULL) {
 642                        WARN_ON(parentid != TC_H_ROOT);
 643                        return;
 644                }
 645                cops = sch->ops->cl_ops;
 646                if (cops->qlen_notify) {
 647                        cl = cops->get(sch, parentid);
 648                        cops->qlen_notify(sch, cl);
 649                        cops->put(sch, cl);
 650                }
 651                sch->q.qlen -= n;
 652        }
 653}
 654EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 655
 656static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 657                               struct nlmsghdr *n, u32 clid,
 658                               struct Qdisc *old, struct Qdisc *new)
 659{
 660        if (new || old)
 661                qdisc_notify(net, skb, n, clid, old, new);
 662
 663        if (old)
 664                qdisc_destroy(old);
 665}
 666
 667/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 668 * to device "dev".
 669 *
 670 * When appropriate send a netlink notification using 'skb'
 671 * and "n".
 672 *
 673 * On success, destroy old qdisc.
 674 */
 675
 676static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 677                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 678                       struct Qdisc *new, struct Qdisc *old)
 679{
 680        struct Qdisc *q = old;
 681        struct net *net = dev_net(dev);
 682        int err = 0;
 683
 684        if (parent == NULL) {
 685                unsigned int i, num_q, ingress;
 686
 687                ingress = 0;
 688                num_q = dev->num_tx_queues;
 689                if ((q && q->flags & TCQ_F_INGRESS) ||
 690                    (new && new->flags & TCQ_F_INGRESS)) {
 691                        num_q = 1;
 692                        ingress = 1;
 693                }
 694
 695                if (dev->flags & IFF_UP)
 696                        dev_deactivate(dev);
 697
 698                if (new && new->ops->attach) {
 699                        new->ops->attach(new);
 700                        num_q = 0;
 701                }
 702
 703                for (i = 0; i < num_q; i++) {
 704                        struct netdev_queue *dev_queue = &dev->rx_queue;
 705
 706                        if (!ingress)
 707                                dev_queue = netdev_get_tx_queue(dev, i);
 708
 709                        old = dev_graft_qdisc(dev_queue, new);
 710                        if (new && i > 0)
 711                                atomic_inc(&new->refcnt);
 712
 713                        if (!ingress)
 714                                qdisc_destroy(old);
 715                }
 716
 717                if (!ingress) {
 718                        notify_and_destroy(net, skb, n, classid,
 719                                           dev->qdisc, new);
 720                        if (new && !new->ops->attach)
 721                                atomic_inc(&new->refcnt);
 722                        dev->qdisc = new ? : &noop_qdisc;
 723                } else {
 724                        notify_and_destroy(net, skb, n, classid, old, new);
 725                }
 726
 727                if (dev->flags & IFF_UP)
 728                        dev_activate(dev);
 729        } else {
 730                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 731
 732                err = -EOPNOTSUPP;
 733                if (cops && cops->graft) {
 734                        unsigned long cl = cops->get(parent, classid);
 735                        if (cl) {
 736                                err = cops->graft(parent, cl, new, &old);
 737                                cops->put(parent, cl);
 738                        } else
 739                                err = -ENOENT;
 740                }
 741                if (!err)
 742                        notify_and_destroy(net, skb, n, classid, old, new);
 743        }
 744        return err;
 745}
 746
 747/* lockdep annotation is needed for ingress; egress gets it only for name */
 748static struct lock_class_key qdisc_tx_lock;
 749static struct lock_class_key qdisc_rx_lock;
 750
 751/*
 752   Allocate and initialize new qdisc.
 753
 754   Parameters are passed via opt.
 755 */
 756
 757static struct Qdisc *
 758qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 759             struct Qdisc *p, u32 parent, u32 handle,
 760             struct nlattr **tca, int *errp)
 761{
 762        int err;
 763        struct nlattr *kind = tca[TCA_KIND];
 764        struct Qdisc *sch;
 765        struct Qdisc_ops *ops;
 766        struct qdisc_size_table *stab;
 767
 768        ops = qdisc_lookup_ops(kind);
 769#ifdef CONFIG_MODULES
 770        if (ops == NULL && kind != NULL) {
 771                char name[IFNAMSIZ];
 772                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 773                        /* We dropped the RTNL semaphore in order to
 774                         * perform the module load.  So, even if we
 775                         * succeeded in loading the module we have to
 776                         * tell the caller to replay the request.  We
 777                         * indicate this using -EAGAIN.
 778                         * We replay the request because the device may
 779                         * go away in the mean time.
 780                         */
 781                        rtnl_unlock();
 782                        request_module("sch_%s", name);
 783                        rtnl_lock();
 784                        ops = qdisc_lookup_ops(kind);
 785                        if (ops != NULL) {
 786                                /* We will try again qdisc_lookup_ops,
 787                                 * so don't keep a reference.
 788                                 */
 789                                module_put(ops->owner);
 790                                err = -EAGAIN;
 791                                goto err_out;
 792                        }
 793                }
 794        }
 795#endif
 796
 797        err = -ENOENT;
 798        if (ops == NULL)
 799                goto err_out;
 800
 801        sch = qdisc_alloc(dev_queue, ops);
 802        if (IS_ERR(sch)) {
 803                err = PTR_ERR(sch);
 804                goto err_out2;
 805        }
 806
 807        sch->parent = parent;
 808
 809        if (handle == TC_H_INGRESS) {
 810                sch->flags |= TCQ_F_INGRESS;
 811                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 812                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 813        } else {
 814                if (handle == 0) {
 815                        handle = qdisc_alloc_handle(dev);
 816                        err = -ENOMEM;
 817                        if (handle == 0)
 818                                goto err_out3;
 819                }
 820                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 821        }
 822
 823        sch->handle = handle;
 824
 825        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 826                if (tca[TCA_STAB]) {
 827                        stab = qdisc_get_stab(tca[TCA_STAB]);
 828                        if (IS_ERR(stab)) {
 829                                err = PTR_ERR(stab);
 830                                goto err_out4;
 831                        }
 832                        sch->stab = stab;
 833                }
 834                if (tca[TCA_RATE]) {
 835                        spinlock_t *root_lock;
 836
 837                        err = -EOPNOTSUPP;
 838                        if (sch->flags & TCQ_F_MQROOT)
 839                                goto err_out4;
 840
 841                        if ((sch->parent != TC_H_ROOT) &&
 842                            !(sch->flags & TCQ_F_INGRESS) &&
 843                            (!p || !(p->flags & TCQ_F_MQROOT)))
 844                                root_lock = qdisc_root_sleeping_lock(sch);
 845                        else
 846                                root_lock = qdisc_lock(sch);
 847
 848                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 849                                                root_lock, tca[TCA_RATE]);
 850                        if (err)
 851                                goto err_out4;
 852                }
 853
 854                qdisc_list_add(sch);
 855
 856                return sch;
 857        }
 858err_out3:
 859        dev_put(dev);
 860        kfree((char *) sch - sch->padded);
 861err_out2:
 862        module_put(ops->owner);
 863err_out:
 864        *errp = err;
 865        return NULL;
 866
 867err_out4:
 868        /*
 869         * Any broken qdiscs that would require a ops->reset() here?
 870         * The qdisc was never in action so it shouldn't be necessary.
 871         */
 872        qdisc_put_stab(sch->stab);
 873        if (ops->destroy)
 874                ops->destroy(sch);
 875        goto err_out3;
 876}
 877
 878static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 879{
 880        struct qdisc_size_table *stab = NULL;
 881        int err = 0;
 882
 883        if (tca[TCA_OPTIONS]) {
 884                if (sch->ops->change == NULL)
 885                        return -EINVAL;
 886                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 887                if (err)
 888                        return err;
 889        }
 890
 891        if (tca[TCA_STAB]) {
 892                stab = qdisc_get_stab(tca[TCA_STAB]);
 893                if (IS_ERR(stab))
 894                        return PTR_ERR(stab);
 895        }
 896
 897        qdisc_put_stab(sch->stab);
 898        sch->stab = stab;
 899
 900        if (tca[TCA_RATE]) {
 901                /* NB: ignores errors from replace_estimator
 902                   because change can't be undone. */
 903                if (sch->flags & TCQ_F_MQROOT)
 904                        goto out;
 905                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 906                                            qdisc_root_sleeping_lock(sch),
 907                                            tca[TCA_RATE]);
 908        }
 909out:
 910        return 0;
 911}
 912
 913struct check_loop_arg
 914{
 915        struct qdisc_walker     w;
 916        struct Qdisc            *p;
 917        int                     depth;
 918};
 919
 920static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 921
 922static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 923{
 924        struct check_loop_arg   arg;
 925
 926        if (q->ops->cl_ops == NULL)
 927                return 0;
 928
 929        arg.w.stop = arg.w.skip = arg.w.count = 0;
 930        arg.w.fn = check_loop_fn;
 931        arg.depth = depth;
 932        arg.p = p;
 933        q->ops->cl_ops->walk(q, &arg.w);
 934        return arg.w.stop ? -ELOOP : 0;
 935}
 936
 937static int
 938check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 939{
 940        struct Qdisc *leaf;
 941        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 942        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 943
 944        leaf = cops->leaf(q, cl);
 945        if (leaf) {
 946                if (leaf == arg->p || arg->depth > 7)
 947                        return -ELOOP;
 948                return check_loop(leaf, arg->p, arg->depth + 1);
 949        }
 950        return 0;
 951}
 952
 953/*
 954 * Delete/get qdisc.
 955 */
 956
 957static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 958{
 959        struct net *net = sock_net(skb->sk);
 960        struct tcmsg *tcm = NLMSG_DATA(n);
 961        struct nlattr *tca[TCA_MAX + 1];
 962        struct net_device *dev;
 963        u32 clid = tcm->tcm_parent;
 964        struct Qdisc *q = NULL;
 965        struct Qdisc *p = NULL;
 966        int err;
 967
 968        if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
 969                return -ENODEV;
 970
 971        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 972        if (err < 0)
 973                return err;
 974
 975        if (clid) {
 976                if (clid != TC_H_ROOT) {
 977                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 978                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 979                                        return -ENOENT;
 980                                q = qdisc_leaf(p, clid);
 981                        } else { /* ingress */
 982                                q = dev->rx_queue.qdisc_sleeping;
 983                        }
 984                } else {
 985                        q = dev->qdisc;
 986                }
 987                if (!q)
 988                        return -ENOENT;
 989
 990                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 991                        return -EINVAL;
 992        } else {
 993                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
 994                        return -ENOENT;
 995        }
 996
 997        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 998                return -EINVAL;
 999
1000        if (n->nlmsg_type == RTM_DELQDISC) {
1001                if (!clid)
1002                        return -EINVAL;
1003                if (q->handle == 0)
1004                        return -ENOENT;
1005                if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1006                        return err;
1007        } else {
1008                qdisc_notify(net, skb, n, clid, NULL, q);
1009        }
1010        return 0;
1011}
1012
1013/*
1014   Create/change qdisc.
1015 */
1016
1017static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1018{
1019        struct net *net = sock_net(skb->sk);
1020        struct tcmsg *tcm;
1021        struct nlattr *tca[TCA_MAX + 1];
1022        struct net_device *dev;
1023        u32 clid;
1024        struct Qdisc *q, *p;
1025        int err;
1026
1027replay:
1028        /* Reinit, just in case something touches this. */
1029        tcm = NLMSG_DATA(n);
1030        clid = tcm->tcm_parent;
1031        q = p = NULL;
1032
1033        if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1034                return -ENODEV;
1035
1036        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1037        if (err < 0)
1038                return err;
1039
1040        if (clid) {
1041                if (clid != TC_H_ROOT) {
1042                        if (clid != TC_H_INGRESS) {
1043                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1044                                        return -ENOENT;
1045                                q = qdisc_leaf(p, clid);
1046                        } else { /*ingress */
1047                                q = dev->rx_queue.qdisc_sleeping;
1048                        }
1049                } else {
1050                        q = dev->qdisc;
1051                }
1052
1053                /* It may be default qdisc, ignore it */
1054                if (q && q->handle == 0)
1055                        q = NULL;
1056
1057                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1058                        if (tcm->tcm_handle) {
1059                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1060                                        return -EEXIST;
1061                                if (TC_H_MIN(tcm->tcm_handle))
1062                                        return -EINVAL;
1063                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1064                                        goto create_n_graft;
1065                                if (n->nlmsg_flags&NLM_F_EXCL)
1066                                        return -EEXIST;
1067                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1068                                        return -EINVAL;
1069                                if (q == p ||
1070                                    (p && check_loop(q, p, 0)))
1071                                        return -ELOOP;
1072                                atomic_inc(&q->refcnt);
1073                                goto graft;
1074                        } else {
1075                                if (q == NULL)
1076                                        goto create_n_graft;
1077
1078                                /* This magic test requires explanation.
1079                                 *
1080                                 *   We know, that some child q is already
1081                                 *   attached to this parent and have choice:
1082                                 *   either to change it or to create/graft new one.
1083                                 *
1084                                 *   1. We are allowed to create/graft only
1085                                 *   if CREATE and REPLACE flags are set.
1086                                 *
1087                                 *   2. If EXCL is set, requestor wanted to say,
1088                                 *   that qdisc tcm_handle is not expected
1089                                 *   to exist, so that we choose create/graft too.
1090                                 *
1091                                 *   3. The last case is when no flags are set.
1092                                 *   Alas, it is sort of hole in API, we
1093                                 *   cannot decide what to do unambiguously.
1094                                 *   For now we select create/graft, if
1095                                 *   user gave KIND, which does not match existing.
1096                                 */
1097                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
1098                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
1099                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
1100                                     (tca[TCA_KIND] &&
1101                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1102                                        goto create_n_graft;
1103                        }
1104                }
1105        } else {
1106                if (!tcm->tcm_handle)
1107                        return -EINVAL;
1108                q = qdisc_lookup(dev, tcm->tcm_handle);
1109        }
1110
1111        /* Change qdisc parameters */
1112        if (q == NULL)
1113                return -ENOENT;
1114        if (n->nlmsg_flags&NLM_F_EXCL)
1115                return -EEXIST;
1116        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1117                return -EINVAL;
1118        err = qdisc_change(q, tca);
1119        if (err == 0)
1120                qdisc_notify(net, skb, n, clid, NULL, q);
1121        return err;
1122
1123create_n_graft:
1124        if (!(n->nlmsg_flags&NLM_F_CREATE))
1125                return -ENOENT;
1126        if (clid == TC_H_INGRESS)
1127                q = qdisc_create(dev, &dev->rx_queue, p,
1128                                 tcm->tcm_parent, tcm->tcm_parent,
1129                                 tca, &err);
1130        else {
1131                struct netdev_queue *dev_queue;
1132
1133                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1134                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1135                else if (p)
1136                        dev_queue = p->dev_queue;
1137                else
1138                        dev_queue = netdev_get_tx_queue(dev, 0);
1139
1140                q = qdisc_create(dev, dev_queue, p,
1141                                 tcm->tcm_parent, tcm->tcm_handle,
1142                                 tca, &err);
1143        }
1144        if (q == NULL) {
1145                if (err == -EAGAIN)
1146                        goto replay;
1147                return err;
1148        }
1149
1150graft:
1151        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1152        if (err) {
1153                if (q)
1154                        qdisc_destroy(q);
1155                return err;
1156        }
1157
1158        return 0;
1159}
1160
1161static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1162                         u32 pid, u32 seq, u16 flags, int event)
1163{
1164        struct tcmsg *tcm;
1165        struct nlmsghdr  *nlh;
1166        unsigned char *b = skb_tail_pointer(skb);
1167        struct gnet_dump d;
1168
1169        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1170        tcm = NLMSG_DATA(nlh);
1171        tcm->tcm_family = AF_UNSPEC;
1172        tcm->tcm__pad1 = 0;
1173        tcm->tcm__pad2 = 0;
1174        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1175        tcm->tcm_parent = clid;
1176        tcm->tcm_handle = q->handle;
1177        tcm->tcm_info = atomic_read(&q->refcnt);
1178        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1179        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1180                goto nla_put_failure;
1181        q->qstats.qlen = q->q.qlen;
1182
1183        if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1184                goto nla_put_failure;
1185
1186        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1187                                         qdisc_root_sleeping_lock(q), &d) < 0)
1188                goto nla_put_failure;
1189
1190        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1191                goto nla_put_failure;
1192
1193        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1194            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1195            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1196                goto nla_put_failure;
1197
1198        if (gnet_stats_finish_copy(&d) < 0)
1199                goto nla_put_failure;
1200
1201        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1202        return skb->len;
1203
1204nlmsg_failure:
1205nla_put_failure:
1206        nlmsg_trim(skb, b);
1207        return -1;
1208}
1209
1210static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1211{
1212        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1213}
1214
1215static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1216                        struct nlmsghdr *n, u32 clid,
1217                        struct Qdisc *old, struct Qdisc *new)
1218{
1219        struct sk_buff *skb;
1220        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1221
1222        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1223        if (!skb)
1224                return -ENOBUFS;
1225
1226        if (old && !tc_qdisc_dump_ignore(old)) {
1227                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1228                        goto err_out;
1229        }
1230        if (new && !tc_qdisc_dump_ignore(new)) {
1231                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1232                        goto err_out;
1233        }
1234
1235        if (skb->len)
1236                return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1237
1238err_out:
1239        kfree_skb(skb);
1240        return -EINVAL;
1241}
1242
1243static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1244                              struct netlink_callback *cb,
1245                              int *q_idx_p, int s_q_idx)
1246{
1247        int ret = 0, q_idx = *q_idx_p;
1248        struct Qdisc *q;
1249
1250        if (!root)
1251                return 0;
1252
1253        q = root;
1254        if (q_idx < s_q_idx) {
1255                q_idx++;
1256        } else {
1257                if (!tc_qdisc_dump_ignore(q) &&
1258                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1259                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1260                        goto done;
1261                q_idx++;
1262        }
1263        list_for_each_entry(q, &root->list, list) {
1264                if (q_idx < s_q_idx) {
1265                        q_idx++;
1266                        continue;
1267                }
1268                if (!tc_qdisc_dump_ignore(q) && 
1269                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1270                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1271                        goto done;
1272                q_idx++;
1273        }
1274
1275out:
1276        *q_idx_p = q_idx;
1277        return ret;
1278done:
1279        ret = -1;
1280        goto out;
1281}
1282
1283static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1284{
1285        struct net *net = sock_net(skb->sk);
1286        int idx, q_idx;
1287        int s_idx, s_q_idx;
1288        struct net_device *dev;
1289
1290        s_idx = cb->args[0];
1291        s_q_idx = q_idx = cb->args[1];
1292
1293        rcu_read_lock();
1294        idx = 0;
1295        for_each_netdev_rcu(net, dev) {
1296                struct netdev_queue *dev_queue;
1297
1298                if (idx < s_idx)
1299                        goto cont;
1300                if (idx > s_idx)
1301                        s_q_idx = 0;
1302                q_idx = 0;
1303
1304                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1305                        goto done;
1306
1307                dev_queue = &dev->rx_queue;
1308                if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1309                        goto done;
1310
1311cont:
1312                idx++;
1313        }
1314
1315done:
1316        rcu_read_unlock();
1317
1318        cb->args[0] = idx;
1319        cb->args[1] = q_idx;
1320
1321        return skb->len;
1322}
1323
1324
1325
1326/************************************************
1327 *      Traffic classes manipulation.           *
1328 ************************************************/
1329
1330
1331
1332static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1333{
1334        struct net *net = sock_net(skb->sk);
1335        struct tcmsg *tcm = NLMSG_DATA(n);
1336        struct nlattr *tca[TCA_MAX + 1];
1337        struct net_device *dev;
1338        struct Qdisc *q = NULL;
1339        const struct Qdisc_class_ops *cops;
1340        unsigned long cl = 0;
1341        unsigned long new_cl;
1342        u32 pid = tcm->tcm_parent;
1343        u32 clid = tcm->tcm_handle;
1344        u32 qid = TC_H_MAJ(clid);
1345        int err;
1346
1347        if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1348                return -ENODEV;
1349
1350        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1351        if (err < 0)
1352                return err;
1353
1354        /*
1355           parent == TC_H_UNSPEC - unspecified parent.
1356           parent == TC_H_ROOT   - class is root, which has no parent.
1357           parent == X:0         - parent is root class.
1358           parent == X:Y         - parent is a node in hierarchy.
1359           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1360
1361           handle == 0:0         - generate handle from kernel pool.
1362           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1363           handle == X:Y         - clear.
1364           handle == X:0         - root class.
1365         */
1366
1367        /* Step 1. Determine qdisc handle X:0 */
1368
1369        if (pid != TC_H_ROOT) {
1370                u32 qid1 = TC_H_MAJ(pid);
1371
1372                if (qid && qid1) {
1373                        /* If both majors are known, they must be identical. */
1374                        if (qid != qid1)
1375                                return -EINVAL;
1376                } else if (qid1) {
1377                        qid = qid1;
1378                } else if (qid == 0)
1379                        qid = dev->qdisc->handle;
1380
1381                /* Now qid is genuine qdisc handle consistent
1382                   both with parent and child.
1383
1384                   TC_H_MAJ(pid) still may be unspecified, complete it now.
1385                 */
1386                if (pid)
1387                        pid = TC_H_MAKE(qid, pid);
1388        } else {
1389                if (qid == 0)
1390                        qid = dev->qdisc->handle;
1391        }
1392
1393        /* OK. Locate qdisc */
1394        if ((q = qdisc_lookup(dev, qid)) == NULL)
1395                return -ENOENT;
1396
1397        /* An check that it supports classes */
1398        cops = q->ops->cl_ops;
1399        if (cops == NULL)
1400                return -EINVAL;
1401
1402        /* Now try to get class */
1403        if (clid == 0) {
1404                if (pid == TC_H_ROOT)
1405                        clid = qid;
1406        } else
1407                clid = TC_H_MAKE(qid, clid);
1408
1409        if (clid)
1410                cl = cops->get(q, clid);
1411
1412        if (cl == 0) {
1413                err = -ENOENT;
1414                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1415                        goto out;
1416        } else {
1417                switch (n->nlmsg_type) {
1418                case RTM_NEWTCLASS:
1419                        err = -EEXIST;
1420                        if (n->nlmsg_flags&NLM_F_EXCL)
1421                                goto out;
1422                        break;
1423                case RTM_DELTCLASS:
1424                        err = -EOPNOTSUPP;
1425                        if (cops->delete)
1426                                err = cops->delete(q, cl);
1427                        if (err == 0)
1428                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1429                        goto out;
1430                case RTM_GETTCLASS:
1431                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1432                        goto out;
1433                default:
1434                        err = -EINVAL;
1435                        goto out;
1436                }
1437        }
1438
1439        new_cl = cl;
1440        err = -EOPNOTSUPP;
1441        if (cops->change)
1442                err = cops->change(q, clid, pid, tca, &new_cl);
1443        if (err == 0)
1444                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1445
1446out:
1447        if (cl)
1448                cops->put(q, cl);
1449
1450        return err;
1451}
1452
1453
1454static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1455                          unsigned long cl,
1456                          u32 pid, u32 seq, u16 flags, int event)
1457{
1458        struct tcmsg *tcm;
1459        struct nlmsghdr  *nlh;
1460        unsigned char *b = skb_tail_pointer(skb);
1461        struct gnet_dump d;
1462        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1463
1464        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1465        tcm = NLMSG_DATA(nlh);
1466        tcm->tcm_family = AF_UNSPEC;
1467        tcm->tcm__pad1 = 0;
1468        tcm->tcm__pad2 = 0;
1469        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1470        tcm->tcm_parent = q->handle;
1471        tcm->tcm_handle = q->handle;
1472        tcm->tcm_info = 0;
1473        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1474        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1475                goto nla_put_failure;
1476
1477        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1478                                         qdisc_root_sleeping_lock(q), &d) < 0)
1479                goto nla_put_failure;
1480
1481        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1482                goto nla_put_failure;
1483
1484        if (gnet_stats_finish_copy(&d) < 0)
1485                goto nla_put_failure;
1486
1487        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1488        return skb->len;
1489
1490nlmsg_failure:
1491nla_put_failure:
1492        nlmsg_trim(skb, b);
1493        return -1;
1494}
1495
1496static int tclass_notify(struct net *net, struct sk_buff *oskb,
1497                         struct nlmsghdr *n, struct Qdisc *q,
1498                         unsigned long cl, int event)
1499{
1500        struct sk_buff *skb;
1501        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1502
1503        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1504        if (!skb)
1505                return -ENOBUFS;
1506
1507        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1508                kfree_skb(skb);
1509                return -EINVAL;
1510        }
1511
1512        return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1513}
1514
1515struct qdisc_dump_args
1516{
1517        struct qdisc_walker w;
1518        struct sk_buff *skb;
1519        struct netlink_callback *cb;
1520};
1521
1522static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1523{
1524        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1525
1526        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1527                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1528}
1529
1530static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1531                                struct tcmsg *tcm, struct netlink_callback *cb,
1532                                int *t_p, int s_t)
1533{
1534        struct qdisc_dump_args arg;
1535
1536        if (tc_qdisc_dump_ignore(q) ||
1537            *t_p < s_t || !q->ops->cl_ops ||
1538            (tcm->tcm_parent &&
1539             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1540                (*t_p)++;
1541                return 0;
1542        }
1543        if (*t_p > s_t)
1544                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1545        arg.w.fn = qdisc_class_dump;
1546        arg.skb = skb;
1547        arg.cb = cb;
1548        arg.w.stop  = 0;
1549        arg.w.skip = cb->args[1];
1550        arg.w.count = 0;
1551        q->ops->cl_ops->walk(q, &arg.w);
1552        cb->args[1] = arg.w.count;
1553        if (arg.w.stop)
1554                return -1;
1555        (*t_p)++;
1556        return 0;
1557}
1558
1559static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1560                               struct tcmsg *tcm, struct netlink_callback *cb,
1561                               int *t_p, int s_t)
1562{
1563        struct Qdisc *q;
1564
1565        if (!root)
1566                return 0;
1567
1568        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1569                return -1;
1570
1571        list_for_each_entry(q, &root->list, list) {
1572                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1573                        return -1;
1574        }
1575
1576        return 0;
1577}
1578
1579static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1580{
1581        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1582        struct net *net = sock_net(skb->sk);
1583        struct netdev_queue *dev_queue;
1584        struct net_device *dev;
1585        int t, s_t;
1586
1587        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1588                return 0;
1589        if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1590                return 0;
1591
1592        s_t = cb->args[0];
1593        t = 0;
1594
1595        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1596                goto done;
1597
1598        dev_queue = &dev->rx_queue;
1599        if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1600                goto done;
1601
1602done:
1603        cb->args[0] = t;
1604
1605        dev_put(dev);
1606        return skb->len;
1607}
1608
1609/* Main classifier routine: scans classifier chain attached
1610   to this qdisc, (optionally) tests for protocol and asks
1611   specific classifiers.
1612 */
1613int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1614                       struct tcf_result *res)
1615{
1616        __be16 protocol = skb->protocol;
1617        int err = 0;
1618
1619        for (; tp; tp = tp->next) {
1620                if ((tp->protocol == protocol ||
1621                     tp->protocol == htons(ETH_P_ALL)) &&
1622                    (err = tp->classify(skb, tp, res)) >= 0) {
1623#ifdef CONFIG_NET_CLS_ACT
1624                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1625                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1626#endif
1627                        return err;
1628                }
1629        }
1630        return -1;
1631}
1632EXPORT_SYMBOL(tc_classify_compat);
1633
1634int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1635                struct tcf_result *res)
1636{
1637        int err = 0;
1638        __be16 protocol;
1639#ifdef CONFIG_NET_CLS_ACT
1640        struct tcf_proto *otp = tp;
1641reclassify:
1642#endif
1643        protocol = skb->protocol;
1644
1645        err = tc_classify_compat(skb, tp, res);
1646#ifdef CONFIG_NET_CLS_ACT
1647        if (err == TC_ACT_RECLASSIFY) {
1648                u32 verd = G_TC_VERD(skb->tc_verd);
1649                tp = otp;
1650
1651                if (verd++ >= MAX_REC_LOOP) {
1652                        if (net_ratelimit())
1653                                printk(KERN_NOTICE
1654                                       "%s: packet reclassify loop"
1655                                          " rule prio %u protocol %02x\n",
1656                                       tp->q->ops->id,
1657                                       tp->prio & 0xffff, ntohs(tp->protocol));
1658                        return TC_ACT_SHOT;
1659                }
1660                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1661                goto reclassify;
1662        }
1663#endif
1664        return err;
1665}
1666EXPORT_SYMBOL(tc_classify);
1667
1668void tcf_destroy(struct tcf_proto *tp)
1669{
1670        tp->ops->destroy(tp);
1671        module_put(tp->ops->owner);
1672        kfree(tp);
1673}
1674
1675void tcf_destroy_chain(struct tcf_proto **fl)
1676{
1677        struct tcf_proto *tp;
1678
1679        while ((tp = *fl) != NULL) {
1680                *fl = tp->next;
1681                tcf_destroy(tp);
1682        }
1683}
1684EXPORT_SYMBOL(tcf_destroy_chain);
1685
1686#ifdef CONFIG_PROC_FS
1687static int psched_show(struct seq_file *seq, void *v)
1688{
1689        struct timespec ts;
1690
1691        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1692        seq_printf(seq, "%08x %08x %08x %08x\n",
1693                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1694                   1000000,
1695                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1696
1697        return 0;
1698}
1699
1700static int psched_open(struct inode *inode, struct file *file)
1701{
1702        return single_open(file, psched_show, NULL);
1703}
1704
1705static const struct file_operations psched_fops = {
1706        .owner = THIS_MODULE,
1707        .open = psched_open,
1708        .read  = seq_read,
1709        .llseek = seq_lseek,
1710        .release = single_release,
1711};
1712
1713static int __net_init psched_net_init(struct net *net)
1714{
1715        struct proc_dir_entry *e;
1716
1717        e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1718        if (e == NULL)
1719                return -ENOMEM;
1720
1721        return 0;
1722}
1723
1724static void __net_exit psched_net_exit(struct net *net)
1725{
1726        proc_net_remove(net, "psched");
1727}
1728#else
1729static int __net_init psched_net_init(struct net *net)
1730{
1731        return 0;
1732}
1733
1734static void __net_exit psched_net_exit(struct net *net)
1735{
1736}
1737#endif
1738
1739static struct pernet_operations psched_net_ops = {
1740        .init = psched_net_init,
1741        .exit = psched_net_exit,
1742};
1743
1744static int __init pktsched_init(void)
1745{
1746        int err;
1747
1748        err = register_pernet_subsys(&psched_net_ops);
1749        if (err) {
1750                printk(KERN_ERR "pktsched_init: "
1751                       "cannot initialize per netns operations\n");
1752                return err;
1753        }
1754
1755        register_qdisc(&pfifo_qdisc_ops);
1756        register_qdisc(&bfifo_qdisc_ops);
1757        register_qdisc(&pfifo_head_drop_qdisc_ops);
1758        register_qdisc(&mq_qdisc_ops);
1759
1760        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1761        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1762        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1763        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1764        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1765        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1766
1767        return 0;
1768}
1769
1770subsys_initcall(pktsched_init);
1771