linux/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/module.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/string.h>
  22#include <linux/errno.h>
  23#include <linux/skbuff.h>
  24#include <linux/init.h>
  25#include <linux/proc_fs.h>
  26#include <linux/seq_file.h>
  27#include <linux/kmod.h>
  28#include <linux/list.h>
  29#include <linux/hrtimer.h>
  30#include <linux/lockdep.h>
  31#include <linux/slab.h>
  32
  33#include <net/net_namespace.h>
  34#include <net/sock.h>
  35#include <net/netlink.h>
  36#include <net/pkt_sched.h>
  37
  38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
  39                        struct nlmsghdr *n, u32 clid,
  40                        struct Qdisc *old, struct Qdisc *new);
  41static int tclass_notify(struct net *net, struct sk_buff *oskb,
  42                         struct nlmsghdr *n, struct Qdisc *q,
  43                         unsigned long cl, int event);
  44
  45/*
  46
  47   Short review.
  48   -------------
  49
  50   This file consists of two interrelated parts:
  51
  52   1. queueing disciplines manager frontend.
  53   2. traffic classes manager frontend.
  54
  55   Generally, queueing discipline ("qdisc") is a black box,
  56   which is able to enqueue packets and to dequeue them (when
  57   device is ready to send something) in order and at times
  58   determined by algorithm hidden in it.
  59
  60   qdisc's are divided to two categories:
  61   - "queues", which have no internal structure visible from outside.
  62   - "schedulers", which split all the packets to "traffic classes",
  63     using "packet classifiers" (look at cls_api.c)
  64
  65   In turn, classes may have child qdiscs (as rule, queues)
  66   attached to them etc. etc. etc.
  67
  68   The goal of the routines in this file is to translate
  69   information supplied by user in the form of handles
  70   to more intelligible for kernel form, to make some sanity
  71   checks and part of work, which is common to all qdiscs
  72   and to provide rtnetlink notifications.
  73
  74   All real intelligent work is done inside qdisc modules.
  75
  76
  77
  78   Every discipline has two major routines: enqueue and dequeue.
  79
  80   ---dequeue
  81
  82   dequeue usually returns a skb to send. It is allowed to return NULL,
  83   but it does not mean that queue is empty, it just means that
  84   discipline does not want to send anything this time.
  85   Queue is really empty if q->q.qlen == 0.
  86   For complicated disciplines with multiple queues q->q is not
  87   real packet queue, but however q->q.qlen must be valid.
  88
  89   ---enqueue
  90
  91   enqueue returns 0, if packet was enqueued successfully.
  92   If packet (this one or another one) was dropped, it returns
  93   not zero error code.
  94   NET_XMIT_DROP        - this packet dropped
  95     Expected action: do not backoff, but wait until queue will clear.
  96   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  97     Expected action: backoff or ignore
  98   NET_XMIT_POLICED     - dropped by police.
  99     Expected action: backoff or error to real-time apps.
 100
 101   Auxiliary routines:
 102
 103   ---peek
 104
 105   like dequeue but without removing a packet from the queue
 106
 107   ---reset
 108
 109   returns qdisc to initial state: purge all buffers, clear all
 110   timers, counters (except for statistics) etc.
 111
 112   ---init
 113
 114   initializes newly created qdisc.
 115
 116   ---destroy
 117
 118   destroys resources allocated by init and during lifetime of qdisc.
 119
 120   ---change
 121
 122   changes qdisc parameters.
 123 */
 124
 125/* Protects list of registered TC modules. It is pure SMP lock. */
 126static DEFINE_RWLOCK(qdisc_mod_lock);
 127
 128
 129/************************************************
 130 *      Queueing disciplines manipulation.      *
 131 ************************************************/
 132
 133
 134/* The list of all installed queueing disciplines. */
 135
 136static struct Qdisc_ops *qdisc_base;
 137
 138/* Register/uregister queueing discipline */
 139
 140int register_qdisc(struct Qdisc_ops *qops)
 141{
 142        struct Qdisc_ops *q, **qp;
 143        int rc = -EEXIST;
 144
 145        write_lock(&qdisc_mod_lock);
 146        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 147                if (!strcmp(qops->id, q->id))
 148                        goto out;
 149
 150        if (qops->enqueue == NULL)
 151                qops->enqueue = noop_qdisc_ops.enqueue;
 152        if (qops->peek == NULL) {
 153                if (qops->dequeue == NULL)
 154                        qops->peek = noop_qdisc_ops.peek;
 155                else
 156                        goto out_einval;
 157        }
 158        if (qops->dequeue == NULL)
 159                qops->dequeue = noop_qdisc_ops.dequeue;
 160
 161        if (qops->cl_ops) {
 162                const struct Qdisc_class_ops *cops = qops->cl_ops;
 163
 164                if (!(cops->get && cops->put && cops->walk && cops->leaf))
 165                        goto out_einval;
 166
 167                if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
 168                        goto out_einval;
 169        }
 170
 171        qops->next = NULL;
 172        *qp = qops;
 173        rc = 0;
 174out:
 175        write_unlock(&qdisc_mod_lock);
 176        return rc;
 177
 178out_einval:
 179        rc = -EINVAL;
 180        goto out;
 181}
 182EXPORT_SYMBOL(register_qdisc);
 183
 184int unregister_qdisc(struct Qdisc_ops *qops)
 185{
 186        struct Qdisc_ops *q, **qp;
 187        int err = -ENOENT;
 188
 189        write_lock(&qdisc_mod_lock);
 190        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 191                if (q == qops)
 192                        break;
 193        if (q) {
 194                *qp = q->next;
 195                q->next = NULL;
 196                err = 0;
 197        }
 198        write_unlock(&qdisc_mod_lock);
 199        return err;
 200}
 201EXPORT_SYMBOL(unregister_qdisc);
 202
 203/* We know handle. Find qdisc among all qdisc's attached to device
 204   (root qdisc, all its children, children of children etc.)
 205 */
 206
 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 208{
 209        struct Qdisc *q;
 210
 211        if (!(root->flags & TCQ_F_BUILTIN) &&
 212            root->handle == handle)
 213                return root;
 214
 215        list_for_each_entry(q, &root->list, list) {
 216                if (q->handle == handle)
 217                        return q;
 218        }
 219        return NULL;
 220}
 221
 222static void qdisc_list_add(struct Qdisc *q)
 223{
 224        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 225                list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 226}
 227
 228void qdisc_list_del(struct Qdisc *q)
 229{
 230        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
 231                list_del(&q->list);
 232}
 233EXPORT_SYMBOL(qdisc_list_del);
 234
 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 236{
 237        struct Qdisc *q;
 238
 239        q = qdisc_match_from_root(dev->qdisc, handle);
 240        if (q)
 241                goto out;
 242
 243        if (dev_ingress_queue(dev))
 244                q = qdisc_match_from_root(
 245                        dev_ingress_queue(dev)->qdisc_sleeping,
 246                        handle);
 247out:
 248        return q;
 249}
 250
 251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 252{
 253        unsigned long cl;
 254        struct Qdisc *leaf;
 255        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 256
 257        if (cops == NULL)
 258                return NULL;
 259        cl = cops->get(p, classid);
 260
 261        if (cl == 0)
 262                return NULL;
 263        leaf = cops->leaf(p, cl);
 264        cops->put(p, cl);
 265        return leaf;
 266}
 267
 268/* Find queueing discipline by name */
 269
 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 271{
 272        struct Qdisc_ops *q = NULL;
 273
 274        if (kind) {
 275                read_lock(&qdisc_mod_lock);
 276                for (q = qdisc_base; q; q = q->next) {
 277                        if (nla_strcmp(kind, q->id) == 0) {
 278                                if (!try_module_get(q->owner))
 279                                        q = NULL;
 280                                break;
 281                        }
 282                }
 283                read_unlock(&qdisc_mod_lock);
 284        }
 285        return q;
 286}
 287
 288static struct qdisc_rate_table *qdisc_rtab_list;
 289
 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 291{
 292        struct qdisc_rate_table *rtab;
 293
 294        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 295                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 296                        rtab->refcnt++;
 297                        return rtab;
 298                }
 299        }
 300
 301        if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
 302            nla_len(tab) != TC_RTAB_SIZE)
 303                return NULL;
 304
 305        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 306        if (rtab) {
 307                rtab->rate = *r;
 308                rtab->refcnt = 1;
 309                memcpy(rtab->data, nla_data(tab), 1024);
 310                rtab->next = qdisc_rtab_list;
 311                qdisc_rtab_list = rtab;
 312        }
 313        return rtab;
 314}
 315EXPORT_SYMBOL(qdisc_get_rtab);
 316
 317void qdisc_put_rtab(struct qdisc_rate_table *tab)
 318{
 319        struct qdisc_rate_table *rtab, **rtabp;
 320
 321        if (!tab || --tab->refcnt)
 322                return;
 323
 324        for (rtabp = &qdisc_rtab_list;
 325             (rtab = *rtabp) != NULL;
 326             rtabp = &rtab->next) {
 327                if (rtab == tab) {
 328                        *rtabp = rtab->next;
 329                        kfree(rtab);
 330                        return;
 331                }
 332        }
 333}
 334EXPORT_SYMBOL(qdisc_put_rtab);
 335
 336static LIST_HEAD(qdisc_stab_list);
 337static DEFINE_SPINLOCK(qdisc_stab_lock);
 338
 339static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 340        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 341        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 342};
 343
 344static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 345{
 346        struct nlattr *tb[TCA_STAB_MAX + 1];
 347        struct qdisc_size_table *stab;
 348        struct tc_sizespec *s;
 349        unsigned int tsize = 0;
 350        u16 *tab = NULL;
 351        int err;
 352
 353        err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
 354        if (err < 0)
 355                return ERR_PTR(err);
 356        if (!tb[TCA_STAB_BASE])
 357                return ERR_PTR(-EINVAL);
 358
 359        s = nla_data(tb[TCA_STAB_BASE]);
 360
 361        if (s->tsize > 0) {
 362                if (!tb[TCA_STAB_DATA])
 363                        return ERR_PTR(-EINVAL);
 364                tab = nla_data(tb[TCA_STAB_DATA]);
 365                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 366        }
 367
 368        if (tsize != s->tsize || (!tab && tsize > 0))
 369                return ERR_PTR(-EINVAL);
 370
 371        spin_lock(&qdisc_stab_lock);
 372
 373        list_for_each_entry(stab, &qdisc_stab_list, list) {
 374                if (memcmp(&stab->szopts, s, sizeof(*s)))
 375                        continue;
 376                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 377                        continue;
 378                stab->refcnt++;
 379                spin_unlock(&qdisc_stab_lock);
 380                return stab;
 381        }
 382
 383        spin_unlock(&qdisc_stab_lock);
 384
 385        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 386        if (!stab)
 387                return ERR_PTR(-ENOMEM);
 388
 389        stab->refcnt = 1;
 390        stab->szopts = *s;
 391        if (tsize > 0)
 392                memcpy(stab->data, tab, tsize * sizeof(u16));
 393
 394        spin_lock(&qdisc_stab_lock);
 395        list_add_tail(&stab->list, &qdisc_stab_list);
 396        spin_unlock(&qdisc_stab_lock);
 397
 398        return stab;
 399}
 400
 401static void stab_kfree_rcu(struct rcu_head *head)
 402{
 403        kfree(container_of(head, struct qdisc_size_table, rcu));
 404}
 405
 406void qdisc_put_stab(struct qdisc_size_table *tab)
 407{
 408        if (!tab)
 409                return;
 410
 411        spin_lock(&qdisc_stab_lock);
 412
 413        if (--tab->refcnt == 0) {
 414                list_del(&tab->list);
 415                call_rcu_bh(&tab->rcu, stab_kfree_rcu);
 416        }
 417
 418        spin_unlock(&qdisc_stab_lock);
 419}
 420EXPORT_SYMBOL(qdisc_put_stab);
 421
 422static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 423{
 424        struct nlattr *nest;
 425
 426        nest = nla_nest_start(skb, TCA_STAB);
 427        if (nest == NULL)
 428                goto nla_put_failure;
 429        NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
 430        nla_nest_end(skb, nest);
 431
 432        return skb->len;
 433
 434nla_put_failure:
 435        return -1;
 436}
 437
 438void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
 439{
 440        int pkt_len, slot;
 441
 442        pkt_len = skb->len + stab->szopts.overhead;
 443        if (unlikely(!stab->szopts.tsize))
 444                goto out;
 445
 446        slot = pkt_len + stab->szopts.cell_align;
 447        if (unlikely(slot < 0))
 448                slot = 0;
 449
 450        slot >>= stab->szopts.cell_log;
 451        if (likely(slot < stab->szopts.tsize))
 452                pkt_len = stab->data[slot];
 453        else
 454                pkt_len = stab->data[stab->szopts.tsize - 1] *
 455                                (slot / stab->szopts.tsize) +
 456                                stab->data[slot % stab->szopts.tsize];
 457
 458        pkt_len <<= stab->szopts.size_log;
 459out:
 460        if (unlikely(pkt_len < 1))
 461                pkt_len = 1;
 462        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 463}
 464EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 465
 466void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
 467{
 468        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 469                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 470                        txt, qdisc->ops->id, qdisc->handle >> 16);
 471                qdisc->flags |= TCQ_F_WARN_NONWC;
 472        }
 473}
 474EXPORT_SYMBOL(qdisc_warn_nonwc);
 475
 476static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 477{
 478        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 479                                                 timer);
 480
 481        qdisc_unthrottled(wd->qdisc);
 482        __netif_schedule(qdisc_root(wd->qdisc));
 483
 484        return HRTIMER_NORESTART;
 485}
 486
 487void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 488{
 489        hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 490        wd->timer.function = qdisc_watchdog;
 491        wd->qdisc = qdisc;
 492}
 493EXPORT_SYMBOL(qdisc_watchdog_init);
 494
 495void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 496{
 497        ktime_t time;
 498
 499        if (test_bit(__QDISC_STATE_DEACTIVATED,
 500                     &qdisc_root_sleeping(wd->qdisc)->state))
 501                return;
 502
 503        qdisc_throttled(wd->qdisc);
 504        time = ktime_set(0, 0);
 505        time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 506        hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
 507}
 508EXPORT_SYMBOL(qdisc_watchdog_schedule);
 509
 510void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 511{
 512        hrtimer_cancel(&wd->timer);
 513        qdisc_unthrottled(wd->qdisc);
 514}
 515EXPORT_SYMBOL(qdisc_watchdog_cancel);
 516
 517static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 518{
 519        unsigned int size = n * sizeof(struct hlist_head), i;
 520        struct hlist_head *h;
 521
 522        if (size <= PAGE_SIZE)
 523                h = kmalloc(size, GFP_KERNEL);
 524        else
 525                h = (struct hlist_head *)
 526                        __get_free_pages(GFP_KERNEL, get_order(size));
 527
 528        if (h != NULL) {
 529                for (i = 0; i < n; i++)
 530                        INIT_HLIST_HEAD(&h[i]);
 531        }
 532        return h;
 533}
 534
 535static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 536{
 537        unsigned int size = n * sizeof(struct hlist_head);
 538
 539        if (size <= PAGE_SIZE)
 540                kfree(h);
 541        else
 542                free_pages((unsigned long)h, get_order(size));
 543}
 544
 545void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 546{
 547        struct Qdisc_class_common *cl;
 548        struct hlist_node *n, *next;
 549        struct hlist_head *nhash, *ohash;
 550        unsigned int nsize, nmask, osize;
 551        unsigned int i, h;
 552
 553        /* Rehash when load factor exceeds 0.75 */
 554        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 555                return;
 556        nsize = clhash->hashsize * 2;
 557        nmask = nsize - 1;
 558        nhash = qdisc_class_hash_alloc(nsize);
 559        if (nhash == NULL)
 560                return;
 561
 562        ohash = clhash->hash;
 563        osize = clhash->hashsize;
 564
 565        sch_tree_lock(sch);
 566        for (i = 0; i < osize; i++) {
 567                hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
 568                        h = qdisc_class_hash(cl->classid, nmask);
 569                        hlist_add_head(&cl->hnode, &nhash[h]);
 570                }
 571        }
 572        clhash->hash     = nhash;
 573        clhash->hashsize = nsize;
 574        clhash->hashmask = nmask;
 575        sch_tree_unlock(sch);
 576
 577        qdisc_class_hash_free(ohash, osize);
 578}
 579EXPORT_SYMBOL(qdisc_class_hash_grow);
 580
 581int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 582{
 583        unsigned int size = 4;
 584
 585        clhash->hash = qdisc_class_hash_alloc(size);
 586        if (clhash->hash == NULL)
 587                return -ENOMEM;
 588        clhash->hashsize  = size;
 589        clhash->hashmask  = size - 1;
 590        clhash->hashelems = 0;
 591        return 0;
 592}
 593EXPORT_SYMBOL(qdisc_class_hash_init);
 594
 595void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 596{
 597        qdisc_class_hash_free(clhash->hash, clhash->hashsize);
 598}
 599EXPORT_SYMBOL(qdisc_class_hash_destroy);
 600
 601void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 602                             struct Qdisc_class_common *cl)
 603{
 604        unsigned int h;
 605
 606        INIT_HLIST_NODE(&cl->hnode);
 607        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 608        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 609        clhash->hashelems++;
 610}
 611EXPORT_SYMBOL(qdisc_class_hash_insert);
 612
 613void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 614                             struct Qdisc_class_common *cl)
 615{
 616        hlist_del(&cl->hnode);
 617        clhash->hashelems--;
 618}
 619EXPORT_SYMBOL(qdisc_class_hash_remove);
 620
 621/* Allocate an unique handle from space managed by kernel */
 622
 623static u32 qdisc_alloc_handle(struct net_device *dev)
 624{
 625        int i = 0x10000;
 626        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 627
 628        do {
 629                autohandle += TC_H_MAKE(0x10000U, 0);
 630                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 631                        autohandle = TC_H_MAKE(0x80000000U, 0);
 632        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 633
 634        return i > 0 ? autohandle : 0;
 635}
 636
 637void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 638{
 639        const struct Qdisc_class_ops *cops;
 640        unsigned long cl;
 641        u32 parentid;
 642
 643        if (n == 0)
 644                return;
 645        while ((parentid = sch->parent)) {
 646                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 647                        return;
 648
 649                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 650                if (sch == NULL) {
 651                        WARN_ON(parentid != TC_H_ROOT);
 652                        return;
 653                }
 654                cops = sch->ops->cl_ops;
 655                if (cops->qlen_notify) {
 656                        cl = cops->get(sch, parentid);
 657                        cops->qlen_notify(sch, cl);
 658                        cops->put(sch, cl);
 659                }
 660                sch->q.qlen -= n;
 661        }
 662}
 663EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 664
 665static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 666                               struct nlmsghdr *n, u32 clid,
 667                               struct Qdisc *old, struct Qdisc *new)
 668{
 669        if (new || old)
 670                qdisc_notify(net, skb, n, clid, old, new);
 671
 672        if (old)
 673                qdisc_destroy(old);
 674}
 675
 676/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 677 * to device "dev".
 678 *
 679 * When appropriate send a netlink notification using 'skb'
 680 * and "n".
 681 *
 682 * On success, destroy old qdisc.
 683 */
 684
 685static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 686                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
 687                       struct Qdisc *new, struct Qdisc *old)
 688{
 689        struct Qdisc *q = old;
 690        struct net *net = dev_net(dev);
 691        int err = 0;
 692
 693        if (parent == NULL) {
 694                unsigned int i, num_q, ingress;
 695
 696                ingress = 0;
 697                num_q = dev->num_tx_queues;
 698                if ((q && q->flags & TCQ_F_INGRESS) ||
 699                    (new && new->flags & TCQ_F_INGRESS)) {
 700                        num_q = 1;
 701                        ingress = 1;
 702                        if (!dev_ingress_queue(dev))
 703                                return -ENOENT;
 704                }
 705
 706                if (dev->flags & IFF_UP)
 707                        dev_deactivate(dev);
 708
 709                if (new && new->ops->attach) {
 710                        new->ops->attach(new);
 711                        num_q = 0;
 712                }
 713
 714                for (i = 0; i < num_q; i++) {
 715                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 716
 717                        if (!ingress)
 718                                dev_queue = netdev_get_tx_queue(dev, i);
 719
 720                        old = dev_graft_qdisc(dev_queue, new);
 721                        if (new && i > 0)
 722                                atomic_inc(&new->refcnt);
 723
 724                        if (!ingress)
 725                                qdisc_destroy(old);
 726                }
 727
 728                if (!ingress) {
 729                        notify_and_destroy(net, skb, n, classid,
 730                                           dev->qdisc, new);
 731                        if (new && !new->ops->attach)
 732                                atomic_inc(&new->refcnt);
 733                        dev->qdisc = new ? : &noop_qdisc;
 734                } else {
 735                        notify_and_destroy(net, skb, n, classid, old, new);
 736                }
 737
 738                if (dev->flags & IFF_UP)
 739                        dev_activate(dev);
 740        } else {
 741                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 742
 743                err = -EOPNOTSUPP;
 744                if (cops && cops->graft) {
 745                        unsigned long cl = cops->get(parent, classid);
 746                        if (cl) {
 747                                err = cops->graft(parent, cl, new, &old);
 748                                cops->put(parent, cl);
 749                        } else
 750                                err = -ENOENT;
 751                }
 752                if (!err)
 753                        notify_and_destroy(net, skb, n, classid, old, new);
 754        }
 755        return err;
 756}
 757
 758/* lockdep annotation is needed for ingress; egress gets it only for name */
 759static struct lock_class_key qdisc_tx_lock;
 760static struct lock_class_key qdisc_rx_lock;
 761
 762/*
 763   Allocate and initialize new qdisc.
 764
 765   Parameters are passed via opt.
 766 */
 767
 768static struct Qdisc *
 769qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 770             struct Qdisc *p, u32 parent, u32 handle,
 771             struct nlattr **tca, int *errp)
 772{
 773        int err;
 774        struct nlattr *kind = tca[TCA_KIND];
 775        struct Qdisc *sch;
 776        struct Qdisc_ops *ops;
 777        struct qdisc_size_table *stab;
 778
 779        ops = qdisc_lookup_ops(kind);
 780#ifdef CONFIG_MODULES
 781        if (ops == NULL && kind != NULL) {
 782                char name[IFNAMSIZ];
 783                if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 784                        /* We dropped the RTNL semaphore in order to
 785                         * perform the module load.  So, even if we
 786                         * succeeded in loading the module we have to
 787                         * tell the caller to replay the request.  We
 788                         * indicate this using -EAGAIN.
 789                         * We replay the request because the device may
 790                         * go away in the mean time.
 791                         */
 792                        rtnl_unlock();
 793                        request_module("sch_%s", name);
 794                        rtnl_lock();
 795                        ops = qdisc_lookup_ops(kind);
 796                        if (ops != NULL) {
 797                                /* We will try again qdisc_lookup_ops,
 798                                 * so don't keep a reference.
 799                                 */
 800                                module_put(ops->owner);
 801                                err = -EAGAIN;
 802                                goto err_out;
 803                        }
 804                }
 805        }
 806#endif
 807
 808        err = -ENOENT;
 809        if (ops == NULL)
 810                goto err_out;
 811
 812        sch = qdisc_alloc(dev_queue, ops);
 813        if (IS_ERR(sch)) {
 814                err = PTR_ERR(sch);
 815                goto err_out2;
 816        }
 817
 818        sch->parent = parent;
 819
 820        if (handle == TC_H_INGRESS) {
 821                sch->flags |= TCQ_F_INGRESS;
 822                handle = TC_H_MAKE(TC_H_INGRESS, 0);
 823                lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
 824        } else {
 825                if (handle == 0) {
 826                        handle = qdisc_alloc_handle(dev);
 827                        err = -ENOMEM;
 828                        if (handle == 0)
 829                                goto err_out3;
 830                }
 831                lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
 832        }
 833
 834        sch->handle = handle;
 835
 836        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
 837                if (tca[TCA_STAB]) {
 838                        stab = qdisc_get_stab(tca[TCA_STAB]);
 839                        if (IS_ERR(stab)) {
 840                                err = PTR_ERR(stab);
 841                                goto err_out4;
 842                        }
 843                        rcu_assign_pointer(sch->stab, stab);
 844                }
 845                if (tca[TCA_RATE]) {
 846                        spinlock_t *root_lock;
 847
 848                        err = -EOPNOTSUPP;
 849                        if (sch->flags & TCQ_F_MQROOT)
 850                                goto err_out4;
 851
 852                        if ((sch->parent != TC_H_ROOT) &&
 853                            !(sch->flags & TCQ_F_INGRESS) &&
 854                            (!p || !(p->flags & TCQ_F_MQROOT)))
 855                                root_lock = qdisc_root_sleeping_lock(sch);
 856                        else
 857                                root_lock = qdisc_lock(sch);
 858
 859                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
 860                                                root_lock, tca[TCA_RATE]);
 861                        if (err)
 862                                goto err_out4;
 863                }
 864
 865                qdisc_list_add(sch);
 866
 867                return sch;
 868        }
 869err_out3:
 870        dev_put(dev);
 871        kfree((char *) sch - sch->padded);
 872err_out2:
 873        module_put(ops->owner);
 874err_out:
 875        *errp = err;
 876        return NULL;
 877
 878err_out4:
 879        /*
 880         * Any broken qdiscs that would require a ops->reset() here?
 881         * The qdisc was never in action so it shouldn't be necessary.
 882         */
 883        qdisc_put_stab(rtnl_dereference(sch->stab));
 884        if (ops->destroy)
 885                ops->destroy(sch);
 886        goto err_out3;
 887}
 888
 889static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 890{
 891        struct qdisc_size_table *ostab, *stab = NULL;
 892        int err = 0;
 893
 894        if (tca[TCA_OPTIONS]) {
 895                if (sch->ops->change == NULL)
 896                        return -EINVAL;
 897                err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 898                if (err)
 899                        return err;
 900        }
 901
 902        if (tca[TCA_STAB]) {
 903                stab = qdisc_get_stab(tca[TCA_STAB]);
 904                if (IS_ERR(stab))
 905                        return PTR_ERR(stab);
 906        }
 907
 908        ostab = rtnl_dereference(sch->stab);
 909        rcu_assign_pointer(sch->stab, stab);
 910        qdisc_put_stab(ostab);
 911
 912        if (tca[TCA_RATE]) {
 913                /* NB: ignores errors from replace_estimator
 914                   because change can't be undone. */
 915                if (sch->flags & TCQ_F_MQROOT)
 916                        goto out;
 917                gen_replace_estimator(&sch->bstats, &sch->rate_est,
 918                                            qdisc_root_sleeping_lock(sch),
 919                                            tca[TCA_RATE]);
 920        }
 921out:
 922        return 0;
 923}
 924
 925struct check_loop_arg {
 926        struct qdisc_walker     w;
 927        struct Qdisc            *p;
 928        int                     depth;
 929};
 930
 931static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 932
 933static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 934{
 935        struct check_loop_arg   arg;
 936
 937        if (q->ops->cl_ops == NULL)
 938                return 0;
 939
 940        arg.w.stop = arg.w.skip = arg.w.count = 0;
 941        arg.w.fn = check_loop_fn;
 942        arg.depth = depth;
 943        arg.p = p;
 944        q->ops->cl_ops->walk(q, &arg.w);
 945        return arg.w.stop ? -ELOOP : 0;
 946}
 947
 948static int
 949check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 950{
 951        struct Qdisc *leaf;
 952        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 953        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 954
 955        leaf = cops->leaf(q, cl);
 956        if (leaf) {
 957                if (leaf == arg->p || arg->depth > 7)
 958                        return -ELOOP;
 959                return check_loop(leaf, arg->p, arg->depth + 1);
 960        }
 961        return 0;
 962}
 963
 964/*
 965 * Delete/get qdisc.
 966 */
 967
 968static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 969{
 970        struct net *net = sock_net(skb->sk);
 971        struct tcmsg *tcm = NLMSG_DATA(n);
 972        struct nlattr *tca[TCA_MAX + 1];
 973        struct net_device *dev;
 974        u32 clid = tcm->tcm_parent;
 975        struct Qdisc *q = NULL;
 976        struct Qdisc *p = NULL;
 977        int err;
 978
 979        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
 980        if (!dev)
 981                return -ENODEV;
 982
 983        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 984        if (err < 0)
 985                return err;
 986
 987        if (clid) {
 988                if (clid != TC_H_ROOT) {
 989                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 990                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
 991                                if (!p)
 992                                        return -ENOENT;
 993                                q = qdisc_leaf(p, clid);
 994                        } else if (dev_ingress_queue(dev)) {
 995                                q = dev_ingress_queue(dev)->qdisc_sleeping;
 996                        }
 997                } else {
 998                        q = dev->qdisc;
 999                }
1000                if (!q)
1001                        return -ENOENT;
1002
1003                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1004                        return -EINVAL;
1005        } else {
1006                q = qdisc_lookup(dev, tcm->tcm_handle);
1007                if (!q)
1008                        return -ENOENT;
1009        }
1010
1011        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1012                return -EINVAL;
1013
1014        if (n->nlmsg_type == RTM_DELQDISC) {
1015                if (!clid)
1016                        return -EINVAL;
1017                if (q->handle == 0)
1018                        return -ENOENT;
1019                err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1020                if (err != 0)
1021                        return err;
1022        } else {
1023                qdisc_notify(net, skb, n, clid, NULL, q);
1024        }
1025        return 0;
1026}
1027
1028/*
1029 * Create/change qdisc.
1030 */
1031
1032static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1033{
1034        struct net *net = sock_net(skb->sk);
1035        struct tcmsg *tcm;
1036        struct nlattr *tca[TCA_MAX + 1];
1037        struct net_device *dev;
1038        u32 clid;
1039        struct Qdisc *q, *p;
1040        int err;
1041
1042replay:
1043        /* Reinit, just in case something touches this. */
1044        tcm = NLMSG_DATA(n);
1045        clid = tcm->tcm_parent;
1046        q = p = NULL;
1047
1048        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1049        if (!dev)
1050                return -ENODEV;
1051
1052        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1053        if (err < 0)
1054                return err;
1055
1056        if (clid) {
1057                if (clid != TC_H_ROOT) {
1058                        if (clid != TC_H_INGRESS) {
1059                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1060                                if (!p)
1061                                        return -ENOENT;
1062                                q = qdisc_leaf(p, clid);
1063                        } else if (dev_ingress_queue_create(dev)) {
1064                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1065                        }
1066                } else {
1067                        q = dev->qdisc;
1068                }
1069
1070                /* It may be default qdisc, ignore it */
1071                if (q && q->handle == 0)
1072                        q = NULL;
1073
1074                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1075                        if (tcm->tcm_handle) {
1076                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1077                                        return -EEXIST;
1078                                if (TC_H_MIN(tcm->tcm_handle))
1079                                        return -EINVAL;
1080                                q = qdisc_lookup(dev, tcm->tcm_handle);
1081                                if (!q)
1082                                        goto create_n_graft;
1083                                if (n->nlmsg_flags & NLM_F_EXCL)
1084                                        return -EEXIST;
1085                                if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1086                                        return -EINVAL;
1087                                if (q == p ||
1088                                    (p && check_loop(q, p, 0)))
1089                                        return -ELOOP;
1090                                atomic_inc(&q->refcnt);
1091                                goto graft;
1092                        } else {
1093                                if (!q)
1094                                        goto create_n_graft;
1095
1096                                /* This magic test requires explanation.
1097                                 *
1098                                 *   We know, that some child q is already
1099                                 *   attached to this parent and have choice:
1100                                 *   either to change it or to create/graft new one.
1101                                 *
1102                                 *   1. We are allowed to create/graft only
1103                                 *   if CREATE and REPLACE flags are set.
1104                                 *
1105                                 *   2. If EXCL is set, requestor wanted to say,
1106                                 *   that qdisc tcm_handle is not expected
1107                                 *   to exist, so that we choose create/graft too.
1108                                 *
1109                                 *   3. The last case is when no flags are set.
1110                                 *   Alas, it is sort of hole in API, we
1111                                 *   cannot decide what to do unambiguously.
1112                                 *   For now we select create/graft, if
1113                                 *   user gave KIND, which does not match existing.
1114                                 */
1115                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1116                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1117                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1118                                     (tca[TCA_KIND] &&
1119                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1120                                        goto create_n_graft;
1121                        }
1122                }
1123        } else {
1124                if (!tcm->tcm_handle)
1125                        return -EINVAL;
1126                q = qdisc_lookup(dev, tcm->tcm_handle);
1127        }
1128
1129        /* Change qdisc parameters */
1130        if (q == NULL)
1131                return -ENOENT;
1132        if (n->nlmsg_flags & NLM_F_EXCL)
1133                return -EEXIST;
1134        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1135                return -EINVAL;
1136        err = qdisc_change(q, tca);
1137        if (err == 0)
1138                qdisc_notify(net, skb, n, clid, NULL, q);
1139        return err;
1140
1141create_n_graft:
1142        if (!(n->nlmsg_flags & NLM_F_CREATE))
1143                return -ENOENT;
1144        if (clid == TC_H_INGRESS) {
1145                if (dev_ingress_queue(dev))
1146                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1147                                         tcm->tcm_parent, tcm->tcm_parent,
1148                                         tca, &err);
1149                else
1150                        err = -ENOENT;
1151        } else {
1152                struct netdev_queue *dev_queue;
1153
1154                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1155                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1156                else if (p)
1157                        dev_queue = p->dev_queue;
1158                else
1159                        dev_queue = netdev_get_tx_queue(dev, 0);
1160
1161                q = qdisc_create(dev, dev_queue, p,
1162                                 tcm->tcm_parent, tcm->tcm_handle,
1163                                 tca, &err);
1164        }
1165        if (q == NULL) {
1166                if (err == -EAGAIN)
1167                        goto replay;
1168                return err;
1169        }
1170
1171graft:
1172        err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1173        if (err) {
1174                if (q)
1175                        qdisc_destroy(q);
1176                return err;
1177        }
1178
1179        return 0;
1180}
1181
1182static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1183                         u32 pid, u32 seq, u16 flags, int event)
1184{
1185        struct tcmsg *tcm;
1186        struct nlmsghdr  *nlh;
1187        unsigned char *b = skb_tail_pointer(skb);
1188        struct gnet_dump d;
1189        struct qdisc_size_table *stab;
1190
1191        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1192        tcm = NLMSG_DATA(nlh);
1193        tcm->tcm_family = AF_UNSPEC;
1194        tcm->tcm__pad1 = 0;
1195        tcm->tcm__pad2 = 0;
1196        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1197        tcm->tcm_parent = clid;
1198        tcm->tcm_handle = q->handle;
1199        tcm->tcm_info = atomic_read(&q->refcnt);
1200        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1201        if (q->ops->dump && q->ops->dump(q, skb) < 0)
1202                goto nla_put_failure;
1203        q->qstats.qlen = q->q.qlen;
1204
1205        stab = rtnl_dereference(q->stab);
1206        if (stab && qdisc_dump_stab(skb, stab) < 0)
1207                goto nla_put_failure;
1208
1209        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1210                                         qdisc_root_sleeping_lock(q), &d) < 0)
1211                goto nla_put_failure;
1212
1213        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1214                goto nla_put_failure;
1215
1216        if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1217            gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1218            gnet_stats_copy_queue(&d, &q->qstats) < 0)
1219                goto nla_put_failure;
1220
1221        if (gnet_stats_finish_copy(&d) < 0)
1222                goto nla_put_failure;
1223
1224        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1225        return skb->len;
1226
1227nlmsg_failure:
1228nla_put_failure:
1229        nlmsg_trim(skb, b);
1230        return -1;
1231}
1232
1233static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1234{
1235        return (q->flags & TCQ_F_BUILTIN) ? true : false;
1236}
1237
1238static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1239                        struct nlmsghdr *n, u32 clid,
1240                        struct Qdisc *old, struct Qdisc *new)
1241{
1242        struct sk_buff *skb;
1243        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1244
1245        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1246        if (!skb)
1247                return -ENOBUFS;
1248
1249        if (old && !tc_qdisc_dump_ignore(old)) {
1250                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1251                                  0, RTM_DELQDISC) < 0)
1252                        goto err_out;
1253        }
1254        if (new && !tc_qdisc_dump_ignore(new)) {
1255                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1256                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1257                        goto err_out;
1258        }
1259
1260        if (skb->len)
1261                return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1262                                      n->nlmsg_flags & NLM_F_ECHO);
1263
1264err_out:
1265        kfree_skb(skb);
1266        return -EINVAL;
1267}
1268
1269static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1270                              struct netlink_callback *cb,
1271                              int *q_idx_p, int s_q_idx)
1272{
1273        int ret = 0, q_idx = *q_idx_p;
1274        struct Qdisc *q;
1275
1276        if (!root)
1277                return 0;
1278
1279        q = root;
1280        if (q_idx < s_q_idx) {
1281                q_idx++;
1282        } else {
1283                if (!tc_qdisc_dump_ignore(q) &&
1284                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1285                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1286                        goto done;
1287                q_idx++;
1288        }
1289        list_for_each_entry(q, &root->list, list) {
1290                if (q_idx < s_q_idx) {
1291                        q_idx++;
1292                        continue;
1293                }
1294                if (!tc_qdisc_dump_ignore(q) &&
1295                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1296                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1297                        goto done;
1298                q_idx++;
1299        }
1300
1301out:
1302        *q_idx_p = q_idx;
1303        return ret;
1304done:
1305        ret = -1;
1306        goto out;
1307}
1308
1309static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1310{
1311        struct net *net = sock_net(skb->sk);
1312        int idx, q_idx;
1313        int s_idx, s_q_idx;
1314        struct net_device *dev;
1315
1316        s_idx = cb->args[0];
1317        s_q_idx = q_idx = cb->args[1];
1318
1319        rcu_read_lock();
1320        idx = 0;
1321        for_each_netdev_rcu(net, dev) {
1322                struct netdev_queue *dev_queue;
1323
1324                if (idx < s_idx)
1325                        goto cont;
1326                if (idx > s_idx)
1327                        s_q_idx = 0;
1328                q_idx = 0;
1329
1330                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1331                        goto done;
1332
1333                dev_queue = dev_ingress_queue(dev);
1334                if (dev_queue &&
1335                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1336                                       &q_idx, s_q_idx) < 0)
1337                        goto done;
1338
1339cont:
1340                idx++;
1341        }
1342
1343done:
1344        rcu_read_unlock();
1345
1346        cb->args[0] = idx;
1347        cb->args[1] = q_idx;
1348
1349        return skb->len;
1350}
1351
1352
1353
1354/************************************************
1355 *      Traffic classes manipulation.           *
1356 ************************************************/
1357
1358
1359
1360static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1361{
1362        struct net *net = sock_net(skb->sk);
1363        struct tcmsg *tcm = NLMSG_DATA(n);
1364        struct nlattr *tca[TCA_MAX + 1];
1365        struct net_device *dev;
1366        struct Qdisc *q = NULL;
1367        const struct Qdisc_class_ops *cops;
1368        unsigned long cl = 0;
1369        unsigned long new_cl;
1370        u32 pid = tcm->tcm_parent;
1371        u32 clid = tcm->tcm_handle;
1372        u32 qid = TC_H_MAJ(clid);
1373        int err;
1374
1375        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1376        if (!dev)
1377                return -ENODEV;
1378
1379        err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1380        if (err < 0)
1381                return err;
1382
1383        /*
1384           parent == TC_H_UNSPEC - unspecified parent.
1385           parent == TC_H_ROOT   - class is root, which has no parent.
1386           parent == X:0         - parent is root class.
1387           parent == X:Y         - parent is a node in hierarchy.
1388           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1389
1390           handle == 0:0         - generate handle from kernel pool.
1391           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1392           handle == X:Y         - clear.
1393           handle == X:0         - root class.
1394         */
1395
1396        /* Step 1. Determine qdisc handle X:0 */
1397
1398        if (pid != TC_H_ROOT) {
1399                u32 qid1 = TC_H_MAJ(pid);
1400
1401                if (qid && qid1) {
1402                        /* If both majors are known, they must be identical. */
1403                        if (qid != qid1)
1404                                return -EINVAL;
1405                } else if (qid1) {
1406                        qid = qid1;
1407                } else if (qid == 0)
1408                        qid = dev->qdisc->handle;
1409
1410                /* Now qid is genuine qdisc handle consistent
1411                 * both with parent and child.
1412                 *
1413                 * TC_H_MAJ(pid) still may be unspecified, complete it now.
1414                 */
1415                if (pid)
1416                        pid = TC_H_MAKE(qid, pid);
1417        } else {
1418                if (qid == 0)
1419                        qid = dev->qdisc->handle;
1420        }
1421
1422        /* OK. Locate qdisc */
1423        q = qdisc_lookup(dev, qid);
1424        if (!q)
1425                return -ENOENT;
1426
1427        /* An check that it supports classes */
1428        cops = q->ops->cl_ops;
1429        if (cops == NULL)
1430                return -EINVAL;
1431
1432        /* Now try to get class */
1433        if (clid == 0) {
1434                if (pid == TC_H_ROOT)
1435                        clid = qid;
1436        } else
1437                clid = TC_H_MAKE(qid, clid);
1438
1439        if (clid)
1440                cl = cops->get(q, clid);
1441
1442        if (cl == 0) {
1443                err = -ENOENT;
1444                if (n->nlmsg_type != RTM_NEWTCLASS ||
1445                    !(n->nlmsg_flags & NLM_F_CREATE))
1446                        goto out;
1447        } else {
1448                switch (n->nlmsg_type) {
1449                case RTM_NEWTCLASS:
1450                        err = -EEXIST;
1451                        if (n->nlmsg_flags & NLM_F_EXCL)
1452                                goto out;
1453                        break;
1454                case RTM_DELTCLASS:
1455                        err = -EOPNOTSUPP;
1456                        if (cops->delete)
1457                                err = cops->delete(q, cl);
1458                        if (err == 0)
1459                                tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1460                        goto out;
1461                case RTM_GETTCLASS:
1462                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1463                        goto out;
1464                default:
1465                        err = -EINVAL;
1466                        goto out;
1467                }
1468        }
1469
1470        new_cl = cl;
1471        err = -EOPNOTSUPP;
1472        if (cops->change)
1473                err = cops->change(q, clid, pid, tca, &new_cl);
1474        if (err == 0)
1475                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1476
1477out:
1478        if (cl)
1479                cops->put(q, cl);
1480
1481        return err;
1482}
1483
1484
1485static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1486                          unsigned long cl,
1487                          u32 pid, u32 seq, u16 flags, int event)
1488{
1489        struct tcmsg *tcm;
1490        struct nlmsghdr  *nlh;
1491        unsigned char *b = skb_tail_pointer(skb);
1492        struct gnet_dump d;
1493        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1494
1495        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1496        tcm = NLMSG_DATA(nlh);
1497        tcm->tcm_family = AF_UNSPEC;
1498        tcm->tcm__pad1 = 0;
1499        tcm->tcm__pad2 = 0;
1500        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1501        tcm->tcm_parent = q->handle;
1502        tcm->tcm_handle = q->handle;
1503        tcm->tcm_info = 0;
1504        NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1505        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1506                goto nla_put_failure;
1507
1508        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1509                                         qdisc_root_sleeping_lock(q), &d) < 0)
1510                goto nla_put_failure;
1511
1512        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1513                goto nla_put_failure;
1514
1515        if (gnet_stats_finish_copy(&d) < 0)
1516                goto nla_put_failure;
1517
1518        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1519        return skb->len;
1520
1521nlmsg_failure:
1522nla_put_failure:
1523        nlmsg_trim(skb, b);
1524        return -1;
1525}
1526
1527static int tclass_notify(struct net *net, struct sk_buff *oskb,
1528                         struct nlmsghdr *n, struct Qdisc *q,
1529                         unsigned long cl, int event)
1530{
1531        struct sk_buff *skb;
1532        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1533
1534        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1535        if (!skb)
1536                return -ENOBUFS;
1537
1538        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1539                kfree_skb(skb);
1540                return -EINVAL;
1541        }
1542
1543        return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1544                              n->nlmsg_flags & NLM_F_ECHO);
1545}
1546
1547struct qdisc_dump_args {
1548        struct qdisc_walker     w;
1549        struct sk_buff          *skb;
1550        struct netlink_callback *cb;
1551};
1552
1553static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1554{
1555        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1556
1557        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1558                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1559}
1560
1561static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1562                                struct tcmsg *tcm, struct netlink_callback *cb,
1563                                int *t_p, int s_t)
1564{
1565        struct qdisc_dump_args arg;
1566
1567        if (tc_qdisc_dump_ignore(q) ||
1568            *t_p < s_t || !q->ops->cl_ops ||
1569            (tcm->tcm_parent &&
1570             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1571                (*t_p)++;
1572                return 0;
1573        }
1574        if (*t_p > s_t)
1575                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1576        arg.w.fn = qdisc_class_dump;
1577        arg.skb = skb;
1578        arg.cb = cb;
1579        arg.w.stop  = 0;
1580        arg.w.skip = cb->args[1];
1581        arg.w.count = 0;
1582        q->ops->cl_ops->walk(q, &arg.w);
1583        cb->args[1] = arg.w.count;
1584        if (arg.w.stop)
1585                return -1;
1586        (*t_p)++;
1587        return 0;
1588}
1589
1590static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1591                               struct tcmsg *tcm, struct netlink_callback *cb,
1592                               int *t_p, int s_t)
1593{
1594        struct Qdisc *q;
1595
1596        if (!root)
1597                return 0;
1598
1599        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1600                return -1;
1601
1602        list_for_each_entry(q, &root->list, list) {
1603                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1604                        return -1;
1605        }
1606
1607        return 0;
1608}
1609
1610static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1611{
1612        struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1613        struct net *net = sock_net(skb->sk);
1614        struct netdev_queue *dev_queue;
1615        struct net_device *dev;
1616        int t, s_t;
1617
1618        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1619                return 0;
1620        dev = dev_get_by_index(net, tcm->tcm_ifindex);
1621        if (!dev)
1622                return 0;
1623
1624        s_t = cb->args[0];
1625        t = 0;
1626
1627        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1628                goto done;
1629
1630        dev_queue = dev_ingress_queue(dev);
1631        if (dev_queue &&
1632            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1633                                &t, s_t) < 0)
1634                goto done;
1635
1636done:
1637        cb->args[0] = t;
1638
1639        dev_put(dev);
1640        return skb->len;
1641}
1642
1643/* Main classifier routine: scans classifier chain attached
1644 * to this qdisc, (optionally) tests for protocol and asks
1645 * specific classifiers.
1646 */
1647int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1648                       struct tcf_result *res)
1649{
1650        __be16 protocol = skb->protocol;
1651        int err;
1652
1653        for (; tp; tp = tp->next) {
1654                if (tp->protocol != protocol &&
1655                    tp->protocol != htons(ETH_P_ALL))
1656                        continue;
1657                err = tp->classify(skb, tp, res);
1658
1659                if (err >= 0) {
1660#ifdef CONFIG_NET_CLS_ACT
1661                        if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1662                                skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1663#endif
1664                        return err;
1665                }
1666        }
1667        return -1;
1668}
1669EXPORT_SYMBOL(tc_classify_compat);
1670
1671int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1672                struct tcf_result *res)
1673{
1674        int err = 0;
1675#ifdef CONFIG_NET_CLS_ACT
1676        const struct tcf_proto *otp = tp;
1677reclassify:
1678#endif
1679
1680        err = tc_classify_compat(skb, tp, res);
1681#ifdef CONFIG_NET_CLS_ACT
1682        if (err == TC_ACT_RECLASSIFY) {
1683                u32 verd = G_TC_VERD(skb->tc_verd);
1684                tp = otp;
1685
1686                if (verd++ >= MAX_REC_LOOP) {
1687                        if (net_ratelimit())
1688                                pr_notice("%s: packet reclassify loop"
1689                                          " rule prio %u protocol %02x\n",
1690                                          tp->q->ops->id,
1691                                          tp->prio & 0xffff,
1692                                          ntohs(tp->protocol));
1693                        return TC_ACT_SHOT;
1694                }
1695                skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1696                goto reclassify;
1697        }
1698#endif
1699        return err;
1700}
1701EXPORT_SYMBOL(tc_classify);
1702
1703void tcf_destroy(struct tcf_proto *tp)
1704{
1705        tp->ops->destroy(tp);
1706        module_put(tp->ops->owner);
1707        kfree(tp);
1708}
1709
1710void tcf_destroy_chain(struct tcf_proto **fl)
1711{
1712        struct tcf_proto *tp;
1713
1714        while ((tp = *fl) != NULL) {
1715                *fl = tp->next;
1716                tcf_destroy(tp);
1717        }
1718}
1719EXPORT_SYMBOL(tcf_destroy_chain);
1720
1721#ifdef CONFIG_PROC_FS
1722static int psched_show(struct seq_file *seq, void *v)
1723{
1724        struct timespec ts;
1725
1726        hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1727        seq_printf(seq, "%08x %08x %08x %08x\n",
1728                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1729                   1000000,
1730                   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1731
1732        return 0;
1733}
1734
1735static int psched_open(struct inode *inode, struct file *file)
1736{
1737        return single_open(file, psched_show, NULL);
1738}
1739
1740static const struct file_operations psched_fops = {
1741        .owner = THIS_MODULE,
1742        .open = psched_open,
1743        .read  = seq_read,
1744        .llseek = seq_lseek,
1745        .release = single_release,
1746};
1747
1748static int __net_init psched_net_init(struct net *net)
1749{
1750        struct proc_dir_entry *e;
1751
1752        e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1753        if (e == NULL)
1754                return -ENOMEM;
1755
1756        return 0;
1757}
1758
1759static void __net_exit psched_net_exit(struct net *net)
1760{
1761        proc_net_remove(net, "psched");
1762}
1763#else
1764static int __net_init psched_net_init(struct net *net)
1765{
1766        return 0;
1767}
1768
1769static void __net_exit psched_net_exit(struct net *net)
1770{
1771}
1772#endif
1773
1774static struct pernet_operations psched_net_ops = {
1775        .init = psched_net_init,
1776        .exit = psched_net_exit,
1777};
1778
1779static int __init pktsched_init(void)
1780{
1781        int err;
1782
1783        err = register_pernet_subsys(&psched_net_ops);
1784        if (err) {
1785                pr_err("pktsched_init: "
1786                       "cannot initialize per netns operations\n");
1787                return err;
1788        }
1789
1790        register_qdisc(&pfifo_qdisc_ops);
1791        register_qdisc(&bfifo_qdisc_ops);
1792        register_qdisc(&pfifo_head_drop_qdisc_ops);
1793        register_qdisc(&mq_qdisc_ops);
1794
1795        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1796        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1797        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1798        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1799        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1800        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1801
1802        return 0;
1803}
1804
1805subsys_initcall(pktsched_init);
1806
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.