linux/net/sched/sch_api.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * net/sched/sch_api.c  Packet scheduler API.
   4 *
   5 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
   6 *
   7 * Fixes:
   8 *
   9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  12 */
  13
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/string.h>
  18#include <linux/errno.h>
  19#include <linux/skbuff.h>
  20#include <linux/init.h>
  21#include <linux/proc_fs.h>
  22#include <linux/seq_file.h>
  23#include <linux/kmod.h>
  24#include <linux/list.h>
  25#include <linux/hrtimer.h>
  26#include <linux/slab.h>
  27#include <linux/hashtable.h>
  28
  29#include <net/net_namespace.h>
  30#include <net/sock.h>
  31#include <net/netlink.h>
  32#include <net/pkt_sched.h>
  33#include <net/pkt_cls.h>
  34
  35#include <trace/events/qdisc.h>
  36
  37/*
  38
  39   Short review.
  40   -------------
  41
  42   This file consists of two interrelated parts:
  43
  44   1. queueing disciplines manager frontend.
  45   2. traffic classes manager frontend.
  46
  47   Generally, queueing discipline ("qdisc") is a black box,
  48   which is able to enqueue packets and to dequeue them (when
  49   device is ready to send something) in order and at times
  50   determined by algorithm hidden in it.
  51
  52   qdisc's are divided to two categories:
  53   - "queues", which have no internal structure visible from outside.
  54   - "schedulers", which split all the packets to "traffic classes",
  55     using "packet classifiers" (look at cls_api.c)
  56
  57   In turn, classes may have child qdiscs (as rule, queues)
  58   attached to them etc. etc. etc.
  59
  60   The goal of the routines in this file is to translate
  61   information supplied by user in the form of handles
  62   to more intelligible for kernel form, to make some sanity
  63   checks and part of work, which is common to all qdiscs
  64   and to provide rtnetlink notifications.
  65
  66   All real intelligent work is done inside qdisc modules.
  67
  68
  69
  70   Every discipline has two major routines: enqueue and dequeue.
  71
  72   ---dequeue
  73
  74   dequeue usually returns a skb to send. It is allowed to return NULL,
  75   but it does not mean that queue is empty, it just means that
  76   discipline does not want to send anything this time.
  77   Queue is really empty if q->q.qlen == 0.
  78   For complicated disciplines with multiple queues q->q is not
  79   real packet queue, but however q->q.qlen must be valid.
  80
  81   ---enqueue
  82
  83   enqueue returns 0, if packet was enqueued successfully.
  84   If packet (this one or another one) was dropped, it returns
  85   not zero error code.
  86   NET_XMIT_DROP        - this packet dropped
  87     Expected action: do not backoff, but wait until queue will clear.
  88   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
  89     Expected action: backoff or ignore
  90
  91   Auxiliary routines:
  92
  93   ---peek
  94
  95   like dequeue but without removing a packet from the queue
  96
  97   ---reset
  98
  99   returns qdisc to initial state: purge all buffers, clear all
 100   timers, counters (except for statistics) etc.
 101
 102   ---init
 103
 104   initializes newly created qdisc.
 105
 106   ---destroy
 107
 108   destroys resources allocated by init and during lifetime of qdisc.
 109
 110   ---change
 111
 112   changes qdisc parameters.
 113 */
 114
 115/* Protects list of registered TC modules. It is pure SMP lock. */
 116static DEFINE_RWLOCK(qdisc_mod_lock);
 117
 118
 119/************************************************
 120 *      Queueing disciplines manipulation.      *
 121 ************************************************/
 122
 123
 124/* The list of all installed queueing disciplines. */
 125
 126static struct Qdisc_ops *qdisc_base;
 127
 128/* Register/unregister queueing discipline */
 129
 130int register_qdisc(struct Qdisc_ops *qops)
 131{
 132        struct Qdisc_ops *q, **qp;
 133        int rc = -EEXIST;
 134
 135        write_lock(&qdisc_mod_lock);
 136        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 137                if (!strcmp(qops->id, q->id))
 138                        goto out;
 139
 140        if (qops->enqueue == NULL)
 141                qops->enqueue = noop_qdisc_ops.enqueue;
 142        if (qops->peek == NULL) {
 143                if (qops->dequeue == NULL)
 144                        qops->peek = noop_qdisc_ops.peek;
 145                else
 146                        goto out_einval;
 147        }
 148        if (qops->dequeue == NULL)
 149                qops->dequeue = noop_qdisc_ops.dequeue;
 150
 151        if (qops->cl_ops) {
 152                const struct Qdisc_class_ops *cops = qops->cl_ops;
 153
 154                if (!(cops->find && cops->walk && cops->leaf))
 155                        goto out_einval;
 156
 157                if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
 158                        goto out_einval;
 159        }
 160
 161        qops->next = NULL;
 162        *qp = qops;
 163        rc = 0;
 164out:
 165        write_unlock(&qdisc_mod_lock);
 166        return rc;
 167
 168out_einval:
 169        rc = -EINVAL;
 170        goto out;
 171}
 172EXPORT_SYMBOL(register_qdisc);
 173
 174int unregister_qdisc(struct Qdisc_ops *qops)
 175{
 176        struct Qdisc_ops *q, **qp;
 177        int err = -ENOENT;
 178
 179        write_lock(&qdisc_mod_lock);
 180        for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 181                if (q == qops)
 182                        break;
 183        if (q) {
 184                *qp = q->next;
 185                q->next = NULL;
 186                err = 0;
 187        }
 188        write_unlock(&qdisc_mod_lock);
 189        return err;
 190}
 191EXPORT_SYMBOL(unregister_qdisc);
 192
 193/* Get default qdisc if not otherwise specified */
 194void qdisc_get_default(char *name, size_t len)
 195{
 196        read_lock(&qdisc_mod_lock);
 197        strlcpy(name, default_qdisc_ops->id, len);
 198        read_unlock(&qdisc_mod_lock);
 199}
 200
 201static struct Qdisc_ops *qdisc_lookup_default(const char *name)
 202{
 203        struct Qdisc_ops *q = NULL;
 204
 205        for (q = qdisc_base; q; q = q->next) {
 206                if (!strcmp(name, q->id)) {
 207                        if (!try_module_get(q->owner))
 208                                q = NULL;
 209                        break;
 210                }
 211        }
 212
 213        return q;
 214}
 215
 216/* Set new default qdisc to use */
 217int qdisc_set_default(const char *name)
 218{
 219        const struct Qdisc_ops *ops;
 220
 221        if (!capable(CAP_NET_ADMIN))
 222                return -EPERM;
 223
 224        write_lock(&qdisc_mod_lock);
 225        ops = qdisc_lookup_default(name);
 226        if (!ops) {
 227                /* Not found, drop lock and try to load module */
 228                write_unlock(&qdisc_mod_lock);
 229                request_module("sch_%s", name);
 230                write_lock(&qdisc_mod_lock);
 231
 232                ops = qdisc_lookup_default(name);
 233        }
 234
 235        if (ops) {
 236                /* Set new default */
 237                module_put(default_qdisc_ops->owner);
 238                default_qdisc_ops = ops;
 239        }
 240        write_unlock(&qdisc_mod_lock);
 241
 242        return ops ? 0 : -ENOENT;
 243}
 244
 245#ifdef CONFIG_NET_SCH_DEFAULT
 246/* Set default value from kernel config */
 247static int __init sch_default_qdisc(void)
 248{
 249        return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
 250}
 251late_initcall(sch_default_qdisc);
 252#endif
 253
 254/* We know handle. Find qdisc among all qdisc's attached to device
 255 * (root qdisc, all its children, children of children etc.)
 256 * Note: caller either uses rtnl or rcu_read_lock()
 257 */
 258
 259static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 260{
 261        struct Qdisc *q;
 262
 263        if (!qdisc_dev(root))
 264                return (root->handle == handle ? root : NULL);
 265
 266        if (!(root->flags & TCQ_F_BUILTIN) &&
 267            root->handle == handle)
 268                return root;
 269
 270        hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
 271                                   lockdep_rtnl_is_held()) {
 272                if (q->handle == handle)
 273                        return q;
 274        }
 275        return NULL;
 276}
 277
 278void qdisc_hash_add(struct Qdisc *q, bool invisible)
 279{
 280        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 281                ASSERT_RTNL();
 282                hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
 283                if (invisible)
 284                        q->flags |= TCQ_F_INVISIBLE;
 285        }
 286}
 287EXPORT_SYMBOL(qdisc_hash_add);
 288
 289void qdisc_hash_del(struct Qdisc *q)
 290{
 291        if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
 292                ASSERT_RTNL();
 293                hash_del_rcu(&q->hash);
 294        }
 295}
 296EXPORT_SYMBOL(qdisc_hash_del);
 297
 298struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 299{
 300        struct Qdisc *q;
 301
 302        if (!handle)
 303                return NULL;
 304        q = qdisc_match_from_root(dev->qdisc, handle);
 305        if (q)
 306                goto out;
 307
 308        if (dev_ingress_queue(dev))
 309                q = qdisc_match_from_root(
 310                        dev_ingress_queue(dev)->qdisc_sleeping,
 311                        handle);
 312out:
 313        return q;
 314}
 315
 316struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
 317{
 318        struct netdev_queue *nq;
 319        struct Qdisc *q;
 320
 321        if (!handle)
 322                return NULL;
 323        q = qdisc_match_from_root(dev->qdisc, handle);
 324        if (q)
 325                goto out;
 326
 327        nq = dev_ingress_queue_rcu(dev);
 328        if (nq)
 329                q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
 330out:
 331        return q;
 332}
 333
 334static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 335{
 336        unsigned long cl;
 337        const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 338
 339        if (cops == NULL)
 340                return NULL;
 341        cl = cops->find(p, classid);
 342
 343        if (cl == 0)
 344                return NULL;
 345        return cops->leaf(p, cl);
 346}
 347
 348/* Find queueing discipline by name */
 349
 350static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 351{
 352        struct Qdisc_ops *q = NULL;
 353
 354        if (kind) {
 355                read_lock(&qdisc_mod_lock);
 356                for (q = qdisc_base; q; q = q->next) {
 357                        if (nla_strcmp(kind, q->id) == 0) {
 358                                if (!try_module_get(q->owner))
 359                                        q = NULL;
 360                                break;
 361                        }
 362                }
 363                read_unlock(&qdisc_mod_lock);
 364        }
 365        return q;
 366}
 367
 368/* The linklayer setting were not transferred from iproute2, in older
 369 * versions, and the rate tables lookup systems have been dropped in
 370 * the kernel. To keep backward compatible with older iproute2 tc
 371 * utils, we detect the linklayer setting by detecting if the rate
 372 * table were modified.
 373 *
 374 * For linklayer ATM table entries, the rate table will be aligned to
 375 * 48 bytes, thus some table entries will contain the same value.  The
 376 * mpu (min packet unit) is also encoded into the old rate table, thus
 377 * starting from the mpu, we find low and high table entries for
 378 * mapping this cell.  If these entries contain the same value, when
 379 * the rate tables have been modified for linklayer ATM.
 380 *
 381 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
 382 * and then roundup to the next cell, calc the table entry one below,
 383 * and compare.
 384 */
 385static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
 386{
 387        int low       = roundup(r->mpu, 48);
 388        int high      = roundup(low+1, 48);
 389        int cell_low  = low >> r->cell_log;
 390        int cell_high = (high >> r->cell_log) - 1;
 391
 392        /* rtab is too inaccurate at rates > 100Mbit/s */
 393        if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
 394                pr_debug("TC linklayer: Giving up ATM detection\n");
 395                return TC_LINKLAYER_ETHERNET;
 396        }
 397
 398        if ((cell_high > cell_low) && (cell_high < 256)
 399            && (rtab[cell_low] == rtab[cell_high])) {
 400                pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
 401                         cell_low, cell_high, rtab[cell_high]);
 402                return TC_LINKLAYER_ATM;
 403        }
 404        return TC_LINKLAYER_ETHERNET;
 405}
 406
 407static struct qdisc_rate_table *qdisc_rtab_list;
 408
 409struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 410                                        struct nlattr *tab,
 411                                        struct netlink_ext_ack *extack)
 412{
 413        struct qdisc_rate_table *rtab;
 414
 415        if (tab == NULL || r->rate == 0 ||
 416            r->cell_log == 0 || r->cell_log >= 32 ||
 417            nla_len(tab) != TC_RTAB_SIZE) {
 418                NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
 419                return NULL;
 420        }
 421
 422        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 423                if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
 424                    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 425                        rtab->refcnt++;
 426                        return rtab;
 427                }
 428        }
 429
 430        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 431        if (rtab) {
 432                rtab->rate = *r;
 433                rtab->refcnt = 1;
 434                memcpy(rtab->data, nla_data(tab), 1024);
 435                if (r->linklayer == TC_LINKLAYER_UNAWARE)
 436                        r->linklayer = __detect_linklayer(r, rtab->data);
 437                rtab->next = qdisc_rtab_list;
 438                qdisc_rtab_list = rtab;
 439        } else {
 440                NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
 441        }
 442        return rtab;
 443}
 444EXPORT_SYMBOL(qdisc_get_rtab);
 445
 446void qdisc_put_rtab(struct qdisc_rate_table *tab)
 447{
 448        struct qdisc_rate_table *rtab, **rtabp;
 449
 450        if (!tab || --tab->refcnt)
 451                return;
 452
 453        for (rtabp = &qdisc_rtab_list;
 454             (rtab = *rtabp) != NULL;
 455             rtabp = &rtab->next) {
 456                if (rtab == tab) {
 457                        *rtabp = rtab->next;
 458                        kfree(rtab);
 459                        return;
 460                }
 461        }
 462}
 463EXPORT_SYMBOL(qdisc_put_rtab);
 464
 465static LIST_HEAD(qdisc_stab_list);
 466
 467static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
 468        [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
 469        [TCA_STAB_DATA] = { .type = NLA_BINARY },
 470};
 471
 472static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
 473                                               struct netlink_ext_ack *extack)
 474{
 475        struct nlattr *tb[TCA_STAB_MAX + 1];
 476        struct qdisc_size_table *stab;
 477        struct tc_sizespec *s;
 478        unsigned int tsize = 0;
 479        u16 *tab = NULL;
 480        int err;
 481
 482        err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
 483                                          extack);
 484        if (err < 0)
 485                return ERR_PTR(err);
 486        if (!tb[TCA_STAB_BASE]) {
 487                NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
 488                return ERR_PTR(-EINVAL);
 489        }
 490
 491        s = nla_data(tb[TCA_STAB_BASE]);
 492
 493        if (s->tsize > 0) {
 494                if (!tb[TCA_STAB_DATA]) {
 495                        NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
 496                        return ERR_PTR(-EINVAL);
 497                }
 498                tab = nla_data(tb[TCA_STAB_DATA]);
 499                tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 500        }
 501
 502        if (tsize != s->tsize || (!tab && tsize > 0)) {
 503                NL_SET_ERR_MSG(extack, "Invalid size of size table");
 504                return ERR_PTR(-EINVAL);
 505        }
 506
 507        list_for_each_entry(stab, &qdisc_stab_list, list) {
 508                if (memcmp(&stab->szopts, s, sizeof(*s)))
 509                        continue;
 510                if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
 511                        continue;
 512                stab->refcnt++;
 513                return stab;
 514        }
 515
 516        stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
 517        if (!stab)
 518                return ERR_PTR(-ENOMEM);
 519
 520        stab->refcnt = 1;
 521        stab->szopts = *s;
 522        if (tsize > 0)
 523                memcpy(stab->data, tab, tsize * sizeof(u16));
 524
 525        list_add_tail(&stab->list, &qdisc_stab_list);
 526
 527        return stab;
 528}
 529
 530void qdisc_put_stab(struct qdisc_size_table *tab)
 531{
 532        if (!tab)
 533                return;
 534
 535        if (--tab->refcnt == 0) {
 536                list_del(&tab->list);
 537                kfree_rcu(tab, rcu);
 538        }
 539}
 540EXPORT_SYMBOL(qdisc_put_stab);
 541
 542static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 543{
 544        struct nlattr *nest;
 545
 546        nest = nla_nest_start_noflag(skb, TCA_STAB);
 547        if (nest == NULL)
 548                goto nla_put_failure;
 549        if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
 550                goto nla_put_failure;
 551        nla_nest_end(skb, nest);
 552
 553        return skb->len;
 554
 555nla_put_failure:
 556        return -1;
 557}
 558
 559void __qdisc_calculate_pkt_len(struct sk_buff *skb,
 560                               const struct qdisc_size_table *stab)
 561{
 562        int pkt_len, slot;
 563
 564        pkt_len = skb->len + stab->szopts.overhead;
 565        if (unlikely(!stab->szopts.tsize))
 566                goto out;
 567
 568        slot = pkt_len + stab->szopts.cell_align;
 569        if (unlikely(slot < 0))
 570                slot = 0;
 571
 572        slot >>= stab->szopts.cell_log;
 573        if (likely(slot < stab->szopts.tsize))
 574                pkt_len = stab->data[slot];
 575        else
 576                pkt_len = stab->data[stab->szopts.tsize - 1] *
 577                                (slot / stab->szopts.tsize) +
 578                                stab->data[slot % stab->szopts.tsize];
 579
 580        pkt_len <<= stab->szopts.size_log;
 581out:
 582        if (unlikely(pkt_len < 1))
 583                pkt_len = 1;
 584        qdisc_skb_cb(skb)->pkt_len = pkt_len;
 585}
 586EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 587
 588void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 589{
 590        if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 591                pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
 592                        txt, qdisc->ops->id, qdisc->handle >> 16);
 593                qdisc->flags |= TCQ_F_WARN_NONWC;
 594        }
 595}
 596EXPORT_SYMBOL(qdisc_warn_nonwc);
 597
 598static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 599{
 600        struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 601                                                 timer);
 602
 603        rcu_read_lock();
 604        __netif_schedule(qdisc_root(wd->qdisc));
 605        rcu_read_unlock();
 606
 607        return HRTIMER_NORESTART;
 608}
 609
 610void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
 611                                 clockid_t clockid)
 612{
 613        hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
 614        wd->timer.function = qdisc_watchdog;
 615        wd->qdisc = qdisc;
 616}
 617EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
 618
 619void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 620{
 621        qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
 622}
 623EXPORT_SYMBOL(qdisc_watchdog_init);
 624
 625void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
 626                                      u64 delta_ns)
 627{
 628        if (test_bit(__QDISC_STATE_DEACTIVATED,
 629                     &qdisc_root_sleeping(wd->qdisc)->state))
 630                return;
 631
 632        if (hrtimer_is_queued(&wd->timer)) {
 633                /* If timer is already set in [expires, expires + delta_ns],
 634                 * do not reprogram it.
 635                 */
 636                if (wd->last_expires - expires <= delta_ns)
 637                        return;
 638        }
 639
 640        wd->last_expires = expires;
 641        hrtimer_start_range_ns(&wd->timer,
 642                               ns_to_ktime(expires),
 643                               delta_ns,
 644                               HRTIMER_MODE_ABS_PINNED);
 645}
 646EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
 647
 648void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 649{
 650        hrtimer_cancel(&wd->timer);
 651}
 652EXPORT_SYMBOL(qdisc_watchdog_cancel);
 653
 654static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
 655{
 656        struct hlist_head *h;
 657        unsigned int i;
 658
 659        h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
 660
 661        if (h != NULL) {
 662                for (i = 0; i < n; i++)
 663                        INIT_HLIST_HEAD(&h[i]);
 664        }
 665        return h;
 666}
 667
 668void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 669{
 670        struct Qdisc_class_common *cl;
 671        struct hlist_node *next;
 672        struct hlist_head *nhash, *ohash;
 673        unsigned int nsize, nmask, osize;
 674        unsigned int i, h;
 675
 676        /* Rehash when load factor exceeds 0.75 */
 677        if (clhash->hashelems * 4 <= clhash->hashsize * 3)
 678                return;
 679        nsize = clhash->hashsize * 2;
 680        nmask = nsize - 1;
 681        nhash = qdisc_class_hash_alloc(nsize);
 682        if (nhash == NULL)
 683                return;
 684
 685        ohash = clhash->hash;
 686        osize = clhash->hashsize;
 687
 688        sch_tree_lock(sch);
 689        for (i = 0; i < osize; i++) {
 690                hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 691                        h = qdisc_class_hash(cl->classid, nmask);
 692                        hlist_add_head(&cl->hnode, &nhash[h]);
 693                }
 694        }
 695        clhash->hash     = nhash;
 696        clhash->hashsize = nsize;
 697        clhash->hashmask = nmask;
 698        sch_tree_unlock(sch);
 699
 700        kvfree(ohash);
 701}
 702EXPORT_SYMBOL(qdisc_class_hash_grow);
 703
 704int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
 705{
 706        unsigned int size = 4;
 707
 708        clhash->hash = qdisc_class_hash_alloc(size);
 709        if (!clhash->hash)
 710                return -ENOMEM;
 711        clhash->hashsize  = size;
 712        clhash->hashmask  = size - 1;
 713        clhash->hashelems = 0;
 714        return 0;
 715}
 716EXPORT_SYMBOL(qdisc_class_hash_init);
 717
 718void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
 719{
 720        kvfree(clhash->hash);
 721}
 722EXPORT_SYMBOL(qdisc_class_hash_destroy);
 723
 724void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
 725                             struct Qdisc_class_common *cl)
 726{
 727        unsigned int h;
 728
 729        INIT_HLIST_NODE(&cl->hnode);
 730        h = qdisc_class_hash(cl->classid, clhash->hashmask);
 731        hlist_add_head(&cl->hnode, &clhash->hash[h]);
 732        clhash->hashelems++;
 733}
 734EXPORT_SYMBOL(qdisc_class_hash_insert);
 735
 736void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
 737                             struct Qdisc_class_common *cl)
 738{
 739        hlist_del(&cl->hnode);
 740        clhash->hashelems--;
 741}
 742EXPORT_SYMBOL(qdisc_class_hash_remove);
 743
 744/* Allocate an unique handle from space managed by kernel
 745 * Possible range is [8000-FFFF]:0000 (0x8000 values)
 746 */
 747static u32 qdisc_alloc_handle(struct net_device *dev)
 748{
 749        int i = 0x8000;
 750        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 751
 752        do {
 753                autohandle += TC_H_MAKE(0x10000U, 0);
 754                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 755                        autohandle = TC_H_MAKE(0x80000000U, 0);
 756                if (!qdisc_lookup(dev, autohandle))
 757                        return autohandle;
 758                cond_resched();
 759        } while (--i > 0);
 760
 761        return 0;
 762}
 763
 764void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
 765{
 766        bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
 767        const struct Qdisc_class_ops *cops;
 768        unsigned long cl;
 769        u32 parentid;
 770        bool notify;
 771        int drops;
 772
 773        if (n == 0 && len == 0)
 774                return;
 775        drops = max_t(int, n, 0);
 776        rcu_read_lock();
 777        while ((parentid = sch->parent)) {
 778                if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 779                        break;
 780
 781                if (sch->flags & TCQ_F_NOPARENT)
 782                        break;
 783                /* Notify parent qdisc only if child qdisc becomes empty.
 784                 *
 785                 * If child was empty even before update then backlog
 786                 * counter is screwed and we skip notification because
 787                 * parent class is already passive.
 788                 *
 789                 * If the original child was offloaded then it is allowed
 790                 * to be seem as empty, so the parent is notified anyway.
 791                 */
 792                notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
 793                                                       !qdisc_is_offloaded);
 794                /* TODO: perform the search on a per txq basis */
 795                sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 796                if (sch == NULL) {
 797                        WARN_ON_ONCE(parentid != TC_H_ROOT);
 798                        break;
 799                }
 800                cops = sch->ops->cl_ops;
 801                if (notify && cops->qlen_notify) {
 802                        cl = cops->find(sch, parentid);
 803                        cops->qlen_notify(sch, cl);
 804                }
 805                sch->q.qlen -= n;
 806                sch->qstats.backlog -= len;
 807                __qdisc_qstats_drop(sch, drops);
 808        }
 809        rcu_read_unlock();
 810}
 811EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 812
 813int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
 814                              void *type_data)
 815{
 816        struct net_device *dev = qdisc_dev(sch);
 817        int err;
 818
 819        sch->flags &= ~TCQ_F_OFFLOADED;
 820        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 821                return 0;
 822
 823        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 824        if (err == -EOPNOTSUPP)
 825                return 0;
 826
 827        if (!err)
 828                sch->flags |= TCQ_F_OFFLOADED;
 829
 830        return err;
 831}
 832EXPORT_SYMBOL(qdisc_offload_dump_helper);
 833
 834void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
 835                                struct Qdisc *new, struct Qdisc *old,
 836                                enum tc_setup_type type, void *type_data,
 837                                struct netlink_ext_ack *extack)
 838{
 839        bool any_qdisc_is_offloaded;
 840        int err;
 841
 842        if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 843                return;
 844
 845        err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
 846
 847        /* Don't report error if the graft is part of destroy operation. */
 848        if (!err || !new || new == &noop_qdisc)
 849                return;
 850
 851        /* Don't report error if the parent, the old child and the new
 852         * one are not offloaded.
 853         */
 854        any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
 855        any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
 856        any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
 857
 858        if (any_qdisc_is_offloaded)
 859                NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
 860}
 861EXPORT_SYMBOL(qdisc_offload_graft_helper);
 862
 863static void qdisc_offload_graft_root(struct net_device *dev,
 864                                     struct Qdisc *new, struct Qdisc *old,
 865                                     struct netlink_ext_ack *extack)
 866{
 867        struct tc_root_qopt_offload graft_offload = {
 868                .command        = TC_ROOT_GRAFT,
 869                .handle         = new ? new->handle : 0,
 870                .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
 871                                  (old && old->flags & TCQ_F_INGRESS),
 872        };
 873
 874        qdisc_offload_graft_helper(dev, NULL, new, old,
 875                                   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
 876}
 877
 878static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 879                         u32 portid, u32 seq, u16 flags, int event)
 880{
 881        struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
 882        struct gnet_stats_queue __percpu *cpu_qstats = NULL;
 883        struct tcmsg *tcm;
 884        struct nlmsghdr  *nlh;
 885        unsigned char *b = skb_tail_pointer(skb);
 886        struct gnet_dump d;
 887        struct qdisc_size_table *stab;
 888        u32 block_index;
 889        __u32 qlen;
 890
 891        cond_resched();
 892        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
 893        if (!nlh)
 894                goto out_nlmsg_trim;
 895        tcm = nlmsg_data(nlh);
 896        tcm->tcm_family = AF_UNSPEC;
 897        tcm->tcm__pad1 = 0;
 898        tcm->tcm__pad2 = 0;
 899        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 900        tcm->tcm_parent = clid;
 901        tcm->tcm_handle = q->handle;
 902        tcm->tcm_info = refcount_read(&q->refcnt);
 903        if (nla_put_string(skb, TCA_KIND, q->ops->id))
 904                goto nla_put_failure;
 905        if (q->ops->ingress_block_get) {
 906                block_index = q->ops->ingress_block_get(q);
 907                if (block_index &&
 908                    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
 909                        goto nla_put_failure;
 910        }
 911        if (q->ops->egress_block_get) {
 912                block_index = q->ops->egress_block_get(q);
 913                if (block_index &&
 914                    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
 915                        goto nla_put_failure;
 916        }
 917        if (q->ops->dump && q->ops->dump(q, skb) < 0)
 918                goto nla_put_failure;
 919        if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
 920                goto nla_put_failure;
 921        qlen = qdisc_qlen_sum(q);
 922
 923        stab = rtnl_dereference(q->stab);
 924        if (stab && qdisc_dump_stab(skb, stab) < 0)
 925                goto nla_put_failure;
 926
 927        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
 928                                         NULL, &d, TCA_PAD) < 0)
 929                goto nla_put_failure;
 930
 931        if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
 932                goto nla_put_failure;
 933
 934        if (qdisc_is_percpu_stats(q)) {
 935                cpu_bstats = q->cpu_bstats;
 936                cpu_qstats = q->cpu_qstats;
 937        }
 938
 939        if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 940                                  &d, cpu_bstats, &q->bstats) < 0 ||
 941            gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 942            gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 943                goto nla_put_failure;
 944
 945        if (gnet_stats_finish_copy(&d) < 0)
 946                goto nla_put_failure;
 947
 948        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 949        return skb->len;
 950
 951out_nlmsg_trim:
 952nla_put_failure:
 953        nlmsg_trim(skb, b);
 954        return -1;
 955}
 956
 957static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
 958{
 959        if (q->flags & TCQ_F_BUILTIN)
 960                return true;
 961        if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
 962                return true;
 963
 964        return false;
 965}
 966
 967static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 968                        struct nlmsghdr *n, u32 clid,
 969                        struct Qdisc *old, struct Qdisc *new)
 970{
 971        struct sk_buff *skb;
 972        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 973
 974        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 975        if (!skb)
 976                return -ENOBUFS;
 977
 978        if (old && !tc_qdisc_dump_ignore(old, false)) {
 979                if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
 980                                  0, RTM_DELQDISC) < 0)
 981                        goto err_out;
 982        }
 983        if (new && !tc_qdisc_dump_ignore(new, false)) {
 984                if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
 985                                  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 986                        goto err_out;
 987        }
 988
 989        if (skb->len)
 990                return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 991                                      n->nlmsg_flags & NLM_F_ECHO);
 992
 993err_out:
 994        kfree_skb(skb);
 995        return -EINVAL;
 996}
 997
 998static void notify_and_destroy(struct net *net, struct sk_buff *skb,
 999                               struct nlmsghdr *n, u32 clid,
1000                               struct Qdisc *old, struct Qdisc *new)
1001{
1002        if (new || old)
1003                qdisc_notify(net, skb, n, clid, old, new);
1004
1005        if (old)
1006                qdisc_put(old);
1007}
1008
1009static void qdisc_clear_nolock(struct Qdisc *sch)
1010{
1011        sch->flags &= ~TCQ_F_NOLOCK;
1012        if (!(sch->flags & TCQ_F_CPUSTATS))
1013                return;
1014
1015        free_percpu(sch->cpu_bstats);
1016        free_percpu(sch->cpu_qstats);
1017        sch->cpu_bstats = NULL;
1018        sch->cpu_qstats = NULL;
1019        sch->flags &= ~TCQ_F_CPUSTATS;
1020}
1021
1022/* Graft qdisc "new" to class "classid" of qdisc "parent" or
1023 * to device "dev".
1024 *
1025 * When appropriate send a netlink notification using 'skb'
1026 * and "n".
1027 *
1028 * On success, destroy old qdisc.
1029 */
1030
1031static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1032                       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1033                       struct Qdisc *new, struct Qdisc *old,
1034                       struct netlink_ext_ack *extack)
1035{
1036        struct Qdisc *q = old;
1037        struct net *net = dev_net(dev);
1038
1039        if (parent == NULL) {
1040                unsigned int i, num_q, ingress;
1041
1042                ingress = 0;
1043                num_q = dev->num_tx_queues;
1044                if ((q && q->flags & TCQ_F_INGRESS) ||
1045                    (new && new->flags & TCQ_F_INGRESS)) {
1046                        num_q = 1;
1047                        ingress = 1;
1048                        if (!dev_ingress_queue(dev)) {
1049                                NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1050                                return -ENOENT;
1051                        }
1052                }
1053
1054                if (dev->flags & IFF_UP)
1055                        dev_deactivate(dev);
1056
1057                qdisc_offload_graft_root(dev, new, old, extack);
1058
1059                if (new && new->ops->attach)
1060                        goto skip;
1061
1062                for (i = 0; i < num_q; i++) {
1063                        struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1064
1065                        if (!ingress)
1066                                dev_queue = netdev_get_tx_queue(dev, i);
1067
1068                        old = dev_graft_qdisc(dev_queue, new);
1069                        if (new && i > 0)
1070                                qdisc_refcount_inc(new);
1071
1072                        if (!ingress)
1073                                qdisc_put(old);
1074                }
1075
1076skip:
1077                if (!ingress) {
1078                        notify_and_destroy(net, skb, n, classid,
1079                                           dev->qdisc, new);
1080                        if (new && !new->ops->attach)
1081                                qdisc_refcount_inc(new);
1082                        dev->qdisc = new ? : &noop_qdisc;
1083
1084                        if (new && new->ops->attach)
1085                                new->ops->attach(new);
1086                } else {
1087                        notify_and_destroy(net, skb, n, classid, old, new);
1088                }
1089
1090                if (dev->flags & IFF_UP)
1091                        dev_activate(dev);
1092        } else {
1093                const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1094                unsigned long cl;
1095                int err;
1096
1097                /* Only support running class lockless if parent is lockless */
1098                if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1099                        qdisc_clear_nolock(new);
1100
1101                if (!cops || !cops->graft)
1102                        return -EOPNOTSUPP;
1103
1104                cl = cops->find(parent, classid);
1105                if (!cl) {
1106                        NL_SET_ERR_MSG(extack, "Specified class not found");
1107                        return -ENOENT;
1108                }
1109
1110                err = cops->graft(parent, cl, new, &old, extack);
1111                if (err)
1112                        return err;
1113                notify_and_destroy(net, skb, n, classid, old, new);
1114        }
1115        return 0;
1116}
1117
1118static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1119                                   struct netlink_ext_ack *extack)
1120{
1121        u32 block_index;
1122
1123        if (tca[TCA_INGRESS_BLOCK]) {
1124                block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1125
1126                if (!block_index) {
1127                        NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1128                        return -EINVAL;
1129                }
1130                if (!sch->ops->ingress_block_set) {
1131                        NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1132                        return -EOPNOTSUPP;
1133                }
1134                sch->ops->ingress_block_set(sch, block_index);
1135        }
1136        if (tca[TCA_EGRESS_BLOCK]) {
1137                block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1138
1139                if (!block_index) {
1140                        NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1141                        return -EINVAL;
1142                }
1143                if (!sch->ops->egress_block_set) {
1144                        NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1145                        return -EOPNOTSUPP;
1146                }
1147                sch->ops->egress_block_set(sch, block_index);
1148        }
1149        return 0;
1150}
1151
1152/*
1153   Allocate and initialize new qdisc.
1154
1155   Parameters are passed via opt.
1156 */
1157
1158static struct Qdisc *qdisc_create(struct net_device *dev,
1159                                  struct netdev_queue *dev_queue,
1160                                  struct Qdisc *p, u32 parent, u32 handle,
1161                                  struct nlattr **tca, int *errp,
1162                                  struct netlink_ext_ack *extack)
1163{
1164        int err;
1165        struct nlattr *kind = tca[TCA_KIND];
1166        struct Qdisc *sch;
1167        struct Qdisc_ops *ops;
1168        struct qdisc_size_table *stab;
1169
1170        ops = qdisc_lookup_ops(kind);
1171#ifdef CONFIG_MODULES
1172        if (ops == NULL && kind != NULL) {
1173                char name[IFNAMSIZ];
1174                if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1175                        /* We dropped the RTNL semaphore in order to
1176                         * perform the module load.  So, even if we
1177                         * succeeded in loading the module we have to
1178                         * tell the caller to replay the request.  We
1179                         * indicate this using -EAGAIN.
1180                         * We replay the request because the device may
1181                         * go away in the mean time.
1182                         */
1183                        rtnl_unlock();
1184                        request_module("sch_%s", name);
1185                        rtnl_lock();
1186                        ops = qdisc_lookup_ops(kind);
1187                        if (ops != NULL) {
1188                                /* We will try again qdisc_lookup_ops,
1189                                 * so don't keep a reference.
1190                                 */
1191                                module_put(ops->owner);
1192                                err = -EAGAIN;
1193                                goto err_out;
1194                        }
1195                }
1196        }
1197#endif
1198
1199        err = -ENOENT;
1200        if (!ops) {
1201                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1202                goto err_out;
1203        }
1204
1205        sch = qdisc_alloc(dev_queue, ops, extack);
1206        if (IS_ERR(sch)) {
1207                err = PTR_ERR(sch);
1208                goto err_out2;
1209        }
1210
1211        sch->parent = parent;
1212
1213        if (handle == TC_H_INGRESS) {
1214                sch->flags |= TCQ_F_INGRESS;
1215                handle = TC_H_MAKE(TC_H_INGRESS, 0);
1216        } else {
1217                if (handle == 0) {
1218                        handle = qdisc_alloc_handle(dev);
1219                        if (handle == 0) {
1220                                NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1221                                err = -ENOSPC;
1222                                goto err_out3;
1223                        }
1224                }
1225                if (!netif_is_multiqueue(dev))
1226                        sch->flags |= TCQ_F_ONETXQUEUE;
1227        }
1228
1229        sch->handle = handle;
1230
1231        /* This exist to keep backward compatible with a userspace
1232         * loophole, what allowed userspace to get IFF_NO_QUEUE
1233         * facility on older kernels by setting tx_queue_len=0 (prior
1234         * to qdisc init), and then forgot to reinit tx_queue_len
1235         * before again attaching a qdisc.
1236         */
1237        if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1238                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1239                netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1240        }
1241
1242        err = qdisc_block_indexes_set(sch, tca, extack);
1243        if (err)
1244                goto err_out3;
1245
1246        if (ops->init) {
1247                err = ops->init(sch, tca[TCA_OPTIONS], extack);
1248                if (err != 0)
1249                        goto err_out5;
1250        }
1251
1252        if (tca[TCA_STAB]) {
1253                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1254                if (IS_ERR(stab)) {
1255                        err = PTR_ERR(stab);
1256                        goto err_out4;
1257                }
1258                rcu_assign_pointer(sch->stab, stab);
1259        }
1260        if (tca[TCA_RATE]) {
1261                seqcount_t *running;
1262
1263                err = -EOPNOTSUPP;
1264                if (sch->flags & TCQ_F_MQROOT) {
1265                        NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1266                        goto err_out4;
1267                }
1268
1269                if (sch->parent != TC_H_ROOT &&
1270                    !(sch->flags & TCQ_F_INGRESS) &&
1271                    (!p || !(p->flags & TCQ_F_MQROOT)))
1272                        running = qdisc_root_sleeping_running(sch);
1273                else
1274                        running = &sch->running;
1275
1276                err = gen_new_estimator(&sch->bstats,
1277                                        sch->cpu_bstats,
1278                                        &sch->rate_est,
1279                                        NULL,
1280                                        running,
1281                                        tca[TCA_RATE]);
1282                if (err) {
1283                        NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1284                        goto err_out4;
1285                }
1286        }
1287
1288        qdisc_hash_add(sch, false);
1289        trace_qdisc_create(ops, dev, parent);
1290
1291        return sch;
1292
1293err_out5:
1294        /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1295        if (ops->destroy)
1296                ops->destroy(sch);
1297err_out3:
1298        dev_put(dev);
1299        qdisc_free(sch);
1300err_out2:
1301        module_put(ops->owner);
1302err_out:
1303        *errp = err;
1304        return NULL;
1305
1306err_out4:
1307        /*
1308         * Any broken qdiscs that would require a ops->reset() here?
1309         * The qdisc was never in action so it shouldn't be necessary.
1310         */
1311        qdisc_put_stab(rtnl_dereference(sch->stab));
1312        if (ops->destroy)
1313                ops->destroy(sch);
1314        goto err_out3;
1315}
1316
1317static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1318                        struct netlink_ext_ack *extack)
1319{
1320        struct qdisc_size_table *ostab, *stab = NULL;
1321        int err = 0;
1322
1323        if (tca[TCA_OPTIONS]) {
1324                if (!sch->ops->change) {
1325                        NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1326                        return -EINVAL;
1327                }
1328                if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1329                        NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1330                        return -EOPNOTSUPP;
1331                }
1332                err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1333                if (err)
1334                        return err;
1335        }
1336
1337        if (tca[TCA_STAB]) {
1338                stab = qdisc_get_stab(tca[TCA_STAB], extack);
1339                if (IS_ERR(stab))
1340                        return PTR_ERR(stab);
1341        }
1342
1343        ostab = rtnl_dereference(sch->stab);
1344        rcu_assign_pointer(sch->stab, stab);
1345        qdisc_put_stab(ostab);
1346
1347        if (tca[TCA_RATE]) {
1348                /* NB: ignores errors from replace_estimator
1349                   because change can't be undone. */
1350                if (sch->flags & TCQ_F_MQROOT)
1351                        goto out;
1352                gen_replace_estimator(&sch->bstats,
1353                                      sch->cpu_bstats,
1354                                      &sch->rate_est,
1355                                      NULL,
1356                                      qdisc_root_sleeping_running(sch),
1357                                      tca[TCA_RATE]);
1358        }
1359out:
1360        return 0;
1361}
1362
1363struct check_loop_arg {
1364        struct qdisc_walker     w;
1365        struct Qdisc            *p;
1366        int                     depth;
1367};
1368
1369static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1370                         struct qdisc_walker *w);
1371
1372static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1373{
1374        struct check_loop_arg   arg;
1375
1376        if (q->ops->cl_ops == NULL)
1377                return 0;
1378
1379        arg.w.stop = arg.w.skip = arg.w.count = 0;
1380        arg.w.fn = check_loop_fn;
1381        arg.depth = depth;
1382        arg.p = p;
1383        q->ops->cl_ops->walk(q, &arg.w);
1384        return arg.w.stop ? -ELOOP : 0;
1385}
1386
1387static int
1388check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1389{
1390        struct Qdisc *leaf;
1391        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1392        struct check_loop_arg *arg = (struct check_loop_arg *)w;
1393
1394        leaf = cops->leaf(q, cl);
1395        if (leaf) {
1396                if (leaf == arg->p || arg->depth > 7)
1397                        return -ELOOP;
1398                return check_loop(leaf, arg->p, arg->depth + 1);
1399        }
1400        return 0;
1401}
1402
1403const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1404        [TCA_KIND]              = { .type = NLA_STRING },
1405        [TCA_RATE]              = { .type = NLA_BINARY,
1406                                    .len = sizeof(struct tc_estimator) },
1407        [TCA_STAB]              = { .type = NLA_NESTED },
1408        [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1409        [TCA_CHAIN]             = { .type = NLA_U32 },
1410        [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1411        [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1412};
1413
1414/*
1415 * Delete/get qdisc.
1416 */
1417
1418static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1419                        struct netlink_ext_ack *extack)
1420{
1421        struct net *net = sock_net(skb->sk);
1422        struct tcmsg *tcm = nlmsg_data(n);
1423        struct nlattr *tca[TCA_MAX + 1];
1424        struct net_device *dev;
1425        u32 clid;
1426        struct Qdisc *q = NULL;
1427        struct Qdisc *p = NULL;
1428        int err;
1429
1430        if ((n->nlmsg_type != RTM_GETQDISC) &&
1431            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1432                return -EPERM;
1433
1434        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1435                                     rtm_tca_policy, extack);
1436        if (err < 0)
1437                return err;
1438
1439        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1440        if (!dev)
1441                return -ENODEV;
1442
1443        clid = tcm->tcm_parent;
1444        if (clid) {
1445                if (clid != TC_H_ROOT) {
1446                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1447                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1448                                if (!p) {
1449                                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1450                                        return -ENOENT;
1451                                }
1452                                q = qdisc_leaf(p, clid);
1453                        } else if (dev_ingress_queue(dev)) {
1454                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1455                        }
1456                } else {
1457                        q = dev->qdisc;
1458                }
1459                if (!q) {
1460                        NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1461                        return -ENOENT;
1462                }
1463
1464                if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1465                        NL_SET_ERR_MSG(extack, "Invalid handle");
1466                        return -EINVAL;
1467                }
1468        } else {
1469                q = qdisc_lookup(dev, tcm->tcm_handle);
1470                if (!q) {
1471                        NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1472                        return -ENOENT;
1473                }
1474        }
1475
1476        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1477                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1478                return -EINVAL;
1479        }
1480
1481        if (n->nlmsg_type == RTM_DELQDISC) {
1482                if (!clid) {
1483                        NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1484                        return -EINVAL;
1485                }
1486                if (q->handle == 0) {
1487                        NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1488                        return -ENOENT;
1489                }
1490                err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1491                if (err != 0)
1492                        return err;
1493        } else {
1494                qdisc_notify(net, skb, n, clid, NULL, q);
1495        }
1496        return 0;
1497}
1498
1499/*
1500 * Create/change qdisc.
1501 */
1502
1503static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1504                           struct netlink_ext_ack *extack)
1505{
1506        struct net *net = sock_net(skb->sk);
1507        struct tcmsg *tcm;
1508        struct nlattr *tca[TCA_MAX + 1];
1509        struct net_device *dev;
1510        u32 clid;
1511        struct Qdisc *q, *p;
1512        int err;
1513
1514        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1515                return -EPERM;
1516
1517replay:
1518        /* Reinit, just in case something touches this. */
1519        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1520                                     rtm_tca_policy, extack);
1521        if (err < 0)
1522                return err;
1523
1524        tcm = nlmsg_data(n);
1525        clid = tcm->tcm_parent;
1526        q = p = NULL;
1527
1528        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1529        if (!dev)
1530                return -ENODEV;
1531
1532
1533        if (clid) {
1534                if (clid != TC_H_ROOT) {
1535                        if (clid != TC_H_INGRESS) {
1536                                p = qdisc_lookup(dev, TC_H_MAJ(clid));
1537                                if (!p) {
1538                                        NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1539                                        return -ENOENT;
1540                                }
1541                                q = qdisc_leaf(p, clid);
1542                        } else if (dev_ingress_queue_create(dev)) {
1543                                q = dev_ingress_queue(dev)->qdisc_sleeping;
1544                        }
1545                } else {
1546                        q = dev->qdisc;
1547                }
1548
1549                /* It may be default qdisc, ignore it */
1550                if (q && q->handle == 0)
1551                        q = NULL;
1552
1553                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1554                        if (tcm->tcm_handle) {
1555                                if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1556                                        NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1557                                        return -EEXIST;
1558                                }
1559                                if (TC_H_MIN(tcm->tcm_handle)) {
1560                                        NL_SET_ERR_MSG(extack, "Invalid minor handle");
1561                                        return -EINVAL;
1562                                }
1563                                q = qdisc_lookup(dev, tcm->tcm_handle);
1564                                if (!q)
1565                                        goto create_n_graft;
1566                                if (n->nlmsg_flags & NLM_F_EXCL) {
1567                                        NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1568                                        return -EEXIST;
1569                                }
1570                                if (tca[TCA_KIND] &&
1571                                    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1572                                        NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1573                                        return -EINVAL;
1574                                }
1575                                if (q == p ||
1576                                    (p && check_loop(q, p, 0))) {
1577                                        NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1578                                        return -ELOOP;
1579                                }
1580                                qdisc_refcount_inc(q);
1581                                goto graft;
1582                        } else {
1583                                if (!q)
1584                                        goto create_n_graft;
1585
1586                                /* This magic test requires explanation.
1587                                 *
1588                                 *   We know, that some child q is already
1589                                 *   attached to this parent and have choice:
1590                                 *   either to change it or to create/graft new one.
1591                                 *
1592                                 *   1. We are allowed to create/graft only
1593                                 *   if CREATE and REPLACE flags are set.
1594                                 *
1595                                 *   2. If EXCL is set, requestor wanted to say,
1596                                 *   that qdisc tcm_handle is not expected
1597                                 *   to exist, so that we choose create/graft too.
1598                                 *
1599                                 *   3. The last case is when no flags are set.
1600                                 *   Alas, it is sort of hole in API, we
1601                                 *   cannot decide what to do unambiguously.
1602                                 *   For now we select create/graft, if
1603                                 *   user gave KIND, which does not match existing.
1604                                 */
1605                                if ((n->nlmsg_flags & NLM_F_CREATE) &&
1606                                    (n->nlmsg_flags & NLM_F_REPLACE) &&
1607                                    ((n->nlmsg_flags & NLM_F_EXCL) ||
1608                                     (tca[TCA_KIND] &&
1609                                      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1610                                        goto create_n_graft;
1611                        }
1612                }
1613        } else {
1614                if (!tcm->tcm_handle) {
1615                        NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1616                        return -EINVAL;
1617                }
1618                q = qdisc_lookup(dev, tcm->tcm_handle);
1619        }
1620
1621        /* Change qdisc parameters */
1622        if (!q) {
1623                NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1624                return -ENOENT;
1625        }
1626        if (n->nlmsg_flags & NLM_F_EXCL) {
1627                NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1628                return -EEXIST;
1629        }
1630        if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1631                NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1632                return -EINVAL;
1633        }
1634        err = qdisc_change(q, tca, extack);
1635        if (err == 0)
1636                qdisc_notify(net, skb, n, clid, NULL, q);
1637        return err;
1638
1639create_n_graft:
1640        if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1641                NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1642                return -ENOENT;
1643        }
1644        if (clid == TC_H_INGRESS) {
1645                if (dev_ingress_queue(dev)) {
1646                        q = qdisc_create(dev, dev_ingress_queue(dev), p,
1647                                         tcm->tcm_parent, tcm->tcm_parent,
1648                                         tca, &err, extack);
1649                } else {
1650                        NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1651                        err = -ENOENT;
1652                }
1653        } else {
1654                struct netdev_queue *dev_queue;
1655
1656                if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1657                        dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1658                else if (p)
1659                        dev_queue = p->dev_queue;
1660                else
1661                        dev_queue = netdev_get_tx_queue(dev, 0);
1662
1663                q = qdisc_create(dev, dev_queue, p,
1664                                 tcm->tcm_parent, tcm->tcm_handle,
1665                                 tca, &err, extack);
1666        }
1667        if (q == NULL) {
1668                if (err == -EAGAIN)
1669                        goto replay;
1670                return err;
1671        }
1672
1673graft:
1674        err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1675        if (err) {
1676                if (q)
1677                        qdisc_put(q);
1678                return err;
1679        }
1680
1681        return 0;
1682}
1683
1684static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1685                              struct netlink_callback *cb,
1686                              int *q_idx_p, int s_q_idx, bool recur,
1687                              bool dump_invisible)
1688{
1689        int ret = 0, q_idx = *q_idx_p;
1690        struct Qdisc *q;
1691        int b;
1692
1693        if (!root)
1694                return 0;
1695
1696        q = root;
1697        if (q_idx < s_q_idx) {
1698                q_idx++;
1699        } else {
1700                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1701                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1702                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1703                                  RTM_NEWQDISC) <= 0)
1704                        goto done;
1705                q_idx++;
1706        }
1707
1708        /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1709         * itself has already been dumped.
1710         *
1711         * If we've already dumped the top-level (ingress) qdisc above and the global
1712         * qdisc hashtable, we don't want to hit it again
1713         */
1714        if (!qdisc_dev(root) || !recur)
1715                goto out;
1716
1717        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1718                if (q_idx < s_q_idx) {
1719                        q_idx++;
1720                        continue;
1721                }
1722                if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1723                    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1724                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1725                                  RTM_NEWQDISC) <= 0)
1726                        goto done;
1727                q_idx++;
1728        }
1729
1730out:
1731        *q_idx_p = q_idx;
1732        return ret;
1733done:
1734        ret = -1;
1735        goto out;
1736}
1737
1738static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1739{
1740        struct net *net = sock_net(skb->sk);
1741        int idx, q_idx;
1742        int s_idx, s_q_idx;
1743        struct net_device *dev;
1744        const struct nlmsghdr *nlh = cb->nlh;
1745        struct nlattr *tca[TCA_MAX + 1];
1746        int err;
1747
1748        s_idx = cb->args[0];
1749        s_q_idx = q_idx = cb->args[1];
1750
1751        idx = 0;
1752        ASSERT_RTNL();
1753
1754        err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1755                                     rtm_tca_policy, cb->extack);
1756        if (err < 0)
1757                return err;
1758
1759        for_each_netdev(net, dev) {
1760                struct netdev_queue *dev_queue;
1761
1762                if (idx < s_idx)
1763                        goto cont;
1764                if (idx > s_idx)
1765                        s_q_idx = 0;
1766                q_idx = 0;
1767
1768                if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1769                                       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1770                        goto done;
1771
1772                dev_queue = dev_ingress_queue(dev);
1773                if (dev_queue &&
1774                    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1775                                       &q_idx, s_q_idx, false,
1776                                       tca[TCA_DUMP_INVISIBLE]) < 0)
1777                        goto done;
1778
1779cont:
1780                idx++;
1781        }
1782
1783done:
1784        cb->args[0] = idx;
1785        cb->args[1] = q_idx;
1786
1787        return skb->len;
1788}
1789
1790
1791
1792/************************************************
1793 *      Traffic classes manipulation.           *
1794 ************************************************/
1795
1796static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1797                          unsigned long cl,
1798                          u32 portid, u32 seq, u16 flags, int event)
1799{
1800        struct tcmsg *tcm;
1801        struct nlmsghdr  *nlh;
1802        unsigned char *b = skb_tail_pointer(skb);
1803        struct gnet_dump d;
1804        const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1805
1806        cond_resched();
1807        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1808        if (!nlh)
1809                goto out_nlmsg_trim;
1810        tcm = nlmsg_data(nlh);
1811        tcm->tcm_family = AF_UNSPEC;
1812        tcm->tcm__pad1 = 0;
1813        tcm->tcm__pad2 = 0;
1814        tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1815        tcm->tcm_parent = q->handle;
1816        tcm->tcm_handle = q->handle;
1817        tcm->tcm_info = 0;
1818        if (nla_put_string(skb, TCA_KIND, q->ops->id))
1819                goto nla_put_failure;
1820        if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1821                goto nla_put_failure;
1822
1823        if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1824                                         NULL, &d, TCA_PAD) < 0)
1825                goto nla_put_failure;
1826
1827        if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1828                goto nla_put_failure;
1829
1830        if (gnet_stats_finish_copy(&d) < 0)
1831                goto nla_put_failure;
1832
1833        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1834        return skb->len;
1835
1836out_nlmsg_trim:
1837nla_put_failure:
1838        nlmsg_trim(skb, b);
1839        return -1;
1840}
1841
1842static int tclass_notify(struct net *net, struct sk_buff *oskb,
1843                         struct nlmsghdr *n, struct Qdisc *q,
1844                         unsigned long cl, int event)
1845{
1846        struct sk_buff *skb;
1847        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1848        int err = 0;
1849
1850        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1851        if (!skb)
1852                return -ENOBUFS;
1853
1854        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1855                kfree_skb(skb);
1856                return -EINVAL;
1857        }
1858
1859        err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1860                             n->nlmsg_flags & NLM_F_ECHO);
1861        if (err > 0)
1862                err = 0;
1863        return err;
1864}
1865
1866static int tclass_del_notify(struct net *net,
1867                             const struct Qdisc_class_ops *cops,
1868                             struct sk_buff *oskb, struct nlmsghdr *n,
1869                             struct Qdisc *q, unsigned long cl,
1870                             struct netlink_ext_ack *extack)
1871{
1872        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1873        struct sk_buff *skb;
1874        int err = 0;
1875
1876        if (!cops->delete)
1877                return -EOPNOTSUPP;
1878
1879        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1880        if (!skb)
1881                return -ENOBUFS;
1882
1883        if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1884                           RTM_DELTCLASS) < 0) {
1885                kfree_skb(skb);
1886                return -EINVAL;
1887        }
1888
1889        err = cops->delete(q, cl, extack);
1890        if (err) {
1891                kfree_skb(skb);
1892                return err;
1893        }
1894
1895        err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1896                             n->nlmsg_flags & NLM_F_ECHO);
1897        if (err > 0)
1898                err = 0;
1899        return err;
1900}
1901
1902#ifdef CONFIG_NET_CLS
1903
1904struct tcf_bind_args {
1905        struct tcf_walker w;
1906        unsigned long base;
1907        unsigned long cl;
1908        u32 classid;
1909};
1910
1911static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1912{
1913        struct tcf_bind_args *a = (void *)arg;
1914
1915        if (tp->ops->bind_class) {
1916                struct Qdisc *q = tcf_block_q(tp->chain->block);
1917
1918                sch_tree_lock(q);
1919                tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1920                sch_tree_unlock(q);
1921        }
1922        return 0;
1923}
1924
1925struct tc_bind_class_args {
1926        struct qdisc_walker w;
1927        unsigned long new_cl;
1928        u32 portid;
1929        u32 clid;
1930};
1931
1932static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1933                                struct qdisc_walker *w)
1934{
1935        struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1936        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1937        struct tcf_block *block;
1938        struct tcf_chain *chain;
1939
1940        block = cops->tcf_block(q, cl, NULL);
1941        if (!block)
1942                return 0;
1943        for (chain = tcf_get_next_chain(block, NULL);
1944             chain;
1945             chain = tcf_get_next_chain(block, chain)) {
1946                struct tcf_proto *tp;
1947
1948                for (tp = tcf_get_next_proto(chain, NULL);
1949                     tp; tp = tcf_get_next_proto(chain, tp)) {
1950                        struct tcf_bind_args arg = {};
1951
1952                        arg.w.fn = tcf_node_bind;
1953                        arg.classid = a->clid;
1954                        arg.base = cl;
1955                        arg.cl = a->new_cl;
1956                        tp->ops->walk(tp, &arg.w, true);
1957                }
1958        }
1959
1960        return 0;
1961}
1962
1963static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1964                           unsigned long new_cl)
1965{
1966        const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1967        struct tc_bind_class_args args = {};
1968
1969        if (!cops->tcf_block)
1970                return;
1971        args.portid = portid;
1972        args.clid = clid;
1973        args.new_cl = new_cl;
1974        args.w.fn = tc_bind_class_walker;
1975        q->ops->cl_ops->walk(q, &args.w);
1976}
1977
1978#else
1979
1980static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1981                           unsigned long new_cl)
1982{
1983}
1984
1985#endif
1986
1987static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1988                         struct netlink_ext_ack *extack)
1989{
1990        struct net *net = sock_net(skb->sk);
1991        struct tcmsg *tcm = nlmsg_data(n);
1992        struct nlattr *tca[TCA_MAX + 1];
1993        struct net_device *dev;
1994        struct Qdisc *q = NULL;
1995        const struct Qdisc_class_ops *cops;
1996        unsigned long cl = 0;
1997        unsigned long new_cl;
1998        u32 portid;
1999        u32 clid;
2000        u32 qid;
2001        int err;
2002
2003        if ((n->nlmsg_type != RTM_GETTCLASS) &&
2004            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2005                return -EPERM;
2006
2007        err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2008                                     rtm_tca_policy, extack);
2009        if (err < 0)
2010                return err;
2011
2012        dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2013        if (!dev)
2014                return -ENODEV;
2015
2016        /*
2017           parent == TC_H_UNSPEC - unspecified parent.
2018           parent == TC_H_ROOT   - class is root, which has no parent.
2019           parent == X:0         - parent is root class.
2020           parent == X:Y         - parent is a node in hierarchy.
2021           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2022
2023           handle == 0:0         - generate handle from kernel pool.
2024           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2025           handle == X:Y         - clear.
2026           handle == X:0         - root class.
2027         */
2028
2029        /* Step 1. Determine qdisc handle X:0 */
2030
2031        portid = tcm->tcm_parent;
2032        clid = tcm->tcm_handle;
2033        qid = TC_H_MAJ(clid);
2034
2035        if (portid != TC_H_ROOT) {
2036                u32 qid1 = TC_H_MAJ(portid);
2037
2038                if (qid && qid1) {
2039                        /* If both majors are known, they must be identical. */
2040                        if (qid != qid1)
2041                                return -EINVAL;
2042                } else if (qid1) {
2043                        qid = qid1;
2044                } else if (qid == 0)
2045                        qid = dev->qdisc->handle;
2046
2047                /* Now qid is genuine qdisc handle consistent
2048                 * both with parent and child.
2049                 *
2050                 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2051                 */
2052                if (portid)
2053                        portid = TC_H_MAKE(qid, portid);
2054        } else {
2055                if (qid == 0)
2056                        qid = dev->qdisc->handle;
2057        }
2058
2059        /* OK. Locate qdisc */
2060        q = qdisc_lookup(dev, qid);
2061        if (!q)
2062                return -ENOENT;
2063
2064        /* An check that it supports classes */
2065        cops = q->ops->cl_ops;
2066        if (cops == NULL)
2067                return -EINVAL;
2068
2069        /* Now try to get class */
2070        if (clid == 0) {
2071                if (portid == TC_H_ROOT)
2072                        clid = qid;
2073        } else
2074                clid = TC_H_MAKE(qid, clid);
2075
2076        if (clid)
2077                cl = cops->find(q, clid);
2078
2079        if (cl == 0) {
2080                err = -ENOENT;
2081                if (n->nlmsg_type != RTM_NEWTCLASS ||
2082                    !(n->nlmsg_flags & NLM_F_CREATE))
2083                        goto out;
2084        } else {
2085                switch (n->nlmsg_type) {
2086                case RTM_NEWTCLASS:
2087                        err = -EEXIST;
2088                        if (n->nlmsg_flags & NLM_F_EXCL)
2089                                goto out;
2090                        break;
2091                case RTM_DELTCLASS:
2092                        err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2093                        /* Unbind the class with flilters with 0 */
2094                        tc_bind_tclass(q, portid, clid, 0);
2095                        goto out;
2096                case RTM_GETTCLASS:
2097                        err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2098                        goto out;
2099                default:
2100                        err = -EINVAL;
2101                        goto out;
2102                }
2103        }
2104
2105        if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2106                NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2107                return -EOPNOTSUPP;
2108        }
2109
2110        new_cl = cl;
2111        err = -EOPNOTSUPP;
2112        if (cops->change)
2113                err = cops->change(q, clid, portid, tca, &new_cl, extack);
2114        if (err == 0) {
2115                tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2116                /* We just create a new class, need to do reverse binding. */
2117                if (cl != new_cl)
2118                        tc_bind_tclass(q, portid, clid, new_cl);
2119        }
2120out:
2121        return err;
2122}
2123
2124struct qdisc_dump_args {
2125        struct qdisc_walker     w;
2126        struct sk_buff          *skb;
2127        struct netlink_callback *cb;
2128};
2129
2130static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2131                            struct qdisc_walker *arg)
2132{
2133        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2134
2135        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2136                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2137                              RTM_NEWTCLASS);
2138}
2139
2140static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2141                                struct tcmsg *tcm, struct netlink_callback *cb,
2142                                int *t_p, int s_t)
2143{
2144        struct qdisc_dump_args arg;
2145
2146        if (tc_qdisc_dump_ignore(q, false) ||
2147            *t_p < s_t || !q->ops->cl_ops ||
2148            (tcm->tcm_parent &&
2149             TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2150                (*t_p)++;
2151                return 0;
2152        }
2153        if (*t_p > s_t)
2154                memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2155        arg.w.fn = qdisc_class_dump;
2156        arg.skb = skb;
2157        arg.cb = cb;
2158        arg.w.stop  = 0;
2159        arg.w.skip = cb->args[1];
2160        arg.w.count = 0;
2161        q->ops->cl_ops->walk(q, &arg.w);
2162        cb->args[1] = arg.w.count;
2163        if (arg.w.stop)
2164                return -1;
2165        (*t_p)++;
2166        return 0;
2167}
2168
2169static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2170                               struct tcmsg *tcm, struct netlink_callback *cb,
2171                               int *t_p, int s_t, bool recur)
2172{
2173        struct Qdisc *q;
2174        int b;
2175
2176        if (!root)
2177                return 0;
2178
2179        if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2180                return -1;
2181
2182        if (!qdisc_dev(root) || !recur)
2183                return 0;
2184
2185        if (tcm->tcm_parent) {
2186                q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2187                if (q && q != root &&
2188                    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2189                        return -1;
2190                return 0;
2191        }
2192        hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2193                if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2194                        return -1;
2195        }
2196
2197        return 0;
2198}
2199
2200static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2201{
2202        struct tcmsg *tcm = nlmsg_data(cb->nlh);
2203        struct net *net = sock_net(skb->sk);
2204        struct netdev_queue *dev_queue;
2205        struct net_device *dev;
2206        int t, s_t;
2207
2208        if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2209                return 0;
2210        dev = dev_get_by_index(net, tcm->tcm_ifindex);
2211        if (!dev)
2212                return 0;
2213
2214        s_t = cb->args[0];
2215        t = 0;
2216
2217        if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
2218                goto done;
2219
2220        dev_queue = dev_ingress_queue(dev);
2221        if (dev_queue &&
2222            tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2223                                &t, s_t, false) < 0)
2224                goto done;
2225
2226done:
2227        cb->args[0] = t;
2228
2229        dev_put(dev);
2230        return skb->len;
2231}
2232
2233#ifdef CONFIG_PROC_FS
2234static int psched_show(struct seq_file *seq, void *v)
2235{
2236        seq_printf(seq, "%08x %08x %08x %08x\n",
2237                   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2238                   1000000,
2239                   (u32)NSEC_PER_SEC / hrtimer_resolution);
2240
2241        return 0;
2242}
2243
2244static int __net_init psched_net_init(struct net *net)
2245{
2246        struct proc_dir_entry *e;
2247
2248        e = proc_create_single("psched", 0, net->proc_net, psched_show);
2249        if (e == NULL)
2250                return -ENOMEM;
2251
2252        return 0;
2253}
2254
2255static void __net_exit psched_net_exit(struct net *net)
2256{
2257        remove_proc_entry("psched", net->proc_net);
2258}
2259#else
2260static int __net_init psched_net_init(struct net *net)
2261{
2262        return 0;
2263}
2264
2265static void __net_exit psched_net_exit(struct net *net)
2266{
2267}
2268#endif
2269
2270static struct pernet_operations psched_net_ops = {
2271        .init = psched_net_init,
2272        .exit = psched_net_exit,
2273};
2274
2275static int __init pktsched_init(void)
2276{
2277        int err;
2278
2279        err = register_pernet_subsys(&psched_net_ops);
2280        if (err) {
2281                pr_err("pktsched_init: "
2282                       "cannot initialize per netns operations\n");
2283                return err;
2284        }
2285
2286        register_qdisc(&pfifo_fast_ops);
2287        register_qdisc(&pfifo_qdisc_ops);
2288        register_qdisc(&bfifo_qdisc_ops);
2289        register_qdisc(&pfifo_head_drop_qdisc_ops);
2290        register_qdisc(&mq_qdisc_ops);
2291        register_qdisc(&noqueue_qdisc_ops);
2292
2293        rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2294        rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2295        rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2296                      0);
2297        rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2298        rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2299        rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2300                      0);
2301
2302        return 0;
2303}
2304
2305subsys_initcall(pktsched_init);
2306