linux-old/net/sched/sch_api.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_api.c  Packet scheduler API.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *
  11 * Fixes:
  12 *
  13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
  15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  16 */
  17
  18#include <linux/config.h>
  19#include <linux/types.h>
  20#include <linux/kernel.h>
  21#include <linux/sched.h>
  22#include <linux/string.h>
  23#include <linux/mm.h>
  24#include <linux/socket.h>
  25#include <linux/sockios.h>
  26#include <linux/in.h>
  27#include <linux/errno.h>
  28#include <linux/interrupt.h>
  29#include <linux/netdevice.h>
  30#include <linux/skbuff.h>
  31#include <linux/rtnetlink.h>
  32#include <linux/init.h>
  33#include <linux/proc_fs.h>
  34#include <linux/kmod.h>
  35#include <linux/list.h>
  36
  37#include <net/sock.h>
  38#include <net/pkt_sched.h>
  39
  40#include <asm/processor.h>
  41#include <asm/uaccess.h>
  42#include <asm/system.h>
  43#include <asm/bitops.h>
  44
  45static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
  46                        struct Qdisc *old, struct Qdisc *new);
  47static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
  48                         struct Qdisc *q, unsigned long cl, int event);
  49
  50/*
  51
  52   Short review.
  53   -------------
  54
  55   This file consists of two interrelated parts:
  56
  57   1. queueing disciplines manager frontend.
  58   2. traffic classes manager frontend.
  59
  60   Generally, queueing discipline ("qdisc") is a black box,
  61   which is able to enqueue packets and to dequeue them (when
  62   device is ready to send something) in order and at times
  63   determined by algorithm hidden in it.
  64
  65   qdisc's are divided to two categories:
  66   - "queues", which have no internal structure visible from outside.
  67   - "schedulers", which split all the packets to "traffic classes",
  68     using "packet classifiers" (look at cls_api.c)
  69
  70   In turn, classes may have child qdiscs (as rule, queues)
  71   attached to them etc. etc. etc.
  72
  73   The goal of the routines in this file is to translate
  74   information supplied by user in the form of handles
  75   to more intelligible for kernel form, to make some sanity
  76   checks and part of work, which is common to all qdiscs
  77   and to provide rtnetlink notifications.
  78
  79   All real intelligent work is done inside qdisc modules.
  80
  81
  82
  83   Every discipline has two major routines: enqueue and dequeue.
  84
  85   ---dequeue
  86
  87   dequeue usually returns a skb to send. It is allowed to return NULL,
  88   but it does not mean that queue is empty, it just means that
  89   discipline does not want to send anything this time.
  90   Queue is really empty if q->q.qlen == 0.
  91   For complicated disciplines with multiple queues q->q is not
  92   real packet queue, but however q->q.qlen must be valid.
  93
  94   ---enqueue
  95
  96   enqueue returns 0, if packet was enqueued successfully.
  97   If packet (this one or another one) was dropped, it returns
  98   not zero error code.
  99   NET_XMIT_DROP        - this packet dropped
 100     Expected action: do not backoff, but wait until queue will clear.
 101   NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
 102     Expected action: backoff or ignore
 103   NET_XMIT_POLICED     - dropped by police.
 104     Expected action: backoff or error to real-time apps.
 105
 106   Auxiliary routines:
 107
 108   ---requeue
 109
 110   requeues once dequeued packet. It is used for non-standard or
 111   just buggy devices, which can defer output even if dev->tbusy=0.
 112
 113   ---reset
 114
 115   returns qdisc to initial state: purge all buffers, clear all
 116   timers, counters (except for statistics) etc.
 117
 118   ---init
 119
 120   initializes newly created qdisc.
 121
 122   ---destroy
 123
 124   destroys resources allocated by init and during lifetime of qdisc.
 125
 126   ---change
 127
 128   changes qdisc parameters.
 129 */
 130
 131/* Protects list of registered TC modules. It is pure SMP lock. */
 132static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
 133
 134
 135/************************************************
 136 *      Queueing disciplines manipulation.      *
 137 ************************************************/
 138
 139
 140/* The list of all installed queueing disciplines. */
 141
 142static struct Qdisc_ops *qdisc_base = NULL;
 143
 144/* Register/uregister queueing discipline */
 145
 146int register_qdisc(struct Qdisc_ops *qops)
 147{
 148        struct Qdisc_ops *q, **qp;
 149
 150        write_lock(&qdisc_mod_lock);
 151        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) {
 152                if (strcmp(qops->id, q->id) == 0) {
 153                        write_unlock(&qdisc_mod_lock);
 154                        return -EEXIST;
 155                }
 156        }
 157
 158        if (qops->enqueue == NULL)
 159                qops->enqueue = noop_qdisc_ops.enqueue;
 160        if (qops->requeue == NULL)
 161                qops->requeue = noop_qdisc_ops.requeue;
 162        if (qops->dequeue == NULL)
 163                qops->dequeue = noop_qdisc_ops.dequeue;
 164
 165        qops->next = NULL;
 166        *qp = qops;
 167        write_unlock(&qdisc_mod_lock);
 168        return 0;
 169}
 170
 171int unregister_qdisc(struct Qdisc_ops *qops)
 172{
 173        struct Qdisc_ops *q, **qp;
 174        int err = -ENOENT;
 175
 176        write_lock(&qdisc_mod_lock);
 177        for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
 178                if (q == qops)
 179                        break;
 180        if (q) {
 181                *qp = q->next;
 182                q->next = NULL;
 183                err = 0;
 184        }
 185        write_unlock(&qdisc_mod_lock);
 186        return err;
 187}
 188
 189/* We know handle. Find qdisc among all qdisc's attached to device
 190   (root qdisc, all its children, children of children etc.)
 191 */
 192
 193struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 194{
 195        struct Qdisc *q;
 196
 197        list_for_each_entry(q, &dev->qdisc_list, list) {
 198                if (q->handle == handle)
 199                        return q;
 200        }
 201        return NULL;
 202}
 203
 204struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 205{
 206        unsigned long cl;
 207        struct Qdisc *leaf;
 208        struct Qdisc_class_ops *cops = p->ops->cl_ops;
 209
 210        if (cops == NULL)
 211                return NULL;
 212        cl = cops->get(p, classid);
 213
 214        if (cl == 0)
 215                return NULL;
 216        leaf = cops->leaf(p, cl);
 217        cops->put(p, cl);
 218        return leaf;
 219}
 220
 221/* Find queueing discipline by name */
 222
 223struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
 224{
 225        struct Qdisc_ops *q = NULL;
 226
 227        if (kind) {
 228                read_lock(&qdisc_mod_lock);
 229                for (q = qdisc_base; q; q = q->next) {
 230                        if (rtattr_strcmp(kind, q->id) == 0)
 231                                break;
 232                }
 233                read_unlock(&qdisc_mod_lock);
 234        }
 235        return q;
 236}
 237
 238static struct qdisc_rate_table *qdisc_rtab_list;
 239
 240struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
 241{
 242        struct qdisc_rate_table *rtab;
 243
 244        for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
 245                if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
 246                        rtab->refcnt++;
 247                        return rtab;
 248                }
 249        }
 250
 251        if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
 252                return NULL;
 253
 254        rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 255        if (rtab) {
 256                rtab->rate = *r;
 257                rtab->refcnt = 1;
 258                memcpy(rtab->data, RTA_DATA(tab), 1024);
 259                rtab->next = qdisc_rtab_list;
 260                qdisc_rtab_list = rtab;
 261        }
 262        return rtab;
 263}
 264
 265void qdisc_put_rtab(struct qdisc_rate_table *tab)
 266{
 267        struct qdisc_rate_table *rtab, **rtabp;
 268
 269        if (!tab || --tab->refcnt)
 270                return;
 271
 272        for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
 273                if (rtab == tab) {
 274                        *rtabp = rtab->next;
 275                        kfree(rtab);
 276                        return;
 277                }
 278        }
 279}
 280
 281
 282/* Allocate an unique handle from space managed by kernel */
 283
 284u32 qdisc_alloc_handle(struct net_device *dev)
 285{
 286        int i = 0x10000;
 287        static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 288
 289        do {
 290                autohandle += TC_H_MAKE(0x10000U, 0);
 291                if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 292                        autohandle = TC_H_MAKE(0x80000000U, 0);
 293        } while (qdisc_lookup(dev, autohandle) && --i > 0);
 294
 295        return i>0 ? autohandle : 0;
 296}
 297
 298/* Attach toplevel qdisc to device dev */
 299
 300static struct Qdisc *
 301dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
 302{
 303        struct Qdisc *oqdisc;
 304
 305        if (dev->flags & IFF_UP)
 306                dev_deactivate(dev);
 307
 308        write_lock(&qdisc_tree_lock);
 309        spin_lock_bh(&dev->queue_lock);
 310        if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
 311                oqdisc = dev->qdisc_ingress;
 312                /* Prune old scheduler */
 313                if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
 314                        /* delete */
 315                        qdisc_reset(oqdisc);
 316                        dev->qdisc_ingress = NULL;
 317                } else {  /* new */
 318                        dev->qdisc_ingress = qdisc;
 319                }
 320
 321        } else {
 322
 323                oqdisc = dev->qdisc_sleeping;
 324
 325                /* Prune old scheduler */
 326                if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
 327                        qdisc_reset(oqdisc);
 328
 329                /* ... and graft new one */
 330                if (qdisc == NULL)
 331                        qdisc = &noop_qdisc;
 332                dev->qdisc_sleeping = qdisc;
 333                dev->qdisc = &noop_qdisc;
 334        }
 335
 336        spin_unlock_bh(&dev->queue_lock);
 337        write_unlock(&qdisc_tree_lock);
 338
 339        if (dev->flags & IFF_UP)
 340                dev_activate(dev);
 341
 342        return oqdisc;
 343}
 344
 345
 346/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 347   to device "dev".
 348
 349   Old qdisc is not destroyed but returned in *old.
 350 */
 351
 352int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
 353                struct Qdisc *new, struct Qdisc **old)
 354{
 355        int err = 0;
 356        struct Qdisc *q = *old;
 357
 358
 359        if (parent == NULL) { 
 360                if (q && q->flags&TCQ_F_INGRESS) {
 361                        *old = dev_graft_qdisc(dev, q);
 362                } else {
 363                        *old = dev_graft_qdisc(dev, new);
 364                }
 365        } else {
 366                struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 367
 368                err = -EINVAL;
 369
 370                if (cops) {
 371                        unsigned long cl = cops->get(parent, classid);
 372                        if (cl) {
 373                                err = cops->graft(parent, cl, new, old);
 374                                if (new)
 375                                        new->parent = classid;
 376                                cops->put(parent, cl);
 377                        }
 378                }
 379        }
 380        return err;
 381}
 382
 383/*
 384   Allocate and initialize new qdisc.
 385
 386   Parameters are passed via opt.
 387 */
 388
 389static struct Qdisc *
 390qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 391{
 392        int err;
 393        struct rtattr *kind = tca[TCA_KIND-1];
 394        struct Qdisc *sch = NULL;
 395        struct Qdisc_ops *ops;
 396        int size;
 397
 398        ops = qdisc_lookup_ops(kind);
 399#ifdef CONFIG_KMOD
 400        if (ops==NULL && tca[TCA_KIND-1] != NULL) {
 401                char module_name[4 + IFNAMSIZ + 1];
 402
 403                if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
 404                        sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
 405                        request_module (module_name);
 406                        ops = qdisc_lookup_ops(kind);
 407                }
 408        }
 409#endif
 410
 411        err = -EINVAL;
 412        if (ops == NULL)
 413                goto err_out;
 414
 415        size = sizeof(*sch) + ops->priv_size;
 416
 417        sch = kmalloc(size, GFP_KERNEL);
 418        err = -ENOBUFS;
 419        if (!sch)
 420                goto err_out;
 421
 422        /* Grrr... Resolve race condition with module unload */
 423
 424        err = -EINVAL;
 425        if (ops != qdisc_lookup_ops(kind))
 426                goto err_out;
 427
 428        memset(sch, 0, size);
 429
 430        INIT_LIST_HEAD(&sch->list);
 431        skb_queue_head_init(&sch->q);
 432
 433        if (handle == TC_H_INGRESS)
 434                sch->flags |= TCQ_F_INGRESS;
 435
 436        sch->ops = ops;
 437        sch->enqueue = ops->enqueue;
 438        sch->dequeue = ops->dequeue;
 439        sch->dev = dev;
 440        atomic_set(&sch->refcnt, 1);
 441        sch->stats.lock = &dev->queue_lock;
 442        if (handle == 0) {
 443                handle = qdisc_alloc_handle(dev);
 444                err = -ENOMEM;
 445                if (handle == 0)
 446                        goto err_out;
 447        }
 448
 449        if (handle == TC_H_INGRESS)
 450                sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
 451        else
 452                sch->handle = handle;
 453
 454        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
 455                write_lock(&qdisc_tree_lock);
 456                list_add_tail(&sch->list, &dev->qdisc_list);
 457                write_unlock(&qdisc_tree_lock);
 458#ifdef CONFIG_NET_ESTIMATOR
 459                if (tca[TCA_RATE-1])
 460                        qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
 461#endif
 462                return sch;
 463        }
 464
 465err_out:
 466        *errp = err;
 467        if (sch)
 468                kfree(sch);
 469        return NULL;
 470}
 471
 472static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
 473{
 474        if (tca[TCA_OPTIONS-1]) {
 475                int err;
 476
 477                if (sch->ops->change == NULL)
 478                        return -EINVAL;
 479                err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
 480                if (err)
 481                        return err;
 482        }
 483#ifdef CONFIG_NET_ESTIMATOR
 484        if (tca[TCA_RATE-1]) {
 485                qdisc_kill_estimator(&sch->stats);
 486                qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
 487        }
 488#endif
 489        return 0;
 490}
 491
 492struct check_loop_arg
 493{
 494        struct qdisc_walker     w;
 495        struct Qdisc            *p;
 496        int                     depth;
 497};
 498
 499static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
 500
 501static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
 502{
 503        struct check_loop_arg   arg;
 504
 505        if (q->ops->cl_ops == NULL)
 506                return 0;
 507
 508        arg.w.stop = arg.w.skip = arg.w.count = 0;
 509        arg.w.fn = check_loop_fn;
 510        arg.depth = depth;
 511        arg.p = p;
 512        q->ops->cl_ops->walk(q, &arg.w);
 513        return arg.w.stop ? -ELOOP : 0;
 514}
 515
 516static int
 517check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 518{
 519        struct Qdisc *leaf;
 520        struct Qdisc_class_ops *cops = q->ops->cl_ops;
 521        struct check_loop_arg *arg = (struct check_loop_arg *)w;
 522
 523        leaf = cops->leaf(q, cl);
 524        if (leaf) {
 525                if (leaf == arg->p || arg->depth > 7)
 526                        return -ELOOP;
 527                return check_loop(leaf, arg->p, arg->depth + 1);
 528        }
 529        return 0;
 530}
 531
 532/*
 533 * Delete/get qdisc.
 534 */
 535
 536static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 537{
 538        struct tcmsg *tcm = NLMSG_DATA(n);
 539        struct rtattr **tca = arg;
 540        struct net_device *dev;
 541        u32 clid = tcm->tcm_parent;
 542        struct Qdisc *q = NULL;
 543        struct Qdisc *p = NULL;
 544        int err;
 545
 546        if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
 547                return -ENODEV;
 548
 549        if (clid) {
 550                if (clid != TC_H_ROOT) {
 551                        if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
 552                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 553                                        return -ENOENT;
 554                                q = qdisc_leaf(p, clid);
 555                        } else { /* ingress */
 556                                q = dev->qdisc_ingress;
 557                        }
 558                } else {
 559                        q = dev->qdisc_sleeping;
 560                }
 561                if (!q)
 562                        return -ENOENT;
 563
 564                if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 565                        return -EINVAL;
 566        } else {
 567                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
 568                        return -ENOENT;
 569        }
 570
 571        if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
 572                return -EINVAL;
 573
 574        if (n->nlmsg_type == RTM_DELQDISC) {
 575                if (!clid)
 576                        return -EINVAL;
 577                if (q->handle == 0)
 578                        return -ENOENT;
 579                if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
 580                        return err;
 581                if (q) {
 582                        qdisc_notify(skb, n, clid, q, NULL);
 583                        spin_lock_bh(&dev->queue_lock);
 584                        qdisc_destroy(q);
 585                        spin_unlock_bh(&dev->queue_lock);
 586                }
 587        } else {
 588                qdisc_notify(skb, n, clid, NULL, q);
 589        }
 590        return 0;
 591}
 592
 593/*
 594   Create/change qdisc.
 595 */
 596
 597static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 598{
 599        struct tcmsg *tcm = NLMSG_DATA(n);
 600        struct rtattr **tca = arg;
 601        struct net_device *dev;
 602        u32 clid = tcm->tcm_parent;
 603        struct Qdisc *q = NULL;
 604        struct Qdisc *p = NULL;
 605        int err;
 606
 607        if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
 608                return -ENODEV;
 609
 610        if (clid) {
 611                if (clid != TC_H_ROOT) {
 612                        if (clid != TC_H_INGRESS) {
 613                                if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 614                                        return -ENOENT;
 615                                q = qdisc_leaf(p, clid);
 616                        } else { /*ingress */
 617                                q = dev->qdisc_ingress;
 618                        }
 619                } else {
 620                        q = dev->qdisc_sleeping;
 621                }
 622
 623                /* It may be default qdisc, ignore it */
 624                if (q && q->handle == 0)
 625                        q = NULL;
 626
 627                if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
 628                        if (tcm->tcm_handle) {
 629                                if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
 630                                        return -EEXIST;
 631                                if (TC_H_MIN(tcm->tcm_handle))
 632                                        return -EINVAL;
 633                                if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
 634                                        goto create_n_graft;
 635                                if (n->nlmsg_flags&NLM_F_EXCL)
 636                                        return -EEXIST;
 637                                if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
 638                                        return -EINVAL;
 639                                if (q == p ||
 640                                    (p && check_loop(q, p, 0)))
 641                                        return -ELOOP;
 642                                atomic_inc(&q->refcnt);
 643                                goto graft;
 644                        } else {
 645                                if (q == NULL)
 646                                        goto create_n_graft;
 647
 648                                /* This magic test requires explanation.
 649                                 *
 650                                 *   We know, that some child q is already
 651                                 *   attached to this parent and have choice:
 652                                 *   either to change it or to create/graft new one.
 653                                 *
 654                                 *   1. We are allowed to create/graft only
 655                                 *   if CREATE and REPLACE flags are set.
 656                                 *
 657                                 *   2. If EXCL is set, requestor wanted to say,
 658                                 *   that qdisc tcm_handle is not expected
 659                                 *   to exist, so that we choose create/graft too.
 660                                 *
 661                                 *   3. The last case is when no flags are set.
 662                                 *   Alas, it is sort of hole in API, we
 663                                 *   cannot decide what to do unambiguously.
 664                                 *   For now we select create/graft, if
 665                                 *   user gave KIND, which does not match existing.
 666                                 */
 667                                if ((n->nlmsg_flags&NLM_F_CREATE) &&
 668                                    (n->nlmsg_flags&NLM_F_REPLACE) &&
 669                                    ((n->nlmsg_flags&NLM_F_EXCL) ||
 670                                     (tca[TCA_KIND-1] &&
 671                                      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
 672                                        goto create_n_graft;
 673                        }
 674                }
 675        } else {
 676                if (!tcm->tcm_handle)
 677                        return -EINVAL;
 678                q = qdisc_lookup(dev, tcm->tcm_handle);
 679        }
 680
 681        /* Change qdisc parameters */
 682        if (q == NULL)
 683                return -ENOENT;
 684        if (n->nlmsg_flags&NLM_F_EXCL)
 685                return -EEXIST;
 686        if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
 687                return -EINVAL;
 688        err = qdisc_change(q, tca);
 689        if (err == 0)
 690                qdisc_notify(skb, n, clid, NULL, q);
 691        return err;
 692
 693create_n_graft:
 694        if (!(n->nlmsg_flags&NLM_F_CREATE))
 695                return -ENOENT;
 696        if (clid == TC_H_INGRESS)
 697                q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
 698        else
 699                q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
 700        if (q == NULL)
 701                return err;
 702
 703graft:
 704        if (1) {
 705                struct Qdisc *old_q = NULL;
 706                err = qdisc_graft(dev, p, clid, q, &old_q);
 707                if (err) {
 708                        if (q) {
 709                                spin_lock_bh(&dev->queue_lock);
 710                                qdisc_destroy(q);
 711                                spin_unlock_bh(&dev->queue_lock);
 712                        }
 713                        return err;
 714                }
 715                qdisc_notify(skb, n, clid, old_q, q);
 716                if (old_q) {
 717                        spin_lock_bh(&dev->queue_lock);
 718                        qdisc_destroy(old_q);
 719                        spin_unlock_bh(&dev->queue_lock);
 720                }
 721        }
 722        return 0;
 723}
 724
 725int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
 726{
 727        spin_lock_bh(st->lock);
 728        RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
 729        spin_unlock_bh(st->lock);
 730        return 0;
 731
 732rtattr_failure:
 733        spin_unlock_bh(st->lock);
 734        return -1;
 735}
 736
 737
 738static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 739                         u32 pid, u32 seq, unsigned flags, int event)
 740{
 741        struct tcmsg *tcm;
 742        struct nlmsghdr  *nlh;
 743        unsigned char    *b = skb->tail;
 744
 745        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
 746        nlh->nlmsg_flags = flags;
 747        tcm = NLMSG_DATA(nlh);
 748        tcm->tcm_family = AF_UNSPEC;
 749        tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
 750        tcm->tcm_parent = clid;
 751        tcm->tcm_handle = q->handle;
 752        tcm->tcm_info = atomic_read(&q->refcnt);
 753        RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
 754        if (q->ops->dump && q->ops->dump(q, skb) < 0)
 755                goto rtattr_failure;
 756        q->stats.qlen = q->q.qlen;
 757        if (qdisc_copy_stats(skb, &q->stats))
 758                goto rtattr_failure;
 759        nlh->nlmsg_len = skb->tail - b;
 760        return skb->len;
 761
 762nlmsg_failure:
 763rtattr_failure:
 764        skb_trim(skb, b - skb->data);
 765        return -1;
 766}
 767
 768static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 769                        u32 clid, struct Qdisc *old, struct Qdisc *new)
 770{
 771        struct sk_buff *skb;
 772        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 773
 774        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 775        if (!skb)
 776                return -ENOBUFS;
 777
 778        if (old && old->handle) {
 779                if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
 780                        goto err_out;
 781        }
 782        if (new) {
 783                if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 784                        goto err_out;
 785        }
 786
 787        if (skb->len)
 788                return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 789
 790err_out:
 791        kfree_skb(skb);
 792        return -EINVAL;
 793}
 794
 795static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 796{
 797        int idx, q_idx;
 798        int s_idx, s_q_idx;
 799        struct net_device *dev;
 800        struct Qdisc *q;
 801
 802        s_idx = cb->args[0];
 803        s_q_idx = q_idx = cb->args[1];
 804        read_lock(&dev_base_lock);
 805        for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
 806                if (idx < s_idx)
 807                        continue;
 808                if (idx > s_idx)
 809                        s_q_idx = 0;
 810                read_lock(&qdisc_tree_lock);
 811                q_idx = 0;
 812                list_for_each_entry(q, &dev->qdisc_list, list) {
 813                        if (q_idx < s_q_idx) {
 814                                q_idx++;
 815                                continue;
 816                        }
 817                        if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
 818                                          cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
 819                                read_unlock(&qdisc_tree_lock);
 820                                goto done;
 821                        }
 822                        q_idx++;
 823                }
 824                read_unlock(&qdisc_tree_lock);
 825        }
 826
 827done:
 828        read_unlock(&dev_base_lock);
 829
 830        cb->args[0] = idx;
 831        cb->args[1] = q_idx;
 832
 833        return skb->len;
 834}
 835
 836
 837
 838/************************************************
 839 *      Traffic classes manipulation.           *
 840 ************************************************/
 841
 842
 843
 844static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 845{
 846        struct tcmsg *tcm = NLMSG_DATA(n);
 847        struct rtattr **tca = arg;
 848        struct net_device *dev;
 849        struct Qdisc *q = NULL;
 850        struct Qdisc_class_ops *cops;
 851        unsigned long cl = 0;
 852        unsigned long new_cl;
 853        u32 pid = tcm->tcm_parent;
 854        u32 clid = tcm->tcm_handle;
 855        u32 qid = TC_H_MAJ(clid);
 856        int err;
 857
 858        if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
 859                return -ENODEV;
 860
 861        /*
 862           parent == TC_H_UNSPEC - unspecified parent.
 863           parent == TC_H_ROOT   - class is root, which has no parent.
 864           parent == X:0         - parent is root class.
 865           parent == X:Y         - parent is a node in hierarchy.
 866           parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
 867
 868           handle == 0:0         - generate handle from kernel pool.
 869           handle == 0:Y         - class is X:Y, where X:0 is qdisc.
 870           handle == X:Y         - clear.
 871           handle == X:0         - root class.
 872         */
 873
 874        /* Step 1. Determine qdisc handle X:0 */
 875
 876        if (pid != TC_H_ROOT) {
 877                u32 qid1 = TC_H_MAJ(pid);
 878
 879                if (qid && qid1) {
 880                        /* If both majors are known, they must be identical. */
 881                        if (qid != qid1)
 882                                return -EINVAL;
 883                } else if (qid1) {
 884                        qid = qid1;
 885                } else if (qid == 0)
 886                        qid = dev->qdisc_sleeping->handle;
 887
 888                /* Now qid is genuine qdisc handle consistent
 889                   both with parent and child.
 890
 891                   TC_H_MAJ(pid) still may be unspecified, complete it now.
 892                 */
 893                if (pid)
 894                        pid = TC_H_MAKE(qid, pid);
 895        } else {
 896                if (qid == 0)
 897                        qid = dev->qdisc_sleeping->handle;
 898        }
 899
 900        /* OK. Locate qdisc */
 901        if ((q = qdisc_lookup(dev, qid)) == NULL) 
 902                return -ENOENT;
 903
 904        /* An check that it supports classes */
 905        cops = q->ops->cl_ops;
 906        if (cops == NULL)
 907                return -EINVAL;
 908
 909        /* Now try to get class */
 910        if (clid == 0) {
 911                if (pid == TC_H_ROOT)
 912                        clid = qid;
 913        } else
 914                clid = TC_H_MAKE(qid, clid);
 915
 916        if (clid)
 917                cl = cops->get(q, clid);
 918
 919        if (cl == 0) {
 920                err = -ENOENT;
 921                if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
 922                        goto out;
 923        } else {
 924                switch (n->nlmsg_type) {
 925                case RTM_NEWTCLASS:     
 926                        err = -EEXIST;
 927                        if (n->nlmsg_flags&NLM_F_EXCL)
 928                                goto out;
 929                        break;
 930                case RTM_DELTCLASS:
 931                        err = cops->delete(q, cl);
 932                        if (err == 0)
 933                                tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
 934                        goto out;
 935                case RTM_GETTCLASS:
 936                        err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
 937                        goto out;
 938                default:
 939                        err = -EINVAL;
 940                        goto out;
 941                }
 942        }
 943
 944        new_cl = cl;
 945        err = cops->change(q, clid, pid, tca, &new_cl);
 946        if (err == 0)
 947                tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
 948
 949out:
 950        if (cl)
 951                cops->put(q, cl);
 952
 953        return err;
 954}
 955
 956
 957static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 958                          unsigned long cl,
 959                          u32 pid, u32 seq, unsigned flags, int event)
 960{
 961        struct tcmsg *tcm;
 962        struct nlmsghdr  *nlh;
 963        unsigned char    *b = skb->tail;
 964
 965        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
 966        nlh->nlmsg_flags = flags;
 967        tcm = NLMSG_DATA(nlh);
 968        tcm->tcm_family = AF_UNSPEC;
 969        tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
 970        tcm->tcm_parent = q->handle;
 971        tcm->tcm_handle = q->handle;
 972        tcm->tcm_info = 0;
 973        RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
 974        if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
 975                goto rtattr_failure;
 976        nlh->nlmsg_len = skb->tail - b;
 977        return skb->len;
 978
 979nlmsg_failure:
 980rtattr_failure:
 981        skb_trim(skb, b - skb->data);
 982        return -1;
 983}
 984
 985static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 986                          struct Qdisc *q, unsigned long cl, int event)
 987{
 988        struct sk_buff *skb;
 989        u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 990
 991        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 992        if (!skb)
 993                return -ENOBUFS;
 994
 995        if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
 996                kfree_skb(skb);
 997                return -EINVAL;
 998        }
 999
1000        return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1001}
1002
1003struct qdisc_dump_args
1004{
1005        struct qdisc_walker w;
1006        struct sk_buff *skb;
1007        struct netlink_callback *cb;
1008};
1009
1010static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1011{
1012        struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1013
1014        return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1015                              a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1016}
1017
1018static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1019{
1020        int t;
1021        int s_t;
1022        struct net_device *dev;
1023        struct Qdisc *q;
1024        struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1025        struct qdisc_dump_args arg;
1026
1027        if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1028                return 0;
1029        if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1030                return 0;
1031
1032        s_t = cb->args[0];
1033        t = 0;
1034
1035        read_lock(&qdisc_tree_lock);
1036        list_for_each_entry(q, &dev->qdisc_list, list) {
1037                if (t < s_t || !q->ops->cl_ops ||
1038                    (tcm->tcm_parent &&
1039                     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1040                        t++;
1041                        continue;
1042                }
1043                if (t > s_t)
1044                        memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1045                arg.w.fn = qdisc_class_dump;
1046                arg.skb = skb;
1047                arg.cb = cb;
1048                arg.w.stop  = 0;
1049                arg.w.skip = cb->args[1];
1050                arg.w.count = 0;
1051                q->ops->cl_ops->walk(q, &arg.w);
1052                cb->args[1] = arg.w.count;
1053                if (arg.w.stop)
1054                        break;
1055                t++;
1056        }
1057        read_unlock(&qdisc_tree_lock);
1058
1059        cb->args[0] = t;
1060
1061        dev_put(dev);
1062        return skb->len;
1063}
1064
1065int psched_us_per_tick = 1;
1066int psched_tick_per_us = 1;
1067
1068#ifdef CONFIG_PROC_FS
1069static int psched_read_proc(char *buffer, char **start, off_t offset,
1070                             int length, int *eof, void *data)
1071{
1072        int len;
1073
1074        len = sprintf(buffer, "%08x %08x %08x %08x\n",
1075                      psched_tick_per_us, psched_us_per_tick,
1076                      1000000, HZ);
1077
1078        len -= offset;
1079
1080        if (len > length)
1081                len = length;
1082        if(len < 0)
1083                len = 0;
1084
1085        *start = buffer + offset;
1086        *eof = 1;
1087
1088        return len;
1089}
1090#endif
1091
1092#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
1093int psched_tod_diff(int delta_sec, int bound)
1094{
1095        int delta;
1096
1097        if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1098                return bound;
1099        delta = delta_sec * 1000000;
1100        if (delta > bound)
1101                delta = bound;
1102        return delta;
1103}
1104#endif
1105
1106psched_time_t psched_time_base;
1107
1108#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1109psched_tdiff_t psched_clock_per_hz;
1110int psched_clock_scale;
1111#endif
1112
1113#ifdef PSCHED_WATCHER
1114PSCHED_WATCHER psched_time_mark;
1115
1116static void psched_tick(unsigned long);
1117
1118static struct timer_list psched_timer =
1119        { function: psched_tick };
1120
1121static void psched_tick(unsigned long dummy)
1122{
1123#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1124        psched_time_t dummy_stamp;
1125        PSCHED_GET_TIME(dummy_stamp);
1126        /* It is OK up to 4GHz cpu */
1127        psched_timer.expires = jiffies + 1*HZ;
1128#else
1129        unsigned long now = jiffies;
1130        psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE;
1131        psched_time_mark = now;
1132        psched_timer.expires = now + 60*60*HZ;
1133#endif
1134        add_timer(&psched_timer);
1135}
1136#endif
1137
1138#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1139int __init psched_calibrate_clock(void)
1140{
1141        psched_time_t stamp, stamp1;
1142        struct timeval tv, tv1;
1143        psched_tdiff_t delay;
1144        long rdelay;
1145        unsigned long stop;
1146
1147#ifdef PSCHED_WATCHER
1148        psched_tick(0);
1149#endif
1150        stop = jiffies + HZ/10;
1151        PSCHED_GET_TIME(stamp);
1152        do_gettimeofday(&tv);
1153        while (time_before(jiffies, stop)) {
1154                barrier();
1155                cpu_relax();
1156        }
1157        PSCHED_GET_TIME(stamp1);
1158        do_gettimeofday(&tv1);
1159
1160        delay = PSCHED_TDIFF(stamp1, stamp);
1161        rdelay = tv1.tv_usec - tv.tv_usec;
1162        rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1163        if (rdelay > delay)
1164                return -1;
1165        delay /= rdelay;
1166        psched_tick_per_us = delay;
1167        while ((delay>>=1) != 0)
1168                psched_clock_scale++;
1169        psched_us_per_tick = 1<<psched_clock_scale;
1170        psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1171        return 0;
1172}
1173#endif
1174
1175int __init pktsched_init(void)
1176{
1177        struct rtnetlink_link *link_p;
1178
1179#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1180        if (psched_calibrate_clock() < 0)
1181                return -1;
1182#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
1183        psched_tick_per_us = HZ<<PSCHED_JSCALE;
1184        psched_us_per_tick = 1000000;
1185#ifdef PSCHED_WATCHER
1186        psched_tick(0);
1187#endif
1188#endif
1189
1190        link_p = rtnetlink_links[PF_UNSPEC];
1191
1192        /* Setup rtnetlink links. It is made here to avoid
1193           exporting large number of public symbols.
1194         */
1195
1196        if (link_p) {
1197                link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1198                link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1199                link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1200                link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1201                link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1202                link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1203                link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1204                link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1205        }
1206
1207#define INIT_QDISC(name) { \
1208          extern struct Qdisc_ops name##_qdisc_ops; \
1209          register_qdisc(& name##_qdisc_ops);       \
1210        }
1211
1212        INIT_QDISC(pfifo);
1213        INIT_QDISC(bfifo);
1214
1215#ifdef CONFIG_NET_SCH_CBQ
1216        INIT_QDISC(cbq);
1217#endif
1218#ifdef CONFIG_NET_SCH_HTB
1219        INIT_QDISC(htb);
1220#endif
1221#ifdef CONFIG_NET_SCH_CSZ
1222        INIT_QDISC(csz);
1223#endif
1224#ifdef CONFIG_NET_SCH_HPFQ
1225        INIT_QDISC(hpfq);
1226#endif
1227#ifdef CONFIG_NET_SCH_HFSC
1228        INIT_QDISC(hfsc);
1229#endif
1230#ifdef CONFIG_NET_SCH_RED
1231        INIT_QDISC(red);
1232#endif
1233#ifdef CONFIG_NET_SCH_GRED
1234       INIT_QDISC(gred);
1235#endif
1236#ifdef CONFIG_NET_SCH_INGRESS
1237       INIT_QDISC(ingress);
1238#endif
1239#ifdef CONFIG_NET_SCH_DSMARK
1240       INIT_QDISC(dsmark);
1241#endif
1242#ifdef CONFIG_NET_SCH_SFQ
1243        INIT_QDISC(sfq);
1244#endif
1245#ifdef CONFIG_NET_SCH_TBF
1246        INIT_QDISC(tbf);
1247#endif
1248#ifdef CONFIG_NET_SCH_TEQL
1249        teql_init();
1250#endif
1251#ifdef CONFIG_NET_SCH_PRIO
1252        INIT_QDISC(prio);
1253#endif
1254#ifdef CONFIG_NET_SCH_ATM
1255        INIT_QDISC(atm);
1256#endif
1257#ifdef CONFIG_NET_CLS
1258        tc_filter_init();
1259#endif
1260
1261#ifdef CONFIG_PROC_FS
1262        create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL);
1263#endif
1264
1265        return 0;
1266}
1267
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.