linux/net/sched/sch_generic.c
<<
>>
Prefs
   1/*
   2 * net/sched/sch_generic.c      Generic packet scheduler routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
  11 *              - Ingress support
  12 */
  13
  14#include <asm/uaccess.h>
  15#include <asm/system.h>
  16#include <linux/bitops.h>
  17#include <linux/module.h>
  18#include <linux/types.h>
  19#include <linux/kernel.h>
  20#include <linux/sched.h>
  21#include <linux/string.h>
  22#include <linux/mm.h>
  23#include <linux/socket.h>
  24#include <linux/sockios.h>
  25#include <linux/in.h>
  26#include <linux/errno.h>
  27#include <linux/interrupt.h>
  28#include <linux/netdevice.h>
  29#include <linux/skbuff.h>
  30#include <linux/rtnetlink.h>
  31#include <linux/init.h>
  32#include <linux/rcupdate.h>
  33#include <linux/list.h>
  34#include <net/sock.h>
  35#include <net/pkt_sched.h>
  36
  37/* Main transmission queue. */
  38
  39/* Main qdisc structure lock. 
  40
  41   However, modifications
  42   to data, participating in scheduling must be additionally
  43   protected with dev->queue_lock spinlock.
  44
  45   The idea is the following:
  46   - enqueue, dequeue are serialized via top level device
  47     spinlock dev->queue_lock.
  48   - tree walking is protected by read_lock(qdisc_tree_lock)
  49     and this lock is used only in process context.
  50   - updates to tree are made only under rtnl semaphore,
  51     hence this lock may be made without local bh disabling.
  52
  53   qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
  54 */
  55DEFINE_RWLOCK(qdisc_tree_lock);
  56
  57void qdisc_lock_tree(struct net_device *dev)
  58{
  59        write_lock(&qdisc_tree_lock);
  60        spin_lock_bh(&dev->queue_lock);
  61}
  62
  63void qdisc_unlock_tree(struct net_device *dev)
  64{
  65        spin_unlock_bh(&dev->queue_lock);
  66        write_unlock(&qdisc_tree_lock);
  67}
  68
  69/* 
  70   dev->queue_lock serializes queue accesses for this device
  71   AND dev->qdisc pointer itself.
  72
  73   netif_tx_lock serializes accesses to device driver.
  74
  75   dev->queue_lock and netif_tx_lock are mutually exclusive,
  76   if one is grabbed, another must be free.
  77 */
  78
  79
  80/* Kick device.
  81   Note, that this procedure can be called by a watchdog timer, so that
  82   we do not check dev->tbusy flag here.
  83
  84   Returns:  0  - queue is empty.
  85            >0  - queue is not empty, but throttled.
  86            <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
  87
  88   NOTE: Called under dev->queue_lock with locally disabled BH.
  89*/
  90
  91static inline int qdisc_restart(struct net_device *dev)
  92{
  93        struct Qdisc *q = dev->qdisc;
  94        struct sk_buff *skb;
  95
  96        /* Dequeue packet */
  97        if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
  98                unsigned nolock = (dev->features & NETIF_F_LLTX);
  99
 100                dev->gso_skb = NULL;
 101
 102                /*
 103                 * When the driver has LLTX set it does its own locking
 104                 * in start_xmit. No need to add additional overhead by
 105                 * locking again. These checks are worth it because
 106                 * even uncongested locks can be quite expensive.
 107                 * The driver can do trylock like here too, in case
 108                 * of lock congestion it should return -1 and the packet
 109                 * will be requeued.
 110                 */
 111                if (!nolock) {
 112                        if (!netif_tx_trylock(dev)) {
 113                        collision:
 114                                /* So, someone grabbed the driver. */
 115                                
 116                                /* It may be transient configuration error,
 117                                   when hard_start_xmit() recurses. We detect
 118                                   it by checking xmit owner and drop the
 119                                   packet when deadloop is detected.
 120                                */
 121                                if (dev->xmit_lock_owner == smp_processor_id()) {
 122                                        kfree_skb(skb);
 123                                        if (net_ratelimit())
 124                                                printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
 125                                        return -1;
 126                                }
 127                                __get_cpu_var(netdev_rx_stat).cpu_collision++;
 128                                goto requeue;
 129                        }
 130                }
 131                
 132                {
 133                        /* And release queue */
 134                        spin_unlock(&dev->queue_lock);
 135
 136                        if (!netif_queue_stopped(dev)) {
 137                                int ret;
 138
 139                                ret = dev_hard_start_xmit(skb, dev);
 140                                if (ret == NETDEV_TX_OK) { 
 141                                        if (!nolock) {
 142                                                netif_tx_unlock(dev);
 143                                        }
 144                                        spin_lock(&dev->queue_lock);
 145                                        return -1;
 146                                }
 147                                if (ret == NETDEV_TX_LOCKED && nolock) {
 148                                        spin_lock(&dev->queue_lock);
 149                                        goto collision; 
 150                                }
 151                        }
 152
 153                        /* NETDEV_TX_BUSY - we need to requeue */
 154                        /* Release the driver */
 155                        if (!nolock) { 
 156                                netif_tx_unlock(dev);
 157                        } 
 158                        spin_lock(&dev->queue_lock);
 159                        q = dev->qdisc;
 160                }
 161
 162                /* Device kicked us out :(
 163                   This is possible in three cases:
 164
 165                   0. driver is locked
 166                   1. fastroute is enabled
 167                   2. device cannot determine busy state
 168                      before start of transmission (f.e. dialout)
 169                   3. device is buggy (ppp)
 170                 */
 171
 172requeue:
 173                if (skb->next)
 174                        dev->gso_skb = skb;
 175                else
 176                        q->ops->requeue(skb, q);
 177                netif_schedule(dev);
 178                return 1;
 179        }
 180        BUG_ON((int) q->q.qlen < 0);
 181        return q->q.qlen;
 182}
 183
 184void __qdisc_run(struct net_device *dev)
 185{
 186        if (unlikely(dev->qdisc == &noop_qdisc))
 187                goto out;
 188
 189        while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
 190                /* NOTHING */;
 191
 192out:
 193        clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
 194}
 195
 196static void dev_watchdog(unsigned long arg)
 197{
 198        struct net_device *dev = (struct net_device *)arg;
 199
 200        netif_tx_lock(dev);
 201        if (dev->qdisc != &noop_qdisc) {
 202                if (netif_device_present(dev) &&
 203                    netif_running(dev) &&
 204                    netif_carrier_ok(dev)) {
 205                        if (netif_queue_stopped(dev) &&
 206                            time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
 207
 208                                printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
 209                                       dev->name);
 210                                dev->tx_timeout(dev);
 211                        }
 212                        if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
 213                                dev_hold(dev);
 214                }
 215        }
 216        netif_tx_unlock(dev);
 217
 218        dev_put(dev);
 219}
 220
 221static void dev_watchdog_init(struct net_device *dev)
 222{
 223        init_timer(&dev->watchdog_timer);
 224        dev->watchdog_timer.data = (unsigned long)dev;
 225        dev->watchdog_timer.function = dev_watchdog;
 226}
 227
 228void __netdev_watchdog_up(struct net_device *dev)
 229{
 230        if (dev->tx_timeout) {
 231                if (dev->watchdog_timeo <= 0)
 232                        dev->watchdog_timeo = 5*HZ;
 233                if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
 234                        dev_hold(dev);
 235        }
 236}
 237
 238static void dev_watchdog_up(struct net_device *dev)
 239{
 240        __netdev_watchdog_up(dev);
 241}
 242
 243static void dev_watchdog_down(struct net_device *dev)
 244{
 245        netif_tx_lock_bh(dev);
 246        if (del_timer(&dev->watchdog_timer))
 247                dev_put(dev);
 248        netif_tx_unlock_bh(dev);
 249}
 250
 251void netif_carrier_on(struct net_device *dev)
 252{
 253        if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
 254                linkwatch_fire_event(dev);
 255        if (netif_running(dev))
 256                __netdev_watchdog_up(dev);
 257}
 258
 259void netif_carrier_off(struct net_device *dev)
 260{
 261        if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
 262                linkwatch_fire_event(dev);
 263}
 264
 265/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
 266   under all circumstances. It is difficult to invent anything faster or
 267   cheaper.
 268 */
 269
 270static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
 271{
 272        kfree_skb(skb);
 273        return NET_XMIT_CN;
 274}
 275
 276static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
 277{
 278        return NULL;
 279}
 280
 281static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 282{
 283        if (net_ratelimit())
 284                printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
 285                       skb->dev->name);
 286        kfree_skb(skb);
 287        return NET_XMIT_CN;
 288}
 289
 290struct Qdisc_ops noop_qdisc_ops = {
 291        .id             =       "noop",
 292        .priv_size      =       0,
 293        .enqueue        =       noop_enqueue,
 294        .dequeue        =       noop_dequeue,
 295        .requeue        =       noop_requeue,
 296        .owner          =       THIS_MODULE,
 297};
 298
 299struct Qdisc noop_qdisc = {
 300        .enqueue        =       noop_enqueue,
 301        .dequeue        =       noop_dequeue,
 302        .flags          =       TCQ_F_BUILTIN,
 303        .ops            =       &noop_qdisc_ops,        
 304        .list           =       LIST_HEAD_INIT(noop_qdisc.list),
 305};
 306
 307static struct Qdisc_ops noqueue_qdisc_ops = {
 308        .id             =       "noqueue",
 309        .priv_size      =       0,
 310        .enqueue        =       noop_enqueue,
 311        .dequeue        =       noop_dequeue,
 312        .requeue        =       noop_requeue,
 313        .owner          =       THIS_MODULE,
 314};
 315
 316static struct Qdisc noqueue_qdisc = {
 317        .enqueue        =       NULL,
 318        .dequeue        =       noop_dequeue,
 319        .flags          =       TCQ_F_BUILTIN,
 320        .ops            =       &noqueue_qdisc_ops,
 321        .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
 322};
 323
 324
 325static const u8 prio2band[TC_PRIO_MAX+1] =
 326        { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
 327
 328/* 3-band FIFO queue: old style, but should be a bit faster than
 329   generic prio+fifo combination.
 330 */
 331
 332#define PFIFO_FAST_BANDS 3
 333
 334static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
 335                                             struct Qdisc *qdisc)
 336{
 337        struct sk_buff_head *list = qdisc_priv(qdisc);
 338        return list + prio2band[skb->priority & TC_PRIO_MAX];
 339}
 340
 341static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 342{
 343        struct sk_buff_head *list = prio2list(skb, qdisc);
 344
 345        if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
 346                qdisc->q.qlen++;
 347                return __qdisc_enqueue_tail(skb, qdisc, list);
 348        }
 349
 350        return qdisc_drop(skb, qdisc);
 351}
 352
 353static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 354{
 355        int prio;
 356        struct sk_buff_head *list = qdisc_priv(qdisc);
 357
 358        for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
 359                if (!skb_queue_empty(list + prio)) {
 360                        qdisc->q.qlen--;
 361                        return __qdisc_dequeue_head(qdisc, list + prio);
 362                }
 363        }
 364
 365        return NULL;
 366}
 367
 368static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 369{
 370        qdisc->q.qlen++;
 371        return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
 372}
 373
 374static void pfifo_fast_reset(struct Qdisc* qdisc)
 375{
 376        int prio;
 377        struct sk_buff_head *list = qdisc_priv(qdisc);
 378
 379        for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 380                __qdisc_reset_queue(qdisc, list + prio);
 381
 382        qdisc->qstats.backlog = 0;
 383        qdisc->q.qlen = 0;
 384}
 385
 386static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 387{
 388        struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 389
 390        memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
 391        RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 392        return skb->len;
 393
 394rtattr_failure:
 395        return -1;
 396}
 397
 398static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
 399{
 400        int prio;
 401        struct sk_buff_head *list = qdisc_priv(qdisc);
 402
 403        for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 404                skb_queue_head_init(list + prio);
 405
 406        return 0;
 407}
 408
 409static struct Qdisc_ops pfifo_fast_ops = {
 410        .id             =       "pfifo_fast",
 411        .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
 412        .enqueue        =       pfifo_fast_enqueue,
 413        .dequeue        =       pfifo_fast_dequeue,
 414        .requeue        =       pfifo_fast_requeue,
 415        .init           =       pfifo_fast_init,
 416        .reset          =       pfifo_fast_reset,
 417        .dump           =       pfifo_fast_dump,
 418        .owner          =       THIS_MODULE,
 419};
 420
 421struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
 422{
 423        void *p;
 424        struct Qdisc *sch;
 425        unsigned int size;
 426        int err = -ENOBUFS;
 427
 428        /* ensure that the Qdisc and the private data are 32-byte aligned */
 429        size = QDISC_ALIGN(sizeof(*sch));
 430        size += ops->priv_size + (QDISC_ALIGNTO - 1);
 431
 432        p = kzalloc(size, GFP_KERNEL);
 433        if (!p)
 434                goto errout;
 435        sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 436        sch->padded = (char *) sch - (char *) p;
 437
 438        INIT_LIST_HEAD(&sch->list);
 439        skb_queue_head_init(&sch->q);
 440        sch->ops = ops;
 441        sch->enqueue = ops->enqueue;
 442        sch->dequeue = ops->dequeue;
 443        sch->dev = dev;
 444        dev_hold(dev);
 445        sch->stats_lock = &dev->queue_lock;
 446        atomic_set(&sch->refcnt, 1);
 447
 448        return sch;
 449errout:
 450        return ERR_PTR(-err);
 451}
 452
 453struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
 454                                 unsigned int parentid)
 455{
 456        struct Qdisc *sch;
 457        
 458        sch = qdisc_alloc(dev, ops);
 459        if (IS_ERR(sch))
 460                goto errout;
 461        sch->parent = parentid;
 462
 463        if (!ops->init || ops->init(sch, NULL) == 0)
 464                return sch;
 465
 466        qdisc_destroy(sch);
 467errout:
 468        return NULL;
 469}
 470
 471/* Under dev->queue_lock and BH! */
 472
 473void qdisc_reset(struct Qdisc *qdisc)
 474{
 475        struct Qdisc_ops *ops = qdisc->ops;
 476
 477        if (ops->reset)
 478                ops->reset(qdisc);
 479}
 480
 481/* this is the rcu callback function to clean up a qdisc when there 
 482 * are no further references to it */
 483
 484static void __qdisc_destroy(struct rcu_head *head)
 485{
 486        struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
 487        kfree((char *) qdisc - qdisc->padded);
 488}
 489
 490/* Under dev->queue_lock and BH! */
 491
 492void qdisc_destroy(struct Qdisc *qdisc)
 493{
 494        struct Qdisc_ops  *ops = qdisc->ops;
 495
 496        if (qdisc->flags & TCQ_F_BUILTIN ||
 497            !atomic_dec_and_test(&qdisc->refcnt))
 498                return;
 499
 500        list_del(&qdisc->list);
 501#ifdef CONFIG_NET_ESTIMATOR
 502        gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
 503#endif
 504        if (ops->reset)
 505                ops->reset(qdisc);
 506        if (ops->destroy)
 507                ops->destroy(qdisc);
 508
 509        module_put(ops->owner);
 510        dev_put(qdisc->dev);
 511        call_rcu(&qdisc->q_rcu, __qdisc_destroy);
 512}
 513
 514void dev_activate(struct net_device *dev)
 515{
 516        /* No queueing discipline is attached to device;
 517           create default one i.e. pfifo_fast for devices,
 518           which need queueing and noqueue_qdisc for
 519           virtual interfaces
 520         */
 521
 522        if (dev->qdisc_sleeping == &noop_qdisc) {
 523                struct Qdisc *qdisc;
 524                if (dev->tx_queue_len) {
 525                        qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
 526                                                  TC_H_ROOT);
 527                        if (qdisc == NULL) {
 528                                printk(KERN_INFO "%s: activation failed\n", dev->name);
 529                                return;
 530                        }
 531                        write_lock(&qdisc_tree_lock);
 532                        list_add_tail(&qdisc->list, &dev->qdisc_list);
 533                        write_unlock(&qdisc_tree_lock);
 534                } else {
 535                        qdisc =  &noqueue_qdisc;
 536                }
 537                write_lock(&qdisc_tree_lock);
 538                dev->qdisc_sleeping = qdisc;
 539                write_unlock(&qdisc_tree_lock);
 540        }
 541
 542        if (!netif_carrier_ok(dev))
 543                /* Delay activation until next carrier-on event */
 544                return;
 545
 546        spin_lock_bh(&dev->queue_lock);
 547        rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
 548        if (dev->qdisc != &noqueue_qdisc) {
 549                dev->trans_start = jiffies;
 550                dev_watchdog_up(dev);
 551        }
 552        spin_unlock_bh(&dev->queue_lock);
 553}
 554
 555void dev_deactivate(struct net_device *dev)
 556{
 557        struct Qdisc *qdisc;
 558
 559        spin_lock_bh(&dev->queue_lock);
 560        qdisc = dev->qdisc;
 561        dev->qdisc = &noop_qdisc;
 562
 563        qdisc_reset(qdisc);
 564
 565        spin_unlock_bh(&dev->queue_lock);
 566
 567        dev_watchdog_down(dev);
 568
 569        /* Wait for outstanding dev_queue_xmit calls. */
 570        synchronize_rcu();
 571
 572        /* Wait for outstanding qdisc_run calls. */
 573        while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
 574                yield();
 575
 576        if (dev->gso_skb) {
 577                kfree_skb(dev->gso_skb);
 578                dev->gso_skb = NULL;
 579        }
 580}
 581
 582void dev_init_scheduler(struct net_device *dev)
 583{
 584        qdisc_lock_tree(dev);
 585        dev->qdisc = &noop_qdisc;
 586        dev->qdisc_sleeping = &noop_qdisc;
 587        INIT_LIST_HEAD(&dev->qdisc_list);
 588        qdisc_unlock_tree(dev);
 589
 590        dev_watchdog_init(dev);
 591}
 592
 593void dev_shutdown(struct net_device *dev)
 594{
 595        struct Qdisc *qdisc;
 596
 597        qdisc_lock_tree(dev);
 598        qdisc = dev->qdisc_sleeping;
 599        dev->qdisc = &noop_qdisc;
 600        dev->qdisc_sleeping = &noop_qdisc;
 601        qdisc_destroy(qdisc);
 602#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
 603        if ((qdisc = dev->qdisc_ingress) != NULL) {
 604                dev->qdisc_ingress = NULL;
 605                qdisc_destroy(qdisc);
 606        }
 607#endif
 608        BUG_TRAP(!timer_pending(&dev->watchdog_timer));
 609        qdisc_unlock_tree(dev);
 610}
 611
 612EXPORT_SYMBOL(netif_carrier_on);
 613EXPORT_SYMBOL(netif_carrier_off);
 614EXPORT_SYMBOL(noop_qdisc);
 615EXPORT_SYMBOL(qdisc_create_dflt);
 616EXPORT_SYMBOL(qdisc_destroy);
 617EXPORT_SYMBOL(qdisc_reset);
 618EXPORT_SYMBOL(qdisc_lock_tree);
 619EXPORT_SYMBOL(qdisc_unlock_tree);
 620
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.