linux/drivers/block/cfq-iosched.c
<<
>>
Prefs
   1/*
   2 *  linux/drivers/block/cfq-iosched.c
   3 *
   4 *  CFQ, or complete fairness queueing, disk scheduler.
   5 *
   6 *  Based on ideas from a previously unfinished io
   7 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   8 *
   9 *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
  10 */
  11#include <linux/kernel.h>
  12#include <linux/fs.h>
  13#include <linux/blkdev.h>
  14#include <linux/elevator.h>
  15#include <linux/bio.h>
  16#include <linux/config.h>
  17#include <linux/module.h>
  18#include <linux/slab.h>
  19#include <linux/init.h>
  20#include <linux/compiler.h>
  21#include <linux/hash.h>
  22#include <linux/rbtree.h>
  23#include <linux/mempool.h>
  24
  25static unsigned long max_elapsed_crq;
  26static unsigned long max_elapsed_dispatch;
  27
  28/*
  29 * tunables
  30 */
  31static int cfq_quantum = 4;             /* max queue in one round of service */
  32static int cfq_queued = 8;              /* minimum rq allocate limit per-queue*/
  33static int cfq_service = HZ;            /* period over which service is avg */
  34static int cfq_fifo_expire_r = HZ / 2;  /* fifo timeout for sync requests */
  35static int cfq_fifo_expire_w = 5 * HZ;  /* fifo timeout for async requests */
  36static int cfq_fifo_rate = HZ / 8;      /* fifo expiry rate */
  37static int cfq_back_max = 16 * 1024;    /* maximum backwards seek, in KiB */
  38static int cfq_back_penalty = 2;        /* penalty of a backwards seek */
  39
  40/*
  41 * for the hash of cfqq inside the cfqd
  42 */
  43#define CFQ_QHASH_SHIFT         6
  44#define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
  45#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
  46
  47/*
  48 * for the hash of crq inside the cfqq
  49 */
  50#define CFQ_MHASH_SHIFT         6
  51#define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
  52#define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
  53#define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
  54#define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
  55#define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
  56
  57#define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
  58
  59#define RQ_DATA(rq)             (rq)->elevator_private
  60
  61/*
  62 * rb-tree defines
  63 */
  64#define RB_NONE                 (2)
  65#define RB_EMPTY(node)          ((node)->rb_node == NULL)
  66#define RB_CLEAR_COLOR(node)    (node)->rb_color = RB_NONE
  67#define RB_CLEAR(node)          do {    \
  68        (node)->rb_parent = NULL;       \
  69        RB_CLEAR_COLOR((node));         \
  70        (node)->rb_right = NULL;        \
  71        (node)->rb_left = NULL;         \
  72} while (0)
  73#define RB_CLEAR_ROOT(root)     ((root)->rb_node = NULL)
  74#define ON_RB(node)             ((node)->rb_color != RB_NONE)
  75#define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
  76#define rq_rb_key(rq)           (rq)->sector
  77
  78/*
  79 * threshold for switching off non-tag accounting
  80 */
  81#define CFQ_MAX_TAG             (4)
  82
  83/*
  84 * sort key types and names
  85 */
  86enum {
  87        CFQ_KEY_PGID,
  88        CFQ_KEY_TGID,
  89        CFQ_KEY_UID,
  90        CFQ_KEY_GID,
  91        CFQ_KEY_LAST,
  92};
  93
  94static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
  95
  96static kmem_cache_t *crq_pool;
  97static kmem_cache_t *cfq_pool;
  98static kmem_cache_t *cfq_ioc_pool;
  99
 100struct cfq_data {
 101        struct list_head rr_list;
 102        struct list_head empty_list;
 103
 104        struct hlist_head *cfq_hash;
 105        struct hlist_head *crq_hash;
 106
 107        /* queues on rr_list (ie they have pending requests */
 108        unsigned int busy_queues;
 109
 110        unsigned int max_queued;
 111
 112        atomic_t ref;
 113
 114        int key_type;
 115
 116        mempool_t *crq_pool;
 117
 118        request_queue_t *queue;
 119
 120        sector_t last_sector;
 121
 122        int rq_in_driver;
 123
 124        /*
 125         * tunables, see top of file
 126         */
 127        unsigned int cfq_quantum;
 128        unsigned int cfq_queued;
 129        unsigned int cfq_fifo_expire_r;
 130        unsigned int cfq_fifo_expire_w;
 131        unsigned int cfq_fifo_batch_expire;
 132        unsigned int cfq_back_penalty;
 133        unsigned int cfq_back_max;
 134        unsigned int find_best_crq;
 135
 136        unsigned int cfq_tagged;
 137};
 138
 139struct cfq_queue {
 140        /* reference count */
 141        atomic_t ref;
 142        /* parent cfq_data */
 143        struct cfq_data *cfqd;
 144        /* hash of mergeable requests */
 145        struct hlist_node cfq_hash;
 146        /* hash key */
 147        unsigned long key;
 148        /* whether queue is on rr (or empty) list */
 149        int on_rr;
 150        /* on either rr or empty list of cfqd */
 151        struct list_head cfq_list;
 152        /* sorted list of pending requests */
 153        struct rb_root sort_list;
 154        /* if fifo isn't expired, next request to serve */
 155        struct cfq_rq *next_crq;
 156        /* requests queued in sort_list */
 157        int queued[2];
 158        /* currently allocated requests */
 159        int allocated[2];
 160        /* fifo list of requests in sort_list */
 161        struct list_head fifo[2];
 162        /* last time fifo expired */
 163        unsigned long last_fifo_expire;
 164
 165        int key_type;
 166
 167        unsigned long service_start;
 168        unsigned long service_used;
 169
 170        unsigned int max_rate;
 171
 172        /* number of requests that have been handed to the driver */
 173        int in_flight;
 174        /* number of currently allocated requests */
 175        int alloc_limit[2];
 176};
 177
 178struct cfq_rq {
 179        struct rb_node rb_node;
 180        sector_t rb_key;
 181        struct request *request;
 182        struct hlist_node hash;
 183
 184        struct cfq_queue *cfq_queue;
 185        struct cfq_io_context *io_context;
 186
 187        unsigned long service_start;
 188        unsigned long queue_start;
 189
 190        unsigned int in_flight : 1;
 191        unsigned int accounted : 1;
 192        unsigned int is_sync   : 1;
 193        unsigned int is_write  : 1;
 194};
 195
 196static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
 197static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
 198static void cfq_update_next_crq(struct cfq_rq *);
 199static void cfq_put_cfqd(struct cfq_data *cfqd);
 200
 201/*
 202 * what the fairness is based on (ie how processes are grouped and
 203 * differentiated)
 204 */
 205static inline unsigned long
 206cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
 207{
 208        /*
 209         * optimize this so that ->key_type is the offset into the struct
 210         */
 211        switch (cfqd->key_type) {
 212                case CFQ_KEY_PGID:
 213                        return process_group(tsk);
 214                default:
 215                case CFQ_KEY_TGID:
 216                        return tsk->tgid;
 217                case CFQ_KEY_UID:
 218                        return tsk->uid;
 219                case CFQ_KEY_GID:
 220                        return tsk->gid;
 221        }
 222}
 223
 224/*
 225 * lots of deadline iosched dupes, can be abstracted later...
 226 */
 227static inline void cfq_del_crq_hash(struct cfq_rq *crq)
 228{
 229        hlist_del_init(&crq->hash);
 230}
 231
 232static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
 233{
 234        cfq_del_crq_hash(crq);
 235
 236        if (q->last_merge == crq->request)
 237                q->last_merge = NULL;
 238
 239        cfq_update_next_crq(crq);
 240}
 241
 242static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 243{
 244        const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
 245
 246        BUG_ON(!hlist_unhashed(&crq->hash));
 247
 248        hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 249}
 250
 251static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 252{
 253        struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
 254        struct hlist_node *entry, *next;
 255
 256        hlist_for_each_safe(entry, next, hash_list) {
 257                struct cfq_rq *crq = list_entry_hash(entry);
 258                struct request *__rq = crq->request;
 259
 260                BUG_ON(hlist_unhashed(&crq->hash));
 261
 262                if (!rq_mergeable(__rq)) {
 263                        cfq_del_crq_hash(crq);
 264                        continue;
 265                }
 266
 267                if (rq_hash_key(__rq) == offset)
 268                        return __rq;
 269        }
 270
 271        return NULL;
 272}
 273
 274/*
 275 * Lifted from AS - choose which of crq1 and crq2 that is best served now.
 276 * We choose the request that is closest to the head right now. Distance
 277 * behind the head are penalized and only allowed to a certain extent.
 278 */
 279static struct cfq_rq *
 280cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 281{
 282        sector_t last, s1, s2, d1 = 0, d2 = 0;
 283        int r1_wrap = 0, r2_wrap = 0;   /* requests are behind the disk head */
 284        unsigned long back_max;
 285
 286        if (crq1 == NULL || crq1 == crq2)
 287                return crq2;
 288        if (crq2 == NULL)
 289                return crq1;
 290
 291        s1 = crq1->request->sector;
 292        s2 = crq2->request->sector;
 293
 294        last = cfqd->last_sector;
 295
 296#if 0
 297        if (!list_empty(&cfqd->queue->queue_head)) {
 298                struct list_head *entry = &cfqd->queue->queue_head;
 299                unsigned long distance = ~0UL;
 300                struct request *rq;
 301
 302                while ((entry = entry->prev) != &cfqd->queue->queue_head) {
 303                        rq = list_entry_rq(entry);
 304
 305                        if (blk_barrier_rq(rq))
 306                                break;
 307
 308                        if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
 309                                distance = abs(s1 - rq->sector +rq->nr_sectors);
 310                                last = rq->sector + rq->nr_sectors;
 311                        }
 312                        if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
 313                                distance = abs(s2 - rq->sector +rq->nr_sectors);
 314                                last = rq->sector + rq->nr_sectors;
 315                        }
 316                }
 317        }
 318#endif
 319
 320        /*
 321         * by definition, 1KiB is 2 sectors
 322         */
 323        back_max = cfqd->cfq_back_max * 2;
 324
 325        /*
 326         * Strict one way elevator _except_ in the case where we allow
 327         * short backward seeks which are biased as twice the cost of a
 328         * similar forward seek.
 329         */
 330        if (s1 >= last)
 331                d1 = s1 - last;
 332        else if (s1 + back_max >= last)
 333                d1 = (last - s1) * cfqd->cfq_back_penalty;
 334        else
 335                r1_wrap = 1;
 336
 337        if (s2 >= last)
 338                d2 = s2 - last;
 339        else if (s2 + back_max >= last)
 340                d2 = (last - s2) * cfqd->cfq_back_penalty;
 341        else
 342                r2_wrap = 1;
 343
 344        /* Found required data */
 345        if (!r1_wrap && r2_wrap)
 346                return crq1;
 347        else if (!r2_wrap && r1_wrap)
 348                return crq2;
 349        else if (r1_wrap && r2_wrap) {
 350                /* both behind the head */
 351                if (s1 <= s2)
 352                        return crq1;
 353                else
 354                        return crq2;
 355        }
 356
 357        /* Both requests in front of the head */
 358        if (d1 < d2)
 359                return crq1;
 360        else if (d2 < d1)
 361                return crq2;
 362        else {
 363                if (s1 >= s2)
 364                        return crq1;
 365                else
 366                        return crq2;
 367        }
 368}
 369
 370/*
 371 * would be nice to take fifo expire time into account as well
 372 */
 373static struct cfq_rq *
 374cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 375                  struct cfq_rq *last)
 376{
 377        struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
 378        struct rb_node *rbnext, *rbprev;
 379
 380        if (!ON_RB(&last->rb_node))
 381                return NULL;
 382
 383        if ((rbnext = rb_next(&last->rb_node)) == NULL)
 384                rbnext = rb_first(&cfqq->sort_list);
 385
 386        rbprev = rb_prev(&last->rb_node);
 387
 388        if (rbprev)
 389                crq_prev = rb_entry_crq(rbprev);
 390        if (rbnext)
 391                crq_next = rb_entry_crq(rbnext);
 392
 393        return cfq_choose_req(cfqd, crq_next, crq_prev);
 394}
 395
 396static void cfq_update_next_crq(struct cfq_rq *crq)
 397{
 398        struct cfq_queue *cfqq = crq->cfq_queue;
 399
 400        if (cfqq->next_crq == crq)
 401                cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
 402}
 403
 404static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
 405{
 406        struct list_head *head = &cfqq->cfqd->rr_list;
 407        struct list_head *next, *prev;
 408
 409        /*
 410         * list might still be ordered
 411         */
 412        next = cfqq->cfq_list.next;
 413        if (next != head) {
 414                struct cfq_queue *cnext = list_entry_cfqq(next);
 415
 416                if (cfqq->service_used > cnext->service_used)
 417                        return 1;
 418        }
 419
 420        prev = cfqq->cfq_list.prev;
 421        if (prev != head) {
 422                struct cfq_queue *cprev = list_entry_cfqq(prev);
 423
 424                if (cfqq->service_used < cprev->service_used)
 425                        return 1;
 426        }
 427
 428        return 0;
 429}
 430
 431static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
 432{
 433        struct list_head *entry = &cfqq->cfqd->rr_list;
 434
 435        if (!cfqq->on_rr)
 436                return;
 437        if (!new_queue && !cfq_check_sort_rr_list(cfqq))
 438                return;
 439
 440        list_del(&cfqq->cfq_list);
 441
 442        /*
 443         * sort by our mean service_used, sub-sort by in-flight requests
 444         */
 445        while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
 446                struct cfq_queue *__cfqq = list_entry_cfqq(entry);
 447
 448                if (cfqq->service_used > __cfqq->service_used)
 449                        break;
 450                else if (cfqq->service_used == __cfqq->service_used) {
 451                        struct list_head *prv;
 452
 453                        while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
 454                                __cfqq = list_entry_cfqq(prv);
 455
 456                                WARN_ON(__cfqq->service_used > cfqq->service_used);
 457                                if (cfqq->service_used != __cfqq->service_used)
 458                                        break;
 459                                if (cfqq->in_flight > __cfqq->in_flight)
 460                                        break;
 461
 462                                entry = prv;
 463                        }
 464                }
 465        }
 466
 467        list_add(&cfqq->cfq_list, entry);
 468}
 469
 470/*
 471 * add to busy list of queues for service, trying to be fair in ordering
 472 * the pending list according to requests serviced
 473 */
 474static inline void
 475cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 476{
 477        /*
 478         * it's currently on the empty list
 479         */
 480        cfqq->on_rr = 1;
 481        cfqd->busy_queues++;
 482
 483        if (time_after(jiffies, cfqq->service_start + cfq_service))
 484                cfqq->service_used >>= 3;
 485
 486        cfq_sort_rr_list(cfqq, 1);
 487}
 488
 489static inline void
 490cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 491{
 492        list_move(&cfqq->cfq_list, &cfqd->empty_list);
 493        cfqq->on_rr = 0;
 494
 495        BUG_ON(!cfqd->busy_queues);
 496        cfqd->busy_queues--;
 497}
 498
 499/*
 500 * rb tree support functions
 501 */
 502static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 503{
 504        struct cfq_queue *cfqq = crq->cfq_queue;
 505
 506        if (ON_RB(&crq->rb_node)) {
 507                struct cfq_data *cfqd = cfqq->cfqd;
 508
 509                BUG_ON(!cfqq->queued[crq->is_sync]);
 510
 511                cfq_update_next_crq(crq);
 512
 513                cfqq->queued[crq->is_sync]--;
 514                rb_erase(&crq->rb_node, &cfqq->sort_list);
 515                RB_CLEAR_COLOR(&crq->rb_node);
 516
 517                if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
 518                        cfq_del_cfqq_rr(cfqd, cfqq);
 519        }
 520}
 521
 522static struct cfq_rq *
 523__cfq_add_crq_rb(struct cfq_rq *crq)
 524{
 525        struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
 526        struct rb_node *parent = NULL;
 527        struct cfq_rq *__crq;
 528
 529        while (*p) {
 530                parent = *p;
 531                __crq = rb_entry_crq(parent);
 532
 533                if (crq->rb_key < __crq->rb_key)
 534                        p = &(*p)->rb_left;
 535                else if (crq->rb_key > __crq->rb_key)
 536                        p = &(*p)->rb_right;
 537                else
 538                        return __crq;
 539        }
 540
 541        rb_link_node(&crq->rb_node, parent, p);
 542        return NULL;
 543}
 544
 545static void cfq_add_crq_rb(struct cfq_rq *crq)
 546{
 547        struct cfq_queue *cfqq = crq->cfq_queue;
 548        struct cfq_data *cfqd = cfqq->cfqd;
 549        struct request *rq = crq->request;
 550        struct cfq_rq *__alias;
 551
 552        crq->rb_key = rq_rb_key(rq);
 553        cfqq->queued[crq->is_sync]++;
 554
 555        /*
 556         * looks a little odd, but the first insert might return an alias.
 557         * if that happens, put the alias on the dispatch list
 558         */
 559        while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
 560                cfq_dispatch_sort(cfqd->queue, __alias);
 561
 562        rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 563
 564        if (!cfqq->on_rr)
 565                cfq_add_cfqq_rr(cfqd, cfqq);
 566
 567        /*
 568         * check if this request is a better next-serve candidate
 569         */
 570        cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
 571}
 572
 573static inline void
 574cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 575{
 576        if (ON_RB(&crq->rb_node)) {
 577                rb_erase(&crq->rb_node, &cfqq->sort_list);
 578                cfqq->queued[crq->is_sync]--;
 579        }
 580
 581        cfq_add_crq_rb(crq);
 582}
 583
 584static struct request *
 585cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 586{
 587        const unsigned long key = cfq_hash_key(cfqd, current);
 588        struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
 589        struct rb_node *n;
 590
 591        if (!cfqq)
 592                goto out;
 593
 594        n = cfqq->sort_list.rb_node;
 595        while (n) {
 596                struct cfq_rq *crq = rb_entry_crq(n);
 597
 598                if (sector < crq->rb_key)
 599                        n = n->rb_left;
 600                else if (sector > crq->rb_key)
 601                        n = n->rb_right;
 602                else
 603                        return crq->request;
 604        }
 605
 606out:
 607        return NULL;
 608}
 609
 610/*
 611 * make sure the service time gets corrected on reissue of this request
 612 */
 613static void cfq_requeue_request(request_queue_t *q, struct request *rq)
 614{
 615        struct cfq_rq *crq = RQ_DATA(rq);
 616
 617        if (crq) {
 618                struct cfq_queue *cfqq = crq->cfq_queue;
 619
 620                if (cfqq->cfqd->cfq_tagged) {
 621                        cfqq->service_used--;
 622                        cfq_sort_rr_list(cfqq, 0);
 623                }
 624
 625                if (crq->accounted) {
 626                        crq->accounted = 0;
 627                        cfqq->cfqd->rq_in_driver--;
 628                }
 629        }
 630        list_add(&rq->queuelist, &q->queue_head);
 631}
 632
 633static void cfq_remove_request(request_queue_t *q, struct request *rq)
 634{
 635        struct cfq_rq *crq = RQ_DATA(rq);
 636
 637        if (crq) {
 638                cfq_remove_merge_hints(q, crq);
 639                list_del_init(&rq->queuelist);
 640
 641                if (crq->cfq_queue)
 642                        cfq_del_crq_rb(crq);
 643        }
 644}
 645
 646static int
 647cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 648{
 649        struct cfq_data *cfqd = q->elevator->elevator_data;
 650        struct request *__rq;
 651        int ret;
 652
 653        ret = elv_try_last_merge(q, bio);
 654        if (ret != ELEVATOR_NO_MERGE) {
 655                __rq = q->last_merge;
 656                goto out_insert;
 657        }
 658
 659        __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
 660        if (__rq) {
 661                BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
 662
 663                if (elv_rq_merge_ok(__rq, bio)) {
 664                        ret = ELEVATOR_BACK_MERGE;
 665                        goto out;
 666                }
 667        }
 668
 669        __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
 670        if (__rq) {
 671                if (elv_rq_merge_ok(__rq, bio)) {
 672                        ret = ELEVATOR_FRONT_MERGE;
 673                        goto out;
 674                }
 675        }
 676
 677        return ELEVATOR_NO_MERGE;
 678out:
 679        q->last_merge = __rq;
 680out_insert:
 681        *req = __rq;
 682        return ret;
 683}
 684
 685static void cfq_merged_request(request_queue_t *q, struct request *req)
 686{
 687        struct cfq_data *cfqd = q->elevator->elevator_data;
 688        struct cfq_rq *crq = RQ_DATA(req);
 689
 690        cfq_del_crq_hash(crq);
 691        cfq_add_crq_hash(cfqd, crq);
 692
 693        if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
 694                struct cfq_queue *cfqq = crq->cfq_queue;
 695
 696                cfq_update_next_crq(crq);
 697                cfq_reposition_crq_rb(cfqq, crq);
 698        }
 699
 700        q->last_merge = req;
 701}
 702
 703static void
 704cfq_merged_requests(request_queue_t *q, struct request *rq,
 705                    struct request *next)
 706{
 707        struct cfq_rq *crq = RQ_DATA(rq);
 708        struct cfq_rq *cnext = RQ_DATA(next);
 709
 710        cfq_merged_request(q, rq);
 711
 712        if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
 713                if (time_before(cnext->queue_start, crq->queue_start)) {
 714                        list_move(&rq->queuelist, &next->queuelist);
 715                        crq->queue_start = cnext->queue_start;
 716                }
 717        }
 718
 719        cfq_update_next_crq(cnext);
 720        cfq_remove_request(q, next);
 721}
 722
 723/*
 724 * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues,
 725 * this function sector sorts the selected request to minimize seeks. we start
 726 * at cfqd->last_sector, not 0.
 727 */
 728static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 729{
 730        struct cfq_data *cfqd = q->elevator->elevator_data;
 731        struct cfq_queue *cfqq = crq->cfq_queue;
 732        struct list_head *head = &q->queue_head, *entry = head;
 733        struct request *__rq;
 734        sector_t last;
 735
 736        cfq_del_crq_rb(crq);
 737        cfq_remove_merge_hints(q, crq);
 738        list_del(&crq->request->queuelist);
 739
 740        last = cfqd->last_sector;
 741        while ((entry = entry->prev) != head) {
 742                __rq = list_entry_rq(entry);
 743
 744                if (blk_barrier_rq(crq->request))
 745                        break;
 746                if (!blk_fs_request(crq->request))
 747                        break;
 748
 749                if (crq->request->sector > __rq->sector)
 750                        break;
 751                if (__rq->sector > last && crq->request->sector < last) {
 752                        last = crq->request->sector;
 753                        break;
 754                }
 755        }
 756
 757        cfqd->last_sector = last;
 758        crq->in_flight = 1;
 759        cfqq->in_flight++;
 760        list_add(&crq->request->queuelist, entry);
 761}
 762
 763/*
 764 * return expired entry, or NULL to just start from scratch in rbtree
 765 */
 766static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 767{
 768        struct cfq_data *cfqd = cfqq->cfqd;
 769        const int reads = !list_empty(&cfqq->fifo[0]);
 770        const int writes = !list_empty(&cfqq->fifo[1]);
 771        unsigned long now = jiffies;
 772        struct cfq_rq *crq;
 773
 774        if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
 775                return NULL;
 776
 777        crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
 778        if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
 779                cfqq->last_fifo_expire = now;
 780                return crq;
 781        }
 782
 783        crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
 784        if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
 785                cfqq->last_fifo_expire = now;
 786                return crq;
 787        }
 788
 789        return NULL;
 790}
 791
 792/*
 793 * dispatch a single request from given queue
 794 */
 795static inline void
 796cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
 797                     struct cfq_queue *cfqq)
 798{
 799        struct cfq_rq *crq;
 800
 801        /*
 802         * follow expired path, else get first next available
 803         */
 804        if ((crq = cfq_check_fifo(cfqq)) == NULL) {
 805                if (cfqd->find_best_crq)
 806                        crq = cfqq->next_crq;
 807                else
 808                        crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 809        }
 810
 811        cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
 812
 813        /*
 814         * finally, insert request into driver list
 815         */
 816        cfq_dispatch_sort(q, crq);
 817}
 818
 819static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
 820{
 821        struct cfq_data *cfqd = q->elevator->elevator_data;
 822        struct cfq_queue *cfqq;
 823        struct list_head *entry, *tmp;
 824        int queued, busy_queues, first_round;
 825
 826        if (list_empty(&cfqd->rr_list))
 827                return 0;
 828
 829        queued = 0;
 830        first_round = 1;
 831restart:
 832        busy_queues = 0;
 833        list_for_each_safe(entry, tmp, &cfqd->rr_list) {
 834                cfqq = list_entry_cfqq(entry);
 835
 836                BUG_ON(RB_EMPTY(&cfqq->sort_list));
 837
 838                /*
 839                 * first round of queueing, only select from queues that
 840                 * don't already have io in-flight
 841                 */
 842                if (first_round && cfqq->in_flight)
 843                        continue;
 844
 845                cfq_dispatch_request(q, cfqd, cfqq);
 846
 847                if (!RB_EMPTY(&cfqq->sort_list))
 848                        busy_queues++;
 849
 850                queued++;
 851        }
 852
 853        if ((queued < max_dispatch) && (busy_queues || first_round)) {
 854                first_round = 0;
 855                goto restart;
 856        }
 857
 858        return queued;
 859}
 860
 861static inline void cfq_account_dispatch(struct cfq_rq *crq)
 862{
 863        struct cfq_queue *cfqq = crq->cfq_queue;
 864        struct cfq_data *cfqd = cfqq->cfqd;
 865        unsigned long now, elapsed;
 866
 867        if (!blk_fs_request(crq->request))
 868                return;
 869
 870        /*
 871         * accounted bit is necessary since some drivers will call
 872         * elv_next_request() many times for the same request (eg ide)
 873         */
 874        if (crq->accounted)
 875                return;
 876
 877        now = jiffies;
 878        if (cfqq->service_start == ~0UL)
 879                cfqq->service_start = now;
 880
 881        /*
 882         * on drives with tagged command queueing, command turn-around time
 883         * doesn't necessarily reflect the time spent processing this very
 884         * command inside the drive. so do the accounting differently there,
 885         * by just sorting on the number of requests
 886         */
 887        if (cfqd->cfq_tagged) {
 888                if (time_after(now, cfqq->service_start + cfq_service)) {
 889                        cfqq->service_start = now;
 890                        cfqq->service_used /= 10;
 891                }
 892
 893                cfqq->service_used++;
 894                cfq_sort_rr_list(cfqq, 0);
 895        }
 896
 897        elapsed = now - crq->queue_start;
 898        if (elapsed > max_elapsed_dispatch)
 899                max_elapsed_dispatch = elapsed;
 900
 901        crq->accounted = 1;
 902        crq->service_start = now;
 903
 904        if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
 905                cfqq->cfqd->cfq_tagged = 1;
 906                printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
 907        }
 908}
 909
 910static inline void
 911cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
 912{
 913        struct cfq_data *cfqd = cfqq->cfqd;
 914
 915        if (!crq->accounted)
 916                return;
 917
 918        WARN_ON(!cfqd->rq_in_driver);
 919        cfqd->rq_in_driver--;
 920
 921        if (!cfqd->cfq_tagged) {
 922                unsigned long now = jiffies;
 923                unsigned long duration = now - crq->service_start;
 924
 925                if (time_after(now, cfqq->service_start + cfq_service)) {
 926                        cfqq->service_start = now;
 927                        cfqq->service_used >>= 3;
 928                }
 929
 930                cfqq->service_used += duration;
 931                cfq_sort_rr_list(cfqq, 0);
 932
 933                if (duration > max_elapsed_crq)
 934                        max_elapsed_crq = duration;
 935        }
 936}
 937
 938static struct request *cfq_next_request(request_queue_t *q)
 939{
 940        struct cfq_data *cfqd = q->elevator->elevator_data;
 941        struct request *rq;
 942
 943        if (!list_empty(&q->queue_head)) {
 944                struct cfq_rq *crq;
 945dispatch:
 946                rq = list_entry_rq(q->queue_head.next);
 947
 948                if ((crq = RQ_DATA(rq)) != NULL) {
 949                        cfq_remove_merge_hints(q, crq);
 950                        cfq_account_dispatch(crq);
 951                }
 952
 953                return rq;
 954        }
 955
 956        if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
 957                goto dispatch;
 958
 959        return NULL;
 960}
 961
 962/*
 963 * task holds one reference to the queue, dropped when task exits. each crq
 964 * in-flight on this queue also holds a reference, dropped when crq is freed.
 965 *
 966 * queue lock must be held here.
 967 */
 968static void cfq_put_queue(struct cfq_queue *cfqq)
 969{
 970        BUG_ON(!atomic_read(&cfqq->ref));
 971
 972        if (!atomic_dec_and_test(&cfqq->ref))
 973                return;
 974
 975        BUG_ON(rb_first(&cfqq->sort_list));
 976        BUG_ON(cfqq->on_rr);
 977
 978        cfq_put_cfqd(cfqq->cfqd);
 979
 980        /*
 981         * it's on the empty list and still hashed
 982         */
 983        list_del(&cfqq->cfq_list);
 984        hlist_del(&cfqq->cfq_hash);
 985        kmem_cache_free(cfq_pool, cfqq);
 986}
 987
 988static inline struct cfq_queue *
 989__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
 990{
 991        struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
 992        struct hlist_node *entry, *next;
 993
 994        hlist_for_each_safe(entry, next, hash_list) {
 995                struct cfq_queue *__cfqq = list_entry_qhash(entry);
 996
 997                if (__cfqq->key == key)
 998                        return __cfqq;
 999        }
1000
1001        return NULL;
1002}
1003
1004static struct cfq_queue *
1005cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
1006{
1007        return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
1008}
1009
1010static inline void
1011cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
1012                struct cfq_io_context *cic)
1013{
1014        unsigned long hashkey = cfq_hash_key(cfqd, current);
1015        unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
1016        struct cfq_queue *__cfqq;
1017        unsigned long flags;
1018
1019        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1020
1021        hlist_del(&(*cfqq)->cfq_hash);
1022
1023        __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
1024        if (!__cfqq || __cfqq == *cfqq) {
1025                __cfqq = *cfqq;
1026                hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1027                __cfqq->key_type = cfqd->key_type;
1028        } else {
1029                atomic_inc(&__cfqq->ref);
1030                cic->cfqq = __cfqq;
1031                cfq_put_queue(*cfqq);
1032                *cfqq = __cfqq;
1033        }
1034
1035        cic->cfqq = __cfqq;
1036        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1037}
1038
1039static void cfq_free_io_context(struct cfq_io_context *cic)
1040{
1041        kmem_cache_free(cfq_ioc_pool, cic);
1042}
1043
1044/*
1045 * locking hierarchy is: io_context lock -> queue locks
1046 */
1047static void cfq_exit_io_context(struct cfq_io_context *cic)
1048{
1049        struct cfq_queue *cfqq = cic->cfqq;
1050        struct list_head *entry = &cic->list;
1051        request_queue_t *q;
1052        unsigned long flags;
1053
1054        /*
1055         * put the reference this task is holding to the various queues
1056         */
1057        spin_lock_irqsave(&cic->ioc->lock, flags);
1058        while ((entry = cic->list.next) != &cic->list) {
1059                struct cfq_io_context *__cic;
1060
1061                __cic = list_entry(entry, struct cfq_io_context, list);
1062                list_del(entry);
1063
1064                q = __cic->cfqq->cfqd->queue;
1065                spin_lock(q->queue_lock);
1066                cfq_put_queue(__cic->cfqq);
1067                spin_unlock(q->queue_lock);
1068        }
1069
1070        q = cfqq->cfqd->queue;
1071        spin_lock(q->queue_lock);
1072        cfq_put_queue(cfqq);
1073        spin_unlock(q->queue_lock);
1074
1075        cic->cfqq = NULL;
1076        spin_unlock_irqrestore(&cic->ioc->lock, flags);
1077}
1078
1079static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
1080{
1081        struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
1082
1083        if (cic) {
1084                cic->dtor = cfq_free_io_context;
1085                cic->exit = cfq_exit_io_context;
1086                INIT_LIST_HEAD(&cic->list);
1087                cic->cfqq = NULL;
1088        }
1089
1090        return cic;
1091}
1092
1093/*
1094 * Setup general io context and cfq io context. There can be several cfq
1095 * io contexts per general io context, if this process is doing io to more
1096 * than one device managed by cfq. Note that caller is holding a reference to
1097 * cfqq, so we don't need to worry about it disappearing
1098 */
1099static struct cfq_io_context *
1100cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
1101{
1102        struct cfq_data *cfqd = (*cfqq)->cfqd;
1103        struct cfq_queue *__cfqq = *cfqq;
1104        struct cfq_io_context *cic;
1105        struct io_context *ioc;
1106
1107        might_sleep_if(gfp_flags & __GFP_WAIT);
1108
1109        ioc = get_io_context(gfp_flags);
1110        if (!ioc)
1111                return NULL;
1112
1113        if ((cic = ioc->cic) == NULL) {
1114                cic = cfq_alloc_io_context(gfp_flags);
1115
1116                if (cic == NULL)
1117                        goto err;
1118
1119                ioc->cic = cic;
1120                cic->ioc = ioc;
1121                cic->cfqq = __cfqq;
1122                atomic_inc(&__cfqq->ref);
1123        } else {
1124                struct cfq_io_context *__cic;
1125                unsigned long flags;
1126
1127                /*
1128                 * since the first cic on the list is actually the head
1129                 * itself, need to check this here or we'll duplicate an
1130                 * cic per ioc for no reason
1131                 */
1132                if (cic->cfqq == __cfqq)
1133                        goto out;
1134
1135                /*
1136                 * cic exists, check if we already are there. linear search
1137                 * should be ok here, the list will usually not be more than
1138                 * 1 or a few entries long
1139                 */
1140                spin_lock_irqsave(&ioc->lock, flags);
1141                list_for_each_entry(__cic, &cic->list, list) {
1142                        /*
1143                         * this process is already holding a reference to
1144                         * this queue, so no need to get one more
1145                         */
1146                        if (__cic->cfqq == __cfqq) {
1147                                cic = __cic;
1148                                spin_unlock_irqrestore(&ioc->lock, flags);
1149                                goto out;
1150                        }
1151                }
1152                spin_unlock_irqrestore(&ioc->lock, flags);
1153
1154                /*
1155                 * nope, process doesn't have a cic assoicated with this
1156                 * cfqq yet. get a new one and add to list
1157                 */
1158                __cic = cfq_alloc_io_context(gfp_flags);
1159                if (__cic == NULL)
1160                        goto err;
1161
1162                __cic->ioc = ioc;
1163                __cic->cfqq = __cfqq;
1164                atomic_inc(&__cfqq->ref);
1165                spin_lock_irqsave(&ioc->lock, flags);
1166                list_add(&__cic->list, &cic->list);
1167                spin_unlock_irqrestore(&ioc->lock, flags);
1168
1169                cic = __cic;
1170                *cfqq = __cfqq;
1171        }
1172
1173out:
1174        /*
1175         * if key_type has been changed on the fly, we lazily rehash
1176         * each queue at lookup time
1177         */
1178        if ((*cfqq)->key_type != cfqd->key_type)
1179                cfq_rehash_cfqq(cfqd, cfqq, cic);
1180
1181        return cic;
1182err:
1183        put_io_context(ioc);
1184        return NULL;
1185}
1186
1187static struct cfq_queue *
1188__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
1189{
1190        const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1191        struct cfq_queue *cfqq, *new_cfqq = NULL;
1192
1193retry:
1194        cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
1195
1196        if (!cfqq) {
1197                if (new_cfqq) {
1198                        cfqq = new_cfqq;
1199                        new_cfqq = NULL;
1200                } else if (gfp_mask & __GFP_WAIT) {
1201                        spin_unlock_irq(cfqd->queue->queue_lock);
1202                        new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1203                        spin_lock_irq(cfqd->queue->queue_lock);
1204                        goto retry;
1205                } else
1206                        goto out;
1207
1208                memset(cfqq, 0, sizeof(*cfqq));
1209
1210                INIT_HLIST_NODE(&cfqq->cfq_hash);
1211                INIT_LIST_HEAD(&cfqq->cfq_list);
1212                RB_CLEAR_ROOT(&cfqq->sort_list);
1213                INIT_LIST_HEAD(&cfqq->fifo[0]);
1214                INIT_LIST_HEAD(&cfqq->fifo[1]);
1215
1216                cfqq->key = key;
1217                hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1218                atomic_set(&cfqq->ref, 0);
1219                cfqq->cfqd = cfqd;
1220                atomic_inc(&cfqd->ref);
1221                cfqq->key_type = cfqd->key_type;
1222                cfqq->service_start = ~0UL;
1223        }
1224
1225        if (new_cfqq)
1226                kmem_cache_free(cfq_pool, new_cfqq);
1227
1228        atomic_inc(&cfqq->ref);
1229out:
1230        WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1231        return cfqq;
1232}
1233
1234static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
1235{
1236        crq->is_sync = 0;
1237        if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
1238                crq->is_sync = 1;
1239
1240        cfq_add_crq_rb(crq);
1241        crq->queue_start = jiffies;
1242
1243        list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
1244}
1245
1246static void
1247cfq_insert_request(request_queue_t *q, struct request *rq, int where)
1248{
1249        struct cfq_data *cfqd = q->elevator->elevator_data;
1250        struct cfq_rq *crq = RQ_DATA(rq);
1251
1252        switch (where) {
1253                case ELEVATOR_INSERT_BACK:
1254                        while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
1255                                ;
1256                        list_add_tail(&rq->queuelist, &q->queue_head);
1257                        break;
1258                case ELEVATOR_INSERT_FRONT:
1259                        list_add(&rq->queuelist, &q->queue_head);
1260                        break;
1261                case ELEVATOR_INSERT_SORT:
1262                        BUG_ON(!blk_fs_request(rq));
1263                        cfq_enqueue(cfqd, crq);
1264                        break;
1265                default:
1266                        printk("%s: bad insert point %d\n", __FUNCTION__,where);
1267                        return;
1268        }
1269
1270        if (rq_mergeable(rq)) {
1271                cfq_add_crq_hash(cfqd, crq);
1272
1273                if (!q->last_merge)
1274                        q->last_merge = rq;
1275        }
1276}
1277
1278static int cfq_queue_empty(request_queue_t *q)
1279{
1280        struct cfq_data *cfqd = q->elevator->elevator_data;
1281
1282        return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
1283}
1284
1285static void cfq_completed_request(request_queue_t *q, struct request *rq)
1286{
1287        struct cfq_rq *crq = RQ_DATA(rq);
1288        struct cfq_queue *cfqq;
1289
1290        if (unlikely(!blk_fs_request(rq)))
1291                return;
1292
1293        cfqq = crq->cfq_queue;
1294
1295        if (crq->in_flight) {
1296                WARN_ON(!cfqq->in_flight);
1297                cfqq->in_flight--;
1298        }
1299
1300        cfq_account_completion(cfqq, crq);
1301}
1302
1303static struct request *
1304cfq_former_request(request_queue_t *q, struct request *rq)
1305{
1306        struct cfq_rq *crq = RQ_DATA(rq);
1307        struct rb_node *rbprev = rb_prev(&crq->rb_node);
1308
1309        if (rbprev)
1310                return rb_entry_crq(rbprev)->request;
1311
1312        return NULL;
1313}
1314
1315static struct request *
1316cfq_latter_request(request_queue_t *q, struct request *rq)
1317{
1318        struct cfq_rq *crq = RQ_DATA(rq);
1319        struct rb_node *rbnext = rb_next(&crq->rb_node);
1320
1321        if (rbnext)
1322                return rb_entry_crq(rbnext)->request;
1323
1324        return NULL;
1325}
1326
1327static int cfq_may_queue(request_queue_t *q, int rw)
1328{
1329        struct cfq_data *cfqd = q->elevator->elevator_data;
1330        struct cfq_queue *cfqq;
1331        int ret = ELV_MQUEUE_MAY;
1332
1333        if (current->flags & PF_MEMALLOC)
1334                return ELV_MQUEUE_MAY;
1335
1336        cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
1337        if (cfqq) {
1338                int limit = cfqd->max_queued;
1339
1340                if (cfqq->allocated[rw] < cfqd->cfq_queued)
1341                        return ELV_MQUEUE_MUST;
1342
1343                if (cfqd->busy_queues)
1344                        limit = q->nr_requests / cfqd->busy_queues;
1345
1346                if (limit < cfqd->cfq_queued)
1347                        limit = cfqd->cfq_queued;
1348                else if (limit > cfqd->max_queued)
1349                        limit = cfqd->max_queued;
1350
1351                if (cfqq->allocated[rw] >= limit) {
1352                        if (limit > cfqq->alloc_limit[rw])
1353                                cfqq->alloc_limit[rw] = limit;
1354
1355                        ret = ELV_MQUEUE_NO;
1356                }
1357        }
1358
1359        return ret;
1360}
1361
1362static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1363{
1364        struct request_list *rl = &q->rq;
1365        const int write = waitqueue_active(&rl->wait[WRITE]);
1366        const int read = waitqueue_active(&rl->wait[READ]);
1367
1368        if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
1369                wake_up(&rl->wait[READ]);
1370        if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
1371                wake_up(&rl->wait[WRITE]);
1372}
1373
1374/*
1375 * queue lock held here
1376 */
1377static void cfq_put_request(request_queue_t *q, struct request *rq)
1378{
1379        struct cfq_data *cfqd = q->elevator->elevator_data;
1380        struct cfq_rq *crq = RQ_DATA(rq);
1381
1382        if (crq) {
1383                struct cfq_queue *cfqq = crq->cfq_queue;
1384
1385                BUG_ON(q->last_merge == rq);
1386                BUG_ON(!hlist_unhashed(&crq->hash));
1387
1388                if (crq->io_context)
1389                        put_io_context(crq->io_context->ioc);
1390
1391                BUG_ON(!cfqq->allocated[crq->is_write]);
1392                cfqq->allocated[crq->is_write]--;
1393
1394                mempool_free(crq, cfqd->crq_pool);
1395                rq->elevator_private = NULL;
1396
1397                smp_mb();
1398                cfq_check_waiters(q, cfqq);
1399                cfq_put_queue(cfqq);
1400        }
1401}
1402
1403/*
1404 * Allocate cfq data structures associated with this request. A queue and
1405 */
1406static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1407{
1408        struct cfq_data *cfqd = q->elevator->elevator_data;
1409        struct cfq_io_context *cic;
1410        const int rw = rq_data_dir(rq);
1411        struct cfq_queue *cfqq, *saved_cfqq;
1412        struct cfq_rq *crq;
1413        unsigned long flags;
1414
1415        might_sleep_if(gfp_mask & __GFP_WAIT);
1416
1417        spin_lock_irqsave(q->queue_lock, flags);
1418
1419        cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
1420        if (!cfqq)
1421                goto out_lock;
1422
1423repeat:
1424        if (cfqq->allocated[rw] >= cfqd->max_queued)
1425                goto out_lock;
1426
1427        cfqq->allocated[rw]++;
1428        spin_unlock_irqrestore(q->queue_lock, flags);
1429
1430        /*
1431         * if hashing type has changed, the cfq_queue might change here.
1432         */
1433        saved_cfqq = cfqq;
1434        cic = cfq_get_io_context(&cfqq, gfp_mask);
1435        if (!cic)
1436                goto err;
1437
1438        /*
1439         * repeat allocation checks on queue change
1440         */
1441        if (unlikely(saved_cfqq != cfqq)) {
1442                spin_lock_irqsave(q->queue_lock, flags);
1443                saved_cfqq->allocated[rw]--;
1444                goto repeat;
1445        }
1446
1447        crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1448        if (crq) {
1449                RB_CLEAR(&crq->rb_node);
1450                crq->rb_key = 0;
1451                crq->request = rq;
1452                INIT_HLIST_NODE(&crq->hash);
1453                crq->cfq_queue = cfqq;
1454                crq->io_context = cic;
1455                crq->service_start = crq->queue_start = 0;
1456                crq->in_flight = crq->accounted = crq->is_sync = 0;
1457                crq->is_write = rw;
1458                rq->elevator_private = crq;
1459                cfqq->alloc_limit[rw] = 0;
1460                return 0;
1461        }
1462
1463        put_io_context(cic->ioc);
1464err:
1465        spin_lock_irqsave(q->queue_lock, flags);
1466        cfqq->allocated[rw]--;
1467        cfq_put_queue(cfqq);
1468out_lock:
1469        spin_unlock_irqrestore(q->queue_lock, flags);
1470        return 1;
1471}
1472
1473static void cfq_put_cfqd(struct cfq_data *cfqd)
1474{
1475        request_queue_t *q = cfqd->queue;
1476
1477        if (!atomic_dec_and_test(&cfqd->ref))
1478                return;
1479
1480        blk_put_queue(q);
1481
1482        mempool_destroy(cfqd->crq_pool);
1483        kfree(cfqd->crq_hash);
1484        kfree(cfqd->cfq_hash);
1485        kfree(cfqd);
1486}
1487
1488static void cfq_exit_queue(elevator_t *e)
1489{
1490        cfq_put_cfqd(e->elevator_data);
1491}
1492
1493static int cfq_init_queue(request_queue_t *q, elevator_t *e)
1494{
1495        struct cfq_data *cfqd;
1496        int i;
1497
1498        cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1499        if (!cfqd)
1500                return -ENOMEM;
1501
1502        memset(cfqd, 0, sizeof(*cfqd));
1503        INIT_LIST_HEAD(&cfqd->rr_list);
1504        INIT_LIST_HEAD(&cfqd->empty_list);
1505
1506        cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
1507        if (!cfqd->crq_hash)
1508                goto out_crqhash;
1509
1510        cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
1511        if (!cfqd->cfq_hash)
1512                goto out_cfqhash;
1513
1514        cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
1515        if (!cfqd->crq_pool)
1516                goto out_crqpool;
1517
1518        for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1519                INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1520        for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1521                INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1522
1523        e->elevator_data = cfqd;
1524
1525        cfqd->queue = q;
1526        atomic_inc(&q->refcnt);
1527
1528        /*
1529         * just set it to some high value, we want anyone to be able to queue
1530         * some requests. fairness is handled differently
1531         */
1532        q->nr_requests = 1024;
1533        cfqd->max_queued = q->nr_requests / 16;
1534        q->nr_batching = cfq_queued;
1535        cfqd->key_type = CFQ_KEY_TGID;
1536        cfqd->find_best_crq = 1;
1537        atomic_set(&cfqd->ref, 1);
1538
1539        cfqd->cfq_queued = cfq_queued;
1540        cfqd->cfq_quantum = cfq_quantum;
1541        cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
1542        cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
1543        cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
1544        cfqd->cfq_back_max = cfq_back_max;
1545        cfqd->cfq_back_penalty = cfq_back_penalty;
1546
1547        return 0;
1548out_crqpool:
1549        kfree(cfqd->cfq_hash);
1550out_cfqhash:
1551        kfree(cfqd->crq_hash);
1552out_crqhash:
1553        kfree(cfqd);
1554        return -ENOMEM;
1555}
1556
1557static void cfq_slab_kill(void)
1558{
1559        if (crq_pool)
1560                kmem_cache_destroy(crq_pool);
1561        if (cfq_pool)
1562                kmem_cache_destroy(cfq_pool);
1563        if (cfq_ioc_pool)
1564                kmem_cache_destroy(cfq_ioc_pool);
1565}
1566
1567static int __init cfq_slab_setup(void)
1568{
1569        crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1570                                        NULL, NULL);
1571        if (!crq_pool)
1572                goto fail;
1573
1574        cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1575                                        NULL, NULL);
1576        if (!cfq_pool)
1577                goto fail;
1578
1579        cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
1580                        sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
1581        if (!cfq_ioc_pool)
1582                goto fail;
1583
1584        return 0;
1585fail:
1586        cfq_slab_kill();
1587        return -ENOMEM;
1588}
1589
1590
1591/*
1592 * sysfs parts below -->
1593 */
1594struct cfq_fs_entry {
1595        struct attribute attr;
1596        ssize_t (*show)(struct cfq_data *, char *);
1597        ssize_t (*store)(struct cfq_data *, const char *, size_t);
1598};
1599
1600static ssize_t
1601cfq_var_show(unsigned int var, char *page)
1602{
1603        return sprintf(page, "%d\n", var);
1604}
1605
1606static ssize_t
1607cfq_var_store(unsigned int *var, const char *page, size_t count)
1608{
1609        char *p = (char *) page;
1610
1611        *var = simple_strtoul(p, &p, 10);
1612        return count;
1613}
1614
1615static ssize_t
1616cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
1617{
1618        max_elapsed_dispatch = max_elapsed_crq = 0;
1619        return count;
1620}
1621
1622static ssize_t
1623cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
1624{
1625        spin_lock_irq(cfqd->queue->queue_lock);
1626        if (!strncmp(page, "pgid", 4))
1627                cfqd->key_type = CFQ_KEY_PGID;
1628        else if (!strncmp(page, "tgid", 4))
1629                cfqd->key_type = CFQ_KEY_TGID;
1630        else if (!strncmp(page, "uid", 3))
1631                cfqd->key_type = CFQ_KEY_UID;
1632        else if (!strncmp(page, "gid", 3))
1633                cfqd->key_type = CFQ_KEY_GID;
1634        spin_unlock_irq(cfqd->queue->queue_lock);
1635        return count;
1636}
1637
1638static ssize_t
1639cfq_read_key_type(struct cfq_data *cfqd, char *page)
1640{
1641        ssize_t len = 0;
1642        int i;
1643
1644        for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
1645                if (cfqd->key_type == i)
1646                        len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
1647                else
1648                        len += sprintf(page+len, "%s ", cfq_key_types[i]);
1649        }
1650        len += sprintf(page+len, "\n");
1651        return len;
1652}
1653
1654#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
1655static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
1656{                                                                       \
1657        unsigned int __data = __VAR;                                    \
1658        if (__CONV)                                                     \
1659                __data = jiffies_to_msecs(__data);                      \
1660        return cfq_var_show(__data, (page));                            \
1661}
1662SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
1663SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
1664SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
1665SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
1666SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
1667SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
1668SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
1669SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
1670#undef SHOW_FUNCTION
1671
1672#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
1673static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
1674{                                                                       \
1675        unsigned int __data;                                            \
1676        int ret = cfq_var_store(&__data, (page), count);                \
1677        if (__data < (MIN))                                             \
1678                __data = (MIN);                                         \
1679        else if (__data > (MAX))                                        \
1680                __data = (MAX);                                         \
1681        if (__CONV)                                                     \
1682                *(__PTR) = msecs_to_jiffies(__data);                    \
1683        else                                                            \
1684                *(__PTR) = __data;                                      \
1685        return ret;                                                     \
1686}
1687STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
1688STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
1689STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
1690STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
1691STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
1692STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
1693STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
1694STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
1695#undef STORE_FUNCTION
1696
1697static struct cfq_fs_entry cfq_quantum_entry = {
1698        .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1699        .show = cfq_quantum_show,
1700        .store = cfq_quantum_store,
1701};
1702static struct cfq_fs_entry cfq_queued_entry = {
1703        .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1704        .show = cfq_queued_show,
1705        .store = cfq_queued_store,
1706};
1707static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
1708        .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
1709        .show = cfq_fifo_expire_r_show,
1710        .store = cfq_fifo_expire_r_store,
1711};
1712static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
1713        .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
1714        .show = cfq_fifo_expire_w_show,
1715        .store = cfq_fifo_expire_w_store,
1716};
1717static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
1718        .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
1719        .show = cfq_fifo_batch_expire_show,
1720        .store = cfq_fifo_batch_expire_store,
1721};
1722static struct cfq_fs_entry cfq_find_best_entry = {
1723        .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
1724        .show = cfq_find_best_show,
1725        .store = cfq_find_best_store,
1726};
1727static struct cfq_fs_entry cfq_back_max_entry = {
1728        .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
1729        .show = cfq_back_max_show,
1730        .store = cfq_back_max_store,
1731};
1732static struct cfq_fs_entry cfq_back_penalty_entry = {
1733        .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
1734        .show = cfq_back_penalty_show,
1735        .store = cfq_back_penalty_store,
1736};
1737static struct cfq_fs_entry cfq_clear_elapsed_entry = {
1738        .attr = {.name = "clear_elapsed", .mode = S_IWUSR },
1739        .store = cfq_clear_elapsed,
1740};
1741static struct cfq_fs_entry cfq_key_type_entry = {
1742        .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
1743        .show = cfq_read_key_type,
1744        .store = cfq_set_key_type,
1745};
1746
1747static struct attribute *default_attrs[] = {
1748        &cfq_quantum_entry.attr,
1749        &cfq_queued_entry.attr,
1750        &cfq_fifo_expire_r_entry.attr,
1751        &cfq_fifo_expire_w_entry.attr,
1752        &cfq_fifo_batch_expire_entry.attr,
1753        &cfq_key_type_entry.attr,
1754        &cfq_find_best_entry.attr,
1755        &cfq_back_max_entry.attr,
1756        &cfq_back_penalty_entry.attr,
1757        &cfq_clear_elapsed_entry.attr,
1758        NULL,
1759};
1760
1761#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1762
1763static ssize_t
1764cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1765{
1766        elevator_t *e = container_of(kobj, elevator_t, kobj);
1767        struct cfq_fs_entry *entry = to_cfq(attr);
1768
1769        if (!entry->show)
1770                return 0;
1771
1772        return entry->show(e->elevator_data, page);
1773}
1774
1775static ssize_t
1776cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1777               const char *page, size_t length)
1778{
1779        elevator_t *e = container_of(kobj, elevator_t, kobj);
1780        struct cfq_fs_entry *entry = to_cfq(attr);
1781
1782        if (!entry->store)
1783                return -EINVAL;
1784
1785        return entry->store(e->elevator_data, page, length);
1786}
1787
1788static struct sysfs_ops cfq_sysfs_ops = {
1789        .show   = cfq_attr_show,
1790        .store  = cfq_attr_store,
1791};
1792
1793struct kobj_type cfq_ktype = {
1794        .sysfs_ops      = &cfq_sysfs_ops,
1795        .default_attrs  = default_attrs,
1796};
1797
1798static struct elevator_type iosched_cfq = {
1799        .ops = {
1800                .elevator_merge_fn =            cfq_merge,
1801                .elevator_merged_fn =           cfq_merged_request,
1802                .elevator_merge_req_fn =        cfq_merged_requests,
1803                .elevator_next_req_fn =         cfq_next_request,
1804                .elevator_add_req_fn =          cfq_insert_request,
1805                .elevator_remove_req_fn =       cfq_remove_request,
1806                .elevator_requeue_req_fn =      cfq_requeue_request,
1807                .elevator_queue_empty_fn =      cfq_queue_empty,
1808                .elevator_completed_req_fn =    cfq_completed_request,
1809                .elevator_former_req_fn =       cfq_former_request,
1810                .elevator_latter_req_fn =       cfq_latter_request,
1811                .elevator_set_req_fn =          cfq_set_request,
1812                .elevator_put_req_fn =          cfq_put_request,
1813                .elevator_may_queue_fn =        cfq_may_queue,
1814                .elevator_init_fn =             cfq_init_queue,
1815                .elevator_exit_fn =             cfq_exit_queue,
1816        },
1817        .elevator_ktype =       &cfq_ktype,
1818        .elevator_name =        "cfq",
1819        .elevator_owner =       THIS_MODULE,
1820};
1821
1822static int __init cfq_init(void)
1823{
1824        int ret;
1825
1826        if (cfq_slab_setup())
1827                return -ENOMEM;
1828
1829        ret = elv_register(&iosched_cfq);
1830        if (!ret) {
1831                __module_get(THIS_MODULE);
1832                return 0;
1833        }
1834
1835        cfq_slab_kill();
1836        return ret;
1837}
1838
1839static void __exit cfq_exit(void)
1840{
1841        cfq_slab_kill();
1842        elv_unregister(&iosched_cfq);
1843}
1844
1845module_init(cfq_init);
1846module_exit(cfq_exit);
1847
1848MODULE_AUTHOR("Jens Axboe");
1849MODULE_LICENSE("GPL");
1850MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
1851
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.