linux/block/elevator.c
<<
>>
Prefs
   1/*
   2 *  Block device elevator/IO-scheduler.
   3 *
   4 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
   5 *
   6 * 30042000 Jens Axboe <axboe@kernel.dk> :
   7 *
   8 * Split the elevator a bit so that it is possible to choose a different
   9 * one or even write a new "plug in". There are three pieces:
  10 * - elevator_fn, inserts a new request in the queue list
  11 * - elevator_merge_fn, decides whether a new buffer can be merged with
  12 *   an existing request
  13 * - elevator_dequeue_fn, called when a request is taken off the active list
  14 *
  15 * 20082000 Dave Jones <davej@suse.de> :
  16 * Removed tests for max-bomb-segments, which was breaking elvtune
  17 *  when run without -bN
  18 *
  19 * Jens:
  20 * - Rework again to work with bio instead of buffer_heads
  21 * - loose bi_dev comparisons, partition handling is right now
  22 * - completely modularize elevator setup and teardown
  23 *
  24 */
  25#include <linux/kernel.h>
  26#include <linux/fs.h>
  27#include <linux/blkdev.h>
  28#include <linux/elevator.h>
  29#include <linux/bio.h>
  30#include <linux/module.h>
  31#include <linux/slab.h>
  32#include <linux/init.h>
  33#include <linux/compiler.h>
  34#include <linux/delay.h>
  35#include <linux/blktrace_api.h>
  36#include <linux/hash.h>
  37
  38#include <asm/uaccess.h>
  39
  40static DEFINE_SPINLOCK(elv_list_lock);
  41static LIST_HEAD(elv_list);
  42
  43/*
  44 * Merge hash stuff.
  45 */
  46static const int elv_hash_shift = 6;
  47#define ELV_HASH_BLOCK(sec)     ((sec) >> 3)
  48#define ELV_HASH_FN(sec)        \
  49                (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
  50#define ELV_HASH_ENTRIES        (1 << elv_hash_shift)
  51#define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
  52#define ELV_ON_HASH(rq)         (!hlist_unhashed(&(rq)->hash))
  53
  54/*
  55 * Query io scheduler to see if the current process issuing bio may be
  56 * merged with rq.
  57 */
  58static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
  59{
  60        struct request_queue *q = rq->q;
  61        elevator_t *e = q->elevator;
  62
  63        if (e->ops->elevator_allow_merge_fn)
  64                return e->ops->elevator_allow_merge_fn(q, rq, bio);
  65
  66        return 1;
  67}
  68
  69/*
  70 * can we safely merge with this request?
  71 */
  72int elv_rq_merge_ok(struct request *rq, struct bio *bio)
  73{
  74        if (!rq_mergeable(rq))
  75                return 0;
  76
  77        /*
  78         * different data direction or already started, don't merge
  79         */
  80        if (bio_data_dir(bio) != rq_data_dir(rq))
  81                return 0;
  82
  83        /*
  84         * must be same device and not a special request
  85         */
  86        if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
  87                return 0;
  88
  89        /*
  90         * only merge integrity protected bio into ditto rq
  91         */
  92        if (bio_integrity(bio) != blk_integrity_rq(rq))
  93                return 0;
  94
  95        if (!elv_iosched_allow_merge(rq, bio))
  96                return 0;
  97
  98        return 1;
  99}
 100EXPORT_SYMBOL(elv_rq_merge_ok);
 101
 102static inline int elv_try_merge(struct request *__rq, struct bio *bio)
 103{
 104        int ret = ELEVATOR_NO_MERGE;
 105
 106        /*
 107         * we can merge and sequence is ok, check if it's possible
 108         */
 109        if (elv_rq_merge_ok(__rq, bio)) {
 110                if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
 111                        ret = ELEVATOR_BACK_MERGE;
 112                else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
 113                        ret = ELEVATOR_FRONT_MERGE;
 114        }
 115
 116        return ret;
 117}
 118
 119static struct elevator_type *elevator_find(const char *name)
 120{
 121        struct elevator_type *e;
 122
 123        list_for_each_entry(e, &elv_list, list) {
 124                if (!strcmp(e->elevator_name, name))
 125                        return e;
 126        }
 127
 128        return NULL;
 129}
 130
 131static void elevator_put(struct elevator_type *e)
 132{
 133        module_put(e->elevator_owner);
 134}
 135
 136static struct elevator_type *elevator_get(const char *name)
 137{
 138        struct elevator_type *e;
 139
 140        spin_lock(&elv_list_lock);
 141
 142        e = elevator_find(name);
 143        if (!e) {
 144                char elv[ELV_NAME_MAX + strlen("-iosched")];
 145
 146                spin_unlock(&elv_list_lock);
 147
 148                if (!strcmp(name, "anticipatory"))
 149                        sprintf(elv, "as-iosched");
 150                else
 151                        sprintf(elv, "%s-iosched", name);
 152
 153                request_module("%s", elv);
 154                spin_lock(&elv_list_lock);
 155                e = elevator_find(name);
 156        }
 157
 158        if (e && !try_module_get(e->elevator_owner))
 159                e = NULL;
 160
 161        spin_unlock(&elv_list_lock);
 162
 163        return e;
 164}
 165
 166static void *elevator_init_queue(struct request_queue *q,
 167                                 struct elevator_queue *eq)
 168{
 169        return eq->ops->elevator_init_fn(q);
 170}
 171
 172static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
 173                           void *data)
 174{
 175        q->elevator = eq;
 176        eq->elevator_data = data;
 177}
 178
 179static char chosen_elevator[16];
 180
 181static int __init elevator_setup(char *str)
 182{
 183        /*
 184         * Be backwards-compatible with previous kernels, so users
 185         * won't get the wrong elevator.
 186         */
 187        if (!strcmp(str, "as"))
 188                strcpy(chosen_elevator, "anticipatory");
 189        else
 190                strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
 191        return 1;
 192}
 193
 194__setup("elevator=", elevator_setup);
 195
 196static struct kobj_type elv_ktype;
 197
 198static elevator_t *elevator_alloc(struct request_queue *q,
 199                                  struct elevator_type *e)
 200{
 201        elevator_t *eq;
 202        int i;
 203
 204        eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node);
 205        if (unlikely(!eq))
 206                goto err;
 207
 208        eq->ops = &e->ops;
 209        eq->elevator_type = e;
 210        kobject_init(&eq->kobj, &elv_ktype);
 211        mutex_init(&eq->sysfs_lock);
 212
 213        eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
 214                                        GFP_KERNEL, q->node);
 215        if (!eq->hash)
 216                goto err;
 217
 218        for (i = 0; i < ELV_HASH_ENTRIES; i++)
 219                INIT_HLIST_HEAD(&eq->hash[i]);
 220
 221        return eq;
 222err:
 223        kfree(eq);
 224        elevator_put(e);
 225        return NULL;
 226}
 227
 228static void elevator_release(struct kobject *kobj)
 229{
 230        elevator_t *e = container_of(kobj, elevator_t, kobj);
 231
 232        elevator_put(e->elevator_type);
 233        kfree(e->hash);
 234        kfree(e);
 235}
 236
 237int elevator_init(struct request_queue *q, char *name)
 238{
 239        struct elevator_type *e = NULL;
 240        struct elevator_queue *eq;
 241        int ret = 0;
 242        void *data;
 243
 244        INIT_LIST_HEAD(&q->queue_head);
 245        q->last_merge = NULL;
 246        q->end_sector = 0;
 247        q->boundary_rq = NULL;
 248
 249        if (name) {
 250                e = elevator_get(name);
 251                if (!e)
 252                        return -EINVAL;
 253        }
 254
 255        if (!e && *chosen_elevator) {
 256                e = elevator_get(chosen_elevator);
 257                if (!e)
 258                        printk(KERN_ERR "I/O scheduler %s not found\n",
 259                                                        chosen_elevator);
 260        }
 261
 262        if (!e) {
 263                e = elevator_get(CONFIG_DEFAULT_IOSCHED);
 264                if (!e) {
 265                        printk(KERN_ERR
 266                                "Default I/O scheduler not found. " \
 267                                "Using noop.\n");
 268                        e = elevator_get("noop");
 269                }
 270        }
 271
 272        eq = elevator_alloc(q, e);
 273        if (!eq)
 274                return -ENOMEM;
 275
 276        data = elevator_init_queue(q, eq);
 277        if (!data) {
 278                kobject_put(&eq->kobj);
 279                return -ENOMEM;
 280        }
 281
 282        elevator_attach(q, eq, data);
 283        return ret;
 284}
 285EXPORT_SYMBOL(elevator_init);
 286
 287void elevator_exit(elevator_t *e)
 288{
 289        mutex_lock(&e->sysfs_lock);
 290        if (e->ops->elevator_exit_fn)
 291                e->ops->elevator_exit_fn(e);
 292        e->ops = NULL;
 293        mutex_unlock(&e->sysfs_lock);
 294
 295        kobject_put(&e->kobj);
 296}
 297EXPORT_SYMBOL(elevator_exit);
 298
 299static void elv_activate_rq(struct request_queue *q, struct request *rq)
 300{
 301        elevator_t *e = q->elevator;
 302
 303        if (e->ops->elevator_activate_req_fn)
 304                e->ops->elevator_activate_req_fn(q, rq);
 305}
 306
 307static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 308{
 309        elevator_t *e = q->elevator;
 310
 311        if (e->ops->elevator_deactivate_req_fn)
 312                e->ops->elevator_deactivate_req_fn(q, rq);
 313}
 314
 315static inline void __elv_rqhash_del(struct request *rq)
 316{
 317        hlist_del_init(&rq->hash);
 318}
 319
 320static void elv_rqhash_del(struct request_queue *q, struct request *rq)
 321{
 322        if (ELV_ON_HASH(rq))
 323                __elv_rqhash_del(rq);
 324}
 325
 326static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 327{
 328        elevator_t *e = q->elevator;
 329
 330        BUG_ON(ELV_ON_HASH(rq));
 331        hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
 332}
 333
 334static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 335{
 336        __elv_rqhash_del(rq);
 337        elv_rqhash_add(q, rq);
 338}
 339
 340static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 341{
 342        elevator_t *e = q->elevator;
 343        struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
 344        struct hlist_node *entry, *next;
 345        struct request *rq;
 346
 347        hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
 348                BUG_ON(!ELV_ON_HASH(rq));
 349
 350                if (unlikely(!rq_mergeable(rq))) {
 351                        __elv_rqhash_del(rq);
 352                        continue;
 353                }
 354
 355                if (rq_hash_key(rq) == offset)
 356                        return rq;
 357        }
 358
 359        return NULL;
 360}
 361
 362/*
 363 * RB-tree support functions for inserting/lookup/removal of requests
 364 * in a sorted RB tree.
 365 */
 366struct request *elv_rb_add(struct rb_root *root, struct request *rq)
 367{
 368        struct rb_node **p = &root->rb_node;
 369        struct rb_node *parent = NULL;
 370        struct request *__rq;
 371
 372        while (*p) {
 373                parent = *p;
 374                __rq = rb_entry(parent, struct request, rb_node);
 375
 376                if (rq->sector < __rq->sector)
 377                        p = &(*p)->rb_left;
 378                else if (rq->sector > __rq->sector)
 379                        p = &(*p)->rb_right;
 380                else
 381                        return __rq;
 382        }
 383
 384        rb_link_node(&rq->rb_node, parent, p);
 385        rb_insert_color(&rq->rb_node, root);
 386        return NULL;
 387}
 388EXPORT_SYMBOL(elv_rb_add);
 389
 390void elv_rb_del(struct rb_root *root, struct request *rq)
 391{
 392        BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
 393        rb_erase(&rq->rb_node, root);
 394        RB_CLEAR_NODE(&rq->rb_node);
 395}
 396EXPORT_SYMBOL(elv_rb_del);
 397
 398struct request *elv_rb_find(struct rb_root *root, sector_t sector)
 399{
 400        struct rb_node *n = root->rb_node;
 401        struct request *rq;
 402
 403        while (n) {
 404                rq = rb_entry(n, struct request, rb_node);
 405
 406                if (sector < rq->sector)
 407                        n = n->rb_left;
 408                else if (sector > rq->sector)
 409                        n = n->rb_right;
 410                else
 411                        return rq;
 412        }
 413
 414        return NULL;
 415}
 416EXPORT_SYMBOL(elv_rb_find);
 417
 418/*
 419 * Insert rq into dispatch queue of q.  Queue lock must be held on
 420 * entry.  rq is sort instead into the dispatch queue. To be used by
 421 * specific elevators.
 422 */
 423void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 424{
 425        sector_t boundary;
 426        struct list_head *entry;
 427        int stop_flags;
 428
 429        if (q->last_merge == rq)
 430                q->last_merge = NULL;
 431
 432        elv_rqhash_del(q, rq);
 433
 434        q->nr_sorted--;
 435
 436        boundary = q->end_sector;
 437        stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED;
 438        list_for_each_prev(entry, &q->queue_head) {
 439                struct request *pos = list_entry_rq(entry);
 440
 441                if (rq_data_dir(rq) != rq_data_dir(pos))
 442                        break;
 443                if (pos->cmd_flags & stop_flags)
 444                        break;
 445                if (rq->sector >= boundary) {
 446                        if (pos->sector < boundary)
 447                                continue;
 448                } else {
 449                        if (pos->sector >= boundary)
 450                                break;
 451                }
 452                if (rq->sector >= pos->sector)
 453                        break;
 454        }
 455
 456        list_add(&rq->queuelist, entry);
 457}
 458EXPORT_SYMBOL(elv_dispatch_sort);
 459
 460/*
 461 * Insert rq into dispatch queue of q.  Queue lock must be held on
 462 * entry.  rq is added to the back of the dispatch queue. To be used by
 463 * specific elevators.
 464 */
 465void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
 466{
 467        if (q->last_merge == rq)
 468                q->last_merge = NULL;
 469
 470        elv_rqhash_del(q, rq);
 471
 472        q->nr_sorted--;
 473
 474        q->end_sector = rq_end_sector(rq);
 475        q->boundary_rq = rq;
 476        list_add_tail(&rq->queuelist, &q->queue_head);
 477}
 478EXPORT_SYMBOL(elv_dispatch_add_tail);
 479
 480int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 481{
 482        elevator_t *e = q->elevator;
 483        struct request *__rq;
 484        int ret;
 485
 486        /*
 487         * First try one-hit cache.
 488         */
 489        if (q->last_merge) {
 490                ret = elv_try_merge(q->last_merge, bio);
 491                if (ret != ELEVATOR_NO_MERGE) {
 492                        *req = q->last_merge;
 493                        return ret;
 494                }
 495        }
 496
 497        if (blk_queue_nomerges(q))
 498                return ELEVATOR_NO_MERGE;
 499
 500        /*
 501         * See if our hash lookup can find a potential backmerge.
 502         */
 503        __rq = elv_rqhash_find(q, bio->bi_sector);
 504        if (__rq && elv_rq_merge_ok(__rq, bio)) {
 505                *req = __rq;
 506                return ELEVATOR_BACK_MERGE;
 507        }
 508
 509        if (e->ops->elevator_merge_fn)
 510                return e->ops->elevator_merge_fn(q, req, bio);
 511
 512        return ELEVATOR_NO_MERGE;
 513}
 514
 515void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 516{
 517        elevator_t *e = q->elevator;
 518
 519        if (e->ops->elevator_merged_fn)
 520                e->ops->elevator_merged_fn(q, rq, type);
 521
 522        if (type == ELEVATOR_BACK_MERGE)
 523                elv_rqhash_reposition(q, rq);
 524
 525        q->last_merge = rq;
 526}
 527
 528void elv_merge_requests(struct request_queue *q, struct request *rq,
 529                             struct request *next)
 530{
 531        elevator_t *e = q->elevator;
 532
 533        if (e->ops->elevator_merge_req_fn)
 534                e->ops->elevator_merge_req_fn(q, rq, next);
 535
 536        elv_rqhash_reposition(q, rq);
 537        elv_rqhash_del(q, next);
 538
 539        q->nr_sorted--;
 540        q->last_merge = rq;
 541}
 542
 543void elv_requeue_request(struct request_queue *q, struct request *rq)
 544{
 545        /*
 546         * it already went through dequeue, we need to decrement the
 547         * in_flight count again
 548         */
 549        if (blk_account_rq(rq)) {
 550                q->in_flight--;
 551                if (blk_sorted_rq(rq))
 552                        elv_deactivate_rq(q, rq);
 553        }
 554
 555        rq->cmd_flags &= ~REQ_STARTED;
 556
 557        elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
 558}
 559
 560static void elv_drain_elevator(struct request_queue *q)
 561{
 562        static int printed;
 563        while (q->elevator->ops->elevator_dispatch_fn(q, 1))
 564                ;
 565        if (q->nr_sorted == 0)
 566                return;
 567        if (printed++ < 10) {
 568                printk(KERN_ERR "%s: forced dispatching is broken "
 569                       "(nr_sorted=%u), please report this\n",
 570                       q->elevator->elevator_type->elevator_name, q->nr_sorted);
 571        }
 572}
 573
 574void elv_insert(struct request_queue *q, struct request *rq, int where)
 575{
 576        struct list_head *pos;
 577        unsigned ordseq;
 578        int unplug_it = 1;
 579
 580        blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 581
 582        rq->q = q;
 583
 584        switch (where) {
 585        case ELEVATOR_INSERT_FRONT:
 586                rq->cmd_flags |= REQ_SOFTBARRIER;
 587
 588                list_add(&rq->queuelist, &q->queue_head);
 589                break;
 590
 591        case ELEVATOR_INSERT_BACK:
 592                rq->cmd_flags |= REQ_SOFTBARRIER;
 593                elv_drain_elevator(q);
 594                list_add_tail(&rq->queuelist, &q->queue_head);
 595                /*
 596                 * We kick the queue here for the following reasons.
 597                 * - The elevator might have returned NULL previously
 598                 *   to delay requests and returned them now.  As the
 599                 *   queue wasn't empty before this request, ll_rw_blk
 600                 *   won't run the queue on return, resulting in hang.
 601                 * - Usually, back inserted requests won't be merged
 602                 *   with anything.  There's no point in delaying queue
 603                 *   processing.
 604                 */
 605                blk_remove_plug(q);
 606                q->request_fn(q);
 607                break;
 608
 609        case ELEVATOR_INSERT_SORT:
 610                BUG_ON(!blk_fs_request(rq));
 611                rq->cmd_flags |= REQ_SORTED;
 612                q->nr_sorted++;
 613                if (rq_mergeable(rq)) {
 614                        elv_rqhash_add(q, rq);
 615                        if (!q->last_merge)
 616                                q->last_merge = rq;
 617                }
 618
 619                /*
 620                 * Some ioscheds (cfq) run q->request_fn directly, so
 621                 * rq cannot be accessed after calling
 622                 * elevator_add_req_fn.
 623                 */
 624                q->elevator->ops->elevator_add_req_fn(q, rq);
 625                break;
 626
 627        case ELEVATOR_INSERT_REQUEUE:
 628                /*
 629                 * If ordered flush isn't in progress, we do front
 630                 * insertion; otherwise, requests should be requeued
 631                 * in ordseq order.
 632                 */
 633                rq->cmd_flags |= REQ_SOFTBARRIER;
 634
 635                /*
 636                 * Most requeues happen because of a busy condition,
 637                 * don't force unplug of the queue for that case.
 638                 */
 639                unplug_it = 0;
 640
 641                if (q->ordseq == 0) {
 642                        list_add(&rq->queuelist, &q->queue_head);
 643                        break;
 644                }
 645
 646                ordseq = blk_ordered_req_seq(rq);
 647
 648                list_for_each(pos, &q->queue_head) {
 649                        struct request *pos_rq = list_entry_rq(pos);
 650                        if (ordseq <= blk_ordered_req_seq(pos_rq))
 651                                break;
 652                }
 653
 654                list_add_tail(&rq->queuelist, pos);
 655                break;
 656
 657        default:
 658                printk(KERN_ERR "%s: bad insertion point %d\n",
 659                       __func__, where);
 660                BUG();
 661        }
 662
 663        if (unplug_it && blk_queue_plugged(q)) {
 664                int nrq = q->rq.count[READ] + q->rq.count[WRITE]
 665                        - q->in_flight;
 666
 667                if (nrq >= q->unplug_thresh)
 668                        __generic_unplug_device(q);
 669        }
 670}
 671
 672void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 673                       int plug)
 674{
 675        if (q->ordcolor)
 676                rq->cmd_flags |= REQ_ORDERED_COLOR;
 677
 678        if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
 679                /*
 680                 * toggle ordered color
 681                 */
 682                if (blk_barrier_rq(rq))
 683                        q->ordcolor ^= 1;
 684
 685                /*
 686                 * barriers implicitly indicate back insertion
 687                 */
 688                if (where == ELEVATOR_INSERT_SORT)
 689                        where = ELEVATOR_INSERT_BACK;
 690
 691                /*
 692                 * this request is scheduling boundary, update
 693                 * end_sector
 694                 */
 695                if (blk_fs_request(rq)) {
 696                        q->end_sector = rq_end_sector(rq);
 697                        q->boundary_rq = rq;
 698                }
 699        } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
 700                    where == ELEVATOR_INSERT_SORT)
 701                where = ELEVATOR_INSERT_BACK;
 702
 703        if (plug)
 704                blk_plug_device(q);
 705
 706        elv_insert(q, rq, where);
 707}
 708EXPORT_SYMBOL(__elv_add_request);
 709
 710void elv_add_request(struct request_queue *q, struct request *rq, int where,
 711                     int plug)
 712{
 713        unsigned long flags;
 714
 715        spin_lock_irqsave(q->queue_lock, flags);
 716        __elv_add_request(q, rq, where, plug);
 717        spin_unlock_irqrestore(q->queue_lock, flags);
 718}
 719EXPORT_SYMBOL(elv_add_request);
 720
 721static inline struct request *__elv_next_request(struct request_queue *q)
 722{
 723        struct request *rq;
 724
 725        while (1) {
 726                while (!list_empty(&q->queue_head)) {
 727                        rq = list_entry_rq(q->queue_head.next);
 728                        if (blk_do_ordered(q, &rq))
 729                                return rq;
 730                }
 731
 732                if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
 733                    !q->elevator->ops->elevator_dispatch_fn(q, 0))
 734                        return NULL;
 735        }
 736}
 737
 738struct request *elv_next_request(struct request_queue *q)
 739{
 740        struct request *rq;
 741        int ret;
 742
 743        while ((rq = __elv_next_request(q)) != NULL) {
 744                /*
 745                 * Kill the empty barrier place holder, the driver must
 746                 * not ever see it.
 747                 */
 748                if (blk_empty_barrier(rq)) {
 749                        end_queued_request(rq, 1);
 750                        continue;
 751                }
 752                if (!(rq->cmd_flags & REQ_STARTED)) {
 753                        /*
 754                         * This is the first time the device driver
 755                         * sees this request (possibly after
 756                         * requeueing).  Notify IO scheduler.
 757                         */
 758                        if (blk_sorted_rq(rq))
 759                                elv_activate_rq(q, rq);
 760
 761                        /*
 762                         * just mark as started even if we don't start
 763                         * it, a request that has been delayed should
 764                         * not be passed by new incoming requests
 765                         */
 766                        rq->cmd_flags |= REQ_STARTED;
 767                        blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 768                }
 769
 770                if (!q->boundary_rq || q->boundary_rq == rq) {
 771                        q->end_sector = rq_end_sector(rq);
 772                        q->boundary_rq = NULL;
 773                }
 774
 775                if (rq->cmd_flags & REQ_DONTPREP)
 776                        break;
 777
 778                if (q->dma_drain_size && rq->data_len) {
 779                        /*
 780                         * make sure space for the drain appears we
 781                         * know we can do this because max_hw_segments
 782                         * has been adjusted to be one fewer than the
 783                         * device can handle
 784                         */
 785                        rq->nr_phys_segments++;
 786                        rq->nr_hw_segments++;
 787                }
 788
 789                if (!q->prep_rq_fn)
 790                        break;
 791
 792                ret = q->prep_rq_fn(q, rq);
 793                if (ret == BLKPREP_OK) {
 794                        break;
 795                } else if (ret == BLKPREP_DEFER) {
 796                        /*
 797                         * the request may have been (partially) prepped.
 798                         * we need to keep this request in the front to
 799                         * avoid resource deadlock.  REQ_STARTED will
 800                         * prevent other fs requests from passing this one.
 801                         */
 802                        if (q->dma_drain_size && rq->data_len &&
 803                            !(rq->cmd_flags & REQ_DONTPREP)) {
 804                                /*
 805                                 * remove the space for the drain we added
 806                                 * so that we don't add it again
 807                                 */
 808                                --rq->nr_phys_segments;
 809                                --rq->nr_hw_segments;
 810                        }
 811
 812                        rq = NULL;
 813                        break;
 814                } else if (ret == BLKPREP_KILL) {
 815                        rq->cmd_flags |= REQ_QUIET;
 816                        end_queued_request(rq, 0);
 817                } else {
 818                        printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 819                        break;
 820                }
 821        }
 822
 823        return rq;
 824}
 825EXPORT_SYMBOL(elv_next_request);
 826
 827void elv_dequeue_request(struct request_queue *q, struct request *rq)
 828{
 829        BUG_ON(list_empty(&rq->queuelist));
 830        BUG_ON(ELV_ON_HASH(rq));
 831
 832        list_del_init(&rq->queuelist);
 833
 834        /*
 835         * the time frame between a request being removed from the lists
 836         * and to it is freed is accounted as io that is in progress at
 837         * the driver side.
 838         */
 839        if (blk_account_rq(rq))
 840                q->in_flight++;
 841}
 842EXPORT_SYMBOL(elv_dequeue_request);
 843
 844int elv_queue_empty(struct request_queue *q)
 845{
 846        elevator_t *e = q->elevator;
 847
 848        if (!list_empty(&q->queue_head))
 849                return 0;
 850
 851        if (e->ops->elevator_queue_empty_fn)
 852                return e->ops->elevator_queue_empty_fn(q);
 853
 854        return 1;
 855}
 856EXPORT_SYMBOL(elv_queue_empty);
 857
 858struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 859{
 860        elevator_t *e = q->elevator;
 861
 862        if (e->ops->elevator_latter_req_fn)
 863                return e->ops->elevator_latter_req_fn(q, rq);
 864        return NULL;
 865}
 866
 867struct request *elv_former_request(struct request_queue *q, struct request *rq)
 868{
 869        elevator_t *e = q->elevator;
 870
 871        if (e->ops->elevator_former_req_fn)
 872                return e->ops->elevator_former_req_fn(q, rq);
 873        return NULL;
 874}
 875
 876int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 877{
 878        elevator_t *e = q->elevator;
 879
 880        if (e->ops->elevator_set_req_fn)
 881                return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 882
 883        rq->elevator_private = NULL;
 884        return 0;
 885}
 886
 887void elv_put_request(struct request_queue *q, struct request *rq)
 888{
 889        elevator_t *e = q->elevator;
 890
 891        if (e->ops->elevator_put_req_fn)
 892                e->ops->elevator_put_req_fn(rq);
 893}
 894
 895int elv_may_queue(struct request_queue *q, int rw)
 896{
 897        elevator_t *e = q->elevator;
 898
 899        if (e->ops->elevator_may_queue_fn)
 900                return e->ops->elevator_may_queue_fn(q, rw);
 901
 902        return ELV_MQUEUE_MAY;
 903}
 904
 905void elv_completed_request(struct request_queue *q, struct request *rq)
 906{
 907        elevator_t *e = q->elevator;
 908
 909        /*
 910         * request is released from the driver, io must be done
 911         */
 912        if (blk_account_rq(rq)) {
 913                q->in_flight--;
 914                if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 915                        e->ops->elevator_completed_req_fn(q, rq);
 916        }
 917
 918        /*
 919         * Check if the queue is waiting for fs requests to be
 920         * drained for flush sequence.
 921         */
 922        if (unlikely(q->ordseq)) {
 923                struct request *first_rq = list_entry_rq(q->queue_head.next);
 924                if (q->in_flight == 0 &&
 925                    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 926                    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
 927                        blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
 928                        q->request_fn(q);
 929                }
 930        }
 931}
 932
 933#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
 934
 935static ssize_t
 936elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 937{
 938        elevator_t *e = container_of(kobj, elevator_t, kobj);
 939        struct elv_fs_entry *entry = to_elv(attr);
 940        ssize_t error;
 941
 942        if (!entry->show)
 943                return -EIO;
 944
 945        mutex_lock(&e->sysfs_lock);
 946        error = e->ops ? entry->show(e, page) : -ENOENT;
 947        mutex_unlock(&e->sysfs_lock);
 948        return error;
 949}
 950
 951static ssize_t
 952elv_attr_store(struct kobject *kobj, struct attribute *attr,
 953               const char *page, size_t length)
 954{
 955        elevator_t *e = container_of(kobj, elevator_t, kobj);
 956        struct elv_fs_entry *entry = to_elv(attr);
 957        ssize_t error;
 958
 959        if (!entry->store)
 960                return -EIO;
 961
 962        mutex_lock(&e->sysfs_lock);
 963        error = e->ops ? entry->store(e, page, length) : -ENOENT;
 964        mutex_unlock(&e->sysfs_lock);
 965        return error;
 966}
 967
 968static struct sysfs_ops elv_sysfs_ops = {
 969        .show   = elv_attr_show,
 970        .store  = elv_attr_store,
 971};
 972
 973static struct kobj_type elv_ktype = {
 974        .sysfs_ops      = &elv_sysfs_ops,
 975        .release        = elevator_release,
 976};
 977
 978int elv_register_queue(struct request_queue *q)
 979{
 980        elevator_t *e = q->elevator;
 981        int error;
 982
 983        error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
 984        if (!error) {
 985                struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
 986                if (attr) {
 987                        while (attr->attr.name) {
 988                                if (sysfs_create_file(&e->kobj, &attr->attr))
 989                                        break;
 990                                attr++;
 991                        }
 992                }
 993                kobject_uevent(&e->kobj, KOBJ_ADD);
 994        }
 995        return error;
 996}
 997
 998static void __elv_unregister_queue(elevator_t *e)
 999{
1000        kobject_uevent(&e->kobj, KOBJ_REMOVE);
1001        kobject_del(&e->kobj);
1002}
1003
1004void elv_unregister_queue(struct request_queue *q)
1005{
1006        if (q)
1007                __elv_unregister_queue(q->elevator);
1008}
1009
1010void elv_register(struct elevator_type *e)
1011{
1012        char *def = "";
1013
1014        spin_lock(&elv_list_lock);
1015        BUG_ON(elevator_find(e->elevator_name));
1016        list_add_tail(&e->list, &elv_list);
1017        spin_unlock(&elv_list_lock);
1018
1019        if (!strcmp(e->elevator_name, chosen_elevator) ||
1020                        (!*chosen_elevator &&
1021                         !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
1022                                def = " (default)";
1023
1024        printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
1025                                                                def);
1026}
1027EXPORT_SYMBOL_GPL(elv_register);
1028
1029void elv_unregister(struct elevator_type *e)
1030{
1031        struct task_struct *g, *p;
1032
1033        /*
1034         * Iterate every thread in the process to remove the io contexts.
1035         */
1036        if (e->ops.trim) {
1037                read_lock(&tasklist_lock);
1038                do_each_thread(g, p) {
1039                        task_lock(p);
1040                        if (p->io_context)
1041                                e->ops.trim(p->io_context);
1042                        task_unlock(p);
1043                } while_each_thread(g, p);
1044                read_unlock(&tasklist_lock);
1045        }
1046
1047        spin_lock(&elv_list_lock);
1048        list_del_init(&e->list);
1049        spin_unlock(&elv_list_lock);
1050}
1051EXPORT_SYMBOL_GPL(elv_unregister);
1052
1053/*
1054 * switch to new_e io scheduler. be careful not to introduce deadlocks -
1055 * we don't free the old io scheduler, before we have allocated what we
1056 * need for the new one. this way we have a chance of going back to the old
1057 * one, if the new one fails init for some reason.
1058 */
1059static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1060{
1061        elevator_t *old_elevator, *e;
1062        void *data;
1063
1064        /*
1065         * Allocate new elevator
1066         */
1067        e = elevator_alloc(q, new_e);
1068        if (!e)
1069                return 0;
1070
1071        data = elevator_init_queue(q, e);
1072        if (!data) {
1073                kobject_put(&e->kobj);
1074                return 0;
1075        }
1076
1077        /*
1078         * Turn on BYPASS and drain all requests w/ elevator private data
1079         */
1080        spin_lock_irq(q->queue_lock);
1081
1082        queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
1083
1084        elv_drain_elevator(q);
1085
1086        while (q->rq.elvpriv) {
1087                blk_remove_plug(q);
1088                q->request_fn(q);
1089                spin_unlock_irq(q->queue_lock);
1090                msleep(10);
1091                spin_lock_irq(q->queue_lock);
1092                elv_drain_elevator(q);
1093        }
1094
1095        /*
1096         * Remember old elevator.
1097         */
1098        old_elevator = q->elevator;
1099
1100        /*
1101         * attach and start new elevator
1102         */
1103        elevator_attach(q, e, data);
1104
1105        spin_unlock_irq(q->queue_lock);
1106
1107        __elv_unregister_queue(old_elevator);
1108
1109        if (elv_register_queue(q))
1110                goto fail_register;
1111
1112        /*
1113         * finally exit old elevator and turn off BYPASS.
1114         */
1115        elevator_exit(old_elevator);
1116        spin_lock_irq(q->queue_lock);
1117        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
1118        spin_unlock_irq(q->queue_lock);
1119
1120        blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
1121
1122        return 1;
1123
1124fail_register:
1125        /*
1126         * switch failed, exit the new io scheduler and reattach the old
1127         * one again (along with re-adding the sysfs dir)
1128         */
1129        elevator_exit(e);
1130        q->elevator = old_elevator;
1131        elv_register_queue(q);
1132
1133        spin_lock_irq(q->queue_lock);
1134        queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
1135        spin_unlock_irq(q->queue_lock);
1136
1137        return 0;
1138}
1139
1140ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1141                          size_t count)
1142{
1143        char elevator_name[ELV_NAME_MAX];
1144        size_t len;
1145        struct elevator_type *e;
1146
1147        elevator_name[sizeof(elevator_name) - 1] = '\0';
1148        strncpy(elevator_name, name, sizeof(elevator_name) - 1);
1149        len = strlen(elevator_name);
1150
1151        if (len && elevator_name[len - 1] == '\n')
1152                elevator_name[len - 1] = '\0';
1153
1154        e = elevator_get(elevator_name);
1155        if (!e) {
1156                printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
1157                return -EINVAL;
1158        }
1159
1160        if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
1161                elevator_put(e);
1162                return count;
1163        }
1164
1165        if (!elevator_switch(q, e))
1166                printk(KERN_ERR "elevator: switch to %s failed\n",
1167                                                        elevator_name);
1168        return count;
1169}
1170
1171ssize_t elv_iosched_show(struct request_queue *q, char *name)
1172{
1173        elevator_t *e = q->elevator;
1174        struct elevator_type *elv = e->elevator_type;
1175        struct elevator_type *__e;
1176        int len = 0;
1177
1178        spin_lock(&elv_list_lock);
1179        list_for_each_entry(__e, &elv_list, list) {
1180                if (!strcmp(elv->elevator_name, __e->elevator_name))
1181                        len += sprintf(name+len, "[%s] ", elv->elevator_name);
1182                else
1183                        len += sprintf(name+len, "%s ", __e->elevator_name);
1184        }
1185        spin_unlock(&elv_list_lock);
1186
1187        len += sprintf(len+name, "\n");
1188        return len;
1189}
1190
1191struct request *elv_rb_former_request(struct request_queue *q,
1192                                      struct request *rq)
1193{
1194        struct rb_node *rbprev = rb_prev(&rq->rb_node);
1195
1196        if (rbprev)
1197                return rb_entry_rq(rbprev);
1198
1199        return NULL;
1200}
1201EXPORT_SYMBOL(elv_rb_former_request);
1202
1203struct request *elv_rb_latter_request(struct request_queue *q,
1204                                      struct request *rq)
1205{
1206        struct rb_node *rbnext = rb_next(&rq->rb_node);
1207
1208        if (rbnext)
1209                return rb_entry_rq(rbnext);
1210
1211        return NULL;
1212}
1213EXPORT_SYMBOL(elv_rb_latter_request);
1214