linux/block/cfq-iosched.c
<<
>>
Prefs
   1/*
   2 *  CFQ, or complete fairness queueing, disk scheduler.
   3 *
   4 *  Based on ideas from a previously unfinished io
   5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   6 *
   7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   8 */
   9#include <linux/module.h>
  10#include <linux/blkdev.h>
  11#include <linux/elevator.h>
  12#include <linux/rbtree.h>
  13#include <linux/ioprio.h>
  14#include <linux/blktrace_api.h>
  15
  16/*
  17 * tunables
  18 */
  19/* max queue in one round of service */
  20static const int cfq_quantum = 4;
  21static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  22/* maximum backwards seek, in KiB */
  23static const int cfq_back_max = 16 * 1024;
  24/* penalty of a backwards seek */
  25static const int cfq_back_penalty = 2;
  26static const int cfq_slice_sync = HZ / 10;
  27static int cfq_slice_async = HZ / 25;
  28static const int cfq_slice_async_rq = 2;
  29static int cfq_slice_idle = HZ / 125;
  30
  31/*
  32 * offset from end of service tree
  33 */
  34#define CFQ_IDLE_DELAY          (HZ / 5)
  35
  36/*
  37 * below this threshold, we consider thinktime immediate
  38 */
  39#define CFQ_MIN_TT              (2)
  40
  41#define CFQ_SLICE_SCALE         (5)
  42#define CFQ_HW_QUEUE_MIN        (5)
  43
  44#define RQ_CIC(rq)              \
  45        ((struct cfq_io_context *) (rq)->elevator_private)
  46#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
  47
  48static struct kmem_cache *cfq_pool;
  49static struct kmem_cache *cfq_ioc_pool;
  50
  51static DEFINE_PER_CPU(unsigned long, ioc_count);
  52static struct completion *ioc_gone;
  53static DEFINE_SPINLOCK(ioc_gone_lock);
  54
  55#define CFQ_PRIO_LISTS          IOPRIO_BE_NR
  56#define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  57#define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  58
  59#define sample_valid(samples)   ((samples) > 80)
  60
  61/*
  62 * Most of our rbtree usage is for sorting with min extraction, so
  63 * if we cache the leftmost node we don't have to walk down the tree
  64 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  65 * move this into the elevator for the rq sorting as well.
  66 */
  67struct cfq_rb_root {
  68        struct rb_root rb;
  69        struct rb_node *left;
  70};
  71#define CFQ_RB_ROOT     (struct cfq_rb_root) { RB_ROOT, NULL, }
  72
  73/*
  74 * Per process-grouping structure
  75 */
  76struct cfq_queue {
  77        /* reference count */
  78        atomic_t ref;
  79        /* various state flags, see below */
  80        unsigned int flags;
  81        /* parent cfq_data */
  82        struct cfq_data *cfqd;
  83        /* service_tree member */
  84        struct rb_node rb_node;
  85        /* service_tree key */
  86        unsigned long rb_key;
  87        /* prio tree member */
  88        struct rb_node p_node;
  89        /* prio tree root we belong to, if any */
  90        struct rb_root *p_root;
  91        /* sorted list of pending requests */
  92        struct rb_root sort_list;
  93        /* if fifo isn't expired, next request to serve */
  94        struct request *next_rq;
  95        /* requests queued in sort_list */
  96        int queued[2];
  97        /* currently allocated requests */
  98        int allocated[2];
  99        /* fifo list of requests in sort_list */
 100        struct list_head fifo;
 101
 102        unsigned long slice_end;
 103        long slice_resid;
 104        unsigned int slice_dispatch;
 105
 106        /* pending metadata requests */
 107        int meta_pending;
 108        /* number of requests that are on the dispatch list or inside driver */
 109        int dispatched;
 110
 111        /* io prio of this group */
 112        unsigned short ioprio, org_ioprio;
 113        unsigned short ioprio_class, org_ioprio_class;
 114
 115        pid_t pid;
 116};
 117
 118/*
 119 * Per block device queue structure
 120 */
 121struct cfq_data {
 122        struct request_queue *queue;
 123
 124        /*
 125         * rr list of queues with requests and the count of them
 126         */
 127        struct cfq_rb_root service_tree;
 128
 129        /*
 130         * Each priority tree is sorted by next_request position.  These
 131         * trees are used when determining if two or more queues are
 132         * interleaving requests (see cfq_close_cooperator).
 133         */
 134        struct rb_root prio_trees[CFQ_PRIO_LISTS];
 135
 136        unsigned int busy_queues;
 137        /*
 138         * Used to track any pending rt requests so we can pre-empt current
 139         * non-RT cfqq in service when this value is non-zero.
 140         */
 141        unsigned int busy_rt_queues;
 142
 143        int rq_in_driver;
 144        int sync_flight;
 145
 146        /*
 147         * queue-depth detection
 148         */
 149        int rq_queued;
 150        int hw_tag;
 151        int hw_tag_samples;
 152        int rq_in_driver_peak;
 153
 154        /*
 155         * idle window management
 156         */
 157        struct timer_list idle_slice_timer;
 158        struct work_struct unplug_work;
 159
 160        struct cfq_queue *active_queue;
 161        struct cfq_io_context *active_cic;
 162
 163        /*
 164         * async queue for each priority case
 165         */
 166        struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 167        struct cfq_queue *async_idle_cfqq;
 168
 169        sector_t last_position;
 170
 171        /*
 172         * tunables, see top of file
 173         */
 174        unsigned int cfq_quantum;
 175        unsigned int cfq_fifo_expire[2];
 176        unsigned int cfq_back_penalty;
 177        unsigned int cfq_back_max;
 178        unsigned int cfq_slice[2];
 179        unsigned int cfq_slice_async_rq;
 180        unsigned int cfq_slice_idle;
 181
 182        struct list_head cic_list;
 183
 184        /*
 185         * Fallback dummy cfqq for extreme OOM conditions
 186         */
 187        struct cfq_queue oom_cfqq;
 188};
 189
 190enum cfqq_state_flags {
 191        CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
 192        CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
 193        CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
 194        CFQ_CFQQ_FLAG_must_alloc,       /* must be allowed rq alloc */
 195        CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
 196        CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
 197        CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
 198        CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
 199        CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
 200        CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
 201        CFQ_CFQQ_FLAG_coop,             /* has done a coop jump of the queue */
 202};
 203
 204#define CFQ_CFQQ_FNS(name)                                              \
 205static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
 206{                                                                       \
 207        (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
 208}                                                                       \
 209static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
 210{                                                                       \
 211        (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
 212}                                                                       \
 213static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
 214{                                                                       \
 215        return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
 216}
 217
 218CFQ_CFQQ_FNS(on_rr);
 219CFQ_CFQQ_FNS(wait_request);
 220CFQ_CFQQ_FNS(must_dispatch);
 221CFQ_CFQQ_FNS(must_alloc);
 222CFQ_CFQQ_FNS(must_alloc_slice);
 223CFQ_CFQQ_FNS(fifo_expire);
 224CFQ_CFQQ_FNS(idle_window);
 225CFQ_CFQQ_FNS(prio_changed);
 226CFQ_CFQQ_FNS(slice_new);
 227CFQ_CFQQ_FNS(sync);
 228CFQ_CFQQ_FNS(coop);
 229#undef CFQ_CFQQ_FNS
 230
 231#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
 232        blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 233#define cfq_log(cfqd, fmt, args...)     \
 234        blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 235
 236static void cfq_dispatch_insert(struct request_queue *, struct request *);
 237static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
 238                                       struct io_context *, gfp_t);
 239static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 240                                                struct io_context *);
 241
 242static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
 243                                            int is_sync)
 244{
 245        return cic->cfqq[!!is_sync];
 246}
 247
 248static inline void cic_set_cfqq(struct cfq_io_context *cic,
 249                                struct cfq_queue *cfqq, int is_sync)
 250{
 251        cic->cfqq[!!is_sync] = cfqq;
 252}
 253
 254/*
 255 * We regard a request as SYNC, if it's either a read or has the SYNC bit
 256 * set (in which case it could also be direct WRITE).
 257 */
 258static inline int cfq_bio_sync(struct bio *bio)
 259{
 260        if (bio_data_dir(bio) == READ || bio_sync(bio))
 261                return 1;
 262
 263        return 0;
 264}
 265
 266/*
 267 * scheduler run of queue, if there are requests pending and no one in the
 268 * driver that will restart queueing
 269 */
 270static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 271{
 272        if (cfqd->busy_queues) {
 273                cfq_log(cfqd, "schedule dispatch");
 274                kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
 275        }
 276}
 277
 278static int cfq_queue_empty(struct request_queue *q)
 279{
 280        struct cfq_data *cfqd = q->elevator->elevator_data;
 281
 282        return !cfqd->busy_queues;
 283}
 284
 285/*
 286 * Scale schedule slice based on io priority. Use the sync time slice only
 287 * if a queue is marked sync and has sync io queued. A sync queue with async
 288 * io only, should not get full sync slice length.
 289 */
 290static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync,
 291                                 unsigned short prio)
 292{
 293        const int base_slice = cfqd->cfq_slice[sync];
 294
 295        WARN_ON(prio >= IOPRIO_BE_NR);
 296
 297        return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
 298}
 299
 300static inline int
 301cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 302{
 303        return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 304}
 305
 306static inline void
 307cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 308{
 309        cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
 310        cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 311}
 312
 313/*
 314 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
 315 * isn't valid until the first request from the dispatch is activated
 316 * and the slice time set.
 317 */
 318static inline int cfq_slice_used(struct cfq_queue *cfqq)
 319{
 320        if (cfq_cfqq_slice_new(cfqq))
 321                return 0;
 322        if (time_before(jiffies, cfqq->slice_end))
 323                return 0;
 324
 325        return 1;
 326}
 327
 328/*
 329 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
 330 * We choose the request that is closest to the head right now. Distance
 331 * behind the head is penalized and only allowed to a certain extent.
 332 */
 333static struct request *
 334cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 335{
 336        sector_t last, s1, s2, d1 = 0, d2 = 0;
 337        unsigned long back_max;
 338#define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
 339#define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
 340        unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 341
 342        if (rq1 == NULL || rq1 == rq2)
 343                return rq2;
 344        if (rq2 == NULL)
 345                return rq1;
 346
 347        if (rq_is_sync(rq1) && !rq_is_sync(rq2))
 348                return rq1;
 349        else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
 350                return rq2;
 351        if (rq_is_meta(rq1) && !rq_is_meta(rq2))
 352                return rq1;
 353        else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
 354                return rq2;
 355
 356        s1 = blk_rq_pos(rq1);
 357        s2 = blk_rq_pos(rq2);
 358
 359        last = cfqd->last_position;
 360
 361        /*
 362         * by definition, 1KiB is 2 sectors
 363         */
 364        back_max = cfqd->cfq_back_max * 2;
 365
 366        /*
 367         * Strict one way elevator _except_ in the case where we allow
 368         * short backward seeks which are biased as twice the cost of a
 369         * similar forward seek.
 370         */
 371        if (s1 >= last)
 372                d1 = s1 - last;
 373        else if (s1 + back_max >= last)
 374                d1 = (last - s1) * cfqd->cfq_back_penalty;
 375        else
 376                wrap |= CFQ_RQ1_WRAP;
 377
 378        if (s2 >= last)
 379                d2 = s2 - last;
 380        else if (s2 + back_max >= last)
 381                d2 = (last - s2) * cfqd->cfq_back_penalty;
 382        else
 383                wrap |= CFQ_RQ2_WRAP;
 384
 385        /* Found required data */
 386
 387        /*
 388         * By doing switch() on the bit mask "wrap" we avoid having to
 389         * check two variables for all permutations: --> faster!
 390         */
 391        switch (wrap) {
 392        case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
 393                if (d1 < d2)
 394                        return rq1;
 395                else if (d2 < d1)
 396                        return rq2;
 397                else {
 398                        if (s1 >= s2)
 399                                return rq1;
 400                        else
 401                                return rq2;
 402                }
 403
 404        case CFQ_RQ2_WRAP:
 405                return rq1;
 406        case CFQ_RQ1_WRAP:
 407                return rq2;
 408        case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
 409        default:
 410                /*
 411                 * Since both rqs are wrapped,
 412                 * start with the one that's further behind head
 413                 * (--> only *one* back seek required),
 414                 * since back seek takes more time than forward.
 415                 */
 416                if (s1 <= s2)
 417                        return rq1;
 418                else
 419                        return rq2;
 420        }
 421}
 422
 423/*
 424 * The below is leftmost cache rbtree addon
 425 */
 426static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 427{
 428        if (!root->left)
 429                root->left = rb_first(&root->rb);
 430
 431        if (root->left)
 432                return rb_entry(root->left, struct cfq_queue, rb_node);
 433
 434        return NULL;
 435}
 436
 437static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 438{
 439        rb_erase(n, root);
 440        RB_CLEAR_NODE(n);
 441}
 442
 443static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 444{
 445        if (root->left == n)
 446                root->left = NULL;
 447        rb_erase_init(n, &root->rb);
 448}
 449
 450/*
 451 * would be nice to take fifo expire time into account as well
 452 */
 453static struct request *
 454cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 455                  struct request *last)
 456{
 457        struct rb_node *rbnext = rb_next(&last->rb_node);
 458        struct rb_node *rbprev = rb_prev(&last->rb_node);
 459        struct request *next = NULL, *prev = NULL;
 460
 461        BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 462
 463        if (rbprev)
 464                prev = rb_entry_rq(rbprev);
 465
 466        if (rbnext)
 467                next = rb_entry_rq(rbnext);
 468        else {
 469                rbnext = rb_first(&cfqq->sort_list);
 470                if (rbnext && rbnext != &last->rb_node)
 471                        next = rb_entry_rq(rbnext);
 472        }
 473
 474        return cfq_choose_req(cfqd, next, prev);
 475}
 476
 477static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 478                                      struct cfq_queue *cfqq)
 479{
 480        /*
 481         * just an approximation, should be ok.
 482         */
 483        return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
 484                       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 485}
 486
 487/*
 488 * The cfqd->service_tree holds all pending cfq_queue's that have
 489 * requests waiting to be processed. It is sorted in the order that
 490 * we will service the queues.
 491 */
 492static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 493                                 int add_front)
 494{
 495        struct rb_node **p, *parent;
 496        struct cfq_queue *__cfqq;
 497        unsigned long rb_key;
 498        int left;
 499
 500        if (cfq_class_idle(cfqq)) {
 501                rb_key = CFQ_IDLE_DELAY;
 502                parent = rb_last(&cfqd->service_tree.rb);
 503                if (parent && parent != &cfqq->rb_node) {
 504                        __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 505                        rb_key += __cfqq->rb_key;
 506                } else
 507                        rb_key += jiffies;
 508        } else if (!add_front) {
 509                rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 510                rb_key += cfqq->slice_resid;
 511                cfqq->slice_resid = 0;
 512        } else
 513                rb_key = 0;
 514
 515        if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 516                /*
 517                 * same position, nothing more to do
 518                 */
 519                if (rb_key == cfqq->rb_key)
 520                        return;
 521
 522                cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 523        }
 524
 525        left = 1;
 526        parent = NULL;
 527        p = &cfqd->service_tree.rb.rb_node;
 528        while (*p) {
 529                struct rb_node **n;
 530
 531                parent = *p;
 532                __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 533
 534                /*
 535                 * sort RT queues first, we always want to give
 536                 * preference to them. IDLE queues goes to the back.
 537                 * after that, sort on the next service time.
 538                 */
 539                if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
 540                        n = &(*p)->rb_left;
 541                else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
 542                        n = &(*p)->rb_right;
 543                else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
 544                        n = &(*p)->rb_left;
 545                else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
 546                        n = &(*p)->rb_right;
 547                else if (rb_key < __cfqq->rb_key)
 548                        n = &(*p)->rb_left;
 549                else
 550                        n = &(*p)->rb_right;
 551
 552                if (n == &(*p)->rb_right)
 553                        left = 0;
 554
 555                p = n;
 556        }
 557
 558        if (left)
 559                cfqd->service_tree.left = &cfqq->rb_node;
 560
 561        cfqq->rb_key = rb_key;
 562        rb_link_node(&cfqq->rb_node, parent, p);
 563        rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
 564}
 565
 566static struct cfq_queue *
 567cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 568                     sector_t sector, struct rb_node **ret_parent,
 569                     struct rb_node ***rb_link)
 570{
 571        struct rb_node **p, *parent;
 572        struct cfq_queue *cfqq = NULL;
 573
 574        parent = NULL;
 575        p = &root->rb_node;
 576        while (*p) {
 577                struct rb_node **n;
 578
 579                parent = *p;
 580                cfqq = rb_entry(parent, struct cfq_queue, p_node);
 581
 582                /*
 583                 * Sort strictly based on sector.  Smallest to the left,
 584                 * largest to the right.
 585                 */
 586                if (sector > blk_rq_pos(cfqq->next_rq))
 587                        n = &(*p)->rb_right;
 588                else if (sector < blk_rq_pos(cfqq->next_rq))
 589                        n = &(*p)->rb_left;
 590                else
 591                        break;
 592                p = n;
 593                cfqq = NULL;
 594        }
 595
 596        *ret_parent = parent;
 597        if (rb_link)
 598                *rb_link = p;
 599        return cfqq;
 600}
 601
 602static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 603{
 604        struct rb_node **p, *parent;
 605        struct cfq_queue *__cfqq;
 606
 607        if (cfqq->p_root) {
 608                rb_erase(&cfqq->p_node, cfqq->p_root);
 609                cfqq->p_root = NULL;
 610        }
 611
 612        if (cfq_class_idle(cfqq))
 613                return;
 614        if (!cfqq->next_rq)
 615                return;
 616
 617        cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
 618        __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
 619                                      blk_rq_pos(cfqq->next_rq), &parent, &p);
 620        if (!__cfqq) {
 621                rb_link_node(&cfqq->p_node, parent, p);
 622                rb_insert_color(&cfqq->p_node, cfqq->p_root);
 623        } else
 624                cfqq->p_root = NULL;
 625}
 626
 627/*
 628 * Update cfqq's position in the service tree.
 629 */
 630static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 631{
 632        /*
 633         * Resorting requires the cfqq to be on the RR list already.
 634         */
 635        if (cfq_cfqq_on_rr(cfqq)) {
 636                cfq_service_tree_add(cfqd, cfqq, 0);
 637                cfq_prio_tree_add(cfqd, cfqq);
 638        }
 639}
 640
 641/*
 642 * add to busy list of queues for service, trying to be fair in ordering
 643 * the pending list according to last request service
 644 */
 645static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 646{
 647        cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
 648        BUG_ON(cfq_cfqq_on_rr(cfqq));
 649        cfq_mark_cfqq_on_rr(cfqq);
 650        cfqd->busy_queues++;
 651        if (cfq_class_rt(cfqq))
 652                cfqd->busy_rt_queues++;
 653
 654        cfq_resort_rr_list(cfqd, cfqq);
 655}
 656
 657/*
 658 * Called when the cfqq no longer has requests pending, remove it from
 659 * the service tree.
 660 */
 661static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 662{
 663        cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 664        BUG_ON(!cfq_cfqq_on_rr(cfqq));
 665        cfq_clear_cfqq_on_rr(cfqq);
 666
 667        if (!RB_EMPTY_NODE(&cfqq->rb_node))
 668                cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
 669        if (cfqq->p_root) {
 670                rb_erase(&cfqq->p_node, cfqq->p_root);
 671                cfqq->p_root = NULL;
 672        }
 673
 674        BUG_ON(!cfqd->busy_queues);
 675        cfqd->busy_queues--;
 676        if (cfq_class_rt(cfqq))
 677                cfqd->busy_rt_queues--;
 678}
 679
 680/*
 681 * rb tree support functions
 682 */
 683static void cfq_del_rq_rb(struct request *rq)
 684{
 685        struct cfq_queue *cfqq = RQ_CFQQ(rq);
 686        struct cfq_data *cfqd = cfqq->cfqd;
 687        const int sync = rq_is_sync(rq);
 688
 689        BUG_ON(!cfqq->queued[sync]);
 690        cfqq->queued[sync]--;
 691
 692        elv_rb_del(&cfqq->sort_list, rq);
 693
 694        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 695                cfq_del_cfqq_rr(cfqd, cfqq);
 696}
 697
 698static void cfq_add_rq_rb(struct request *rq)
 699{
 700        struct cfq_queue *cfqq = RQ_CFQQ(rq);
 701        struct cfq_data *cfqd = cfqq->cfqd;
 702        struct request *__alias, *prev;
 703
 704        cfqq->queued[rq_is_sync(rq)]++;
 705
 706        /*
 707         * looks a little odd, but the first insert might return an alias.
 708         * if that happens, put the alias on the dispatch list
 709         */
 710        while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
 711                cfq_dispatch_insert(cfqd->queue, __alias);
 712
 713        if (!cfq_cfqq_on_rr(cfqq))
 714                cfq_add_cfqq_rr(cfqd, cfqq);
 715
 716        /*
 717         * check if this request is a better next-serve candidate
 718         */
 719        prev = cfqq->next_rq;
 720        cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
 721
 722        /*
 723         * adjust priority tree position, if ->next_rq changes
 724         */
 725        if (prev != cfqq->next_rq)
 726                cfq_prio_tree_add(cfqd, cfqq);
 727
 728        BUG_ON(!cfqq->next_rq);
 729}
 730
 731static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 732{
 733        elv_rb_del(&cfqq->sort_list, rq);
 734        cfqq->queued[rq_is_sync(rq)]--;
 735        cfq_add_rq_rb(rq);
 736}
 737
 738static struct request *
 739cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 740{
 741        struct task_struct *tsk = current;
 742        struct cfq_io_context *cic;
 743        struct cfq_queue *cfqq;
 744
 745        cic = cfq_cic_lookup(cfqd, tsk->io_context);
 746        if (!cic)
 747                return NULL;
 748
 749        cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 750        if (cfqq) {
 751                sector_t sector = bio->bi_sector + bio_sectors(bio);
 752
 753                return elv_rb_find(&cfqq->sort_list, sector);
 754        }
 755
 756        return NULL;
 757}
 758
 759static void cfq_activate_request(struct request_queue *q, struct request *rq)
 760{
 761        struct cfq_data *cfqd = q->elevator->elevator_data;
 762
 763        cfqd->rq_in_driver++;
 764        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 765                                                cfqd->rq_in_driver);
 766
 767        cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 768}
 769
 770static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 771{
 772        struct cfq_data *cfqd = q->elevator->elevator_data;
 773
 774        WARN_ON(!cfqd->rq_in_driver);
 775        cfqd->rq_in_driver--;
 776        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
 777                                                cfqd->rq_in_driver);
 778}
 779
 780static void cfq_remove_request(struct request *rq)
 781{
 782        struct cfq_queue *cfqq = RQ_CFQQ(rq);
 783
 784        if (cfqq->next_rq == rq)
 785                cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
 786
 787        list_del_init(&rq->queuelist);
 788        cfq_del_rq_rb(rq);
 789
 790        cfqq->cfqd->rq_queued--;
 791        if (rq_is_meta(rq)) {
 792                WARN_ON(!cfqq->meta_pending);
 793                cfqq->meta_pending--;
 794        }
 795}
 796
 797static int cfq_merge(struct request_queue *q, struct request **req,
 798                     struct bio *bio)
 799{
 800        struct cfq_data *cfqd = q->elevator->elevator_data;
 801        struct request *__rq;
 802
 803        __rq = cfq_find_rq_fmerge(cfqd, bio);
 804        if (__rq && elv_rq_merge_ok(__rq, bio)) {
 805                *req = __rq;
 806                return ELEVATOR_FRONT_MERGE;
 807        }
 808
 809        return ELEVATOR_NO_MERGE;
 810}
 811
 812static void cfq_merged_request(struct request_queue *q, struct request *req,
 813                               int type)
 814{
 815        if (type == ELEVATOR_FRONT_MERGE) {
 816                struct cfq_queue *cfqq = RQ_CFQQ(req);
 817
 818                cfq_reposition_rq_rb(cfqq, req);
 819        }
 820}
 821
 822static void
 823cfq_merged_requests(struct request_queue *q, struct request *rq,
 824                    struct request *next)
 825{
 826        /*
 827         * reposition in fifo if next is older than rq
 828         */
 829        if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 830            time_before(next->start_time, rq->start_time))
 831                list_move(&rq->queuelist, &next->queuelist);
 832
 833        cfq_remove_request(next);
 834}
 835
 836static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 837                           struct bio *bio)
 838{
 839        struct cfq_data *cfqd = q->elevator->elevator_data;
 840        struct cfq_io_context *cic;
 841        struct cfq_queue *cfqq;
 842
 843        /*
 844         * Disallow merge of a sync bio into an async request.
 845         */
 846        if (cfq_bio_sync(bio) && !rq_is_sync(rq))
 847                return 0;
 848
 849        /*
 850         * Lookup the cfqq that this bio will be queued with. Allow
 851         * merge only if rq is queued there.
 852         */
 853        cic = cfq_cic_lookup(cfqd, current->io_context);
 854        if (!cic)
 855                return 0;
 856
 857        cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 858        if (cfqq == RQ_CFQQ(rq))
 859                return 1;
 860
 861        return 0;
 862}
 863
 864static void __cfq_set_active_queue(struct cfq_data *cfqd,
 865                                   struct cfq_queue *cfqq)
 866{
 867        if (cfqq) {
 868                cfq_log_cfqq(cfqd, cfqq, "set_active");
 869                cfqq->slice_end = 0;
 870                cfqq->slice_dispatch = 0;
 871
 872                cfq_clear_cfqq_wait_request(cfqq);
 873                cfq_clear_cfqq_must_dispatch(cfqq);
 874                cfq_clear_cfqq_must_alloc_slice(cfqq);
 875                cfq_clear_cfqq_fifo_expire(cfqq);
 876                cfq_mark_cfqq_slice_new(cfqq);
 877
 878                del_timer(&cfqd->idle_slice_timer);
 879        }
 880
 881        cfqd->active_queue = cfqq;
 882}
 883
 884/*
 885 * current cfqq expired its slice (or was too idle), select new one
 886 */
 887static void
 888__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 889                    int timed_out)
 890{
 891        cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 892
 893        if (cfq_cfqq_wait_request(cfqq))
 894                del_timer(&cfqd->idle_slice_timer);
 895
 896        cfq_clear_cfqq_wait_request(cfqq);
 897
 898        /*
 899         * store what was left of this slice, if the queue idled/timed out
 900         */
 901        if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
 902                cfqq->slice_resid = cfqq->slice_end - jiffies;
 903                cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 904        }
 905
 906        cfq_resort_rr_list(cfqd, cfqq);
 907
 908        if (cfqq == cfqd->active_queue)
 909                cfqd->active_queue = NULL;
 910
 911        if (cfqd->active_cic) {
 912                put_io_context(cfqd->active_cic->ioc);
 913                cfqd->active_cic = NULL;
 914        }
 915}
 916
 917static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
 918{
 919        struct cfq_queue *cfqq = cfqd->active_queue;
 920
 921        if (cfqq)
 922                __cfq_slice_expired(cfqd, cfqq, timed_out);
 923}
 924
 925/*
 926 * Get next queue for service. Unless we have a queue preemption,
 927 * we'll simply select the first cfqq in the service tree.
 928 */
 929static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 930{
 931        if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
 932                return NULL;
 933
 934        return cfq_rb_first(&cfqd->service_tree);
 935}
 936
 937/*
 938 * Get and set a new active queue for service.
 939 */
 940static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 941                                              struct cfq_queue *cfqq)
 942{
 943        if (!cfqq) {
 944                cfqq = cfq_get_next_queue(cfqd);
 945                if (cfqq)
 946                        cfq_clear_cfqq_coop(cfqq);
 947        }
 948
 949        __cfq_set_active_queue(cfqd, cfqq);
 950        return cfqq;
 951}
 952
 953static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 954                                          struct request *rq)
 955{
 956        if (blk_rq_pos(rq) >= cfqd->last_position)
 957                return blk_rq_pos(rq) - cfqd->last_position;
 958        else
 959                return cfqd->last_position - blk_rq_pos(rq);
 960}
 961
 962#define CIC_SEEK_THR    8 * 1024
 963#define CIC_SEEKY(cic)  ((cic)->seek_mean > CIC_SEEK_THR)
 964
 965static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 966{
 967        struct cfq_io_context *cic = cfqd->active_cic;
 968        sector_t sdist = cic->seek_mean;
 969
 970        if (!sample_valid(cic->seek_samples))
 971                sdist = CIC_SEEK_THR;
 972
 973        return cfq_dist_from_last(cfqd, rq) <= sdist;
 974}
 975
 976static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 977                                    struct cfq_queue *cur_cfqq)
 978{
 979        struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
 980        struct rb_node *parent, *node;
 981        struct cfq_queue *__cfqq;
 982        sector_t sector = cfqd->last_position;
 983
 984        if (RB_EMPTY_ROOT(root))
 985                return NULL;
 986
 987        /*
 988         * First, if we find a request starting at the end of the last
 989         * request, choose it.
 990         */
 991        __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
 992        if (__cfqq)
 993                return __cfqq;
 994
 995        /*
 996         * If the exact sector wasn't found, the parent of the NULL leaf
 997         * will contain the closest sector.
 998         */
 999        __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1000        if (cfq_rq_close(cfqd, __cfqq->next_rq))
1001                return __cfqq;
1002
1003        if (blk_rq_pos(__cfqq->next_rq) < sector)
1004                node = rb_next(&__cfqq->p_node);
1005        else
1006                node = rb_prev(&__cfqq->p_node);
1007        if (!node)
1008                return NULL;
1009
1010        __cfqq = rb_entry(node, struct cfq_queue, p_node);
1011        if (cfq_rq_close(cfqd, __cfqq->next_rq))
1012                return __cfqq;
1013
1014        return NULL;
1015}
1016
1017/*
1018 * cfqd - obvious
1019 * cur_cfqq - passed in so that we don't decide that the current queue is
1020 *            closely cooperating with itself.
1021 *
1022 * So, basically we're assuming that that cur_cfqq has dispatched at least
1023 * one request, and that cfqd->last_position reflects a position on the disk
1024 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1025 * assumption.
1026 */
1027static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1028                                              struct cfq_queue *cur_cfqq,
1029                                              int probe)
1030{
1031        struct cfq_queue *cfqq;
1032
1033        /*
1034         * A valid cfq_io_context is necessary to compare requests against
1035         * the seek_mean of the current cfqq.
1036         */
1037        if (!cfqd->active_cic)
1038                return NULL;
1039
1040        /*
1041         * We should notice if some of the queues are cooperating, eg
1042         * working closely on the same area of the disk. In that case,
1043         * we can group them together and don't waste time idling.
1044         */
1045        cfqq = cfqq_close(cfqd, cur_cfqq);
1046        if (!cfqq)
1047                return NULL;
1048
1049        if (cfq_cfqq_coop(cfqq))
1050                return NULL;
1051
1052        if (!probe)
1053                cfq_mark_cfqq_coop(cfqq);
1054        return cfqq;
1055}
1056
1057static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1058{
1059        struct cfq_queue *cfqq = cfqd->active_queue;
1060        struct cfq_io_context *cic;
1061        unsigned long sl;
1062
1063        /*
1064         * SSD device without seek penalty, disable idling. But only do so
1065         * for devices that support queuing, otherwise we still have a problem
1066         * with sync vs async workloads.
1067         */
1068        if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1069                return;
1070
1071        WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1072        WARN_ON(cfq_cfqq_slice_new(cfqq));
1073
1074        /*
1075         * idle is disabled, either manually or by past process history
1076         */
1077        if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
1078                return;
1079
1080        /*
1081         * still requests with the driver, don't idle
1082         */
1083        if (cfqd->rq_in_driver)
1084                return;
1085
1086        /*
1087         * task has exited, don't wait
1088         */
1089        cic = cfqd->active_cic;
1090        if (!cic || !atomic_read(&cic->ioc->nr_tasks))
1091                return;
1092
1093        cfq_mark_cfqq_wait_request(cfqq);
1094
1095        /*
1096         * we don't want to idle for seeks, but we do want to allow
1097         * fair distribution of slice time for a process doing back-to-back
1098         * seeks. so allow a little bit of time for him to submit a new rq
1099         */
1100        sl = cfqd->cfq_slice_idle;
1101        if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
1102                sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
1103
1104        mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1105        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1106}
1107
1108/*
1109 * Move request from internal lists to the request queue dispatch list.
1110 */
1111static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1112{
1113        struct cfq_data *cfqd = q->elevator->elevator_data;
1114        struct cfq_queue *cfqq = RQ_CFQQ(rq);
1115
1116        cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1117
1118        cfq_remove_request(rq);
1119        cfqq->dispatched++;
1120        elv_dispatch_sort(q, rq);
1121
1122        if (cfq_cfqq_sync(cfqq))
1123                cfqd->sync_flight++;
1124}
1125
1126/*
1127 * return expired entry, or NULL to just start from scratch in rbtree
1128 */
1129static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1130{
1131        struct cfq_data *cfqd = cfqq->cfqd;
1132        struct request *rq;
1133        int fifo;
1134
1135        if (cfq_cfqq_fifo_expire(cfqq))
1136                return NULL;
1137
1138        cfq_mark_cfqq_fifo_expire(cfqq);
1139
1140        if (list_empty(&cfqq->fifo))
1141                return NULL;
1142
1143        fifo = cfq_cfqq_sync(cfqq);
1144        rq = rq_entry_fifo(cfqq->fifo.next);
1145
1146        if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
1147                rq = NULL;
1148
1149        cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq);
1150        return rq;
1151}
1152
1153static inline int
1154cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1155{
1156        const int base_rq = cfqd->cfq_slice_async_rq;
1157
1158        WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1159
1160        return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1161}
1162
1163/*
1164 * Select a queue for service. If we have a current active queue,
1165 * check whether to continue servicing it, or retrieve and set a new one.
1166 */
1167static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1168{
1169        struct cfq_queue *cfqq, *new_cfqq = NULL;
1170
1171        cfqq = cfqd->active_queue;
1172        if (!cfqq)
1173                goto new_queue;
1174
1175        /*
1176         * The active queue has run out of time, expire it and select new.
1177         */
1178        if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
1179                goto expire;
1180
1181        /*
1182         * If we have a RT cfqq waiting, then we pre-empt the current non-rt
1183         * cfqq.
1184         */
1185        if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
1186                /*
1187                 * We simulate this as cfqq timed out so that it gets to bank
1188                 * the remaining of its time slice.
1189                 */
1190                cfq_log_cfqq(cfqd, cfqq, "preempt");
1191                cfq_slice_expired(cfqd, 1);
1192                goto new_queue;
1193        }
1194
1195        /*
1196         * The active queue has requests and isn't expired, allow it to
1197         * dispatch.
1198         */
1199        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
1200                goto keep_queue;
1201
1202        /*
1203         * If another queue has a request waiting within our mean seek
1204         * distance, let it run.  The expire code will check for close
1205         * cooperators and put the close queue at the front of the service
1206         * tree.
1207         */
1208        new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
1209        if (new_cfqq)
1210                goto expire;
1211
1212        /*
1213         * No requests pending. If the active queue still has requests in
1214         * flight or is idling for a new request, allow either of these
1215         * conditions to happen (or time out) before selecting a new queue.
1216         */
1217        if (timer_pending(&cfqd->idle_slice_timer) ||
1218            (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
1219                cfqq = NULL;
1220                goto keep_queue;
1221        }
1222
1223expire:
1224        cfq_slice_expired(cfqd, 0);
1225new_queue:
1226        cfqq = cfq_set_active_queue(cfqd, new_cfqq);
1227keep_queue:
1228        return cfqq;
1229}
1230
1231static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
1232{
1233        int dispatched = 0;
1234
1235        while (cfqq->next_rq) {
1236                cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
1237                dispatched++;
1238        }
1239
1240        BUG_ON(!list_empty(&cfqq->fifo));
1241        return dispatched;
1242}
1243
1244/*
1245 * Drain our current requests. Used for barriers and when switching
1246 * io schedulers on-the-fly.
1247 */
1248static int cfq_forced_dispatch(struct cfq_data *cfqd)
1249{
1250        struct cfq_queue *cfqq;
1251        int dispatched = 0;
1252
1253        while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
1254                dispatched += __cfq_forced_dispatch_cfqq(cfqq);
1255
1256        cfq_slice_expired(cfqd, 0);
1257
1258        BUG_ON(cfqd->busy_queues);
1259
1260        cfq_log(cfqd, "forced_dispatch=%d", dispatched);
1261        return dispatched;
1262}
1263
1264/*
1265 * Dispatch a request from cfqq, moving them to the request queue
1266 * dispatch list.
1267 */
1268static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1269{
1270        struct request *rq;
1271
1272        BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
1273
1274        /*
1275         * follow expired path, else get first next available
1276         */
1277        rq = cfq_check_fifo(cfqq);
1278        if (!rq)
1279                rq = cfqq->next_rq;
1280
1281        /*
1282         * insert request into driver dispatch list
1283         */
1284        cfq_dispatch_insert(cfqd->queue, rq);
1285
1286        if (!cfqd->active_cic) {
1287                struct cfq_io_context *cic = RQ_CIC(rq);
1288
1289                atomic_long_inc(&cic->ioc->refcount);
1290                cfqd->active_cic = cic;
1291        }
1292}
1293
1294/*
1295 * Find the cfqq that we need to service and move a request from that to the
1296 * dispatch list
1297 */
1298static int cfq_dispatch_requests(struct request_queue *q, int force)
1299{
1300        struct cfq_data *cfqd = q->elevator->elevator_data;
1301        struct cfq_queue *cfqq;
1302        unsigned int max_dispatch;
1303
1304        if (!cfqd->busy_queues)
1305                return 0;
1306
1307        if (unlikely(force))
1308                return cfq_forced_dispatch(cfqd);
1309
1310        cfqq = cfq_select_queue(cfqd);
1311        if (!cfqq)
1312                return 0;
1313
1314        /*
1315         * If this is an async queue and we have sync IO in flight, let it wait
1316         */
1317        if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
1318                return 0;
1319
1320        max_dispatch = cfqd->cfq_quantum;
1321        if (cfq_class_idle(cfqq))
1322                max_dispatch = 1;
1323
1324        /*
1325         * Does this cfqq already have too much IO in flight?
1326         */
1327        if (cfqq->dispatched >= max_dispatch) {
1328                /*
1329                 * idle queue must always only have a single IO in flight
1330                 */
1331                if (cfq_class_idle(cfqq))
1332                        return 0;
1333
1334                /*
1335                 * We have other queues, don't allow more IO from this one
1336                 */
1337                if (cfqd->busy_queues > 1)
1338                        return 0;
1339
1340                /*
1341                 * we are the only queue, allow up to 4 times of 'quantum'
1342                 */
1343                if (cfqq->dispatched >= 4 * max_dispatch)
1344                        return 0;
1345        }
1346
1347        /*
1348         * Dispatch a request from this cfqq
1349         */
1350        cfq_dispatch_request(cfqd, cfqq);
1351        cfqq->slice_dispatch++;
1352        cfq_clear_cfqq_must_dispatch(cfqq);
1353
1354        /*
1355         * expire an async queue immediately if it has used up its slice. idle
1356         * queue always expire after 1 dispatch round.
1357         */
1358        if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
1359            cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1360            cfq_class_idle(cfqq))) {
1361                cfqq->slice_end = jiffies + 1;
1362                cfq_slice_expired(cfqd, 0);
1363        }
1364
1365        cfq_log(cfqd, "dispatched a request");
1366        return 1;
1367}
1368
1369/*
1370 * task holds one reference to the queue, dropped when task exits. each rq
1371 * in-flight on this queue also holds a reference, dropped when rq is freed.
1372 *
1373 * queue lock must be held here.
1374 */
1375static void cfq_put_queue(struct cfq_queue *cfqq)
1376{
1377        struct cfq_data *cfqd = cfqq->cfqd;
1378
1379        BUG_ON(atomic_read(&cfqq->ref) <= 0);
1380
1381        if (!atomic_dec_and_test(&cfqq->ref))
1382                return;
1383
1384        cfq_log_cfqq(cfqd, cfqq, "put_queue");
1385        BUG_ON(rb_first(&cfqq->sort_list));
1386        BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1387        BUG_ON(cfq_cfqq_on_rr(cfqq));
1388
1389        if (unlikely(cfqd->active_queue == cfqq)) {
1390                __cfq_slice_expired(cfqd, cfqq, 0);
1391                cfq_schedule_dispatch(cfqd);
1392        }
1393
1394        kmem_cache_free(cfq_pool, cfqq);
1395}
1396
1397/*
1398 * Must always be called with the rcu_read_lock() held
1399 */
1400static void
1401__call_for_each_cic(struct io_context *ioc,
1402                    void (*func)(struct io_context *, struct cfq_io_context *))
1403{
1404        struct cfq_io_context *cic;
1405        struct hlist_node *n;
1406
1407        hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
1408                func(ioc, cic);
1409}
1410
1411/*
1412 * Call func for each cic attached to this ioc.
1413 */
1414static void
1415call_for_each_cic(struct io_context *ioc,
1416                  void (*func)(struct io_context *, struct cfq_io_context *))
1417{
1418        rcu_read_lock();
1419        __call_for_each_cic(ioc, func);
1420        rcu_read_unlock();
1421}
1422
1423static void cfq_cic_free_rcu(struct rcu_head *head)
1424{
1425        struct cfq_io_context *cic;
1426
1427        cic = container_of(head, struct cfq_io_context, rcu_head);
1428
1429        kmem_cache_free(cfq_ioc_pool, cic);
1430        elv_ioc_count_dec(ioc_count);
1431
1432        if (ioc_gone) {
1433                /*
1434                 * CFQ scheduler is exiting, grab exit lock and check
1435                 * the pending io context count. If it hits zero,
1436                 * complete ioc_gone and set it back to NULL
1437                 */
1438                spin_lock(&ioc_gone_lock);
1439                if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
1440                        complete(ioc_gone);
1441                        ioc_gone = NULL;
1442                }
1443                spin_unlock(&ioc_gone_lock);
1444        }
1445}
1446
1447static void cfq_cic_free(struct cfq_io_context *cic)
1448{
1449        call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
1450}
1451
1452static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
1453{
1454        unsigned long flags;
1455
1456        BUG_ON(!cic->dead_key);
1457
1458        spin_lock_irqsave(&ioc->lock, flags);
1459        radix_tree_delete(&ioc->radix_root, cic->dead_key);
1460        hlist_del_rcu(&cic->cic_list);
1461        spin_unlock_irqrestore(&ioc->lock, flags);
1462
1463        cfq_cic_free(cic);
1464}
1465
1466/*
1467 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
1468 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
1469 * and ->trim() which is called with the task lock held
1470 */
1471static void cfq_free_io_context(struct io_context *ioc)
1472{
1473        /*
1474         * ioc->refcount is zero here, or we are called from elv_unregister(),
1475         * so no more cic's are allowed to be linked into this ioc.  So it
1476         * should be ok to iterate over the known list, we will see all cic's
1477         * since no new ones are added.
1478         */
1479        __call_for_each_cic(ioc, cic_free_func);
1480}
1481
1482static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1483{
1484        if (unlikely(cfqq == cfqd->active_queue)) {
1485                __cfq_slice_expired(cfqd, cfqq, 0);
1486                cfq_schedule_dispatch(cfqd);
1487        }
1488
1489        cfq_put_queue(cfqq);
1490}
1491
1492static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
1493                                         struct cfq_io_context *cic)
1494{
1495        struct io_context *ioc = cic->ioc;
1496
1497        list_del_init(&cic->queue_list);
1498
1499        /*
1500         * Make sure key == NULL is seen for dead queues
1501         */
1502        smp_wmb();
1503        cic->dead_key = (unsigned long) cic->key;
1504        cic->key = NULL;
1505
1506        if (ioc->ioc_data == cic)
1507                rcu_assign_pointer(ioc->ioc_data, NULL);
1508
1509        if (cic->cfqq[BLK_RW_ASYNC]) {
1510                cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
1511                cic->cfqq[BLK_RW_ASYNC] = NULL;
1512        }
1513
1514        if (cic->cfqq[BLK_RW_SYNC]) {
1515                cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
1516                cic->cfqq[BLK_RW_SYNC] = NULL;
1517        }
1518}
1519
1520static void cfq_exit_single_io_context(struct io_context *ioc,
1521                                       struct cfq_io_context *cic)
1522{
1523        struct cfq_data *cfqd = cic->key;
1524
1525        if (cfqd) {
1526                struct request_queue *q = cfqd->queue;
1527                unsigned long flags;
1528
1529                spin_lock_irqsave(q->queue_lock, flags);
1530
1531                /*
1532                 * Ensure we get a fresh copy of the ->key to prevent
1533                 * race between exiting task and queue
1534                 */
1535                smp_read_barrier_depends();
1536                if (cic->key)
1537                        __cfq_exit_single_io_context(cfqd, cic);
1538
1539                spin_unlock_irqrestore(q->queue_lock, flags);
1540        }
1541}
1542
1543/*
1544 * The process that ioc belongs to has exited, we need to clean up
1545 * and put the internal structures we have that belongs to that process.
1546 */
1547static void cfq_exit_io_context(struct io_context *ioc)
1548{
1549        call_for_each_cic(ioc, cfq_exit_single_io_context);
1550}
1551
1552static struct cfq_io_context *
1553cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1554{
1555        struct cfq_io_context *cic;
1556
1557        cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
1558                                                        cfqd->queue->node);
1559        if (cic) {
1560                cic->last_end_request = jiffies;
1561                INIT_LIST_HEAD(&cic->queue_list);
1562                INIT_HLIST_NODE(&cic->cic_list);
1563                cic->dtor = cfq_free_io_context;
1564                cic->exit = cfq_exit_io_context;
1565                elv_ioc_count_inc(ioc_count);
1566        }
1567
1568        return cic;
1569}
1570
1571static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
1572{
1573        struct task_struct *tsk = current;
1574        int ioprio_class;
1575
1576        if (!cfq_cfqq_prio_changed(cfqq))
1577                return;
1578
1579        ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
1580        switch (ioprio_class) {
1581        default:
1582                printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1583        case IOPRIO_CLASS_NONE:
1584                /*
1585                 * no prio set, inherit CPU scheduling settings
1586                 */
1587                cfqq->ioprio = task_nice_ioprio(tsk);
1588                cfqq->ioprio_class = task_nice_ioclass(tsk);
1589                break;
1590        case IOPRIO_CLASS_RT:
1591                cfqq->ioprio = task_ioprio(ioc);
1592                cfqq->ioprio_class = IOPRIO_CLASS_RT;
1593                break;
1594        case IOPRIO_CLASS_BE:
1595                cfqq->ioprio = task_ioprio(ioc);
1596                cfqq->ioprio_class = IOPRIO_CLASS_BE;
1597                break;
1598        case IOPRIO_CLASS_IDLE:
1599                cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1600                cfqq->ioprio = 7;
1601                cfq_clear_cfqq_idle_window(cfqq);
1602                break;
1603        }
1604
1605        /*
1606         * keep track of original prio settings in case we have to temporarily
1607         * elevate the priority of this queue
1608         */
1609        cfqq->org_ioprio = cfqq->ioprio;
1610        cfqq->org_ioprio_class = cfqq->ioprio_class;
1611        cfq_clear_cfqq_prio_changed(cfqq);
1612}
1613
1614static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
1615{
1616        struct cfq_data *cfqd = cic->key;
1617        struct cfq_queue *cfqq;
1618        unsigned long flags;
1619
1620        if (unlikely(!cfqd))
1621                return;
1622
1623        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1624
1625        cfqq = cic->cfqq[BLK_RW_ASYNC];
1626        if (cfqq) {
1627                struct cfq_queue *new_cfqq;
1628                new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
1629                                                GFP_ATOMIC);
1630                if (new_cfqq) {
1631                        cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
1632                        cfq_put_queue(cfqq);
1633                }
1634        }
1635
1636        cfqq = cic->cfqq[BLK_RW_SYNC];
1637        if (cfqq)
1638                cfq_mark_cfqq_prio_changed(cfqq);
1639
1640        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1641}
1642
1643static void cfq_ioc_set_ioprio(struct io_context *ioc)
1644{
1645        call_for_each_cic(ioc, changed_ioprio);
1646        ioc->ioprio_changed = 0;
1647}
1648
1649static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1650                          pid_t pid, int is_sync)
1651{
1652        RB_CLEAR_NODE(&cfqq->rb_node);
1653        RB_CLEAR_NODE(&cfqq->p_node);
1654        INIT_LIST_HEAD(&cfqq->fifo);
1655
1656        atomic_set(&cfqq->ref, 0);
1657        cfqq->cfqd = cfqd;
1658
1659        cfq_mark_cfqq_prio_changed(cfqq);
1660
1661        if (is_sync) {
1662                if (!cfq_class_idle(cfqq))
1663                        cfq_mark_cfqq_idle_window(cfqq);
1664                cfq_mark_cfqq_sync(cfqq);
1665        }
1666        cfqq->pid = pid;
1667}
1668
1669static struct cfq_queue *
1670cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
1671                     struct io_context *ioc, gfp_t gfp_mask)
1672{
1673        struct cfq_queue *cfqq, *new_cfqq = NULL;
1674        struct cfq_io_context *cic;
1675
1676retry:
1677        cic = cfq_cic_lookup(cfqd, ioc);
1678        /* cic always exists here */
1679        cfqq = cic_to_cfqq(cic, is_sync);
1680
1681        /*
1682         * Always try a new alloc if we fell back to the OOM cfqq
1683         * originally, since it should just be a temporary situation.
1684         */
1685        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
1686                cfqq = NULL;
1687                if (new_cfqq) {
1688                        cfqq = new_cfqq;
1689                        new_cfqq = NULL;
1690                } else if (gfp_mask & __GFP_WAIT) {
1691                        spin_unlock_irq(cfqd->queue->queue_lock);
1692                        new_cfqq = kmem_cache_alloc_node(cfq_pool,
1693                                        gfp_mask | __GFP_ZERO,
1694                                        cfqd->queue->node);
1695                        spin_lock_irq(cfqd->queue->queue_lock);
1696                        if (new_cfqq)
1697                                goto retry;
1698                } else {
1699                        cfqq = kmem_cache_alloc_node(cfq_pool,
1700                                        gfp_mask | __GFP_ZERO,
1701                                        cfqd->queue->node);
1702                }
1703
1704                if (cfqq) {
1705                        cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
1706                        cfq_init_prio_data(cfqq, ioc);
1707                        cfq_log_cfqq(cfqd, cfqq, "alloced");
1708                } else
1709                        cfqq = &cfqd->oom_cfqq;
1710        }
1711
1712        if (new_cfqq)
1713                kmem_cache_free(cfq_pool, new_cfqq);
1714
1715        return cfqq;
1716}
1717
1718static struct cfq_queue **
1719cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
1720{
1721        switch (ioprio_class) {
1722        case IOPRIO_CLASS_RT:
1723                return &cfqd->async_cfqq[0][ioprio];
1724        case IOPRIO_CLASS_BE:
1725                return &cfqd->async_cfqq[1][ioprio];
1726        case IOPRIO_CLASS_IDLE:
1727                return &cfqd->async_idle_cfqq;
1728        default:
1729                BUG();
1730        }
1731}
1732
1733static struct cfq_queue *
1734cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
1735              gfp_t gfp_mask)
1736{
1737        const int ioprio = task_ioprio(ioc);
1738        const int ioprio_class = task_ioprio_class(ioc);
1739        struct cfq_queue **async_cfqq = NULL;
1740        struct cfq_queue *cfqq = NULL;
1741
1742        if (!is_sync) {
1743                async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
1744                cfqq = *async_cfqq;
1745        }
1746
1747        if (!cfqq)
1748                cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
1749
1750        /*
1751         * pin the queue now that it's allocated, scheduler exit will prune it
1752         */
1753        if (!is_sync && !(*async_cfqq)) {
1754                atomic_inc(&cfqq->ref);
1755                *async_cfqq = cfqq;
1756        }
1757
1758        atomic_inc(&cfqq->ref);
1759        return cfqq;
1760}
1761
1762/*
1763 * We drop cfq io contexts lazily, so we may find a dead one.
1764 */
1765static void
1766cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
1767                  struct cfq_io_context *cic)
1768{
1769        unsigned long flags;
1770
1771        WARN_ON(!list_empty(&cic->queue_list));
1772
1773        spin_lock_irqsave(&ioc->lock, flags);
1774
1775        BUG_ON(ioc->ioc_data == cic);
1776
1777        radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
1778        hlist_del_rcu(&cic->cic_list);
1779        spin_unlock_irqrestore(&ioc->lock, flags);
1780
1781        cfq_cic_free(cic);
1782}
1783
1784static struct cfq_io_context *
1785cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1786{
1787        struct cfq_io_context *cic;
1788        unsigned long flags;
1789        void *k;
1790
1791        if (unlikely(!ioc))
1792                return NULL;
1793
1794        rcu_read_lock();
1795
1796        /*
1797         * we maintain a last-hit cache, to avoid browsing over the tree
1798         */
1799        cic = rcu_dereference(ioc->ioc_data);
1800        if (cic && cic->key == cfqd) {
1801                rcu_read_unlock();
1802                return cic;
1803        }
1804
1805        do {
1806                cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
1807                rcu_read_unlock();
1808                if (!cic)
1809                        break;
1810                /* ->key must be copied to avoid race with cfq_exit_queue() */
1811                k = cic->key;
1812                if (unlikely(!k)) {
1813                        cfq_drop_dead_cic(cfqd, ioc, cic);
1814                        rcu_read_lock();
1815                        continue;
1816                }
1817
1818                spin_lock_irqsave(&ioc->lock, flags);
1819                rcu_assign_pointer(ioc->ioc_data, cic);
1820                spin_unlock_irqrestore(&ioc->lock, flags);
1821                break;
1822        } while (1);
1823
1824        return cic;
1825}
1826
1827/*
1828 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
1829 * the process specific cfq io context when entered from the block layer.
1830 * Also adds the cic to a per-cfqd list, used when this queue is removed.
1831 */
1832static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1833                        struct cfq_io_context *cic, gfp_t gfp_mask)
1834{
1835        unsigned long flags;
1836        int ret;
1837
1838        ret = radix_tree_preload(gfp_mask);
1839        if (!ret) {
1840                cic->ioc = ioc;
1841                cic->key = cfqd;
1842
1843                spin_lock_irqsave(&ioc->lock, flags);
1844                ret = radix_tree_insert(&ioc->radix_root,
1845                                                (unsigned long) cfqd, cic);
1846                if (!ret)
1847                        hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
1848                spin_unlock_irqrestore(&ioc->lock, flags);
1849
1850                radix_tree_preload_end();
1851
1852                if (!ret) {
1853                        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1854                        list_add(&cic->queue_list, &cfqd->cic_list);
1855                        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1856                }
1857        }
1858
1859        if (ret)
1860                printk(KERN_ERR "cfq: cic link failed!\n");
1861
1862        return ret;
1863}
1864
1865/*
1866 * Setup general io context and cfq io context. There can be several cfq
1867 * io contexts per general io context, if this process is doing io to more
1868 * than one device managed by cfq.
1869 */
1870static struct cfq_io_context *
1871cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1872{
1873        struct io_context *ioc = NULL;
1874        struct cfq_io_context *cic;
1875
1876        might_sleep_if(gfp_mask & __GFP_WAIT);
1877
1878        ioc = get_io_context(gfp_mask, cfqd->queue->node);
1879        if (!ioc)
1880                return NULL;
1881
1882        cic = cfq_cic_lookup(cfqd, ioc);
1883        if (cic)
1884                goto out;
1885
1886        cic = cfq_alloc_io_context(cfqd, gfp_mask);
1887        if (cic == NULL)
1888                goto err;
1889
1890        if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
1891                goto err_free;
1892
1893out:
1894        smp_read_barrier_depends();
1895        if (unlikely(ioc->ioprio_changed))
1896                cfq_ioc_set_ioprio(ioc);
1897
1898        return cic;
1899err_free:
1900        cfq_cic_free(cic);
1901err:
1902        put_io_context(ioc);
1903        return NULL;
1904}
1905
1906static void
1907cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1908{
1909        unsigned long elapsed = jiffies - cic->last_end_request;
1910        unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1911
1912        cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1913        cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1914        cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1915}
1916
1917static void
1918cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1919                       struct request *rq)
1920{
1921        sector_t sdist;
1922        u64 total;
1923
1924        if (!cic->last_request_pos)
1925                sdist = 0;
1926        else if (cic->last_request_pos < blk_rq_pos(rq))
1927                sdist = blk_rq_pos(rq) - cic->last_request_pos;
1928        else
1929                sdist = cic->last_request_pos - blk_rq_pos(rq);
1930
1931        /*
1932         * Don't allow the seek distance to get too large from the
1933         * odd fragment, pagein, etc
1934         */
1935        if (cic->seek_samples <= 60) /* second&third seek */
1936                sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1937        else
1938                sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1939
1940        cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1941        cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1942        total = cic->seek_total + (cic->seek_samples/2);
1943        do_div(total, cic->seek_samples);
1944        cic->seek_mean = (sector_t)total;
1945}
1946
1947/*
1948 * Disable idle window if the process thinks too long or seeks so much that
1949 * it doesn't matter
1950 */
1951static void
1952cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1953                       struct cfq_io_context *cic)
1954{
1955        int old_idle, enable_idle;
1956
1957        /*
1958         * Don't idle for async or idle io prio class
1959         */
1960        if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
1961                return;
1962
1963        enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
1964
1965        if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
1966            (cfqd->hw_tag && CIC_SEEKY(cic)))
1967                enable_idle = 0;
1968        else if (sample_valid(cic->ttime_samples)) {
1969                if (cic->ttime_mean > cfqd->cfq_slice_idle)
1970                        enable_idle = 0;
1971                else
1972                        enable_idle = 1;
1973        }
1974
1975        if (old_idle != enable_idle) {
1976                cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
1977                if (enable_idle)
1978                        cfq_mark_cfqq_idle_window(cfqq);
1979                else
1980                        cfq_clear_cfqq_idle_window(cfqq);
1981        }
1982}
1983
1984/*
1985 * Check if new_cfqq should preempt the currently active queue. Return 0 for
1986 * no or if we aren't sure, a 1 will cause a preempt.
1987 */
1988static int
1989cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1990                   struct request *rq)
1991{
1992        struct cfq_queue *cfqq;
1993
1994        cfqq = cfqd->active_queue;
1995        if (!cfqq)
1996                return 0;
1997
1998        if (cfq_slice_used(cfqq))
1999                return 1;
2000
2001        if (cfq_class_idle(new_cfqq))
2002                return 0;
2003
2004        if (cfq_class_idle(cfqq))
2005                return 1;
2006
2007        /*
2008         * if the new request is sync, but the currently running queue is
2009         * not, let the sync request have priority.
2010         */
2011        if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
2012                return 1;
2013
2014        /*
2015         * So both queues are sync. Let the new request get disk time if
2016         * it's a metadata request and the current queue is doing regular IO.
2017         */
2018        if (rq_is_meta(rq) && !cfqq->meta_pending)
2019                return 1;
2020
2021        /*
2022         * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
2023         */
2024        if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
2025                return 1;
2026
2027        if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
2028                return 0;
2029
2030        /*
2031         * if this request is as-good as one we would expect from the
2032         * current cfqq, let it preempt
2033         */
2034        if (cfq_rq_close(cfqd, rq))
2035                return 1;
2036
2037        return 0;
2038}
2039
2040/*
2041 * cfqq preempts the active queue. if we allowed preempt with no slice left,
2042 * let it have half of its nominal slice.
2043 */
2044static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2045{
2046        cfq_log_cfqq(cfqd, cfqq, "preempt");
2047        cfq_slice_expired(cfqd, 1);
2048
2049        /*
2050         * Put the new queue at the front of the of the current list,
2051         * so we know that it will be selected next.
2052         */
2053        BUG_ON(!cfq_cfqq_on_rr(cfqq));
2054
2055        cfq_service_tree_add(cfqd, cfqq, 1);
2056
2057        cfqq->slice_end = 0;
2058        cfq_mark_cfqq_slice_new(cfqq);
2059}
2060
2061/*
2062 * Called when a new fs request (rq) is added (to cfqq). Check if there's
2063 * something we should do about it
2064 */
2065static void
2066cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2067                struct request *rq)
2068{
2069        struct cfq_io_context *cic = RQ_CIC(rq);
2070
2071        cfqd->rq_queued++;
2072        if (rq_is_meta(rq))
2073                cfqq->meta_pending++;
2074
2075        cfq_update_io_thinktime(cfqd, cic);
2076        cfq_update_io_seektime(cfqd, cic, rq);
2077        cfq_update_idle_window(cfqd, cfqq, cic);
2078
2079        cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2080
2081        if (cfqq == cfqd->active_queue) {
2082                /*
2083                 * Remember that we saw a request from this process, but
2084                 * don't start queuing just yet. Otherwise we risk seeing lots
2085                 * of tiny requests, because we disrupt the normal plugging
2086                 * and merging. If the request is already larger than a single
2087                 * page, let it rip immediately. For that case we assume that
2088                 * merging is already done. Ditto for a busy system that
2089                 * has other work pending, don't risk delaying until the
2090                 * idle timer unplug to continue working.
2091                 */
2092                if (cfq_cfqq_wait_request(cfqq)) {
2093                        if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
2094                            cfqd->busy_queues > 1) {
2095                                del_timer(&cfqd->idle_slice_timer);
2096                        __blk_run_queue(cfqd->queue);
2097                        }
2098                        cfq_mark_cfqq_must_dispatch(cfqq);
2099                }
2100        } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
2101                /*
2102                 * not the active queue - expire current slice if it is
2103                 * idle and has expired it's mean thinktime or this new queue
2104                 * has some old slice time left and is of higher priority or
2105                 * this new queue is RT and the current one is BE
2106                 */
2107                cfq_preempt_queue(cfqd, cfqq);
2108                __blk_run_queue(cfqd->queue);
2109        }
2110}
2111
2112static void cfq_insert_request(struct request_queue *q, struct request *rq)
2113{
2114        struct cfq_data *cfqd = q->elevator->elevator_data;
2115        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2116
2117        cfq_log_cfqq(cfqd, cfqq, "insert_request");
2118        cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
2119
2120        cfq_add_rq_rb(rq);
2121
2122        list_add_tail(&rq->queuelist, &cfqq->fifo);
2123
2124        cfq_rq_enqueued(cfqd, cfqq, rq);
2125}
2126
2127/*
2128 * Update hw_tag based on peak queue depth over 50 samples under
2129 * sufficient load.
2130 */
2131static void cfq_update_hw_tag(struct cfq_data *cfqd)
2132{
2133        if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
2134                cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
2135
2136        if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
2137            cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
2138                return;
2139
2140        if (cfqd->hw_tag_samples++ < 50)
2141                return;
2142
2143        if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
2144                cfqd->hw_tag = 1;
2145        else
2146                cfqd->hw_tag = 0;
2147
2148        cfqd->hw_tag_samples = 0;
2149        cfqd->rq_in_driver_peak = 0;
2150}
2151
2152static void cfq_completed_request(struct request_queue *q, struct request *rq)
2153{
2154        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2155        struct cfq_data *cfqd = cfqq->cfqd;
2156        const int sync = rq_is_sync(rq);
2157        unsigned long now;
2158
2159        now = jiffies;
2160        cfq_log_cfqq(cfqd, cfqq, "complete");
2161
2162        cfq_update_hw_tag(cfqd);
2163
2164        WARN_ON(!cfqd->rq_in_driver);
2165        WARN_ON(!cfqq->dispatched);
2166        cfqd->rq_in_driver--;
2167        cfqq->dispatched--;
2168
2169        if (cfq_cfqq_sync(cfqq))
2170                cfqd->sync_flight--;
2171
2172        if (sync)
2173                RQ_CIC(rq)->last_end_request = now;
2174
2175        /*
2176         * If this is the active queue, check if it needs to be expired,
2177         * or if we want to idle in case it has no pending requests.
2178         */
2179        if (cfqd->active_queue == cfqq) {
2180                const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
2181
2182                if (cfq_cfqq_slice_new(cfqq)) {
2183                        cfq_set_prio_slice(cfqd, cfqq);
2184                        cfq_clear_cfqq_slice_new(cfqq);
2185                }
2186                /*
2187                 * If there are no requests waiting in this queue, and
2188                 * there are other queues ready to issue requests, AND
2189                 * those other queues are issuing requests within our
2190                 * mean seek distance, give them a chance to run instead
2191                 * of idling.
2192                 */
2193                if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
2194                        cfq_slice_expired(cfqd, 1);
2195                else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
2196                         sync && !rq_noidle(rq))
2197                        cfq_arm_slice_timer(cfqd);
2198        }
2199
2200        if (!cfqd->rq_in_driver)
2201                cfq_schedule_dispatch(cfqd);
2202}
2203
2204/*
2205 * we temporarily boost lower priority queues if they are holding fs exclusive
2206 * resources. they are boosted to normal prio (CLASS_BE/4)
2207 */
2208static void cfq_prio_boost(struct cfq_queue *cfqq)
2209{
2210        if (has_fs_excl()) {
2211                /*
2212                 * boost idle prio on transactions that would lock out other
2213                 * users of the filesystem
2214                 */
2215                if (cfq_class_idle(cfqq))
2216                        cfqq->ioprio_class = IOPRIO_CLASS_BE;
2217                if (cfqq->ioprio > IOPRIO_NORM)
2218                        cfqq->ioprio = IOPRIO_NORM;
2219        } else {
2220                /*
2221                 * check if we need to unboost the queue
2222                 */
2223                if (cfqq->ioprio_class != cfqq->org_ioprio_class)
2224                        cfqq->ioprio_class = cfqq->org_ioprio_class;
2225                if (cfqq->ioprio != cfqq->org_ioprio)
2226                        cfqq->ioprio = cfqq->org_ioprio;
2227        }
2228}
2229
2230static inline int __cfq_may_queue(struct cfq_queue *cfqq)
2231{
2232        if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
2233            !cfq_cfqq_must_alloc_slice(cfqq)) {
2234                cfq_mark_cfqq_must_alloc_slice(cfqq);
2235                return ELV_MQUEUE_MUST;
2236        }
2237
2238        return ELV_MQUEUE_MAY;
2239}
2240
2241static int cfq_may_queue(struct request_queue *q, int rw)
2242{
2243        struct cfq_data *cfqd = q->elevator->elevator_data;
2244        struct task_struct *tsk = current;
2245        struct cfq_io_context *cic;
2246        struct cfq_queue *cfqq;
2247
2248        /*
2249         * don't force setup of a queue from here, as a call to may_queue
2250         * does not necessarily imply that a request actually will be queued.
2251         * so just lookup a possibly existing queue, or return 'may queue'
2252         * if that fails
2253         */
2254        cic = cfq_cic_lookup(cfqd, tsk->io_context);
2255        if (!cic)
2256                return ELV_MQUEUE_MAY;
2257
2258        cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
2259        if (cfqq) {
2260                cfq_init_prio_data(cfqq, cic->ioc);
2261                cfq_prio_boost(cfqq);
2262
2263                return __cfq_may_queue(cfqq);
2264        }
2265
2266        return ELV_MQUEUE_MAY;
2267}
2268
2269/*
2270 * queue lock held here
2271 */
2272static void cfq_put_request(struct request *rq)
2273{
2274        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2275
2276        if (cfqq) {
2277                const int rw = rq_data_dir(rq);
2278
2279                BUG_ON(!cfqq->allocated[rw]);
2280                cfqq->allocated[rw]--;
2281
2282                put_io_context(RQ_CIC(rq)->ioc);
2283
2284                rq->elevator_private = NULL;
2285                rq->elevator_private2 = NULL;
2286
2287                cfq_put_queue(cfqq);
2288        }
2289}
2290
2291/*
2292 * Allocate cfq data structures associated with this request.
2293 */
2294static int
2295cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
2296{
2297        struct cfq_data *cfqd = q->elevator->elevator_data;
2298        struct cfq_io_context *cic;
2299        const int rw = rq_data_dir(rq);
2300        const int is_sync = rq_is_sync(rq);
2301        struct cfq_queue *cfqq;
2302        unsigned long flags;
2303
2304        might_sleep_if(gfp_mask & __GFP_WAIT);
2305
2306        cic = cfq_get_io_context(cfqd, gfp_mask);
2307
2308        spin_lock_irqsave(q->queue_lock, flags);
2309
2310        if (!cic)
2311                goto queue_fail;
2312
2313        cfqq = cic_to_cfqq(cic, is_sync);
2314        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2315                cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
2316                cic_set_cfqq(cic, cfqq, is_sync);
2317        }
2318
2319        cfqq->allocated[rw]++;
2320        cfq_clear_cfqq_must_alloc(cfqq);
2321        atomic_inc(&cfqq->ref);
2322
2323        spin_unlock_irqrestore(q->queue_lock, flags);
2324
2325        rq->elevator_private = cic;
2326        rq->elevator_private2 = cfqq;
2327        return 0;
2328
2329queue_fail:
2330        if (cic)
2331                put_io_context(cic->ioc);
2332
2333        cfq_schedule_dispatch(cfqd);
2334        spin_unlock_irqrestore(q->queue_lock, flags);
2335        cfq_log(cfqd, "set_request fail");
2336        return 1;
2337}
2338
2339static void cfq_kick_queue(struct work_struct *work)
2340{
2341        struct cfq_data *cfqd =
2342                container_of(work, struct cfq_data, unplug_work);
2343        struct request_queue *q = cfqd->queue;
2344
2345        spin_lock_irq(q->queue_lock);
2346        __blk_run_queue(cfqd->queue);
2347        spin_unlock_irq(q->queue_lock);
2348}
2349
2350/*
2351 * Timer running if the active_queue is currently idling inside its time slice
2352 */
2353static void cfq_idle_slice_timer(unsigned long data)
2354{
2355        struct cfq_data *cfqd = (struct cfq_data *) data;
2356        struct cfq_queue *cfqq;
2357        unsigned long flags;
2358        int timed_out = 1;
2359
2360        cfq_log(cfqd, "idle timer fired");
2361
2362        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2363
2364        cfqq = cfqd->active_queue;
2365        if (cfqq) {
2366                timed_out = 0;
2367
2368                /*
2369                 * We saw a request before the queue expired, let it through
2370                 */
2371                if (cfq_cfqq_must_dispatch(cfqq))
2372                        goto out_kick;
2373
2374                /*
2375                 * expired
2376                 */
2377                if (cfq_slice_used(cfqq))
2378                        goto expire;
2379
2380                /*
2381                 * only expire and reinvoke request handler, if there are
2382                 * other queues with pending requests
2383                 */
2384                if (!cfqd->busy_queues)
2385                        goto out_cont;
2386
2387                /*
2388                 * not expired and it has a request pending, let it dispatch
2389                 */
2390                if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2391                        goto out_kick;
2392        }
2393expire:
2394        cfq_slice_expired(cfqd, timed_out);
2395out_kick:
2396        cfq_schedule_dispatch(cfqd);
2397out_cont:
2398        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2399}
2400
2401static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2402{
2403        del_timer_sync(&cfqd->idle_slice_timer);
2404        cancel_work_sync(&cfqd->unplug_work);
2405}
2406
2407static void cfq_put_async_queues(struct cfq_data *cfqd)
2408{
2409        int i;
2410
2411        for (i = 0; i < IOPRIO_BE_NR; i++) {
2412                if (cfqd->async_cfqq[0][i])
2413                        cfq_put_queue(cfqd->async_cfqq[0][i]);
2414                if (cfqd->async_cfqq[1][i])
2415                        cfq_put_queue(cfqd->async_cfqq[1][i]);
2416        }
2417
2418        if (cfqd->async_idle_cfqq)
2419                cfq_put_queue(cfqd->async_idle_cfqq);
2420}
2421
2422static void cfq_exit_queue(struct elevator_queue *e)
2423{
2424        struct cfq_data *cfqd = e->elevator_data;
2425        struct request_queue *q = cfqd->queue;
2426
2427        cfq_shutdown_timer_wq(cfqd);
2428
2429        spin_lock_irq(q->queue_lock);
2430
2431        if (cfqd->active_queue)
2432                __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2433
2434        while (!list_empty(&cfqd->cic_list)) {
2435                struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2436                                                        struct cfq_io_context,
2437                                                        queue_list);
2438
2439                __cfq_exit_single_io_context(cfqd, cic);
2440        }
2441
2442        cfq_put_async_queues(cfqd);
2443
2444        spin_unlock_irq(q->queue_lock);
2445
2446        cfq_shutdown_timer_wq(cfqd);
2447
2448        kfree(cfqd);
2449}
2450
2451static void *cfq_init_queue(struct request_queue *q)
2452{
2453        struct cfq_data *cfqd;
2454        int i;
2455
2456        cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2457        if (!cfqd)
2458                return NULL;
2459
2460        cfqd->service_tree = CFQ_RB_ROOT;
2461
2462        /*
2463         * Not strictly needed (since RB_ROOT just clears the node and we
2464         * zeroed cfqd on alloc), but better be safe in case someone decides
2465         * to add magic to the rb code
2466         */
2467        for (i = 0; i < CFQ_PRIO_LISTS; i++)
2468                cfqd->prio_trees[i] = RB_ROOT;
2469
2470        /*
2471         * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
2472         * Grab a permanent reference to it, so that the normal code flow
2473         * will not attempt to free it.
2474         */
2475        cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
2476        atomic_inc(&cfqd->oom_cfqq.ref);
2477
2478        INIT_LIST_HEAD(&cfqd->cic_list);
2479
2480        cfqd->queue = q;
2481
2482        init_timer(&cfqd->idle_slice_timer);
2483        cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2484        cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2485
2486        INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
2487
2488        cfqd->cfq_quantum = cfq_quantum;
2489        cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2490        cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2491        cfqd->cfq_back_max = cfq_back_max;
2492        cfqd->cfq_back_penalty = cfq_back_penalty;
2493        cfqd->cfq_slice[0] = cfq_slice_async;
2494        cfqd->cfq_slice[1] = cfq_slice_sync;
2495        cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2496        cfqd->cfq_slice_idle = cfq_slice_idle;
2497        cfqd->hw_tag = 1;
2498
2499        return cfqd;
2500}
2501
2502static void cfq_slab_kill(void)
2503{
2504        /*
2505         * Caller already ensured that pending RCU callbacks are completed,
2506         * so we should have no busy allocations at this point.
2507         */
2508        if (cfq_pool)
2509                kmem_cache_destroy(cfq_pool);
2510        if (cfq_ioc_pool)
2511                kmem_cache_destroy(cfq_ioc_pool);
2512}
2513
2514static int __init cfq_slab_setup(void)
2515{
2516        cfq_pool = KMEM_CACHE(cfq_queue, 0);
2517        if (!cfq_pool)
2518                goto fail;
2519
2520        cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
2521        if (!cfq_ioc_pool)
2522                goto fail;
2523
2524        return 0;
2525fail:
2526        cfq_slab_kill();
2527        return -ENOMEM;
2528}
2529
2530/*
2531 * sysfs parts below -->
2532 */
2533static ssize_t
2534cfq_var_show(unsigned int var, char *page)
2535{
2536        return sprintf(page, "%d\n", var);
2537}
2538
2539static ssize_t
2540cfq_var_store(unsigned int *var, const char *page, size_t count)
2541{
2542        char *p = (char *) page;
2543
2544        *var = simple_strtoul(p, &p, 10);
2545        return count;
2546}
2547
2548#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2549static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
2550{                                                                       \
2551        struct cfq_data *cfqd = e->elevator_data;                       \
2552        unsigned int __data = __VAR;                                    \
2553        if (__CONV)                                                     \
2554                __data = jiffies_to_msecs(__data);                      \
2555        return cfq_var_show(__data, (page));                            \
2556}
2557SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2558SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2559SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2560SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2561SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2562SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2563SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2564SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2565SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2566#undef SHOW_FUNCTION
2567
2568#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2569static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
2570{                                                                       \
2571        struct cfq_data *cfqd = e->elevator_data;                       \
2572        unsigned int __data;                                            \
2573        int ret = cfq_var_store(&__data, (page), count);                \
2574        if (__data < (MIN))                                             \
2575                __data = (MIN);                                         \
2576        else if (__data > (MAX))                                        \
2577                __data = (MAX);                                         \
2578        if (__CONV)                                                     \
2579                *(__PTR) = msecs_to_jiffies(__data);                    \
2580        else                                                            \
2581                *(__PTR) = __data;                                      \
2582        return ret;                                                     \
2583}
2584STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2585STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
2586                UINT_MAX, 1);
2587STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
2588                UINT_MAX, 1);
2589STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2590STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
2591                UINT_MAX, 0);
2592STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2593STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2594STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2595STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
2596                UINT_MAX, 0);
2597#undef STORE_FUNCTION
2598
2599#define CFQ_ATTR(name) \
2600        __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2601
2602static struct elv_fs_entry cfq_attrs[] = {
2603        CFQ_ATTR(quantum),
2604        CFQ_ATTR(fifo_expire_sync),
2605        CFQ_ATTR(fifo_expire_async),
2606        CFQ_ATTR(back_seek_max),
2607        CFQ_ATTR(back_seek_penalty),
2608        CFQ_ATTR(slice_sync),
2609        CFQ_ATTR(slice_async),
2610        CFQ_ATTR(slice_async_rq),
2611        CFQ_ATTR(slice_idle),
2612        __ATTR_NULL
2613};
2614
2615static struct elevator_type iosched_cfq = {
2616        .ops = {
2617                .elevator_merge_fn =            cfq_merge,
2618                .elevator_merged_fn =           cfq_merged_request,
2619                .elevator_merge_req_fn =        cfq_merged_requests,
2620                .elevator_allow_merge_fn =      cfq_allow_merge,
2621                .elevator_dispatch_fn =         cfq_dispatch_requests,
2622                .elevator_add_req_fn =          cfq_insert_request,
2623                .elevator_activate_req_fn =     cfq_activate_request,
2624                .elevator_deactivate_req_fn =   cfq_deactivate_request,
2625                .elevator_queue_empty_fn =      cfq_queue_empty,
2626                .elevator_completed_req_fn =    cfq_completed_request,
2627                .elevator_former_req_fn =       elv_rb_former_request,
2628                .elevator_latter_req_fn =       elv_rb_latter_request,
2629                .elevator_set_req_fn =          cfq_set_request,
2630                .elevator_put_req_fn =          cfq_put_request,
2631                .elevator_may_queue_fn =        cfq_may_queue,
2632                .elevator_init_fn =             cfq_init_queue,
2633                .elevator_exit_fn =             cfq_exit_queue,
2634                .trim =                         cfq_free_io_context,
2635        },
2636        .elevator_attrs =       cfq_attrs,
2637        .elevator_name =        "cfq",
2638        .elevator_owner =       THIS_MODULE,
2639};
2640
2641static int __init cfq_init(void)
2642{
2643        /*
2644         * could be 0 on HZ < 1000 setups
2645         */
2646        if (!cfq_slice_async)
2647                cfq_slice_async = 1;
2648        if (!cfq_slice_idle)
2649                cfq_slice_idle = 1;
2650
2651        if (cfq_slab_setup())
2652                return -ENOMEM;
2653
2654        elv_register(&iosched_cfq);
2655
2656        return 0;
2657}
2658
2659static void __exit cfq_exit(void)
2660{
2661        DECLARE_COMPLETION_ONSTACK(all_gone);
2662        elv_unregister(&iosched_cfq);
2663        ioc_gone = &all_gone;
2664        /* ioc_gone's update must be visible before reading ioc_count */
2665        smp_wmb();
2666
2667        /*
2668         * this also protects us from entering cfq_slab_kill() with
2669         * pending RCU callbacks
2670         */
2671        if (elv_ioc_count_read(ioc_count))
2672                wait_for_completion(&all_gone);
2673        cfq_slab_kill();
2674}
2675
2676module_init(cfq_init);
2677module_exit(cfq_exit);
2678
2679MODULE_AUTHOR("Jens Axboe");
2680MODULE_LICENSE("GPL");
2681MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
2682
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.