linux/block/cfq-iosched.c
<<
>>
Prefs
   1/*
   2 *  CFQ, or complete fairness queueing, disk scheduler.
   3 *
   4 *  Based on ideas from a previously unfinished io
   5 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   6 *
   7 *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   8 */
   9#include <linux/module.h>
  10#include <linux/slab.h>
  11#include <linux/blkdev.h>
  12#include <linux/elevator.h>
  13#include <linux/ktime.h>
  14#include <linux/rbtree.h>
  15#include <linux/ioprio.h>
  16#include <linux/blktrace_api.h>
  17#include <linux/blk-cgroup.h>
  18#include "blk.h"
  19#include "blk-wbt.h"
  20
  21/*
  22 * tunables
  23 */
  24/* max queue in one round of service */
  25static const int cfq_quantum = 8;
  26static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
  27/* maximum backwards seek, in KiB */
  28static const int cfq_back_max = 16 * 1024;
  29/* penalty of a backwards seek */
  30static const int cfq_back_penalty = 2;
  31static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
  32static u64 cfq_slice_async = NSEC_PER_SEC / 25;
  33static const int cfq_slice_async_rq = 2;
  34static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
  35static u64 cfq_group_idle = NSEC_PER_SEC / 125;
  36static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
  37static const int cfq_hist_divisor = 4;
  38
  39/*
  40 * offset from end of service tree
  41 */
  42#define CFQ_IDLE_DELAY          (NSEC_PER_SEC / 5)
  43
  44/*
  45 * below this threshold, we consider thinktime immediate
  46 */
  47#define CFQ_MIN_TT              (2 * NSEC_PER_SEC / HZ)
  48
  49#define CFQ_SLICE_SCALE         (5)
  50#define CFQ_HW_QUEUE_MIN        (5)
  51#define CFQ_SERVICE_SHIFT       12
  52
  53#define CFQQ_SEEK_THR           (sector_t)(8 * 100)
  54#define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
  55#define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
  56#define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
  57
  58#define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
  59#define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
  60#define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
  61
  62static struct kmem_cache *cfq_pool;
  63
  64#define CFQ_PRIO_LISTS          IOPRIO_BE_NR
  65#define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  66#define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  67
  68#define sample_valid(samples)   ((samples) > 80)
  69#define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
  70
  71/* blkio-related constants */
  72#define CFQ_WEIGHT_LEGACY_MIN   10
  73#define CFQ_WEIGHT_LEGACY_DFL   500
  74#define CFQ_WEIGHT_LEGACY_MAX   1000
  75
  76struct cfq_ttime {
  77        u64 last_end_request;
  78
  79        u64 ttime_total;
  80        u64 ttime_mean;
  81        unsigned long ttime_samples;
  82};
  83
  84/*
  85 * Most of our rbtree usage is for sorting with min extraction, so
  86 * if we cache the leftmost node we don't have to walk down the tree
  87 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  88 * move this into the elevator for the rq sorting as well.
  89 */
  90struct cfq_rb_root {
  91        struct rb_root rb;
  92        struct rb_node *left;
  93        unsigned count;
  94        u64 min_vdisktime;
  95        struct cfq_ttime ttime;
  96};
  97#define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, \
  98                        .ttime = {.last_end_request = ktime_get_ns(),},}
  99
 100/*
 101 * Per process-grouping structure
 102 */
 103struct cfq_queue {
 104        /* reference count */
 105        int ref;
 106        /* various state flags, see below */
 107        unsigned int flags;
 108        /* parent cfq_data */
 109        struct cfq_data *cfqd;
 110        /* service_tree member */
 111        struct rb_node rb_node;
 112        /* service_tree key */
 113        u64 rb_key;
 114        /* prio tree member */
 115        struct rb_node p_node;
 116        /* prio tree root we belong to, if any */
 117        struct rb_root *p_root;
 118        /* sorted list of pending requests */
 119        struct rb_root sort_list;
 120        /* if fifo isn't expired, next request to serve */
 121        struct request *next_rq;
 122        /* requests queued in sort_list */
 123        int queued[2];
 124        /* currently allocated requests */
 125        int allocated[2];
 126        /* fifo list of requests in sort_list */
 127        struct list_head fifo;
 128
 129        /* time when queue got scheduled in to dispatch first request. */
 130        u64 dispatch_start;
 131        u64 allocated_slice;
 132        u64 slice_dispatch;
 133        /* time when first request from queue completed and slice started. */
 134        u64 slice_start;
 135        u64 slice_end;
 136        s64 slice_resid;
 137
 138        /* pending priority requests */
 139        int prio_pending;
 140        /* number of requests that are on the dispatch list or inside driver */
 141        int dispatched;
 142
 143        /* io prio of this group */
 144        unsigned short ioprio, org_ioprio;
 145        unsigned short ioprio_class, org_ioprio_class;
 146
 147        pid_t pid;
 148
 149        u32 seek_history;
 150        sector_t last_request_pos;
 151
 152        struct cfq_rb_root *service_tree;
 153        struct cfq_queue *new_cfqq;
 154        struct cfq_group *cfqg;
 155        /* Number of sectors dispatched from queue in single dispatch round */
 156        unsigned long nr_sectors;
 157};
 158
 159/*
 160 * First index in the service_trees.
 161 * IDLE is handled separately, so it has negative index
 162 */
 163enum wl_class_t {
 164        BE_WORKLOAD = 0,
 165        RT_WORKLOAD = 1,
 166        IDLE_WORKLOAD = 2,
 167        CFQ_PRIO_NR,
 168};
 169
 170/*
 171 * Second index in the service_trees.
 172 */
 173enum wl_type_t {
 174        ASYNC_WORKLOAD = 0,
 175        SYNC_NOIDLE_WORKLOAD = 1,
 176        SYNC_WORKLOAD = 2
 177};
 178
 179struct cfqg_stats {
 180#ifdef CONFIG_CFQ_GROUP_IOSCHED
 181        /* number of ios merged */
 182        struct blkg_rwstat              merged;
 183        /* total time spent on device in ns, may not be accurate w/ queueing */
 184        struct blkg_rwstat              service_time;
 185        /* total time spent waiting in scheduler queue in ns */
 186        struct blkg_rwstat              wait_time;
 187        /* number of IOs queued up */
 188        struct blkg_rwstat              queued;
 189        /* total disk time and nr sectors dispatched by this group */
 190        struct blkg_stat                time;
 191#ifdef CONFIG_DEBUG_BLK_CGROUP
 192        /* time not charged to this cgroup */
 193        struct blkg_stat                unaccounted_time;
 194        /* sum of number of ios queued across all samples */
 195        struct blkg_stat                avg_queue_size_sum;
 196        /* count of samples taken for average */
 197        struct blkg_stat                avg_queue_size_samples;
 198        /* how many times this group has been removed from service tree */
 199        struct blkg_stat                dequeue;
 200        /* total time spent waiting for it to be assigned a timeslice. */
 201        struct blkg_stat                group_wait_time;
 202        /* time spent idling for this blkcg_gq */
 203        struct blkg_stat                idle_time;
 204        /* total time with empty current active q with other requests queued */
 205        struct blkg_stat                empty_time;
 206        /* fields after this shouldn't be cleared on stat reset */
 207        uint64_t                        start_group_wait_time;
 208        uint64_t                        start_idle_time;
 209        uint64_t                        start_empty_time;
 210        uint16_t                        flags;
 211#endif  /* CONFIG_DEBUG_BLK_CGROUP */
 212#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 213};
 214
 215/* Per-cgroup data */
 216struct cfq_group_data {
 217        /* must be the first member */
 218        struct blkcg_policy_data cpd;
 219
 220        unsigned int weight;
 221        unsigned int leaf_weight;
 222};
 223
 224/* This is per cgroup per device grouping structure */
 225struct cfq_group {
 226        /* must be the first member */
 227        struct blkg_policy_data pd;
 228
 229        /* group service_tree member */
 230        struct rb_node rb_node;
 231
 232        /* group service_tree key */
 233        u64 vdisktime;
 234
 235        /*
 236         * The number of active cfqgs and sum of their weights under this
 237         * cfqg.  This covers this cfqg's leaf_weight and all children's
 238         * weights, but does not cover weights of further descendants.
 239         *
 240         * If a cfqg is on the service tree, it's active.  An active cfqg
 241         * also activates its parent and contributes to the children_weight
 242         * of the parent.
 243         */
 244        int nr_active;
 245        unsigned int children_weight;
 246
 247        /*
 248         * vfraction is the fraction of vdisktime that the tasks in this
 249         * cfqg are entitled to.  This is determined by compounding the
 250         * ratios walking up from this cfqg to the root.
 251         *
 252         * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
 253         * vfractions on a service tree is approximately 1.  The sum may
 254         * deviate a bit due to rounding errors and fluctuations caused by
 255         * cfqgs entering and leaving the service tree.
 256         */
 257        unsigned int vfraction;
 258
 259        /*
 260         * There are two weights - (internal) weight is the weight of this
 261         * cfqg against the sibling cfqgs.  leaf_weight is the wight of
 262         * this cfqg against the child cfqgs.  For the root cfqg, both
 263         * weights are kept in sync for backward compatibility.
 264         */
 265        unsigned int weight;
 266        unsigned int new_weight;
 267        unsigned int dev_weight;
 268
 269        unsigned int leaf_weight;
 270        unsigned int new_leaf_weight;
 271        unsigned int dev_leaf_weight;
 272
 273        /* number of cfqq currently on this group */
 274        int nr_cfqq;
 275
 276        /*
 277         * Per group busy queues average. Useful for workload slice calc. We
 278         * create the array for each prio class but at run time it is used
 279         * only for RT and BE class and slot for IDLE class remains unused.
 280         * This is primarily done to avoid confusion and a gcc warning.
 281         */
 282        unsigned int busy_queues_avg[CFQ_PRIO_NR];
 283        /*
 284         * rr lists of queues with requests. We maintain service trees for
 285         * RT and BE classes. These trees are subdivided in subclasses
 286         * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
 287         * class there is no subclassification and all the cfq queues go on
 288         * a single tree service_tree_idle.
 289         * Counts are embedded in the cfq_rb_root
 290         */
 291        struct cfq_rb_root service_trees[2][3];
 292        struct cfq_rb_root service_tree_idle;
 293
 294        u64 saved_wl_slice;
 295        enum wl_type_t saved_wl_type;
 296        enum wl_class_t saved_wl_class;
 297
 298        /* number of requests that are on the dispatch list or inside driver */
 299        int dispatched;
 300        struct cfq_ttime ttime;
 301        struct cfqg_stats stats;        /* stats for this cfqg */
 302
 303        /* async queue for each priority case */
 304        struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 305        struct cfq_queue *async_idle_cfqq;
 306
 307};
 308
 309struct cfq_io_cq {
 310        struct io_cq            icq;            /* must be the first member */
 311        struct cfq_queue        *cfqq[2];
 312        struct cfq_ttime        ttime;
 313        int                     ioprio;         /* the current ioprio */
 314#ifdef CONFIG_CFQ_GROUP_IOSCHED
 315        uint64_t                blkcg_serial_nr; /* the current blkcg serial */
 316#endif
 317};
 318
 319/*
 320 * Per block device queue structure
 321 */
 322struct cfq_data {
 323        struct request_queue *queue;
 324        /* Root service tree for cfq_groups */
 325        struct cfq_rb_root grp_service_tree;
 326        struct cfq_group *root_group;
 327
 328        /*
 329         * The priority currently being served
 330         */
 331        enum wl_class_t serving_wl_class;
 332        enum wl_type_t serving_wl_type;
 333        u64 workload_expires;
 334        struct cfq_group *serving_group;
 335
 336        /*
 337         * Each priority tree is sorted by next_request position.  These
 338         * trees are used when determining if two or more queues are
 339         * interleaving requests (see cfq_close_cooperator).
 340         */
 341        struct rb_root prio_trees[CFQ_PRIO_LISTS];
 342
 343        unsigned int busy_queues;
 344        unsigned int busy_sync_queues;
 345
 346        int rq_in_driver;
 347        int rq_in_flight[2];
 348
 349        /*
 350         * queue-depth detection
 351         */
 352        int rq_queued;
 353        int hw_tag;
 354        /*
 355         * hw_tag can be
 356         * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
 357         *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
 358         *  0 => no NCQ
 359         */
 360        int hw_tag_est_depth;
 361        unsigned int hw_tag_samples;
 362
 363        /*
 364         * idle window management
 365         */
 366        struct hrtimer idle_slice_timer;
 367        struct work_struct unplug_work;
 368
 369        struct cfq_queue *active_queue;
 370        struct cfq_io_cq *active_cic;
 371
 372        sector_t last_position;
 373
 374        /*
 375         * tunables, see top of file
 376         */
 377        unsigned int cfq_quantum;
 378        unsigned int cfq_back_penalty;
 379        unsigned int cfq_back_max;
 380        unsigned int cfq_slice_async_rq;
 381        unsigned int cfq_latency;
 382        u64 cfq_fifo_expire[2];
 383        u64 cfq_slice[2];
 384        u64 cfq_slice_idle;
 385        u64 cfq_group_idle;
 386        u64 cfq_target_latency;
 387
 388        /*
 389         * Fallback dummy cfqq for extreme OOM conditions
 390         */
 391        struct cfq_queue oom_cfqq;
 392
 393        u64 last_delayed_sync;
 394};
 395
 396static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 397static void cfq_put_queue(struct cfq_queue *cfqq);
 398
 399static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
 400                                            enum wl_class_t class,
 401                                            enum wl_type_t type)
 402{
 403        if (!cfqg)
 404                return NULL;
 405
 406        if (class == IDLE_WORKLOAD)
 407                return &cfqg->service_tree_idle;
 408
 409        return &cfqg->service_trees[class][type];
 410}
 411
 412enum cfqq_state_flags {
 413        CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
 414        CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
 415        CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
 416        CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
 417        CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
 418        CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
 419        CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
 420        CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
 421        CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
 422        CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
 423        CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
 424        CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
 425        CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
 426};
 427
 428#define CFQ_CFQQ_FNS(name)                                              \
 429static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
 430{                                                                       \
 431        (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
 432}                                                                       \
 433static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
 434{                                                                       \
 435        (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
 436}                                                                       \
 437static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
 438{                                                                       \
 439        return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
 440}
 441
 442CFQ_CFQQ_FNS(on_rr);
 443CFQ_CFQQ_FNS(wait_request);
 444CFQ_CFQQ_FNS(must_dispatch);
 445CFQ_CFQQ_FNS(must_alloc_slice);
 446CFQ_CFQQ_FNS(fifo_expire);
 447CFQ_CFQQ_FNS(idle_window);
 448CFQ_CFQQ_FNS(prio_changed);
 449CFQ_CFQQ_FNS(slice_new);
 450CFQ_CFQQ_FNS(sync);
 451CFQ_CFQQ_FNS(coop);
 452CFQ_CFQQ_FNS(split_coop);
 453CFQ_CFQQ_FNS(deep);
 454CFQ_CFQQ_FNS(wait_busy);
 455#undef CFQ_CFQQ_FNS
 456
 457#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 458
 459/* cfqg stats flags */
 460enum cfqg_stats_flags {
 461        CFQG_stats_waiting = 0,
 462        CFQG_stats_idling,
 463        CFQG_stats_empty,
 464};
 465
 466#define CFQG_FLAG_FNS(name)                                             \
 467static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)     \
 468{                                                                       \
 469        stats->flags |= (1 << CFQG_stats_##name);                       \
 470}                                                                       \
 471static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)    \
 472{                                                                       \
 473        stats->flags &= ~(1 << CFQG_stats_##name);                      \
 474}                                                                       \
 475static inline int cfqg_stats_##name(struct cfqg_stats *stats)           \
 476{                                                                       \
 477        return (stats->flags & (1 << CFQG_stats_##name)) != 0;          \
 478}                                                                       \
 479
 480CFQG_FLAG_FNS(waiting)
 481CFQG_FLAG_FNS(idling)
 482CFQG_FLAG_FNS(empty)
 483#undef CFQG_FLAG_FNS
 484
 485/* This should be called with the queue_lock held. */
 486static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 487{
 488        unsigned long long now;
 489
 490        if (!cfqg_stats_waiting(stats))
 491                return;
 492
 493        now = sched_clock();
 494        if (time_after64(now, stats->start_group_wait_time))
 495                blkg_stat_add(&stats->group_wait_time,
 496                              now - stats->start_group_wait_time);
 497        cfqg_stats_clear_waiting(stats);
 498}
 499
 500/* This should be called with the queue_lock held. */
 501static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 502                                                 struct cfq_group *curr_cfqg)
 503{
 504        struct cfqg_stats *stats = &cfqg->stats;
 505
 506        if (cfqg_stats_waiting(stats))
 507                return;
 508        if (cfqg == curr_cfqg)
 509                return;
 510        stats->start_group_wait_time = sched_clock();
 511        cfqg_stats_mark_waiting(stats);
 512}
 513
 514/* This should be called with the queue_lock held. */
 515static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 516{
 517        unsigned long long now;
 518
 519        if (!cfqg_stats_empty(stats))
 520                return;
 521
 522        now = sched_clock();
 523        if (time_after64(now, stats->start_empty_time))
 524                blkg_stat_add(&stats->empty_time,
 525                              now - stats->start_empty_time);
 526        cfqg_stats_clear_empty(stats);
 527}
 528
 529static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
 530{
 531        blkg_stat_add(&cfqg->stats.dequeue, 1);
 532}
 533
 534static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 535{
 536        struct cfqg_stats *stats = &cfqg->stats;
 537
 538        if (blkg_rwstat_total(&stats->queued))
 539                return;
 540
 541        /*
 542         * group is already marked empty. This can happen if cfqq got new
 543         * request in parent group and moved to this group while being added
 544         * to service tree. Just ignore the event and move on.
 545         */
 546        if (cfqg_stats_empty(stats))
 547                return;
 548
 549        stats->start_empty_time = sched_clock();
 550        cfqg_stats_mark_empty(stats);
 551}
 552
 553static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 554{
 555        struct cfqg_stats *stats = &cfqg->stats;
 556
 557        if (cfqg_stats_idling(stats)) {
 558                unsigned long long now = sched_clock();
 559
 560                if (time_after64(now, stats->start_idle_time))
 561                        blkg_stat_add(&stats->idle_time,
 562                                      now - stats->start_idle_time);
 563                cfqg_stats_clear_idling(stats);
 564        }
 565}
 566
 567static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 568{
 569        struct cfqg_stats *stats = &cfqg->stats;
 570
 571        BUG_ON(cfqg_stats_idling(stats));
 572
 573        stats->start_idle_time = sched_clock();
 574        cfqg_stats_mark_idling(stats);
 575}
 576
 577static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 578{
 579        struct cfqg_stats *stats = &cfqg->stats;
 580
 581        blkg_stat_add(&stats->avg_queue_size_sum,
 582                      blkg_rwstat_total(&stats->queued));
 583        blkg_stat_add(&stats->avg_queue_size_samples, 1);
 584        cfqg_stats_update_group_wait_time(stats);
 585}
 586
 587#else   /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 588
 589static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
 590static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
 591static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
 592static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
 593static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
 594static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
 595static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 596
 597#endif  /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 598
 599#ifdef CONFIG_CFQ_GROUP_IOSCHED
 600
 601static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 602{
 603        return pd ? container_of(pd, struct cfq_group, pd) : NULL;
 604}
 605
 606static struct cfq_group_data
 607*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 608{
 609        return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 610}
 611
 612static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 613{
 614        return pd_to_blkg(&cfqg->pd);
 615}
 616
 617static struct blkcg_policy blkcg_policy_cfq;
 618
 619static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 620{
 621        return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 622}
 623
 624static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
 625{
 626        return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
 627}
 628
 629static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 630{
 631        struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
 632
 633        return pblkg ? blkg_to_cfqg(pblkg) : NULL;
 634}
 635
 636static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
 637                                      struct cfq_group *ancestor)
 638{
 639        return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
 640                                    cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
 641}
 642
 643static inline void cfqg_get(struct cfq_group *cfqg)
 644{
 645        return blkg_get(cfqg_to_blkg(cfqg));
 646}
 647
 648static inline void cfqg_put(struct cfq_group *cfqg)
 649{
 650        return blkg_put(cfqg_to_blkg(cfqg));
 651}
 652
 653#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  do {                    \
 654        char __pbuf[128];                                               \
 655                                                                        \
 656        blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));  \
 657        blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
 658                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 659                        cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 660                          __pbuf, ##args);                              \
 661} while (0)
 662
 663#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)  do {                    \
 664        char __pbuf[128];                                               \
 665                                                                        \
 666        blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));          \
 667        blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);    \
 668} while (0)
 669
 670static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 671                                            struct cfq_group *curr_cfqg,
 672                                            unsigned int op)
 673{
 674        blkg_rwstat_add(&cfqg->stats.queued, op, 1);
 675        cfqg_stats_end_empty_time(&cfqg->stats);
 676        cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 677}
 678
 679static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 680                        uint64_t time, unsigned long unaccounted_time)
 681{
 682        blkg_stat_add(&cfqg->stats.time, time);
 683#ifdef CONFIG_DEBUG_BLK_CGROUP
 684        blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
 685#endif
 686}
 687
 688static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 689                                               unsigned int op)
 690{
 691        blkg_rwstat_add(&cfqg->stats.queued, op, -1);
 692}
 693
 694static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 695                                               unsigned int op)
 696{
 697        blkg_rwstat_add(&cfqg->stats.merged, op, 1);
 698}
 699
 700static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 701                        uint64_t start_time, uint64_t io_start_time,
 702                        unsigned int op)
 703{
 704        struct cfqg_stats *stats = &cfqg->stats;
 705        unsigned long long now = sched_clock();
 706
 707        if (time_after64(now, io_start_time))
 708                blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
 709        if (time_after64(io_start_time, start_time))
 710                blkg_rwstat_add(&stats->wait_time, op,
 711                                io_start_time - start_time);
 712}
 713
 714/* @stats = 0 */
 715static void cfqg_stats_reset(struct cfqg_stats *stats)
 716{
 717        /* queued stats shouldn't be cleared */
 718        blkg_rwstat_reset(&stats->merged);
 719        blkg_rwstat_reset(&stats->service_time);
 720        blkg_rwstat_reset(&stats->wait_time);
 721        blkg_stat_reset(&stats->time);
 722#ifdef CONFIG_DEBUG_BLK_CGROUP
 723        blkg_stat_reset(&stats->unaccounted_time);
 724        blkg_stat_reset(&stats->avg_queue_size_sum);
 725        blkg_stat_reset(&stats->avg_queue_size_samples);
 726        blkg_stat_reset(&stats->dequeue);
 727        blkg_stat_reset(&stats->group_wait_time);
 728        blkg_stat_reset(&stats->idle_time);
 729        blkg_stat_reset(&stats->empty_time);
 730#endif
 731}
 732
 733/* @to += @from */
 734static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 735{
 736        /* queued stats shouldn't be cleared */
 737        blkg_rwstat_add_aux(&to->merged, &from->merged);
 738        blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 739        blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
 740        blkg_stat_add_aux(&from->time, &from->time);
 741#ifdef CONFIG_DEBUG_BLK_CGROUP
 742        blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
 743        blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
 744        blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
 745        blkg_stat_add_aux(&to->dequeue, &from->dequeue);
 746        blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
 747        blkg_stat_add_aux(&to->idle_time, &from->idle_time);
 748        blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 749#endif
 750}
 751
 752/*
 753 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
 754 * recursive stats can still account for the amount used by this cfqg after
 755 * it's gone.
 756 */
 757static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 758{
 759        struct cfq_group *parent = cfqg_parent(cfqg);
 760
 761        lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
 762
 763        if (unlikely(!parent))
 764                return;
 765
 766        cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
 767        cfqg_stats_reset(&cfqg->stats);
 768}
 769
 770#else   /* CONFIG_CFQ_GROUP_IOSCHED */
 771
 772static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
 773static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
 774                                      struct cfq_group *ancestor)
 775{
 776        return true;
 777}
 778static inline void cfqg_get(struct cfq_group *cfqg) { }
 779static inline void cfqg_put(struct cfq_group *cfqg) { }
 780
 781#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
 782        blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
 783                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 784                        cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 785                                ##args)
 786#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
 787
 788static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 789                        struct cfq_group *curr_cfqg, unsigned int op) { }
 790static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 791                        uint64_t time, unsigned long unaccounted_time) { }
 792static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 793                        unsigned int op) { }
 794static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 795                        unsigned int op) { }
 796static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 797                        uint64_t start_time, uint64_t io_start_time,
 798                        unsigned int op) { }
 799
 800#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 801
 802#define cfq_log(cfqd, fmt, args...)     \
 803        blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 804
 805/* Traverses through cfq group service trees */
 806#define for_each_cfqg_st(cfqg, i, j, st) \
 807        for (i = 0; i <= IDLE_WORKLOAD; i++) \
 808                for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
 809                        : &cfqg->service_tree_idle; \
 810                        (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
 811                        (i == IDLE_WORKLOAD && j == 0); \
 812                        j++, st = i < IDLE_WORKLOAD ? \
 813                        &cfqg->service_trees[i][j]: NULL) \
 814
 815static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
 816        struct cfq_ttime *ttime, bool group_idle)
 817{
 818        u64 slice;
 819        if (!sample_valid(ttime->ttime_samples))
 820                return false;
 821        if (group_idle)
 822                slice = cfqd->cfq_group_idle;
 823        else
 824                slice = cfqd->cfq_slice_idle;
 825        return ttime->ttime_mean > slice;
 826}
 827
 828static inline bool iops_mode(struct cfq_data *cfqd)
 829{
 830        /*
 831         * If we are not idling on queues and it is a NCQ drive, parallel
 832         * execution of requests is on and measuring time is not possible
 833         * in most of the cases until and unless we drive shallower queue
 834         * depths and that becomes a performance bottleneck. In such cases
 835         * switch to start providing fairness in terms of number of IOs.
 836         */
 837        if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
 838                return true;
 839        else
 840                return false;
 841}
 842
 843static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
 844{
 845        if (cfq_class_idle(cfqq))
 846                return IDLE_WORKLOAD;
 847        if (cfq_class_rt(cfqq))
 848                return RT_WORKLOAD;
 849        return BE_WORKLOAD;
 850}
 851
 852
 853static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 854{
 855        if (!cfq_cfqq_sync(cfqq))
 856                return ASYNC_WORKLOAD;
 857        if (!cfq_cfqq_idle_window(cfqq))
 858                return SYNC_NOIDLE_WORKLOAD;
 859        return SYNC_WORKLOAD;
 860}
 861
 862static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
 863                                        struct cfq_data *cfqd,
 864                                        struct cfq_group *cfqg)
 865{
 866        if (wl_class == IDLE_WORKLOAD)
 867                return cfqg->service_tree_idle.count;
 868
 869        return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
 870                cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
 871                cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
 872}
 873
 874static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 875                                        struct cfq_group *cfqg)
 876{
 877        return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
 878                cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 879}
 880
 881static void cfq_dispatch_insert(struct request_queue *, struct request *);
 882static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
 883                                       struct cfq_io_cq *cic, struct bio *bio);
 884
 885static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 886{
 887        /* cic->icq is the first member, %NULL will convert to %NULL */
 888        return container_of(icq, struct cfq_io_cq, icq);
 889}
 890
 891static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
 892                                               struct io_context *ioc)
 893{
 894        if (ioc)
 895                return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
 896        return NULL;
 897}
 898
 899static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 900{
 901        return cic->cfqq[is_sync];
 902}
 903
 904static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
 905                                bool is_sync)
 906{
 907        cic->cfqq[is_sync] = cfqq;
 908}
 909
 910static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 911{
 912        return cic->icq.q->elevator->elevator_data;
 913}
 914
 915/*
 916 * scheduler run of queue, if there are requests pending and no one in the
 917 * driver that will restart queueing
 918 */
 919static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 920{
 921        if (cfqd->busy_queues) {
 922                cfq_log(cfqd, "schedule dispatch");
 923                kblockd_schedule_work(&cfqd->unplug_work);
 924        }
 925}
 926
 927/*
 928 * Scale schedule slice based on io priority. Use the sync time slice only
 929 * if a queue is marked sync and has sync io queued. A sync queue with async
 930 * io only, should not get full sync slice length.
 931 */
 932static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 933                                 unsigned short prio)
 934{
 935        u64 base_slice = cfqd->cfq_slice[sync];
 936        u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
 937
 938        WARN_ON(prio >= IOPRIO_BE_NR);
 939
 940        return base_slice + (slice * (4 - prio));
 941}
 942
 943static inline u64
 944cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 945{
 946        return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 947}
 948
 949/**
 950 * cfqg_scale_charge - scale disk time charge according to cfqg weight
 951 * @charge: disk time being charged
 952 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
 953 *
 954 * Scale @charge according to @vfraction, which is in range (0, 1].  The
 955 * scaling is inversely proportional.
 956 *
 957 * scaled = charge / vfraction
 958 *
 959 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
 960 */
 961static inline u64 cfqg_scale_charge(u64 charge,
 962                                    unsigned int vfraction)
 963{
 964        u64 c = charge << CFQ_SERVICE_SHIFT;    /* make it fixed point */
 965
 966        /* charge / vfraction */
 967        c <<= CFQ_SERVICE_SHIFT;
 968        return div_u64(c, vfraction);
 969}
 970
 971static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 972{
 973        s64 delta = (s64)(vdisktime - min_vdisktime);
 974        if (delta > 0)
 975                min_vdisktime = vdisktime;
 976
 977        return min_vdisktime;
 978}
 979
 980static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 981{
 982        s64 delta = (s64)(vdisktime - min_vdisktime);
 983        if (delta < 0)
 984                min_vdisktime = vdisktime;
 985
 986        return min_vdisktime;
 987}
 988
 989static void update_min_vdisktime(struct cfq_rb_root *st)
 990{
 991        struct cfq_group *cfqg;
 992
 993        if (st->left) {
 994                cfqg = rb_entry_cfqg(st->left);
 995                st->min_vdisktime = max_vdisktime(st->min_vdisktime,
 996                                                  cfqg->vdisktime);
 997        }
 998}
 999
1000/*
1001 * get averaged number of queues of RT/BE priority.
1002 * average is updated, with a formula that gives more weight to higher numbers,
1003 * to quickly follows sudden increases and decrease slowly
1004 */
1005
1006static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
1007                                        struct cfq_group *cfqg, bool rt)
1008{
1009        unsigned min_q, max_q;
1010        unsigned mult  = cfq_hist_divisor - 1;
1011        unsigned round = cfq_hist_divisor / 2;
1012        unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1013
1014        min_q = min(cfqg->busy_queues_avg[rt], busy);
1015        max_q = max(cfqg->busy_queues_avg[rt], busy);
1016        cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1017                cfq_hist_divisor;
1018        return cfqg->busy_queues_avg[rt];
1019}
1020
1021static inline u64
1022cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1023{
1024        return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1025}
1026
1027static inline u64
1028cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1029{
1030        u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1031        if (cfqd->cfq_latency) {
1032                /*
1033                 * interested queues (we consider only the ones with the same
1034                 * priority class in the cfq group)
1035                 */
1036                unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1037                                                cfq_class_rt(cfqq));
1038                u64 sync_slice = cfqd->cfq_slice[1];
1039                u64 expect_latency = sync_slice * iq;
1040                u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1041
1042                if (expect_latency > group_slice) {
1043                        u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1044                        u64 low_slice;
1045
1046                        /* scale low_slice according to IO priority
1047                         * and sync vs async */
1048                        low_slice = div64_u64(base_low_slice*slice, sync_slice);
1049                        low_slice = min(slice, low_slice);
1050                        /* the adapted slice value is scaled to fit all iqs
1051                         * into the target latency */
1052                        slice = div64_u64(slice*group_slice, expect_latency);
1053                        slice = max(slice, low_slice);
1054                }
1055        }
1056        return slice;
1057}
1058
1059static inline void
1060cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1061{
1062        u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1063        u64 now = ktime_get_ns();
1064
1065        cfqq->slice_start = now;
1066        cfqq->slice_end = now + slice;
1067        cfqq->allocated_slice = slice;
1068        cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1069}
1070
1071/*
1072 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1073 * isn't valid until the first request from the dispatch is activated
1074 * and the slice time set.
1075 */
1076static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1077{
1078        if (cfq_cfqq_slice_new(cfqq))
1079                return false;
1080        if (ktime_get_ns() < cfqq->slice_end)
1081                return false;
1082
1083        return true;
1084}
1085
1086/*
1087 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1088 * We choose the request that is closest to the head right now. Distance
1089 * behind the head is penalized and only allowed to a certain extent.
1090 */
1091static struct request *
1092cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1093{
1094        sector_t s1, s2, d1 = 0, d2 = 0;
1095        unsigned long back_max;
1096#define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
1097#define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
1098        unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1099
1100        if (rq1 == NULL || rq1 == rq2)
1101                return rq2;
1102        if (rq2 == NULL)
1103                return rq1;
1104
1105        if (rq_is_sync(rq1) != rq_is_sync(rq2))
1106                return rq_is_sync(rq1) ? rq1 : rq2;
1107
1108        if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1109                return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1110
1111        s1 = blk_rq_pos(rq1);
1112        s2 = blk_rq_pos(rq2);
1113
1114        /*
1115         * by definition, 1KiB is 2 sectors
1116         */
1117        back_max = cfqd->cfq_back_max * 2;
1118
1119        /*
1120         * Strict one way elevator _except_ in the case where we allow
1121         * short backward seeks which are biased as twice the cost of a
1122         * similar forward seek.
1123         */
1124        if (s1 >= last)
1125                d1 = s1 - last;
1126        else if (s1 + back_max >= last)
1127                d1 = (last - s1) * cfqd->cfq_back_penalty;
1128        else
1129                wrap |= CFQ_RQ1_WRAP;
1130
1131        if (s2 >= last)
1132                d2 = s2 - last;
1133        else if (s2 + back_max >= last)
1134                d2 = (last - s2) * cfqd->cfq_back_penalty;
1135        else
1136                wrap |= CFQ_RQ2_WRAP;
1137
1138        /* Found required data */
1139
1140        /*
1141         * By doing switch() on the bit mask "wrap" we avoid having to
1142         * check two variables for all permutations: --> faster!
1143         */
1144        switch (wrap) {
1145        case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1146                if (d1 < d2)
1147                        return rq1;
1148                else if (d2 < d1)
1149                        return rq2;
1150                else {
1151                        if (s1 >= s2)
1152                                return rq1;
1153                        else
1154                                return rq2;
1155                }
1156
1157        case CFQ_RQ2_WRAP:
1158                return rq1;
1159        case CFQ_RQ1_WRAP:
1160                return rq2;
1161        case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1162        default:
1163                /*
1164                 * Since both rqs are wrapped,
1165                 * start with the one that's further behind head
1166                 * (--> only *one* back seek required),
1167                 * since back seek takes more time than forward.
1168                 */
1169                if (s1 <= s2)
1170                        return rq1;
1171                else
1172                        return rq2;
1173        }
1174}
1175
1176/*
1177 * The below is leftmost cache rbtree addon
1178 */
1179static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1180{
1181        /* Service tree is empty */
1182        if (!root->count)
1183                return NULL;
1184
1185        if (!root->left)
1186                root->left = rb_first(&root->rb);
1187
1188        if (root->left)
1189                return rb_entry(root->left, struct cfq_queue, rb_node);
1190
1191        return NULL;
1192}
1193
1194static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1195{
1196        if (!root->left)
1197                root->left = rb_first(&root->rb);
1198
1199        if (root->left)
1200                return rb_entry_cfqg(root->left);
1201
1202        return NULL;
1203}
1204
1205static void rb_erase_init(struct rb_node *n, struct rb_root *root)
1206{
1207        rb_erase(n, root);
1208        RB_CLEAR_NODE(n);
1209}
1210
1211static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1212{
1213        if (root->left == n)
1214                root->left = NULL;
1215        rb_erase_init(n, &root->rb);
1216        --root->count;
1217}
1218
1219/*
1220 * would be nice to take fifo expire time into account as well
1221 */
1222static struct request *
1223cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1224                  struct request *last)
1225{
1226        struct rb_node *rbnext = rb_next(&last->rb_node);
1227        struct rb_node *rbprev = rb_prev(&last->rb_node);
1228        struct request *next = NULL, *prev = NULL;
1229
1230        BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1231
1232        if (rbprev)
1233                prev = rb_entry_rq(rbprev);
1234
1235        if (rbnext)
1236                next = rb_entry_rq(rbnext);
1237        else {
1238                rbnext = rb_first(&cfqq->sort_list);
1239                if (rbnext && rbnext != &last->rb_node)
1240                        next = rb_entry_rq(rbnext);
1241        }
1242
1243        return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1244}
1245
1246static u64 cfq_slice_offset(struct cfq_data *cfqd,
1247                            struct cfq_queue *cfqq)
1248{
1249        /*
1250         * just an approximation, should be ok.
1251         */
1252        return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1253                       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1254}
1255
1256static inline s64
1257cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1258{
1259        return cfqg->vdisktime - st->min_vdisktime;
1260}
1261
1262static void
1263__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1264{
1265        struct rb_node **node = &st->rb.rb_node;
1266        struct rb_node *parent = NULL;
1267        struct cfq_group *__cfqg;
1268        s64 key = cfqg_key(st, cfqg);
1269        int left = 1;
1270
1271        while (*node != NULL) {
1272                parent = *node;
1273                __cfqg = rb_entry_cfqg(parent);
1274
1275                if (key < cfqg_key(st, __cfqg))
1276                        node = &parent->rb_left;
1277                else {
1278                        node = &parent->rb_right;
1279                        left = 0;
1280                }
1281        }
1282
1283        if (left)
1284                st->left = &cfqg->rb_node;
1285
1286        rb_link_node(&cfqg->rb_node, parent, node);
1287        rb_insert_color(&cfqg->rb_node, &st->rb);
1288}
1289
1290/*
1291 * This has to be called only on activation of cfqg
1292 */
1293static void
1294cfq_update_group_weight(struct cfq_group *cfqg)
1295{
1296        if (cfqg->new_weight) {
1297                cfqg->weight = cfqg->new_weight;
1298                cfqg->new_weight = 0;
1299        }
1300}
1301
1302static void
1303cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1304{
1305        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1306
1307        if (cfqg->new_leaf_weight) {
1308                cfqg->leaf_weight = cfqg->new_leaf_weight;
1309                cfqg->new_leaf_weight = 0;
1310        }
1311}
1312
1313static void
1314cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1315{
1316        unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;      /* start with 1 */
1317        struct cfq_group *pos = cfqg;
1318        struct cfq_group *parent;
1319        bool propagate;
1320
1321        /* add to the service tree */
1322        BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1323
1324        /*
1325         * Update leaf_weight.  We cannot update weight at this point
1326         * because cfqg might already have been activated and is
1327         * contributing its current weight to the parent's child_weight.
1328         */
1329        cfq_update_group_leaf_weight(cfqg);
1330        __cfq_group_service_tree_add(st, cfqg);
1331
1332        /*
1333         * Activate @cfqg and calculate the portion of vfraction @cfqg is
1334         * entitled to.  vfraction is calculated by walking the tree
1335         * towards the root calculating the fraction it has at each level.
1336         * The compounded ratio is how much vfraction @cfqg owns.
1337         *
1338         * Start with the proportion tasks in this cfqg has against active
1339         * children cfqgs - its leaf_weight against children_weight.
1340         */
1341        propagate = !pos->nr_active++;
1342        pos->children_weight += pos->leaf_weight;
1343        vfr = vfr * pos->leaf_weight / pos->children_weight;
1344
1345        /*
1346         * Compound ->weight walking up the tree.  Both activation and
1347         * vfraction calculation are done in the same loop.  Propagation
1348         * stops once an already activated node is met.  vfraction
1349         * calculation should always continue to the root.
1350         */
1351        while ((parent = cfqg_parent(pos))) {
1352                if (propagate) {
1353                        cfq_update_group_weight(pos);
1354                        propagate = !parent->nr_active++;
1355                        parent->children_weight += pos->weight;
1356                }
1357                vfr = vfr * pos->weight / parent->children_weight;
1358                pos = parent;
1359        }
1360
1361        cfqg->vfraction = max_t(unsigned, vfr, 1);
1362}
1363
1364static void
1365cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1366{
1367        struct cfq_rb_root *st = &cfqd->grp_service_tree;
1368        struct cfq_group *__cfqg;
1369        struct rb_node *n;
1370
1371        cfqg->nr_cfqq++;
1372        if (!RB_EMPTY_NODE(&cfqg->rb_node))
1373                return;
1374
1375        /*
1376         * Currently put the group at the end. Later implement something
1377         * so that groups get lesser vtime based on their weights, so that
1378         * if group does not loose all if it was not continuously backlogged.
1379         */
1380        n = rb_last(&st->rb);
1381        if (n) {
1382                __cfqg = rb_entry_cfqg(n);
1383                cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
1384        } else
1385                cfqg->vdisktime = st->min_vdisktime;
1386        cfq_group_service_tree_add(st, cfqg);
1387}
1388
1389static void
1390cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1391{
1392        struct cfq_group *pos = cfqg;
1393        bool propagate;
1394
1395        /*
1396         * Undo activation from cfq_group_service_tree_add().  Deactivate
1397         * @cfqg and propagate deactivation upwards.
1398         */
1399        propagate = !--pos->nr_active;
1400        pos->children_weight -= pos->leaf_weight;
1401
1402        while (propagate) {
1403                struct cfq_group *parent = cfqg_parent(pos);
1404
1405                /* @pos has 0 nr_active at this point */
1406                WARN_ON_ONCE(pos->children_weight);
1407                pos->vfraction = 0;
1408
1409                if (!parent)
1410                        break;
1411
1412                propagate = !--parent->nr_active;
1413                parent->children_weight -= pos->weight;
1414                pos = parent;
1415        }
1416
1417        /* remove from the service tree */
1418        if (!RB_EMPTY_NODE(&cfqg->rb_node))
1419                cfq_rb_erase(&cfqg->rb_node, st);
1420}
1421
1422static void
1423cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1424{
1425        struct cfq_rb_root *st = &cfqd->grp_service_tree;
1426
1427        BUG_ON(cfqg->nr_cfqq < 1);
1428        cfqg->nr_cfqq--;
1429
1430        /* If there are other cfq queues under this group, don't delete it */
1431        if (cfqg->nr_cfqq)
1432                return;
1433
1434        cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1435        cfq_group_service_tree_del(st, cfqg);
1436        cfqg->saved_wl_slice = 0;
1437        cfqg_stats_update_dequeue(cfqg);
1438}
1439
1440static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1441                                       u64 *unaccounted_time)
1442{
1443        u64 slice_used;
1444        u64 now = ktime_get_ns();
1445
1446        /*
1447         * Queue got expired before even a single request completed or
1448         * got expired immediately after first request completion.
1449         */
1450        if (!cfqq->slice_start || cfqq->slice_start == now) {
1451                /*
1452                 * Also charge the seek time incurred to the group, otherwise
1453                 * if there are mutiple queues in the group, each can dispatch
1454                 * a single request on seeky media and cause lots of seek time
1455                 * and group will never know it.
1456                 */
1457                slice_used = max_t(u64, (now - cfqq->dispatch_start),
1458                                        jiffies_to_nsecs(1));
1459        } else {
1460                slice_used = now - cfqq->slice_start;
1461                if (slice_used > cfqq->allocated_slice) {
1462                        *unaccounted_time = slice_used - cfqq->allocated_slice;
1463                        slice_used = cfqq->allocated_slice;
1464                }
1465                if (cfqq->slice_start > cfqq->dispatch_start)
1466                        *unaccounted_time += cfqq->slice_start -
1467                                        cfqq->dispatch_start;
1468        }
1469
1470        return slice_used;
1471}
1472
1473static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1474                                struct cfq_queue *cfqq)
1475{
1476        struct cfq_rb_root *st = &cfqd->grp_service_tree;
1477        u64 used_sl, charge, unaccounted_sl = 0;
1478        int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1479                        - cfqg->service_tree_idle.count;
1480        unsigned int vfr;
1481        u64 now = ktime_get_ns();
1482
1483        BUG_ON(nr_sync < 0);
1484        used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1485
1486        if (iops_mode(cfqd))
1487                charge = cfqq->slice_dispatch;
1488        else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1489                charge = cfqq->allocated_slice;
1490
1491        /*
1492         * Can't update vdisktime while on service tree and cfqg->vfraction
1493         * is valid only while on it.  Cache vfr, leave the service tree,
1494         * update vdisktime and go back on.  The re-addition to the tree
1495         * will also update the weights as necessary.
1496         */
1497        vfr = cfqg->vfraction;
1498        cfq_group_service_tree_del(st, cfqg);
1499        cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1500        cfq_group_service_tree_add(st, cfqg);
1501
1502        /* This group is being expired. Save the context */
1503        if (cfqd->workload_expires > now) {
1504                cfqg->saved_wl_slice = cfqd->workload_expires - now;
1505                cfqg->saved_wl_type = cfqd->serving_wl_type;
1506                cfqg->saved_wl_class = cfqd->serving_wl_class;
1507        } else
1508                cfqg->saved_wl_slice = 0;
1509
1510        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1511                                        st->min_vdisktime);
1512        cfq_log_cfqq(cfqq->cfqd, cfqq,
1513                     "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1514                     used_sl, cfqq->slice_dispatch, charge,
1515                     iops_mode(cfqd), cfqq->nr_sectors);
1516        cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1517        cfqg_stats_set_start_empty_time(cfqg);
1518}
1519
1520/**
1521 * cfq_init_cfqg_base - initialize base part of a cfq_group
1522 * @cfqg: cfq_group to initialize
1523 *
1524 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1525 * is enabled or not.
1526 */
1527static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1528{
1529        struct cfq_rb_root *st;
1530        int i, j;
1531
1532        for_each_cfqg_st(cfqg, i, j, st)
1533                *st = CFQ_RB_ROOT;
1534        RB_CLEAR_NODE(&cfqg->rb_node);
1535
1536        cfqg->ttime.last_end_request = ktime_get_ns();
1537}
1538
1539#ifdef CONFIG_CFQ_GROUP_IOSCHED
1540static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1541                            bool on_dfl, bool reset_dev, bool is_leaf_weight);
1542
1543static void cfqg_stats_exit(struct cfqg_stats *stats)
1544{
1545        blkg_rwstat_exit(&stats->merged);
1546        blkg_rwstat_exit(&stats->service_time);
1547        blkg_rwstat_exit(&stats->wait_time);
1548        blkg_rwstat_exit(&stats->queued);
1549        blkg_stat_exit(&stats->time);
1550#ifdef CONFIG_DEBUG_BLK_CGROUP
1551        blkg_stat_exit(&stats->unaccounted_time);
1552        blkg_stat_exit(&stats->avg_queue_size_sum);
1553        blkg_stat_exit(&stats->avg_queue_size_samples);
1554        blkg_stat_exit(&stats->dequeue);
1555        blkg_stat_exit(&stats->group_wait_time);
1556        blkg_stat_exit(&stats->idle_time);
1557        blkg_stat_exit(&stats->empty_time);
1558#endif
1559}
1560
1561static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1562{
1563        if (blkg_rwstat_init(&stats->merged, gfp) ||
1564            blkg_rwstat_init(&stats->service_time, gfp) ||
1565            blkg_rwstat_init(&stats->wait_time, gfp) ||
1566            blkg_rwstat_init(&stats->queued, gfp) ||
1567            blkg_stat_init(&stats->time, gfp))
1568                goto err;
1569
1570#ifdef CONFIG_DEBUG_BLK_CGROUP
1571        if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1572            blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1573            blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1574            blkg_stat_init(&stats->dequeue, gfp) ||
1575            blkg_stat_init(&stats->group_wait_time, gfp) ||
1576            blkg_stat_init(&stats->idle_time, gfp) ||
1577            blkg_stat_init(&stats->empty_time, gfp))
1578                goto err;
1579#endif
1580        return 0;
1581err:
1582        cfqg_stats_exit(stats);
1583        return -ENOMEM;
1584}
1585
1586static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1587{
1588        struct cfq_group_data *cgd;
1589
1590        cgd = kzalloc(sizeof(*cgd), gfp);
1591        if (!cgd)
1592                return NULL;
1593        return &cgd->cpd;
1594}
1595
1596static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1597{
1598        struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1599        unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1600                              CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1601
1602        if (cpd_to_blkcg(cpd) == &blkcg_root)
1603                weight *= 2;
1604
1605        cgd->weight = weight;
1606        cgd->leaf_weight = weight;
1607}
1608
1609static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1610{
1611        kfree(cpd_to_cfqgd(cpd));
1612}
1613
1614static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1615{
1616        struct blkcg *blkcg = cpd_to_blkcg(cpd);
1617        bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1618        unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1619
1620        if (blkcg == &blkcg_root)
1621                weight *= 2;
1622
1623        WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1624        WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1625}
1626
1627static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1628{
1629        struct cfq_group *cfqg;
1630
1631        cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1632        if (!cfqg)
1633                return NULL;
1634
1635        cfq_init_cfqg_base(cfqg);
1636        if (cfqg_stats_init(&cfqg->stats, gfp)) {
1637                kfree(cfqg);
1638                return NULL;
1639        }
1640
1641        return &cfqg->pd;
1642}
1643
1644static void cfq_pd_init(struct blkg_policy_data *pd)
1645{
1646        struct cfq_group *cfqg = pd_to_cfqg(pd);
1647        struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1648
1649        cfqg->weight = cgd->weight;
1650        cfqg->leaf_weight = cgd->leaf_weight;
1651}
1652
1653static void cfq_pd_offline(struct blkg_policy_data *pd)
1654{
1655        struct cfq_group *cfqg = pd_to_cfqg(pd);
1656        int i;
1657
1658        for (i = 0; i < IOPRIO_BE_NR; i++) {
1659                if (cfqg->async_cfqq[0][i])
1660                        cfq_put_queue(cfqg->async_cfqq[0][i]);
1661                if (cfqg->async_cfqq[1][i])
1662                        cfq_put_queue(cfqg->async_cfqq[1][i]);
1663        }
1664
1665        if (cfqg->async_idle_cfqq)
1666                cfq_put_queue(cfqg->async_idle_cfqq);
1667
1668        /*
1669         * @blkg is going offline and will be ignored by
1670         * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
1671         * that they don't get lost.  If IOs complete after this point, the
1672         * stats for them will be lost.  Oh well...
1673         */
1674        cfqg_stats_xfer_dead(cfqg);
1675}
1676
1677static void cfq_pd_free(struct blkg_policy_data *pd)
1678{
1679        struct cfq_group *cfqg = pd_to_cfqg(pd);
1680
1681        cfqg_stats_exit(&cfqg->stats);
1682        return kfree(cfqg);
1683}
1684
1685static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1686{
1687        struct cfq_group *cfqg = pd_to_cfqg(pd);
1688
1689        cfqg_stats_reset(&cfqg->stats);
1690}
1691
1692static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1693                                         struct blkcg *blkcg)
1694{
1695        struct blkcg_gq *blkg;
1696
1697        blkg = blkg_lookup(blkcg, cfqd->queue);
1698        if (likely(blkg))
1699                return blkg_to_cfqg(blkg);
1700        return NULL;
1701}
1702
1703static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1704{
1705        cfqq->cfqg = cfqg;
1706        /* cfqq reference on cfqg */
1707        cfqg_get(cfqg);
1708}
1709
1710static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1711                                     struct blkg_policy_data *pd, int off)
1712{
1713        struct cfq_group *cfqg = pd_to_cfqg(pd);
1714
1715        if (!cfqg->dev_weight)
1716                return 0;
1717        return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1718}
1719
1720static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1721{
1722        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1723                          cfqg_prfill_weight_device, &blkcg_policy_cfq,
1724                          0, false);
1725        return 0;
1726}
1727
1728static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1729                                          struct blkg_policy_data *pd, int off)
1730{
1731        struct cfq_group *cfqg = pd_to_cfqg(pd);
1732
1733        if (!cfqg->dev_leaf_weight)
1734                return 0;
1735        return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1736}
1737
1738static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1739{
1740        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1741                          cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1742                          0, false);
1743        return 0;
1744}
1745
1746static int cfq_print_weight(struct seq_file *sf, void *v)
1747{
1748        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1749        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1750        unsigned int val = 0;
1751
1752        if (cgd)
1753                val = cgd->weight;
1754
1755        seq_printf(sf, "%u\n", val);
1756        return 0;
1757}
1758
1759static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1760{
1761        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1762        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1763        unsigned int val = 0;
1764
1765        if (cgd)
1766                val = cgd->leaf_weight;
1767
1768        seq_printf(sf, "%u\n", val);
1769        return 0;
1770}
1771
1772static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1773                                        char *buf, size_t nbytes, loff_t off,
1774                                        bool on_dfl, bool is_leaf_weight)
1775{
1776        unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1777        unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1778        struct blkcg *blkcg = css_to_blkcg(of_css(of));
1779        struct blkg_conf_ctx ctx;
1780        struct cfq_group *cfqg;
1781        struct cfq_group_data *cfqgd;
1782        int ret;
1783        u64 v;
1784
1785        ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1786        if (ret)
1787                return ret;
1788
1789        if (sscanf(ctx.body, "%llu", &v) == 1) {
1790                /* require "default" on dfl */
1791                ret = -ERANGE;
1792                if (!v && on_dfl)
1793                        goto out_finish;
1794        } else if (!strcmp(strim(ctx.body), "default")) {
1795                v = 0;
1796        } else {
1797                ret = -EINVAL;
1798                goto out_finish;
1799        }
1800
1801        cfqg = blkg_to_cfqg(ctx.blkg);
1802        cfqgd = blkcg_to_cfqgd(blkcg);
1803
1804        ret = -ERANGE;
1805        if (!v || (v >= min && v <= max)) {
1806                if (!is_leaf_weight) {
1807                        cfqg->dev_weight = v;
1808                        cfqg->new_weight = v ?: cfqgd->weight;
1809                } else {
1810                        cfqg->dev_leaf_weight = v;
1811                        cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1812                }
1813                ret = 0;
1814        }
1815out_finish:
1816        blkg_conf_finish(&ctx);
1817        return ret ?: nbytes;
1818}
1819
1820static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1821                                      char *buf, size_t nbytes, loff_t off)
1822{
1823        return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1824}
1825
1826static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1827                                           char *buf, size_t nbytes, loff_t off)
1828{
1829        return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1830}
1831
1832static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1833                            bool on_dfl, bool reset_dev, bool is_leaf_weight)
1834{
1835        unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1836        unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1837        struct blkcg *blkcg = css_to_blkcg(css);
1838        struct blkcg_gq *blkg;
1839        struct cfq_group_data *cfqgd;
1840        int ret = 0;
1841
1842        if (val < min || val > max)
1843                return -ERANGE;
1844
1845        spin_lock_irq(&blkcg->lock);
1846        cfqgd = blkcg_to_cfqgd(blkcg);
1847        if (!cfqgd) {
1848                ret = -EINVAL;
1849                goto out;
1850        }
1851
1852        if (!is_leaf_weight)
1853                cfqgd->weight = val;
1854        else
1855                cfqgd->leaf_weight = val;
1856
1857        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1858                struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1859
1860                if (!cfqg)
1861                        continue;
1862
1863                if (!is_leaf_weight) {
1864                        if (reset_dev)
1865                                cfqg->dev_weight = 0;
1866                        if (!cfqg->dev_weight)
1867                                cfqg->new_weight = cfqgd->weight;
1868                } else {
1869                        if (reset_dev)
1870                                cfqg->dev_leaf_weight = 0;
1871                        if (!cfqg->dev_leaf_weight)
1872                                cfqg->new_leaf_weight = cfqgd->leaf_weight;
1873                }
1874        }
1875
1876out:
1877        spin_unlock_irq(&blkcg->lock);
1878        return ret;
1879}
1880
1881static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1882                          u64 val)
1883{
1884        return __cfq_set_weight(css, val, false, false, false);
1885}
1886
1887static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1888                               struct cftype *cft, u64 val)
1889{
1890        return __cfq_set_weight(css, val, false, false, true);
1891}
1892
1893static int cfqg_print_stat(struct seq_file *sf, void *v)
1894{
1895        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1896                          &blkcg_policy_cfq, seq_cft(sf)->private, false);
1897        return 0;
1898}
1899
1900static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1901{
1902        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1903                          &blkcg_policy_cfq, seq_cft(sf)->private, true);
1904        return 0;
1905}
1906
1907static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1908                                      struct blkg_policy_data *pd, int off)
1909{
1910        u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1911                                          &blkcg_policy_cfq, off);
1912        return __blkg_prfill_u64(sf, pd, sum);
1913}
1914
1915static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1916                                        struct blkg_policy_data *pd, int off)
1917{
1918        struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1919                                                        &blkcg_policy_cfq, off);
1920        return __blkg_prfill_rwstat(sf, pd, &sum);
1921}
1922
1923static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1924{
1925        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1926                          cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1927                          seq_cft(sf)->private, false);
1928        return 0;
1929}
1930
1931static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1932{
1933        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1934                          cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1935                          seq_cft(sf)->private, true);
1936        return 0;
1937}
1938
1939static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1940                               int off)
1941{
1942        u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1943
1944        return __blkg_prfill_u64(sf, pd, sum >> 9);
1945}
1946
1947static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1948{
1949        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1950                          cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1951        return 0;
1952}
1953
1954static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1955                                         struct blkg_policy_data *pd, int off)
1956{
1957        struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1958                                        offsetof(struct blkcg_gq, stat_bytes));
1959        u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1960                atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1961
1962        return __blkg_prfill_u64(sf, pd, sum >> 9);
1963}
1964
1965static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1966{
1967        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1968                          cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1969                          false);
1970        return 0;
1971}
1972
1973#ifdef CONFIG_DEBUG_BLK_CGROUP
1974static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1975                                      struct blkg_policy_data *pd, int off)
1976{
1977        struct cfq_group *cfqg = pd_to_cfqg(pd);
1978        u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1979        u64 v = 0;
1980
1981        if (samples) {
1982                v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1983                v = div64_u64(v, samples);
1984        }
1985        __blkg_prfill_u64(sf, pd, v);
1986        return 0;
1987}
1988
1989/* print avg_queue_size */
1990static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1991{
1992        blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1993                          cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1994                          0, false);
1995        return 0;
1996}
1997#endif  /* CONFIG_DEBUG_BLK_CGROUP */
1998
1999static struct cftype cfq_blkcg_legacy_files[] = {
2000        /* on root, weight is mapped to leaf_weight */
2001        {
2002                .name = "weight_device",
2003                .flags = CFTYPE_ONLY_ON_ROOT,
2004                .seq_show = cfqg_print_leaf_weight_device,
2005                .write = cfqg_set_leaf_weight_device,
2006        },
2007        {
2008                .name = "weight",
2009                .flags = CFTYPE_ONLY_ON_ROOT,
2010                .seq_show = cfq_print_leaf_weight,
2011                .write_u64 = cfq_set_leaf_weight,
2012        },
2013
2014        /* no such mapping necessary for !roots */
2015        {
2016                .name = "weight_device",
2017                .flags = CFTYPE_NOT_ON_ROOT,
2018                .seq_show = cfqg_print_weight_device,
2019                .write = cfqg_set_weight_device,
2020        },
2021        {
2022                .name = "weight",
2023                .flags = CFTYPE_NOT_ON_ROOT,
2024                .seq_show = cfq_print_weight,
2025                .write_u64 = cfq_set_weight,
2026        },
2027
2028        {
2029                .name = "leaf_weight_device",
2030                .seq_show = cfqg_print_leaf_weight_device,
2031                .write = cfqg_set_leaf_weight_device,
2032        },
2033        {
2034                .name = "leaf_weight",
2035                .seq_show = cfq_print_leaf_weight,
2036                .write_u64 = cfq_set_leaf_weight,
2037        },
2038
2039        /* statistics, covers only the tasks in the cfqg */
2040        {
2041                .name = "time",
2042                .private = offsetof(struct cfq_group, stats.time),
2043                .seq_show = cfqg_print_stat,
2044        },
2045        {
2046                .name = "sectors",
2047                .seq_show = cfqg_print_stat_sectors,
2048        },
2049        {
2050                .name = "io_service_bytes",
2051                .private = (unsigned long)&blkcg_policy_cfq,
2052                .seq_show = blkg_print_stat_bytes,
2053        },
2054        {
2055                .name = "io_serviced",
2056                .private = (unsigned long)&blkcg_policy_cfq,
2057                .seq_show = blkg_print_stat_ios,
2058        },
2059        {
2060                .name = "io_service_time",
2061                .private = offsetof(struct cfq_group, stats.service_time),
2062                .seq_show = cfqg_print_rwstat,
2063        },
2064        {
2065                .name = "io_wait_time",
2066                .private = offsetof(struct cfq_group, stats.wait_time),
2067                .seq_show = cfqg_print_rwstat,
2068        },
2069        {
2070                .name = "io_merged",
2071                .private = offsetof(struct cfq_group, stats.merged),
2072                .seq_show = cfqg_print_rwstat,
2073        },
2074        {
2075                .name = "io_queued",
2076                .private = offsetof(struct cfq_group, stats.queued),
2077                .seq_show = cfqg_print_rwstat,
2078        },
2079
2080        /* the same statictics which cover the cfqg and its descendants */
2081        {
2082                .name = "time_recursive",
2083                .private = offsetof(struct cfq_group, stats.time),
2084                .seq_show = cfqg_print_stat_recursive,
2085        },
2086        {
2087                .name = "sectors_recursive",
2088                .seq_show = cfqg_print_stat_sectors_recursive,
2089        },
2090        {
2091                .name = "io_service_bytes_recursive",
2092                .private = (unsigned long)&blkcg_policy_cfq,
2093                .seq_show = blkg_print_stat_bytes_recursive,
2094        },
2095        {
2096                .name = "io_serviced_recursive",
2097                .private = (unsigned long)&blkcg_policy_cfq,
2098                .seq_show = blkg_print_stat_ios_recursive,
2099        },
2100        {
2101                .name = "io_service_time_recursive",
2102                .private = offsetof(struct cfq_group, stats.service_time),
2103                .seq_show = cfqg_print_rwstat_recursive,
2104        },
2105        {
2106                .name = "io_wait_time_recursive",
2107                .private = offsetof(struct cfq_group, stats.wait_time),
2108                .seq_show = cfqg_print_rwstat_recursive,
2109        },
2110        {
2111                .name = "io_merged_recursive",
2112                .private = offsetof(struct cfq_group, stats.merged),
2113                .seq_show = cfqg_print_rwstat_recursive,
2114        },
2115        {
2116                .name = "io_queued_recursive",
2117                .private = offsetof(struct cfq_group, stats.queued),
2118                .seq_show = cfqg_print_rwstat_recursive,
2119        },
2120#ifdef CONFIG_DEBUG_BLK_CGROUP
2121        {
2122                .name = "avg_queue_size",
2123                .seq_show = cfqg_print_avg_queue_size,
2124        },
2125        {
2126                .name = "group_wait_time",
2127                .private = offsetof(struct cfq_group, stats.group_wait_time),
2128                .seq_show = cfqg_print_stat,
2129        },
2130        {
2131                .name = "idle_time",
2132                .private = offsetof(struct cfq_group, stats.idle_time),
2133                .seq_show = cfqg_print_stat,
2134        },
2135        {
2136                .name = "empty_time",
2137                .private = offsetof(struct cfq_group, stats.empty_time),
2138                .seq_show = cfqg_print_stat,
2139        },
2140        {
2141                .name = "dequeue",
2142                .private = offsetof(struct cfq_group, stats.dequeue),
2143                .seq_show = cfqg_print_stat,
2144        },
2145        {
2146                .name = "unaccounted_time",
2147                .private = offsetof(struct cfq_group, stats.unaccounted_time),
2148                .seq_show = cfqg_print_stat,
2149        },
2150#endif  /* CONFIG_DEBUG_BLK_CGROUP */
2151        { }     /* terminate */
2152};
2153
2154static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2155{
2156        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2157        struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2158
2159        seq_printf(sf, "default %u\n", cgd->weight);
2160        blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2161                          &blkcg_policy_cfq, 0, false);
2162        return 0;
2163}
2164
2165static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2166                                     char *buf, size_t nbytes, loff_t off)
2167{
2168        char *endp;
2169        int ret;
2170        u64 v;
2171
2172        buf = strim(buf);
2173
2174        /* "WEIGHT" or "default WEIGHT" sets the default weight */
2175        v = simple_strtoull(buf, &endp, 0);
2176        if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2177                ret = __cfq_set_weight(of_css(of), v, true, false, false);
2178                return ret ?: nbytes;
2179        }
2180
2181        /* "MAJ:MIN WEIGHT" */
2182        return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2183}
2184
2185static struct cftype cfq_blkcg_files[] = {
2186        {
2187                .name = "weight",
2188                .flags = CFTYPE_NOT_ON_ROOT,
2189                .seq_show = cfq_print_weight_on_dfl,
2190                .write = cfq_set_weight_on_dfl,
2191        },
2192        { }     /* terminate */
2193};
2194
2195#else /* GROUP_IOSCHED */
2196static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2197                                         struct blkcg *blkcg)
2198{
2199        return cfqd->root_group;
2200}
2201
2202static inline void
2203cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2204        cfqq->cfqg = cfqg;
2205}
2206
2207#endif /* GROUP_IOSCHED */
2208
2209/*
2210 * The cfqd->service_trees holds all pending cfq_queue's that have
2211 * requests waiting to be processed. It is sorted in the order that
2212 * we will service the queues.
2213 */
2214static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2215                                 bool add_front)
2216{
2217        struct rb_node **p, *parent;
2218        struct cfq_queue *__cfqq;
2219        u64 rb_key;
2220        struct cfq_rb_root *st;
2221        int left;
2222        int new_cfqq = 1;
2223        u64 now = ktime_get_ns();
2224
2225        st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2226        if (cfq_class_idle(cfqq)) {
2227                rb_key = CFQ_IDLE_DELAY;
2228                parent = rb_last(&st->rb);
2229                if (parent && parent != &cfqq->rb_node) {
2230                        __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2231                        rb_key += __cfqq->rb_key;
2232                } else
2233                        rb_key += now;
2234        } else if (!add_front) {
2235                /*
2236                 * Get our rb key offset. Subtract any residual slice
2237                 * value carried from last service. A negative resid
2238                 * count indicates slice overrun, and this should position
2239                 * the next service time further away in the tree.
2240                 */
2241                rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2242                rb_key -= cfqq->slice_resid;
2243                cfqq->slice_resid = 0;
2244        } else {
2245                rb_key = -NSEC_PER_SEC;
2246                __cfqq = cfq_rb_first(st);
2247                rb_key += __cfqq ? __cfqq->rb_key : now;
2248        }
2249
2250        if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2251                new_cfqq = 0;
2252                /*
2253                 * same position, nothing more to do
2254                 */
2255                if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2256                        return;
2257
2258                cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2259                cfqq->service_tree = NULL;
2260        }
2261
2262        left = 1;
2263        parent = NULL;
2264        cfqq->service_tree = st;
2265        p = &st->rb.rb_node;
2266        while (*p) {
2267                parent = *p;
2268                __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2269
2270                /*
2271                 * sort by key, that represents service time.
2272                 */
2273                if (rb_key < __cfqq->rb_key)
2274                        p = &parent->rb_left;
2275                else {
2276                        p = &parent->rb_right;
2277                        left = 0;
2278                }
2279        }
2280
2281        if (left)
2282                st->left = &cfqq->rb_node;
2283
2284        cfqq->rb_key = rb_key;
2285        rb_link_node(&cfqq->rb_node, parent, p);
2286        rb_insert_color(&cfqq->rb_node, &st->rb);
2287        st->count++;
2288        if (add_front || !new_cfqq)
2289                return;
2290        cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2291}
2292
2293static struct cfq_queue *
2294cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2295                     sector_t sector, struct rb_node **ret_parent,
2296                     struct rb_node ***rb_link)
2297{
2298        struct rb_node **p, *parent;
2299        struct cfq_queue *cfqq = NULL;
2300
2301        parent = NULL;
2302        p = &root->rb_node;
2303        while (*p) {
2304                struct rb_node **n;
2305
2306                parent = *p;
2307                cfqq = rb_entry(parent, struct cfq_queue, p_node);
2308
2309                /*
2310                 * Sort strictly based on sector.  Smallest to the left,
2311                 * largest to the right.
2312                 */
2313                if (sector > blk_rq_pos(cfqq->next_rq))
2314                        n = &(*p)->rb_right;
2315                else if (sector < blk_rq_pos(cfqq->next_rq))
2316                        n = &(*p)->rb_left;
2317                else
2318                        break;
2319                p = n;
2320                cfqq = NULL;
2321        }
2322
2323        *ret_parent = parent;
2324        if (rb_link)
2325                *rb_link = p;
2326        return cfqq;
2327}
2328
2329static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2330{
2331        struct rb_node **p, *parent;
2332        struct cfq_queue *__cfqq;
2333
2334        if (cfqq->p_root) {
2335                rb_erase(&cfqq->p_node, cfqq->p_root);
2336                cfqq->p_root = NULL;
2337        }
2338
2339        if (cfq_class_idle(cfqq))
2340                return;
2341        if (!cfqq->next_rq)
2342                return;
2343
2344        cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2345        __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2346                                      blk_rq_pos(cfqq->next_rq), &parent, &p);
2347        if (!__cfqq) {
2348                rb_link_node(&cfqq->p_node, parent, p);
2349                rb_insert_color(&cfqq->p_node, cfqq->p_root);
2350        } else
2351                cfqq->p_root = NULL;
2352}
2353
2354/*
2355 * Update cfqq's position in the service tree.
2356 */
2357static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2358{
2359        /*
2360         * Resorting requires the cfqq to be on the RR list already.
2361         */
2362        if (cfq_cfqq_on_rr(cfqq)) {
2363                cfq_service_tree_add(cfqd, cfqq, 0);
2364                cfq_prio_tree_add(cfqd, cfqq);
2365        }
2366}
2367
2368/*
2369 * add to busy list of queues for service, trying to be fair in ordering
2370 * the pending list according to last request service
2371 */
2372static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2373{
2374        cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2375        BUG_ON(cfq_cfqq_on_rr(cfqq));
2376        cfq_mark_cfqq_on_rr(cfqq);
2377        cfqd->busy_queues++;
2378        if (cfq_cfqq_sync(cfqq))
2379                cfqd->busy_sync_queues++;
2380
2381        cfq_resort_rr_list(cfqd, cfqq);
2382}
2383
2384/*
2385 * Called when the cfqq no longer has requests pending, remove it from
2386 * the service tree.
2387 */
2388static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2389{
2390        cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2391        BUG_ON(!cfq_cfqq_on_rr(cfqq));
2392        cfq_clear_cfqq_on_rr(cfqq);
2393
2394        if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2395                cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2396                cfqq->service_tree = NULL;
2397        }
2398        if (cfqq->p_root) {
2399                rb_erase(&cfqq->p_node, cfqq->p_root);
2400                cfqq->p_root = NULL;
2401        }
2402
2403        cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2404        BUG_ON(!cfqd->busy_queues);
2405        cfqd->busy_queues--;
2406        if (cfq_cfqq_sync(cfqq))
2407                cfqd->busy_sync_queues--;
2408}
2409
2410/*
2411 * rb tree support functions
2412 */
2413static void cfq_del_rq_rb(struct request *rq)
2414{
2415        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2416        const int sync = rq_is_sync(rq);
2417
2418        BUG_ON(!cfqq->queued[sync]);
2419        cfqq->queued[sync]--;
2420
2421        elv_rb_del(&cfqq->sort_list, rq);
2422
2423        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2424                /*
2425                 * Queue will be deleted from service tree when we actually
2426                 * expire it later. Right now just remove it from prio tree
2427                 * as it is empty.
2428                 */
2429                if (cfqq->p_root) {
2430                        rb_erase(&cfqq->p_node, cfqq->p_root);
2431                        cfqq->p_root = NULL;
2432                }
2433        }
2434}
2435
2436static void cfq_add_rq_rb(struct request *rq)
2437{
2438        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2439        struct cfq_data *cfqd = cfqq->cfqd;
2440        struct request *prev;
2441
2442        cfqq->queued[rq_is_sync(rq)]++;
2443
2444        elv_rb_add(&cfqq->sort_list, rq);
2445
2446        if (!cfq_cfqq_on_rr(cfqq))
2447                cfq_add_cfqq_rr(cfqd, cfqq);
2448
2449        /*
2450         * check if this request is a better next-serve candidate
2451         */
2452        prev = cfqq->next_rq;
2453        cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2454
2455        /*
2456         * adjust priority tree position, if ->next_rq changes
2457         */
2458        if (prev != cfqq->next_rq)
2459                cfq_prio_tree_add(cfqd, cfqq);
2460
2461        BUG_ON(!cfqq->next_rq);
2462}
2463
2464static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2465{
2466        elv_rb_del(&cfqq->sort_list, rq);
2467        cfqq->queued[rq_is_sync(rq)]--;
2468        cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2469        cfq_add_rq_rb(rq);
2470        cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2471                                 rq->cmd_flags);
2472}
2473
2474static struct request *
2475cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2476{
2477        struct task_struct *tsk = current;
2478        struct cfq_io_cq *cic;
2479        struct cfq_queue *cfqq;
2480
2481        cic = cfq_cic_lookup(cfqd, tsk->io_context);
2482        if (!cic)
2483                return NULL;
2484
2485        cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
2486        if (cfqq)
2487                return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2488
2489        return NULL;
2490}
2491
2492static void cfq_activate_request(struct request_queue *q, struct request *rq)
2493{
2494        struct cfq_data *cfqd = q->elevator->elevator_data;
2495
2496        cfqd->rq_in_driver++;
2497        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2498                                                cfqd->rq_in_driver);
2499
2500        cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2501}
2502
2503static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2504{
2505        struct cfq_data *cfqd = q->elevator->elevator_data;
2506
2507        WARN_ON(!cfqd->rq_in_driver);
2508        cfqd->rq_in_driver--;
2509        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2510                                                cfqd->rq_in_driver);
2511}
2512
2513static void cfq_remove_request(struct request *rq)
2514{
2515        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2516
2517        if (cfqq->next_rq == rq)
2518                cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2519
2520        list_del_init(&rq->queuelist);
2521        cfq_del_rq_rb(rq);
2522
2523        cfqq->cfqd->rq_queued--;
2524        cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2525        if (rq->cmd_flags & REQ_PRIO) {
2526                WARN_ON(!cfqq->prio_pending);
2527                cfqq->prio_pending--;
2528        }
2529}
2530
2531static int cfq_merge(struct request_queue *q, struct request **req,
2532                     struct bio *bio)
2533{
2534        struct cfq_data *cfqd = q->elevator->elevator_data;
2535        struct request *__rq;
2536
2537        __rq = cfq_find_rq_fmerge(cfqd, bio);
2538        if (__rq && elv_bio_merge_ok(__rq, bio)) {
2539                *req = __rq;
2540                return ELEVATOR_FRONT_MERGE;
2541        }
2542
2543        return ELEVATOR_NO_MERGE;
2544}
2545
2546static void cfq_merged_request(struct request_queue *q, struct request *req,
2547                               int type)
2548{
2549        if (type == ELEVATOR_FRONT_MERGE) {
2550                struct cfq_queue *cfqq = RQ_CFQQ(req);
2551
2552                cfq_reposition_rq_rb(cfqq, req);
2553        }
2554}
2555
2556static void cfq_bio_merged(struct request_queue *q, struct request *req,
2557                                struct bio *bio)
2558{
2559        cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
2560}
2561
2562static void
2563cfq_merged_requests(struct request_queue *q, struct request *rq,
2564                    struct request *next)
2565{
2566        struct cfq_queue *cfqq = RQ_CFQQ(rq);
2567        struct cfq_data *cfqd = q->elevator->elevator_data;
2568
2569        /*
2570         * reposition in fifo if next is older than rq
2571         */
2572        if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2573            next->fifo_time < rq->fifo_time &&
2574            cfqq == RQ_CFQQ(next)) {
2575                list_move(&rq->queuelist, &next->queuelist);
2576                rq->fifo_time = next->fifo_time;
2577        }
2578
2579        if (cfqq->next_rq == next)
2580                cfqq->next_rq = rq;
2581        cfq_remove_request(next);
2582        cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2583
2584        cfqq = RQ_CFQQ(next);
2585        /*
2586         * all requests of this queue are merged to other queues, delete it
2587         * from the service tree. If it's the active_queue,
2588         * cfq_dispatch_requests() will choose to expire it or do idle
2589         */
2590        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2591            cfqq != cfqd->active_queue)
2592                cfq_del_cfqq_rr(cfqd, cfqq);
2593}
2594
2595static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2596                               struct bio *bio)
2597{
2598        struct cfq_data *cfqd = q->elevator->elevator_data;
2599        bool is_sync = op_is_sync(bio->bi_opf);
2600        struct cfq_io_cq *cic;
2601        struct cfq_queue *cfqq;
2602
2603        /*
2604         * Disallow merge of a sync bio into an async request.
2605         */
2606        if (is_sync && !rq_is_sync(rq))
2607                return false;
2608
2609        /*
2610         * Lookup the cfqq that this bio will be queued with and allow
2611         * merge only if rq is queued there.
2612         */
2613        cic = cfq_cic_lookup(cfqd, current->io_context);
2614        if (!cic)
2615                return false;
2616
2617        cfqq = cic_to_cfqq(cic, is_sync);
2618        return cfqq == RQ_CFQQ(rq);
2619}
2620
2621static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2622                              struct request *next)
2623{
2624        return RQ_CFQQ(rq) == RQ_CFQQ(next);
2625}
2626
2627static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2628{
2629        hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2630        cfqg_stats_update_idle_time(cfqq->cfqg);
2631}
2632
2633static void __cfq_set_active_queue(struct cfq_data *cfqd,
2634                                   struct cfq_queue *cfqq)
2635{
2636        if (cfqq) {
2637                cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2638                                cfqd->serving_wl_class, cfqd->serving_wl_type);
2639                cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2640                cfqq->slice_start = 0;
2641                cfqq->dispatch_start = ktime_get_ns();
2642                cfqq->allocated_slice = 0;
2643                cfqq->slice_end = 0;
2644                cfqq->slice_dispatch = 0;
2645                cfqq->nr_sectors = 0;
2646
2647                cfq_clear_cfqq_wait_request(cfqq);
2648                cfq_clear_cfqq_must_dispatch(cfqq);
2649                cfq_clear_cfqq_must_alloc_slice(cfqq);
2650                cfq_clear_cfqq_fifo_expire(cfqq);
2651                cfq_mark_cfqq_slice_new(cfqq);
2652
2653                cfq_del_timer(cfqd, cfqq);
2654        }
2655
2656        cfqd->active_queue = cfqq;
2657}
2658
2659/*
2660 * current cfqq expired its slice (or was too idle), select new one
2661 */
2662static void
2663__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2664                    bool timed_out)
2665{
2666        cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2667
2668        if (cfq_cfqq_wait_request(cfqq))
2669                cfq_del_timer(cfqd, cfqq);
2670
2671        cfq_clear_cfqq_wait_request(cfqq);
2672        cfq_clear_cfqq_wait_busy(cfqq);
2673
2674        /*
2675         * If this cfqq is shared between multiple processes, check to
2676         * make sure that those processes are still issuing I/Os within
2677         * the mean seek distance.  If not, it may be time to break the
2678         * queues apart again.
2679         */
2680        if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2681                cfq_mark_cfqq_split_coop(cfqq);
2682
2683        /*
2684         * store what was left of this slice, if the queue idled/timed out
2685         */
2686        if (timed_out) {
2687                if (cfq_cfqq_slice_new(cfqq))
2688                        cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2689                else
2690                        cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2691                cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2692        }
2693
2694        cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2695
2696        if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2697                cfq_del_cfqq_rr(cfqd, cfqq);
2698
2699        cfq_resort_rr_list(cfqd, cfqq);
2700
2701        if (cfqq == cfqd->active_queue)
2702                cfqd->active_queue = NULL;
2703
2704        if (cfqd->active_cic) {
2705                put_io_context(cfqd->active_cic->icq.ioc);
2706                cfqd->active_cic = NULL;
2707        }
2708}
2709
2710static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2711{
2712        struct cfq_queue *cfqq = cfqd->active_queue;
2713
2714        if (cfqq)
2715                __cfq_slice_expired(cfqd, cfqq, timed_out);
2716}
2717
2718/*
2719 * Get next queue for service. Unless we have a queue preemption,
2720 * we'll simply select the first cfqq in the service tree.
2721 */
2722static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2723{
2724        struct cfq_rb_root *st = st_for(cfqd->serving_group,
2725                        cfqd->serving_wl_class, cfqd->serving_wl_type);
2726
2727        if (!cfqd->rq_queued)
2728                return NULL;
2729
2730        /* There is nothing to dispatch */
2731        if (!st)
2732                return NULL;
2733        if (RB_EMPTY_ROOT(&st->rb))
2734                return NULL;
2735        return cfq_rb_first(st);
2736}
2737
2738static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2739{
2740        struct cfq_group *cfqg;
2741        struct cfq_queue *cfqq;
2742        int i, j;
2743        struct cfq_rb_root *st;
2744
2745        if (!cfqd->rq_queued)
2746                return NULL;
2747
2748        cfqg = cfq_get_next_cfqg(cfqd);
2749        if (!cfqg)
2750                return NULL;
2751
2752        for_each_cfqg_st(cfqg, i, j, st)
2753                if ((cfqq = cfq_rb_first(st)) != NULL)
2754                        return cfqq;
2755        return NULL;
2756}
2757
2758/*
2759 * Get and set a new active queue for service.
2760 */
2761static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2762                                              struct cfq_queue *cfqq)
2763{
2764        if (!cfqq)
2765                cfqq = cfq_get_next_queue(cfqd);
2766
2767        __cfq_set_active_queue(cfqd, cfqq);
2768        return cfqq;
2769}
2770
2771static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2772                                          struct request *rq)
2773{
2774        if (blk_rq_pos(rq) >= cfqd->last_position)
2775                return blk_rq_pos(rq) - cfqd->last_position;
2776        else
2777                return cfqd->last_position - blk_rq_pos(rq);
2778}
2779
2780static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2781                               struct request *rq)
2782{
2783        return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2784}
2785
2786static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2787                                    struct cfq_queue *cur_cfqq)
2788{
2789        struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2790        struct rb_node *parent, *node;
2791        struct cfq_queue *__cfqq;
2792        sector_t sector = cfqd->last_position;
2793
2794        if (RB_EMPTY_ROOT(root))
2795                return NULL;
2796
2797        /*
2798         * First, if we find a request starting at the end of the last
2799         * request, choose it.
2800         */
2801        __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2802        if (__cfqq)
2803                return __cfqq;
2804
2805        /*
2806         * If the exact sector wasn't found, the parent of the NULL leaf
2807         * will contain the closest sector.
2808         */
2809        __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2810        if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2811                return __cfqq;
2812
2813        if (blk_rq_pos(__cfqq->next_rq) < sector)
2814                node = rb_next(&__cfqq->p_node);
2815        else
2816                node = rb_prev(&__cfqq->p_node);
2817        if (!node)
2818                return NULL;
2819
2820        __cfqq = rb_entry(node, struct cfq_queue, p_node);
2821        if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2822                return __cfqq;
2823
2824        return NULL;
2825}
2826
2827/*
2828 * cfqd - obvious
2829 * cur_cfqq - passed in so that we don't decide that the current queue is
2830 *            closely cooperating with itself.
2831 *
2832 * So, basically we're assuming that that cur_cfqq has dispatched at least
2833 * one request, and that cfqd->last_position reflects a position on the disk
2834 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2835 * assumption.
2836 */
2837static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2838                                              struct cfq_queue *cur_cfqq)
2839{
2840        struct cfq_queue *cfqq;
2841
2842        if (cfq_class_idle(cur_cfqq))
2843                return NULL;
2844        if (!cfq_cfqq_sync(cur_cfqq))
2845                return NULL;
2846        if (CFQQ_SEEKY(cur_cfqq))
2847                return NULL;
2848
2849        /*
2850         * Don't search priority tree if it's the only queue in the group.
2851         */
2852        if (cur_cfqq->cfqg->nr_cfqq == 1)
2853                return NULL;
2854
2855        /*
2856         * We should notice if some of the queues are cooperating, eg
2857         * working closely on the same area of the disk. In that case,
2858         * we can group them together and don't waste time idling.
2859         */
2860        cfqq = cfqq_close(cfqd, cur_cfqq);
2861        if (!cfqq)
2862                return NULL;
2863
2864        /* If new queue belongs to different cfq_group, don't choose it */
2865        if (cur_cfqq->cfqg != cfqq->cfqg)
2866                return NULL;
2867
2868        /*
2869         * It only makes sense to merge sync queues.
2870         */
2871        if (!cfq_cfqq_sync(cfqq))
2872                return NULL;
2873        if (CFQQ_SEEKY(cfqq))
2874                return NULL;
2875
2876        /*
2877         * Do not merge queues of different priority classes
2878         */
2879        if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2880                return NULL;
2881
2882        return cfqq;
2883}
2884
2885/*
2886 * Determine whether we should enforce idle window for this queue.
2887 */
2888
2889static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2890{
2891        enum wl_class_t wl_class = cfqq_class(cfqq);
2892        struct cfq_rb_root *st = cfqq->service_tree;
2893
2894        BUG_ON(!st);
2895        BUG_ON(!st->count);
2896
2897        if (!cfqd->cfq_slice_idle)
2898                return false;
2899
2900        /* We never do for idle class queues. */
2901        if (wl_class == IDLE_WORKLOAD)
2902                return false;
2903
2904        /* We do for queues that were marked with idle window flag. */
2905        if (cfq_cfqq_idle_window(cfqq) &&
2906           !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2907                return true;
2908
2909        /*
2910         * Otherwise, we do only if they are the last ones
2911         * in their service tree.
2912         */
2913        if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2914           !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2915                return true;
2916        cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2917        return false;
2918}
2919
2920static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2921{
2922        struct cfq_queue *cfqq = cfqd->active_queue;
2923        struct cfq_rb_root *st = cfqq->service_tree;
2924        struct cfq_io_cq *cic;
2925        u64 sl, group_idle = 0;
2926        u64 now = ktime_get_ns();
2927
2928        /*
2929         * SSD device without seek penalty, disable idling. But only do so
2930         * for devices that support queuing, otherwise we still have a problem
2931         * with sync vs async workloads.
2932         */
2933        if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
2934                return;
2935
2936        WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2937        WARN_ON(cfq_cfqq_slice_new(cfqq));
2938
2939        /*
2940         * idle is disabled, either manually or by past process history
2941         */
2942        if (!cfq_should_idle(cfqd, cfqq)) {
2943                /* no queue idling. Check for group idling */
2944                if (cfqd->cfq_group_idle)
2945                        group_idle = cfqd->cfq_group_idle;
2946                else
2947                        return;
2948        }
2949
2950        /*
2951         * still active requests from this queue, don't idle
2952         */
2953        if (cfqq->dispatched)
2954                return;
2955
2956        /*
2957         * task has exited, don't wait
2958         */
2959        cic = cfqd->active_cic;
2960        if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2961                return;
2962
2963        /*
2964         * If our average think time is larger than the remaining time
2965         * slice, then don't idle. This avoids overrunning the allotted
2966         * time slice.
2967         */
2968        if (sample_valid(cic->ttime.ttime_samples) &&
2969            (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2970                cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2971                             cic->ttime.ttime_mean);
2972                return;
2973        }
2974
2975        /*
2976         * There are other queues in the group or this is the only group and
2977         * it has too big thinktime, don't do group idle.
2978         */
2979        if (group_idle &&
2980            (cfqq->cfqg->nr_cfqq > 1 ||
2981             cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2982                return;
2983
2984        cfq_mark_cfqq_wait_request(cfqq);
2985
2986        if (group_idle)
2987                sl = cfqd->cfq_group_idle;
2988        else
2989                sl = cfqd->cfq_slice_idle;
2990
2991        hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2992                      HRTIMER_MODE_REL);
2993        cfqg_stats_set_start_idle_time(cfqq->cfqg);
2994        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2995                        group_idle ? 1 : 0);
2996}
2997
2998/*
2999 * Move request from internal lists to the request queue dispatch list.
3000 */
3001static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
3002{
3003        struct cfq_data *cfqd = q->elevator->elevator_data;
3004        struct cfq_queue *cfqq = RQ_CFQQ(rq);
3005
3006        cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
3007
3008        cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
3009        cfq_remove_request(rq);
3010        cfqq->dispatched++;
3011        (RQ_CFQG(rq))->dispatched++;
3012        elv_dispatch_sort(q, rq);
3013
3014        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
3015        cfqq->nr_sectors += blk_rq_sectors(rq);
3016}
3017
3018/*
3019 * return expired entry, or NULL to just start from scratch in rbtree
3020 */
3021static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
3022{
3023        struct request *rq = NULL;
3024
3025        if (cfq_cfqq_fifo_expire(cfqq))
3026                return NULL;
3027
3028        cfq_mark_cfqq_fifo_expire(cfqq);
3029
3030        if (list_empty(&cfqq->fifo))
3031                return NULL;
3032
3033        rq = rq_entry_fifo(cfqq->fifo.next);
3034        if (ktime_get_ns() < rq->fifo_time)
3035                rq = NULL;
3036
3037        return rq;
3038}
3039
3040static inline int
3041cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3042{
3043        const int base_rq = cfqd->cfq_slice_async_rq;
3044
3045        WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
3046
3047        return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
3048}
3049
3050/*
3051 * Must be called with the queue_lock held.
3052 */
3053static int cfqq_process_refs(struct cfq_queue *cfqq)
3054{
3055        int process_refs, io_refs;
3056
3057        io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3058        process_refs = cfqq->ref - io_refs;
3059        BUG_ON(process_refs < 0);
3060        return process_refs;
3061}
3062
3063static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3064{
3065        int process_refs, new_process_refs;
3066        struct cfq_queue *__cfqq;
3067
3068        /*
3069         * If there are no process references on the new_cfqq, then it is
3070         * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3071         * chain may have dropped their last reference (not just their
3072         * last process reference).
3073         */
3074        if (!cfqq_process_refs(new_cfqq))
3075                return;
3076
3077        /* Avoid a circular list and skip interim queue merges */
3078        while ((__cfqq = new_cfqq->new_cfqq)) {
3079                if (__cfqq == cfqq)
3080                        return;
3081                new_cfqq = __cfqq;
3082        }
3083
3084        process_refs = cfqq_process_refs(cfqq);
3085        new_process_refs = cfqq_process_refs(new_cfqq);
3086        /*
3087         * If the process for the cfqq has gone away, there is no
3088         * sense in merging the queues.
3089         */
3090        if (process_refs == 0 || new_process_refs == 0)
3091                return;
3092
3093        /*
3094         * Merge in the direction of the lesser amount of work.
3095         */
3096        if (new_process_refs >= process_refs) {
3097                cfqq->new_cfqq = new_cfqq;
3098                new_cfqq->ref += process_refs;
3099        } else {
3100                new_cfqq->new_cfqq = cfqq;
3101                cfqq->ref += new_process_refs;
3102        }
3103}
3104
3105static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3106                        struct cfq_group *cfqg, enum wl_class_t wl_class)
3107{
3108        struct cfq_queue *queue;
3109        int i;
3110        bool key_valid = false;
3111        u64 lowest_key = 0;
3112        enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3113
3114        for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3115                /* select the one with lowest rb_key */
3116                queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3117                if (queue &&
3118                    (!key_valid || queue->rb_key < lowest_key)) {
3119                        lowest_key = queue->rb_key;
3120                        cur_best = i;
3121                        key_valid = true;
3122                }
3123        }
3124
3125        return cur_best;
3126}
3127
3128static void
3129choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3130{
3131        u64 slice;
3132        unsigned count;
3133        struct cfq_rb_root *st;
3134        u64 group_slice;
3135        enum wl_class_t original_class = cfqd->serving_wl_class;
3136        u64 now = ktime_get_ns();
3137
3138        /* Choose next priority. RT > BE > IDLE */
3139        if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3140                cfqd->serving_wl_class = RT_WORKLOAD;
3141        else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3142                cfqd->serving_wl_class = BE_WORKLOAD;
3143        else {
3144                cfqd->serving_wl_class = IDLE_WORKLOAD;
3145                cfqd->workload_expires = now + jiffies_to_nsecs(1);
3146                return;
3147        }
3148
3149        if (original_class != cfqd->serving_wl_class)
3150                goto new_workload;
3151
3152        /*
3153         * For RT and BE, we have to choose also the type
3154         * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3155         * expiration time
3156         */
3157        st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3158        count = st->count;
3159
3160        /*
3161         * check workload expiration, and that we still have other queues ready
3162         */
3163        if (count && !(now > cfqd->workload_expires))
3164                return;
3165
3166new_workload:
3167        /* otherwise select new workload type */
3168        cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3169                                        cfqd->serving_wl_class);
3170        st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3171        count = st->count;
3172
3173        /*
3174         * the workload slice is computed as a fraction of target latency
3175         * proportional to the number of queues in that workload, over
3176         * all the queues in the same priority class
3177         */
3178        group_slice = cfq_group_slice(cfqd, cfqg);
3179
3180        slice = div_u64(group_slice * count,
3181                max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3182                      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3183                                        cfqg)));
3184
3185        if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3186                u64 tmp;
3187
3188                /*
3189                 * Async queues are currently system wide. Just taking
3190                 * proportion of queues with-in same group will lead to higher
3191                 * async ratio system wide as generally root group is going
3192                 * to have higher weight. A more accurate thing would be to
3193                 * calculate system wide asnc/sync ratio.
3194                 */
3195                tmp = cfqd->cfq_target_latency *
3196                        cfqg_busy_async_queues(cfqd, cfqg);
3197                tmp = div_u64(tmp, cfqd->busy_queues);
3198                slice = min_t(u64, slice, tmp);
3199
3200                /* async workload slice is scaled down according to
3201                 * the sync/async slice ratio. */
3202                slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3203        } else
3204                /* sync workload slice is at least 2 * cfq_slice_idle */
3205                slice = max(slice, 2 * cfqd->cfq_slice_idle);
3206
3207        slice = max_t(u64, slice, CFQ_MIN_TT);
3208        cfq_log(cfqd, "workload slice:%llu", slice);
3209        cfqd->workload_expires = now + slice;
3210}
3211
3212static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3213{
3214        struct cfq_rb_root *st = &cfqd->grp_service_tree;
3215        struct cfq_group *cfqg;
3216
3217        if (RB_EMPTY_ROOT(&st->rb))
3218                return NULL;
3219        cfqg = cfq_rb_first_group(st);
3220        update_min_vdisktime(st);
3221        return cfqg;
3222}
3223
3224static void cfq_choose_cfqg(struct cfq_data *cfqd)
3225{
3226        struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3227        u64 now = ktime_get_ns();
3228
3229        cfqd->serving_group = cfqg;
3230
3231        /* Restore the workload type data */
3232        if (cfqg->saved_wl_slice) {
3233                cfqd->workload_expires = now + cfqg->saved_wl_slice;
3234                cfqd->serving_wl_type = cfqg->saved_wl_type;
3235                cfqd->serving_wl_class = cfqg->saved_wl_class;
3236        } else
3237                cfqd->workload_expires = now - 1;
3238
3239        choose_wl_class_and_type(cfqd, cfqg);
3240}
3241
3242/*
3243 * Select a queue for service. If we have a current active queue,
3244 * check whether to continue servicing it, or retrieve and set a new one.
3245 */
3246static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3247{
3248        struct cfq_queue *cfqq, *new_cfqq = NULL;
3249        u64 now = ktime_get_ns();
3250
3251        cfqq = cfqd->active_queue;
3252        if (!cfqq)
3253                goto new_queue;
3254
3255        if (!cfqd->rq_queued)
3256                return NULL;
3257
3258        /*
3259         * We were waiting for group to get backlogged. Expire the queue
3260         */
3261        if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3262                goto expire;
3263
3264        /*
3265         * The active queue has run out of time, expire it and select new.
3266         */
3267        if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3268                /*
3269                 * If slice had not expired at the completion of last request
3270                 * we might not have turned on wait_busy flag. Don't expire
3271                 * the queue yet. Allow the group to get backlogged.
3272                 *
3273                 * The very fact that we have used the slice, that means we
3274                 * have been idling all along on this queue and it should be
3275                 * ok to wait for this request to complete.
3276                 */
3277                if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3278                    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3279                        cfqq = NULL;
3280                        goto keep_queue;
3281                } else
3282                        goto check_group_idle;
3283        }
3284
3285        /*
3286         * The active queue has requests and isn't expired, allow it to
3287         * dispatch.
3288         */
3289        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3290                goto keep_queue;
3291
3292        /*
3293         * If another queue has a request waiting within our mean seek
3294         * distance, let it run.  The expire code will check for close
3295         * cooperators and put the close queue at the front of the service
3296         * tree.  If possible, merge the expiring queue with the new cfqq.
3297         */
3298        new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3299        if (new_cfqq) {
3300                if (!cfqq->new_cfqq)
3301                        cfq_setup_merge(cfqq, new_cfqq);
3302                goto expire;
3303        }
3304
3305        /*
3306         * No requests pending. If the active queue still has requests in
3307         * flight or is idling for a new request, allow either of these
3308         * conditions to happen (or time out) before selecting a new queue.
3309         */
3310        if (hrtimer_active(&cfqd->idle_slice_timer)) {
3311                cfqq = NULL;
3312                goto keep_queue;
3313        }
3314
3315        /*
3316         * This is a deep seek queue, but the device is much faster than
3317         * the queue can deliver, don't idle
3318         **/
3319        if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3320            (cfq_cfqq_slice_new(cfqq) ||
3321            (cfqq->slice_end - now > now - cfqq->slice_start))) {
3322                cfq_clear_cfqq_deep(cfqq);
3323                cfq_clear_cfqq_idle_window(cfqq);
3324        }
3325
3326        if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3327                cfqq = NULL;
3328                goto keep_queue;
3329        }
3330
3331        /*
3332         * If group idle is enabled and there are requests dispatched from
3333         * this group, wait for requests to complete.
3334         */
3335check_group_idle:
3336        if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3337            cfqq->cfqg->dispatched &&
3338            !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3339                cfqq = NULL;
3340                goto keep_queue;
3341        }
3342
3343expire:
3344        cfq_slice_expired(cfqd, 0);
3345new_queue:
3346        /*
3347         * Current queue expired. Check if we have to switch to a new
3348         * service tree
3349         */
3350        if (!new_cfqq)
3351                cfq_choose_cfqg(cfqd);
3352
3353        cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3354keep_queue:
3355        return cfqq;
3356}
3357
3358static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3359{
3360        int dispatched = 0;
3361
3362        while (cfqq->next_rq) {
3363                cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3364                dispatched++;
3365        }
3366
3367        BUG_ON(!list_empty(&cfqq->fifo));
3368
3369        /* By default cfqq is not expired if it is empty. Do it explicitly */
3370        __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3371        return dispatched;
3372}
3373
3374/*
3375 * Drain our current requests. Used for barriers and when switching
3376 * io schedulers on-the-fly.
3377 */
3378static int cfq_forced_dispatch(struct cfq_data *cfqd)
3379{
3380        struct cfq_queue *cfqq;
3381        int dispatched = 0;
3382
3383        /* Expire the timeslice of the current active queue first */
3384        cfq_slice_expired(cfqd, 0);
3385        while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3386                __cfq_set_active_queue(cfqd, cfqq);
3387                dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3388        }
3389
3390        BUG_ON(cfqd->busy_queues);
3391
3392        cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3393        return dispatched;
3394}
3395
3396static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3397        struct cfq_queue *cfqq)
3398{
3399        u64 now = ktime_get_ns();
3400
3401        /* the queue hasn't finished any request, can't estimate */
3402        if (cfq_cfqq_slice_new(cfqq))
3403                return true;
3404        if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
3405                return true;
3406
3407        return false;
3408}
3409
3410static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3411{
3412        unsigned int max_dispatch;
3413
3414        if (cfq_cfqq_must_dispatch(cfqq))
3415                return true;
3416
3417        /*
3418         * Drain async requests before we start sync IO
3419         */
3420        if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3421                return false;
3422
3423        /*
3424         * If this is an async queue and we have sync IO in flight, let it wait
3425         */
3426        if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3427                return false;
3428
3429        max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3430        if (cfq_class_idle(cfqq))
3431                max_dispatch = 1;
3432
3433        /*
3434         * Does this cfqq already have too much IO in flight?
3435         */
3436        if (cfqq->dispatched >= max_dispatch) {
3437                bool promote_sync = false;
3438                /*
3439                 * idle queue must always only have a single IO in flight
3440                 */
3441                if (cfq_class_idle(cfqq))
3442                        return false;
3443
3444                /*
3445                 * If there is only one sync queue
3446                 * we can ignore async queue here and give the sync
3447                 * queue no dispatch limit. The reason is a sync queue can
3448                 * preempt async queue, limiting the sync queue doesn't make
3449                 * sense. This is useful for aiostress test.
3450                 */
3451                if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3452                        promote_sync = true;
3453
3454                /*
3455                 * We have other queues, don't allow more IO from this one
3456                 */
3457                if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3458                                !promote_sync)
3459                        return false;
3460
3461                /*
3462                 * Sole queue user, no limit
3463                 */
3464                if (cfqd->busy_queues == 1 || promote_sync)
3465                        max_dispatch = -1;
3466                else
3467                        /*
3468                         * Normally we start throttling cfqq when cfq_quantum/2
3469                         * requests have been dispatched. But we can drive
3470                         * deeper queue depths at the beginning of slice
3471                         * subjected to upper limit of cfq_quantum.
3472                         * */
3473                        max_dispatch = cfqd->cfq_quantum;
3474        }
3475
3476        /*
3477         * Async queues must wait a bit before being allowed dispatch.
3478         * We also ramp up the dispatch depth gradually for async IO,
3479         * based on the last sync IO we serviced
3480         */
3481        if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3482                u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3483                unsigned int depth;
3484
3485                depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3486                if (!depth && !cfqq->dispatched)
3487                        depth = 1;
3488                if (depth < max_dispatch)
3489                        max_dispatch = depth;
3490        }
3491
3492        /*
3493         * If we're below the current max, allow a dispatch
3494         */
3495        return cfqq->dispatched < max_dispatch;
3496}
3497
3498/*
3499 * Dispatch a request from cfqq, moving them to the request queue
3500 * dispatch list.
3501 */
3502static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3503{
3504        struct request *rq;
3505
3506        BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3507
3508        rq = cfq_check_fifo(cfqq);
3509        if (rq)
3510                cfq_mark_cfqq_must_dispatch(cfqq);
3511
3512        if (!cfq_may_dispatch(cfqd, cfqq))
3513                return false;
3514
3515        /*
3516         * follow expired path, else get first next available
3517         */
3518        if (!rq)
3519                rq = cfqq->next_rq;
3520        else
3521                cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3522
3523        /*
3524         * insert request into driver dispatch list
3525         */
3526        cfq_dispatch_insert(cfqd->queue, rq);
3527
3528        if (!cfqd->active_cic) {
3529                struct cfq_io_cq *cic = RQ_CIC(rq);
3530
3531                atomic_long_inc(&cic->icq.ioc->refcount);
3532                cfqd->active_cic = cic;
3533        }
3534
3535        return true;
3536}
3537
3538/*
3539 * Find the cfqq that we need to service and move a request from that to the
3540 * dispatch list
3541 */
3542static int cfq_dispatch_requests(struct request_queue *q, int force)
3543{
3544        struct cfq_data *cfqd = q->elevator->elevator_data;
3545        struct cfq_queue *cfqq;
3546
3547        if (!cfqd->busy_queues)
3548                return 0;
3549
3550        if (unlikely(force))
3551                return cfq_forced_dispatch(cfqd);
3552
3553        cfqq = cfq_select_queue(cfqd);
3554        if (!cfqq)
3555                return 0;
3556
3557        /*
3558         * Dispatch a request from this cfqq, if it is allowed
3559         */
3560        if (!cfq_dispatch_request(cfqd, cfqq))
3561                return 0;
3562
3563        cfqq->slice_dispatch++;
3564        cfq_clear_cfqq_must_dispatch(cfqq);
3565
3566        /*
3567         * expire an async queue immediately if it has used up its slice. idle
3568         * queue always expire after 1 dispatch round.
3569         */
3570        if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3571            cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3572            cfq_class_idle(cfqq))) {
3573                cfqq->slice_end = ktime_get_ns() + 1;
3574                cfq_slice_expired(cfqd, 0);
3575        }
3576
3577        cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3578        return 1;
3579}
3580
3581/*
3582 * task holds one reference to the queue, dropped when task exits. each rq
3583 * in-flight on this queue also holds a reference, dropped when rq is freed.
3584 *
3585 * Each cfq queue took a reference on the parent group. Drop it now.
3586 * queue lock must be held here.
3587 */
3588static void cfq_put_queue(struct cfq_queue *cfqq)
3589{
3590        struct cfq_data *cfqd = cfqq->cfqd;
3591        struct cfq_group *cfqg;
3592
3593        BUG_ON(cfqq->ref <= 0);
3594
3595        cfqq->ref--;
3596        if (cfqq->ref)
3597                return;
3598
3599        cfq_log_cfqq(cfqd, cfqq, "put_queue");
3600        BUG_ON(rb_first(&cfqq->sort_list));
3601        BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3602        cfqg = cfqq->cfqg;
3603
3604        if (unlikely(cfqd->active_queue == cfqq)) {
3605                __cfq_slice_expired(cfqd, cfqq, 0);
3606                cfq_schedule_dispatch(cfqd);
3607        }
3608
3609        BUG_ON(cfq_cfqq_on_rr(cfqq));
3610        kmem_cache_free(cfq_pool, cfqq);
3611        cfqg_put(cfqg);
3612}
3613
3614static void cfq_put_cooperator(struct cfq_queue *cfqq)
3615{
3616        struct cfq_queue *__cfqq, *next;
3617
3618        /*
3619         * If this queue was scheduled to merge with another queue, be
3620         * sure to drop the reference taken on that queue (and others in
3621         * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3622         */
3623        __cfqq = cfqq->new_cfqq;
3624        while (__cfqq) {
3625                if (__cfqq == cfqq) {
3626                        WARN(1, "cfqq->new_cfqq loop detected\n");
3627                        break;
3628                }
3629                next = __cfqq->new_cfqq;
3630                cfq_put_queue(__cfqq);
3631                __cfqq = next;
3632        }
3633}
3634
3635static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3636{
3637        if (unlikely(cfqq == cfqd->active_queue)) {
3638                __cfq_slice_expired(cfqd, cfqq, 0);
3639                cfq_schedule_dispatch(cfqd);
3640        }
3641
3642        cfq_put_cooperator(cfqq);
3643
3644        cfq_put_queue(cfqq);
3645}
3646
3647static void cfq_init_icq(struct io_cq *icq)
3648{
3649        struct cfq_io_cq *cic = icq_to_cic(icq);
3650
3651        cic->ttime.last_end_request = ktime_get_ns();
3652}
3653
3654static void cfq_exit_icq(struct io_cq *icq)
3655{
3656        struct cfq_io_cq *cic = icq_to_cic(icq);
3657        struct cfq_data *cfqd = cic_to_cfqd(cic);
3658
3659        if (cic_to_cfqq(cic, false)) {
3660                cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3661                cic_set_cfqq(cic, NULL, false);
3662        }
3663
3664        if (cic_to_cfqq(cic, true)) {
3665                cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3666                cic_set_cfqq(cic, NULL, true);
3667        }
3668}
3669
3670static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3671{
3672        struct task_struct *tsk = current;
3673        int ioprio_class;
3674
3675        if (!cfq_cfqq_prio_changed(cfqq))
3676                return;
3677
3678        ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3679        switch (ioprio_class) {
3680        default:
3681                printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3682        case IOPRIO_CLASS_NONE:
3683                /*
3684                 * no prio set, inherit CPU scheduling settings
3685                 */
3686                cfqq->ioprio = task_nice_ioprio(tsk);
3687                cfqq->ioprio_class = task_nice_ioclass(tsk);
3688                break;
3689        case IOPRIO_CLASS_RT:
3690                cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3691                cfqq->ioprio_class = IOPRIO_CLASS_RT;
3692                break;
3693        case IOPRIO_CLASS_BE:
3694                cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3695                cfqq->ioprio_class = IOPRIO_CLASS_BE;
3696                break;
3697        case IOPRIO_CLASS_IDLE:
3698                cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3699                cfqq->ioprio = 7;
3700                cfq_clear_cfqq_idle_window(cfqq);
3701                break;
3702        }
3703
3704        /*
3705         * keep track of original prio settings in case we have to temporarily
3706         * elevate the priority of this queue
3707         */
3708        cfqq->org_ioprio = cfqq->ioprio;
3709        cfqq->org_ioprio_class = cfqq->ioprio_class;
3710        cfq_clear_cfqq_prio_changed(cfqq);
3711}
3712
3713static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3714{
3715        int ioprio = cic->icq.ioc->ioprio;
3716        struct cfq_data *cfqd = cic_to_cfqd(cic);
3717        struct cfq_queue *cfqq;
3718
3719        /*
3720         * Check whether ioprio has changed.  The condition may trigger
3721         * spuriously on a newly created cic but there's no harm.
3722         */
3723        if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3724                return;
3725
3726        cfqq = cic_to_cfqq(cic, false);
3727        if (cfqq) {
3728                cfq_put_queue(cfqq);
3729                cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3730                cic_set_cfqq(cic, cfqq, false);
3731        }
3732
3733        cfqq = cic_to_cfqq(cic, true);
3734        if (cfqq)
3735                cfq_mark_cfqq_prio_changed(cfqq);
3736
3737        cic->ioprio = ioprio;
3738}
3739
3740static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3741                          pid_t pid, bool is_sync)
3742{
3743        RB_CLEAR_NODE(&cfqq->rb_node);
3744        RB_CLEAR_NODE(&cfqq->p_node);
3745        INIT_LIST_HEAD(&cfqq->fifo);
3746
3747        cfqq->ref = 0;
3748        cfqq->cfqd = cfqd;
3749
3750        cfq_mark_cfqq_prio_changed(cfqq);
3751
3752        if (is_sync) {
3753                if (!cfq_class_idle(cfqq))
3754                        cfq_mark_cfqq_idle_window(cfqq);
3755                cfq_mark_cfqq_sync(cfqq);
3756        }
3757        cfqq->pid = pid;
3758}
3759
3760#ifdef CONFIG_CFQ_GROUP_IOSCHED
3761static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3762{
3763        struct cfq_data *cfqd = cic_to_cfqd(cic);
3764        struct cfq_queue *cfqq;
3765        uint64_t serial_nr;
3766        bool nonroot_cg;
3767
3768        rcu_read_lock();
3769        serial_nr = bio_blkcg(bio)->css.serial_nr;
3770        nonroot_cg = bio_blkcg(bio) != &blkcg_root;
3771        rcu_read_unlock();
3772
3773        /*
3774         * Check whether blkcg has changed.  The condition may trigger
3775         * spuriously on a newly created cic but there's no harm.
3776         */
3777        if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3778                return nonroot_cg;
3779
3780        /*
3781         * Drop reference to queues.  New queues will be assigned in new
3782         * group upon arrival of fresh requests.
3783         */
3784        cfqq = cic_to_cfqq(cic, false);
3785        if (cfqq) {
3786                cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3787                cic_set_cfqq(cic, NULL, false);
3788                cfq_put_queue(cfqq);
3789        }
3790
3791        cfqq = cic_to_cfqq(cic, true);
3792        if (cfqq) {
3793                cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3794                cic_set_cfqq(cic, NULL, true);
3795                cfq_put_queue(cfqq);
3796        }
3797
3798        cic->blkcg_serial_nr = serial_nr;
3799        return nonroot_cg;
3800}
3801#else
3802static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3803{
3804        return false;
3805}
3806#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3807
3808static struct cfq_queue **
3809cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3810{
3811        switch (ioprio_class) {
3812        case IOPRIO_CLASS_RT:
3813                return &cfqg->async_cfqq[0][ioprio];
3814        case IOPRIO_CLASS_NONE:
3815                ioprio = IOPRIO_NORM;
3816                /* fall through */
3817        case IOPRIO_CLASS_BE:
3818                return &cfqg->async_cfqq[1][ioprio];
3819        case IOPRIO_CLASS_IDLE:
3820                return &cfqg->async_idle_cfqq;
3821        default:
3822                BUG();
3823        }
3824}
3825
3826static struct cfq_queue *
3827cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3828              struct bio *bio)
3829{
3830        int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3831        int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3832        struct cfq_queue **async_cfqq = NULL;
3833        struct cfq_queue *cfqq;
3834        struct cfq_group *cfqg;
3835
3836        rcu_read_lock();
3837        cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3838        if (!cfqg) {
3839                cfqq = &cfqd->oom_cfqq;
3840                goto out;
3841        }
3842
3843        if (!is_sync) {
3844                if (!ioprio_valid(cic->ioprio)) {
3845                        struct task_struct *tsk = current;
3846                        ioprio = task_nice_ioprio(tsk);
3847                        ioprio_class = task_nice_ioclass(tsk);
3848                }
3849                async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3850                cfqq = *async_cfqq;
3851                if (cfqq)
3852                        goto out;
3853        }
3854
3855        cfqq = kmem_cache_alloc_node(cfq_pool,
3856                                     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3857                                     cfqd->queue->node);
3858        if (!cfqq) {
3859                cfqq = &cfqd->oom_cfqq;
3860                goto out;
3861        }
3862
3863        cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3864        cfq_init_prio_data(cfqq, cic);
3865        cfq_link_cfqq_cfqg(cfqq, cfqg);
3866        cfq_log_cfqq(cfqd, cfqq, "alloced");
3867
3868        if (async_cfqq) {
3869                /* a new async queue is created, pin and remember */
3870                cfqq->ref++;
3871                *async_cfqq = cfqq;
3872        }
3873out:
3874        cfqq->ref++;
3875        rcu_read_unlock();
3876        return cfqq;
3877}
3878
3879static void
3880__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
3881{
3882        u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3883        elapsed = min(elapsed, 2UL * slice_idle);
3884
3885        ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3886        ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
3887        ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3888                                     ttime->ttime_samples);
3889}
3890
3891static void
3892cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3893                        struct cfq_io_cq *cic)
3894{
3895        if (cfq_cfqq_sync(cfqq)) {
3896                __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3897                __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3898                        cfqd->cfq_slice_idle);
3899        }
3900#ifdef CONFIG_CFQ_GROUP_IOSCHED
3901        __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3902#endif
3903}
3904
3905static void
3906cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3907                       struct request *rq)
3908{
3909        sector_t sdist = 0;
3910        sector_t n_sec = blk_rq_sectors(rq);
3911        if (cfqq->last_request_pos) {
3912                if (cfqq->last_request_pos < blk_rq_pos(rq))
3913                        sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3914                else
3915                        sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3916        }
3917
3918        cfqq->seek_history <<= 1;
3919        if (blk_queue_nonrot(cfqd->queue))
3920                cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3921        else
3922                cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3923}
3924
3925static inline bool req_noidle(struct request *req)
3926{
3927        return req_op(req) == REQ_OP_WRITE &&
3928                (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3929}
3930
3931/*
3932 * Disable idle window if the process thinks too long or seeks so much that
3933 * it doesn't matter
3934 */
3935static void
3936cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3937                       struct cfq_io_cq *cic)
3938{
3939        int old_idle, enable_idle;
3940
3941        /*
3942         * Don't idle for async or idle io prio class
3943         */
3944        if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3945                return;
3946
3947        enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3948
3949        if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3950                cfq_mark_cfqq_deep(cfqq);
3951
3952        if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3953                enable_idle = 0;
3954        else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3955                 !cfqd->cfq_slice_idle ||
3956                 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3957                enable_idle = 0;
3958        else if (sample_valid(cic->ttime.ttime_samples)) {
3959                if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3960                        enable_idle = 0;
3961                else
3962                        enable_idle = 1;
3963        }
3964
3965        if (old_idle != enable_idle) {
3966                cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3967                if (enable_idle)
3968                        cfq_mark_cfqq_idle_window(cfqq);
3969                else
3970                        cfq_clear_cfqq_idle_window(cfqq);
3971        }
3972}
3973
3974/*
3975 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3976 * no or if we aren't sure, a 1 will cause a preempt.
3977 */
3978static bool
3979cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3980                   struct request *rq)
3981{
3982        struct cfq_queue *cfqq;
3983
3984        cfqq = cfqd->active_queue;
3985        if (!cfqq)
3986                return false;
3987
3988        if (cfq_class_idle(new_cfqq))
3989                return false;
3990
3991        if (cfq_class_idle(cfqq))
3992                return true;
3993
3994        /*
3995         * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3996         */
3997        if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3998                return false;
3999
4000        /*
4001         * if the new request is sync, but the currently running queue is
4002         * not, let the sync request have priority.
4003         */
4004        if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
4005                return true;
4006
4007        /*
4008         * Treat ancestors of current cgroup the same way as current cgroup.
4009         * For anybody else we disallow preemption to guarantee service
4010         * fairness among cgroups.
4011         */
4012        if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
4013                return false;
4014
4015        if (cfq_slice_used(cfqq))
4016                return true;
4017
4018        /*
4019         * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
4020         */
4021        if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
4022                return true;
4023
4024        WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
4025        /* Allow preemption only if we are idling on sync-noidle tree */
4026        if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
4027            cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
4028            RB_EMPTY_ROOT(&cfqq->sort_list))
4029                return true;
4030
4031        /*
4032         * So both queues are sync. Let the new request get disk time if
4033         * it's a metadata request and the current queue is doing regular IO.
4034         */
4035        if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
4036                return true;
4037
4038        /* An idle queue should not be idle now for some reason */
4039        if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
4040                return true;
4041
4042        if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
4043                return false;
4044
4045        /*
4046         * if this request is as-good as one we would expect from the
4047         * current cfqq, let it preempt
4048         */
4049        if (cfq_rq_close(cfqd, cfqq, rq))
4050                return true;
4051
4052        return false;
4053}
4054
4055/*
4056 * cfqq preempts the active queue. if we allowed preempt with no slice left,
4057 * let it have half of its nominal slice.
4058 */
4059static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4060{
4061        enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
4062
4063        cfq_log_cfqq(cfqd, cfqq, "preempt");
4064        cfq_slice_expired(cfqd, 1);
4065
4066        /*
4067         * workload type is changed, don't save slice, otherwise preempt
4068         * doesn't happen
4069         */
4070        if (old_type != cfqq_type(cfqq))
4071                cfqq->cfqg->saved_wl_slice = 0;
4072
4073        /*
4074         * Put the new queue at the front of the of the current list,
4075         * so we know that it will be selected next.
4076         */
4077        BUG_ON(!cfq_cfqq_on_rr(cfqq));
4078
4079        cfq_service_tree_add(cfqd, cfqq, 1);
4080
4081        cfqq->slice_end = 0;
4082        cfq_mark_cfqq_slice_new(cfqq);
4083}
4084
4085/*
4086 * Called when a new fs request (rq) is added (to cfqq). Check if there's
4087 * something we should do about it
4088 */
4089static void
4090cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
4091                struct request *rq)
4092{
4093        struct cfq_io_cq *cic = RQ_CIC(rq);
4094
4095        cfqd->rq_queued++;
4096        if (rq->cmd_flags & REQ_PRIO)
4097                cfqq->prio_pending++;
4098
4099        cfq_update_io_thinktime(cfqd, cfqq, cic);
4100        cfq_update_io_seektime(cfqd, cfqq, rq);
4101        cfq_update_idle_window(cfqd, cfqq, cic);
4102
4103        cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4104
4105        if (cfqq == cfqd->active_queue) {
4106                /*
4107                 * Remember that we saw a request from this process, but
4108                 * don't start queuing just yet. Otherwise we risk seeing lots
4109                 * of tiny requests, because we disrupt the normal plugging
4110                 * and merging. If the request is already larger than a single
4111                 * page, let it rip immediately. For that case we assume that
4112                 * merging is already done. Ditto for a busy system that
4113                 * has other work pending, don't risk delaying until the
4114                 * idle timer unplug to continue working.
4115                 */
4116                if (cfq_cfqq_wait_request(cfqq)) {
4117                        if (blk_rq_bytes(rq) > PAGE_SIZE ||
4118                            cfqd->busy_queues > 1) {
4119                                cfq_del_timer(cfqd, cfqq);
4120                                cfq_clear_cfqq_wait_request(cfqq);
4121                                __blk_run_queue(cfqd->queue);
4122                        } else {
4123                                cfqg_stats_update_idle_time(cfqq->cfqg);
4124                                cfq_mark_cfqq_must_dispatch(cfqq);
4125                        }
4126                }
4127        } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
4128                /*
4129                 * not the active queue - expire current slice if it is
4130                 * idle and has expired it's mean thinktime or this new queue
4131                 * has some old slice time left and is of higher priority or
4132                 * this new queue is RT and the current one is BE
4133                 */
4134                cfq_preempt_queue(cfqd, cfqq);
4135                __blk_run_queue(cfqd->queue);
4136        }
4137}
4138
4139static void cfq_insert_request(struct request_queue *q, struct request *rq)
4140{
4141        struct cfq_data *cfqd = q->elevator->elevator_data;
4142        struct cfq_queue *cfqq = RQ_CFQQ(rq);
4143
4144        cfq_log_cfqq(cfqd, cfqq, "insert_request");
4145        cfq_init_prio_data(cfqq, RQ_CIC(rq));
4146
4147        rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
4148        list_add_tail(&rq->queuelist, &cfqq->fifo);
4149        cfq_add_rq_rb(rq);
4150        cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4151                                 rq->cmd_flags);
4152        cfq_rq_enqueued(cfqd, cfqq, rq);
4153}
4154
4155/*
4156 * Update hw_tag based on peak queue depth over 50 samples under
4157 * sufficient load.
4158 */
4159static void cfq_update_hw_tag(struct cfq_data *cfqd)
4160{
4161        struct cfq_queue *cfqq = cfqd->active_queue;
4162
4163        if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4164                cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4165
4166        if (cfqd->hw_tag == 1)
4167                return;
4168
4169        if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4170            cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4171                return;
4172
4173        /*
4174         * If active queue hasn't enough requests and can idle, cfq might not
4175         * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4176         * case
4177         */
4178        if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4179            cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4180            CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
4181                return;
4182
4183        if (cfqd->hw_tag_samples++ < 50)
4184                return;
4185
4186        if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4187                cfqd->hw_tag = 1;
4188        else
4189                cfqd->hw_tag = 0;
4190}
4191
4192static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4193{
4194        struct cfq_io_cq *cic = cfqd->active_cic;
4195        u64 now = ktime_get_ns();
4196
4197        /* If the queue already has requests, don't wait */
4198        if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4199                return false;
4200
4201        /* If there are other queues in the group, don't wait */
4202        if (cfqq->cfqg->nr_cfqq > 1)
4203                return false;
4204
4205        /* the only queue in the group, but think time is big */
4206        if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4207                return false;
4208
4209        if (cfq_slice_used(cfqq))
4210                return true;
4211
4212        /* if slice left is less than think time, wait busy */
4213        if (cic && sample_valid(cic->ttime.ttime_samples)
4214            && (cfqq->slice_end - now < cic->ttime.ttime_mean))
4215                return true;
4216
4217        /*
4218         * If think times is less than a jiffy than ttime_mean=0 and above
4219         * will not be true. It might happen that slice has not expired yet
4220         * but will expire soon (4-5 ns) during select_queue(). To cover the
4221         * case where think time is less than a jiffy, mark the queue wait
4222         * busy if only 1 jiffy is left in the slice.
4223         */
4224        if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
4225                return true;
4226
4227        return false;
4228}
4229
4230static void cfq_completed_request(struct request_queue *q, struct request *rq)
4231{
4232        struct cfq_queue *cfqq = RQ_CFQQ(rq);
4233        struct cfq_data *cfqd = cfqq->cfqd;
4234        const int sync = rq_is_sync(rq);
4235        u64 now = ktime_get_ns();
4236
4237        cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
4238
4239        cfq_update_hw_tag(cfqd);
4240
4241        WARN_ON(!cfqd->rq_in_driver);
4242        WARN_ON(!cfqq->dispatched);
4243        cfqd->rq_in_driver--;
4244        cfqq->dispatched--;
4245        (RQ_CFQG(rq))->dispatched--;
4246        cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
4247                                     rq_io_start_time_ns(rq), rq->cmd_flags);
4248
4249        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4250
4251        if (sync) {
4252                struct cfq_rb_root *st;
4253
4254                RQ_CIC(rq)->ttime.last_end_request = now;
4255
4256                if (cfq_cfqq_on_rr(cfqq))
4257                        st = cfqq->service_tree;
4258                else
4259                        st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4260                                        cfqq_type(cfqq));
4261
4262                st->ttime.last_end_request = now;
4263                /*
4264                 * We have to do this check in jiffies since start_time is in
4265                 * jiffies and it is not trivial to convert to ns. If
4266                 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
4267                 * will become problematic but so far we are fine (the default
4268                 * is 128 ms).
4269                 */
4270                if (!time_after(rq->start_time +
4271                                  nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
4272                                jiffies))
4273                        cfqd->last_delayed_sync = now;
4274        }
4275
4276#ifdef CONFIG_CFQ_GROUP_IOSCHED
4277        cfqq->cfqg->ttime.last_end_request = now;
4278#endif
4279
4280        /*
4281         * If this is the active queue, check if it needs to be expired,
4282         * or if we want to idle in case it has no pending requests.
4283         */
4284        if (cfqd->active_queue == cfqq) {
4285                const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4286
4287                if (cfq_cfqq_slice_new(cfqq)) {
4288                        cfq_set_prio_slice(cfqd, cfqq);
4289                        cfq_clear_cfqq_slice_new(cfqq);
4290                }
4291
4292                /*
4293                 * Should we wait for next request to come in before we expire
4294                 * the queue.
4295                 */
4296                if (cfq_should_wait_busy(cfqd, cfqq)) {
4297                        u64 extend_sl = cfqd->cfq_slice_idle;
4298                        if (!cfqd->cfq_slice_idle)
4299                                extend_sl = cfqd->cfq_group_idle;
4300                        cfqq->slice_end = now + extend_sl;
4301                        cfq_mark_cfqq_wait_busy(cfqq);
4302                        cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4303                }
4304
4305                /*
4306                 * Idling is not enabled on:
4307                 * - expired queues
4308                 * - idle-priority queues
4309                 * - async queues
4310                 * - queues with still some requests queued
4311                 * - when there is a close cooperator
4312                 */
4313                if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4314                        cfq_slice_expired(cfqd, 1);
4315                else if (sync && cfqq_empty &&
4316                         !cfq_close_cooperator(cfqd, cfqq)) {
4317                        cfq_arm_slice_timer(cfqd);
4318                }
4319        }
4320
4321        if (!cfqd->rq_in_driver)
4322                cfq_schedule_dispatch(cfqd);
4323}
4324
4325static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
4326{
4327        /*
4328         * If REQ_PRIO is set, boost class and prio level, if it's below
4329         * BE/NORM. If prio is not set, restore the potentially boosted
4330         * class/prio level.
4331         */
4332        if (!(op & REQ_PRIO)) {
4333                cfqq->ioprio_class = cfqq->org_ioprio_class;
4334                cfqq->ioprio = cfqq->org_ioprio;
4335        } else {
4336                if (cfq_class_idle(cfqq))
4337                        cfqq->ioprio_class = IOPRIO_CLASS_BE;
4338                if (cfqq->ioprio > IOPRIO_NORM)
4339                        cfqq->ioprio = IOPRIO_NORM;
4340        }
4341}
4342
4343static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4344{
4345        if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
4346                cfq_mark_cfqq_must_alloc_slice(cfqq);
4347                return ELV_MQUEUE_MUST;
4348        }
4349
4350        return ELV_MQUEUE_MAY;
4351}
4352
4353static int cfq_may_queue(struct request_queue *q, unsigned int op)
4354{
4355        struct cfq_data *cfqd = q->elevator->elevator_data;
4356        struct task_struct *tsk = current;
4357        struct cfq_io_cq *cic;
4358        struct cfq_queue *cfqq;
4359
4360        /*
4361         * don't force setup of a queue from here, as a call to may_queue
4362         * does not necessarily imply that a request actually will be queued.
4363         * so just lookup a possibly existing queue, or return 'may queue'
4364         * if that fails
4365         */
4366        cic = cfq_cic_lookup(cfqd, tsk->io_context);
4367        if (!cic)
4368                return ELV_MQUEUE_MAY;
4369
4370        cfqq = cic_to_cfqq(cic, op_is_sync(op));
4371        if (cfqq) {
4372                cfq_init_prio_data(cfqq, cic);
4373                cfqq_boost_on_prio(cfqq, op);
4374
4375                return __cfq_may_queue(cfqq);
4376        }
4377
4378        return ELV_MQUEUE_MAY;
4379}
4380
4381/*
4382 * queue lock held here
4383 */
4384static void cfq_put_request(struct request *rq)
4385{
4386        struct cfq_queue *cfqq = RQ_CFQQ(rq);
4387
4388        if (cfqq) {
4389                const int rw = rq_data_dir(rq);
4390
4391                BUG_ON(!cfqq->allocated[rw]);
4392                cfqq->allocated[rw]--;
4393
4394                /* Put down rq reference on cfqg */
4395                cfqg_put(RQ_CFQG(rq));
4396                rq->elv.priv[0] = NULL;
4397                rq->elv.priv[1] = NULL;
4398
4399                cfq_put_queue(cfqq);
4400        }
4401}
4402
4403static struct cfq_queue *
4404cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4405                struct cfq_queue *cfqq)
4406{
4407        cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4408        cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4409        cfq_mark_cfqq_coop(cfqq->new_cfqq);
4410        cfq_put_queue(cfqq);
4411        return cic_to_cfqq(cic, 1);
4412}
4413
4414/*
4415 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4416 * was the last process referring to said cfqq.
4417 */
4418static struct cfq_queue *
4419split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4420{
4421        if (cfqq_process_refs(cfqq) == 1) {
4422                cfqq->pid = current->pid;
4423                cfq_clear_cfqq_coop(cfqq);
4424                cfq_clear_cfqq_split_coop(cfqq);
4425                return cfqq;
4426        }
4427
4428        cic_set_cfqq(cic, NULL, 1);
4429
4430        cfq_put_cooperator(cfqq);
4431
4432        cfq_put_queue(cfqq);
4433        return NULL;
4434}
4435/*
4436 * Allocate cfq data structures associated with this request.
4437 */
4438static int
4439cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4440                gfp_t gfp_mask)
4441{
4442        struct cfq_data *cfqd = q->elevator->elevator_data;
4443        struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4444        const int rw = rq_data_dir(rq);
4445        const bool is_sync = rq_is_sync(rq);
4446        struct cfq_queue *cfqq;
4447        bool disable_wbt;
4448
4449        spin_lock_irq(q->queue_lock);
4450
4451        check_ioprio_changed(cic, bio);
4452        disable_wbt = check_blkcg_changed(cic, bio);
4453new_queue:
4454        cfqq = cic_to_cfqq(cic, is_sync);
4455        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4456                if (cfqq)
4457                        cfq_put_queue(cfqq);
4458                cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4459                cic_set_cfqq(cic, cfqq, is_sync);
4460        } else {
4461                /*
4462                 * If the queue was seeky for too long, break it apart.
4463                 */
4464                if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4465                        cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4466                        cfqq = split_cfqq(cic, cfqq);
4467                        if (!cfqq)
4468                                goto new_queue;
4469                }
4470
4471                /*
4472                 * Check to see if this queue is scheduled to merge with
4473                 * another, closely cooperating queue.  The merging of
4474                 * queues happens here as it must be done in process context.
4475                 * The reference on new_cfqq was taken in merge_cfqqs.
4476                 */
4477                if (cfqq->new_cfqq)
4478                        cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4479        }
4480
4481        cfqq->allocated[rw]++;
4482
4483        cfqq->ref++;
4484        cfqg_get(cfqq->cfqg);
4485        rq->elv.priv[0] = cfqq;
4486        rq->elv.priv[1] = cfqq->cfqg;
4487        spin_unlock_irq(q->queue_lock);
4488
4489        if (disable_wbt)
4490                wbt_disable_default(q);
4491
4492        return 0;
4493}
4494
4495static void cfq_kick_queue(struct work_struct *work)
4496{
4497        struct cfq_data *cfqd =
4498                container_of(work, struct cfq_data, unplug_work);
4499        struct request_queue *q = cfqd->queue;
4500
4501        spin_lock_irq(q->queue_lock);
4502        __blk_run_queue(cfqd->queue);
4503        spin_unlock_irq(q->queue_lock);
4504}
4505
4506/*
4507 * Timer running if the active_queue is currently idling inside its time slice
4508 */
4509static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
4510{
4511        struct cfq_data *cfqd = container_of(timer, struct cfq_data,
4512                                             idle_slice_timer);
4513        struct cfq_queue *cfqq;
4514        unsigned long flags;
4515        int timed_out = 1;
4516
4517        cfq_log(cfqd, "idle timer fired");
4518
4519        spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4520
4521        cfqq = cfqd->active_queue;
4522        if (cfqq) {
4523                timed_out = 0;
4524
4525                /*
4526                 * We saw a request before the queue expired, let it through
4527                 */
4528                if (cfq_cfqq_must_dispatch(cfqq))
4529                        goto out_kick;
4530
4531                /*
4532                 * expired
4533                 */
4534                if (cfq_slice_used(cfqq))
4535                        goto expire;
4536
4537                /*
4538                 * only expire and reinvoke request handler, if there are
4539                 * other queues with pending requests
4540                 */
4541                if (!cfqd->busy_queues)
4542                        goto out_cont;
4543
4544                /*
4545                 * not expired and it has a request pending, let it dispatch
4546                 */
4547                if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4548                        goto out_kick;
4549
4550                /*
4551                 * Queue depth flag is reset only when the idle didn't succeed
4552                 */
4553                cfq_clear_cfqq_deep(cfqq);
4554        }
4555expire:
4556        cfq_slice_expired(cfqd, timed_out);
4557out_kick:
4558        cfq_schedule_dispatch(cfqd);
4559out_cont:
4560        spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4561        return HRTIMER_NORESTART;
4562}
4563
4564static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4565{
4566        hrtimer_cancel(&cfqd->idle_slice_timer);
4567        cancel_work_sync(&cfqd->unplug_work);
4568}
4569
4570static void cfq_exit_queue(struct elevator_queue *e)
4571{
4572        struct cfq_data *cfqd = e->elevator_data;
4573        struct request_queue *q = cfqd->queue;
4574
4575        cfq_shutdown_timer_wq(cfqd);
4576
4577        spin_lock_irq(q->queue_lock);
4578
4579        if (cfqd->active_queue)
4580                __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4581
4582        spin_unlock_irq(q->queue_lock);
4583
4584        cfq_shutdown_timer_wq(cfqd);
4585
4586#ifdef CONFIG_CFQ_GROUP_IOSCHED
4587        blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4588#else
4589        kfree(cfqd->root_group);
4590#endif
4591        kfree(cfqd);
4592}
4593
4594static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4595{
4596        struct cfq_data *cfqd;
4597        struct blkcg_gq *blkg __maybe_unused;
4598        int i, ret;
4599        struct elevator_queue *eq;
4600
4601        eq = elevator_alloc(q, e);
4602        if (!eq)
4603                return -ENOMEM;
4604
4605        cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4606        if (!cfqd) {
4607                kobject_put(&eq->kobj);
4608                return -ENOMEM;
4609        }
4610        eq->elevator_data = cfqd;
4611
4612        cfqd->queue = q;
4613        spin_lock_irq(q->queue_lock);
4614        q->elevator = eq;
4615        spin_unlock_irq(q->queue_lock);
4616
4617        /* Init root service tree */
4618        cfqd->grp_service_tree = CFQ_RB_ROOT;
4619
4620        /* Init root group and prefer root group over other groups by default */
4621#ifdef CONFIG_CFQ_GROUP_IOSCHED
4622        ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4623        if (ret)
4624                goto out_free;
4625
4626        cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4627#else
4628        ret = -ENOMEM;
4629        cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4630                                        GFP_KERNEL, cfqd->queue->node);
4631        if (!cfqd->root_group)
4632                goto out_free;
4633
4634        cfq_init_cfqg_base(cfqd->root_group);
4635        cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4636        cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4637#endif
4638
4639        /*
4640         * Not strictly needed (since RB_ROOT just clears the node and we
4641         * zeroed cfqd on alloc), but better be safe in case someone decides
4642         * to add magic to the rb code
4643         */
4644        for (i = 0; i < CFQ_PRIO_LISTS; i++)
4645                cfqd->prio_trees[i] = RB_ROOT;
4646
4647        /*
4648         * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4649         * Grab a permanent reference to it, so that the normal code flow
4650         * will not attempt to free it.  oom_cfqq is linked to root_group
4651         * but shouldn't hold a reference as it'll never be unlinked.  Lose
4652         * the reference from linking right away.
4653         */
4654        cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4655        cfqd->oom_cfqq.ref++;
4656
4657        spin_lock_irq(q->queue_lock);
4658        cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4659        cfqg_put(cfqd->root_group);
4660        spin_unlock_irq(q->queue_lock);
4661
4662        hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
4663                     HRTIMER_MODE_REL);
4664        cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4665
4666        INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4667
4668        cfqd->cfq_quantum = cfq_quantum;
4669        cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4670        cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4671        cfqd->cfq_back_max = cfq_back_max;
4672        cfqd->cfq_back_penalty = cfq_back_penalty;
4673        cfqd->cfq_slice[0] = cfq_slice_async;
4674        cfqd->cfq_slice[1] = cfq_slice_sync;
4675        cfqd->cfq_target_latency = cfq_target_latency;
4676        cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4677        cfqd->cfq_slice_idle = cfq_slice_idle;
4678        cfqd->cfq_group_idle = cfq_group_idle;
4679        cfqd->cfq_latency = 1;
4680        cfqd->hw_tag = -1;
4681        /*
4682         * we optimistically start assuming sync ops weren't delayed in last
4683         * second, in order to have larger depth for async operations.
4684         */
4685        cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
4686        return 0;
4687
4688out_free:
4689        kfree(cfqd);
4690        kobject_put(&eq->kobj);
4691        return ret;
4692}
4693
4694static void cfq_registered_queue(struct request_queue *q)
4695{
4696        struct elevator_queue *e = q->elevator;
4697        struct cfq_data *cfqd = e->elevator_data;
4698
4699        /*
4700         * Default to IOPS mode with no idling for SSDs
4701         */
4702        if (blk_queue_nonrot(q))
4703                cfqd->cfq_slice_idle = 0;
4704}
4705
4706/*
4707 * sysfs parts below -->
4708 */
4709static ssize_t
4710cfq_var_show(unsigned int var, char *page)
4711{
4712        return sprintf(page, "%u\n", var);
4713}
4714
4715static ssize_t
4716cfq_var_store(unsigned int *var, const char *page, size_t count)
4717{
4718        char *p = (char *) page;
4719
4720        *var = simple_strtoul(p, &p, 10);
4721        return count;
4722}
4723
4724#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
4725static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4726{                                                                       \
4727        struct cfq_data *cfqd = e->elevator_data;                       \
4728        u64 __data = __VAR;                                             \
4729        if (__CONV)                                                     \
4730                __data = div_u64(__data, NSEC_PER_MSEC);                        \
4731        return cfq_var_show(__data, (page));                            \
4732}
4733SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4734SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4735SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4736SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4737SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4738SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4739SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4740SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4741SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4742SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4743SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4744SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4745#undef SHOW_FUNCTION
4746
4747#define USEC_SHOW_FUNCTION(__FUNC, __VAR)                               \
4748static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4749{                                                                       \
4750        struct cfq_data *cfqd = e->elevator_data;                       \
4751        u64 __data = __VAR;                                             \
4752        __data = div_u64(__data, NSEC_PER_USEC);                        \
4753        return cfq_var_show(__data, (page));                            \
4754}
4755USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
4756USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
4757USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
4758USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
4759USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4760#undef USEC_SHOW_FUNCTION
4761
4762#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
4763static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4764{                                                                       \
4765        struct cfq_data *cfqd = e->elevator_data;                       \
4766        unsigned int __data;                                            \
4767        int ret = cfq_var_store(&__data, (page), count);                \
4768        if (__data < (MIN))                                             \
4769                __data = (MIN);                                         \
4770        else if (__data > (MAX))                                        \
4771                __data = (MAX);                                         \
4772        if (__CONV)                                                     \
4773                *(__PTR) = (u64)__data * NSEC_PER_MSEC;                 \
4774        else                                                            \
4775                *(__PTR) = __data;                                      \
4776        return ret;                                                     \
4777}
4778STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4779STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4780                UINT_MAX, 1);
4781STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4782                UINT_MAX, 1);
4783STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4784STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4785                UINT_MAX, 0);
4786STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4787STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4788STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4789STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4790STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4791                UINT_MAX, 0);
4792STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4793STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4794#undef STORE_FUNCTION
4795
4796#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                    \
4797static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4798{                                                                       \
4799        struct cfq_data *cfqd = e->elevator_data;                       \
4800        unsigned int __data;                                            \
4801        int ret = cfq_var_store(&__data, (page), count);                \
4802        if (__data < (MIN))                                             \
4803                __data = (MIN);                                         \
4804        else if (__data > (MAX))                                        \
4805                __data = (MAX);                                         \
4806        *(__PTR) = (u64)__data * NSEC_PER_USEC;                         \
4807        return ret;                                                     \
4808}
4809USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
4810USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
4811USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
4812USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
4813USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
4814#undef USEC_STORE_FUNCTION
4815
4816#define CFQ_ATTR(name) \
4817        __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4818
4819static struct elv_fs_entry cfq_attrs[] = {
4820        CFQ_ATTR(quantum),
4821        CFQ_ATTR(fifo_expire_sync),
4822        CFQ_ATTR(fifo_expire_async),
4823        CFQ_ATTR(back_seek_max),
4824        CFQ_ATTR(back_seek_penalty),
4825        CFQ_ATTR(slice_sync),
4826        CFQ_ATTR(slice_sync_us),
4827        CFQ_ATTR(slice_async),
4828        CFQ_ATTR(slice_async_us),
4829        CFQ_ATTR(slice_async_rq),
4830        CFQ_ATTR(slice_idle),
4831        CFQ_ATTR(slice_idle_us),
4832        CFQ_ATTR(group_idle),
4833        CFQ_ATTR(group_idle_us),
4834        CFQ_ATTR(low_latency),
4835        CFQ_ATTR(target_latency),
4836        CFQ_ATTR(target_latency_us),
4837        __ATTR_NULL
4838};
4839
4840static struct elevator_type iosched_cfq = {
4841        .ops = {
4842                .elevator_merge_fn =            cfq_merge,
4843                .elevator_merged_fn =           cfq_merged_request,
4844                .elevator_merge_req_fn =        cfq_merged_requests,
4845                .elevator_allow_bio_merge_fn =  cfq_allow_bio_merge,
4846                .elevator_allow_rq_merge_fn =   cfq_allow_rq_merge,
4847                .elevator_bio_merged_fn =       cfq_bio_merged,
4848                .elevator_dispatch_fn =         cfq_dispatch_requests,
4849                .elevator_add_req_fn =          cfq_insert_request,
4850                .elevator_activate_req_fn =     cfq_activate_request,
4851                .elevator_deactivate_req_fn =   cfq_deactivate_request,
4852                .elevator_completed_req_fn =    cfq_completed_request,
4853                .elevator_former_req_fn =       elv_rb_former_request,
4854                .elevator_latter_req_fn =       elv_rb_latter_request,
4855                .elevator_init_icq_fn =         cfq_init_icq,
4856                .elevator_exit_icq_fn =         cfq_exit_icq,
4857                .elevator_set_req_fn =          cfq_set_request,
4858                .elevator_put_req_fn =          cfq_put_request,
4859                .elevator_may_queue_fn =        cfq_may_queue,
4860                .elevator_init_fn =             cfq_init_queue,
4861                .elevator_exit_fn =             cfq_exit_queue,
4862                .elevator_registered_fn =       cfq_registered_queue,
4863        },
4864        .icq_size       =       sizeof(struct cfq_io_cq),
4865        .icq_align      =       __alignof__(struct cfq_io_cq),
4866        .elevator_attrs =       cfq_attrs,
4867        .elevator_name  =       "cfq",
4868        .elevator_owner =       THIS_MODULE,
4869};
4870
4871#ifdef CONFIG_CFQ_GROUP_IOSCHED
4872static struct blkcg_policy blkcg_policy_cfq = {
4873        .dfl_cftypes            = cfq_blkcg_files,
4874        .legacy_cftypes         = cfq_blkcg_legacy_files,
4875
4876        .cpd_alloc_fn           = cfq_cpd_alloc,
4877        .cpd_init_fn            = cfq_cpd_init,
4878        .cpd_free_fn            = cfq_cpd_free,
4879        .cpd_bind_fn            = cfq_cpd_bind,
4880
4881        .pd_alloc_fn            = cfq_pd_alloc,
4882        .pd_init_fn             = cfq_pd_init,
4883        .pd_offline_fn          = cfq_pd_offline,
4884        .pd_free_fn             = cfq_pd_free,
4885        .pd_reset_stats_fn      = cfq_pd_reset_stats,
4886};
4887#endif
4888
4889static int __init cfq_init(void)
4890{
4891        int ret;
4892
4893#ifdef CONFIG_CFQ_GROUP_IOSCHED
4894        ret = blkcg_policy_register(&blkcg_policy_cfq);
4895        if (ret)
4896                return ret;
4897#else
4898        cfq_group_idle = 0;
4899#endif
4900
4901        ret = -ENOMEM;
4902        cfq_pool = KMEM_CACHE(cfq_queue, 0);
4903        if (!cfq_pool)
4904                goto err_pol_unreg;
4905
4906        ret = elv_register(&iosched_cfq);
4907        if (ret)
4908                goto err_free_pool;
4909
4910        return 0;
4911
4912err_free_pool:
4913        kmem_cache_destroy(cfq_pool);
4914err_pol_unreg:
4915#ifdef CONFIG_CFQ_GROUP_IOSCHED
4916        blkcg_policy_unregister(&blkcg_policy_cfq);
4917#endif
4918        return ret;
4919}
4920
4921static void __exit cfq_exit(void)
4922{
4923#ifdef CONFIG_CFQ_GROUP_IOSCHED
4924        blkcg_policy_unregister(&blkcg_policy_cfq);
4925#endif
4926        elv_unregister(&iosched_cfq);
4927        kmem_cache_destroy(cfq_pool);
4928}
4929
4930module_init(cfq_init);
4931module_exit(cfq_exit);
4932
4933MODULE_AUTHOR("Jens Axboe");
4934MODULE_LICENSE("GPL");
4935MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
4936
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.