linux/block/blk-core.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 1991, 1992 Linus Torvalds
   3 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
   4 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
   5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
   6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
   7 *      -  July2000
   8 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
   9 */
  10
  11/*
  12 * This handles all read/write requests to block devices
  13 */
  14#include <linux/kernel.h>
  15#include <linux/module.h>
  16#include <linux/backing-dev.h>
  17#include <linux/bio.h>
  18#include <linux/blkdev.h>
  19#include <linux/highmem.h>
  20#include <linux/mm.h>
  21#include <linux/kernel_stat.h>
  22#include <linux/string.h>
  23#include <linux/init.h>
  24#include <linux/completion.h>
  25#include <linux/slab.h>
  26#include <linux/swap.h>
  27#include <linux/writeback.h>
  28#include <linux/task_io_accounting_ops.h>
  29#include <linux/fault-inject.h>
  30#include <linux/list_sort.h>
  31#include <linux/delay.h>
  32#include <linux/ratelimit.h>
  33#include <linux/pm_runtime.h>
  34
  35#define CREATE_TRACE_POINTS
  36#include <trace/events/block.h>
  37
  38#include "blk.h"
  39#include "blk-cgroup.h"
  40
  41EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
  42EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
  43EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
  44EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
  45
  46DEFINE_IDA(blk_queue_ida);
  47
  48/*
  49 * For the allocated request tables
  50 */
  51static struct kmem_cache *request_cachep;
  52
  53/*
  54 * For queue allocation
  55 */
  56struct kmem_cache *blk_requestq_cachep;
  57
  58/*
  59 * Controlling structure to kblockd
  60 */
  61static struct workqueue_struct *kblockd_workqueue;
  62
  63static void drive_stat_acct(struct request *rq, int new_io)
  64{
  65        struct hd_struct *part;
  66        int rw = rq_data_dir(rq);
  67        int cpu;
  68
  69        if (!blk_do_io_stat(rq))
  70                return;
  71
  72        cpu = part_stat_lock();
  73
  74        if (!new_io) {
  75                part = rq->part;
  76                part_stat_inc(cpu, part, merges[rw]);
  77        } else {
  78                part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
  79                if (!hd_struct_try_get(part)) {
  80                        /*
  81                         * The partition is already being removed,
  82                         * the request will be accounted on the disk only
  83                         *
  84                         * We take a reference on disk->part0 although that
  85                         * partition will never be deleted, so we can treat
  86                         * it as any other partition.
  87                         */
  88                        part = &rq->rq_disk->part0;
  89                        hd_struct_get(part);
  90                }
  91                part_round_stats(cpu, part);
  92                part_inc_in_flight(part, rw);
  93                rq->part = part;
  94        }
  95
  96        part_stat_unlock();
  97}
  98
  99void blk_queue_congestion_threshold(struct request_queue *q)
 100{
 101        int nr;
 102
 103        nr = q->nr_requests - (q->nr_requests / 8) + 1;
 104        if (nr > q->nr_requests)
 105                nr = q->nr_requests;
 106        q->nr_congestion_on = nr;
 107
 108        nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 109        if (nr < 1)
 110                nr = 1;
 111        q->nr_congestion_off = nr;
 112}
 113
 114/**
 115 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
 116 * @bdev:       device
 117 *
 118 * Locates the passed device's request queue and returns the address of its
 119 * backing_dev_info
 120 *
 121 * Will return NULL if the request queue cannot be located.
 122 */
 123struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 124{
 125        struct backing_dev_info *ret = NULL;
 126        struct request_queue *q = bdev_get_queue(bdev);
 127
 128        if (q)
 129                ret = &q->backing_dev_info;
 130        return ret;
 131}
 132EXPORT_SYMBOL(blk_get_backing_dev_info);
 133
 134void blk_rq_init(struct request_queue *q, struct request *rq)
 135{
 136        memset(rq, 0, sizeof(*rq));
 137
 138        INIT_LIST_HEAD(&rq->queuelist);
 139        INIT_LIST_HEAD(&rq->timeout_list);
 140        rq->cpu = -1;
 141        rq->q = q;
 142        rq->__sector = (sector_t) -1;
 143        INIT_HLIST_NODE(&rq->hash);
 144        RB_CLEAR_NODE(&rq->rb_node);
 145        rq->cmd = rq->__cmd;
 146        rq->cmd_len = BLK_MAX_CDB;
 147        rq->tag = -1;
 148        rq->ref_count = 1;
 149        rq->start_time = jiffies;
 150        set_start_time_ns(rq);
 151        rq->part = NULL;
 152}
 153EXPORT_SYMBOL(blk_rq_init);
 154
 155static void req_bio_endio(struct request *rq, struct bio *bio,
 156                          unsigned int nbytes, int error)
 157{
 158        if (error)
 159                clear_bit(BIO_UPTODATE, &bio->bi_flags);
 160        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 161                error = -EIO;
 162
 163        if (unlikely(rq->cmd_flags & REQ_QUIET))
 164                set_bit(BIO_QUIET, &bio->bi_flags);
 165
 166        bio_advance(bio, nbytes);
 167
 168        /* don't actually finish bio if it's part of flush sequence */
 169        if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 170                bio_endio(bio, error);
 171}
 172
 173void blk_dump_rq_flags(struct request *rq, char *msg)
 174{
 175        int bit;
 176
 177        printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
 178                rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 179                rq->cmd_flags);
 180
 181        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 182               (unsigned long long)blk_rq_pos(rq),
 183               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 184        printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 185               rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
 186
 187        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 188                printk(KERN_INFO "  cdb: ");
 189                for (bit = 0; bit < BLK_MAX_CDB; bit++)
 190                        printk("%02x ", rq->cmd[bit]);
 191                printk("\n");
 192        }
 193}
 194EXPORT_SYMBOL(blk_dump_rq_flags);
 195
 196static void blk_delay_work(struct work_struct *work)
 197{
 198        struct request_queue *q;
 199
 200        q = container_of(work, struct request_queue, delay_work.work);
 201        spin_lock_irq(q->queue_lock);
 202        __blk_run_queue(q);
 203        spin_unlock_irq(q->queue_lock);
 204}
 205
 206/**
 207 * blk_delay_queue - restart queueing after defined interval
 208 * @q:          The &struct request_queue in question
 209 * @msecs:      Delay in msecs
 210 *
 211 * Description:
 212 *   Sometimes queueing needs to be postponed for a little while, to allow
 213 *   resources to come back. This function will make sure that queueing is
 214 *   restarted around the specified time. Queue lock must be held.
 215 */
 216void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 217{
 218        if (likely(!blk_queue_dead(q)))
 219                queue_delayed_work(kblockd_workqueue, &q->delay_work,
 220                                   msecs_to_jiffies(msecs));
 221}
 222EXPORT_SYMBOL(blk_delay_queue);
 223
 224/**
 225 * blk_start_queue - restart a previously stopped queue
 226 * @q:    The &struct request_queue in question
 227 *
 228 * Description:
 229 *   blk_start_queue() will clear the stop flag on the queue, and call
 230 *   the request_fn for the queue if it was in a stopped state when
 231 *   entered. Also see blk_stop_queue(). Queue lock must be held.
 232 **/
 233void blk_start_queue(struct request_queue *q)
 234{
 235        WARN_ON(!irqs_disabled());
 236
 237        queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 238        __blk_run_queue(q);
 239}
 240EXPORT_SYMBOL(blk_start_queue);
 241
 242/**
 243 * blk_stop_queue - stop a queue
 244 * @q:    The &struct request_queue in question
 245 *
 246 * Description:
 247 *   The Linux block layer assumes that a block driver will consume all
 248 *   entries on the request queue when the request_fn strategy is called.
 249 *   Often this will not happen, because of hardware limitations (queue
 250 *   depth settings). If a device driver gets a 'queue full' response,
 251 *   or if it simply chooses not to queue more I/O at one point, it can
 252 *   call this function to prevent the request_fn from being called until
 253 *   the driver has signalled it's ready to go again. This happens by calling
 254 *   blk_start_queue() to restart queue operations. Queue lock must be held.
 255 **/
 256void blk_stop_queue(struct request_queue *q)
 257{
 258        cancel_delayed_work(&q->delay_work);
 259        queue_flag_set(QUEUE_FLAG_STOPPED, q);
 260}
 261EXPORT_SYMBOL(blk_stop_queue);
 262
 263/**
 264 * blk_sync_queue - cancel any pending callbacks on a queue
 265 * @q: the queue
 266 *
 267 * Description:
 268 *     The block layer may perform asynchronous callback activity
 269 *     on a queue, such as calling the unplug function after a timeout.
 270 *     A block device may call blk_sync_queue to ensure that any
 271 *     such activity is cancelled, thus allowing it to release resources
 272 *     that the callbacks might use. The caller must already have made sure
 273 *     that its ->make_request_fn will not re-add plugging prior to calling
 274 *     this function.
 275 *
 276 *     This function does not cancel any asynchronous activity arising
 277 *     out of elevator or throttling code. That would require elevaotor_exit()
 278 *     and blkcg_exit_queue() to be called with queue lock initialized.
 279 *
 280 */
 281void blk_sync_queue(struct request_queue *q)
 282{
 283        del_timer_sync(&q->timeout);
 284        cancel_delayed_work_sync(&q->delay_work);
 285}
 286EXPORT_SYMBOL(blk_sync_queue);
 287
 288/**
 289 * __blk_run_queue_uncond - run a queue whether or not it has been stopped
 290 * @q:  The queue to run
 291 *
 292 * Description:
 293 *    Invoke request handling on a queue if there are any pending requests.
 294 *    May be used to restart request handling after a request has completed.
 295 *    This variant runs the queue whether or not the queue has been
 296 *    stopped. Must be called with the queue lock held and interrupts
 297 *    disabled. See also @blk_run_queue.
 298 */
 299inline void __blk_run_queue_uncond(struct request_queue *q)
 300{
 301        if (unlikely(blk_queue_dead(q)))
 302                return;
 303
 304        /*
 305         * Some request_fn implementations, e.g. scsi_request_fn(), unlock
 306         * the queue lock internally. As a result multiple threads may be
 307         * running such a request function concurrently. Keep track of the
 308         * number of active request_fn invocations such that blk_drain_queue()
 309         * can wait until all these request_fn calls have finished.
 310         */
 311        q->request_fn_active++;
 312        q->request_fn(q);
 313        q->request_fn_active--;
 314}
 315
 316/**
 317 * __blk_run_queue - run a single device queue
 318 * @q:  The queue to run
 319 *
 320 * Description:
 321 *    See @blk_run_queue. This variant must be called with the queue lock
 322 *    held and interrupts disabled.
 323 */
 324void __blk_run_queue(struct request_queue *q)
 325{
 326        if (unlikely(blk_queue_stopped(q)))
 327                return;
 328
 329        __blk_run_queue_uncond(q);
 330}
 331EXPORT_SYMBOL(__blk_run_queue);
 332
 333/**
 334 * blk_run_queue_async - run a single device queue in workqueue context
 335 * @q:  The queue to run
 336 *
 337 * Description:
 338 *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
 339 *    of us. The caller must hold the queue lock.
 340 */
 341void blk_run_queue_async(struct request_queue *q)
 342{
 343        if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
 344                mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 345}
 346EXPORT_SYMBOL(blk_run_queue_async);
 347
 348/**
 349 * blk_run_queue - run a single device queue
 350 * @q: The queue to run
 351 *
 352 * Description:
 353 *    Invoke request handling on this queue, if it has pending work to do.
 354 *    May be used to restart queueing when a request has completed.
 355 */
 356void blk_run_queue(struct request_queue *q)
 357{
 358        unsigned long flags;
 359
 360        spin_lock_irqsave(q->queue_lock, flags);
 361        __blk_run_queue(q);
 362        spin_unlock_irqrestore(q->queue_lock, flags);
 363}
 364EXPORT_SYMBOL(blk_run_queue);
 365
 366void blk_put_queue(struct request_queue *q)
 367{
 368        kobject_put(&q->kobj);
 369}
 370EXPORT_SYMBOL(blk_put_queue);
 371
 372/**
 373 * __blk_drain_queue - drain requests from request_queue
 374 * @q: queue to drain
 375 * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
 376 *
 377 * Drain requests from @q.  If @drain_all is set, all requests are drained.
 378 * If not, only ELVPRIV requests are drained.  The caller is responsible
 379 * for ensuring that no new requests which need to be drained are queued.
 380 */
 381static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 382        __releases(q->queue_lock)
 383        __acquires(q->queue_lock)
 384{
 385        int i;
 386
 387        lockdep_assert_held(q->queue_lock);
 388
 389        while (true) {
 390                bool drain = false;
 391
 392                /*
 393                 * The caller might be trying to drain @q before its
 394                 * elevator is initialized.
 395                 */
 396                if (q->elevator)
 397                        elv_drain_elevator(q);
 398
 399                blkcg_drain_queue(q);
 400
 401                /*
 402                 * This function might be called on a queue which failed
 403                 * driver init after queue creation or is not yet fully
 404                 * active yet.  Some drivers (e.g. fd and loop) get unhappy
 405                 * in such cases.  Kick queue iff dispatch queue has
 406                 * something on it and @q has request_fn set.
 407                 */
 408                if (!list_empty(&q->queue_head) && q->request_fn)
 409                        __blk_run_queue(q);
 410
 411                drain |= q->nr_rqs_elvpriv;
 412                drain |= q->request_fn_active;
 413
 414                /*
 415                 * Unfortunately, requests are queued at and tracked from
 416                 * multiple places and there's no single counter which can
 417                 * be drained.  Check all the queues and counters.
 418                 */
 419                if (drain_all) {
 420                        drain |= !list_empty(&q->queue_head);
 421                        for (i = 0; i < 2; i++) {
 422                                drain |= q->nr_rqs[i];
 423                                drain |= q->in_flight[i];
 424                                drain |= !list_empty(&q->flush_queue[i]);
 425                        }
 426                }
 427
 428                if (!drain)
 429                        break;
 430
 431                spin_unlock_irq(q->queue_lock);
 432
 433                msleep(10);
 434
 435                spin_lock_irq(q->queue_lock);
 436        }
 437
 438        /*
 439         * With queue marked dead, any woken up waiter will fail the
 440         * allocation path, so the wakeup chaining is lost and we're
 441         * left with hung waiters. We need to wake up those waiters.
 442         */
 443        if (q->request_fn) {
 444                struct request_list *rl;
 445
 446                blk_queue_for_each_rl(rl, q)
 447                        for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
 448                                wake_up_all(&rl->wait[i]);
 449        }
 450}
 451
 452/**
 453 * blk_queue_bypass_start - enter queue bypass mode
 454 * @q: queue of interest
 455 *
 456 * In bypass mode, only the dispatch FIFO queue of @q is used.  This
 457 * function makes @q enter bypass mode and drains all requests which were
 458 * throttled or issued before.  On return, it's guaranteed that no request
 459 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
 460 * inside queue or RCU read lock.
 461 */
 462void blk_queue_bypass_start(struct request_queue *q)
 463{
 464        bool drain;
 465
 466        spin_lock_irq(q->queue_lock);
 467        drain = !q->bypass_depth++;
 468        queue_flag_set(QUEUE_FLAG_BYPASS, q);
 469        spin_unlock_irq(q->queue_lock);
 470
 471        if (drain) {
 472                spin_lock_irq(q->queue_lock);
 473                __blk_drain_queue(q, false);
 474                spin_unlock_irq(q->queue_lock);
 475
 476                /* ensure blk_queue_bypass() is %true inside RCU read lock */
 477                synchronize_rcu();
 478        }
 479}
 480EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
 481
 482/**
 483 * blk_queue_bypass_end - leave queue bypass mode
 484 * @q: queue of interest
 485 *
 486 * Leave bypass mode and restore the normal queueing behavior.
 487 */
 488void blk_queue_bypass_end(struct request_queue *q)
 489{
 490        spin_lock_irq(q->queue_lock);
 491        if (!--q->bypass_depth)
 492                queue_flag_clear(QUEUE_FLAG_BYPASS, q);
 493        WARN_ON_ONCE(q->bypass_depth < 0);
 494        spin_unlock_irq(q->queue_lock);
 495}
 496EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
 497
 498/**
 499 * blk_cleanup_queue - shutdown a request queue
 500 * @q: request queue to shutdown
 501 *
 502 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
 503 * put it.  All future requests will be failed immediately with -ENODEV.
 504 */
 505void blk_cleanup_queue(struct request_queue *q)
 506{
 507        spinlock_t *lock = q->queue_lock;
 508
 509        /* mark @q DYING, no new request or merges will be allowed afterwards */
 510        mutex_lock(&q->sysfs_lock);
 511        queue_flag_set_unlocked(QUEUE_FLAG_DYING, q);
 512        spin_lock_irq(lock);
 513
 514        /*
 515         * A dying queue is permanently in bypass mode till released.  Note
 516         * that, unlike blk_queue_bypass_start(), we aren't performing
 517         * synchronize_rcu() after entering bypass mode to avoid the delay
 518         * as some drivers create and destroy a lot of queues while
 519         * probing.  This is still safe because blk_release_queue() will be
 520         * called only after the queue refcnt drops to zero and nothing,
 521         * RCU or not, would be traversing the queue by then.
 522         */
 523        q->bypass_depth++;
 524        queue_flag_set(QUEUE_FLAG_BYPASS, q);
 525
 526        queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 527        queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
 528        queue_flag_set(QUEUE_FLAG_DYING, q);
 529        spin_unlock_irq(lock);
 530        mutex_unlock(&q->sysfs_lock);
 531
 532        /*
 533         * Drain all requests queued before DYING marking. Set DEAD flag to
 534         * prevent that q->request_fn() gets invoked after draining finished.
 535         */
 536        spin_lock_irq(lock);
 537        __blk_drain_queue(q, true);
 538        queue_flag_set(QUEUE_FLAG_DEAD, q);
 539        spin_unlock_irq(lock);
 540
 541        /* @q won't process any more request, flush async actions */
 542        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
 543        blk_sync_queue(q);
 544
 545        spin_lock_irq(lock);
 546        if (q->queue_lock != &q->__queue_lock)
 547                q->queue_lock = &q->__queue_lock;
 548        spin_unlock_irq(lock);
 549
 550        /* @q is and will stay empty, shutdown and put */
 551        blk_put_queue(q);
 552}
 553EXPORT_SYMBOL(blk_cleanup_queue);
 554
 555int blk_init_rl(struct request_list *rl, struct request_queue *q,
 556                gfp_t gfp_mask)
 557{
 558        if (unlikely(rl->rq_pool))
 559                return 0;
 560
 561        rl->q = q;
 562        rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
 563        rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
 564        init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 565        init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 566
 567        rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 568                                          mempool_free_slab, request_cachep,
 569                                          gfp_mask, q->node);
 570        if (!rl->rq_pool)
 571                return -ENOMEM;
 572
 573        return 0;
 574}
 575
 576void blk_exit_rl(struct request_list *rl)
 577{
 578        if (rl->rq_pool)
 579                mempool_destroy(rl->rq_pool);
 580}
 581
 582struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 583{
 584        return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
 585}
 586EXPORT_SYMBOL(blk_alloc_queue);
 587
 588struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 589{
 590        struct request_queue *q;
 591        int err;
 592
 593        q = kmem_cache_alloc_node(blk_requestq_cachep,
 594                                gfp_mask | __GFP_ZERO, node_id);
 595        if (!q)
 596                return NULL;
 597
 598        q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
 599        if (q->id < 0)
 600                goto fail_q;
 601
 602        q->backing_dev_info.ra_pages =
 603                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 604        q->backing_dev_info.state = 0;
 605        q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 606        q->backing_dev_info.name = "block";
 607        q->node = node_id;
 608
 609        err = bdi_init(&q->backing_dev_info);
 610        if (err)
 611                goto fail_id;
 612
 613        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 614                    laptop_mode_timer_fn, (unsigned long) q);
 615        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 616        INIT_LIST_HEAD(&q->queue_head);
 617        INIT_LIST_HEAD(&q->timeout_list);
 618        INIT_LIST_HEAD(&q->icq_list);
 619#ifdef CONFIG_BLK_CGROUP
 620        INIT_LIST_HEAD(&q->blkg_list);
 621#endif
 622        INIT_LIST_HEAD(&q->flush_queue[0]);
 623        INIT_LIST_HEAD(&q->flush_queue[1]);
 624        INIT_LIST_HEAD(&q->flush_data_in_flight);
 625        INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 626
 627        kobject_init(&q->kobj, &blk_queue_ktype);
 628
 629        mutex_init(&q->sysfs_lock);
 630        spin_lock_init(&q->__queue_lock);
 631
 632        /*
 633         * By default initialize queue_lock to internal lock and driver can
 634         * override it later if need be.
 635         */
 636        q->queue_lock = &q->__queue_lock;
 637
 638        /*
 639         * A queue starts its life with bypass turned on to avoid
 640         * unnecessary bypass on/off overhead and nasty surprises during
 641         * init.  The initial bypass will be finished when the queue is
 642         * registered by blk_register_queue().
 643         */
 644        q->bypass_depth = 1;
 645        __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
 646
 647        if (blkcg_init_queue(q))
 648                goto fail_bdi;
 649
 650        return q;
 651
 652fail_bdi:
 653        bdi_destroy(&q->backing_dev_info);
 654fail_id:
 655        ida_simple_remove(&blk_queue_ida, q->id);
 656fail_q:
 657        kmem_cache_free(blk_requestq_cachep, q);
 658        return NULL;
 659}
 660EXPORT_SYMBOL(blk_alloc_queue_node);
 661
 662/**
 663 * blk_init_queue  - prepare a request queue for use with a block device
 664 * @rfn:  The function to be called to process requests that have been
 665 *        placed on the queue.
 666 * @lock: Request queue spin lock
 667 *
 668 * Description:
 669 *    If a block device wishes to use the standard request handling procedures,
 670 *    which sorts requests and coalesces adjacent requests, then it must
 671 *    call blk_init_queue().  The function @rfn will be called when there
 672 *    are requests on the queue that need to be processed.  If the device
 673 *    supports plugging, then @rfn may not be called immediately when requests
 674 *    are available on the queue, but may be called at some time later instead.
 675 *    Plugged queues are generally unplugged when a buffer belonging to one
 676 *    of the requests on the queue is needed, or due to memory pressure.
 677 *
 678 *    @rfn is not required, or even expected, to remove all requests off the
 679 *    queue, but only as many as it can handle at a time.  If it does leave
 680 *    requests on the queue, it is responsible for arranging that the requests
 681 *    get dealt with eventually.
 682 *
 683 *    The queue spin lock must be held while manipulating the requests on the
 684 *    request queue; this lock will be taken also from interrupt context, so irq
 685 *    disabling is needed for it.
 686 *
 687 *    Function returns a pointer to the initialized request queue, or %NULL if
 688 *    it didn't succeed.
 689 *
 690 * Note:
 691 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
 692 *    when the block device is deactivated (such as at module unload).
 693 **/
 694
 695struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 696{
 697        return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
 698}
 699EXPORT_SYMBOL(blk_init_queue);
 700
 701struct request_queue *
 702blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 703{
 704        struct request_queue *uninit_q, *q;
 705
 706        uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 707        if (!uninit_q)
 708                return NULL;
 709
 710        q = blk_init_allocated_queue(uninit_q, rfn, lock);
 711        if (!q)
 712                blk_cleanup_queue(uninit_q);
 713
 714        return q;
 715}
 716EXPORT_SYMBOL(blk_init_queue_node);
 717
 718struct request_queue *
 719blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 720                         spinlock_t *lock)
 721{
 722        if (!q)
 723                return NULL;
 724
 725        if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 726                return NULL;
 727
 728        q->request_fn           = rfn;
 729        q->prep_rq_fn           = NULL;
 730        q->unprep_rq_fn         = NULL;
 731        q->queue_flags          |= QUEUE_FLAG_DEFAULT;
 732
 733        /* Override internal queue lock with supplied lock pointer */
 734        if (lock)
 735                q->queue_lock           = lock;
 736
 737        /*
 738         * This also sets hw/phys segments, boundary and size
 739         */
 740        blk_queue_make_request(q, blk_queue_bio);
 741
 742        q->sg_reserved_size = INT_MAX;
 743
 744        /* Protect q->elevator from elevator_change */
 745        mutex_lock(&q->sysfs_lock);
 746
 747        /* init elevator */
 748        if (elevator_init(q, NULL)) {
 749                mutex_unlock(&q->sysfs_lock);
 750                return NULL;
 751        }
 752
 753        mutex_unlock(&q->sysfs_lock);
 754
 755        return q;
 756}
 757EXPORT_SYMBOL(blk_init_allocated_queue);
 758
 759bool blk_get_queue(struct request_queue *q)
 760{
 761        if (likely(!blk_queue_dying(q))) {
 762                __blk_get_queue(q);
 763                return true;
 764        }
 765
 766        return false;
 767}
 768EXPORT_SYMBOL(blk_get_queue);
 769
 770static inline void blk_free_request(struct request_list *rl, struct request *rq)
 771{
 772        if (rq->cmd_flags & REQ_ELVPRIV) {
 773                elv_put_request(rl->q, rq);
 774                if (rq->elv.icq)
 775                        put_io_context(rq->elv.icq->ioc);
 776        }
 777
 778        mempool_free(rq, rl->rq_pool);
 779}
 780
 781/*
 782 * ioc_batching returns true if the ioc is a valid batching request and
 783 * should be given priority access to a request.
 784 */
 785static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
 786{
 787        if (!ioc)
 788                return 0;
 789
 790        /*
 791         * Make sure the process is able to allocate at least 1 request
 792         * even if the batch times out, otherwise we could theoretically
 793         * lose wakeups.
 794         */
 795        return ioc->nr_batch_requests == q->nr_batching ||
 796                (ioc->nr_batch_requests > 0
 797                && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 798}
 799
 800/*
 801 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
 802 * will cause the process to be a "batcher" on all queues in the system. This
 803 * is the behaviour we want though - once it gets a wakeup it should be given
 804 * a nice run.
 805 */
 806static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 807{
 808        if (!ioc || ioc_batching(q, ioc))
 809                return;
 810
 811        ioc->nr_batch_requests = q->nr_batching;
 812        ioc->last_waited = jiffies;
 813}
 814
 815static void __freed_request(struct request_list *rl, int sync)
 816{
 817        struct request_queue *q = rl->q;
 818
 819        /*
 820         * bdi isn't aware of blkcg yet.  As all async IOs end up root
 821         * blkcg anyway, just use root blkcg state.
 822         */
 823        if (rl == &q->root_rl &&
 824            rl->count[sync] < queue_congestion_off_threshold(q))
 825                blk_clear_queue_congested(q, sync);
 826
 827        if (rl->count[sync] + 1 <= q->nr_requests) {
 828                if (waitqueue_active(&rl->wait[sync]))
 829                        wake_up(&rl->wait[sync]);
 830
 831                blk_clear_rl_full(rl, sync);
 832        }
 833}
 834
 835/*
 836 * A request has just been released.  Account for it, update the full and
 837 * congestion status, wake up any waiters.   Called under q->queue_lock.
 838 */
 839static void freed_request(struct request_list *rl, unsigned int flags)
 840{
 841        struct request_queue *q = rl->q;
 842        int sync = rw_is_sync(flags);
 843
 844        q->nr_rqs[sync]--;
 845        rl->count[sync]--;
 846        if (flags & REQ_ELVPRIV)
 847                q->nr_rqs_elvpriv--;
 848
 849        __freed_request(rl, sync);
 850
 851        if (unlikely(rl->starved[sync ^ 1]))
 852                __freed_request(rl, sync ^ 1);
 853}
 854
 855/*
 856 * Determine if elevator data should be initialized when allocating the
 857 * request associated with @bio.
 858 */
 859static bool blk_rq_should_init_elevator(struct bio *bio)
 860{
 861        if (!bio)
 862                return true;
 863
 864        /*
 865         * Flush requests do not use the elevator so skip initialization.
 866         * This allows a request to share the flush and elevator data.
 867         */
 868        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
 869                return false;
 870
 871        return true;
 872}
 873
 874/**
 875 * rq_ioc - determine io_context for request allocation
 876 * @bio: request being allocated is for this bio (can be %NULL)
 877 *
 878 * Determine io_context to use for request allocation for @bio.  May return
 879 * %NULL if %current->io_context doesn't exist.
 880 */
 881static struct io_context *rq_ioc(struct bio *bio)
 882{
 883#ifdef CONFIG_BLK_CGROUP
 884        if (bio && bio->bi_ioc)
 885                return bio->bi_ioc;
 886#endif
 887        return current->io_context;
 888}
 889
 890/**
 891 * __get_request - get a free request
 892 * @rl: request list to allocate from
 893 * @rw_flags: RW and SYNC flags
 894 * @bio: bio to allocate request for (can be %NULL)
 895 * @gfp_mask: allocation mask
 896 *
 897 * Get a free request from @q.  This function may fail under memory
 898 * pressure or if @q is dead.
 899 *
 900 * Must be callled with @q->queue_lock held and,
 901 * Returns %NULL on failure, with @q->queue_lock held.
 902 * Returns !%NULL on success, with @q->queue_lock *not held*.
 903 */
 904static struct request *__get_request(struct request_list *rl, int rw_flags,
 905                                     struct bio *bio, gfp_t gfp_mask)
 906{
 907        struct request_queue *q = rl->q;
 908        struct request *rq;
 909        struct elevator_type *et = q->elevator->type;
 910        struct io_context *ioc = rq_ioc(bio);
 911        struct io_cq *icq = NULL;
 912        const bool is_sync = rw_is_sync(rw_flags) != 0;
 913        int may_queue;
 914
 915        if (unlikely(blk_queue_dying(q)))
 916                return NULL;
 917
 918        may_queue = elv_may_queue(q, rw_flags);
 919        if (may_queue == ELV_MQUEUE_NO)
 920                goto rq_starved;
 921
 922        if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
 923                if (rl->count[is_sync]+1 >= q->nr_requests) {
 924                        /*
 925                         * The queue will fill after this allocation, so set
 926                         * it as full, and mark this process as "batching".
 927                         * This process will be allowed to complete a batch of
 928                         * requests, others will be blocked.
 929                         */
 930                        if (!blk_rl_full(rl, is_sync)) {
 931                                ioc_set_batching(q, ioc);
 932                                blk_set_rl_full(rl, is_sync);
 933                        } else {
 934                                if (may_queue != ELV_MQUEUE_MUST
 935                                                && !ioc_batching(q, ioc)) {
 936                                        /*
 937                                         * The queue is full and the allocating
 938                                         * process is not a "batcher", and not
 939                                         * exempted by the IO scheduler
 940                                         */
 941                                        return NULL;
 942                                }
 943                        }
 944                }
 945                /*
 946                 * bdi isn't aware of blkcg yet.  As all async IOs end up
 947                 * root blkcg anyway, just use root blkcg state.
 948                 */
 949                if (rl == &q->root_rl)
 950                        blk_set_queue_congested(q, is_sync);
 951        }
 952
 953        /*
 954         * Only allow batching queuers to allocate up to 50% over the defined
 955         * limit of requests, otherwise we could have thousands of requests
 956         * allocated with any setting of ->nr_requests
 957         */
 958        if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
 959                return NULL;
 960
 961        q->nr_rqs[is_sync]++;
 962        rl->count[is_sync]++;
 963        rl->starved[is_sync] = 0;
 964
 965        /*
 966         * Decide whether the new request will be managed by elevator.  If
 967         * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
 968         * prevent the current elevator from being destroyed until the new
 969         * request is freed.  This guarantees icq's won't be destroyed and
 970         * makes creating new ones safe.
 971         *
 972         * Also, lookup icq while holding queue_lock.  If it doesn't exist,
 973         * it will be created after releasing queue_lock.
 974         */
 975        if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
 976                rw_flags |= REQ_ELVPRIV;
 977                q->nr_rqs_elvpriv++;
 978                if (et->icq_cache && ioc)
 979                        icq = ioc_lookup_icq(ioc, q);
 980        }
 981
 982        if (blk_queue_io_stat(q))
 983                rw_flags |= REQ_IO_STAT;
 984        spin_unlock_irq(q->queue_lock);
 985
 986        /* allocate and init request */
 987        rq = mempool_alloc(rl->rq_pool, gfp_mask);
 988        if (!rq)
 989                goto fail_alloc;
 990
 991        blk_rq_init(q, rq);
 992        blk_rq_set_rl(rq, rl);
 993        rq->cmd_flags = rw_flags | REQ_ALLOCED;
 994
 995        /* init elvpriv */
 996        if (rw_flags & REQ_ELVPRIV) {
 997                if (unlikely(et->icq_cache && !icq)) {
 998                        if (ioc)
 999                                icq = ioc_create_icq(ioc, q, gfp_mask);
1000                        if (!icq)
1001                                goto fail_elvpriv;
1002                }
1003
1004                rq->elv.icq = icq;
1005                if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
1006                        goto fail_elvpriv;
1007
1008                /* @rq->elv.icq holds io_context until @rq is freed */
1009                if (icq)
1010                        get_io_context(icq->ioc);
1011        }
1012out:
1013        /*
1014         * ioc may be NULL here, and ioc_batching will be false. That's
1015         * OK, if the queue is under the request limit then requests need
1016         * not count toward the nr_batch_requests limit. There will always
1017         * be some limit enforced by BLK_BATCH_TIME.
1018         */
1019        if (ioc_batching(q, ioc))
1020                ioc->nr_batch_requests--;
1021
1022        trace_block_getrq(q, bio, rw_flags & 1);
1023        return rq;
1024
1025fail_elvpriv:
1026        /*
1027         * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
1028         * and may fail indefinitely under memory pressure and thus
1029         * shouldn't stall IO.  Treat this request as !elvpriv.  This will
1030         * disturb iosched and blkcg but weird is bettern than dead.
1031         */
1032        printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
1033                           dev_name(q->backing_dev_info.dev));
1034
1035        rq->cmd_flags &= ~REQ_ELVPRIV;
1036        rq->elv.icq = NULL;
1037
1038        spin_lock_irq(q->queue_lock);
1039        q->nr_rqs_elvpriv--;
1040        spin_unlock_irq(q->queue_lock);
1041        goto out;
1042
1043fail_alloc:
1044        /*
1045         * Allocation failed presumably due to memory. Undo anything we
1046         * might have messed up.
1047         *
1048         * Allocating task should really be put onto the front of the wait
1049         * queue, but this is pretty rare.
1050         */
1051        spin_lock_irq(q->queue_lock);
1052        freed_request(rl, rw_flags);
1053
1054        /*
1055         * in the very unlikely event that allocation failed and no
1056         * requests for this direction was pending, mark us starved so that
1057         * freeing of a request in the other direction will notice
1058         * us. another possible fix would be to split the rq mempool into
1059         * READ and WRITE
1060         */
1061rq_starved:
1062        if (unlikely(rl->count[is_sync] == 0))
1063                rl->starved[is_sync] = 1;
1064        return NULL;
1065}
1066
1067/**
1068 * get_request - get a free request
1069 * @q: request_queue to allocate request from
1070 * @rw_flags: RW and SYNC flags
1071 * @bio: bio to allocate request for (can be %NULL)
1072 * @gfp_mask: allocation mask
1073 *
1074 * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
1075 * function keeps retrying under memory pressure and fails iff @q is dead.
1076 *
1077 * Must be callled with @q->queue_lock held and,
1078 * Returns %NULL on failure, with @q->queue_lock held.
1079 * Returns !%NULL on success, with @q->queue_lock *not held*.
1080 */
1081static struct request *get_request(struct request_queue *q, int rw_flags,
1082                                   struct bio *bio, gfp_t gfp_mask)
1083{
1084        const bool is_sync = rw_is_sync(rw_flags) != 0;
1085        DEFINE_WAIT(wait);
1086        struct request_list *rl;
1087        struct request *rq;
1088
1089        rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
1090retry:
1091        rq = __get_request(rl, rw_flags, bio, gfp_mask);
1092        if (rq)
1093                return rq;
1094
1095        if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
1096                blk_put_rl(rl);
1097                return NULL;
1098        }
1099
1100        /* wait on @rl and retry */
1101        prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1102                                  TASK_UNINTERRUPTIBLE);
1103
1104        trace_block_sleeprq(q, bio, rw_flags & 1);
1105
1106        spin_unlock_irq(q->queue_lock);
1107        io_schedule();
1108
1109        /*
1110         * After sleeping, we become a "batching" process and will be able
1111         * to allocate at least one request, and up to a big batch of them
1112         * for a small period time.  See ioc_batching, ioc_set_batching
1113         */
1114        ioc_set_batching(q, current->io_context);
1115
1116        spin_lock_irq(q->queue_lock);
1117        finish_wait(&rl->wait[is_sync], &wait);
1118
1119        goto retry;
1120}
1121
1122struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1123{
1124        struct request *rq;
1125
1126        BUG_ON(rw != READ && rw != WRITE);
1127
1128        /* create ioc upfront */
1129        create_io_context(gfp_mask, q->node);
1130
1131        spin_lock_irq(q->queue_lock);
1132        rq = get_request(q, rw, NULL, gfp_mask);
1133        if (!rq)
1134                spin_unlock_irq(q->queue_lock);
1135        /* q->queue_lock is unlocked at this point */
1136
1137        return rq;
1138}
1139EXPORT_SYMBOL(blk_get_request);
1140
1141/**
1142 * blk_make_request - given a bio, allocate a corresponding struct request.
1143 * @q: target request queue
1144 * @bio:  The bio describing the memory mappings that will be submitted for IO.
1145 *        It may be a chained-bio properly constructed by block/bio layer.
1146 * @gfp_mask: gfp flags to be used for memory allocation
1147 *
1148 * blk_make_request is the parallel of generic_make_request for BLOCK_PC
1149 * type commands. Where the struct request needs to be farther initialized by
1150 * the caller. It is passed a &struct bio, which describes the memory info of
1151 * the I/O transfer.
1152 *
1153 * The caller of blk_make_request must make sure that bi_io_vec
1154 * are set to describe the memory buffers. That bio_data_dir() will return
1155 * the needed direction of the request. (And all bio's in the passed bio-chain
1156 * are properly set accordingly)
1157 *
1158 * If called under none-sleepable conditions, mapped bio buffers must not
1159 * need bouncing, by calling the appropriate masked or flagged allocator,
1160 * suitable for the target device. Otherwise the call to blk_queue_bounce will
1161 * BUG.
1162 *
1163 * WARNING: When allocating/cloning a bio-chain, careful consideration should be
1164 * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
1165 * anything but the first bio in the chain. Otherwise you risk waiting for IO
1166 * completion of a bio that hasn't been submitted yet, thus resulting in a
1167 * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
1168 * of bio_alloc(), as that avoids the mempool deadlock.
1169 * If possible a big IO should be split into smaller parts when allocation
1170 * fails. Partial allocation should not be an error, or you risk a live-lock.
1171 */
1172struct request *blk_make_request(struct request_queue *q, struct bio *bio,
1173                                 gfp_t gfp_mask)
1174{
1175        struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
1176
1177        if (unlikely(!rq))
1178                return ERR_PTR(-ENOMEM);
1179
1180        for_each_bio(bio) {
1181                struct bio *bounce_bio = bio;
1182                int ret;
1183
1184                blk_queue_bounce(q, &bounce_bio);
1185                ret = blk_rq_append_bio(q, rq, bounce_bio);
1186                if (unlikely(ret)) {
1187                        blk_put_request(rq);
1188                        return ERR_PTR(ret);
1189                }
1190        }
1191
1192        return rq;
1193}
1194EXPORT_SYMBOL(blk_make_request);
1195
1196/**
1197 * blk_requeue_request - put a request back on queue
1198 * @q:          request queue where request should be inserted
1199 * @rq:         request to be inserted
1200 *
1201 * Description:
1202 *    Drivers often keep queueing requests until the hardware cannot accept
1203 *    more, when that condition happens we need to put the request back
1204 *    on the queue. Must be called with queue lock held.
1205 */
1206void blk_requeue_request(struct request_queue *q, struct request *rq)
1207{
1208        blk_delete_timer(rq);
1209        blk_clear_rq_complete(rq);
1210        trace_block_rq_requeue(q, rq);
1211
1212        if (blk_rq_tagged(rq))
1213                blk_queue_end_tag(q, rq);
1214
1215        BUG_ON(blk_queued_rq(rq));
1216
1217        elv_requeue_request(q, rq);
1218}
1219EXPORT_SYMBOL(blk_requeue_request);
1220
1221static void add_acct_request(struct request_queue *q, struct request *rq,
1222                             int where)
1223{
1224        drive_stat_acct(rq, 1);
1225        __elv_add_request(q, rq, where);
1226}
1227
1228static void part_round_stats_single(int cpu, struct hd_struct *part,
1229                                    unsigned long now)
1230{
1231        if (now == part->stamp)
1232                return;
1233
1234        if (part_in_flight(part)) {
1235                __part_stat_add(cpu, part, time_in_queue,
1236                                part_in_flight(part) * (now - part->stamp));
1237                __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1238        }
1239        part->stamp = now;
1240}
1241
1242/**
1243 * part_round_stats() - Round off the performance stats on a struct disk_stats.
1244 * @cpu: cpu number for stats access
1245 * @part: target partition
1246 *
1247 * The average IO queue length and utilisation statistics are maintained
1248 * by observing the current state of the queue length and the amount of
1249 * time it has been in this state for.
1250 *
1251 * Normally, that accounting is done on IO completion, but that can result
1252 * in more than a second's worth of IO being accounted for within any one
1253 * second, leading to >100% utilisation.  To deal with that, we call this
1254 * function to do a round-off before returning the results when reading
1255 * /proc/diskstats.  This accounts immediately for all queue usage up to
1256 * the current jiffies and restarts the counters again.
1257 */
1258void part_round_stats(int cpu, struct hd_struct *part)
1259{
1260        unsigned long now = jiffies;
1261
1262        if (part->partno)
1263                part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
1264        part_round_stats_single(cpu, part, now);
1265}
1266EXPORT_SYMBOL_GPL(part_round_stats);
1267
1268#ifdef CONFIG_PM_RUNTIME
1269static void blk_pm_put_request(struct request *rq)
1270{
1271        if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
1272                pm_runtime_mark_last_busy(rq->q->dev);
1273}
1274#else
1275static inline void blk_pm_put_request(struct request *rq) {}
1276#endif
1277
1278/*
1279 * queue lock must be held
1280 */
1281void __blk_put_request(struct request_queue *q, struct request *req)
1282{
1283        if (unlikely(!q))
1284                return;
1285        if (unlikely(--req->ref_count))
1286                return;
1287
1288        blk_pm_put_request(req);
1289
1290        elv_completed_request(q, req);
1291
1292        /* this is a bio leak */
1293        WARN_ON(req->bio != NULL);
1294
1295        /*
1296         * Request may not have originated from ll_rw_blk. if not,
1297         * it didn't come out of our reserved rq pools
1298         */
1299        if (req->cmd_flags & REQ_ALLOCED) {
1300                unsigned int flags = req->cmd_flags;
1301                struct request_list *rl = blk_rq_rl(req);
1302
1303                BUG_ON(!list_empty(&req->queuelist));
1304                BUG_ON(!hlist_unhashed(&req->hash));
1305
1306                blk_free_request(rl, req);
1307                freed_request(rl, flags);
1308                blk_put_rl(rl);
1309        }
1310}
1311EXPORT_SYMBOL_GPL(__blk_put_request);
1312
1313void blk_put_request(struct request *req)
1314{
1315        unsigned long flags;
1316        struct request_queue *q = req->q;
1317
1318        spin_lock_irqsave(q->queue_lock, flags);
1319        __blk_put_request(q, req);
1320        spin_unlock_irqrestore(q->queue_lock, flags);
1321}
1322EXPORT_SYMBOL(blk_put_request);
1323
1324/**
1325 * blk_add_request_payload - add a payload to a request
1326 * @rq: request to update
1327 * @page: page backing the payload
1328 * @len: length of the payload.
1329 *
1330 * This allows to later add a payload to an already submitted request by
1331 * a block driver.  The driver needs to take care of freeing the payload
1332 * itself.
1333 *
1334 * Note that this is a quite horrible hack and nothing but handling of
1335 * discard requests should ever use it.
1336 */
1337void blk_add_request_payload(struct request *rq, struct page *page,
1338                unsigned int len)
1339{
1340        struct bio *bio = rq->bio;
1341
1342        bio->bi_io_vec->bv_page = page;
1343        bio->bi_io_vec->bv_offset = 0;
1344        bio->bi_io_vec->bv_len = len;
1345
1346        bio->bi_size = len;
1347        bio->bi_vcnt = 1;
1348        bio->bi_phys_segments = 1;
1349
1350        rq->__data_len = rq->resid_len = len;
1351        rq->nr_phys_segments = 1;
1352        rq->buffer = bio_data(bio);
1353}
1354EXPORT_SYMBOL_GPL(blk_add_request_payload);
1355
1356static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1357                                   struct bio *bio)
1358{
1359        const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1360
1361        if (!ll_back_merge_fn(q, req, bio))
1362                return false;
1363
1364        trace_block_bio_backmerge(q, req, bio);
1365
1366        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1367                blk_rq_set_mixed_merge(req);
1368
1369        req->biotail->bi_next = bio;
1370        req->biotail = bio;
1371        req->__data_len += bio->bi_size;
1372        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1373
1374        drive_stat_acct(req, 0);
1375        return true;
1376}
1377
1378static bool bio_attempt_front_merge(struct request_queue *q,
1379                                    struct request *req, struct bio *bio)
1380{
1381        const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1382
1383        if (!ll_front_merge_fn(q, req, bio))
1384                return false;
1385
1386        trace_block_bio_frontmerge(q, req, bio);
1387
1388        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1389                blk_rq_set_mixed_merge(req);
1390
1391        bio->bi_next = req->bio;
1392        req->bio = bio;
1393
1394        /*
1395         * may not be valid. if the low level driver said
1396         * it didn't need a bounce buffer then it better
1397         * not touch req->buffer either...
1398         */
1399        req->buffer = bio_data(bio);
1400        req->__sector = bio->bi_sector;
1401        req->__data_len += bio->bi_size;
1402        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1403
1404        drive_stat_acct(req, 0);
1405        return true;
1406}
1407
1408/**
1409 * attempt_plug_merge - try to merge with %current's plugged list
1410 * @q: request_queue new bio is being queued at
1411 * @bio: new bio being queued
1412 * @request_count: out parameter for number of traversed plugged requests
1413 *
1414 * Determine whether @bio being queued on @q can be merged with a request
1415 * on %current's plugged list.  Returns %true if merge was successful,
1416 * otherwise %false.
1417 *
1418 * Plugging coalesces IOs from the same issuer for the same purpose without
1419 * going through @q->queue_lock.  As such it's more of an issuing mechanism
1420 * than scheduling, and the request, while may have elvpriv data, is not
1421 * added on the elevator at this point.  In addition, we don't have
1422 * reliable access to the elevator outside queue lock.  Only check basic
1423 * merging parameters without querying the elevator.
1424 */
1425static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
1426                               unsigned int *request_count)
1427{
1428        struct blk_plug *plug;
1429        struct request *rq;
1430        bool ret = false;
1431
1432        plug = current->plug;
1433        if (!plug)
1434                goto out;
1435        *request_count = 0;
1436
1437        list_for_each_entry_reverse(rq, &plug->list, queuelist) {
1438                int el_ret;
1439
1440                if (rq->q == q)
1441                        (*request_count)++;
1442
1443                if (rq->q != q || !blk_rq_merge_ok(rq, bio))
1444                        continue;
1445
1446                el_ret = blk_try_merge(rq, bio);
1447                if (el_ret == ELEVATOR_BACK_MERGE) {
1448                        ret = bio_attempt_back_merge(q, rq, bio);
1449                        if (ret)
1450                                break;
1451                } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1452                        ret = bio_attempt_front_merge(q, rq, bio);
1453                        if (ret)
1454                                break;
1455                }
1456        }
1457out:
1458        return ret;
1459}
1460
1461void init_request_from_bio(struct request *req, struct bio *bio)
1462{
1463        req->cmd_type = REQ_TYPE_FS;
1464
1465        req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
1466        if (bio->bi_rw & REQ_RAHEAD)
1467                req->cmd_flags |= REQ_FAILFAST_MASK;
1468
1469        req->errors = 0;
1470        req->__sector = bio->bi_sector;
1471        req->ioprio = bio_prio(bio);
1472        blk_rq_bio_prep(req->q, req, bio);
1473}
1474
1475void blk_queue_bio(struct request_queue *q, struct bio *bio)
1476{
1477        const bool sync = !!(bio->bi_rw & REQ_SYNC);
1478        struct blk_plug *plug;
1479        int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
1480        struct request *req;
1481        unsigned int request_count = 0;
1482
1483        /*
1484         * low level driver can indicate that it wants pages above a
1485         * certain limit bounced to low memory (ie for highmem, or even
1486         * ISA dma in theory)
1487         */
1488        blk_queue_bounce(q, &bio);
1489
1490        if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1491                bio_endio(bio, -EIO);
1492                return;
1493        }
1494
1495        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1496                spin_lock_irq(q->queue_lock);
1497                where = ELEVATOR_INSERT_FLUSH;
1498                goto get_rq;
1499        }
1500
1501        /*
1502         * Check if we can merge with the plugged list before grabbing
1503         * any locks.
1504         */
1505        if (attempt_plug_merge(q, bio, &request_count))
1506                return;
1507
1508        spin_lock_irq(q->queue_lock);
1509
1510        el_ret = elv_merge(q, &req, bio);
1511        if (el_ret == ELEVATOR_BACK_MERGE) {
1512                if (bio_attempt_back_merge(q, req, bio)) {
1513                        elv_bio_merged(q, req, bio);
1514                        if (!attempt_back_merge(q, req))
1515                                elv_merged_request(q, req, el_ret);
1516                        goto out_unlock;
1517                }
1518        } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1519                if (bio_attempt_front_merge(q, req, bio)) {
1520                        elv_bio_merged(q, req, bio);
1521                        if (!attempt_front_merge(q, req))
1522                                elv_merged_request(q, req, el_ret);
1523                        goto out_unlock;
1524                }
1525        }
1526
1527get_rq:
1528        /*
1529         * This sync check and mask will be re-done in init_request_from_bio(),
1530         * but we need to set it earlier to expose the sync flag to the
1531         * rq allocator and io schedulers.
1532         */
1533        rw_flags = bio_data_dir(bio);
1534        if (sync)
1535                rw_flags |= REQ_SYNC;
1536
1537        /*
1538         * Grab a free request. This is might sleep but can not fail.
1539         * Returns with the queue unlocked.
1540         */
1541        req = get_request(q, rw_flags, bio, GFP_NOIO);
1542        if (unlikely(!req)) {
1543                bio_endio(bio, -ENODEV);        /* @q is dead */
1544                goto out_unlock;
1545        }
1546
1547        /*
1548         * After dropping the lock and possibly sleeping here, our request
1549         * may now be mergeable after it had proven unmergeable (above).
1550         * We don't worry about that case for efficiency. It won't happen
1551         * often, and the elevators are able to handle it.
1552         */
1553        init_request_from_bio(req, bio);
1554
1555        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
1556                req->cpu = raw_smp_processor_id();
1557
1558        plug = current->plug;
1559        if (plug) {
1560                /*
1561                 * If this is the first request added after a plug, fire
1562                 * of a plug trace. If others have been added before, check
1563                 * if we have multiple devices in this plug. If so, make a
1564                 * note to sort the list before dispatch.
1565                 */
1566                if (list_empty(&plug->list))
1567                        trace_block_plug(q);
1568                else {
1569                        if (request_count >= BLK_MAX_REQUEST_COUNT) {
1570                                blk_flush_plug_list(plug, false);
1571                                trace_block_plug(q);
1572                        }
1573                }
1574                list_add_tail(&req->queuelist, &plug->list);
1575                drive_stat_acct(req, 1);
1576        } else {
1577                spin_lock_irq(q->queue_lock);
1578                add_acct_request(q, req, where);
1579                __blk_run_queue(q);
1580out_unlock:
1581                spin_unlock_irq(q->queue_lock);
1582        }
1583}
1584EXPORT_SYMBOL_GPL(blk_queue_bio);       /* for device mapper only */
1585
1586/*
1587 * If bio->bi_dev is a partition, remap the location
1588 */
1589static inline void blk_partition_remap(struct bio *bio)
1590{
1591        struct block_device *bdev = bio->bi_bdev;
1592
1593        if (bio_sectors(bio) && bdev != bdev->bd_contains) {
1594                struct hd_struct *p = bdev->bd_part;
1595
1596                bio->bi_sector += p->start_sect;
1597                bio->bi_bdev = bdev->bd_contains;
1598
1599                trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
1600                                      bdev->bd_dev,
1601                                      bio->bi_sector - p->start_sect);
1602        }
1603}
1604
1605static void handle_bad_sector(struct bio *bio)
1606{
1607        char b[BDEVNAME_SIZE];
1608
1609        printk(KERN_INFO "attempt to access beyond end of device\n");
1610        printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
1611                        bdevname(bio->bi_bdev, b),
1612                        bio->bi_rw,
1613                        (unsigned long long)bio_end_sector(bio),
1614                        (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
1615
1616        set_bit(BIO_EOF, &bio->bi_flags);
1617}
1618
1619#ifdef CONFIG_FAIL_MAKE_REQUEST
1620
1621static DECLARE_FAULT_ATTR(fail_make_request);
1622
1623static int __init setup_fail_make_request(char *str)
1624{
1625        return setup_fault_attr(&fail_make_request, str);
1626}
1627__setup("fail_make_request=", setup_fail_make_request);
1628
1629static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
1630{
1631        return part->make_it_fail && should_fail(&fail_make_request, bytes);
1632}
1633
1634static int __init fail_make_request_debugfs(void)
1635{
1636        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
1637                                                NULL, &fail_make_request);
1638
1639        return IS_ERR(dir) ? PTR_ERR(dir) : 0;
1640}
1641
1642late_initcall(fail_make_request_debugfs);
1643
1644#else /* CONFIG_FAIL_MAKE_REQUEST */
1645
1646static inline bool should_fail_request(struct hd_struct *part,
1647                                        unsigned int bytes)
1648{
1649        return false;
1650}
1651
1652#endif /* CONFIG_FAIL_MAKE_REQUEST */
1653
1654/*
1655 * Check whether this bio extends beyond the end of the device.
1656 */
1657static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
1658{
1659        sector_t maxsector;
1660
1661        if (!nr_sectors)
1662                return 0;
1663
1664        /* Test device or partition size, when known. */
1665        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
1666        if (maxsector) {
1667                sector_t sector = bio->bi_sector;
1668
1669                if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
1670                        /*
1671                         * This may well happen - the kernel calls bread()
1672                         * without checking the size of the device, e.g., when
1673                         * mounting a device.
1674                         */
1675                        handle_bad_sector(bio);
1676                        return 1;
1677                }
1678        }
1679
1680        return 0;
1681}
1682
1683static noinline_for_stack bool
1684generic_make_request_checks(struct bio *bio)
1685{
1686        struct request_queue *q;
1687        int nr_sectors = bio_sectors(bio);
1688        int err = -EIO;
1689        char b[BDEVNAME_SIZE];
1690        struct hd_struct *part;
1691
1692        might_sleep();
1693
1694        if (bio_check_eod(bio, nr_sectors))
1695                goto end_io;
1696
1697        q = bdev_get_queue(bio->bi_bdev);
1698        if (unlikely(!q)) {
1699                printk(KERN_ERR
1700                       "generic_make_request: Trying to access "
1701                        "nonexistent block-device %s (%Lu)\n",
1702                        bdevname(bio->bi_bdev, b),
1703                        (long long) bio->bi_sector);
1704                goto end_io;
1705        }
1706
1707        if (likely(bio_is_rw(bio) &&
1708                   nr_sectors > queue_max_hw_sectors(q))) {
1709                printk(KERN_ERR "bio too big device %s (%u > %u)\n",
1710                       bdevname(bio->bi_bdev, b),
1711                       bio_sectors(bio),
1712                       queue_max_hw_sectors(q));
1713                goto end_io;
1714        }
1715
1716        part = bio->bi_bdev->bd_part;
1717        if (should_fail_request(part, bio->bi_size) ||
1718            should_fail_request(&part_to_disk(part)->part0,
1719                                bio->bi_size))
1720                goto end_io;
1721
1722        /*
1723         * If this device has partitions, remap block n
1724         * of partition p to block n+start(p) of the disk.
1725         */
1726        blk_partition_remap(bio);
1727
1728        if (bio_check_eod(bio, nr_sectors))
1729                goto end_io;
1730
1731        /*
1732         * Filter flush bio's early so that make_request based
1733         * drivers without flush support don't have to worry
1734         * about them.
1735         */
1736        if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
1737                bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
1738                if (!nr_sectors) {
1739                        err = 0;
1740                        goto end_io;
1741                }
1742        }
1743
1744        if ((bio->bi_rw & REQ_DISCARD) &&
1745            (!blk_queue_discard(q) ||
1746             ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
1747                err = -EOPNOTSUPP;
1748                goto end_io;
1749        }
1750
1751        if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
1752                err = -EOPNOTSUPP;
1753                goto end_io;
1754        }
1755
1756        /*
1757         * Various block parts want %current->io_context and lazy ioc
1758         * allocation ends up trading a lot of pain for a small amount of
1759         * memory.  Just allocate it upfront.  This may fail and block
1760         * layer knows how to live with it.
1761         */
1762        create_io_context(GFP_ATOMIC, q->node);
1763
1764        if (blk_throtl_bio(q, bio))
1765                return false;   /* throttled, will be resubmitted later */
1766
1767        trace_block_bio_queue(q, bio);
1768        return true;
1769
1770end_io:
1771        bio_endio(bio, err);
1772        return false;
1773}
1774
1775/**
1776 * generic_make_request - hand a buffer to its device driver for I/O
1777 * @bio:  The bio describing the location in memory and on the device.
1778 *
1779 * generic_make_request() is used to make I/O requests of block
1780 * devices. It is passed a &struct bio, which describes the I/O that needs
1781 * to be done.
1782 *
1783 * generic_make_request() does not return any status.  The
1784 * success/failure status of the request, along with notification of
1785 * completion, is delivered asynchronously through the bio->bi_end_io
1786 * function described (one day) else where.
1787 *
1788 * The caller of generic_make_request must make sure that bi_io_vec
1789 * are set to describe the memory buffer, and that bi_dev and bi_sector are
1790 * set to describe the device address, and the
1791 * bi_end_io and optionally bi_private are set to describe how
1792 * completion notification should be signaled.
1793 *
1794 * generic_make_request and the drivers it calls may use bi_next if this
1795 * bio happens to be merged with someone else, and may resubmit the bio to
1796 * a lower device by calling into generic_make_request recursively, which
1797 * means the bio should NOT be touched after the call to ->make_request_fn.
1798 */
1799void generic_make_request(struct bio *bio)
1800{
1801        struct bio_list bio_list_on_stack;
1802
1803        if (!generic_make_request_checks(bio))
1804                return;
1805
1806        /*
1807         * We only want one ->make_request_fn to be active at a time, else
1808         * stack usage with stacked devices could be a problem.  So use
1809         * current->bio_list to keep a list of requests submited by a
1810         * make_request_fn function.  current->bio_list is also used as a
1811         * flag to say if generic_make_request is currently active in this
1812         * task or not.  If it is NULL, then no make_request is active.  If
1813         * it is non-NULL, then a make_request is active, and new requests
1814         * should be added at the tail
1815         */
1816        if (current->bio_list) {
1817                bio_list_add(current->bio_list, bio);
1818                return;
1819        }
1820
1821        /* following loop may be a bit non-obvious, and so deserves some
1822         * explanation.
1823         * Before entering the loop, bio->bi_next is NULL (as all callers
1824         * ensure that) so we have a list with a single bio.
1825         * We pretend that we have just taken it off a longer list, so
1826         * we assign bio_list to a pointer to the bio_list_on_stack,
1827         * thus initialising the bio_list of new bios to be
1828         * added.  ->make_request() may indeed add some more bios
1829         * through a recursive call to generic_make_request.  If it
1830         * did, we find a non-NULL value in bio_list and re-enter the loop
1831         * from the top.  In this case we really did just take the bio
1832         * of the top of the list (no pretending) and so remove it from
1833         * bio_list, and call into ->make_request() again.
1834         */
1835        BUG_ON(bio->bi_next);
1836        bio_list_init(&bio_list_on_stack);
1837        current->bio_list = &bio_list_on_stack;
1838        do {
1839                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
1840
1841                q->make_request_fn(q, bio);
1842
1843                bio = bio_list_pop(current->bio_list);
1844        } while (bio);
1845        current->bio_list = NULL; /* deactivate */
1846}
1847EXPORT_SYMBOL(generic_make_request);
1848
1849/**
1850 * submit_bio - submit a bio to the block device layer for I/O
1851 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1852 * @bio: The &struct bio which describes the I/O
1853 *
1854 * submit_bio() is very similar in purpose to generic_make_request(), and
1855 * uses that function to do most of the work. Both are fairly rough
1856 * interfaces; @bio must be presetup and ready for I/O.
1857 *
1858 */
1859void submit_bio(int rw, struct bio *bio)
1860{
1861        bio->bi_rw |= rw;
1862
1863        /*
1864         * If it's a regular read/write or a barrier with data attached,
1865         * go through the normal accounting stuff before submission.
1866         */
1867        if (bio_has_data(bio)) {
1868                unsigned int count;
1869
1870                if (unlikely(rw & REQ_WRITE_SAME))
1871                        count = bdev_logical_block_size(bio->bi_bdev) >> 9;
1872                else
1873                        count = bio_sectors(bio);
1874
1875                if (rw & WRITE) {
1876                        count_vm_events(PGPGOUT, count);
1877                } else {
1878                        task_io_account_read(bio->bi_size);
1879                        count_vm_events(PGPGIN, count);
1880                }
1881
1882                if (unlikely(block_dump)) {
1883                        char b[BDEVNAME_SIZE];
1884                        printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
1885                        current->comm, task_pid_nr(current),
1886                                (rw & WRITE) ? "WRITE" : "READ",
1887                                (unsigned long long)bio->bi_sector,
1888                                bdevname(bio->bi_bdev, b),
1889                                count);
1890                }
1891        }
1892
1893        generic_make_request(bio);
1894}
1895EXPORT_SYMBOL(submit_bio);
1896
1897/**
1898 * blk_rq_check_limits - Helper function to check a request for the queue limit
1899 * @q:  the queue
1900 * @rq: the request being checked
1901 *
1902 * Description:
1903 *    @rq may have been made based on weaker limitations of upper-level queues
1904 *    in request stacking drivers, and it may violate the limitation of @q.
1905 *    Since the block layer and the underlying device driver trust @rq
1906 *    after it is inserted to @q, it should be checked against @q before
1907 *    the insertion using this generic function.
1908 *
1909 *    This function should also be useful for request stacking drivers
1910 *    in some cases below, so export this function.
1911 *    Request stacking drivers like request-based dm may change the queue
1912 *    limits while requests are in the queue (e.g. dm's table swapping).
1913 *    Such request stacking drivers should check those requests agaist
1914 *    the new queue limits again when they dispatch those requests,
1915 *    although such checkings are also done against the old queue limits
1916 *    when submitting requests.
1917 */
1918int blk_rq_check_limits(struct request_queue *q, struct request *rq)
1919{
1920        if (!rq_mergeable(rq))
1921                return 0;
1922
1923        if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
1924                printk(KERN_ERR "%s: over max size limit.\n", __func__);
1925                return -EIO;
1926        }
1927
1928        /*
1929         * queue's settings related to segment counting like q->bounce_pfn
1930         * may differ from that of other stacking queues.
1931         * Recalculate it to check the request correctly on this queue's
1932         * limitation.
1933         */
1934        blk_recalc_rq_segments(rq);
1935        if (rq->nr_phys_segments > queue_max_segments(q)) {
1936                printk(KERN_ERR "%s: over max segments limit.\n", __func__);
1937                return -EIO;
1938        }
1939
1940        return 0;
1941}
1942EXPORT_SYMBOL_GPL(blk_rq_check_limits);
1943
1944/**
1945 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
1946 * @q:  the queue to submit the request
1947 * @rq: the request being queued
1948 */
1949int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1950{
1951        unsigned long flags;
1952        int where = ELEVATOR_INSERT_BACK;
1953
1954        if (blk_rq_check_limits(q, rq))
1955                return -EIO;
1956
1957        if (rq->rq_disk &&
1958            should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
1959                return -EIO;
1960
1961        spin_lock_irqsave(q->queue_lock, flags);
1962        if (unlikely(blk_queue_dying(q))) {
1963                spin_unlock_irqrestore(q->queue_lock, flags);
1964                return -ENODEV;
1965        }
1966
1967        /*
1968         * Submitting request must be dequeued before calling this function
1969         * because it will be linked to another request_queue
1970         */
1971        BUG_ON(blk_queued_rq(rq));
1972
1973        if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
1974                where = ELEVATOR_INSERT_FLUSH;
1975
1976        add_acct_request(q, rq, where);
1977        if (where == ELEVATOR_INSERT_FLUSH)
1978                __blk_run_queue(q);
1979        spin_unlock_irqrestore(q->queue_lock, flags);
1980
1981        return 0;
1982}
1983EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
1984
1985/**
1986 * blk_rq_err_bytes - determine number of bytes till the next failure boundary
1987 * @rq: request to examine
1988 *
1989 * Description:
1990 *     A request could be merge of IOs which require different failure
1991 *     handling.  This function determines the number of bytes which
1992 *     can be failed from the beginning of the request without
1993 *     crossing into area which need to be retried further.
1994 *
1995 * Return:
1996 *     The number of bytes to fail.
1997 *
1998 * Context:
1999 *     queue_lock must be held.
2000 */
2001unsigned int blk_rq_err_bytes(const struct request *rq)
2002{
2003        unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
2004        unsigned int bytes = 0;
2005        struct bio *bio;
2006
2007        if (!(rq->cmd_flags & REQ_MIXED_MERGE))
2008                return blk_rq_bytes(rq);
2009
2010        /*
2011         * Currently the only 'mixing' which can happen is between
2012         * different fastfail types.  We can safely fail portions
2013         * which have all the failfast bits that the first one has -
2014         * the ones which are at least as eager to fail as the first
2015         * one.
2016         */
2017        for (bio = rq->bio; bio; bio = bio->bi_next) {
2018                if ((bio->bi_rw & ff) != ff)
2019                        break;
2020                bytes += bio->bi_size;
2021        }
2022
2023        /* this could lead to infinite loop */
2024        BUG_ON(blk_rq_bytes(rq) && !bytes);
2025        return bytes;
2026}
2027EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
2028
2029static void blk_account_io_completion(struct request *req, unsigned int bytes)
2030{
2031        if (blk_do_io_stat(req)) {
2032                const int rw = rq_data_dir(req);
2033                struct hd_struct *part;
2034                int cpu;
2035
2036                cpu = part_stat_lock();
2037                part = req->part;
2038                part_stat_add(cpu, part, sectors[rw], bytes >> 9);
2039                part_stat_unlock();
2040        }
2041}
2042
2043static void blk_account_io_done(struct request *req)
2044{
2045        /*
2046         * Account IO completion.  flush_rq isn't accounted as a
2047         * normal IO on queueing nor completion.  Accounting the
2048         * containing request is enough.
2049         */
2050        if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
2051                unsigned long duration = jiffies - req->start_time;
2052                const int rw = rq_data_dir(req);
2053                struct hd_struct *part;
2054                int cpu;
2055
2056                cpu = part_stat_lock();
2057                part = req->part;
2058
2059                part_stat_inc(cpu, part, ios[rw]);
2060                part_stat_add(cpu, part, ticks[rw], duration);
2061                part_round_stats(cpu, part);
2062                part_dec_in_flight(part, rw);
2063
2064                hd_struct_put(part);
2065                part_stat_unlock();
2066        }
2067}
2068
2069#ifdef CONFIG_PM_RUNTIME
2070/*
2071 * Don't process normal requests when queue is suspended
2072 * or in the process of suspending/resuming
2073 */
2074static struct request *blk_pm_peek_request(struct request_queue *q,
2075                                           struct request *rq)
2076{
2077        if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
2078            (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
2079                return NULL;
2080        else
2081                return rq;
2082}
2083#else
2084static inline struct request *blk_pm_peek_request(struct request_queue *q,
2085                                                  struct request *rq)
2086{
2087        return rq;
2088}
2089#endif
2090
2091/**
2092 * blk_peek_request - peek at the top of a request queue
2093 * @q: request queue to peek at
2094 *
2095 * Description:
2096 *     Return the request at the top of @q.  The returned request
2097 *     should be started using blk_start_request() before LLD starts
2098 *     processing it.
2099 *
2100 * Return:
2101 *     Pointer to the request at the top of @q if available.  Null
2102 *     otherwise.
2103 *
2104 * Context:
2105 *     queue_lock must be held.
2106 */
2107struct request *blk_peek_request(struct request_queue *q)
2108{
2109        struct request *rq;
2110        int ret;
2111
2112        while ((rq = __elv_next_request(q)) != NULL) {
2113
2114                rq = blk_pm_peek_request(q, rq);
2115                if (!rq)
2116                        break;
2117
2118                if (!(rq->cmd_flags & REQ_STARTED)) {
2119                        /*
2120                         * This is the first time the device driver
2121                         * sees this request (possibly after
2122                         * requeueing).  Notify IO scheduler.
2123                         */
2124                        if (rq->cmd_flags & REQ_SORTED)
2125                                elv_activate_rq(q, rq);
2126
2127                        /*
2128                         * just mark as started even if we don't start
2129                         * it, a request that has been delayed should
2130                         * not be passed by new incoming requests
2131                         */
2132                        rq->cmd_flags |= REQ_STARTED;
2133                        trace_block_rq_issue(q, rq);
2134                }
2135
2136                if (!q->boundary_rq || q->boundary_rq == rq) {
2137                        q->end_sector = rq_end_sector(rq);
2138                        q->boundary_rq = NULL;
2139                }
2140
2141                if (rq->cmd_flags & REQ_DONTPREP)
2142                        break;
2143
2144                if (q->dma_drain_size && blk_rq_bytes(rq)) {
2145                        /*
2146                         * make sure space for the drain appears we
2147                         * know we can do this because max_hw_segments
2148                         * has been adjusted to be one fewer than the
2149                         * device can handle
2150                         */
2151                        rq->nr_phys_segments++;
2152                }
2153
2154                if (!q->prep_rq_fn)
2155                        break;
2156
2157                ret = q->prep_rq_fn(q, rq);
2158                if (ret == BLKPREP_OK) {
2159                        break;
2160                } else if (ret == BLKPREP_DEFER) {
2161                        /*
2162                         * the request may have been (partially) prepped.
2163                         * we need to keep this request in the front to
2164                         * avoid resource deadlock.  REQ_STARTED will
2165                         * prevent other fs requests from passing this one.
2166                         */
2167                        if (q->dma_drain_size && blk_rq_bytes(rq) &&
2168                            !(rq->cmd_flags & REQ_DONTPREP)) {
2169                                /*
2170                                 * remove the space for the drain we added
2171                                 * so that we don't add it again
2172                                 */
2173                                --rq->nr_phys_segments;
2174                        }
2175
2176                        rq = NULL;
2177                        break;
2178                } else if (ret == BLKPREP_KILL) {
2179                        rq->cmd_flags |= REQ_QUIET;
2180                        /*
2181                         * Mark this request as started so we don't trigger
2182                         * any debug logic in the end I/O path.
2183                         */
2184                        blk_start_request(rq);
2185                        __blk_end_request_all(rq, -EIO);
2186                } else {
2187                        printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
2188                        break;
2189                }
2190        }
2191
2192        return rq;
2193}
2194EXPORT_SYMBOL(blk_peek_request);
2195
2196void blk_dequeue_request(struct request *rq)
2197{
2198        struct request_queue *q = rq->q;
2199
2200        BUG_ON(list_empty(&rq->queuelist));
2201        BUG_ON(ELV_ON_HASH(rq));
2202
2203        list_del_init(&rq->queuelist);
2204
2205        /*
2206         * the time frame between a request being removed from the lists
2207         * and to it is freed is accounted as io that is in progress at
2208         * the driver side.
2209         */
2210        if (blk_account_rq(rq)) {
2211                q->in_flight[rq_is_sync(rq)]++;
2212                set_io_start_time_ns(rq);
2213        }
2214}
2215
2216/**
2217 * blk_start_request - start request processing on the driver
2218 * @req: request to dequeue
2219 *
2220 * Description:
2221 *     Dequeue @req and start timeout timer on it.  This hands off the
2222 *     request to the driver.
2223 *
2224 *     Block internal functions which don't want to start timer should
2225 *     call blk_dequeue_request().
2226 *
2227 * Context:
2228 *     queue_lock must be held.
2229 */
2230void blk_start_request(struct request *req)
2231{
2232        blk_dequeue_request(req);
2233
2234        /*
2235         * We are now handing the request to the hardware, initialize
2236         * resid_len to full count and add the timeout handler.
2237         */
2238        req->resid_len = blk_rq_bytes(req);
2239        if (unlikely(blk_bidi_rq(req)))
2240                req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
2241
2242        BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
2243        blk_add_timer(req);
2244}
2245EXPORT_SYMBOL(blk_start_request);
2246
2247/**
2248 * blk_fetch_request - fetch a request from a request queue
2249 * @q: request queue to fetch a request from
2250 *
2251 * Description:
2252 *     Return the request at the top of @q.  The request is started on
2253 *     return and LLD can start processing it immediately.
2254 *
2255 * Return:
2256 *     Pointer to the request at the top of @q if available.  Null
2257 *     otherwise.
2258 *
2259 * Context:
2260 *     queue_lock must be held.
2261 */
2262struct request *blk_fetch_request(struct request_queue *q)
2263{
2264        struct request *rq;
2265
2266        rq = blk_peek_request(q);
2267        if (rq)
2268                blk_start_request(rq);
2269        return rq;
2270}
2271EXPORT_SYMBOL(blk_fetch_request);
2272
2273/**
2274 * blk_update_request - Special helper function for request stacking drivers
2275 * @req:      the request being processed
2276 * @error:    %0 for success, < %0 for error
2277 * @nr_bytes: number of bytes to complete @req
2278 *
2279 * Description:
2280 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
2281 *     the request structure even if @req doesn't have leftover.
2282 *     If @req has leftover, sets it up for the next range of segments.
2283 *
2284 *     This special helper function is only for request stacking drivers
2285 *     (e.g. request-based dm) so that they can handle partial completion.
2286 *     Actual device drivers should use blk_end_request instead.
2287 *
2288 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
2289 *     %false return from this function.
2290 *
2291 * Return:
2292 *     %false - this request doesn't have any more data
2293 *     %true  - this request has more data
2294 **/
2295bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2296{
2297        int total_bytes;
2298
2299        if (!req->bio)
2300                return false;
2301
2302        trace_block_rq_complete(req->q, req, nr_bytes);
2303
2304        /*
2305         * For fs requests, rq is just carrier of independent bio's
2306         * and each partial completion should be handled separately.
2307         * Reset per-request error on each partial completion.
2308         *
2309         * TODO: tj: This is too subtle.  It would be better to let
2310         * low level drivers do what they see fit.
2311         */
2312        if (req->cmd_type == REQ_TYPE_FS)
2313                req->errors = 0;
2314
2315        if (error && req->cmd_type == REQ_TYPE_FS &&
2316            !(req->cmd_flags & REQ_QUIET)) {
2317                char *error_type;
2318
2319                switch (error) {
2320                case -ENOLINK:
2321                        error_type = "recoverable transport";
2322                        break;
2323                case -EREMOTEIO:
2324                        error_type = "critical target";
2325                        break;
2326                case -EBADE:
2327                        error_type = "critical nexus";
2328                        break;
2329                case -EIO:
2330                default:
2331                        error_type = "I/O";
2332                        break;
2333                }
2334                printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
2335                                   error_type, req->rq_disk ?
2336                                   req->rq_disk->disk_name : "?",
2337                                   (unsigned long long)blk_rq_pos(req));
2338
2339        }
2340
2341        blk_account_io_completion(req, nr_bytes);
2342
2343        total_bytes = 0;
2344        while (req->bio) {
2345                struct bio *bio = req->bio;
2346                unsigned bio_bytes = min(bio->bi_size, nr_bytes);
2347
2348                if (bio_bytes == bio->bi_size)
2349                        req->bio = bio->bi_next;
2350
2351                req_bio_endio(req, bio, bio_bytes, error);
2352
2353                total_bytes += bio_bytes;
2354                nr_bytes -= bio_bytes;
2355
2356                if (!nr_bytes)
2357                        break;
2358        }
2359
2360        /*
2361         * completely done
2362         */
2363        if (!req->bio) {
2364                /*
2365                 * Reset counters so that the request stacking driver
2366                 * can find how many bytes remain in the request
2367                 * later.
2368                 */
2369                req->__data_len = 0;
2370                return false;
2371        }
2372
2373        req->__data_len -= total_bytes;
2374        req->buffer = bio_data(req->bio);
2375
2376        /* update sector only for requests with clear definition of sector */
2377        if (req->cmd_type == REQ_TYPE_FS)
2378                req->__sector += total_bytes >> 9;
2379
2380        /* mixed attributes always follow the first bio */
2381        if (req->cmd_flags & REQ_MIXED_MERGE) {
2382                req->cmd_flags &= ~REQ_FAILFAST_MASK;
2383                req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
2384        }
2385
2386        /*
2387         * If total number of sectors is less than the first segment
2388         * size, something has gone terribly wrong.
2389         */
2390        if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
2391                blk_dump_rq_flags(req, "request botched");
2392                req->__data_len = blk_rq_cur_bytes(req);
2393        }
2394
2395        /* recalculate the number of segments */
2396        blk_recalc_rq_segments(req);
2397
2398        return true;
2399}
2400EXPORT_SYMBOL_GPL(blk_update_request);
2401
2402static bool blk_update_bidi_request(struct request *rq, int error,
2403                                    unsigned int nr_bytes,
2404                                    unsigned int bidi_bytes)
2405{
2406        if (blk_update_request(rq, error, nr_bytes))
2407                return true;
2408
2409        /* Bidi request must be completed as a whole */
2410        if (unlikely(blk_bidi_rq(rq)) &&
2411            blk_update_request(rq->next_rq, error, bidi_bytes))
2412                return true;
2413
2414        if (blk_queue_add_random(rq->q))
2415                add_disk_randomness(rq->rq_disk);
2416
2417        return false;
2418}
2419
2420/**
2421 * blk_unprep_request - unprepare a request
2422 * @req:        the request
2423 *
2424 * This function makes a request ready for complete resubmission (or
2425 * completion).  It happens only after all error handling is complete,
2426 * so represents the appropriate moment to deallocate any resources
2427 * that were allocated to the request in the prep_rq_fn.  The queue
2428 * lock is held when calling this.
2429 */
2430void blk_unprep_request(struct request *req)
2431{
2432        struct request_queue *q = req->q;
2433
2434        req->cmd_flags &= ~REQ_DONTPREP;
2435        if (q->unprep_rq_fn)
2436                q->unprep_rq_fn(q, req);
2437}
2438EXPORT_SYMBOL_GPL(blk_unprep_request);
2439
2440/*
2441 * queue lock must be held
2442 */
2443static void blk_finish_request(struct request *req, int error)
2444{
2445        if (blk_rq_tagged(req))
2446                blk_queue_end_tag(req->q, req);
2447
2448        BUG_ON(blk_queued_rq(req));
2449
2450        if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
2451                laptop_io_completion(&req->q->backing_dev_info);
2452
2453        blk_delete_timer(req);
2454
2455        if (req->cmd_flags & REQ_DONTPREP)
2456                blk_unprep_request(req);
2457
2458
2459        blk_account_io_done(req);
2460
2461        if (req->end_io)
2462                req->end_io(req, error);
2463        else {
2464                if (blk_bidi_rq(req))
2465                        __blk_put_request(req->next_rq->q, req->next_rq);
2466
2467                __blk_put_request(req->q, req);
2468        }
2469}
2470
2471/**
2472 * blk_end_bidi_request - Complete a bidi request
2473 * @rq:         the request to complete
2474 * @error:      %0 for success, < %0 for error
2475 * @nr_bytes:   number of bytes to complete @rq
2476 * @bidi_bytes: number of bytes to complete @rq->next_rq
2477 *
2478 * Description:
2479 *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
2480 *     Drivers that supports bidi can safely call this member for any
2481 *     type of request, bidi or uni.  In the later case @bidi_bytes is
2482 *     just ignored.
2483 *
2484 * Return:
2485 *     %false - we are done with this request
2486 *     %true  - still buffers pending for this request
2487 **/
2488static bool blk_end_bidi_request(struct request *rq, int error,
2489                                 unsigned int nr_bytes, unsigned int bidi_bytes)
2490{
2491        struct request_queue *q = rq->q;
2492        unsigned long flags;
2493
2494        if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
2495                return true;
2496
2497        spin_lock_irqsave(q->queue_lock, flags);
2498        blk_finish_request(rq, error);
2499        spin_unlock_irqrestore(q->queue_lock, flags);
2500
2501        return false;
2502}
2503
2504/**
2505 * __blk_end_bidi_request - Complete a bidi request with queue lock held
2506 * @rq:         the request to complete
2507 * @error:      %0 for success, < %0 for error
2508 * @nr_bytes:   number of bytes to complete @rq
2509 * @bidi_bytes: number of bytes to complete @rq->next_rq
2510 *
2511 * Description:
2512 *     Identical to blk_end_bidi_request() except that queue lock is
2513 *     assumed to be locked on entry and remains so on return.
2514 *
2515 * Return:
2516 *     %false - we are done with this request
2517 *     %true  - still buffers pending for this request
2518 **/
2519bool __blk_end_bidi_request(struct request *rq, int error,
2520                                   unsigned int nr_bytes, unsigned int bidi_bytes)
2521{
2522        if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
2523                return true;
2524
2525        blk_finish_request(rq, error);
2526
2527        return false;
2528}
2529
2530/**
2531 * blk_end_request - Helper function for drivers to complete the request.
2532 * @rq:       the request being processed
2533 * @error:    %0 for success, < %0 for error
2534 * @nr_bytes: number of bytes to complete
2535 *
2536 * Description:
2537 *     Ends I/O on a number of bytes attached to @rq.
2538 *     If @rq has leftover, sets it up for the next range of segments.
2539 *
2540 * Return:
2541 *     %false - we are done with this request
2542 *     %true  - still buffers pending for this request
2543 **/
2544bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
2545{
2546        return blk_end_bidi_request(rq, error, nr_bytes, 0);
2547}
2548EXPORT_SYMBOL(blk_end_request);
2549
2550/**
2551 * blk_end_request_all - Helper function for drives to finish the request.
2552 * @rq: the request to finish
2553 * @error: %0 for success, < %0 for error
2554 *
2555 * Description:
2556 *     Completely finish @rq.
2557 */
2558void blk_end_request_all(struct request *rq, int error)
2559{
2560        bool pending;
2561        unsigned int bidi_bytes = 0;
2562
2563        if (unlikely(blk_bidi_rq(rq)))
2564                bidi_bytes = blk_rq_bytes(rq->next_rq);
2565
2566        pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
2567        BUG_ON(pending);
2568}
2569EXPORT_SYMBOL(blk_end_request_all);
2570
2571/**
2572 * blk_end_request_cur - Helper function to finish the current request chunk.
2573 * @rq: the request to finish the current chunk for
2574 * @error: %0 for success, < %0 for error
2575 *
2576 * Description:
2577 *     Complete the current consecutively mapped chunk from @rq.
2578 *
2579 * Return:
2580 *     %false - we are done with this request
2581 *     %true  - still buffers pending for this request
2582 */
2583bool blk_end_request_cur(struct request *rq, int error)
2584{
2585        return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
2586}
2587EXPORT_SYMBOL(blk_end_request_cur);
2588
2589/**
2590 * blk_end_request_err - Finish a request till the next failure boundary.
2591 * @rq: the request to finish till the next failure boundary for
2592 * @error: must be negative errno
2593 *
2594 * Description:
2595 *     Complete @rq till the next failure boundary.
2596 *
2597 * Return:
2598 *     %false - we are done with this request
2599 *     %true  - still buffers pending for this request
2600 */
2601bool blk_end_request_err(struct request *rq, int error)
2602{
2603        WARN_ON(error >= 0);
2604        return blk_end_request(rq, error, blk_rq_err_bytes(rq));
2605}
2606EXPORT_SYMBOL_GPL(blk_end_request_err);
2607
2608/**
2609 * __blk_end_request - Helper function for drivers to complete the request.
2610 * @rq:       the request being processed
2611 * @error:    %0 for success, < %0 for error
2612 * @nr_bytes: number of bytes to complete
2613 *
2614 * Description:
2615 *     Must be called with queue lock held unlike blk_end_request().
2616 *
2617 * Return:
2618 *     %false - we are done with this request
2619 *     %true  - still buffers pending for this request
2620 **/
2621bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
2622{
2623        return __blk_end_bidi_request(rq, error, nr_bytes, 0);
2624}
2625EXPORT_SYMBOL(__blk_end_request);
2626
2627/**
2628 * __blk_end_request_all - Helper function for drives to finish the request.
2629 * @rq: the request to finish
2630 * @error: %0 for success, < %0 for error
2631 *
2632 * Description:
2633 *     Completely finish @rq.  Must be called with queue lock held.
2634 */
2635void __blk_end_request_all(struct request *rq, int error)
2636{
2637        bool pending;
2638        unsigned int bidi_bytes = 0;
2639
2640        if (unlikely(blk_bidi_rq(rq)))
2641                bidi_bytes = blk_rq_bytes(rq->next_rq);
2642
2643        pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
2644        BUG_ON(pending);
2645}
2646EXPORT_SYMBOL(__blk_end_request_all);
2647
2648/**
2649 * __blk_end_request_cur - Helper function to finish the current request chunk.
2650 * @rq: the request to finish the current chunk for
2651 * @error: %0 for success, < %0 for error
2652 *
2653 * Description:
2654 *     Complete the current consecutively mapped chunk from @rq.  Must
2655 *     be called with queue lock held.
2656 *
2657 * Return:
2658 *     %false - we are done with this request
2659 *     %true  - still buffers pending for this request
2660 */
2661bool __blk_end_request_cur(struct request *rq, int error)
2662{
2663        return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
2664}
2665EXPORT_SYMBOL(__blk_end_request_cur);
2666
2667/**
2668 * __blk_end_request_err - Finish a request till the next failure boundary.
2669 * @rq: the request to finish till the next failure boundary for
2670 * @error: must be negative errno
2671 *
2672 * Description:
2673 *     Complete @rq till the next failure boundary.  Must be called
2674 *     with queue lock held.
2675 *
2676 * Return:
2677 *     %false - we are done with this request
2678 *     %true  - still buffers pending for this request
2679 */
2680bool __blk_end_request_err(struct request *rq, int error)
2681{
2682        WARN_ON(error >= 0);
2683        return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
2684}
2685EXPORT_SYMBOL_GPL(__blk_end_request_err);
2686
2687void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2688                     struct bio *bio)
2689{
2690        /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
2691        rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
2692
2693        if (bio_has_data(bio)) {
2694                rq->nr_phys_segments = bio_phys_segments(q, bio);
2695                rq->buffer = bio_data(bio);
2696        }
2697        rq->__data_len = bio->bi_size;
2698        rq->bio = rq->biotail = bio;
2699
2700        if (bio->bi_bdev)
2701                rq->rq_disk = bio->bi_bdev->bd_disk;
2702}
2703
2704#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
2705/**
2706 * rq_flush_dcache_pages - Helper function to flush all pages in a request
2707 * @rq: the request to be flushed
2708 *
2709 * Description:
2710 *     Flush all pages in @rq.
2711 */
2712void rq_flush_dcache_pages(struct request *rq)
2713{
2714        struct req_iterator iter;
2715        struct bio_vec *bvec;
2716
2717        rq_for_each_segment(bvec, rq, iter)
2718                flush_dcache_page(bvec->bv_page);
2719}
2720EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
2721#endif
2722
2723/**
2724 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
2725 * @q : the queue of the device being checked
2726 *
2727 * Description:
2728 *    Check if underlying low-level drivers of a device are busy.
2729 *    If the drivers want to export their busy state, they must set own
2730 *    exporting function using blk_queue_lld_busy() first.
2731 *
2732 *    Basically, this function is used only by request stacking drivers
2733 *    to stop dispatching requests to underlying devices when underlying
2734 *    devices are busy.  This behavior helps more I/O merging on the queue
2735 *    of the request stacking driver and prevents I/O throughput regression
2736 *    on burst I/O load.
2737 *
2738 * Return:
2739 *    0 - Not busy (The request stacking driver should dispatch request)
2740 *    1 - Busy (The request stacking driver should stop dispatching request)
2741 */
2742int blk_lld_busy(struct request_queue *q)
2743{
2744        if (q->lld_busy_fn)
2745                return q->lld_busy_fn(q);
2746
2747        return 0;
2748}
2749EXPORT_SYMBOL_GPL(blk_lld_busy);
2750
2751/**
2752 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
2753 * @rq: the clone request to be cleaned up
2754 *
2755 * Description:
2756 *     Free all bios in @rq for a cloned request.
2757 */
2758void blk_rq_unprep_clone(struct request *rq)
2759{
2760        struct bio *bio;
2761
2762        while ((bio = rq->bio) != NULL) {
2763                rq->bio = bio->bi_next;
2764
2765                bio_put(bio);
2766        }
2767}
2768EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2769
2770/*
2771 * Copy attributes of the original request to the clone request.
2772 * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
2773 */
2774static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2775{
2776        dst->cpu = src->cpu;
2777        dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
2778        dst->cmd_type = src->cmd_type;
2779        dst->__sector = blk_rq_pos(src);
2780        dst->__data_len = blk_rq_bytes(src);
2781        dst->nr_phys_segments = src->nr_phys_segments;
2782        dst->ioprio = src->ioprio;
2783        dst->extra_len = src->extra_len;
2784}
2785
2786/**
2787 * blk_rq_prep_clone - Helper function to setup clone request
2788 * @rq: the request to be setup
2789 * @rq_src: original request to be cloned
2790 * @bs: bio_set that bios for clone are allocated from
2791 * @gfp_mask: memory allocation mask for bio
2792 * @bio_ctr: setup function to be called for each clone bio.
2793 *           Returns %0 for success, non %0 for failure.
2794 * @data: private data to be passed to @bio_ctr
2795 *
2796 * Description:
2797 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
2798 *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
2799 *     are not copied, and copying such parts is the caller's responsibility.
2800 *     Also, pages which the original bios are pointing to are not copied
2801 *     and the cloned bios just point same pages.
2802 *     So cloned bios must be completed before original bios, which means
2803 *     the caller must complete @rq before @rq_src.
2804 */
2805int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
2806                      struct bio_set *bs, gfp_t gfp_mask,
2807                      int (*bio_ctr)(struct bio *, struct bio *, void *),
2808                      void *data)
2809{
2810        struct bio *bio, *bio_src;
2811
2812        if (!bs)
2813                bs = fs_bio_set;
2814
2815        blk_rq_init(NULL, rq);
2816
2817        __rq_for_each_bio(bio_src, rq_src) {
2818                bio = bio_clone_bioset(bio_src, gfp_mask, bs);
2819                if (!bio)
2820                        goto free_and_out;
2821
2822                if (bio_ctr && bio_ctr(bio, bio_src, data))
2823                        goto free_and_out;
2824
2825                if (rq->bio) {
2826                        rq->biotail->bi_next = bio;
2827                        rq->biotail = bio;
2828                } else
2829                        rq->bio = rq->biotail = bio;
2830        }
2831
2832        __blk_rq_prep_clone(rq, rq_src);
2833
2834        return 0;
2835
2836free_and_out:
2837        if (bio)
2838                bio_put(bio);
2839        blk_rq_unprep_clone(rq);
2840
2841        return -ENOMEM;
2842}
2843EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
2844
2845int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2846{
2847        return queue_work(kblockd_workqueue, work);
2848}
2849EXPORT_SYMBOL(kblockd_schedule_work);
2850
2851int kblockd_schedule_delayed_work(struct request_queue *q,
2852                        struct delayed_work *dwork, unsigned long delay)
2853{
2854        return queue_delayed_work(kblockd_workqueue, dwork, delay);
2855}
2856EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2857
2858#define PLUG_MAGIC      0x91827364
2859
2860/**
2861 * blk_start_plug - initialize blk_plug and track it inside the task_struct
2862 * @plug:       The &struct blk_plug that needs to be initialized
2863 *
2864 * Description:
2865 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
2866 *   pending I/O should the task end up blocking between blk_start_plug() and
2867 *   blk_finish_plug(). This is important from a performance perspective, but
2868 *   also ensures that we don't deadlock. For instance, if the task is blocking
2869 *   for a memory allocation, memory reclaim could end up wanting to free a
2870 *   page belonging to that request that is currently residing in our private
2871 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
2872 *   this kind of deadlock.
2873 */
2874void blk_start_plug(struct blk_plug *plug)
2875{
2876        struct task_struct *tsk = current;
2877
2878        plug->magic = PLUG_MAGIC;
2879        INIT_LIST_HEAD(&plug->list);
2880        INIT_LIST_HEAD(&plug->cb_list);
2881
2882        /*
2883         * If this is a nested plug, don't actually assign it. It will be
2884         * flushed on its own.
2885         */
2886        if (!tsk->plug) {
2887                /*
2888                 * Store ordering should not be needed here, since a potential
2889                 * preempt will imply a full memory barrier
2890                 */
2891                tsk->plug = plug;
2892        }
2893}
2894EXPORT_SYMBOL(blk_start_plug);
2895
2896static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
2897{
2898        struct request *rqa = container_of(a, struct request, queuelist);
2899        struct request *rqb = container_of(b, struct request, queuelist);
2900
2901        return !(rqa->q < rqb->q ||
2902                (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
2903}
2904
2905/*
2906 * If 'from_schedule' is true, then postpone the dispatch of requests
2907 * until a safe kblockd context. We due this to avoid accidental big
2908 * additional stack usage in driver dispatch, in places where the originally
2909 * plugger did not intend it.
2910 */
2911static void queue_unplugged(struct request_queue *q, unsigned int depth,
2912                            bool from_schedule)
2913        __releases(q->queue_lock)
2914{
2915        trace_block_unplug(q, depth, !from_schedule);
2916
2917        if (from_schedule)
2918                blk_run_queue_async(q);
2919        else
2920                __blk_run_queue(q);
2921        spin_unlock(q->queue_lock);
2922}
2923
2924static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
2925{
2926        LIST_HEAD(callbacks);
2927
2928        while (!list_empty(&plug->cb_list)) {
2929                list_splice_init(&plug->cb_list, &callbacks);
2930
2931                while (!list_empty(&callbacks)) {
2932                        struct blk_plug_cb *cb = list_first_entry(&callbacks,
2933                                                          struct blk_plug_cb,
2934                                                          list);
2935                        list_del(&cb->list);
2936                        cb->callback(cb, from_schedule);
2937                }
2938        }
2939}
2940
2941struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
2942                                      int size)
2943{
2944        struct blk_plug *plug = current->plug;
2945        struct blk_plug_cb *cb;
2946
2947        if (!plug)
2948                return NULL;
2949
2950        list_for_each_entry(cb, &plug->cb_list, list)
2951                if (cb->callback == unplug && cb->data == data)
2952                        return cb;
2953
2954        /* Not currently on the callback list */
2955        BUG_ON(size < sizeof(*cb));
2956        cb = kzalloc(size, GFP_ATOMIC);
2957        if (cb) {
2958                cb->data = data;
2959                cb->callback = unplug;
2960                list_add(&cb->list, &plug->cb_list);
2961        }
2962        return cb;
2963}
2964EXPORT_SYMBOL(blk_check_plugged);
2965
2966void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2967{
2968        struct request_queue *q;
2969        unsigned long flags;
2970        struct request *rq;
2971        LIST_HEAD(list);
2972        unsigned int depth;
2973
2974        BUG_ON(plug->magic != PLUG_MAGIC);
2975
2976        flush_plug_callbacks(plug, from_schedule);
2977        if (list_empty(&plug->list))
2978                return;
2979
2980        list_splice_init(&plug->list, &list);
2981
2982        list_sort(NULL, &list, plug_rq_cmp);
2983
2984        q = NULL;
2985        depth = 0;
2986
2987        /*
2988         * Save and disable interrupts here, to avoid doing it for every
2989         * queue lock we have to take.
2990         */
2991        local_irq_save(flags);
2992        while (!list_empty(&list)) {
2993                rq = list_entry_rq(list.next);
2994                list_del_init(&rq->queuelist);
2995                BUG_ON(!rq->q);
2996                if (rq->q != q) {
2997                        /*
2998                         * This drops the queue lock
2999                         */
3000                        if (q)
3001                                queue_unplugged(q, depth, from_schedule);
3002                        q = rq->q;
3003                        depth = 0;
3004                        spin_lock(q->queue_lock);
3005                }
3006
3007                /*
3008                 * Short-circuit if @q is dead
3009                 */
3010                if (unlikely(blk_queue_dying(q))) {
3011                        __blk_end_request_all(rq, -ENODEV);
3012                        continue;
3013                }
3014
3015                /*
3016                 * rq is already accounted, so use raw insert
3017                 */
3018                if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
3019                        __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
3020                else
3021                        __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
3022
3023                depth++;
3024        }
3025
3026        /*
3027         * This drops the queue lock
3028         */
3029        if (q)
3030                queue_unplugged(q, depth, from_schedule);
3031
3032        local_irq_restore(flags);
3033}
3034
3035void blk_finish_plug(struct blk_plug *plug)
3036{
3037        blk_flush_plug_list(plug, false);
3038
3039        if (plug == current->plug)
3040                current->plug = NULL;
3041}
3042EXPORT_SYMBOL(blk_finish_plug);
3043
3044#ifdef CONFIG_PM_RUNTIME
3045/**
3046 * blk_pm_runtime_init - Block layer runtime PM initialization routine
3047 * @q: the queue of the device
3048 * @dev: the device the queue belongs to
3049 *
3050 * Description:
3051 *    Initialize runtime-PM-related fields for @q and start auto suspend for
3052 *    @dev. Drivers that want to take advantage of request-based runtime PM
3053 *    should call this function after @dev has been initialized, and its
3054 *    request queue @q has been allocated, and runtime PM for it can not happen
3055 *    yet(either due to disabled/forbidden or its usage_count > 0). In most
3056 *    cases, driver should call this function before any I/O has taken place.
3057 *
3058 *    This function takes care of setting up using auto suspend for the device,
3059 *    the autosuspend delay is set to -1 to make runtime suspend impossible
3060 *    until an updated value is either set by user or by driver. Drivers do
3061 *    not need to touch other autosuspend settings.
3062 *
3063 *    The block layer runtime PM is request based, so only works for drivers
3064 *    that use request as their IO unit instead of those directly use bio's.
3065 */
3066void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
3067{
3068        q->dev = dev;
3069        q->rpm_status = RPM_ACTIVE;
3070        pm_runtime_set_autosuspend_delay(q->dev, -1);
3071        pm_runtime_use_autosuspend(q->dev);
3072}
3073EXPORT_SYMBOL(blk_pm_runtime_init);
3074
3075/**
3076 * blk_pre_runtime_suspend - Pre runtime suspend check
3077 * @q: the queue of the device
3078 *
3079 * Description:
3080 *    This function will check if runtime suspend is allowed for the device
3081 *    by examining if there are any requests pending in the queue. If there
3082 *    are requests pending, the device can not be runtime suspended; otherwise,
3083 *    the queue's status will be updated to SUSPENDING and the driver can
3084 *    proceed to suspend the device.
3085 *
3086 *    For the not allowed case, we mark last busy for the device so that
3087 *    runtime PM core will try to autosuspend it some time later.
3088 *
3089 *    This function should be called near the start of the device's
3090 *    runtime_suspend callback.
3091 *
3092 * Return:
3093 *    0         - OK to runtime suspend the device
3094 *    -EBUSY    - Device should not be runtime suspended
3095 */
3096int blk_pre_runtime_suspend(struct request_queue *q)
3097{
3098        int ret = 0;
3099
3100        if (!q->dev)
3101                return ret;
3102
3103        spin_lock_irq(q->queue_lock);
3104        if (q->nr_pending) {
3105                ret = -EBUSY;
3106                pm_runtime_mark_last_busy(q->dev);
3107        } else {
3108                q->rpm_status = RPM_SUSPENDING;
3109        }
3110        spin_unlock_irq(q->queue_lock);
3111        return ret;
3112}
3113EXPORT_SYMBOL(blk_pre_runtime_suspend);
3114
3115/**
3116 * blk_post_runtime_suspend - Post runtime suspend processing
3117 * @q: the queue of the device
3118 * @err: return value of the device's runtime_suspend function
3119 *
3120 * Description:
3121 *    Update the queue's runtime status according to the return value of the
3122 *    device's runtime suspend function and mark last busy for the device so
3123 *    that PM core will try to auto suspend the device at a later time.
3124 *
3125 *    This function should be called near the end of the device's
3126 *    runtime_suspend callback.
3127 */
3128void blk_post_runtime_suspend(struct request_queue *q, int err)
3129{
3130        if (!q->dev)
3131                return;
3132
3133        spin_lock_irq(q->queue_lock);
3134        if (!err) {
3135                q->rpm_status = RPM_SUSPENDED;
3136        } else {
3137                q->rpm_status = RPM_ACTIVE;
3138                pm_runtime_mark_last_busy(q->dev);
3139        }
3140        spin_unlock_irq(q->queue_lock);
3141}
3142EXPORT_SYMBOL(blk_post_runtime_suspend);
3143
3144/**
3145 * blk_pre_runtime_resume - Pre runtime resume processing
3146 * @q: the queue of the device
3147 *
3148 * Description:
3149 *    Update the queue's runtime status to RESUMING in preparation for the
3150 *    runtime resume of the device.
3151 *
3152 *    This function should be called near the start of the device's
3153 *    runtime_resume callback.
3154 */
3155void blk_pre_runtime_resume(struct request_queue *q)
3156{
3157        if (!q->dev)
3158                return;
3159
3160        spin_lock_irq(q->queue_lock);
3161        q->rpm_status = RPM_RESUMING;
3162        spin_unlock_irq(q->queue_lock);
3163}
3164EXPORT_SYMBOL(blk_pre_runtime_resume);
3165
3166/**
3167 * blk_post_runtime_resume - Post runtime resume processing
3168 * @q: the queue of the device
3169 * @err: return value of the device's runtime_resume function
3170 *
3171 * Description:
3172 *    Update the queue's runtime status according to the return value of the
3173 *    device's runtime_resume function. If it is successfully resumed, process
3174 *    the requests that are queued into the device's queue when it is resuming
3175 *    and then mark last busy and initiate autosuspend for it.
3176 *
3177 *    This function should be called near the end of the device's
3178 *    runtime_resume callback.
3179 */
3180void blk_post_runtime_resume(struct request_queue *q, int err)
3181{
3182        if (!q->dev)
3183                return;
3184
3185        spin_lock_irq(q->queue_lock);
3186        if (!err) {
3187                q->rpm_status = RPM_ACTIVE;
3188                __blk_run_queue(q);
3189                pm_runtime_mark_last_busy(q->dev);
3190                pm_request_autosuspend(q->dev);
3191        } else {
3192                q->rpm_status = RPM_SUSPENDED;
3193        }
3194        spin_unlock_irq(q->queue_lock);
3195}
3196EXPORT_SYMBOL(blk_post_runtime_resume);
3197#endif
3198
3199int __init blk_dev_init(void)
3200{
3201        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
3202                        sizeof(((struct request *)0)->cmd_flags));
3203
3204        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
3205        kblockd_workqueue = alloc_workqueue("kblockd",
3206                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
3207        if (!kblockd_workqueue)
3208                panic("Failed to create kblockd\n");
3209
3210        request_cachep = kmem_cache_create("blkdev_requests",
3211                        sizeof(struct request), 0, SLAB_PANIC, NULL);
3212
3213        blk_requestq_cachep = kmem_cache_create("blkdev_queue",
3214                        sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3215
3216        return 0;
3217}
3218