linux-bk/fs/direct-io.c
<<
>>
Prefs
   1/*
   2 * fs/direct-io.c
   3 *
   4 * Copyright (C) 2002, Linus Torvalds.
   5 *
   6 * O_DIRECT
   7 *
   8 * 04Jul2002    akpm@zip.com.au
   9 *              Initial version
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/types.h>
  14#include <linux/fs.h>
  15#include <linux/mm.h>
  16#include <linux/highmem.h>
  17#include <linux/pagemap.h>
  18#include <linux/bio.h>
  19#include <linux/wait.h>
  20#include <linux/err.h>
  21#include <linux/buffer_head.h>
  22#include <linux/rwsem.h>
  23#include <asm/atomic.h>
  24
  25/*
  26 * The largest-sized BIO which this code will assemble, in bytes.  Set this
  27 * to PAGE_SIZE if your drivers are broken.
  28 */
  29#define DIO_BIO_MAX_SIZE BIO_MAX_SIZE
  30
  31/*
  32 * How many user pages to map in one call to get_user_pages().  This determines
  33 * the size of a structure on the stack.
  34 */
  35#define DIO_PAGES       64
  36
  37struct dio {
  38        /* BIO submission state */
  39        struct bio *bio;                /* bio under assembly */
  40        struct bio_vec *bvec;           /* current bvec in that bio */
  41        struct inode *inode;
  42        int rw;
  43        unsigned blkbits;               /* doesn't change */
  44        sector_t block_in_file;         /* changes */
  45        unsigned blocks_available;      /* At block_in_file.  changes */
  46        sector_t final_block_in_request;/* doesn't change */
  47        unsigned first_block_in_page;   /* doesn't change, Used only once */
  48        int boundary;                   /* prev block is at a boundary */
  49        int reap_counter;               /* rate limit reaping */
  50        get_blocks_t *get_blocks;       /* block mapping function */
  51        sector_t last_block_in_bio;     /* current final block in bio */
  52        sector_t next_block_in_bio;     /* next block to be added to bio */
  53        struct buffer_head map_bh;      /* last get_blocks() result */
  54
  55        /* Page fetching state */
  56        int curr_page;                  /* changes */
  57        int total_pages;                /* doesn't change */
  58        unsigned long curr_user_address;/* changes */
  59
  60        /* Page queue */
  61        struct page *pages[DIO_PAGES];  /* page buffer */
  62        unsigned head;                  /* next page to process */
  63        unsigned tail;                  /* last valid page + 1 */
  64        int page_errors;                /* errno from get_user_pages() */
  65
  66        /* BIO completion state */
  67        atomic_t bio_count;             /* nr bios in flight */
  68        spinlock_t bio_list_lock;       /* protects bio_list */
  69        struct bio *bio_list;           /* singly linked via bi_private */
  70        struct task_struct *waiter;     /* waiting task (NULL if none) */
  71};
  72
  73/*
  74 * How many pages are in the queue?
  75 */
  76static inline unsigned dio_pages_present(struct dio *dio)
  77{
  78        return dio->tail - dio->head;
  79}
  80
  81/*
  82 * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
  83 */
  84static int dio_refill_pages(struct dio *dio)
  85{
  86        int ret;
  87        int nr_pages;
  88
  89        nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
  90        down_read(&current->mm->mmap_sem);
  91        ret = get_user_pages(
  92                current,                        /* Task for fault acounting */
  93                current->mm,                    /* whose pages? */
  94                dio->curr_user_address,         /* Where from? */
  95                nr_pages,                       /* How many pages? */
  96                dio->rw == READ,                /* Write to memory? */
  97                0,                              /* force (?) */
  98                &dio->pages[0],
  99                NULL);                          /* vmas */
 100        up_read(&current->mm->mmap_sem);
 101
 102        if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
 103                /*
 104                 * A memory fault, but the filesystem has some outstanding
 105                 * mapped blocks.  We need to use those blocks up to avoid
 106                 * leaking stale data in the file.
 107                 */
 108                if (dio->page_errors == 0)
 109                        dio->page_errors = ret;
 110                dio->pages[0] = ZERO_PAGE(dio->cur_user_address);
 111                dio->head = 0;
 112                dio->tail = 1;
 113                ret = 0;
 114                goto out;
 115        }
 116
 117        if (ret >= 0) {
 118                dio->curr_user_address += ret * PAGE_SIZE;
 119                dio->curr_page += ret;
 120                dio->head = 0;
 121                dio->tail = ret;
 122                ret = 0;
 123        }
 124out:
 125        return ret;     
 126}
 127
 128/*
 129 * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
 130 * buffered inside the dio so that we can call get_user_pages() against a
 131 * decent number of pages, less frequently.  To provide nicer use of the
 132 * L1 cache.
 133 */
 134static struct page *dio_get_page(struct dio *dio)
 135{
 136        if (dio_pages_present(dio) == 0) {
 137                int ret;
 138
 139                ret = dio_refill_pages(dio);
 140                if (ret)
 141                        return ERR_PTR(ret);
 142                BUG_ON(dio_pages_present(dio) == 0);
 143        }
 144        return dio->pages[dio->head++];
 145}
 146
 147/*
 148 * The BIO completion handler simply queues the BIO up for the process-context
 149 * handler.
 150 *
 151 * During I/O bi_private points at the dio.  After I/O, bi_private is used to
 152 * implement a singly-linked list of completed BIOs, at dio->bio_list.
 153 */
 154static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
 155{
 156        struct dio *dio = bio->bi_private;
 157        unsigned long flags;
 158
 159        if (bio->bi_size)
 160                return 1;
 161
 162        spin_lock_irqsave(&dio->bio_list_lock, flags);
 163        bio->bi_private = dio->bio_list;
 164        dio->bio_list = bio;
 165        if (dio->waiter)
 166                wake_up_process(dio->waiter);
 167        spin_unlock_irqrestore(&dio->bio_list_lock, flags);
 168        return 0;
 169}
 170
 171static int
 172dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 173                sector_t first_sector, int nr_vecs)
 174{
 175        struct bio *bio;
 176
 177        bio = bio_alloc(GFP_KERNEL, nr_vecs);
 178        if (bio == NULL)
 179                return -ENOMEM;
 180
 181        bio->bi_bdev = bdev;
 182        bio->bi_vcnt = nr_vecs;
 183        bio->bi_idx = 0;
 184        bio->bi_size = 0;
 185        bio->bi_sector = first_sector;
 186        bio->bi_io_vec[0].bv_page = NULL;
 187        bio->bi_end_io = dio_bio_end_io;
 188
 189        dio->bio = bio;
 190        dio->bvec = NULL;               /* debug */
 191        return 0;
 192}
 193
 194static void dio_bio_submit(struct dio *dio)
 195{
 196        struct bio *bio = dio->bio;
 197
 198        bio->bi_vcnt = bio->bi_idx;
 199        bio->bi_idx = 0;
 200        bio->bi_private = dio;
 201        atomic_inc(&dio->bio_count);
 202        submit_bio(dio->rw, bio);
 203
 204        dio->bio = NULL;
 205        dio->bvec = NULL;
 206        dio->boundary = 0;
 207}
 208
 209/*
 210 * Release any resources in case of a failure
 211 */
 212static void dio_cleanup(struct dio *dio)
 213{
 214        while (dio_pages_present(dio))
 215                page_cache_release(dio_get_page(dio));
 216}
 217
 218/*
 219 * Wait for the next BIO to complete.  Remove it and return it.
 220 */
 221static struct bio *dio_await_one(struct dio *dio)
 222{
 223        unsigned long flags;
 224        struct bio *bio;
 225
 226        spin_lock_irqsave(&dio->bio_list_lock, flags);
 227        while (dio->bio_list == NULL) {
 228                set_current_state(TASK_UNINTERRUPTIBLE);
 229                if (dio->bio_list == NULL) {
 230                        dio->waiter = current;
 231                        spin_unlock_irqrestore(&dio->bio_list_lock, flags);
 232                        blk_run_queues();
 233                        schedule();
 234                        spin_lock_irqsave(&dio->bio_list_lock, flags);
 235                        dio->waiter = NULL;
 236                }
 237                set_current_state(TASK_RUNNING);
 238        }
 239        bio = dio->bio_list;
 240        dio->bio_list = bio->bi_private;
 241        spin_unlock_irqrestore(&dio->bio_list_lock, flags);
 242        return bio;
 243}
 244
 245/*
 246 * Process one completed BIO.  No locks are held.
 247 */
 248static int dio_bio_complete(struct dio *dio, struct bio *bio)
 249{
 250        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 251        struct bio_vec *bvec = bio->bi_io_vec;
 252        int page_no;
 253
 254        for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
 255                struct page *page = bvec[page_no].bv_page;
 256
 257                if (dio->rw == READ)
 258                        set_page_dirty(page);
 259                page_cache_release(page);
 260        }
 261        atomic_dec(&dio->bio_count);
 262        bio_put(bio);
 263        return uptodate ? 0 : -EIO;
 264}
 265
 266/*
 267 * Wait on and process all in-flight BIOs.
 268 */
 269static int dio_await_completion(struct dio *dio)
 270{
 271        int ret = 0;
 272
 273        if (dio->bio)
 274                dio_bio_submit(dio);
 275
 276        while (atomic_read(&dio->bio_count)) {
 277                struct bio *bio = dio_await_one(dio);
 278                int ret2;
 279
 280                ret2 = dio_bio_complete(dio, bio);
 281                if (ret == 0)
 282                        ret = ret2;
 283        }
 284        return ret;
 285}
 286
 287/*
 288 * A really large O_DIRECT read or write can generate a lot of BIOs.  So
 289 * to keep the memory consumption sane we periodically reap any completed BIOs
 290 * during the BIO generation phase.
 291 *
 292 * This also helps to limit the peak amount of pinned userspace memory.
 293 */
 294static int dio_bio_reap(struct dio *dio)
 295{
 296        int ret = 0;
 297
 298        if (dio->reap_counter++ >= 64) {
 299                while (dio->bio_list) {
 300                        unsigned long flags;
 301                        struct bio *bio;
 302
 303                        spin_lock_irqsave(&dio->bio_list_lock, flags);
 304                        bio = dio->bio_list;
 305                        dio->bio_list = bio->bi_private;
 306                        spin_unlock_irqrestore(&dio->bio_list_lock, flags);
 307                        ret = dio_bio_complete(dio, bio);
 308                }
 309                dio->reap_counter = 0;
 310        }
 311        return ret;
 312}
 313
 314/*
 315 * Call into the fs to map some more disk blocks.  We record the current number
 316 * of available blocks at dio->blocks_available.  These are in units of the
 317 * fs blocksize, (1 << inode->i_blkbits).
 318 *
 319 * The fs is allowed to map lots of blocks at once.  If it wants to do that,
 320 * it uses the passed inode-relative block number as the file offset, as usual.
 321 *
 322 * get_blocks() is passed the number of i_blkbits-sized blocks which direct_io
 323 * has remaining to do.  The fs should not map more than this number of blocks.
 324 *
 325 * If the fs has mapped a lot of blocks, it should populate bh->b_size to
 326 * indicate how much contiguous disk space has been made available at
 327 * bh->b_blocknr.
 328 *
 329 * If *any* of the mapped blocks are new, then the fs must set buffer_new().
 330 * This isn't very efficient...
 331 *
 332 * In the case of filesystem holes: the fs may return an arbitrarily-large
 333 * hole by returning an appropriate value in b_size and by clearing
 334 * buffer_mapped().  This code _should_ handle that case correctly, but it has
 335 * only been tested against single-block holes (b_size == blocksize).
 336 */
 337static int get_more_blocks(struct dio *dio)
 338{
 339        int ret;
 340        struct buffer_head *map_bh = &dio->map_bh;
 341
 342        if (dio->blocks_available)
 343                return 0;
 344
 345        /*
 346         * If there was a memory error and we've overwritten all the
 347         * mapped blocks then we can now return that memory error
 348         */
 349        if (dio->page_errors) {
 350                ret = dio->page_errors;
 351                goto out;
 352        }
 353
 354        map_bh->b_state = 0;
 355        map_bh->b_size = 0;
 356        BUG_ON(dio->block_in_file >= dio->final_block_in_request);
 357        ret = (*dio->get_blocks)(dio->inode, dio->block_in_file,
 358                        dio->final_block_in_request - dio->block_in_file,
 359                        map_bh, dio->rw == WRITE);
 360        if (ret)
 361                goto out;
 362
 363        if (buffer_mapped(map_bh)) {
 364                BUG_ON(map_bh->b_size == 0);
 365                BUG_ON((map_bh->b_size & ((1 << dio->blkbits) - 1)) != 0);
 366
 367                dio->blocks_available = map_bh->b_size >> dio->blkbits;
 368
 369                /* blockdevs do not set buffer_new */
 370                if (buffer_new(map_bh)) {
 371                        sector_t block = map_bh->b_blocknr;
 372                        unsigned i;
 373
 374                        for (i = 0; i < dio->blocks_available; i++)
 375                                unmap_underlying_metadata(map_bh->b_bdev,
 376                                                        block++);
 377                }
 378        } else {
 379                BUG_ON(dio->rw != READ);
 380                if (dio->bio)
 381                        dio_bio_submit(dio);
 382        }
 383        dio->next_block_in_bio = map_bh->b_blocknr;
 384out:
 385        return ret;
 386}
 387
 388/*
 389 * Check to see if we can continue to grow the BIO. If not, then send it.
 390 */
 391static void dio_prep_bio(struct dio *dio)
 392{
 393        if (dio->bio == NULL)
 394                return;
 395
 396        if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
 397                        dio->boundary ||
 398                        dio->last_block_in_bio != dio->next_block_in_bio - 1)
 399                dio_bio_submit(dio);
 400}
 401
 402/*
 403 * There is no bio.  Make one now.
 404 */
 405static int dio_new_bio(struct dio *dio)
 406{
 407        sector_t sector;
 408        int ret;
 409
 410        ret = dio_bio_reap(dio);
 411        if (ret)
 412                goto out;
 413        sector = dio->next_block_in_bio << (dio->blkbits - 9);
 414        ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector,
 415                                DIO_BIO_MAX_SIZE / PAGE_SIZE);
 416        dio->boundary = 0;
 417out:
 418        return ret;
 419}
 420
 421/*
 422 * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
 423 *
 424 * Direct IO against a blockdev is different from a file.  Because we can
 425 * happily perform page-sized but 512-byte aligned IOs.  It is important that
 426 * blockdev IO be able to have fine alignment and large sizes.
 427 *
 428 * So what we do is to permit the ->get_blocks function to populate bh.b_size
 429 * with the size of IO which is permitted at this offset and this i_blkbits.
 430 *
 431 * For best results, the blockdev should be set up with 512-byte i_blkbits and
 432 * it should set b_size to PAGE_SIZE or more inside get_blocks().  This gives
 433 * fine alignment but still allows this function to work in PAGE_SIZE units.
 434 */
 435int do_direct_IO(struct dio *dio)
 436{
 437        const unsigned blkbits = dio->blkbits;
 438        const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 439        struct page *page;
 440        unsigned block_in_page;
 441        int ret;
 442
 443        /* The I/O can start at any block offset within the first page */
 444        block_in_page = dio->first_block_in_page;
 445
 446        while (dio->block_in_file < dio->final_block_in_request) {
 447                int new_page;   /* Need to insert this page into the BIO? */
 448
 449                page = dio_get_page(dio);
 450                if (IS_ERR(page)) {
 451                        ret = PTR_ERR(page);
 452                        goto out;
 453                }
 454
 455                new_page = 1;
 456                while (block_in_page < blocks_per_page) {
 457                        struct bio *bio;
 458                        unsigned this_chunk_bytes;      /* # of bytes mapped */
 459                        unsigned this_chunk_blocks;     /* # of blocks */
 460                        unsigned u;
 461
 462                        ret = get_more_blocks(dio);
 463                        if (ret)
 464                                goto fail_release;
 465
 466                        /* Handle holes */
 467                        if (!buffer_mapped(&dio->map_bh)) {
 468                                char *kaddr = kmap_atomic(page, KM_USER0);
 469                                memset(kaddr + (block_in_page << blkbits),
 470                                                0, 1 << blkbits);
 471                                flush_dcache_page(page);
 472                                kunmap_atomic(kaddr, KM_USER0);
 473                                dio->block_in_file++;
 474                                dio->next_block_in_bio++;
 475                                block_in_page++;
 476                                goto next_block;
 477                        }
 478
 479                        dio_prep_bio(dio);
 480                        if (dio->bio == NULL) {
 481                                ret = dio_new_bio(dio);
 482                                if (ret)
 483                                        goto fail_release;
 484                                new_page = 1;
 485                        }
 486
 487                        bio = dio->bio;
 488                        if (new_page) {
 489                                dio->bvec = &bio->bi_io_vec[bio->bi_idx];
 490                                page_cache_get(page);
 491                                dio->bvec->bv_page = page;
 492                                dio->bvec->bv_len = 0;
 493                                dio->bvec->bv_offset = block_in_page << blkbits;
 494                                bio->bi_idx++;
 495                                new_page = 0;
 496                        }
 497
 498                        /* Work out how much disk we can add to this page */
 499                        this_chunk_blocks = dio->blocks_available;
 500                        u = (PAGE_SIZE - (dio->bvec->bv_offset + dio->bvec->bv_len)) >> blkbits;
 501                        if (this_chunk_blocks > u)
 502                                this_chunk_blocks = u;
 503                        u = dio->final_block_in_request - dio->block_in_file;
 504                        if (this_chunk_blocks > u)
 505                                this_chunk_blocks = u;
 506                        this_chunk_bytes = this_chunk_blocks << blkbits;
 507                        BUG_ON(this_chunk_bytes == 0);
 508
 509                        dio->bvec->bv_len += this_chunk_bytes;
 510                        bio->bi_size += this_chunk_bytes;
 511                        dio->next_block_in_bio += this_chunk_blocks;
 512                        dio->last_block_in_bio = dio->next_block_in_bio - 1;
 513                        dio->boundary = buffer_boundary(&dio->map_bh);
 514                        dio->block_in_file += this_chunk_blocks;
 515                        block_in_page += this_chunk_blocks;
 516                        dio->blocks_available -= this_chunk_blocks;
 517next_block:
 518                        if (dio->block_in_file > dio->final_block_in_request)
 519                                BUG();
 520                        if (dio->block_in_file == dio->final_block_in_request)
 521                                break;
 522                }
 523                block_in_page = 0;
 524                page_cache_release(page);
 525        }
 526        ret = 0;
 527        goto out;
 528fail_release:
 529        page_cache_release(page);
 530out:
 531        return ret;
 532}
 533
 534int
 535direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, 
 536        loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
 537{
 538        const unsigned blkbits = inode->i_blkbits;
 539        unsigned long user_addr; 
 540        int seg, ret2, ret = 0;
 541        struct dio dio;
 542        size_t bytes, tot_bytes = 0;
 543
 544        dio.bio = NULL;
 545        dio.bvec = NULL;
 546        dio.inode = inode;
 547        dio.rw = rw;
 548        dio.blkbits = blkbits;
 549        dio.block_in_file = offset >> blkbits;
 550        dio.blocks_available = 0;
 551
 552        dio.boundary = 0;
 553        dio.reap_counter = 0;
 554        dio.get_blocks = get_blocks;
 555        dio.last_block_in_bio = -1;
 556        dio.next_block_in_bio = -1;
 557
 558        dio.page_errors = 0;
 559
 560        /* BIO completion state */
 561        atomic_set(&dio.bio_count, 0);
 562        spin_lock_init(&dio.bio_list_lock);
 563        dio.bio_list = NULL;
 564        dio.waiter = NULL;
 565
 566        for (seg = 0; seg < nr_segs; seg++) {
 567                user_addr = (unsigned long)iov[seg].iov_base;
 568                bytes = iov[seg].iov_len;
 569
 570                /* Index into the first page of the first block */
 571                dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
 572                dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
 573                /* Page fetching state */
 574                dio.head = 0;
 575                dio.tail = 0;
 576                dio.curr_page = 0;
 577
 578                dio.total_pages = 0;
 579                if (user_addr & (PAGE_SIZE-1)) {
 580                        dio.total_pages++;
 581                        bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 582                }
 583                dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 584                dio.curr_user_address = user_addr;
 585        
 586                ret = do_direct_IO(&dio);
 587
 588                if (ret) {
 589                        dio_cleanup(&dio);
 590                        break;
 591                }
 592
 593                tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
 594                                        dio.block_in_file) << blkbits);
 595
 596        } /* end iovec loop */
 597
 598        ret2 = dio_await_completion(&dio);
 599        if (ret == 0)
 600                ret = ret2;
 601        if (ret == 0)
 602                ret = dio.page_errors;
 603        if (ret == 0)
 604                ret = tot_bytes; 
 605
 606        return ret;
 607}
 608
 609/*
 610 * This is a library function for use by filesystem drivers.
 611 */
 612int
 613generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
 614        loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
 615{
 616        int seg;
 617        size_t size;
 618        unsigned long addr;
 619        struct address_space *mapping = inode->i_mapping;
 620        unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
 621        ssize_t retval = -EINVAL;
 622
 623        if (offset & blocksize_mask) {
 624                goto out;
 625        }
 626
 627        /* Check the memory alignment.  Blocks cannot straddle pages */
 628        for (seg = 0; seg < nr_segs; seg++) {
 629                addr = (unsigned long)iov[seg].iov_base;
 630                size = iov[seg].iov_len;
 631                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
 632                        goto out;       
 633        }
 634
 635        if (mapping->nrpages) {
 636                retval = filemap_fdatawrite(mapping);
 637                if (retval == 0)
 638                        retval = filemap_fdatawait(mapping);
 639                if (retval)
 640                        goto out;
 641        }
 642
 643        retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
 644out:
 645        return retval;
 646}
 647
 648ssize_t
 649generic_file_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
 650        loff_t offset, unsigned long nr_segs)
 651{
 652        struct address_space *mapping = inode->i_mapping;
 653        ssize_t retval;
 654
 655        retval = mapping->a_ops->direct_IO(rw, inode, iov, offset, nr_segs);
 656        if (inode->i_mapping->nrpages)
 657                invalidate_inode_pages2(inode->i_mapping);
 658        return retval;
 659}
 660
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.