linux-bk/fs/bio.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License version 2 as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public Licens
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  17 *
  18 */
  19#include <linux/mm.h>
  20#include <linux/bio.h>
  21#include <linux/blk.h>
  22#include <linux/slab.h>
  23#include <linux/iobuf.h>
  24#include <linux/kernel.h>
  25#include <linux/module.h>
  26#include <linux/mempool.h>
  27
  28#define BIO_POOL_SIZE 256
  29
  30static mempool_t *bio_pool;
  31static kmem_cache_t *bio_slab;
  32
  33#define BIOVEC_NR_POOLS 6
  34
  35struct biovec_pool {
  36        int nr_vecs;
  37        char *name; 
  38        kmem_cache_t *slab;
  39        mempool_t *pool;
  40};
  41
  42/*
  43 * if you change this list, also change bvec_alloc or things will
  44 * break badly! cannot be bigger than what you can fit into an
  45 * unsigned short
  46 */
  47
  48#define BV(x) { x, "biovec-" #x }
  49static struct biovec_pool bvec_array[BIOVEC_NR_POOLS] = {
  50        BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
  51};
  52#undef BV
  53
  54static void *slab_pool_alloc(int gfp_mask, void *data)
  55{
  56        return kmem_cache_alloc(data, gfp_mask);
  57}
  58
  59static void slab_pool_free(void *ptr, void *data)
  60{
  61        kmem_cache_free(data, ptr);
  62}
  63
  64static inline struct bio_vec *bvec_alloc(int gfp_mask, int nr, int *idx)
  65{
  66        struct biovec_pool *bp;
  67        struct bio_vec *bvl;
  68
  69        /*
  70         * see comment near bvec_array define!
  71         */
  72        switch (nr) {
  73                case   1        : *idx = 0; break;
  74                case   2 ...   4: *idx = 1; break;
  75                case   5 ...  16: *idx = 2; break;
  76                case  17 ...  64: *idx = 3; break;
  77                case  65 ... 128: *idx = 4; break;
  78                case 129 ... BIO_MAX_PAGES: *idx = 5; break;
  79                default:
  80                        return NULL;
  81        }
  82        /*
  83         * idx now points to the pool we want to allocate from
  84         */
  85        bp = bvec_array + *idx;
  86
  87        bvl = mempool_alloc(bp->pool, gfp_mask);
  88        if (bvl)
  89                memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
  90        return bvl;
  91}
  92
  93/*
  94 * default destructor for a bio allocated with bio_alloc()
  95 */
  96void bio_destructor(struct bio *bio)
  97{
  98        struct biovec_pool *bp = bvec_array + bio->bi_max;
  99
 100        BIO_BUG_ON(bio->bi_max >= BIOVEC_NR_POOLS);
 101        /*
 102         * cloned bio doesn't own the veclist
 103         */
 104        if (!bio_flagged(bio, BIO_CLONED))
 105                mempool_free(bio->bi_io_vec, bp->pool);
 106
 107        mempool_free(bio, bio_pool);
 108}
 109
 110inline void bio_init(struct bio *bio)
 111{
 112        bio->bi_next = NULL;
 113        bio->bi_flags = 1 << BIO_UPTODATE;
 114        bio->bi_rw = 0;
 115        bio->bi_vcnt = 0;
 116        bio->bi_idx = 0;
 117        bio->bi_phys_segments = 0;
 118        bio->bi_hw_segments = 0;
 119        bio->bi_size = 0;
 120        bio->bi_end_io = NULL;
 121        atomic_set(&bio->bi_cnt, 1);
 122}
 123
 124/**
 125 * bio_alloc - allocate a bio for I/O
 126 * @gfp_mask:   the GFP_ mask given to the slab allocator
 127 * @nr_iovecs:  number of iovecs to pre-allocate
 128 *
 129 * Description:
 130 *   bio_alloc will first try it's on mempool to satisfy the allocation.
 131 *   If %__GFP_WAIT is set then we will block on the internal pool waiting
 132 *   for a &struct bio to become free.
 133 **/
 134struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
 135{
 136        struct bio *bio;
 137        struct bio_vec *bvl = NULL;
 138        int pf_flags = current->flags;
 139
 140        current->flags |= PF_NOWARN;
 141        bio = mempool_alloc(bio_pool, gfp_mask);
 142        if (unlikely(!bio))
 143                goto out;
 144
 145        if (!nr_iovecs || (bvl = bvec_alloc(gfp_mask,nr_iovecs,&bio->bi_max))) {
 146                bio_init(bio);
 147                bio->bi_destructor = bio_destructor;
 148                bio->bi_io_vec = bvl;
 149                goto out;
 150        }
 151
 152        mempool_free(bio, bio_pool);
 153        bio = NULL;
 154out:
 155        current->flags = pf_flags;
 156        return bio;
 157}
 158
 159/**
 160 * bio_put - release a reference to a bio
 161 * @bio:   bio to release reference to
 162 *
 163 * Description:
 164 *   Put a reference to a &struct bio, either one you have gotten with
 165 *   bio_alloc or bio_get. The last put of a bio will free it.
 166 **/
 167void bio_put(struct bio *bio)
 168{
 169        BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
 170
 171        /*
 172         * last put frees it
 173         */
 174        if (atomic_dec_and_test(&bio->bi_cnt)) {
 175                bio->bi_next = NULL;
 176                bio->bi_destructor(bio);
 177        }
 178}
 179
 180inline int bio_phys_segments(request_queue_t *q, struct bio *bio)
 181{
 182        if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
 183                blk_recount_segments(q, bio);
 184
 185        return bio->bi_phys_segments;
 186}
 187
 188inline int bio_hw_segments(request_queue_t *q, struct bio *bio)
 189{
 190        if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
 191                blk_recount_segments(q, bio);
 192
 193        return bio->bi_hw_segments;
 194}
 195
 196/**
 197 *      __bio_clone     -       clone a bio
 198 *      @bio: destination bio
 199 *      @bio_src: bio to clone
 200 *
 201 *      Clone a &bio. Caller will own the returned bio, but not
 202 *      the actual data it points to. Reference count of returned
 203 *      bio will be one.
 204 */
 205inline void __bio_clone(struct bio *bio, struct bio *bio_src)
 206{
 207        bio->bi_io_vec = bio_src->bi_io_vec;
 208
 209        bio->bi_sector = bio_src->bi_sector;
 210        bio->bi_bdev = bio_src->bi_bdev;
 211        bio->bi_flags |= 1 << BIO_CLONED;
 212        bio->bi_rw = bio_src->bi_rw;
 213
 214        /*
 215         * notes -- maybe just leave bi_idx alone. bi_max has no use
 216         * on a cloned bio. assume identical mapping for the clone
 217         */
 218        bio->bi_vcnt = bio_src->bi_vcnt;
 219        bio->bi_idx = bio_src->bi_idx;
 220        if (bio_flagged(bio, BIO_SEG_VALID)) {
 221                bio->bi_phys_segments = bio_src->bi_phys_segments;
 222                bio->bi_hw_segments = bio_src->bi_hw_segments;
 223                bio->bi_flags |= (1 << BIO_SEG_VALID);
 224        }
 225        bio->bi_size = bio_src->bi_size;
 226        bio->bi_max = bio_src->bi_max;
 227}
 228
 229/**
 230 *      bio_clone       -       clone a bio
 231 *      @bio: bio to clone
 232 *      @gfp_mask: allocation priority
 233 *
 234 *      Like __bio_clone, only also allocates the returned bio
 235 */
 236struct bio *bio_clone(struct bio *bio, int gfp_mask)
 237{
 238        struct bio *b = bio_alloc(gfp_mask, 0);
 239
 240        if (b)
 241                __bio_clone(b, bio);
 242
 243        return b;
 244}
 245
 246/**
 247 *      bio_copy        -       create copy of a bio
 248 *      @bio: bio to copy
 249 *      @gfp_mask: allocation priority
 250 *      @copy: copy data to allocated bio
 251 *
 252 *      Create a copy of a &bio. Caller will own the returned bio and
 253 *      the actual data it points to. Reference count of returned
 254 *      bio will be one.
 255 */
 256struct bio *bio_copy(struct bio *bio, int gfp_mask, int copy)
 257{
 258        struct bio *b = bio_alloc(gfp_mask, bio->bi_vcnt);
 259        unsigned long flags = 0; /* gcc silly */
 260        struct bio_vec *bv;
 261        int i;
 262
 263        if (unlikely(!b))
 264                return NULL;
 265
 266        /*
 267         * iterate iovec list and alloc pages + copy data
 268         */
 269        __bio_for_each_segment(bv, bio, i, 0) {
 270                struct bio_vec *bbv = &b->bi_io_vec[i];
 271                char *vfrom, *vto;
 272
 273                bbv->bv_page = alloc_page(gfp_mask);
 274                if (bbv->bv_page == NULL)
 275                        goto oom;
 276
 277                bbv->bv_len = bv->bv_len;
 278                bbv->bv_offset = bv->bv_offset;
 279
 280                /*
 281                 * if doing a copy for a READ request, no need
 282                 * to memcpy page data
 283                 */
 284                if (!copy)
 285                        continue;
 286
 287                if (gfp_mask & __GFP_WAIT) {
 288                        vfrom = kmap(bv->bv_page);
 289                        vto = kmap(bbv->bv_page);
 290                } else {
 291                        local_irq_save(flags);
 292                        vfrom = kmap_atomic(bv->bv_page, KM_BIO_SRC_IRQ);
 293                        vto = kmap_atomic(bbv->bv_page, KM_BIO_DST_IRQ);
 294                }
 295
 296                memcpy(vto + bbv->bv_offset, vfrom + bv->bv_offset, bv->bv_len);
 297                if (gfp_mask & __GFP_WAIT) {
 298                        kunmap(bbv->bv_page);
 299                        kunmap(bv->bv_page);
 300                } else {
 301                        kunmap_atomic(vto, KM_BIO_DST_IRQ);
 302                        kunmap_atomic(vfrom, KM_BIO_SRC_IRQ);
 303                        local_irq_restore(flags);
 304                }
 305        }
 306
 307        b->bi_sector = bio->bi_sector;
 308        b->bi_bdev = bio->bi_bdev;
 309        b->bi_rw = bio->bi_rw;
 310
 311        b->bi_vcnt = bio->bi_vcnt;
 312        b->bi_size = bio->bi_size;
 313
 314        return b;
 315
 316oom:
 317        while (--i >= 0)
 318                __free_page(b->bi_io_vec[i].bv_page);
 319
 320        mempool_free(b, bio_pool);
 321        return NULL;
 322}
 323
 324/**
 325 *      bio_add_page    -       attempt to add page to bio
 326 *      @bio: destination bio
 327 *      @page: page to add
 328 *      @len: vec entry length
 329 *      @offset: vec entry offset
 330 *
 331 *      Attempt to add a page to the bio_vec maplist. This can fail for a
 332 *      number of reasons, such as the bio being full or target block
 333 *      device limitations.
 334 */
 335int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 336                 unsigned int offset)
 337{
 338        request_queue_t *q = bdev_get_queue(bio->bi_bdev);
 339        int fail_segments = 0, retried_segments = 0;
 340        struct bio_vec *bvec;
 341
 342        /*
 343         * cloned bio must not modify vec list
 344         */
 345        if (unlikely(bio_flagged(bio, BIO_CLONED)))
 346                return 1;
 347
 348        /*
 349         * FIXME: change bi_max?
 350         */
 351        BUG_ON(bio->bi_max > BIOVEC_NR_POOLS);
 352
 353        if (bio->bi_vcnt >= bvec_array[bio->bi_max].nr_vecs)
 354                return 1;
 355
 356        if (((bio->bi_size + len) >> 9) > q->max_sectors)
 357                return 1;
 358
 359        /*
 360         * we might loose a segment or two here, but rather that than
 361         * make this too complex.
 362         */
 363retry_segments:
 364        if (bio_phys_segments(q, bio) >= q->max_phys_segments
 365            || bio_hw_segments(q, bio) >= q->max_hw_segments)
 366                fail_segments = 1;
 367
 368        if (fail_segments) {
 369                if (retried_segments)
 370                        return 1;
 371
 372                bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 373                retried_segments = 1;
 374                goto retry_segments;
 375        }
 376
 377        /*
 378         * setup the new entry, we might clear it again later if we
 379         * cannot add the page
 380         */
 381        bvec = &bio->bi_io_vec[bio->bi_vcnt];
 382        bvec->bv_page = page;
 383        bvec->bv_len = len;
 384        bvec->bv_offset = offset;
 385
 386        /*
 387         * if queue has other restrictions (eg varying max sector size
 388         * depending on offset), it can specify a merge_bvec_fn in the
 389         * queue to get further control
 390         */
 391        if (q->merge_bvec_fn && q->merge_bvec_fn(q, bio, bvec)) {
 392                bvec->bv_page = NULL;
 393                bvec->bv_len = 0;
 394                bvec->bv_offset = 0;
 395                return 1;
 396        }
 397
 398        bio->bi_vcnt++;
 399        bio->bi_phys_segments++;
 400        bio->bi_hw_segments++;
 401        bio->bi_size += len;
 402        return 0;
 403}
 404
 405static int bio_end_io_kio(struct bio *bio, unsigned int bytes_done, int error)
 406{
 407        struct kiobuf *kio = (struct kiobuf *) bio->bi_private;
 408
 409        if (bio->bi_size)
 410                return 1;
 411
 412        end_kio_request(kio, error);
 413        bio_put(bio);
 414        return 0;
 415}
 416
 417/**
 418 * ll_rw_kio - submit a &struct kiobuf for I/O
 419 * @rw:   %READ or %WRITE
 420 * @kio:   the kiobuf to do I/O on
 421 * @bdev:   target device
 422 * @sector:   start location on disk
 423 *
 424 * Description:
 425 *   ll_rw_kio will map the page list inside the &struct kiobuf to
 426 *   &struct bio and queue them for I/O. The kiobuf given must describe
 427 *   a continous range of data, and must be fully prepared for I/O.
 428 **/
 429void ll_rw_kio(int rw, struct kiobuf *kio, struct block_device *bdev, sector_t sector)
 430{
 431        int i, offset, size, err, map_i, total_nr_pages, nr_pages;
 432        struct bio *bio;
 433
 434        err = 0;
 435        if ((rw & WRITE) && bdev_read_only(bdev)) {
 436                printk("ll_rw_bio: WRITE to ro device %s\n", bdevname(bdev));
 437                err = -EPERM;
 438                goto out;
 439        }
 440
 441        if (!kio->nr_pages) {
 442                err = -EINVAL;
 443                goto out;
 444        }
 445
 446        /*
 447         * maybe kio is bigger than the max we can easily map into a bio.
 448         * if so, split it up in appropriately sized chunks.
 449         */
 450        total_nr_pages = kio->nr_pages;
 451        offset = kio->offset & ~PAGE_MASK;
 452        size = kio->length;
 453
 454        atomic_set(&kio->io_count, 1);
 455
 456        map_i = 0;
 457
 458next_chunk:
 459        nr_pages = BIO_MAX_PAGES;
 460        if (nr_pages > total_nr_pages)
 461                nr_pages = total_nr_pages;
 462
 463        atomic_inc(&kio->io_count);
 464
 465        /*
 466         * allocate bio and do initial setup
 467         */
 468        if ((bio = bio_alloc(GFP_NOIO, nr_pages)) == NULL) {
 469                err = -ENOMEM;
 470                goto out;
 471        }
 472
 473        bio->bi_sector = sector;
 474        bio->bi_bdev = bdev;
 475        bio->bi_idx = 0;
 476        bio->bi_end_io = bio_end_io_kio;
 477        bio->bi_private = kio;
 478
 479        for (i = 0; i < nr_pages; i++, map_i++) {
 480                int nbytes = PAGE_SIZE - offset;
 481
 482                if (nbytes > size)
 483                        nbytes = size;
 484
 485                BUG_ON(kio->maplist[map_i] == NULL);
 486
 487                /*
 488                 * if we can't add this page to the bio, submit for i/o
 489                 * and alloc a new one if needed
 490                 */
 491                if (bio_add_page(bio, kio->maplist[map_i], nbytes, offset))
 492                        break;
 493
 494                /*
 495                 * kiobuf only has an offset into the first page
 496                 */
 497                offset = 0;
 498
 499                sector += nbytes >> 9;
 500                size -= nbytes;
 501                total_nr_pages--;
 502                kio->offset += nbytes;
 503        }
 504
 505        submit_bio(rw, bio);
 506
 507        if (total_nr_pages)
 508                goto next_chunk;
 509
 510        if (size) {
 511                printk("ll_rw_kio: size %d left (kio %d)\n", size, kio->length);
 512                BUG();
 513        }
 514
 515out:
 516        if (err)
 517                kio->errno = err;
 518
 519        /*
 520         * final atomic_dec of io_count to match our initial setting of 1.
 521         * I/O may or may not have completed at this point, final completion
 522         * handler is only run on last decrement.
 523         */
 524        end_kio_request(kio, !err);
 525}
 526
 527/**
 528 * bio_endio - end I/O on a bio
 529 * @bio:        bio
 530 * @bytes_done: number of bytes completed
 531 * @error:      error, if any
 532 *
 533 * Description:
 534 *   bio_endio() will end I/O @bytes_done number of bytes. This may be just
 535 *   a partial part of the bio, or it may be the whole bio. bio_endio() is
 536 *   the preferred way to end I/O on a bio, it takes care of decrementing
 537 *   bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and
 538 *   and one of the established -Exxxx (-EIO, for instance) error values in
 539 *   case something went wrong.
 540 **/
 541int bio_endio(struct bio *bio, unsigned int bytes_done, int error)
 542{
 543        if (error)
 544                clear_bit(BIO_UPTODATE, &bio->bi_flags);
 545
 546        if (unlikely(bytes_done > bio->bi_size)) {
 547                printk("%s: want %u bytes done, only %u left\n", __FUNCTION__,
 548                                                bytes_done, bio->bi_size);
 549                bytes_done = bio->bi_size;
 550        }
 551
 552        bio->bi_size -= bytes_done;
 553        return bio->bi_end_io(bio, bytes_done, error);
 554}
 555
 556static void __init biovec_init_pools(void)
 557{
 558        int i, size, megabytes, pool_entries = BIO_POOL_SIZE;
 559        int scale = BIOVEC_NR_POOLS;
 560
 561        megabytes = nr_free_pages() >> (20 - PAGE_SHIFT);
 562
 563        /*
 564         * find out where to start scaling
 565         */
 566        if (megabytes <= 16)
 567                scale = 0;
 568        else if (megabytes <= 32)
 569                scale = 1;
 570        else if (megabytes <= 64)
 571                scale = 2;
 572        else if (megabytes <= 96)
 573                scale = 3;
 574        else if (megabytes <= 128)
 575                scale = 4;
 576
 577        /*
 578         * scale number of entries
 579         */
 580        pool_entries = megabytes * 2;
 581        if (pool_entries > 256)
 582                pool_entries = 256;
 583
 584        for (i = 0; i < BIOVEC_NR_POOLS; i++) {
 585                struct biovec_pool *bp = bvec_array + i;
 586
 587                size = bp->nr_vecs * sizeof(struct bio_vec);
 588
 589                bp->slab = kmem_cache_create(bp->name, size, 0,
 590                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
 591                if (!bp->slab)
 592                        panic("biovec: can't init slab cache\n");
 593
 594                if (i >= scale)
 595                        pool_entries >>= 1;
 596
 597                bp->pool = mempool_create(pool_entries, slab_pool_alloc,
 598                                        slab_pool_free, bp->slab);
 599                if (!bp->pool)
 600                        panic("biovec: can't init mempool\n");
 601
 602                printk("biovec pool[%d]: %3d bvecs: %3d entries (%d bytes)\n",
 603                                                i, bp->nr_vecs, pool_entries,
 604                                                size);
 605        }
 606}
 607
 608static int __init init_bio(void)
 609{
 610        bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
 611                                        SLAB_HWCACHE_ALIGN, NULL, NULL);
 612        if (!bio_slab)
 613                panic("bio: can't create slab cache\n");
 614        bio_pool = mempool_create(BIO_POOL_SIZE, slab_pool_alloc, slab_pool_free, bio_slab);
 615        if (!bio_pool)
 616                panic("bio: can't create mempool\n");
 617
 618        printk("BIO: pool of %d setup, %ZuKb (%Zd bytes/bio)\n", BIO_POOL_SIZE, BIO_POOL_SIZE * sizeof(struct bio) >> 10, sizeof(struct bio));
 619
 620        biovec_init_pools();
 621
 622        return 0;
 623}
 624
 625module_init(init_bio);
 626
 627EXPORT_SYMBOL(bio_alloc);
 628EXPORT_SYMBOL(bio_put);
 629EXPORT_SYMBOL(ll_rw_kio);
 630EXPORT_SYMBOL(bio_endio);
 631EXPORT_SYMBOL(bio_init);
 632EXPORT_SYMBOL(bio_copy);
 633EXPORT_SYMBOL(__bio_clone);
 634EXPORT_SYMBOL(bio_clone);
 635EXPORT_SYMBOL(bio_phys_segments);
 636EXPORT_SYMBOL(bio_hw_segments);
 637EXPORT_SYMBOL(bio_add_page);
 638
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.