linux/drivers/md/dm-bufio.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009-2011 Red Hat, Inc.
   3 *
   4 * Author: Mikulas Patocka <mpatocka@redhat.com>
   5 *
   6 * This file is released under the GPL.
   7 */
   8
   9#include "dm-bufio.h"
  10
  11#include <linux/device-mapper.h>
  12#include <linux/dm-io.h>
  13#include <linux/slab.h>
  14#include <linux/vmalloc.h>
  15#include <linux/shrinker.h>
  16#include <linux/module.h>
  17
  18#define DM_MSG_PREFIX "bufio"
  19
  20/*
  21 * Memory management policy:
  22 *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
  23 *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
  24 *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
  25 *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
  26 *      dirty buffers.
  27 */
  28#define DM_BUFIO_MIN_BUFFERS            8
  29
  30#define DM_BUFIO_MEMORY_PERCENT         2
  31#define DM_BUFIO_VMALLOC_PERCENT        25
  32#define DM_BUFIO_WRITEBACK_PERCENT      75
  33
  34/*
  35 * Check buffer ages in this interval (seconds)
  36 */
  37#define DM_BUFIO_WORK_TIMER_SECS        10
  38
  39/*
  40 * Free buffers when they are older than this (seconds)
  41 */
  42#define DM_BUFIO_DEFAULT_AGE_SECS       60
  43
  44/*
  45 * The number of bvec entries that are embedded directly in the buffer.
  46 * If the chunk size is larger, dm-io is used to do the io.
  47 */
  48#define DM_BUFIO_INLINE_VECS            16
  49
  50/*
  51 * Buffer hash
  52 */
  53#define DM_BUFIO_HASH_BITS      20
  54#define DM_BUFIO_HASH(block) \
  55        ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
  56         ((1 << DM_BUFIO_HASH_BITS) - 1))
  57
  58/*
  59 * Don't try to use kmem_cache_alloc for blocks larger than this.
  60 * For explanation, see alloc_buffer_data below.
  61 */
  62#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT  (PAGE_SIZE >> 1)
  63#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT   (PAGE_SIZE << (MAX_ORDER - 1))
  64
  65/*
  66 * dm_buffer->list_mode
  67 */
  68#define LIST_CLEAN      0
  69#define LIST_DIRTY      1
  70#define LIST_SIZE       2
  71
  72/*
  73 * Linking of buffers:
  74 *      All buffers are linked to cache_hash with their hash_list field.
  75 *
  76 *      Clean buffers that are not being written (B_WRITING not set)
  77 *      are linked to lru[LIST_CLEAN] with their lru_list field.
  78 *
  79 *      Dirty and clean buffers that are being written are linked to
  80 *      lru[LIST_DIRTY] with their lru_list field. When the write
  81 *      finishes, the buffer cannot be relinked immediately (because we
  82 *      are in an interrupt context and relinking requires process
  83 *      context), so some clean-not-writing buffers can be held on
  84 *      dirty_lru too.  They are later added to lru in the process
  85 *      context.
  86 */
  87struct dm_bufio_client {
  88        struct mutex lock;
  89
  90        struct list_head lru[LIST_SIZE];
  91        unsigned long n_buffers[LIST_SIZE];
  92
  93        struct block_device *bdev;
  94        unsigned block_size;
  95        unsigned char sectors_per_block_bits;
  96        unsigned char pages_per_block_bits;
  97        unsigned char blocks_per_page_bits;
  98        unsigned aux_size;
  99        void (*alloc_callback)(struct dm_buffer *);
 100        void (*write_callback)(struct dm_buffer *);
 101
 102        struct dm_io_client *dm_io;
 103
 104        struct list_head reserved_buffers;
 105        unsigned need_reserved_buffers;
 106
 107        struct hlist_head *cache_hash;
 108        wait_queue_head_t free_buffer_wait;
 109
 110        int async_write_error;
 111
 112        struct list_head client_list;
 113        struct shrinker shrinker;
 114};
 115
 116/*
 117 * Buffer state bits.
 118 */
 119#define B_READING       0
 120#define B_WRITING       1
 121#define B_DIRTY         2
 122
 123/*
 124 * Describes how the block was allocated:
 125 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
 126 * See the comment at alloc_buffer_data.
 127 */
 128enum data_mode {
 129        DATA_MODE_SLAB = 0,
 130        DATA_MODE_GET_FREE_PAGES = 1,
 131        DATA_MODE_VMALLOC = 2,
 132        DATA_MODE_LIMIT = 3
 133};
 134
 135struct dm_buffer {
 136        struct hlist_node hash_list;
 137        struct list_head lru_list;
 138        sector_t block;
 139        void *data;
 140        enum data_mode data_mode;
 141        unsigned char list_mode;                /* LIST_* */
 142        unsigned hold_count;
 143        int read_error;
 144        int write_error;
 145        unsigned long state;
 146        unsigned long last_accessed;
 147        struct dm_bufio_client *c;
 148        struct list_head write_list;
 149        struct bio bio;
 150        struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
 151};
 152
 153/*----------------------------------------------------------------*/
 154
 155static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
 156static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
 157
 158static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
 159{
 160        unsigned ret = c->blocks_per_page_bits - 1;
 161
 162        BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
 163
 164        return ret;
 165}
 166
 167#define DM_BUFIO_CACHE(c)       (dm_bufio_caches[dm_bufio_cache_index(c)])
 168#define DM_BUFIO_CACHE_NAME(c)  (dm_bufio_cache_names[dm_bufio_cache_index(c)])
 169
 170#define dm_bufio_in_request()   (!!current->bio_list)
 171
 172static void dm_bufio_lock(struct dm_bufio_client *c)
 173{
 174        mutex_lock_nested(&c->lock, dm_bufio_in_request());
 175}
 176
 177static int dm_bufio_trylock(struct dm_bufio_client *c)
 178{
 179        return mutex_trylock(&c->lock);
 180}
 181
 182static void dm_bufio_unlock(struct dm_bufio_client *c)
 183{
 184        mutex_unlock(&c->lock);
 185}
 186
 187/*
 188 * FIXME Move to sched.h?
 189 */
 190#ifdef CONFIG_PREEMPT_VOLUNTARY
 191#  define dm_bufio_cond_resched()               \
 192do {                                            \
 193        if (unlikely(need_resched()))           \
 194                _cond_resched();                \
 195} while (0)
 196#else
 197#  define dm_bufio_cond_resched()                do { } while (0)
 198#endif
 199
 200/*----------------------------------------------------------------*/
 201
 202/*
 203 * Default cache size: available memory divided by the ratio.
 204 */
 205static unsigned long dm_bufio_default_cache_size;
 206
 207/*
 208 * Total cache size set by the user.
 209 */
 210static unsigned long dm_bufio_cache_size;
 211
 212/*
 213 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
 214 * at any time.  If it disagrees, the user has changed cache size.
 215 */
 216static unsigned long dm_bufio_cache_size_latch;
 217
 218static DEFINE_SPINLOCK(param_spinlock);
 219
 220/*
 221 * Buffers are freed after this timeout
 222 */
 223static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
 224
 225static unsigned long dm_bufio_peak_allocated;
 226static unsigned long dm_bufio_allocated_kmem_cache;
 227static unsigned long dm_bufio_allocated_get_free_pages;
 228static unsigned long dm_bufio_allocated_vmalloc;
 229static unsigned long dm_bufio_current_allocated;
 230
 231/*----------------------------------------------------------------*/
 232
 233/*
 234 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
 235 */
 236static unsigned long dm_bufio_cache_size_per_client;
 237
 238/*
 239 * The current number of clients.
 240 */
 241static int dm_bufio_client_count;
 242
 243/*
 244 * The list of all clients.
 245 */
 246static LIST_HEAD(dm_bufio_all_clients);
 247
 248/*
 249 * This mutex protects dm_bufio_cache_size_latch,
 250 * dm_bufio_cache_size_per_client and dm_bufio_client_count
 251 */
 252static DEFINE_MUTEX(dm_bufio_clients_lock);
 253
 254/*----------------------------------------------------------------*/
 255
 256static void adjust_total_allocated(enum data_mode data_mode, long diff)
 257{
 258        static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
 259                &dm_bufio_allocated_kmem_cache,
 260                &dm_bufio_allocated_get_free_pages,
 261                &dm_bufio_allocated_vmalloc,
 262        };
 263
 264        spin_lock(&param_spinlock);
 265
 266        *class_ptr[data_mode] += diff;
 267
 268        dm_bufio_current_allocated += diff;
 269
 270        if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
 271                dm_bufio_peak_allocated = dm_bufio_current_allocated;
 272
 273        spin_unlock(&param_spinlock);
 274}
 275
 276/*
 277 * Change the number of clients and recalculate per-client limit.
 278 */
 279static void __cache_size_refresh(void)
 280{
 281        BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 282        BUG_ON(dm_bufio_client_count < 0);
 283
 284        dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
 285
 286        /*
 287         * Use default if set to 0 and report the actual cache size used.
 288         */
 289        if (!dm_bufio_cache_size_latch) {
 290                (void)cmpxchg(&dm_bufio_cache_size, 0,
 291                              dm_bufio_default_cache_size);
 292                dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
 293        }
 294
 295        dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
 296                                         (dm_bufio_client_count ? : 1);
 297}
 298
 299/*
 300 * Allocating buffer data.
 301 *
 302 * Small buffers are allocated with kmem_cache, to use space optimally.
 303 *
 304 * For large buffers, we choose between get_free_pages and vmalloc.
 305 * Each has advantages and disadvantages.
 306 *
 307 * __get_free_pages can randomly fail if the memory is fragmented.
 308 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
 309 * as low as 128M) so using it for caching is not appropriate.
 310 *
 311 * If the allocation may fail we use __get_free_pages. Memory fragmentation
 312 * won't have a fatal effect here, but it just causes flushes of some other
 313 * buffers and more I/O will be performed. Don't use __get_free_pages if it
 314 * always fails (i.e. order >= MAX_ORDER).
 315 *
 316 * If the allocation shouldn't fail we use __vmalloc. This is only for the
 317 * initial reserve allocation, so there's no risk of wasting all vmalloc
 318 * space.
 319 */
 320static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 321                               enum data_mode *data_mode)
 322{
 323        unsigned noio_flag;
 324        void *ptr;
 325
 326        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
 327                *data_mode = DATA_MODE_SLAB;
 328                return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
 329        }
 330
 331        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
 332            gfp_mask & __GFP_NORETRY) {
 333                *data_mode = DATA_MODE_GET_FREE_PAGES;
 334                return (void *)__get_free_pages(gfp_mask,
 335                                                c->pages_per_block_bits);
 336        }
 337
 338        *data_mode = DATA_MODE_VMALLOC;
 339
 340        /*
 341         * __vmalloc allocates the data pages and auxiliary structures with
 342         * gfp_flags that were specified, but pagetables are always allocated
 343         * with GFP_KERNEL, no matter what was specified as gfp_mask.
 344         *
 345         * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
 346         * all allocations done by this process (including pagetables) are done
 347         * as if GFP_NOIO was specified.
 348         */
 349
 350        if (gfp_mask & __GFP_NORETRY)
 351                noio_flag = memalloc_noio_save();
 352
 353        ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
 354
 355        if (gfp_mask & __GFP_NORETRY)
 356                memalloc_noio_restore(noio_flag);
 357
 358        return ptr;
 359}
 360
 361/*
 362 * Free buffer's data.
 363 */
 364static void free_buffer_data(struct dm_bufio_client *c,
 365                             void *data, enum data_mode data_mode)
 366{
 367        switch (data_mode) {
 368        case DATA_MODE_SLAB:
 369                kmem_cache_free(DM_BUFIO_CACHE(c), data);
 370                break;
 371
 372        case DATA_MODE_GET_FREE_PAGES:
 373                free_pages((unsigned long)data, c->pages_per_block_bits);
 374                break;
 375
 376        case DATA_MODE_VMALLOC:
 377                vfree(data);
 378                break;
 379
 380        default:
 381                DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
 382                       data_mode);
 383                BUG();
 384        }
 385}
 386
 387/*
 388 * Allocate buffer and its data.
 389 */
 390static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
 391{
 392        struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
 393                                      gfp_mask);
 394
 395        if (!b)
 396                return NULL;
 397
 398        b->c = c;
 399
 400        b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
 401        if (!b->data) {
 402                kfree(b);
 403                return NULL;
 404        }
 405
 406        adjust_total_allocated(b->data_mode, (long)c->block_size);
 407
 408        return b;
 409}
 410
 411/*
 412 * Free buffer and its data.
 413 */
 414static void free_buffer(struct dm_buffer *b)
 415{
 416        struct dm_bufio_client *c = b->c;
 417
 418        adjust_total_allocated(b->data_mode, -(long)c->block_size);
 419
 420        free_buffer_data(c, b->data, b->data_mode);
 421        kfree(b);
 422}
 423
 424/*
 425 * Link buffer to the hash list and clean or dirty queue.
 426 */
 427static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 428{
 429        struct dm_bufio_client *c = b->c;
 430
 431        c->n_buffers[dirty]++;
 432        b->block = block;
 433        b->list_mode = dirty;
 434        list_add(&b->lru_list, &c->lru[dirty]);
 435        hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
 436        b->last_accessed = jiffies;
 437}
 438
 439/*
 440 * Unlink buffer from the hash list and dirty or clean queue.
 441 */
 442static void __unlink_buffer(struct dm_buffer *b)
 443{
 444        struct dm_bufio_client *c = b->c;
 445
 446        BUG_ON(!c->n_buffers[b->list_mode]);
 447
 448        c->n_buffers[b->list_mode]--;
 449        hlist_del(&b->hash_list);
 450        list_del(&b->lru_list);
 451}
 452
 453/*
 454 * Place the buffer to the head of dirty or clean LRU queue.
 455 */
 456static void __relink_lru(struct dm_buffer *b, int dirty)
 457{
 458        struct dm_bufio_client *c = b->c;
 459
 460        BUG_ON(!c->n_buffers[b->list_mode]);
 461
 462        c->n_buffers[b->list_mode]--;
 463        c->n_buffers[dirty]++;
 464        b->list_mode = dirty;
 465        list_move(&b->lru_list, &c->lru[dirty]);
 466}
 467
 468/*----------------------------------------------------------------
 469 * Submit I/O on the buffer.
 470 *
 471 * Bio interface is faster but it has some problems:
 472 *      the vector list is limited (increasing this limit increases
 473 *      memory-consumption per buffer, so it is not viable);
 474 *
 475 *      the memory must be direct-mapped, not vmalloced;
 476 *
 477 *      the I/O driver can reject requests spuriously if it thinks that
 478 *      the requests are too big for the device or if they cross a
 479 *      controller-defined memory boundary.
 480 *
 481 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
 482 * it is not vmalloced, try using the bio interface.
 483 *
 484 * If the buffer is big, if it is vmalloced or if the underlying device
 485 * rejects the bio because it is too large, use dm-io layer to do the I/O.
 486 * The dm-io layer splits the I/O into multiple requests, avoiding the above
 487 * shortcomings.
 488 *--------------------------------------------------------------*/
 489
 490/*
 491 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
 492 * that the request was handled directly with bio interface.
 493 */
 494static void dmio_complete(unsigned long error, void *context)
 495{
 496        struct dm_buffer *b = context;
 497
 498        b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
 499}
 500
 501static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 502                     bio_end_io_t *end_io)
 503{
 504        int r;
 505        struct dm_io_request io_req = {
 506                .bi_rw = rw,
 507                .notify.fn = dmio_complete,
 508                .notify.context = b,
 509                .client = b->c->dm_io,
 510        };
 511        struct dm_io_region region = {
 512                .bdev = b->c->bdev,
 513                .sector = block << b->c->sectors_per_block_bits,
 514                .count = b->c->block_size >> SECTOR_SHIFT,
 515        };
 516
 517        if (b->data_mode != DATA_MODE_VMALLOC) {
 518                io_req.mem.type = DM_IO_KMEM;
 519                io_req.mem.ptr.addr = b->data;
 520        } else {
 521                io_req.mem.type = DM_IO_VMA;
 522                io_req.mem.ptr.vma = b->data;
 523        }
 524
 525        b->bio.bi_end_io = end_io;
 526
 527        r = dm_io(&io_req, 1, &region, NULL);
 528        if (r)
 529                end_io(&b->bio, r);
 530}
 531
 532static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 533                           bio_end_io_t *end_io)
 534{
 535        char *ptr;
 536        int len;
 537
 538        bio_init(&b->bio);
 539        b->bio.bi_io_vec = b->bio_vec;
 540        b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
 541        b->bio.bi_sector = block << b->c->sectors_per_block_bits;
 542        b->bio.bi_bdev = b->c->bdev;
 543        b->bio.bi_end_io = end_io;
 544
 545        /*
 546         * We assume that if len >= PAGE_SIZE ptr is page-aligned.
 547         * If len < PAGE_SIZE the buffer doesn't cross page boundary.
 548         */
 549        ptr = b->data;
 550        len = b->c->block_size;
 551
 552        if (len >= PAGE_SIZE)
 553                BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
 554        else
 555                BUG_ON((unsigned long)ptr & (len - 1));
 556
 557        do {
 558                if (!bio_add_page(&b->bio, virt_to_page(ptr),
 559                                  len < PAGE_SIZE ? len : PAGE_SIZE,
 560                                  virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
 561                        BUG_ON(b->c->block_size <= PAGE_SIZE);
 562                        use_dmio(b, rw, block, end_io);
 563                        return;
 564                }
 565
 566                len -= PAGE_SIZE;
 567                ptr += PAGE_SIZE;
 568        } while (len > 0);
 569
 570        submit_bio(rw, &b->bio);
 571}
 572
 573static void submit_io(struct dm_buffer *b, int rw, sector_t block,
 574                      bio_end_io_t *end_io)
 575{
 576        if (rw == WRITE && b->c->write_callback)
 577                b->c->write_callback(b);
 578
 579        if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
 580            b->data_mode != DATA_MODE_VMALLOC)
 581                use_inline_bio(b, rw, block, end_io);
 582        else
 583                use_dmio(b, rw, block, end_io);
 584}
 585
 586/*----------------------------------------------------------------
 587 * Writing dirty buffers
 588 *--------------------------------------------------------------*/
 589
 590/*
 591 * The endio routine for write.
 592 *
 593 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
 594 * it.
 595 */
 596static void write_endio(struct bio *bio, int error)
 597{
 598        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 599
 600        b->write_error = error;
 601        if (unlikely(error)) {
 602                struct dm_bufio_client *c = b->c;
 603                (void)cmpxchg(&c->async_write_error, 0, error);
 604        }
 605
 606        BUG_ON(!test_bit(B_WRITING, &b->state));
 607
 608        smp_mb__before_clear_bit();
 609        clear_bit(B_WRITING, &b->state);
 610        smp_mb__after_clear_bit();
 611
 612        wake_up_bit(&b->state, B_WRITING);
 613}
 614
 615/*
 616 * This function is called when wait_on_bit is actually waiting.
 617 */
 618static int do_io_schedule(void *word)
 619{
 620        io_schedule();
 621
 622        return 0;
 623}
 624
 625/*
 626 * Initiate a write on a dirty buffer, but don't wait for it.
 627 *
 628 * - If the buffer is not dirty, exit.
 629 * - If there some previous write going on, wait for it to finish (we can't
 630 *   have two writes on the same buffer simultaneously).
 631 * - Submit our write and don't wait on it. We set B_WRITING indicating
 632 *   that there is a write in progress.
 633 */
 634static void __write_dirty_buffer(struct dm_buffer *b,
 635                                 struct list_head *write_list)
 636{
 637        if (!test_bit(B_DIRTY, &b->state))
 638                return;
 639
 640        clear_bit(B_DIRTY, &b->state);
 641        wait_on_bit_lock(&b->state, B_WRITING,
 642                         do_io_schedule, TASK_UNINTERRUPTIBLE);
 643
 644        if (!write_list)
 645                submit_io(b, WRITE, b->block, write_endio);
 646        else
 647                list_add_tail(&b->write_list, write_list);
 648}
 649
 650static void __flush_write_list(struct list_head *write_list)
 651{
 652        struct blk_plug plug;
 653        blk_start_plug(&plug);
 654        while (!list_empty(write_list)) {
 655                struct dm_buffer *b =
 656                        list_entry(write_list->next, struct dm_buffer, write_list);
 657                list_del(&b->write_list);
 658                submit_io(b, WRITE, b->block, write_endio);
 659                dm_bufio_cond_resched();
 660        }
 661        blk_finish_plug(&plug);
 662}
 663
 664/*
 665 * Wait until any activity on the buffer finishes.  Possibly write the
 666 * buffer if it is dirty.  When this function finishes, there is no I/O
 667 * running on the buffer and the buffer is not dirty.
 668 */
 669static void __make_buffer_clean(struct dm_buffer *b)
 670{
 671        BUG_ON(b->hold_count);
 672
 673        if (!b->state)  /* fast case */
 674                return;
 675
 676        wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 677        __write_dirty_buffer(b, NULL);
 678        wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 679}
 680
 681/*
 682 * Find some buffer that is not held by anybody, clean it, unlink it and
 683 * return it.
 684 */
 685static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 686{
 687        struct dm_buffer *b;
 688
 689        list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
 690                BUG_ON(test_bit(B_WRITING, &b->state));
 691                BUG_ON(test_bit(B_DIRTY, &b->state));
 692
 693                if (!b->hold_count) {
 694                        __make_buffer_clean(b);
 695                        __unlink_buffer(b);
 696                        return b;
 697                }
 698                dm_bufio_cond_resched();
 699        }
 700
 701        list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
 702                BUG_ON(test_bit(B_READING, &b->state));
 703
 704                if (!b->hold_count) {
 705                        __make_buffer_clean(b);
 706                        __unlink_buffer(b);
 707                        return b;
 708                }
 709                dm_bufio_cond_resched();
 710        }
 711
 712        return NULL;
 713}
 714
 715/*
 716 * Wait until some other threads free some buffer or release hold count on
 717 * some buffer.
 718 *
 719 * This function is entered with c->lock held, drops it and regains it
 720 * before exiting.
 721 */
 722static void __wait_for_free_buffer(struct dm_bufio_client *c)
 723{
 724        DECLARE_WAITQUEUE(wait, current);
 725
 726        add_wait_queue(&c->free_buffer_wait, &wait);
 727        set_task_state(current, TASK_UNINTERRUPTIBLE);
 728        dm_bufio_unlock(c);
 729
 730        io_schedule();
 731
 732        set_task_state(current, TASK_RUNNING);
 733        remove_wait_queue(&c->free_buffer_wait, &wait);
 734
 735        dm_bufio_lock(c);
 736}
 737
 738enum new_flag {
 739        NF_FRESH = 0,
 740        NF_READ = 1,
 741        NF_GET = 2,
 742        NF_PREFETCH = 3
 743};
 744
 745/*
 746 * Allocate a new buffer. If the allocation is not possible, wait until
 747 * some other thread frees a buffer.
 748 *
 749 * May drop the lock and regain it.
 750 */
 751static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
 752{
 753        struct dm_buffer *b;
 754
 755        /*
 756         * dm-bufio is resistant to allocation failures (it just keeps
 757         * one buffer reserved in cases all the allocations fail).
 758         * So set flags to not try too hard:
 759         *      GFP_NOIO: don't recurse into the I/O layer
 760         *      __GFP_NORETRY: don't retry and rather return failure
 761         *      __GFP_NOMEMALLOC: don't use emergency reserves
 762         *      __GFP_NOWARN: don't print a warning in case of failure
 763         *
 764         * For debugging, if we set the cache size to 1, no new buffers will
 765         * be allocated.
 766         */
 767        while (1) {
 768                if (dm_bufio_cache_size_latch != 1) {
 769                        b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 770                        if (b)
 771                                return b;
 772                }
 773
 774                if (nf == NF_PREFETCH)
 775                        return NULL;
 776
 777                if (!list_empty(&c->reserved_buffers)) {
 778                        b = list_entry(c->reserved_buffers.next,
 779                                       struct dm_buffer, lru_list);
 780                        list_del(&b->lru_list);
 781                        c->need_reserved_buffers++;
 782
 783                        return b;
 784                }
 785
 786                b = __get_unclaimed_buffer(c);
 787                if (b)
 788                        return b;
 789
 790                __wait_for_free_buffer(c);
 791        }
 792}
 793
 794static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
 795{
 796        struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
 797
 798        if (!b)
 799                return NULL;
 800
 801        if (c->alloc_callback)
 802                c->alloc_callback(b);
 803
 804        return b;
 805}
 806
 807/*
 808 * Free a buffer and wake other threads waiting for free buffers.
 809 */
 810static void __free_buffer_wake(struct dm_buffer *b)
 811{
 812        struct dm_bufio_client *c = b->c;
 813
 814        if (!c->need_reserved_buffers)
 815                free_buffer(b);
 816        else {
 817                list_add(&b->lru_list, &c->reserved_buffers);
 818                c->need_reserved_buffers--;
 819        }
 820
 821        wake_up(&c->free_buffer_wait);
 822}
 823
 824static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
 825                                        struct list_head *write_list)
 826{
 827        struct dm_buffer *b, *tmp;
 828
 829        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
 830                BUG_ON(test_bit(B_READING, &b->state));
 831
 832                if (!test_bit(B_DIRTY, &b->state) &&
 833                    !test_bit(B_WRITING, &b->state)) {
 834                        __relink_lru(b, LIST_CLEAN);
 835                        continue;
 836                }
 837
 838                if (no_wait && test_bit(B_WRITING, &b->state))
 839                        return;
 840
 841                __write_dirty_buffer(b, write_list);
 842                dm_bufio_cond_resched();
 843        }
 844}
 845
 846/*
 847 * Get writeback threshold and buffer limit for a given client.
 848 */
 849static void __get_memory_limit(struct dm_bufio_client *c,
 850                               unsigned long *threshold_buffers,
 851                               unsigned long *limit_buffers)
 852{
 853        unsigned long buffers;
 854
 855        if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
 856                mutex_lock(&dm_bufio_clients_lock);
 857                __cache_size_refresh();
 858                mutex_unlock(&dm_bufio_clients_lock);
 859        }
 860
 861        buffers = dm_bufio_cache_size_per_client >>
 862                  (c->sectors_per_block_bits + SECTOR_SHIFT);
 863
 864        if (buffers < DM_BUFIO_MIN_BUFFERS)
 865                buffers = DM_BUFIO_MIN_BUFFERS;
 866
 867        *limit_buffers = buffers;
 868        *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
 869}
 870
 871/*
 872 * Check if we're over watermark.
 873 * If we are over threshold_buffers, start freeing buffers.
 874 * If we're over "limit_buffers", block until we get under the limit.
 875 */
 876static void __check_watermark(struct dm_bufio_client *c,
 877                              struct list_head *write_list)
 878{
 879        unsigned long threshold_buffers, limit_buffers;
 880
 881        __get_memory_limit(c, &threshold_buffers, &limit_buffers);
 882
 883        while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
 884               limit_buffers) {
 885
 886                struct dm_buffer *b = __get_unclaimed_buffer(c);
 887
 888                if (!b)
 889                        return;
 890
 891                __free_buffer_wake(b);
 892                dm_bufio_cond_resched();
 893        }
 894
 895        if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
 896                __write_dirty_buffers_async(c, 1, write_list);
 897}
 898
 899/*
 900 * Find a buffer in the hash.
 901 */
 902static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 903{
 904        struct dm_buffer *b;
 905
 906        hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
 907                             hash_list) {
 908                dm_bufio_cond_resched();
 909                if (b->block == block)
 910                        return b;
 911        }
 912
 913        return NULL;
 914}
 915
 916/*----------------------------------------------------------------
 917 * Getting a buffer
 918 *--------------------------------------------------------------*/
 919
 920static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 921                                     enum new_flag nf, int *need_submit,
 922                                     struct list_head *write_list)
 923{
 924        struct dm_buffer *b, *new_b = NULL;
 925
 926        *need_submit = 0;
 927
 928        b = __find(c, block);
 929        if (b)
 930                goto found_buffer;
 931
 932        if (nf == NF_GET)
 933                return NULL;
 934
 935        new_b = __alloc_buffer_wait(c, nf);
 936        if (!new_b)
 937                return NULL;
 938
 939        /*
 940         * We've had a period where the mutex was unlocked, so need to
 941         * recheck the hash table.
 942         */
 943        b = __find(c, block);
 944        if (b) {
 945                __free_buffer_wake(new_b);
 946                goto found_buffer;
 947        }
 948
 949        __check_watermark(c, write_list);
 950
 951        b = new_b;
 952        b->hold_count = 1;
 953        b->read_error = 0;
 954        b->write_error = 0;
 955        __link_buffer(b, block, LIST_CLEAN);
 956
 957        if (nf == NF_FRESH) {
 958                b->state = 0;
 959                return b;
 960        }
 961
 962        b->state = 1 << B_READING;
 963        *need_submit = 1;
 964
 965        return b;
 966
 967found_buffer:
 968        if (nf == NF_PREFETCH)
 969                return NULL;
 970        /*
 971         * Note: it is essential that we don't wait for the buffer to be
 972         * read if dm_bufio_get function is used. Both dm_bufio_get and
 973         * dm_bufio_prefetch can be used in the driver request routine.
 974         * If the user called both dm_bufio_prefetch and dm_bufio_get on
 975         * the same buffer, it would deadlock if we waited.
 976         */
 977        if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
 978                return NULL;
 979
 980        b->hold_count++;
 981        __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
 982                     test_bit(B_WRITING, &b->state));
 983        return b;
 984}
 985
 986/*
 987 * The endio routine for reading: set the error, clear the bit and wake up
 988 * anyone waiting on the buffer.
 989 */
 990static void read_endio(struct bio *bio, int error)
 991{
 992        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 993
 994        b->read_error = error;
 995
 996        BUG_ON(!test_bit(B_READING, &b->state));
 997
 998        smp_mb__before_clear_bit();
 999        clear_bit(B_READING, &b->state);
1000        smp_mb__after_clear_bit();
1001
1002        wake_up_bit(&b->state, B_READING);
1003}
1004
1005/*
1006 * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1007 * functions is similar except that dm_bufio_new doesn't read the
1008 * buffer from the disk (assuming that the caller overwrites all the data
1009 * and uses dm_bufio_mark_buffer_dirty to write new data back).
1010 */
1011static void *new_read(struct dm_bufio_client *c, sector_t block,
1012                      enum new_flag nf, struct dm_buffer **bp)
1013{
1014        int need_submit;
1015        struct dm_buffer *b;
1016
1017        LIST_HEAD(write_list);
1018
1019        dm_bufio_lock(c);
1020        b = __bufio_new(c, block, nf, &need_submit, &write_list);
1021        dm_bufio_unlock(c);
1022
1023        __flush_write_list(&write_list);
1024
1025        if (!b)
1026                return b;
1027
1028        if (need_submit)
1029                submit_io(b, READ, b->block, read_endio);
1030
1031        wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
1032
1033        if (b->read_error) {
1034                int error = b->read_error;
1035
1036                dm_bufio_release(b);
1037
1038                return ERR_PTR(error);
1039        }
1040
1041        *bp = b;
1042
1043        return b->data;
1044}
1045
1046void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1047                   struct dm_buffer **bp)
1048{
1049        return new_read(c, block, NF_GET, bp);
1050}
1051EXPORT_SYMBOL_GPL(dm_bufio_get);
1052
1053void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1054                    struct dm_buffer **bp)
1055{
1056        BUG_ON(dm_bufio_in_request());
1057
1058        return new_read(c, block, NF_READ, bp);
1059}
1060EXPORT_SYMBOL_GPL(dm_bufio_read);
1061
1062void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1063                   struct dm_buffer **bp)
1064{
1065        BUG_ON(dm_bufio_in_request());
1066
1067        return new_read(c, block, NF_FRESH, bp);
1068}
1069EXPORT_SYMBOL_GPL(dm_bufio_new);
1070
1071void dm_bufio_prefetch(struct dm_bufio_client *c,
1072                       sector_t block, unsigned n_blocks)
1073{
1074        struct blk_plug plug;
1075
1076        LIST_HEAD(write_list);
1077
1078        BUG_ON(dm_bufio_in_request());
1079
1080        blk_start_plug(&plug);
1081        dm_bufio_lock(c);
1082
1083        for (; n_blocks--; block++) {
1084                int need_submit;
1085                struct dm_buffer *b;
1086                b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1087                                &write_list);
1088                if (unlikely(!list_empty(&write_list))) {
1089                        dm_bufio_unlock(c);
1090                        blk_finish_plug(&plug);
1091                        __flush_write_list(&write_list);
1092                        blk_start_plug(&plug);
1093                        dm_bufio_lock(c);
1094                }
1095                if (unlikely(b != NULL)) {
1096                        dm_bufio_unlock(c);
1097
1098                        if (need_submit)
1099                                submit_io(b, READ, b->block, read_endio);
1100                        dm_bufio_release(b);
1101
1102                        dm_bufio_cond_resched();
1103
1104                        if (!n_blocks)
1105                                goto flush_plug;
1106                        dm_bufio_lock(c);
1107                }
1108        }
1109
1110        dm_bufio_unlock(c);
1111
1112flush_plug:
1113        blk_finish_plug(&plug);
1114}
1115EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1116
1117void dm_bufio_release(struct dm_buffer *b)
1118{
1119        struct dm_bufio_client *c = b->c;
1120
1121        dm_bufio_lock(c);
1122
1123        BUG_ON(!b->hold_count);
1124
1125        b->hold_count--;
1126        if (!b->hold_count) {
1127                wake_up(&c->free_buffer_wait);
1128
1129                /*
1130                 * If there were errors on the buffer, and the buffer is not
1131                 * to be written, free the buffer. There is no point in caching
1132                 * invalid buffer.
1133                 */
1134                if ((b->read_error || b->write_error) &&
1135                    !test_bit(B_READING, &b->state) &&
1136                    !test_bit(B_WRITING, &b->state) &&
1137                    !test_bit(B_DIRTY, &b->state)) {
1138                        __unlink_buffer(b);
1139                        __free_buffer_wake(b);
1140                }
1141        }
1142
1143        dm_bufio_unlock(c);
1144}
1145EXPORT_SYMBOL_GPL(dm_bufio_release);
1146
1147void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1148{
1149        struct dm_bufio_client *c = b->c;
1150
1151        dm_bufio_lock(c);
1152
1153        BUG_ON(test_bit(B_READING, &b->state));
1154
1155        if (!test_and_set_bit(B_DIRTY, &b->state))
1156                __relink_lru(b, LIST_DIRTY);
1157
1158        dm_bufio_unlock(c);
1159}
1160EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1161
1162void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1163{
1164        LIST_HEAD(write_list);
1165
1166        BUG_ON(dm_bufio_in_request());
1167
1168        dm_bufio_lock(c);
1169        __write_dirty_buffers_async(c, 0, &write_list);
1170        dm_bufio_unlock(c);
1171        __flush_write_list(&write_list);
1172}
1173EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1174
1175/*
1176 * For performance, it is essential that the buffers are written asynchronously
1177 * and simultaneously (so that the block layer can merge the writes) and then
1178 * waited upon.
1179 *
1180 * Finally, we flush hardware disk cache.
1181 */
1182int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1183{
1184        int a, f;
1185        unsigned long buffers_processed = 0;
1186        struct dm_buffer *b, *tmp;
1187
1188        LIST_HEAD(write_list);
1189
1190        dm_bufio_lock(c);
1191        __write_dirty_buffers_async(c, 0, &write_list);
1192        dm_bufio_unlock(c);
1193        __flush_write_list(&write_list);
1194        dm_bufio_lock(c);
1195
1196again:
1197        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1198                int dropped_lock = 0;
1199
1200                if (buffers_processed < c->n_buffers[LIST_DIRTY])
1201                        buffers_processed++;
1202
1203                BUG_ON(test_bit(B_READING, &b->state));
1204
1205                if (test_bit(B_WRITING, &b->state)) {
1206                        if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1207                                dropped_lock = 1;
1208                                b->hold_count++;
1209                                dm_bufio_unlock(c);
1210                                wait_on_bit(&b->state, B_WRITING,
1211                                            do_io_schedule,
1212                                            TASK_UNINTERRUPTIBLE);
1213                                dm_bufio_lock(c);
1214                                b->hold_count--;
1215                        } else
1216                                wait_on_bit(&b->state, B_WRITING,
1217                                            do_io_schedule,
1218                                            TASK_UNINTERRUPTIBLE);
1219                }
1220
1221                if (!test_bit(B_DIRTY, &b->state) &&
1222                    !test_bit(B_WRITING, &b->state))
1223                        __relink_lru(b, LIST_CLEAN);
1224
1225                dm_bufio_cond_resched();
1226
1227                /*
1228                 * If we dropped the lock, the list is no longer consistent,
1229                 * so we must restart the search.
1230                 *
1231                 * In the most common case, the buffer just processed is
1232                 * relinked to the clean list, so we won't loop scanning the
1233                 * same buffer again and again.
1234                 *
1235                 * This may livelock if there is another thread simultaneously
1236                 * dirtying buffers, so we count the number of buffers walked
1237                 * and if it exceeds the total number of buffers, it means that
1238                 * someone is doing some writes simultaneously with us.  In
1239                 * this case, stop, dropping the lock.
1240                 */
1241                if (dropped_lock)
1242                        goto again;
1243        }
1244        wake_up(&c->free_buffer_wait);
1245        dm_bufio_unlock(c);
1246
1247        a = xchg(&c->async_write_error, 0);
1248        f = dm_bufio_issue_flush(c);
1249        if (a)
1250                return a;
1251
1252        return f;
1253}
1254EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1255
1256/*
1257 * Use dm-io to send and empty barrier flush the device.
1258 */
1259int dm_bufio_issue_flush(struct dm_bufio_client *c)
1260{
1261        struct dm_io_request io_req = {
1262                .bi_rw = WRITE_FLUSH,
1263                .mem.type = DM_IO_KMEM,
1264                .mem.ptr.addr = NULL,
1265                .client = c->dm_io,
1266        };
1267        struct dm_io_region io_reg = {
1268                .bdev = c->bdev,
1269                .sector = 0,
1270                .count = 0,
1271        };
1272
1273        BUG_ON(dm_bufio_in_request());
1274
1275        return dm_io(&io_req, 1, &io_reg, NULL);
1276}
1277EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1278
1279/*
1280 * We first delete any other buffer that may be at that new location.
1281 *
1282 * Then, we write the buffer to the original location if it was dirty.
1283 *
1284 * Then, if we are the only one who is holding the buffer, relink the buffer
1285 * in the hash queue for the new location.
1286 *
1287 * If there was someone else holding the buffer, we write it to the new
1288 * location but not relink it, because that other user needs to have the buffer
1289 * at the same place.
1290 */
1291void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1292{
1293        struct dm_bufio_client *c = b->c;
1294        struct dm_buffer *new;
1295
1296        BUG_ON(dm_bufio_in_request());
1297
1298        dm_bufio_lock(c);
1299
1300retry:
1301        new = __find(c, new_block);
1302        if (new) {
1303                if (new->hold_count) {
1304                        __wait_for_free_buffer(c);
1305                        goto retry;
1306                }
1307
1308                /*
1309                 * FIXME: Is there any point waiting for a write that's going
1310                 * to be overwritten in a bit?
1311                 */
1312                __make_buffer_clean(new);
1313                __unlink_buffer(new);
1314                __free_buffer_wake(new);
1315        }
1316
1317        BUG_ON(!b->hold_count);
1318        BUG_ON(test_bit(B_READING, &b->state));
1319
1320        __write_dirty_buffer(b, NULL);
1321        if (b->hold_count == 1) {
1322                wait_on_bit(&b->state, B_WRITING,
1323                            do_io_schedule, TASK_UNINTERRUPTIBLE);
1324                set_bit(B_DIRTY, &b->state);
1325                __unlink_buffer(b);
1326                __link_buffer(b, new_block, LIST_DIRTY);
1327        } else {
1328                sector_t old_block;
1329                wait_on_bit_lock(&b->state, B_WRITING,
1330                                 do_io_schedule, TASK_UNINTERRUPTIBLE);
1331                /*
1332                 * Relink buffer to "new_block" so that write_callback
1333                 * sees "new_block" as a block number.
1334                 * After the write, link the buffer back to old_block.
1335                 * All this must be done in bufio lock, so that block number
1336                 * change isn't visible to other threads.
1337                 */
1338                old_block = b->block;
1339                __unlink_buffer(b);
1340                __link_buffer(b, new_block, b->list_mode);
1341                submit_io(b, WRITE, new_block, write_endio);
1342                wait_on_bit(&b->state, B_WRITING,
1343                            do_io_schedule, TASK_UNINTERRUPTIBLE);
1344                __unlink_buffer(b);
1345                __link_buffer(b, old_block, b->list_mode);
1346        }
1347
1348        dm_bufio_unlock(c);
1349        dm_bufio_release(b);
1350}
1351EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1352
1353unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1354{
1355        return c->block_size;
1356}
1357EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1358
1359sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1360{
1361        return i_size_read(c->bdev->bd_inode) >>
1362                           (SECTOR_SHIFT + c->sectors_per_block_bits);
1363}
1364EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1365
1366sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1367{
1368        return b->block;
1369}
1370EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1371
1372void *dm_bufio_get_block_data(struct dm_buffer *b)
1373{
1374        return b->data;
1375}
1376EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1377
1378void *dm_bufio_get_aux_data(struct dm_buffer *b)
1379{
1380        return b + 1;
1381}
1382EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1383
1384struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1385{
1386        return b->c;
1387}
1388EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1389
1390static void drop_buffers(struct dm_bufio_client *c)
1391{
1392        struct dm_buffer *b;
1393        int i;
1394
1395        BUG_ON(dm_bufio_in_request());
1396
1397        /*
1398         * An optimization so that the buffers are not written one-by-one.
1399         */
1400        dm_bufio_write_dirty_buffers_async(c);
1401
1402        dm_bufio_lock(c);
1403
1404        while ((b = __get_unclaimed_buffer(c)))
1405                __free_buffer_wake(b);
1406
1407        for (i = 0; i < LIST_SIZE; i++)
1408                list_for_each_entry(b, &c->lru[i], lru_list)
1409                        DMERR("leaked buffer %llx, hold count %u, list %d",
1410                              (unsigned long long)b->block, b->hold_count, i);
1411
1412        for (i = 0; i < LIST_SIZE; i++)
1413                BUG_ON(!list_empty(&c->lru[i]));
1414
1415        dm_bufio_unlock(c);
1416}
1417
1418/*
1419 * Test if the buffer is unused and too old, and commit it.
1420 * At if noio is set, we must not do any I/O because we hold
1421 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
1422 * different bufio client.
1423 */
1424static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
1425                                unsigned long max_jiffies)
1426{
1427        if (jiffies - b->last_accessed < max_jiffies)
1428                return 0;
1429
1430        if (!(gfp & __GFP_IO)) {
1431                if (test_bit(B_READING, &b->state) ||
1432                    test_bit(B_WRITING, &b->state) ||
1433                    test_bit(B_DIRTY, &b->state))
1434                        return 0;
1435        }
1436
1437        if (b->hold_count)
1438                return 0;
1439
1440        __make_buffer_clean(b);
1441        __unlink_buffer(b);
1442        __free_buffer_wake(b);
1443
1444        return 1;
1445}
1446
1447static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1448                   gfp_t gfp_mask)
1449{
1450        int l;
1451        struct dm_buffer *b, *tmp;
1452        long freed = 0;
1453
1454        for (l = 0; l < LIST_SIZE; l++) {
1455                list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1456                        freed += __cleanup_old_buffer(b, gfp_mask, 0);
1457                        if (!--nr_to_scan)
1458                                break;
1459                }
1460                dm_bufio_cond_resched();
1461        }
1462        return freed;
1463}
1464
1465static unsigned long
1466dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1467{
1468        struct dm_bufio_client *c;
1469        unsigned long freed;
1470
1471        c = container_of(shrink, struct dm_bufio_client, shrinker);
1472        if (sc->gfp_mask & __GFP_IO)
1473                dm_bufio_lock(c);
1474        else if (!dm_bufio_trylock(c))
1475                return SHRINK_STOP;
1476
1477        freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
1478        dm_bufio_unlock(c);
1479        return freed;
1480}
1481
1482static unsigned long
1483dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1484{
1485        struct dm_bufio_client *c;
1486        unsigned long count;
1487
1488        c = container_of(shrink, struct dm_bufio_client, shrinker);
1489        if (sc->gfp_mask & __GFP_IO)
1490                dm_bufio_lock(c);
1491        else if (!dm_bufio_trylock(c))
1492                return 0;
1493
1494        count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1495        dm_bufio_unlock(c);
1496        return count;
1497}
1498
1499/*
1500 * Create the buffering interface
1501 */
1502struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1503                                               unsigned reserved_buffers, unsigned aux_size,
1504                                               void (*alloc_callback)(struct dm_buffer *),
1505                                               void (*write_callback)(struct dm_buffer *))
1506{
1507        int r;
1508        struct dm_bufio_client *c;
1509        unsigned i;
1510
1511        BUG_ON(block_size < 1 << SECTOR_SHIFT ||
1512               (block_size & (block_size - 1)));
1513
1514        c = kmalloc(sizeof(*c), GFP_KERNEL);
1515        if (!c) {
1516                r = -ENOMEM;
1517                goto bad_client;
1518        }
1519        c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
1520        if (!c->cache_hash) {
1521                r = -ENOMEM;
1522                goto bad_hash;
1523        }
1524
1525        c->bdev = bdev;
1526        c->block_size = block_size;
1527        c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
1528        c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
1529                                  ffs(block_size) - 1 - PAGE_SHIFT : 0;
1530        c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
1531                                  PAGE_SHIFT - (ffs(block_size) - 1) : 0);
1532
1533        c->aux_size = aux_size;
1534        c->alloc_callback = alloc_callback;
1535        c->write_callback = write_callback;
1536
1537        for (i = 0; i < LIST_SIZE; i++) {
1538                INIT_LIST_HEAD(&c->lru[i]);
1539                c->n_buffers[i] = 0;
1540        }
1541
1542        for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1543                INIT_HLIST_HEAD(&c->cache_hash[i]);
1544
1545        mutex_init(&c->lock);
1546        INIT_LIST_HEAD(&c->reserved_buffers);
1547        c->need_reserved_buffers = reserved_buffers;
1548
1549        init_waitqueue_head(&c->free_buffer_wait);
1550        c->async_write_error = 0;
1551
1552        c->dm_io = dm_io_client_create();
1553        if (IS_ERR(c->dm_io)) {
1554                r = PTR_ERR(c->dm_io);
1555                goto bad_dm_io;
1556        }
1557
1558        mutex_lock(&dm_bufio_clients_lock);
1559        if (c->blocks_per_page_bits) {
1560                if (!DM_BUFIO_CACHE_NAME(c)) {
1561                        DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
1562                        if (!DM_BUFIO_CACHE_NAME(c)) {
1563                                r = -ENOMEM;
1564                                mutex_unlock(&dm_bufio_clients_lock);
1565                                goto bad_cache;
1566                        }
1567                }
1568
1569                if (!DM_BUFIO_CACHE(c)) {
1570                        DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
1571                                                              c->block_size,
1572                                                              c->block_size, 0, NULL);
1573                        if (!DM_BUFIO_CACHE(c)) {
1574                                r = -ENOMEM;
1575                                mutex_unlock(&dm_bufio_clients_lock);
1576                                goto bad_cache;
1577                        }
1578                }
1579        }
1580        mutex_unlock(&dm_bufio_clients_lock);
1581
1582        while (c->need_reserved_buffers) {
1583                struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1584
1585                if (!b) {
1586                        r = -ENOMEM;
1587                        goto bad_buffer;
1588                }
1589                __free_buffer_wake(b);
1590        }
1591
1592        mutex_lock(&dm_bufio_clients_lock);
1593        dm_bufio_client_count++;
1594        list_add(&c->client_list, &dm_bufio_all_clients);
1595        __cache_size_refresh();
1596        mutex_unlock(&dm_bufio_clients_lock);
1597
1598        c->shrinker.count_objects = dm_bufio_shrink_count;
1599        c->shrinker.scan_objects = dm_bufio_shrink_scan;
1600        c->shrinker.seeks = 1;
1601        c->shrinker.batch = 0;
1602        register_shrinker(&c->shrinker);
1603
1604        return c;
1605
1606bad_buffer:
1607bad_cache:
1608        while (!list_empty(&c->reserved_buffers)) {
1609                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1610                                                 struct dm_buffer, lru_list);
1611                list_del(&b->lru_list);
1612                free_buffer(b);
1613        }
1614        dm_io_client_destroy(c->dm_io);
1615bad_dm_io:
1616        vfree(c->cache_hash);
1617bad_hash:
1618        kfree(c);
1619bad_client:
1620        return ERR_PTR(r);
1621}
1622EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1623
1624/*
1625 * Free the buffering interface.
1626 * It is required that there are no references on any buffers.
1627 */
1628void dm_bufio_client_destroy(struct dm_bufio_client *c)
1629{
1630        unsigned i;
1631
1632        drop_buffers(c);
1633
1634        unregister_shrinker(&c->shrinker);
1635
1636        mutex_lock(&dm_bufio_clients_lock);
1637
1638        list_del(&c->client_list);
1639        dm_bufio_client_count--;
1640        __cache_size_refresh();
1641
1642        mutex_unlock(&dm_bufio_clients_lock);
1643
1644        for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1645                BUG_ON(!hlist_empty(&c->cache_hash[i]));
1646
1647        BUG_ON(c->need_reserved_buffers);
1648
1649        while (!list_empty(&c->reserved_buffers)) {
1650                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1651                                                 struct dm_buffer, lru_list);
1652                list_del(&b->lru_list);
1653                free_buffer(b);
1654        }
1655
1656        for (i = 0; i < LIST_SIZE; i++)
1657                if (c->n_buffers[i])
1658                        DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1659
1660        for (i = 0; i < LIST_SIZE; i++)
1661                BUG_ON(c->n_buffers[i]);
1662
1663        dm_io_client_destroy(c->dm_io);
1664        vfree(c->cache_hash);
1665        kfree(c);
1666}
1667EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1668
1669static void cleanup_old_buffers(void)
1670{
1671        unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
1672        struct dm_bufio_client *c;
1673
1674        if (max_age > ULONG_MAX / HZ)
1675                max_age = ULONG_MAX / HZ;
1676
1677        mutex_lock(&dm_bufio_clients_lock);
1678        list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
1679                if (!dm_bufio_trylock(c))
1680                        continue;
1681
1682                while (!list_empty(&c->lru[LIST_CLEAN])) {
1683                        struct dm_buffer *b;
1684                        b = list_entry(c->lru[LIST_CLEAN].prev,
1685                                       struct dm_buffer, lru_list);
1686                        if (!__cleanup_old_buffer(b, 0, max_age * HZ))
1687                                break;
1688                        dm_bufio_cond_resched();
1689                }
1690
1691                dm_bufio_unlock(c);
1692                dm_bufio_cond_resched();
1693        }
1694        mutex_unlock(&dm_bufio_clients_lock);
1695}
1696
1697static struct workqueue_struct *dm_bufio_wq;
1698static struct delayed_work dm_bufio_work;
1699
1700static void work_fn(struct work_struct *w)
1701{
1702        cleanup_old_buffers();
1703
1704        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1705                           DM_BUFIO_WORK_TIMER_SECS * HZ);
1706}
1707
1708/*----------------------------------------------------------------
1709 * Module setup
1710 *--------------------------------------------------------------*/
1711
1712/*
1713 * This is called only once for the whole dm_bufio module.
1714 * It initializes memory limit.
1715 */
1716static int __init dm_bufio_init(void)
1717{
1718        __u64 mem;
1719
1720        memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
1721        memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
1722
1723        mem = (__u64)((totalram_pages - totalhigh_pages) *
1724                      DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
1725
1726        if (mem > ULONG_MAX)
1727                mem = ULONG_MAX;
1728
1729#ifdef CONFIG_MMU
1730        /*
1731         * Get the size of vmalloc space the same way as VMALLOC_TOTAL
1732         * in fs/proc/internal.h
1733         */
1734        if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
1735                mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
1736#endif
1737
1738        dm_bufio_default_cache_size = mem;
1739
1740        mutex_lock(&dm_bufio_clients_lock);
1741        __cache_size_refresh();
1742        mutex_unlock(&dm_bufio_clients_lock);
1743
1744        dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
1745        if (!dm_bufio_wq)
1746                return -ENOMEM;
1747
1748        INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
1749        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1750                           DM_BUFIO_WORK_TIMER_SECS * HZ);
1751
1752        return 0;
1753}
1754
1755/*
1756 * This is called once when unloading the dm_bufio module.
1757 */
1758static void __exit dm_bufio_exit(void)
1759{
1760        int bug = 0;
1761        int i;
1762
1763        cancel_delayed_work_sync(&dm_bufio_work);
1764        destroy_workqueue(dm_bufio_wq);
1765
1766        for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
1767                struct kmem_cache *kc = dm_bufio_caches[i];
1768
1769                if (kc)
1770                        kmem_cache_destroy(kc);
1771        }
1772
1773        for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
1774                kfree(dm_bufio_cache_names[i]);
1775
1776        if (dm_bufio_client_count) {
1777                DMCRIT("%s: dm_bufio_client_count leaked: %d",
1778                        __func__, dm_bufio_client_count);
1779                bug = 1;
1780        }
1781
1782        if (dm_bufio_current_allocated) {
1783                DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
1784                        __func__, dm_bufio_current_allocated);
1785                bug = 1;
1786        }
1787
1788        if (dm_bufio_allocated_get_free_pages) {
1789                DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
1790                       __func__, dm_bufio_allocated_get_free_pages);
1791                bug = 1;
1792        }
1793
1794        if (dm_bufio_allocated_vmalloc) {
1795                DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
1796                       __func__, dm_bufio_allocated_vmalloc);
1797                bug = 1;
1798        }
1799
1800        if (bug)
1801                BUG();
1802}
1803
1804module_init(dm_bufio_init)
1805module_exit(dm_bufio_exit)
1806
1807module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
1808MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
1809
1810module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
1811MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
1812
1813module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
1814MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
1815
1816module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
1817MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
1818
1819module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
1820MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
1821
1822module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
1823MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
1824
1825module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
1826MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
1827
1828MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
1829MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
1830MODULE_LICENSE("GPL");
1831
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.