linux/drivers/md/dm-bufio.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009-2011 Red Hat, Inc.
   3 *
   4 * Author: Mikulas Patocka <mpatocka@redhat.com>
   5 *
   6 * This file is released under the GPL.
   7 */
   8
   9#include "dm-bufio.h"
  10
  11#include <linux/device-mapper.h>
  12#include <linux/dm-io.h>
  13#include <linux/slab.h>
  14#include <linux/vmalloc.h>
  15#include <linux/shrinker.h>
  16#include <linux/module.h>
  17
  18#define DM_MSG_PREFIX "bufio"
  19
  20/*
  21 * Memory management policy:
  22 *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
  23 *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
  24 *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
  25 *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
  26 *      dirty buffers.
  27 */
  28#define DM_BUFIO_MIN_BUFFERS            8
  29
  30#define DM_BUFIO_MEMORY_PERCENT         2
  31#define DM_BUFIO_VMALLOC_PERCENT        25
  32#define DM_BUFIO_WRITEBACK_PERCENT      75
  33
  34/*
  35 * Check buffer ages in this interval (seconds)
  36 */
  37#define DM_BUFIO_WORK_TIMER_SECS        10
  38
  39/*
  40 * Free buffers when they are older than this (seconds)
  41 */
  42#define DM_BUFIO_DEFAULT_AGE_SECS       60
  43
  44/*
  45 * The number of bvec entries that are embedded directly in the buffer.
  46 * If the chunk size is larger, dm-io is used to do the io.
  47 */
  48#define DM_BUFIO_INLINE_VECS            16
  49
  50/*
  51 * Buffer hash
  52 */
  53#define DM_BUFIO_HASH_BITS      20
  54#define DM_BUFIO_HASH(block) \
  55        ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
  56         ((1 << DM_BUFIO_HASH_BITS) - 1))
  57
  58/*
  59 * Don't try to use kmem_cache_alloc for blocks larger than this.
  60 * For explanation, see alloc_buffer_data below.
  61 */
  62#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT  (PAGE_SIZE >> 1)
  63#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT   (PAGE_SIZE << (MAX_ORDER - 1))
  64
  65/*
  66 * dm_buffer->list_mode
  67 */
  68#define LIST_CLEAN      0
  69#define LIST_DIRTY      1
  70#define LIST_SIZE       2
  71
  72/*
  73 * Linking of buffers:
  74 *      All buffers are linked to cache_hash with their hash_list field.
  75 *
  76 *      Clean buffers that are not being written (B_WRITING not set)
  77 *      are linked to lru[LIST_CLEAN] with their lru_list field.
  78 *
  79 *      Dirty and clean buffers that are being written are linked to
  80 *      lru[LIST_DIRTY] with their lru_list field. When the write
  81 *      finishes, the buffer cannot be relinked immediately (because we
  82 *      are in an interrupt context and relinking requires process
  83 *      context), so some clean-not-writing buffers can be held on
  84 *      dirty_lru too.  They are later added to lru in the process
  85 *      context.
  86 */
  87struct dm_bufio_client {
  88        struct mutex lock;
  89
  90        struct list_head lru[LIST_SIZE];
  91        unsigned long n_buffers[LIST_SIZE];
  92
  93        struct block_device *bdev;
  94        unsigned block_size;
  95        unsigned char sectors_per_block_bits;
  96        unsigned char pages_per_block_bits;
  97        unsigned char blocks_per_page_bits;
  98        unsigned aux_size;
  99        void (*alloc_callback)(struct dm_buffer *);
 100        void (*write_callback)(struct dm_buffer *);
 101
 102        struct dm_io_client *dm_io;
 103
 104        struct list_head reserved_buffers;
 105        unsigned need_reserved_buffers;
 106
 107        struct hlist_head *cache_hash;
 108        wait_queue_head_t free_buffer_wait;
 109
 110        int async_write_error;
 111
 112        struct list_head client_list;
 113        struct shrinker shrinker;
 114};
 115
 116/*
 117 * Buffer state bits.
 118 */
 119#define B_READING       0
 120#define B_WRITING       1
 121#define B_DIRTY         2
 122
 123/*
 124 * Describes how the block was allocated:
 125 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
 126 * See the comment at alloc_buffer_data.
 127 */
 128enum data_mode {
 129        DATA_MODE_SLAB = 0,
 130        DATA_MODE_GET_FREE_PAGES = 1,
 131        DATA_MODE_VMALLOC = 2,
 132        DATA_MODE_LIMIT = 3
 133};
 134
 135struct dm_buffer {
 136        struct hlist_node hash_list;
 137        struct list_head lru_list;
 138        sector_t block;
 139        void *data;
 140        enum data_mode data_mode;
 141        unsigned char list_mode;                /* LIST_* */
 142        unsigned hold_count;
 143        int read_error;
 144        int write_error;
 145        unsigned long state;
 146        unsigned long last_accessed;
 147        struct dm_bufio_client *c;
 148        struct list_head write_list;
 149        struct bio bio;
 150        struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
 151};
 152
 153/*----------------------------------------------------------------*/
 154
 155static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
 156static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
 157
 158static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
 159{
 160        unsigned ret = c->blocks_per_page_bits - 1;
 161
 162        BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
 163
 164        return ret;
 165}
 166
 167#define DM_BUFIO_CACHE(c)       (dm_bufio_caches[dm_bufio_cache_index(c)])
 168#define DM_BUFIO_CACHE_NAME(c)  (dm_bufio_cache_names[dm_bufio_cache_index(c)])
 169
 170#define dm_bufio_in_request()   (!!current->bio_list)
 171
 172static void dm_bufio_lock(struct dm_bufio_client *c)
 173{
 174        mutex_lock_nested(&c->lock, dm_bufio_in_request());
 175}
 176
 177static int dm_bufio_trylock(struct dm_bufio_client *c)
 178{
 179        return mutex_trylock(&c->lock);
 180}
 181
 182static void dm_bufio_unlock(struct dm_bufio_client *c)
 183{
 184        mutex_unlock(&c->lock);
 185}
 186
 187/*
 188 * FIXME Move to sched.h?
 189 */
 190#ifdef CONFIG_PREEMPT_VOLUNTARY
 191#  define dm_bufio_cond_resched()               \
 192do {                                            \
 193        if (unlikely(need_resched()))           \
 194                _cond_resched();                \
 195} while (0)
 196#else
 197#  define dm_bufio_cond_resched()                do { } while (0)
 198#endif
 199
 200/*----------------------------------------------------------------*/
 201
 202/*
 203 * Default cache size: available memory divided by the ratio.
 204 */
 205static unsigned long dm_bufio_default_cache_size;
 206
 207/*
 208 * Total cache size set by the user.
 209 */
 210static unsigned long dm_bufio_cache_size;
 211
 212/*
 213 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
 214 * at any time.  If it disagrees, the user has changed cache size.
 215 */
 216static unsigned long dm_bufio_cache_size_latch;
 217
 218static DEFINE_SPINLOCK(param_spinlock);
 219
 220/*
 221 * Buffers are freed after this timeout
 222 */
 223static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
 224
 225static unsigned long dm_bufio_peak_allocated;
 226static unsigned long dm_bufio_allocated_kmem_cache;
 227static unsigned long dm_bufio_allocated_get_free_pages;
 228static unsigned long dm_bufio_allocated_vmalloc;
 229static unsigned long dm_bufio_current_allocated;
 230
 231/*----------------------------------------------------------------*/
 232
 233/*
 234 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
 235 */
 236static unsigned long dm_bufio_cache_size_per_client;
 237
 238/*
 239 * The current number of clients.
 240 */
 241static int dm_bufio_client_count;
 242
 243/*
 244 * The list of all clients.
 245 */
 246static LIST_HEAD(dm_bufio_all_clients);
 247
 248/*
 249 * This mutex protects dm_bufio_cache_size_latch,
 250 * dm_bufio_cache_size_per_client and dm_bufio_client_count
 251 */
 252static DEFINE_MUTEX(dm_bufio_clients_lock);
 253
 254/*----------------------------------------------------------------*/
 255
 256static void adjust_total_allocated(enum data_mode data_mode, long diff)
 257{
 258        static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
 259                &dm_bufio_allocated_kmem_cache,
 260                &dm_bufio_allocated_get_free_pages,
 261                &dm_bufio_allocated_vmalloc,
 262        };
 263
 264        spin_lock(&param_spinlock);
 265
 266        *class_ptr[data_mode] += diff;
 267
 268        dm_bufio_current_allocated += diff;
 269
 270        if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
 271                dm_bufio_peak_allocated = dm_bufio_current_allocated;
 272
 273        spin_unlock(&param_spinlock);
 274}
 275
 276/*
 277 * Change the number of clients and recalculate per-client limit.
 278 */
 279static void __cache_size_refresh(void)
 280{
 281        BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 282        BUG_ON(dm_bufio_client_count < 0);
 283
 284        dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
 285
 286        /*
 287         * Use default if set to 0 and report the actual cache size used.
 288         */
 289        if (!dm_bufio_cache_size_latch) {
 290                (void)cmpxchg(&dm_bufio_cache_size, 0,
 291                              dm_bufio_default_cache_size);
 292                dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
 293        }
 294
 295        dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
 296                                         (dm_bufio_client_count ? : 1);
 297}
 298
 299/*
 300 * Allocating buffer data.
 301 *
 302 * Small buffers are allocated with kmem_cache, to use space optimally.
 303 *
 304 * For large buffers, we choose between get_free_pages and vmalloc.
 305 * Each has advantages and disadvantages.
 306 *
 307 * __get_free_pages can randomly fail if the memory is fragmented.
 308 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
 309 * as low as 128M) so using it for caching is not appropriate.
 310 *
 311 * If the allocation may fail we use __get_free_pages. Memory fragmentation
 312 * won't have a fatal effect here, but it just causes flushes of some other
 313 * buffers and more I/O will be performed. Don't use __get_free_pages if it
 314 * always fails (i.e. order >= MAX_ORDER).
 315 *
 316 * If the allocation shouldn't fail we use __vmalloc. This is only for the
 317 * initial reserve allocation, so there's no risk of wasting all vmalloc
 318 * space.
 319 */
 320static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 321                               enum data_mode *data_mode)
 322{
 323        unsigned noio_flag;
 324        void *ptr;
 325
 326        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
 327                *data_mode = DATA_MODE_SLAB;
 328                return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
 329        }
 330
 331        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
 332            gfp_mask & __GFP_NORETRY) {
 333                *data_mode = DATA_MODE_GET_FREE_PAGES;
 334                return (void *)__get_free_pages(gfp_mask,
 335                                                c->pages_per_block_bits);
 336        }
 337
 338        *data_mode = DATA_MODE_VMALLOC;
 339
 340        /*
 341         * __vmalloc allocates the data pages and auxiliary structures with
 342         * gfp_flags that were specified, but pagetables are always allocated
 343         * with GFP_KERNEL, no matter what was specified as gfp_mask.
 344         *
 345         * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
 346         * all allocations done by this process (including pagetables) are done
 347         * as if GFP_NOIO was specified.
 348         */
 349
 350        if (gfp_mask & __GFP_NORETRY)
 351                noio_flag = memalloc_noio_save();
 352
 353        ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
 354
 355        if (gfp_mask & __GFP_NORETRY)
 356                memalloc_noio_restore(noio_flag);
 357
 358        return ptr;
 359}
 360
 361/*
 362 * Free buffer's data.
 363 */
 364static void free_buffer_data(struct dm_bufio_client *c,
 365                             void *data, enum data_mode data_mode)
 366{
 367        switch (data_mode) {
 368        case DATA_MODE_SLAB:
 369                kmem_cache_free(DM_BUFIO_CACHE(c), data);
 370                break;
 371
 372        case DATA_MODE_GET_FREE_PAGES:
 373                free_pages((unsigned long)data, c->pages_per_block_bits);
 374                break;
 375
 376        case DATA_MODE_VMALLOC:
 377                vfree(data);
 378                break;
 379
 380        default:
 381                DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
 382                       data_mode);
 383                BUG();
 384        }
 385}
 386
 387/*
 388 * Allocate buffer and its data.
 389 */
 390static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
 391{
 392        struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
 393                                      gfp_mask);
 394
 395        if (!b)
 396                return NULL;
 397
 398        b->c = c;
 399
 400        b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
 401        if (!b->data) {
 402                kfree(b);
 403                return NULL;
 404        }
 405
 406        adjust_total_allocated(b->data_mode, (long)c->block_size);
 407
 408        return b;
 409}
 410
 411/*
 412 * Free buffer and its data.
 413 */
 414static void free_buffer(struct dm_buffer *b)
 415{
 416        struct dm_bufio_client *c = b->c;
 417
 418        adjust_total_allocated(b->data_mode, -(long)c->block_size);
 419
 420        free_buffer_data(c, b->data, b->data_mode);
 421        kfree(b);
 422}
 423
 424/*
 425 * Link buffer to the hash list and clean or dirty queue.
 426 */
 427static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 428{
 429        struct dm_bufio_client *c = b->c;
 430
 431        c->n_buffers[dirty]++;
 432        b->block = block;
 433        b->list_mode = dirty;
 434        list_add(&b->lru_list, &c->lru[dirty]);
 435        hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
 436        b->last_accessed = jiffies;
 437}
 438
 439/*
 440 * Unlink buffer from the hash list and dirty or clean queue.
 441 */
 442static void __unlink_buffer(struct dm_buffer *b)
 443{
 444        struct dm_bufio_client *c = b->c;
 445
 446        BUG_ON(!c->n_buffers[b->list_mode]);
 447
 448        c->n_buffers[b->list_mode]--;
 449        hlist_del(&b->hash_list);
 450        list_del(&b->lru_list);
 451}
 452
 453/*
 454 * Place the buffer to the head of dirty or clean LRU queue.
 455 */
 456static void __relink_lru(struct dm_buffer *b, int dirty)
 457{
 458        struct dm_bufio_client *c = b->c;
 459
 460        BUG_ON(!c->n_buffers[b->list_mode]);
 461
 462        c->n_buffers[b->list_mode]--;
 463        c->n_buffers[dirty]++;
 464        b->list_mode = dirty;
 465        list_move(&b->lru_list, &c->lru[dirty]);
 466}
 467
 468/*----------------------------------------------------------------
 469 * Submit I/O on the buffer.
 470 *
 471 * Bio interface is faster but it has some problems:
 472 *      the vector list is limited (increasing this limit increases
 473 *      memory-consumption per buffer, so it is not viable);
 474 *
 475 *      the memory must be direct-mapped, not vmalloced;
 476 *
 477 *      the I/O driver can reject requests spuriously if it thinks that
 478 *      the requests are too big for the device or if they cross a
 479 *      controller-defined memory boundary.
 480 *
 481 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
 482 * it is not vmalloced, try using the bio interface.
 483 *
 484 * If the buffer is big, if it is vmalloced or if the underlying device
 485 * rejects the bio because it is too large, use dm-io layer to do the I/O.
 486 * The dm-io layer splits the I/O into multiple requests, avoiding the above
 487 * shortcomings.
 488 *--------------------------------------------------------------*/
 489
 490/*
 491 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
 492 * that the request was handled directly with bio interface.
 493 */
 494static void dmio_complete(unsigned long error, void *context)
 495{
 496        struct dm_buffer *b = context;
 497
 498        b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
 499}
 500
 501static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 502                     bio_end_io_t *end_io)
 503{
 504        int r;
 505        struct dm_io_request io_req = {
 506                .bi_rw = rw,
 507                .notify.fn = dmio_complete,
 508                .notify.context = b,
 509                .client = b->c->dm_io,
 510        };
 511        struct dm_io_region region = {
 512                .bdev = b->c->bdev,
 513                .sector = block << b->c->sectors_per_block_bits,
 514                .count = b->c->block_size >> SECTOR_SHIFT,
 515        };
 516
 517        if (b->data_mode != DATA_MODE_VMALLOC) {
 518                io_req.mem.type = DM_IO_KMEM;
 519                io_req.mem.ptr.addr = b->data;
 520        } else {
 521                io_req.mem.type = DM_IO_VMA;
 522                io_req.mem.ptr.vma = b->data;
 523        }
 524
 525        b->bio.bi_end_io = end_io;
 526
 527        r = dm_io(&io_req, 1, &region, NULL);
 528        if (r)
 529                end_io(&b->bio, r);
 530}
 531
 532static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 533                           bio_end_io_t *end_io)
 534{
 535        char *ptr;
 536        int len;
 537
 538        bio_init(&b->bio);
 539        b->bio.bi_io_vec = b->bio_vec;
 540        b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
 541        b->bio.bi_sector = block << b->c->sectors_per_block_bits;
 542        b->bio.bi_bdev = b->c->bdev;
 543        b->bio.bi_end_io = end_io;
 544
 545        /*
 546         * We assume that if len >= PAGE_SIZE ptr is page-aligned.
 547         * If len < PAGE_SIZE the buffer doesn't cross page boundary.
 548         */
 549        ptr = b->data;
 550        len = b->c->block_size;
 551
 552        if (len >= PAGE_SIZE)
 553                BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
 554        else
 555                BUG_ON((unsigned long)ptr & (len - 1));
 556
 557        do {
 558                if (!bio_add_page(&b->bio, virt_to_page(ptr),
 559                                  len < PAGE_SIZE ? len : PAGE_SIZE,
 560                                  virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
 561                        BUG_ON(b->c->block_size <= PAGE_SIZE);
 562                        use_dmio(b, rw, block, end_io);
 563                        return;
 564                }
 565
 566                len -= PAGE_SIZE;
 567                ptr += PAGE_SIZE;
 568        } while (len > 0);
 569
 570        submit_bio(rw, &b->bio);
 571}
 572
 573static void submit_io(struct dm_buffer *b, int rw, sector_t block,
 574                      bio_end_io_t *end_io)
 575{
 576        if (rw == WRITE && b->c->write_callback)
 577                b->c->write_callback(b);
 578
 579        if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
 580            b->data_mode != DATA_MODE_VMALLOC)
 581                use_inline_bio(b, rw, block, end_io);
 582        else
 583                use_dmio(b, rw, block, end_io);
 584}
 585
 586/*----------------------------------------------------------------
 587 * Writing dirty buffers
 588 *--------------------------------------------------------------*/
 589
 590/*
 591 * The endio routine for write.
 592 *
 593 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
 594 * it.
 595 */
 596static void write_endio(struct bio *bio, int error)
 597{
 598        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 599
 600        b->write_error = error;
 601        if (unlikely(error)) {
 602                struct dm_bufio_client *c = b->c;
 603                (void)cmpxchg(&c->async_write_error, 0, error);
 604        }
 605
 606        BUG_ON(!test_bit(B_WRITING, &b->state));
 607
 608        smp_mb__before_clear_bit();
 609        clear_bit(B_WRITING, &b->state);
 610        smp_mb__after_clear_bit();
 611
 612        wake_up_bit(&b->state, B_WRITING);
 613}
 614
 615/*
 616 * This function is called when wait_on_bit is actually waiting.
 617 */
 618static int do_io_schedule(void *word)
 619{
 620        io_schedule();
 621
 622        return 0;
 623}
 624
 625/*
 626 * Initiate a write on a dirty buffer, but don't wait for it.
 627 *
 628 * - If the buffer is not dirty, exit.
 629 * - If there some previous write going on, wait for it to finish (we can't
 630 *   have two writes on the same buffer simultaneously).
 631 * - Submit our write and don't wait on it. We set B_WRITING indicating
 632 *   that there is a write in progress.
 633 */
 634static void __write_dirty_buffer(struct dm_buffer *b,
 635                                 struct list_head *write_list)
 636{
 637        if (!test_bit(B_DIRTY, &b->state))
 638                return;
 639
 640        clear_bit(B_DIRTY, &b->state);
 641        wait_on_bit_lock(&b->state, B_WRITING,
 642                         do_io_schedule, TASK_UNINTERRUPTIBLE);
 643
 644        if (!write_list)
 645                submit_io(b, WRITE, b->block, write_endio);
 646        else
 647                list_add_tail(&b->write_list, write_list);
 648}
 649
 650static void __flush_write_list(struct list_head *write_list)
 651{
 652        struct blk_plug plug;
 653        blk_start_plug(&plug);
 654        while (!list_empty(write_list)) {
 655                struct dm_buffer *b =
 656                        list_entry(write_list->next, struct dm_buffer, write_list);
 657                list_del(&b->write_list);
 658                submit_io(b, WRITE, b->block, write_endio);
 659                dm_bufio_cond_resched();
 660        }
 661        blk_finish_plug(&plug);
 662}
 663
 664/*
 665 * Wait until any activity on the buffer finishes.  Possibly write the
 666 * buffer if it is dirty.  When this function finishes, there is no I/O
 667 * running on the buffer and the buffer is not dirty.
 668 */
 669static void __make_buffer_clean(struct dm_buffer *b)
 670{
 671        BUG_ON(b->hold_count);
 672
 673        if (!b->state)  /* fast case */
 674                return;
 675
 676        wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 677        __write_dirty_buffer(b, NULL);
 678        wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
 679}
 680
 681/*
 682 * Find some buffer that is not held by anybody, clean it, unlink it and
 683 * return it.
 684 */
 685static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 686{
 687        struct dm_buffer *b;
 688
 689        list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
 690                BUG_ON(test_bit(B_WRITING, &b->state));
 691                BUG_ON(test_bit(B_DIRTY, &b->state));
 692
 693                if (!b->hold_count) {
 694                        __make_buffer_clean(b);
 695                        __unlink_buffer(b);
 696                        return b;
 697                }
 698                dm_bufio_cond_resched();
 699        }
 700
 701        list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
 702                BUG_ON(test_bit(B_READING, &b->state));
 703
 704                if (!b->hold_count) {
 705                        __make_buffer_clean(b);
 706                        __unlink_buffer(b);
 707                        return b;
 708                }
 709                dm_bufio_cond_resched();
 710        }
 711
 712        return NULL;
 713}
 714
 715/*
 716 * Wait until some other threads free some buffer or release hold count on
 717 * some buffer.
 718 *
 719 * This function is entered with c->lock held, drops it and regains it
 720 * before exiting.
 721 */
 722static void __wait_for_free_buffer(struct dm_bufio_client *c)
 723{
 724        DECLARE_WAITQUEUE(wait, current);
 725
 726        add_wait_queue(&c->free_buffer_wait, &wait);
 727        set_task_state(current, TASK_UNINTERRUPTIBLE);
 728        dm_bufio_unlock(c);
 729
 730        io_schedule();
 731
 732        set_task_state(current, TASK_RUNNING);
 733        remove_wait_queue(&c->free_buffer_wait, &wait);
 734
 735        dm_bufio_lock(c);
 736}
 737
 738enum new_flag {
 739        NF_FRESH = 0,
 740        NF_READ = 1,
 741        NF_GET = 2,
 742        NF_PREFETCH = 3
 743};
 744
 745/*
 746 * Allocate a new buffer. If the allocation is not possible, wait until
 747 * some other thread frees a buffer.
 748 *
 749 * May drop the lock and regain it.
 750 */
 751static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
 752{
 753        struct dm_buffer *b;
 754
 755        /*
 756         * dm-bufio is resistant to allocation failures (it just keeps
 757         * one buffer reserved in cases all the allocations fail).
 758         * So set flags to not try too hard:
 759         *      GFP_NOIO: don't recurse into the I/O layer
 760         *      __GFP_NORETRY: don't retry and rather return failure
 761         *      __GFP_NOMEMALLOC: don't use emergency reserves
 762         *      __GFP_NOWARN: don't print a warning in case of failure
 763         *
 764         * For debugging, if we set the cache size to 1, no new buffers will
 765         * be allocated.
 766         */
 767        while (1) {
 768                if (dm_bufio_cache_size_latch != 1) {
 769                        b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 770                        if (b)
 771                                return b;
 772                }
 773
 774                if (nf == NF_PREFETCH)
 775                        return NULL;
 776
 777                if (!list_empty(&c->reserved_buffers)) {
 778                        b = list_entry(c->reserved_buffers.next,
 779                                       struct dm_buffer, lru_list);
 780                        list_del(&b->lru_list);
 781                        c->need_reserved_buffers++;
 782
 783                        return b;
 784                }
 785
 786                b = __get_unclaimed_buffer(c);
 787                if (b)
 788                        return b;
 789
 790                __wait_for_free_buffer(c);
 791        }
 792}
 793
 794static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
 795{
 796        struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
 797
 798        if (!b)
 799                return NULL;
 800
 801        if (c->alloc_callback)
 802                c->alloc_callback(b);
 803
 804        return b;
 805}
 806
 807/*
 808 * Free a buffer and wake other threads waiting for free buffers.
 809 */
 810static void __free_buffer_wake(struct dm_buffer *b)
 811{
 812        struct dm_bufio_client *c = b->c;
 813
 814        if (!c->need_reserved_buffers)
 815                free_buffer(b);
 816        else {
 817                list_add(&b->lru_list, &c->reserved_buffers);
 818                c->need_reserved_buffers--;
 819        }
 820
 821        wake_up(&c->free_buffer_wait);
 822}
 823
 824static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
 825                                        struct list_head *write_list)
 826{
 827        struct dm_buffer *b, *tmp;
 828
 829        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
 830                BUG_ON(test_bit(B_READING, &b->state));
 831
 832                if (!test_bit(B_DIRTY, &b->state) &&
 833                    !test_bit(B_WRITING, &b->state)) {
 834                        __relink_lru(b, LIST_CLEAN);
 835                        continue;
 836                }
 837
 838                if (no_wait && test_bit(B_WRITING, &b->state))
 839                        return;
 840
 841                __write_dirty_buffer(b, write_list);
 842                dm_bufio_cond_resched();
 843        }
 844}
 845
 846/*
 847 * Get writeback threshold and buffer limit for a given client.
 848 */
 849static void __get_memory_limit(struct dm_bufio_client *c,
 850                               unsigned long *threshold_buffers,
 851                               unsigned long *limit_buffers)
 852{
 853        unsigned long buffers;
 854
 855        if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
 856                mutex_lock(&dm_bufio_clients_lock);
 857                __cache_size_refresh();
 858                mutex_unlock(&dm_bufio_clients_lock);
 859        }
 860
 861        buffers = dm_bufio_cache_size_per_client >>
 862                  (c->sectors_per_block_bits + SECTOR_SHIFT);
 863
 864        if (buffers < DM_BUFIO_MIN_BUFFERS)
 865                buffers = DM_BUFIO_MIN_BUFFERS;
 866
 867        *limit_buffers = buffers;
 868        *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
 869}
 870
 871/*
 872 * Check if we're over watermark.
 873 * If we are over threshold_buffers, start freeing buffers.
 874 * If we're over "limit_buffers", block until we get under the limit.
 875 */
 876static void __check_watermark(struct dm_bufio_client *c,
 877                              struct list_head *write_list)
 878{
 879        unsigned long threshold_buffers, limit_buffers;
 880
 881        __get_memory_limit(c, &threshold_buffers, &limit_buffers);
 882
 883        while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
 884               limit_buffers) {
 885
 886                struct dm_buffer *b = __get_unclaimed_buffer(c);
 887
 888                if (!b)
 889                        return;
 890
 891                __free_buffer_wake(b);
 892                dm_bufio_cond_resched();
 893        }
 894
 895        if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
 896                __write_dirty_buffers_async(c, 1, write_list);
 897}
 898
 899/*
 900 * Find a buffer in the hash.
 901 */
 902static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 903{
 904        struct dm_buffer *b;
 905
 906        hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
 907                             hash_list) {
 908                dm_bufio_cond_resched();
 909                if (b->block == block)
 910                        return b;
 911        }
 912
 913        return NULL;
 914}
 915
 916/*----------------------------------------------------------------
 917 * Getting a buffer
 918 *--------------------------------------------------------------*/
 919
 920static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 921                                     enum new_flag nf, int *need_submit,
 922                                     struct list_head *write_list)
 923{
 924        struct dm_buffer *b, *new_b = NULL;
 925
 926        *need_submit = 0;
 927
 928        b = __find(c, block);
 929        if (b)
 930                goto found_buffer;
 931
 932        if (nf == NF_GET)
 933                return NULL;
 934
 935        new_b = __alloc_buffer_wait(c, nf);
 936        if (!new_b)
 937                return NULL;
 938
 939        /*
 940         * We've had a period where the mutex was unlocked, so need to
 941         * recheck the hash table.
 942         */
 943        b = __find(c, block);
 944        if (b) {
 945                __free_buffer_wake(new_b);
 946                goto found_buffer;
 947        }
 948
 949        __check_watermark(c, write_list);
 950
 951        b = new_b;
 952        b->hold_count = 1;
 953        b->read_error = 0;
 954        b->write_error = 0;
 955        __link_buffer(b, block, LIST_CLEAN);
 956
 957        if (nf == NF_FRESH) {
 958                b->state = 0;
 959                return b;
 960        }
 961
 962        b->state = 1 << B_READING;
 963        *need_submit = 1;
 964
 965        return b;
 966
 967found_buffer:
 968        if (nf == NF_PREFETCH)
 969                return NULL;
 970        /*
 971         * Note: it is essential that we don't wait for the buffer to be
 972         * read if dm_bufio_get function is used. Both dm_bufio_get and
 973         * dm_bufio_prefetch can be used in the driver request routine.
 974         * If the user called both dm_bufio_prefetch and dm_bufio_get on
 975         * the same buffer, it would deadlock if we waited.
 976         */
 977        if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
 978                return NULL;
 979
 980        b->hold_count++;
 981        __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
 982                     test_bit(B_WRITING, &b->state));
 983        return b;
 984}
 985
 986/*
 987 * The endio routine for reading: set the error, clear the bit and wake up
 988 * anyone waiting on the buffer.
 989 */
 990static void read_endio(struct bio *bio, int error)
 991{
 992        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 993
 994        b->read_error = error;
 995
 996        BUG_ON(!test_bit(B_READING, &b->state));
 997
 998        smp_mb__before_clear_bit();
 999        clear_bit(B_READING, &b->state);
1000        smp_mb__after_clear_bit();
1001
1002        wake_up_bit(&b->state, B_READING);
1003}
1004
1005/*
1006 * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1007 * functions is similar except that dm_bufio_new doesn't read the
1008 * buffer from the disk (assuming that the caller overwrites all the data
1009 * and uses dm_bufio_mark_buffer_dirty to write new data back).
1010 */
1011static void *new_read(struct dm_bufio_client *c, sector_t block,
1012                      enum new_flag nf, struct dm_buffer **bp)
1013{
1014        int need_submit;
1015        struct dm_buffer *b;
1016
1017        LIST_HEAD(write_list);
1018
1019        dm_bufio_lock(c);
1020        b = __bufio_new(c, block, nf, &need_submit, &write_list);
1021        dm_bufio_unlock(c);
1022
1023        __flush_write_list(&write_list);
1024
1025        if (!b)
1026                return b;
1027
1028        if (need_submit)
1029                submit_io(b, READ, b->block, read_endio);
1030
1031        wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
1032
1033        if (b->read_error) {
1034                int error = b->read_error;
1035
1036                dm_bufio_release(b);
1037
1038                return ERR_PTR(error);
1039        }
1040
1041        *bp = b;
1042
1043        return b->data;
1044}
1045
1046void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1047                   struct dm_buffer **bp)
1048{
1049        return new_read(c, block, NF_GET, bp);
1050}
1051EXPORT_SYMBOL_GPL(dm_bufio_get);
1052
1053void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1054                    struct dm_buffer **bp)
1055{
1056        BUG_ON(dm_bufio_in_request());
1057
1058        return new_read(c, block, NF_READ, bp);
1059}
1060EXPORT_SYMBOL_GPL(dm_bufio_read);
1061
1062void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1063                   struct dm_buffer **bp)
1064{
1065        BUG_ON(dm_bufio_in_request());
1066
1067        return new_read(c, block, NF_FRESH, bp);
1068}
1069EXPORT_SYMBOL_GPL(dm_bufio_new);
1070
1071void dm_bufio_prefetch(struct dm_bufio_client *c,
1072                       sector_t block, unsigned n_blocks)
1073{
1074        struct blk_plug plug;
1075
1076        LIST_HEAD(write_list);
1077
1078        BUG_ON(dm_bufio_in_request());
1079
1080        blk_start_plug(&plug);
1081        dm_bufio_lock(c);
1082
1083        for (; n_blocks--; block++) {
1084                int need_submit;
1085                struct dm_buffer *b;
1086                b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1087                                &write_list);
1088                if (unlikely(!list_empty(&write_list))) {
1089                        dm_bufio_unlock(c);
1090                        blk_finish_plug(&plug);
1091                        __flush_write_list(&write_list);
1092                        blk_start_plug(&plug);
1093                        dm_bufio_lock(c);
1094                }
1095                if (unlikely(b != NULL)) {
1096                        dm_bufio_unlock(c);
1097
1098                        if (need_submit)
1099                                submit_io(b, READ, b->block, read_endio);
1100                        dm_bufio_release(b);
1101
1102                        dm_bufio_cond_resched();
1103
1104                        if (!n_blocks)
1105                                goto flush_plug;
1106                        dm_bufio_lock(c);
1107                }
1108        }
1109
1110        dm_bufio_unlock(c);
1111
1112flush_plug:
1113        blk_finish_plug(&plug);
1114}
1115EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1116
1117void dm_bufio_release(struct dm_buffer *b)
1118{
1119        struct dm_bufio_client *c = b->c;
1120
1121        dm_bufio_lock(c);
1122
1123        BUG_ON(!b->hold_count);
1124
1125        b->hold_count--;
1126        if (!b->hold_count) {
1127                wake_up(&c->free_buffer_wait);
1128
1129                /*
1130                 * If there were errors on the buffer, and the buffer is not
1131                 * to be written, free the buffer. There is no point in caching
1132                 * invalid buffer.
1133                 */
1134                if ((b->read_error || b->write_error) &&
1135                    !test_bit(B_READING, &b->state) &&
1136                    !test_bit(B_WRITING, &b->state) &&
1137                    !test_bit(B_DIRTY, &b->state)) {
1138                        __unlink_buffer(b);
1139                        __free_buffer_wake(b);
1140                }
1141        }
1142
1143        dm_bufio_unlock(c);
1144}
1145EXPORT_SYMBOL_GPL(dm_bufio_release);
1146
1147void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1148{
1149        struct dm_bufio_client *c = b->c;
1150
1151        dm_bufio_lock(c);
1152
1153        BUG_ON(test_bit(B_READING, &b->state));
1154
1155        if (!test_and_set_bit(B_DIRTY, &b->state))
1156                __relink_lru(b, LIST_DIRTY);
1157
1158        dm_bufio_unlock(c);
1159}
1160EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1161
1162void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1163{
1164        LIST_HEAD(write_list);
1165
1166        BUG_ON(dm_bufio_in_request());
1167
1168        dm_bufio_lock(c);
1169        __write_dirty_buffers_async(c, 0, &write_list);
1170        dm_bufio_unlock(c);
1171        __flush_write_list(&write_list);
1172}
1173EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1174
1175/*
1176 * For performance, it is essential that the buffers are written asynchronously
1177 * and simultaneously (so that the block layer can merge the writes) and then
1178 * waited upon.
1179 *
1180 * Finally, we flush hardware disk cache.
1181 */
1182int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1183{
1184        int a, f;
1185        unsigned long buffers_processed = 0;
1186        struct dm_buffer *b, *tmp;
1187
1188        LIST_HEAD(write_list);
1189
1190        dm_bufio_lock(c);
1191        __write_dirty_buffers_async(c, 0, &write_list);
1192        dm_bufio_unlock(c);
1193        __flush_write_list(&write_list);
1194        dm_bufio_lock(c);
1195
1196again:
1197        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1198                int dropped_lock = 0;
1199
1200                if (buffers_processed < c->n_buffers[LIST_DIRTY])
1201                        buffers_processed++;
1202
1203                BUG_ON(test_bit(B_READING, &b->state));
1204
1205                if (test_bit(B_WRITING, &b->state)) {
1206                        if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1207                                dropped_lock = 1;
1208                                b->hold_count++;
1209                                dm_bufio_unlock(c);
1210                                wait_on_bit(&b->state, B_WRITING,
1211                                            do_io_schedule,
1212                                            TASK_UNINTERRUPTIBLE);
1213                                dm_bufio_lock(c);
1214                                b->hold_count--;
1215                        } else
1216                                wait_on_bit(&b->state, B_WRITING,
1217                                            do_io_schedule,
1218                                            TASK_UNINTERRUPTIBLE);
1219                }
1220
1221                if (!test_bit(B_DIRTY, &b->state) &&
1222                    !test_bit(B_WRITING, &b->state))
1223                        __relink_lru(b, LIST_CLEAN);
1224
1225                dm_bufio_cond_resched();
1226
1227                /*
1228                 * If we dropped the lock, the list is no longer consistent,
1229                 * so we must restart the search.
1230                 *
1231                 * In the most common case, the buffer just processed is
1232                 * relinked to the clean list, so we won't loop scanning the
1233                 * same buffer again and again.
1234                 *
1235                 * This may livelock if there is another thread simultaneously
1236                 * dirtying buffers, so we count the number of buffers walked
1237                 * and if it exceeds the total number of buffers, it means that
1238                 * someone is doing some writes simultaneously with us.  In
1239                 * this case, stop, dropping the lock.
1240                 */
1241                if (dropped_lock)
1242                        goto again;
1243        }
1244        wake_up(&c->free_buffer_wait);
1245        dm_bufio_unlock(c);
1246
1247        a = xchg(&c->async_write_error, 0);
1248        f = dm_bufio_issue_flush(c);
1249        if (a)
1250                return a;
1251
1252        return f;
1253}
1254EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1255
1256/*
1257 * Use dm-io to send and empty barrier flush the device.
1258 */
1259int dm_bufio_issue_flush(struct dm_bufio_client *c)
1260{
1261        struct dm_io_request io_req = {
1262                .bi_rw = WRITE_FLUSH,
1263                .mem.type = DM_IO_KMEM,
1264                .mem.ptr.addr = NULL,
1265                .client = c->dm_io,
1266        };
1267        struct dm_io_region io_reg = {
1268                .bdev = c->bdev,
1269                .sector = 0,
1270                .count = 0,
1271        };
1272
1273        BUG_ON(dm_bufio_in_request());
1274
1275        return dm_io(&io_req, 1, &io_reg, NULL);
1276}
1277EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1278
1279/*
1280 * We first delete any other buffer that may be at that new location.
1281 *
1282 * Then, we write the buffer to the original location if it was dirty.
1283 *
1284 * Then, if we are the only one who is holding the buffer, relink the buffer
1285 * in the hash queue for the new location.
1286 *
1287 * If there was someone else holding the buffer, we write it to the new
1288 * location but not relink it, because that other user needs to have the buffer
1289 * at the same place.
1290 */
1291void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1292{
1293        struct dm_bufio_client *c = b->c;
1294        struct dm_buffer *new;
1295
1296        BUG_ON(dm_bufio_in_request());
1297
1298        dm_bufio_lock(c);
1299
1300retry:
1301        new = __find(c, new_block);
1302        if (new) {
1303                if (new->hold_count) {
1304                        __wait_for_free_buffer(c);
1305                        goto retry;
1306                }
1307
1308                /*
1309                 * FIXME: Is there any point waiting for a write that's going
1310                 * to be overwritten in a bit?
1311                 */
1312                __make_buffer_clean(new);
1313                __unlink_buffer(new);
1314                __free_buffer_wake(new);
1315        }
1316
1317        BUG_ON(!b->hold_count);
1318        BUG_ON(test_bit(B_READING, &b->state));
1319
1320        __write_dirty_buffer(b, NULL);
1321        if (b->hold_count == 1) {
1322                wait_on_bit(&b->state, B_WRITING,
1323                            do_io_schedule, TASK_UNINTERRUPTIBLE);
1324                set_bit(B_DIRTY, &b->state);
1325                __unlink_buffer(b);
1326                __link_buffer(b, new_block, LIST_DIRTY);
1327        } else {
1328                sector_t old_block;
1329                wait_on_bit_lock(&b->state, B_WRITING,
1330                                 do_io_schedule, TASK_UNINTERRUPTIBLE);
1331                /*
1332                 * Relink buffer to "new_block" so that write_callback
1333                 * sees "new_block" as a block number.
1334                 * After the write, link the buffer back to old_block.
1335                 * All this must be done in bufio lock, so that block number
1336                 * change isn't visible to other threads.
1337                 */
1338                old_block = b->block;
1339                __unlink_buffer(b);
1340                __link_buffer(b, new_block, b->list_mode);
1341                submit_io(b, WRITE, new_block, write_endio);
1342                wait_on_bit(&b->state, B_WRITING,
1343                            do_io_schedule, TASK_UNINTERRUPTIBLE);
1344                __unlink_buffer(b);
1345                __link_buffer(b, old_block, b->list_mode);
1346        }
1347
1348        dm_bufio_unlock(c);
1349        dm_bufio_release(b);
1350}
1351EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1352
1353unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1354{
1355        return c->block_size;
1356}
1357EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1358
1359sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1360{
1361        return i_size_read(c->bdev->bd_inode) >>
1362                           (SECTOR_SHIFT + c->sectors_per_block_bits);
1363}
1364EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1365
1366sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1367{
1368        return b->block;
1369}
1370EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1371
1372void *dm_bufio_get_block_data(struct dm_buffer *b)
1373{
1374        return b->data;
1375}
1376EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1377
1378void *dm_bufio_get_aux_data(struct dm_buffer *b)
1379{
1380        return b + 1;
1381}
1382EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1383
1384struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1385{
1386        return b->c;
1387}
1388EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1389
1390static void drop_buffers(struct dm_bufio_client *c)
1391{
1392        struct dm_buffer *b;
1393        int i;
1394
1395        BUG_ON(dm_bufio_in_request());
1396
1397        /*
1398         * An optimization so that the buffers are not written one-by-one.
1399         */
1400        dm_bufio_write_dirty_buffers_async(c);
1401
1402        dm_bufio_lock(c);
1403
1404        while ((b = __get_unclaimed_buffer(c)))
1405                __free_buffer_wake(b);
1406
1407        for (i = 0; i < LIST_SIZE; i++)
1408                list_for_each_entry(b, &c->lru[i], lru_list)
1409                        DMERR("leaked buffer %llx, hold count %u, list %d",
1410                              (unsigned long long)b->block, b->hold_count, i);
1411
1412        for (i = 0; i < LIST_SIZE; i++)
1413                BUG_ON(!list_empty(&c->lru[i]));
1414
1415        dm_bufio_unlock(c);
1416}
1417
1418/*
1419 * Test if the buffer is unused and too old, and commit it.
1420 * At if noio is set, we must not do any I/O because we hold
1421 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
1422 * different bufio client.
1423 */
1424static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
1425                                unsigned long max_jiffies)
1426{
1427        if (jiffies - b->last_accessed < max_jiffies)
1428                return 1;
1429
1430        if (!(gfp & __GFP_IO)) {
1431                if (test_bit(B_READING, &b->state) ||
1432                    test_bit(B_WRITING, &b->state) ||
1433                    test_bit(B_DIRTY, &b->state))
1434                        return 1;
1435        }
1436
1437        if (b->hold_count)
1438                return 1;
1439
1440        __make_buffer_clean(b);
1441        __unlink_buffer(b);
1442        __free_buffer_wake(b);
1443
1444        return 0;
1445}
1446
1447static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1448                   struct shrink_control *sc)
1449{
1450        int l;
1451        struct dm_buffer *b, *tmp;
1452
1453        for (l = 0; l < LIST_SIZE; l++) {
1454                list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list)
1455                        if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) &&
1456                            !--nr_to_scan)
1457                                return;
1458                dm_bufio_cond_resched();
1459        }
1460}
1461
1462static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
1463{
1464        struct dm_bufio_client *c =
1465            container_of(shrinker, struct dm_bufio_client, shrinker);
1466        unsigned long r;
1467        unsigned long nr_to_scan = sc->nr_to_scan;
1468
1469        if (sc->gfp_mask & __GFP_IO)
1470                dm_bufio_lock(c);
1471        else if (!dm_bufio_trylock(c))
1472                return !nr_to_scan ? 0 : -1;
1473
1474        if (nr_to_scan)
1475                __scan(c, nr_to_scan, sc);
1476
1477        r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1478        if (r > INT_MAX)
1479                r = INT_MAX;
1480
1481        dm_bufio_unlock(c);
1482
1483        return r;
1484}
1485
1486/*
1487 * Create the buffering interface
1488 */
1489struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1490                                               unsigned reserved_buffers, unsigned aux_size,
1491                                               void (*alloc_callback)(struct dm_buffer *),
1492                                               void (*write_callback)(struct dm_buffer *))
1493{
1494        int r;
1495        struct dm_bufio_client *c;
1496        unsigned i;
1497
1498        BUG_ON(block_size < 1 << SECTOR_SHIFT ||
1499               (block_size & (block_size - 1)));
1500
1501        c = kmalloc(sizeof(*c), GFP_KERNEL);
1502        if (!c) {
1503                r = -ENOMEM;
1504                goto bad_client;
1505        }
1506        c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
1507        if (!c->cache_hash) {
1508                r = -ENOMEM;
1509                goto bad_hash;
1510        }
1511
1512        c->bdev = bdev;
1513        c->block_size = block_size;
1514        c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
1515        c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
1516                                  ffs(block_size) - 1 - PAGE_SHIFT : 0;
1517        c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
1518                                  PAGE_SHIFT - (ffs(block_size) - 1) : 0);
1519
1520        c->aux_size = aux_size;
1521        c->alloc_callback = alloc_callback;
1522        c->write_callback = write_callback;
1523
1524        for (i = 0; i < LIST_SIZE; i++) {
1525                INIT_LIST_HEAD(&c->lru[i]);
1526                c->n_buffers[i] = 0;
1527        }
1528
1529        for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1530                INIT_HLIST_HEAD(&c->cache_hash[i]);
1531
1532        mutex_init(&c->lock);
1533        INIT_LIST_HEAD(&c->reserved_buffers);
1534        c->need_reserved_buffers = reserved_buffers;
1535
1536        init_waitqueue_head(&c->free_buffer_wait);
1537        c->async_write_error = 0;
1538
1539        c->dm_io = dm_io_client_create();
1540        if (IS_ERR(c->dm_io)) {
1541                r = PTR_ERR(c->dm_io);
1542                goto bad_dm_io;
1543        }
1544
1545        mutex_lock(&dm_bufio_clients_lock);
1546        if (c->blocks_per_page_bits) {
1547                if (!DM_BUFIO_CACHE_NAME(c)) {
1548                        DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
1549                        if (!DM_BUFIO_CACHE_NAME(c)) {
1550                                r = -ENOMEM;
1551                                mutex_unlock(&dm_bufio_clients_lock);
1552                                goto bad_cache;
1553                        }
1554                }
1555
1556                if (!DM_BUFIO_CACHE(c)) {
1557                        DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
1558                                                              c->block_size,
1559                                                              c->block_size, 0, NULL);
1560                        if (!DM_BUFIO_CACHE(c)) {
1561                                r = -ENOMEM;
1562                                mutex_unlock(&dm_bufio_clients_lock);
1563                                goto bad_cache;
1564                        }
1565                }
1566        }
1567        mutex_unlock(&dm_bufio_clients_lock);
1568
1569        while (c->need_reserved_buffers) {
1570                struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1571
1572                if (!b) {
1573                        r = -ENOMEM;
1574                        goto bad_buffer;
1575                }
1576                __free_buffer_wake(b);
1577        }
1578
1579        mutex_lock(&dm_bufio_clients_lock);
1580        dm_bufio_client_count++;
1581        list_add(&c->client_list, &dm_bufio_all_clients);
1582        __cache_size_refresh();
1583        mutex_unlock(&dm_bufio_clients_lock);
1584
1585        c->shrinker.shrink = shrink;
1586        c->shrinker.seeks = 1;
1587        c->shrinker.batch = 0;
1588        register_shrinker(&c->shrinker);
1589
1590        return c;
1591
1592bad_buffer:
1593bad_cache:
1594        while (!list_empty(&c->reserved_buffers)) {
1595                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1596                                                 struct dm_buffer, lru_list);
1597                list_del(&b->lru_list);
1598                free_buffer(b);
1599        }
1600        dm_io_client_destroy(c->dm_io);
1601bad_dm_io:
1602        vfree(c->cache_hash);
1603bad_hash:
1604        kfree(c);
1605bad_client:
1606        return ERR_PTR(r);
1607}
1608EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1609
1610/*
1611 * Free the buffering interface.
1612 * It is required that there are no references on any buffers.
1613 */
1614void dm_bufio_client_destroy(struct dm_bufio_client *c)
1615{
1616        unsigned i;
1617
1618        drop_buffers(c);
1619
1620        unregister_shrinker(&c->shrinker);
1621
1622        mutex_lock(&dm_bufio_clients_lock);
1623
1624        list_del(&c->client_list);
1625        dm_bufio_client_count--;
1626        __cache_size_refresh();
1627
1628        mutex_unlock(&dm_bufio_clients_lock);
1629
1630        for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1631                BUG_ON(!hlist_empty(&c->cache_hash[i]));
1632
1633        BUG_ON(c->need_reserved_buffers);
1634
1635        while (!list_empty(&c->reserved_buffers)) {
1636                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1637                                                 struct dm_buffer, lru_list);
1638                list_del(&b->lru_list);
1639                free_buffer(b);
1640        }
1641
1642        for (i = 0; i < LIST_SIZE; i++)
1643                if (c->n_buffers[i])
1644                        DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1645
1646        for (i = 0; i < LIST_SIZE; i++)
1647                BUG_ON(c->n_buffers[i]);
1648
1649        dm_io_client_destroy(c->dm_io);
1650        vfree(c->cache_hash);
1651        kfree(c);
1652}
1653EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1654
1655static void cleanup_old_buffers(void)
1656{
1657        unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
1658        struct dm_bufio_client *c;
1659
1660        if (max_age > ULONG_MAX / HZ)
1661                max_age = ULONG_MAX / HZ;
1662
1663        mutex_lock(&dm_bufio_clients_lock);
1664        list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
1665                if (!dm_bufio_trylock(c))
1666                        continue;
1667
1668                while (!list_empty(&c->lru[LIST_CLEAN])) {
1669                        struct dm_buffer *b;
1670                        b = list_entry(c->lru[LIST_CLEAN].prev,
1671                                       struct dm_buffer, lru_list);
1672                        if (__cleanup_old_buffer(b, 0, max_age * HZ))
1673                                break;
1674                        dm_bufio_cond_resched();
1675                }
1676
1677                dm_bufio_unlock(c);
1678                dm_bufio_cond_resched();
1679        }
1680        mutex_unlock(&dm_bufio_clients_lock);
1681}
1682
1683static struct workqueue_struct *dm_bufio_wq;
1684static struct delayed_work dm_bufio_work;
1685
1686static void work_fn(struct work_struct *w)
1687{
1688        cleanup_old_buffers();
1689
1690        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1691                           DM_BUFIO_WORK_TIMER_SECS * HZ);
1692}
1693
1694/*----------------------------------------------------------------
1695 * Module setup
1696 *--------------------------------------------------------------*/
1697
1698/*
1699 * This is called only once for the whole dm_bufio module.
1700 * It initializes memory limit.
1701 */
1702static int __init dm_bufio_init(void)
1703{
1704        __u64 mem;
1705
1706        memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
1707        memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
1708
1709        mem = (__u64)((totalram_pages - totalhigh_pages) *
1710                      DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
1711
1712        if (mem > ULONG_MAX)
1713                mem = ULONG_MAX;
1714
1715#ifdef CONFIG_MMU
1716        /*
1717         * Get the size of vmalloc space the same way as VMALLOC_TOTAL
1718         * in fs/proc/internal.h
1719         */
1720        if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
1721                mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
1722#endif
1723
1724        dm_bufio_default_cache_size = mem;
1725
1726        mutex_lock(&dm_bufio_clients_lock);
1727        __cache_size_refresh();
1728        mutex_unlock(&dm_bufio_clients_lock);
1729
1730        dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
1731        if (!dm_bufio_wq)
1732                return -ENOMEM;
1733
1734        INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
1735        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1736                           DM_BUFIO_WORK_TIMER_SECS * HZ);
1737
1738        return 0;
1739}
1740
1741/*
1742 * This is called once when unloading the dm_bufio module.
1743 */
1744static void __exit dm_bufio_exit(void)
1745{
1746        int bug = 0;
1747        int i;
1748
1749        cancel_delayed_work_sync(&dm_bufio_work);
1750        destroy_workqueue(dm_bufio_wq);
1751
1752        for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
1753                struct kmem_cache *kc = dm_bufio_caches[i];
1754
1755                if (kc)
1756                        kmem_cache_destroy(kc);
1757        }
1758
1759        for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
1760                kfree(dm_bufio_cache_names[i]);
1761
1762        if (dm_bufio_client_count) {
1763                DMCRIT("%s: dm_bufio_client_count leaked: %d",
1764                        __func__, dm_bufio_client_count);
1765                bug = 1;
1766        }
1767
1768        if (dm_bufio_current_allocated) {
1769                DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
1770                        __func__, dm_bufio_current_allocated);
1771                bug = 1;
1772        }
1773
1774        if (dm_bufio_allocated_get_free_pages) {
1775                DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
1776                       __func__, dm_bufio_allocated_get_free_pages);
1777                bug = 1;
1778        }
1779
1780        if (dm_bufio_allocated_vmalloc) {
1781                DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
1782                       __func__, dm_bufio_allocated_vmalloc);
1783                bug = 1;
1784        }
1785
1786        if (bug)
1787                BUG();
1788}
1789
1790module_init(dm_bufio_init)
1791module_exit(dm_bufio_exit)
1792
1793module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
1794MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
1795
1796module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
1797MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
1798
1799module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
1800MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
1801
1802module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
1803MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
1804
1805module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
1806MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
1807
1808module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
1809MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
1810
1811module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
1812MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
1813
1814MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
1815MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
1816MODULE_LICENSE("GPL");
1817
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.