linux/drivers/md/dm-thin.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011-2012 Red Hat UK.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-thin-metadata.h"
   8#include "dm.h"
   9
  10#include <linux/device-mapper.h>
  11#include <linux/dm-io.h>
  12#include <linux/dm-kcopyd.h>
  13#include <linux/list.h>
  14#include <linux/init.h>
  15#include <linux/module.h>
  16#include <linux/slab.h>
  17
  18#define DM_MSG_PREFIX   "thin"
  19
  20/*
  21 * Tunable constants
  22 */
  23#define ENDIO_HOOK_POOL_SIZE 1024
  24#define DEFERRED_SET_SIZE 64
  25#define MAPPING_POOL_SIZE 1024
  26#define PRISON_CELLS 1024
  27#define COMMIT_PERIOD HZ
  28
  29/*
  30 * The block size of the device holding pool data must be
  31 * between 64KB and 1GB.
  32 */
  33#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  34#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  35
  36/*
  37 * Device id is restricted to 24 bits.
  38 */
  39#define MAX_DEV_ID ((1 << 24) - 1)
  40
  41/*
  42 * How do we handle breaking sharing of data blocks?
  43 * =================================================
  44 *
  45 * We use a standard copy-on-write btree to store the mappings for the
  46 * devices (note I'm talking about copy-on-write of the metadata here, not
  47 * the data).  When you take an internal snapshot you clone the root node
  48 * of the origin btree.  After this there is no concept of an origin or a
  49 * snapshot.  They are just two device trees that happen to point to the
  50 * same data blocks.
  51 *
  52 * When we get a write in we decide if it's to a shared data block using
  53 * some timestamp magic.  If it is, we have to break sharing.
  54 *
  55 * Let's say we write to a shared block in what was the origin.  The
  56 * steps are:
  57 *
  58 * i) plug io further to this physical block. (see bio_prison code).
  59 *
  60 * ii) quiesce any read io to that shared data block.  Obviously
  61 * including all devices that share this block.  (see deferred_set code)
  62 *
  63 * iii) copy the data block to a newly allocate block.  This step can be
  64 * missed out if the io covers the block. (schedule_copy).
  65 *
  66 * iv) insert the new mapping into the origin's btree
  67 * (process_prepared_mapping).  This act of inserting breaks some
  68 * sharing of btree nodes between the two devices.  Breaking sharing only
  69 * effects the btree of that specific device.  Btrees for the other
  70 * devices that share the block never change.  The btree for the origin
  71 * device as it was after the last commit is untouched, ie. we're using
  72 * persistent data structures in the functional programming sense.
  73 *
  74 * v) unplug io to this physical block, including the io that triggered
  75 * the breaking of sharing.
  76 *
  77 * Steps (ii) and (iii) occur in parallel.
  78 *
  79 * The metadata _doesn't_ need to be committed before the io continues.  We
  80 * get away with this because the io is always written to a _new_ block.
  81 * If there's a crash, then:
  82 *
  83 * - The origin mapping will point to the old origin block (the shared
  84 * one).  This will contain the data as it was before the io that triggered
  85 * the breaking of sharing came in.
  86 *
  87 * - The snap mapping still points to the old block.  As it would after
  88 * the commit.
  89 *
  90 * The downside of this scheme is the timestamp magic isn't perfect, and
  91 * will continue to think that data block in the snapshot device is shared
  92 * even after the write to the origin has broken sharing.  I suspect data
  93 * blocks will typically be shared by many different devices, so we're
  94 * breaking sharing n + 1 times, rather than n, where n is the number of
  95 * devices that reference this data block.  At the moment I think the
  96 * benefits far, far outweigh the disadvantages.
  97 */
  98
  99/*----------------------------------------------------------------*/
 100
 101/*
 102 * Sometimes we can't deal with a bio straight away.  We put them in prison
 103 * where they can't cause any mischief.  Bios are put in a cell identified
 104 * by a key, multiple bios can be in the same cell.  When the cell is
 105 * subsequently unlocked the bios become available.
 106 */
 107struct bio_prison;
 108
 109struct cell_key {
 110        int virtual;
 111        dm_thin_id dev;
 112        dm_block_t block;
 113};
 114
 115struct dm_bio_prison_cell {
 116        struct hlist_node list;
 117        struct bio_prison *prison;
 118        struct cell_key key;
 119        struct bio *holder;
 120        struct bio_list bios;
 121};
 122
 123struct bio_prison {
 124        spinlock_t lock;
 125        mempool_t *cell_pool;
 126
 127        unsigned nr_buckets;
 128        unsigned hash_mask;
 129        struct hlist_head *cells;
 130};
 131
 132static uint32_t calc_nr_buckets(unsigned nr_cells)
 133{
 134        uint32_t n = 128;
 135
 136        nr_cells /= 4;
 137        nr_cells = min(nr_cells, 8192u);
 138
 139        while (n < nr_cells)
 140                n <<= 1;
 141
 142        return n;
 143}
 144
 145static struct kmem_cache *_cell_cache;
 146
 147/*
 148 * @nr_cells should be the number of cells you want in use _concurrently_.
 149 * Don't confuse it with the number of distinct keys.
 150 */
 151static struct bio_prison *prison_create(unsigned nr_cells)
 152{
 153        unsigned i;
 154        uint32_t nr_buckets = calc_nr_buckets(nr_cells);
 155        size_t len = sizeof(struct bio_prison) +
 156                (sizeof(struct hlist_head) * nr_buckets);
 157        struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
 158
 159        if (!prison)
 160                return NULL;
 161
 162        spin_lock_init(&prison->lock);
 163        prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
 164        if (!prison->cell_pool) {
 165                kfree(prison);
 166                return NULL;
 167        }
 168
 169        prison->nr_buckets = nr_buckets;
 170        prison->hash_mask = nr_buckets - 1;
 171        prison->cells = (struct hlist_head *) (prison + 1);
 172        for (i = 0; i < nr_buckets; i++)
 173                INIT_HLIST_HEAD(prison->cells + i);
 174
 175        return prison;
 176}
 177
 178static void prison_destroy(struct bio_prison *prison)
 179{
 180        mempool_destroy(prison->cell_pool);
 181        kfree(prison);
 182}
 183
 184static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
 185{
 186        const unsigned long BIG_PRIME = 4294967291UL;
 187        uint64_t hash = key->block * BIG_PRIME;
 188
 189        return (uint32_t) (hash & prison->hash_mask);
 190}
 191
 192static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
 193{
 194               return (lhs->virtual == rhs->virtual) &&
 195                       (lhs->dev == rhs->dev) &&
 196                       (lhs->block == rhs->block);
 197}
 198
 199static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
 200                                                  struct cell_key *key)
 201{
 202        struct dm_bio_prison_cell *cell;
 203        struct hlist_node *tmp;
 204
 205        hlist_for_each_entry(cell, tmp, bucket, list)
 206                if (keys_equal(&cell->key, key))
 207                        return cell;
 208
 209        return NULL;
 210}
 211
 212/*
 213 * This may block if a new cell needs allocating.  You must ensure that
 214 * cells will be unlocked even if the calling thread is blocked.
 215 *
 216 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
 217 */
 218static int bio_detain(struct bio_prison *prison, struct cell_key *key,
 219                      struct bio *inmate, struct dm_bio_prison_cell **ref)
 220{
 221        int r = 1;
 222        unsigned long flags;
 223        uint32_t hash = hash_key(prison, key);
 224        struct dm_bio_prison_cell *cell, *cell2;
 225
 226        BUG_ON(hash > prison->nr_buckets);
 227
 228        spin_lock_irqsave(&prison->lock, flags);
 229
 230        cell = __search_bucket(prison->cells + hash, key);
 231        if (cell) {
 232                bio_list_add(&cell->bios, inmate);
 233                goto out;
 234        }
 235
 236        /*
 237         * Allocate a new cell
 238         */
 239        spin_unlock_irqrestore(&prison->lock, flags);
 240        cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
 241        spin_lock_irqsave(&prison->lock, flags);
 242
 243        /*
 244         * We've been unlocked, so we have to double check that
 245         * nobody else has inserted this cell in the meantime.
 246         */
 247        cell = __search_bucket(prison->cells + hash, key);
 248        if (cell) {
 249                mempool_free(cell2, prison->cell_pool);
 250                bio_list_add(&cell->bios, inmate);
 251                goto out;
 252        }
 253
 254        /*
 255         * Use new cell.
 256         */
 257        cell = cell2;
 258
 259        cell->prison = prison;
 260        memcpy(&cell->key, key, sizeof(cell->key));
 261        cell->holder = inmate;
 262        bio_list_init(&cell->bios);
 263        hlist_add_head(&cell->list, prison->cells + hash);
 264
 265        r = 0;
 266
 267out:
 268        spin_unlock_irqrestore(&prison->lock, flags);
 269
 270        *ref = cell;
 271
 272        return r;
 273}
 274
 275/*
 276 * @inmates must have been initialised prior to this call
 277 */
 278static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
 279{
 280        struct bio_prison *prison = cell->prison;
 281
 282        hlist_del(&cell->list);
 283
 284        if (inmates) {
 285                bio_list_add(inmates, cell->holder);
 286                bio_list_merge(inmates, &cell->bios);
 287        }
 288
 289        mempool_free(cell, prison->cell_pool);
 290}
 291
 292static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
 293{
 294        unsigned long flags;
 295        struct bio_prison *prison = cell->prison;
 296
 297        spin_lock_irqsave(&prison->lock, flags);
 298        __cell_release(cell, bios);
 299        spin_unlock_irqrestore(&prison->lock, flags);
 300}
 301
 302/*
 303 * There are a couple of places where we put a bio into a cell briefly
 304 * before taking it out again.  In these situations we know that no other
 305 * bio may be in the cell.  This function releases the cell, and also does
 306 * a sanity check.
 307 */
 308static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
 309{
 310        BUG_ON(cell->holder != bio);
 311        BUG_ON(!bio_list_empty(&cell->bios));
 312
 313        __cell_release(cell, NULL);
 314}
 315
 316static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
 317{
 318        unsigned long flags;
 319        struct bio_prison *prison = cell->prison;
 320
 321        spin_lock_irqsave(&prison->lock, flags);
 322        __cell_release_singleton(cell, bio);
 323        spin_unlock_irqrestore(&prison->lock, flags);
 324}
 325
 326/*
 327 * Sometimes we don't want the holder, just the additional bios.
 328 */
 329static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
 330                                     struct bio_list *inmates)
 331{
 332        struct bio_prison *prison = cell->prison;
 333
 334        hlist_del(&cell->list);
 335        bio_list_merge(inmates, &cell->bios);
 336
 337        mempool_free(cell, prison->cell_pool);
 338}
 339
 340static void cell_release_no_holder(struct dm_bio_prison_cell *cell,
 341                                   struct bio_list *inmates)
 342{
 343        unsigned long flags;
 344        struct bio_prison *prison = cell->prison;
 345
 346        spin_lock_irqsave(&prison->lock, flags);
 347        __cell_release_no_holder(cell, inmates);
 348        spin_unlock_irqrestore(&prison->lock, flags);
 349}
 350
 351static void cell_error(struct dm_bio_prison_cell *cell)
 352{
 353        struct bio_prison *prison = cell->prison;
 354        struct bio_list bios;
 355        struct bio *bio;
 356        unsigned long flags;
 357
 358        bio_list_init(&bios);
 359
 360        spin_lock_irqsave(&prison->lock, flags);
 361        __cell_release(cell, &bios);
 362        spin_unlock_irqrestore(&prison->lock, flags);
 363
 364        while ((bio = bio_list_pop(&bios)))
 365                bio_io_error(bio);
 366}
 367
 368/*----------------------------------------------------------------*/
 369
 370/*
 371 * We use the deferred set to keep track of pending reads to shared blocks.
 372 * We do this to ensure the new mapping caused by a write isn't performed
 373 * until these prior reads have completed.  Otherwise the insertion of the
 374 * new mapping could free the old block that the read bios are mapped to.
 375 */
 376
 377struct deferred_set;
 378struct deferred_entry {
 379        struct deferred_set *ds;
 380        unsigned count;
 381        struct list_head work_items;
 382};
 383
 384struct deferred_set {
 385        spinlock_t lock;
 386        unsigned current_entry;
 387        unsigned sweeper;
 388        struct deferred_entry entries[DEFERRED_SET_SIZE];
 389};
 390
 391static void ds_init(struct deferred_set *ds)
 392{
 393        int i;
 394
 395        spin_lock_init(&ds->lock);
 396        ds->current_entry = 0;
 397        ds->sweeper = 0;
 398        for (i = 0; i < DEFERRED_SET_SIZE; i++) {
 399                ds->entries[i].ds = ds;
 400                ds->entries[i].count = 0;
 401                INIT_LIST_HEAD(&ds->entries[i].work_items);
 402        }
 403}
 404
 405static struct deferred_entry *ds_inc(struct deferred_set *ds)
 406{
 407        unsigned long flags;
 408        struct deferred_entry *entry;
 409
 410        spin_lock_irqsave(&ds->lock, flags);
 411        entry = ds->entries + ds->current_entry;
 412        entry->count++;
 413        spin_unlock_irqrestore(&ds->lock, flags);
 414
 415        return entry;
 416}
 417
 418static unsigned ds_next(unsigned index)
 419{
 420        return (index + 1) % DEFERRED_SET_SIZE;
 421}
 422
 423static void __sweep(struct deferred_set *ds, struct list_head *head)
 424{
 425        while ((ds->sweeper != ds->current_entry) &&
 426               !ds->entries[ds->sweeper].count) {
 427                list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 428                ds->sweeper = ds_next(ds->sweeper);
 429        }
 430
 431        if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
 432                list_splice_init(&ds->entries[ds->sweeper].work_items, head);
 433}
 434
 435static void ds_dec(struct deferred_entry *entry, struct list_head *head)
 436{
 437        unsigned long flags;
 438
 439        spin_lock_irqsave(&entry->ds->lock, flags);
 440        BUG_ON(!entry->count);
 441        --entry->count;
 442        __sweep(entry->ds, head);
 443        spin_unlock_irqrestore(&entry->ds->lock, flags);
 444}
 445
 446/*
 447 * Returns 1 if deferred or 0 if no pending items to delay job.
 448 */
 449static int ds_add_work(struct deferred_set *ds, struct list_head *work)
 450{
 451        int r = 1;
 452        unsigned long flags;
 453        unsigned next_entry;
 454
 455        spin_lock_irqsave(&ds->lock, flags);
 456        if ((ds->sweeper == ds->current_entry) &&
 457            !ds->entries[ds->current_entry].count)
 458                r = 0;
 459        else {
 460                list_add(work, &ds->entries[ds->current_entry].work_items);
 461                next_entry = ds_next(ds->current_entry);
 462                if (!ds->entries[next_entry].count)
 463                        ds->current_entry = next_entry;
 464        }
 465        spin_unlock_irqrestore(&ds->lock, flags);
 466
 467        return r;
 468}
 469
 470/*----------------------------------------------------------------*/
 471
 472/*
 473 * Key building.
 474 */
 475static void build_data_key(struct dm_thin_device *td,
 476                           dm_block_t b, struct cell_key *key)
 477{
 478        key->virtual = 0;
 479        key->dev = dm_thin_dev_id(td);
 480        key->block = b;
 481}
 482
 483static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 484                              struct cell_key *key)
 485{
 486        key->virtual = 1;
 487        key->dev = dm_thin_dev_id(td);
 488        key->block = b;
 489}
 490
 491/*----------------------------------------------------------------*/
 492
 493/*
 494 * A pool device ties together a metadata device and a data device.  It
 495 * also provides the interface for creating and destroying internal
 496 * devices.
 497 */
 498struct dm_thin_new_mapping;
 499
 500/*
 501 * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
 502 */
 503enum pool_mode {
 504        PM_WRITE,               /* metadata may be changed */
 505        PM_READ_ONLY,           /* metadata may not be changed */
 506        PM_FAIL,                /* all I/O fails */
 507};
 508
 509struct pool_features {
 510        enum pool_mode mode;
 511
 512        bool zero_new_blocks:1;
 513        bool discard_enabled:1;
 514        bool discard_passdown:1;
 515};
 516
 517struct thin_c;
 518typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 519typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 520
 521struct pool {
 522        struct list_head list;
 523        struct dm_target *ti;   /* Only set if a pool target is bound */
 524
 525        struct mapped_device *pool_md;
 526        struct block_device *md_dev;
 527        struct dm_pool_metadata *pmd;
 528
 529        dm_block_t low_water_blocks;
 530        uint32_t sectors_per_block;
 531        int sectors_per_block_shift;
 532
 533        struct pool_features pf;
 534        unsigned low_water_triggered:1; /* A dm event has been sent */
 535        unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
 536
 537        struct bio_prison *prison;
 538        struct dm_kcopyd_client *copier;
 539
 540        struct workqueue_struct *wq;
 541        struct work_struct worker;
 542        struct delayed_work waker;
 543
 544        unsigned long last_commit_jiffies;
 545        unsigned ref_count;
 546
 547        spinlock_t lock;
 548        struct bio_list deferred_bios;
 549        struct bio_list deferred_flush_bios;
 550        struct list_head prepared_mappings;
 551        struct list_head prepared_discards;
 552
 553        struct bio_list retry_on_resume_list;
 554
 555        struct deferred_set shared_read_ds;
 556        struct deferred_set all_io_ds;
 557
 558        struct dm_thin_new_mapping *next_mapping;
 559        mempool_t *mapping_pool;
 560        mempool_t *endio_hook_pool;
 561
 562        process_bio_fn process_bio;
 563        process_bio_fn process_discard;
 564
 565        process_mapping_fn process_prepared_mapping;
 566        process_mapping_fn process_prepared_discard;
 567};
 568
 569static enum pool_mode get_pool_mode(struct pool *pool);
 570static void set_pool_mode(struct pool *pool, enum pool_mode mode);
 571
 572/*
 573 * Target context for a pool.
 574 */
 575struct pool_c {
 576        struct dm_target *ti;
 577        struct pool *pool;
 578        struct dm_dev *data_dev;
 579        struct dm_dev *metadata_dev;
 580        struct dm_target_callbacks callbacks;
 581
 582        dm_block_t low_water_blocks;
 583        struct pool_features requested_pf; /* Features requested during table load */
 584        struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 585};
 586
 587/*
 588 * Target context for a thin.
 589 */
 590struct thin_c {
 591        struct dm_dev *pool_dev;
 592        struct dm_dev *origin_dev;
 593        dm_thin_id dev_id;
 594
 595        struct pool *pool;
 596        struct dm_thin_device *td;
 597};
 598
 599/*----------------------------------------------------------------*/
 600
 601/*
 602 * A global list of pools that uses a struct mapped_device as a key.
 603 */
 604static struct dm_thin_pool_table {
 605        struct mutex mutex;
 606        struct list_head pools;
 607} dm_thin_pool_table;
 608
 609static void pool_table_init(void)
 610{
 611        mutex_init(&dm_thin_pool_table.mutex);
 612        INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 613}
 614
 615static void __pool_table_insert(struct pool *pool)
 616{
 617        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 618        list_add(&pool->list, &dm_thin_pool_table.pools);
 619}
 620
 621static void __pool_table_remove(struct pool *pool)
 622{
 623        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 624        list_del(&pool->list);
 625}
 626
 627static struct pool *__pool_table_lookup(struct mapped_device *md)
 628{
 629        struct pool *pool = NULL, *tmp;
 630
 631        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 632
 633        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 634                if (tmp->pool_md == md) {
 635                        pool = tmp;
 636                        break;
 637                }
 638        }
 639
 640        return pool;
 641}
 642
 643static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 644{
 645        struct pool *pool = NULL, *tmp;
 646
 647        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 648
 649        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 650                if (tmp->md_dev == md_dev) {
 651                        pool = tmp;
 652                        break;
 653                }
 654        }
 655
 656        return pool;
 657}
 658
 659/*----------------------------------------------------------------*/
 660
 661struct dm_thin_endio_hook {
 662        struct thin_c *tc;
 663        struct deferred_entry *shared_read_entry;
 664        struct deferred_entry *all_io_entry;
 665        struct dm_thin_new_mapping *overwrite_mapping;
 666};
 667
 668static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 669{
 670        struct bio *bio;
 671        struct bio_list bios;
 672
 673        bio_list_init(&bios);
 674        bio_list_merge(&bios, master);
 675        bio_list_init(master);
 676
 677        while ((bio = bio_list_pop(&bios))) {
 678                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 679
 680                if (h->tc == tc)
 681                        bio_endio(bio, DM_ENDIO_REQUEUE);
 682                else
 683                        bio_list_add(master, bio);
 684        }
 685}
 686
 687static void requeue_io(struct thin_c *tc)
 688{
 689        struct pool *pool = tc->pool;
 690        unsigned long flags;
 691
 692        spin_lock_irqsave(&pool->lock, flags);
 693        __requeue_bio_list(tc, &pool->deferred_bios);
 694        __requeue_bio_list(tc, &pool->retry_on_resume_list);
 695        spin_unlock_irqrestore(&pool->lock, flags);
 696}
 697
 698/*
 699 * This section of code contains the logic for processing a thin device's IO.
 700 * Much of the code depends on pool object resources (lists, workqueues, etc)
 701 * but most is exclusively called from the thin target rather than the thin-pool
 702 * target.
 703 */
 704
 705static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 706{
 707        sector_t block_nr = bio->bi_sector;
 708
 709        if (tc->pool->sectors_per_block_shift < 0)
 710                (void) sector_div(block_nr, tc->pool->sectors_per_block);
 711        else
 712                block_nr >>= tc->pool->sectors_per_block_shift;
 713
 714        return block_nr;
 715}
 716
 717static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 718{
 719        struct pool *pool = tc->pool;
 720        sector_t bi_sector = bio->bi_sector;
 721
 722        bio->bi_bdev = tc->pool_dev->bdev;
 723        if (tc->pool->sectors_per_block_shift < 0)
 724                bio->bi_sector = (block * pool->sectors_per_block) +
 725                                 sector_div(bi_sector, pool->sectors_per_block);
 726        else
 727                bio->bi_sector = (block << pool->sectors_per_block_shift) |
 728                                (bi_sector & (pool->sectors_per_block - 1));
 729}
 730
 731static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 732{
 733        bio->bi_bdev = tc->origin_dev->bdev;
 734}
 735
 736static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 737{
 738        return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
 739                dm_thin_changed_this_transaction(tc->td);
 740}
 741
 742static void issue(struct thin_c *tc, struct bio *bio)
 743{
 744        struct pool *pool = tc->pool;
 745        unsigned long flags;
 746
 747        if (!bio_triggers_commit(tc, bio)) {
 748                generic_make_request(bio);
 749                return;
 750        }
 751
 752        /*
 753         * Complete bio with an error if earlier I/O caused changes to
 754         * the metadata that can't be committed e.g, due to I/O errors
 755         * on the metadata device.
 756         */
 757        if (dm_thin_aborted_changes(tc->td)) {
 758                bio_io_error(bio);
 759                return;
 760        }
 761
 762        /*
 763         * Batch together any bios that trigger commits and then issue a
 764         * single commit for them in process_deferred_bios().
 765         */
 766        spin_lock_irqsave(&pool->lock, flags);
 767        bio_list_add(&pool->deferred_flush_bios, bio);
 768        spin_unlock_irqrestore(&pool->lock, flags);
 769}
 770
 771static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 772{
 773        remap_to_origin(tc, bio);
 774        issue(tc, bio);
 775}
 776
 777static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 778                            dm_block_t block)
 779{
 780        remap(tc, bio, block);
 781        issue(tc, bio);
 782}
 783
 784/*
 785 * wake_worker() is used when new work is queued and when pool_resume is
 786 * ready to continue deferred IO processing.
 787 */
 788static void wake_worker(struct pool *pool)
 789{
 790        queue_work(pool->wq, &pool->worker);
 791}
 792
 793/*----------------------------------------------------------------*/
 794
 795/*
 796 * Bio endio functions.
 797 */
 798struct dm_thin_new_mapping {
 799        struct list_head list;
 800
 801        unsigned quiesced:1;
 802        unsigned prepared:1;
 803        unsigned pass_discard:1;
 804
 805        struct thin_c *tc;
 806        dm_block_t virt_block;
 807        dm_block_t data_block;
 808        struct dm_bio_prison_cell *cell, *cell2;
 809        int err;
 810
 811        /*
 812         * If the bio covers the whole area of a block then we can avoid
 813         * zeroing or copying.  Instead this bio is hooked.  The bio will
 814         * still be in the cell, so care has to be taken to avoid issuing
 815         * the bio twice.
 816         */
 817        struct bio *bio;
 818        bio_end_io_t *saved_bi_end_io;
 819};
 820
 821static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
 822{
 823        struct pool *pool = m->tc->pool;
 824
 825        if (m->quiesced && m->prepared) {
 826                list_add(&m->list, &pool->prepared_mappings);
 827                wake_worker(pool);
 828        }
 829}
 830
 831static void copy_complete(int read_err, unsigned long write_err, void *context)
 832{
 833        unsigned long flags;
 834        struct dm_thin_new_mapping *m = context;
 835        struct pool *pool = m->tc->pool;
 836
 837        m->err = read_err || write_err ? -EIO : 0;
 838
 839        spin_lock_irqsave(&pool->lock, flags);
 840        m->prepared = 1;
 841        __maybe_add_mapping(m);
 842        spin_unlock_irqrestore(&pool->lock, flags);
 843}
 844
 845static void overwrite_endio(struct bio *bio, int err)
 846{
 847        unsigned long flags;
 848        struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 849        struct dm_thin_new_mapping *m = h->overwrite_mapping;
 850        struct pool *pool = m->tc->pool;
 851
 852        m->err = err;
 853
 854        spin_lock_irqsave(&pool->lock, flags);
 855        m->prepared = 1;
 856        __maybe_add_mapping(m);
 857        spin_unlock_irqrestore(&pool->lock, flags);
 858}
 859
 860/*----------------------------------------------------------------*/
 861
 862/*
 863 * Workqueue.
 864 */
 865
 866/*
 867 * Prepared mapping jobs.
 868 */
 869
 870/*
 871 * This sends the bios in the cell back to the deferred_bios list.
 872 */
 873static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
 874                       dm_block_t data_block)
 875{
 876        struct pool *pool = tc->pool;
 877        unsigned long flags;
 878
 879        spin_lock_irqsave(&pool->lock, flags);
 880        cell_release(cell, &pool->deferred_bios);
 881        spin_unlock_irqrestore(&tc->pool->lock, flags);
 882
 883        wake_worker(pool);
 884}
 885
 886/*
 887 * Same as cell_defer above, except it omits one particular detainee,
 888 * a write bio that covers the block and has already been processed.
 889 */
 890static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 891{
 892        struct bio_list bios;
 893        struct pool *pool = tc->pool;
 894        unsigned long flags;
 895
 896        bio_list_init(&bios);
 897
 898        spin_lock_irqsave(&pool->lock, flags);
 899        cell_release_no_holder(cell, &pool->deferred_bios);
 900        spin_unlock_irqrestore(&pool->lock, flags);
 901
 902        wake_worker(pool);
 903}
 904
 905static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 906{
 907        if (m->bio)
 908                m->bio->bi_end_io = m->saved_bi_end_io;
 909        cell_error(m->cell);
 910        list_del(&m->list);
 911        mempool_free(m, m->tc->pool->mapping_pool);
 912}
 913static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 914{
 915        struct thin_c *tc = m->tc;
 916        struct bio *bio;
 917        int r;
 918
 919        bio = m->bio;
 920        if (bio)
 921                bio->bi_end_io = m->saved_bi_end_io;
 922
 923        if (m->err) {
 924                cell_error(m->cell);
 925                goto out;
 926        }
 927
 928        /*
 929         * Commit the prepared block into the mapping btree.
 930         * Any I/O for this block arriving after this point will get
 931         * remapped to it directly.
 932         */
 933        r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 934        if (r) {
 935                DMERR("dm_thin_insert_block() failed");
 936                cell_error(m->cell);
 937                goto out;
 938        }
 939
 940        /*
 941         * Release any bios held while the block was being provisioned.
 942         * If we are processing a write bio that completely covers the block,
 943         * we already processed it so can ignore it now when processing
 944         * the bios in the cell.
 945         */
 946        if (bio) {
 947                cell_defer_except(tc, m->cell);
 948                bio_endio(bio, 0);
 949        } else
 950                cell_defer(tc, m->cell, m->data_block);
 951
 952out:
 953        list_del(&m->list);
 954        mempool_free(m, tc->pool->mapping_pool);
 955}
 956
 957static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 958{
 959        struct thin_c *tc = m->tc;
 960
 961        bio_io_error(m->bio);
 962        cell_defer_except(tc, m->cell);
 963        cell_defer_except(tc, m->cell2);
 964        mempool_free(m, tc->pool->mapping_pool);
 965}
 966
 967static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 968{
 969        struct thin_c *tc = m->tc;
 970
 971        if (m->pass_discard)
 972                remap_and_issue(tc, m->bio, m->data_block);
 973        else
 974                bio_endio(m->bio, 0);
 975
 976        cell_defer_except(tc, m->cell);
 977        cell_defer_except(tc, m->cell2);
 978        mempool_free(m, tc->pool->mapping_pool);
 979}
 980
 981static void process_prepared_discard(struct dm_thin_new_mapping *m)
 982{
 983        int r;
 984        struct thin_c *tc = m->tc;
 985
 986        r = dm_thin_remove_block(tc->td, m->virt_block);
 987        if (r)
 988                DMERR("dm_thin_remove_block() failed");
 989
 990        process_prepared_discard_passdown(m);
 991}
 992
 993static void process_prepared(struct pool *pool, struct list_head *head,
 994                             process_mapping_fn *fn)
 995{
 996        unsigned long flags;
 997        struct list_head maps;
 998        struct dm_thin_new_mapping *m, *tmp;
 999
1000        INIT_LIST_HEAD(&maps);
1001        spin_lock_irqsave(&pool->lock, flags);
1002        list_splice_init(head, &maps);
1003        spin_unlock_irqrestore(&pool->lock, flags);
1004
1005        list_for_each_entry_safe(m, tmp, &maps, list)
1006                (*fn)(m);
1007}
1008
1009/*
1010 * Deferred bio jobs.
1011 */
1012static int io_overlaps_block(struct pool *pool, struct bio *bio)
1013{
1014        return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
1015}
1016
1017static int io_overwrites_block(struct pool *pool, struct bio *bio)
1018{
1019        return (bio_data_dir(bio) == WRITE) &&
1020                io_overlaps_block(pool, bio);
1021}
1022
1023static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
1024                               bio_end_io_t *fn)
1025{
1026        *save = bio->bi_end_io;
1027        bio->bi_end_io = fn;
1028}
1029
1030static int ensure_next_mapping(struct pool *pool)
1031{
1032        if (pool->next_mapping)
1033                return 0;
1034
1035        pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
1036
1037        return pool->next_mapping ? 0 : -ENOMEM;
1038}
1039
1040static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
1041{
1042        struct dm_thin_new_mapping *r = pool->next_mapping;
1043
1044        BUG_ON(!pool->next_mapping);
1045
1046        pool->next_mapping = NULL;
1047
1048        return r;
1049}
1050
1051static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1052                          struct dm_dev *origin, dm_block_t data_origin,
1053                          dm_block_t data_dest,
1054                          struct dm_bio_prison_cell *cell, struct bio *bio)
1055{
1056        int r;
1057        struct pool *pool = tc->pool;
1058        struct dm_thin_new_mapping *m = get_next_mapping(pool);
1059
1060        INIT_LIST_HEAD(&m->list);
1061        m->quiesced = 0;
1062        m->prepared = 0;
1063        m->tc = tc;
1064        m->virt_block = virt_block;
1065        m->data_block = data_dest;
1066        m->cell = cell;
1067        m->err = 0;
1068        m->bio = NULL;
1069
1070        if (!ds_add_work(&pool->shared_read_ds, &m->list))
1071                m->quiesced = 1;
1072
1073        /*
1074         * IO to pool_dev remaps to the pool target's data_dev.
1075         *
1076         * If the whole block of data is being overwritten, we can issue the
1077         * bio immediately. Otherwise we use kcopyd to clone the data first.
1078         */
1079        if (io_overwrites_block(pool, bio)) {
1080                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1081
1082                h->overwrite_mapping = m;
1083                m->bio = bio;
1084                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1085                remap_and_issue(tc, bio, data_dest);
1086        } else {
1087                struct dm_io_region from, to;
1088
1089                from.bdev = origin->bdev;
1090                from.sector = data_origin * pool->sectors_per_block;
1091                from.count = pool->sectors_per_block;
1092
1093                to.bdev = tc->pool_dev->bdev;
1094                to.sector = data_dest * pool->sectors_per_block;
1095                to.count = pool->sectors_per_block;
1096
1097                r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
1098                                   0, copy_complete, m);
1099                if (r < 0) {
1100                        mempool_free(m, pool->mapping_pool);
1101                        DMERR("dm_kcopyd_copy() failed");
1102                        cell_error(cell);
1103                }
1104        }
1105}
1106
1107static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1108                                   dm_block_t data_origin, dm_block_t data_dest,
1109                                   struct dm_bio_prison_cell *cell, struct bio *bio)
1110{
1111        schedule_copy(tc, virt_block, tc->pool_dev,
1112                      data_origin, data_dest, cell, bio);
1113}
1114
1115static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1116                                   dm_block_t data_dest,
1117                                   struct dm_bio_prison_cell *cell, struct bio *bio)
1118{
1119        schedule_copy(tc, virt_block, tc->origin_dev,
1120                      virt_block, data_dest, cell, bio);
1121}
1122
1123static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1124                          dm_block_t data_block, struct dm_bio_prison_cell *cell,
1125                          struct bio *bio)
1126{
1127        struct pool *pool = tc->pool;
1128        struct dm_thin_new_mapping *m = get_next_mapping(pool);
1129
1130        INIT_LIST_HEAD(&m->list);
1131        m->quiesced = 1;
1132        m->prepared = 0;
1133        m->tc = tc;
1134        m->virt_block = virt_block;
1135        m->data_block = data_block;
1136        m->cell = cell;
1137        m->err = 0;
1138        m->bio = NULL;
1139
1140        /*
1141         * If the whole block of data is being overwritten or we are not
1142         * zeroing pre-existing data, we can issue the bio immediately.
1143         * Otherwise we use kcopyd to zero the data first.
1144         */
1145        if (!pool->pf.zero_new_blocks)
1146                process_prepared_mapping(m);
1147
1148        else if (io_overwrites_block(pool, bio)) {
1149                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1150
1151                h->overwrite_mapping = m;
1152                m->bio = bio;
1153                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1154                remap_and_issue(tc, bio, data_block);
1155        } else {
1156                int r;
1157                struct dm_io_region to;
1158
1159                to.bdev = tc->pool_dev->bdev;
1160                to.sector = data_block * pool->sectors_per_block;
1161                to.count = pool->sectors_per_block;
1162
1163                r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
1164                if (r < 0) {
1165                        mempool_free(m, pool->mapping_pool);
1166                        DMERR("dm_kcopyd_zero() failed");
1167                        cell_error(cell);
1168                }
1169        }
1170}
1171
1172static int commit(struct pool *pool)
1173{
1174        int r;
1175
1176        r = dm_pool_commit_metadata(pool->pmd);
1177        if (r)
1178                DMERR("commit failed, error = %d", r);
1179
1180        return r;
1181}
1182
1183/*
1184 * A non-zero return indicates read_only or fail_io mode.
1185 * Many callers don't care about the return value.
1186 */
1187static int commit_or_fallback(struct pool *pool)
1188{
1189        int r;
1190
1191        if (get_pool_mode(pool) != PM_WRITE)
1192                return -EINVAL;
1193
1194        r = commit(pool);
1195        if (r)
1196                set_pool_mode(pool, PM_READ_ONLY);
1197
1198        return r;
1199}
1200
1201static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1202{
1203        int r;
1204        dm_block_t free_blocks;
1205        unsigned long flags;
1206        struct pool *pool = tc->pool;
1207
1208        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1209        if (r)
1210                return r;
1211
1212        if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1213                DMWARN("%s: reached low water mark, sending event.",
1214                       dm_device_name(pool->pool_md));
1215                spin_lock_irqsave(&pool->lock, flags);
1216                pool->low_water_triggered = 1;
1217                spin_unlock_irqrestore(&pool->lock, flags);
1218                dm_table_event(pool->ti->table);
1219        }
1220
1221        if (!free_blocks) {
1222                if (pool->no_free_space)
1223                        return -ENOSPC;
1224                else {
1225                        /*
1226                         * Try to commit to see if that will free up some
1227                         * more space.
1228                         */
1229                        (void) commit_or_fallback(pool);
1230
1231                        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1232                        if (r)
1233                                return r;
1234
1235                        /*
1236                         * If we still have no space we set a flag to avoid
1237                         * doing all this checking and return -ENOSPC.
1238                         */
1239                        if (!free_blocks) {
1240                                DMWARN("%s: no free space available.",
1241                                       dm_device_name(pool->pool_md));
1242                                spin_lock_irqsave(&pool->lock, flags);
1243                                pool->no_free_space = 1;
1244                                spin_unlock_irqrestore(&pool->lock, flags);
1245                                return -ENOSPC;
1246                        }
1247                }
1248        }
1249
1250        r = dm_pool_alloc_data_block(pool->pmd, result);
1251        if (r)
1252                return r;
1253
1254        return 0;
1255}
1256
1257/*
1258 * If we have run out of space, queue bios until the device is
1259 * resumed, presumably after having been reloaded with more space.
1260 */
1261static void retry_on_resume(struct bio *bio)
1262{
1263        struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1264        struct thin_c *tc = h->tc;
1265        struct pool *pool = tc->pool;
1266        unsigned long flags;
1267
1268        spin_lock_irqsave(&pool->lock, flags);
1269        bio_list_add(&pool->retry_on_resume_list, bio);
1270        spin_unlock_irqrestore(&pool->lock, flags);
1271}
1272
1273static void no_space(struct dm_bio_prison_cell *cell)
1274{
1275        struct bio *bio;
1276        struct bio_list bios;
1277
1278        bio_list_init(&bios);
1279        cell_release(cell, &bios);
1280
1281        while ((bio = bio_list_pop(&bios)))
1282                retry_on_resume(bio);
1283}
1284
1285static void process_discard(struct thin_c *tc, struct bio *bio)
1286{
1287        int r;
1288        unsigned long flags;
1289        struct pool *pool = tc->pool;
1290        struct dm_bio_prison_cell *cell, *cell2;
1291        struct cell_key key, key2;
1292        dm_block_t block = get_bio_block(tc, bio);
1293        struct dm_thin_lookup_result lookup_result;
1294        struct dm_thin_new_mapping *m;
1295
1296        build_virtual_key(tc->td, block, &key);
1297        if (bio_detain(tc->pool->prison, &key, bio, &cell))
1298                return;
1299
1300        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1301        switch (r) {
1302        case 0:
1303                /*
1304                 * Check nobody is fiddling with this pool block.  This can
1305                 * happen if someone's in the process of breaking sharing
1306                 * on this block.
1307                 */
1308                build_data_key(tc->td, lookup_result.block, &key2);
1309                if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1310                        cell_release_singleton(cell, bio);
1311                        break;
1312                }
1313
1314                if (io_overlaps_block(pool, bio)) {
1315                        /*
1316                         * IO may still be going to the destination block.  We must
1317                         * quiesce before we can do the removal.
1318                         */
1319                        m = get_next_mapping(pool);
1320                        m->tc = tc;
1321                        m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
1322                        m->virt_block = block;
1323                        m->data_block = lookup_result.block;
1324                        m->cell = cell;
1325                        m->cell2 = cell2;
1326                        m->err = 0;
1327                        m->bio = bio;
1328
1329                        if (!ds_add_work(&pool->all_io_ds, &m->list)) {
1330                                spin_lock_irqsave(&pool->lock, flags);
1331                                list_add(&m->list, &pool->prepared_discards);
1332                                spin_unlock_irqrestore(&pool->lock, flags);
1333                                wake_worker(pool);
1334                        }
1335                } else {
1336                        /*
1337                         * The DM core makes sure that the discard doesn't span
1338                         * a block boundary.  So we submit the discard of a
1339                         * partial block appropriately.
1340                         */
1341                        cell_release_singleton(cell, bio);
1342                        cell_release_singleton(cell2, bio);
1343                        if ((!lookup_result.shared) && pool->pf.discard_passdown)
1344                                remap_and_issue(tc, bio, lookup_result.block);
1345                        else
1346                                bio_endio(bio, 0);
1347                }
1348                break;
1349
1350        case -ENODATA:
1351                /*
1352                 * It isn't provisioned, just forget it.
1353                 */
1354                cell_release_singleton(cell, bio);
1355                bio_endio(bio, 0);
1356                break;
1357
1358        default:
1359                DMERR("discard: find block unexpectedly returned %d", r);
1360                cell_release_singleton(cell, bio);
1361                bio_io_error(bio);
1362                break;
1363        }
1364}
1365
1366static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1367                          struct cell_key *key,
1368                          struct dm_thin_lookup_result *lookup_result,
1369                          struct dm_bio_prison_cell *cell)
1370{
1371        int r;
1372        dm_block_t data_block;
1373
1374        r = alloc_data_block(tc, &data_block);
1375        switch (r) {
1376        case 0:
1377                schedule_internal_copy(tc, block, lookup_result->block,
1378                                       data_block, cell, bio);
1379                break;
1380
1381        case -ENOSPC:
1382                no_space(cell);
1383                break;
1384
1385        default:
1386                DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1387                cell_error(cell);
1388                break;
1389        }
1390}
1391
1392static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1393                               dm_block_t block,
1394                               struct dm_thin_lookup_result *lookup_result)
1395{
1396        struct dm_bio_prison_cell *cell;
1397        struct pool *pool = tc->pool;
1398        struct cell_key key;
1399
1400        /*
1401         * If cell is already occupied, then sharing is already in the process
1402         * of being broken so we have nothing further to do here.
1403         */
1404        build_data_key(tc->td, lookup_result->block, &key);
1405        if (bio_detain(pool->prison, &key, bio, &cell))
1406                return;
1407
1408        if (bio_data_dir(bio) == WRITE && bio->bi_size)
1409                break_sharing(tc, bio, block, &key, lookup_result, cell);
1410        else {
1411                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1412
1413                h->shared_read_entry = ds_inc(&pool->shared_read_ds);
1414
1415                cell_release_singleton(cell, bio);
1416                remap_and_issue(tc, bio, lookup_result->block);
1417        }
1418}
1419
1420static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1421                            struct dm_bio_prison_cell *cell)
1422{
1423        int r;
1424        dm_block_t data_block;
1425
1426        /*
1427         * Remap empty bios (flushes) immediately, without provisioning.
1428         */
1429        if (!bio->bi_size) {
1430                cell_release_singleton(cell, bio);
1431                remap_and_issue(tc, bio, 0);
1432                return;
1433        }
1434
1435        /*
1436         * Fill read bios with zeroes and complete them immediately.
1437         */
1438        if (bio_data_dir(bio) == READ) {
1439                zero_fill_bio(bio);
1440                cell_release_singleton(cell, bio);
1441                bio_endio(bio, 0);
1442                return;
1443        }
1444
1445        r = alloc_data_block(tc, &data_block);
1446        switch (r) {
1447        case 0:
1448                if (tc->origin_dev)
1449                        schedule_external_copy(tc, block, data_block, cell, bio);
1450                else
1451                        schedule_zero(tc, block, data_block, cell, bio);
1452                break;
1453
1454        case -ENOSPC:
1455                no_space(cell);
1456                break;
1457
1458        default:
1459                DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1460                set_pool_mode(tc->pool, PM_READ_ONLY);
1461                cell_error(cell);
1462                break;
1463        }
1464}
1465
1466static void process_bio(struct thin_c *tc, struct bio *bio)
1467{
1468        int r;
1469        dm_block_t block = get_bio_block(tc, bio);
1470        struct dm_bio_prison_cell *cell;
1471        struct cell_key key;
1472        struct dm_thin_lookup_result lookup_result;
1473
1474        /*
1475         * If cell is already occupied, then the block is already
1476         * being provisioned so we have nothing further to do here.
1477         */
1478        build_virtual_key(tc->td, block, &key);
1479        if (bio_detain(tc->pool->prison, &key, bio, &cell))
1480                return;
1481
1482        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1483        switch (r) {
1484        case 0:
1485                /*
1486                 * We can release this cell now.  This thread is the only
1487                 * one that puts bios into a cell, and we know there were
1488                 * no preceding bios.
1489                 */
1490                /*
1491                 * TODO: this will probably have to change when discard goes
1492                 * back in.
1493                 */
1494                cell_release_singleton(cell, bio);
1495
1496                if (lookup_result.shared)
1497                        process_shared_bio(tc, bio, block, &lookup_result);
1498                else
1499                        remap_and_issue(tc, bio, lookup_result.block);
1500                break;
1501
1502        case -ENODATA:
1503                if (bio_data_dir(bio) == READ && tc->origin_dev) {
1504                        cell_release_singleton(cell, bio);
1505                        remap_to_origin_and_issue(tc, bio);
1506                } else
1507                        provision_block(tc, bio, block, cell);
1508                break;
1509
1510        default:
1511                DMERR("dm_thin_find_block() failed, error = %d", r);
1512                cell_release_singleton(cell, bio);
1513                bio_io_error(bio);
1514                break;
1515        }
1516}
1517
1518static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1519{
1520        int r;
1521        int rw = bio_data_dir(bio);
1522        dm_block_t block = get_bio_block(tc, bio);
1523        struct dm_thin_lookup_result lookup_result;
1524
1525        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1526        switch (r) {
1527        case 0:
1528                if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1529                        bio_io_error(bio);
1530                else
1531                        remap_and_issue(tc, bio, lookup_result.block);
1532                break;
1533
1534        case -ENODATA:
1535                if (rw != READ) {
1536                        bio_io_error(bio);
1537                        break;
1538                }
1539
1540                if (tc->origin_dev) {
1541                        remap_to_origin_and_issue(tc, bio);
1542                        break;
1543                }
1544
1545                zero_fill_bio(bio);
1546                bio_endio(bio, 0);
1547                break;
1548
1549        default:
1550                DMERR("dm_thin_find_block() failed, error = %d", r);
1551                bio_io_error(bio);
1552                break;
1553        }
1554}
1555
1556static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1557{
1558        bio_io_error(bio);
1559}
1560
1561static int need_commit_due_to_time(struct pool *pool)
1562{
1563        return jiffies < pool->last_commit_jiffies ||
1564               jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1565}
1566
1567static void process_deferred_bios(struct pool *pool)
1568{
1569        unsigned long flags;
1570        struct bio *bio;
1571        struct bio_list bios;
1572
1573        bio_list_init(&bios);
1574
1575        spin_lock_irqsave(&pool->lock, flags);
1576        bio_list_merge(&bios, &pool->deferred_bios);
1577        bio_list_init(&pool->deferred_bios);
1578        spin_unlock_irqrestore(&pool->lock, flags);
1579
1580        while ((bio = bio_list_pop(&bios))) {
1581                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1582                struct thin_c *tc = h->tc;
1583
1584                /*
1585                 * If we've got no free new_mapping structs, and processing
1586                 * this bio might require one, we pause until there are some
1587                 * prepared mappings to process.
1588                 */
1589                if (ensure_next_mapping(pool)) {
1590                        spin_lock_irqsave(&pool->lock, flags);
1591                        bio_list_merge(&pool->deferred_bios, &bios);
1592                        spin_unlock_irqrestore(&pool->lock, flags);
1593
1594                        break;
1595                }
1596
1597                if (bio->bi_rw & REQ_DISCARD)
1598                        pool->process_discard(tc, bio);
1599                else
1600                        pool->process_bio(tc, bio);
1601        }
1602
1603        /*
1604         * If there are any deferred flush bios, we must commit
1605         * the metadata before issuing them.
1606         */
1607        bio_list_init(&bios);
1608        spin_lock_irqsave(&pool->lock, flags);
1609        bio_list_merge(&bios, &pool->deferred_flush_bios);
1610        bio_list_init(&pool->deferred_flush_bios);
1611        spin_unlock_irqrestore(&pool->lock, flags);
1612
1613        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1614                return;
1615
1616        if (commit_or_fallback(pool)) {
1617                while ((bio = bio_list_pop(&bios)))
1618                        bio_io_error(bio);
1619                return;
1620        }
1621        pool->last_commit_jiffies = jiffies;
1622
1623        while ((bio = bio_list_pop(&bios)))
1624                generic_make_request(bio);
1625}
1626
1627static void do_worker(struct work_struct *ws)
1628{
1629        struct pool *pool = container_of(ws, struct pool, worker);
1630
1631        process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1632        process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1633        process_deferred_bios(pool);
1634}
1635
1636/*
1637 * We want to commit periodically so that not too much
1638 * unwritten data builds up.
1639 */
1640static void do_waker(struct work_struct *ws)
1641{
1642        struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1643        wake_worker(pool);
1644        queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1645}
1646
1647/*----------------------------------------------------------------*/
1648
1649static enum pool_mode get_pool_mode(struct pool *pool)
1650{
1651        return pool->pf.mode;
1652}
1653
1654static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1655{
1656        int r;
1657
1658        pool->pf.mode = mode;
1659
1660        switch (mode) {
1661        case PM_FAIL:
1662                DMERR("switching pool to failure mode");
1663                pool->process_bio = process_bio_fail;
1664                pool->process_discard = process_bio_fail;
1665                pool->process_prepared_mapping = process_prepared_mapping_fail;
1666                pool->process_prepared_discard = process_prepared_discard_fail;
1667                break;
1668
1669        case PM_READ_ONLY:
1670                DMERR("switching pool to read-only mode");
1671                r = dm_pool_abort_metadata(pool->pmd);
1672                if (r) {
1673                        DMERR("aborting transaction failed");
1674                        set_pool_mode(pool, PM_FAIL);
1675                } else {
1676                        dm_pool_metadata_read_only(pool->pmd);
1677                        pool->process_bio = process_bio_read_only;
1678                        pool->process_discard = process_discard;
1679                        pool->process_prepared_mapping = process_prepared_mapping_fail;
1680                        pool->process_prepared_discard = process_prepared_discard_passdown;
1681                }
1682                break;
1683
1684        case PM_WRITE:
1685                pool->process_bio = process_bio;
1686                pool->process_discard = process_discard;
1687                pool->process_prepared_mapping = process_prepared_mapping;
1688                pool->process_prepared_discard = process_prepared_discard;
1689                break;
1690        }
1691}
1692
1693/*----------------------------------------------------------------*/
1694
1695/*
1696 * Mapping functions.
1697 */
1698
1699/*
1700 * Called only while mapping a thin bio to hand it over to the workqueue.
1701 */
1702static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1703{
1704        unsigned long flags;
1705        struct pool *pool = tc->pool;
1706
1707        spin_lock_irqsave(&pool->lock, flags);
1708        bio_list_add(&pool->deferred_bios, bio);
1709        spin_unlock_irqrestore(&pool->lock, flags);
1710
1711        wake_worker(pool);
1712}
1713
1714static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1715{
1716        struct pool *pool = tc->pool;
1717        struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1718
1719        h->tc = tc;
1720        h->shared_read_entry = NULL;
1721        h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
1722        h->overwrite_mapping = NULL;
1723
1724        return h;
1725}
1726
1727/*
1728 * Non-blocking function called from the thin target's map function.
1729 */
1730static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1731                        union map_info *map_context)
1732{
1733        int r;
1734        struct thin_c *tc = ti->private;
1735        dm_block_t block = get_bio_block(tc, bio);
1736        struct dm_thin_device *td = tc->td;
1737        struct dm_thin_lookup_result result;
1738
1739        map_context->ptr = thin_hook_bio(tc, bio);
1740
1741        if (get_pool_mode(tc->pool) == PM_FAIL) {
1742                bio_io_error(bio);
1743                return DM_MAPIO_SUBMITTED;
1744        }
1745
1746        if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1747                thin_defer_bio(tc, bio);
1748                return DM_MAPIO_SUBMITTED;
1749        }
1750
1751        r = dm_thin_find_block(td, block, 0, &result);
1752
1753        /*
1754         * Note that we defer readahead too.
1755         */
1756        switch (r) {
1757        case 0:
1758                if (unlikely(result.shared)) {
1759                        /*
1760                         * We have a race condition here between the
1761                         * result.shared value returned by the lookup and
1762                         * snapshot creation, which may cause new
1763                         * sharing.
1764                         *
1765                         * To avoid this always quiesce the origin before
1766                         * taking the snap.  You want to do this anyway to
1767                         * ensure a consistent application view
1768                         * (i.e. lockfs).
1769                         *
1770                         * More distant ancestors are irrelevant. The
1771                         * shared flag will be set in their case.
1772                         */
1773                        thin_defer_bio(tc, bio);
1774                        r = DM_MAPIO_SUBMITTED;
1775                } else {
1776                        remap(tc, bio, result.block);
1777                        r = DM_MAPIO_REMAPPED;
1778                }
1779                break;
1780
1781        case -ENODATA:
1782                if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1783                        /*
1784                         * This block isn't provisioned, and we have no way
1785                         * of doing so.  Just error it.
1786                         */
1787                        bio_io_error(bio);
1788                        r = DM_MAPIO_SUBMITTED;
1789                        break;
1790                }
1791                /* fall through */
1792
1793        case -EWOULDBLOCK:
1794                /*
1795                 * In future, the failed dm_thin_find_block above could
1796                 * provide the hint to load the metadata into cache.
1797                 */
1798                thin_defer_bio(tc, bio);
1799                r = DM_MAPIO_SUBMITTED;
1800                break;
1801
1802        default:
1803                /*
1804                 * Must always call bio_io_error on failure.
1805                 * dm_thin_find_block can fail with -EINVAL if the
1806                 * pool is switched to fail-io mode.
1807                 */
1808                bio_io_error(bio);
1809                r = DM_MAPIO_SUBMITTED;
1810                break;
1811        }
1812
1813        return r;
1814}
1815
1816static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1817{
1818        int r;
1819        unsigned long flags;
1820        struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1821
1822        spin_lock_irqsave(&pt->pool->lock, flags);
1823        r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1824        spin_unlock_irqrestore(&pt->pool->lock, flags);
1825
1826        if (!r) {
1827                struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1828                r = bdi_congested(&q->backing_dev_info, bdi_bits);
1829        }
1830
1831        return r;
1832}
1833
1834static void __requeue_bios(struct pool *pool)
1835{
1836        bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1837        bio_list_init(&pool->retry_on_resume_list);
1838}
1839
1840/*----------------------------------------------------------------
1841 * Binding of control targets to a pool object
1842 *--------------------------------------------------------------*/
1843static bool data_dev_supports_discard(struct pool_c *pt)
1844{
1845        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1846
1847        return q && blk_queue_discard(q);
1848}
1849
1850/*
1851 * If discard_passdown was enabled verify that the data device
1852 * supports discards.  Disable discard_passdown if not.
1853 */
1854static void disable_passdown_if_not_supported(struct pool_c *pt)
1855{
1856        struct pool *pool = pt->pool;
1857        struct block_device *data_bdev = pt->data_dev->bdev;
1858        struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1859        sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1860        const char *reason = NULL;
1861        char buf[BDEVNAME_SIZE];
1862
1863        if (!pt->adjusted_pf.discard_passdown)
1864                return;
1865
1866        if (!data_dev_supports_discard(pt))
1867                reason = "discard unsupported";
1868
1869        else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1870                reason = "max discard sectors smaller than a block";
1871
1872        else if (data_limits->discard_granularity > block_size)
1873                reason = "discard granularity larger than a block";
1874
1875        else if (block_size & (data_limits->discard_granularity - 1))
1876                reason = "discard granularity not a factor of block size";
1877
1878        if (reason) {
1879                DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1880                pt->adjusted_pf.discard_passdown = false;
1881        }
1882}
1883
1884static int bind_control_target(struct pool *pool, struct dm_target *ti)
1885{
1886        struct pool_c *pt = ti->private;
1887
1888        /*
1889         * We want to make sure that degraded pools are never upgraded.
1890         */
1891        enum pool_mode old_mode = pool->pf.mode;
1892        enum pool_mode new_mode = pt->adjusted_pf.mode;
1893
1894        if (old_mode > new_mode)
1895                new_mode = old_mode;
1896
1897        pool->ti = ti;
1898        pool->low_water_blocks = pt->low_water_blocks;
1899        pool->pf = pt->adjusted_pf;
1900
1901        set_pool_mode(pool, new_mode);
1902
1903        return 0;
1904}
1905
1906static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1907{
1908        if (pool->ti == ti)
1909                pool->ti = NULL;
1910}
1911
1912/*----------------------------------------------------------------
1913 * Pool creation
1914 *--------------------------------------------------------------*/
1915/* Initialize pool features. */
1916static void pool_features_init(struct pool_features *pf)
1917{
1918        pf->mode = PM_WRITE;
1919        pf->zero_new_blocks = true;
1920        pf->discard_enabled = true;
1921        pf->discard_passdown = true;
1922}
1923
1924static void __pool_destroy(struct pool *pool)
1925{
1926        __pool_table_remove(pool);
1927
1928        if (dm_pool_metadata_close(pool->pmd) < 0)
1929                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1930
1931        prison_destroy(pool->prison);
1932        dm_kcopyd_client_destroy(pool->copier);
1933
1934        if (pool->wq)
1935                destroy_workqueue(pool->wq);
1936
1937        if (pool->next_mapping)
1938                mempool_free(pool->next_mapping, pool->mapping_pool);
1939        mempool_destroy(pool->mapping_pool);
1940        mempool_destroy(pool->endio_hook_pool);
1941        kfree(pool);
1942}
1943
1944static struct kmem_cache *_new_mapping_cache;
1945static struct kmem_cache *_endio_hook_cache;
1946
1947static struct pool *pool_create(struct mapped_device *pool_md,
1948                                struct block_device *metadata_dev,
1949                                unsigned long block_size,
1950                                int read_only, char **error)
1951{
1952        int r;
1953        void *err_p;
1954        struct pool *pool;
1955        struct dm_pool_metadata *pmd;
1956        bool format_device = read_only ? false : true;
1957
1958        pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1959        if (IS_ERR(pmd)) {
1960                *error = "Error creating metadata object";
1961                return (struct pool *)pmd;
1962        }
1963
1964        pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1965        if (!pool) {
1966                *error = "Error allocating memory for pool";
1967                err_p = ERR_PTR(-ENOMEM);
1968                goto bad_pool;
1969        }
1970
1971        pool->pmd = pmd;
1972        pool->sectors_per_block = block_size;
1973        if (block_size & (block_size - 1))
1974                pool->sectors_per_block_shift = -1;
1975        else
1976                pool->sectors_per_block_shift = __ffs(block_size);
1977        pool->low_water_blocks = 0;
1978        pool_features_init(&pool->pf);
1979        pool->prison = prison_create(PRISON_CELLS);
1980        if (!pool->prison) {
1981                *error = "Error creating pool's bio prison";
1982                err_p = ERR_PTR(-ENOMEM);
1983                goto bad_prison;
1984        }
1985
1986        pool->copier = dm_kcopyd_client_create();
1987        if (IS_ERR(pool->copier)) {
1988                r = PTR_ERR(pool->copier);
1989                *error = "Error creating pool's kcopyd client";
1990                err_p = ERR_PTR(r);
1991                goto bad_kcopyd_client;
1992        }
1993
1994        /*
1995         * Create singlethreaded workqueue that will service all devices
1996         * that use this metadata.
1997         */
1998        pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1999        if (!pool->wq) {
2000                *error = "Error creating pool's workqueue";
2001                err_p = ERR_PTR(-ENOMEM);
2002                goto bad_wq;
2003        }
2004
2005        INIT_WORK(&pool->worker, do_worker);
2006        INIT_DELAYED_WORK(&pool->waker, do_waker);
2007        spin_lock_init(&pool->lock);
2008        bio_list_init(&pool->deferred_bios);
2009        bio_list_init(&pool->deferred_flush_bios);
2010        INIT_LIST_HEAD(&pool->prepared_mappings);
2011        INIT_LIST_HEAD(&pool->prepared_discards);
2012        pool->low_water_triggered = 0;
2013        pool->no_free_space = 0;
2014        bio_list_init(&pool->retry_on_resume_list);
2015        ds_init(&pool->shared_read_ds);
2016        ds_init(&pool->all_io_ds);
2017
2018        pool->next_mapping = NULL;
2019        pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2020                                                      _new_mapping_cache);
2021        if (!pool->mapping_pool) {
2022                *error = "Error creating pool's mapping mempool";
2023                err_p = ERR_PTR(-ENOMEM);
2024                goto bad_mapping_pool;
2025        }
2026
2027        pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
2028                                                         _endio_hook_cache);
2029        if (!pool->endio_hook_pool) {
2030                *error = "Error creating pool's endio_hook mempool";
2031                err_p = ERR_PTR(-ENOMEM);
2032                goto bad_endio_hook_pool;
2033        }
2034        pool->ref_count = 1;
2035        pool->last_commit_jiffies = jiffies;
2036        pool->pool_md = pool_md;
2037        pool->md_dev = metadata_dev;
2038        __pool_table_insert(pool);
2039
2040        return pool;
2041
2042bad_endio_hook_pool:
2043        mempool_destroy(pool->mapping_pool);
2044bad_mapping_pool:
2045        destroy_workqueue(pool->wq);
2046bad_wq:
2047        dm_kcopyd_client_destroy(pool->copier);
2048bad_kcopyd_client:
2049        prison_destroy(pool->prison);
2050bad_prison:
2051        kfree(pool);
2052bad_pool:
2053        if (dm_pool_metadata_close(pmd))
2054                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2055
2056        return err_p;
2057}
2058
2059static void __pool_inc(struct pool *pool)
2060{
2061        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2062        pool->ref_count++;
2063}
2064
2065static void __pool_dec(struct pool *pool)
2066{
2067        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2068        BUG_ON(!pool->ref_count);
2069        if (!--pool->ref_count)
2070                __pool_destroy(pool);
2071}
2072
2073static struct pool *__pool_find(struct mapped_device *pool_md,
2074                                struct block_device *metadata_dev,
2075                                unsigned long block_size, int read_only,
2076                                char **error, int *created)
2077{
2078        struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2079
2080        if (pool) {
2081                if (pool->pool_md != pool_md) {
2082                        *error = "metadata device already in use by a pool";
2083                        return ERR_PTR(-EBUSY);
2084                }
2085                __pool_inc(pool);
2086
2087        } else {
2088                pool = __pool_table_lookup(pool_md);
2089                if (pool) {
2090                        if (pool->md_dev != metadata_dev) {
2091                                *error = "different pool cannot replace a pool";
2092                                return ERR_PTR(-EINVAL);
2093                        }
2094                        __pool_inc(pool);
2095
2096                } else {
2097                        pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
2098                        *created = 1;
2099                }
2100        }
2101
2102        return pool;
2103}
2104
2105/*----------------------------------------------------------------
2106 * Pool target methods
2107 *--------------------------------------------------------------*/
2108static void pool_dtr(struct dm_target *ti)
2109{
2110        struct pool_c *pt = ti->private;
2111
2112        mutex_lock(&dm_thin_pool_table.mutex);
2113
2114        unbind_control_target(pt->pool, ti);
2115        __pool_dec(pt->pool);
2116        dm_put_device(ti, pt->metadata_dev);
2117        dm_put_device(ti, pt->data_dev);
2118        kfree(pt);
2119
2120        mutex_unlock(&dm_thin_pool_table.mutex);
2121}
2122
2123static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2124                               struct dm_target *ti)
2125{
2126        int r;
2127        unsigned argc;
2128        const char *arg_name;
2129
2130        static struct dm_arg _args[] = {
2131                {0, 3, "Invalid number of pool feature arguments"},
2132        };
2133
2134        /*
2135         * No feature arguments supplied.
2136         */
2137        if (!as->argc)
2138                return 0;
2139
2140        r = dm_read_arg_group(_args, as, &argc, &ti->error);
2141        if (r)
2142                return -EINVAL;
2143
2144        while (argc && !r) {
2145                arg_name = dm_shift_arg(as);
2146                argc--;
2147
2148                if (!strcasecmp(arg_name, "skip_block_zeroing"))
2149                        pf->zero_new_blocks = false;
2150
2151                else if (!strcasecmp(arg_name, "ignore_discard"))
2152                        pf->discard_enabled = false;
2153
2154                else if (!strcasecmp(arg_name, "no_discard_passdown"))
2155                        pf->discard_passdown = false;
2156
2157                else if (!strcasecmp(arg_name, "read_only"))
2158                        pf->mode = PM_READ_ONLY;
2159
2160                else {
2161                        ti->error = "Unrecognised pool feature requested";
2162                        r = -EINVAL;
2163                        break;
2164                }
2165        }
2166
2167        return r;
2168}
2169
2170/*
2171 * thin-pool <metadata dev> <data dev>
2172 *           <data block size (sectors)>
2173 *           <low water mark (blocks)>
2174 *           [<#feature args> [<arg>]*]
2175 *
2176 * Optional feature arguments are:
2177 *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
2178 *           ignore_discard: disable discard
2179 *           no_discard_passdown: don't pass discards down to the data device
2180 */
2181static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2182{
2183        int r, pool_created = 0;
2184        struct pool_c *pt;
2185        struct pool *pool;
2186        struct pool_features pf;
2187        struct dm_arg_set as;
2188        struct dm_dev *data_dev;
2189        unsigned long block_size;
2190        dm_block_t low_water_blocks;
2191        struct dm_dev *metadata_dev;
2192        sector_t metadata_dev_size;
2193        char b[BDEVNAME_SIZE];
2194
2195        /*
2196         * FIXME Remove validation from scope of lock.
2197         */
2198        mutex_lock(&dm_thin_pool_table.mutex);
2199
2200        if (argc < 4) {
2201                ti->error = "Invalid argument count";
2202                r = -EINVAL;
2203                goto out_unlock;
2204        }
2205        as.argc = argc;
2206        as.argv = argv;
2207
2208        r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
2209        if (r) {
2210                ti->error = "Error opening metadata block device";
2211                goto out_unlock;
2212        }
2213
2214        metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
2215        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2216                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2217                       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2218
2219        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2220        if (r) {
2221                ti->error = "Error getting data device";
2222                goto out_metadata;
2223        }
2224
2225        if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2226            block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2227            block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2228            block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2229                ti->error = "Invalid block size";
2230                r = -EINVAL;
2231                goto out;
2232        }
2233
2234        if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2235                ti->error = "Invalid low water mark";
2236                r = -EINVAL;
2237                goto out;
2238        }
2239
2240        /*
2241         * Set default pool features.
2242         */
2243        pool_features_init(&pf);
2244
2245        dm_consume_args(&as, 4);
2246        r = parse_pool_features(&as, &pf, ti);
2247        if (r)
2248                goto out;
2249
2250        pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2251        if (!pt) {
2252                r = -ENOMEM;
2253                goto out;
2254        }
2255
2256        pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
2257                           block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
2258        if (IS_ERR(pool)) {
2259                r = PTR_ERR(pool);
2260                goto out_free_pt;
2261        }
2262
2263        /*
2264         * 'pool_created' reflects whether this is the first table load.
2265         * Top level discard support is not allowed to be changed after
2266         * initial load.  This would require a pool reload to trigger thin
2267         * device changes.
2268         */
2269        if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2270                ti->error = "Discard support cannot be disabled once enabled";
2271                r = -EINVAL;
2272                goto out_flags_changed;
2273        }
2274
2275        /*
2276         * The block layer requires discard_granularity to be a power of 2.
2277         */
2278        if (pf.discard_enabled && !is_power_of_2(block_size)) {
2279                ti->error = "Discard support must be disabled when the block size is not a power of 2";
2280                r = -EINVAL;
2281                goto out_flags_changed;
2282        }
2283
2284        pt->pool = pool;
2285        pt->ti = ti;
2286        pt->metadata_dev = metadata_dev;
2287        pt->data_dev = data_dev;
2288        pt->low_water_blocks = low_water_blocks;
2289        pt->adjusted_pf = pt->requested_pf = pf;
2290        ti->num_flush_requests = 1;
2291
2292        /*
2293         * Only need to enable discards if the pool should pass
2294         * them down to the data device.  The thin device's discard
2295         * processing will cause mappings to be removed from the btree.
2296         */
2297        if (pf.discard_enabled && pf.discard_passdown) {
2298                ti->num_discard_requests = 1;
2299
2300                /*
2301                 * Setting 'discards_supported' circumvents the normal
2302                 * stacking of discard limits (this keeps the pool and
2303                 * thin devices' discard limits consistent).
2304                 */
2305                ti->discards_supported = true;
2306                ti->discard_zeroes_data_unsupported = true;
2307        }
2308        ti->private = pt;
2309
2310        pt->callbacks.congested_fn = pool_is_congested;
2311        dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2312
2313        mutex_unlock(&dm_thin_pool_table.mutex);
2314
2315        return 0;
2316
2317out_flags_changed:
2318        __pool_dec(pool);
2319out_free_pt:
2320        kfree(pt);
2321out:
2322        dm_put_device(ti, data_dev);
2323out_metadata:
2324        dm_put_device(ti, metadata_dev);
2325out_unlock:
2326        mutex_unlock(&dm_thin_pool_table.mutex);
2327
2328        return r;
2329}
2330
2331static int pool_map(struct dm_target *ti, struct bio *bio,
2332                    union map_info *map_context)
2333{
2334        int r;
2335        struct pool_c *pt = ti->private;
2336        struct pool *pool = pt->pool;
2337        unsigned long flags;
2338
2339        /*
2340         * As this is a singleton target, ti->begin is always zero.
2341         */
2342        spin_lock_irqsave(&pool->lock, flags);
2343        bio->bi_bdev = pt->data_dev->bdev;
2344        r = DM_MAPIO_REMAPPED;
2345        spin_unlock_irqrestore(&pool->lock, flags);
2346
2347        return r;
2348}
2349
2350/*
2351 * Retrieves the number of blocks of the data device from
2352 * the superblock and compares it to the actual device size,
2353 * thus resizing the data device in case it has grown.
2354 *
2355 * This both copes with opening preallocated data devices in the ctr
2356 * being followed by a resume
2357 * -and-
2358 * calling the resume method individually after userspace has
2359 * grown the data device in reaction to a table event.
2360 */
2361static int pool_preresume(struct dm_target *ti)
2362{
2363        int r;
2364        struct pool_c *pt = ti->private;
2365        struct pool *pool = pt->pool;
2366        sector_t data_size = ti->len;
2367        dm_block_t sb_data_size;
2368
2369        /*
2370         * Take control of the pool object.
2371         */
2372        r = bind_control_target(pool, ti);
2373        if (r)
2374                return r;
2375
2376        (void) sector_div(data_size, pool->sectors_per_block);
2377
2378        r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2379        if (r) {
2380                DMERR("failed to retrieve data device size");
2381                return r;
2382        }
2383
2384        if (data_size < sb_data_size) {
2385                DMERR("pool target too small, is %llu blocks (expected %llu)",
2386                      (unsigned long long)data_size, sb_data_size);
2387                return -EINVAL;
2388
2389        } else if (data_size > sb_data_size) {
2390                r = dm_pool_resize_data_dev(pool->pmd, data_size);
2391                if (r) {
2392                        DMERR("failed to resize data device");
2393                        /* FIXME Stricter than necessary: Rollback transaction instead here */
2394                        set_pool_mode(pool, PM_READ_ONLY);
2395                        return r;
2396                }
2397
2398                (void) commit_or_fallback(pool);
2399        }
2400
2401        return 0;
2402}
2403
2404static void pool_resume(struct dm_target *ti)
2405{
2406        struct pool_c *pt = ti->private;
2407        struct pool *pool = pt->pool;
2408        unsigned long flags;
2409
2410        spin_lock_irqsave(&pool->lock, flags);
2411        pool->low_water_triggered = 0;
2412        pool->no_free_space = 0;
2413        __requeue_bios(pool);
2414        spin_unlock_irqrestore(&pool->lock, flags);
2415
2416        do_waker(&pool->waker.work);
2417}
2418
2419static void pool_postsuspend(struct dm_target *ti)
2420{
2421        struct pool_c *pt = ti->private;
2422        struct pool *pool = pt->pool;
2423
2424        cancel_delayed_work(&pool->waker);
2425        flush_workqueue(pool->wq);
2426        (void) commit_or_fallback(pool);
2427}
2428
2429static int check_arg_count(unsigned argc, unsigned args_required)
2430{
2431        if (argc != args_required) {
2432                DMWARN("Message received with %u arguments instead of %u.",
2433                       argc, args_required);
2434                return -EINVAL;
2435        }
2436
2437        return 0;
2438}
2439
2440static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2441{
2442        if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2443            *dev_id <= MAX_DEV_ID)
2444                return 0;
2445
2446        if (warning)
2447                DMWARN("Message received with invalid device id: %s", arg);
2448
2449        return -EINVAL;
2450}
2451
2452static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2453{
2454        dm_thin_id dev_id;
2455        int r;
2456
2457        r = check_arg_count(argc, 2);
2458        if (r)
2459                return r;
2460
2461        r = read_dev_id(argv[1], &dev_id, 1);
2462        if (r)
2463                return r;
2464
2465        r = dm_pool_create_thin(pool->pmd, dev_id);
2466        if (r) {
2467                DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2468                       argv[1]);
2469                return r;
2470        }
2471
2472        return 0;
2473}
2474
2475static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2476{
2477        dm_thin_id dev_id;
2478        dm_thin_id origin_dev_id;
2479        int r;
2480
2481        r = check_arg_count(argc, 3);
2482        if (r)
2483                return r;
2484
2485        r = read_dev_id(argv[1], &dev_id, 1);
2486        if (r)
2487                return r;
2488
2489        r = read_dev_id(argv[2], &origin_dev_id, 1);
2490        if (r)
2491                return r;
2492
2493        r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2494        if (r) {
2495                DMWARN("Creation of new snapshot %s of device %s failed.",
2496                       argv[1], argv[2]);
2497                return r;
2498        }
2499
2500        return 0;
2501}
2502
2503static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2504{
2505        dm_thin_id dev_id;
2506        int r;
2507
2508        r = check_arg_count(argc, 2);
2509        if (r)
2510                return r;
2511
2512        r = read_dev_id(argv[1], &dev_id, 1);
2513        if (r)
2514                return r;
2515
2516        r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2517        if (r)
2518                DMWARN("Deletion of thin device %s failed.", argv[1]);
2519
2520        return r;
2521}
2522
2523static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2524{
2525        dm_thin_id old_id, new_id;
2526        int r;
2527
2528        r = check_arg_count(argc, 3);
2529        if (r)
2530                return r;
2531
2532        if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2533                DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2534                return -EINVAL;
2535        }
2536
2537        if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2538                DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2539                return -EINVAL;
2540        }
2541
2542        r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2543        if (r) {
2544                DMWARN("Failed to change transaction id from %s to %s.",
2545                       argv[1], argv[2]);
2546                return r;
2547        }
2548
2549        return 0;
2550}
2551
2552static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2553{
2554        int r;
2555
2556        r = check_arg_count(argc, 1);
2557        if (r)
2558                return r;
2559
2560        (void) commit_or_fallback(pool);
2561
2562        r = dm_pool_reserve_metadata_snap(pool->pmd);
2563        if (r)
2564                DMWARN("reserve_metadata_snap message failed.");
2565
2566        return r;
2567}
2568
2569static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2570{
2571        int r;
2572
2573        r = check_arg_count(argc, 1);
2574        if (r)
2575                return r;
2576
2577        r = dm_pool_release_metadata_snap(pool->pmd);
2578        if (r)
2579                DMWARN("release_metadata_snap message failed.");
2580
2581        return r;
2582}
2583
2584/*
2585 * Messages supported:
2586 *   create_thin        <dev_id>
2587 *   create_snap        <dev_id> <origin_id>
2588 *   delete             <dev_id>
2589 *   trim               <dev_id> <new_size_in_sectors>
2590 *   set_transaction_id <current_trans_id> <new_trans_id>
2591 *   reserve_metadata_snap
2592 *   release_metadata_snap
2593 */
2594static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2595{
2596        int r = -EINVAL;
2597        struct pool_c *pt = ti->private;
2598        struct pool *pool = pt->pool;
2599
2600        if (!strcasecmp(argv[0], "create_thin"))
2601                r = process_create_thin_mesg(argc, argv, pool);
2602
2603        else if (!strcasecmp(argv[0], "create_snap"))
2604                r = process_create_snap_mesg(argc, argv, pool);
2605
2606        else if (!strcasecmp(argv[0], "delete"))
2607                r = process_delete_mesg(argc, argv, pool);
2608
2609        else if (!strcasecmp(argv[0], "set_transaction_id"))
2610                r = process_set_transaction_id_mesg(argc, argv, pool);
2611
2612        else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2613                r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2614
2615        else if (!strcasecmp(argv[0], "release_metadata_snap"))
2616                r = process_release_metadata_snap_mesg(argc, argv, pool);
2617
2618        else
2619                DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2620
2621        if (!r)
2622                (void) commit_or_fallback(pool);
2623
2624        return r;
2625}
2626
2627static void emit_flags(struct pool_features *pf, char *result,
2628                       unsigned sz, unsigned maxlen)
2629{
2630        unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2631                !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2632        DMEMIT("%u ", count);
2633
2634        if (!pf->zero_new_blocks)
2635                DMEMIT("skip_block_zeroing ");
2636
2637        if (!pf->discard_enabled)
2638                DMEMIT("ignore_discard ");
2639
2640        if (!pf->discard_passdown)
2641                DMEMIT("no_discard_passdown ");
2642
2643        if (pf->mode == PM_READ_ONLY)
2644                DMEMIT("read_only ");
2645}
2646
2647/*
2648 * Status line is:
2649 *    <transaction id> <used metadata sectors>/<total metadata sectors>
2650 *    <used data sectors>/<total data sectors> <held metadata root>
2651 */
2652static int pool_status(struct dm_target *ti, status_type_t type,
2653                       unsigned status_flags, char *result, unsigned maxlen)
2654{
2655        int r;
2656        unsigned sz = 0;
2657        uint64_t transaction_id;
2658        dm_block_t nr_free_blocks_data;
2659        dm_block_t nr_free_blocks_metadata;
2660        dm_block_t nr_blocks_data;
2661        dm_block_t nr_blocks_metadata;
2662        dm_block_t held_root;
2663        char buf[BDEVNAME_SIZE];
2664        char buf2[BDEVNAME_SIZE];
2665        struct pool_c *pt = ti->private;
2666        struct pool *pool = pt->pool;
2667
2668        switch (type) {
2669        case STATUSTYPE_INFO:
2670                if (get_pool_mode(pool) == PM_FAIL) {
2671                        DMEMIT("Fail");
2672                        break;
2673                }
2674
2675                /* Commit to ensure statistics aren't out-of-date */
2676                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2677                        (void) commit_or_fallback(pool);
2678
2679                r = dm_pool_get_metadata_transaction_id(pool->pmd,
2680                                                        &transaction_id);
2681                if (r)
2682                        return r;
2683
2684                r = dm_pool_get_free_metadata_block_count(pool->pmd,
2685                                                          &nr_free_blocks_metadata);
2686                if (r)
2687                        return r;
2688
2689                r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2690                if (r)
2691                        return r;
2692
2693                r = dm_pool_get_free_block_count(pool->pmd,
2694                                                 &nr_free_blocks_data);
2695                if (r)
2696                        return r;
2697
2698                r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2699                if (r)
2700                        return r;
2701
2702                r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2703                if (r)
2704                        return r;
2705
2706                DMEMIT("%llu %llu/%llu %llu/%llu ",
2707                       (unsigned long long)transaction_id,
2708                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2709                       (unsigned long long)nr_blocks_metadata,
2710                       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2711                       (unsigned long long)nr_blocks_data);
2712
2713                if (held_root)
2714                        DMEMIT("%llu ", held_root);
2715                else
2716                        DMEMIT("- ");
2717
2718                if (pool->pf.mode == PM_READ_ONLY)
2719                        DMEMIT("ro ");
2720                else
2721                        DMEMIT("rw ");
2722
2723                if (pool->pf.discard_enabled && pool->pf.discard_passdown)
2724                        DMEMIT("discard_passdown");
2725                else
2726                        DMEMIT("no_discard_passdown");
2727
2728                break;
2729
2730        case STATUSTYPE_TABLE:
2731                DMEMIT("%s %s %lu %llu ",
2732                       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2733                       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2734                       (unsigned long)pool->sectors_per_block,
2735                       (unsigned long long)pt->low_water_blocks);
2736                emit_flags(&pt->requested_pf, result, sz, maxlen);
2737                break;
2738        }
2739
2740        return 0;
2741}
2742
2743static int pool_iterate_devices(struct dm_target *ti,
2744                                iterate_devices_callout_fn fn, void *data)
2745{
2746        struct pool_c *pt = ti->private;
2747
2748        return fn(ti, pt->data_dev, 0, ti->len, data);
2749}
2750
2751static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2752                      struct bio_vec *biovec, int max_size)
2753{
2754        struct pool_c *pt = ti->private;
2755        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2756
2757        if (!q->merge_bvec_fn)
2758                return max_size;
2759
2760        bvm->bi_bdev = pt->data_dev->bdev;
2761
2762        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2763}
2764
2765static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2766{
2767        struct pool *pool = pt->pool;
2768        struct queue_limits *data_limits;
2769
2770        limits->max_discard_sectors = pool->sectors_per_block;
2771
2772        /*
2773         * discard_granularity is just a hint, and not enforced.
2774         */
2775        if (pt->adjusted_pf.discard_passdown) {
2776                data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2777                limits->discard_granularity = data_limits->discard_granularity;
2778        } else
2779                limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2780}
2781
2782static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2783{
2784        struct pool_c *pt = ti->private;
2785        struct pool *pool = pt->pool;
2786
2787        blk_limits_io_min(limits, 0);
2788        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2789
2790        /*
2791         * pt->adjusted_pf is a staging area for the actual features to use.
2792         * They get transferred to the live pool in bind_control_target()
2793         * called from pool_preresume().
2794         */
2795        if (!pt->adjusted_pf.discard_enabled)
2796                return;
2797
2798        disable_passdown_if_not_supported(pt);
2799
2800        set_discard_limits(pt, limits);
2801}
2802
2803static struct target_type pool_target = {
2804        .name = "thin-pool",
2805        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2806                    DM_TARGET_IMMUTABLE,
2807        .version = {1, 4, 0},
2808        .module = THIS_MODULE,
2809        .ctr = pool_ctr,
2810        .dtr = pool_dtr,
2811        .map = pool_map,
2812        .postsuspend = pool_postsuspend,
2813        .preresume = pool_preresume,
2814        .resume = pool_resume,
2815        .message = pool_message,
2816        .status = pool_status,
2817        .merge = pool_merge,
2818        .iterate_devices = pool_iterate_devices,
2819        .io_hints = pool_io_hints,
2820};
2821
2822/*----------------------------------------------------------------
2823 * Thin target methods
2824 *--------------------------------------------------------------*/
2825static void thin_dtr(struct dm_target *ti)
2826{
2827        struct thin_c *tc = ti->private;
2828
2829        mutex_lock(&dm_thin_pool_table.mutex);
2830
2831        __pool_dec(tc->pool);
2832        dm_pool_close_thin_device(tc->td);
2833        dm_put_device(ti, tc->pool_dev);
2834        if (tc->origin_dev)
2835                dm_put_device(ti, tc->origin_dev);
2836        kfree(tc);
2837
2838        mutex_unlock(&dm_thin_pool_table.mutex);
2839}
2840
2841/*
2842 * Thin target parameters:
2843 *
2844 * <pool_dev> <dev_id> [origin_dev]
2845 *
2846 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2847 * dev_id: the internal device identifier
2848 * origin_dev: a device external to the pool that should act as the origin
2849 *
2850 * If the pool device has discards disabled, they get disabled for the thin
2851 * device as well.
2852 */
2853static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2854{
2855        int r;
2856        struct thin_c *tc;
2857        struct dm_dev *pool_dev, *origin_dev;
2858        struct mapped_device *pool_md;
2859
2860        mutex_lock(&dm_thin_pool_table.mutex);
2861
2862        if (argc != 2 && argc != 3) {
2863                ti->error = "Invalid argument count";
2864                r = -EINVAL;
2865                goto out_unlock;
2866        }
2867
2868        tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2869        if (!tc) {
2870                ti->error = "Out of memory";
2871                r = -ENOMEM;
2872                goto out_unlock;
2873        }
2874
2875        if (argc == 3) {
2876                r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2877                if (r) {
2878                        ti->error = "Error opening origin device";
2879                        goto bad_origin_dev;
2880                }
2881                tc->origin_dev = origin_dev;
2882        }
2883
2884        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2885        if (r) {
2886                ti->error = "Error opening pool device";
2887                goto bad_pool_dev;
2888        }
2889        tc->pool_dev = pool_dev;
2890
2891        if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2892                ti->error = "Invalid device id";
2893                r = -EINVAL;
2894                goto bad_common;
2895        }
2896
2897        pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2898        if (!pool_md) {
2899                ti->error = "Couldn't get pool mapped device";
2900                r = -EINVAL;
2901                goto bad_common;
2902        }
2903
2904        tc->pool = __pool_table_lookup(pool_md);
2905        if (!tc->pool) {
2906                ti->error = "Couldn't find pool object";
2907                r = -EINVAL;
2908                goto bad_pool_lookup;
2909        }
2910        __pool_inc(tc->pool);
2911
2912        if (get_pool_mode(tc->pool) == PM_FAIL) {
2913                ti->error = "Couldn't open thin device, Pool is in fail mode";
2914                goto bad_thin_open;
2915        }
2916
2917        r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2918        if (r) {
2919                ti->error = "Couldn't open thin internal device";
2920                goto bad_thin_open;
2921        }
2922
2923        r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2924        if (r)
2925                goto bad_thin_open;
2926
2927        ti->num_flush_requests = 1;
2928        ti->flush_supported = true;
2929
2930        /* In case the pool supports discards, pass them on. */
2931        if (tc->pool->pf.discard_enabled) {
2932                ti->discards_supported = true;
2933                ti->num_discard_requests = 1;
2934                ti->discard_zeroes_data_unsupported = true;
2935                /* Discard requests must be split on a block boundary */
2936                ti->split_discard_requests = true;
2937        }
2938
2939        dm_put(pool_md);
2940
2941        mutex_unlock(&dm_thin_pool_table.mutex);
2942
2943        return 0;
2944
2945bad_thin_open:
2946        __pool_dec(tc->pool);
2947bad_pool_lookup:
2948        dm_put(pool_md);
2949bad_common:
2950        dm_put_device(ti, tc->pool_dev);
2951bad_pool_dev:
2952        if (tc->origin_dev)
2953                dm_put_device(ti, tc->origin_dev);
2954bad_origin_dev:
2955        kfree(tc);
2956out_unlock:
2957        mutex_unlock(&dm_thin_pool_table.mutex);
2958
2959        return r;
2960}
2961
2962static int thin_map(struct dm_target *ti, struct bio *bio,
2963                    union map_info *map_context)
2964{
2965        bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2966
2967        return thin_bio_map(ti, bio, map_context);
2968}
2969
2970static int thin_endio(struct dm_target *ti,
2971                      struct bio *bio, int err,
2972                      union map_info *map_context)
2973{
2974        unsigned long flags;
2975        struct dm_thin_endio_hook *h = map_context->ptr;
2976        struct list_head work;
2977        struct dm_thin_new_mapping *m, *tmp;
2978        struct pool *pool = h->tc->pool;
2979
2980        if (h->shared_read_entry) {
2981                INIT_LIST_HEAD(&work);
2982                ds_dec(h->shared_read_entry, &work);
2983
2984                spin_lock_irqsave(&pool->lock, flags);
2985                list_for_each_entry_safe(m, tmp, &work, list) {
2986                        list_del(&m->list);
2987                        m->quiesced = 1;
2988                        __maybe_add_mapping(m);
2989                }
2990                spin_unlock_irqrestore(&pool->lock, flags);
2991        }
2992
2993        if (h->all_io_entry) {
2994                INIT_LIST_HEAD(&work);
2995                ds_dec(h->all_io_entry, &work);
2996                spin_lock_irqsave(&pool->lock, flags);
2997                list_for_each_entry_safe(m, tmp, &work, list)
2998                        list_add(&m->list, &pool->prepared_discards);
2999                spin_unlock_irqrestore(&pool->lock, flags);
3000        }
3001
3002        mempool_free(h, pool->endio_hook_pool);
3003
3004        return 0;
3005}
3006
3007static void thin_postsuspend(struct dm_target *ti)
3008{
3009        if (dm_noflush_suspending(ti))
3010                requeue_io((struct thin_c *)ti->private);
3011}
3012
3013/*
3014 * <nr mapped sectors> <highest mapped sector>
3015 */
3016static int thin_status(struct dm_target *ti, status_type_t type,
3017                       unsigned status_flags, char *result, unsigned maxlen)
3018{
3019        int r;
3020        ssize_t sz = 0;
3021        dm_block_t mapped, highest;
3022        char buf[BDEVNAME_SIZE];
3023        struct thin_c *tc = ti->private;
3024
3025        if (get_pool_mode(tc->pool) == PM_FAIL) {
3026                DMEMIT("Fail");
3027                return 0;
3028        }
3029
3030        if (!tc->td)
3031                DMEMIT("-");
3032        else {
3033                switch (type) {
3034                case STATUSTYPE_INFO:
3035                        r = dm_thin_get_mapped_count(tc->td, &mapped);
3036                        if (r)
3037                                return r;
3038
3039                        r = dm_thin_get_highest_mapped_block(tc->td, &highest);
3040                        if (r < 0)
3041                                return r;
3042
3043                        DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3044                        if (r)
3045                                DMEMIT("%llu", ((highest + 1) *
3046                                                tc->pool->sectors_per_block) - 1);
3047                        else
3048                                DMEMIT("-");
3049                        break;
3050
3051                case STATUSTYPE_TABLE:
3052                        DMEMIT("%s %lu",
3053                               format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3054                               (unsigned long) tc->dev_id);
3055                        if (tc->origin_dev)
3056                                DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
3057                        break;
3058                }
3059        }
3060
3061        return 0;
3062}
3063
3064static int thin_iterate_devices(struct dm_target *ti,
3065                                iterate_devices_callout_fn fn, void *data)
3066{
3067        sector_t blocks;
3068        struct thin_c *tc = ti->private;
3069        struct pool *pool = tc->pool;
3070
3071        /*
3072         * We can't call dm_pool_get_data_dev_size() since that blocks.  So
3073         * we follow a more convoluted path through to the pool's target.
3074         */
3075        if (!pool->ti)
3076                return 0;       /* nothing is bound */
3077
3078        blocks = pool->ti->len;
3079        (void) sector_div(blocks, pool->sectors_per_block);
3080        if (blocks)
3081                return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
3082
3083        return 0;
3084}
3085
3086/*
3087 * A thin device always inherits its queue limits from its pool.
3088 */
3089static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
3090{
3091        struct thin_c *tc = ti->private;
3092
3093        *limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
3094}
3095
3096static struct target_type thin_target = {
3097        .name = "thin",
3098        .version = {1, 4, 0},
3099        .module = THIS_MODULE,
3100        .ctr = thin_ctr,
3101        .dtr = thin_dtr,
3102        .map = thin_map,
3103        .end_io = thin_endio,
3104        .postsuspend = thin_postsuspend,
3105        .status = thin_status,
3106        .iterate_devices = thin_iterate_devices,
3107        .io_hints = thin_io_hints,
3108};
3109
3110/*----------------------------------------------------------------*/
3111
3112static int __init dm_thin_init(void)
3113{
3114        int r;
3115
3116        pool_table_init();
3117
3118        r = dm_register_target(&thin_target);
3119        if (r)
3120                return r;
3121
3122        r = dm_register_target(&pool_target);
3123        if (r)
3124                goto bad_pool_target;
3125
3126        r = -ENOMEM;
3127
3128        _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
3129        if (!_cell_cache)
3130                goto bad_cell_cache;
3131
3132        _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3133        if (!_new_mapping_cache)
3134                goto bad_new_mapping_cache;
3135
3136        _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
3137        if (!_endio_hook_cache)
3138                goto bad_endio_hook_cache;
3139
3140        return 0;
3141
3142bad_endio_hook_cache:
3143        kmem_cache_destroy(_new_mapping_cache);
3144bad_new_mapping_cache:
3145        kmem_cache_destroy(_cell_cache);
3146bad_cell_cache:
3147        dm_unregister_target(&pool_target);
3148bad_pool_target:
3149        dm_unregister_target(&thin_target);
3150
3151        return r;
3152}
3153
3154static void dm_thin_exit(void)
3155{
3156        dm_unregister_target(&thin_target);
3157        dm_unregister_target(&pool_target);
3158
3159        kmem_cache_destroy(_cell_cache);
3160        kmem_cache_destroy(_new_mapping_cache);
3161        kmem_cache_destroy(_endio_hook_cache);
3162}
3163
3164module_init(dm_thin_init);
3165module_exit(dm_thin_exit);
3166
3167MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
3168MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3169MODULE_LICENSE("GPL");
3170
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.