linux/drivers/md/dm-exception-store.c
<<
>>
Prefs
   1/*
   2 * dm-exception-store.c
   3 *
   4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
   5 * Copyright (C) 2006 Red Hat GmbH
   6 *
   7 * This file is released under the GPL.
   8 */
   9
  10#include "dm-snap.h"
  11
  12#include <linux/mm.h>
  13#include <linux/pagemap.h>
  14#include <linux/vmalloc.h>
  15#include <linux/slab.h>
  16#include <linux/dm-io.h>
  17#include <linux/dm-kcopyd.h>
  18
  19#define DM_MSG_PREFIX "snapshots"
  20#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
  21
  22/*-----------------------------------------------------------------
  23 * Persistent snapshots, by persistent we mean that the snapshot
  24 * will survive a reboot.
  25 *---------------------------------------------------------------*/
  26
  27/*
  28 * We need to store a record of which parts of the origin have
  29 * been copied to the snapshot device.  The snapshot code
  30 * requires that we copy exception chunks to chunk aligned areas
  31 * of the COW store.  It makes sense therefore, to store the
  32 * metadata in chunk size blocks.
  33 *
  34 * There is no backward or forward compatibility implemented,
  35 * snapshots with different disk versions than the kernel will
  36 * not be usable.  It is expected that "lvcreate" will blank out
  37 * the start of a fresh COW device before calling the snapshot
  38 * constructor.
  39 *
  40 * The first chunk of the COW device just contains the header.
  41 * After this there is a chunk filled with exception metadata,
  42 * followed by as many exception chunks as can fit in the
  43 * metadata areas.
  44 *
  45 * All on disk structures are in little-endian format.  The end
  46 * of the exceptions info is indicated by an exception with a
  47 * new_chunk of 0, which is invalid since it would point to the
  48 * header chunk.
  49 */
  50
  51/*
  52 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
  53 */
  54#define SNAP_MAGIC 0x70416e53
  55
  56/*
  57 * The on-disk version of the metadata.
  58 */
  59#define SNAPSHOT_DISK_VERSION 1
  60
  61struct disk_header {
  62        uint32_t magic;
  63
  64        /*
  65         * Is this snapshot valid.  There is no way of recovering
  66         * an invalid snapshot.
  67         */
  68        uint32_t valid;
  69
  70        /*
  71         * Simple, incrementing version. no backward
  72         * compatibility.
  73         */
  74        uint32_t version;
  75
  76        /* In sectors */
  77        uint32_t chunk_size;
  78};
  79
  80struct disk_exception {
  81        uint64_t old_chunk;
  82        uint64_t new_chunk;
  83};
  84
  85struct commit_callback {
  86        void (*callback)(void *, int success);
  87        void *context;
  88};
  89
  90/*
  91 * The top level structure for a persistent exception store.
  92 */
  93struct pstore {
  94        struct dm_snapshot *snap;       /* up pointer to my snapshot */
  95        int version;
  96        int valid;
  97        uint32_t exceptions_per_area;
  98
  99        /*
 100         * Now that we have an asynchronous kcopyd there is no
 101         * need for large chunk sizes, so it wont hurt to have a
 102         * whole chunks worth of metadata in memory at once.
 103         */
 104        void *area;
 105
 106        /*
 107         * An area of zeros used to clear the next area.
 108         */
 109        void *zero_area;
 110
 111        /*
 112         * Used to keep track of which metadata area the data in
 113         * 'chunk' refers to.
 114         */
 115        chunk_t current_area;
 116
 117        /*
 118         * The next free chunk for an exception.
 119         */
 120        chunk_t next_free;
 121
 122        /*
 123         * The index of next free exception in the current
 124         * metadata area.
 125         */
 126        uint32_t current_committed;
 127
 128        atomic_t pending_count;
 129        uint32_t callback_count;
 130        struct commit_callback *callbacks;
 131        struct dm_io_client *io_client;
 132
 133        struct workqueue_struct *metadata_wq;
 134};
 135
 136static unsigned sectors_to_pages(unsigned sectors)
 137{
 138        return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
 139}
 140
 141static int alloc_area(struct pstore *ps)
 142{
 143        int r = -ENOMEM;
 144        size_t len;
 145
 146        len = ps->snap->chunk_size << SECTOR_SHIFT;
 147
 148        /*
 149         * Allocate the chunk_size block of memory that will hold
 150         * a single metadata area.
 151         */
 152        ps->area = vmalloc(len);
 153        if (!ps->area)
 154                return r;
 155
 156        ps->zero_area = vmalloc(len);
 157        if (!ps->zero_area) {
 158                vfree(ps->area);
 159                return r;
 160        }
 161        memset(ps->zero_area, 0, len);
 162
 163        return 0;
 164}
 165
 166static void free_area(struct pstore *ps)
 167{
 168        vfree(ps->area);
 169        ps->area = NULL;
 170        vfree(ps->zero_area);
 171        ps->zero_area = NULL;
 172}
 173
 174struct mdata_req {
 175        struct dm_io_region *where;
 176        struct dm_io_request *io_req;
 177        struct work_struct work;
 178        int result;
 179};
 180
 181static void do_metadata(struct work_struct *work)
 182{
 183        struct mdata_req *req = container_of(work, struct mdata_req, work);
 184
 185        req->result = dm_io(req->io_req, 1, req->where, NULL);
 186}
 187
 188/*
 189 * Read or write a chunk aligned and sized block of data from a device.
 190 */
 191static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
 192{
 193        struct dm_io_region where = {
 194                .bdev = ps->snap->cow->bdev,
 195                .sector = ps->snap->chunk_size * chunk,
 196                .count = ps->snap->chunk_size,
 197        };
 198        struct dm_io_request io_req = {
 199                .bi_rw = rw,
 200                .mem.type = DM_IO_VMA,
 201                .mem.ptr.vma = ps->area,
 202                .client = ps->io_client,
 203                .notify.fn = NULL,
 204        };
 205        struct mdata_req req;
 206
 207        if (!metadata)
 208                return dm_io(&io_req, 1, &where, NULL);
 209
 210        req.where = &where;
 211        req.io_req = &io_req;
 212
 213        /*
 214         * Issue the synchronous I/O from a different thread
 215         * to avoid generic_make_request recursion.
 216         */
 217        INIT_WORK(&req.work, do_metadata);
 218        queue_work(ps->metadata_wq, &req.work);
 219        flush_workqueue(ps->metadata_wq);
 220
 221        return req.result;
 222}
 223
 224/*
 225 * Convert a metadata area index to a chunk index.
 226 */
 227static chunk_t area_location(struct pstore *ps, chunk_t area)
 228{
 229        return 1 + ((ps->exceptions_per_area + 1) * area);
 230}
 231
 232/*
 233 * Read or write a metadata area.  Remembering to skip the first
 234 * chunk which holds the header.
 235 */
 236static int area_io(struct pstore *ps, int rw)
 237{
 238        int r;
 239        chunk_t chunk;
 240
 241        chunk = area_location(ps, ps->current_area);
 242
 243        r = chunk_io(ps, chunk, rw, 0);
 244        if (r)
 245                return r;
 246
 247        return 0;
 248}
 249
 250static void zero_memory_area(struct pstore *ps)
 251{
 252        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 253}
 254
 255static int zero_disk_area(struct pstore *ps, chunk_t area)
 256{
 257        struct dm_io_region where = {
 258                .bdev = ps->snap->cow->bdev,
 259                .sector = ps->snap->chunk_size * area_location(ps, area),
 260                .count = ps->snap->chunk_size,
 261        };
 262        struct dm_io_request io_req = {
 263                .bi_rw = WRITE,
 264                .mem.type = DM_IO_VMA,
 265                .mem.ptr.vma = ps->zero_area,
 266                .client = ps->io_client,
 267                .notify.fn = NULL,
 268        };
 269
 270        return dm_io(&io_req, 1, &where, NULL);
 271}
 272
 273static int read_header(struct pstore *ps, int *new_snapshot)
 274{
 275        int r;
 276        struct disk_header *dh;
 277        chunk_t chunk_size;
 278        int chunk_size_supplied = 1;
 279
 280        /*
 281         * Use default chunk size (or hardsect_size, if larger) if none supplied
 282         */
 283        if (!ps->snap->chunk_size) {
 284                ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
 285                    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
 286                ps->snap->chunk_mask = ps->snap->chunk_size - 1;
 287                ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
 288                chunk_size_supplied = 0;
 289        }
 290
 291        ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
 292                                                             chunk_size));
 293        if (IS_ERR(ps->io_client))
 294                return PTR_ERR(ps->io_client);
 295
 296        r = alloc_area(ps);
 297        if (r)
 298                return r;
 299
 300        r = chunk_io(ps, 0, READ, 1);
 301        if (r)
 302                goto bad;
 303
 304        dh = (struct disk_header *) ps->area;
 305
 306        if (le32_to_cpu(dh->magic) == 0) {
 307                *new_snapshot = 1;
 308                return 0;
 309        }
 310
 311        if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
 312                DMWARN("Invalid or corrupt snapshot");
 313                r = -ENXIO;
 314                goto bad;
 315        }
 316
 317        *new_snapshot = 0;
 318        ps->valid = le32_to_cpu(dh->valid);
 319        ps->version = le32_to_cpu(dh->version);
 320        chunk_size = le32_to_cpu(dh->chunk_size);
 321
 322        if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
 323                return 0;
 324
 325        DMWARN("chunk size %llu in device metadata overrides "
 326               "table chunk size of %llu.",
 327               (unsigned long long)chunk_size,
 328               (unsigned long long)ps->snap->chunk_size);
 329
 330        /* We had a bogus chunk_size. Fix stuff up. */
 331        free_area(ps);
 332
 333        ps->snap->chunk_size = chunk_size;
 334        ps->snap->chunk_mask = chunk_size - 1;
 335        ps->snap->chunk_shift = ffs(chunk_size) - 1;
 336
 337        r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
 338                                ps->io_client);
 339        if (r)
 340                return r;
 341
 342        r = alloc_area(ps);
 343        return r;
 344
 345bad:
 346        free_area(ps);
 347        return r;
 348}
 349
 350static int write_header(struct pstore *ps)
 351{
 352        struct disk_header *dh;
 353
 354        memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 355
 356        dh = (struct disk_header *) ps->area;
 357        dh->magic = cpu_to_le32(SNAP_MAGIC);
 358        dh->valid = cpu_to_le32(ps->valid);
 359        dh->version = cpu_to_le32(ps->version);
 360        dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
 361
 362        return chunk_io(ps, 0, WRITE, 1);
 363}
 364
 365/*
 366 * Access functions for the disk exceptions, these do the endian conversions.
 367 */
 368static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 369{
 370        BUG_ON(index >= ps->exceptions_per_area);
 371
 372        return ((struct disk_exception *) ps->area) + index;
 373}
 374
 375static void read_exception(struct pstore *ps,
 376                           uint32_t index, struct disk_exception *result)
 377{
 378        struct disk_exception *e = get_exception(ps, index);
 379
 380        /* copy it */
 381        result->old_chunk = le64_to_cpu(e->old_chunk);
 382        result->new_chunk = le64_to_cpu(e->new_chunk);
 383}
 384
 385static void write_exception(struct pstore *ps,
 386                            uint32_t index, struct disk_exception *de)
 387{
 388        struct disk_exception *e = get_exception(ps, index);
 389
 390        /* copy it */
 391        e->old_chunk = cpu_to_le64(de->old_chunk);
 392        e->new_chunk = cpu_to_le64(de->new_chunk);
 393}
 394
 395/*
 396 * Registers the exceptions that are present in the current area.
 397 * 'full' is filled in to indicate if the area has been
 398 * filled.
 399 */
 400static int insert_exceptions(struct pstore *ps, int *full)
 401{
 402        int r;
 403        unsigned int i;
 404        struct disk_exception de;
 405
 406        /* presume the area is full */
 407        *full = 1;
 408
 409        for (i = 0; i < ps->exceptions_per_area; i++) {
 410                read_exception(ps, i, &de);
 411
 412                /*
 413                 * If the new_chunk is pointing at the start of
 414                 * the COW device, where the first metadata area
 415                 * is we know that we've hit the end of the
 416                 * exceptions.  Therefore the area is not full.
 417                 */
 418                if (de.new_chunk == 0LL) {
 419                        ps->current_committed = i;
 420                        *full = 0;
 421                        break;
 422                }
 423
 424                /*
 425                 * Keep track of the start of the free chunks.
 426                 */
 427                if (ps->next_free <= de.new_chunk)
 428                        ps->next_free = de.new_chunk + 1;
 429
 430                /*
 431                 * Otherwise we add the exception to the snapshot.
 432                 */
 433                r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
 434                if (r)
 435                        return r;
 436        }
 437
 438        return 0;
 439}
 440
 441static int read_exceptions(struct pstore *ps)
 442{
 443        int r, full = 1;
 444
 445        /*
 446         * Keeping reading chunks and inserting exceptions until
 447         * we find a partially full area.
 448         */
 449        for (ps->current_area = 0; full; ps->current_area++) {
 450                r = area_io(ps, READ);
 451                if (r)
 452                        return r;
 453
 454                r = insert_exceptions(ps, &full);
 455                if (r)
 456                        return r;
 457        }
 458
 459        ps->current_area--;
 460
 461        return 0;
 462}
 463
 464static struct pstore *get_info(struct exception_store *store)
 465{
 466        return (struct pstore *) store->context;
 467}
 468
 469static void persistent_fraction_full(struct exception_store *store,
 470                                     sector_t *numerator, sector_t *denominator)
 471{
 472        *numerator = get_info(store)->next_free * store->snap->chunk_size;
 473        *denominator = get_dev_size(store->snap->cow->bdev);
 474}
 475
 476static void persistent_destroy(struct exception_store *store)
 477{
 478        struct pstore *ps = get_info(store);
 479
 480        destroy_workqueue(ps->metadata_wq);
 481        dm_io_client_destroy(ps->io_client);
 482        vfree(ps->callbacks);
 483        free_area(ps);
 484        kfree(ps);
 485}
 486
 487static int persistent_read_metadata(struct exception_store *store)
 488{
 489        int r, uninitialized_var(new_snapshot);
 490        struct pstore *ps = get_info(store);
 491
 492        /*
 493         * Read the snapshot header.
 494         */
 495        r = read_header(ps, &new_snapshot);
 496        if (r)
 497                return r;
 498
 499        /*
 500         * Now we know correct chunk_size, complete the initialisation.
 501         */
 502        ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
 503                                  sizeof(struct disk_exception);
 504        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
 505                        sizeof(*ps->callbacks));
 506        if (!ps->callbacks)
 507                return -ENOMEM;
 508
 509        /*
 510         * Do we need to setup a new snapshot ?
 511         */
 512        if (new_snapshot) {
 513                r = write_header(ps);
 514                if (r) {
 515                        DMWARN("write_header failed");
 516                        return r;
 517                }
 518
 519                ps->current_area = 0;
 520                zero_memory_area(ps);
 521                r = zero_disk_area(ps, 0);
 522                if (r) {
 523                        DMWARN("zero_disk_area(0) failed");
 524                        return r;
 525                }
 526        } else {
 527                /*
 528                 * Sanity checks.
 529                 */
 530                if (ps->version != SNAPSHOT_DISK_VERSION) {
 531                        DMWARN("unable to handle snapshot disk version %d",
 532                               ps->version);
 533                        return -EINVAL;
 534                }
 535
 536                /*
 537                 * Metadata are valid, but snapshot is invalidated
 538                 */
 539                if (!ps->valid)
 540                        return 1;
 541
 542                /*
 543                 * Read the metadata.
 544                 */
 545                r = read_exceptions(ps);
 546                if (r)
 547                        return r;
 548        }
 549
 550        return 0;
 551}
 552
 553static int persistent_prepare(struct exception_store *store,
 554                              struct dm_snap_exception *e)
 555{
 556        struct pstore *ps = get_info(store);
 557        uint32_t stride;
 558        chunk_t next_free;
 559        sector_t size = get_dev_size(store->snap->cow->bdev);
 560
 561        /* Is there enough room ? */
 562        if (size < ((ps->next_free + 1) * store->snap->chunk_size))
 563                return -ENOSPC;
 564
 565        e->new_chunk = ps->next_free;
 566
 567        /*
 568         * Move onto the next free pending, making sure to take
 569         * into account the location of the metadata chunks.
 570         */
 571        stride = (ps->exceptions_per_area + 1);
 572        next_free = ++ps->next_free;
 573        if (sector_div(next_free, stride) == 1)
 574                ps->next_free++;
 575
 576        atomic_inc(&ps->pending_count);
 577        return 0;
 578}
 579
 580static void persistent_commit(struct exception_store *store,
 581                              struct dm_snap_exception *e,
 582                              void (*callback) (void *, int success),
 583                              void *callback_context)
 584{
 585        unsigned int i;
 586        struct pstore *ps = get_info(store);
 587        struct disk_exception de;
 588        struct commit_callback *cb;
 589
 590        de.old_chunk = e->old_chunk;
 591        de.new_chunk = e->new_chunk;
 592        write_exception(ps, ps->current_committed++, &de);
 593
 594        /*
 595         * Add the callback to the back of the array.  This code
 596         * is the only place where the callback array is
 597         * manipulated, and we know that it will never be called
 598         * multiple times concurrently.
 599         */
 600        cb = ps->callbacks + ps->callback_count++;
 601        cb->callback = callback;
 602        cb->context = callback_context;
 603
 604        /*
 605         * If there are exceptions in flight and we have not yet
 606         * filled this metadata area there's nothing more to do.
 607         */
 608        if (!atomic_dec_and_test(&ps->pending_count) &&
 609            (ps->current_committed != ps->exceptions_per_area))
 610                return;
 611
 612        /*
 613         * If we completely filled the current area, then wipe the next one.
 614         */
 615        if ((ps->current_committed == ps->exceptions_per_area) &&
 616             zero_disk_area(ps, ps->current_area + 1))
 617                ps->valid = 0;
 618
 619        /*
 620         * Commit exceptions to disk.
 621         */
 622        if (ps->valid && area_io(ps, WRITE))
 623                ps->valid = 0;
 624
 625        /*
 626         * Advance to the next area if this one is full.
 627         */
 628        if (ps->current_committed == ps->exceptions_per_area) {
 629                ps->current_committed = 0;
 630                ps->current_area++;
 631                zero_memory_area(ps);
 632        }
 633
 634        for (i = 0; i < ps->callback_count; i++) {
 635                cb = ps->callbacks + i;
 636                cb->callback(cb->context, ps->valid);
 637        }
 638
 639        ps->callback_count = 0;
 640}
 641
 642static void persistent_drop(struct exception_store *store)
 643{
 644        struct pstore *ps = get_info(store);
 645
 646        ps->valid = 0;
 647        if (write_header(ps))
 648                DMWARN("write header failed");
 649}
 650
 651int dm_create_persistent(struct exception_store *store)
 652{
 653        struct pstore *ps;
 654
 655        /* allocate the pstore */
 656        ps = kmalloc(sizeof(*ps), GFP_KERNEL);
 657        if (!ps)
 658                return -ENOMEM;
 659
 660        ps->snap = store->snap;
 661        ps->valid = 1;
 662        ps->version = SNAPSHOT_DISK_VERSION;
 663        ps->area = NULL;
 664        ps->next_free = 2;      /* skipping the header and first area */
 665        ps->current_committed = 0;
 666
 667        ps->callback_count = 0;
 668        atomic_set(&ps->pending_count, 0);
 669        ps->callbacks = NULL;
 670
 671        ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
 672        if (!ps->metadata_wq) {
 673                kfree(ps);
 674                DMERR("couldn't start header metadata update thread");
 675                return -ENOMEM;
 676        }
 677
 678        store->destroy = persistent_destroy;
 679        store->read_metadata = persistent_read_metadata;
 680        store->prepare_exception = persistent_prepare;
 681        store->commit_exception = persistent_commit;
 682        store->drop_snapshot = persistent_drop;
 683        store->fraction_full = persistent_fraction_full;
 684        store->context = ps;
 685
 686        return 0;
 687}
 688
 689/*-----------------------------------------------------------------
 690 * Implementation of the store for non-persistent snapshots.
 691 *---------------------------------------------------------------*/
 692struct transient_c {
 693        sector_t next_free;
 694};
 695
 696static void transient_destroy(struct exception_store *store)
 697{
 698        kfree(store->context);
 699}
 700
 701static int transient_read_metadata(struct exception_store *store)
 702{
 703        return 0;
 704}
 705
 706static int transient_prepare(struct exception_store *store,
 707                             struct dm_snap_exception *e)
 708{
 709        struct transient_c *tc = (struct transient_c *) store->context;
 710        sector_t size = get_dev_size(store->snap->cow->bdev);
 711
 712        if (size < (tc->next_free + store->snap->chunk_size))
 713                return -1;
 714
 715        e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
 716        tc->next_free += store->snap->chunk_size;
 717
 718        return 0;
 719}
 720
 721static void transient_commit(struct exception_store *store,
 722                             struct dm_snap_exception *e,
 723                             void (*callback) (void *, int success),
 724                             void *callback_context)
 725{
 726        /* Just succeed */
 727        callback(callback_context, 1);
 728}
 729
 730static void transient_fraction_full(struct exception_store *store,
 731                                    sector_t *numerator, sector_t *denominator)
 732{
 733        *numerator = ((struct transient_c *) store->context)->next_free;
 734        *denominator = get_dev_size(store->snap->cow->bdev);
 735}
 736
 737int dm_create_transient(struct exception_store *store)
 738{
 739        struct transient_c *tc;
 740
 741        store->destroy = transient_destroy;
 742        store->read_metadata = transient_read_metadata;
 743        store->prepare_exception = transient_prepare;
 744        store->commit_exception = transient_commit;
 745        store->drop_snapshot = NULL;
 746        store->fraction_full = transient_fraction_full;
 747
 748        tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
 749        if (!tc)
 750                return -ENOMEM;
 751
 752        tc->next_free = 0;
 753        store->context = tc;
 754
 755        return 0;
 756}
 757