linux/drivers/md/dm-switch.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
   3 * Copyright (C) 2011-2013 Red Hat, Inc.
   4 *
   5 * This file is released under the GPL.
   6 *
   7 * dm-switch is a device-mapper target that maps IO to underlying block
   8 * devices efficiently when there are a large number of fixed-sized
   9 * address regions but there is no simple pattern to allow for a compact
  10 * mapping representation such as dm-stripe.
  11 */
  12
  13#include <linux/device-mapper.h>
  14
  15#include <linux/module.h>
  16#include <linux/init.h>
  17#include <linux/vmalloc.h>
  18
  19#define DM_MSG_PREFIX "switch"
  20
  21/*
  22 * One region_table_slot_t holds <region_entries_per_slot> region table
  23 * entries each of which is <region_table_entry_bits> in size.
  24 */
  25typedef unsigned long region_table_slot_t;
  26
  27/*
  28 * A device with the offset to its start sector.
  29 */
  30struct switch_path {
  31        struct dm_dev *dmdev;
  32        sector_t start;
  33};
  34
  35/*
  36 * Context block for a dm switch device.
  37 */
  38struct switch_ctx {
  39        struct dm_target *ti;
  40
  41        unsigned nr_paths;              /* Number of paths in path_list. */
  42
  43        unsigned region_size;           /* Region size in 512-byte sectors */
  44        unsigned long nr_regions;       /* Number of regions making up the device */
  45        signed char region_size_bits;   /* log2 of region_size or -1 */
  46
  47        unsigned char region_table_entry_bits;  /* Number of bits in one region table entry */
  48        unsigned char region_entries_per_slot;  /* Number of entries in one region table slot */
  49        signed char region_entries_per_slot_bits;       /* log2 of region_entries_per_slot or -1 */
  50
  51        region_table_slot_t *region_table;      /* Region table */
  52
  53        /*
  54         * Array of dm devices to switch between.
  55         */
  56        struct switch_path path_list[0];
  57};
  58
  59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
  60                                           unsigned region_size)
  61{
  62        struct switch_ctx *sctx;
  63
  64        sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
  65                       GFP_KERNEL);
  66        if (!sctx)
  67                return NULL;
  68
  69        sctx->ti = ti;
  70        sctx->region_size = region_size;
  71
  72        ti->private = sctx;
  73
  74        return sctx;
  75}
  76
  77static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
  78{
  79        struct switch_ctx *sctx = ti->private;
  80        sector_t nr_regions = ti->len;
  81        sector_t nr_slots;
  82
  83        if (!(sctx->region_size & (sctx->region_size - 1)))
  84                sctx->region_size_bits = __ffs(sctx->region_size);
  85        else
  86                sctx->region_size_bits = -1;
  87
  88        sctx->region_table_entry_bits = 1;
  89        while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
  90               (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
  91                sctx->region_table_entry_bits++;
  92
  93        sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
  94        if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
  95                sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
  96        else
  97                sctx->region_entries_per_slot_bits = -1;
  98
  99        if (sector_div(nr_regions, sctx->region_size))
 100                nr_regions++;
 101
 102        sctx->nr_regions = nr_regions;
 103        if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
 104                ti->error = "Region table too large";
 105                return -EINVAL;
 106        }
 107
 108        nr_slots = nr_regions;
 109        if (sector_div(nr_slots, sctx->region_entries_per_slot))
 110                nr_slots++;
 111
 112        if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
 113                ti->error = "Region table too large";
 114                return -EINVAL;
 115        }
 116
 117        sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
 118        if (!sctx->region_table) {
 119                ti->error = "Cannot allocate region table";
 120                return -ENOMEM;
 121        }
 122
 123        return 0;
 124}
 125
 126static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
 127                                unsigned long *region_index, unsigned *bit)
 128{
 129        if (sctx->region_entries_per_slot_bits >= 0) {
 130                *region_index = region_nr >> sctx->region_entries_per_slot_bits;
 131                *bit = region_nr & (sctx->region_entries_per_slot - 1);
 132        } else {
 133                *region_index = region_nr / sctx->region_entries_per_slot;
 134                *bit = region_nr % sctx->region_entries_per_slot;
 135        }
 136
 137        *bit *= sctx->region_table_entry_bits;
 138}
 139
 140/*
 141 * Find which path to use at given offset.
 142 */
 143static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
 144{
 145        unsigned long region_index;
 146        unsigned bit, path_nr;
 147        sector_t p;
 148
 149        p = offset;
 150        if (sctx->region_size_bits >= 0)
 151                p >>= sctx->region_size_bits;
 152        else
 153                sector_div(p, sctx->region_size);
 154
 155        switch_get_position(sctx, p, &region_index, &bit);
 156        path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
 157               ((1 << sctx->region_table_entry_bits) - 1);
 158
 159        /* This can only happen if the processor uses non-atomic stores. */
 160        if (unlikely(path_nr >= sctx->nr_paths))
 161                path_nr = 0;
 162
 163        return path_nr;
 164}
 165
 166static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
 167                                      unsigned value)
 168{
 169        unsigned long region_index;
 170        unsigned bit;
 171        region_table_slot_t pte;
 172
 173        switch_get_position(sctx, region_nr, &region_index, &bit);
 174
 175        pte = sctx->region_table[region_index];
 176        pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
 177        pte |= (region_table_slot_t)value << bit;
 178        sctx->region_table[region_index] = pte;
 179}
 180
 181/*
 182 * Fill the region table with an initial round robin pattern.
 183 */
 184static void initialise_region_table(struct switch_ctx *sctx)
 185{
 186        unsigned path_nr = 0;
 187        unsigned long region_nr;
 188
 189        for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
 190                switch_region_table_write(sctx, region_nr, path_nr);
 191                if (++path_nr >= sctx->nr_paths)
 192                        path_nr = 0;
 193        }
 194}
 195
 196static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
 197{
 198        struct switch_ctx *sctx = ti->private;
 199        unsigned long long start;
 200        int r;
 201
 202        r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
 203                          &sctx->path_list[sctx->nr_paths].dmdev);
 204        if (r) {
 205                ti->error = "Device lookup failed";
 206                return r;
 207        }
 208
 209        if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
 210                ti->error = "Invalid device starting offset";
 211                dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
 212                return -EINVAL;
 213        }
 214
 215        sctx->path_list[sctx->nr_paths].start = start;
 216
 217        sctx->nr_paths++;
 218
 219        return 0;
 220}
 221
 222/*
 223 * Destructor: Don't free the dm_target, just the ti->private data (if any).
 224 */
 225static void switch_dtr(struct dm_target *ti)
 226{
 227        struct switch_ctx *sctx = ti->private;
 228
 229        while (sctx->nr_paths--)
 230                dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
 231
 232        vfree(sctx->region_table);
 233        kfree(sctx);
 234}
 235
 236/*
 237 * Constructor arguments:
 238 *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
 239 *   [<dev_path> <offset>]+
 240 *
 241 * Optional args are to allow for future extension: currently this
 242 * parameter must be 0.
 243 */
 244static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
 245{
 246        static struct dm_arg _args[] = {
 247                {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
 248                {1, UINT_MAX, "Invalid region size"},
 249                {0, 0, "Invalid number of optional args"},
 250        };
 251
 252        struct switch_ctx *sctx;
 253        struct dm_arg_set as;
 254        unsigned nr_paths, region_size, nr_optional_args;
 255        int r;
 256
 257        as.argc = argc;
 258        as.argv = argv;
 259
 260        r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
 261        if (r)
 262                return -EINVAL;
 263
 264        r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
 265        if (r)
 266                return r;
 267
 268        r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
 269        if (r)
 270                return r;
 271        /* parse optional arguments here, if we add any */
 272
 273        if (as.argc != nr_paths * 2) {
 274                ti->error = "Incorrect number of path arguments";
 275                return -EINVAL;
 276        }
 277
 278        sctx = alloc_switch_ctx(ti, nr_paths, region_size);
 279        if (!sctx) {
 280                ti->error = "Cannot allocate redirection context";
 281                return -ENOMEM;
 282        }
 283
 284        r = dm_set_target_max_io_len(ti, region_size);
 285        if (r)
 286                goto error;
 287
 288        while (as.argc) {
 289                r = parse_path(&as, ti);
 290                if (r)
 291                        goto error;
 292        }
 293
 294        r = alloc_region_table(ti, nr_paths);
 295        if (r)
 296                goto error;
 297
 298        initialise_region_table(sctx);
 299
 300        /* For UNMAP, sending the request down any path is sufficient */
 301        ti->num_discard_bios = 1;
 302
 303        return 0;
 304
 305error:
 306        switch_dtr(ti);
 307
 308        return r;
 309}
 310
 311static int switch_map(struct dm_target *ti, struct bio *bio)
 312{
 313        struct switch_ctx *sctx = ti->private;
 314        sector_t offset = dm_target_offset(ti, bio->bi_sector);
 315        unsigned path_nr = switch_get_path_nr(sctx, offset);
 316
 317        bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
 318        bio->bi_sector = sctx->path_list[path_nr].start + offset;
 319
 320        return DM_MAPIO_REMAPPED;
 321}
 322
 323/*
 324 * We need to parse hex numbers in the message as quickly as possible.
 325 *
 326 * This table-based hex parser improves performance.
 327 * It improves a time to load 1000000 entries compared to the condition-based
 328 * parser.
 329 *              table-based parser      condition-based parser
 330 * PA-RISC      0.29s                   0.31s
 331 * Opteron      0.0495s                 0.0498s
 332 */
 333static const unsigned char hex_table[256] = {
 334255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 335255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 336255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 3370, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
 338255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 339255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 340255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 341255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 342255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 345255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 346255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 347255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 348255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 349255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
 350};
 351
 352static __always_inline unsigned long parse_hex(const char **string)
 353{
 354        unsigned char d;
 355        unsigned long r = 0;
 356
 357        while ((d = hex_table[(unsigned char)**string]) < 16) {
 358                r = (r << 4) | d;
 359                (*string)++;
 360        }
 361
 362        return r;
 363}
 364
 365static int process_set_region_mappings(struct switch_ctx *sctx,
 366                             unsigned argc, char **argv)
 367{
 368        unsigned i;
 369        unsigned long region_index = 0;
 370
 371        for (i = 1; i < argc; i++) {
 372                unsigned long path_nr;
 373                const char *string = argv[i];
 374
 375                if (*string == ':')
 376                        region_index++;
 377                else {
 378                        region_index = parse_hex(&string);
 379                        if (unlikely(*string != ':')) {
 380                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 381                                return -EINVAL;
 382                        }
 383                }
 384
 385                string++;
 386                if (unlikely(!*string)) {
 387                        DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 388                        return -EINVAL;
 389                }
 390
 391                path_nr = parse_hex(&string);
 392                if (unlikely(*string)) {
 393                        DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 394                        return -EINVAL;
 395                }
 396                if (unlikely(region_index >= sctx->nr_regions)) {
 397                        DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
 398                        return -EINVAL;
 399                }
 400                if (unlikely(path_nr >= sctx->nr_paths)) {
 401                        DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
 402                        return -EINVAL;
 403                }
 404
 405                switch_region_table_write(sctx, region_index, path_nr);
 406        }
 407
 408        return 0;
 409}
 410
 411/*
 412 * Messages are processed one-at-a-time.
 413 *
 414 * Only set_region_mappings is supported.
 415 */
 416static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
 417{
 418        static DEFINE_MUTEX(message_mutex);
 419
 420        struct switch_ctx *sctx = ti->private;
 421        int r = -EINVAL;
 422
 423        mutex_lock(&message_mutex);
 424
 425        if (!strcasecmp(argv[0], "set_region_mappings"))
 426                r = process_set_region_mappings(sctx, argc, argv);
 427        else
 428                DMWARN("Unrecognised message received.");
 429
 430        mutex_unlock(&message_mutex);
 431
 432        return r;
 433}
 434
 435static void switch_status(struct dm_target *ti, status_type_t type,
 436                          unsigned status_flags, char *result, unsigned maxlen)
 437{
 438        struct switch_ctx *sctx = ti->private;
 439        unsigned sz = 0;
 440        int path_nr;
 441
 442        switch (type) {
 443        case STATUSTYPE_INFO:
 444                result[0] = '\0';
 445                break;
 446
 447        case STATUSTYPE_TABLE:
 448                DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
 449                for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
 450                        DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
 451                               (unsigned long long)sctx->path_list[path_nr].start);
 452                break;
 453        }
 454}
 455
 456/*
 457 * Switch ioctl:
 458 *
 459 * Passthrough all ioctls to the path for sector 0
 460 */
 461static int switch_ioctl(struct dm_target *ti, unsigned cmd,
 462                        unsigned long arg)
 463{
 464        struct switch_ctx *sctx = ti->private;
 465        struct block_device *bdev;
 466        fmode_t mode;
 467        unsigned path_nr;
 468        int r = 0;
 469
 470        path_nr = switch_get_path_nr(sctx, 0);
 471
 472        bdev = sctx->path_list[path_nr].dmdev->bdev;
 473        mode = sctx->path_list[path_nr].dmdev->mode;
 474
 475        /*
 476         * Only pass ioctls through if the device sizes match exactly.
 477         */
 478        if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
 479                r = scsi_verify_blk_ioctl(NULL, cmd);
 480
 481        return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 482}
 483
 484static int switch_iterate_devices(struct dm_target *ti,
 485                                  iterate_devices_callout_fn fn, void *data)
 486{
 487        struct switch_ctx *sctx = ti->private;
 488        int path_nr;
 489        int r;
 490
 491        for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
 492                r = fn(ti, sctx->path_list[path_nr].dmdev,
 493                         sctx->path_list[path_nr].start, ti->len, data);
 494                if (r)
 495                        return r;
 496        }
 497
 498        return 0;
 499}
 500
 501static struct target_type switch_target = {
 502        .name = "switch",
 503        .version = {1, 0, 0},
 504        .module = THIS_MODULE,
 505        .ctr = switch_ctr,
 506        .dtr = switch_dtr,
 507        .map = switch_map,
 508        .message = switch_message,
 509        .status = switch_status,
 510        .ioctl = switch_ioctl,
 511        .iterate_devices = switch_iterate_devices,
 512};
 513
 514static int __init dm_switch_init(void)
 515{
 516        int r;
 517
 518        r = dm_register_target(&switch_target);
 519        if (r < 0)
 520                DMERR("dm_register_target() failed %d", r);
 521
 522        return r;
 523}
 524
 525static void __exit dm_switch_exit(void)
 526{
 527        dm_unregister_target(&switch_target);
 528}
 529
 530module_init(dm_switch_init);
 531module_exit(dm_switch_exit);
 532
 533MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
 534MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
 535MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
 536MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
 537MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
 538MODULE_LICENSE("GPL");
 539
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.