linux/drivers/block/brd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Ram backed block device driver.
   4 *
   5 * Copyright (C) 2007 Nick Piggin
   6 * Copyright (C) 2007 Novell Inc.
   7 *
   8 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
   9 * of their respective owners.
  10 */
  11
  12#include <linux/init.h>
  13#include <linux/initrd.h>
  14#include <linux/module.h>
  15#include <linux/moduleparam.h>
  16#include <linux/major.h>
  17#include <linux/blkdev.h>
  18#include <linux/bio.h>
  19#include <linux/highmem.h>
  20#include <linux/mutex.h>
  21#include <linux/pagemap.h>
  22#include <linux/radix-tree.h>
  23#include <linux/fs.h>
  24#include <linux/slab.h>
  25#include <linux/backing-dev.h>
  26#include <linux/debugfs.h>
  27
  28#include <linux/uaccess.h>
  29
  30#define PAGE_SECTORS_SHIFT      (PAGE_SHIFT - SECTOR_SHIFT)
  31#define PAGE_SECTORS            (1 << PAGE_SECTORS_SHIFT)
  32
  33/*
  34 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
  35 * the pages containing the block device's contents. A brd page's ->index is
  36 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
  37 * with, the kernel's pagecache or buffer cache (which sit above our block
  38 * device).
  39 */
  40struct brd_device {
  41        int                     brd_number;
  42        struct gendisk          *brd_disk;
  43        struct list_head        brd_list;
  44
  45        /*
  46         * Backing store of pages and lock to protect it. This is the contents
  47         * of the block device.
  48         */
  49        spinlock_t              brd_lock;
  50        struct radix_tree_root  brd_pages;
  51        u64                     brd_nr_pages;
  52};
  53
  54/*
  55 * Look up and return a brd's page for a given sector.
  56 */
  57static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
  58{
  59        pgoff_t idx;
  60        struct page *page;
  61
  62        /*
  63         * The page lifetime is protected by the fact that we have opened the
  64         * device node -- brd pages will never be deleted under us, so we
  65         * don't need any further locking or refcounting.
  66         *
  67         * This is strictly true for the radix-tree nodes as well (ie. we
  68         * don't actually need the rcu_read_lock()), however that is not a
  69         * documented feature of the radix-tree API so it is better to be
  70         * safe here (we don't have total exclusion from radix tree updates
  71         * here, only deletes).
  72         */
  73        rcu_read_lock();
  74        idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
  75        page = radix_tree_lookup(&brd->brd_pages, idx);
  76        rcu_read_unlock();
  77
  78        BUG_ON(page && page->index != idx);
  79
  80        return page;
  81}
  82
  83/*
  84 * Look up and return a brd's page for a given sector.
  85 * If one does not exist, allocate an empty page, and insert that. Then
  86 * return it.
  87 */
  88static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
  89{
  90        pgoff_t idx;
  91        struct page *page;
  92        gfp_t gfp_flags;
  93
  94        page = brd_lookup_page(brd, sector);
  95        if (page)
  96                return page;
  97
  98        /*
  99         * Must use NOIO because we don't want to recurse back into the
 100         * block or filesystem layers from page reclaim.
 101         */
 102        gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
 103        page = alloc_page(gfp_flags);
 104        if (!page)
 105                return NULL;
 106
 107        if (radix_tree_preload(GFP_NOIO)) {
 108                __free_page(page);
 109                return NULL;
 110        }
 111
 112        spin_lock(&brd->brd_lock);
 113        idx = sector >> PAGE_SECTORS_SHIFT;
 114        page->index = idx;
 115        if (radix_tree_insert(&brd->brd_pages, idx, page)) {
 116                __free_page(page);
 117                page = radix_tree_lookup(&brd->brd_pages, idx);
 118                BUG_ON(!page);
 119                BUG_ON(page->index != idx);
 120        } else {
 121                brd->brd_nr_pages++;
 122        }
 123        spin_unlock(&brd->brd_lock);
 124
 125        radix_tree_preload_end();
 126
 127        return page;
 128}
 129
 130/*
 131 * Free all backing store pages and radix tree. This must only be called when
 132 * there are no other users of the device.
 133 */
 134#define FREE_BATCH 16
 135static void brd_free_pages(struct brd_device *brd)
 136{
 137        unsigned long pos = 0;
 138        struct page *pages[FREE_BATCH];
 139        int nr_pages;
 140
 141        do {
 142                int i;
 143
 144                nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
 145                                (void **)pages, pos, FREE_BATCH);
 146
 147                for (i = 0; i < nr_pages; i++) {
 148                        void *ret;
 149
 150                        BUG_ON(pages[i]->index < pos);
 151                        pos = pages[i]->index;
 152                        ret = radix_tree_delete(&brd->brd_pages, pos);
 153                        BUG_ON(!ret || ret != pages[i]);
 154                        __free_page(pages[i]);
 155                }
 156
 157                pos++;
 158
 159                /*
 160                 * It takes 3.4 seconds to remove 80GiB ramdisk.
 161                 * So, we need cond_resched to avoid stalling the CPU.
 162                 */
 163                cond_resched();
 164
 165                /*
 166                 * This assumes radix_tree_gang_lookup always returns as
 167                 * many pages as possible. If the radix-tree code changes,
 168                 * so will this have to.
 169                 */
 170        } while (nr_pages == FREE_BATCH);
 171}
 172
 173/*
 174 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
 175 */
 176static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
 177{
 178        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 179        size_t copy;
 180
 181        copy = min_t(size_t, n, PAGE_SIZE - offset);
 182        if (!brd_insert_page(brd, sector))
 183                return -ENOSPC;
 184        if (copy < n) {
 185                sector += copy >> SECTOR_SHIFT;
 186                if (!brd_insert_page(brd, sector))
 187                        return -ENOSPC;
 188        }
 189        return 0;
 190}
 191
 192/*
 193 * Copy n bytes from src to the brd starting at sector. Does not sleep.
 194 */
 195static void copy_to_brd(struct brd_device *brd, const void *src,
 196                        sector_t sector, size_t n)
 197{
 198        struct page *page;
 199        void *dst;
 200        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 201        size_t copy;
 202
 203        copy = min_t(size_t, n, PAGE_SIZE - offset);
 204        page = brd_lookup_page(brd, sector);
 205        BUG_ON(!page);
 206
 207        dst = kmap_atomic(page);
 208        memcpy(dst + offset, src, copy);
 209        kunmap_atomic(dst);
 210
 211        if (copy < n) {
 212                src += copy;
 213                sector += copy >> SECTOR_SHIFT;
 214                copy = n - copy;
 215                page = brd_lookup_page(brd, sector);
 216                BUG_ON(!page);
 217
 218                dst = kmap_atomic(page);
 219                memcpy(dst, src, copy);
 220                kunmap_atomic(dst);
 221        }
 222}
 223
 224/*
 225 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
 226 */
 227static void copy_from_brd(void *dst, struct brd_device *brd,
 228                        sector_t sector, size_t n)
 229{
 230        struct page *page;
 231        void *src;
 232        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 233        size_t copy;
 234
 235        copy = min_t(size_t, n, PAGE_SIZE - offset);
 236        page = brd_lookup_page(brd, sector);
 237        if (page) {
 238                src = kmap_atomic(page);
 239                memcpy(dst, src + offset, copy);
 240                kunmap_atomic(src);
 241        } else
 242                memset(dst, 0, copy);
 243
 244        if (copy < n) {
 245                dst += copy;
 246                sector += copy >> SECTOR_SHIFT;
 247                copy = n - copy;
 248                page = brd_lookup_page(brd, sector);
 249                if (page) {
 250                        src = kmap_atomic(page);
 251                        memcpy(dst, src, copy);
 252                        kunmap_atomic(src);
 253                } else
 254                        memset(dst, 0, copy);
 255        }
 256}
 257
 258/*
 259 * Process a single bvec of a bio.
 260 */
 261static int brd_do_bvec(struct brd_device *brd, struct page *page,
 262                        unsigned int len, unsigned int off, unsigned int op,
 263                        sector_t sector)
 264{
 265        void *mem;
 266        int err = 0;
 267
 268        if (op_is_write(op)) {
 269                err = copy_to_brd_setup(brd, sector, len);
 270                if (err)
 271                        goto out;
 272        }
 273
 274        mem = kmap_atomic(page);
 275        if (!op_is_write(op)) {
 276                copy_from_brd(mem + off, brd, sector, len);
 277                flush_dcache_page(page);
 278        } else {
 279                flush_dcache_page(page);
 280                copy_to_brd(brd, mem + off, sector, len);
 281        }
 282        kunmap_atomic(mem);
 283
 284out:
 285        return err;
 286}
 287
 288static blk_qc_t brd_submit_bio(struct bio *bio)
 289{
 290        struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
 291        sector_t sector = bio->bi_iter.bi_sector;
 292        struct bio_vec bvec;
 293        struct bvec_iter iter;
 294
 295        bio_for_each_segment(bvec, bio, iter) {
 296                unsigned int len = bvec.bv_len;
 297                int err;
 298
 299                /* Don't support un-aligned buffer */
 300                WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
 301                                (len & (SECTOR_SIZE - 1)));
 302
 303                err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
 304                                  bio_op(bio), sector);
 305                if (err)
 306                        goto io_error;
 307                sector += len >> SECTOR_SHIFT;
 308        }
 309
 310        bio_endio(bio);
 311        return BLK_QC_T_NONE;
 312io_error:
 313        bio_io_error(bio);
 314        return BLK_QC_T_NONE;
 315}
 316
 317static int brd_rw_page(struct block_device *bdev, sector_t sector,
 318                       struct page *page, unsigned int op)
 319{
 320        struct brd_device *brd = bdev->bd_disk->private_data;
 321        int err;
 322
 323        if (PageTransHuge(page))
 324                return -ENOTSUPP;
 325        err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
 326        page_endio(page, op_is_write(op), err);
 327        return err;
 328}
 329
 330static const struct block_device_operations brd_fops = {
 331        .owner =                THIS_MODULE,
 332        .submit_bio =           brd_submit_bio,
 333        .rw_page =              brd_rw_page,
 334};
 335
 336/*
 337 * And now the modules code and kernel interface.
 338 */
 339static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
 340module_param(rd_nr, int, 0444);
 341MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 342
 343unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 344module_param(rd_size, ulong, 0444);
 345MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 346
 347static int max_part = 1;
 348module_param(max_part, int, 0444);
 349MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 350
 351MODULE_LICENSE("GPL");
 352MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 353MODULE_ALIAS("rd");
 354
 355#ifndef MODULE
 356/* Legacy boot options - nonmodular */
 357static int __init ramdisk_size(char *str)
 358{
 359        rd_size = simple_strtol(str, NULL, 0);
 360        return 1;
 361}
 362__setup("ramdisk_size=", ramdisk_size);
 363#endif
 364
 365/*
 366 * The device scheme is derived from loop.c. Keep them in synch where possible
 367 * (should share code eventually).
 368 */
 369static LIST_HEAD(brd_devices);
 370static DEFINE_MUTEX(brd_devices_mutex);
 371static struct dentry *brd_debugfs_dir;
 372
 373static int brd_alloc(int i)
 374{
 375        struct brd_device *brd;
 376        struct gendisk *disk;
 377        char buf[DISK_NAME_LEN];
 378
 379        brd = kzalloc(sizeof(*brd), GFP_KERNEL);
 380        if (!brd)
 381                return -ENOMEM;
 382        brd->brd_number         = i;
 383        spin_lock_init(&brd->brd_lock);
 384        INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
 385
 386        snprintf(buf, DISK_NAME_LEN, "ram%d", i);
 387        if (!IS_ERR_OR_NULL(brd_debugfs_dir))
 388                debugfs_create_u64(buf, 0444, brd_debugfs_dir,
 389                                &brd->brd_nr_pages);
 390
 391        disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE);
 392        if (!disk)
 393                goto out_free_dev;
 394
 395        disk->major             = RAMDISK_MAJOR;
 396        disk->first_minor       = i * max_part;
 397        disk->minors            = max_part;
 398        disk->fops              = &brd_fops;
 399        disk->private_data      = brd;
 400        disk->flags             = GENHD_FL_EXT_DEVT;
 401        strlcpy(disk->disk_name, buf, DISK_NAME_LEN);
 402        set_capacity(disk, rd_size * 2);
 403        
 404        /*
 405         * This is so fdisk will align partitions on 4k, because of
 406         * direct_access API needing 4k alignment, returning a PFN
 407         * (This is only a problem on very small devices <= 4M,
 408         *  otherwise fdisk will align on 1M. Regardless this call
 409         *  is harmless)
 410         */
 411        blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
 412
 413        /* Tell the block layer that this is not a rotational device */
 414        blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
 415        blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 416        add_disk(disk);
 417        list_add_tail(&brd->brd_list, &brd_devices);
 418
 419        return 0;
 420
 421out_free_dev:
 422        kfree(brd);
 423        return -ENOMEM;
 424}
 425
 426static void brd_probe(dev_t dev)
 427{
 428        int i = MINOR(dev) / max_part;
 429        struct brd_device *brd;
 430
 431        mutex_lock(&brd_devices_mutex);
 432        list_for_each_entry(brd, &brd_devices, brd_list) {
 433                if (brd->brd_number == i)
 434                        goto out_unlock;
 435        }
 436
 437        brd_alloc(i);
 438out_unlock:
 439        mutex_unlock(&brd_devices_mutex);
 440}
 441
 442static void brd_del_one(struct brd_device *brd)
 443{
 444        list_del(&brd->brd_list);
 445        del_gendisk(brd->brd_disk);
 446        blk_cleanup_disk(brd->brd_disk);
 447        brd_free_pages(brd);
 448        kfree(brd);
 449}
 450
 451static inline void brd_check_and_reset_par(void)
 452{
 453        if (unlikely(!max_part))
 454                max_part = 1;
 455
 456        /*
 457         * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
 458         * otherwise, it is possiable to get same dev_t when adding partitions.
 459         */
 460        if ((1U << MINORBITS) % max_part != 0)
 461                max_part = 1UL << fls(max_part);
 462
 463        if (max_part > DISK_MAX_PARTS) {
 464                pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
 465                        DISK_MAX_PARTS, DISK_MAX_PARTS);
 466                max_part = DISK_MAX_PARTS;
 467        }
 468}
 469
 470static int __init brd_init(void)
 471{
 472        struct brd_device *brd, *next;
 473        int err, i;
 474
 475        /*
 476         * brd module now has a feature to instantiate underlying device
 477         * structure on-demand, provided that there is an access dev node.
 478         *
 479         * (1) if rd_nr is specified, create that many upfront. else
 480         *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
 481         * (2) User can further extend brd devices by create dev node themselves
 482         *     and have kernel automatically instantiate actual device
 483         *     on-demand. Example:
 484         *              mknod /path/devnod_name b 1 X   # 1 is the rd major
 485         *              fdisk -l /path/devnod_name
 486         *      If (X / max_part) was not already created it will be created
 487         *      dynamically.
 488         */
 489
 490        if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe))
 491                return -EIO;
 492
 493        brd_check_and_reset_par();
 494
 495        brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
 496
 497        mutex_lock(&brd_devices_mutex);
 498        for (i = 0; i < rd_nr; i++) {
 499                err = brd_alloc(i);
 500                if (err)
 501                        goto out_free;
 502        }
 503
 504        mutex_unlock(&brd_devices_mutex);
 505
 506        pr_info("brd: module loaded\n");
 507        return 0;
 508
 509out_free:
 510        debugfs_remove_recursive(brd_debugfs_dir);
 511
 512        list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 513                brd_del_one(brd);
 514        mutex_unlock(&brd_devices_mutex);
 515        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 516
 517        pr_info("brd: module NOT loaded !!!\n");
 518        return err;
 519}
 520
 521static void __exit brd_exit(void)
 522{
 523        struct brd_device *brd, *next;
 524
 525        debugfs_remove_recursive(brd_debugfs_dir);
 526
 527        list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 528                brd_del_one(brd);
 529
 530        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 531
 532        pr_info("brd: module unloaded\n");
 533}
 534
 535module_init(brd_init);
 536module_exit(brd_exit);
 537
 538