linux/block/genhd.c
<<
>>
Prefs
   1/*
   2 *  gendisk handling
   3 */
   4
   5#include <linux/module.h>
   6#include <linux/fs.h>
   7#include <linux/genhd.h>
   8#include <linux/kdev_t.h>
   9#include <linux/kernel.h>
  10#include <linux/blkdev.h>
  11#include <linux/init.h>
  12#include <linux/spinlock.h>
  13#include <linux/proc_fs.h>
  14#include <linux/seq_file.h>
  15#include <linux/slab.h>
  16#include <linux/kmod.h>
  17#include <linux/kobj_map.h>
  18#include <linux/mutex.h>
  19#include <linux/idr.h>
  20#include <linux/log2.h>
  21
  22#include "blk.h"
  23
  24static DEFINE_MUTEX(block_class_lock);
  25struct kobject *block_depr;
  26
  27/* for extended dynamic devt allocation, currently only one major is used */
  28#define NR_EXT_DEVT             (1 << MINORBITS)
  29
  30/* For extended devt allocation.  ext_devt_mutex prevents look up
  31 * results from going away underneath its user.
  32 */
  33static DEFINE_MUTEX(ext_devt_mutex);
  34static DEFINE_IDR(ext_devt_idr);
  35
  36static struct device_type disk_type;
  37
  38static void disk_check_events(struct disk_events *ev,
  39                              unsigned int *clearing_ptr);
  40static void disk_alloc_events(struct gendisk *disk);
  41static void disk_add_events(struct gendisk *disk);
  42static void disk_del_events(struct gendisk *disk);
  43static void disk_release_events(struct gendisk *disk);
  44
  45/**
  46 * disk_get_part - get partition
  47 * @disk: disk to look partition from
  48 * @partno: partition number
  49 *
  50 * Look for partition @partno from @disk.  If found, increment
  51 * reference count and return it.
  52 *
  53 * CONTEXT:
  54 * Don't care.
  55 *
  56 * RETURNS:
  57 * Pointer to the found partition on success, NULL if not found.
  58 */
  59struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
  60{
  61        struct hd_struct *part = NULL;
  62        struct disk_part_tbl *ptbl;
  63
  64        if (unlikely(partno < 0))
  65                return NULL;
  66
  67        rcu_read_lock();
  68
  69        ptbl = rcu_dereference(disk->part_tbl);
  70        if (likely(partno < ptbl->len)) {
  71                part = rcu_dereference(ptbl->part[partno]);
  72                if (part)
  73                        get_device(part_to_dev(part));
  74        }
  75
  76        rcu_read_unlock();
  77
  78        return part;
  79}
  80EXPORT_SYMBOL_GPL(disk_get_part);
  81
  82/**
  83 * disk_part_iter_init - initialize partition iterator
  84 * @piter: iterator to initialize
  85 * @disk: disk to iterate over
  86 * @flags: DISK_PITER_* flags
  87 *
  88 * Initialize @piter so that it iterates over partitions of @disk.
  89 *
  90 * CONTEXT:
  91 * Don't care.
  92 */
  93void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
  94                          unsigned int flags)
  95{
  96        struct disk_part_tbl *ptbl;
  97
  98        rcu_read_lock();
  99        ptbl = rcu_dereference(disk->part_tbl);
 100
 101        piter->disk = disk;
 102        piter->part = NULL;
 103
 104        if (flags & DISK_PITER_REVERSE)
 105                piter->idx = ptbl->len - 1;
 106        else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
 107                piter->idx = 0;
 108        else
 109                piter->idx = 1;
 110
 111        piter->flags = flags;
 112
 113        rcu_read_unlock();
 114}
 115EXPORT_SYMBOL_GPL(disk_part_iter_init);
 116
 117/**
 118 * disk_part_iter_next - proceed iterator to the next partition and return it
 119 * @piter: iterator of interest
 120 *
 121 * Proceed @piter to the next partition and return it.
 122 *
 123 * CONTEXT:
 124 * Don't care.
 125 */
 126struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 127{
 128        struct disk_part_tbl *ptbl;
 129        int inc, end;
 130
 131        /* put the last partition */
 132        disk_put_part(piter->part);
 133        piter->part = NULL;
 134
 135        /* get part_tbl */
 136        rcu_read_lock();
 137        ptbl = rcu_dereference(piter->disk->part_tbl);
 138
 139        /* determine iteration parameters */
 140        if (piter->flags & DISK_PITER_REVERSE) {
 141                inc = -1;
 142                if (piter->flags & (DISK_PITER_INCL_PART0 |
 143                                    DISK_PITER_INCL_EMPTY_PART0))
 144                        end = -1;
 145                else
 146                        end = 0;
 147        } else {
 148                inc = 1;
 149                end = ptbl->len;
 150        }
 151
 152        /* iterate to the next partition */
 153        for (; piter->idx != end; piter->idx += inc) {
 154                struct hd_struct *part;
 155
 156                part = rcu_dereference(ptbl->part[piter->idx]);
 157                if (!part)
 158                        continue;
 159                if (!part_nr_sects_read(part) &&
 160                    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 161                    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
 162                      piter->idx == 0))
 163                        continue;
 164
 165                get_device(part_to_dev(part));
 166                piter->part = part;
 167                piter->idx += inc;
 168                break;
 169        }
 170
 171        rcu_read_unlock();
 172
 173        return piter->part;
 174}
 175EXPORT_SYMBOL_GPL(disk_part_iter_next);
 176
 177/**
 178 * disk_part_iter_exit - finish up partition iteration
 179 * @piter: iter of interest
 180 *
 181 * Called when iteration is over.  Cleans up @piter.
 182 *
 183 * CONTEXT:
 184 * Don't care.
 185 */
 186void disk_part_iter_exit(struct disk_part_iter *piter)
 187{
 188        disk_put_part(piter->part);
 189        piter->part = NULL;
 190}
 191EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 192
 193static inline int sector_in_part(struct hd_struct *part, sector_t sector)
 194{
 195        return part->start_sect <= sector &&
 196                sector < part->start_sect + part_nr_sects_read(part);
 197}
 198
 199/**
 200 * disk_map_sector_rcu - map sector to partition
 201 * @disk: gendisk of interest
 202 * @sector: sector to map
 203 *
 204 * Find out which partition @sector maps to on @disk.  This is
 205 * primarily used for stats accounting.
 206 *
 207 * CONTEXT:
 208 * RCU read locked.  The returned partition pointer is valid only
 209 * while preemption is disabled.
 210 *
 211 * RETURNS:
 212 * Found partition on success, part0 is returned if no partition matches
 213 */
 214struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 215{
 216        struct disk_part_tbl *ptbl;
 217        struct hd_struct *part;
 218        int i;
 219
 220        ptbl = rcu_dereference(disk->part_tbl);
 221
 222        part = rcu_dereference(ptbl->last_lookup);
 223        if (part && sector_in_part(part, sector))
 224                return part;
 225
 226        for (i = 1; i < ptbl->len; i++) {
 227                part = rcu_dereference(ptbl->part[i]);
 228
 229                if (part && sector_in_part(part, sector)) {
 230                        rcu_assign_pointer(ptbl->last_lookup, part);
 231                        return part;
 232                }
 233        }
 234        return &disk->part0;
 235}
 236EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
 237
 238/*
 239 * Can be deleted altogether. Later.
 240 *
 241 */
 242static struct blk_major_name {
 243        struct blk_major_name *next;
 244        int major;
 245        char name[16];
 246} *major_names[BLKDEV_MAJOR_HASH_SIZE];
 247
 248/* index in the above - for now: assume no multimajor ranges */
 249static inline int major_to_index(unsigned major)
 250{
 251        return major % BLKDEV_MAJOR_HASH_SIZE;
 252}
 253
 254#ifdef CONFIG_PROC_FS
 255void blkdev_show(struct seq_file *seqf, off_t offset)
 256{
 257        struct blk_major_name *dp;
 258
 259        if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 260                mutex_lock(&block_class_lock);
 261                for (dp = major_names[offset]; dp; dp = dp->next)
 262                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
 263                mutex_unlock(&block_class_lock);
 264        }
 265}
 266#endif /* CONFIG_PROC_FS */
 267
 268/**
 269 * register_blkdev - register a new block device
 270 *
 271 * @major: the requested major device number [1..255]. If @major=0, try to
 272 *         allocate any unused major number.
 273 * @name: the name of the new block device as a zero terminated string
 274 *
 275 * The @name must be unique within the system.
 276 *
 277 * The return value depends on the @major input parameter.
 278 *  - if a major device number was requested in range [1..255] then the
 279 *    function returns zero on success, or a negative error code
 280 *  - if any unused major number was requested with @major=0 parameter
 281 *    then the return value is the allocated major number in range
 282 *    [1..255] or a negative error code otherwise
 283 */
 284int register_blkdev(unsigned int major, const char *name)
 285{
 286        struct blk_major_name **n, *p;
 287        int index, ret = 0;
 288
 289        mutex_lock(&block_class_lock);
 290
 291        /* temporary */
 292        if (major == 0) {
 293                for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
 294                        if (major_names[index] == NULL)
 295                                break;
 296                }
 297
 298                if (index == 0) {
 299                        printk("register_blkdev: failed to get major for %s\n",
 300                               name);
 301                        ret = -EBUSY;
 302                        goto out;
 303                }
 304                major = index;
 305                ret = major;
 306        }
 307
 308        p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
 309        if (p == NULL) {
 310                ret = -ENOMEM;
 311                goto out;
 312        }
 313
 314        p->major = major;
 315        strlcpy(p->name, name, sizeof(p->name));
 316        p->next = NULL;
 317        index = major_to_index(major);
 318
 319        for (n = &major_names[index]; *n; n = &(*n)->next) {
 320                if ((*n)->major == major)
 321                        break;
 322        }
 323        if (!*n)
 324                *n = p;
 325        else
 326                ret = -EBUSY;
 327
 328        if (ret < 0) {
 329                printk("register_blkdev: cannot get major %d for %s\n",
 330                       major, name);
 331                kfree(p);
 332        }
 333out:
 334        mutex_unlock(&block_class_lock);
 335        return ret;
 336}
 337
 338EXPORT_SYMBOL(register_blkdev);
 339
 340void unregister_blkdev(unsigned int major, const char *name)
 341{
 342        struct blk_major_name **n;
 343        struct blk_major_name *p = NULL;
 344        int index = major_to_index(major);
 345
 346        mutex_lock(&block_class_lock);
 347        for (n = &major_names[index]; *n; n = &(*n)->next)
 348                if ((*n)->major == major)
 349                        break;
 350        if (!*n || strcmp((*n)->name, name)) {
 351                WARN_ON(1);
 352        } else {
 353                p = *n;
 354                *n = p->next;
 355        }
 356        mutex_unlock(&block_class_lock);
 357        kfree(p);
 358}
 359
 360EXPORT_SYMBOL(unregister_blkdev);
 361
 362static struct kobj_map *bdev_map;
 363
 364/**
 365 * blk_mangle_minor - scatter minor numbers apart
 366 * @minor: minor number to mangle
 367 *
 368 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 369 * is enabled.  Mangling twice gives the original value.
 370 *
 371 * RETURNS:
 372 * Mangled value.
 373 *
 374 * CONTEXT:
 375 * Don't care.
 376 */
 377static int blk_mangle_minor(int minor)
 378{
 379#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
 380        int i;
 381
 382        for (i = 0; i < MINORBITS / 2; i++) {
 383                int low = minor & (1 << i);
 384                int high = minor & (1 << (MINORBITS - 1 - i));
 385                int distance = MINORBITS - 1 - 2 * i;
 386
 387                minor ^= low | high;    /* clear both bits */
 388                low <<= distance;       /* swap the positions */
 389                high >>= distance;
 390                minor |= low | high;    /* and set */
 391        }
 392#endif
 393        return minor;
 394}
 395
 396/**
 397 * blk_alloc_devt - allocate a dev_t for a partition
 398 * @part: partition to allocate dev_t for
 399 * @devt: out parameter for resulting dev_t
 400 *
 401 * Allocate a dev_t for block device.
 402 *
 403 * RETURNS:
 404 * 0 on success, allocated dev_t is returned in *@devt.  -errno on
 405 * failure.
 406 *
 407 * CONTEXT:
 408 * Might sleep.
 409 */
 410int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 411{
 412        struct gendisk *disk = part_to_disk(part);
 413        int idx, rc;
 414
 415        /* in consecutive minor range? */
 416        if (part->partno < disk->minors) {
 417                *devt = MKDEV(disk->major, disk->first_minor + part->partno);
 418                return 0;
 419        }
 420
 421        /* allocate ext devt */
 422        do {
 423                if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL))
 424                        return -ENOMEM;
 425                mutex_lock(&ext_devt_mutex);
 426                rc = idr_get_new(&ext_devt_idr, part, &idx);
 427                if (!rc && idx >= NR_EXT_DEVT) {
 428                        idr_remove(&ext_devt_idr, idx);
 429                        rc = -EBUSY;
 430                }
 431                mutex_unlock(&ext_devt_mutex);
 432        } while (rc == -EAGAIN);
 433
 434        if (rc)
 435                return rc;
 436
 437        *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
 438        return 0;
 439}
 440
 441/**
 442 * blk_free_devt - free a dev_t
 443 * @devt: dev_t to free
 444 *
 445 * Free @devt which was allocated using blk_alloc_devt().
 446 *
 447 * CONTEXT:
 448 * Might sleep.
 449 */
 450void blk_free_devt(dev_t devt)
 451{
 452        might_sleep();
 453
 454        if (devt == MKDEV(0, 0))
 455                return;
 456
 457        if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
 458                mutex_lock(&ext_devt_mutex);
 459                idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 460                mutex_unlock(&ext_devt_mutex);
 461        }
 462}
 463
 464static char *bdevt_str(dev_t devt, char *buf)
 465{
 466        if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
 467                char tbuf[BDEVT_SIZE];
 468                snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
 469                snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
 470        } else
 471                snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
 472
 473        return buf;
 474}
 475
 476/*
 477 * Register device numbers dev..(dev+range-1)
 478 * range must be nonzero
 479 * The hash chain is sorted on range, so that subranges can override.
 480 */
 481void blk_register_region(dev_t devt, unsigned long range, struct module *module,
 482                         struct kobject *(*probe)(dev_t, int *, void *),
 483                         int (*lock)(dev_t, void *), void *data)
 484{
 485        kobj_map(bdev_map, devt, range, module, probe, lock, data);
 486}
 487
 488EXPORT_SYMBOL(blk_register_region);
 489
 490void blk_unregister_region(dev_t devt, unsigned long range)
 491{
 492        kobj_unmap(bdev_map, devt, range);
 493}
 494
 495EXPORT_SYMBOL(blk_unregister_region);
 496
 497static struct kobject *exact_match(dev_t devt, int *partno, void *data)
 498{
 499        struct gendisk *p = data;
 500
 501        return &disk_to_dev(p)->kobj;
 502}
 503
 504static int exact_lock(dev_t devt, void *data)
 505{
 506        struct gendisk *p = data;
 507
 508        if (!get_disk(p))
 509                return -1;
 510        return 0;
 511}
 512
 513static void register_disk(struct gendisk *disk)
 514{
 515        struct device *ddev = disk_to_dev(disk);
 516        struct block_device *bdev;
 517        struct disk_part_iter piter;
 518        struct hd_struct *part;
 519        int err;
 520
 521        ddev->parent = disk->driverfs_dev;
 522
 523        dev_set_name(ddev, disk->disk_name);
 524
 525        /* delay uevents, until we scanned partition table */
 526        dev_set_uevent_suppress(ddev, 1);
 527
 528        if (device_add(ddev))
 529                return;
 530        if (!sysfs_deprecated) {
 531                err = sysfs_create_link(block_depr, &ddev->kobj,
 532                                        kobject_name(&ddev->kobj));
 533                if (err) {
 534                        device_del(ddev);
 535                        return;
 536                }
 537        }
 538        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 539        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 540
 541        /* No minors to use for partitions */
 542        if (!disk_part_scan_enabled(disk))
 543                goto exit;
 544
 545        /* No such device (e.g., media were just removed) */
 546        if (!get_capacity(disk))
 547                goto exit;
 548
 549        bdev = bdget_disk(disk, 0);
 550        if (!bdev)
 551                goto exit;
 552
 553        bdev->bd_invalidated = 1;
 554        err = blkdev_get(bdev, FMODE_READ, NULL);
 555        if (err < 0)
 556                goto exit;
 557        blkdev_put(bdev, FMODE_READ);
 558
 559exit:
 560        /* announce disk after possible partitions are created */
 561        dev_set_uevent_suppress(ddev, 0);
 562        kobject_uevent(&ddev->kobj, KOBJ_ADD);
 563
 564        /* announce possible partitions */
 565        disk_part_iter_init(&piter, disk, 0);
 566        while ((part = disk_part_iter_next(&piter)))
 567                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 568        disk_part_iter_exit(&piter);
 569}
 570
 571/**
 572 * add_disk - add partitioning information to kernel list
 573 * @disk: per-device partitioning information
 574 *
 575 * This function registers the partitioning information in @disk
 576 * with the kernel.
 577 *
 578 * FIXME: error handling
 579 */
 580void add_disk(struct gendisk *disk)
 581{
 582        struct backing_dev_info *bdi;
 583        dev_t devt;
 584        int retval;
 585
 586        /* minors == 0 indicates to use ext devt from part0 and should
 587         * be accompanied with EXT_DEVT flag.  Make sure all
 588         * parameters make sense.
 589         */
 590        WARN_ON(disk->minors && !(disk->major || disk->first_minor));
 591        WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
 592
 593        disk->flags |= GENHD_FL_UP;
 594
 595        retval = blk_alloc_devt(&disk->part0, &devt);
 596        if (retval) {
 597                WARN_ON(1);
 598                return;
 599        }
 600        disk_to_dev(disk)->devt = devt;
 601
 602        /* ->major and ->first_minor aren't supposed to be
 603         * dereferenced from here on, but set them just in case.
 604         */
 605        disk->major = MAJOR(devt);
 606        disk->first_minor = MINOR(devt);
 607
 608        disk_alloc_events(disk);
 609
 610        /* Register BDI before referencing it from bdev */
 611        bdi = &disk->queue->backing_dev_info;
 612        bdi_register_dev(bdi, disk_devt(disk));
 613
 614        blk_register_region(disk_devt(disk), disk->minors, NULL,
 615                            exact_match, exact_lock, disk);
 616        register_disk(disk);
 617        blk_register_queue(disk);
 618
 619        /*
 620         * Take an extra ref on queue which will be put on disk_release()
 621         * so that it sticks around as long as @disk is there.
 622         */
 623        WARN_ON_ONCE(!blk_get_queue(disk->queue));
 624
 625        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 626                                   "bdi");
 627        WARN_ON(retval);
 628
 629        disk_add_events(disk);
 630}
 631EXPORT_SYMBOL(add_disk);
 632
 633void del_gendisk(struct gendisk *disk)
 634{
 635        struct disk_part_iter piter;
 636        struct hd_struct *part;
 637
 638        disk_del_events(disk);
 639
 640        /* invalidate stuff */
 641        disk_part_iter_init(&piter, disk,
 642                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 643        while ((part = disk_part_iter_next(&piter))) {
 644                invalidate_partition(disk, part->partno);
 645                delete_partition(disk, part->partno);
 646        }
 647        disk_part_iter_exit(&piter);
 648
 649        invalidate_partition(disk, 0);
 650        set_capacity(disk, 0);
 651        disk->flags &= ~GENHD_FL_UP;
 652
 653        sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 654        bdi_unregister(&disk->queue->backing_dev_info);
 655        blk_unregister_queue(disk);
 656        blk_unregister_region(disk_devt(disk), disk->minors);
 657
 658        part_stat_set_all(&disk->part0, 0);
 659        disk->part0.stamp = 0;
 660
 661        kobject_put(disk->part0.holder_dir);
 662        kobject_put(disk->slave_dir);
 663        disk->driverfs_dev = NULL;
 664        if (!sysfs_deprecated)
 665                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 666        device_del(disk_to_dev(disk));
 667        blk_free_devt(disk_to_dev(disk)->devt);
 668}
 669EXPORT_SYMBOL(del_gendisk);
 670
 671/**
 672 * get_gendisk - get partitioning information for a given device
 673 * @devt: device to get partitioning information for
 674 * @partno: returned partition index
 675 *
 676 * This function gets the structure containing partitioning
 677 * information for the given device @devt.
 678 */
 679struct gendisk *get_gendisk(dev_t devt, int *partno)
 680{
 681        struct gendisk *disk = NULL;
 682
 683        if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
 684                struct kobject *kobj;
 685
 686                kobj = kobj_lookup(bdev_map, devt, partno);
 687                if (kobj)
 688                        disk = dev_to_disk(kobj_to_dev(kobj));
 689        } else {
 690                struct hd_struct *part;
 691
 692                mutex_lock(&ext_devt_mutex);
 693                part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 694                if (part && get_disk(part_to_disk(part))) {
 695                        *partno = part->partno;
 696                        disk = part_to_disk(part);
 697                }
 698                mutex_unlock(&ext_devt_mutex);
 699        }
 700
 701        return disk;
 702}
 703EXPORT_SYMBOL(get_gendisk);
 704
 705/**
 706 * bdget_disk - do bdget() by gendisk and partition number
 707 * @disk: gendisk of interest
 708 * @partno: partition number
 709 *
 710 * Find partition @partno from @disk, do bdget() on it.
 711 *
 712 * CONTEXT:
 713 * Don't care.
 714 *
 715 * RETURNS:
 716 * Resulting block_device on success, NULL on failure.
 717 */
 718struct block_device *bdget_disk(struct gendisk *disk, int partno)
 719{
 720        struct hd_struct *part;
 721        struct block_device *bdev = NULL;
 722
 723        part = disk_get_part(disk, partno);
 724        if (part)
 725                bdev = bdget(part_devt(part));
 726        disk_put_part(part);
 727
 728        return bdev;
 729}
 730EXPORT_SYMBOL(bdget_disk);
 731
 732/*
 733 * print a full list of all partitions - intended for places where the root
 734 * filesystem can't be mounted and thus to give the victim some idea of what
 735 * went wrong
 736 */
 737void __init printk_all_partitions(void)
 738{
 739        struct class_dev_iter iter;
 740        struct device *dev;
 741
 742        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 743        while ((dev = class_dev_iter_next(&iter))) {
 744                struct gendisk *disk = dev_to_disk(dev);
 745                struct disk_part_iter piter;
 746                struct hd_struct *part;
 747                char name_buf[BDEVNAME_SIZE];
 748                char devt_buf[BDEVT_SIZE];
 749
 750                /*
 751                 * Don't show empty devices or things that have been
 752                 * suppressed
 753                 */
 754                if (get_capacity(disk) == 0 ||
 755                    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
 756                        continue;
 757
 758                /*
 759                 * Note, unlike /proc/partitions, I am showing the
 760                 * numbers in hex - the same format as the root=
 761                 * option takes.
 762                 */
 763                disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 764                while ((part = disk_part_iter_next(&piter))) {
 765                        bool is_part0 = part == &disk->part0;
 766
 767                        printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 768                               bdevt_str(part_devt(part), devt_buf),
 769                               (unsigned long long)part_nr_sects_read(part) >> 1
 770                               , disk_name(disk, part->partno, name_buf),
 771                               part->info ? part->info->uuid : "");
 772                        if (is_part0) {
 773                                if (disk->driverfs_dev != NULL &&
 774                                    disk->driverfs_dev->driver != NULL)
 775                                        printk(" driver: %s\n",
 776                                              disk->driverfs_dev->driver->name);
 777                                else
 778                                        printk(" (driver?)\n");
 779                        } else
 780                                printk("\n");
 781                }
 782                disk_part_iter_exit(&piter);
 783        }
 784        class_dev_iter_exit(&iter);
 785}
 786
 787#ifdef CONFIG_PROC_FS
 788/* iterator */
 789static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
 790{
 791        loff_t skip = *pos;
 792        struct class_dev_iter *iter;
 793        struct device *dev;
 794
 795        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 796        if (!iter)
 797                return ERR_PTR(-ENOMEM);
 798
 799        seqf->private = iter;
 800        class_dev_iter_init(iter, &block_class, NULL, &disk_type);
 801        do {
 802                dev = class_dev_iter_next(iter);
 803                if (!dev)
 804                        return NULL;
 805        } while (skip--);
 806
 807        return dev_to_disk(dev);
 808}
 809
 810static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
 811{
 812        struct device *dev;
 813
 814        (*pos)++;
 815        dev = class_dev_iter_next(seqf->private);
 816        if (dev)
 817                return dev_to_disk(dev);
 818
 819        return NULL;
 820}
 821
 822static void disk_seqf_stop(struct seq_file *seqf, void *v)
 823{
 824        struct class_dev_iter *iter = seqf->private;
 825
 826        /* stop is called even after start failed :-( */
 827        if (iter) {
 828                class_dev_iter_exit(iter);
 829                kfree(iter);
 830        }
 831}
 832
 833static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 834{
 835        void *p;
 836
 837        p = disk_seqf_start(seqf, pos);
 838        if (!IS_ERR_OR_NULL(p) && !*pos)
 839                seq_puts(seqf, "major minor  #blocks  name\n\n");
 840        return p;
 841}
 842
 843static int show_partition(struct seq_file *seqf, void *v)
 844{
 845        struct gendisk *sgp = v;
 846        struct disk_part_iter piter;
 847        struct hd_struct *part;
 848        char buf[BDEVNAME_SIZE];
 849
 850        /* Don't show non-partitionable removeable devices or empty devices */
 851        if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
 852                                   (sgp->flags & GENHD_FL_REMOVABLE)))
 853                return 0;
 854        if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
 855                return 0;
 856
 857        /* show the full disk and all non-0 size partitions of it */
 858        disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
 859        while ((part = disk_part_iter_next(&piter)))
 860                seq_printf(seqf, "%4d  %7d %10llu %s\n",
 861                           MAJOR(part_devt(part)), MINOR(part_devt(part)),
 862                           (unsigned long long)part_nr_sects_read(part) >> 1,
 863                           disk_name(sgp, part->partno, buf));
 864        disk_part_iter_exit(&piter);
 865
 866        return 0;
 867}
 868
 869static const struct seq_operations partitions_op = {
 870        .start  = show_partition_start,
 871        .next   = disk_seqf_next,
 872        .stop   = disk_seqf_stop,
 873        .show   = show_partition
 874};
 875
 876static int partitions_open(struct inode *inode, struct file *file)
 877{
 878        return seq_open(file, &partitions_op);
 879}
 880
 881static const struct file_operations proc_partitions_operations = {
 882        .open           = partitions_open,
 883        .read           = seq_read,
 884        .llseek         = seq_lseek,
 885        .release        = seq_release,
 886};
 887#endif
 888
 889
 890static struct kobject *base_probe(dev_t devt, int *partno, void *data)
 891{
 892        if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
 893                /* Make old-style 2.4 aliases work */
 894                request_module("block-major-%d", MAJOR(devt));
 895        return NULL;
 896}
 897
 898static int __init genhd_device_init(void)
 899{
 900        int error;
 901
 902        block_class.dev_kobj = sysfs_dev_block_kobj;
 903        error = class_register(&block_class);
 904        if (unlikely(error))
 905                return error;
 906        bdev_map = kobj_map_init(base_probe, &block_class_lock);
 907        blk_dev_init();
 908
 909        register_blkdev(BLOCK_EXT_MAJOR, "blkext");
 910
 911        /* create top-level block dir */
 912        if (!sysfs_deprecated)
 913                block_depr = kobject_create_and_add("block", NULL);
 914        return 0;
 915}
 916
 917subsys_initcall(genhd_device_init);
 918
 919static ssize_t disk_range_show(struct device *dev,
 920                               struct device_attribute *attr, char *buf)
 921{
 922        struct gendisk *disk = dev_to_disk(dev);
 923
 924        return sprintf(buf, "%d\n", disk->minors);
 925}
 926
 927static ssize_t disk_ext_range_show(struct device *dev,
 928                                   struct device_attribute *attr, char *buf)
 929{
 930        struct gendisk *disk = dev_to_disk(dev);
 931
 932        return sprintf(buf, "%d\n", disk_max_parts(disk));
 933}
 934
 935static ssize_t disk_removable_show(struct device *dev,
 936                                   struct device_attribute *attr, char *buf)
 937{
 938        struct gendisk *disk = dev_to_disk(dev);
 939
 940        return sprintf(buf, "%d\n",
 941                       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
 942}
 943
 944static ssize_t disk_ro_show(struct device *dev,
 945                                   struct device_attribute *attr, char *buf)
 946{
 947        struct gendisk *disk = dev_to_disk(dev);
 948
 949        return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
 950}
 951
 952static ssize_t disk_capability_show(struct device *dev,
 953                                    struct device_attribute *attr, char *buf)
 954{
 955        struct gendisk *disk = dev_to_disk(dev);
 956
 957        return sprintf(buf, "%x\n", disk->flags);
 958}
 959
 960static ssize_t disk_alignment_offset_show(struct device *dev,
 961                                          struct device_attribute *attr,
 962                                          char *buf)
 963{
 964        struct gendisk *disk = dev_to_disk(dev);
 965
 966        return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 967}
 968
 969static ssize_t disk_discard_alignment_show(struct device *dev,
 970                                           struct device_attribute *attr,
 971                                           char *buf)
 972{
 973        struct gendisk *disk = dev_to_disk(dev);
 974
 975        return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 976}
 977
 978static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 979static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 980static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 981static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 982static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 983static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 984static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
 985                   NULL);
 986static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 987static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 988static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 989#ifdef CONFIG_FAIL_MAKE_REQUEST
 990static struct device_attribute dev_attr_fail =
 991        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
 992#endif
 993#ifdef CONFIG_FAIL_IO_TIMEOUT
 994static struct device_attribute dev_attr_fail_timeout =
 995        __ATTR(io-timeout-fail,  S_IRUGO|S_IWUSR, part_timeout_show,
 996                part_timeout_store);
 997#endif
 998
 999static struct attribute *disk_attrs[] = {
1000        &dev_attr_range.attr,
1001        &dev_attr_ext_range.attr,
1002        &dev_attr_removable.attr,
1003        &dev_attr_ro.attr,
1004        &dev_attr_size.attr,
1005        &dev_attr_alignment_offset.attr,
1006        &dev_attr_discard_alignment.attr,
1007        &dev_attr_capability.attr,
1008        &dev_attr_stat.attr,
1009        &dev_attr_inflight.attr,
1010#ifdef CONFIG_FAIL_MAKE_REQUEST
1011        &dev_attr_fail.attr,
1012#endif
1013#ifdef CONFIG_FAIL_IO_TIMEOUT
1014        &dev_attr_fail_timeout.attr,
1015#endif
1016        NULL
1017};
1018
1019static struct attribute_group disk_attr_group = {
1020        .attrs = disk_attrs,
1021};
1022
1023static const struct attribute_group *disk_attr_groups[] = {
1024        &disk_attr_group,
1025        NULL
1026};
1027
1028/**
1029 * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
1030 * @disk: disk to replace part_tbl for
1031 * @new_ptbl: new part_tbl to install
1032 *
1033 * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
1034 * original ptbl is freed using RCU callback.
1035 *
1036 * LOCKING:
1037 * Matching bd_mutx locked.
1038 */
1039static void disk_replace_part_tbl(struct gendisk *disk,
1040                                  struct disk_part_tbl *new_ptbl)
1041{
1042        struct disk_part_tbl *old_ptbl = disk->part_tbl;
1043
1044        rcu_assign_pointer(disk->part_tbl, new_ptbl);
1045
1046        if (old_ptbl) {
1047                rcu_assign_pointer(old_ptbl->last_lookup, NULL);
1048                kfree_rcu(old_ptbl, rcu_head);
1049        }
1050}
1051
1052/**
1053 * disk_expand_part_tbl - expand disk->part_tbl
1054 * @disk: disk to expand part_tbl for
1055 * @partno: expand such that this partno can fit in
1056 *
1057 * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
1058 * uses RCU to allow unlocked dereferencing for stats and other stuff.
1059 *
1060 * LOCKING:
1061 * Matching bd_mutex locked, might sleep.
1062 *
1063 * RETURNS:
1064 * 0 on success, -errno on failure.
1065 */
1066int disk_expand_part_tbl(struct gendisk *disk, int partno)
1067{
1068        struct disk_part_tbl *old_ptbl = disk->part_tbl;
1069        struct disk_part_tbl *new_ptbl;
1070        int len = old_ptbl ? old_ptbl->len : 0;
1071        int target = partno + 1;
1072        size_t size;
1073        int i;
1074
1075        /* disk_max_parts() is zero during initialization, ignore if so */
1076        if (disk_max_parts(disk) && target > disk_max_parts(disk))
1077                return -EINVAL;
1078
1079        if (target <= len)
1080                return 0;
1081
1082        size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
1083        new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
1084        if (!new_ptbl)
1085                return -ENOMEM;
1086
1087        new_ptbl->len = target;
1088
1089        for (i = 0; i < len; i++)
1090                rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
1091
1092        disk_replace_part_tbl(disk, new_ptbl);
1093        return 0;
1094}
1095
1096static void disk_release(struct device *dev)
1097{
1098        struct gendisk *disk = dev_to_disk(dev);
1099
1100        disk_release_events(disk);
1101        kfree(disk->random);
1102        disk_replace_part_tbl(disk, NULL);
1103        free_part_stats(&disk->part0);
1104        free_part_info(&disk->part0);
1105        if (disk->queue)
1106                blk_put_queue(disk->queue);
1107        kfree(disk);
1108}
1109struct class block_class = {
1110        .name           = "block",
1111};
1112
1113static char *block_devnode(struct device *dev, umode_t *mode)
1114{
1115        struct gendisk *disk = dev_to_disk(dev);
1116
1117        if (disk->devnode)
1118                return disk->devnode(disk, mode);
1119        return NULL;
1120}
1121
1122static struct device_type disk_type = {
1123        .name           = "disk",
1124        .groups         = disk_attr_groups,
1125        .release        = disk_release,
1126        .devnode        = block_devnode,
1127};
1128
1129#ifdef CONFIG_PROC_FS
1130/*
1131 * aggregate disk stat collector.  Uses the same stats that the sysfs
1132 * entries do, above, but makes them available through one seq_file.
1133 *
1134 * The output looks suspiciously like /proc/partitions with a bunch of
1135 * extra fields.
1136 */
1137static int diskstats_show(struct seq_file *seqf, void *v)
1138{
1139        struct gendisk *gp = v;
1140        struct disk_part_iter piter;
1141        struct hd_struct *hd;
1142        char buf[BDEVNAME_SIZE];
1143        int cpu;
1144
1145        /*
1146        if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1147                seq_puts(seqf,  "major minor name"
1148                                "     rio rmerge rsect ruse wio wmerge "
1149                                "wsect wuse running use aveq"
1150                                "\n\n");
1151        */
1152
1153        disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1154        while ((hd = disk_part_iter_next(&piter))) {
1155                cpu = part_stat_lock();
1156                part_round_stats(cpu, hd);
1157                part_stat_unlock();
1158                seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
1159                           "%u %lu %lu %lu %u %u %u %u\n",
1160                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1161                           disk_name(gp, hd->partno, buf),
1162                           part_stat_read(hd, ios[READ]),
1163                           part_stat_read(hd, merges[READ]),
1164                           part_stat_read(hd, sectors[READ]),
1165                           jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1166                           part_stat_read(hd, ios[WRITE]),
1167                           part_stat_read(hd, merges[WRITE]),
1168                           part_stat_read(hd, sectors[WRITE]),
1169                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1170                           part_in_flight(hd),
1171                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1172                           jiffies_to_msecs(part_stat_read(hd, time_in_queue))
1173                        );
1174        }
1175        disk_part_iter_exit(&piter);
1176
1177        return 0;
1178}
1179
1180static const struct seq_operations diskstats_op = {
1181        .start  = disk_seqf_start,
1182        .next   = disk_seqf_next,
1183        .stop   = disk_seqf_stop,
1184        .show   = diskstats_show
1185};
1186
1187static int diskstats_open(struct inode *inode, struct file *file)
1188{
1189        return seq_open(file, &diskstats_op);
1190}
1191
1192static const struct file_operations proc_diskstats_operations = {
1193        .open           = diskstats_open,
1194        .read           = seq_read,
1195        .llseek         = seq_lseek,
1196        .release        = seq_release,
1197};
1198
1199static int __init proc_genhd_init(void)
1200{
1201        proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
1202        proc_create("partitions", 0, NULL, &proc_partitions_operations);
1203        return 0;
1204}
1205module_init(proc_genhd_init);
1206#endif /* CONFIG_PROC_FS */
1207
1208dev_t blk_lookup_devt(const char *name, int partno)
1209{
1210        dev_t devt = MKDEV(0, 0);
1211        struct class_dev_iter iter;
1212        struct device *dev;
1213
1214        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
1215        while ((dev = class_dev_iter_next(&iter))) {
1216                struct gendisk *disk = dev_to_disk(dev);
1217                struct hd_struct *part;
1218
1219                if (strcmp(dev_name(dev), name))
1220                        continue;
1221
1222                if (partno < disk->minors) {
1223                        /* We need to return the right devno, even
1224                         * if the partition doesn't exist yet.
1225                         */
1226                        devt = MKDEV(MAJOR(dev->devt),
1227                                     MINOR(dev->devt) + partno);
1228                        break;
1229                }
1230                part = disk_get_part(disk, partno);
1231                if (part) {
1232                        devt = part_devt(part);
1233                        disk_put_part(part);
1234                        break;
1235                }
1236                disk_put_part(part);
1237        }
1238        class_dev_iter_exit(&iter);
1239        return devt;
1240}
1241EXPORT_SYMBOL(blk_lookup_devt);
1242
1243struct gendisk *alloc_disk(int minors)
1244{
1245        return alloc_disk_node(minors, NUMA_NO_NODE);
1246}
1247EXPORT_SYMBOL(alloc_disk);
1248
1249struct gendisk *alloc_disk_node(int minors, int node_id)
1250{
1251        struct gendisk *disk;
1252
1253        disk = kmalloc_node(sizeof(struct gendisk),
1254                                GFP_KERNEL | __GFP_ZERO, node_id);
1255        if (disk) {
1256                if (!init_part_stats(&disk->part0)) {
1257                        kfree(disk);
1258                        return NULL;
1259                }
1260                disk->node_id = node_id;
1261                if (disk_expand_part_tbl(disk, 0)) {
1262                        free_part_stats(&disk->part0);
1263                        kfree(disk);
1264                        return NULL;
1265                }
1266                disk->part_tbl->part[0] = &disk->part0;
1267
1268                /*
1269                 * set_capacity() and get_capacity() currently don't use
1270                 * seqcounter to read/update the part0->nr_sects. Still init
1271                 * the counter as we can read the sectors in IO submission
1272                 * patch using seqence counters.
1273                 *
1274                 * TODO: Ideally set_capacity() and get_capacity() should be
1275                 * converted to make use of bd_mutex and sequence counters.
1276                 */
1277                seqcount_init(&disk->part0.nr_sects_seq);
1278                hd_ref_init(&disk->part0);
1279
1280                disk->minors = minors;
1281                rand_initialize_disk(disk);
1282                disk_to_dev(disk)->class = &block_class;
1283                disk_to_dev(disk)->type = &disk_type;
1284                device_initialize(disk_to_dev(disk));
1285        }
1286        return disk;
1287}
1288EXPORT_SYMBOL(alloc_disk_node);
1289
1290struct kobject *get_disk(struct gendisk *disk)
1291{
1292        struct module *owner;
1293        struct kobject *kobj;
1294
1295        if (!disk->fops)
1296                return NULL;
1297        owner = disk->fops->owner;
1298        if (owner && !try_module_get(owner))
1299                return NULL;
1300        kobj = kobject_get(&disk_to_dev(disk)->kobj);
1301        if (kobj == NULL) {
1302                module_put(owner);
1303                return NULL;
1304        }
1305        return kobj;
1306
1307}
1308
1309EXPORT_SYMBOL(get_disk);
1310
1311void put_disk(struct gendisk *disk)
1312{
1313        if (disk)
1314                kobject_put(&disk_to_dev(disk)->kobj);
1315}
1316
1317EXPORT_SYMBOL(put_disk);
1318
1319static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1320{
1321        char event[] = "DISK_RO=1";
1322        char *envp[] = { event, NULL };
1323
1324        if (!ro)
1325                event[8] = '0';
1326        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1327}
1328
1329void set_device_ro(struct block_device *bdev, int flag)
1330{
1331        bdev->bd_part->policy = flag;
1332}
1333
1334EXPORT_SYMBOL(set_device_ro);
1335
1336void set_disk_ro(struct gendisk *disk, int flag)
1337{
1338        struct disk_part_iter piter;
1339        struct hd_struct *part;
1340
1341        if (disk->part0.policy != flag) {
1342                set_disk_ro_uevent(disk, flag);
1343                disk->part0.policy = flag;
1344        }
1345
1346        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
1347        while ((part = disk_part_iter_next(&piter)))
1348                part->policy = flag;
1349        disk_part_iter_exit(&piter);
1350}
1351
1352EXPORT_SYMBOL(set_disk_ro);
1353
1354int bdev_read_only(struct block_device *bdev)
1355{
1356        if (!bdev)
1357                return 0;
1358        return bdev->bd_part->policy;
1359}
1360
1361EXPORT_SYMBOL(bdev_read_only);
1362
1363int invalidate_partition(struct gendisk *disk, int partno)
1364{
1365        int res = 0;
1366        struct block_device *bdev = bdget_disk(disk, partno);
1367        if (bdev) {
1368                fsync_bdev(bdev);
1369                res = __invalidate_device(bdev, true);
1370                bdput(bdev);
1371        }
1372        return res;
1373}
1374
1375EXPORT_SYMBOL(invalidate_partition);
1376
1377/*
1378 * Disk events - monitor disk events like media change and eject request.
1379 */
1380struct disk_events {
1381        struct list_head        node;           /* all disk_event's */
1382        struct gendisk          *disk;          /* the associated disk */
1383        spinlock_t              lock;
1384
1385        struct mutex            block_mutex;    /* protects blocking */
1386        int                     block;          /* event blocking depth */
1387        unsigned int            pending;        /* events already sent out */
1388        unsigned int            clearing;       /* events being cleared */
1389
1390        long                    poll_msecs;     /* interval, -1 for default */
1391        struct delayed_work     dwork;
1392};
1393
1394static const char *disk_events_strs[] = {
1395        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
1396        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "eject_request",
1397};
1398
1399static char *disk_uevents[] = {
1400        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
1401        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "DISK_EJECT_REQUEST=1",
1402};
1403
1404/* list of all disk_events */
1405static DEFINE_MUTEX(disk_events_mutex);
1406static LIST_HEAD(disk_events);
1407
1408/* disable in-kernel polling by default */
1409static unsigned long disk_events_dfl_poll_msecs = 0;
1410
1411static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1412{
1413        struct disk_events *ev = disk->ev;
1414        long intv_msecs = 0;
1415
1416        /*
1417         * If device-specific poll interval is set, always use it.  If
1418         * the default is being used, poll iff there are events which
1419         * can't be monitored asynchronously.
1420         */
1421        if (ev->poll_msecs >= 0)
1422                intv_msecs = ev->poll_msecs;
1423        else if (disk->events & ~disk->async_events)
1424                intv_msecs = disk_events_dfl_poll_msecs;
1425
1426        return msecs_to_jiffies(intv_msecs);
1427}
1428
1429/**
1430 * disk_block_events - block and flush disk event checking
1431 * @disk: disk to block events for
1432 *
1433 * On return from this function, it is guaranteed that event checking
1434 * isn't in progress and won't happen until unblocked by
1435 * disk_unblock_events().  Events blocking is counted and the actual
1436 * unblocking happens after the matching number of unblocks are done.
1437 *
1438 * Note that this intentionally does not block event checking from
1439 * disk_clear_events().
1440 *
1441 * CONTEXT:
1442 * Might sleep.
1443 */
1444void disk_block_events(struct gendisk *disk)
1445{
1446        struct disk_events *ev = disk->ev;
1447        unsigned long flags;
1448        bool cancel;
1449
1450        if (!ev)
1451                return;
1452
1453        /*
1454         * Outer mutex ensures that the first blocker completes canceling
1455         * the event work before further blockers are allowed to finish.
1456         */
1457        mutex_lock(&ev->block_mutex);
1458
1459        spin_lock_irqsave(&ev->lock, flags);
1460        cancel = !ev->block++;
1461        spin_unlock_irqrestore(&ev->lock, flags);
1462
1463        if (cancel)
1464                cancel_delayed_work_sync(&disk->ev->dwork);
1465
1466        mutex_unlock(&ev->block_mutex);
1467}
1468
1469static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1470{
1471        struct disk_events *ev = disk->ev;
1472        unsigned long intv;
1473        unsigned long flags;
1474
1475        spin_lock_irqsave(&ev->lock, flags);
1476
1477        if (WARN_ON_ONCE(ev->block <= 0))
1478                goto out_unlock;
1479
1480        if (--ev->block)
1481                goto out_unlock;
1482
1483        /*
1484         * Not exactly a latency critical operation, set poll timer
1485         * slack to 25% and kick event check.
1486         */
1487        intv = disk_events_poll_jiffies(disk);
1488        set_timer_slack(&ev->dwork.timer, intv / 4);
1489        if (check_now)
1490                queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
1491        else if (intv)
1492                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
1493out_unlock:
1494        spin_unlock_irqrestore(&ev->lock, flags);
1495}
1496
1497/**
1498 * disk_unblock_events - unblock disk event checking
1499 * @disk: disk to unblock events for
1500 *
1501 * Undo disk_block_events().  When the block count reaches zero, it
1502 * starts events polling if configured.
1503 *
1504 * CONTEXT:
1505 * Don't care.  Safe to call from irq context.
1506 */
1507void disk_unblock_events(struct gendisk *disk)
1508{
1509        if (disk->ev)
1510                __disk_unblock_events(disk, false);
1511}
1512
1513/**
1514 * disk_flush_events - schedule immediate event checking and flushing
1515 * @disk: disk to check and flush events for
1516 * @mask: events to flush
1517 *
1518 * Schedule immediate event checking on @disk if not blocked.  Events in
1519 * @mask are scheduled to be cleared from the driver.  Note that this
1520 * doesn't clear the events from @disk->ev.
1521 *
1522 * CONTEXT:
1523 * If @mask is non-zero must be called with bdev->bd_mutex held.
1524 */
1525void disk_flush_events(struct gendisk *disk, unsigned int mask)
1526{
1527        struct disk_events *ev = disk->ev;
1528
1529        if (!ev)
1530                return;
1531
1532        spin_lock_irq(&ev->lock);
1533        ev->clearing |= mask;
1534        if (!ev->block)
1535                mod_delayed_work(system_freezable_wq, &ev->dwork, 0);
1536        spin_unlock_irq(&ev->lock);
1537}
1538
1539/**
1540 * disk_clear_events - synchronously check, clear and return pending events
1541 * @disk: disk to fetch and clear events from
1542 * @mask: mask of events to be fetched and clearted
1543 *
1544 * Disk events are synchronously checked and pending events in @mask
1545 * are cleared and returned.  This ignores the block count.
1546 *
1547 * CONTEXT:
1548 * Might sleep.
1549 */
1550unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1551{
1552        const struct block_device_operations *bdops = disk->fops;
1553        struct disk_events *ev = disk->ev;
1554        unsigned int pending;
1555        unsigned int clearing = mask;
1556
1557        if (!ev) {
1558                /* for drivers still using the old ->media_changed method */
1559                if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1560                    bdops->media_changed && bdops->media_changed(disk))
1561                        return DISK_EVENT_MEDIA_CHANGE;
1562                return 0;
1563        }
1564
1565        disk_block_events(disk);
1566
1567        /*
1568         * store the union of mask and ev->clearing on the stack so that the
1569         * race with disk_flush_events does not cause ambiguity (ev->clearing
1570         * can still be modified even if events are blocked).
1571         */
1572        spin_lock_irq(&ev->lock);
1573        clearing |= ev->clearing;
1574        ev->clearing = 0;
1575        spin_unlock_irq(&ev->lock);
1576
1577        disk_check_events(ev, &clearing);
1578        /*
1579         * if ev->clearing is not 0, the disk_flush_events got called in the
1580         * middle of this function, so we want to run the workfn without delay.
1581         */
1582        __disk_unblock_events(disk, ev->clearing ? true : false);
1583
1584        /* then, fetch and clear pending events */
1585        spin_lock_irq(&ev->lock);
1586        pending = ev->pending & mask;
1587        ev->pending &= ~mask;
1588        spin_unlock_irq(&ev->lock);
1589        WARN_ON_ONCE(clearing & mask);
1590
1591        return pending;
1592}
1593
1594/*
1595 * Separate this part out so that a different pointer for clearing_ptr can be
1596 * passed in for disk_clear_events.
1597 */
1598static void disk_events_workfn(struct work_struct *work)
1599{
1600        struct delayed_work *dwork = to_delayed_work(work);
1601        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1602
1603        disk_check_events(ev, &ev->clearing);
1604}
1605
1606static void disk_check_events(struct disk_events *ev,
1607                              unsigned int *clearing_ptr)
1608{
1609        struct gendisk *disk = ev->disk;
1610        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1611        unsigned int clearing = *clearing_ptr;
1612        unsigned int events;
1613        unsigned long intv;
1614        int nr_events = 0, i;
1615
1616        /* check events */
1617        events = disk->fops->check_events(disk, clearing);
1618
1619        /* accumulate pending events and schedule next poll if necessary */
1620        spin_lock_irq(&ev->lock);
1621
1622        events &= ~ev->pending;
1623        ev->pending |= events;
1624        *clearing_ptr &= ~clearing;
1625
1626        intv = disk_events_poll_jiffies(disk);
1627        if (!ev->block && intv)
1628                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
1629
1630        spin_unlock_irq(&ev->lock);
1631
1632        /*
1633         * Tell userland about new events.  Only the events listed in
1634         * @disk->events are reported.  Unlisted events are processed the
1635         * same internally but never get reported to userland.
1636         */
1637        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1638                if (events & disk->events & (1 << i))
1639                        envp[nr_events++] = disk_uevents[i];
1640
1641        if (nr_events)
1642                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1643}
1644
1645/*
1646 * A disk events enabled device has the following sysfs nodes under
1647 * its /sys/block/X/ directory.
1648 *
1649 * events               : list of all supported events
1650 * events_async         : list of events which can be detected w/o polling
1651 * events_poll_msecs    : polling interval, 0: disable, -1: system default
1652 */
1653static ssize_t __disk_events_show(unsigned int events, char *buf)
1654{
1655        const char *delim = "";
1656        ssize_t pos = 0;
1657        int i;
1658
1659        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1660                if (events & (1 << i)) {
1661                        pos += sprintf(buf + pos, "%s%s",
1662                                       delim, disk_events_strs[i]);
1663                        delim = " ";
1664                }
1665        if (pos)
1666                pos += sprintf(buf + pos, "\n");
1667        return pos;
1668}
1669
1670static ssize_t disk_events_show(struct device *dev,
1671                                struct device_attribute *attr, char *buf)
1672{
1673        struct gendisk *disk = dev_to_disk(dev);
1674
1675        return __disk_events_show(disk->events, buf);
1676}
1677
1678static ssize_t disk_events_async_show(struct device *dev,
1679                                      struct device_attribute *attr, char *buf)
1680{
1681        struct gendisk *disk = dev_to_disk(dev);
1682
1683        return __disk_events_show(disk->async_events, buf);
1684}
1685
1686static ssize_t disk_events_poll_msecs_show(struct device *dev,
1687                                           struct device_attribute *attr,
1688                                           char *buf)
1689{
1690        struct gendisk *disk = dev_to_disk(dev);
1691
1692        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1693}
1694
1695static ssize_t disk_events_poll_msecs_store(struct device *dev,
1696                                            struct device_attribute *attr,
1697                                            const char *buf, size_t count)
1698{
1699        struct gendisk *disk = dev_to_disk(dev);
1700        long intv;
1701
1702        if (!count || !sscanf(buf, "%ld", &intv))
1703                return -EINVAL;
1704
1705        if (intv < 0 && intv != -1)
1706                return -EINVAL;
1707
1708        disk_block_events(disk);
1709        disk->ev->poll_msecs = intv;
1710        __disk_unblock_events(disk, true);
1711
1712        return count;
1713}
1714
1715static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
1716static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
1717static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
1718                         disk_events_poll_msecs_show,
1719                         disk_events_poll_msecs_store);
1720
1721static const struct attribute *disk_events_attrs[] = {
1722        &dev_attr_events.attr,
1723        &dev_attr_events_async.attr,
1724        &dev_attr_events_poll_msecs.attr,
1725        NULL,
1726};
1727
1728/*
1729 * The default polling interval can be specified by the kernel
1730 * parameter block.events_dfl_poll_msecs which defaults to 0
1731 * (disable).  This can also be modified runtime by writing to
1732 * /sys/module/block/events_dfl_poll_msecs.
1733 */
1734static int disk_events_set_dfl_poll_msecs(const char *val,
1735                                          const struct kernel_param *kp)
1736{
1737        struct disk_events *ev;
1738        int ret;
1739
1740        ret = param_set_ulong(val, kp);
1741        if (ret < 0)
1742                return ret;
1743
1744        mutex_lock(&disk_events_mutex);
1745
1746        list_for_each_entry(ev, &disk_events, node)
1747                disk_flush_events(ev->disk, 0);
1748
1749        mutex_unlock(&disk_events_mutex);
1750
1751        return 0;
1752}
1753
1754static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1755        .set    = disk_events_set_dfl_poll_msecs,
1756        .get    = param_get_ulong,
1757};
1758
1759#undef MODULE_PARAM_PREFIX
1760#define MODULE_PARAM_PREFIX     "block."
1761
1762module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1763                &disk_events_dfl_poll_msecs, 0644);
1764
1765/*
1766 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
1767 */
1768static void disk_alloc_events(struct gendisk *disk)
1769{
1770        struct disk_events *ev;
1771
1772        if (!disk->fops->check_events)
1773                return;
1774
1775        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1776        if (!ev) {
1777                pr_warn("%s: failed to initialize events\n", disk->disk_name);
1778                return;
1779        }
1780
1781        INIT_LIST_HEAD(&ev->node);
1782        ev->disk = disk;
1783        spin_lock_init(&ev->lock);
1784        mutex_init(&ev->block_mutex);
1785        ev->block = 1;
1786        ev->poll_msecs = -1;
1787        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1788
1789        disk->ev = ev;
1790}
1791
1792static void disk_add_events(struct gendisk *disk)
1793{
1794        if (!disk->ev)
1795                return;
1796
1797        /* FIXME: error handling */
1798        if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
1799                pr_warn("%s: failed to create sysfs files for events\n",
1800                        disk->disk_name);
1801
1802        mutex_lock(&disk_events_mutex);
1803        list_add_tail(&disk->ev->node, &disk_events);
1804        mutex_unlock(&disk_events_mutex);
1805
1806        /*
1807         * Block count is initialized to 1 and the following initial
1808         * unblock kicks it into action.
1809         */
1810        __disk_unblock_events(disk, true);
1811}
1812
1813static void disk_del_events(struct gendisk *disk)
1814{
1815        if (!disk->ev)
1816                return;
1817
1818        disk_block_events(disk);
1819
1820        mutex_lock(&disk_events_mutex);
1821        list_del_init(&disk->ev->node);
1822        mutex_unlock(&disk_events_mutex);
1823
1824        sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
1825}
1826
1827static void disk_release_events(struct gendisk *disk)
1828{
1829        /* the block count should be 1 from disk_del_events() */
1830        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
1831        kfree(disk->ev);
1832}
1833