linux/block/genhd.c
<<
>>
Prefs
   1/*
   2 *  gendisk handling
   3 */
   4
   5#include <linux/module.h>
   6#include <linux/fs.h>
   7#include <linux/genhd.h>
   8#include <linux/kdev_t.h>
   9#include <linux/kernel.h>
  10#include <linux/blkdev.h>
  11#include <linux/init.h>
  12#include <linux/spinlock.h>
  13#include <linux/proc_fs.h>
  14#include <linux/seq_file.h>
  15#include <linux/slab.h>
  16#include <linux/kmod.h>
  17#include <linux/kobj_map.h>
  18#include <linux/mutex.h>
  19#include <linux/idr.h>
  20#include <linux/log2.h>
  21
  22#include "blk.h"
  23
  24static DEFINE_MUTEX(block_class_lock);
  25struct kobject *block_depr;
  26
  27/* for extended dynamic devt allocation, currently only one major is used */
  28#define MAX_EXT_DEVT            (1 << MINORBITS)
  29
  30/* For extended devt allocation.  ext_devt_mutex prevents look up
  31 * results from going away underneath its user.
  32 */
  33static DEFINE_MUTEX(ext_devt_mutex);
  34static DEFINE_IDR(ext_devt_idr);
  35
  36static struct device_type disk_type;
  37
  38static void disk_check_events(struct disk_events *ev,
  39                              unsigned int *clearing_ptr);
  40static void disk_alloc_events(struct gendisk *disk);
  41static void disk_add_events(struct gendisk *disk);
  42static void disk_del_events(struct gendisk *disk);
  43static void disk_release_events(struct gendisk *disk);
  44
  45/**
  46 * disk_get_part - get partition
  47 * @disk: disk to look partition from
  48 * @partno: partition number
  49 *
  50 * Look for partition @partno from @disk.  If found, increment
  51 * reference count and return it.
  52 *
  53 * CONTEXT:
  54 * Don't care.
  55 *
  56 * RETURNS:
  57 * Pointer to the found partition on success, NULL if not found.
  58 */
  59struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
  60{
  61        struct hd_struct *part = NULL;
  62        struct disk_part_tbl *ptbl;
  63
  64        if (unlikely(partno < 0))
  65                return NULL;
  66
  67        rcu_read_lock();
  68
  69        ptbl = rcu_dereference(disk->part_tbl);
  70        if (likely(partno < ptbl->len)) {
  71                part = rcu_dereference(ptbl->part[partno]);
  72                if (part)
  73                        get_device(part_to_dev(part));
  74        }
  75
  76        rcu_read_unlock();
  77
  78        return part;
  79}
  80EXPORT_SYMBOL_GPL(disk_get_part);
  81
  82/**
  83 * disk_part_iter_init - initialize partition iterator
  84 * @piter: iterator to initialize
  85 * @disk: disk to iterate over
  86 * @flags: DISK_PITER_* flags
  87 *
  88 * Initialize @piter so that it iterates over partitions of @disk.
  89 *
  90 * CONTEXT:
  91 * Don't care.
  92 */
  93void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
  94                          unsigned int flags)
  95{
  96        struct disk_part_tbl *ptbl;
  97
  98        rcu_read_lock();
  99        ptbl = rcu_dereference(disk->part_tbl);
 100
 101        piter->disk = disk;
 102        piter->part = NULL;
 103
 104        if (flags & DISK_PITER_REVERSE)
 105                piter->idx = ptbl->len - 1;
 106        else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
 107                piter->idx = 0;
 108        else
 109                piter->idx = 1;
 110
 111        piter->flags = flags;
 112
 113        rcu_read_unlock();
 114}
 115EXPORT_SYMBOL_GPL(disk_part_iter_init);
 116
 117/**
 118 * disk_part_iter_next - proceed iterator to the next partition and return it
 119 * @piter: iterator of interest
 120 *
 121 * Proceed @piter to the next partition and return it.
 122 *
 123 * CONTEXT:
 124 * Don't care.
 125 */
 126struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 127{
 128        struct disk_part_tbl *ptbl;
 129        int inc, end;
 130
 131        /* put the last partition */
 132        disk_put_part(piter->part);
 133        piter->part = NULL;
 134
 135        /* get part_tbl */
 136        rcu_read_lock();
 137        ptbl = rcu_dereference(piter->disk->part_tbl);
 138
 139        /* determine iteration parameters */
 140        if (piter->flags & DISK_PITER_REVERSE) {
 141                inc = -1;
 142                if (piter->flags & (DISK_PITER_INCL_PART0 |
 143                                    DISK_PITER_INCL_EMPTY_PART0))
 144                        end = -1;
 145                else
 146                        end = 0;
 147        } else {
 148                inc = 1;
 149                end = ptbl->len;
 150        }
 151
 152        /* iterate to the next partition */
 153        for (; piter->idx != end; piter->idx += inc) {
 154                struct hd_struct *part;
 155
 156                part = rcu_dereference(ptbl->part[piter->idx]);
 157                if (!part)
 158                        continue;
 159                if (!part_nr_sects_read(part) &&
 160                    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 161                    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
 162                      piter->idx == 0))
 163                        continue;
 164
 165                get_device(part_to_dev(part));
 166                piter->part = part;
 167                piter->idx += inc;
 168                break;
 169        }
 170
 171        rcu_read_unlock();
 172
 173        return piter->part;
 174}
 175EXPORT_SYMBOL_GPL(disk_part_iter_next);
 176
 177/**
 178 * disk_part_iter_exit - finish up partition iteration
 179 * @piter: iter of interest
 180 *
 181 * Called when iteration is over.  Cleans up @piter.
 182 *
 183 * CONTEXT:
 184 * Don't care.
 185 */
 186void disk_part_iter_exit(struct disk_part_iter *piter)
 187{
 188        disk_put_part(piter->part);
 189        piter->part = NULL;
 190}
 191EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 192
 193static inline int sector_in_part(struct hd_struct *part, sector_t sector)
 194{
 195        return part->start_sect <= sector &&
 196                sector < part->start_sect + part_nr_sects_read(part);
 197}
 198
 199/**
 200 * disk_map_sector_rcu - map sector to partition
 201 * @disk: gendisk of interest
 202 * @sector: sector to map
 203 *
 204 * Find out which partition @sector maps to on @disk.  This is
 205 * primarily used for stats accounting.
 206 *
 207 * CONTEXT:
 208 * RCU read locked.  The returned partition pointer is valid only
 209 * while preemption is disabled.
 210 *
 211 * RETURNS:
 212 * Found partition on success, part0 is returned if no partition matches
 213 */
 214struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 215{
 216        struct disk_part_tbl *ptbl;
 217        struct hd_struct *part;
 218        int i;
 219
 220        ptbl = rcu_dereference(disk->part_tbl);
 221
 222        part = rcu_dereference(ptbl->last_lookup);
 223        if (part && sector_in_part(part, sector))
 224                return part;
 225
 226        for (i = 1; i < ptbl->len; i++) {
 227                part = rcu_dereference(ptbl->part[i]);
 228
 229                if (part && sector_in_part(part, sector)) {
 230                        rcu_assign_pointer(ptbl->last_lookup, part);
 231                        return part;
 232                }
 233        }
 234        return &disk->part0;
 235}
 236EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
 237
 238/*
 239 * Can be deleted altogether. Later.
 240 *
 241 */
 242static struct blk_major_name {
 243        struct blk_major_name *next;
 244        int major;
 245        char name[16];
 246} *major_names[BLKDEV_MAJOR_HASH_SIZE];
 247
 248/* index in the above - for now: assume no multimajor ranges */
 249static inline int major_to_index(unsigned major)
 250{
 251        return major % BLKDEV_MAJOR_HASH_SIZE;
 252}
 253
 254#ifdef CONFIG_PROC_FS
 255void blkdev_show(struct seq_file *seqf, off_t offset)
 256{
 257        struct blk_major_name *dp;
 258
 259        if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 260                mutex_lock(&block_class_lock);
 261                for (dp = major_names[offset]; dp; dp = dp->next)
 262                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
 263                mutex_unlock(&block_class_lock);
 264        }
 265}
 266#endif /* CONFIG_PROC_FS */
 267
 268/**
 269 * register_blkdev - register a new block device
 270 *
 271 * @major: the requested major device number [1..255]. If @major=0, try to
 272 *         allocate any unused major number.
 273 * @name: the name of the new block device as a zero terminated string
 274 *
 275 * The @name must be unique within the system.
 276 *
 277 * The return value depends on the @major input parameter.
 278 *  - if a major device number was requested in range [1..255] then the
 279 *    function returns zero on success, or a negative error code
 280 *  - if any unused major number was requested with @major=0 parameter
 281 *    then the return value is the allocated major number in range
 282 *    [1..255] or a negative error code otherwise
 283 */
 284int register_blkdev(unsigned int major, const char *name)
 285{
 286        struct blk_major_name **n, *p;
 287        int index, ret = 0;
 288
 289        mutex_lock(&block_class_lock);
 290
 291        /* temporary */
 292        if (major == 0) {
 293                for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
 294                        if (major_names[index] == NULL)
 295                                break;
 296                }
 297
 298                if (index == 0) {
 299                        printk("register_blkdev: failed to get major for %s\n",
 300                               name);
 301                        ret = -EBUSY;
 302                        goto out;
 303                }
 304                major = index;
 305                ret = major;
 306        }
 307
 308        p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
 309        if (p == NULL) {
 310                ret = -ENOMEM;
 311                goto out;
 312        }
 313
 314        p->major = major;
 315        strlcpy(p->name, name, sizeof(p->name));
 316        p->next = NULL;
 317        index = major_to_index(major);
 318
 319        for (n = &major_names[index]; *n; n = &(*n)->next) {
 320                if ((*n)->major == major)
 321                        break;
 322        }
 323        if (!*n)
 324                *n = p;
 325        else
 326                ret = -EBUSY;
 327
 328        if (ret < 0) {
 329                printk("register_blkdev: cannot get major %d for %s\n",
 330                       major, name);
 331                kfree(p);
 332        }
 333out:
 334        mutex_unlock(&block_class_lock);
 335        return ret;
 336}
 337
 338EXPORT_SYMBOL(register_blkdev);
 339
 340void unregister_blkdev(unsigned int major, const char *name)
 341{
 342        struct blk_major_name **n;
 343        struct blk_major_name *p = NULL;
 344        int index = major_to_index(major);
 345
 346        mutex_lock(&block_class_lock);
 347        for (n = &major_names[index]; *n; n = &(*n)->next)
 348                if ((*n)->major == major)
 349                        break;
 350        if (!*n || strcmp((*n)->name, name)) {
 351                WARN_ON(1);
 352        } else {
 353                p = *n;
 354                *n = p->next;
 355        }
 356        mutex_unlock(&block_class_lock);
 357        kfree(p);
 358}
 359
 360EXPORT_SYMBOL(unregister_blkdev);
 361
 362static struct kobj_map *bdev_map;
 363
 364/**
 365 * blk_mangle_minor - scatter minor numbers apart
 366 * @minor: minor number to mangle
 367 *
 368 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 369 * is enabled.  Mangling twice gives the original value.
 370 *
 371 * RETURNS:
 372 * Mangled value.
 373 *
 374 * CONTEXT:
 375 * Don't care.
 376 */
 377static int blk_mangle_minor(int minor)
 378{
 379#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
 380        int i;
 381
 382        for (i = 0; i < MINORBITS / 2; i++) {
 383                int low = minor & (1 << i);
 384                int high = minor & (1 << (MINORBITS - 1 - i));
 385                int distance = MINORBITS - 1 - 2 * i;
 386
 387                minor ^= low | high;    /* clear both bits */
 388                low <<= distance;       /* swap the positions */
 389                high >>= distance;
 390                minor |= low | high;    /* and set */
 391        }
 392#endif
 393        return minor;
 394}
 395
 396/**
 397 * blk_alloc_devt - allocate a dev_t for a partition
 398 * @part: partition to allocate dev_t for
 399 * @devt: out parameter for resulting dev_t
 400 *
 401 * Allocate a dev_t for block device.
 402 *
 403 * RETURNS:
 404 * 0 on success, allocated dev_t is returned in *@devt.  -errno on
 405 * failure.
 406 *
 407 * CONTEXT:
 408 * Might sleep.
 409 */
 410int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 411{
 412        struct gendisk *disk = part_to_disk(part);
 413        int idx, rc;
 414
 415        /* in consecutive minor range? */
 416        if (part->partno < disk->minors) {
 417                *devt = MKDEV(disk->major, disk->first_minor + part->partno);
 418                return 0;
 419        }
 420
 421        /* allocate ext devt */
 422        do {
 423                if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL))
 424                        return -ENOMEM;
 425                rc = idr_get_new(&ext_devt_idr, part, &idx);
 426        } while (rc == -EAGAIN);
 427
 428        if (rc)
 429                return rc;
 430
 431        if (idx > MAX_EXT_DEVT) {
 432                idr_remove(&ext_devt_idr, idx);
 433                return -EBUSY;
 434        }
 435
 436        *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
 437        return 0;
 438}
 439
 440/**
 441 * blk_free_devt - free a dev_t
 442 * @devt: dev_t to free
 443 *
 444 * Free @devt which was allocated using blk_alloc_devt().
 445 *
 446 * CONTEXT:
 447 * Might sleep.
 448 */
 449void blk_free_devt(dev_t devt)
 450{
 451        might_sleep();
 452
 453        if (devt == MKDEV(0, 0))
 454                return;
 455
 456        if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
 457                mutex_lock(&ext_devt_mutex);
 458                idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 459                mutex_unlock(&ext_devt_mutex);
 460        }
 461}
 462
 463static char *bdevt_str(dev_t devt, char *buf)
 464{
 465        if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
 466                char tbuf[BDEVT_SIZE];
 467                snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
 468                snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
 469        } else
 470                snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
 471
 472        return buf;
 473}
 474
 475/*
 476 * Register device numbers dev..(dev+range-1)
 477 * range must be nonzero
 478 * The hash chain is sorted on range, so that subranges can override.
 479 */
 480void blk_register_region(dev_t devt, unsigned long range, struct module *module,
 481                         struct kobject *(*probe)(dev_t, int *, void *),
 482                         int (*lock)(dev_t, void *), void *data)
 483{
 484        kobj_map(bdev_map, devt, range, module, probe, lock, data);
 485}
 486
 487EXPORT_SYMBOL(blk_register_region);
 488
 489void blk_unregister_region(dev_t devt, unsigned long range)
 490{
 491        kobj_unmap(bdev_map, devt, range);
 492}
 493
 494EXPORT_SYMBOL(blk_unregister_region);
 495
 496static struct kobject *exact_match(dev_t devt, int *partno, void *data)
 497{
 498        struct gendisk *p = data;
 499
 500        return &disk_to_dev(p)->kobj;
 501}
 502
 503static int exact_lock(dev_t devt, void *data)
 504{
 505        struct gendisk *p = data;
 506
 507        if (!get_disk(p))
 508                return -1;
 509        return 0;
 510}
 511
 512static void register_disk(struct gendisk *disk)
 513{
 514        struct device *ddev = disk_to_dev(disk);
 515        struct block_device *bdev;
 516        struct disk_part_iter piter;
 517        struct hd_struct *part;
 518        int err;
 519
 520        ddev->parent = disk->driverfs_dev;
 521
 522        dev_set_name(ddev, disk->disk_name);
 523
 524        /* delay uevents, until we scanned partition table */
 525        dev_set_uevent_suppress(ddev, 1);
 526
 527        if (device_add(ddev))
 528                return;
 529        if (!sysfs_deprecated) {
 530                err = sysfs_create_link(block_depr, &ddev->kobj,
 531                                        kobject_name(&ddev->kobj));
 532                if (err) {
 533                        device_del(ddev);
 534                        return;
 535                }
 536        }
 537        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 538        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 539
 540        /* No minors to use for partitions */
 541        if (!disk_part_scan_enabled(disk))
 542                goto exit;
 543
 544        /* No such device (e.g., media were just removed) */
 545        if (!get_capacity(disk))
 546                goto exit;
 547
 548        bdev = bdget_disk(disk, 0);
 549        if (!bdev)
 550                goto exit;
 551
 552        bdev->bd_invalidated = 1;
 553        err = blkdev_get(bdev, FMODE_READ, NULL);
 554        if (err < 0)
 555                goto exit;
 556        blkdev_put(bdev, FMODE_READ);
 557
 558exit:
 559        /* announce disk after possible partitions are created */
 560        dev_set_uevent_suppress(ddev, 0);
 561        kobject_uevent(&ddev->kobj, KOBJ_ADD);
 562
 563        /* announce possible partitions */
 564        disk_part_iter_init(&piter, disk, 0);
 565        while ((part = disk_part_iter_next(&piter)))
 566                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 567        disk_part_iter_exit(&piter);
 568}
 569
 570/**
 571 * add_disk - add partitioning information to kernel list
 572 * @disk: per-device partitioning information
 573 *
 574 * This function registers the partitioning information in @disk
 575 * with the kernel.
 576 *
 577 * FIXME: error handling
 578 */
 579void add_disk(struct gendisk *disk)
 580{
 581        struct backing_dev_info *bdi;
 582        dev_t devt;
 583        int retval;
 584
 585        /* minors == 0 indicates to use ext devt from part0 and should
 586         * be accompanied with EXT_DEVT flag.  Make sure all
 587         * parameters make sense.
 588         */
 589        WARN_ON(disk->minors && !(disk->major || disk->first_minor));
 590        WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
 591
 592        disk->flags |= GENHD_FL_UP;
 593
 594        retval = blk_alloc_devt(&disk->part0, &devt);
 595        if (retval) {
 596                WARN_ON(1);
 597                return;
 598        }
 599        disk_to_dev(disk)->devt = devt;
 600
 601        /* ->major and ->first_minor aren't supposed to be
 602         * dereferenced from here on, but set them just in case.
 603         */
 604        disk->major = MAJOR(devt);
 605        disk->first_minor = MINOR(devt);
 606
 607        disk_alloc_events(disk);
 608
 609        /* Register BDI before referencing it from bdev */
 610        bdi = &disk->queue->backing_dev_info;
 611        bdi_register_dev(bdi, disk_devt(disk));
 612
 613        blk_register_region(disk_devt(disk), disk->minors, NULL,
 614                            exact_match, exact_lock, disk);
 615        register_disk(disk);
 616        blk_register_queue(disk);
 617
 618        /*
 619         * Take an extra ref on queue which will be put on disk_release()
 620         * so that it sticks around as long as @disk is there.
 621         */
 622        WARN_ON_ONCE(!blk_get_queue(disk->queue));
 623
 624        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 625                                   "bdi");
 626        WARN_ON(retval);
 627
 628        disk_add_events(disk);
 629}
 630EXPORT_SYMBOL(add_disk);
 631
 632void del_gendisk(struct gendisk *disk)
 633{
 634        struct disk_part_iter piter;
 635        struct hd_struct *part;
 636
 637        disk_del_events(disk);
 638
 639        /* invalidate stuff */
 640        disk_part_iter_init(&piter, disk,
 641                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 642        while ((part = disk_part_iter_next(&piter))) {
 643                invalidate_partition(disk, part->partno);
 644                delete_partition(disk, part->partno);
 645        }
 646        disk_part_iter_exit(&piter);
 647
 648        invalidate_partition(disk, 0);
 649        blk_free_devt(disk_to_dev(disk)->devt);
 650        set_capacity(disk, 0);
 651        disk->flags &= ~GENHD_FL_UP;
 652
 653        sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 654        bdi_unregister(&disk->queue->backing_dev_info);
 655        blk_unregister_queue(disk);
 656        blk_unregister_region(disk_devt(disk), disk->minors);
 657
 658        part_stat_set_all(&disk->part0, 0);
 659        disk->part0.stamp = 0;
 660
 661        kobject_put(disk->part0.holder_dir);
 662        kobject_put(disk->slave_dir);
 663        disk->driverfs_dev = NULL;
 664        if (!sysfs_deprecated)
 665                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 666        device_del(disk_to_dev(disk));
 667}
 668EXPORT_SYMBOL(del_gendisk);
 669
 670/**
 671 * get_gendisk - get partitioning information for a given device
 672 * @devt: device to get partitioning information for
 673 * @partno: returned partition index
 674 *
 675 * This function gets the structure containing partitioning
 676 * information for the given device @devt.
 677 */
 678struct gendisk *get_gendisk(dev_t devt, int *partno)
 679{
 680        struct gendisk *disk = NULL;
 681
 682        if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
 683                struct kobject *kobj;
 684
 685                kobj = kobj_lookup(bdev_map, devt, partno);
 686                if (kobj)
 687                        disk = dev_to_disk(kobj_to_dev(kobj));
 688        } else {
 689                struct hd_struct *part;
 690
 691                mutex_lock(&ext_devt_mutex);
 692                part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 693                if (part && get_disk(part_to_disk(part))) {
 694                        *partno = part->partno;
 695                        disk = part_to_disk(part);
 696                }
 697                mutex_unlock(&ext_devt_mutex);
 698        }
 699
 700        return disk;
 701}
 702EXPORT_SYMBOL(get_gendisk);
 703
 704/**
 705 * bdget_disk - do bdget() by gendisk and partition number
 706 * @disk: gendisk of interest
 707 * @partno: partition number
 708 *
 709 * Find partition @partno from @disk, do bdget() on it.
 710 *
 711 * CONTEXT:
 712 * Don't care.
 713 *
 714 * RETURNS:
 715 * Resulting block_device on success, NULL on failure.
 716 */
 717struct block_device *bdget_disk(struct gendisk *disk, int partno)
 718{
 719        struct hd_struct *part;
 720        struct block_device *bdev = NULL;
 721
 722        part = disk_get_part(disk, partno);
 723        if (part)
 724                bdev = bdget(part_devt(part));
 725        disk_put_part(part);
 726
 727        return bdev;
 728}
 729EXPORT_SYMBOL(bdget_disk);
 730
 731/*
 732 * print a full list of all partitions - intended for places where the root
 733 * filesystem can't be mounted and thus to give the victim some idea of what
 734 * went wrong
 735 */
 736void __init printk_all_partitions(void)
 737{
 738        struct class_dev_iter iter;
 739        struct device *dev;
 740
 741        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 742        while ((dev = class_dev_iter_next(&iter))) {
 743                struct gendisk *disk = dev_to_disk(dev);
 744                struct disk_part_iter piter;
 745                struct hd_struct *part;
 746                char name_buf[BDEVNAME_SIZE];
 747                char devt_buf[BDEVT_SIZE];
 748
 749                /*
 750                 * Don't show empty devices or things that have been
 751                 * suppressed
 752                 */
 753                if (get_capacity(disk) == 0 ||
 754                    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
 755                        continue;
 756
 757                /*
 758                 * Note, unlike /proc/partitions, I am showing the
 759                 * numbers in hex - the same format as the root=
 760                 * option takes.
 761                 */
 762                disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 763                while ((part = disk_part_iter_next(&piter))) {
 764                        bool is_part0 = part == &disk->part0;
 765
 766                        printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 767                               bdevt_str(part_devt(part), devt_buf),
 768                               (unsigned long long)part_nr_sects_read(part) >> 1
 769                               , disk_name(disk, part->partno, name_buf),
 770                               part->info ? part->info->uuid : "");
 771                        if (is_part0) {
 772                                if (disk->driverfs_dev != NULL &&
 773                                    disk->driverfs_dev->driver != NULL)
 774                                        printk(" driver: %s\n",
 775                                              disk->driverfs_dev->driver->name);
 776                                else
 777                                        printk(" (driver?)\n");
 778                        } else
 779                                printk("\n");
 780                }
 781                disk_part_iter_exit(&piter);
 782        }
 783        class_dev_iter_exit(&iter);
 784}
 785
 786#ifdef CONFIG_PROC_FS
 787/* iterator */
 788static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
 789{
 790        loff_t skip = *pos;
 791        struct class_dev_iter *iter;
 792        struct device *dev;
 793
 794        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 795        if (!iter)
 796                return ERR_PTR(-ENOMEM);
 797
 798        seqf->private = iter;
 799        class_dev_iter_init(iter, &block_class, NULL, &disk_type);
 800        do {
 801                dev = class_dev_iter_next(iter);
 802                if (!dev)
 803                        return NULL;
 804        } while (skip--);
 805
 806        return dev_to_disk(dev);
 807}
 808
 809static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
 810{
 811        struct device *dev;
 812
 813        (*pos)++;
 814        dev = class_dev_iter_next(seqf->private);
 815        if (dev)
 816                return dev_to_disk(dev);
 817
 818        return NULL;
 819}
 820
 821static void disk_seqf_stop(struct seq_file *seqf, void *v)
 822{
 823        struct class_dev_iter *iter = seqf->private;
 824
 825        /* stop is called even after start failed :-( */
 826        if (iter) {
 827                class_dev_iter_exit(iter);
 828                kfree(iter);
 829        }
 830}
 831
 832static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 833{
 834        void *p;
 835
 836        p = disk_seqf_start(seqf, pos);
 837        if (!IS_ERR_OR_NULL(p) && !*pos)
 838                seq_puts(seqf, "major minor  #blocks  name\n\n");
 839        return p;
 840}
 841
 842static int show_partition(struct seq_file *seqf, void *v)
 843{
 844        struct gendisk *sgp = v;
 845        struct disk_part_iter piter;
 846        struct hd_struct *part;
 847        char buf[BDEVNAME_SIZE];
 848
 849        /* Don't show non-partitionable removeable devices or empty devices */
 850        if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
 851                                   (sgp->flags & GENHD_FL_REMOVABLE)))
 852                return 0;
 853        if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
 854                return 0;
 855
 856        /* show the full disk and all non-0 size partitions of it */
 857        disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
 858        while ((part = disk_part_iter_next(&piter)))
 859                seq_printf(seqf, "%4d  %7d %10llu %s\n",
 860                           MAJOR(part_devt(part)), MINOR(part_devt(part)),
 861                           (unsigned long long)part_nr_sects_read(part) >> 1,
 862                           disk_name(sgp, part->partno, buf));
 863        disk_part_iter_exit(&piter);
 864
 865        return 0;
 866}
 867
 868static const struct seq_operations partitions_op = {
 869        .start  = show_partition_start,
 870        .next   = disk_seqf_next,
 871        .stop   = disk_seqf_stop,
 872        .show   = show_partition
 873};
 874
 875static int partitions_open(struct inode *inode, struct file *file)
 876{
 877        return seq_open(file, &partitions_op);
 878}
 879
 880static const struct file_operations proc_partitions_operations = {
 881        .open           = partitions_open,
 882        .read           = seq_read,
 883        .llseek         = seq_lseek,
 884        .release        = seq_release,
 885};
 886#endif
 887
 888
 889static struct kobject *base_probe(dev_t devt, int *partno, void *data)
 890{
 891        if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
 892                /* Make old-style 2.4 aliases work */
 893                request_module("block-major-%d", MAJOR(devt));
 894        return NULL;
 895}
 896
 897static int __init genhd_device_init(void)
 898{
 899        int error;
 900
 901        block_class.dev_kobj = sysfs_dev_block_kobj;
 902        error = class_register(&block_class);
 903        if (unlikely(error))
 904                return error;
 905        bdev_map = kobj_map_init(base_probe, &block_class_lock);
 906        blk_dev_init();
 907
 908        register_blkdev(BLOCK_EXT_MAJOR, "blkext");
 909
 910        /* create top-level block dir */
 911        if (!sysfs_deprecated)
 912                block_depr = kobject_create_and_add("block", NULL);
 913        return 0;
 914}
 915
 916subsys_initcall(genhd_device_init);
 917
 918static ssize_t disk_range_show(struct device *dev,
 919                               struct device_attribute *attr, char *buf)
 920{
 921        struct gendisk *disk = dev_to_disk(dev);
 922
 923        return sprintf(buf, "%d\n", disk->minors);
 924}
 925
 926static ssize_t disk_ext_range_show(struct device *dev,
 927                                   struct device_attribute *attr, char *buf)
 928{
 929        struct gendisk *disk = dev_to_disk(dev);
 930
 931        return sprintf(buf, "%d\n", disk_max_parts(disk));
 932}
 933
 934static ssize_t disk_removable_show(struct device *dev,
 935                                   struct device_attribute *attr, char *buf)
 936{
 937        struct gendisk *disk = dev_to_disk(dev);
 938
 939        return sprintf(buf, "%d\n",
 940                       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
 941}
 942
 943static ssize_t disk_ro_show(struct device *dev,
 944                                   struct device_attribute *attr, char *buf)
 945{
 946        struct gendisk *disk = dev_to_disk(dev);
 947
 948        return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
 949}
 950
 951static ssize_t disk_capability_show(struct device *dev,
 952                                    struct device_attribute *attr, char *buf)
 953{
 954        struct gendisk *disk = dev_to_disk(dev);
 955
 956        return sprintf(buf, "%x\n", disk->flags);
 957}
 958
 959static ssize_t disk_alignment_offset_show(struct device *dev,
 960                                          struct device_attribute *attr,
 961                                          char *buf)
 962{
 963        struct gendisk *disk = dev_to_disk(dev);
 964
 965        return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 966}
 967
 968static ssize_t disk_discard_alignment_show(struct device *dev,
 969                                           struct device_attribute *attr,
 970                                           char *buf)
 971{
 972        struct gendisk *disk = dev_to_disk(dev);
 973
 974        return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 975}
 976
 977static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 978static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 979static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 980static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 981static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 982static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 983static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
 984                   NULL);
 985static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 986static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 987static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 988#ifdef CONFIG_FAIL_MAKE_REQUEST
 989static struct device_attribute dev_attr_fail =
 990        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
 991#endif
 992#ifdef CONFIG_FAIL_IO_TIMEOUT
 993static struct device_attribute dev_attr_fail_timeout =
 994        __ATTR(io-timeout-fail,  S_IRUGO|S_IWUSR, part_timeout_show,
 995                part_timeout_store);
 996#endif
 997
 998static struct attribute *disk_attrs[] = {
 999        &dev_attr_range.attr,
1000        &dev_attr_ext_range.attr,
1001        &dev_attr_removable.attr,
1002        &dev_attr_ro.attr,
1003        &dev_attr_size.attr,
1004        &dev_attr_alignment_offset.attr,
1005        &dev_attr_discard_alignment.attr,
1006        &dev_attr_capability.attr,
1007        &dev_attr_stat.attr,
1008        &dev_attr_inflight.attr,
1009#ifdef CONFIG_FAIL_MAKE_REQUEST
1010        &dev_attr_fail.attr,
1011#endif
1012#ifdef CONFIG_FAIL_IO_TIMEOUT
1013        &dev_attr_fail_timeout.attr,
1014#endif
1015        NULL
1016};
1017
1018static struct attribute_group disk_attr_group = {
1019        .attrs = disk_attrs,
1020};
1021
1022static const struct attribute_group *disk_attr_groups[] = {
1023        &disk_attr_group,
1024        NULL
1025};
1026
1027/**
1028 * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
1029 * @disk: disk to replace part_tbl for
1030 * @new_ptbl: new part_tbl to install
1031 *
1032 * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
1033 * original ptbl is freed using RCU callback.
1034 *
1035 * LOCKING:
1036 * Matching bd_mutx locked.
1037 */
1038static void disk_replace_part_tbl(struct gendisk *disk,
1039                                  struct disk_part_tbl *new_ptbl)
1040{
1041        struct disk_part_tbl *old_ptbl = disk->part_tbl;
1042
1043        rcu_assign_pointer(disk->part_tbl, new_ptbl);
1044
1045        if (old_ptbl) {
1046                rcu_assign_pointer(old_ptbl->last_lookup, NULL);
1047                kfree_rcu(old_ptbl, rcu_head);
1048        }
1049}
1050
1051/**
1052 * disk_expand_part_tbl - expand disk->part_tbl
1053 * @disk: disk to expand part_tbl for
1054 * @partno: expand such that this partno can fit in
1055 *
1056 * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
1057 * uses RCU to allow unlocked dereferencing for stats and other stuff.
1058 *
1059 * LOCKING:
1060 * Matching bd_mutex locked, might sleep.
1061 *
1062 * RETURNS:
1063 * 0 on success, -errno on failure.
1064 */
1065int disk_expand_part_tbl(struct gendisk *disk, int partno)
1066{
1067        struct disk_part_tbl *old_ptbl = disk->part_tbl;
1068        struct disk_part_tbl *new_ptbl;
1069        int len = old_ptbl ? old_ptbl->len : 0;
1070        int target = partno + 1;
1071        size_t size;
1072        int i;
1073
1074        /* disk_max_parts() is zero during initialization, ignore if so */
1075        if (disk_max_parts(disk) && target > disk_max_parts(disk))
1076                return -EINVAL;
1077
1078        if (target <= len)
1079                return 0;
1080
1081        size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
1082        new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
1083        if (!new_ptbl)
1084                return -ENOMEM;
1085
1086        new_ptbl->len = target;
1087
1088        for (i = 0; i < len; i++)
1089                rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
1090
1091        disk_replace_part_tbl(disk, new_ptbl);
1092        return 0;
1093}
1094
1095static void disk_release(struct device *dev)
1096{
1097        struct gendisk *disk = dev_to_disk(dev);
1098
1099        disk_release_events(disk);
1100        kfree(disk->random);
1101        disk_replace_part_tbl(disk, NULL);
1102        free_part_stats(&disk->part0);
1103        free_part_info(&disk->part0);
1104        if (disk->queue)
1105                blk_put_queue(disk->queue);
1106        kfree(disk);
1107}
1108struct class block_class = {
1109        .name           = "block",
1110};
1111
1112static char *block_devnode(struct device *dev, umode_t *mode)
1113{
1114        struct gendisk *disk = dev_to_disk(dev);
1115
1116        if (disk->devnode)
1117                return disk->devnode(disk, mode);
1118        return NULL;
1119}
1120
1121static struct device_type disk_type = {
1122        .name           = "disk",
1123        .groups         = disk_attr_groups,
1124        .release        = disk_release,
1125        .devnode        = block_devnode,
1126};
1127
1128#ifdef CONFIG_PROC_FS
1129/*
1130 * aggregate disk stat collector.  Uses the same stats that the sysfs
1131 * entries do, above, but makes them available through one seq_file.
1132 *
1133 * The output looks suspiciously like /proc/partitions with a bunch of
1134 * extra fields.
1135 */
1136static int diskstats_show(struct seq_file *seqf, void *v)
1137{
1138        struct gendisk *gp = v;
1139        struct disk_part_iter piter;
1140        struct hd_struct *hd;
1141        char buf[BDEVNAME_SIZE];
1142        int cpu;
1143
1144        /*
1145        if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1146                seq_puts(seqf,  "major minor name"
1147                                "     rio rmerge rsect ruse wio wmerge "
1148                                "wsect wuse running use aveq"
1149                                "\n\n");
1150        */
1151
1152        disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1153        while ((hd = disk_part_iter_next(&piter))) {
1154                cpu = part_stat_lock();
1155                part_round_stats(cpu, hd);
1156                part_stat_unlock();
1157                seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
1158                           "%u %lu %lu %lu %u %u %u %u\n",
1159                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1160                           disk_name(gp, hd->partno, buf),
1161                           part_stat_read(hd, ios[READ]),
1162                           part_stat_read(hd, merges[READ]),
1163                           part_stat_read(hd, sectors[READ]),
1164                           jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1165                           part_stat_read(hd, ios[WRITE]),
1166                           part_stat_read(hd, merges[WRITE]),
1167                           part_stat_read(hd, sectors[WRITE]),
1168                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1169                           part_in_flight(hd),
1170                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1171                           jiffies_to_msecs(part_stat_read(hd, time_in_queue))
1172                        );
1173        }
1174        disk_part_iter_exit(&piter);
1175
1176        return 0;
1177}
1178
1179static const struct seq_operations diskstats_op = {
1180        .start  = disk_seqf_start,
1181        .next   = disk_seqf_next,
1182        .stop   = disk_seqf_stop,
1183        .show   = diskstats_show
1184};
1185
1186static int diskstats_open(struct inode *inode, struct file *file)
1187{
1188        return seq_open(file, &diskstats_op);
1189}
1190
1191static const struct file_operations proc_diskstats_operations = {
1192        .open           = diskstats_open,
1193        .read           = seq_read,
1194        .llseek         = seq_lseek,
1195        .release        = seq_release,
1196};
1197
1198static int __init proc_genhd_init(void)
1199{
1200        proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
1201        proc_create("partitions", 0, NULL, &proc_partitions_operations);
1202        return 0;
1203}
1204module_init(proc_genhd_init);
1205#endif /* CONFIG_PROC_FS */
1206
1207dev_t blk_lookup_devt(const char *name, int partno)
1208{
1209        dev_t devt = MKDEV(0, 0);
1210        struct class_dev_iter iter;
1211        struct device *dev;
1212
1213        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
1214        while ((dev = class_dev_iter_next(&iter))) {
1215                struct gendisk *disk = dev_to_disk(dev);
1216                struct hd_struct *part;
1217
1218                if (strcmp(dev_name(dev), name))
1219                        continue;
1220
1221                if (partno < disk->minors) {
1222                        /* We need to return the right devno, even
1223                         * if the partition doesn't exist yet.
1224                         */
1225                        devt = MKDEV(MAJOR(dev->devt),
1226                                     MINOR(dev->devt) + partno);
1227                        break;
1228                }
1229                part = disk_get_part(disk, partno);
1230                if (part) {
1231                        devt = part_devt(part);
1232                        disk_put_part(part);
1233                        break;
1234                }
1235                disk_put_part(part);
1236        }
1237        class_dev_iter_exit(&iter);
1238        return devt;
1239}
1240EXPORT_SYMBOL(blk_lookup_devt);
1241
1242struct gendisk *alloc_disk(int minors)
1243{
1244        return alloc_disk_node(minors, NUMA_NO_NODE);
1245}
1246EXPORT_SYMBOL(alloc_disk);
1247
1248struct gendisk *alloc_disk_node(int minors, int node_id)
1249{
1250        struct gendisk *disk;
1251
1252        disk = kmalloc_node(sizeof(struct gendisk),
1253                                GFP_KERNEL | __GFP_ZERO, node_id);
1254        if (disk) {
1255                if (!init_part_stats(&disk->part0)) {
1256                        kfree(disk);
1257                        return NULL;
1258                }
1259                disk->node_id = node_id;
1260                if (disk_expand_part_tbl(disk, 0)) {
1261                        free_part_stats(&disk->part0);
1262                        kfree(disk);
1263                        return NULL;
1264                }
1265                disk->part_tbl->part[0] = &disk->part0;
1266
1267                /*
1268                 * set_capacity() and get_capacity() currently don't use
1269                 * seqcounter to read/update the part0->nr_sects. Still init
1270                 * the counter as we can read the sectors in IO submission
1271                 * patch using seqence counters.
1272                 *
1273                 * TODO: Ideally set_capacity() and get_capacity() should be
1274                 * converted to make use of bd_mutex and sequence counters.
1275                 */
1276                seqcount_init(&disk->part0.nr_sects_seq);
1277                hd_ref_init(&disk->part0);
1278
1279                disk->minors = minors;
1280                rand_initialize_disk(disk);
1281                disk_to_dev(disk)->class = &block_class;
1282                disk_to_dev(disk)->type = &disk_type;
1283                device_initialize(disk_to_dev(disk));
1284        }
1285        return disk;
1286}
1287EXPORT_SYMBOL(alloc_disk_node);
1288
1289struct kobject *get_disk(struct gendisk *disk)
1290{
1291        struct module *owner;
1292        struct kobject *kobj;
1293
1294        if (!disk->fops)
1295                return NULL;
1296        owner = disk->fops->owner;
1297        if (owner && !try_module_get(owner))
1298                return NULL;
1299        kobj = kobject_get(&disk_to_dev(disk)->kobj);
1300        if (kobj == NULL) {
1301                module_put(owner);
1302                return NULL;
1303        }
1304        return kobj;
1305
1306}
1307
1308EXPORT_SYMBOL(get_disk);
1309
1310void put_disk(struct gendisk *disk)
1311{
1312        if (disk)
1313                kobject_put(&disk_to_dev(disk)->kobj);
1314}
1315
1316EXPORT_SYMBOL(put_disk);
1317
1318static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1319{
1320        char event[] = "DISK_RO=1";
1321        char *envp[] = { event, NULL };
1322
1323        if (!ro)
1324                event[8] = '0';
1325        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1326}
1327
1328void set_device_ro(struct block_device *bdev, int flag)
1329{
1330        bdev->bd_part->policy = flag;
1331}
1332
1333EXPORT_SYMBOL(set_device_ro);
1334
1335void set_disk_ro(struct gendisk *disk, int flag)
1336{
1337        struct disk_part_iter piter;
1338        struct hd_struct *part;
1339
1340        if (disk->part0.policy != flag) {
1341                set_disk_ro_uevent(disk, flag);
1342                disk->part0.policy = flag;
1343        }
1344
1345        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
1346        while ((part = disk_part_iter_next(&piter)))
1347                part->policy = flag;
1348        disk_part_iter_exit(&piter);
1349}
1350
1351EXPORT_SYMBOL(set_disk_ro);
1352
1353int bdev_read_only(struct block_device *bdev)
1354{
1355        if (!bdev)
1356                return 0;
1357        return bdev->bd_part->policy;
1358}
1359
1360EXPORT_SYMBOL(bdev_read_only);
1361
1362int invalidate_partition(struct gendisk *disk, int partno)
1363{
1364        int res = 0;
1365        struct block_device *bdev = bdget_disk(disk, partno);
1366        if (bdev) {
1367                fsync_bdev(bdev);
1368                res = __invalidate_device(bdev, true);
1369                bdput(bdev);
1370        }
1371        return res;
1372}
1373
1374EXPORT_SYMBOL(invalidate_partition);
1375
1376/*
1377 * Disk events - monitor disk events like media change and eject request.
1378 */
1379struct disk_events {
1380        struct list_head        node;           /* all disk_event's */
1381        struct gendisk          *disk;          /* the associated disk */
1382        spinlock_t              lock;
1383
1384        struct mutex            block_mutex;    /* protects blocking */
1385        int                     block;          /* event blocking depth */
1386        unsigned int            pending;        /* events already sent out */
1387        unsigned int            clearing;       /* events being cleared */
1388
1389        long                    poll_msecs;     /* interval, -1 for default */
1390        struct delayed_work     dwork;
1391};
1392
1393static const char *disk_events_strs[] = {
1394        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
1395        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "eject_request",
1396};
1397
1398static char *disk_uevents[] = {
1399        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
1400        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "DISK_EJECT_REQUEST=1",
1401};
1402
1403/* list of all disk_events */
1404static DEFINE_MUTEX(disk_events_mutex);
1405static LIST_HEAD(disk_events);
1406
1407/* disable in-kernel polling by default */
1408static unsigned long disk_events_dfl_poll_msecs = 0;
1409
1410static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1411{
1412        struct disk_events *ev = disk->ev;
1413        long intv_msecs = 0;
1414
1415        /*
1416         * If device-specific poll interval is set, always use it.  If
1417         * the default is being used, poll iff there are events which
1418         * can't be monitored asynchronously.
1419         */
1420        if (ev->poll_msecs >= 0)
1421                intv_msecs = ev->poll_msecs;
1422        else if (disk->events & ~disk->async_events)
1423                intv_msecs = disk_events_dfl_poll_msecs;
1424
1425        return msecs_to_jiffies(intv_msecs);
1426}
1427
1428/**
1429 * disk_block_events - block and flush disk event checking
1430 * @disk: disk to block events for
1431 *
1432 * On return from this function, it is guaranteed that event checking
1433 * isn't in progress and won't happen until unblocked by
1434 * disk_unblock_events().  Events blocking is counted and the actual
1435 * unblocking happens after the matching number of unblocks are done.
1436 *
1437 * Note that this intentionally does not block event checking from
1438 * disk_clear_events().
1439 *
1440 * CONTEXT:
1441 * Might sleep.
1442 */
1443void disk_block_events(struct gendisk *disk)
1444{
1445        struct disk_events *ev = disk->ev;
1446        unsigned long flags;
1447        bool cancel;
1448
1449        if (!ev)
1450                return;
1451
1452        /*
1453         * Outer mutex ensures that the first blocker completes canceling
1454         * the event work before further blockers are allowed to finish.
1455         */
1456        mutex_lock(&ev->block_mutex);
1457
1458        spin_lock_irqsave(&ev->lock, flags);
1459        cancel = !ev->block++;
1460        spin_unlock_irqrestore(&ev->lock, flags);
1461
1462        if (cancel)
1463                cancel_delayed_work_sync(&disk->ev->dwork);
1464
1465        mutex_unlock(&ev->block_mutex);
1466}
1467
1468static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1469{
1470        struct disk_events *ev = disk->ev;
1471        unsigned long intv;
1472        unsigned long flags;
1473
1474        spin_lock_irqsave(&ev->lock, flags);
1475
1476        if (WARN_ON_ONCE(ev->block <= 0))
1477                goto out_unlock;
1478
1479        if (--ev->block)
1480                goto out_unlock;
1481
1482        /*
1483         * Not exactly a latency critical operation, set poll timer
1484         * slack to 25% and kick event check.
1485         */
1486        intv = disk_events_poll_jiffies(disk);
1487        set_timer_slack(&ev->dwork.timer, intv / 4);
1488        if (check_now)
1489                queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
1490        else if (intv)
1491                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
1492out_unlock:
1493        spin_unlock_irqrestore(&ev->lock, flags);
1494}
1495
1496/**
1497 * disk_unblock_events - unblock disk event checking
1498 * @disk: disk to unblock events for
1499 *
1500 * Undo disk_block_events().  When the block count reaches zero, it
1501 * starts events polling if configured.
1502 *
1503 * CONTEXT:
1504 * Don't care.  Safe to call from irq context.
1505 */
1506void disk_unblock_events(struct gendisk *disk)
1507{
1508        if (disk->ev)
1509                __disk_unblock_events(disk, false);
1510}
1511
1512/**
1513 * disk_flush_events - schedule immediate event checking and flushing
1514 * @disk: disk to check and flush events for
1515 * @mask: events to flush
1516 *
1517 * Schedule immediate event checking on @disk if not blocked.  Events in
1518 * @mask are scheduled to be cleared from the driver.  Note that this
1519 * doesn't clear the events from @disk->ev.
1520 *
1521 * CONTEXT:
1522 * If @mask is non-zero must be called with bdev->bd_mutex held.
1523 */
1524void disk_flush_events(struct gendisk *disk, unsigned int mask)
1525{
1526        struct disk_events *ev = disk->ev;
1527
1528        if (!ev)
1529                return;
1530
1531        spin_lock_irq(&ev->lock);
1532        ev->clearing |= mask;
1533        if (!ev->block)
1534                mod_delayed_work(system_freezable_wq, &ev->dwork, 0);
1535        spin_unlock_irq(&ev->lock);
1536}
1537
1538/**
1539 * disk_clear_events - synchronously check, clear and return pending events
1540 * @disk: disk to fetch and clear events from
1541 * @mask: mask of events to be fetched and clearted
1542 *
1543 * Disk events are synchronously checked and pending events in @mask
1544 * are cleared and returned.  This ignores the block count.
1545 *
1546 * CONTEXT:
1547 * Might sleep.
1548 */
1549unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1550{
1551        const struct block_device_operations *bdops = disk->fops;
1552        struct disk_events *ev = disk->ev;
1553        unsigned int pending;
1554        unsigned int clearing = mask;
1555
1556        if (!ev) {
1557                /* for drivers still using the old ->media_changed method */
1558                if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1559                    bdops->media_changed && bdops->media_changed(disk))
1560                        return DISK_EVENT_MEDIA_CHANGE;
1561                return 0;
1562        }
1563
1564        disk_block_events(disk);
1565
1566        /*
1567         * store the union of mask and ev->clearing on the stack so that the
1568         * race with disk_flush_events does not cause ambiguity (ev->clearing
1569         * can still be modified even if events are blocked).
1570         */
1571        spin_lock_irq(&ev->lock);
1572        clearing |= ev->clearing;
1573        ev->clearing = 0;
1574        spin_unlock_irq(&ev->lock);
1575
1576        disk_check_events(ev, &clearing);
1577        /*
1578         * if ev->clearing is not 0, the disk_flush_events got called in the
1579         * middle of this function, so we want to run the workfn without delay.
1580         */
1581        __disk_unblock_events(disk, ev->clearing ? true : false);
1582
1583        /* then, fetch and clear pending events */
1584        spin_lock_irq(&ev->lock);
1585        pending = ev->pending & mask;
1586        ev->pending &= ~mask;
1587        spin_unlock_irq(&ev->lock);
1588        WARN_ON_ONCE(clearing & mask);
1589
1590        return pending;
1591}
1592
1593/*
1594 * Separate this part out so that a different pointer for clearing_ptr can be
1595 * passed in for disk_clear_events.
1596 */
1597static void disk_events_workfn(struct work_struct *work)
1598{
1599        struct delayed_work *dwork = to_delayed_work(work);
1600        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1601
1602        disk_check_events(ev, &ev->clearing);
1603}
1604
1605static void disk_check_events(struct disk_events *ev,
1606                              unsigned int *clearing_ptr)
1607{
1608        struct gendisk *disk = ev->disk;
1609        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1610        unsigned int clearing = *clearing_ptr;
1611        unsigned int events;
1612        unsigned long intv;
1613        int nr_events = 0, i;
1614
1615        /* check events */
1616        events = disk->fops->check_events(disk, clearing);
1617
1618        /* accumulate pending events and schedule next poll if necessary */
1619        spin_lock_irq(&ev->lock);
1620
1621        events &= ~ev->pending;
1622        ev->pending |= events;
1623        *clearing_ptr &= ~clearing;
1624
1625        intv = disk_events_poll_jiffies(disk);
1626        if (!ev->block && intv)
1627                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
1628
1629        spin_unlock_irq(&ev->lock);
1630
1631        /*
1632         * Tell userland about new events.  Only the events listed in
1633         * @disk->events are reported.  Unlisted events are processed the
1634         * same internally but never get reported to userland.
1635         */
1636        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1637                if (events & disk->events & (1 << i))
1638                        envp[nr_events++] = disk_uevents[i];
1639
1640        if (nr_events)
1641                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1642}
1643
1644/*
1645 * A disk events enabled device has the following sysfs nodes under
1646 * its /sys/block/X/ directory.
1647 *
1648 * events               : list of all supported events
1649 * events_async         : list of events which can be detected w/o polling
1650 * events_poll_msecs    : polling interval, 0: disable, -1: system default
1651 */
1652static ssize_t __disk_events_show(unsigned int events, char *buf)
1653{
1654        const char *delim = "";
1655        ssize_t pos = 0;
1656        int i;
1657
1658        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1659                if (events & (1 << i)) {
1660                        pos += sprintf(buf + pos, "%s%s",
1661                                       delim, disk_events_strs[i]);
1662                        delim = " ";
1663                }
1664        if (pos)
1665                pos += sprintf(buf + pos, "\n");
1666        return pos;
1667}
1668
1669static ssize_t disk_events_show(struct device *dev,
1670                                struct device_attribute *attr, char *buf)
1671{
1672        struct gendisk *disk = dev_to_disk(dev);
1673
1674        return __disk_events_show(disk->events, buf);
1675}
1676
1677static ssize_t disk_events_async_show(struct device *dev,
1678                                      struct device_attribute *attr, char *buf)
1679{
1680        struct gendisk *disk = dev_to_disk(dev);
1681
1682        return __disk_events_show(disk->async_events, buf);
1683}
1684
1685static ssize_t disk_events_poll_msecs_show(struct device *dev,
1686                                           struct device_attribute *attr,
1687                                           char *buf)
1688{
1689        struct gendisk *disk = dev_to_disk(dev);
1690
1691        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1692}
1693
1694static ssize_t disk_events_poll_msecs_store(struct device *dev,
1695                                            struct device_attribute *attr,
1696                                            const char *buf, size_t count)
1697{
1698        struct gendisk *disk = dev_to_disk(dev);
1699        long intv;
1700
1701        if (!count || !sscanf(buf, "%ld", &intv))
1702                return -EINVAL;
1703
1704        if (intv < 0 && intv != -1)
1705                return -EINVAL;
1706
1707        disk_block_events(disk);
1708        disk->ev->poll_msecs = intv;
1709        __disk_unblock_events(disk, true);
1710
1711        return count;
1712}
1713
1714static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
1715static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
1716static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
1717                         disk_events_poll_msecs_show,
1718                         disk_events_poll_msecs_store);
1719
1720static const struct attribute *disk_events_attrs[] = {
1721        &dev_attr_events.attr,
1722        &dev_attr_events_async.attr,
1723        &dev_attr_events_poll_msecs.attr,
1724        NULL,
1725};
1726
1727/*
1728 * The default polling interval can be specified by the kernel
1729 * parameter block.events_dfl_poll_msecs which defaults to 0
1730 * (disable).  This can also be modified runtime by writing to
1731 * /sys/module/block/events_dfl_poll_msecs.
1732 */
1733static int disk_events_set_dfl_poll_msecs(const char *val,
1734                                          const struct kernel_param *kp)
1735{
1736        struct disk_events *ev;
1737        int ret;
1738
1739        ret = param_set_ulong(val, kp);
1740        if (ret < 0)
1741                return ret;
1742
1743        mutex_lock(&disk_events_mutex);
1744
1745        list_for_each_entry(ev, &disk_events, node)
1746                disk_flush_events(ev->disk, 0);
1747
1748        mutex_unlock(&disk_events_mutex);
1749
1750        return 0;
1751}
1752
1753static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1754        .set    = disk_events_set_dfl_poll_msecs,
1755        .get    = param_get_ulong,
1756};
1757
1758#undef MODULE_PARAM_PREFIX
1759#define MODULE_PARAM_PREFIX     "block."
1760
1761module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1762                &disk_events_dfl_poll_msecs, 0644);
1763
1764/*
1765 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
1766 */
1767static void disk_alloc_events(struct gendisk *disk)
1768{
1769        struct disk_events *ev;
1770
1771        if (!disk->fops->check_events)
1772                return;
1773
1774        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1775        if (!ev) {
1776                pr_warn("%s: failed to initialize events\n", disk->disk_name);
1777                return;
1778        }
1779
1780        INIT_LIST_HEAD(&ev->node);
1781        ev->disk = disk;
1782        spin_lock_init(&ev->lock);
1783        mutex_init(&ev->block_mutex);
1784        ev->block = 1;
1785        ev->poll_msecs = -1;
1786        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1787
1788        disk->ev = ev;
1789}
1790
1791static void disk_add_events(struct gendisk *disk)
1792{
1793        if (!disk->ev)
1794                return;
1795
1796        /* FIXME: error handling */
1797        if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
1798                pr_warn("%s: failed to create sysfs files for events\n",
1799                        disk->disk_name);
1800
1801        mutex_lock(&disk_events_mutex);
1802        list_add_tail(&disk->ev->node, &disk_events);
1803        mutex_unlock(&disk_events_mutex);
1804
1805        /*
1806         * Block count is initialized to 1 and the following initial
1807         * unblock kicks it into action.
1808         */
1809        __disk_unblock_events(disk, true);
1810}
1811
1812static void disk_del_events(struct gendisk *disk)
1813{
1814        if (!disk->ev)
1815                return;
1816
1817        disk_block_events(disk);
1818
1819        mutex_lock(&disk_events_mutex);
1820        list_del_init(&disk->ev->node);
1821        mutex_unlock(&disk_events_mutex);
1822
1823        sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
1824}
1825
1826static void disk_release_events(struct gendisk *disk)
1827{
1828        /* the block count should be 1 from disk_del_events() */
1829        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
1830        kfree(disk->ev);
1831}
1832
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.