linux/drivers/md/md.c
<<
>>
Prefs
   1/*
   2   md.c : Multiple Devices driver for Linux
   3          Copyright (C) 1998, 1999, 2000 Ingo Molnar
   4
   5     completely rewritten, based on the MD driver code from Marc Zyngier
   6
   7   Changes:
   8
   9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
  10   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
  11   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
  12   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
  13   - kmod support by: Cyrus Durgin
  14   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
  15   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
  16
  17   - lots of fixes and improvements to the RAID1/RAID5 and generic
  18     RAID code (such as request based resynchronization):
  19
  20     Neil Brown <neilb@cse.unsw.edu.au>.
  21
  22   - persistent bitmap code
  23     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
  24
  25   This program is free software; you can redistribute it and/or modify
  26   it under the terms of the GNU General Public License as published by
  27   the Free Software Foundation; either version 2, or (at your option)
  28   any later version.
  29
  30   You should have received a copy of the GNU General Public License
  31   (for example /usr/src/linux/COPYING); if not, write to the Free
  32   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  33*/
  34
  35#include <linux/module.h>
  36#include <linux/kernel.h>
  37#include <linux/kthread.h>
  38#include <linux/linkage.h>
  39#include <linux/raid/md.h>
  40#include <linux/raid/bitmap.h>
  41#include <linux/sysctl.h>
  42#include <linux/buffer_head.h> /* for invalidate_bdev */
  43#include <linux/poll.h>
  44#include <linux/mutex.h>
  45#include <linux/ctype.h>
  46#include <linux/freezer.h>
  47
  48#include <linux/init.h>
  49
  50#include <linux/file.h>
  51
  52#ifdef CONFIG_KMOD
  53#include <linux/kmod.h>
  54#endif
  55
  56#include <asm/unaligned.h>
  57
  58#define MAJOR_NR MD_MAJOR
  59#define MD_DRIVER
  60
  61/* 63 partitions with the alternate major number (mdp) */
  62#define MdpMinorShift 6
  63
  64#define DEBUG 0
  65#define dprintk(x...) ((void)(DEBUG && printk(x)))
  66
  67
  68#ifndef MODULE
  69static void autostart_arrays (int part);
  70#endif
  71
  72static LIST_HEAD(pers_list);
  73static DEFINE_SPINLOCK(pers_lock);
  74
  75static void md_print_devices(void);
  76
  77#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
  78
  79/*
  80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  81 * is 1000 KB/sec, so the extra system load does not show up that much.
  82 * Increase it if you want to have more _guaranteed_ speed. Note that
  83 * the RAID driver will use the maximum available bandwidth if the IO
  84 * subsystem is idle. There is also an 'absolute maximum' reconstruction
  85 * speed limit - in case reconstruction slows down your system despite
  86 * idle IO detection.
  87 *
  88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
  89 * or /sys/block/mdX/md/sync_speed_{min,max}
  90 */
  91
  92static int sysctl_speed_limit_min = 1000;
  93static int sysctl_speed_limit_max = 200000;
  94static inline int speed_min(mddev_t *mddev)
  95{
  96        return mddev->sync_speed_min ?
  97                mddev->sync_speed_min : sysctl_speed_limit_min;
  98}
  99
 100static inline int speed_max(mddev_t *mddev)
 101{
 102        return mddev->sync_speed_max ?
 103                mddev->sync_speed_max : sysctl_speed_limit_max;
 104}
 105
 106static struct ctl_table_header *raid_table_header;
 107
 108static ctl_table raid_table[] = {
 109        {
 110                .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
 111                .procname       = "speed_limit_min",
 112                .data           = &sysctl_speed_limit_min,
 113                .maxlen         = sizeof(int),
 114                .mode           = S_IRUGO|S_IWUSR,
 115                .proc_handler   = &proc_dointvec,
 116        },
 117        {
 118                .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
 119                .procname       = "speed_limit_max",
 120                .data           = &sysctl_speed_limit_max,
 121                .maxlen         = sizeof(int),
 122                .mode           = S_IRUGO|S_IWUSR,
 123                .proc_handler   = &proc_dointvec,
 124        },
 125        { .ctl_name = 0 }
 126};
 127
 128static ctl_table raid_dir_table[] = {
 129        {
 130                .ctl_name       = DEV_RAID,
 131                .procname       = "raid",
 132                .maxlen         = 0,
 133                .mode           = S_IRUGO|S_IXUGO,
 134                .child          = raid_table,
 135        },
 136        { .ctl_name = 0 }
 137};
 138
 139static ctl_table raid_root_table[] = {
 140        {
 141                .ctl_name       = CTL_DEV,
 142                .procname       = "dev",
 143                .maxlen         = 0,
 144                .mode           = 0555,
 145                .child          = raid_dir_table,
 146        },
 147        { .ctl_name = 0 }
 148};
 149
 150static struct block_device_operations md_fops;
 151
 152static int start_readonly;
 153
 154/*
 155 * We have a system wide 'event count' that is incremented
 156 * on any 'interesting' event, and readers of /proc/mdstat
 157 * can use 'poll' or 'select' to find out when the event
 158 * count increases.
 159 *
 160 * Events are:
 161 *  start array, stop array, error, add device, remove device,
 162 *  start build, activate spare
 163 */
 164static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 165static atomic_t md_event_count;
 166void md_new_event(mddev_t *mddev)
 167{
 168        atomic_inc(&md_event_count);
 169        wake_up(&md_event_waiters);
 170        sysfs_notify(&mddev->kobj, NULL, "sync_action");
 171}
 172EXPORT_SYMBOL_GPL(md_new_event);
 173
 174/* Alternate version that can be called from interrupts
 175 * when calling sysfs_notify isn't needed.
 176 */
 177static void md_new_event_inintr(mddev_t *mddev)
 178{
 179        atomic_inc(&md_event_count);
 180        wake_up(&md_event_waiters);
 181}
 182
 183/*
 184 * Enables to iterate over all existing md arrays
 185 * all_mddevs_lock protects this list.
 186 */
 187static LIST_HEAD(all_mddevs);
 188static DEFINE_SPINLOCK(all_mddevs_lock);
 189
 190
 191/*
 192 * iterates through all used mddevs in the system.
 193 * We take care to grab the all_mddevs_lock whenever navigating
 194 * the list, and to always hold a refcount when unlocked.
 195 * Any code which breaks out of this loop while own
 196 * a reference to the current mddev and must mddev_put it.
 197 */
 198#define for_each_mddev(mddev,tmp)                                       \
 199                                                                        \
 200        for (({ spin_lock(&all_mddevs_lock);                            \
 201                tmp = all_mddevs.next;                                  \
 202                mddev = NULL;});                                        \
 203             ({ if (tmp != &all_mddevs)                                 \
 204                        mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
 205                spin_unlock(&all_mddevs_lock);                          \
 206                if (mddev) mddev_put(mddev);                            \
 207                mddev = list_entry(tmp, mddev_t, all_mddevs);           \
 208                tmp != &all_mddevs;});                                  \
 209             ({ spin_lock(&all_mddevs_lock);                            \
 210                tmp = tmp->next;})                                      \
 211                )
 212
 213
 214static int md_fail_request (struct request_queue *q, struct bio *bio)
 215{
 216        bio_io_error(bio);
 217        return 0;
 218}
 219
 220static inline mddev_t *mddev_get(mddev_t *mddev)
 221{
 222        atomic_inc(&mddev->active);
 223        return mddev;
 224}
 225
 226static void mddev_put(mddev_t *mddev)
 227{
 228        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 229                return;
 230        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
 231                list_del(&mddev->all_mddevs);
 232                spin_unlock(&all_mddevs_lock);
 233                blk_cleanup_queue(mddev->queue);
 234                kobject_put(&mddev->kobj);
 235        } else
 236                spin_unlock(&all_mddevs_lock);
 237}
 238
 239static mddev_t * mddev_find(dev_t unit)
 240{
 241        mddev_t *mddev, *new = NULL;
 242
 243 retry:
 244        spin_lock(&all_mddevs_lock);
 245        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 246                if (mddev->unit == unit) {
 247                        mddev_get(mddev);
 248                        spin_unlock(&all_mddevs_lock);
 249                        kfree(new);
 250                        return mddev;
 251                }
 252
 253        if (new) {
 254                list_add(&new->all_mddevs, &all_mddevs);
 255                spin_unlock(&all_mddevs_lock);
 256                return new;
 257        }
 258        spin_unlock(&all_mddevs_lock);
 259
 260        new = kzalloc(sizeof(*new), GFP_KERNEL);
 261        if (!new)
 262                return NULL;
 263
 264        new->unit = unit;
 265        if (MAJOR(unit) == MD_MAJOR)
 266                new->md_minor = MINOR(unit);
 267        else
 268                new->md_minor = MINOR(unit) >> MdpMinorShift;
 269
 270        mutex_init(&new->reconfig_mutex);
 271        INIT_LIST_HEAD(&new->disks);
 272        INIT_LIST_HEAD(&new->all_mddevs);
 273        init_timer(&new->safemode_timer);
 274        atomic_set(&new->active, 1);
 275        spin_lock_init(&new->write_lock);
 276        init_waitqueue_head(&new->sb_wait);
 277        new->reshape_position = MaxSector;
 278        new->resync_max = MaxSector;
 279
 280        new->queue = blk_alloc_queue(GFP_KERNEL);
 281        if (!new->queue) {
 282                kfree(new);
 283                return NULL;
 284        }
 285        set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
 286
 287        blk_queue_make_request(new->queue, md_fail_request);
 288
 289        goto retry;
 290}
 291
 292static inline int mddev_lock(mddev_t * mddev)
 293{
 294        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 295}
 296
 297static inline int mddev_trylock(mddev_t * mddev)
 298{
 299        return mutex_trylock(&mddev->reconfig_mutex);
 300}
 301
 302static inline void mddev_unlock(mddev_t * mddev)
 303{
 304        mutex_unlock(&mddev->reconfig_mutex);
 305
 306        md_wakeup_thread(mddev->thread);
 307}
 308
 309static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 310{
 311        mdk_rdev_t * rdev;
 312        struct list_head *tmp;
 313
 314        rdev_for_each(rdev, tmp, mddev) {
 315                if (rdev->desc_nr == nr)
 316                        return rdev;
 317        }
 318        return NULL;
 319}
 320
 321static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 322{
 323        struct list_head *tmp;
 324        mdk_rdev_t *rdev;
 325
 326        rdev_for_each(rdev, tmp, mddev) {
 327                if (rdev->bdev->bd_dev == dev)
 328                        return rdev;
 329        }
 330        return NULL;
 331}
 332
 333static struct mdk_personality *find_pers(int level, char *clevel)
 334{
 335        struct mdk_personality *pers;
 336        list_for_each_entry(pers, &pers_list, list) {
 337                if (level != LEVEL_NONE && pers->level == level)
 338                        return pers;
 339                if (strcmp(pers->name, clevel)==0)
 340                        return pers;
 341        }
 342        return NULL;
 343}
 344
 345static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 346{
 347        sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 348        return MD_NEW_SIZE_BLOCKS(size);
 349}
 350
 351static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
 352{
 353        sector_t size;
 354
 355        size = rdev->sb_offset;
 356
 357        if (chunk_size)
 358                size &= ~((sector_t)chunk_size/1024 - 1);
 359        return size;
 360}
 361
 362static int alloc_disk_sb(mdk_rdev_t * rdev)
 363{
 364        if (rdev->sb_page)
 365                MD_BUG();
 366
 367        rdev->sb_page = alloc_page(GFP_KERNEL);
 368        if (!rdev->sb_page) {
 369                printk(KERN_ALERT "md: out of memory.\n");
 370                return -EINVAL;
 371        }
 372
 373        return 0;
 374}
 375
 376static void free_disk_sb(mdk_rdev_t * rdev)
 377{
 378        if (rdev->sb_page) {
 379                put_page(rdev->sb_page);
 380                rdev->sb_loaded = 0;
 381                rdev->sb_page = NULL;
 382                rdev->sb_offset = 0;
 383                rdev->size = 0;
 384        }
 385}
 386
 387
 388static void super_written(struct bio *bio, int error)
 389{
 390        mdk_rdev_t *rdev = bio->bi_private;
 391        mddev_t *mddev = rdev->mddev;
 392
 393        if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 394                printk("md: super_written gets error=%d, uptodate=%d\n",
 395                       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
 396                WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
 397                md_error(mddev, rdev);
 398        }
 399
 400        if (atomic_dec_and_test(&mddev->pending_writes))
 401                wake_up(&mddev->sb_wait);
 402        bio_put(bio);
 403}
 404
 405static void super_written_barrier(struct bio *bio, int error)
 406{
 407        struct bio *bio2 = bio->bi_private;
 408        mdk_rdev_t *rdev = bio2->bi_private;
 409        mddev_t *mddev = rdev->mddev;
 410
 411        if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
 412            error == -EOPNOTSUPP) {
 413                unsigned long flags;
 414                /* barriers don't appear to be supported :-( */
 415                set_bit(BarriersNotsupp, &rdev->flags);
 416                mddev->barriers_work = 0;
 417                spin_lock_irqsave(&mddev->write_lock, flags);
 418                bio2->bi_next = mddev->biolist;
 419                mddev->biolist = bio2;
 420                spin_unlock_irqrestore(&mddev->write_lock, flags);
 421                wake_up(&mddev->sb_wait);
 422                bio_put(bio);
 423        } else {
 424                bio_put(bio2);
 425                bio->bi_private = rdev;
 426                super_written(bio, error);
 427        }
 428}
 429
 430void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 431                   sector_t sector, int size, struct page *page)
 432{
 433        /* write first size bytes of page to sector of rdev
 434         * Increment mddev->pending_writes before returning
 435         * and decrement it on completion, waking up sb_wait
 436         * if zero is reached.
 437         * If an error occurred, call md_error
 438         *
 439         * As we might need to resubmit the request if BIO_RW_BARRIER
 440         * causes ENOTSUPP, we allocate a spare bio...
 441         */
 442        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 443        int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
 444
 445        bio->bi_bdev = rdev->bdev;
 446        bio->bi_sector = sector;
 447        bio_add_page(bio, page, size, 0);
 448        bio->bi_private = rdev;
 449        bio->bi_end_io = super_written;
 450        bio->bi_rw = rw;
 451
 452        atomic_inc(&mddev->pending_writes);
 453        if (!test_bit(BarriersNotsupp, &rdev->flags)) {
 454                struct bio *rbio;
 455                rw |= (1<<BIO_RW_BARRIER);
 456                rbio = bio_clone(bio, GFP_NOIO);
 457                rbio->bi_private = bio;
 458                rbio->bi_end_io = super_written_barrier;
 459                submit_bio(rw, rbio);
 460        } else
 461                submit_bio(rw, bio);
 462}
 463
 464void md_super_wait(mddev_t *mddev)
 465{
 466        /* wait for all superblock writes that were scheduled to complete.
 467         * if any had to be retried (due to BARRIER problems), retry them
 468         */
 469        DEFINE_WAIT(wq);
 470        for(;;) {
 471                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
 472                if (atomic_read(&mddev->pending_writes)==0)
 473                        break;
 474                while (mddev->biolist) {
 475                        struct bio *bio;
 476                        spin_lock_irq(&mddev->write_lock);
 477                        bio = mddev->biolist;
 478                        mddev->biolist = bio->bi_next ;
 479                        bio->bi_next = NULL;
 480                        spin_unlock_irq(&mddev->write_lock);
 481                        submit_bio(bio->bi_rw, bio);
 482                }
 483                schedule();
 484        }
 485        finish_wait(&mddev->sb_wait, &wq);
 486}
 487
 488static void bi_complete(struct bio *bio, int error)
 489{
 490        complete((struct completion*)bio->bi_private);
 491}
 492
 493int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 494                   struct page *page, int rw)
 495{
 496        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 497        struct completion event;
 498        int ret;
 499
 500        rw |= (1 << BIO_RW_SYNC);
 501
 502        bio->bi_bdev = bdev;
 503        bio->bi_sector = sector;
 504        bio_add_page(bio, page, size, 0);
 505        init_completion(&event);
 506        bio->bi_private = &event;
 507        bio->bi_end_io = bi_complete;
 508        submit_bio(rw, bio);
 509        wait_for_completion(&event);
 510
 511        ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
 512        bio_put(bio);
 513        return ret;
 514}
 515EXPORT_SYMBOL_GPL(sync_page_io);
 516
 517static int read_disk_sb(mdk_rdev_t * rdev, int size)
 518{
 519        char b[BDEVNAME_SIZE];
 520        if (!rdev->sb_page) {
 521                MD_BUG();
 522                return -EINVAL;
 523        }
 524        if (rdev->sb_loaded)
 525                return 0;
 526
 527
 528        if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
 529                goto fail;
 530        rdev->sb_loaded = 1;
 531        return 0;
 532
 533fail:
 534        printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
 535                bdevname(rdev->bdev,b));
 536        return -EINVAL;
 537}
 538
 539static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 540{
 541        if (    (sb1->set_uuid0 == sb2->set_uuid0) &&
 542                (sb1->set_uuid1 == sb2->set_uuid1) &&
 543                (sb1->set_uuid2 == sb2->set_uuid2) &&
 544                (sb1->set_uuid3 == sb2->set_uuid3))
 545
 546                return 1;
 547
 548        return 0;
 549}
 550
 551
 552static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 553{
 554        int ret;
 555        mdp_super_t *tmp1, *tmp2;
 556
 557        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
 558        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
 559
 560        if (!tmp1 || !tmp2) {
 561                ret = 0;
 562                printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
 563                goto abort;
 564        }
 565
 566        *tmp1 = *sb1;
 567        *tmp2 = *sb2;
 568
 569        /*
 570         * nr_disks is not constant
 571         */
 572        tmp1->nr_disks = 0;
 573        tmp2->nr_disks = 0;
 574
 575        if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
 576                ret = 0;
 577        else
 578                ret = 1;
 579
 580abort:
 581        kfree(tmp1);
 582        kfree(tmp2);
 583        return ret;
 584}
 585
 586
 587static u32 md_csum_fold(u32 csum)
 588{
 589        csum = (csum & 0xffff) + (csum >> 16);
 590        return (csum & 0xffff) + (csum >> 16);
 591}
 592
 593static unsigned int calc_sb_csum(mdp_super_t * sb)
 594{
 595        u64 newcsum = 0;
 596        u32 *sb32 = (u32*)sb;
 597        int i;
 598        unsigned int disk_csum, csum;
 599
 600        disk_csum = sb->sb_csum;
 601        sb->sb_csum = 0;
 602
 603        for (i = 0; i < MD_SB_BYTES/4 ; i++)
 604                newcsum += sb32[i];
 605        csum = (newcsum & 0xffffffff) + (newcsum>>32);
 606
 607
 608#ifdef CONFIG_ALPHA
 609        /* This used to use csum_partial, which was wrong for several
 610         * reasons including that different results are returned on
 611         * different architectures.  It isn't critical that we get exactly
 612         * the same return value as before (we always csum_fold before
 613         * testing, and that removes any differences).  However as we
 614         * know that csum_partial always returned a 16bit value on
 615         * alphas, do a fold to maximise conformity to previous behaviour.
 616         */
 617        sb->sb_csum = md_csum_fold(disk_csum);
 618#else
 619        sb->sb_csum = disk_csum;
 620#endif
 621        return csum;
 622}
 623
 624
 625/*
 626 * Handle superblock details.
 627 * We want to be able to handle multiple superblock formats
 628 * so we have a common interface to them all, and an array of
 629 * different handlers.
 630 * We rely on user-space to write the initial superblock, and support
 631 * reading and updating of superblocks.
 632 * Interface methods are:
 633 *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
 634 *      loads and validates a superblock on dev.
 635 *      if refdev != NULL, compare superblocks on both devices
 636 *    Return:
 637 *      0 - dev has a superblock that is compatible with refdev
 638 *      1 - dev has a superblock that is compatible and newer than refdev
 639 *          so dev should be used as the refdev in future
 640 *     -EINVAL superblock incompatible or invalid
 641 *     -othererror e.g. -EIO
 642 *
 643 *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
 644 *      Verify that dev is acceptable into mddev.
 645 *       The first time, mddev->raid_disks will be 0, and data from
 646 *       dev should be merged in.  Subsequent calls check that dev
 647 *       is new enough.  Return 0 or -EINVAL
 648 *
 649 *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
 650 *     Update the superblock for rdev with data in mddev
 651 *     This does not write to disc.
 652 *
 653 */
 654
 655struct super_type  {
 656        char            *name;
 657        struct module   *owner;
 658        int             (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
 659        int             (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 660        void            (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 661};
 662
 663/*
 664 * load_super for 0.90.0 
 665 */
 666static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 667{
 668        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 669        mdp_super_t *sb;
 670        int ret;
 671        sector_t sb_offset;
 672
 673        /*
 674         * Calculate the position of the superblock,
 675         * it's at the end of the disk.
 676         *
 677         * It also happens to be a multiple of 4Kb.
 678         */
 679        sb_offset = calc_dev_sboffset(rdev->bdev);
 680        rdev->sb_offset = sb_offset;
 681
 682        ret = read_disk_sb(rdev, MD_SB_BYTES);
 683        if (ret) return ret;
 684
 685        ret = -EINVAL;
 686
 687        bdevname(rdev->bdev, b);
 688        sb = (mdp_super_t*)page_address(rdev->sb_page);
 689
 690        if (sb->md_magic != MD_SB_MAGIC) {
 691                printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
 692                       b);
 693                goto abort;
 694        }
 695
 696        if (sb->major_version != 0 ||
 697            sb->minor_version < 90 ||
 698            sb->minor_version > 91) {
 699                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 700                        sb->major_version, sb->minor_version,
 701                        b);
 702                goto abort;
 703        }
 704
 705        if (sb->raid_disks <= 0)
 706                goto abort;
 707
 708        if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
 709                printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
 710                        b);
 711                goto abort;
 712        }
 713
 714        rdev->preferred_minor = sb->md_minor;
 715        rdev->data_offset = 0;
 716        rdev->sb_size = MD_SB_BYTES;
 717
 718        if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
 719                if (sb->level != 1 && sb->level != 4
 720                    && sb->level != 5 && sb->level != 6
 721                    && sb->level != 10) {
 722                        /* FIXME use a better test */
 723                        printk(KERN_WARNING
 724                               "md: bitmaps not supported for this level.\n");
 725                        goto abort;
 726                }
 727        }
 728
 729        if (sb->level == LEVEL_MULTIPATH)
 730                rdev->desc_nr = -1;
 731        else
 732                rdev->desc_nr = sb->this_disk.number;
 733
 734        if (refdev == 0)
 735                ret = 1;
 736        else {
 737                __u64 ev1, ev2;
 738                mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
 739                if (!uuid_equal(refsb, sb)) {
 740                        printk(KERN_WARNING "md: %s has different UUID to %s\n",
 741                                b, bdevname(refdev->bdev,b2));
 742                        goto abort;
 743                }
 744                if (!sb_equal(refsb, sb)) {
 745                        printk(KERN_WARNING "md: %s has same UUID"
 746                               " but different superblock to %s\n",
 747                               b, bdevname(refdev->bdev, b2));
 748                        goto abort;
 749                }
 750                ev1 = md_event(sb);
 751                ev2 = md_event(refsb);
 752                if (ev1 > ev2)
 753                        ret = 1;
 754                else 
 755                        ret = 0;
 756        }
 757        rdev->size = calc_dev_size(rdev, sb->chunk_size);
 758
 759        if (rdev->size < sb->size && sb->level > 1)
 760                /* "this cannot possibly happen" ... */
 761                ret = -EINVAL;
 762
 763 abort:
 764        return ret;
 765}
 766
 767/*
 768 * validate_super for 0.90.0
 769 */
 770static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 771{
 772        mdp_disk_t *desc;
 773        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
 774        __u64 ev1 = md_event(sb);
 775
 776        rdev->raid_disk = -1;
 777        clear_bit(Faulty, &rdev->flags);
 778        clear_bit(In_sync, &rdev->flags);
 779        clear_bit(WriteMostly, &rdev->flags);
 780        clear_bit(BarriersNotsupp, &rdev->flags);
 781
 782        if (mddev->raid_disks == 0) {
 783                mddev->major_version = 0;
 784                mddev->minor_version = sb->minor_version;
 785                mddev->patch_version = sb->patch_version;
 786                mddev->external = 0;
 787                mddev->chunk_size = sb->chunk_size;
 788                mddev->ctime = sb->ctime;
 789                mddev->utime = sb->utime;
 790                mddev->level = sb->level;
 791                mddev->clevel[0] = 0;
 792                mddev->layout = sb->layout;
 793                mddev->raid_disks = sb->raid_disks;
 794                mddev->size = sb->size;
 795                mddev->events = ev1;
 796                mddev->bitmap_offset = 0;
 797                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 798
 799                if (mddev->minor_version >= 91) {
 800                        mddev->reshape_position = sb->reshape_position;
 801                        mddev->delta_disks = sb->delta_disks;
 802                        mddev->new_level = sb->new_level;
 803                        mddev->new_layout = sb->new_layout;
 804                        mddev->new_chunk = sb->new_chunk;
 805                } else {
 806                        mddev->reshape_position = MaxSector;
 807                        mddev->delta_disks = 0;
 808                        mddev->new_level = mddev->level;
 809                        mddev->new_layout = mddev->layout;
 810                        mddev->new_chunk = mddev->chunk_size;
 811                }
 812
 813                if (sb->state & (1<<MD_SB_CLEAN))
 814                        mddev->recovery_cp = MaxSector;
 815                else {
 816                        if (sb->events_hi == sb->cp_events_hi && 
 817                                sb->events_lo == sb->cp_events_lo) {
 818                                mddev->recovery_cp = sb->recovery_cp;
 819                        } else
 820                                mddev->recovery_cp = 0;
 821                }
 822
 823                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
 824                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
 825                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
 826                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
 827
 828                mddev->max_disks = MD_SB_DISKS;
 829
 830                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 831                    mddev->bitmap_file == NULL)
 832                        mddev->bitmap_offset = mddev->default_bitmap_offset;
 833
 834        } else if (mddev->pers == NULL) {
 835                /* Insist on good event counter while assembling */
 836                ++ev1;
 837                if (ev1 < mddev->events) 
 838                        return -EINVAL;
 839        } else if (mddev->bitmap) {
 840                /* if adding to array with a bitmap, then we can accept an
 841                 * older device ... but not too old.
 842                 */
 843                if (ev1 < mddev->bitmap->events_cleared)
 844                        return 0;
 845        } else {
 846                if (ev1 < mddev->events)
 847                        /* just a hot-add of a new device, leave raid_disk at -1 */
 848                        return 0;
 849        }
 850
 851        if (mddev->level != LEVEL_MULTIPATH) {
 852                desc = sb->disks + rdev->desc_nr;
 853
 854                if (desc->state & (1<<MD_DISK_FAULTY))
 855                        set_bit(Faulty, &rdev->flags);
 856                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
 857                            desc->raid_disk < mddev->raid_disks */) {
 858                        set_bit(In_sync, &rdev->flags);
 859                        rdev->raid_disk = desc->raid_disk;
 860                }
 861                if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
 862                        set_bit(WriteMostly, &rdev->flags);
 863        } else /* MULTIPATH are always insync */
 864                set_bit(In_sync, &rdev->flags);
 865        return 0;
 866}
 867
 868/*
 869 * sync_super for 0.90.0
 870 */
 871static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 872{
 873        mdp_super_t *sb;
 874        struct list_head *tmp;
 875        mdk_rdev_t *rdev2;
 876        int next_spare = mddev->raid_disks;
 877
 878
 879        /* make rdev->sb match mddev data..
 880         *
 881         * 1/ zero out disks
 882         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
 883         * 3/ any empty disks < next_spare become removed
 884         *
 885         * disks[0] gets initialised to REMOVED because
 886         * we cannot be sure from other fields if it has
 887         * been initialised or not.
 888         */
 889        int i;
 890        int active=0, working=0,failed=0,spare=0,nr_disks=0;
 891
 892        rdev->sb_size = MD_SB_BYTES;
 893
 894        sb = (mdp_super_t*)page_address(rdev->sb_page);
 895
 896        memset(sb, 0, sizeof(*sb));
 897
 898        sb->md_magic = MD_SB_MAGIC;
 899        sb->major_version = mddev->major_version;
 900        sb->patch_version = mddev->patch_version;
 901        sb->gvalid_words  = 0; /* ignored */
 902        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
 903        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
 904        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
 905        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
 906
 907        sb->ctime = mddev->ctime;
 908        sb->level = mddev->level;
 909        sb->size  = mddev->size;
 910        sb->raid_disks = mddev->raid_disks;
 911        sb->md_minor = mddev->md_minor;
 912        sb->not_persistent = 0;
 913        sb->utime = mddev->utime;
 914        sb->state = 0;
 915        sb->events_hi = (mddev->events>>32);
 916        sb->events_lo = (u32)mddev->events;
 917
 918        if (mddev->reshape_position == MaxSector)
 919                sb->minor_version = 90;
 920        else {
 921                sb->minor_version = 91;
 922                sb->reshape_position = mddev->reshape_position;
 923                sb->new_level = mddev->new_level;
 924                sb->delta_disks = mddev->delta_disks;
 925                sb->new_layout = mddev->new_layout;
 926                sb->new_chunk = mddev->new_chunk;
 927        }
 928        mddev->minor_version = sb->minor_version;
 929        if (mddev->in_sync)
 930        {
 931                sb->recovery_cp = mddev->recovery_cp;
 932                sb->cp_events_hi = (mddev->events>>32);
 933                sb->cp_events_lo = (u32)mddev->events;
 934                if (mddev->recovery_cp == MaxSector)
 935                        sb->state = (1<< MD_SB_CLEAN);
 936        } else
 937                sb->recovery_cp = 0;
 938
 939        sb->layout = mddev->layout;
 940        sb->chunk_size = mddev->chunk_size;
 941
 942        if (mddev->bitmap && mddev->bitmap_file == NULL)
 943                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 944
 945        sb->disks[0].state = (1<<MD_DISK_REMOVED);
 946        rdev_for_each(rdev2, tmp, mddev) {
 947                mdp_disk_t *d;
 948                int desc_nr;
 949                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
 950                    && !test_bit(Faulty, &rdev2->flags))
 951                        desc_nr = rdev2->raid_disk;
 952                else
 953                        desc_nr = next_spare++;
 954                rdev2->desc_nr = desc_nr;
 955                d = &sb->disks[rdev2->desc_nr];
 956                nr_disks++;
 957                d->number = rdev2->desc_nr;
 958                d->major = MAJOR(rdev2->bdev->bd_dev);
 959                d->minor = MINOR(rdev2->bdev->bd_dev);
 960                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
 961                    && !test_bit(Faulty, &rdev2->flags))
 962                        d->raid_disk = rdev2->raid_disk;
 963                else
 964                        d->raid_disk = rdev2->desc_nr; /* compatibility */
 965                if (test_bit(Faulty, &rdev2->flags))
 966                        d->state = (1<<MD_DISK_FAULTY);
 967                else if (test_bit(In_sync, &rdev2->flags)) {
 968                        d->state = (1<<MD_DISK_ACTIVE);
 969                        d->state |= (1<<MD_DISK_SYNC);
 970                        active++;
 971                        working++;
 972                } else {
 973                        d->state = 0;
 974                        spare++;
 975                        working++;
 976                }
 977                if (test_bit(WriteMostly, &rdev2->flags))
 978                        d->state |= (1<<MD_DISK_WRITEMOSTLY);
 979        }
 980        /* now set the "removed" and "faulty" bits on any missing devices */
 981        for (i=0 ; i < mddev->raid_disks ; i++) {
 982                mdp_disk_t *d = &sb->disks[i];
 983                if (d->state == 0 && d->number == 0) {
 984                        d->number = i;
 985                        d->raid_disk = i;
 986                        d->state = (1<<MD_DISK_REMOVED);
 987                        d->state |= (1<<MD_DISK_FAULTY);
 988                        failed++;
 989                }
 990        }
 991        sb->nr_disks = nr_disks;
 992        sb->active_disks = active;
 993        sb->working_disks = working;
 994        sb->failed_disks = failed;
 995        sb->spare_disks = spare;
 996
 997        sb->this_disk = sb->disks[rdev->desc_nr];
 998        sb->sb_csum = calc_sb_csum(sb);
 999}
1000
1001/*
1002 * version 1 superblock
1003 */
1004
1005static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1006{
1007        __le32 disk_csum;
1008        u32 csum;
1009        unsigned long long newcsum;
1010        int size = 256 + le32_to_cpu(sb->max_dev)*2;
1011        __le32 *isuper = (__le32*)sb;
1012        int i;
1013
1014        disk_csum = sb->sb_csum;
1015        sb->sb_csum = 0;
1016        newcsum = 0;
1017        for (i=0; size>=4; size -= 4 )
1018                newcsum += le32_to_cpu(*isuper++);
1019
1020        if (size == 2)
1021                newcsum += le16_to_cpu(*(__le16*) isuper);
1022
1023        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1024        sb->sb_csum = disk_csum;
1025        return cpu_to_le32(csum);
1026}
1027
1028static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1029{
1030        struct mdp_superblock_1 *sb;
1031        int ret;
1032        sector_t sb_offset;
1033        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1034        int bmask;
1035
1036        /*
1037         * Calculate the position of the superblock.
1038         * It is always aligned to a 4K boundary and
1039         * depeding on minor_version, it can be:
1040         * 0: At least 8K, but less than 12K, from end of device
1041         * 1: At start of device
1042         * 2: 4K from start of device.
1043         */
1044        switch(minor_version) {
1045        case 0:
1046                sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1047                sb_offset -= 8*2;
1048                sb_offset &= ~(sector_t)(4*2-1);
1049                /* convert from sectors to K */
1050                sb_offset /= 2;
1051                break;
1052        case 1:
1053                sb_offset = 0;
1054                break;
1055        case 2:
1056                sb_offset = 4;
1057                break;
1058        default:
1059                return -EINVAL;
1060        }
1061        rdev->sb_offset = sb_offset;
1062
1063        /* superblock is rarely larger than 1K, but it can be larger,
1064         * and it is safe to read 4k, so we do that
1065         */
1066        ret = read_disk_sb(rdev, 4096);
1067        if (ret) return ret;
1068
1069
1070        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1071
1072        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1073            sb->major_version != cpu_to_le32(1) ||
1074            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1075            le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1076            (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1077                return -EINVAL;
1078
1079        if (calc_sb_1_csum(sb) != sb->sb_csum) {
1080                printk("md: invalid superblock checksum on %s\n",
1081                        bdevname(rdev->bdev,b));
1082                return -EINVAL;
1083        }
1084        if (le64_to_cpu(sb->data_size) < 10) {
1085                printk("md: data_size too small on %s\n",
1086                       bdevname(rdev->bdev,b));
1087                return -EINVAL;
1088        }
1089        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1090                if (sb->level != cpu_to_le32(1) &&
1091                    sb->level != cpu_to_le32(4) &&
1092                    sb->level != cpu_to_le32(5) &&
1093                    sb->level != cpu_to_le32(6) &&
1094                    sb->level != cpu_to_le32(10)) {
1095                        printk(KERN_WARNING
1096                               "md: bitmaps not supported for this level.\n");
1097                        return -EINVAL;
1098                }
1099        }
1100
1101        rdev->preferred_minor = 0xffff;
1102        rdev->data_offset = le64_to_cpu(sb->data_offset);
1103        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1104
1105        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1106        bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1107        if (rdev->sb_size & bmask)
1108                rdev->sb_size = (rdev->sb_size | bmask) + 1;
1109
1110        if (minor_version
1111            && rdev->data_offset < sb_offset + (rdev->sb_size/512))
1112                return -EINVAL;
1113
1114        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1115                rdev->desc_nr = -1;
1116        else
1117                rdev->desc_nr = le32_to_cpu(sb->dev_number);
1118
1119        if (refdev == 0)
1120                ret = 1;
1121        else {
1122                __u64 ev1, ev2;
1123                struct mdp_superblock_1 *refsb = 
1124                        (struct mdp_superblock_1*)page_address(refdev->sb_page);
1125
1126                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1127                    sb->level != refsb->level ||
1128                    sb->layout != refsb->layout ||
1129                    sb->chunksize != refsb->chunksize) {
1130                        printk(KERN_WARNING "md: %s has strangely different"
1131                                " superblock to %s\n",
1132                                bdevname(rdev->bdev,b),
1133                                bdevname(refdev->bdev,b2));
1134                        return -EINVAL;
1135                }
1136                ev1 = le64_to_cpu(sb->events);
1137                ev2 = le64_to_cpu(refsb->events);
1138
1139                if (ev1 > ev2)
1140                        ret = 1;
1141                else
1142                        ret = 0;
1143        }
1144        if (minor_version)
1145                rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1146        else
1147                rdev->size = rdev->sb_offset;
1148        if (rdev->size < le64_to_cpu(sb->data_size)/2)
1149                return -EINVAL;
1150        rdev->size = le64_to_cpu(sb->data_size)/2;
1151        if (le32_to_cpu(sb->chunksize))
1152                rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1153
1154        if (le64_to_cpu(sb->size) > rdev->size*2)
1155                return -EINVAL;
1156        return ret;
1157}
1158
1159static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1160{
1161        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1162        __u64 ev1 = le64_to_cpu(sb->events);
1163
1164        rdev->raid_disk = -1;
1165        clear_bit(Faulty, &rdev->flags);
1166        clear_bit(In_sync, &rdev->flags);
1167        clear_bit(WriteMostly, &rdev->flags);
1168        clear_bit(BarriersNotsupp, &rdev->flags);
1169
1170        if (mddev->raid_disks == 0) {
1171                mddev->major_version = 1;
1172                mddev->patch_version = 0;
1173                mddev->external = 0;
1174                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1175                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1176                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1177                mddev->level = le32_to_cpu(sb->level);
1178                mddev->clevel[0] = 0;
1179                mddev->layout = le32_to_cpu(sb->layout);
1180                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1181                mddev->size = le64_to_cpu(sb->size)/2;
1182                mddev->events = ev1;
1183                mddev->bitmap_offset = 0;
1184                mddev->default_bitmap_offset = 1024 >> 9;
1185                
1186                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1187                memcpy(mddev->uuid, sb->set_uuid, 16);
1188
1189                mddev->max_disks =  (4096-256)/2;
1190
1191                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1192                    mddev->bitmap_file == NULL )
1193                        mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1194
1195                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1196                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1197                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1198                        mddev->new_level = le32_to_cpu(sb->new_level);
1199                        mddev->new_layout = le32_to_cpu(sb->new_layout);
1200                        mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1201                } else {
1202                        mddev->reshape_position = MaxSector;
1203                        mddev->delta_disks = 0;
1204                        mddev->new_level = mddev->level;
1205                        mddev->new_layout = mddev->layout;
1206                        mddev->new_chunk = mddev->chunk_size;
1207                }
1208
1209        } else if (mddev->pers == NULL) {
1210                /* Insist of good event counter while assembling */
1211                ++ev1;
1212                if (ev1 < mddev->events)
1213                        return -EINVAL;
1214        } else if (mddev->bitmap) {
1215                /* If adding to array with a bitmap, then we can accept an
1216                 * older device, but not too old.
1217                 */
1218                if (ev1 < mddev->bitmap->events_cleared)
1219                        return 0;
1220        } else {
1221                if (ev1 < mddev->events)
1222                        /* just a hot-add of a new device, leave raid_disk at -1 */
1223                        return 0;
1224        }
1225        if (mddev->level != LEVEL_MULTIPATH) {
1226                int role;
1227                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1228                switch(role) {
1229                case 0xffff: /* spare */
1230                        break;
1231                case 0xfffe: /* faulty */
1232                        set_bit(Faulty, &rdev->flags);
1233                        break;
1234                default:
1235                        if ((le32_to_cpu(sb->feature_map) &
1236                             MD_FEATURE_RECOVERY_OFFSET))
1237                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1238                        else
1239                                set_bit(In_sync, &rdev->flags);
1240                        rdev->raid_disk = role;
1241                        break;
1242                }
1243                if (sb->devflags & WriteMostly1)
1244                        set_bit(WriteMostly, &rdev->flags);
1245        } else /* MULTIPATH are always insync */
1246                set_bit(In_sync, &rdev->flags);
1247
1248        return 0;
1249}
1250
1251static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1252{
1253        struct mdp_superblock_1 *sb;
1254        struct list_head *tmp;
1255        mdk_rdev_t *rdev2;
1256        int max_dev, i;
1257        /* make rdev->sb match mddev and rdev data. */
1258
1259        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1260
1261        sb->feature_map = 0;
1262        sb->pad0 = 0;
1263        sb->recovery_offset = cpu_to_le64(0);
1264        memset(sb->pad1, 0, sizeof(sb->pad1));
1265        memset(sb->pad2, 0, sizeof(sb->pad2));
1266        memset(sb->pad3, 0, sizeof(sb->pad3));
1267
1268        sb->utime = cpu_to_le64((__u64)mddev->utime);
1269        sb->events = cpu_to_le64(mddev->events);
1270        if (mddev->in_sync)
1271                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1272        else
1273                sb->resync_offset = cpu_to_le64(0);
1274
1275        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1276
1277        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1278        sb->size = cpu_to_le64(mddev->size<<1);
1279
1280        if (mddev->bitmap && mddev->bitmap_file == NULL) {
1281                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1282                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1283        }
1284
1285        if (rdev->raid_disk >= 0 &&
1286            !test_bit(In_sync, &rdev->flags) &&
1287            rdev->recovery_offset > 0) {
1288                sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1289                sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1290        }
1291
1292        if (mddev->reshape_position != MaxSector) {
1293                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1294                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1295                sb->new_layout = cpu_to_le32(mddev->new_layout);
1296                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1297                sb->new_level = cpu_to_le32(mddev->new_level);
1298                sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1299        }
1300
1301        max_dev = 0;
1302        rdev_for_each(rdev2, tmp, mddev)
1303                if (rdev2->desc_nr+1 > max_dev)
1304                        max_dev = rdev2->desc_nr+1;
1305
1306        if (max_dev > le32_to_cpu(sb->max_dev))
1307                sb->max_dev = cpu_to_le32(max_dev);
1308        for (i=0; i<max_dev;i++)
1309                sb->dev_roles[i] = cpu_to_le16(0xfffe);
1310        
1311        rdev_for_each(rdev2, tmp, mddev) {
1312                i = rdev2->desc_nr;
1313                if (test_bit(Faulty, &rdev2->flags))
1314                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
1315                else if (test_bit(In_sync, &rdev2->flags))
1316                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1317                else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1318                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1319                else
1320                        sb->dev_roles[i] = cpu_to_le16(0xffff);
1321        }
1322
1323        sb->sb_csum = calc_sb_1_csum(sb);
1324}
1325
1326
1327static struct super_type super_types[] = {
1328        [0] = {
1329                .name   = "0.90.0",
1330                .owner  = THIS_MODULE,
1331                .load_super     = super_90_load,
1332                .validate_super = super_90_validate,
1333                .sync_super     = super_90_sync,
1334        },
1335        [1] = {
1336                .name   = "md-1",
1337                .owner  = THIS_MODULE,
1338                .load_super     = super_1_load,
1339                .validate_super = super_1_validate,
1340                .sync_super     = super_1_sync,
1341        },
1342};
1343
1344static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1345{
1346        struct list_head *tmp, *tmp2;
1347        mdk_rdev_t *rdev, *rdev2;
1348
1349        rdev_for_each(rdev, tmp, mddev1)
1350                rdev_for_each(rdev2, tmp2, mddev2)
1351                        if (rdev->bdev->bd_contains ==
1352                            rdev2->bdev->bd_contains)
1353                                return 1;
1354
1355        return 0;
1356}
1357
1358static LIST_HEAD(pending_raid_disks);
1359
1360static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1361{
1362        char b[BDEVNAME_SIZE];
1363        struct kobject *ko;
1364        char *s;
1365        int err;
1366
1367        if (rdev->mddev) {
1368                MD_BUG();
1369                return -EINVAL;
1370        }
1371        /* make sure rdev->size exceeds mddev->size */
1372        if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1373                if (mddev->pers) {
1374                        /* Cannot change size, so fail
1375                         * If mddev->level <= 0, then we don't care
1376                         * about aligning sizes (e.g. linear)
1377                         */
1378                        if (mddev->level > 0)
1379                                return -ENOSPC;
1380                } else
1381                        mddev->size = rdev->size;
1382        }
1383
1384        /* Verify rdev->desc_nr is unique.
1385         * If it is -1, assign a free number, else
1386         * check number is not in use
1387         */
1388        if (rdev->desc_nr < 0) {
1389                int choice = 0;
1390                if (mddev->pers) choice = mddev->raid_disks;
1391                while (find_rdev_nr(mddev, choice))
1392                        choice++;
1393                rdev->desc_nr = choice;
1394        } else {
1395                if (find_rdev_nr(mddev, rdev->desc_nr))
1396                        return -EBUSY;
1397        }
1398        bdevname(rdev->bdev,b);
1399        while ( (s=strchr(b, '/')) != NULL)
1400                *s = '!';
1401
1402        rdev->mddev = mddev;
1403        printk(KERN_INFO "md: bind<%s>\n", b);
1404
1405        if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1406                goto fail;
1407
1408        if (rdev->bdev->bd_part)
1409                ko = &rdev->bdev->bd_part->dev.kobj;
1410        else
1411                ko = &rdev->bdev->bd_disk->dev.kobj;
1412        if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1413                kobject_del(&rdev->kobj);
1414                goto fail;
1415        }
1416        list_add(&rdev->same_set, &mddev->disks);
1417        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1418        return 0;
1419
1420 fail:
1421        printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1422               b, mdname(mddev));
1423        return err;
1424}
1425
1426static void md_delayed_delete(struct work_struct *ws)
1427{
1428        mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1429        kobject_del(&rdev->kobj);
1430        kobject_put(&rdev->kobj);
1431}
1432
1433static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1434{
1435        char b[BDEVNAME_SIZE];
1436        if (!rdev->mddev) {
1437                MD_BUG();
1438                return;
1439        }
1440        bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1441        list_del_init(&rdev->same_set);
1442        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1443        rdev->mddev = NULL;
1444        sysfs_remove_link(&rdev->kobj, "block");
1445
1446        /* We need to delay this, otherwise we can deadlock when
1447         * writing to 'remove' to "dev/state"
1448         */
1449        INIT_WORK(&rdev->del_work, md_delayed_delete);
1450        kobject_get(&rdev->kobj);
1451        schedule_work(&rdev->del_work);
1452}
1453
1454/*
1455 * prevent the device from being mounted, repartitioned or
1456 * otherwise reused by a RAID array (or any other kernel
1457 * subsystem), by bd_claiming the device.
1458 */
1459static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1460{
1461        int err = 0;
1462        struct block_device *bdev;
1463        char b[BDEVNAME_SIZE];
1464
1465        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1466        if (IS_ERR(bdev)) {
1467                printk(KERN_ERR "md: could not open %s.\n",
1468                        __bdevname(dev, b));
1469                return PTR_ERR(bdev);
1470        }
1471        err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1472        if (err) {
1473                printk(KERN_ERR "md: could not bd_claim %s.\n",
1474                        bdevname(bdev, b));
1475                blkdev_put(bdev);
1476                return err;
1477        }
1478        if (!shared)
1479                set_bit(AllReserved, &rdev->flags);
1480        rdev->bdev = bdev;
1481        return err;
1482}
1483
1484static void unlock_rdev(mdk_rdev_t *rdev)
1485{
1486        struct block_device *bdev = rdev->bdev;
1487        rdev->bdev = NULL;
1488        if (!bdev)
1489                MD_BUG();
1490        bd_release(bdev);
1491        blkdev_put(bdev);
1492}
1493
1494void md_autodetect_dev(dev_t dev);
1495
1496static void export_rdev(mdk_rdev_t * rdev)
1497{
1498        char b[BDEVNAME_SIZE];
1499        printk(KERN_INFO "md: export_rdev(%s)\n",
1500                bdevname(rdev->bdev,b));
1501        if (rdev->mddev)
1502                MD_BUG();
1503        free_disk_sb(rdev);
1504        list_del_init(&rdev->same_set);
1505#ifndef MODULE
1506        if (test_bit(AutoDetected, &rdev->flags))
1507                md_autodetect_dev(rdev->bdev->bd_dev);
1508#endif
1509        unlock_rdev(rdev);
1510        kobject_put(&rdev->kobj);
1511}
1512
1513static void kick_rdev_from_array(mdk_rdev_t * rdev)
1514{
1515        unbind_rdev_from_array(rdev);
1516        export_rdev(rdev);
1517}
1518
1519static void export_array(mddev_t *mddev)
1520{
1521        struct list_head *tmp;
1522        mdk_rdev_t *rdev;
1523
1524        rdev_for_each(rdev, tmp, mddev) {
1525                if (!rdev->mddev) {
1526                        MD_BUG();
1527                        continue;
1528                }
1529                kick_rdev_from_array(rdev);
1530        }
1531        if (!list_empty(&mddev->disks))
1532                MD_BUG();
1533        mddev->raid_disks = 0;
1534        mddev->major_version = 0;
1535}
1536
1537static void print_desc(mdp_disk_t *desc)
1538{
1539        printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1540                desc->major,desc->minor,desc->raid_disk,desc->state);
1541}
1542
1543static void print_sb(mdp_super_t *sb)
1544{
1545        int i;
1546
1547        printk(KERN_INFO 
1548                "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1549                sb->major_version, sb->minor_version, sb->patch_version,
1550                sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1551                sb->ctime);
1552        printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1553                sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1554                sb->md_minor, sb->layout, sb->chunk_size);
1555        printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1556                " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1557                sb->utime, sb->state, sb->active_disks, sb->working_disks,
1558                sb->failed_disks, sb->spare_disks,
1559                sb->sb_csum, (unsigned long)sb->events_lo);
1560
1561        printk(KERN_INFO);
1562        for (i = 0; i < MD_SB_DISKS; i++) {
1563                mdp_disk_t *desc;
1564
1565                desc = sb->disks + i;
1566                if (desc->number || desc->major || desc->minor ||
1567                    desc->raid_disk || (desc->state && (desc->state != 4))) {
1568                        printk("     D %2d: ", i);
1569                        print_desc(desc);
1570                }
1571        }
1572        printk(KERN_INFO "md:     THIS: ");
1573        print_desc(&sb->this_disk);
1574
1575}
1576
1577static void print_rdev(mdk_rdev_t *rdev)
1578{
1579        char b[BDEVNAME_SIZE];
1580        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1581                bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1582                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1583                rdev->desc_nr);
1584        if (rdev->sb_loaded) {
1585                printk(KERN_INFO "md: rdev superblock:\n");
1586                print_sb((mdp_super_t*)page_address(rdev->sb_page));
1587        } else
1588                printk(KERN_INFO "md: no rdev superblock!\n");
1589}
1590
1591static void md_print_devices(void)
1592{
1593        struct list_head *tmp, *tmp2;
1594        mdk_rdev_t *rdev;
1595        mddev_t *mddev;
1596        char b[BDEVNAME_SIZE];
1597
1598        printk("\n");
1599        printk("md:     **********************************\n");
1600        printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1601        printk("md:     **********************************\n");
1602        for_each_mddev(mddev, tmp) {
1603
1604                if (mddev->bitmap)
1605                        bitmap_print_sb(mddev->bitmap);
1606                else
1607                        printk("%s: ", mdname(mddev));
1608                rdev_for_each(rdev, tmp2, mddev)
1609                        printk("<%s>", bdevname(rdev->bdev,b));
1610                printk("\n");
1611
1612                rdev_for_each(rdev, tmp2, mddev)
1613                        print_rdev(rdev);
1614        }
1615        printk("md:     **********************************\n");
1616        printk("\n");
1617}
1618
1619
1620static void sync_sbs(mddev_t * mddev, int nospares)
1621{
1622        /* Update each superblock (in-memory image), but
1623         * if we are allowed to, skip spares which already
1624         * have the right event counter, or have one earlier
1625         * (which would mean they aren't being marked as dirty
1626         * with the rest of the array)
1627         */
1628        mdk_rdev_t *rdev;
1629        struct list_head *tmp;
1630
1631        rdev_for_each(rdev, tmp, mddev) {
1632                if (rdev->sb_events == mddev->events ||
1633                    (nospares &&
1634                     rdev->raid_disk < 0 &&
1635                     (rdev->sb_events&1)==0 &&
1636                     rdev->sb_events+1 == mddev->events)) {
1637                        /* Don't update this superblock */
1638                        rdev->sb_loaded = 2;
1639                } else {
1640                        super_types[mddev->major_version].
1641                                sync_super(mddev, rdev);
1642                        rdev->sb_loaded = 1;
1643                }
1644        }
1645}
1646
1647static void md_update_sb(mddev_t * mddev, int force_change)
1648{
1649        struct list_head *tmp;
1650        mdk_rdev_t *rdev;
1651        int sync_req;
1652        int nospares = 0;
1653
1654repeat:
1655        spin_lock_irq(&mddev->write_lock);
1656
1657        set_bit(MD_CHANGE_PENDING, &mddev->flags);
1658        if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1659                force_change = 1;
1660        if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1661                /* just a clean<-> dirty transition, possibly leave spares alone,
1662                 * though if events isn't the right even/odd, we will have to do
1663                 * spares after all
1664                 */
1665                nospares = 1;
1666        if (force_change)
1667                nospares = 0;
1668        if (mddev->degraded)
1669                /* If the array is degraded, then skipping spares is both
1670                 * dangerous and fairly pointless.
1671                 * Dangerous because a device that was removed from the array
1672                 * might have a event_count that still looks up-to-date,
1673                 * so it can be re-added without a resync.
1674                 * Pointless because if there are any spares to skip,
1675                 * then a recovery will happen and soon that array won't
1676                 * be degraded any more and the spare can go back to sleep then.
1677                 */
1678                nospares = 0;
1679
1680        sync_req = mddev->in_sync;
1681        mddev->utime = get_seconds();
1682
1683        /* If this is just a dirty<->clean transition, and the array is clean
1684         * and 'events' is odd, we can roll back to the previous clean state */
1685        if (nospares
1686            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1687            && (mddev->events & 1)
1688            && mddev->events != 1)
1689                mddev->events--;
1690        else {
1691                /* otherwise we have to go forward and ... */
1692                mddev->events ++;
1693                if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1694                        /* .. if the array isn't clean, insist on an odd 'events' */
1695                        if ((mddev->events&1)==0) {
1696                                mddev->events++;
1697                                nospares = 0;
1698                        }
1699                } else {
1700                        /* otherwise insist on an even 'events' (for clean states) */
1701                        if ((mddev->events&1)) {
1702                                mddev->events++;
1703                                nospares = 0;
1704                        }
1705                }
1706        }
1707
1708        if (!mddev->events) {
1709                /*
1710                 * oops, this 64-bit counter should never wrap.
1711                 * Either we are in around ~1 trillion A.C., assuming
1712                 * 1 reboot per second, or we have a bug:
1713                 */
1714                MD_BUG();
1715                mddev->events --;
1716        }
1717
1718        /*
1719         * do not write anything to disk if using
1720         * nonpersistent superblocks
1721         */
1722        if (!mddev->persistent) {
1723                if (!mddev->external)
1724                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1725
1726                spin_unlock_irq(&mddev->write_lock);
1727                wake_up(&mddev->sb_wait);
1728                return;
1729        }
1730        sync_sbs(mddev, nospares);
1731        spin_unlock_irq(&mddev->write_lock);
1732
1733        dprintk(KERN_INFO 
1734                "md: updating %s RAID superblock on device (in sync %d)\n",
1735                mdname(mddev),mddev->in_sync);
1736
1737        bitmap_update_sb(mddev->bitmap);
1738        rdev_for_each(rdev, tmp, mddev) {
1739                char b[BDEVNAME_SIZE];
1740                dprintk(KERN_INFO "md: ");
1741                if (rdev->sb_loaded != 1)
1742                        continue; /* no noise on spare devices */
1743                if (test_bit(Faulty, &rdev->flags))
1744                        dprintk("(skipping faulty ");
1745
1746                dprintk("%s ", bdevname(rdev->bdev,b));
1747                if (!test_bit(Faulty, &rdev->flags)) {
1748                        md_super_write(mddev,rdev,
1749                                       rdev->sb_offset<<1, rdev->sb_size,
1750                                       rdev->sb_page);
1751                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1752                                bdevname(rdev->bdev,b),
1753                                (unsigned long long)rdev->sb_offset);
1754                        rdev->sb_events = mddev->events;
1755
1756                } else
1757                        dprintk(")\n");
1758                if (mddev->level == LEVEL_MULTIPATH)
1759                        /* only need to write one superblock... */
1760                        break;
1761        }
1762        md_super_wait(mddev);
1763        /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
1764
1765        spin_lock_irq(&mddev->write_lock);
1766        if (mddev->in_sync != sync_req ||
1767            test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
1768                /* have to write it out again */
1769                spin_unlock_irq(&mddev->write_lock);
1770                goto repeat;
1771        }
1772        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1773        spin_unlock_irq(&mddev->write_lock);
1774        wake_up(&mddev->sb_wait);
1775
1776}
1777
1778/* words written to sysfs files may, or my not, be \n terminated.
1779 * We want to accept with case. For this we use cmd_match.
1780 */
1781static int cmd_match(const char *cmd, const char *str)
1782{
1783        /* See if cmd, written into a sysfs file, matches
1784         * str.  They must either be the same, or cmd can
1785         * have a trailing newline
1786         */
1787        while (*cmd && *str && *cmd == *str) {
1788                cmd++;
1789                str++;
1790        }
1791        if (*cmd == '\n')
1792                cmd++;
1793        if (*str || *cmd)
1794                return 0;
1795        return 1;
1796}
1797
1798struct rdev_sysfs_entry {
1799        struct attribute attr;
1800        ssize_t (*show)(mdk_rdev_t *, char *);
1801        ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1802};
1803
1804static ssize_t
1805state_show(mdk_rdev_t *rdev, char *page)
1806{
1807        char *sep = "";
1808        size_t len = 0;
1809
1810        if (test_bit(Faulty, &rdev->flags)) {
1811                len+= sprintf(page+len, "%sfaulty",sep);
1812                sep = ",";
1813        }
1814        if (test_bit(In_sync, &rdev->flags)) {
1815                len += sprintf(page+len, "%sin_sync",sep);
1816                sep = ",";
1817        }
1818        if (test_bit(WriteMostly, &rdev->flags)) {
1819                len += sprintf(page+len, "%swrite_mostly",sep);
1820                sep = ",";
1821        }
1822        if (!test_bit(Faulty, &rdev->flags) &&
1823            !test_bit(In_sync, &rdev->flags)) {
1824                len += sprintf(page+len, "%sspare", sep);
1825                sep = ",";
1826        }
1827        return len+sprintf(page+len, "\n");
1828}
1829
1830static ssize_t
1831state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1832{
1833        /* can write
1834         *  faulty  - simulates and error
1835         *  remove  - disconnects the device
1836         *  writemostly - sets write_mostly
1837         *  -writemostly - clears write_mostly
1838         */
1839        int err = -EINVAL;
1840        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1841                md_error(rdev->mddev, rdev);
1842                err = 0;
1843        } else if (cmd_match(buf, "remove")) {
1844                if (rdev->raid_disk >= 0)
1845                        err = -EBUSY;
1846                else {
1847                        mddev_t *mddev = rdev->mddev;
1848                        kick_rdev_from_array(rdev);
1849                        if (mddev->pers)
1850                                md_update_sb(mddev, 1);
1851                        md_new_event(mddev);
1852                        err = 0;
1853                }
1854        } else if (cmd_match(buf, "writemostly")) {
1855                set_bit(WriteMostly, &rdev->flags);
1856                err = 0;
1857        } else if (cmd_match(buf, "-writemostly")) {
1858                clear_bit(WriteMostly, &rdev->flags);
1859                err = 0;
1860        }
1861        return err ? err : len;
1862}
1863static struct rdev_sysfs_entry rdev_state =
1864__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1865
1866static ssize_t
1867errors_show(mdk_rdev_t *rdev, char *page)
1868{
1869        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1870}
1871
1872static ssize_t
1873errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1874{
1875        char *e;
1876        unsigned long n = simple_strtoul(buf, &e, 10);
1877        if (*buf && (*e == 0 || *e == '\n')) {
1878                atomic_set(&rdev->corrected_errors, n);
1879                return len;
1880        }
1881        return -EINVAL;
1882}
1883static struct rdev_sysfs_entry rdev_errors =
1884__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1885
1886static ssize_t
1887slot_show(mdk_rdev_t *rdev, char *page)
1888{
1889        if (rdev->raid_disk < 0)
1890                return sprintf(page, "none\n");
1891        else
1892                return sprintf(page, "%d\n", rdev->raid_disk);
1893}
1894
1895static ssize_t
1896slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1897{
1898        char *e;
1899        int err;
1900        char nm[20];
1901        int slot = simple_strtoul(buf, &e, 10);
1902        if (strncmp(buf, "none", 4)==0)
1903                slot = -1;
1904        else if (e==buf || (*e && *e!= '\n'))
1905                return -EINVAL;
1906        if (rdev->mddev->pers) {
1907                /* Setting 'slot' on an active array requires also
1908                 * updating the 'rd%d' link, and communicating
1909                 * with the personality with ->hot_*_disk.
1910                 * For now we only support removing
1911                 * failed/spare devices.  This normally happens automatically,
1912                 * but not when the metadata is externally managed.
1913                 */
1914                if (slot != -1)
1915                        return -EBUSY;
1916                if (rdev->raid_disk == -1)
1917                        return -EEXIST;
1918                /* personality does all needed checks */
1919                if (rdev->mddev->pers->hot_add_disk == NULL)
1920                        return -EINVAL;
1921                err = rdev->mddev->pers->
1922                        hot_remove_disk(rdev->mddev, rdev->raid_disk);
1923                if (err)
1924                        return err;
1925                sprintf(nm, "rd%d", rdev->raid_disk);
1926                sysfs_remove_link(&rdev->mddev->kobj, nm);
1927                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1928                md_wakeup_thread(rdev->mddev->thread);
1929        } else {
1930                if (slot >= rdev->mddev->raid_disks)
1931                        return -ENOSPC;
1932                rdev->raid_disk = slot;
1933                /* assume it is working */
1934                clear_bit(Faulty, &rdev->flags);
1935                clear_bit(WriteMostly, &rdev->flags);
1936                set_bit(In_sync, &rdev->flags);
1937        }
1938        return len;
1939}
1940
1941
1942static struct rdev_sysfs_entry rdev_slot =
1943__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1944
1945static ssize_t
1946offset_show(mdk_rdev_t *rdev, char *page)
1947{
1948        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1949}
1950
1951static ssize_t
1952offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1953{
1954        char *e;
1955        unsigned long long offset = simple_strtoull(buf, &e, 10);
1956        if (e==buf || (*e && *e != '\n'))
1957                return -EINVAL;
1958        if (rdev->mddev->pers)
1959                return -EBUSY;
1960        if (rdev->size && rdev->mddev->external)
1961                /* Must set offset before size, so overlap checks
1962                 * can be sane */
1963                return -EBUSY;
1964        rdev->data_offset = offset;
1965        return len;
1966}
1967
1968static struct rdev_sysfs_entry rdev_offset =
1969__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1970
1971static ssize_t
1972rdev_size_show(mdk_rdev_t *rdev, char *page)
1973{
1974        return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1975}
1976
1977static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
1978{
1979        /* check if two start/length pairs overlap */
1980        if (s1+l1 <= s2)
1981                return 0;
1982        if (s2+l2 <= s1)
1983                return 0;
1984        return 1;
1985}
1986
1987static ssize_t
1988rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1989{
1990        char *e;
1991        unsigned long long size = simple_strtoull(buf, &e, 10);
1992        unsigned long long oldsize = rdev->size;
1993        mddev_t *my_mddev = rdev->mddev;
1994
1995        if (e==buf || (*e && *e != '\n'))
1996                return -EINVAL;
1997        if (my_mddev->pers)
1998                return -EBUSY;
1999        rdev->size = size;
2000        if (size > oldsize && rdev->mddev->external) {
2001                /* need to check that all other rdevs with the same ->bdev
2002                 * do not overlap.  We need to unlock the mddev to avoid
2003                 * a deadlock.  We have already changed rdev->size, and if
2004                 * we have to change it back, we will have the lock again.
2005                 */
2006                mddev_t *mddev;
2007                int overlap = 0;
2008                struct list_head *tmp, *tmp2;
2009
2010                mddev_unlock(my_mddev);
2011                for_each_mddev(mddev, tmp) {
2012                        mdk_rdev_t *rdev2;
2013
2014                        mddev_lock(mddev);
2015                        rdev_for_each(rdev2, tmp2, mddev)
2016                                if (test_bit(AllReserved, &rdev2->flags) ||
2017                                    (rdev->bdev == rdev2->bdev &&
2018                                     rdev != rdev2 &&
2019                                     overlaps(rdev->data_offset, rdev->size,
2020                                            rdev2->data_offset, rdev2->size))) {
2021                                        overlap = 1;
2022                                        break;
2023                                }
2024                        mddev_unlock(mddev);
2025                        if (overlap) {
2026                                mddev_put(mddev);
2027                                break;
2028                        }
2029                }
2030                mddev_lock(my_mddev);
2031                if (overlap) {
2032                        /* Someone else could have slipped in a size
2033                         * change here, but doing so is just silly.
2034                         * We put oldsize back because we *know* it is
2035                         * safe, and trust userspace not to race with
2036                         * itself
2037                         */
2038                        rdev->size = oldsize;
2039                        return -EBUSY;
2040                }
2041        }
2042        if (size < my_mddev->size || my_mddev->size == 0)
2043                my_mddev->size = size;
2044        return len;
2045}
2046
2047static struct rdev_sysfs_entry rdev_size =
2048__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2049
2050static struct attribute *rdev_default_attrs[] = {
2051        &rdev_state.attr,
2052        &rdev_errors.attr,
2053        &rdev_slot.attr,
2054        &rdev_offset.attr,
2055        &rdev_size.attr,
2056        NULL,
2057};
2058static ssize_t
2059rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2060{
2061        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2062        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2063        mddev_t *mddev = rdev->mddev;
2064        ssize_t rv;
2065
2066        if (!entry->show)
2067                return -EIO;
2068
2069        rv = mddev ? mddev_lock(mddev) : -EBUSY;
2070        if (!rv) {
2071                if (rdev->mddev == NULL)
2072                        rv = -EBUSY;
2073                else
2074                        rv = entry->show(rdev, page);
2075                mddev_unlock(mddev);
2076        }
2077        return rv;
2078}
2079
2080static ssize_t
2081rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2082              const char *page, size_t length)
2083{
2084        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2085        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2086        ssize_t rv;
2087        mddev_t *mddev = rdev->mddev;
2088
2089        if (!entry->store)
2090                return -EIO;
2091        if (!capable(CAP_SYS_ADMIN))
2092                return -EACCES;
2093        rv = mddev ? mddev_lock(mddev): -EBUSY;
2094        if (!rv) {
2095                if (rdev->mddev == NULL)
2096                        rv = -EBUSY;
2097                else
2098                        rv = entry->store(rdev, page, length);
2099                mddev_unlock(rdev->mddev);
2100        }
2101        return rv;
2102}
2103
2104static void rdev_free(struct kobject *ko)
2105{
2106        mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2107        kfree(rdev);
2108}
2109static struct sysfs_ops rdev_sysfs_ops = {
2110        .show           = rdev_attr_show,
2111        .store          = rdev_attr_store,
2112};
2113static struct kobj_type rdev_ktype = {
2114        .release        = rdev_free,
2115        .sysfs_ops      = &rdev_sysfs_ops,
2116        .default_attrs  = rdev_default_attrs,
2117};
2118
2119/*
2120 * Import a device. If 'super_format' >= 0, then sanity check the superblock
2121 *
2122 * mark the device faulty if:
2123 *
2124 *   - the device is nonexistent (zero size)
2125 *   - the device has no valid superblock
2126 *
2127 * a faulty rdev _never_ has rdev->sb set.
2128 */
2129static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2130{
2131        char b[BDEVNAME_SIZE];
2132        int err;
2133        mdk_rdev_t *rdev;
2134        sector_t size;
2135
2136        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2137        if (!rdev) {
2138                printk(KERN_ERR "md: could not alloc mem for new device!\n");
2139                return ERR_PTR(-ENOMEM);
2140        }
2141
2142        if ((err = alloc_disk_sb(rdev)))
2143                goto abort_free;
2144
2145        err = lock_rdev(rdev, newdev, super_format == -2);
2146        if (err)
2147                goto abort_free;
2148
2149        kobject_init(&rdev->kobj, &rdev_ktype);
2150
2151        rdev->desc_nr = -1;
2152        rdev->saved_raid_disk = -1;
2153        rdev->raid_disk = -1;
2154        rdev->flags = 0;
2155        rdev->data_offset = 0;
2156        rdev->sb_events = 0;
2157        atomic_set(&rdev->nr_pending, 0);
2158        atomic_set(&rdev->read_errors, 0);
2159        atomic_set(&rdev->corrected_errors, 0);
2160
2161        size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2162        if (!size) {
2163                printk(KERN_WARNING 
2164                        "md: %s has zero or unknown size, marking faulty!\n",
2165                        bdevname(rdev->bdev,b));
2166                err = -EINVAL;
2167                goto abort_free;
2168        }
2169
2170        if (super_format >= 0) {
2171                err = super_types[super_format].
2172                        load_super(rdev, NULL, super_minor);
2173                if (err == -EINVAL) {
2174                        printk(KERN_WARNING
2175                                "md: %s does not have a valid v%d.%d "
2176                               "superblock, not importing!\n",
2177                                bdevname(rdev->bdev,b),
2178                               super_format, super_minor);
2179                        goto abort_free;
2180                }
2181                if (err < 0) {
2182                        printk(KERN_WARNING 
2183                                "md: could not read %s's sb, not importing!\n",
2184                                bdevname(rdev->bdev,b));
2185                        goto abort_free;
2186                }
2187        }
2188        INIT_LIST_HEAD(&rdev->same_set);
2189
2190        return rdev;
2191
2192abort_free:
2193        if (rdev->sb_page) {
2194                if (rdev->bdev)
2195                        unlock_rdev(rdev);
2196                free_disk_sb(rdev);
2197        }
2198        kfree(rdev);
2199        return ERR_PTR(err);
2200}
2201
2202/*
2203 * Check a full RAID array for plausibility
2204 */
2205
2206
2207static void analyze_sbs(mddev_t * mddev)
2208{
2209        int i;
2210        struct list_head *tmp;
2211        mdk_rdev_t *rdev, *freshest;
2212        char b[BDEVNAME_SIZE];
2213
2214        freshest = NULL;
2215        rdev_for_each(rdev, tmp, mddev)
2216                switch (super_types[mddev->major_version].
2217                        load_super(rdev, freshest, mddev->minor_version)) {
2218                case 1:
2219                        freshest = rdev;
2220                        break;
2221                case 0:
2222                        break;
2223                default:
2224                        printk( KERN_ERR \
2225                                "md: fatal superblock inconsistency in %s"
2226                                " -- removing from array\n", 
2227                                bdevname(rdev->bdev,b));
2228                        kick_rdev_from_array(rdev);
2229                }
2230
2231
2232        super_types[mddev->major_version].
2233                validate_super(mddev, freshest);
2234
2235        i = 0;
2236        rdev_for_each(rdev, tmp, mddev) {
2237                if (rdev != freshest)
2238                        if (super_types[mddev->major_version].
2239                            validate_super(mddev, rdev)) {
2240                                printk(KERN_WARNING "md: kicking non-fresh %s"
2241                                        " from array!\n",
2242                                        bdevname(rdev->bdev,b));
2243                                kick_rdev_from_array(rdev);
2244                                continue;
2245                        }
2246                if (mddev->level == LEVEL_MULTIPATH) {
2247                        rdev->desc_nr = i++;
2248                        rdev->raid_disk = rdev->desc_nr;
2249                        set_bit(In_sync, &rdev->flags);
2250                } else if (rdev->raid_disk >= mddev->raid_disks) {
2251                        rdev->raid_disk = -1;
2252                        clear_bit(In_sync, &rdev->flags);
2253                }
2254        }
2255
2256
2257
2258        if (mddev->recovery_cp != MaxSector &&
2259            mddev->level >= 1)
2260                printk(KERN_ERR "md: %s: raid array is not clean"
2261                       " -- starting background reconstruction\n",
2262                       mdname(mddev));
2263
2264}
2265
2266static ssize_t
2267safe_delay_show(mddev_t *mddev, char *page)
2268{
2269        int msec = (mddev->safemode_delay*1000)/HZ;
2270        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2271}
2272static ssize_t
2273safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2274{
2275        int scale=1;
2276        int dot=0;
2277        int i;
2278        unsigned long msec;
2279        char buf[30];
2280        char *e;
2281        /* remove a period, and count digits after it */
2282        if (len >= sizeof(buf))
2283                return -EINVAL;
2284        strlcpy(buf, cbuf, len);
2285        buf[len] = 0;
2286        for (i=0; i<len; i++) {
2287                if (dot) {
2288                        if (isdigit(buf[i])) {
2289                                buf[i-1] = buf[i];
2290                                scale *= 10;
2291                        }
2292                        buf[i] = 0;
2293                } else if (buf[i] == '.') {
2294                        dot=1;
2295                        buf[i] = 0;
2296                }
2297        }
2298        msec = simple_strtoul(buf, &e, 10);
2299        if (e == buf || (*e && *e != '\n'))
2300                return -EINVAL;
2301        msec = (msec * 1000) / scale;
2302        if (msec == 0)
2303                mddev->safemode_delay = 0;
2304        else {
2305                mddev->safemode_delay = (msec*HZ)/1000;
2306                if (mddev->safemode_delay == 0)
2307                        mddev->safemode_delay = 1;
2308        }
2309        return len;
2310}
2311static struct md_sysfs_entry md_safe_delay =
2312__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2313
2314static ssize_t
2315level_show(mddev_t *mddev, char *page)
2316{
2317        struct mdk_personality *p = mddev->pers;
2318        if (p)
2319                return sprintf(page, "%s\n", p->name);
2320        else if (mddev->clevel[0])
2321                return sprintf(page, "%s\n", mddev->clevel);
2322        else if (mddev->level != LEVEL_NONE)
2323                return sprintf(page, "%d\n", mddev->level);
2324        else
2325                return 0;
2326}
2327
2328static ssize_t
2329level_store(mddev_t *mddev, const char *buf, size_t len)
2330{
2331        ssize_t rv = len;
2332        if (mddev->pers)
2333                return -EBUSY;
2334        if (len == 0)
2335                return 0;
2336        if (len >= sizeof(mddev->clevel))
2337                return -ENOSPC;
2338        strncpy(mddev->clevel, buf, len);
2339        if (mddev->clevel[len-1] == '\n')
2340                len--;
2341        mddev->clevel[len] = 0;
2342        mddev->level = LEVEL_NONE;
2343        return rv;
2344}
2345
2346static struct md_sysfs_entry md_level =
2347__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2348
2349
2350static ssize_t
2351layout_show(mddev_t *mddev, char *page)
2352{
2353        /* just a number, not meaningful for all levels */
2354        if (mddev->reshape_position != MaxSector &&
2355            mddev->layout != mddev->new_layout)
2356                return sprintf(page, "%d (%d)\n",
2357                               mddev->new_layout, mddev->layout);
2358        return sprintf(page, "%d\n", mddev->layout);
2359}
2360
2361static ssize_t
2362layout_store(mddev_t *mddev, const char *buf, size_t len)
2363{
2364        char *e;
2365        unsigned long n = simple_strtoul(buf, &e, 10);
2366
2367        if (!*buf || (*e && *e != '\n'))
2368                return -EINVAL;
2369
2370        if (mddev->pers)
2371                return -EBUSY;
2372        if (mddev->reshape_position != MaxSector)
2373                mddev->new_layout = n;
2374        else
2375                mddev->layout = n;
2376        return len;
2377}
2378static struct md_sysfs_entry md_layout =
2379__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2380
2381
2382static ssize_t
2383raid_disks_show(mddev_t *mddev, char *page)
2384{
2385        if (mddev->raid_disks == 0)
2386                return 0;
2387        if (mddev->reshape_position != MaxSector &&
2388            mddev->delta_disks != 0)
2389                return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2390                               mddev->raid_disks - mddev->delta_disks);
2391        return sprintf(page, "%d\n", mddev->raid_disks);
2392}
2393
2394static int update_raid_disks(mddev_t *mddev, int raid_disks);
2395
2396static ssize_t
2397raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2398{
2399        char *e;
2400        int rv = 0;
2401        unsigned long n = simple_strtoul(buf, &e, 10);
2402
2403        if (!*buf || (*e && *e != '\n'))
2404                return -EINVAL;
2405
2406        if (mddev->pers)
2407                rv = update_raid_disks(mddev, n);
2408        else if (mddev->reshape_position != MaxSector) {
2409                int olddisks = mddev->raid_disks - mddev->delta_disks;
2410                mddev->delta_disks = n - olddisks;
2411                mddev->raid_disks = n;
2412        } else
2413                mddev->raid_disks = n;
2414        return rv ? rv : len;
2415}
2416static struct md_sysfs_entry md_raid_disks =
2417__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2418
2419static ssize_t
2420chunk_size_show(mddev_t *mddev, char *page)
2421{
2422        if (mddev->reshape_position != MaxSector &&
2423            mddev->chunk_size != mddev->new_chunk)
2424                return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2425                               mddev->chunk_size);
2426        return sprintf(page, "%d\n", mddev->chunk_size);
2427}
2428
2429static ssize_t
2430chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2431{
2432        /* can only set chunk_size if array is not yet active */
2433        char *e;
2434        unsigned long n = simple_strtoul(buf, &e, 10);
2435
2436        if (!*buf || (*e && *e != '\n'))
2437                return -EINVAL;
2438
2439        if (mddev->pers)
2440                return -EBUSY;
2441        else if (mddev->reshape_position != MaxSector)
2442                mddev->new_chunk = n;
2443        else
2444                mddev->chunk_size = n;
2445        return len;
2446}
2447static struct md_sysfs_entry md_chunk_size =
2448__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2449
2450static ssize_t
2451resync_start_show(mddev_t *mddev, char *page)
2452{
2453        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2454}
2455
2456static ssize_t
2457resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2458{
2459        /* can only set chunk_size if array is not yet active */
2460        char *e;
2461        unsigned long long n = simple_strtoull(buf, &e, 10);
2462
2463        if (mddev->pers)
2464                return -EBUSY;
2465        if (!*buf || (*e && *e != '\n'))
2466                return -EINVAL;
2467
2468        mddev->recovery_cp = n;
2469        return len;
2470}
2471static struct md_sysfs_entry md_resync_start =
2472__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2473
2474/*
2475 * The array state can be:
2476 *
2477 * clear
2478 *     No devices, no size, no level
2479 *     Equivalent to STOP_ARRAY ioctl
2480 * inactive
2481 *     May have some settings, but array is not active
2482 *        all IO results in error
2483 *     When written, doesn't tear down array, but just stops it
2484 * suspended (not supported yet)
2485 *     All IO requests will block. The array can be reconfigured.
2486 *     Writing this, if accepted, will block until array is quiessent
2487 * readonly
2488 *     no resync can happen.  no superblocks get written.
2489 *     write requests fail
2490 * read-auto
2491 *     like readonly, but behaves like 'clean' on a write request.
2492 *
2493 * clean - no pending writes, but otherwise active.
2494 *     When written to inactive array, starts without resync
2495 *     If a write request arrives then
2496 *       if metadata is known, mark 'dirty' and switch to 'active'.
2497 *       if not known, block and switch to write-pending
2498 *     If written to an active array that has pending writes, then fails.
2499 * active
2500 *     fully active: IO and resync can be happening.
2501 *     When written to inactive array, starts with resync
2502 *
2503 * write-pending
2504 *     clean, but writes are blocked waiting for 'active' to be written.
2505 *
2506 * active-idle
2507 *     like active, but no writes have been seen for a while (100msec).
2508 *
2509 */
2510enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2511                   write_pending, active_idle, bad_word};
2512static char *array_states[] = {
2513        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2514        "write-pending", "active-idle", NULL };
2515
2516static int match_word(const char *word, char **list)
2517{
2518        int n;
2519        for (n=0; list[n]; n++)
2520                if (cmd_match(word, list[n]))
2521                        break;
2522        return n;
2523}
2524
2525static ssize_t
2526array_state_show(mddev_t *mddev, char *page)
2527{
2528        enum array_state st = inactive;
2529
2530        if (mddev->pers)
2531                switch(mddev->ro) {
2532                case 1:
2533                        st = readonly;
2534                        break;
2535                case 2:
2536                        st = read_auto;
2537                        break;
2538                case 0:
2539                        if (mddev->in_sync)
2540                                st = clean;
2541                        else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
2542                                st = write_pending;
2543                        else if (mddev->safemode)
2544                                st = active_idle;
2545                        else
2546                                st = active;
2547                }
2548        else {
2549                if (list_empty(&mddev->disks) &&
2550                    mddev->raid_disks == 0 &&
2551                    mddev->size == 0)
2552                        st = clear;
2553                else
2554                        st = inactive;
2555        }
2556        return sprintf(page, "%s\n", array_states[st]);
2557}
2558
2559static int do_md_stop(mddev_t * mddev, int ro);
2560static int do_md_run(mddev_t * mddev);
2561static int restart_array(mddev_t *mddev);
2562
2563static ssize_t
2564array_state_store(mddev_t *mddev, const char *buf, size_t len)
2565{
2566        int err = -EINVAL;
2567        enum array_state st = match_word(buf, array_states);
2568        switch(st) {
2569        case bad_word:
2570                break;
2571        case clear:
2572                /* stopping an active array */
2573                if (atomic_read(&mddev->active) > 1)
2574                        return -EBUSY;
2575                err = do_md_stop(mddev, 0);
2576                break;
2577        case inactive:
2578                /* stopping an active array */
2579                if (mddev->pers) {
2580                        if (atomic_read(&mddev->active) > 1)
2581                                return -EBUSY;
2582                        err = do_md_stop(mddev, 2);
2583                } else
2584                        err = 0; /* already inactive */
2585                break;
2586        case suspended:
2587                break; /* not supported yet */
2588        case readonly:
2589                if (mddev->pers)
2590                        err = do_md_stop(mddev, 1);
2591                else {
2592                        mddev->ro = 1;
2593                        err = do_md_run(mddev);
2594                }
2595                break;
2596        case read_auto:
2597                /* stopping an active array */
2598                if (mddev->pers) {
2599                        err = do_md_stop(mddev, 1);
2600                        if (err == 0)
2601                                mddev->ro = 2; /* FIXME mark devices writable */
2602                } else {
2603                        mddev->ro = 2;
2604                        err = do_md_run(mddev);
2605                }
2606                break;
2607        case clean:
2608                if (mddev->pers) {
2609                        restart_array(mddev);
2610                        spin_lock_irq(&mddev->write_lock);
2611                        if (atomic_read(&mddev->writes_pending) == 0) {
2612                                if (mddev->in_sync == 0) {
2613                                        mddev->in_sync = 1;
2614                                        if (mddev->persistent)
2615                                                set_bit(MD_CHANGE_CLEAN,
2616                                                        &mddev->flags);
2617                                }
2618                                err = 0;
2619                        } else
2620                                err = -EBUSY;
2621                        spin_unlock_irq(&mddev->write_lock);
2622                } else {
2623                        mddev->ro = 0;
2624                        mddev->recovery_cp = MaxSector;
2625                        err = do_md_run(mddev);
2626                }
2627                break;
2628        case active:
2629                if (mddev->pers) {
2630                        restart_array(mddev);
2631                        if (mddev->external)
2632                                clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2633                        wake_up(&mddev->sb_wait);
2634                        err = 0;
2635                } else {
2636                        mddev->ro = 0;
2637                        err = do_md_run(mddev);
2638                }
2639                break;
2640        case write_pending:
2641        case active_idle:
2642                /* these cannot be set */
2643                break;
2644        }
2645        if (err)
2646                return err;
2647        else
2648                return len;
2649}
2650static struct md_sysfs_entry md_array_state =
2651__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2652
2653static ssize_t
2654null_show(mddev_t *mddev, char *page)
2655{
2656        return -EINVAL;
2657}
2658
2659static ssize_t
2660new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2661{
2662        /* buf must be %d:%d\n? giving major and minor numbers */
2663        /* The new device is added to the array.
2664         * If the array has a persistent superblock, we read the
2665         * superblock to initialise info and check validity.
2666         * Otherwise, only checking done is that in bind_rdev_to_array,
2667         * which mainly checks size.
2668         */
2669        char *e;
2670        int major = simple_strtoul(buf, &e, 10);
2671        int minor;
2672        dev_t dev;
2673        mdk_rdev_t *rdev;
2674        int err;
2675
2676        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2677                return -EINVAL;
2678        minor = simple_strtoul(e+1, &e, 10);
2679        if (*e && *e != '\n')
2680                return -EINVAL;
2681        dev = MKDEV(major, minor);
2682        if (major != MAJOR(dev) ||
2683            minor != MINOR(dev))
2684                return -EOVERFLOW;
2685
2686
2687        if (mddev->persistent) {
2688                rdev = md_import_device(dev, mddev->major_version,
2689                                        mddev->minor_version);
2690                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2691                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2692                                                       mdk_rdev_t, same_set);
2693                        err = super_types[mddev->major_version]
2694                                .load_super(rdev, rdev0, mddev->minor_version);
2695                        if (err < 0)
2696                                goto out;
2697                }
2698        } else if (mddev->external)
2699                rdev = md_import_device(dev, -2, -1);
2700        else
2701                rdev = md_import_device(dev, -1, -1);
2702
2703        if (IS_ERR(rdev))
2704                return PTR_ERR(rdev);
2705        err = bind_rdev_to_array(rdev, mddev);
2706 out:
2707        if (err)
2708                export_rdev(rdev);
2709        return err ? err : len;
2710}
2711
2712static struct md_sysfs_entry md_new_device =
2713__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2714
2715static ssize_t
2716bitmap_store(mddev_t *mddev, const char *buf, size_t len)
2717{
2718        char *end;
2719        unsigned long chunk, end_chunk;
2720
2721        if (!mddev->bitmap)
2722                goto out;
2723        /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
2724        while (*buf) {
2725                chunk = end_chunk = simple_strtoul(buf, &end, 0);
2726                if (buf == end) break;
2727                if (*end == '-') { /* range */
2728                        buf = end + 1;
2729                        end_chunk = simple_strtoul(buf, &end, 0);
2730                        if (buf == end) break;
2731                }
2732                if (*end && !isspace(*end)) break;
2733                bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
2734                buf = end;
2735                while (isspace(*buf)) buf++;
2736        }
2737        bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
2738out:
2739        return len;
2740}
2741
2742static struct md_sysfs_entry md_bitmap =
2743__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2744
2745static ssize_t
2746size_show(mddev_t *mddev, char *page)
2747{
2748        return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2749}
2750
2751static int update_size(mddev_t *mddev, unsigned long size);
2752
2753static ssize_t
2754size_store(mddev_t *mddev, const char *buf, size_t len)
2755{
2756        /* If array is inactive, we can reduce the component size, but
2757         * not increase it (except from 0).
2758         * If array is active, we can try an on-line resize
2759         */
2760        char *e;
2761        int err = 0;
2762        unsigned long long size = simple_strtoull(buf, &e, 10);
2763        if (!*buf || *buf == '\n' ||
2764            (*e && *e != '\n'))
2765                return -EINVAL;
2766
2767        if (mddev->pers) {
2768                err = update_size(mddev, size);
2769                md_update_sb(mddev, 1);
2770        } else {
2771                if (mddev->size == 0 ||
2772                    mddev->size > size)
2773                        mddev->size = size;
2774                else
2775                        err = -ENOSPC;
2776        }
2777        return err ? err : len;
2778}
2779
2780static struct md_sysfs_entry md_size =
2781__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2782
2783
2784/* Metdata version.
2785 * This is one of
2786 *   'none' for arrays with no metadata (good luck...)
2787 *   'external' for arrays with externally managed metadata,
2788 * or N.M for internally known formats
2789 */
2790static ssize_t
2791metadata_show(mddev_t *mddev, char *page)
2792{
2793        if (mddev->persistent)
2794                return sprintf(page, "%d.%d\n",
2795                               mddev->major_version, mddev->minor_version);
2796        else if (mddev->external)
2797                return sprintf(page, "external:%s\n", mddev->metadata_type);
2798        else
2799                return sprintf(page, "none\n");
2800}
2801
2802static ssize_t
2803metadata_store(mddev_t *mddev, const char *buf, size_t len)
2804{
2805        int major, minor;
2806        char *e;
2807        if (!list_empty(&mddev->disks))
2808                return -EBUSY;
2809
2810        if (cmd_match(buf, "none")) {
2811                mddev->persistent = 0;
2812                mddev->external = 0;
2813                mddev->major_version = 0;
2814                mddev->minor_version = 90;
2815                return len;
2816        }
2817        if (strncmp(buf, "external:", 9) == 0) {
2818                size_t namelen = len-9;
2819                if (namelen >= sizeof(mddev->metadata_type))
2820                        namelen = sizeof(mddev->metadata_type)-1;
2821                strncpy(mddev->metadata_type, buf+9, namelen);
2822                mddev->metadata_type[namelen] = 0;
2823                if (namelen && mddev->metadata_type[namelen-1] == '\n')
2824                        mddev->metadata_type[--namelen] = 0;
2825                mddev->persistent = 0;
2826                mddev->external = 1;
2827                mddev->major_version = 0;
2828                mddev->minor_version = 90;
2829                return len;
2830        }
2831        major = simple_strtoul(buf, &e, 10);
2832        if (e==buf || *e != '.')
2833                return -EINVAL;
2834        buf = e+1;
2835        minor = simple_strtoul(buf, &e, 10);
2836        if (e==buf || (*e && *e != '\n') )
2837                return -EINVAL;
2838        if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2839                return -ENOENT;
2840        mddev->major_version = major;
2841        mddev->minor_version = minor;
2842        mddev->persistent = 1;
2843        mddev->external = 0;
2844        return len;
2845}
2846
2847static struct md_sysfs_entry md_metadata =
2848__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2849
2850static ssize_t
2851action_show(mddev_t *mddev, char *page)
2852{
2853        char *type = "idle";
2854        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2855            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
2856                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2857                        type = "reshape";
2858                else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2859                        if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2860                                type = "resync";
2861                        else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2862                                type = "check";
2863                        else
2864                                type = "repair";
2865                } else
2866                        type = "recover";
2867        }
2868        return sprintf(page, "%s\n", type);
2869}
2870
2871static ssize_t
2872action_store(mddev_t *mddev, const char *page, size_t len)
2873{
2874        if (!mddev->pers || !mddev->pers->sync_request)
2875                return -EINVAL;
2876
2877        if (cmd_match(page, "idle")) {
2878                if (mddev->sync_thread) {
2879                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2880                        md_unregister_thread(mddev->sync_thread);
2881                        mddev->sync_thread = NULL;
2882                        mddev->recovery = 0;
2883                }
2884        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2885                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2886                return -EBUSY;
2887        else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2888                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2889        else if (cmd_match(page, "reshape")) {
2890                int err;
2891                if (mddev->pers->start_reshape == NULL)
2892                        return -EINVAL;
2893                err = mddev->pers->start_reshape(mddev);
2894                if (err)
2895                        return err;
2896        } else {
2897                if (cmd_match(page, "check"))
2898                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2899                else if (!cmd_match(page, "repair"))
2900                        return -EINVAL;
2901                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2902                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2903        }
2904        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2905        md_wakeup_thread(mddev->thread);
2906        return len;
2907}
2908
2909static ssize_t
2910mismatch_cnt_show(mddev_t *mddev, char *page)
2911{
2912        return sprintf(page, "%llu\n",
2913                       (unsigned long long) mddev->resync_mismatches);
2914}
2915
2916static struct md_sysfs_entry md_scan_mode =
2917__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2918
2919
2920static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2921
2922static ssize_t
2923sync_min_show(mddev_t *mddev, char *page)
2924{
2925        return sprintf(page, "%d (%s)\n", speed_min(mddev),
2926                       mddev->sync_speed_min ? "local": "system");
2927}
2928
2929static ssize_t
2930sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2931{
2932        int min;
2933        char *e;
2934        if (strncmp(buf, "system", 6)==0) {
2935                mddev->sync_speed_min = 0;
2936                return len;
2937        }
2938        min = simple_strtoul(buf, &e, 10);
2939        if (buf == e || (*e && *e != '\n') || min <= 0)
2940                return -EINVAL;
2941        mddev->sync_speed_min = min;
2942        return len;
2943}
2944
2945static struct md_sysfs_entry md_sync_min =
2946__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2947
2948static ssize_t
2949sync_max_show(mddev_t *mddev, char *page)
2950{
2951        return sprintf(page, "%d (%s)\n", speed_max(mddev),
2952                       mddev->sync_speed_max ? "local": "system");
2953}
2954
2955static ssize_t
2956sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2957{
2958        int max;
2959        char *e;
2960        if (strncmp(buf, "system", 6)==0) {
2961                mddev->sync_speed_max = 0;
2962                return len;
2963        }
2964        max = simple_strtoul(buf, &e, 10);
2965        if (buf == e || (*e && *e != '\n') || max <= 0)
2966                return -EINVAL;
2967        mddev->sync_speed_max = max;
2968        return len;
2969}
2970
2971static struct md_sysfs_entry md_sync_max =
2972__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2973
2974static ssize_t
2975degraded_show(mddev_t *mddev, char *page)
2976{
2977        return sprintf(page, "%d\n", mddev->degraded);
2978}
2979static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
2980
2981static ssize_t
2982sync_speed_show(mddev_t *mddev, char *page)
2983{
2984        unsigned long resync, dt, db;
2985        resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2986        dt = ((jiffies - mddev->resync_mark) / HZ);
2987        if (!dt) dt++;
2988        db = resync - (mddev->resync_mark_cnt);
2989        return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2990}
2991
2992static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2993
2994static ssize_t
2995sync_completed_show(mddev_t *mddev, char *page)
2996{
2997        unsigned long max_blocks, resync;
2998
2999        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3000                max_blocks = mddev->resync_max_sectors;
3001        else
3002                max_blocks = mddev->size << 1;
3003
3004        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
3005        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
3006}
3007
3008static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3009
3010static ssize_t
3011max_sync_show(mddev_t *mddev, char *page)
3012{
3013        if (mddev->resync_max == MaxSector)
3014                return sprintf(page, "max\n");
3015        else
3016                return sprintf(page, "%llu\n",
3017                               (unsigned long long)mddev->resync_max);
3018}
3019static ssize_t
3020max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3021{
3022        if (strncmp(buf, "max", 3) == 0)
3023                mddev->resync_max = MaxSector;
3024        else {
3025                char *ep;
3026                unsigned long long max = simple_strtoull(buf, &ep, 10);
3027                if (ep == buf || (*ep != 0 && *ep != '\n'))
3028                        return -EINVAL;
3029                if (max < mddev->resync_max &&
3030                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3031                        return -EBUSY;
3032
3033                /* Must be a multiple of chunk_size */
3034                if (mddev->chunk_size) {
3035                        if (max & (sector_t)((mddev->chunk_size>>9)-1))
3036                                return -EINVAL;
3037                }
3038                mddev->resync_max = max;
3039        }
3040        wake_up(&mddev->recovery_wait);
3041        return len;
3042}
3043
3044static struct md_sysfs_entry md_max_sync =
3045__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3046
3047static ssize_t
3048suspend_lo_show(mddev_t *mddev, char *page)
3049{
3050        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3051}
3052
3053static ssize_t
3054suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3055{
3056        char *e;
3057        unsigned long long new = simple_strtoull(buf, &e, 10);
3058
3059        if (mddev->pers->quiesce == NULL)
3060                return -EINVAL;
3061        if (buf == e || (*e && *e != '\n'))
3062                return -EINVAL;
3063        if (new >= mddev->suspend_hi ||
3064            (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3065                mddev->suspend_lo = new;
3066                mddev->pers->quiesce(mddev, 2);
3067                return len;
3068        } else
3069                return -EINVAL;
3070}
3071static struct md_sysfs_entry md_suspend_lo =
3072__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3073
3074
3075static ssize_t
3076suspend_hi_show(mddev_t *mddev, char *page)
3077{
3078        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3079}
3080
3081static ssize_t
3082suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3083{
3084        char *e;
3085        unsigned long long new = simple_strtoull(buf, &e, 10);
3086
3087        if (mddev->pers->quiesce == NULL)
3088                return -EINVAL;
3089        if (buf == e || (*e && *e != '\n'))
3090                return -EINVAL;
3091        if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3092            (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3093                mddev->suspend_hi = new;
3094                mddev->pers->quiesce(mddev, 1);
3095                mddev->pers->quiesce(mddev, 0);
3096                return len;
3097        } else
3098                return -EINVAL;
3099}
3100static struct md_sysfs_entry md_suspend_hi =
3101__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3102
3103static ssize_t
3104reshape_position_show(mddev_t *mddev, char *page)
3105{
3106        if (mddev->reshape_position != MaxSector)
3107                return sprintf(page, "%llu\n",
3108                               (unsigned long long)mddev->reshape_position);
3109        strcpy(page, "none\n");
3110        return 5;
3111}
3112
3113static ssize_t
3114reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3115{
3116        char *e;
3117        unsigned long long new = simple_strtoull(buf, &e, 10);
3118        if (mddev->pers)
3119                return -EBUSY;
3120        if (buf == e || (*e && *e != '\n'))
3121                return -EINVAL;
3122        mddev->reshape_position = new;
3123        mddev->delta_disks = 0;
3124        mddev->new_level = mddev->level;
3125        mddev->new_layout = mddev->layout;
3126        mddev->new_chunk = mddev->chunk_size;
3127        return len;
3128}
3129
3130static struct md_sysfs_entry md_reshape_position =
3131__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3132       reshape_position_store);
3133
3134
3135static struct attribute *md_default_attrs[] = {
3136        &md_level.attr,
3137        &md_layout.attr,
3138        &md_raid_disks.attr,
3139        &md_chunk_size.attr,
3140        &md_size.attr,
3141        &md_resync_start.attr,
3142        &md_metadata.attr,
3143        &md_new_device.attr,
3144        &md_safe_delay.attr,
3145        &md_array_state.attr,
3146        &md_reshape_position.attr,
3147        NULL,
3148};
3149
3150static struct attribute *md_redundancy_attrs[] = {
3151        &md_scan_mode.attr,
3152        &md_mismatches.attr,
3153        &md_sync_min.attr,
3154        &md_sync_max.attr,
3155        &md_sync_speed.attr,
3156        &md_sync_completed.attr,
3157        &md_max_sync.attr,
3158        &md_suspend_lo.attr,
3159        &md_suspend_hi.attr,
3160        &md_bitmap.attr,
3161        &md_degraded.attr,
3162        NULL,
3163};
3164static struct attribute_group md_redundancy_group = {
3165        .name = NULL,
3166        .attrs = md_redundancy_attrs,
3167};
3168
3169
3170static ssize_t
3171md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3172{
3173        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3174        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3175        ssize_t rv;
3176
3177        if (!entry->show)
3178                return -EIO;
3179        rv = mddev_lock(mddev);
3180        if (!rv) {
3181                rv = entry->show(mddev, page);
3182                mddev_unlock(mddev);
3183        }
3184        return rv;
3185}
3186
3187static ssize_t
3188md_attr_store(struct kobject *kobj, struct attribute *attr,
3189              const char *page, size_t length)
3190{
3191        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3192        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3193        ssize_t rv;
3194
3195        if (!entry->store)
3196                return -EIO;
3197        if (!capable(CAP_SYS_ADMIN))
3198                return -EACCES;
3199        rv = mddev_lock(mddev);
3200        if (!rv) {
3201                rv = entry->store(mddev, page, length);
3202                mddev_unlock(mddev);
3203        }
3204        return rv;
3205}
3206
3207static void md_free(struct kobject *ko)
3208{
3209        mddev_t *mddev = container_of(ko, mddev_t, kobj);
3210        kfree(mddev);
3211}
3212
3213static struct sysfs_ops md_sysfs_ops = {
3214        .show   = md_attr_show,
3215        .store  = md_attr_store,
3216};
3217static struct kobj_type md_ktype = {
3218        .release        = md_free,
3219        .sysfs_ops      = &md_sysfs_ops,
3220        .default_attrs  = md_default_attrs,
3221};
3222
3223int mdp_major = 0;
3224
3225static struct kobject *md_probe(dev_t dev, int *part, void *data)
3226{
3227        static DEFINE_MUTEX(disks_mutex);
3228        mddev_t *mddev = mddev_find(dev);
3229        struct gendisk *disk;
3230        int partitioned = (MAJOR(dev) != MD_MAJOR);
3231        int shift = partitioned ? MdpMinorShift : 0;
3232        int unit = MINOR(dev) >> shift;
3233        int error;
3234
3235        if (!mddev)
3236                return NULL;
3237
3238        mutex_lock(&disks_mutex);
3239        if (mddev->gendisk) {
3240                mutex_unlock(&disks_mutex);
3241                mddev_put(mddev);
3242                return NULL;
3243        }
3244        disk = alloc_disk(1 << shift);
3245        if (!disk) {
3246                mutex_unlock(&disks_mutex);
3247                mddev_put(mddev);
3248                return NULL;
3249        }
3250        disk->major = MAJOR(dev);
3251        disk->first_minor = unit << shift;
3252        if (partitioned)
3253                sprintf(disk->disk_name, "md_d%d", unit);
3254        else
3255                sprintf(disk->disk_name, "md%d", unit);
3256        disk->fops = &md_fops;
3257        disk->private_data = mddev;
3258        disk->queue = mddev->queue;
3259        add_disk(disk);
3260        mddev->gendisk = disk;
3261        mutex_unlock(&disks_mutex);
3262        error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
3263                                     "%s", "md");
3264        if (error)
3265                printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3266                       disk->disk_name);
3267        else
3268                kobject_uevent(&mddev->kobj, KOBJ_ADD);
3269        return NULL;
3270}
3271
3272static void md_safemode_timeout(unsigned long data)
3273{
3274        mddev_t *mddev = (mddev_t *) data;
3275
3276        mddev->safemode = 1;
3277        md_wakeup_thread(mddev->thread);
3278}
3279
3280static int start_dirty_degraded;
3281
3282static int do_md_run(mddev_t * mddev)
3283{
3284        int err;
3285        int chunk_size;
3286        struct list_head *tmp;
3287        mdk_rdev_t *rdev;
3288        struct gendisk *disk;
3289        struct mdk_personality *pers;
3290        char b[BDEVNAME_SIZE];
3291
3292        if (list_empty(&mddev->disks))
3293                /* cannot run an array with no devices.. */
3294                return -EINVAL;
3295
3296        if (mddev->pers)
3297                return -EBUSY;
3298
3299        /*
3300         * Analyze all RAID superblock(s)
3301         */
3302        if (!mddev->raid_disks) {
3303                if (!mddev->persistent)
3304                        return -EINVAL;
3305                analyze_sbs(mddev);
3306        }
3307
3308        chunk_size = mddev->chunk_size;
3309
3310        if (chunk_size) {
3311                if (chunk_size > MAX_CHUNK_SIZE) {
3312                        printk(KERN_ERR "too big chunk_size: %d > %d\n",
3313                                chunk_size, MAX_CHUNK_SIZE);
3314                        return -EINVAL;
3315                }
3316                /*
3317                 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
3318                 */
3319                if ( (1 << ffz(~chunk_size)) != chunk_size) {
3320                        printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3321                        return -EINVAL;
3322                }
3323                if (chunk_size < PAGE_SIZE) {
3324                        printk(KERN_ERR "too small chunk_size: %d < %ld\n",
3325                                chunk_size, PAGE_SIZE);
3326                        return -EINVAL;
3327                }
3328
3329                /* devices must have minimum size of one chunk */
3330                rdev_for_each(rdev, tmp, mddev) {
3331                        if (test_bit(Faulty, &rdev->flags))
3332                                continue;
3333                        if (rdev->size < chunk_size / 1024) {
3334                                printk(KERN_WARNING
3335                                        "md: Dev %s smaller than chunk_size:"
3336                                        " %lluk < %dk\n",
3337                                        bdevname(rdev->bdev,b),
3338                                        (unsigned long long)rdev->size,
3339                                        chunk_size / 1024);
3340                                return -EINVAL;
3341                        }
3342                }
3343        }
3344
3345#ifdef CONFIG_KMOD
3346        if (mddev->level != LEVEL_NONE)
3347                request_module("md-level-%d", mddev->level);
3348        else if (mddev->clevel[0])
3349                request_module("md-%s", mddev->clevel);
3350#endif
3351
3352        /*
3353         * Drop all container device buffers, from now on
3354         * the only valid external interface is through the md
3355         * device.
3356         */
3357        rdev_for_each(rdev, tmp, mddev) {
3358                if (test_bit(Faulty, &rdev->flags))
3359                        continue;
3360                sync_blockdev(rdev->bdev);
3361                invalidate_bdev(rdev->bdev);
3362
3363                /* perform some consistency tests on the device.
3364                 * We don't want the data to overlap the metadata,
3365                 * Internal Bitmap issues has handled elsewhere.
3366                 */
3367                if (rdev->data_offset < rdev->sb_offset) {
3368                        if (mddev->size &&
3369                            rdev->data_offset + mddev->size*2
3370                            > rdev->sb_offset*2) {
3371                                printk("md: %s: data overlaps metadata\n",
3372                                       mdname(mddev));
3373                                return -EINVAL;
3374                        }
3375                } else {
3376                        if (rdev->sb_offset*2 + rdev->sb_size/512
3377                            > rdev->data_offset) {
3378                                printk("md: %s: metadata overlaps data\n",
3379                                       mdname(mddev));
3380                                return -EINVAL;
3381                        }
3382                }
3383        }
3384
3385        md_probe(mddev->unit, NULL, NULL);
3386        disk = mddev->gendisk;
3387        if (!disk)
3388                return -ENOMEM;
3389
3390        spin_lock(&pers_lock);
3391        pers = find_pers(mddev->level, mddev->clevel);
3392        if (!pers || !try_module_get(pers->owner)) {
3393                spin_unlock(&pers_lock);
3394                if (mddev->level != LEVEL_NONE)
3395                        printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3396                               mddev->level);
3397                else
3398                        printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3399                               mddev->clevel);
3400                return -EINVAL;
3401        }
3402        mddev->pers = pers;
3403        spin_unlock(&pers_lock);
3404        mddev->level = pers->level;
3405        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3406
3407        if (mddev->reshape_position != MaxSector &&
3408            pers->start_reshape == NULL) {
3409                /* This personality cannot handle reshaping... */
3410                mddev->pers = NULL;
3411                module_put(pers->owner);
3412                return -EINVAL;
3413        }
3414
3415        if (pers->sync_request) {
3416                /* Warn if this is a potentially silly
3417                 * configuration.
3418                 */
3419                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3420                mdk_rdev_t *rdev2;
3421                struct list_head *tmp2;
3422                int warned = 0;
3423                rdev_for_each(rdev, tmp, mddev) {
3424                        rdev_for_each(rdev2, tmp2, mddev) {
3425                                if (rdev < rdev2 &&
3426                                    rdev->bdev->bd_contains ==
3427                                    rdev2->bdev->bd_contains) {
3428                                        printk(KERN_WARNING
3429                                               "%s: WARNING: %s appears to be"
3430                                               " on the same physical disk as"
3431                                               " %s.\n",
3432                                               mdname(mddev),
3433                                               bdevname(rdev->bdev,b),
3434                                               bdevname(rdev2->bdev,b2));
3435                                        warned = 1;
3436                                }
3437                        }
3438                }
3439                if (warned)
3440                        printk(KERN_WARNING
3441                               "True protection against single-disk"
3442                               " failure might be compromised.\n");
3443        }
3444
3445        mddev->recovery = 0;
3446        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3447        mddev->barriers_work = 1;
3448        mddev->ok_start_degraded = start_dirty_degraded;
3449
3450        if (start_readonly)
3451                mddev->ro = 2; /* read-only, but switch on first write */
3452
3453        err = mddev->pers->run(mddev);
3454        if (!err && mddev->pers->sync_request) {
3455                err = bitmap_create(mddev);
3456                if (err) {
3457                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3458                               mdname(mddev), err);
3459                        mddev->pers->stop(mddev);
3460                }
3461        }
3462        if (err) {
3463                printk(KERN_ERR "md: pers->run() failed ...\n");
3464                module_put(mddev->pers->owner);
3465                mddev->pers = NULL;
3466                bitmap_destroy(mddev);
3467                return err;
3468        }
3469        if (mddev->pers->sync_request) {
3470                if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3471                        printk(KERN_WARNING
3472                               "md: cannot register extra attributes for %s\n",
3473                               mdname(mddev));
3474        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
3475                mddev->ro = 0;
3476
3477        atomic_set(&mddev->writes_pending,0);
3478        mddev->safemode = 0;
3479        mddev->safemode_timer.function = md_safemode_timeout;
3480        mddev->safemode_timer.data = (unsigned long) mddev;
3481        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3482        mddev->in_sync = 1;
3483
3484        rdev_for_each(rdev, tmp, mddev)
3485                if (rdev->raid_disk >= 0) {
3486                        char nm[20];
3487                        sprintf(nm, "rd%d", rdev->raid_disk);
3488                        if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3489                                printk("md: cannot register %s for %s\n",
3490                                       nm, mdname(mddev));
3491                }
3492        
3493        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3494        
3495        if (mddev->flags)
3496                md_update_sb(mddev, 0);
3497
3498        set_capacity(disk, mddev->array_size<<1);
3499
3500        /* If we call blk_queue_make_request here, it will
3501         * re-initialise max_sectors etc which may have been
3502         * refined inside -> run.  So just set the bits we need to set.
3503         * Most initialisation happended when we called
3504         * blk_queue_make_request(..., md_fail_request)
3505         * earlier.
3506         */
3507        mddev->queue->queuedata = mddev;
3508        mddev->queue->make_request_fn = mddev->pers->make_request;
3509
3510        /* If there is a partially-recovered drive we need to
3511         * start recovery here.  If we leave it to md_check_recovery,
3512         * it will remove the drives and not do the right thing
3513         */
3514        if (mddev->degraded && !mddev->sync_thread) {
3515                struct list_head *rtmp;
3516                int spares = 0;
3517                rdev_for_each(rdev, rtmp, mddev)
3518                        if (rdev->raid_disk >= 0 &&
3519                            !test_bit(In_sync, &rdev->flags) &&
3520                            !test_bit(Faulty, &rdev->flags))
3521                                /* complete an interrupted recovery */
3522                                spares++;
3523                if (spares && mddev->pers->sync_request) {
3524                        mddev->recovery = 0;
3525                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3526                        mddev->sync_thread = md_register_thread(md_do_sync,
3527                                                                mddev,
3528                                                                "%s_resync");
3529                        if (!mddev->sync_thread) {
3530                                printk(KERN_ERR "%s: could not start resync"
3531                                       " thread...\n",
3532                                       mdname(mddev));
3533                                /* leave the spares where they are, it shouldn't hurt */
3534                                mddev->recovery = 0;
3535                        }
3536                }
3537        }
3538        md_wakeup_thread(mddev->thread);
3539        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3540
3541        mddev->changed = 1;
3542        md_new_event(mddev);
3543        kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
3544        return 0;
3545}
3546
3547static int restart_array(mddev_t *mddev)
3548{
3549        struct gendisk *disk = mddev->gendisk;
3550        int err;
3551
3552        /*
3553         * Complain if it has no devices
3554         */
3555        err = -ENXIO;
3556        if (list_empty(&mddev->disks))
3557                goto out;
3558
3559        if (mddev->pers) {
3560                err = -EBUSY;
3561                if (!mddev->ro)
3562                        goto out;
3563
3564                mddev->safemode = 0;
3565                mddev->ro = 0;
3566                set_disk_ro(disk, 0);
3567
3568                printk(KERN_INFO "md: %s switched to read-write mode.\n",
3569                        mdname(mddev));
3570                /*
3571                 * Kick recovery or resync if necessary
3572                 */
3573                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3574                md_wakeup_thread(mddev->thread);
3575                md_wakeup_thread(mddev->sync_thread);
3576                err = 0;
3577        } else
3578                err = -EINVAL;
3579
3580out:
3581        return err;
3582}
3583
3584/* similar to deny_write_access, but accounts for our holding a reference
3585 * to the file ourselves */
3586static int deny_bitmap_write_access(struct file * file)
3587{
3588        struct inode *inode = file->f_mapping->host;
3589
3590        spin_lock(&inode->i_lock);
3591        if (atomic_read(&inode->i_writecount) > 1) {
3592                spin_unlock(&inode->i_lock);
3593                return -ETXTBSY;
3594        }
3595        atomic_set(&inode->i_writecount, -1);
3596        spin_unlock(&inode->i_lock);
3597
3598        return 0;
3599}
3600
3601static void restore_bitmap_write_access(struct file *file)
3602{
3603        struct inode *inode = file->f_mapping->host;
3604
3605        spin_lock(&inode->i_lock);
3606        atomic_set(&inode->i_writecount, 1);
3607        spin_unlock(&inode->i_lock);
3608}
3609
3610/* mode:
3611 *   0 - completely stop and dis-assemble array
3612 *   1 - switch to readonly
3613 *   2 - stop but do not disassemble array
3614 */
3615static int do_md_stop(mddev_t * mddev, int mode)
3616{
3617        int err = 0;
3618        struct gendisk *disk = mddev->gendisk;
3619
3620        if (mddev->pers) {
3621                if (atomic_read(&mddev->active)>2) {
3622                        printk("md: %s still in use.\n",mdname(mddev));
3623                        return -EBUSY;
3624                }
3625
3626                if (mddev->sync_thread) {
3627                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3628                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3629                        md_unregister_thread(mddev->sync_thread);
3630                        mddev->sync_thread = NULL;
3631                }
3632
3633                del_timer_sync(&mddev->safemode_timer);
3634
3635                invalidate_partition(disk, 0);
3636
3637                switch(mode) {
3638                case 1: /* readonly */
3639                        err  = -ENXIO;
3640                        if (mddev->ro==1)
3641                                goto out;
3642                        mddev->ro = 1;
3643                        break;
3644                case 0: /* disassemble */
3645                case 2: /* stop */
3646                        bitmap_flush(mddev);
3647                        md_super_wait(mddev);
3648                        if (mddev->ro)
3649                                set_disk_ro(disk, 0);
3650                        blk_queue_make_request(mddev->queue, md_fail_request);
3651                        mddev->pers->stop(mddev);
3652                        mddev->queue->merge_bvec_fn = NULL;
3653                        mddev->queue->unplug_fn = NULL;
3654                        mddev->queue->backing_dev_info.congested_fn = NULL;
3655                        if (mddev->pers->sync_request)
3656                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3657
3658                        module_put(mddev->pers->owner);
3659                        mddev->pers = NULL;
3660
3661                        set_capacity(disk, 0);
3662                        mddev->changed = 1;
3663
3664                        if (mddev->ro)
3665                                mddev->ro = 0;
3666                }
3667                if (!mddev->in_sync || mddev->flags) {
3668                        /* mark array as shutdown cleanly */
3669                        mddev->in_sync = 1;
3670                        md_update_sb(mddev, 1);
3671                }
3672                if (mode == 1)
3673                        set_disk_ro(disk, 1);
3674                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3675        }
3676
3677        /*
3678         * Free resources if final stop
3679         */
3680        if (mode == 0) {
3681                mdk_rdev_t *rdev;
3682                struct list_head *tmp;
3683
3684                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3685
3686                bitmap_destroy(mddev);
3687                if (mddev->bitmap_file) {
3688                        restore_bitmap_write_access(mddev->bitmap_file);
3689                        fput(mddev->bitmap_file);
3690                        mddev->bitmap_file = NULL;
3691                }
3692                mddev->bitmap_offset = 0;
3693
3694                rdev_for_each(rdev, tmp, mddev)
3695                        if (rdev->raid_disk >= 0) {
3696                                char nm[20];
3697                                sprintf(nm, "rd%d", rdev->raid_disk);
3698                                sysfs_remove_link(&mddev->kobj, nm);
3699                        }
3700
3701                /* make sure all md_delayed_delete calls have finished */
3702                flush_scheduled_work();
3703
3704                export_array(mddev);
3705
3706                mddev->array_size = 0;
3707                mddev->size = 0;
3708                mddev->raid_disks = 0;
3709                mddev->recovery_cp = 0;
3710                mddev->resync_max = MaxSector;
3711                mddev->reshape_position = MaxSector;
3712                mddev->external = 0;
3713                mddev->persistent = 0;
3714
3715        } else if (mddev->pers)
3716                printk(KERN_INFO "md: %s switched to read-only mode.\n",
3717                        mdname(mddev));
3718        err = 0;
3719        md_new_event(mddev);
3720out:
3721        return err;
3722}
3723
3724#ifndef MODULE
3725static void autorun_array(mddev_t *mddev)
3726{
3727        mdk_rdev_t *rdev;
3728        struct list_head *tmp;
3729        int err;
3730
3731        if (list_empty(&mddev->disks))
3732                return;
3733
3734        printk(KERN_INFO "md: running: ");
3735
3736        rdev_for_each(rdev, tmp, mddev) {
3737                char b[BDEVNAME_SIZE];
3738                printk("<%s>", bdevname(rdev->bdev,b));
3739        }
3740        printk("\n");
3741
3742        err = do_md_run (mddev);
3743        if (err) {
3744                printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3745                do_md_stop (mddev, 0);
3746        }
3747}
3748
3749/*
3750 * lets try to run arrays based on all disks that have arrived
3751 * until now. (those are in pending_raid_disks)
3752 *
3753 * the method: pick the first pending disk, collect all disks with
3754 * the same UUID, remove all from the pending list and put them into
3755 * the 'same_array' list. Then order this list based on superblock
3756 * update time (freshest comes first), kick out 'old' disks and
3757 * compare superblocks. If everything's fine then run it.
3758 *
3759 * If "unit" is allocated, then bump its reference count
3760 */
3761static void autorun_devices(int part)
3762{
3763        struct list_head *tmp;
3764        mdk_rdev_t *rdev0, *rdev;
3765        mddev_t *mddev;
3766        char b[BDEVNAME_SIZE];
3767
3768        printk(KERN_INFO "md: autorun ...\n");
3769        while (!list_empty(&pending_raid_disks)) {
3770                int unit;
3771                dev_t dev;
3772                LIST_HEAD(candidates);
3773                rdev0 = list_entry(pending_raid_disks.next,
3774                                         mdk_rdev_t, same_set);
3775
3776                printk(KERN_INFO "md: considering %s ...\n",
3777                        bdevname(rdev0->bdev,b));
3778                INIT_LIST_HEAD(&candidates);
3779                rdev_for_each_list(rdev, tmp, pending_raid_disks)
3780                        if (super_90_load(rdev, rdev0, 0) >= 0) {
3781                                printk(KERN_INFO "md:  adding %s ...\n",
3782                                        bdevname(rdev->bdev,b));
3783                                list_move(&rdev->same_set, &candidates);
3784                        }
3785                /*
3786                 * now we have a set of devices, with all of them having
3787                 * mostly sane superblocks. It's time to allocate the
3788                 * mddev.
3789                 */
3790                if (part) {
3791                        dev = MKDEV(mdp_major,
3792                                    rdev0->preferred_minor << MdpMinorShift);
3793                        unit = MINOR(dev) >> MdpMinorShift;
3794                } else {
3795                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3796                        unit = MINOR(dev);
3797                }
3798                if (rdev0->preferred_minor != unit) {
3799                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3800                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3801                        break;
3802                }
3803
3804                md_probe(dev, NULL, NULL);
3805                mddev = mddev_find(dev);
3806                if (!mddev) {
3807                        printk(KERN_ERR 
3808                                "md: cannot allocate memory for md drive.\n");
3809                        break;
3810                }
3811                if (mddev_lock(mddev)) 
3812                        printk(KERN_WARNING "md: %s locked, cannot run\n",
3813                               mdname(mddev));
3814                else if (mddev->raid_disks || mddev->major_version
3815                         || !list_empty(&mddev->disks)) {
3816                        printk(KERN_WARNING 
3817                                "md: %s already running, cannot run %s\n",
3818                                mdname(mddev), bdevname(rdev0->bdev,b));
3819                        mddev_unlock(mddev);
3820                } else {
3821                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
3822                        mddev->persistent = 1;
3823                        rdev_for_each_list(rdev, tmp, candidates) {
3824                                list_del_init(&rdev->same_set);
3825                                if (bind_rdev_to_array(rdev, mddev))
3826                                        export_rdev(rdev);
3827                        }
3828                        autorun_array(mddev);
3829                        mddev_unlock(mddev);
3830                }
3831                /* on success, candidates will be empty, on error
3832                 * it won't...
3833                 */
3834                rdev_for_each_list(rdev, tmp, candidates)
3835                        export_rdev(rdev);
3836                mddev_put(mddev);
3837        }
3838        printk(KERN_INFO "md: ... autorun DONE.\n");
3839}
3840#endif /* !MODULE */
3841
3842static int get_version(void __user * arg)
3843{
3844        mdu_version_t ver;
3845
3846        ver.major = MD_MAJOR_VERSION;
3847        ver.minor = MD_MINOR_VERSION;
3848        ver.patchlevel = MD_PATCHLEVEL_VERSION;
3849
3850        if (copy_to_user(arg, &ver, sizeof(ver)))
3851                return -EFAULT;
3852
3853        return 0;
3854}
3855
3856static int get_array_info(mddev_t * mddev, void __user * arg)
3857{
3858        mdu_array_info_t info;
3859        int nr,working,active,failed,spare;
3860        mdk_rdev_t *rdev;
3861        struct list_head *tmp;
3862
3863        nr=working=active=failed=spare=0;
3864        rdev_for_each(rdev, tmp, mddev) {
3865                nr++;
3866                if (test_bit(Faulty, &rdev->flags))
3867                        failed++;
3868                else {
3869                        working++;
3870                        if (test_bit(In_sync, &rdev->flags))
3871                                active++;       
3872                        else
3873                                spare++;
3874                }
3875        }
3876
3877        info.major_version = mddev->major_version;
3878        info.minor_version = mddev->minor_version;
3879        info.patch_version = MD_PATCHLEVEL_VERSION;
3880        info.ctime         = mddev->ctime;
3881        info.level         = mddev->level;
3882        info.size          = mddev->size;
3883        if (info.size != mddev->size) /* overflow */
3884                info.size = -1;
3885        info.nr_disks      = nr;
3886        info.raid_disks    = mddev->raid_disks;
3887        info.md_minor      = mddev->md_minor;
3888        info.not_persistent= !mddev->persistent;
3889
3890        info.utime         = mddev->utime;
3891        info.state         = 0;
3892        if (mddev->in_sync)
3893                info.state = (1<<MD_SB_CLEAN);
3894        if (mddev->bitmap && mddev->bitmap_offset)
3895                info.state = (1<<MD_SB_BITMAP_PRESENT);
3896        info.active_disks  = active;
3897        info.working_disks = working;
3898        info.failed_disks  = failed;
3899        info.spare_disks   = spare;
3900
3901        info.layout        = mddev->layout;
3902        info.chunk_size    = mddev->chunk_size;
3903
3904        if (copy_to_user(arg, &info, sizeof(info)))
3905                return -EFAULT;
3906
3907        return 0;
3908}
3909
3910static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3911{
3912        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
3913        char *ptr, *buf = NULL;
3914        int err = -ENOMEM;
3915
3916        md_allow_write(mddev);
3917
3918        file = kmalloc(sizeof(*file), GFP_KERNEL);
3919        if (!file)
3920                goto out;
3921
3922        /* bitmap disabled, zero the first byte and copy out */
3923        if (!mddev->bitmap || !mddev->bitmap->file) {
3924                file->pathname[0] = '\0';
3925                goto copy_out;
3926        }
3927
3928        buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3929        if (!buf)
3930                goto out;
3931
3932        ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3933        if (!ptr)
3934                goto out;
3935
3936        strcpy(file->pathname, ptr);
3937
3938copy_out:
3939        err = 0;
3940        if (copy_to_user(arg, file, sizeof(*file)))
3941                err = -EFAULT;
3942out:
3943        kfree(buf);
3944        kfree(file);
3945        return err;
3946}
3947
3948static int get_disk_info(mddev_t * mddev, void __user * arg)
3949{
3950        mdu_disk_info_t info;
3951        unsigned int nr;
3952        mdk_rdev_t *rdev;
3953
3954        if (copy_from_user(&info, arg, sizeof(info)))
3955                return -EFAULT;
3956
3957        nr = info.number;
3958
3959        rdev = find_rdev_nr(mddev, nr);
3960        if (rdev) {
3961                info.major = MAJOR(rdev->bdev->bd_dev);
3962                info.minor = MINOR(rdev->bdev->bd_dev);
3963                info.raid_disk = rdev->raid_disk;
3964                info.state = 0;
3965                if (test_bit(Faulty, &rdev->flags))
3966                        info.state |= (1<<MD_DISK_FAULTY);
3967                else if (test_bit(In_sync, &rdev->flags)) {
3968                        info.state |= (1<<MD_DISK_ACTIVE);
3969                        info.state |= (1<<MD_DISK_SYNC);
3970                }
3971                if (test_bit(WriteMostly, &rdev->flags))
3972                        info.state |= (1<<MD_DISK_WRITEMOSTLY);
3973        } else {
3974                info.major = info.minor = 0;
3975                info.raid_disk = -1;
3976                info.state = (1<<MD_DISK_REMOVED);
3977        }
3978
3979        if (copy_to_user(arg, &info, sizeof(info)))
3980                return -EFAULT;
3981
3982        return 0;
3983}
3984
3985static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3986{
3987        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3988        mdk_rdev_t *rdev;
3989        dev_t dev = MKDEV(info->major,info->minor);
3990
3991        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3992                return -EOVERFLOW;
3993
3994        if (!mddev->raid_disks) {
3995                int err;
3996                /* expecting a device which has a superblock */
3997                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3998                if (IS_ERR(rdev)) {
3999                        printk(KERN_WARNING 
4000                                "md: md_import_device returned %ld\n",
4001                                PTR_ERR(rdev));
4002                        return PTR_ERR(rdev);
4003                }
4004                if (!list_empty(&mddev->disks)) {
4005                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4006                                                        mdk_rdev_t, same_set);
4007                        int err = super_types[mddev->major_version]
4008                                .load_super(rdev, rdev0, mddev->minor_version);
4009                        if (err < 0) {
4010                                printk(KERN_WARNING 
4011                                        "md: %s has different UUID to %s\n",
4012                                        bdevname(rdev->bdev,b), 
4013                                        bdevname(rdev0->bdev,b2));
4014                                export_rdev(rdev);
4015                                return -EINVAL;
4016                        }
4017                }
4018                err = bind_rdev_to_array(rdev, mddev);
4019                if (err)
4020                        export_rdev(rdev);
4021                return err;
4022        }
4023
4024        /*
4025         * add_new_disk can be used once the array is assembled
4026         * to add "hot spares".  They must already have a superblock
4027         * written
4028         */
4029        if (mddev->pers) {
4030                int err;
4031                if (!mddev->pers->hot_add_disk) {
4032                        printk(KERN_WARNING 
4033                                "%s: personality does not support diskops!\n",
4034                               mdname(mddev));
4035                        return -EINVAL;
4036                }
4037                if (mddev->persistent)
4038                        rdev = md_import_device(dev, mddev->major_version,
4039                                                mddev->minor_version);
4040                else
4041                        rdev = md_import_device(dev, -1, -1);
4042                if (IS_ERR(rdev)) {
4043                        printk(KERN_WARNING 
4044                                "md: md_import_device returned %ld\n",
4045                                PTR_ERR(rdev));
4046                        return PTR_ERR(rdev);
4047                }
4048                /* set save_raid_disk if appropriate */
4049                if (!mddev->persistent) {
4050                        if (info->state & (1<<MD_DISK_SYNC)  &&
4051                            info->raid_disk < mddev->raid_disks)
4052                                rdev->raid_disk = info->raid_disk;
4053                        else
4054                                rdev->raid_disk = -1;
4055                } else
4056                        super_types[mddev->major_version].
4057                                validate_super(mddev, rdev);
4058                rdev->saved_raid_disk = rdev->raid_disk;
4059
4060                clear_bit(In_sync, &rdev->flags); /* just to be sure */
4061                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4062                        set_bit(WriteMostly, &rdev->flags);
4063
4064                rdev->raid_disk = -1;
4065                err = bind_rdev_to_array(rdev, mddev);
4066                if (!err && !mddev->pers->hot_remove_disk) {
4067                        /* If there is hot_add_disk but no hot_remove_disk
4068                         * then added disks for geometry changes,
4069                         * and should be added immediately.
4070                         */
4071                        super_types[mddev->major_version].
4072                                validate_super(mddev, rdev);
4073                        err = mddev->pers->hot_add_disk(mddev, rdev);
4074                        if (err)
4075                                unbind_rdev_from_array(rdev);
4076                }
4077                if (err)
4078                        export_rdev(rdev);
4079
4080                md_update_sb(mddev, 1);
4081                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4082                md_wakeup_thread(mddev->thread);
4083                return err;
4084        }
4085
4086        /* otherwise, add_new_disk is only allowed
4087         * for major_version==0 superblocks
4088         */
4089        if (mddev->major_version != 0) {
4090                printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4091                       mdname(mddev));
4092                return -EINVAL;
4093        }
4094
4095        if (!(info->state & (1<<MD_DISK_FAULTY))) {
4096                int err;
4097                rdev = md_import_device (dev, -1, 0);
4098                if (IS_ERR(rdev)) {
4099                        printk(KERN_WARNING 
4100                                "md: error, md_import_device() returned %ld\n",
4101                                PTR_ERR(rdev));
4102                        return PTR_ERR(rdev);
4103                }
4104                rdev->desc_nr = info->number;
4105                if (info->raid_disk < mddev->raid_disks)
4106                        rdev->raid_disk = info->raid_disk;
4107                else
4108                        rdev->raid_disk = -1;
4109
4110                if (rdev->raid_disk < mddev->raid_disks)
4111                        if (info->state & (1<<MD_DISK_SYNC))
4112                                set_bit(In_sync, &rdev->flags);
4113
4114                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4115                        set_bit(WriteMostly, &rdev->flags);
4116
4117                if (!mddev->persistent) {
4118                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
4119                        rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4120                } else 
4121                        rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4122                rdev->size = calc_dev_size(rdev, mddev->chunk_size);
4123
4124                err = bind_rdev_to_array(rdev, mddev);
4125                if (err) {
4126                        export_rdev(rdev);
4127                        return err;
4128                }
4129        }
4130
4131        return 0;
4132}
4133
4134static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4135{
4136        char b[BDEVNAME_SIZE];
4137        mdk_rdev_t *rdev;
4138
4139        if (!mddev->pers)
4140                return -ENODEV;
4141
4142        rdev = find_rdev(mddev, dev);
4143        if (!rdev)
4144                return -ENXIO;
4145
4146        if (rdev->raid_disk >= 0)
4147                goto busy;
4148
4149        kick_rdev_from_array(rdev);
4150        md_update_sb(mddev, 1);
4151        md_new_event(mddev);
4152
4153        return 0;
4154busy:
4155        printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
4156                bdevname(rdev->bdev,b), mdname(mddev));
4157        return -EBUSY;
4158}
4159
4160static int hot_add_disk(mddev_t * mddev, dev_t dev)
4161{
4162        char b[BDEVNAME_SIZE];
4163        int err;
4164        unsigned int size;
4165        mdk_rdev_t *rdev;
4166
4167        if (!mddev->pers)
4168                return -ENODEV;
4169
4170        if (mddev->major_version != 0) {
4171                printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4172                        " version-0 superblocks.\n",
4173                        mdname(mddev));
4174                return -EINVAL;
4175        }
4176        if (!mddev->pers->hot_add_disk) {
4177                printk(KERN_WARNING 
4178                        "%s: personality does not support diskops!\n",
4179                        mdname(mddev));
4180                return -EINVAL;
4181        }
4182
4183        rdev = md_import_device (dev, -1, 0);
4184        if (IS_ERR(rdev)) {
4185                printk(KERN_WARNING 
4186                        "md: error, md_import_device() returned %ld\n",
4187                        PTR_ERR(rdev));
4188                return -EINVAL;
4189        }
4190
4191        if (mddev->persistent)
4192                rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4193        else
4194                rdev->sb_offset =
4195                        rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4196
4197        size = calc_dev_size(rdev, mddev->chunk_size);
4198        rdev->size = size;
4199
4200        if (test_bit(Faulty, &rdev->flags)) {
4201                printk(KERN_WARNING 
4202                        "md: can not hot-add faulty %s disk to %s!\n",
4203                        bdevname(rdev->bdev,b), mdname(mddev));
4204                err = -EINVAL;
4205                goto abort_export;
4206        }
4207        clear_bit(In_sync, &rdev->flags);
4208        rdev->desc_nr = -1;
4209        rdev->saved_raid_disk = -1;
4210        err = bind_rdev_to_array(rdev, mddev);
4211        if (err)
4212                goto abort_export;
4213
4214        /*
4215         * The rest should better be atomic, we can have disk failures
4216         * noticed in interrupt contexts ...
4217         */
4218
4219        if (rdev->desc_nr == mddev->max_disks) {
4220                printk(KERN_WARNING "%s: can not hot-add to full array!\n",
4221                        mdname(mddev));
4222                err = -EBUSY;
4223                goto abort_unbind_export;
4224        }
4225
4226        rdev->raid_disk = -1;
4227
4228        md_update_sb(mddev, 1);
4229
4230        /*
4231         * Kick recovery, maybe this spare has to be added to the
4232         * array immediately.
4233         */
4234        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4235        md_wakeup_thread(mddev->thread);
4236        md_new_event(mddev);
4237        return 0;
4238
4239abort_unbind_export:
4240        unbind_rdev_from_array(rdev);
4241
4242abort_export:
4243        export_rdev(rdev);
4244        return err;
4245}
4246
4247static int set_bitmap_file(mddev_t *mddev, int fd)
4248{
4249        int err;
4250
4251        if (mddev->pers) {
4252                if (!mddev->pers->quiesce)
4253                        return -EBUSY;
4254                if (mddev->recovery || mddev->sync_thread)
4255                        return -EBUSY;
4256                /* we should be able to change the bitmap.. */
4257        }
4258
4259
4260        if (fd >= 0) {
4261                if (mddev->bitmap)
4262                        return -EEXIST; /* cannot add when bitmap is present */
4263                mddev->bitmap_file = fget(fd);
4264
4265                if (mddev->bitmap_file == NULL) {
4266                        printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4267                               mdname(mddev));
4268                        return -EBADF;
4269                }
4270
4271                err = deny_bitmap_write_access(mddev->bitmap_file);
4272                if (err) {
4273                        printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4274                               mdname(mddev));
4275                        fput(mddev->bitmap_file);
4276                        mddev->bitmap_file = NULL;
4277                        return err;
4278                }
4279                mddev->bitmap_offset = 0; /* file overrides offset */
4280        } else if (mddev->bitmap == NULL)
4281                return -ENOENT; /* cannot remove what isn't there */
4282        err = 0;
4283        if (mddev->pers) {
4284                mddev->pers->quiesce(mddev, 1);
4285                if (fd >= 0)
4286                        err = bitmap_create(mddev);
4287                if (fd < 0 || err) {
4288                        bitmap_destroy(mddev);
4289                        fd = -1; /* make sure to put the file */
4290                }
4291                mddev->pers->quiesce(mddev, 0);
4292        }
4293        if (fd < 0) {
4294                if (mddev->bitmap_file) {
4295                        restore_bitmap_write_access(mddev->bitmap_file);
4296                        fput(mddev->bitmap_file);
4297                }
4298                mddev->bitmap_file = NULL;
4299        }
4300
4301        return err;
4302}
4303
4304/*
4305 * set_array_info is used two different ways
4306 * The original usage is when creating a new array.
4307 * In this usage, raid_disks is > 0 and it together with
4308 *  level, size, not_persistent,layout,chunksize determine the
4309 *  shape of the array.
4310 *  This will always create an array with a type-0.90.0 superblock.
4311 * The newer usage is when assembling an array.
4312 *  In this case raid_disks will be 0, and the major_version field is
4313 *  use to determine which style super-blocks are to be found on the devices.
4314 *  The minor and patch _version numbers are also kept incase the
4315 *  super_block handler wishes to interpret them.
4316 */
4317static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4318{
4319
4320        if (info->raid_disks == 0) {
4321                /* just setting version number for superblock loading */
4322                if (info->major_version < 0 ||
4323                    info->major_version >= ARRAY_SIZE(super_types) ||
4324                    super_types[info->major_version].name == NULL) {
4325                        /* maybe try to auto-load a module? */
4326                        printk(KERN_INFO 
4327                                "md: superblock version %d not known\n",
4328                                info->major_version);
4329                        return -EINVAL;
4330                }
4331                mddev->major_version = info->major_version;
4332                mddev->minor_version = info->minor_version;
4333                mddev->patch_version = info->patch_version;
4334                mddev->persistent = !info->not_persistent;
4335                return 0;
4336        }
4337        mddev->major_version = MD_MAJOR_VERSION;
4338        mddev->minor_version = MD_MINOR_VERSION;
4339        mddev->patch_version = MD_PATCHLEVEL_VERSION;
4340        mddev->ctime         = get_seconds();
4341
4342        mddev->level         = info->level;
4343        mddev->clevel[0]     = 0;
4344        mddev->size          = info->size;
4345        mddev->raid_disks    = info->raid_disks;
4346        /* don't set md_minor, it is determined by which /dev/md* was
4347         * openned
4348         */
4349        if (info->state & (1<<MD_SB_CLEAN))
4350                mddev->recovery_cp = MaxSector;
4351        else
4352                mddev->recovery_cp = 0;
4353        mddev->persistent    = ! info->not_persistent;
4354        mddev->external      = 0;
4355
4356        mddev->layout        = info->layout;
4357        mddev->chunk_size    = info->chunk_size;
4358
4359        mddev->max_disks     = MD_SB_DISKS;
4360
4361        if (mddev->persistent)
4362                mddev->flags         = 0;
4363        set_bit(MD_CHANGE_DEVS, &mddev->flags);
4364
4365        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4366        mddev->bitmap_offset = 0;
4367
4368        mddev->reshape_position = MaxSector;
4369
4370        /*
4371         * Generate a 128 bit UUID
4372         */
4373        get_random_bytes(mddev->uuid, 16);
4374
4375        mddev->new_level = mddev->level;
4376        mddev->new_chunk = mddev->chunk_size;
4377        mddev->new_layout = mddev->layout;
4378        mddev->delta_disks = 0;
4379
4380        return 0;
4381}
4382
4383static int update_size(mddev_t *mddev, unsigned long size)
4384{
4385        mdk_rdev_t * rdev;
4386        int rv;
4387        struct list_head *tmp;
4388        int fit = (size == 0);
4389
4390        if (mddev->pers->resize == NULL)
4391                return -EINVAL;
4392        /* The "size" is the amount of each device that is used.
4393         * This can only make sense for arrays with redundancy.
4394         * linear and raid0 always use whatever space is available
4395         * We can only consider changing the size if no resync
4396         * or reconstruction is happening, and if the new size
4397         * is acceptable. It must fit before the sb_offset or,
4398         * if that is <data_offset, it must fit before the
4399         * size of each device.
4400         * If size is zero, we find the largest size that fits.
4401         */
4402        if (mddev->sync_thread)
4403                return -EBUSY;
4404        rdev_for_each(rdev, tmp, mddev) {
4405                sector_t avail;
4406                avail = rdev->size * 2;
4407
4408                if (fit && (size == 0 || size > avail/2))
4409                        size = avail/2;
4410                if (avail < ((sector_t)size << 1))
4411                        return -ENOSPC;
4412        }
4413        rv = mddev->pers->resize(mddev, (sector_t)size *2);
4414        if (!rv) {
4415                struct block_device *bdev;
4416
4417                bdev = bdget_disk(mddev->gendisk, 0);
4418                if (bdev) {
4419                        mutex_lock(&bdev->bd_inode->i_mutex);
4420                        i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4421                        mutex_unlock(&bdev->bd_inode->i_mutex);
4422                        bdput(bdev);
4423                }
4424        }
4425        return rv;
4426}
4427
4428static int update_raid_disks(mddev_t *mddev, int raid_disks)
4429{
4430        int rv;
4431        /* change the number of raid disks */
4432        if (mddev->pers->check_reshape == NULL)
4433                return -EINVAL;
4434        if (raid_disks <= 0 ||
4435            raid_disks >= mddev->max_disks)
4436                return -EINVAL;
4437        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4438                return -EBUSY;
4439        mddev->delta_disks = raid_disks - mddev->raid_disks;
4440
4441        rv = mddev->pers->check_reshape(mddev);
4442        return rv;
4443}
4444
4445
4446/*
4447 * update_array_info is used to change the configuration of an
4448 * on-line array.
4449 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4450 * fields in the info are checked against the array.
4451 * Any differences that cannot be handled will cause an error.
4452 * Normally, only one change can be managed at a time.
4453 */
4454static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4455{
4456        int rv = 0;
4457        int cnt = 0;
4458        int state = 0;
4459
4460        /* calculate expected state,ignoring low bits */
4461        if (mddev->bitmap && mddev->bitmap_offset)
4462                state |= (1 << MD_SB_BITMAP_PRESENT);
4463
4464        if (mddev->major_version != info->major_version ||
4465            mddev->minor_version != info->minor_version ||
4466/*          mddev->patch_version != info->patch_version || */
4467            mddev->ctime         != info->ctime         ||
4468            mddev->level         != info->level         ||
4469/*          mddev->layout        != info->layout        || */
4470            !mddev->persistent   != info->not_persistent||
4471            mddev->chunk_size    != info->chunk_size    ||
4472            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4473            ((state^info->state) & 0xfffffe00)
4474                )
4475                return -EINVAL;
4476        /* Check there is only one change */
4477        if (info->size >= 0 && mddev->size != info->size) cnt++;
4478        if (mddev->raid_disks != info->raid_disks) cnt++;
4479        if (mddev->layout != info->layout) cnt++;
4480        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4481        if (cnt == 0) return 0;
4482        if (cnt > 1) return -EINVAL;
4483
4484        if (mddev->layout != info->layout) {
4485                /* Change layout
4486                 * we don't need to do anything at the md level, the
4487                 * personality will take care of it all.
4488                 */
4489                if (mddev->pers->reconfig == NULL)
4490                        return -EINVAL;
4491                else
4492                        return mddev->pers->reconfig(mddev, info->layout, -1);
4493        }
4494        if (info->size >= 0 && mddev->size != info->size)
4495                rv = update_size(mddev, info->size);
4496
4497        if (mddev->raid_disks    != info->raid_disks)
4498                rv = update_raid_disks(mddev, info->raid_disks);
4499
4500        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4501                if (mddev->pers->quiesce == NULL)
4502                        return -EINVAL;
4503                if (mddev->recovery || mddev->sync_thread)
4504                        return -EBUSY;
4505                if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4506                        /* add the bitmap */
4507                        if (mddev->bitmap)
4508                                return -EEXIST;
4509                        if (mddev->default_bitmap_offset == 0)
4510                                return -EINVAL;
4511                        mddev->bitmap_offset = mddev->default_bitmap_offset;
4512                        mddev->pers->quiesce(mddev, 1);
4513                        rv = bitmap_create(mddev);
4514                        if (rv)
4515                                bitmap_destroy(mddev);
4516                        mddev->pers->quiesce(mddev, 0);
4517                } else {
4518                        /* remove the bitmap */
4519                        if (!mddev->bitmap)
4520                                return -ENOENT;
4521                        if (mddev->bitmap->file)
4522                                return -EINVAL;
4523                        mddev->pers->quiesce(mddev, 1);
4524                        bitmap_destroy(mddev);
4525                        mddev->pers->quiesce(mddev, 0);
4526                        mddev->bitmap_offset = 0;
4527                }
4528        }
4529        md_update_sb(mddev, 1);
4530        return rv;
4531}
4532
4533static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4534{
4535        mdk_rdev_t *rdev;
4536
4537        if (mddev->pers == NULL)
4538                return -ENODEV;
4539
4540        rdev = find_rdev(mddev, dev);
4541        if (!rdev)
4542                return -ENODEV;
4543
4544        md_error(mddev, rdev);
4545        return 0;
4546}
4547
4548static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4549{
4550        mddev_t *mddev = bdev->bd_disk->private_data;
4551
4552        geo->heads = 2;
4553        geo->sectors = 4;
4554        geo->cylinders = get_capacity(mddev->gendisk) / 8;
4555        return 0;
4556}
4557
4558static int md_ioctl(struct inode *inode, struct file *file,
4559                        unsigned int cmd, unsigned long arg)
4560{
4561        int err = 0;
4562        void __user *argp = (void __user *)arg;
4563        mddev_t *mddev = NULL;
4564
4565        if (!capable(CAP_SYS_ADMIN))
4566                return -EACCES;
4567
4568        /*
4569         * Commands dealing with the RAID driver but not any
4570         * particular array:
4571         */
4572        switch (cmd)
4573        {
4574                case RAID_VERSION:
4575                        err = get_version(argp);
4576                        goto done;
4577
4578                case PRINT_RAID_DEBUG:
4579                        err = 0;
4580                        md_print_devices();
4581                        goto done;
4582
4583#ifndef MODULE
4584                case RAID_AUTORUN:
4585                        err = 0;
4586                        autostart_arrays(arg);
4587                        goto done;
4588#endif
4589                default:;
4590        }
4591
4592        /*
4593         * Commands creating/starting a new array:
4594         */
4595
4596        mddev = inode->i_bdev->bd_disk->private_data;
4597
4598        if (!mddev) {
4599                BUG();
4600                goto abort;
4601        }
4602
4603        err = mddev_lock(mddev);
4604        if (err) {
4605                printk(KERN_INFO 
4606                        "md: ioctl lock interrupted, reason %d, cmd %d\n",
4607                        err, cmd);
4608                goto abort;
4609        }
4610
4611        switch (cmd)
4612        {
4613                case SET_ARRAY_INFO:
4614                        {
4615                                mdu_array_info_t info;
4616                                if (!arg)
4617                                        memset(&info, 0, sizeof(info));
4618                                else if (copy_from_user(&info, argp, sizeof(info))) {
4619                                        err = -EFAULT;
4620                                        goto abort_unlock;
4621                                }
4622                                if (mddev->pers) {
4623                                        err = update_array_info(mddev, &info);
4624                                        if (err) {
4625                                                printk(KERN_WARNING "md: couldn't update"
4626                                                       " array info. %d\n", err);
4627                                                goto abort_unlock;
4628                                        }
4629                                        goto done_unlock;
4630                                }
4631                                if (!list_empty(&mddev->disks)) {
4632                                        printk(KERN_WARNING
4633                                               "md: array %s already has disks!\n",
4634                                               mdname(mddev));
4635                                        err = -EBUSY;
4636                                        goto abort_unlock;
4637                                }
4638                                if (mddev->raid_disks) {
4639                                        printk(KERN_WARNING
4640                                               "md: array %s already initialised!\n",
4641                                               mdname(mddev));
4642                                        err = -EBUSY;
4643                                        goto abort_unlock;
4644                                }
4645                                err = set_array_info(mddev, &info);
4646                                if (err) {
4647                                        printk(KERN_WARNING "md: couldn't set"
4648                                               " array info. %d\n", err);
4649                                        goto abort_unlock;
4650                                }
4651                        }
4652                        goto done_unlock;
4653
4654                default:;
4655        }
4656
4657        /*
4658         * Commands querying/configuring an existing array:
4659         */
4660        /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4661         * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
4662        if ((!mddev->raid_disks && !mddev->external)
4663            && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4664            && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
4665            && cmd != GET_BITMAP_FILE) {
4666                err = -ENODEV;
4667                goto abort_unlock;
4668        }
4669
4670        /*
4671         * Commands even a read-only array can execute:
4672         */
4673        switch (cmd)
4674        {
4675                case GET_ARRAY_INFO:
4676                        err = get_array_info(mddev, argp);
4677                        goto done_unlock;
4678
4679                case GET_BITMAP_FILE:
4680                        err = get_bitmap_file(mddev, argp);
4681                        goto done_unlock;
4682
4683                case GET_DISK_INFO:
4684                        err = get_disk_info(mddev, argp);
4685                        goto done_unlock;
4686
4687                case RESTART_ARRAY_RW:
4688                        err = restart_array(mddev);
4689                        goto done_unlock;
4690
4691                case STOP_ARRAY:
4692                        err = do_md_stop (mddev, 0);
4693                        goto done_unlock;
4694
4695                case STOP_ARRAY_RO:
4696                        err = do_md_stop (mddev, 1);
4697                        goto done_unlock;
4698
4699        /*
4700         * We have a problem here : there is no easy way to give a CHS
4701         * virtual geometry. We currently pretend that we have a 2 heads
4702         * 4 sectors (with a BIG number of cylinders...). This drives
4703         * dosfs just mad... ;-)
4704         */
4705        }
4706
4707        /*
4708         * The remaining ioctls are changing the state of the
4709         * superblock, so we do not allow them on read-only arrays.
4710         * However non-MD ioctls (e.g. get-size) will still come through
4711         * here and hit the 'default' below, so only disallow
4712         * 'md' ioctls, and switch to rw mode if started auto-readonly.
4713         */
4714        if (_IOC_TYPE(cmd) == MD_MAJOR &&
4715            mddev->ro && mddev->pers) {
4716                if (mddev->ro == 2) {
4717                        mddev->ro = 0;
4718                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4719                md_wakeup_thread(mddev->thread);
4720
4721                } else {
4722                        err = -EROFS;
4723                        goto abort_unlock;
4724                }
4725        }
4726
4727        switch (cmd)
4728        {
4729                case ADD_NEW_DISK:
4730                {
4731                        mdu_disk_info_t info;
4732                        if (copy_from_user(&info, argp, sizeof(info)))
4733                                err = -EFAULT;
4734                        else
4735                                err = add_new_disk(mddev, &info);
4736                        goto done_unlock;
4737                }
4738
4739                case HOT_REMOVE_DISK:
4740                        err = hot_remove_disk(mddev, new_decode_dev(arg));
4741                        goto done_unlock;
4742
4743                case HOT_ADD_DISK:
4744                        err = hot_add_disk(mddev, new_decode_dev(arg));
4745                        goto done_unlock;
4746
4747                case SET_DISK_FAULTY:
4748                        err = set_disk_faulty(mddev, new_decode_dev(arg));
4749                        goto done_unlock;
4750
4751                case RUN_ARRAY:
4752                        err = do_md_run (mddev);
4753                        goto done_unlock;
4754
4755                case SET_BITMAP_FILE:
4756                        err = set_bitmap_file(mddev, (int)arg);
4757                        goto done_unlock;
4758
4759                default:
4760                        err = -EINVAL;
4761                        goto abort_unlock;
4762        }
4763
4764done_unlock:
4765abort_unlock:
4766        mddev_unlock(mddev);
4767
4768        return err;
4769done:
4770        if (err)
4771                MD_BUG();
4772abort:
4773        return err;
4774}
4775
4776static int md_open(struct inode *inode, struct file *file)
4777{
4778        /*
4779         * Succeed if we can lock the mddev, which confirms that
4780         * it isn't being stopped right now.
4781         */
4782        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4783        int err;
4784
4785        if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
4786                goto out;
4787
4788        err = 0;
4789        mddev_get(mddev);
4790        mddev_unlock(mddev);
4791
4792        check_disk_change(inode->i_bdev);
4793 out:
4794        return err;
4795}
4796
4797static int md_release(struct inode *inode, struct file * file)
4798{
4799        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4800
4801        BUG_ON(!mddev);
4802        mddev_put(mddev);
4803
4804        return 0;
4805}
4806
4807static int md_media_changed(struct gendisk *disk)
4808{
4809        mddev_t *mddev = disk->private_data;
4810
4811        return mddev->changed;
4812}
4813
4814static int md_revalidate(struct gendisk *disk)
4815{
4816        mddev_t *mddev = disk->private_data;
4817
4818        mddev->changed = 0;
4819        return 0;
4820}
4821static struct block_device_operations md_fops =
4822{
4823        .owner          = THIS_MODULE,
4824        .open           = md_open,
4825        .release        = md_release,
4826        .ioctl          = md_ioctl,
4827        .getgeo         = md_getgeo,
4828        .media_changed  = md_media_changed,
4829        .revalidate_disk= md_revalidate,
4830};
4831
4832static int md_thread(void * arg)
4833{
4834        mdk_thread_t *thread = arg;
4835
4836        /*
4837         * md_thread is a 'system-thread', it's priority should be very
4838         * high. We avoid resource deadlocks individually in each
4839         * raid personality. (RAID5 does preallocation) We also use RR and
4840         * the very same RT priority as kswapd, thus we will never get
4841         * into a priority inversion deadlock.
4842         *
4843         * we definitely have to have equal or higher priority than
4844         * bdflush, otherwise bdflush will deadlock if there are too
4845         * many dirty RAID5 blocks.
4846         */
4847
4848        allow_signal(SIGKILL);
4849        while (!kthread_should_stop()) {
4850
4851                /* We need to wait INTERRUPTIBLE so that
4852                 * we don't add to the load-average.
4853                 * That means we need to be sure no signals are
4854                 * pending
4855                 */
4856                if (signal_pending(current))
4857                        flush_signals(current);
4858
4859                wait_event_interruptible_timeout
4860                        (thread->wqueue,
4861                         test_bit(THREAD_WAKEUP, &thread->flags)
4862                         || kthread_should_stop(),
4863                         thread->timeout);
4864
4865                clear_bit(THREAD_WAKEUP, &thread->flags);
4866
4867                thread->run(thread->mddev);
4868        }
4869
4870        return 0;
4871}
4872
4873void md_wakeup_thread(mdk_thread_t *thread)
4874{
4875        if (thread) {
4876                dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4877                set_bit(THREAD_WAKEUP, &thread->flags);
4878                wake_up(&thread->wqueue);
4879        }
4880}
4881
4882mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4883                                 const char *name)
4884{
4885        mdk_thread_t *thread;
4886
4887        thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4888        if (!thread)
4889                return NULL;
4890
4891        init_waitqueue_head(&thread->wqueue);
4892
4893        thread->run = run;
4894        thread->mddev = mddev;
4895        thread->timeout = MAX_SCHEDULE_TIMEOUT;
4896        thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4897        if (IS_ERR(thread->tsk)) {
4898                kfree(thread);
4899                return NULL;
4900        }
4901        return thread;
4902}
4903
4904void md_unregister_thread(mdk_thread_t *thread)
4905{
4906        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
4907
4908        kthread_stop(thread->tsk);
4909        kfree(thread);
4910}
4911
4912void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4913{
4914        if (!mddev) {
4915                MD_BUG();
4916                return;
4917        }
4918
4919        if (!rdev || test_bit(Faulty, &rdev->flags))
4920                return;
4921/*
4922        dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4923                mdname(mddev),
4924                MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4925                __builtin_return_address(0),__builtin_return_address(1),
4926                __builtin_return_address(2),__builtin_return_address(3));
4927*/
4928        if (!mddev->pers)
4929                return;
4930        if (!mddev->pers->error_handler)
4931                return;
4932        mddev->pers->error_handler(mddev,rdev);
4933        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4934        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4935        md_wakeup_thread(mddev->thread);
4936        md_new_event_inintr(mddev);
4937}
4938
4939/* seq_file implementation /proc/mdstat */
4940
4941static void status_unused(struct seq_file *seq)
4942{
4943        int i = 0;
4944        mdk_rdev_t *rdev;
4945        struct list_head *tmp;
4946
4947        seq_printf(seq, "unused devices: ");
4948
4949        rdev_for_each_list(rdev, tmp, pending_raid_disks) {
4950                char b[BDEVNAME_SIZE];
4951                i++;
4952                seq_printf(seq, "%s ",
4953                              bdevname(rdev->bdev,b));
4954        }
4955        if (!i)
4956                seq_printf(seq, "<none>");
4957
4958        seq_printf(seq, "\n");
4959}
4960
4961
4962static void status_resync(struct seq_file *seq, mddev_t * mddev)
4963{
4964        sector_t max_blocks, resync, res;
4965        unsigned long dt, db, rt;
4966        int scale;
4967        unsigned int per_milli;
4968
4969        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4970
4971        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4972                max_blocks = mddev->resync_max_sectors >> 1;
4973        else
4974                max_blocks = mddev->size;
4975
4976        /*
4977         * Should not happen.
4978         */
4979        if (!max_blocks) {
4980                MD_BUG();
4981                return;
4982        }
4983        /* Pick 'scale' such that (resync>>scale)*1000 will fit
4984         * in a sector_t, and (max_blocks>>scale) will fit in a
4985         * u32, as those are the requirements for sector_div.
4986         * Thus 'scale' must be at least 10
4987         */
4988        scale = 10;
4989        if (sizeof(sector_t) > sizeof(unsigned long)) {
4990                while ( max_blocks/2 > (1ULL<<(scale+32)))
4991                        scale++;
4992        }
4993        res = (resync>>scale)*1000;
4994        sector_div(res, (u32)((max_blocks>>scale)+1));
4995
4996        per_milli = res;
4997        {
4998                int i, x = per_milli/50, y = 20-x;
4999                seq_printf(seq, "[");
5000                for (i = 0; i < x; i++)
5001                        seq_printf(seq, "=");
5002                seq_printf(seq, ">");
5003                for (i = 0; i < y; i++)
5004                        seq_printf(seq, ".");
5005                seq_printf(seq, "] ");
5006        }
5007        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5008                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5009                    "reshape" :
5010                    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5011                     "check" :
5012                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5013                      "resync" : "recovery"))),
5014                   per_milli/10, per_milli % 10,
5015                   (unsigned long long) resync,
5016                   (unsigned long long) max_blocks);
5017
5018        /*
5019         * We do not want to overflow, so the order of operands and
5020         * the * 100 / 100 trick are important. We do a +1 to be
5021         * safe against division by zero. We only estimate anyway.
5022         *
5023         * dt: time from mark until now
5024         * db: blocks written from mark until now
5025         * rt: remaining time
5026         */
5027        dt = ((jiffies - mddev->resync_mark) / HZ);
5028        if (!dt) dt++;
5029        db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5030                - mddev->resync_mark_cnt;
5031        rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
5032
5033        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
5034
5035        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5036}
5037
5038static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5039{
5040        struct list_head *tmp;
5041        loff_t l = *pos;
5042        mddev_t *mddev;
5043
5044        if (l >= 0x10000)
5045                return NULL;
5046        if (!l--)
5047                /* header */
5048                return (void*)1;
5049
5050        spin_lock(&all_mddevs_lock);
5051        list_for_each(tmp,&all_mddevs)
5052                if (!l--) {
5053                        mddev = list_entry(tmp, mddev_t, all_mddevs);
5054                        mddev_get(mddev);
5055                        spin_unlock(&all_mddevs_lock);
5056                        return mddev;
5057                }
5058        spin_unlock(&all_mddevs_lock);
5059        if (!l--)
5060                return (void*)2;/* tail */
5061        return NULL;
5062}
5063
5064static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5065{
5066        struct list_head *tmp;
5067        mddev_t *next_mddev, *mddev = v;
5068        
5069        ++*pos;
5070        if (v == (void*)2)
5071                return NULL;
5072
5073        spin_lock(&all_mddevs_lock);
5074        if (v == (void*)1)
5075                tmp = all_mddevs.next;
5076        else
5077                tmp = mddev->all_mddevs.next;
5078        if (tmp != &all_mddevs)
5079                next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5080        else {
5081                next_mddev = (void*)2;
5082                *pos = 0x10000;
5083        }               
5084        spin_unlock(&all_mddevs_lock);
5085
5086        if (v != (void*)1)
5087                mddev_put(mddev);
5088        return next_mddev;
5089
5090}
5091
5092static void md_seq_stop(struct seq_file *seq, void *v)
5093{
5094        mddev_t *mddev = v;
5095
5096        if (mddev && v != (void*)1 && v != (void*)2)
5097                mddev_put(mddev);
5098}
5099
5100struct mdstat_info {
5101        int event;
5102};
5103
5104static int md_seq_show(struct seq_file *seq, void *v)
5105{
5106        mddev_t *mddev = v;
5107        sector_t size;
5108        struct list_head *tmp2;
5109        mdk_rdev_t *rdev;
5110        struct mdstat_info *mi = seq->private;
5111        struct bitmap *bitmap;
5112
5113        if (v == (void*)1) {
5114                struct mdk_personality *pers;
5115                seq_printf(seq, "Personalities : ");
5116                spin_lock(&pers_lock);
5117                list_for_each_entry(pers, &pers_list, list)
5118                        seq_printf(seq, "[%s] ", pers->name);
5119
5120                spin_unlock(&pers_lock);
5121                seq_printf(seq, "\n");
5122                mi->event = atomic_read(&md_event_count);
5123                return 0;
5124        }
5125        if (v == (void*)2) {
5126                status_unused(seq);
5127                return 0;
5128        }
5129
5130        if (mddev_lock(mddev) < 0)
5131                return -EINTR;
5132
5133        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5134                seq_printf(seq, "%s : %sactive", mdname(mddev),
5135                                                mddev->pers ? "" : "in");
5136                if (mddev->pers) {
5137                        if (mddev->ro==1)
5138                                seq_printf(seq, " (read-only)");
5139                        if (mddev->ro==2)
5140                                seq_printf(seq, " (auto-read-only)");
5141                        seq_printf(seq, " %s", mddev->pers->name);
5142                }
5143
5144                size = 0;
5145                rdev_for_each(rdev, tmp2, mddev) {
5146                        char b[BDEVNAME_SIZE];
5147                        seq_printf(seq, " %s[%d]",
5148                                bdevname(rdev->bdev,b), rdev->desc_nr);
5149                        if (test_bit(WriteMostly, &rdev->flags))
5150                                seq_printf(seq, "(W)");
5151                        if (test_bit(Faulty, &rdev->flags)) {
5152                                seq_printf(seq, "(F)");
5153                                continue;
5154                        } else if (rdev->raid_disk < 0)
5155                                seq_printf(seq, "(S)"); /* spare */
5156                        size += rdev->size;
5157                }
5158
5159                if (!list_empty(&mddev->disks)) {
5160                        if (mddev->pers)
5161                                seq_printf(seq, "\n      %llu blocks",
5162                                        (unsigned long long)mddev->array_size);
5163                        else
5164                                seq_printf(seq, "\n      %llu blocks",
5165                                        (unsigned long long)size);
5166                }
5167                if (mddev->persistent) {
5168                        if (mddev->major_version != 0 ||
5169                            mddev->minor_version != 90) {
5170                                seq_printf(seq," super %d.%d",
5171                                           mddev->major_version,
5172                                           mddev->minor_version);
5173                        }
5174                } else if (mddev->external)
5175                        seq_printf(seq, " super external:%s",
5176                                   mddev->metadata_type);
5177                else
5178                        seq_printf(seq, " super non-persistent");
5179
5180                if (mddev->pers) {
5181                        mddev->pers->status (seq, mddev);
5182                        seq_printf(seq, "\n      ");
5183                        if (mddev->pers->sync_request) {
5184                                if (mddev->curr_resync > 2) {
5185                                        status_resync (seq, mddev);
5186                                        seq_printf(seq, "\n      ");
5187                                } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5188                                        seq_printf(seq, "\tresync=DELAYED\n      ");
5189                                else if (mddev->recovery_cp < MaxSector)
5190                                        seq_printf(seq, "\tresync=PENDING\n      ");
5191                        }
5192                } else
5193                        seq_printf(seq, "\n       ");
5194
5195                if ((bitmap = mddev->bitmap)) {
5196                        unsigned long chunk_kb;
5197                        unsigned long flags;
5198                        spin_lock_irqsave(&bitmap->lock, flags);
5199                        chunk_kb = bitmap->chunksize >> 10;
5200                        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5201                                "%lu%s chunk",
5202                                bitmap->pages - bitmap->missing_pages,
5203                                bitmap->pages,
5204                                (bitmap->pages - bitmap->missing_pages)
5205                                        << (PAGE_SHIFT - 10),
5206                                chunk_kb ? chunk_kb : bitmap->chunksize,
5207                                chunk_kb ? "KB" : "B");
5208                        if (bitmap->file) {
5209                                seq_printf(seq, ", file: ");
5210                                seq_path(seq, &bitmap->file->f_path, " \t\n");
5211                        }
5212
5213                        seq_printf(seq, "\n");
5214                        spin_unlock_irqrestore(&bitmap->lock, flags);
5215                }
5216
5217                seq_printf(seq, "\n");
5218        }
5219        mddev_unlock(mddev);
5220        
5221        return 0;
5222}
5223
5224static struct seq_operations md_seq_ops = {
5225        .start  = md_seq_start,
5226        .next   = md_seq_next,
5227        .stop   = md_seq_stop,
5228        .show   = md_seq_show,
5229};
5230
5231static int md_seq_open(struct inode *inode, struct file *file)
5232{
5233        int error;
5234        struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5235        if (mi == NULL)
5236                return -ENOMEM;
5237
5238        error = seq_open(file, &md_seq_ops);
5239        if (error)
5240                kfree(mi);
5241        else {
5242                struct seq_file *p = file->private_data;
5243                p->private = mi;
5244                mi->event = atomic_read(&md_event_count);
5245        }
5246        return error;
5247}
5248
5249static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
5250{
5251        struct seq_file *m = filp->private_data;
5252        struct mdstat_info *mi = m->private;
5253        int mask;
5254
5255        poll_wait(filp, &md_event_waiters, wait);
5256
5257        /* always allow read */
5258        mask = POLLIN | POLLRDNORM;
5259
5260        if (mi->event != atomic_read(&md_event_count))
5261                mask |= POLLERR | POLLPRI;
5262        return mask;
5263}
5264
5265static const struct file_operations md_seq_fops = {
5266        .owner          = THIS_MODULE,
5267        .open           = md_seq_open,
5268        .read           = seq_read,
5269        .llseek         = seq_lseek,
5270        .release        = seq_release_private,
5271        .poll           = mdstat_poll,
5272};
5273
5274int register_md_personality(struct mdk_personality *p)
5275{
5276        spin_lock(&pers_lock);
5277        list_add_tail(&p->list, &pers_list);
5278        printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
5279        spin_unlock(&pers_lock);
5280        return 0;
5281}
5282
5283int unregister_md_personality(struct mdk_personality *p)
5284{
5285        printk(KERN_INFO "md: %s personality unregistered\n", p->name);
5286        spin_lock(&pers_lock);
5287        list_del_init(&p->list);
5288        spin_unlock(&pers_lock);
5289        return 0;
5290}
5291
5292static int is_mddev_idle(mddev_t *mddev)
5293{
5294        mdk_rdev_t * rdev;
5295        struct list_head *tmp;
5296        int idle;
5297        long curr_events;
5298
5299        idle = 1;
5300        rdev_for_each(rdev, tmp, mddev) {
5301                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5302                curr_events = disk_stat_read(disk, sectors[0]) + 
5303                                disk_stat_read(disk, sectors[1]) - 
5304                                atomic_read(&disk->sync_io);
5305                /* sync IO will cause sync_io to increase before the disk_stats
5306                 * as sync_io is counted when a request starts, and
5307                 * disk_stats is counted when it completes.
5308                 * So resync activity will cause curr_events to be smaller than
5309                 * when there was no such activity.
5310                 * non-sync IO will cause disk_stat to increase without
5311                 * increasing sync_io so curr_events will (eventually)
5312                 * be larger than it was before.  Once it becomes
5313                 * substantially larger, the test below will cause
5314                 * the array to appear non-idle, and resync will slow
5315                 * down.
5316                 * If there is a lot of outstanding resync activity when
5317                 * we set last_event to curr_events, then all that activity
5318                 * completing might cause the array to appear non-idle
5319                 * and resync will be slowed down even though there might
5320                 * not have been non-resync activity.  This will only
5321                 * happen once though.  'last_events' will soon reflect
5322                 * the state where there is little or no outstanding
5323                 * resync requests, and further resync activity will
5324                 * always make curr_events less than last_events.
5325                 *
5326                 */
5327                if (curr_events - rdev->last_events > 4096) {
5328                        rdev->last_events = curr_events;
5329                        idle = 0;
5330                }
5331        }
5332        return idle;
5333}
5334
5335void md_done_sync(mddev_t *mddev, int blocks, int ok)
5336{
5337        /* another "blocks" (512byte) blocks have been synced */
5338        atomic_sub(blocks, &mddev->recovery_active);
5339        wake_up(&mddev->recovery_wait);
5340        if (!ok) {
5341                set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5342                md_wakeup_thread(mddev->thread);
5343                // stop recovery, signal do_sync ....
5344        }
5345}
5346
5347
5348/* md_write_start(mddev, bi)
5349 * If we need to update some array metadata (e.g. 'active' flag
5350 * in superblock) before writing, schedule a superblock update
5351 * and wait for it to complete.
5352 */
5353void md_write_start(mddev_t *mddev, struct bio *bi)
5354{
5355        if (bio_data_dir(bi) != WRITE)
5356                return;
5357
5358        BUG_ON(mddev->ro == 1);
5359        if (mddev->ro == 2) {
5360                /* need to switch to read/write */
5361                mddev->ro = 0;
5362                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5363                md_wakeup_thread(mddev->thread);
5364                md_wakeup_thread(mddev->sync_thread);
5365        }
5366        atomic_inc(&mddev->writes_pending);
5367        if (mddev->in_sync) {
5368                spin_lock_irq(&mddev->write_lock);
5369                if (mddev->in_sync) {
5370                        mddev->in_sync = 0;
5371                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5372                        md_wakeup_thread(mddev->thread);
5373                }
5374                spin_unlock_irq(&mddev->write_lock);
5375        }
5376        wait_event(mddev->sb_wait, mddev->flags==0);
5377}
5378
5379void md_write_end(mddev_t *mddev)
5380{
5381        if (atomic_dec_and_test(&mddev->writes_pending)) {
5382                if (mddev->safemode == 2)
5383                        md_wakeup_thread(mddev->thread);
5384                else if (mddev->safemode_delay)
5385                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5386        }
5387}
5388
5389/* md_allow_write(mddev)
5390 * Calling this ensures that the array is marked 'active' so that writes
5391 * may proceed without blocking.  It is important to call this before
5392 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5393 * Must be called with mddev_lock held.
5394 */
5395void md_allow_write(mddev_t *mddev)
5396{
5397        if (!mddev->pers)
5398                return;
5399        if (mddev->ro)
5400                return;
5401
5402        spin_lock_irq(&mddev->write_lock);
5403        if (mddev->in_sync) {
5404                mddev->in_sync = 0;
5405                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5406                if (mddev->safemode_delay &&
5407                    mddev->safemode == 0)
5408                        mddev->safemode = 1;
5409                spin_unlock_irq(&mddev->write_lock);
5410                md_update_sb(mddev, 0);
5411        } else
5412                spin_unlock_irq(&mddev->write_lock);
5413}
5414EXPORT_SYMBOL_GPL(md_allow_write);
5415
5416static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5417
5418#define SYNC_MARKS      10
5419#define SYNC_MARK_STEP  (3*HZ)
5420void md_do_sync(mddev_t *mddev)
5421{
5422        mddev_t *mddev2;
5423        unsigned int currspeed = 0,
5424                 window;
5425        sector_t max_sectors,j, io_sectors;
5426        unsigned long mark[SYNC_MARKS];
5427        sector_t mark_cnt[SYNC_MARKS];
5428        int last_mark,m;
5429        struct list_head *tmp;
5430        sector_t last_check;
5431        int skipped = 0;
5432        struct list_head *rtmp;
5433        mdk_rdev_t *rdev;
5434        char *desc;
5435
5436        /* just incase thread restarts... */
5437        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5438                return;
5439        if (mddev->ro) /* never try to sync a read-only array */
5440                return;
5441
5442        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5443                if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
5444                        desc = "data-check";
5445                else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5446                        desc = "requested-resync";
5447                else
5448                        desc = "resync";
5449        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5450                desc = "reshape";
5451        else
5452                desc = "recovery";
5453
5454        /* we overload curr_resync somewhat here.
5455         * 0 == not engaged in resync at all
5456         * 2 == checking that there is no conflict with another sync
5457         * 1 == like 2, but have yielded to allow conflicting resync to
5458         *              commense
5459         * other == active in resync - this many blocks
5460         *
5461         * Before starting a resync we must have set curr_resync to
5462         * 2, and then checked that every "conflicting" array has curr_resync
5463         * less than ours.  When we find one that is the same or higher
5464         * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
5465         * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5466         * This will mean we have to start checking from the beginning again.
5467         *
5468         */
5469
5470        do {
5471                mddev->curr_resync = 2;
5472
5473        try_again:
5474                if (kthread_should_stop()) {
5475                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5476                        goto skip;
5477                }
5478                for_each_mddev(mddev2, tmp) {
5479                        if (mddev2 == mddev)
5480                                continue;
5481                        if (mddev2->curr_resync && 
5482                            match_mddev_units(mddev,mddev2)) {
5483                                DEFINE_WAIT(wq);
5484                                if (mddev < mddev2 && mddev->curr_resync == 2) {
5485                                        /* arbitrarily yield */
5486                                        mddev->curr_resync = 1;
5487                                        wake_up(&resync_wait);
5488                                }
5489                                if (mddev > mddev2 && mddev->curr_resync == 1)
5490                                        /* no need to wait here, we can wait the next
5491                                         * time 'round when curr_resync == 2
5492                                         */
5493                                        continue;
5494                                prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5495                                if (!kthread_should_stop() &&
5496                                    mddev2->curr_resync >= mddev->curr_resync) {
5497                                        printk(KERN_INFO "md: delaying %s of %s"
5498                                               " until %s has finished (they"
5499                                               " share one or more physical units)\n",
5500                                               desc, mdname(mddev), mdname(mddev2));
5501                                        mddev_put(mddev2);
5502                                        schedule();
5503                                        finish_wait(&resync_wait, &wq);
5504                                        goto try_again;
5505                                }
5506                                finish_wait(&resync_wait, &wq);
5507                        }
5508                }
5509        } while (mddev->curr_resync < 2);
5510
5511        j = 0;
5512        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5513                /* resync follows the size requested by the personality,
5514                 * which defaults to physical size, but can be virtual size
5515                 */
5516                max_sectors = mddev->resync_max_sectors;
5517                mddev->resync_mismatches = 0;
5518                /* we don't use the checkpoint if there's a bitmap */
5519                if (!mddev->bitmap &&
5520                    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5521                        j = mddev->recovery_cp;
5522        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5523                max_sectors = mddev->size << 1;
5524        else {
5525                /* recovery follows the physical size of devices */
5526                max_sectors = mddev->size << 1;
5527                j = MaxSector;
5528                rdev_for_each(rdev, rtmp, mddev)
5529                        if (rdev->raid_disk >= 0 &&
5530                            !test_bit(Faulty, &rdev->flags) &&
5531                            !test_bit(In_sync, &rdev->flags) &&
5532                            rdev->recovery_offset < j)
5533                                j = rdev->recovery_offset;
5534        }
5535
5536        printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
5537        printk(KERN_INFO "md: minimum _guaranteed_  speed:"
5538                " %d KB/sec/disk.\n", speed_min(mddev));
5539        printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5540               "(but not more than %d KB/sec) for %s.\n",
5541               speed_max(mddev), desc);
5542
5543        is_mddev_idle(mddev); /* this also initializes IO event counters */
5544
5545        io_sectors = 0;
5546        for (m = 0; m < SYNC_MARKS; m++) {
5547                mark[m] = jiffies;
5548                mark_cnt[m] = io_sectors;
5549        }
5550        last_mark = 0;
5551        mddev->resync_mark = mark[last_mark];
5552        mddev->resync_mark_cnt = mark_cnt[last_mark];
5553
5554        /*
5555         * Tune reconstruction:
5556         */
5557        window = 32*(PAGE_SIZE/512);
5558        printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5559                window/2,(unsigned long long) max_sectors/2);
5560
5561        atomic_set(&mddev->recovery_active, 0);
5562        init_waitqueue_head(&mddev->recovery_wait);
5563        last_check = 0;
5564
5565        if (j>2) {
5566                printk(KERN_INFO 
5567                       "md: resuming %s of %s from checkpoint.\n",
5568                       desc, mdname(mddev));
5569                mddev->curr_resync = j;
5570        }
5571
5572        while (j < max_sectors) {
5573                sector_t sectors;
5574
5575                skipped = 0;
5576                if (j >= mddev->resync_max) {
5577                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5578                        wait_event(mddev->recovery_wait,
5579                                   mddev->resync_max > j
5580                                   || kthread_should_stop());
5581                }
5582                if (kthread_should_stop())
5583                        goto interrupted;
5584                sectors = mddev->pers->sync_request(mddev, j, &skipped,
5585                                                  currspeed < speed_min(mddev));
5586                if (sectors == 0) {
5587                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5588                        goto out;
5589                }
5590
5591                if (!skipped) { /* actual IO requested */
5592                        io_sectors += sectors;
5593                        atomic_add(sectors, &mddev->recovery_active);
5594                }
5595
5596                j += sectors;
5597                if (j>1) mddev->curr_resync = j;
5598                mddev->curr_mark_cnt = io_sectors;
5599                if (last_check == 0)
5600                        /* this is the earliers that rebuilt will be
5601                         * visible in /proc/mdstat
5602                         */
5603                        md_new_event(mddev);
5604
5605                if (last_check + window > io_sectors || j == max_sectors)
5606                        continue;
5607
5608                last_check = io_sectors;
5609
5610                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5611                    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5612                        break;
5613
5614        repeat:
5615                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5616                        /* step marks */
5617                        int next = (last_mark+1) % SYNC_MARKS;
5618
5619                        mddev->resync_mark = mark[next];
5620                        mddev->resync_mark_cnt = mark_cnt[next];
5621                        mark[next] = jiffies;
5622                        mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5623                        last_mark = next;
5624                }
5625
5626
5627                if (kthread_should_stop())
5628                        goto interrupted;
5629
5630
5631                /*
5632                 * this loop exits only if either when we are slower than
5633                 * the 'hard' speed limit, or the system was IO-idle for
5634                 * a jiffy.
5635                 * the system might be non-idle CPU-wise, but we only care
5636                 * about not overloading the IO subsystem. (things like an
5637                 * e2fsck being done on the RAID array should execute fast)
5638                 */
5639                blk_unplug(mddev->queue);
5640                cond_resched();
5641
5642                currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5643                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
5644
5645                if (currspeed > speed_min(mddev)) {
5646                        if ((currspeed > speed_max(mddev)) ||
5647                                        !is_mddev_idle(mddev)) {
5648                                msleep(500);
5649                                goto repeat;
5650                        }
5651                }
5652        }
5653        printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
5654        /*
5655         * this also signals 'finished resyncing' to md_stop
5656         */
5657 out:
5658        blk_unplug(mddev->queue);
5659
5660        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5661
5662        /* tell personality that we are finished */
5663        mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5664
5665        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5666            !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5667            mddev->curr_resync > 2) {
5668                if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5669                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5670                                if (mddev->curr_resync >= mddev->recovery_cp) {
5671                                        printk(KERN_INFO
5672                                               "md: checkpointing %s of %s.\n",
5673                                               desc, mdname(mddev));
5674                                        mddev->recovery_cp = mddev->curr_resync;
5675                                }
5676                        } else
5677                                mddev->recovery_cp = MaxSector;
5678                } else {
5679                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5680                                mddev->curr_resync = MaxSector;
5681                        rdev_for_each(rdev, rtmp, mddev)
5682                                if (rdev->raid_disk >= 0 &&
5683                                    !test_bit(Faulty, &rdev->flags) &&
5684                                    !test_bit(In_sync, &rdev->flags) &&
5685                                    rdev->recovery_offset < mddev->curr_resync)
5686                                        rdev->recovery_offset = mddev->curr_resync;
5687                }
5688        }
5689        set_bit(MD_CHANGE_DEVS, &mddev->flags);
5690
5691 skip:
5692        mddev->curr_resync = 0;
5693        mddev->resync_max = MaxSector;
5694        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5695        wake_up(&resync_wait);
5696        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5697        md_wakeup_thread(mddev->thread);
5698        return;
5699
5700 interrupted:
5701        /*
5702         * got a signal, exit.
5703         */
5704        printk(KERN_INFO
5705               "md: md_do_sync() got signal ... exiting\n");
5706        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5707        goto out;
5708
5709}
5710EXPORT_SYMBOL_GPL(md_do_sync);
5711
5712
5713static int remove_and_add_spares(mddev_t *mddev)
5714{
5715        mdk_rdev_t *rdev;
5716        struct list_head *rtmp;
5717        int spares = 0;
5718
5719        rdev_for_each(rdev, rtmp, mddev)
5720                if (rdev->raid_disk >= 0 &&
5721                    !mddev->external &&
5722                    (test_bit(Faulty, &rdev->flags) ||
5723                     ! test_bit(In_sync, &rdev->flags)) &&
5724                    atomic_read(&rdev->nr_pending)==0) {
5725                        if (mddev->pers->hot_remove_disk(
5726                                    mddev, rdev->raid_disk)==0) {
5727                                char nm[20];
5728                                sprintf(nm,"rd%d", rdev->raid_disk);
5729                                sysfs_remove_link(&mddev->kobj, nm);
5730                                rdev->raid_disk = -1;
5731                        }
5732                }
5733
5734        if (mddev->degraded) {
5735                rdev_for_each(rdev, rtmp, mddev)
5736                        if (rdev->raid_disk < 0
5737                            && !test_bit(Faulty, &rdev->flags)) {
5738                                rdev->recovery_offset = 0;
5739                                if (mddev->pers->hot_add_disk(mddev,rdev)) {
5740                                        char nm[20];
5741                                        sprintf(nm, "rd%d", rdev->raid_disk);
5742                                        if (sysfs_create_link(&mddev->kobj,
5743                                                              &rdev->kobj, nm))
5744                                                printk(KERN_WARNING
5745                                                       "md: cannot register "
5746                                                       "%s for %s\n",
5747                                                       nm, mdname(mddev));
5748                                        spares++;
5749                                        md_new_event(mddev);
5750                                } else
5751                                        break;
5752                        }
5753        }
5754        return spares;
5755}
5756/*
5757 * This routine is regularly called by all per-raid-array threads to
5758 * deal with generic issues like resync and super-block update.
5759 * Raid personalities that don't have a thread (linear/raid0) do not
5760 * need this as they never do any recovery or update the superblock.
5761 *
5762 * It does not do any resync itself, but rather "forks" off other threads
5763 * to do that as needed.
5764 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5765 * "->recovery" and create a thread at ->sync_thread.
5766 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
5767 * and wakeups up this thread which will reap the thread and finish up.
5768 * This thread also removes any faulty devices (with nr_pending == 0).
5769 *
5770 * The overall approach is:
5771 *  1/ if the superblock needs updating, update it.
5772 *  2/ If a recovery thread is running, don't do anything else.
5773 *  3/ If recovery has finished, clean up, possibly marking spares active.
5774 *  4/ If there are any faulty devices, remove them.
5775 *  5/ If array is degraded, try to add spares devices
5776 *  6/ If array has spares or is not in-sync, start a resync thread.
5777 */
5778void md_check_recovery(mddev_t *mddev)
5779{
5780        mdk_rdev_t *rdev;
5781        struct list_head *rtmp;
5782
5783
5784        if (mddev->bitmap)
5785                bitmap_daemon_work(mddev->bitmap);
5786
5787        if (mddev->ro)
5788                return;
5789
5790        if (signal_pending(current)) {
5791                if (mddev->pers->sync_request) {
5792                        printk(KERN_INFO "md: %s in immediate safe mode\n",
5793                               mdname(mddev));
5794                        mddev->safemode = 2;
5795                }
5796                flush_signals(current);
5797        }
5798
5799        if ( ! (
5800                (mddev->flags && !mddev->external) ||
5801                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5802                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5803                (mddev->safemode == 1) ||
5804                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5805                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5806                ))
5807                return;
5808
5809        if (mddev_trylock(mddev)) {
5810                int spares = 0;
5811
5812                spin_lock_irq(&mddev->write_lock);
5813                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5814                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5815                        mddev->in_sync = 1;
5816                        if (mddev->persistent)
5817                                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5818                }
5819                if (mddev->safemode == 1)
5820                        mddev->safemode = 0;
5821                spin_unlock_irq(&mddev->write_lock);
5822
5823                if (mddev->flags)
5824                        md_update_sb(mddev, 0);
5825
5826
5827                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5828                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5829                        /* resync/recovery still happening */
5830                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5831                        goto unlock;
5832                }
5833                if (mddev->sync_thread) {
5834                        /* resync has finished, collect result */
5835                        md_unregister_thread(mddev->sync_thread);
5836                        mddev->sync_thread = NULL;
5837                        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5838                            !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5839                                /* success...*/
5840                                /* activate any spares */
5841                                mddev->pers->spare_active(mddev);
5842                        }
5843                        md_update_sb(mddev, 1);
5844
5845                        /* if array is no-longer degraded, then any saved_raid_disk
5846                         * information must be scrapped
5847                         */
5848                        if (!mddev->degraded)
5849                                rdev_for_each(rdev, rtmp, mddev)
5850                                        rdev->saved_raid_disk = -1;
5851
5852                        mddev->recovery = 0;
5853                        /* flag recovery needed just to double check */
5854                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5855                        md_new_event(mddev);
5856                        goto unlock;
5857                }
5858                /* Clear some bits that don't mean anything, but
5859                 * might be left set
5860                 */
5861                clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5862                clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5863                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5864                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5865
5866                if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5867                        goto unlock;
5868                /* no recovery is running.
5869                 * remove any failed drives, then
5870                 * add spares if possible.
5871                 * Spare are also removed and re-added, to allow
5872                 * the personality to fail the re-add.
5873                 */
5874
5875                if (mddev->reshape_position != MaxSector) {
5876                        if (mddev->pers->check_reshape(mddev) != 0)
5877                                /* Cannot proceed */
5878                                goto unlock;
5879                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5880                } else if ((spares = remove_and_add_spares(mddev))) {
5881                        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5882                        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5883                } else if (mddev->recovery_cp < MaxSector) {
5884                        set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5885                } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5886                        /* nothing to be done ... */
5887                        goto unlock;
5888
5889                if (mddev->pers->sync_request) {
5890                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5891                        if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5892                                /* We are adding a device or devices to an array
5893                                 * which has the bitmap stored on all devices.
5894                                 * So make sure all bitmap pages get written
5895                                 */
5896                                bitmap_write_all(mddev->bitmap);
5897                        }
5898                        mddev->sync_thread = md_register_thread(md_do_sync,
5899                                                                mddev,
5900                                                                "%s_resync");
5901                        if (!mddev->sync_thread) {
5902                                printk(KERN_ERR "%s: could not start resync"
5903                                        " thread...\n", 
5904                                        mdname(mddev));
5905                                /* leave the spares where they are, it shouldn't hurt */
5906                                mddev->recovery = 0;
5907                        } else
5908                                md_wakeup_thread(mddev->sync_thread);
5909                        md_new_event(mddev);
5910                }
5911        unlock:
5912                mddev_unlock(mddev);
5913        }
5914}
5915
5916static int md_notify_reboot(struct notifier_block *this,
5917                            unsigned long code, void *x)
5918{
5919        struct list_head *tmp;
5920        mddev_t *mddev;
5921
5922        if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5923
5924                printk(KERN_INFO "md: stopping all md devices.\n");
5925
5926                for_each_mddev(mddev, tmp)
5927                        if (mddev_trylock(mddev)) {
5928                                do_md_stop (mddev, 1);
5929                                mddev_unlock(mddev);
5930                        }
5931                /*
5932                 * certain more exotic SCSI devices are known to be
5933                 * volatile wrt too early system reboots. While the
5934                 * right place to handle this issue is the given
5935                 * driver, we do want to have a safe RAID driver ...
5936                 */
5937                mdelay(1000*1);
5938        }
5939        return NOTIFY_DONE;
5940}
5941
5942static struct notifier_block md_notifier = {
5943        .notifier_call  = md_notify_reboot,
5944        .next           = NULL,
5945        .priority       = INT_MAX, /* before any real devices */
5946};
5947
5948static void md_geninit(void)
5949{
5950        struct proc_dir_entry *p;
5951
5952        dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5953
5954        p = create_proc_entry("mdstat", S_IRUGO, NULL);
5955        if (p)
5956                p->proc_fops = &md_seq_fops;
5957}
5958
5959static int __init md_init(void)
5960{
5961        if (register_blkdev(MAJOR_NR, "md"))
5962                return -1;
5963        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5964                unregister_blkdev(MAJOR_NR, "md");
5965                return -1;
5966        }
5967        blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
5968                            md_probe, NULL, NULL);
5969        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
5970                            md_probe, NULL, NULL);
5971
5972        register_reboot_notifier(&md_notifier);
5973        raid_table_header = register_sysctl_table(raid_root_table);
5974
5975        md_geninit();
5976        return (0);
5977}
5978
5979
5980#ifndef MODULE
5981
5982/*
5983 * Searches all registered partitions for autorun RAID arrays
5984 * at boot time.
5985 */
5986
5987static LIST_HEAD(all_detected_devices);
5988struct detected_devices_node {
5989        struct list_head list;
5990        dev_t dev;
5991};
5992
5993void md_autodetect_dev(dev_t dev)
5994{
5995        struct detected_devices_node *node_detected_dev;
5996
5997        node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
5998        if (node_detected_dev) {
5999                node_detected_dev->dev = dev;
6000                list_add_tail(&node_detected_dev->list, &all_detected_devices);
6001        } else {
6002                printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
6003                        ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
6004        }
6005}
6006
6007
6008static void autostart_arrays(int part)
6009{
6010        mdk_rdev_t *rdev;
6011        struct detected_devices_node *node_detected_dev;
6012        dev_t dev;
6013        int i_scanned, i_passed;
6014
6015        i_scanned = 0;
6016        i_passed = 0;
6017
6018        printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6019
6020        while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6021                i_scanned++;
6022                node_detected_dev = list_entry(all_detected_devices.next,
6023                                        struct detected_devices_node, list);
6024                list_del(&node_detected_dev->list);
6025                dev = node_detected_dev->dev;
6026                kfree(node_detected_dev);
6027                rdev = md_import_device(dev,0, 90);
6028                if (IS_ERR(rdev))
6029                        continue;
6030
6031                if (test_bit(Faulty, &rdev->flags)) {
6032                        MD_BUG();
6033                        continue;
6034                }
6035                set_bit(AutoDetected, &rdev->flags);
6036                list_add(&rdev->same_set, &pending_raid_disks);
6037                i_passed++;
6038        }
6039
6040        printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6041                                                i_scanned, i_passed);
6042
6043        autorun_devices(part);
6044}
6045
6046#endif /* !MODULE */
6047
6048static __exit void md_exit(void)
6049{
6050        mddev_t *mddev;
6051        struct list_head *tmp;
6052
6053        blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
6054        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6055
6056        unregister_blkdev(MAJOR_NR,"md");
6057        unregister_blkdev(mdp_major, "mdp");
6058        unregister_reboot_notifier(&md_notifier);
6059        unregister_sysctl_table(raid_table_header);
6060        remove_proc_entry("mdstat", NULL);
6061        for_each_mddev(mddev, tmp) {
6062                struct gendisk *disk = mddev->gendisk;
6063                if (!disk)
6064                        continue;
6065                export_array(mddev);
6066                del_gendisk(disk);
6067                put_disk(disk);
6068                mddev->gendisk = NULL;
6069                mddev_put(mddev);
6070        }
6071}
6072
6073subsys_initcall(md_init);
6074module_exit(md_exit)
6075
6076static int get_ro(char *buffer, struct kernel_param *kp)
6077{
6078        return sprintf(buffer, "%d", start_readonly);
6079}
6080static int set_ro(const char *val, struct kernel_param *kp)
6081{
6082        char *e;
6083        int num = simple_strtoul(val, &e, 10);
6084        if (*val && (*e == '\0' || *e == '\n')) {
6085                start_readonly = num;
6086                return 0;
6087        }
6088        return -EINVAL;
6089}
6090
6091module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6092module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6093
6094
6095EXPORT_SYMBOL(register_md_personality);
6096EXPORT_SYMBOL(unregister_md_personality);
6097EXPORT_SYMBOL(md_error);
6098EXPORT_SYMBOL(md_done_sync);
6099EXPORT_SYMBOL(md_write_start);
6100EXPORT_SYMBOL(md_write_end);
6101EXPORT_SYMBOL(md_register_thread);
6102EXPORT_SYMBOL(md_unregister_thread);
6103EXPORT_SYMBOL(md_wakeup_thread);
6104EXPORT_SYMBOL(md_check_recovery);
6105MODULE_LICENSE("GPL");
6106MODULE_ALIAS("md");
6107MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
6108
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.