linux/drivers/md/md.c
<<
>>
Prefs
   1/*
   2   md.c : Multiple Devices driver for Linux
   3          Copyright (C) 1998, 1999, 2000 Ingo Molnar
   4
   5     completely rewritten, based on the MD driver code from Marc Zyngier
   6
   7   Changes:
   8
   9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
  10   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
  11   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
  12   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
  13   - kmod support by: Cyrus Durgin
  14   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
  15   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
  16
  17   - lots of fixes and improvements to the RAID1/RAID5 and generic
  18     RAID code (such as request based resynchronization):
  19
  20     Neil Brown <neilb@cse.unsw.edu.au>.
  21
  22   - persistent bitmap code
  23     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
  24
  25   This program is free software; you can redistribute it and/or modify
  26   it under the terms of the GNU General Public License as published by
  27   the Free Software Foundation; either version 2, or (at your option)
  28   any later version.
  29
  30   You should have received a copy of the GNU General Public License
  31   (for example /usr/src/linux/COPYING); if not, write to the Free
  32   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  33*/
  34
  35#include <linux/kthread.h>
  36#include <linux/raid/md.h>
  37#include <linux/raid/bitmap.h>
  38#include <linux/sysctl.h>
  39#include <linux/buffer_head.h> /* for invalidate_bdev */
  40#include <linux/poll.h>
  41#include <linux/ctype.h>
  42#include <linux/hdreg.h>
  43#include <linux/proc_fs.h>
  44#include <linux/random.h>
  45#include <linux/reboot.h>
  46#include <linux/file.h>
  47#include <linux/delay.h>
  48
  49#define MAJOR_NR MD_MAJOR
  50
  51/* 63 partitions with the alternate major number (mdp) */
  52#define MdpMinorShift 6
  53
  54#define DEBUG 0
  55#define dprintk(x...) ((void)(DEBUG && printk(x)))
  56
  57
  58#ifndef MODULE
  59static void autostart_arrays(int part);
  60#endif
  61
  62static LIST_HEAD(pers_list);
  63static DEFINE_SPINLOCK(pers_lock);
  64
  65static void md_print_devices(void);
  66
  67static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
  68
  69#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
  70
  71/*
  72 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  73 * is 1000 KB/sec, so the extra system load does not show up that much.
  74 * Increase it if you want to have more _guaranteed_ speed. Note that
  75 * the RAID driver will use the maximum available bandwidth if the IO
  76 * subsystem is idle. There is also an 'absolute maximum' reconstruction
  77 * speed limit - in case reconstruction slows down your system despite
  78 * idle IO detection.
  79 *
  80 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
  81 * or /sys/block/mdX/md/sync_speed_{min,max}
  82 */
  83
  84static int sysctl_speed_limit_min = 1000;
  85static int sysctl_speed_limit_max = 200000;
  86static inline int speed_min(mddev_t *mddev)
  87{
  88        return mddev->sync_speed_min ?
  89                mddev->sync_speed_min : sysctl_speed_limit_min;
  90}
  91
  92static inline int speed_max(mddev_t *mddev)
  93{
  94        return mddev->sync_speed_max ?
  95                mddev->sync_speed_max : sysctl_speed_limit_max;
  96}
  97
  98static struct ctl_table_header *raid_table_header;
  99
 100static ctl_table raid_table[] = {
 101        {
 102                .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
 103                .procname       = "speed_limit_min",
 104                .data           = &sysctl_speed_limit_min,
 105                .maxlen         = sizeof(int),
 106                .mode           = S_IRUGO|S_IWUSR,
 107                .proc_handler   = &proc_dointvec,
 108        },
 109        {
 110                .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
 111                .procname       = "speed_limit_max",
 112                .data           = &sysctl_speed_limit_max,
 113                .maxlen         = sizeof(int),
 114                .mode           = S_IRUGO|S_IWUSR,
 115                .proc_handler   = &proc_dointvec,
 116        },
 117        { .ctl_name = 0 }
 118};
 119
 120static ctl_table raid_dir_table[] = {
 121        {
 122                .ctl_name       = DEV_RAID,
 123                .procname       = "raid",
 124                .maxlen         = 0,
 125                .mode           = S_IRUGO|S_IXUGO,
 126                .child          = raid_table,
 127        },
 128        { .ctl_name = 0 }
 129};
 130
 131static ctl_table raid_root_table[] = {
 132        {
 133                .ctl_name       = CTL_DEV,
 134                .procname       = "dev",
 135                .maxlen         = 0,
 136                .mode           = 0555,
 137                .child          = raid_dir_table,
 138        },
 139        { .ctl_name = 0 }
 140};
 141
 142static struct block_device_operations md_fops;
 143
 144static int start_readonly;
 145
 146/*
 147 * We have a system wide 'event count' that is incremented
 148 * on any 'interesting' event, and readers of /proc/mdstat
 149 * can use 'poll' or 'select' to find out when the event
 150 * count increases.
 151 *
 152 * Events are:
 153 *  start array, stop array, error, add device, remove device,
 154 *  start build, activate spare
 155 */
 156static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 157static atomic_t md_event_count;
 158void md_new_event(mddev_t *mddev)
 159{
 160        atomic_inc(&md_event_count);
 161        wake_up(&md_event_waiters);
 162}
 163EXPORT_SYMBOL_GPL(md_new_event);
 164
 165/* Alternate version that can be called from interrupts
 166 * when calling sysfs_notify isn't needed.
 167 */
 168static void md_new_event_inintr(mddev_t *mddev)
 169{
 170        atomic_inc(&md_event_count);
 171        wake_up(&md_event_waiters);
 172}
 173
 174/*
 175 * Enables to iterate over all existing md arrays
 176 * all_mddevs_lock protects this list.
 177 */
 178static LIST_HEAD(all_mddevs);
 179static DEFINE_SPINLOCK(all_mddevs_lock);
 180
 181
 182/*
 183 * iterates through all used mddevs in the system.
 184 * We take care to grab the all_mddevs_lock whenever navigating
 185 * the list, and to always hold a refcount when unlocked.
 186 * Any code which breaks out of this loop while own
 187 * a reference to the current mddev and must mddev_put it.
 188 */
 189#define for_each_mddev(mddev,tmp)                                       \
 190                                                                        \
 191        for (({ spin_lock(&all_mddevs_lock);                            \
 192                tmp = all_mddevs.next;                                  \
 193                mddev = NULL;});                                        \
 194             ({ if (tmp != &all_mddevs)                                 \
 195                        mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
 196                spin_unlock(&all_mddevs_lock);                          \
 197                if (mddev) mddev_put(mddev);                            \
 198                mddev = list_entry(tmp, mddev_t, all_mddevs);           \
 199                tmp != &all_mddevs;});                                  \
 200             ({ spin_lock(&all_mddevs_lock);                            \
 201                tmp = tmp->next;})                                      \
 202                )
 203
 204
 205static int md_fail_request(struct request_queue *q, struct bio *bio)
 206{
 207        bio_io_error(bio);
 208        return 0;
 209}
 210
 211static inline mddev_t *mddev_get(mddev_t *mddev)
 212{
 213        atomic_inc(&mddev->active);
 214        return mddev;
 215}
 216
 217static void mddev_put(mddev_t *mddev)
 218{
 219        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 220                return;
 221        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
 222                list_del(&mddev->all_mddevs);
 223                spin_unlock(&all_mddevs_lock);
 224                blk_cleanup_queue(mddev->queue);
 225                if (mddev->sysfs_state)
 226                        sysfs_put(mddev->sysfs_state);
 227                mddev->sysfs_state = NULL;
 228                kobject_put(&mddev->kobj);
 229        } else
 230                spin_unlock(&all_mddevs_lock);
 231}
 232
 233static mddev_t * mddev_find(dev_t unit)
 234{
 235        mddev_t *mddev, *new = NULL;
 236
 237 retry:
 238        spin_lock(&all_mddevs_lock);
 239        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 240                if (mddev->unit == unit) {
 241                        mddev_get(mddev);
 242                        spin_unlock(&all_mddevs_lock);
 243                        kfree(new);
 244                        return mddev;
 245                }
 246
 247        if (new) {
 248                list_add(&new->all_mddevs, &all_mddevs);
 249                spin_unlock(&all_mddevs_lock);
 250                return new;
 251        }
 252        spin_unlock(&all_mddevs_lock);
 253
 254        new = kzalloc(sizeof(*new), GFP_KERNEL);
 255        if (!new)
 256                return NULL;
 257
 258        new->unit = unit;
 259        if (MAJOR(unit) == MD_MAJOR)
 260                new->md_minor = MINOR(unit);
 261        else
 262                new->md_minor = MINOR(unit) >> MdpMinorShift;
 263
 264        mutex_init(&new->reconfig_mutex);
 265        INIT_LIST_HEAD(&new->disks);
 266        INIT_LIST_HEAD(&new->all_mddevs);
 267        init_timer(&new->safemode_timer);
 268        atomic_set(&new->active, 1);
 269        atomic_set(&new->openers, 0);
 270        spin_lock_init(&new->write_lock);
 271        init_waitqueue_head(&new->sb_wait);
 272        init_waitqueue_head(&new->recovery_wait);
 273        new->reshape_position = MaxSector;
 274        new->resync_min = 0;
 275        new->resync_max = MaxSector;
 276        new->level = LEVEL_NONE;
 277
 278        new->queue = blk_alloc_queue(GFP_KERNEL);
 279        if (!new->queue) {
 280                kfree(new);
 281                return NULL;
 282        }
 283        /* Can be unlocked because the queue is new: no concurrency */
 284        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue);
 285
 286        blk_queue_make_request(new->queue, md_fail_request);
 287
 288        goto retry;
 289}
 290
 291static inline int mddev_lock(mddev_t * mddev)
 292{
 293        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 294}
 295
 296static inline int mddev_trylock(mddev_t * mddev)
 297{
 298        return mutex_trylock(&mddev->reconfig_mutex);
 299}
 300
 301static inline void mddev_unlock(mddev_t * mddev)
 302{
 303        mutex_unlock(&mddev->reconfig_mutex);
 304
 305        md_wakeup_thread(mddev->thread);
 306}
 307
 308static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 309{
 310        mdk_rdev_t * rdev;
 311        struct list_head *tmp;
 312
 313        rdev_for_each(rdev, tmp, mddev) {
 314                if (rdev->desc_nr == nr)
 315                        return rdev;
 316        }
 317        return NULL;
 318}
 319
 320static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 321{
 322        struct list_head *tmp;
 323        mdk_rdev_t *rdev;
 324
 325        rdev_for_each(rdev, tmp, mddev) {
 326                if (rdev->bdev->bd_dev == dev)
 327                        return rdev;
 328        }
 329        return NULL;
 330}
 331
 332static struct mdk_personality *find_pers(int level, char *clevel)
 333{
 334        struct mdk_personality *pers;
 335        list_for_each_entry(pers, &pers_list, list) {
 336                if (level != LEVEL_NONE && pers->level == level)
 337                        return pers;
 338                if (strcmp(pers->name, clevel)==0)
 339                        return pers;
 340        }
 341        return NULL;
 342}
 343
 344/* return the offset of the super block in 512byte sectors */
 345static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 346{
 347        sector_t num_sectors = bdev->bd_inode->i_size / 512;
 348        return MD_NEW_SIZE_SECTORS(num_sectors);
 349}
 350
 351static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
 352{
 353        sector_t num_sectors = rdev->sb_start;
 354
 355        if (chunk_size)
 356                num_sectors &= ~((sector_t)chunk_size/512 - 1);
 357        return num_sectors;
 358}
 359
 360static int alloc_disk_sb(mdk_rdev_t * rdev)
 361{
 362        if (rdev->sb_page)
 363                MD_BUG();
 364
 365        rdev->sb_page = alloc_page(GFP_KERNEL);
 366        if (!rdev->sb_page) {
 367                printk(KERN_ALERT "md: out of memory.\n");
 368                return -ENOMEM;
 369        }
 370
 371        return 0;
 372}
 373
 374static void free_disk_sb(mdk_rdev_t * rdev)
 375{
 376        if (rdev->sb_page) {
 377                put_page(rdev->sb_page);
 378                rdev->sb_loaded = 0;
 379                rdev->sb_page = NULL;
 380                rdev->sb_start = 0;
 381                rdev->size = 0;
 382        }
 383}
 384
 385
 386static void super_written(struct bio *bio, int error)
 387{
 388        mdk_rdev_t *rdev = bio->bi_private;
 389        mddev_t *mddev = rdev->mddev;
 390
 391        if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 392                printk("md: super_written gets error=%d, uptodate=%d\n",
 393                       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
 394                WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
 395                md_error(mddev, rdev);
 396        }
 397
 398        if (atomic_dec_and_test(&mddev->pending_writes))
 399                wake_up(&mddev->sb_wait);
 400        bio_put(bio);
 401}
 402
 403static void super_written_barrier(struct bio *bio, int error)
 404{
 405        struct bio *bio2 = bio->bi_private;
 406        mdk_rdev_t *rdev = bio2->bi_private;
 407        mddev_t *mddev = rdev->mddev;
 408
 409        if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
 410            error == -EOPNOTSUPP) {
 411                unsigned long flags;
 412                /* barriers don't appear to be supported :-( */
 413                set_bit(BarriersNotsupp, &rdev->flags);
 414                mddev->barriers_work = 0;
 415                spin_lock_irqsave(&mddev->write_lock, flags);
 416                bio2->bi_next = mddev->biolist;
 417                mddev->biolist = bio2;
 418                spin_unlock_irqrestore(&mddev->write_lock, flags);
 419                wake_up(&mddev->sb_wait);
 420                bio_put(bio);
 421        } else {
 422                bio_put(bio2);
 423                bio->bi_private = rdev;
 424                super_written(bio, error);
 425        }
 426}
 427
 428void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 429                   sector_t sector, int size, struct page *page)
 430{
 431        /* write first size bytes of page to sector of rdev
 432         * Increment mddev->pending_writes before returning
 433         * and decrement it on completion, waking up sb_wait
 434         * if zero is reached.
 435         * If an error occurred, call md_error
 436         *
 437         * As we might need to resubmit the request if BIO_RW_BARRIER
 438         * causes ENOTSUPP, we allocate a spare bio...
 439         */
 440        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 441        int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
 442
 443        bio->bi_bdev = rdev->bdev;
 444        bio->bi_sector = sector;
 445        bio_add_page(bio, page, size, 0);
 446        bio->bi_private = rdev;
 447        bio->bi_end_io = super_written;
 448        bio->bi_rw = rw;
 449
 450        atomic_inc(&mddev->pending_writes);
 451        if (!test_bit(BarriersNotsupp, &rdev->flags)) {
 452                struct bio *rbio;
 453                rw |= (1<<BIO_RW_BARRIER);
 454                rbio = bio_clone(bio, GFP_NOIO);
 455                rbio->bi_private = bio;
 456                rbio->bi_end_io = super_written_barrier;
 457                submit_bio(rw, rbio);
 458        } else
 459                submit_bio(rw, bio);
 460}
 461
 462void md_super_wait(mddev_t *mddev)
 463{
 464        /* wait for all superblock writes that were scheduled to complete.
 465         * if any had to be retried (due to BARRIER problems), retry them
 466         */
 467        DEFINE_WAIT(wq);
 468        for(;;) {
 469                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
 470                if (atomic_read(&mddev->pending_writes)==0)
 471                        break;
 472                while (mddev->biolist) {
 473                        struct bio *bio;
 474                        spin_lock_irq(&mddev->write_lock);
 475                        bio = mddev->biolist;
 476                        mddev->biolist = bio->bi_next ;
 477                        bio->bi_next = NULL;
 478                        spin_unlock_irq(&mddev->write_lock);
 479                        submit_bio(bio->bi_rw, bio);
 480                }
 481                schedule();
 482        }
 483        finish_wait(&mddev->sb_wait, &wq);
 484}
 485
 486static void bi_complete(struct bio *bio, int error)
 487{
 488        complete((struct completion*)bio->bi_private);
 489}
 490
 491int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 492                   struct page *page, int rw)
 493{
 494        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 495        struct completion event;
 496        int ret;
 497
 498        rw |= (1 << BIO_RW_SYNC);
 499
 500        bio->bi_bdev = bdev;
 501        bio->bi_sector = sector;
 502        bio_add_page(bio, page, size, 0);
 503        init_completion(&event);
 504        bio->bi_private = &event;
 505        bio->bi_end_io = bi_complete;
 506        submit_bio(rw, bio);
 507        wait_for_completion(&event);
 508
 509        ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
 510        bio_put(bio);
 511        return ret;
 512}
 513EXPORT_SYMBOL_GPL(sync_page_io);
 514
 515static int read_disk_sb(mdk_rdev_t * rdev, int size)
 516{
 517        char b[BDEVNAME_SIZE];
 518        if (!rdev->sb_page) {
 519                MD_BUG();
 520                return -EINVAL;
 521        }
 522        if (rdev->sb_loaded)
 523                return 0;
 524
 525
 526        if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
 527                goto fail;
 528        rdev->sb_loaded = 1;
 529        return 0;
 530
 531fail:
 532        printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
 533                bdevname(rdev->bdev,b));
 534        return -EINVAL;
 535}
 536
 537static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 538{
 539        return  sb1->set_uuid0 == sb2->set_uuid0 &&
 540                sb1->set_uuid1 == sb2->set_uuid1 &&
 541                sb1->set_uuid2 == sb2->set_uuid2 &&
 542                sb1->set_uuid3 == sb2->set_uuid3;
 543}
 544
 545static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 546{
 547        int ret;
 548        mdp_super_t *tmp1, *tmp2;
 549
 550        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
 551        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
 552
 553        if (!tmp1 || !tmp2) {
 554                ret = 0;
 555                printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
 556                goto abort;
 557        }
 558
 559        *tmp1 = *sb1;
 560        *tmp2 = *sb2;
 561
 562        /*
 563         * nr_disks is not constant
 564         */
 565        tmp1->nr_disks = 0;
 566        tmp2->nr_disks = 0;
 567
 568        ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
 569abort:
 570        kfree(tmp1);
 571        kfree(tmp2);
 572        return ret;
 573}
 574
 575
 576static u32 md_csum_fold(u32 csum)
 577{
 578        csum = (csum & 0xffff) + (csum >> 16);
 579        return (csum & 0xffff) + (csum >> 16);
 580}
 581
 582static unsigned int calc_sb_csum(mdp_super_t * sb)
 583{
 584        u64 newcsum = 0;
 585        u32 *sb32 = (u32*)sb;
 586        int i;
 587        unsigned int disk_csum, csum;
 588
 589        disk_csum = sb->sb_csum;
 590        sb->sb_csum = 0;
 591
 592        for (i = 0; i < MD_SB_BYTES/4 ; i++)
 593                newcsum += sb32[i];
 594        csum = (newcsum & 0xffffffff) + (newcsum>>32);
 595
 596
 597#ifdef CONFIG_ALPHA
 598        /* This used to use csum_partial, which was wrong for several
 599         * reasons including that different results are returned on
 600         * different architectures.  It isn't critical that we get exactly
 601         * the same return value as before (we always csum_fold before
 602         * testing, and that removes any differences).  However as we
 603         * know that csum_partial always returned a 16bit value on
 604         * alphas, do a fold to maximise conformity to previous behaviour.
 605         */
 606        sb->sb_csum = md_csum_fold(disk_csum);
 607#else
 608        sb->sb_csum = disk_csum;
 609#endif
 610        return csum;
 611}
 612
 613
 614/*
 615 * Handle superblock details.
 616 * We want to be able to handle multiple superblock formats
 617 * so we have a common interface to them all, and an array of
 618 * different handlers.
 619 * We rely on user-space to write the initial superblock, and support
 620 * reading and updating of superblocks.
 621 * Interface methods are:
 622 *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
 623 *      loads and validates a superblock on dev.
 624 *      if refdev != NULL, compare superblocks on both devices
 625 *    Return:
 626 *      0 - dev has a superblock that is compatible with refdev
 627 *      1 - dev has a superblock that is compatible and newer than refdev
 628 *          so dev should be used as the refdev in future
 629 *     -EINVAL superblock incompatible or invalid
 630 *     -othererror e.g. -EIO
 631 *
 632 *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
 633 *      Verify that dev is acceptable into mddev.
 634 *       The first time, mddev->raid_disks will be 0, and data from
 635 *       dev should be merged in.  Subsequent calls check that dev
 636 *       is new enough.  Return 0 or -EINVAL
 637 *
 638 *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
 639 *     Update the superblock for rdev with data in mddev
 640 *     This does not write to disc.
 641 *
 642 */
 643
 644struct super_type  {
 645        char                *name;
 646        struct module       *owner;
 647        int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
 648                                          int minor_version);
 649        int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 650        void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 651        unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
 652                                                sector_t num_sectors);
 653};
 654
 655/*
 656 * load_super for 0.90.0 
 657 */
 658static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 659{
 660        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 661        mdp_super_t *sb;
 662        int ret;
 663
 664        /*
 665         * Calculate the position of the superblock (512byte sectors),
 666         * it's at the end of the disk.
 667         *
 668         * It also happens to be a multiple of 4Kb.
 669         */
 670        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 671
 672        ret = read_disk_sb(rdev, MD_SB_BYTES);
 673        if (ret) return ret;
 674
 675        ret = -EINVAL;
 676
 677        bdevname(rdev->bdev, b);
 678        sb = (mdp_super_t*)page_address(rdev->sb_page);
 679
 680        if (sb->md_magic != MD_SB_MAGIC) {
 681                printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
 682                       b);
 683                goto abort;
 684        }
 685
 686        if (sb->major_version != 0 ||
 687            sb->minor_version < 90 ||
 688            sb->minor_version > 91) {
 689                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 690                        sb->major_version, sb->minor_version,
 691                        b);
 692                goto abort;
 693        }
 694
 695        if (sb->raid_disks <= 0)
 696                goto abort;
 697
 698        if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
 699                printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
 700                        b);
 701                goto abort;
 702        }
 703
 704        rdev->preferred_minor = sb->md_minor;
 705        rdev->data_offset = 0;
 706        rdev->sb_size = MD_SB_BYTES;
 707
 708        if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
 709                if (sb->level != 1 && sb->level != 4
 710                    && sb->level != 5 && sb->level != 6
 711                    && sb->level != 10) {
 712                        /* FIXME use a better test */
 713                        printk(KERN_WARNING
 714                               "md: bitmaps not supported for this level.\n");
 715                        goto abort;
 716                }
 717        }
 718
 719        if (sb->level == LEVEL_MULTIPATH)
 720                rdev->desc_nr = -1;
 721        else
 722                rdev->desc_nr = sb->this_disk.number;
 723
 724        if (!refdev) {
 725                ret = 1;
 726        } else {
 727                __u64 ev1, ev2;
 728                mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
 729                if (!uuid_equal(refsb, sb)) {
 730                        printk(KERN_WARNING "md: %s has different UUID to %s\n",
 731                                b, bdevname(refdev->bdev,b2));
 732                        goto abort;
 733                }
 734                if (!sb_equal(refsb, sb)) {
 735                        printk(KERN_WARNING "md: %s has same UUID"
 736                               " but different superblock to %s\n",
 737                               b, bdevname(refdev->bdev, b2));
 738                        goto abort;
 739                }
 740                ev1 = md_event(sb);
 741                ev2 = md_event(refsb);
 742                if (ev1 > ev2)
 743                        ret = 1;
 744                else 
 745                        ret = 0;
 746        }
 747        rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
 748
 749        if (rdev->size < sb->size && sb->level > 1)
 750                /* "this cannot possibly happen" ... */
 751                ret = -EINVAL;
 752
 753 abort:
 754        return ret;
 755}
 756
 757/*
 758 * validate_super for 0.90.0
 759 */
 760static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 761{
 762        mdp_disk_t *desc;
 763        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
 764        __u64 ev1 = md_event(sb);
 765
 766        rdev->raid_disk = -1;
 767        clear_bit(Faulty, &rdev->flags);
 768        clear_bit(In_sync, &rdev->flags);
 769        clear_bit(WriteMostly, &rdev->flags);
 770        clear_bit(BarriersNotsupp, &rdev->flags);
 771
 772        if (mddev->raid_disks == 0) {
 773                mddev->major_version = 0;
 774                mddev->minor_version = sb->minor_version;
 775                mddev->patch_version = sb->patch_version;
 776                mddev->external = 0;
 777                mddev->chunk_size = sb->chunk_size;
 778                mddev->ctime = sb->ctime;
 779                mddev->utime = sb->utime;
 780                mddev->level = sb->level;
 781                mddev->clevel[0] = 0;
 782                mddev->layout = sb->layout;
 783                mddev->raid_disks = sb->raid_disks;
 784                mddev->size = sb->size;
 785                mddev->events = ev1;
 786                mddev->bitmap_offset = 0;
 787                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 788
 789                if (mddev->minor_version >= 91) {
 790                        mddev->reshape_position = sb->reshape_position;
 791                        mddev->delta_disks = sb->delta_disks;
 792                        mddev->new_level = sb->new_level;
 793                        mddev->new_layout = sb->new_layout;
 794                        mddev->new_chunk = sb->new_chunk;
 795                } else {
 796                        mddev->reshape_position = MaxSector;
 797                        mddev->delta_disks = 0;
 798                        mddev->new_level = mddev->level;
 799                        mddev->new_layout = mddev->layout;
 800                        mddev->new_chunk = mddev->chunk_size;
 801                }
 802
 803                if (sb->state & (1<<MD_SB_CLEAN))
 804                        mddev->recovery_cp = MaxSector;
 805                else {
 806                        if (sb->events_hi == sb->cp_events_hi && 
 807                                sb->events_lo == sb->cp_events_lo) {
 808                                mddev->recovery_cp = sb->recovery_cp;
 809                        } else
 810                                mddev->recovery_cp = 0;
 811                }
 812
 813                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
 814                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
 815                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
 816                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
 817
 818                mddev->max_disks = MD_SB_DISKS;
 819
 820                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 821                    mddev->bitmap_file == NULL)
 822                        mddev->bitmap_offset = mddev->default_bitmap_offset;
 823
 824        } else if (mddev->pers == NULL) {
 825                /* Insist on good event counter while assembling */
 826                ++ev1;
 827                if (ev1 < mddev->events) 
 828                        return -EINVAL;
 829        } else if (mddev->bitmap) {
 830                /* if adding to array with a bitmap, then we can accept an
 831                 * older device ... but not too old.
 832                 */
 833                if (ev1 < mddev->bitmap->events_cleared)
 834                        return 0;
 835        } else {
 836                if (ev1 < mddev->events)
 837                        /* just a hot-add of a new device, leave raid_disk at -1 */
 838                        return 0;
 839        }
 840
 841        if (mddev->level != LEVEL_MULTIPATH) {
 842                desc = sb->disks + rdev->desc_nr;
 843
 844                if (desc->state & (1<<MD_DISK_FAULTY))
 845                        set_bit(Faulty, &rdev->flags);
 846                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
 847                            desc->raid_disk < mddev->raid_disks */) {
 848                        set_bit(In_sync, &rdev->flags);
 849                        rdev->raid_disk = desc->raid_disk;
 850                }
 851                if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
 852                        set_bit(WriteMostly, &rdev->flags);
 853        } else /* MULTIPATH are always insync */
 854                set_bit(In_sync, &rdev->flags);
 855        return 0;
 856}
 857
 858/*
 859 * sync_super for 0.90.0
 860 */
 861static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 862{
 863        mdp_super_t *sb;
 864        struct list_head *tmp;
 865        mdk_rdev_t *rdev2;
 866        int next_spare = mddev->raid_disks;
 867
 868
 869        /* make rdev->sb match mddev data..
 870         *
 871         * 1/ zero out disks
 872         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
 873         * 3/ any empty disks < next_spare become removed
 874         *
 875         * disks[0] gets initialised to REMOVED because
 876         * we cannot be sure from other fields if it has
 877         * been initialised or not.
 878         */
 879        int i;
 880        int active=0, working=0,failed=0,spare=0,nr_disks=0;
 881
 882        rdev->sb_size = MD_SB_BYTES;
 883
 884        sb = (mdp_super_t*)page_address(rdev->sb_page);
 885
 886        memset(sb, 0, sizeof(*sb));
 887
 888        sb->md_magic = MD_SB_MAGIC;
 889        sb->major_version = mddev->major_version;
 890        sb->patch_version = mddev->patch_version;
 891        sb->gvalid_words  = 0; /* ignored */
 892        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
 893        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
 894        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
 895        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
 896
 897        sb->ctime = mddev->ctime;
 898        sb->level = mddev->level;
 899        sb->size  = mddev->size;
 900        sb->raid_disks = mddev->raid_disks;
 901        sb->md_minor = mddev->md_minor;
 902        sb->not_persistent = 0;
 903        sb->utime = mddev->utime;
 904        sb->state = 0;
 905        sb->events_hi = (mddev->events>>32);
 906        sb->events_lo = (u32)mddev->events;
 907
 908        if (mddev->reshape_position == MaxSector)
 909                sb->minor_version = 90;
 910        else {
 911                sb->minor_version = 91;
 912                sb->reshape_position = mddev->reshape_position;
 913                sb->new_level = mddev->new_level;
 914                sb->delta_disks = mddev->delta_disks;
 915                sb->new_layout = mddev->new_layout;
 916                sb->new_chunk = mddev->new_chunk;
 917        }
 918        mddev->minor_version = sb->minor_version;
 919        if (mddev->in_sync)
 920        {
 921                sb->recovery_cp = mddev->recovery_cp;
 922                sb->cp_events_hi = (mddev->events>>32);
 923                sb->cp_events_lo = (u32)mddev->events;
 924                if (mddev->recovery_cp == MaxSector)
 925                        sb->state = (1<< MD_SB_CLEAN);
 926        } else
 927                sb->recovery_cp = 0;
 928
 929        sb->layout = mddev->layout;
 930        sb->chunk_size = mddev->chunk_size;
 931
 932        if (mddev->bitmap && mddev->bitmap_file == NULL)
 933                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 934
 935        sb->disks[0].state = (1<<MD_DISK_REMOVED);
 936        rdev_for_each(rdev2, tmp, mddev) {
 937                mdp_disk_t *d;
 938                int desc_nr;
 939                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
 940                    && !test_bit(Faulty, &rdev2->flags))
 941                        desc_nr = rdev2->raid_disk;
 942                else
 943                        desc_nr = next_spare++;
 944                rdev2->desc_nr = desc_nr;
 945                d = &sb->disks[rdev2->desc_nr];
 946                nr_disks++;
 947                d->number = rdev2->desc_nr;
 948                d->major = MAJOR(rdev2->bdev->bd_dev);
 949                d->minor = MINOR(rdev2->bdev->bd_dev);
 950                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
 951                    && !test_bit(Faulty, &rdev2->flags))
 952                        d->raid_disk = rdev2->raid_disk;
 953                else
 954                        d->raid_disk = rdev2->desc_nr; /* compatibility */
 955                if (test_bit(Faulty, &rdev2->flags))
 956                        d->state = (1<<MD_DISK_FAULTY);
 957                else if (test_bit(In_sync, &rdev2->flags)) {
 958                        d->state = (1<<MD_DISK_ACTIVE);
 959                        d->state |= (1<<MD_DISK_SYNC);
 960                        active++;
 961                        working++;
 962                } else {
 963                        d->state = 0;
 964                        spare++;
 965                        working++;
 966                }
 967                if (test_bit(WriteMostly, &rdev2->flags))
 968                        d->state |= (1<<MD_DISK_WRITEMOSTLY);
 969        }
 970        /* now set the "removed" and "faulty" bits on any missing devices */
 971        for (i=0 ; i < mddev->raid_disks ; i++) {
 972                mdp_disk_t *d = &sb->disks[i];
 973                if (d->state == 0 && d->number == 0) {
 974                        d->number = i;
 975                        d->raid_disk = i;
 976                        d->state = (1<<MD_DISK_REMOVED);
 977                        d->state |= (1<<MD_DISK_FAULTY);
 978                        failed++;
 979                }
 980        }
 981        sb->nr_disks = nr_disks;
 982        sb->active_disks = active;
 983        sb->working_disks = working;
 984        sb->failed_disks = failed;
 985        sb->spare_disks = spare;
 986
 987        sb->this_disk = sb->disks[rdev->desc_nr];
 988        sb->sb_csum = calc_sb_csum(sb);
 989}
 990
 991/*
 992 * rdev_size_change for 0.90.0
 993 */
 994static unsigned long long
 995super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 996{
 997        if (num_sectors && num_sectors < rdev->mddev->size * 2)
 998                return 0; /* component must fit device */
 999        if (rdev->mddev->bitmap_offset)
1000                return 0; /* can't move bitmap */
1001        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1002        if (!num_sectors || num_sectors > rdev->sb_start)
1003                num_sectors = rdev->sb_start;
1004        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1005                       rdev->sb_page);
1006        md_super_wait(rdev->mddev);
1007        return num_sectors / 2; /* kB for sysfs */
1008}
1009
1010
1011/*
1012 * version 1 superblock
1013 */
1014
1015static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1016{
1017        __le32 disk_csum;
1018        u32 csum;
1019        unsigned long long newcsum;
1020        int size = 256 + le32_to_cpu(sb->max_dev)*2;
1021        __le32 *isuper = (__le32*)sb;
1022        int i;
1023
1024        disk_csum = sb->sb_csum;
1025        sb->sb_csum = 0;
1026        newcsum = 0;
1027        for (i=0; size>=4; size -= 4 )
1028                newcsum += le32_to_cpu(*isuper++);
1029
1030        if (size == 2)
1031                newcsum += le16_to_cpu(*(__le16*) isuper);
1032
1033        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1034        sb->sb_csum = disk_csum;
1035        return cpu_to_le32(csum);
1036}
1037
1038static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1039{
1040        struct mdp_superblock_1 *sb;
1041        int ret;
1042        sector_t sb_start;
1043        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1044        int bmask;
1045
1046        /*
1047         * Calculate the position of the superblock in 512byte sectors.
1048         * It is always aligned to a 4K boundary and
1049         * depeding on minor_version, it can be:
1050         * 0: At least 8K, but less than 12K, from end of device
1051         * 1: At start of device
1052         * 2: 4K from start of device.
1053         */
1054        switch(minor_version) {
1055        case 0:
1056                sb_start = rdev->bdev->bd_inode->i_size >> 9;
1057                sb_start -= 8*2;
1058                sb_start &= ~(sector_t)(4*2-1);
1059                break;
1060        case 1:
1061                sb_start = 0;
1062                break;
1063        case 2:
1064                sb_start = 8;
1065                break;
1066        default:
1067                return -EINVAL;
1068        }
1069        rdev->sb_start = sb_start;
1070
1071        /* superblock is rarely larger than 1K, but it can be larger,
1072         * and it is safe to read 4k, so we do that
1073         */
1074        ret = read_disk_sb(rdev, 4096);
1075        if (ret) return ret;
1076
1077
1078        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1079
1080        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1081            sb->major_version != cpu_to_le32(1) ||
1082            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1083            le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1084            (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1085                return -EINVAL;
1086
1087        if (calc_sb_1_csum(sb) != sb->sb_csum) {
1088                printk("md: invalid superblock checksum on %s\n",
1089                        bdevname(rdev->bdev,b));
1090                return -EINVAL;
1091        }
1092        if (le64_to_cpu(sb->data_size) < 10) {
1093                printk("md: data_size too small on %s\n",
1094                       bdevname(rdev->bdev,b));
1095                return -EINVAL;
1096        }
1097        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1098                if (sb->level != cpu_to_le32(1) &&
1099                    sb->level != cpu_to_le32(4) &&
1100                    sb->level != cpu_to_le32(5) &&
1101                    sb->level != cpu_to_le32(6) &&
1102                    sb->level != cpu_to_le32(10)) {
1103                        printk(KERN_WARNING
1104                               "md: bitmaps not supported for this level.\n");
1105                        return -EINVAL;
1106                }
1107        }
1108
1109        rdev->preferred_minor = 0xffff;
1110        rdev->data_offset = le64_to_cpu(sb->data_offset);
1111        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1112
1113        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1114        bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1115        if (rdev->sb_size & bmask)
1116                rdev->sb_size = (rdev->sb_size | bmask) + 1;
1117
1118        if (minor_version
1119            && rdev->data_offset < sb_start + (rdev->sb_size/512))
1120                return -EINVAL;
1121
1122        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1123                rdev->desc_nr = -1;
1124        else
1125                rdev->desc_nr = le32_to_cpu(sb->dev_number);
1126
1127        if (!refdev) {
1128                ret = 1;
1129        } else {
1130                __u64 ev1, ev2;
1131                struct mdp_superblock_1 *refsb = 
1132                        (struct mdp_superblock_1*)page_address(refdev->sb_page);
1133
1134                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1135                    sb->level != refsb->level ||
1136                    sb->layout != refsb->layout ||
1137                    sb->chunksize != refsb->chunksize) {
1138                        printk(KERN_WARNING "md: %s has strangely different"
1139                                " superblock to %s\n",
1140                                bdevname(rdev->bdev,b),
1141                                bdevname(refdev->bdev,b2));
1142                        return -EINVAL;
1143                }
1144                ev1 = le64_to_cpu(sb->events);
1145                ev2 = le64_to_cpu(refsb->events);
1146
1147                if (ev1 > ev2)
1148                        ret = 1;
1149                else
1150                        ret = 0;
1151        }
1152        if (minor_version)
1153                rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1154        else
1155                rdev->size = rdev->sb_start / 2;
1156        if (rdev->size < le64_to_cpu(sb->data_size)/2)
1157                return -EINVAL;
1158        rdev->size = le64_to_cpu(sb->data_size)/2;
1159        if (le32_to_cpu(sb->chunksize))
1160                rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1161
1162        if (le64_to_cpu(sb->size) > rdev->size*2)
1163                return -EINVAL;
1164        return ret;
1165}
1166
1167static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1168{
1169        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1170        __u64 ev1 = le64_to_cpu(sb->events);
1171
1172        rdev->raid_disk = -1;
1173        clear_bit(Faulty, &rdev->flags);
1174        clear_bit(In_sync, &rdev->flags);
1175        clear_bit(WriteMostly, &rdev->flags);
1176        clear_bit(BarriersNotsupp, &rdev->flags);
1177
1178        if (mddev->raid_disks == 0) {
1179                mddev->major_version = 1;
1180                mddev->patch_version = 0;
1181                mddev->external = 0;
1182                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1183                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1184                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1185                mddev->level = le32_to_cpu(sb->level);
1186                mddev->clevel[0] = 0;
1187                mddev->layout = le32_to_cpu(sb->layout);
1188                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1189                mddev->size = le64_to_cpu(sb->size)/2;
1190                mddev->events = ev1;
1191                mddev->bitmap_offset = 0;
1192                mddev->default_bitmap_offset = 1024 >> 9;
1193                
1194                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1195                memcpy(mddev->uuid, sb->set_uuid, 16);
1196
1197                mddev->max_disks =  (4096-256)/2;
1198
1199                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1200                    mddev->bitmap_file == NULL )
1201                        mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1202
1203                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1204                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1205                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1206                        mddev->new_level = le32_to_cpu(sb->new_level);
1207                        mddev->new_layout = le32_to_cpu(sb->new_layout);
1208                        mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1209                } else {
1210                        mddev->reshape_position = MaxSector;
1211                        mddev->delta_disks = 0;
1212                        mddev->new_level = mddev->level;
1213                        mddev->new_layout = mddev->layout;
1214                        mddev->new_chunk = mddev->chunk_size;
1215                }
1216
1217        } else if (mddev->pers == NULL) {
1218                /* Insist of good event counter while assembling */
1219                ++ev1;
1220                if (ev1 < mddev->events)
1221                        return -EINVAL;
1222        } else if (mddev->bitmap) {
1223                /* If adding to array with a bitmap, then we can accept an
1224                 * older device, but not too old.
1225                 */
1226                if (ev1 < mddev->bitmap->events_cleared)
1227                        return 0;
1228        } else {
1229                if (ev1 < mddev->events)
1230                        /* just a hot-add of a new device, leave raid_disk at -1 */
1231                        return 0;
1232        }
1233        if (mddev->level != LEVEL_MULTIPATH) {
1234                int role;
1235                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1236                switch(role) {
1237                case 0xffff: /* spare */
1238                        break;
1239                case 0xfffe: /* faulty */
1240                        set_bit(Faulty, &rdev->flags);
1241                        break;
1242                default:
1243                        if ((le32_to_cpu(sb->feature_map) &
1244                             MD_FEATURE_RECOVERY_OFFSET))
1245                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1246                        else
1247                                set_bit(In_sync, &rdev->flags);
1248                        rdev->raid_disk = role;
1249                        break;
1250                }
1251                if (sb->devflags & WriteMostly1)
1252                        set_bit(WriteMostly, &rdev->flags);
1253        } else /* MULTIPATH are always insync */
1254                set_bit(In_sync, &rdev->flags);
1255
1256        return 0;
1257}
1258
1259static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1260{
1261        struct mdp_superblock_1 *sb;
1262        struct list_head *tmp;
1263        mdk_rdev_t *rdev2;
1264        int max_dev, i;
1265        /* make rdev->sb match mddev and rdev data. */
1266
1267        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1268
1269        sb->feature_map = 0;
1270        sb->pad0 = 0;
1271        sb->recovery_offset = cpu_to_le64(0);
1272        memset(sb->pad1, 0, sizeof(sb->pad1));
1273        memset(sb->pad2, 0, sizeof(sb->pad2));
1274        memset(sb->pad3, 0, sizeof(sb->pad3));
1275
1276        sb->utime = cpu_to_le64((__u64)mddev->utime);
1277        sb->events = cpu_to_le64(mddev->events);
1278        if (mddev->in_sync)
1279                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1280        else
1281                sb->resync_offset = cpu_to_le64(0);
1282
1283        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1284
1285        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1286        sb->size = cpu_to_le64(mddev->size<<1);
1287
1288        if (mddev->bitmap && mddev->bitmap_file == NULL) {
1289                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1290                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1291        }
1292
1293        if (rdev->raid_disk >= 0 &&
1294            !test_bit(In_sync, &rdev->flags) &&
1295            rdev->recovery_offset > 0) {
1296                sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1297                sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1298        }
1299
1300        if (mddev->reshape_position != MaxSector) {
1301                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1302                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1303                sb->new_layout = cpu_to_le32(mddev->new_layout);
1304                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1305                sb->new_level = cpu_to_le32(mddev->new_level);
1306                sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1307        }
1308
1309        max_dev = 0;
1310        rdev_for_each(rdev2, tmp, mddev)
1311                if (rdev2->desc_nr+1 > max_dev)
1312                        max_dev = rdev2->desc_nr+1;
1313
1314        if (max_dev > le32_to_cpu(sb->max_dev))
1315                sb->max_dev = cpu_to_le32(max_dev);
1316        for (i=0; i<max_dev;i++)
1317                sb->dev_roles[i] = cpu_to_le16(0xfffe);
1318        
1319        rdev_for_each(rdev2, tmp, mddev) {
1320                i = rdev2->desc_nr;
1321                if (test_bit(Faulty, &rdev2->flags))
1322                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
1323                else if (test_bit(In_sync, &rdev2->flags))
1324                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1325                else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1326                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1327                else
1328                        sb->dev_roles[i] = cpu_to_le16(0xffff);
1329        }
1330
1331        sb->sb_csum = calc_sb_1_csum(sb);
1332}
1333
1334static unsigned long long
1335super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1336{
1337        struct mdp_superblock_1 *sb;
1338        sector_t max_sectors;
1339        if (num_sectors && num_sectors < rdev->mddev->size * 2)
1340                return 0; /* component must fit device */
1341        if (rdev->sb_start < rdev->data_offset) {
1342                /* minor versions 1 and 2; superblock before data */
1343                max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1344                max_sectors -= rdev->data_offset;
1345                if (!num_sectors || num_sectors > max_sectors)
1346                        num_sectors = max_sectors;
1347        } else if (rdev->mddev->bitmap_offset) {
1348                /* minor version 0 with bitmap we can't move */
1349                return 0;
1350        } else {
1351                /* minor version 0; superblock after data */
1352                sector_t sb_start;
1353                sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1354                sb_start &= ~(sector_t)(4*2 - 1);
1355                max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
1356                if (!num_sectors || num_sectors > max_sectors)
1357                        num_sectors = max_sectors;
1358                rdev->sb_start = sb_start;
1359        }
1360        sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1361        sb->data_size = cpu_to_le64(num_sectors);
1362        sb->super_offset = rdev->sb_start;
1363        sb->sb_csum = calc_sb_1_csum(sb);
1364        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1365                       rdev->sb_page);
1366        md_super_wait(rdev->mddev);
1367        return num_sectors / 2; /* kB for sysfs */
1368}
1369
1370static struct super_type super_types[] = {
1371        [0] = {
1372                .name   = "0.90.0",
1373                .owner  = THIS_MODULE,
1374                .load_super         = super_90_load,
1375                .validate_super     = super_90_validate,
1376                .sync_super         = super_90_sync,
1377                .rdev_size_change   = super_90_rdev_size_change,
1378        },
1379        [1] = {
1380                .name   = "md-1",
1381                .owner  = THIS_MODULE,
1382                .load_super         = super_1_load,
1383                .validate_super     = super_1_validate,
1384                .sync_super         = super_1_sync,
1385                .rdev_size_change   = super_1_rdev_size_change,
1386        },
1387};
1388
1389static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1390{
1391        mdk_rdev_t *rdev, *rdev2;
1392
1393        rcu_read_lock();
1394        rdev_for_each_rcu(rdev, mddev1)
1395                rdev_for_each_rcu(rdev2, mddev2)
1396                        if (rdev->bdev->bd_contains ==
1397                            rdev2->bdev->bd_contains) {
1398                                rcu_read_unlock();
1399                                return 1;
1400                        }
1401        rcu_read_unlock();
1402        return 0;
1403}
1404
1405static LIST_HEAD(pending_raid_disks);
1406
1407static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1408{
1409        char b[BDEVNAME_SIZE];
1410        struct kobject *ko;
1411        char *s;
1412        int err;
1413
1414        if (rdev->mddev) {
1415                MD_BUG();
1416                return -EINVAL;
1417        }
1418
1419        /* prevent duplicates */
1420        if (find_rdev(mddev, rdev->bdev->bd_dev))
1421                return -EEXIST;
1422
1423        /* make sure rdev->size exceeds mddev->size */
1424        if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1425                if (mddev->pers) {
1426                        /* Cannot change size, so fail
1427                         * If mddev->level <= 0, then we don't care
1428                         * about aligning sizes (e.g. linear)
1429                         */
1430                        if (mddev->level > 0)
1431                                return -ENOSPC;
1432                } else
1433                        mddev->size = rdev->size;
1434        }
1435
1436        /* Verify rdev->desc_nr is unique.
1437         * If it is -1, assign a free number, else
1438         * check number is not in use
1439         */
1440        if (rdev->desc_nr < 0) {
1441                int choice = 0;
1442                if (mddev->pers) choice = mddev->raid_disks;
1443                while (find_rdev_nr(mddev, choice))
1444                        choice++;
1445                rdev->desc_nr = choice;
1446        } else {
1447                if (find_rdev_nr(mddev, rdev->desc_nr))
1448                        return -EBUSY;
1449        }
1450        bdevname(rdev->bdev,b);
1451        while ( (s=strchr(b, '/')) != NULL)
1452                *s = '!';
1453
1454        rdev->mddev = mddev;
1455        printk(KERN_INFO "md: bind<%s>\n", b);
1456
1457        if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1458                goto fail;
1459
1460        ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1461        if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1462                kobject_del(&rdev->kobj);
1463                goto fail;
1464        }
1465        rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1466
1467        list_add_rcu(&rdev->same_set, &mddev->disks);
1468        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1469        return 0;
1470
1471 fail:
1472        printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1473               b, mdname(mddev));
1474        return err;
1475}
1476
1477static void md_delayed_delete(struct work_struct *ws)
1478{
1479        mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1480        kobject_del(&rdev->kobj);
1481        kobject_put(&rdev->kobj);
1482}
1483
1484static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1485{
1486        char b[BDEVNAME_SIZE];
1487        if (!rdev->mddev) {
1488                MD_BUG();
1489                return;
1490        }
1491        bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1492        list_del_rcu(&rdev->same_set);
1493        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1494        rdev->mddev = NULL;
1495        sysfs_remove_link(&rdev->kobj, "block");
1496        sysfs_put(rdev->sysfs_state);
1497        rdev->sysfs_state = NULL;
1498        /* We need to delay this, otherwise we can deadlock when
1499         * writing to 'remove' to "dev/state".  We also need
1500         * to delay it due to rcu usage.
1501         */
1502        synchronize_rcu();
1503        INIT_WORK(&rdev->del_work, md_delayed_delete);
1504        kobject_get(&rdev->kobj);
1505        schedule_work(&rdev->del_work);
1506}
1507
1508/*
1509 * prevent the device from being mounted, repartitioned or
1510 * otherwise reused by a RAID array (or any other kernel
1511 * subsystem), by bd_claiming the device.
1512 */
1513static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1514{
1515        int err = 0;
1516        struct block_device *bdev;
1517        char b[BDEVNAME_SIZE];
1518
1519        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1520        if (IS_ERR(bdev)) {
1521                printk(KERN_ERR "md: could not open %s.\n",
1522                        __bdevname(dev, b));
1523                return PTR_ERR(bdev);
1524        }
1525        err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1526        if (err) {
1527                printk(KERN_ERR "md: could not bd_claim %s.\n",
1528                        bdevname(bdev, b));
1529                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1530                return err;
1531        }
1532        if (!shared)
1533                set_bit(AllReserved, &rdev->flags);
1534        rdev->bdev = bdev;
1535        return err;
1536}
1537
1538static void unlock_rdev(mdk_rdev_t *rdev)
1539{
1540        struct block_device *bdev = rdev->bdev;
1541        rdev->bdev = NULL;
1542        if (!bdev)
1543                MD_BUG();
1544        bd_release(bdev);
1545        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1546}
1547
1548void md_autodetect_dev(dev_t dev);
1549
1550static void export_rdev(mdk_rdev_t * rdev)
1551{
1552        char b[BDEVNAME_SIZE];
1553        printk(KERN_INFO "md: export_rdev(%s)\n",
1554                bdevname(rdev->bdev,b));
1555        if (rdev->mddev)
1556                MD_BUG();
1557        free_disk_sb(rdev);
1558#ifndef MODULE
1559        if (test_bit(AutoDetected, &rdev->flags))
1560                md_autodetect_dev(rdev->bdev->bd_dev);
1561#endif
1562        unlock_rdev(rdev);
1563        kobject_put(&rdev->kobj);
1564}
1565
1566static void kick_rdev_from_array(mdk_rdev_t * rdev)
1567{
1568        unbind_rdev_from_array(rdev);
1569        export_rdev(rdev);
1570}
1571
1572static void export_array(mddev_t *mddev)
1573{
1574        struct list_head *tmp;
1575        mdk_rdev_t *rdev;
1576
1577        rdev_for_each(rdev, tmp, mddev) {
1578                if (!rdev->mddev) {
1579                        MD_BUG();
1580                        continue;
1581                }
1582                kick_rdev_from_array(rdev);
1583        }
1584        if (!list_empty(&mddev->disks))
1585                MD_BUG();
1586        mddev->raid_disks = 0;
1587        mddev->major_version = 0;
1588}
1589
1590static void print_desc(mdp_disk_t *desc)
1591{
1592        printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1593                desc->major,desc->minor,desc->raid_disk,desc->state);
1594}
1595
1596static void print_sb(mdp_super_t *sb)
1597{
1598        int i;
1599
1600        printk(KERN_INFO 
1601                "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1602                sb->major_version, sb->minor_version, sb->patch_version,
1603                sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1604                sb->ctime);
1605        printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1606                sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1607                sb->md_minor, sb->layout, sb->chunk_size);
1608        printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1609                " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1610                sb->utime, sb->state, sb->active_disks, sb->working_disks,
1611                sb->failed_disks, sb->spare_disks,
1612                sb->sb_csum, (unsigned long)sb->events_lo);
1613
1614        printk(KERN_INFO);
1615        for (i = 0; i < MD_SB_DISKS; i++) {
1616                mdp_disk_t *desc;
1617
1618                desc = sb->disks + i;
1619                if (desc->number || desc->major || desc->minor ||
1620                    desc->raid_disk || (desc->state && (desc->state != 4))) {
1621                        printk("     D %2d: ", i);
1622                        print_desc(desc);
1623                }
1624        }
1625        printk(KERN_INFO "md:     THIS: ");
1626        print_desc(&sb->this_disk);
1627
1628}
1629
1630static void print_rdev(mdk_rdev_t *rdev)
1631{
1632        char b[BDEVNAME_SIZE];
1633        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1634                bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1635                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1636                rdev->desc_nr);
1637        if (rdev->sb_loaded) {
1638                printk(KERN_INFO "md: rdev superblock:\n");
1639                print_sb((mdp_super_t*)page_address(rdev->sb_page));
1640        } else
1641                printk(KERN_INFO "md: no rdev superblock!\n");
1642}
1643
1644static void md_print_devices(void)
1645{
1646        struct list_head *tmp, *tmp2;
1647        mdk_rdev_t *rdev;
1648        mddev_t *mddev;
1649        char b[BDEVNAME_SIZE];
1650
1651        printk("\n");
1652        printk("md:     **********************************\n");
1653        printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1654        printk("md:     **********************************\n");
1655        for_each_mddev(mddev, tmp) {
1656
1657                if (mddev->bitmap)
1658                        bitmap_print_sb(mddev->bitmap);
1659                else
1660                        printk("%s: ", mdname(mddev));
1661                rdev_for_each(rdev, tmp2, mddev)
1662                        printk("<%s>", bdevname(rdev->bdev,b));
1663                printk("\n");
1664
1665                rdev_for_each(rdev, tmp2, mddev)
1666                        print_rdev(rdev);
1667        }
1668        printk("md:     **********************************\n");
1669        printk("\n");
1670}
1671
1672
1673static void sync_sbs(mddev_t * mddev, int nospares)
1674{
1675        /* Update each superblock (in-memory image), but
1676         * if we are allowed to, skip spares which already
1677         * have the right event counter, or have one earlier
1678         * (which would mean they aren't being marked as dirty
1679         * with the rest of the array)
1680         */
1681        mdk_rdev_t *rdev;
1682        struct list_head *tmp;
1683
1684        rdev_for_each(rdev, tmp, mddev) {
1685                if (rdev->sb_events == mddev->events ||
1686                    (nospares &&
1687                     rdev->raid_disk < 0 &&
1688                     (rdev->sb_events&1)==0 &&
1689                     rdev->sb_events+1 == mddev->events)) {
1690                        /* Don't update this superblock */
1691                        rdev->sb_loaded = 2;
1692                } else {
1693                        super_types[mddev->major_version].
1694                                sync_super(mddev, rdev);
1695                        rdev->sb_loaded = 1;
1696                }
1697        }
1698}
1699
1700static void md_update_sb(mddev_t * mddev, int force_change)
1701{
1702        struct list_head *tmp;
1703        mdk_rdev_t *rdev;
1704        int sync_req;
1705        int nospares = 0;
1706
1707        if (mddev->external)
1708                return;
1709repeat:
1710        spin_lock_irq(&mddev->write_lock);
1711
1712        set_bit(MD_CHANGE_PENDING, &mddev->flags);
1713        if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1714                force_change = 1;
1715        if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1716                /* just a clean<-> dirty transition, possibly leave spares alone,
1717                 * though if events isn't the right even/odd, we will have to do
1718                 * spares after all
1719                 */
1720                nospares = 1;
1721        if (force_change)
1722                nospares = 0;
1723        if (mddev->degraded)
1724                /* If the array is degraded, then skipping spares is both
1725                 * dangerous and fairly pointless.
1726                 * Dangerous because a device that was removed from the array
1727                 * might have a event_count that still looks up-to-date,
1728                 * so it can be re-added without a resync.
1729                 * Pointless because if there are any spares to skip,
1730                 * then a recovery will happen and soon that array won't
1731                 * be degraded any more and the spare can go back to sleep then.
1732                 */
1733                nospares = 0;
1734
1735        sync_req = mddev->in_sync;
1736        mddev->utime = get_seconds();
1737
1738        /* If this is just a dirty<->clean transition, and the array is clean
1739         * and 'events' is odd, we can roll back to the previous clean state */
1740        if (nospares
1741            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1742            && (mddev->events & 1)
1743            && mddev->events != 1)
1744                mddev->events--;
1745        else {
1746                /* otherwise we have to go forward and ... */
1747                mddev->events ++;
1748                if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1749                        /* .. if the array isn't clean, insist on an odd 'events' */
1750                        if ((mddev->events&1)==0) {
1751                                mddev->events++;
1752                                nospares = 0;
1753                        }
1754                } else {
1755                        /* otherwise insist on an even 'events' (for clean states) */
1756                        if ((mddev->events&1)) {
1757                                mddev->events++;
1758                                nospares = 0;
1759                        }
1760                }
1761        }
1762
1763        if (!mddev->events) {
1764                /*
1765                 * oops, this 64-bit counter should never wrap.
1766                 * Either we are in around ~1 trillion A.C., assuming
1767                 * 1 reboot per second, or we have a bug:
1768                 */
1769                MD_BUG();
1770                mddev->events --;
1771        }
1772
1773        /*
1774         * do not write anything to disk if using
1775         * nonpersistent superblocks
1776         */
1777        if (!mddev->persistent) {
1778                if (!mddev->external)
1779                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1780
1781                spin_unlock_irq(&mddev->write_lock);
1782                wake_up(&mddev->sb_wait);
1783                return;
1784        }
1785        sync_sbs(mddev, nospares);
1786        spin_unlock_irq(&mddev->write_lock);
1787
1788        dprintk(KERN_INFO 
1789                "md: updating %s RAID superblock on device (in sync %d)\n",
1790                mdname(mddev),mddev->in_sync);
1791
1792        bitmap_update_sb(mddev->bitmap);
1793        rdev_for_each(rdev, tmp, mddev) {
1794                char b[BDEVNAME_SIZE];
1795                dprintk(KERN_INFO "md: ");
1796                if (rdev->sb_loaded != 1)
1797                        continue; /* no noise on spare devices */
1798                if (test_bit(Faulty, &rdev->flags))
1799                        dprintk("(skipping faulty ");
1800
1801                dprintk("%s ", bdevname(rdev->bdev,b));
1802                if (!test_bit(Faulty, &rdev->flags)) {
1803                        md_super_write(mddev,rdev,
1804                                       rdev->sb_start, rdev->sb_size,
1805                                       rdev->sb_page);
1806                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1807                                bdevname(rdev->bdev,b),
1808                                (unsigned long long)rdev->sb_start);
1809                        rdev->sb_events = mddev->events;
1810
1811                } else
1812                        dprintk(")\n");
1813                if (mddev->level == LEVEL_MULTIPATH)
1814                        /* only need to write one superblock... */
1815                        break;
1816        }
1817        md_super_wait(mddev);
1818        /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
1819
1820        spin_lock_irq(&mddev->write_lock);
1821        if (mddev->in_sync != sync_req ||
1822            test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
1823                /* have to write it out again */
1824                spin_unlock_irq(&mddev->write_lock);
1825                goto repeat;
1826        }
1827        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1828        spin_unlock_irq(&mddev->write_lock);
1829        wake_up(&mddev->sb_wait);
1830
1831}
1832
1833/* words written to sysfs files may, or may not, be \n terminated.
1834 * We want to accept with case. For this we use cmd_match.
1835 */
1836static int cmd_match(const char *cmd, const char *str)
1837{
1838        /* See if cmd, written into a sysfs file, matches
1839         * str.  They must either be the same, or cmd can
1840         * have a trailing newline
1841         */
1842        while (*cmd && *str && *cmd == *str) {
1843                cmd++;
1844                str++;
1845        }
1846        if (*cmd == '\n')
1847                cmd++;
1848        if (*str || *cmd)
1849                return 0;
1850        return 1;
1851}
1852
1853struct rdev_sysfs_entry {
1854        struct attribute attr;
1855        ssize_t (*show)(mdk_rdev_t *, char *);
1856        ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1857};
1858
1859static ssize_t
1860state_show(mdk_rdev_t *rdev, char *page)
1861{
1862        char *sep = "";
1863        size_t len = 0;
1864
1865        if (test_bit(Faulty, &rdev->flags)) {
1866                len+= sprintf(page+len, "%sfaulty",sep);
1867                sep = ",";
1868        }
1869        if (test_bit(In_sync, &rdev->flags)) {
1870                len += sprintf(page+len, "%sin_sync",sep);
1871                sep = ",";
1872        }
1873        if (test_bit(WriteMostly, &rdev->flags)) {
1874                len += sprintf(page+len, "%swrite_mostly",sep);
1875                sep = ",";
1876        }
1877        if (test_bit(Blocked, &rdev->flags)) {
1878                len += sprintf(page+len, "%sblocked", sep);
1879                sep = ",";
1880        }
1881        if (!test_bit(Faulty, &rdev->flags) &&
1882            !test_bit(In_sync, &rdev->flags)) {
1883                len += sprintf(page+len, "%sspare", sep);
1884                sep = ",";
1885        }
1886        return len+sprintf(page+len, "\n");
1887}
1888
1889static ssize_t
1890state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1891{
1892        /* can write
1893         *  faulty  - simulates and error
1894         *  remove  - disconnects the device
1895         *  writemostly - sets write_mostly
1896         *  -writemostly - clears write_mostly
1897         *  blocked - sets the Blocked flag
1898         *  -blocked - clears the Blocked flag
1899         */
1900        int err = -EINVAL;
1901        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1902                md_error(rdev->mddev, rdev);
1903                err = 0;
1904        } else if (cmd_match(buf, "remove")) {
1905                if (rdev->raid_disk >= 0)
1906                        err = -EBUSY;
1907                else {
1908                        mddev_t *mddev = rdev->mddev;
1909                        kick_rdev_from_array(rdev);
1910                        if (mddev->pers)
1911                                md_update_sb(mddev, 1);
1912                        md_new_event(mddev);
1913                        err = 0;
1914                }
1915        } else if (cmd_match(buf, "writemostly")) {
1916                set_bit(WriteMostly, &rdev->flags);
1917                err = 0;
1918        } else if (cmd_match(buf, "-writemostly")) {
1919                clear_bit(WriteMostly, &rdev->flags);
1920                err = 0;
1921        } else if (cmd_match(buf, "blocked")) {
1922                set_bit(Blocked, &rdev->flags);
1923                err = 0;
1924        } else if (cmd_match(buf, "-blocked")) {
1925                clear_bit(Blocked, &rdev->flags);
1926                wake_up(&rdev->blocked_wait);
1927                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1928                md_wakeup_thread(rdev->mddev->thread);
1929
1930                err = 0;
1931        }
1932        if (!err && rdev->sysfs_state)
1933                sysfs_notify_dirent(rdev->sysfs_state);
1934        return err ? err : len;
1935}
1936static struct rdev_sysfs_entry rdev_state =
1937__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1938
1939static ssize_t
1940errors_show(mdk_rdev_t *rdev, char *page)
1941{
1942        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1943}
1944
1945static ssize_t
1946errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1947{
1948        char *e;
1949        unsigned long n = simple_strtoul(buf, &e, 10);
1950        if (*buf && (*e == 0 || *e == '\n')) {
1951                atomic_set(&rdev->corrected_errors, n);
1952                return len;
1953        }
1954        return -EINVAL;
1955}
1956static struct rdev_sysfs_entry rdev_errors =
1957__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1958
1959static ssize_t
1960slot_show(mdk_rdev_t *rdev, char *page)
1961{
1962        if (rdev->raid_disk < 0)
1963                return sprintf(page, "none\n");
1964        else
1965                return sprintf(page, "%d\n", rdev->raid_disk);
1966}
1967
1968static ssize_t
1969slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1970{
1971        char *e;
1972        int err;
1973        char nm[20];
1974        int slot = simple_strtoul(buf, &e, 10);
1975        if (strncmp(buf, "none", 4)==0)
1976                slot = -1;
1977        else if (e==buf || (*e && *e!= '\n'))
1978                return -EINVAL;
1979        if (rdev->mddev->pers && slot == -1) {
1980                /* Setting 'slot' on an active array requires also
1981                 * updating the 'rd%d' link, and communicating
1982                 * with the personality with ->hot_*_disk.
1983                 * For now we only support removing
1984                 * failed/spare devices.  This normally happens automatically,
1985                 * but not when the metadata is externally managed.
1986                 */
1987                if (rdev->raid_disk == -1)
1988                        return -EEXIST;
1989                /* personality does all needed checks */
1990                if (rdev->mddev->pers->hot_add_disk == NULL)
1991                        return -EINVAL;
1992                err = rdev->mddev->pers->
1993                        hot_remove_disk(rdev->mddev, rdev->raid_disk);
1994                if (err)
1995                        return err;
1996                sprintf(nm, "rd%d", rdev->raid_disk);
1997                sysfs_remove_link(&rdev->mddev->kobj, nm);
1998                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1999                md_wakeup_thread(rdev->mddev->thread);
2000        } else if (rdev->mddev->pers) {
2001                mdk_rdev_t *rdev2;
2002                struct list_head *tmp;
2003                /* Activating a spare .. or possibly reactivating
2004                 * if we every get bitmaps working here.
2005                 */
2006
2007                if (rdev->raid_disk != -1)
2008                        return -EBUSY;
2009
2010                if (rdev->mddev->pers->hot_add_disk == NULL)
2011                        return -EINVAL;
2012
2013                rdev_for_each(rdev2, tmp, rdev->mddev)
2014                        if (rdev2->raid_disk == slot)
2015                                return -EEXIST;
2016
2017                rdev->raid_disk = slot;
2018                if (test_bit(In_sync, &rdev->flags))
2019                        rdev->saved_raid_disk = slot;
2020                else
2021                        rdev->saved_raid_disk = -1;
2022                err = rdev->mddev->pers->
2023                        hot_add_disk(rdev->mddev, rdev);
2024                if (err) {
2025                        rdev->raid_disk = -1;
2026                        return err;
2027                } else
2028                        sysfs_notify_dirent(rdev->sysfs_state);
2029                sprintf(nm, "rd%d", rdev->raid_disk);
2030                if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2031                        printk(KERN_WARNING
2032                               "md: cannot register "
2033                               "%s for %s\n",
2034                               nm, mdname(rdev->mddev));
2035
2036                /* don't wakeup anyone, leave that to userspace. */
2037        } else {
2038                if (slot >= rdev->mddev->raid_disks)
2039                        return -ENOSPC;
2040                rdev->raid_disk = slot;
2041                /* assume it is working */
2042                clear_bit(Faulty, &rdev->flags);
2043                clear_bit(WriteMostly, &rdev->flags);
2044                set_bit(In_sync, &rdev->flags);
2045                sysfs_notify_dirent(rdev->sysfs_state);
2046        }
2047        return len;
2048}
2049
2050
2051static struct rdev_sysfs_entry rdev_slot =
2052__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2053
2054static ssize_t
2055offset_show(mdk_rdev_t *rdev, char *page)
2056{
2057        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2058}
2059
2060static ssize_t
2061offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2062{
2063        char *e;
2064        unsigned long long offset = simple_strtoull(buf, &e, 10);
2065        if (e==buf || (*e && *e != '\n'))
2066                return -EINVAL;
2067        if (rdev->mddev->pers && rdev->raid_disk >= 0)
2068                return -EBUSY;
2069        if (rdev->size && rdev->mddev->external)
2070                /* Must set offset before size, so overlap checks
2071                 * can be sane */
2072                return -EBUSY;
2073        rdev->data_offset = offset;
2074        return len;
2075}
2076
2077static struct rdev_sysfs_entry rdev_offset =
2078__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2079
2080static ssize_t
2081rdev_size_show(mdk_rdev_t *rdev, char *page)
2082{
2083        return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
2084}
2085
2086static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2087{
2088        /* check if two start/length pairs overlap */
2089        if (s1+l1 <= s2)
2090                return 0;
2091        if (s2+l2 <= s1)
2092                return 0;
2093        return 1;
2094}
2095
2096static ssize_t
2097rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2098{
2099        unsigned long long size;
2100        unsigned long long oldsize = rdev->size;
2101        mddev_t *my_mddev = rdev->mddev;
2102
2103        if (strict_strtoull(buf, 10, &size) < 0)
2104                return -EINVAL;
2105        if (my_mddev->pers && rdev->raid_disk >= 0) {
2106                if (my_mddev->persistent) {
2107                        size = super_types[my_mddev->major_version].
2108                                rdev_size_change(rdev, size * 2);
2109                        if (!size)
2110                                return -EBUSY;
2111                } else if (!size) {
2112                        size = (rdev->bdev->bd_inode->i_size >> 10);
2113                        size -= rdev->data_offset/2;
2114                }
2115        }
2116        if (size < my_mddev->size)
2117                return -EINVAL; /* component must fit device */
2118
2119        rdev->size = size;
2120        if (size > oldsize && my_mddev->external) {
2121                /* need to check that all other rdevs with the same ->bdev
2122                 * do not overlap.  We need to unlock the mddev to avoid
2123                 * a deadlock.  We have already changed rdev->size, and if
2124                 * we have to change it back, we will have the lock again.
2125                 */
2126                mddev_t *mddev;
2127                int overlap = 0;
2128                struct list_head *tmp, *tmp2;
2129
2130                mddev_unlock(my_mddev);
2131                for_each_mddev(mddev, tmp) {
2132                        mdk_rdev_t *rdev2;
2133
2134                        mddev_lock(mddev);
2135                        rdev_for_each(rdev2, tmp2, mddev)
2136                                if (test_bit(AllReserved, &rdev2->flags) ||
2137                                    (rdev->bdev == rdev2->bdev &&
2138                                     rdev != rdev2 &&
2139                                     overlaps(rdev->data_offset, rdev->size * 2,
2140                                              rdev2->data_offset,
2141                                              rdev2->size * 2))) {
2142                                        overlap = 1;
2143                                        break;
2144                                }
2145                        mddev_unlock(mddev);
2146                        if (overlap) {
2147                                mddev_put(mddev);
2148                                break;
2149                        }
2150                }
2151                mddev_lock(my_mddev);
2152                if (overlap) {
2153                        /* Someone else could have slipped in a size
2154                         * change here, but doing so is just silly.
2155                         * We put oldsize back because we *know* it is
2156                         * safe, and trust userspace not to race with
2157                         * itself
2158                         */
2159                        rdev->size = oldsize;
2160                        return -EBUSY;
2161                }
2162        }
2163        return len;
2164}
2165
2166static struct rdev_sysfs_entry rdev_size =
2167__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2168
2169static struct attribute *rdev_default_attrs[] = {
2170        &rdev_state.attr,
2171        &rdev_errors.attr,
2172        &rdev_slot.attr,
2173        &rdev_offset.attr,
2174        &rdev_size.attr,
2175        NULL,
2176};
2177static ssize_t
2178rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2179{
2180        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2181        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2182        mddev_t *mddev = rdev->mddev;
2183        ssize_t rv;
2184
2185        if (!entry->show)
2186                return -EIO;
2187
2188        rv = mddev ? mddev_lock(mddev) : -EBUSY;
2189        if (!rv) {
2190                if (rdev->mddev == NULL)
2191                        rv = -EBUSY;
2192                else
2193                        rv = entry->show(rdev, page);
2194                mddev_unlock(mddev);
2195        }
2196        return rv;
2197}
2198
2199static ssize_t
2200rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2201              const char *page, size_t length)
2202{
2203        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2204        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2205        ssize_t rv;
2206        mddev_t *mddev = rdev->mddev;
2207
2208        if (!entry->store)
2209                return -EIO;
2210        if (!capable(CAP_SYS_ADMIN))
2211                return -EACCES;
2212        rv = mddev ? mddev_lock(mddev): -EBUSY;
2213        if (!rv) {
2214                if (rdev->mddev == NULL)
2215                        rv = -EBUSY;
2216                else
2217                        rv = entry->store(rdev, page, length);
2218                mddev_unlock(mddev);
2219        }
2220        return rv;
2221}
2222
2223static void rdev_free(struct kobject *ko)
2224{
2225        mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2226        kfree(rdev);
2227}
2228static struct sysfs_ops rdev_sysfs_ops = {
2229        .show           = rdev_attr_show,
2230        .store          = rdev_attr_store,
2231};
2232static struct kobj_type rdev_ktype = {
2233        .release        = rdev_free,
2234        .sysfs_ops      = &rdev_sysfs_ops,
2235        .default_attrs  = rdev_default_attrs,
2236};
2237
2238/*
2239 * Import a device. If 'super_format' >= 0, then sanity check the superblock
2240 *
2241 * mark the device faulty if:
2242 *
2243 *   - the device is nonexistent (zero size)
2244 *   - the device has no valid superblock
2245 *
2246 * a faulty rdev _never_ has rdev->sb set.
2247 */
2248static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2249{
2250        char b[BDEVNAME_SIZE];
2251        int err;
2252        mdk_rdev_t *rdev;
2253        sector_t size;
2254
2255        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2256        if (!rdev) {
2257                printk(KERN_ERR "md: could not alloc mem for new device!\n");
2258                return ERR_PTR(-ENOMEM);
2259        }
2260
2261        if ((err = alloc_disk_sb(rdev)))
2262                goto abort_free;
2263
2264        err = lock_rdev(rdev, newdev, super_format == -2);
2265        if (err)
2266                goto abort_free;
2267
2268        kobject_init(&rdev->kobj, &rdev_ktype);
2269
2270        rdev->desc_nr = -1;
2271        rdev->saved_raid_disk = -1;
2272        rdev->raid_disk = -1;
2273        rdev->flags = 0;
2274        rdev->data_offset = 0;
2275        rdev->sb_events = 0;
2276        atomic_set(&rdev->nr_pending, 0);
2277        atomic_set(&rdev->read_errors, 0);
2278        atomic_set(&rdev->corrected_errors, 0);
2279
2280        size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2281        if (!size) {
2282                printk(KERN_WARNING 
2283                        "md: %s has zero or unknown size, marking faulty!\n",
2284                        bdevname(rdev->bdev,b));
2285                err = -EINVAL;
2286                goto abort_free;
2287        }
2288
2289        if (super_format >= 0) {
2290                err = super_types[super_format].
2291                        load_super(rdev, NULL, super_minor);
2292                if (err == -EINVAL) {
2293                        printk(KERN_WARNING
2294                                "md: %s does not have a valid v%d.%d "
2295                               "superblock, not importing!\n",
2296                                bdevname(rdev->bdev,b),
2297                               super_format, super_minor);
2298                        goto abort_free;
2299                }
2300                if (err < 0) {
2301                        printk(KERN_WARNING 
2302                                "md: could not read %s's sb, not importing!\n",
2303                                bdevname(rdev->bdev,b));
2304                        goto abort_free;
2305                }
2306        }
2307
2308        INIT_LIST_HEAD(&rdev->same_set);
2309        init_waitqueue_head(&rdev->blocked_wait);
2310
2311        return rdev;
2312
2313abort_free:
2314        if (rdev->sb_page) {
2315                if (rdev->bdev)
2316                        unlock_rdev(rdev);
2317                free_disk_sb(rdev);
2318        }
2319        kfree(rdev);
2320        return ERR_PTR(err);
2321}
2322
2323/*
2324 * Check a full RAID array for plausibility
2325 */
2326
2327
2328static void analyze_sbs(mddev_t * mddev)
2329{
2330        int i;
2331        struct list_head *tmp;
2332        mdk_rdev_t *rdev, *freshest;
2333        char b[BDEVNAME_SIZE];
2334
2335        freshest = NULL;
2336        rdev_for_each(rdev, tmp, mddev)
2337                switch (super_types[mddev->major_version].
2338                        load_super(rdev, freshest, mddev->minor_version)) {
2339                case 1:
2340                        freshest = rdev;
2341                        break;
2342                case 0:
2343                        break;
2344                default:
2345                        printk( KERN_ERR \
2346                                "md: fatal superblock inconsistency in %s"
2347                                " -- removing from array\n", 
2348                                bdevname(rdev->bdev,b));
2349                        kick_rdev_from_array(rdev);
2350                }
2351
2352
2353        super_types[mddev->major_version].
2354                validate_super(mddev, freshest);
2355
2356        i = 0;
2357        rdev_for_each(rdev, tmp, mddev) {
2358                if (rdev != freshest)
2359                        if (super_types[mddev->major_version].
2360                            validate_super(mddev, rdev)) {
2361                                printk(KERN_WARNING "md: kicking non-fresh %s"
2362                                        " from array!\n",
2363                                        bdevname(rdev->bdev,b));
2364                                kick_rdev_from_array(rdev);
2365                                continue;
2366                        }
2367                if (mddev->level == LEVEL_MULTIPATH) {
2368                        rdev->desc_nr = i++;
2369                        rdev->raid_disk = rdev->desc_nr;
2370                        set_bit(In_sync, &rdev->flags);
2371                } else if (rdev->raid_disk >= mddev->raid_disks) {
2372                        rdev->raid_disk = -1;
2373                        clear_bit(In_sync, &rdev->flags);
2374                }
2375        }
2376
2377
2378
2379        if (mddev->recovery_cp != MaxSector &&
2380            mddev->level >= 1)
2381                printk(KERN_ERR "md: %s: raid array is not clean"
2382                       " -- starting background reconstruction\n",
2383                       mdname(mddev));
2384
2385}
2386
2387static void md_safemode_timeout(unsigned long data);
2388
2389static ssize_t
2390safe_delay_show(mddev_t *mddev, char *page)
2391{
2392        int msec = (mddev->safemode_delay*1000)/HZ;
2393        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2394}
2395static ssize_t
2396safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2397{
2398        int scale=1;
2399        int dot=0;
2400        int i;
2401        unsigned long msec;
2402        char buf[30];
2403
2404        /* remove a period, and count digits after it */
2405        if (len >= sizeof(buf))
2406                return -EINVAL;
2407        strlcpy(buf, cbuf, sizeof(buf));
2408        for (i=0; i<len; i++) {
2409                if (dot) {
2410                        if (isdigit(buf[i])) {
2411                                buf[i-1] = buf[i];
2412                                scale *= 10;
2413                        }
2414                        buf[i] = 0;
2415                } else if (buf[i] == '.') {
2416                        dot=1;
2417                        buf[i] = 0;
2418                }
2419        }
2420        if (strict_strtoul(buf, 10, &msec) < 0)
2421                return -EINVAL;
2422        msec = (msec * 1000) / scale;
2423        if (msec == 0)
2424                mddev->safemode_delay = 0;
2425        else {
2426                unsigned long old_delay = mddev->safemode_delay;
2427                mddev->safemode_delay = (msec*HZ)/1000;
2428                if (mddev->safemode_delay == 0)
2429                        mddev->safemode_delay = 1;
2430                if (mddev->safemode_delay < old_delay)
2431                        md_safemode_timeout((unsigned long)mddev);
2432        }
2433        return len;
2434}
2435static struct md_sysfs_entry md_safe_delay =
2436__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2437
2438static ssize_t
2439level_show(mddev_t *mddev, char *page)
2440{
2441        struct mdk_personality *p = mddev->pers;
2442        if (p)
2443                return sprintf(page, "%s\n", p->name);
2444        else if (mddev->clevel[0])
2445                return sprintf(page, "%s\n", mddev->clevel);
2446        else if (mddev->level != LEVEL_NONE)
2447                return sprintf(page, "%d\n", mddev->level);
2448        else
2449                return 0;
2450}
2451
2452static ssize_t
2453level_store(mddev_t *mddev, const char *buf, size_t len)
2454{
2455        ssize_t rv = len;
2456        if (mddev->pers)
2457                return -EBUSY;
2458        if (len == 0)
2459                return 0;
2460        if (len >= sizeof(mddev->clevel))
2461                return -ENOSPC;
2462        strncpy(mddev->clevel, buf, len);
2463        if (mddev->clevel[len-1] == '\n')
2464                len--;
2465        mddev->clevel[len] = 0;
2466        mddev->level = LEVEL_NONE;
2467        return rv;
2468}
2469
2470static struct md_sysfs_entry md_level =
2471__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2472
2473
2474static ssize_t
2475layout_show(mddev_t *mddev, char *page)
2476{
2477        /* just a number, not meaningful for all levels */
2478        if (mddev->reshape_position != MaxSector &&
2479            mddev->layout != mddev->new_layout)
2480                return sprintf(page, "%d (%d)\n",
2481                               mddev->new_layout, mddev->layout);
2482        return sprintf(page, "%d\n", mddev->layout);
2483}
2484
2485static ssize_t
2486layout_store(mddev_t *mddev, const char *buf, size_t len)
2487{
2488        char *e;
2489        unsigned long n = simple_strtoul(buf, &e, 10);
2490
2491        if (!*buf || (*e && *e != '\n'))
2492                return -EINVAL;
2493
2494        if (mddev->pers)
2495                return -EBUSY;
2496        if (mddev->reshape_position != MaxSector)
2497                mddev->new_layout = n;
2498        else
2499                mddev->layout = n;
2500        return len;
2501}
2502static struct md_sysfs_entry md_layout =
2503__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2504
2505
2506static ssize_t
2507raid_disks_show(mddev_t *mddev, char *page)
2508{
2509        if (mddev->raid_disks == 0)
2510                return 0;
2511        if (mddev->reshape_position != MaxSector &&
2512            mddev->delta_disks != 0)
2513                return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2514                               mddev->raid_disks - mddev->delta_disks);
2515        return sprintf(page, "%d\n", mddev->raid_disks);
2516}
2517
2518static int update_raid_disks(mddev_t *mddev, int raid_disks);
2519
2520static ssize_t
2521raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2522{
2523        char *e;
2524        int rv = 0;
2525        unsigned long n = simple_strtoul(buf, &e, 10);
2526
2527        if (!*buf || (*e && *e != '\n'))
2528                return -EINVAL;
2529
2530        if (mddev->pers)
2531                rv = update_raid_disks(mddev, n);
2532        else if (mddev->reshape_position != MaxSector) {
2533                int olddisks = mddev->raid_disks - mddev->delta_disks;
2534                mddev->delta_disks = n - olddisks;
2535                mddev->raid_disks = n;
2536        } else
2537                mddev->raid_disks = n;
2538        return rv ? rv : len;
2539}
2540static struct md_sysfs_entry md_raid_disks =
2541__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2542
2543static ssize_t
2544chunk_size_show(mddev_t *mddev, char *page)
2545{
2546        if (mddev->reshape_position != MaxSector &&
2547            mddev->chunk_size != mddev->new_chunk)
2548                return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2549                               mddev->chunk_size);
2550        return sprintf(page, "%d\n", mddev->chunk_size);
2551}
2552
2553static ssize_t
2554chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2555{
2556        /* can only set chunk_size if array is not yet active */
2557        char *e;
2558        unsigned long n = simple_strtoul(buf, &e, 10);
2559
2560        if (!*buf || (*e && *e != '\n'))
2561                return -EINVAL;
2562
2563        if (mddev->pers)
2564                return -EBUSY;
2565        else if (mddev->reshape_position != MaxSector)
2566                mddev->new_chunk = n;
2567        else
2568                mddev->chunk_size = n;
2569        return len;
2570}
2571static struct md_sysfs_entry md_chunk_size =
2572__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2573
2574static ssize_t
2575resync_start_show(mddev_t *mddev, char *page)
2576{
2577        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2578}
2579
2580static ssize_t
2581resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2582{
2583        char *e;
2584        unsigned long long n = simple_strtoull(buf, &e, 10);
2585
2586        if (mddev->pers)
2587                return -EBUSY;
2588        if (!*buf || (*e && *e != '\n'))
2589                return -EINVAL;
2590
2591        mddev->recovery_cp = n;
2592        return len;
2593}
2594static struct md_sysfs_entry md_resync_start =
2595__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2596
2597/*
2598 * The array state can be:
2599 *
2600 * clear
2601 *     No devices, no size, no level
2602 *     Equivalent to STOP_ARRAY ioctl
2603 * inactive
2604 *     May have some settings, but array is not active
2605 *        all IO results in error
2606 *     When written, doesn't tear down array, but just stops it
2607 * suspended (not supported yet)
2608 *     All IO requests will block. The array can be reconfigured.
2609 *     Writing this, if accepted, will block until array is quiescent
2610 * readonly
2611 *     no resync can happen.  no superblocks get written.
2612 *     write requests fail
2613 * read-auto
2614 *     like readonly, but behaves like 'clean' on a write request.
2615 *
2616 * clean - no pending writes, but otherwise active.
2617 *     When written to inactive array, starts without resync
2618 *     If a write request arrives then
2619 *       if metadata is known, mark 'dirty' and switch to 'active'.
2620 *       if not known, block and switch to write-pending
2621 *     If written to an active array that has pending writes, then fails.
2622 * active
2623 *     fully active: IO and resync can be happening.
2624 *     When written to inactive array, starts with resync
2625 *
2626 * write-pending
2627 *     clean, but writes are blocked waiting for 'active' to be written.
2628 *
2629 * active-idle
2630 *     like active, but no writes have been seen for a while (100msec).
2631 *
2632 */
2633enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2634                   write_pending, active_idle, bad_word};
2635static char *array_states[] = {
2636        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2637        "write-pending", "active-idle", NULL };
2638
2639static int match_word(const char *word, char **list)
2640{
2641        int n;
2642        for (n=0; list[n]; n++)
2643                if (cmd_match(word, list[n]))
2644                        break;
2645        return n;
2646}
2647
2648static ssize_t
2649array_state_show(mddev_t *mddev, char *page)
2650{
2651        enum array_state st = inactive;
2652
2653        if (mddev->pers)
2654                switch(mddev->ro) {
2655                case 1:
2656                        st = readonly;
2657                        break;
2658                case 2:
2659                        st = read_auto;
2660                        break;
2661                case 0:
2662                        if (mddev->in_sync)
2663                                st = clean;
2664                        else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
2665                                st = write_pending;
2666                        else if (mddev->safemode)
2667                                st = active_idle;
2668                        else
2669                                st = active;
2670                }
2671        else {
2672                if (list_empty(&mddev->disks) &&
2673                    mddev->raid_disks == 0 &&
2674                    mddev->size == 0)
2675                        st = clear;
2676                else
2677                        st = inactive;
2678        }
2679        return sprintf(page, "%s\n", array_states[st]);
2680}
2681
2682static int do_md_stop(mddev_t * mddev, int ro, int is_open);
2683static int do_md_run(mddev_t * mddev);
2684static int restart_array(mddev_t *mddev);
2685
2686static ssize_t
2687array_state_store(mddev_t *mddev, const char *buf, size_t len)
2688{
2689        int err = -EINVAL;
2690        enum array_state st = match_word(buf, array_states);
2691        switch(st) {
2692        case bad_word:
2693                break;
2694        case clear:
2695                /* stopping an active array */
2696                if (atomic_read(&mddev->openers) > 0)
2697                        return -EBUSY;
2698                err = do_md_stop(mddev, 0, 0);
2699                break;
2700        case inactive:
2701                /* stopping an active array */
2702                if (mddev->pers) {
2703                        if (atomic_read(&mddev->openers) > 0)
2704                                return -EBUSY;
2705                        err = do_md_stop(mddev, 2, 0);
2706                } else
2707                        err = 0; /* already inactive */
2708                break;
2709        case suspended:
2710                break; /* not supported yet */
2711        case readonly:
2712                if (mddev->pers)
2713                        err = do_md_stop(mddev, 1, 0);
2714                else {
2715                        mddev->ro = 1;
2716                        set_disk_ro(mddev->gendisk, 1);
2717                        err = do_md_run(mddev);
2718                }
2719                break;
2720        case read_auto:
2721                if (mddev->pers) {
2722                        if (mddev->ro == 0)
2723                                err = do_md_stop(mddev, 1, 0);
2724                        else if (mddev->ro == 1)
2725                                err = restart_array(mddev);
2726                        if (err == 0) {
2727                                mddev->ro = 2;
2728                                set_disk_ro(mddev->gendisk, 0);
2729                        }
2730                } else {
2731                        mddev->ro = 2;
2732                        err = do_md_run(mddev);
2733                }
2734                break;
2735        case clean:
2736                if (mddev->pers) {
2737                        restart_array(mddev);
2738                        spin_lock_irq(&mddev->write_lock);
2739                        if (atomic_read(&mddev->writes_pending) == 0) {
2740                                if (mddev->in_sync == 0) {
2741                                        mddev->in_sync = 1;
2742                                        if (mddev->safemode == 1)
2743                                                mddev->safemode = 0;
2744                                        if (mddev->persistent)
2745                                                set_bit(MD_CHANGE_CLEAN,
2746                                                        &mddev->flags);
2747                                }
2748                                err = 0;
2749                        } else
2750                                err = -EBUSY;
2751                        spin_unlock_irq(&mddev->write_lock);
2752                } else {
2753                        mddev->ro = 0;
2754                        mddev->recovery_cp = MaxSector;
2755                        err = do_md_run(mddev);
2756                }
2757                break;
2758        case active:
2759                if (mddev->pers) {
2760                        restart_array(mddev);
2761                        if (mddev->external)
2762                                clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2763                        wake_up(&mddev->sb_wait);
2764                        err = 0;
2765                } else {
2766                        mddev->ro = 0;
2767                        set_disk_ro(mddev->gendisk, 0);
2768                        err = do_md_run(mddev);
2769                }
2770                break;
2771        case write_pending:
2772        case active_idle:
2773                /* these cannot be set */
2774                break;
2775        }
2776        if (err)
2777                return err;
2778        else {
2779                sysfs_notify_dirent(mddev->sysfs_state);
2780                return len;
2781        }
2782}
2783static struct md_sysfs_entry md_array_state =
2784__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2785
2786static ssize_t
2787null_show(mddev_t *mddev, char *page)
2788{
2789        return -EINVAL;
2790}
2791
2792static ssize_t
2793new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2794{
2795        /* buf must be %d:%d\n? giving major and minor numbers */
2796        /* The new device is added to the array.
2797         * If the array has a persistent superblock, we read the
2798         * superblock to initialise info and check validity.
2799         * Otherwise, only checking done is that in bind_rdev_to_array,
2800         * which mainly checks size.
2801         */
2802        char *e;
2803        int major = simple_strtoul(buf, &e, 10);
2804        int minor;
2805        dev_t dev;
2806        mdk_rdev_t *rdev;
2807        int err;
2808
2809        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2810                return -EINVAL;
2811        minor = simple_strtoul(e+1, &e, 10);
2812        if (*e && *e != '\n')
2813                return -EINVAL;
2814        dev = MKDEV(major, minor);
2815        if (major != MAJOR(dev) ||
2816            minor != MINOR(dev))
2817                return -EOVERFLOW;
2818
2819
2820        if (mddev->persistent) {
2821                rdev = md_import_device(dev, mddev->major_version,
2822                                        mddev->minor_version);
2823                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2824                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2825                                                       mdk_rdev_t, same_set);
2826                        err = super_types[mddev->major_version]
2827                                .load_super(rdev, rdev0, mddev->minor_version);
2828                        if (err < 0)
2829                                goto out;
2830                }
2831        } else if (mddev->external)
2832                rdev = md_import_device(dev, -2, -1);
2833        else
2834                rdev = md_import_device(dev, -1, -1);
2835
2836        if (IS_ERR(rdev))
2837                return PTR_ERR(rdev);
2838        err = bind_rdev_to_array(rdev, mddev);
2839 out:
2840        if (err)
2841                export_rdev(rdev);
2842        return err ? err : len;
2843}
2844
2845static struct md_sysfs_entry md_new_device =
2846__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2847
2848static ssize_t
2849bitmap_store(mddev_t *mddev, const char *buf, size_t len)
2850{
2851        char *end;
2852        unsigned long chunk, end_chunk;
2853
2854        if (!mddev->bitmap)
2855                goto out;
2856        /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
2857        while (*buf) {
2858                chunk = end_chunk = simple_strtoul(buf, &end, 0);
2859                if (buf == end) break;
2860                if (*end == '-') { /* range */
2861                        buf = end + 1;
2862                        end_chunk = simple_strtoul(buf, &end, 0);
2863                        if (buf == end) break;
2864                }
2865                if (*end && !isspace(*end)) break;
2866                bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
2867                buf = end;
2868                while (isspace(*buf)) buf++;
2869        }
2870        bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
2871out:
2872        return len;
2873}
2874
2875static struct md_sysfs_entry md_bitmap =
2876__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2877
2878static ssize_t
2879size_show(mddev_t *mddev, char *page)
2880{
2881        return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2882}
2883
2884static int update_size(mddev_t *mddev, sector_t num_sectors);
2885
2886static ssize_t
2887size_store(mddev_t *mddev, const char *buf, size_t len)
2888{
2889        /* If array is inactive, we can reduce the component size, but
2890         * not increase it (except from 0).
2891         * If array is active, we can try an on-line resize
2892         */
2893        char *e;
2894        int err = 0;
2895        unsigned long long size = simple_strtoull(buf, &e, 10);
2896        if (!*buf || *buf == '\n' ||
2897            (*e && *e != '\n'))
2898                return -EINVAL;
2899
2900        if (mddev->pers) {
2901                err = update_size(mddev, size * 2);
2902                md_update_sb(mddev, 1);
2903        } else {
2904                if (mddev->size == 0 ||
2905                    mddev->size > size)
2906                        mddev->size = size;
2907                else
2908                        err = -ENOSPC;
2909        }
2910        return err ? err : len;
2911}
2912
2913static struct md_sysfs_entry md_size =
2914__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2915
2916
2917/* Metdata version.
2918 * This is one of
2919 *   'none' for arrays with no metadata (good luck...)
2920 *   'external' for arrays with externally managed metadata,
2921 * or N.M for internally known formats
2922 */
2923static ssize_t
2924metadata_show(mddev_t *mddev, char *page)
2925{
2926        if (mddev->persistent)
2927                return sprintf(page, "%d.%d\n",
2928                               mddev->major_version, mddev->minor_version);
2929        else if (mddev->external)
2930                return sprintf(page, "external:%s\n", mddev->metadata_type);
2931        else
2932                return sprintf(page, "none\n");
2933}
2934
2935static ssize_t
2936metadata_store(mddev_t *mddev, const char *buf, size_t len)
2937{
2938        int major, minor;
2939        char *e;
2940        /* Changing the details of 'external' metadata is
2941         * always permitted.  Otherwise there must be
2942         * no devices attached to the array.
2943         */
2944        if (mddev->external && strncmp(buf, "external:", 9) == 0)
2945                ;
2946        else if (!list_empty(&mddev->disks))
2947                return -EBUSY;
2948
2949        if (cmd_match(buf, "none")) {
2950                mddev->persistent = 0;
2951                mddev->external = 0;
2952                mddev->major_version = 0;
2953                mddev->minor_version = 90;
2954                return len;
2955        }
2956        if (strncmp(buf, "external:", 9) == 0) {
2957                size_t namelen = len-9;
2958                if (namelen >= sizeof(mddev->metadata_type))
2959                        namelen = sizeof(mddev->metadata_type)-1;
2960                strncpy(mddev->metadata_type, buf+9, namelen);
2961                mddev->metadata_type[namelen] = 0;
2962                if (namelen && mddev->metadata_type[namelen-1] == '\n')
2963                        mddev->metadata_type[--namelen] = 0;
2964                mddev->persistent = 0;
2965                mddev->external = 1;
2966                mddev->major_version = 0;
2967                mddev->minor_version = 90;
2968                return len;
2969        }
2970        major = simple_strtoul(buf, &e, 10);
2971        if (e==buf || *e != '.')
2972                return -EINVAL;
2973        buf = e+1;
2974        minor = simple_strtoul(buf, &e, 10);
2975        if (e==buf || (*e && *e != '\n') )
2976                return -EINVAL;
2977        if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2978                return -ENOENT;
2979        mddev->major_version = major;
2980        mddev->minor_version = minor;
2981        mddev->persistent = 1;
2982        mddev->external = 0;
2983        return len;
2984}
2985
2986static struct md_sysfs_entry md_metadata =
2987__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2988
2989static ssize_t
2990action_show(mddev_t *mddev, char *page)
2991{
2992        char *type = "idle";
2993        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2994            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
2995                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2996                        type = "reshape";
2997                else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2998                        if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2999                                type = "resync";
3000                        else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3001                                type = "check";
3002                        else
3003                                type = "repair";
3004                } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3005                        type = "recover";
3006        }
3007        return sprintf(page, "%s\n", type);
3008}
3009
3010static ssize_t
3011action_store(mddev_t *mddev, const char *page, size_t len)
3012{
3013        if (!mddev->pers || !mddev->pers->sync_request)
3014                return -EINVAL;
3015
3016        if (cmd_match(page, "idle")) {
3017                if (mddev->sync_thread) {
3018                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3019                        md_unregister_thread(mddev->sync_thread);
3020                        mddev->sync_thread = NULL;
3021                        mddev->recovery = 0;
3022                }
3023        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3024                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3025                return -EBUSY;
3026        else if (cmd_match(page, "resync"))
3027                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3028        else if (cmd_match(page, "recover")) {
3029                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3030                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3031        } else if (cmd_match(page, "reshape")) {
3032                int err;
3033                if (mddev->pers->start_reshape == NULL)
3034                        return -EINVAL;
3035                err = mddev->pers->start_reshape(mddev);
3036                if (err)
3037                        return err;
3038                sysfs_notify(&mddev->kobj, NULL, "degraded");
3039        } else {
3040                if (cmd_match(page, "check"))
3041                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3042                else if (!cmd_match(page, "repair"))
3043                        return -EINVAL;
3044                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3045                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3046        }
3047        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3048        md_wakeup_thread(mddev->thread);
3049        sysfs_notify(&mddev->kobj, NULL, "sync_action");
3050        return len;
3051}
3052
3053static ssize_t
3054mismatch_cnt_show(mddev_t *mddev, char *page)
3055{
3056        return sprintf(page, "%llu\n",
3057                       (unsigned long long) mddev->resync_mismatches);
3058}
3059
3060static struct md_sysfs_entry md_scan_mode =
3061__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3062
3063
3064static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3065
3066static ssize_t
3067sync_min_show(mddev_t *mddev, char *page)
3068{
3069        return sprintf(page, "%d (%s)\n", speed_min(mddev),
3070                       mddev->sync_speed_min ? "local": "system");
3071}
3072
3073static ssize_t
3074sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3075{
3076        int min;
3077        char *e;
3078        if (strncmp(buf, "system", 6)==0) {
3079                mddev->sync_speed_min = 0;
3080                return len;
3081        }
3082        min = simple_strtoul(buf, &e, 10);
3083        if (buf == e || (*e && *e != '\n') || min <= 0)
3084                return -EINVAL;
3085        mddev->sync_speed_min = min;
3086        return len;
3087}
3088
3089static struct md_sysfs_entry md_sync_min =
3090__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3091
3092static ssize_t
3093sync_max_show(mddev_t *mddev, char *page)
3094{
3095        return sprintf(page, "%d (%s)\n", speed_max(mddev),
3096                       mddev->sync_speed_max ? "local": "system");
3097}
3098
3099static ssize_t
3100sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3101{
3102        int max;
3103        char *e;
3104        if (strncmp(buf, "system", 6)==0) {
3105                mddev->sync_speed_max = 0;
3106                return len;
3107        }
3108        max = simple_strtoul(buf, &e, 10);
3109        if (buf == e || (*e && *e != '\n') || max <= 0)
3110                return -EINVAL;
3111        mddev->sync_speed_max = max;
3112        return len;
3113}
3114
3115static struct md_sysfs_entry md_sync_max =
3116__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3117
3118static ssize_t
3119degraded_show(mddev_t *mddev, char *page)
3120{
3121        return sprintf(page, "%d\n", mddev->degraded);
3122}
3123static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3124
3125static ssize_t
3126sync_force_parallel_show(mddev_t *mddev, char *page)
3127{
3128        return sprintf(page, "%d\n", mddev->parallel_resync);
3129}
3130
3131static ssize_t
3132sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3133{
3134        long n;
3135
3136        if (strict_strtol(buf, 10, &n))
3137                return -EINVAL;
3138
3139        if (n != 0 && n != 1)
3140                return -EINVAL;
3141
3142        mddev->parallel_resync = n;
3143
3144        if (mddev->sync_thread)
3145                wake_up(&resync_wait);
3146
3147        return len;
3148}
3149
3150/* force parallel resync, even with shared block devices */
3151static struct md_sysfs_entry md_sync_force_parallel =
3152__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3153       sync_force_parallel_show, sync_force_parallel_store);
3154
3155static ssize_t
3156sync_speed_show(mddev_t *mddev, char *page)
3157{
3158        unsigned long resync, dt, db;
3159        resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3160        dt = (jiffies - mddev->resync_mark) / HZ;
3161        if (!dt) dt++;
3162        db = resync - mddev->resync_mark_cnt;
3163        return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3164}
3165
3166static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3167
3168static ssize_t
3169sync_completed_show(mddev_t *mddev, char *page)
3170{
3171        unsigned long max_blocks, resync;
3172
3173        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3174                max_blocks = mddev->resync_max_sectors;
3175        else
3176                max_blocks = mddev->size << 1;
3177
3178        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
3179        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
3180}
3181
3182static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3183
3184static ssize_t
3185min_sync_show(mddev_t *mddev, char *page)
3186{
3187        return sprintf(page, "%llu\n",
3188                       (unsigned long long)mddev->resync_min);
3189}
3190static ssize_t
3191min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3192{
3193        unsigned long long min;
3194        if (strict_strtoull(buf, 10, &min))
3195                return -EINVAL;
3196        if (min > mddev->resync_max)
3197                return -EINVAL;
3198        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3199                return -EBUSY;
3200
3201        /* Must be a multiple of chunk_size */
3202        if (mddev->chunk_size) {
3203                if (min & (sector_t)((mddev->chunk_size>>9)-1))
3204                        return -EINVAL;
3205        }
3206        mddev->resync_min = min;
3207
3208        return len;
3209}
3210
3211static struct md_sysfs_entry md_min_sync =
3212__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3213
3214static ssize_t
3215max_sync_show(mddev_t *mddev, char *page)
3216{
3217        if (mddev->resync_max == MaxSector)
3218                return sprintf(page, "max\n");
3219        else
3220                return sprintf(page, "%llu\n",
3221                               (unsigned long long)mddev->resync_max);
3222}
3223static ssize_t
3224max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3225{
3226        if (strncmp(buf, "max", 3) == 0)
3227                mddev->resync_max = MaxSector;
3228        else {
3229                unsigned long long max;
3230                if (strict_strtoull(buf, 10, &max))
3231                        return -EINVAL;
3232                if (max < mddev->resync_min)
3233                        return -EINVAL;
3234                if (max < mddev->resync_max &&
3235                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3236                        return -EBUSY;
3237
3238                /* Must be a multiple of chunk_size */
3239                if (mddev->chunk_size) {
3240                        if (max & (sector_t)((mddev->chunk_size>>9)-1))
3241                                return -EINVAL;
3242                }
3243                mddev->resync_max = max;
3244        }
3245        wake_up(&mddev->recovery_wait);
3246        return len;
3247}
3248
3249static struct md_sysfs_entry md_max_sync =
3250__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3251
3252static ssize_t
3253suspend_lo_show(mddev_t *mddev, char *page)
3254{
3255        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3256}
3257
3258static ssize_t
3259suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3260{
3261        char *e;
3262        unsigned long long new = simple_strtoull(buf, &e, 10);
3263
3264        if (mddev->pers->quiesce == NULL)
3265                return -EINVAL;
3266        if (buf == e || (*e && *e != '\n'))
3267                return -EINVAL;
3268        if (new >= mddev->suspend_hi ||
3269            (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3270                mddev->suspend_lo = new;
3271                mddev->pers->quiesce(mddev, 2);
3272                return len;
3273        } else
3274                return -EINVAL;
3275}
3276static struct md_sysfs_entry md_suspend_lo =
3277__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3278
3279
3280static ssize_t
3281suspend_hi_show(mddev_t *mddev, char *page)
3282{
3283        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3284}
3285
3286static ssize_t
3287suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3288{
3289        char *e;
3290        unsigned long long new = simple_strtoull(buf, &e, 10);
3291
3292        if (mddev->pers->quiesce == NULL)
3293                return -EINVAL;
3294        if (buf == e || (*e && *e != '\n'))
3295                return -EINVAL;
3296        if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3297            (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3298                mddev->suspend_hi = new;
3299                mddev->pers->quiesce(mddev, 1);
3300                mddev->pers->quiesce(mddev, 0);
3301                return len;
3302        } else
3303                return -EINVAL;
3304}
3305static struct md_sysfs_entry md_suspend_hi =
3306__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3307
3308static ssize_t
3309reshape_position_show(mddev_t *mddev, char *page)
3310{
3311        if (mddev->reshape_position != MaxSector)
3312                return sprintf(page, "%llu\n",
3313                               (unsigned long long)mddev->reshape_position);
3314        strcpy(page, "none\n");
3315        return 5;
3316}
3317
3318static ssize_t
3319reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3320{
3321        char *e;
3322        unsigned long long new = simple_strtoull(buf, &e, 10);
3323        if (mddev->pers)
3324                return -EBUSY;
3325        if (buf == e || (*e && *e != '\n'))
3326                return -EINVAL;
3327        mddev->reshape_position = new;
3328        mddev->delta_disks = 0;
3329        mddev->new_level = mddev->level;
3330        mddev->new_layout = mddev->layout;
3331        mddev->new_chunk = mddev->chunk_size;
3332        return len;
3333}
3334
3335static struct md_sysfs_entry md_reshape_position =
3336__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3337       reshape_position_store);
3338
3339
3340static struct attribute *md_default_attrs[] = {
3341        &md_level.attr,
3342        &md_layout.attr,
3343        &md_raid_disks.attr,
3344        &md_chunk_size.attr,
3345        &md_size.attr,
3346        &md_resync_start.attr,
3347        &md_metadata.attr,
3348        &md_new_device.attr,
3349        &md_safe_delay.attr,
3350        &md_array_state.attr,
3351        &md_reshape_position.attr,
3352        NULL,
3353};
3354
3355static struct attribute *md_redundancy_attrs[] = {
3356        &md_scan_mode.attr,
3357        &md_mismatches.attr,
3358        &md_sync_min.attr,
3359        &md_sync_max.attr,
3360        &md_sync_speed.attr,
3361        &md_sync_force_parallel.attr,
3362        &md_sync_completed.attr,
3363        &md_min_sync.attr,
3364        &md_max_sync.attr,
3365        &md_suspend_lo.attr,
3366        &md_suspend_hi.attr,
3367        &md_bitmap.attr,
3368        &md_degraded.attr,
3369        NULL,
3370};
3371static struct attribute_group md_redundancy_group = {
3372        .name = NULL,
3373        .attrs = md_redundancy_attrs,
3374};
3375
3376
3377static ssize_t
3378md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3379{
3380        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3381        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3382        ssize_t rv;
3383
3384        if (!entry->show)
3385                return -EIO;
3386        rv = mddev_lock(mddev);
3387        if (!rv) {
3388                rv = entry->show(mddev, page);
3389                mddev_unlock(mddev);
3390        }
3391        return rv;
3392}
3393
3394static ssize_t
3395md_attr_store(struct kobject *kobj, struct attribute *attr,
3396              const char *page, size_t length)
3397{
3398        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3399        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3400        ssize_t rv;
3401
3402        if (!entry->store)
3403                return -EIO;
3404        if (!capable(CAP_SYS_ADMIN))
3405                return -EACCES;
3406        rv = mddev_lock(mddev);
3407        if (!rv) {
3408                rv = entry->store(mddev, page, length);
3409                mddev_unlock(mddev);
3410        }
3411        return rv;
3412}
3413
3414static void md_free(struct kobject *ko)
3415{
3416        mddev_t *mddev = container_of(ko, mddev_t, kobj);
3417        kfree(mddev);
3418}
3419
3420static struct sysfs_ops md_sysfs_ops = {
3421        .show   = md_attr_show,
3422        .store  = md_attr_store,
3423};
3424static struct kobj_type md_ktype = {
3425        .release        = md_free,
3426        .sysfs_ops      = &md_sysfs_ops,
3427        .default_attrs  = md_default_attrs,
3428};
3429
3430int mdp_major = 0;
3431
3432static struct kobject *md_probe(dev_t dev, int *part, void *data)
3433{
3434        static DEFINE_MUTEX(disks_mutex);
3435        mddev_t *mddev = mddev_find(dev);
3436        struct gendisk *disk;
3437        int partitioned = (MAJOR(dev) != MD_MAJOR);
3438        int shift = partitioned ? MdpMinorShift : 0;
3439        int unit = MINOR(dev) >> shift;
3440        int error;
3441
3442        if (!mddev)
3443                return NULL;
3444
3445        mutex_lock(&disks_mutex);
3446        if (mddev->gendisk) {
3447                mutex_unlock(&disks_mutex);
3448                mddev_put(mddev);
3449                return NULL;
3450        }
3451        disk = alloc_disk(1 << shift);
3452        if (!disk) {
3453                mutex_unlock(&disks_mutex);
3454                mddev_put(mddev);
3455                return NULL;
3456        }
3457        disk->major = MAJOR(dev);
3458        disk->first_minor = unit << shift;
3459        if (partitioned)
3460                sprintf(disk->disk_name, "md_d%d", unit);
3461        else
3462                sprintf(disk->disk_name, "md%d", unit);
3463        disk->fops = &md_fops;
3464        disk->private_data = mddev;
3465        disk->queue = mddev->queue;
3466        /* Allow extended partitions.  This makes the
3467         * 'mdp' device redundant, but we can really
3468         * remove it now.
3469         */
3470        disk->flags |= GENHD_FL_EXT_DEVT;
3471        add_disk(disk);
3472        mddev->gendisk = disk;
3473        error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3474                                     &disk_to_dev(disk)->kobj, "%s", "md");
3475        mutex_unlock(&disks_mutex);
3476        if (error)
3477                printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3478                       disk->disk_name);
3479        else {
3480                kobject_uevent(&mddev->kobj, KOBJ_ADD);
3481                mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3482        }
3483        return NULL;
3484}
3485
3486static void md_safemode_timeout(unsigned long data)
3487{
3488        mddev_t *mddev = (mddev_t *) data;
3489
3490        if (!atomic_read(&mddev->writes_pending)) {
3491                mddev->safemode = 1;
3492                if (mddev->external)
3493                        sysfs_notify_dirent(mddev->sysfs_state);
3494        }
3495        md_wakeup_thread(mddev->thread);
3496}
3497
3498static int start_dirty_degraded;
3499
3500static int do_md_run(mddev_t * mddev)
3501{
3502        int err;
3503        int chunk_size;
3504        struct list_head *tmp;
3505        mdk_rdev_t *rdev;
3506        struct gendisk *disk;
3507        struct mdk_personality *pers;
3508        char b[BDEVNAME_SIZE];
3509
3510        if (list_empty(&mddev->disks))
3511                /* cannot run an array with no devices.. */
3512                return -EINVAL;
3513
3514        if (mddev->pers)
3515                return -EBUSY;
3516
3517        /*
3518         * Analyze all RAID superblock(s)
3519         */
3520        if (!mddev->raid_disks) {
3521                if (!mddev->persistent)
3522                        return -EINVAL;
3523                analyze_sbs(mddev);
3524        }
3525
3526        chunk_size = mddev->chunk_size;
3527
3528        if (chunk_size) {
3529                if (chunk_size > MAX_CHUNK_SIZE) {
3530                        printk(KERN_ERR "too big chunk_size: %d > %d\n",
3531                                chunk_size, MAX_CHUNK_SIZE);
3532                        return -EINVAL;
3533                }
3534                /*
3535                 * chunk-size has to be a power of 2
3536                 */
3537                if ( (1 << ffz(~chunk_size)) != chunk_size) {
3538                        printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3539                        return -EINVAL;
3540                }
3541
3542                /* devices must have minimum size of one chunk */
3543                rdev_for_each(rdev, tmp, mddev) {
3544                        if (test_bit(Faulty, &rdev->flags))
3545                                continue;
3546                        if (rdev->size < chunk_size / 1024) {
3547                                printk(KERN_WARNING
3548                                        "md: Dev %s smaller than chunk_size:"
3549                                        " %lluk < %dk\n",
3550                                        bdevname(rdev->bdev,b),
3551                                        (unsigned long long)rdev->size,
3552                                        chunk_size / 1024);
3553                                return -EINVAL;
3554                        }
3555                }
3556        }
3557
3558        if (mddev->level != LEVEL_NONE)
3559                request_module("md-level-%d", mddev->level);
3560        else if (mddev->clevel[0])
3561                request_module("md-%s", mddev->clevel);
3562
3563        /*
3564         * Drop all container device buffers, from now on
3565         * the only valid external interface is through the md
3566         * device.
3567         */
3568        rdev_for_each(rdev, tmp, mddev) {
3569                if (test_bit(Faulty, &rdev->flags))
3570                        continue;
3571                sync_blockdev(rdev->bdev);
3572                invalidate_bdev(rdev->bdev);
3573
3574                /* perform some consistency tests on the device.
3575                 * We don't want the data to overlap the metadata,
3576                 * Internal Bitmap issues has handled elsewhere.
3577                 */
3578                if (rdev->data_offset < rdev->sb_start) {
3579                        if (mddev->size &&
3580                            rdev->data_offset + mddev->size*2
3581                            > rdev->sb_start) {
3582                                printk("md: %s: data overlaps metadata\n",
3583                                       mdname(mddev));
3584                                return -EINVAL;
3585                        }
3586                } else {
3587                        if (rdev->sb_start + rdev->sb_size/512
3588                            > rdev->data_offset) {
3589                                printk("md: %s: metadata overlaps data\n",
3590                                       mdname(mddev));
3591                                return -EINVAL;
3592                        }
3593                }
3594                sysfs_notify_dirent(rdev->sysfs_state);
3595        }
3596
3597        md_probe(mddev->unit, NULL, NULL);
3598        disk = mddev->gendisk;
3599        if (!disk)
3600                return -ENOMEM;
3601
3602        spin_lock(&pers_lock);
3603        pers = find_pers(mddev->level, mddev->clevel);
3604        if (!pers || !try_module_get(pers->owner)) {
3605                spin_unlock(&pers_lock);
3606                if (mddev->level != LEVEL_NONE)
3607                        printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3608                               mddev->level);
3609                else
3610                        printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3611                               mddev->clevel);
3612                return -EINVAL;
3613        }
3614        mddev->pers = pers;
3615        spin_unlock(&pers_lock);
3616        mddev->level = pers->level;
3617        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3618
3619        if (mddev->reshape_position != MaxSector &&
3620            pers->start_reshape == NULL) {
3621                /* This personality cannot handle reshaping... */
3622                mddev->pers = NULL;
3623                module_put(pers->owner);
3624                return -EINVAL;
3625        }
3626
3627        if (pers->sync_request) {
3628                /* Warn if this is a potentially silly
3629                 * configuration.
3630                 */
3631                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3632                mdk_rdev_t *rdev2;
3633                struct list_head *tmp2;
3634                int warned = 0;
3635                rdev_for_each(rdev, tmp, mddev) {
3636                        rdev_for_each(rdev2, tmp2, mddev) {
3637                                if (rdev < rdev2 &&
3638                                    rdev->bdev->bd_contains ==
3639                                    rdev2->bdev->bd_contains) {
3640                                        printk(KERN_WARNING
3641                                               "%s: WARNING: %s appears to be"
3642                                               " on the same physical disk as"
3643                                               " %s.\n",
3644                                               mdname(mddev),
3645                                               bdevname(rdev->bdev,b),
3646                                               bdevname(rdev2->bdev,b2));
3647                                        warned = 1;
3648                                }
3649                        }
3650                }
3651                if (warned)
3652                        printk(KERN_WARNING
3653                               "True protection against single-disk"
3654                               " failure might be compromised.\n");
3655        }
3656
3657        mddev->recovery = 0;
3658        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3659        mddev->barriers_work = 1;
3660        mddev->ok_start_degraded = start_dirty_degraded;
3661
3662        if (start_readonly)
3663                mddev->ro = 2; /* read-only, but switch on first write */
3664
3665        err = mddev->pers->run(mddev);
3666        if (err)
3667                printk(KERN_ERR "md: pers->run() failed ...\n");
3668        else if (mddev->pers->sync_request) {
3669                err = bitmap_create(mddev);
3670                if (err) {
3671                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3672                               mdname(mddev), err);
3673                        mddev->pers->stop(mddev);
3674                }
3675        }
3676        if (err) {
3677                module_put(mddev->pers->owner);
3678                mddev->pers = NULL;
3679                bitmap_destroy(mddev);
3680                return err;
3681        }
3682        if (mddev->pers->sync_request) {
3683                if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3684                        printk(KERN_WARNING
3685                               "md: cannot register extra attributes for %s\n",
3686                               mdname(mddev));
3687        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
3688                mddev->ro = 0;
3689
3690        atomic_set(&mddev->writes_pending,0);
3691        mddev->safemode = 0;
3692        mddev->safemode_timer.function = md_safemode_timeout;
3693        mddev->safemode_timer.data = (unsigned long) mddev;
3694        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3695        mddev->in_sync = 1;
3696
3697        rdev_for_each(rdev, tmp, mddev)
3698                if (rdev->raid_disk >= 0) {
3699                        char nm[20];
3700                        sprintf(nm, "rd%d", rdev->raid_disk);
3701                        if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3702                                printk("md: cannot register %s for %s\n",
3703                                       nm, mdname(mddev));
3704                }
3705        
3706        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3707        
3708        if (mddev->flags)
3709                md_update_sb(mddev, 0);
3710
3711        set_capacity(disk, mddev->array_sectors);
3712
3713        /* If we call blk_queue_make_request here, it will
3714         * re-initialise max_sectors etc which may have been
3715         * refined inside -> run.  So just set the bits we need to set.
3716         * Most initialisation happended when we called
3717         * blk_queue_make_request(..., md_fail_request)
3718         * earlier.
3719         */
3720        mddev->queue->queuedata = mddev;
3721        mddev->queue->make_request_fn = mddev->pers->make_request;
3722
3723        /* If there is a partially-recovered drive we need to
3724         * start recovery here.  If we leave it to md_check_recovery,
3725         * it will remove the drives and not do the right thing
3726         */
3727        if (mddev->degraded && !mddev->sync_thread) {
3728                struct list_head *rtmp;
3729                int spares = 0;
3730                rdev_for_each(rdev, rtmp, mddev)
3731                        if (rdev->raid_disk >= 0 &&
3732                            !test_bit(In_sync, &rdev->flags) &&
3733                            !test_bit(Faulty, &rdev->flags))
3734                                /* complete an interrupted recovery */
3735                                spares++;
3736                if (spares && mddev->pers->sync_request) {
3737                        mddev->recovery = 0;
3738                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3739                        mddev->sync_thread = md_register_thread(md_do_sync,
3740                                                                mddev,
3741                                                                "%s_resync");
3742                        if (!mddev->sync_thread) {
3743                                printk(KERN_ERR "%s: could not start resync"
3744                                       " thread...\n",
3745                                       mdname(mddev));
3746                                /* leave the spares where they are, it shouldn't hurt */
3747                                mddev->recovery = 0;
3748                        }
3749                }
3750        }
3751        md_wakeup_thread(mddev->thread);
3752        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3753
3754        mddev->changed = 1;
3755        md_new_event(mddev);
3756        sysfs_notify_dirent(mddev->sysfs_state);
3757        sysfs_notify(&mddev->kobj, NULL, "sync_action");
3758        sysfs_notify(&mddev->kobj, NULL, "degraded");
3759        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
3760        return 0;
3761}
3762
3763static int restart_array(mddev_t *mddev)
3764{
3765        struct gendisk *disk = mddev->gendisk;
3766
3767        /* Complain if it has no devices */
3768        if (list_empty(&mddev->disks))
3769                return -ENXIO;
3770        if (!mddev->pers)
3771                return -EINVAL;
3772        if (!mddev->ro)
3773                return -EBUSY;
3774        mddev->safemode = 0;
3775        mddev->ro = 0;
3776        set_disk_ro(disk, 0);
3777        printk(KERN_INFO "md: %s switched to read-write mode.\n",
3778                mdname(mddev));
3779        /* Kick recovery or resync if necessary */
3780        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3781        md_wakeup_thread(mddev->thread);
3782        md_wakeup_thread(mddev->sync_thread);
3783        sysfs_notify_dirent(mddev->sysfs_state);
3784        return 0;
3785}
3786
3787/* similar to deny_write_access, but accounts for our holding a reference
3788 * to the file ourselves */
3789static int deny_bitmap_write_access(struct file * file)
3790{
3791        struct inode *inode = file->f_mapping->host;
3792
3793        spin_lock(&inode->i_lock);
3794        if (atomic_read(&inode->i_writecount) > 1) {
3795                spin_unlock(&inode->i_lock);
3796                return -ETXTBSY;
3797        }
3798        atomic_set(&inode->i_writecount, -1);
3799        spin_unlock(&inode->i_lock);
3800
3801        return 0;
3802}
3803
3804static void restore_bitmap_write_access(struct file *file)
3805{
3806        struct inode *inode = file->f_mapping->host;
3807
3808        spin_lock(&inode->i_lock);
3809        atomic_set(&inode->i_writecount, 1);
3810        spin_unlock(&inode->i_lock);
3811}
3812
3813/* mode:
3814 *   0 - completely stop and dis-assemble array
3815 *   1 - switch to readonly
3816 *   2 - stop but do not disassemble array
3817 */
3818static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3819{
3820        int err = 0;
3821        struct gendisk *disk = mddev->gendisk;
3822
3823        if (atomic_read(&mddev->openers) > is_open) {
3824                printk("md: %s still in use.\n",mdname(mddev));
3825                return -EBUSY;
3826        }
3827
3828        if (mddev->pers) {
3829
3830                if (mddev->sync_thread) {
3831                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3832                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3833                        md_unregister_thread(mddev->sync_thread);
3834                        mddev->sync_thread = NULL;
3835                }
3836
3837                del_timer_sync(&mddev->safemode_timer);
3838
3839                switch(mode) {
3840                case 1: /* readonly */
3841                        err  = -ENXIO;
3842                        if (mddev->ro==1)
3843                                goto out;
3844                        mddev->ro = 1;
3845                        break;
3846                case 0: /* disassemble */
3847                case 2: /* stop */
3848                        bitmap_flush(mddev);
3849                        md_super_wait(mddev);
3850                        if (mddev->ro)
3851                                set_disk_ro(disk, 0);
3852                        blk_queue_make_request(mddev->queue, md_fail_request);
3853                        mddev->pers->stop(mddev);
3854                        mddev->queue->merge_bvec_fn = NULL;
3855                        mddev->queue->unplug_fn = NULL;
3856                        mddev->queue->backing_dev_info.congested_fn = NULL;
3857                        if (mddev->pers->sync_request)
3858                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3859
3860                        module_put(mddev->pers->owner);
3861                        mddev->pers = NULL;
3862                        /* tell userspace to handle 'inactive' */
3863                        sysfs_notify_dirent(mddev->sysfs_state);
3864
3865                        set_capacity(disk, 0);
3866                        mddev->changed = 1;
3867
3868                        if (mddev->ro)
3869                                mddev->ro = 0;
3870                }
3871                if (!mddev->in_sync || mddev->flags) {
3872                        /* mark array as shutdown cleanly */
3873                        mddev->in_sync = 1;
3874                        md_update_sb(mddev, 1);
3875                }
3876                if (mode == 1)
3877                        set_disk_ro(disk, 1);
3878                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3879        }
3880
3881        /*
3882         * Free resources if final stop
3883         */
3884        if (mode == 0) {
3885                mdk_rdev_t *rdev;
3886                struct list_head *tmp;
3887
3888                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3889
3890                bitmap_destroy(mddev);
3891                if (mddev->bitmap_file) {
3892                        restore_bitmap_write_access(mddev->bitmap_file);
3893                        fput(mddev->bitmap_file);
3894                        mddev->bitmap_file = NULL;
3895                }
3896                mddev->bitmap_offset = 0;
3897
3898                rdev_for_each(rdev, tmp, mddev)
3899                        if (rdev->raid_disk >= 0) {
3900                                char nm[20];
3901                                sprintf(nm, "rd%d", rdev->raid_disk);
3902                                sysfs_remove_link(&mddev->kobj, nm);
3903                        }
3904
3905                /* make sure all md_delayed_delete calls have finished */
3906                flush_scheduled_work();
3907
3908                export_array(mddev);
3909
3910                mddev->array_sectors = 0;
3911                mddev->size = 0;
3912                mddev->raid_disks = 0;
3913                mddev->recovery_cp = 0;
3914                mddev->resync_min = 0;
3915                mddev->resync_max = MaxSector;
3916                mddev->reshape_position = MaxSector;
3917                mddev->external = 0;
3918                mddev->persistent = 0;
3919                mddev->level = LEVEL_NONE;
3920                mddev->clevel[0] = 0;
3921                mddev->flags = 0;
3922                mddev->ro = 0;
3923                mddev->metadata_type[0] = 0;
3924                mddev->chunk_size = 0;
3925                mddev->ctime = mddev->utime = 0;
3926                mddev->layout = 0;
3927                mddev->max_disks = 0;
3928                mddev->events = 0;
3929                mddev->delta_disks = 0;
3930                mddev->new_level = LEVEL_NONE;
3931                mddev->new_layout = 0;
3932                mddev->new_chunk = 0;
3933                mddev->curr_resync = 0;
3934                mddev->resync_mismatches = 0;
3935                mddev->suspend_lo = mddev->suspend_hi = 0;
3936                mddev->sync_speed_min = mddev->sync_speed_max = 0;
3937                mddev->recovery = 0;
3938                mddev->in_sync = 0;
3939                mddev->changed = 0;
3940                mddev->degraded = 0;
3941                mddev->barriers_work = 0;
3942                mddev->safemode = 0;
3943                kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
3944
3945        } else if (mddev->pers)
3946                printk(KERN_INFO "md: %s switched to read-only mode.\n",
3947                        mdname(mddev));
3948        err = 0;
3949        md_new_event(mddev);
3950        sysfs_notify_dirent(mddev->sysfs_state);
3951out:
3952        return err;
3953}
3954
3955#ifndef MODULE
3956static void autorun_array(mddev_t *mddev)
3957{
3958        mdk_rdev_t *rdev;
3959        struct list_head *tmp;
3960        int err;
3961
3962        if (list_empty(&mddev->disks))
3963                return;
3964
3965        printk(KERN_INFO "md: running: ");
3966
3967        rdev_for_each(rdev, tmp, mddev) {
3968                char b[BDEVNAME_SIZE];
3969                printk("<%s>", bdevname(rdev->bdev,b));
3970        }
3971        printk("\n");
3972
3973        err = do_md_run(mddev);
3974        if (err) {
3975                printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3976                do_md_stop(mddev, 0, 0);
3977        }
3978}
3979
3980/*
3981 * lets try to run arrays based on all disks that have arrived
3982 * until now. (those are in pending_raid_disks)
3983 *
3984 * the method: pick the first pending disk, collect all disks with
3985 * the same UUID, remove all from the pending list and put them into
3986 * the 'same_array' list. Then order this list based on superblock
3987 * update time (freshest comes first), kick out 'old' disks and
3988 * compare superblocks. If everything's fine then run it.
3989 *
3990 * If "unit" is allocated, then bump its reference count
3991 */
3992static void autorun_devices(int part)
3993{
3994        struct list_head *tmp;
3995        mdk_rdev_t *rdev0, *rdev;
3996        mddev_t *mddev;
3997        char b[BDEVNAME_SIZE];
3998
3999        printk(KERN_INFO "md: autorun ...\n");
4000        while (!list_empty(&pending_raid_disks)) {
4001                int unit;
4002                dev_t dev;
4003                LIST_HEAD(candidates);
4004                rdev0 = list_entry(pending_raid_disks.next,
4005                                         mdk_rdev_t, same_set);
4006
4007                printk(KERN_INFO "md: considering %s ...\n",
4008                        bdevname(rdev0->bdev,b));
4009                INIT_LIST_HEAD(&candidates);
4010                rdev_for_each_list(rdev, tmp, pending_raid_disks)
4011                        if (super_90_load(rdev, rdev0, 0) >= 0) {
4012                                printk(KERN_INFO "md:  adding %s ...\n",
4013                                        bdevname(rdev->bdev,b));
4014                                list_move(&rdev->same_set, &candidates);
4015                        }
4016                /*
4017                 * now we have a set of devices, with all of them having
4018                 * mostly sane superblocks. It's time to allocate the
4019                 * mddev.
4020                 */
4021                if (part) {
4022                        dev = MKDEV(mdp_major,
4023                                    rdev0->preferred_minor << MdpMinorShift);
4024                        unit = MINOR(dev) >> MdpMinorShift;
4025                } else {
4026                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4027                        unit = MINOR(dev);
4028                }
4029                if (rdev0->preferred_minor != unit) {
4030                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4031                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4032                        break;
4033                }
4034
4035                md_probe(dev, NULL, NULL);
4036                mddev = mddev_find(dev);
4037                if (!mddev || !mddev->gendisk) {
4038                        if (mddev)
4039                                mddev_put(mddev);
4040                        printk(KERN_ERR
4041                                "md: cannot allocate memory for md drive.\n");
4042                        break;
4043                }
4044                if (mddev_lock(mddev)) 
4045                        printk(KERN_WARNING "md: %s locked, cannot run\n",
4046                               mdname(mddev));
4047                else if (mddev->raid_disks || mddev->major_version
4048                         || !list_empty(&mddev->disks)) {
4049                        printk(KERN_WARNING 
4050                                "md: %s already running, cannot run %s\n",
4051                                mdname(mddev), bdevname(rdev0->bdev,b));
4052                        mddev_unlock(mddev);
4053                } else {
4054                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
4055                        mddev->persistent = 1;
4056                        rdev_for_each_list(rdev, tmp, candidates) {
4057                                list_del_init(&rdev->same_set);
4058                                if (bind_rdev_to_array(rdev, mddev))
4059                                        export_rdev(rdev);
4060                        }
4061                        autorun_array(mddev);
4062                        mddev_unlock(mddev);
4063                }
4064                /* on success, candidates will be empty, on error
4065                 * it won't...
4066                 */
4067                rdev_for_each_list(rdev, tmp, candidates) {
4068                        list_del_init(&rdev->same_set);
4069                        export_rdev(rdev);
4070                }
4071                mddev_put(mddev);
4072        }
4073        printk(KERN_INFO "md: ... autorun DONE.\n");
4074}
4075#endif /* !MODULE */
4076
4077static int get_version(void __user * arg)
4078{
4079        mdu_version_t ver;
4080
4081        ver.major = MD_MAJOR_VERSION;
4082        ver.minor = MD_MINOR_VERSION;
4083        ver.patchlevel = MD_PATCHLEVEL_VERSION;
4084
4085        if (copy_to_user(arg, &ver, sizeof(ver)))
4086                return -EFAULT;
4087
4088        return 0;
4089}
4090
4091static int get_array_info(mddev_t * mddev, void __user * arg)
4092{
4093        mdu_array_info_t info;
4094        int nr,working,active,failed,spare;
4095        mdk_rdev_t *rdev;
4096        struct list_head *tmp;
4097
4098        nr=working=active=failed=spare=0;
4099        rdev_for_each(rdev, tmp, mddev) {
4100                nr++;
4101                if (test_bit(Faulty, &rdev->flags))
4102                        failed++;
4103                else {
4104                        working++;
4105                        if (test_bit(In_sync, &rdev->flags))
4106                                active++;       
4107                        else
4108                                spare++;
4109                }
4110        }
4111
4112        info.major_version = mddev->major_version;
4113        info.minor_version = mddev->minor_version;
4114        info.patch_version = MD_PATCHLEVEL_VERSION;
4115        info.ctime         = mddev->ctime;
4116        info.level         = mddev->level;
4117        info.size          = mddev->size;
4118        if (info.size != mddev->size) /* overflow */
4119                info.size = -1;
4120        info.nr_disks      = nr;
4121        info.raid_disks    = mddev->raid_disks;
4122        info.md_minor      = mddev->md_minor;
4123        info.not_persistent= !mddev->persistent;
4124
4125        info.utime         = mddev->utime;
4126        info.state         = 0;
4127        if (mddev->in_sync)
4128                info.state = (1<<MD_SB_CLEAN);
4129        if (mddev->bitmap && mddev->bitmap_offset)
4130                info.state = (1<<MD_SB_BITMAP_PRESENT);
4131        info.active_disks  = active;
4132        info.working_disks = working;
4133        info.failed_disks  = failed;
4134        info.spare_disks   = spare;
4135
4136        info.layout        = mddev->layout;
4137        info.chunk_size    = mddev->chunk_size;
4138
4139        if (copy_to_user(arg, &info, sizeof(info)))
4140                return -EFAULT;
4141
4142        return 0;
4143}
4144
4145static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4146{
4147        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4148        char *ptr, *buf = NULL;
4149        int err = -ENOMEM;
4150
4151        if (md_allow_write(mddev))
4152                file = kmalloc(sizeof(*file), GFP_NOIO);
4153        else
4154                file = kmalloc(sizeof(*file), GFP_KERNEL);
4155
4156        if (!file)
4157                goto out;
4158
4159        /* bitmap disabled, zero the first byte and copy out */
4160        if (!mddev->bitmap || !mddev->bitmap->file) {
4161                file->pathname[0] = '\0';
4162                goto copy_out;
4163        }
4164
4165        buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4166        if (!buf)
4167                goto out;
4168
4169        ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4170        if (IS_ERR(ptr))
4171                goto out;
4172
4173        strcpy(file->pathname, ptr);
4174
4175copy_out:
4176        err = 0;
4177        if (copy_to_user(arg, file, sizeof(*file)))
4178                err = -EFAULT;
4179out:
4180        kfree(buf);
4181        kfree(file);
4182        return err;
4183}
4184
4185static int get_disk_info(mddev_t * mddev, void __user * arg)
4186{
4187        mdu_disk_info_t info;
4188        mdk_rdev_t *rdev;
4189
4190        if (copy_from_user(&info, arg, sizeof(info)))
4191                return -EFAULT;
4192
4193        rdev = find_rdev_nr(mddev, info.number);
4194        if (rdev) {
4195                info.major = MAJOR(rdev->bdev->bd_dev);
4196                info.minor = MINOR(rdev->bdev->bd_dev);
4197                info.raid_disk = rdev->raid_disk;
4198                info.state = 0;
4199                if (test_bit(Faulty, &rdev->flags))
4200                        info.state |= (1<<MD_DISK_FAULTY);
4201                else if (test_bit(In_sync, &rdev->flags)) {
4202                        info.state |= (1<<MD_DISK_ACTIVE);
4203                        info.state |= (1<<MD_DISK_SYNC);
4204                }
4205                if (test_bit(WriteMostly, &rdev->flags))
4206                        info.state |= (1<<MD_DISK_WRITEMOSTLY);
4207        } else {
4208                info.major = info.minor = 0;
4209                info.raid_disk = -1;
4210                info.state = (1<<MD_DISK_REMOVED);
4211        }
4212
4213        if (copy_to_user(arg, &info, sizeof(info)))
4214                return -EFAULT;
4215
4216        return 0;
4217}
4218
4219static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4220{
4221        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4222        mdk_rdev_t *rdev;
4223        dev_t dev = MKDEV(info->major,info->minor);
4224
4225        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4226                return -EOVERFLOW;
4227
4228        if (!mddev->raid_disks) {
4229                int err;
4230                /* expecting a device which has a superblock */
4231                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4232                if (IS_ERR(rdev)) {
4233                        printk(KERN_WARNING 
4234                                "md: md_import_device returned %ld\n",
4235                                PTR_ERR(rdev));
4236                        return PTR_ERR(rdev);
4237                }
4238                if (!list_empty(&mddev->disks)) {
4239                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4240                                                        mdk_rdev_t, same_set);
4241                        int err = super_types[mddev->major_version]
4242                                .load_super(rdev, rdev0, mddev->minor_version);
4243                        if (err < 0) {
4244                                printk(KERN_WARNING 
4245                                        "md: %s has different UUID to %s\n",
4246                                        bdevname(rdev->bdev,b), 
4247                                        bdevname(rdev0->bdev,b2));
4248                                export_rdev(rdev);
4249                                return -EINVAL;
4250                        }
4251                }
4252                err = bind_rdev_to_array(rdev, mddev);
4253                if (err)
4254                        export_rdev(rdev);
4255                return err;
4256        }
4257
4258        /*
4259         * add_new_disk can be used once the array is assembled
4260         * to add "hot spares".  They must already have a superblock
4261         * written
4262         */
4263        if (mddev->pers) {
4264                int err;
4265                if (!mddev->pers->hot_add_disk) {
4266                        printk(KERN_WARNING 
4267                                "%s: personality does not support diskops!\n",
4268                               mdname(mddev));
4269                        return -EINVAL;
4270                }
4271                if (mddev->persistent)
4272                        rdev = md_import_device(dev, mddev->major_version,
4273                                                mddev->minor_version);
4274                else
4275                        rdev = md_import_device(dev, -1, -1);
4276                if (IS_ERR(rdev)) {
4277                        printk(KERN_WARNING 
4278                                "md: md_import_device returned %ld\n",
4279                                PTR_ERR(rdev));
4280                        return PTR_ERR(rdev);
4281                }
4282                /* set save_raid_disk if appropriate */
4283                if (!mddev->persistent) {
4284                        if (info->state & (1<<MD_DISK_SYNC)  &&
4285                            info->raid_disk < mddev->raid_disks)
4286                                rdev->raid_disk = info->raid_disk;
4287                        else
4288                                rdev->raid_disk = -1;
4289                } else
4290                        super_types[mddev->major_version].
4291                                validate_super(mddev, rdev);
4292                rdev->saved_raid_disk = rdev->raid_disk;
4293
4294                clear_bit(In_sync, &rdev->flags); /* just to be sure */
4295                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4296                        set_bit(WriteMostly, &rdev->flags);
4297
4298                rdev->raid_disk = -1;
4299                err = bind_rdev_to_array(rdev, mddev);
4300                if (!err && !mddev->pers->hot_remove_disk) {
4301                        /* If there is hot_add_disk but no hot_remove_disk
4302                         * then added disks for geometry changes,
4303                         * and should be added immediately.
4304                         */
4305                        super_types[mddev->major_version].
4306                                validate_super(mddev, rdev);
4307                        err = mddev->pers->hot_add_disk(mddev, rdev);
4308                        if (err)
4309                                unbind_rdev_from_array(rdev);
4310                }
4311                if (err)
4312                        export_rdev(rdev);
4313                else
4314                        sysfs_notify_dirent(rdev->sysfs_state);
4315
4316                md_update_sb(mddev, 1);
4317                if (mddev->degraded)
4318                        set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4319                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4320                md_wakeup_thread(mddev->thread);
4321                return err;
4322        }
4323
4324        /* otherwise, add_new_disk is only allowed
4325         * for major_version==0 superblocks
4326         */
4327        if (mddev->major_version != 0) {
4328                printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4329                       mdname(mddev));
4330                return -EINVAL;
4331        }
4332
4333        if (!(info->state & (1<<MD_DISK_FAULTY))) {
4334                int err;
4335                rdev = md_import_device(dev, -1, 0);
4336                if (IS_ERR(rdev)) {
4337                        printk(KERN_WARNING 
4338                                "md: error, md_import_device() returned %ld\n",
4339                                PTR_ERR(rdev));
4340                        return PTR_ERR(rdev);
4341                }
4342                rdev->desc_nr = info->number;
4343                if (info->raid_disk < mddev->raid_disks)
4344                        rdev->raid_disk = info->raid_disk;
4345                else
4346                        rdev->raid_disk = -1;
4347
4348                if (rdev->raid_disk < mddev->raid_disks)
4349                        if (info->state & (1<<MD_DISK_SYNC))
4350                                set_bit(In_sync, &rdev->flags);
4351
4352                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4353                        set_bit(WriteMostly, &rdev->flags);
4354
4355                if (!mddev->persistent) {
4356                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
4357                        rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4358                } else 
4359                        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4360                rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4361
4362                err = bind_rdev_to_array(rdev, mddev);
4363                if (err) {
4364                        export_rdev(rdev);
4365                        return err;
4366                }
4367        }
4368
4369        return 0;
4370}
4371
4372static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4373{
4374        char b[BDEVNAME_SIZE];
4375        mdk_rdev_t *rdev;
4376
4377        rdev = find_rdev(mddev, dev);
4378        if (!rdev)
4379                return -ENXIO;
4380
4381        if (rdev->raid_disk >= 0)
4382                goto busy;
4383
4384        kick_rdev_from_array(rdev);
4385        md_update_sb(mddev, 1);
4386        md_new_event(mddev);
4387
4388        return 0;
4389busy:
4390        printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4391                bdevname(rdev->bdev,b), mdname(mddev));
4392        return -EBUSY;
4393}
4394
4395static int hot_add_disk(mddev_t * mddev, dev_t dev)
4396{
4397        char b[BDEVNAME_SIZE];
4398        int err;
4399        mdk_rdev_t *rdev;
4400
4401        if (!mddev->pers)
4402                return -ENODEV;
4403
4404        if (mddev->major_version != 0) {
4405                printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4406                        " version-0 superblocks.\n",
4407                        mdname(mddev));
4408                return -EINVAL;
4409        }
4410        if (!mddev->pers->hot_add_disk) {
4411                printk(KERN_WARNING 
4412                        "%s: personality does not support diskops!\n",
4413                        mdname(mddev));
4414                return -EINVAL;
4415        }
4416
4417        rdev = md_import_device(dev, -1, 0);
4418        if (IS_ERR(rdev)) {
4419                printk(KERN_WARNING 
4420                        "md: error, md_import_device() returned %ld\n",
4421                        PTR_ERR(rdev));
4422                return -EINVAL;
4423        }
4424
4425        if (mddev->persistent)
4426                rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4427        else
4428                rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4429
4430        rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4431
4432        if (test_bit(Faulty, &rdev->flags)) {
4433                printk(KERN_WARNING 
4434                        "md: can not hot-add faulty %s disk to %s!\n",
4435                        bdevname(rdev->bdev,b), mdname(mddev));
4436                err = -EINVAL;
4437                goto abort_export;
4438        }
4439        clear_bit(In_sync, &rdev->flags);
4440        rdev->desc_nr = -1;
4441        rdev->saved_raid_disk = -1;
4442        err = bind_rdev_to_array(rdev, mddev);
4443        if (err)
4444                goto abort_export;
4445
4446        /*
4447         * The rest should better be atomic, we can have disk failures
4448         * noticed in interrupt contexts ...
4449         */
4450
4451        if (rdev->desc_nr == mddev->max_disks) {
4452                printk(KERN_WARNING "%s: can not hot-add to full array!\n",
4453                        mdname(mddev));
4454                err = -EBUSY;
4455                goto abort_unbind_export;
4456        }
4457
4458        rdev->raid_disk = -1;
4459
4460        md_update_sb(mddev, 1);
4461
4462        /*
4463         * Kick recovery, maybe this spare has to be added to the
4464         * array immediately.
4465         */
4466        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4467        md_wakeup_thread(mddev->thread);
4468        md_new_event(mddev);
4469        return 0;
4470
4471abort_unbind_export:
4472        unbind_rdev_from_array(rdev);
4473
4474abort_export:
4475        export_rdev(rdev);
4476        return err;
4477}
4478
4479static int set_bitmap_file(mddev_t *mddev, int fd)
4480{
4481        int err;
4482
4483        if (mddev->pers) {
4484                if (!mddev->pers->quiesce)
4485                        return -EBUSY;
4486                if (mddev->recovery || mddev->sync_thread)
4487                        return -EBUSY;
4488                /* we should be able to change the bitmap.. */
4489        }
4490
4491
4492        if (fd >= 0) {
4493                if (mddev->bitmap)
4494                        return -EEXIST; /* cannot add when bitmap is present */
4495                mddev->bitmap_file = fget(fd);
4496
4497                if (mddev->bitmap_file == NULL) {
4498                        printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4499                               mdname(mddev));
4500                        return -EBADF;
4501                }
4502
4503                err = deny_bitmap_write_access(mddev->bitmap_file);
4504                if (err) {
4505                        printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4506                               mdname(mddev));
4507                        fput(mddev->bitmap_file);
4508                        mddev->bitmap_file = NULL;
4509                        return err;
4510                }
4511                mddev->bitmap_offset = 0; /* file overrides offset */
4512        } else if (mddev->bitmap == NULL)
4513                return -ENOENT; /* cannot remove what isn't there */
4514        err = 0;
4515        if (mddev->pers) {
4516                mddev->pers->quiesce(mddev, 1);
4517                if (fd >= 0)
4518                        err = bitmap_create(mddev);
4519                if (fd < 0 || err) {
4520                        bitmap_destroy(mddev);
4521                        fd = -1; /* make sure to put the file */
4522                }
4523                mddev->pers->quiesce(mddev, 0);
4524        }
4525        if (fd < 0) {
4526                if (mddev->bitmap_file) {
4527                        restore_bitmap_write_access(mddev->bitmap_file);
4528                        fput(mddev->bitmap_file);
4529                }
4530                mddev->bitmap_file = NULL;
4531        }
4532
4533        return err;
4534}
4535
4536/*
4537 * set_array_info is used two different ways
4538 * The original usage is when creating a new array.
4539 * In this usage, raid_disks is > 0 and it together with
4540 *  level, size, not_persistent,layout,chunksize determine the
4541 *  shape of the array.
4542 *  This will always create an array with a type-0.90.0 superblock.
4543 * The newer usage is when assembling an array.
4544 *  In this case raid_disks will be 0, and the major_version field is
4545 *  use to determine which style super-blocks are to be found on the devices.
4546 *  The minor and patch _version numbers are also kept incase the
4547 *  super_block handler wishes to interpret them.
4548 */
4549static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4550{
4551
4552        if (info->raid_disks == 0) {
4553                /* just setting version number for superblock loading */
4554                if (info->major_version < 0 ||
4555                    info->major_version >= ARRAY_SIZE(super_types) ||
4556                    super_types[info->major_version].name == NULL) {
4557                        /* maybe try to auto-load a module? */
4558                        printk(KERN_INFO 
4559                                "md: superblock version %d not known\n",
4560                                info->major_version);
4561                        return -EINVAL;
4562                }
4563                mddev->major_version = info->major_version;
4564                mddev->minor_version = info->minor_version;
4565                mddev->patch_version = info->patch_version;
4566                mddev->persistent = !info->not_persistent;
4567                return 0;
4568        }
4569        mddev->major_version = MD_MAJOR_VERSION;
4570        mddev->minor_version = MD_MINOR_VERSION;
4571        mddev->patch_version = MD_PATCHLEVEL_VERSION;
4572        mddev->ctime         = get_seconds();
4573
4574        mddev->level         = info->level;
4575        mddev->clevel[0]     = 0;
4576        mddev->size          = info->size;
4577        mddev->raid_disks    = info->raid_disks;
4578        /* don't set md_minor, it is determined by which /dev/md* was
4579         * openned
4580         */
4581        if (info->state & (1<<MD_SB_CLEAN))
4582                mddev->recovery_cp = MaxSector;
4583        else
4584                mddev->recovery_cp = 0;
4585        mddev->persistent    = ! info->not_persistent;
4586        mddev->external      = 0;
4587
4588        mddev->layout        = info->layout;
4589        mddev->chunk_size    = info->chunk_size;
4590
4591        mddev->max_disks     = MD_SB_DISKS;
4592
4593        if (mddev->persistent)
4594                mddev->flags         = 0;
4595        set_bit(MD_CHANGE_DEVS, &mddev->flags);
4596
4597        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4598        mddev->bitmap_offset = 0;
4599
4600        mddev->reshape_position = MaxSector;
4601
4602        /*
4603         * Generate a 128 bit UUID
4604         */
4605        get_random_bytes(mddev->uuid, 16);
4606
4607        mddev->new_level = mddev->level;
4608        mddev->new_chunk = mddev->chunk_size;
4609        mddev->new_layout = mddev->layout;
4610        mddev->delta_disks = 0;
4611
4612        return 0;
4613}
4614
4615static int update_size(mddev_t *mddev, sector_t num_sectors)
4616{
4617        mdk_rdev_t * rdev;
4618        int rv;
4619        struct list_head *tmp;
4620        int fit = (num_sectors == 0);
4621
4622        if (mddev->pers->resize == NULL)
4623                return -EINVAL;
4624        /* The "num_sectors" is the number of sectors of each device that
4625         * is used.  This can only make sense for arrays with redundancy.
4626         * linear and raid0 always use whatever space is available. We can only
4627         * consider changing this number if no resync or reconstruction is
4628         * happening, and if the new size is acceptable. It must fit before the
4629         * sb_start or, if that is <data_offset, it must fit before the size
4630         * of each device.  If num_sectors is zero, we find the largest size
4631         * that fits.
4632
4633         */
4634        if (mddev->sync_thread)
4635                return -EBUSY;
4636        if (mddev->bitmap)
4637                /* Sorry, cannot grow a bitmap yet, just remove it,
4638                 * grow, and re-add.
4639                 */
4640                return -EBUSY;
4641        rdev_for_each(rdev, tmp, mddev) {
4642                sector_t avail;
4643                avail = rdev->size * 2;
4644
4645                if (fit && (num_sectors == 0 || num_sectors > avail))
4646                        num_sectors = avail;
4647                if (avail < num_sectors)
4648                        return -ENOSPC;
4649        }
4650        rv = mddev->pers->resize(mddev, num_sectors);
4651        if (!rv) {
4652                struct block_device *bdev;
4653
4654                bdev = bdget_disk(mddev->gendisk, 0);
4655                if (bdev) {
4656                        mutex_lock(&bdev->bd_inode->i_mutex);
4657                        i_size_write(bdev->bd_inode,
4658                                     (loff_t)mddev->array_sectors << 9);
4659                        mutex_unlock(&bdev->bd_inode->i_mutex);
4660                        bdput(bdev);
4661                }
4662        }
4663        return rv;
4664}
4665
4666static int update_raid_disks(mddev_t *mddev, int raid_disks)
4667{
4668        int rv;
4669        /* change the number of raid disks */
4670        if (mddev->pers->check_reshape == NULL)
4671                return -EINVAL;
4672        if (raid_disks <= 0 ||
4673            raid_disks >= mddev->max_disks)
4674                return -EINVAL;
4675        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4676                return -EBUSY;
4677        mddev->delta_disks = raid_disks - mddev->raid_disks;
4678
4679        rv = mddev->pers->check_reshape(mddev);
4680        return rv;
4681}
4682
4683
4684/*
4685 * update_array_info is used to change the configuration of an
4686 * on-line array.
4687 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4688 * fields in the info are checked against the array.
4689 * Any differences that cannot be handled will cause an error.
4690 * Normally, only one change can be managed at a time.
4691 */
4692static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4693{
4694        int rv = 0;
4695        int cnt = 0;
4696        int state = 0;
4697
4698        /* calculate expected state,ignoring low bits */
4699        if (mddev->bitmap && mddev->bitmap_offset)
4700                state |= (1 << MD_SB_BITMAP_PRESENT);
4701
4702        if (mddev->major_version != info->major_version ||
4703            mddev->minor_version != info->minor_version ||
4704/*          mddev->patch_version != info->patch_version || */
4705            mddev->ctime         != info->ctime         ||
4706            mddev->level         != info->level         ||
4707/*          mddev->layout        != info->layout        || */
4708            !mddev->persistent   != info->not_persistent||
4709            mddev->chunk_size    != info->chunk_size    ||
4710            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4711            ((state^info->state) & 0xfffffe00)
4712                )
4713                return -EINVAL;
4714        /* Check there is only one change */
4715        if (info->size >= 0 && mddev->size != info->size) cnt++;
4716        if (mddev->raid_disks != info->raid_disks) cnt++;
4717        if (mddev->layout != info->layout) cnt++;
4718        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4719        if (cnt == 0) return 0;
4720        if (cnt > 1) return -EINVAL;
4721
4722        if (mddev->layout != info->layout) {
4723                /* Change layout
4724                 * we don't need to do anything at the md level, the
4725                 * personality will take care of it all.
4726                 */
4727                if (mddev->pers->reconfig == NULL)
4728                        return -EINVAL;
4729                else
4730                        return mddev->pers->reconfig(mddev, info->layout, -1);
4731        }
4732        if (info->size >= 0 && mddev->size != info->size)
4733                rv = update_size(mddev, (sector_t)info->size * 2);
4734
4735        if (mddev->raid_disks    != info->raid_disks)
4736                rv = update_raid_disks(mddev, info->raid_disks);
4737
4738        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4739                if (mddev->pers->quiesce == NULL)
4740                        return -EINVAL;
4741                if (mddev->recovery || mddev->sync_thread)
4742                        return -EBUSY;
4743                if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4744                        /* add the bitmap */
4745                        if (mddev->bitmap)
4746                                return -EEXIST;
4747                        if (mddev->default_bitmap_offset == 0)
4748                                return -EINVAL;
4749                        mddev->bitmap_offset = mddev->default_bitmap_offset;
4750                        mddev->pers->quiesce(mddev, 1);
4751                        rv = bitmap_create(mddev);
4752                        if (rv)
4753                                bitmap_destroy(mddev);
4754                        mddev->pers->quiesce(mddev, 0);
4755                } else {
4756                        /* remove the bitmap */
4757                        if (!mddev->bitmap)
4758                                return -ENOENT;
4759                        if (mddev->bitmap->file)
4760                                return -EINVAL;
4761                        mddev->pers->quiesce(mddev, 1);
4762                        bitmap_destroy(mddev);
4763                        mddev->pers->quiesce(mddev, 0);
4764                        mddev->bitmap_offset = 0;
4765                }
4766        }
4767        md_update_sb(mddev, 1);
4768        return rv;
4769}
4770
4771static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4772{
4773        mdk_rdev_t *rdev;
4774
4775        if (mddev->pers == NULL)
4776                return -ENODEV;
4777
4778        rdev = find_rdev(mddev, dev);
4779        if (!rdev)
4780                return -ENODEV;
4781
4782        md_error(mddev, rdev);
4783        return 0;
4784}
4785
4786/*
4787 * We have a problem here : there is no easy way to give a CHS
4788 * virtual geometry. We currently pretend that we have a 2 heads
4789 * 4 sectors (with a BIG number of cylinders...). This drives
4790 * dosfs just mad... ;-)
4791 */
4792static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4793{
4794        mddev_t *mddev = bdev->bd_disk->private_data;
4795
4796        geo->heads = 2;
4797        geo->sectors = 4;
4798        geo->cylinders = get_capacity(mddev->gendisk) / 8;
4799        return 0;
4800}
4801
4802static int md_ioctl(struct block_device *bdev, fmode_t mode,
4803                        unsigned int cmd, unsigned long arg)
4804{
4805        int err = 0;
4806        void __user *argp = (void __user *)arg;
4807        mddev_t *mddev = NULL;
4808
4809        if (!capable(CAP_SYS_ADMIN))
4810                return -EACCES;
4811
4812        /*
4813         * Commands dealing with the RAID driver but not any
4814         * particular array:
4815         */
4816        switch (cmd)
4817        {
4818                case RAID_VERSION:
4819                        err = get_version(argp);
4820                        goto done;
4821
4822                case PRINT_RAID_DEBUG:
4823                        err = 0;
4824                        md_print_devices();
4825                        goto done;
4826
4827#ifndef MODULE
4828                case RAID_AUTORUN:
4829                        err = 0;
4830                        autostart_arrays(arg);
4831                        goto done;
4832#endif
4833                default:;
4834        }
4835
4836        /*
4837         * Commands creating/starting a new array:
4838         */
4839
4840        mddev = bdev->bd_disk->private_data;
4841
4842        if (!mddev) {
4843                BUG();
4844                goto abort;
4845        }
4846
4847        err = mddev_lock(mddev);
4848        if (err) {
4849                printk(KERN_INFO 
4850                        "md: ioctl lock interrupted, reason %d, cmd %d\n",
4851                        err, cmd);
4852                goto abort;
4853        }
4854
4855        switch (cmd)
4856        {
4857                case SET_ARRAY_INFO:
4858                        {
4859                                mdu_array_info_t info;
4860                                if (!arg)
4861                                        memset(&info, 0, sizeof(info));
4862                                else if (copy_from_user(&info, argp, sizeof(info))) {
4863                                        err = -EFAULT;
4864                                        goto abort_unlock;
4865                                }
4866                                if (mddev->pers) {
4867                                        err = update_array_info(mddev, &info);
4868                                        if (err) {
4869                                                printk(KERN_WARNING "md: couldn't update"
4870                                                       " array info. %d\n", err);
4871                                                goto abort_unlock;
4872                                        }
4873                                        goto done_unlock;
4874                                }
4875                                if (!list_empty(&mddev->disks)) {
4876                                        printk(KERN_WARNING
4877                                               "md: array %s already has disks!\n",
4878                                               mdname(mddev));
4879                                        err = -EBUSY;
4880                                        goto abort_unlock;
4881                                }
4882                                if (mddev->raid_disks) {
4883                                        printk(KERN_WARNING
4884                                               "md: array %s already initialised!\n",
4885                                               mdname(mddev));
4886                                        err = -EBUSY;
4887                                        goto abort_unlock;
4888                                }
4889                                err = set_array_info(mddev, &info);
4890                                if (err) {
4891                                        printk(KERN_WARNING "md: couldn't set"
4892                                               " array info. %d\n", err);
4893                                        goto abort_unlock;
4894                                }
4895                        }
4896                        goto done_unlock;
4897
4898                default:;
4899        }
4900
4901        /*
4902         * Commands querying/configuring an existing array:
4903         */
4904        /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4905         * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
4906        if ((!mddev->raid_disks && !mddev->external)
4907            && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4908            && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
4909            && cmd != GET_BITMAP_FILE) {
4910                err = -ENODEV;
4911                goto abort_unlock;
4912        }
4913
4914        /*
4915         * Commands even a read-only array can execute:
4916         */
4917        switch (cmd)
4918        {
4919                case GET_ARRAY_INFO:
4920                        err = get_array_info(mddev, argp);
4921                        goto done_unlock;
4922
4923                case GET_BITMAP_FILE:
4924                        err = get_bitmap_file(mddev, argp);
4925                        goto done_unlock;
4926
4927                case GET_DISK_INFO:
4928                        err = get_disk_info(mddev, argp);
4929                        goto done_unlock;
4930
4931                case RESTART_ARRAY_RW:
4932                        err = restart_array(mddev);
4933                        goto done_unlock;
4934
4935                case STOP_ARRAY:
4936                        err = do_md_stop(mddev, 0, 1);
4937                        goto done_unlock;
4938
4939                case STOP_ARRAY_RO:
4940                        err = do_md_stop(mddev, 1, 1);
4941                        goto done_unlock;
4942
4943        }
4944
4945        /*
4946         * The remaining ioctls are changing the state of the
4947         * superblock, so we do not allow them on read-only arrays.
4948         * However non-MD ioctls (e.g. get-size) will still come through
4949         * here and hit the 'default' below, so only disallow
4950         * 'md' ioctls, and switch to rw mode if started auto-readonly.
4951         */
4952        if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
4953                if (mddev->ro == 2) {
4954                        mddev->ro = 0;
4955                        sysfs_notify_dirent(mddev->sysfs_state);
4956                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4957                        md_wakeup_thread(mddev->thread);
4958                } else {
4959                        err = -EROFS;
4960                        goto abort_unlock;
4961                }
4962        }
4963
4964        switch (cmd)
4965        {
4966                case ADD_NEW_DISK:
4967                {
4968                        mdu_disk_info_t info;
4969                        if (copy_from_user(&info, argp, sizeof(info)))
4970                                err = -EFAULT;
4971                        else
4972                                err = add_new_disk(mddev, &info);
4973                        goto done_unlock;
4974                }
4975
4976                case HOT_REMOVE_DISK:
4977                        err = hot_remove_disk(mddev, new_decode_dev(arg));
4978                        goto done_unlock;
4979
4980                case HOT_ADD_DISK:
4981                        err = hot_add_disk(mddev, new_decode_dev(arg));
4982                        goto done_unlock;
4983
4984                case SET_DISK_FAULTY:
4985                        err = set_disk_faulty(mddev, new_decode_dev(arg));
4986                        goto done_unlock;
4987
4988                case RUN_ARRAY:
4989                        err = do_md_run(mddev);
4990                        goto done_unlock;
4991
4992                case SET_BITMAP_FILE:
4993                        err = set_bitmap_file(mddev, (int)arg);
4994                        goto done_unlock;
4995
4996                default:
4997                        err = -EINVAL;
4998                        goto abort_unlock;
4999        }
5000
5001done_unlock:
5002abort_unlock:
5003        mddev_unlock(mddev);
5004
5005        return err;
5006done:
5007        if (err)
5008                MD_BUG();
5009abort:
5010        return err;
5011}
5012
5013static int md_open(struct block_device *bdev, fmode_t mode)
5014{
5015        /*
5016         * Succeed if we can lock the mddev, which confirms that
5017         * it isn't being stopped right now.
5018         */
5019        mddev_t *mddev = bdev->bd_disk->private_data;
5020        int err;
5021
5022        if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
5023                goto out;
5024
5025        err = 0;
5026        mddev_get(mddev);
5027        atomic_inc(&mddev->openers);
5028        mddev_unlock(mddev);
5029
5030        check_disk_change(bdev);
5031 out:
5032        return err;
5033}
5034
5035static int md_release(struct gendisk *disk, fmode_t mode)
5036{
5037        mddev_t *mddev = disk->private_data;
5038
5039        BUG_ON(!mddev);
5040        atomic_dec(&mddev->openers);
5041        mddev_put(mddev);
5042
5043        return 0;
5044}
5045
5046static int md_media_changed(struct gendisk *disk)
5047{
5048        mddev_t *mddev = disk->private_data;
5049
5050        return mddev->changed;
5051}
5052
5053static int md_revalidate(struct gendisk *disk)
5054{
5055        mddev_t *mddev = disk->private_data;
5056
5057        mddev->changed = 0;
5058        return 0;
5059}
5060static struct block_device_operations md_fops =
5061{
5062        .owner          = THIS_MODULE,
5063        .open           = md_open,
5064        .release        = md_release,
5065        .locked_ioctl   = md_ioctl,
5066        .getgeo         = md_getgeo,
5067        .media_changed  = md_media_changed,
5068        .revalidate_disk= md_revalidate,
5069};
5070
5071static int md_thread(void * arg)
5072{
5073        mdk_thread_t *thread = arg;
5074
5075        /*
5076         * md_thread is a 'system-thread', it's priority should be very
5077         * high. We avoid resource deadlocks individually in each
5078         * raid personality. (RAID5 does preallocation) We also use RR and
5079         * the very same RT priority as kswapd, thus we will never get
5080         * into a priority inversion deadlock.
5081         *
5082         * we definitely have to have equal or higher priority than
5083         * bdflush, otherwise bdflush will deadlock if there are too
5084         * many dirty RAID5 blocks.
5085         */
5086
5087        allow_signal(SIGKILL);
5088        while (!kthread_should_stop()) {
5089
5090                /* We need to wait INTERRUPTIBLE so that
5091                 * we don't add to the load-average.
5092                 * That means we need to be sure no signals are
5093                 * pending
5094                 */
5095                if (signal_pending(current))
5096                        flush_signals(current);
5097
5098                wait_event_interruptible_timeout
5099                        (thread->wqueue,
5100                         test_bit(THREAD_WAKEUP, &thread->flags)
5101                         || kthread_should_stop(),
5102                         thread->timeout);
5103
5104                clear_bit(THREAD_WAKEUP, &thread->flags);
5105
5106                thread->run(thread->mddev);
5107        }
5108
5109        return 0;
5110}
5111
5112void md_wakeup_thread(mdk_thread_t *thread)
5113{
5114        if (thread) {
5115                dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5116                set_bit(THREAD_WAKEUP, &thread->flags);
5117                wake_up(&thread->wqueue);
5118        }
5119}
5120
5121mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5122                                 const char *name)
5123{
5124        mdk_thread_t *thread;
5125
5126        thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5127        if (!thread)
5128                return NULL;
5129
5130        init_waitqueue_head(&thread->wqueue);
5131
5132        thread->run = run;
5133        thread->mddev = mddev;
5134        thread->timeout = MAX_SCHEDULE_TIMEOUT;
5135        thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
5136        if (IS_ERR(thread->tsk)) {
5137                kfree(thread);
5138                return NULL;
5139        }
5140        return thread;
5141}
5142
5143void md_unregister_thread(mdk_thread_t *thread)
5144{
5145        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5146
5147        kthread_stop(thread->tsk);
5148        kfree(thread);
5149}
5150
5151void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5152{
5153        if (!mddev) {
5154                MD_BUG();
5155                return;
5156        }
5157
5158        if (!rdev || test_bit(Faulty, &rdev->flags))
5159                return;
5160
5161        if (mddev->external)
5162                set_bit(Blocked, &rdev->flags);
5163/*
5164        dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5165                mdname(mddev),
5166                MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5167                __builtin_return_address(0),__builtin_return_address(1),
5168                __builtin_return_address(2),__builtin_return_address(3));
5169*/
5170        if (!mddev->pers)
5171                return;
5172        if (!mddev->pers->error_handler)
5173                return;
5174        mddev->pers->error_handler(mddev,rdev);
5175        if (mddev->degraded)
5176                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5177        set_bit(StateChanged, &rdev->flags);
5178        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5179        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5180        md_wakeup_thread(mddev->thread);
5181        md_new_event_inintr(mddev);
5182}
5183
5184/* seq_file implementation /proc/mdstat */
5185
5186static void status_unused(struct seq_file *seq)
5187{
5188        int i = 0;
5189        mdk_rdev_t *rdev;
5190        struct list_head *tmp;
5191
5192        seq_printf(seq, "unused devices: ");
5193
5194        rdev_for_each_list(rdev, tmp, pending_raid_disks) {
5195                char b[BDEVNAME_SIZE];
5196                i++;
5197                seq_printf(seq, "%s ",
5198                              bdevname(rdev->bdev,b));
5199        }
5200        if (!i)
5201                seq_printf(seq, "<none>");
5202
5203        seq_printf(seq, "\n");
5204}
5205
5206
5207static void status_resync(struct seq_file *seq, mddev_t * mddev)
5208{
5209        sector_t max_blocks, resync, res;
5210        unsigned long dt, db, rt;
5211        int scale;
5212        unsigned int per_milli;
5213
5214        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
5215
5216        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5217                max_blocks = mddev->resync_max_sectors >> 1;
5218        else
5219                max_blocks = mddev->size;
5220
5221        /*
5222         * Should not happen.
5223         */
5224        if (!max_blocks) {
5225                MD_BUG();
5226                return;
5227        }
5228        /* Pick 'scale' such that (resync>>scale)*1000 will fit
5229         * in a sector_t, and (max_blocks>>scale) will fit in a
5230         * u32, as those are the requirements for sector_div.
5231         * Thus 'scale' must be at least 10
5232         */
5233        scale = 10;
5234        if (sizeof(sector_t) > sizeof(unsigned long)) {
5235                while ( max_blocks/2 > (1ULL<<(scale+32)))
5236                        scale++;
5237        }
5238        res = (resync>>scale)*1000;
5239        sector_div(res, (u32)((max_blocks>>scale)+1));
5240
5241        per_milli = res;
5242        {
5243                int i, x = per_milli/50, y = 20-x;
5244                seq_printf(seq, "[");
5245                for (i = 0; i < x; i++)
5246                        seq_printf(seq, "=");
5247                seq_printf(seq, ">");
5248                for (i = 0; i < y; i++)
5249                        seq_printf(seq, ".");
5250                seq_printf(seq, "] ");
5251        }
5252        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5253                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5254                    "reshape" :
5255                    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5256                     "check" :
5257                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5258                      "resync" : "recovery"))),
5259                   per_milli/10, per_milli % 10,
5260                   (unsigned long long) resync,
5261                   (unsigned long long) max_blocks);
5262
5263        /*
5264         * We do not want to overflow, so the order of operands and
5265         * the * 100 / 100 trick are important. We do a +1 to be
5266         * safe against division by zero. We only estimate anyway.
5267         *
5268         * dt: time from mark until now
5269         * db: blocks written from mark until now
5270         * rt: remaining time
5271         */
5272        dt = ((jiffies - mddev->resync_mark) / HZ);
5273        if (!dt) dt++;
5274        db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5275                - mddev->resync_mark_cnt;
5276        rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
5277
5278        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
5279
5280        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5281}
5282
5283static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5284{
5285        struct list_head *tmp;
5286        loff_t l = *pos;
5287        mddev_t *mddev;
5288
5289        if (l >= 0x10000)
5290                return NULL;
5291        if (!l--)
5292                /* header */
5293                return (void*)1;
5294
5295        spin_lock(&all_mddevs_lock);
5296        list_for_each(tmp,&all_mddevs)
5297                if (!l--) {
5298                        mddev = list_entry(tmp, mddev_t, all_mddevs);
5299                        mddev_get(mddev);
5300                        spin_unlock(&all_mddevs_lock);
5301                        return mddev;
5302                }
5303        spin_unlock(&all_mddevs_lock);
5304        if (!l--)
5305                return (void*)2;/* tail */
5306        return NULL;
5307}
5308
5309static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5310{
5311        struct list_head *tmp;
5312        mddev_t *next_mddev, *mddev = v;
5313        
5314        ++*pos;
5315        if (v == (void*)2)
5316                return NULL;
5317
5318        spin_lock(&all_mddevs_lock);
5319        if (v == (void*)1)
5320                tmp = all_mddevs.next;
5321        else
5322                tmp = mddev->all_mddevs.next;
5323        if (tmp != &all_mddevs)
5324                next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5325        else {
5326                next_mddev = (void*)2;
5327                *pos = 0x10000;
5328        }               
5329        spin_unlock(&all_mddevs_lock);
5330
5331        if (