linux/drivers/md/md.c
<<
>>
Prefs
   1/*
   2   md.c : Multiple Devices driver for Linux
   3          Copyright (C) 1998, 1999, 2000 Ingo Molnar
   4
   5     completely rewritten, based on the MD driver code from Marc Zyngier
   6
   7   Changes:
   8
   9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
  10   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
  11   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
  12   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
  13   - kmod support by: Cyrus Durgin
  14   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
  15   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
  16
  17   - lots of fixes and improvements to the RAID1/RAID5 and generic
  18     RAID code (such as request based resynchronization):
  19
  20     Neil Brown <neilb@cse.unsw.edu.au>.
  21
  22   - persistent bitmap code
  23     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
  24
  25   This program is free software; you can redistribute it and/or modify
  26   it under the terms of the GNU General Public License as published by
  27   the Free Software Foundation; either version 2, or (at your option)
  28   any later version.
  29
  30   You should have received a copy of the GNU General Public License
  31   (for example /usr/src/linux/COPYING); if not, write to the Free
  32   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  33*/
  34
  35#include <linux/kthread.h>
  36#include <linux/blkdev.h>
  37#include <linux/sysctl.h>
  38#include <linux/seq_file.h>
  39#include <linux/buffer_head.h> /* for invalidate_bdev */
  40#include <linux/poll.h>
  41#include <linux/ctype.h>
  42#include <linux/hdreg.h>
  43#include <linux/proc_fs.h>
  44#include <linux/random.h>
  45#include <linux/reboot.h>
  46#include <linux/file.h>
  47#include <linux/delay.h>
  48#include <linux/raid/md_p.h>
  49#include <linux/raid/md_u.h>
  50#include "md.h"
  51#include "bitmap.h"
  52
  53#define DEBUG 0
  54#define dprintk(x...) ((void)(DEBUG && printk(x)))
  55
  56
  57#ifndef MODULE
  58static void autostart_arrays(int part);
  59#endif
  60
  61static LIST_HEAD(pers_list);
  62static DEFINE_SPINLOCK(pers_lock);
  63
  64static void md_print_devices(void);
  65
  66static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
  67
  68#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
  69
  70/*
  71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  72 * is 1000 KB/sec, so the extra system load does not show up that much.
  73 * Increase it if you want to have more _guaranteed_ speed. Note that
  74 * the RAID driver will use the maximum available bandwidth if the IO
  75 * subsystem is idle. There is also an 'absolute maximum' reconstruction
  76 * speed limit - in case reconstruction slows down your system despite
  77 * idle IO detection.
  78 *
  79 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
  80 * or /sys/block/mdX/md/sync_speed_{min,max}
  81 */
  82
  83static int sysctl_speed_limit_min = 1000;
  84static int sysctl_speed_limit_max = 200000;
  85static inline int speed_min(mddev_t *mddev)
  86{
  87        return mddev->sync_speed_min ?
  88                mddev->sync_speed_min : sysctl_speed_limit_min;
  89}
  90
  91static inline int speed_max(mddev_t *mddev)
  92{
  93        return mddev->sync_speed_max ?
  94                mddev->sync_speed_max : sysctl_speed_limit_max;
  95}
  96
  97static struct ctl_table_header *raid_table_header;
  98
  99static ctl_table raid_table[] = {
 100        {
 101                .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
 102                .procname       = "speed_limit_min",
 103                .data           = &sysctl_speed_limit_min,
 104                .maxlen         = sizeof(int),
 105                .mode           = S_IRUGO|S_IWUSR,
 106                .proc_handler   = &proc_dointvec,
 107        },
 108        {
 109                .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
 110                .procname       = "speed_limit_max",
 111                .data           = &sysctl_speed_limit_max,
 112                .maxlen         = sizeof(int),
 113                .mode           = S_IRUGO|S_IWUSR,
 114                .proc_handler   = &proc_dointvec,
 115        },
 116        { .ctl_name = 0 }
 117};
 118
 119static ctl_table raid_dir_table[] = {
 120        {
 121                .ctl_name       = DEV_RAID,
 122                .procname       = "raid",
 123                .maxlen         = 0,
 124                .mode           = S_IRUGO|S_IXUGO,
 125                .child          = raid_table,
 126        },
 127        { .ctl_name = 0 }
 128};
 129
 130static ctl_table raid_root_table[] = {
 131        {
 132                .ctl_name       = CTL_DEV,
 133                .procname       = "dev",
 134                .maxlen         = 0,
 135                .mode           = 0555,
 136                .child          = raid_dir_table,
 137        },
 138        { .ctl_name = 0 }
 139};
 140
 141static struct block_device_operations md_fops;
 142
 143static int start_readonly;
 144
 145/*
 146 * We have a system wide 'event count' that is incremented
 147 * on any 'interesting' event, and readers of /proc/mdstat
 148 * can use 'poll' or 'select' to find out when the event
 149 * count increases.
 150 *
 151 * Events are:
 152 *  start array, stop array, error, add device, remove device,
 153 *  start build, activate spare
 154 */
 155static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 156static atomic_t md_event_count;
 157void md_new_event(mddev_t *mddev)
 158{
 159        atomic_inc(&md_event_count);
 160        wake_up(&md_event_waiters);
 161}
 162EXPORT_SYMBOL_GPL(md_new_event);
 163
 164/* Alternate version that can be called from interrupts
 165 * when calling sysfs_notify isn't needed.
 166 */
 167static void md_new_event_inintr(mddev_t *mddev)
 168{
 169        atomic_inc(&md_event_count);
 170        wake_up(&md_event_waiters);
 171}
 172
 173/*
 174 * Enables to iterate over all existing md arrays
 175 * all_mddevs_lock protects this list.
 176 */
 177static LIST_HEAD(all_mddevs);
 178static DEFINE_SPINLOCK(all_mddevs_lock);
 179
 180
 181/*
 182 * iterates through all used mddevs in the system.
 183 * We take care to grab the all_mddevs_lock whenever navigating
 184 * the list, and to always hold a refcount when unlocked.
 185 * Any code which breaks out of this loop while own
 186 * a reference to the current mddev and must mddev_put it.
 187 */
 188#define for_each_mddev(mddev,tmp)                                       \
 189                                                                        \
 190        for (({ spin_lock(&all_mddevs_lock);                            \
 191                tmp = all_mddevs.next;                                  \
 192                mddev = NULL;});                                        \
 193             ({ if (tmp != &all_mddevs)                                 \
 194                        mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
 195                spin_unlock(&all_mddevs_lock);                          \
 196                if (mddev) mddev_put(mddev);                            \
 197                mddev = list_entry(tmp, mddev_t, all_mddevs);           \
 198                tmp != &all_mddevs;});                                  \
 199             ({ spin_lock(&all_mddevs_lock);                            \
 200                tmp = tmp->next;})                                      \
 201                )
 202
 203
 204/* Rather than calling directly into the personality make_request function,
 205 * IO requests come here first so that we can check if the device is
 206 * being suspended pending a reconfiguration.
 207 * We hold a refcount over the call to ->make_request.  By the time that
 208 * call has finished, the bio has been linked into some internal structure
 209 * and so is visible to ->quiesce(), so we don't need the refcount any more.
 210 */
 211static int md_make_request(struct request_queue *q, struct bio *bio)
 212{
 213        mddev_t *mddev = q->queuedata;
 214        int rv;
 215        if (mddev == NULL || mddev->pers == NULL) {
 216                bio_io_error(bio);
 217                return 0;
 218        }
 219        rcu_read_lock();
 220        if (mddev->suspended) {
 221                DEFINE_WAIT(__wait);
 222                for (;;) {
 223                        prepare_to_wait(&mddev->sb_wait, &__wait,
 224                                        TASK_UNINTERRUPTIBLE);
 225                        if (!mddev->suspended)
 226                                break;
 227                        rcu_read_unlock();
 228                        schedule();
 229                        rcu_read_lock();
 230                }
 231                finish_wait(&mddev->sb_wait, &__wait);
 232        }
 233        atomic_inc(&mddev->active_io);
 234        rcu_read_unlock();
 235        rv = mddev->pers->make_request(q, bio);
 236        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
 237                wake_up(&mddev->sb_wait);
 238
 239        return rv;
 240}
 241
 242static void mddev_suspend(mddev_t *mddev)
 243{
 244        BUG_ON(mddev->suspended);
 245        mddev->suspended = 1;
 246        synchronize_rcu();
 247        wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 248        mddev->pers->quiesce(mddev, 1);
 249        md_unregister_thread(mddev->thread);
 250        mddev->thread = NULL;
 251        /* we now know that no code is executing in the personality module,
 252         * except possibly the tail end of a ->bi_end_io function, but that
 253         * is certain to complete before the module has a chance to get
 254         * unloaded
 255         */
 256}
 257
 258static void mddev_resume(mddev_t *mddev)
 259{
 260        mddev->suspended = 0;
 261        wake_up(&mddev->sb_wait);
 262        mddev->pers->quiesce(mddev, 0);
 263}
 264
 265
 266static inline mddev_t *mddev_get(mddev_t *mddev)
 267{
 268        atomic_inc(&mddev->active);
 269        return mddev;
 270}
 271
 272static void mddev_delayed_delete(struct work_struct *ws);
 273
 274static void mddev_put(mddev_t *mddev)
 275{
 276        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 277                return;
 278        if (!mddev->raid_disks && list_empty(&mddev->disks) &&
 279            !mddev->hold_active) {
 280                list_del(&mddev->all_mddevs);
 281                if (mddev->gendisk) {
 282                        /* we did a probe so need to clean up.
 283                         * Call schedule_work inside the spinlock
 284                         * so that flush_scheduled_work() after
 285                         * mddev_find will succeed in waiting for the
 286                         * work to be done.
 287                         */
 288                        INIT_WORK(&mddev->del_work, mddev_delayed_delete);
 289                        schedule_work(&mddev->del_work);
 290                } else
 291                        kfree(mddev);
 292        }
 293        spin_unlock(&all_mddevs_lock);
 294}
 295
 296static mddev_t * mddev_find(dev_t unit)
 297{
 298        mddev_t *mddev, *new = NULL;
 299
 300 retry:
 301        spin_lock(&all_mddevs_lock);
 302
 303        if (unit) {
 304                list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 305                        if (mddev->unit == unit) {
 306                                mddev_get(mddev);
 307                                spin_unlock(&all_mddevs_lock);
 308                                kfree(new);
 309                                return mddev;
 310                        }
 311
 312                if (new) {
 313                        list_add(&new->all_mddevs, &all_mddevs);
 314                        spin_unlock(&all_mddevs_lock);
 315                        new->hold_active = UNTIL_IOCTL;
 316                        return new;
 317                }
 318        } else if (new) {
 319                /* find an unused unit number */
 320                static int next_minor = 512;
 321                int start = next_minor;
 322                int is_free = 0;
 323                int dev = 0;
 324                while (!is_free) {
 325                        dev = MKDEV(MD_MAJOR, next_minor);
 326                        next_minor++;
 327                        if (next_minor > MINORMASK)
 328                                next_minor = 0;
 329                        if (next_minor == start) {
 330                                /* Oh dear, all in use. */
 331                                spin_unlock(&all_mddevs_lock);
 332                                kfree(new);
 333                                return NULL;
 334                        }
 335                                
 336                        is_free = 1;
 337                        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
 338                                if (mddev->unit == dev) {
 339                                        is_free = 0;
 340                                        break;
 341                                }
 342                }
 343                new->unit = dev;
 344                new->md_minor = MINOR(dev);
 345                new->hold_active = UNTIL_STOP;
 346                list_add(&new->all_mddevs, &all_mddevs);
 347                spin_unlock(&all_mddevs_lock);
 348                return new;
 349        }
 350        spin_unlock(&all_mddevs_lock);
 351
 352        new = kzalloc(sizeof(*new), GFP_KERNEL);
 353        if (!new)
 354                return NULL;
 355
 356        new->unit = unit;
 357        if (MAJOR(unit) == MD_MAJOR)
 358                new->md_minor = MINOR(unit);
 359        else
 360                new->md_minor = MINOR(unit) >> MdpMinorShift;
 361
 362        mutex_init(&new->open_mutex);
 363        mutex_init(&new->reconfig_mutex);
 364        INIT_LIST_HEAD(&new->disks);
 365        INIT_LIST_HEAD(&new->all_mddevs);
 366        init_timer(&new->safemode_timer);
 367        atomic_set(&new->active, 1);
 368        atomic_set(&new->openers, 0);
 369        atomic_set(&new->active_io, 0);
 370        spin_lock_init(&new->write_lock);
 371        init_waitqueue_head(&new->sb_wait);
 372        init_waitqueue_head(&new->recovery_wait);
 373        new->reshape_position = MaxSector;
 374        new->resync_min = 0;
 375        new->resync_max = MaxSector;
 376        new->level = LEVEL_NONE;
 377
 378        goto retry;
 379}
 380
 381static inline int mddev_lock(mddev_t * mddev)
 382{
 383        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 384}
 385
 386static inline int mddev_is_locked(mddev_t *mddev)
 387{
 388        return mutex_is_locked(&mddev->reconfig_mutex);
 389}
 390
 391static inline int mddev_trylock(mddev_t * mddev)
 392{
 393        return mutex_trylock(&mddev->reconfig_mutex);
 394}
 395
 396static inline void mddev_unlock(mddev_t * mddev)
 397{
 398        mutex_unlock(&mddev->reconfig_mutex);
 399
 400        md_wakeup_thread(mddev->thread);
 401}
 402
 403static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 404{
 405        mdk_rdev_t *rdev;
 406
 407        list_for_each_entry(rdev, &mddev->disks, same_set)
 408                if (rdev->desc_nr == nr)
 409                        return rdev;
 410
 411        return NULL;
 412}
 413
 414static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 415{
 416        mdk_rdev_t *rdev;
 417
 418        list_for_each_entry(rdev, &mddev->disks, same_set)
 419                if (rdev->bdev->bd_dev == dev)
 420                        return rdev;
 421
 422        return NULL;
 423}
 424
 425static struct mdk_personality *find_pers(int level, char *clevel)
 426{
 427        struct mdk_personality *pers;
 428        list_for_each_entry(pers, &pers_list, list) {
 429                if (level != LEVEL_NONE && pers->level == level)
 430                        return pers;
 431                if (strcmp(pers->name, clevel)==0)
 432                        return pers;
 433        }
 434        return NULL;
 435}
 436
 437/* return the offset of the super block in 512byte sectors */
 438static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 439{
 440        sector_t num_sectors = bdev->bd_inode->i_size / 512;
 441        return MD_NEW_SIZE_SECTORS(num_sectors);
 442}
 443
 444static int alloc_disk_sb(mdk_rdev_t * rdev)
 445{
 446        if (rdev->sb_page)
 447                MD_BUG();
 448
 449        rdev->sb_page = alloc_page(GFP_KERNEL);
 450        if (!rdev->sb_page) {
 451                printk(KERN_ALERT "md: out of memory.\n");
 452                return -ENOMEM;
 453        }
 454
 455        return 0;
 456}
 457
 458static void free_disk_sb(mdk_rdev_t * rdev)
 459{
 460        if (rdev->sb_page) {
 461                put_page(rdev->sb_page);
 462                rdev->sb_loaded = 0;
 463                rdev->sb_page = NULL;
 464                rdev->sb_start = 0;
 465                rdev->sectors = 0;
 466        }
 467}
 468
 469
 470static void super_written(struct bio *bio, int error)
 471{
 472        mdk_rdev_t *rdev = bio->bi_private;
 473        mddev_t *mddev = rdev->mddev;
 474
 475        if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 476                printk("md: super_written gets error=%d, uptodate=%d\n",
 477                       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
 478                WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
 479                md_error(mddev, rdev);
 480        }
 481
 482        if (atomic_dec_and_test(&mddev->pending_writes))
 483                wake_up(&mddev->sb_wait);
 484        bio_put(bio);
 485}
 486
 487static void super_written_barrier(struct bio *bio, int error)
 488{
 489        struct bio *bio2 = bio->bi_private;
 490        mdk_rdev_t *rdev = bio2->bi_private;
 491        mddev_t *mddev = rdev->mddev;
 492
 493        if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
 494            error == -EOPNOTSUPP) {
 495                unsigned long flags;
 496                /* barriers don't appear to be supported :-( */
 497                set_bit(BarriersNotsupp, &rdev->flags);
 498                mddev->barriers_work = 0;
 499                spin_lock_irqsave(&mddev->write_lock, flags);
 500                bio2->bi_next = mddev->biolist;
 501                mddev->biolist = bio2;
 502                spin_unlock_irqrestore(&mddev->write_lock, flags);
 503                wake_up(&mddev->sb_wait);
 504                bio_put(bio);
 505        } else {
 506                bio_put(bio2);
 507                bio->bi_private = rdev;
 508                super_written(bio, error);
 509        }
 510}
 511
 512void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 513                   sector_t sector, int size, struct page *page)
 514{
 515        /* write first size bytes of page to sector of rdev
 516         * Increment mddev->pending_writes before returning
 517         * and decrement it on completion, waking up sb_wait
 518         * if zero is reached.
 519         * If an error occurred, call md_error
 520         *
 521         * As we might need to resubmit the request if BIO_RW_BARRIER
 522         * causes ENOTSUPP, we allocate a spare bio...
 523         */
 524        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 525        int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
 526
 527        bio->bi_bdev = rdev->bdev;
 528        bio->bi_sector = sector;
 529        bio_add_page(bio, page, size, 0);
 530        bio->bi_private = rdev;
 531        bio->bi_end_io = super_written;
 532        bio->bi_rw = rw;
 533
 534        atomic_inc(&mddev->pending_writes);
 535        if (!test_bit(BarriersNotsupp, &rdev->flags)) {
 536                struct bio *rbio;
 537                rw |= (1<<BIO_RW_BARRIER);
 538                rbio = bio_clone(bio, GFP_NOIO);
 539                rbio->bi_private = bio;
 540                rbio->bi_end_io = super_written_barrier;
 541                submit_bio(rw, rbio);
 542        } else
 543                submit_bio(rw, bio);
 544}
 545
 546void md_super_wait(mddev_t *mddev)
 547{
 548        /* wait for all superblock writes that were scheduled to complete.
 549         * if any had to be retried (due to BARRIER problems), retry them
 550         */
 551        DEFINE_WAIT(wq);
 552        for(;;) {
 553                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
 554                if (atomic_read(&mddev->pending_writes)==0)
 555                        break;
 556                while (mddev->biolist) {
 557                        struct bio *bio;
 558                        spin_lock_irq(&mddev->write_lock);
 559                        bio = mddev->biolist;
 560                        mddev->biolist = bio->bi_next ;
 561                        bio->bi_next = NULL;
 562                        spin_unlock_irq(&mddev->write_lock);
 563                        submit_bio(bio->bi_rw, bio);
 564                }
 565                schedule();
 566        }
 567        finish_wait(&mddev->sb_wait, &wq);
 568}
 569
 570static void bi_complete(struct bio *bio, int error)
 571{
 572        complete((struct completion*)bio->bi_private);
 573}
 574
 575int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 576                   struct page *page, int rw)
 577{
 578        struct bio *bio = bio_alloc(GFP_NOIO, 1);
 579        struct completion event;
 580        int ret;
 581
 582        rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 583
 584        bio->bi_bdev = bdev;
 585        bio->bi_sector = sector;
 586        bio_add_page(bio, page, size, 0);
 587        init_completion(&event);
 588        bio->bi_private = &event;
 589        bio->bi_end_io = bi_complete;
 590        submit_bio(rw, bio);
 591        wait_for_completion(&event);
 592
 593        ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
 594        bio_put(bio);
 595        return ret;
 596}
 597EXPORT_SYMBOL_GPL(sync_page_io);
 598
 599static int read_disk_sb(mdk_rdev_t * rdev, int size)
 600{
 601        char b[BDEVNAME_SIZE];
 602        if (!rdev->sb_page) {
 603                MD_BUG();
 604                return -EINVAL;
 605        }
 606        if (rdev->sb_loaded)
 607                return 0;
 608
 609
 610        if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
 611                goto fail;
 612        rdev->sb_loaded = 1;
 613        return 0;
 614
 615fail:
 616        printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
 617                bdevname(rdev->bdev,b));
 618        return -EINVAL;
 619}
 620
 621static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 622{
 623        return  sb1->set_uuid0 == sb2->set_uuid0 &&
 624                sb1->set_uuid1 == sb2->set_uuid1 &&
 625                sb1->set_uuid2 == sb2->set_uuid2 &&
 626                sb1->set_uuid3 == sb2->set_uuid3;
 627}
 628
 629static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 630{
 631        int ret;
 632        mdp_super_t *tmp1, *tmp2;
 633
 634        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
 635        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
 636
 637        if (!tmp1 || !tmp2) {
 638                ret = 0;
 639                printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
 640                goto abort;
 641        }
 642
 643        *tmp1 = *sb1;
 644        *tmp2 = *sb2;
 645
 646        /*
 647         * nr_disks is not constant
 648         */
 649        tmp1->nr_disks = 0;
 650        tmp2->nr_disks = 0;
 651
 652        ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
 653abort:
 654        kfree(tmp1);
 655        kfree(tmp2);
 656        return ret;
 657}
 658
 659
 660static u32 md_csum_fold(u32 csum)
 661{
 662        csum = (csum & 0xffff) + (csum >> 16);
 663        return (csum & 0xffff) + (csum >> 16);
 664}
 665
 666static unsigned int calc_sb_csum(mdp_super_t * sb)
 667{
 668        u64 newcsum = 0;
 669        u32 *sb32 = (u32*)sb;
 670        int i;
 671        unsigned int disk_csum, csum;
 672
 673        disk_csum = sb->sb_csum;
 674        sb->sb_csum = 0;
 675
 676        for (i = 0; i < MD_SB_BYTES/4 ; i++)
 677                newcsum += sb32[i];
 678        csum = (newcsum & 0xffffffff) + (newcsum>>32);
 679
 680
 681#ifdef CONFIG_ALPHA
 682        /* This used to use csum_partial, which was wrong for several
 683         * reasons including that different results are returned on
 684         * different architectures.  It isn't critical that we get exactly
 685         * the same return value as before (we always csum_fold before
 686         * testing, and that removes any differences).  However as we
 687         * know that csum_partial always returned a 16bit value on
 688         * alphas, do a fold to maximise conformity to previous behaviour.
 689         */
 690        sb->sb_csum = md_csum_fold(disk_csum);
 691#else
 692        sb->sb_csum = disk_csum;
 693#endif
 694        return csum;
 695}
 696
 697
 698/*
 699 * Handle superblock details.
 700 * We want to be able to handle multiple superblock formats
 701 * so we have a common interface to them all, and an array of
 702 * different handlers.
 703 * We rely on user-space to write the initial superblock, and support
 704 * reading and updating of superblocks.
 705 * Interface methods are:
 706 *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
 707 *      loads and validates a superblock on dev.
 708 *      if refdev != NULL, compare superblocks on both devices
 709 *    Return:
 710 *      0 - dev has a superblock that is compatible with refdev
 711 *      1 - dev has a superblock that is compatible and newer than refdev
 712 *          so dev should be used as the refdev in future
 713 *     -EINVAL superblock incompatible or invalid
 714 *     -othererror e.g. -EIO
 715 *
 716 *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
 717 *      Verify that dev is acceptable into mddev.
 718 *       The first time, mddev->raid_disks will be 0, and data from
 719 *       dev should be merged in.  Subsequent calls check that dev
 720 *       is new enough.  Return 0 or -EINVAL
 721 *
 722 *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
 723 *     Update the superblock for rdev with data in mddev
 724 *     This does not write to disc.
 725 *
 726 */
 727
 728struct super_type  {
 729        char                *name;
 730        struct module       *owner;
 731        int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
 732                                          int minor_version);
 733        int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 734        void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 735        unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
 736                                                sector_t num_sectors);
 737};
 738
 739/*
 740 * Check that the given mddev has no bitmap.
 741 *
 742 * This function is called from the run method of all personalities that do not
 743 * support bitmaps. It prints an error message and returns non-zero if mddev
 744 * has a bitmap. Otherwise, it returns 0.
 745 *
 746 */
 747int md_check_no_bitmap(mddev_t *mddev)
 748{
 749        if (!mddev->bitmap_file && !mddev->bitmap_offset)
 750                return 0;
 751        printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
 752                mdname(mddev), mddev->pers->name);
 753        return 1;
 754}
 755EXPORT_SYMBOL(md_check_no_bitmap);
 756
 757/*
 758 * load_super for 0.90.0 
 759 */
 760static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 761{
 762        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 763        mdp_super_t *sb;
 764        int ret;
 765
 766        /*
 767         * Calculate the position of the superblock (512byte sectors),
 768         * it's at the end of the disk.
 769         *
 770         * It also happens to be a multiple of 4Kb.
 771         */
 772        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 773
 774        ret = read_disk_sb(rdev, MD_SB_BYTES);
 775        if (ret) return ret;
 776
 777        ret = -EINVAL;
 778
 779        bdevname(rdev->bdev, b);
 780        sb = (mdp_super_t*)page_address(rdev->sb_page);
 781
 782        if (sb->md_magic != MD_SB_MAGIC) {
 783                printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
 784                       b);
 785                goto abort;
 786        }
 787
 788        if (sb->major_version != 0 ||
 789            sb->minor_version < 90 ||
 790            sb->minor_version > 91) {
 791                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 792                        sb->major_version, sb->minor_version,
 793                        b);
 794                goto abort;
 795        }
 796
 797        if (sb->raid_disks <= 0)
 798                goto abort;
 799
 800        if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
 801                printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
 802                        b);
 803                goto abort;
 804        }
 805
 806        rdev->preferred_minor = sb->md_minor;
 807        rdev->data_offset = 0;
 808        rdev->sb_size = MD_SB_BYTES;
 809
 810        if (sb->level == LEVEL_MULTIPATH)
 811                rdev->desc_nr = -1;
 812        else
 813                rdev->desc_nr = sb->this_disk.number;
 814
 815        if (!refdev) {
 816                ret = 1;
 817        } else {
 818                __u64 ev1, ev2;
 819                mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
 820                if (!uuid_equal(refsb, sb)) {
 821                        printk(KERN_WARNING "md: %s has different UUID to %s\n",
 822                                b, bdevname(refdev->bdev,b2));
 823                        goto abort;
 824                }
 825                if (!sb_equal(refsb, sb)) {
 826                        printk(KERN_WARNING "md: %s has same UUID"
 827                               " but different superblock to %s\n",
 828                               b, bdevname(refdev->bdev, b2));
 829                        goto abort;
 830                }
 831                ev1 = md_event(sb);
 832                ev2 = md_event(refsb);
 833                if (ev1 > ev2)
 834                        ret = 1;
 835                else 
 836                        ret = 0;
 837        }
 838        rdev->sectors = rdev->sb_start;
 839
 840        if (rdev->sectors < sb->size * 2 && sb->level > 1)
 841                /* "this cannot possibly happen" ... */
 842                ret = -EINVAL;
 843
 844 abort:
 845        return ret;
 846}
 847
 848/*
 849 * validate_super for 0.90.0
 850 */
 851static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 852{
 853        mdp_disk_t *desc;
 854        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
 855        __u64 ev1 = md_event(sb);
 856
 857        rdev->raid_disk = -1;
 858        clear_bit(Faulty, &rdev->flags);
 859        clear_bit(In_sync, &rdev->flags);
 860        clear_bit(WriteMostly, &rdev->flags);
 861        clear_bit(BarriersNotsupp, &rdev->flags);
 862
 863        if (mddev->raid_disks == 0) {
 864                mddev->major_version = 0;
 865                mddev->minor_version = sb->minor_version;
 866                mddev->patch_version = sb->patch_version;
 867                mddev->external = 0;
 868                mddev->chunk_sectors = sb->chunk_size >> 9;
 869                mddev->ctime = sb->ctime;
 870                mddev->utime = sb->utime;
 871                mddev->level = sb->level;
 872                mddev->clevel[0] = 0;
 873                mddev->layout = sb->layout;
 874                mddev->raid_disks = sb->raid_disks;
 875                mddev->dev_sectors = sb->size * 2;
 876                mddev->events = ev1;
 877                mddev->bitmap_offset = 0;
 878                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 879
 880                if (mddev->minor_version >= 91) {
 881                        mddev->reshape_position = sb->reshape_position;
 882                        mddev->delta_disks = sb->delta_disks;
 883                        mddev->new_level = sb->new_level;
 884                        mddev->new_layout = sb->new_layout;
 885                        mddev->new_chunk_sectors = sb->new_chunk >> 9;
 886                } else {
 887                        mddev->reshape_position = MaxSector;
 888                        mddev->delta_disks = 0;
 889                        mddev->new_level = mddev->level;
 890                        mddev->new_layout = mddev->layout;
 891                        mddev->new_chunk_sectors = mddev->chunk_sectors;
 892                }
 893
 894                if (sb->state & (1<<MD_SB_CLEAN))
 895                        mddev->recovery_cp = MaxSector;
 896                else {
 897                        if (sb->events_hi == sb->cp_events_hi && 
 898                                sb->events_lo == sb->cp_events_lo) {
 899                                mddev->recovery_cp = sb->recovery_cp;
 900                        } else
 901                                mddev->recovery_cp = 0;
 902                }
 903
 904                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
 905                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
 906                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
 907                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
 908
 909                mddev->max_disks = MD_SB_DISKS;
 910
 911                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 912                    mddev->bitmap_file == NULL)
 913                        mddev->bitmap_offset = mddev->default_bitmap_offset;
 914
 915        } else if (mddev->pers == NULL) {
 916                /* Insist on good event counter while assembling */
 917                ++ev1;
 918                if (ev1 < mddev->events) 
 919                        return -EINVAL;
 920        } else if (mddev->bitmap) {
 921                /* if adding to array with a bitmap, then we can accept an
 922                 * older device ... but not too old.
 923                 */
 924                if (ev1 < mddev->bitmap->events_cleared)
 925                        return 0;
 926        } else {
 927                if (ev1 < mddev->events)
 928                        /* just a hot-add of a new device, leave raid_disk at -1 */
 929                        return 0;
 930        }
 931
 932        if (mddev->level != LEVEL_MULTIPATH) {
 933                desc = sb->disks + rdev->desc_nr;
 934
 935                if (desc->state & (1<<MD_DISK_FAULTY))
 936                        set_bit(Faulty, &rdev->flags);
 937                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
 938                            desc->raid_disk < mddev->raid_disks */) {
 939                        set_bit(In_sync, &rdev->flags);
 940                        rdev->raid_disk = desc->raid_disk;
 941                }
 942                if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
 943                        set_bit(WriteMostly, &rdev->flags);
 944        } else /* MULTIPATH are always insync */
 945                set_bit(In_sync, &rdev->flags);
 946        return 0;
 947}
 948
 949/*
 950 * sync_super for 0.90.0
 951 */
 952static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 953{
 954        mdp_super_t *sb;
 955        mdk_rdev_t *rdev2;
 956        int next_spare = mddev->raid_disks;
 957
 958
 959        /* make rdev->sb match mddev data..
 960         *
 961         * 1/ zero out disks
 962         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
 963         * 3/ any empty disks < next_spare become removed
 964         *
 965         * disks[0] gets initialised to REMOVED because
 966         * we cannot be sure from other fields if it has
 967         * been initialised or not.
 968         */
 969        int i;
 970        int active=0, working=0,failed=0,spare=0,nr_disks=0;
 971
 972        rdev->sb_size = MD_SB_BYTES;
 973
 974        sb = (mdp_super_t*)page_address(rdev->sb_page);
 975
 976        memset(sb, 0, sizeof(*sb));
 977
 978        sb->md_magic = MD_SB_MAGIC;
 979        sb->major_version = mddev->major_version;
 980        sb->patch_version = mddev->patch_version;
 981        sb->gvalid_words  = 0; /* ignored */
 982        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
 983        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
 984        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
 985        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
 986
 987        sb->ctime = mddev->ctime;
 988        sb->level = mddev->level;
 989        sb->size = mddev->dev_sectors / 2;
 990        sb->raid_disks = mddev->raid_disks;
 991        sb->md_minor = mddev->md_minor;
 992        sb->not_persistent = 0;
 993        sb->utime = mddev->utime;
 994        sb->state = 0;
 995        sb->events_hi = (mddev->events>>32);
 996        sb->events_lo = (u32)mddev->events;
 997
 998        if (mddev->reshape_position == MaxSector)
 999                sb->minor_version = 90;
1000        else {
1001                sb->minor_version = 91;
1002                sb->reshape_position = mddev->reshape_position;
1003                sb->new_level = mddev->new_level;
1004                sb->delta_disks = mddev->delta_disks;
1005                sb->new_layout = mddev->new_layout;
1006                sb->new_chunk = mddev->new_chunk_sectors << 9;
1007        }
1008        mddev->minor_version = sb->minor_version;
1009        if (mddev->in_sync)
1010        {
1011                sb->recovery_cp = mddev->recovery_cp;
1012                sb->cp_events_hi = (mddev->events>>32);
1013                sb->cp_events_lo = (u32)mddev->events;
1014                if (mddev->recovery_cp == MaxSector)
1015                        sb->state = (1<< MD_SB_CLEAN);
1016        } else
1017                sb->recovery_cp = 0;
1018
1019        sb->layout = mddev->layout;
1020        sb->chunk_size = mddev->chunk_sectors << 9;
1021
1022        if (mddev->bitmap && mddev->bitmap_file == NULL)
1023                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1024
1025        sb->disks[0].state = (1<<MD_DISK_REMOVED);
1026        list_for_each_entry(rdev2, &mddev->disks, same_set) {
1027                mdp_disk_t *d;
1028                int desc_nr;
1029                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1030                    && !test_bit(Faulty, &rdev2->flags))
1031                        desc_nr = rdev2->raid_disk;
1032                else
1033                        desc_nr = next_spare++;
1034                rdev2->desc_nr = desc_nr;
1035                d = &sb->disks[rdev2->desc_nr];
1036                nr_disks++;
1037                d->number = rdev2->desc_nr;
1038                d->major = MAJOR(rdev2->bdev->bd_dev);
1039                d->minor = MINOR(rdev2->bdev->bd_dev);
1040                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1041                    && !test_bit(Faulty, &rdev2->flags))
1042                        d->raid_disk = rdev2->raid_disk;
1043                else
1044                        d->raid_disk = rdev2->desc_nr; /* compatibility */
1045                if (test_bit(Faulty, &rdev2->flags))
1046                        d->state = (1<<MD_DISK_FAULTY);
1047                else if (test_bit(In_sync, &rdev2->flags)) {
1048                        d->state = (1<<MD_DISK_ACTIVE);
1049                        d->state |= (1<<MD_DISK_SYNC);
1050                        active++;
1051                        working++;
1052                } else {
1053                        d->state = 0;
1054                        spare++;
1055                        working++;
1056                }
1057                if (test_bit(WriteMostly, &rdev2->flags))
1058                        d->state |= (1<<MD_DISK_WRITEMOSTLY);
1059        }
1060        /* now set the "removed" and "faulty" bits on any missing devices */
1061        for (i=0 ; i < mddev->raid_disks ; i++) {
1062                mdp_disk_t *d = &sb->disks[i];
1063                if (d->state == 0 && d->number == 0) {
1064                        d->number = i;
1065                        d->raid_disk = i;
1066                        d->state = (1<<MD_DISK_REMOVED);
1067                        d->state |= (1<<MD_DISK_FAULTY);
1068                        failed++;
1069                }
1070        }
1071        sb->nr_disks = nr_disks;
1072        sb->active_disks = active;
1073        sb->working_disks = working;
1074        sb->failed_disks = failed;
1075        sb->spare_disks = spare;
1076
1077        sb->this_disk = sb->disks[rdev->desc_nr];
1078        sb->sb_csum = calc_sb_csum(sb);
1079}
1080
1081/*
1082 * rdev_size_change for 0.90.0
1083 */
1084static unsigned long long
1085super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1086{
1087        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1088                return 0; /* component must fit device */
1089        if (rdev->mddev->bitmap_offset)
1090                return 0; /* can't move bitmap */
1091        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1092        if (!num_sectors || num_sectors > rdev->sb_start)
1093                num_sectors = rdev->sb_start;
1094        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1095                       rdev->sb_page);
1096        md_super_wait(rdev->mddev);
1097        return num_sectors / 2; /* kB for sysfs */
1098}
1099
1100
1101/*
1102 * version 1 superblock
1103 */
1104
1105static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1106{
1107        __le32 disk_csum;
1108        u32 csum;
1109        unsigned long long newcsum;
1110        int size = 256 + le32_to_cpu(sb->max_dev)*2;
1111        __le32 *isuper = (__le32*)sb;
1112        int i;
1113
1114        disk_csum = sb->sb_csum;
1115        sb->sb_csum = 0;
1116        newcsum = 0;
1117        for (i=0; size>=4; size -= 4 )
1118                newcsum += le32_to_cpu(*isuper++);
1119
1120        if (size == 2)
1121                newcsum += le16_to_cpu(*(__le16*) isuper);
1122
1123        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1124        sb->sb_csum = disk_csum;
1125        return cpu_to_le32(csum);
1126}
1127
1128static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1129{
1130        struct mdp_superblock_1 *sb;
1131        int ret;
1132        sector_t sb_start;
1133        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1134        int bmask;
1135
1136        /*
1137         * Calculate the position of the superblock in 512byte sectors.
1138         * It is always aligned to a 4K boundary and
1139         * depeding on minor_version, it can be:
1140         * 0: At least 8K, but less than 12K, from end of device
1141         * 1: At start of device
1142         * 2: 4K from start of device.
1143         */
1144        switch(minor_version) {
1145        case 0:
1146                sb_start = rdev->bdev->bd_inode->i_size >> 9;
1147                sb_start -= 8*2;
1148                sb_start &= ~(sector_t)(4*2-1);
1149                break;
1150        case 1:
1151                sb_start = 0;
1152                break;
1153        case 2:
1154                sb_start = 8;
1155                break;
1156        default:
1157                return -EINVAL;
1158        }
1159        rdev->sb_start = sb_start;
1160
1161        /* superblock is rarely larger than 1K, but it can be larger,
1162         * and it is safe to read 4k, so we do that
1163         */
1164        ret = read_disk_sb(rdev, 4096);
1165        if (ret) return ret;
1166
1167
1168        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1169
1170        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1171            sb->major_version != cpu_to_le32(1) ||
1172            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1173            le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1174            (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1175                return -EINVAL;
1176
1177        if (calc_sb_1_csum(sb) != sb->sb_csum) {
1178                printk("md: invalid superblock checksum on %s\n",
1179                        bdevname(rdev->bdev,b));
1180                return -EINVAL;
1181        }
1182        if (le64_to_cpu(sb->data_size) < 10) {
1183                printk("md: data_size too small on %s\n",
1184                       bdevname(rdev->bdev,b));
1185                return -EINVAL;
1186        }
1187
1188        rdev->preferred_minor = 0xffff;
1189        rdev->data_offset = le64_to_cpu(sb->data_offset);
1190        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1191
1192        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1193        bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1194        if (rdev->sb_size & bmask)
1195                rdev->sb_size = (rdev->sb_size | bmask) + 1;
1196
1197        if (minor_version
1198            && rdev->data_offset < sb_start + (rdev->sb_size/512))
1199                return -EINVAL;
1200
1201        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1202                rdev->desc_nr = -1;
1203        else
1204                rdev->desc_nr = le32_to_cpu(sb->dev_number);
1205
1206        if (!refdev) {
1207                ret = 1;
1208        } else {
1209                __u64 ev1, ev2;
1210                struct mdp_superblock_1 *refsb = 
1211                        (struct mdp_superblock_1*)page_address(refdev->sb_page);
1212
1213                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1214                    sb->level != refsb->level ||
1215                    sb->layout != refsb->layout ||
1216                    sb->chunksize != refsb->chunksize) {
1217                        printk(KERN_WARNING "md: %s has strangely different"
1218                                " superblock to %s\n",
1219                                bdevname(rdev->bdev,b),
1220                                bdevname(refdev->bdev,b2));
1221                        return -EINVAL;
1222                }
1223                ev1 = le64_to_cpu(sb->events);
1224                ev2 = le64_to_cpu(refsb->events);
1225
1226                if (ev1 > ev2)
1227                        ret = 1;
1228                else
1229                        ret = 0;
1230        }
1231        if (minor_version)
1232                rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1233                        le64_to_cpu(sb->data_offset);
1234        else
1235                rdev->sectors = rdev->sb_start;
1236        if (rdev->sectors < le64_to_cpu(sb->data_size))
1237                return -EINVAL;
1238        rdev->sectors = le64_to_cpu(sb->data_size);
1239        if (le64_to_cpu(sb->size) > rdev->sectors)
1240                return -EINVAL;
1241        return ret;
1242}
1243
1244static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1245{
1246        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1247        __u64 ev1 = le64_to_cpu(sb->events);
1248
1249        rdev->raid_disk = -1;
1250        clear_bit(Faulty, &rdev->flags);
1251        clear_bit(In_sync, &rdev->flags);
1252        clear_bit(WriteMostly, &rdev->flags);
1253        clear_bit(BarriersNotsupp, &rdev->flags);
1254
1255        if (mddev->raid_disks == 0) {
1256                mddev->major_version = 1;
1257                mddev->patch_version = 0;
1258                mddev->external = 0;
1259                mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1260                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1261                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1262                mddev->level = le32_to_cpu(sb->level);
1263                mddev->clevel[0] = 0;
1264                mddev->layout = le32_to_cpu(sb->layout);
1265                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1266                mddev->dev_sectors = le64_to_cpu(sb->size);
1267                mddev->events = ev1;
1268                mddev->bitmap_offset = 0;
1269                mddev->default_bitmap_offset = 1024 >> 9;
1270                
1271                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1272                memcpy(mddev->uuid, sb->set_uuid, 16);
1273
1274                mddev->max_disks =  (4096-256)/2;
1275
1276                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1277                    mddev->bitmap_file == NULL )
1278                        mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1279
1280                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1281                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1282                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1283                        mddev->new_level = le32_to_cpu(sb->new_level);
1284                        mddev->new_layout = le32_to_cpu(sb->new_layout);
1285                        mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1286                } else {
1287                        mddev->reshape_position = MaxSector;
1288                        mddev->delta_disks = 0;
1289                        mddev->new_level = mddev->level;
1290                        mddev->new_layout = mddev->layout;
1291                        mddev->new_chunk_sectors = mddev->chunk_sectors;
1292                }
1293
1294        } else if (mddev->pers == NULL) {
1295                /* Insist of good event counter while assembling */
1296                ++ev1;
1297                if (ev1 < mddev->events)
1298                        return -EINVAL;
1299        } else if (mddev->bitmap) {
1300                /* If adding to array with a bitmap, then we can accept an
1301                 * older device, but not too old.
1302                 */
1303                if (ev1 < mddev->bitmap->events_cleared)
1304                        return 0;
1305        } else {
1306                if (ev1 < mddev->events)
1307                        /* just a hot-add of a new device, leave raid_disk at -1 */
1308                        return 0;
1309        }
1310        if (mddev->level != LEVEL_MULTIPATH) {
1311                int role;
1312                if (rdev->desc_nr < 0 ||
1313                    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1314                        role = 0xffff;
1315                        rdev->desc_nr = -1;
1316                } else
1317                        role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1318                switch(role) {
1319                case 0xffff: /* spare */
1320                        break;
1321                case 0xfffe: /* faulty */
1322                        set_bit(Faulty, &rdev->flags);
1323                        break;
1324                default:
1325                        if ((le32_to_cpu(sb->feature_map) &
1326                             MD_FEATURE_RECOVERY_OFFSET))
1327                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1328                        else
1329                                set_bit(In_sync, &rdev->flags);
1330                        rdev->raid_disk = role;
1331                        break;
1332                }
1333                if (sb->devflags & WriteMostly1)
1334                        set_bit(WriteMostly, &rdev->flags);
1335        } else /* MULTIPATH are always insync */
1336                set_bit(In_sync, &rdev->flags);
1337
1338        return 0;
1339}
1340
1341static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1342{
1343        struct mdp_superblock_1 *sb;
1344        mdk_rdev_t *rdev2;
1345        int max_dev, i;
1346        /* make rdev->sb match mddev and rdev data. */
1347
1348        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1349
1350        sb->feature_map = 0;
1351        sb->pad0 = 0;
1352        sb->recovery_offset = cpu_to_le64(0);
1353        memset(sb->pad1, 0, sizeof(sb->pad1));
1354        memset(sb->pad2, 0, sizeof(sb->pad2));
1355        memset(sb->pad3, 0, sizeof(sb->pad3));
1356
1357        sb->utime = cpu_to_le64((__u64)mddev->utime);
1358        sb->events = cpu_to_le64(mddev->events);
1359        if (mddev->in_sync)
1360                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1361        else
1362                sb->resync_offset = cpu_to_le64(0);
1363
1364        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1365
1366        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1367        sb->size = cpu_to_le64(mddev->dev_sectors);
1368        sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1369        sb->level = cpu_to_le32(mddev->level);
1370        sb->layout = cpu_to_le32(mddev->layout);
1371
1372        if (mddev->bitmap && mddev->bitmap_file == NULL) {
1373                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1374                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1375        }
1376
1377        if (rdev->raid_disk >= 0 &&
1378            !test_bit(In_sync, &rdev->flags)) {
1379                if (mddev->curr_resync_completed > rdev->recovery_offset)
1380                        rdev->recovery_offset = mddev->curr_resync_completed;
1381                if (rdev->recovery_offset > 0) {
1382                        sb->feature_map |=
1383                                cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1384                        sb->recovery_offset =
1385                                cpu_to_le64(rdev->recovery_offset);
1386                }
1387        }
1388
1389        if (mddev->reshape_position != MaxSector) {
1390                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1391                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1392                sb->new_layout = cpu_to_le32(mddev->new_layout);
1393                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1394                sb->new_level = cpu_to_le32(mddev->new_level);
1395                sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1396        }
1397
1398        max_dev = 0;
1399        list_for_each_entry(rdev2, &mddev->disks, same_set)
1400                if (rdev2->desc_nr+1 > max_dev)
1401                        max_dev = rdev2->desc_nr+1;
1402
1403        if (max_dev > le32_to_cpu(sb->max_dev)) {
1404                int bmask;
1405                sb->max_dev = cpu_to_le32(max_dev);
1406                rdev->sb_size = max_dev * 2 + 256;
1407                bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1408                if (rdev->sb_size & bmask)
1409                        rdev->sb_size = (rdev->sb_size | bmask) + 1;
1410        }
1411        for (i=0; i<max_dev;i++)
1412                sb->dev_roles[i] = cpu_to_le16(0xfffe);
1413        
1414        list_for_each_entry(rdev2, &mddev->disks, same_set) {
1415                i = rdev2->desc_nr;
1416                if (test_bit(Faulty, &rdev2->flags))
1417                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
1418                else if (test_bit(In_sync, &rdev2->flags))
1419                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1420                else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1421                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1422                else
1423                        sb->dev_roles[i] = cpu_to_le16(0xffff);
1424        }
1425
1426        sb->sb_csum = calc_sb_1_csum(sb);
1427}
1428
1429static unsigned long long
1430super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1431{
1432        struct mdp_superblock_1 *sb;
1433        sector_t max_sectors;
1434        if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1435                return 0; /* component must fit device */
1436        if (rdev->sb_start < rdev->data_offset) {
1437                /* minor versions 1 and 2; superblock before data */
1438                max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1439                max_sectors -= rdev->data_offset;
1440                if (!num_sectors || num_sectors > max_sectors)
1441                        num_sectors = max_sectors;
1442        } else if (rdev->mddev->bitmap_offset) {
1443                /* minor version 0 with bitmap we can't move */
1444                return 0;
1445        } else {
1446                /* minor version 0; superblock after data */
1447                sector_t sb_start;
1448                sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1449                sb_start &= ~(sector_t)(4*2 - 1);
1450                max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1451                if (!num_sectors || num_sectors > max_sectors)
1452                        num_sectors = max_sectors;
1453                rdev->sb_start = sb_start;
1454        }
1455        sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1456        sb->data_size = cpu_to_le64(num_sectors);
1457        sb->super_offset = rdev->sb_start;
1458        sb->sb_csum = calc_sb_1_csum(sb);
1459        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1460                       rdev->sb_page);
1461        md_super_wait(rdev->mddev);
1462        return num_sectors / 2; /* kB for sysfs */
1463}
1464
1465static struct super_type super_types[] = {
1466        [0] = {
1467                .name   = "0.90.0",
1468                .owner  = THIS_MODULE,
1469                .load_super         = super_90_load,
1470                .validate_super     = super_90_validate,
1471                .sync_super         = super_90_sync,
1472                .rdev_size_change   = super_90_rdev_size_change,
1473        },
1474        [1] = {
1475                .name   = "md-1",
1476                .owner  = THIS_MODULE,
1477                .load_super         = super_1_load,
1478                .validate_super     = super_1_validate,
1479                .sync_super         = super_1_sync,
1480                .rdev_size_change   = super_1_rdev_size_change,
1481        },
1482};
1483
1484static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1485{
1486        mdk_rdev_t *rdev, *rdev2;
1487
1488        rcu_read_lock();
1489        rdev_for_each_rcu(rdev, mddev1)
1490                rdev_for_each_rcu(rdev2, mddev2)
1491                        if (rdev->bdev->bd_contains ==
1492                            rdev2->bdev->bd_contains) {
1493                                rcu_read_unlock();
1494                                return 1;
1495                        }
1496        rcu_read_unlock();
1497        return 0;
1498}
1499
1500static LIST_HEAD(pending_raid_disks);
1501
1502/*
1503 * Try to register data integrity profile for an mddev
1504 *
1505 * This is called when an array is started and after a disk has been kicked
1506 * from the array. It only succeeds if all working and active component devices
1507 * are integrity capable with matching profiles.
1508 */
1509int md_integrity_register(mddev_t *mddev)
1510{
1511        mdk_rdev_t *rdev, *reference = NULL;
1512
1513        if (list_empty(&mddev->disks))
1514                return 0; /* nothing to do */
1515        if (blk_get_integrity(mddev->gendisk))
1516                return 0; /* already registered */
1517        list_for_each_entry(rdev, &mddev->disks, same_set) {
1518                /* skip spares and non-functional disks */
1519                if (test_bit(Faulty, &rdev->flags))
1520                        continue;
1521                if (rdev->raid_disk < 0)
1522                        continue;
1523                /*
1524                 * If at least one rdev is not integrity capable, we can not
1525                 * enable data integrity for the md device.
1526                 */
1527                if (!bdev_get_integrity(rdev->bdev))
1528                        return -EINVAL;
1529                if (!reference) {
1530                        /* Use the first rdev as the reference */
1531                        reference = rdev;
1532                        continue;
1533                }
1534                /* does this rdev's profile match the reference profile? */
1535                if (blk_integrity_compare(reference->bdev->bd_disk,
1536                                rdev->bdev->bd_disk) < 0)
1537                        return -EINVAL;
1538        }
1539        /*
1540         * All component devices are integrity capable and have matching
1541         * profiles, register the common profile for the md device.
1542         */
1543        if (blk_integrity_register(mddev->gendisk,
1544                        bdev_get_integrity(reference->bdev)) != 0) {
1545                printk(KERN_ERR "md: failed to register integrity for %s\n",
1546                        mdname(mddev));
1547                return -EINVAL;
1548        }
1549        printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1550                mdname(mddev));
1551        return 0;
1552}
1553EXPORT_SYMBOL(md_integrity_register);
1554
1555/* Disable data integrity if non-capable/non-matching disk is being added */
1556void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1557{
1558        struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1559        struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1560
1561        if (!bi_mddev) /* nothing to do */
1562                return;
1563        if (rdev->raid_disk < 0) /* skip spares */
1564                return;
1565        if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1566                                             rdev->bdev->bd_disk) >= 0)
1567                return;
1568        printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1569        blk_integrity_unregister(mddev->gendisk);
1570}
1571EXPORT_SYMBOL(md_integrity_add_rdev);
1572
1573static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1574{
1575        char b[BDEVNAME_SIZE];
1576        struct kobject *ko;
1577        char *s;
1578        int err;
1579
1580        if (rdev->mddev) {
1581                MD_BUG();
1582                return -EINVAL;
1583        }
1584
1585        /* prevent duplicates */
1586        if (find_rdev(mddev, rdev->bdev->bd_dev))
1587                return -EEXIST;
1588
1589        /* make sure rdev->sectors exceeds mddev->dev_sectors */
1590        if (rdev->sectors && (mddev->dev_sectors == 0 ||
1591                        rdev->sectors < mddev->dev_sectors)) {
1592                if (mddev->pers) {
1593                        /* Cannot change size, so fail
1594                         * If mddev->level <= 0, then we don't care
1595                         * about aligning sizes (e.g. linear)
1596                         */
1597                        if (mddev->level > 0)
1598                                return -ENOSPC;
1599                } else
1600                        mddev->dev_sectors = rdev->sectors;
1601        }
1602
1603        /* Verify rdev->desc_nr is unique.
1604         * If it is -1, assign a free number, else
1605         * check number is not in use
1606         */
1607        if (rdev->desc_nr < 0) {
1608                int choice = 0;
1609                if (mddev->pers) choice = mddev->raid_disks;
1610                while (find_rdev_nr(mddev, choice))
1611                        choice++;
1612                rdev->desc_nr = choice;
1613        } else {
1614                if (find_rdev_nr(mddev, rdev->desc_nr))
1615                        return -EBUSY;
1616        }
1617        if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1618                printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1619                       mdname(mddev), mddev->max_disks);
1620                return -EBUSY;
1621        }
1622        bdevname(rdev->bdev,b);
1623        while ( (s=strchr(b, '/')) != NULL)
1624                *s = '!';
1625
1626        rdev->mddev = mddev;
1627        printk(KERN_INFO "md: bind<%s>\n", b);
1628
1629        if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1630                goto fail;
1631
1632        ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1633        if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1634                kobject_del(&rdev->kobj);
1635                goto fail;
1636        }
1637        rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1638
1639        list_add_rcu(&rdev->same_set, &mddev->disks);
1640        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1641
1642        /* May as well allow recovery to be retried once */
1643        mddev->recovery_disabled = 0;
1644
1645        return 0;
1646
1647 fail:
1648        printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1649               b, mdname(mddev));
1650        return err;
1651}
1652
1653static void md_delayed_delete(struct work_struct *ws)
1654{
1655        mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1656        kobject_del(&rdev->kobj);
1657        kobject_put(&rdev->kobj);
1658}
1659
1660static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1661{
1662        char b[BDEVNAME_SIZE];
1663        if (!rdev->mddev) {
1664                MD_BUG();
1665                return;
1666        }
1667        bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1668        list_del_rcu(&rdev->same_set);
1669        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1670        rdev->mddev = NULL;
1671        sysfs_remove_link(&rdev->kobj, "block");
1672        sysfs_put(rdev->sysfs_state);
1673        rdev->sysfs_state = NULL;
1674        /* We need to delay this, otherwise we can deadlock when
1675         * writing to 'remove' to "dev/state".  We also need
1676         * to delay it due to rcu usage.
1677         */
1678        synchronize_rcu();
1679        INIT_WORK(&rdev->del_work, md_delayed_delete);
1680        kobject_get(&rdev->kobj);
1681        schedule_work(&rdev->del_work);
1682}
1683
1684/*
1685 * prevent the device from being mounted, repartitioned or
1686 * otherwise reused by a RAID array (or any other kernel
1687 * subsystem), by bd_claiming the device.
1688 */
1689static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1690{
1691        int err = 0;
1692        struct block_device *bdev;
1693        char b[BDEVNAME_SIZE];
1694
1695        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1696        if (IS_ERR(bdev)) {
1697                printk(KERN_ERR "md: could not open %s.\n",
1698                        __bdevname(dev, b));
1699                return PTR_ERR(bdev);
1700        }
1701        err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1702        if (err) {
1703                printk(KERN_ERR "md: could not bd_claim %s.\n",
1704                        bdevname(bdev, b));
1705                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1706                return err;
1707        }
1708        if (!shared)
1709                set_bit(AllReserved, &rdev->flags);
1710        rdev->bdev = bdev;
1711        return err;
1712}
1713
1714static void unlock_rdev(mdk_rdev_t *rdev)
1715{
1716        struct block_device *bdev = rdev->bdev;
1717        rdev->bdev = NULL;
1718        if (!bdev)
1719                MD_BUG();
1720        bd_release(bdev);
1721        blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1722}
1723
1724void md_autodetect_dev(dev_t dev);
1725
1726static void export_rdev(mdk_rdev_t * rdev)
1727{
1728        char b[BDEVNAME_SIZE];
1729        printk(KERN_INFO "md: export_rdev(%s)\n",
1730                bdevname(rdev->bdev,b));
1731        if (rdev->mddev)
1732                MD_BUG();
1733        free_disk_sb(rdev);
1734#ifndef MODULE
1735        if (test_bit(AutoDetected, &rdev->flags))
1736                md_autodetect_dev(rdev->bdev->bd_dev);
1737#endif
1738        unlock_rdev(rdev);
1739        kobject_put(&rdev->kobj);
1740}
1741
1742static void kick_rdev_from_array(mdk_rdev_t * rdev)
1743{
1744        unbind_rdev_from_array(rdev);
1745        export_rdev(rdev);
1746}
1747
1748static void export_array(mddev_t *mddev)
1749{
1750        mdk_rdev_t *rdev, *tmp;
1751
1752        rdev_for_each(rdev, tmp, mddev) {
1753                if (!rdev->mddev) {
1754                        MD_BUG();
1755                        continue;
1756                }
1757                kick_rdev_from_array(rdev);
1758        }
1759        if (!list_empty(&mddev->disks))
1760                MD_BUG();
1761        mddev->raid_disks = 0;
1762        mddev->major_version = 0;
1763}
1764
1765static void print_desc(mdp_disk_t *desc)
1766{
1767        printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1768                desc->major,desc->minor,desc->raid_disk,desc->state);
1769}
1770
1771static void print_sb_90(mdp_super_t *sb)
1772{
1773        int i;
1774
1775        printk(KERN_INFO 
1776                "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1777                sb->major_version, sb->minor_version, sb->patch_version,
1778                sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1779                sb->ctime);
1780        printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1781                sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1782                sb->md_minor, sb->layout, sb->chunk_size);
1783        printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1784                " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1785                sb->utime, sb->state, sb->active_disks, sb->working_disks,
1786                sb->failed_disks, sb->spare_disks,
1787                sb->sb_csum, (unsigned long)sb->events_lo);
1788
1789        printk(KERN_INFO);
1790        for (i = 0; i < MD_SB_DISKS; i++) {
1791                mdp_disk_t *desc;
1792
1793                desc = sb->disks + i;
1794                if (desc->number || desc->major || desc->minor ||
1795                    desc->raid_disk || (desc->state && (desc->state != 4))) {
1796                        printk("     D %2d: ", i);
1797                        print_desc(desc);
1798                }
1799        }
1800        printk(KERN_INFO "md:     THIS: ");
1801        print_desc(&sb->this_disk);
1802}
1803
1804static void print_sb_1(struct mdp_superblock_1 *sb)
1805{
1806        __u8 *uuid;
1807
1808        uuid = sb->set_uuid;
1809        printk(KERN_INFO
1810               "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1811               ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1812               "md:    Name: \"%s\" CT:%llu\n",
1813                le32_to_cpu(sb->major_version),
1814                le32_to_cpu(sb->feature_map),
1815                uuid[0], uuid[1], uuid[2], uuid[3],
1816                uuid[4], uuid[5], uuid[6], uuid[7],
1817                uuid[8], uuid[9], uuid[10], uuid[11],
1818                uuid[12], uuid[13], uuid[14], uuid[15],
1819                sb->set_name,
1820                (unsigned long long)le64_to_cpu(sb->ctime)
1821                       & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1822
1823        uuid = sb->device_uuid;
1824        printk(KERN_INFO
1825               "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1826                        " RO:%llu\n"
1827               "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1828                        ":%02x%02x%02x%02x%02x%02x\n"
1829               "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1830               "md:         (MaxDev:%u) \n",
1831                le32_to_cpu(sb->level),
1832                (unsigned long long)le64_to_cpu(sb->size),
1833                le32_to_cpu(sb->raid_disks),
1834                le32_to_cpu(sb->layout),
1835                le32_to_cpu(sb->chunksize),
1836                (unsigned long long)le64_to_cpu(sb->data_offset),
1837                (unsigned long long)le64_to_cpu(sb->data_size),
1838                (unsigned long long)le64_to_cpu(sb->super_offset),
1839                (unsigned long long)le64_to_cpu(sb->recovery_offset),
1840                le32_to_cpu(sb->dev_number),
1841                uuid[0], uuid[1], uuid[2], uuid[3],
1842                uuid[4], uuid[5], uuid[6], uuid[7],
1843                uuid[8], uuid[9], uuid[10], uuid[11],
1844                uuid[12], uuid[13], uuid[14], uuid[15],
1845                sb->devflags,
1846                (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1847                (unsigned long long)le64_to_cpu(sb->events),
1848                (unsigned long long)le64_to_cpu(sb->resync_offset),
1849                le32_to_cpu(sb->sb_csum),
1850                le32_to_cpu(sb->max_dev)
1851                );
1852}
1853
1854static void print_rdev(mdk_rdev_t *rdev, int major_version)
1855{
1856        char b[BDEVNAME_SIZE];
1857        printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1858                bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1859                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1860                rdev->desc_nr);
1861        if (rdev->sb_loaded) {
1862                printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1863                switch (major_version) {
1864                case 0:
1865                        print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1866                        break;
1867                case 1:
1868                        print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1869                        break;
1870                }
1871        } else
1872                printk(KERN_INFO "md: no rdev superblock!\n");
1873}
1874
1875static void md_print_devices(void)
1876{
1877        struct list_head *tmp;
1878        mdk_rdev_t *rdev;
1879        mddev_t *mddev;
1880        char b[BDEVNAME_SIZE];
1881
1882        printk("\n");
1883        printk("md:     **********************************\n");
1884        printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1885        printk("md:     **********************************\n");
1886        for_each_mddev(mddev, tmp) {
1887
1888                if (mddev->bitmap)
1889                        bitmap_print_sb(mddev->bitmap);
1890                else
1891                        printk("%s: ", mdname(mddev));
1892                list_for_each_entry(rdev, &mddev->disks, same_set)
1893                        printk("<%s>", bdevname(rdev->bdev,b));
1894                printk("\n");
1895
1896                list_for_each_entry(rdev, &mddev->disks, same_set)
1897                        print_rdev(rdev, mddev->major_version);
1898        }
1899        printk("md:     **********************************\n");
1900        printk("\n");
1901}
1902
1903
1904static void sync_sbs(mddev_t * mddev, int nospares)
1905{
1906        /* Update each superblock (in-memory image), but
1907         * if we are allowed to, skip spares which already
1908         * have the right event counter, or have one earlier
1909         * (which would mean they aren't being marked as dirty
1910         * with the rest of the array)
1911         */
1912        mdk_rdev_t *rdev;
1913
1914        list_for_each_entry(rdev, &mddev->disks, same_set) {
1915                if (rdev->sb_events == mddev->events ||
1916                    (nospares &&
1917                     rdev->raid_disk < 0 &&
1918                     (rdev->sb_events&1)==0 &&
1919                     rdev->sb_events+1 == mddev->events)) {
1920                        /* Don't update this superblock */
1921                        rdev->sb_loaded = 2;
1922                } else {
1923                        super_types[mddev->major_version].
1924                                sync_super(mddev, rdev);
1925                        rdev->sb_loaded = 1;
1926                }
1927        }
1928}
1929
1930static void md_update_sb(mddev_t * mddev, int force_change)
1931{
1932        mdk_rdev_t *rdev;
1933        int sync_req;
1934        int nospares = 0;
1935
1936        mddev->utime = get_seconds();
1937        if (mddev->external)
1938                return;
1939repeat:
1940        spin_lock_irq(&mddev->write_lock);
1941
1942        set_bit(MD_CHANGE_PENDING, &mddev->flags);
1943        if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1944                force_change = 1;
1945        if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1946                /* just a clean<-> dirty transition, possibly leave spares alone,
1947                 * though if events isn't the right even/odd, we will have to do
1948                 * spares after all
1949                 */
1950                nospares = 1;
1951        if (force_change)
1952                nospares = 0;
1953        if (mddev->degraded)
1954                /* If the array is degraded, then skipping spares is both
1955                 * dangerous and fairly pointless.
1956                 * Dangerous because a device that was removed from the array
1957                 * might have a event_count that still looks up-to-date,
1958                 * so it can be re-added without a resync.
1959                 * Pointless because if there are any spares to skip,
1960                 * then a recovery will happen and soon that array won't
1961                 * be degraded any more and the spare can go back to sleep then.
1962                 */
1963                nospares = 0;
1964
1965        sync_req = mddev->in_sync;
1966
1967        /* If this is just a dirty<->clean transition, and the array is clean
1968         * and 'events' is odd, we can roll back to the previous clean state */
1969        if (nospares
1970            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1971            && (mddev->events & 1)
1972            && mddev->events != 1)
1973                mddev->events--;
1974        else {
1975                /* otherwise we have to go forward and ... */
1976                mddev->events ++;
1977                if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1978                        /* .. if the array isn't clean, an 'even' event must also go
1979                         * to spares. */
1980                        if ((mddev->events&1)==0)
1981                                nospares = 0;
1982                } else {
1983                        /* otherwise an 'odd' event must go to spares */
1984                        if ((mddev->events&1))
1985                                nospares = 0;
1986                }
1987        }
1988
1989        if (!mddev->events) {
1990                /*
1991                 * oops, this 64-bit counter should never wrap.
1992                 * Either we are in around ~1 trillion A.C., assuming
1993                 * 1 reboot per second, or we have a bug:
1994                 */
1995                MD_BUG();
1996                mddev->events --;
1997        }
1998
1999        /*
2000         * do not write anything to disk if using
2001         * nonpersistent superblocks
2002         */
2003        if (!mddev->persistent) {
2004                if (!mddev->external)
2005                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2006
2007                spin_unlock_irq(&mddev->write_lock);
2008                wake_up(&mddev->sb_wait);
2009                return;
2010        }
2011        sync_sbs(mddev, nospares);
2012        spin_unlock_irq(&mddev->write_lock);
2013
2014        dprintk(KERN_INFO 
2015                "md: updating %s RAID superblock on device (in sync %d)\n",
2016                mdname(mddev),mddev->in_sync);
2017
2018        bitmap_update_sb(mddev->bitmap);
2019        list_for_each_entry(rdev, &mddev->disks, same_set) {
2020                char b[BDEVNAME_SIZE];
2021                dprintk(KERN_INFO "md: ");
2022                if (rdev->sb_loaded != 1)
2023                        continue; /* no noise on spare devices */
2024                if (test_bit(Faulty, &rdev->flags))
2025                        dprintk("(skipping faulty ");
2026
2027                dprintk("%s ", bdevname(rdev->bdev,b));
2028                if (!test_bit(Faulty, &rdev->flags)) {
2029                        md_super_write(mddev,rdev,
2030                                       rdev->sb_start, rdev->sb_size,
2031                                       rdev->sb_page);
2032                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2033                                bdevname(rdev->bdev,b),
2034                                (unsigned long long)rdev->sb_start);
2035                        rdev->sb_events = mddev->events;
2036
2037                } else
2038                        dprintk(")\n");
2039                if (mddev->level == LEVEL_MULTIPATH)
2040                        /* only need to write one superblock... */
2041                        break;
2042        }
2043        md_super_wait(mddev);
2044        /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2045
2046        spin_lock_irq(&mddev->write_lock);
2047        if (mddev->in_sync != sync_req ||
2048            test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2049                /* have to write it out again */
2050                spin_unlock_irq(&mddev->write_lock);
2051                goto repeat;
2052        }
2053        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2054        spin_unlock_irq(&mddev->write_lock);
2055        wake_up(&mddev->sb_wait);
2056        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2057                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2058
2059}
2060
2061/* words written to sysfs files may, or may not, be \n terminated.
2062 * We want to accept with case. For this we use cmd_match.
2063 */
2064static int cmd_match(const char *cmd, const char *str)
2065{
2066        /* See if cmd, written into a sysfs file, matches
2067         * str.  They must either be the same, or cmd can
2068         * have a trailing newline
2069         */
2070        while (*cmd && *str && *cmd == *str) {
2071                cmd++;
2072                str++;
2073        }
2074        if (*cmd == '\n')
2075                cmd++;
2076        if (*str || *cmd)
2077                return 0;
2078        return 1;
2079}
2080
2081struct rdev_sysfs_entry {
2082        struct attribute attr;
2083        ssize_t (*show)(mdk_rdev_t *, char *);
2084        ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2085};
2086
2087static ssize_t
2088state_show(mdk_rdev_t *rdev, char *page)
2089{
2090        char *sep = "";
2091        size_t len = 0;
2092
2093        if (test_bit(Faulty, &rdev->flags)) {
2094                len+= sprintf(page+len, "%sfaulty",sep);
2095                sep = ",";
2096        }
2097        if (test_bit(In_sync, &rdev->flags)) {
2098                len += sprintf(page+len, "%sin_sync",sep);
2099                sep = ",";
2100        }
2101        if (test_bit(WriteMostly, &rdev->flags)) {
2102                len += sprintf(page+len, "%swrite_mostly",sep);
2103                sep = ",";
2104        }
2105        if (test_bit(Blocked, &rdev->flags)) {
2106                len += sprintf(page+len, "%sblocked", sep);
2107                sep = ",";
2108        }
2109        if (!test_bit(Faulty, &rdev->flags) &&
2110            !test_bit(In_sync, &rdev->flags)) {
2111                len += sprintf(page+len, "%sspare", sep);
2112                sep = ",";
2113        }
2114        return len+sprintf(page+len, "\n");
2115}
2116
2117static ssize_t
2118state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2119{
2120        /* can write
2121         *  faulty  - simulates and error
2122         *  remove  - disconnects the device
2123         *  writemostly - sets write_mostly
2124         *  -writemostly - clears write_mostly
2125         *  blocked - sets the Blocked flag
2126         *  -blocked - clears the Blocked flag
2127         *  insync - sets Insync providing device isn't active
2128         */
2129        int err = -EINVAL;
2130        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2131                md_error(rdev->mddev, rdev);
2132                err = 0;
2133        } else if (cmd_match(buf, "remove")) {
2134                if (rdev->raid_disk >= 0)
2135                        err = -EBUSY;
2136                else {
2137                        mddev_t *mddev = rdev->mddev;
2138                        kick_rdev_from_array(rdev);
2139                        if (mddev->pers)
2140                                md_update_sb(mddev, 1);
2141                        md_new_event(mddev);
2142                        err = 0;
2143                }
2144        } else if (cmd_match(buf, "writemostly")) {
2145                set_bit(WriteMostly, &rdev->flags);
2146                err = 0;
2147        } else if (cmd_match(buf, "-writemostly")) {
2148                clear_bit(WriteMostly, &rdev->flags);
2149                err = 0;
2150        } else if (cmd_match(buf, "blocked")) {
2151                set_bit(Blocked, &rdev->flags);
2152                err = 0;
2153        } else if (cmd_match(buf, "-blocked")) {
2154                clear_bit(Blocked, &rdev->flags);
2155                wake_up(&rdev->blocked_wait);
2156                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2157                md_wakeup_thread(rdev->mddev->thread);
2158
2159                err = 0;
2160        } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2161                set_bit(In_sync, &rdev->flags);
2162                err = 0;
2163        }
2164        if (!err && rdev->sysfs_state)
2165                sysfs_notify_dirent(rdev->sysfs_state);
2166        return err ? err : len;
2167}
2168static struct rdev_sysfs_entry rdev_state =
2169__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2170
2171static ssize_t
2172errors_show(mdk_rdev_t *rdev, char *page)
2173{
2174        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2175}
2176
2177static ssize_t
2178errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2179{
2180        char *e;
2181        unsigned long n = simple_strtoul(buf, &e, 10);
2182        if (*buf && (*e == 0 || *e == '\n')) {
2183                atomic_set(&rdev->corrected_errors, n);
2184                return len;
2185        }
2186        return -EINVAL;
2187}
2188static struct rdev_sysfs_entry rdev_errors =
2189__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2190
2191static ssize_t
2192slot_show(mdk_rdev_t *rdev, char *page)
2193{
2194        if (rdev->raid_disk < 0)
2195                return sprintf(page, "none\n");
2196        else
2197                return sprintf(page, "%d\n", rdev->raid_disk);
2198}
2199
2200static ssize_t
2201slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2202{
2203        char *e;
2204        int err;
2205        char nm[20];
2206        int slot = simple_strtoul(buf, &e, 10);
2207        if (strncmp(buf, "none", 4)==0)
2208                slot = -1;
2209        else if (e==buf || (*e && *e!= '\n'))
2210                return -EINVAL;
2211        if (rdev->mddev->pers && slot == -1) {
2212                /* Setting 'slot' on an active array requires also
2213                 * updating the 'rd%d' link, and communicating
2214                 * with the personality with ->hot_*_disk.
2215                 * For now we only support removing
2216                 * failed/spare devices.  This normally happens automatically,
2217                 * but not when the metadata is externally managed.
2218                 */
2219                if (rdev->raid_disk == -1)
2220                        return -EEXIST;
2221                /* personality does all needed checks */
2222                if (rdev->mddev->pers->hot_add_disk == NULL)
2223                        return -EINVAL;
2224                err = rdev->mddev->pers->
2225                        hot_remove_disk(rdev->mddev, rdev->raid_disk);
2226                if (err)
2227                        return err;
2228                sprintf(nm, "rd%d", rdev->raid_disk);
2229                sysfs_remove_link(&rdev->mddev->kobj, nm);
2230                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2231                md_wakeup_thread(rdev->mddev->thread);
2232        } else if (rdev->mddev->pers) {
2233                mdk_rdev_t *rdev2;
2234                /* Activating a spare .. or possibly reactivating
2235                 * if we ever get bitmaps working here.
2236                 */
2237
2238                if (rdev->raid_disk != -1)
2239                        return -EBUSY;
2240
2241                if (rdev->mddev->pers->hot_add_disk == NULL)
2242                        return -EINVAL;
2243
2244                list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2245                        if (rdev2->raid_disk == slot)
2246                                return -EEXIST;
2247
2248                rdev->raid_disk = slot;
2249                if (test_bit(In_sync, &rdev->flags))
2250                        rdev->saved_raid_disk = slot;
2251                else
2252                        rdev->saved_raid_disk = -1;
2253                err = rdev->mddev->pers->
2254                        hot_add_disk(rdev->mddev, rdev);
2255                if (err) {
2256                        rdev->raid_disk = -1;
2257                        return err;
2258                } else
2259                        sysfs_notify_dirent(rdev->sysfs_state);
2260                sprintf(nm, "rd%d", rdev->raid_disk);
2261                if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2262                        printk(KERN_WARNING
2263                               "md: cannot register "
2264                               "%s for %s\n",
2265                               nm, mdname(rdev->mddev));
2266
2267                /* don't wakeup anyone, leave that to userspace. */
2268        } else {
2269                if (slot >= rdev->mddev->raid_disks)
2270                        return -ENOSPC;
2271                rdev->raid_disk = slot;
2272                /* assume it is working */
2273                clear_bit(Faulty, &rdev->flags);
2274                clear_bit(WriteMostly, &rdev->flags);
2275                set_bit(In_sync, &rdev->flags);
2276                sysfs_notify_dirent(rdev->sysfs_state);
2277        }
2278        return len;
2279}
2280
2281
2282static struct rdev_sysfs_entry rdev_slot =
2283__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2284
2285static ssize_t
2286offset_show(mdk_rdev_t *rdev, char *page)
2287{
2288        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2289}
2290
2291static ssize_t
2292offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2293{
2294        char *e;
2295        unsigned long long offset = simple_strtoull(buf, &e, 10);
2296        if (e==buf || (*e && *e != '\n'))
2297                return -EINVAL;
2298        if (rdev->mddev->pers && rdev->raid_disk >= 0)
2299                return -EBUSY;
2300        if (rdev->sectors && rdev->mddev->external)
2301                /* Must set offset before size, so overlap checks
2302                 * can be sane */
2303                return -EBUSY;
2304        rdev->data_offset = offset;
2305        return len;
2306}
2307
2308static struct rdev_sysfs_entry rdev_offset =
2309__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2310
2311static ssize_t
2312rdev_size_show(mdk_rdev_t *rdev, char *page)
2313{
2314        return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2315}
2316
2317static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2318{
2319        /* check if two start/length pairs overlap */
2320        if (s1+l1 <= s2)
2321                return 0;
2322        if (s2+l2 <= s1)
2323                return 0;
2324        return 1;
2325}
2326
2327static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2328{
2329        unsigned long long blocks;
2330        sector_t new;
2331
2332        if (strict_strtoull(buf, 10, &blocks) < 0)
2333                return -EINVAL;
2334
2335        if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2336                return -EINVAL; /* sector conversion overflow */
2337
2338        new = blocks * 2;
2339        if (new != blocks * 2)
2340                return -EINVAL; /* unsigned long long to sector_t overflow */
2341
2342        *sectors = new;
2343        return 0;
2344}
2345
2346static ssize_t
2347rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2348{
2349        mddev_t *my_mddev = rdev->mddev;
2350        sector_t oldsectors = rdev->sectors;
2351        sector_t sectors;
2352
2353        if (strict_blocks_to_sectors(buf, &sectors) < 0)
2354                return -EINVAL;
2355        if (my_mddev->pers && rdev->raid_disk >= 0) {
2356                if (my_mddev->persistent) {
2357                        sectors = super_types[my_mddev->major_version].
2358                                rdev_size_change(rdev, sectors);
2359                        if (!sectors)
2360                                return -EBUSY;
2361                } else if (!sectors)
2362                        sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2363                                rdev->data_offset;
2364        }
2365        if (sectors < my_mddev->dev_sectors)
2366                return -EINVAL; /* component must fit device */
2367
2368        rdev->sectors = sectors;
2369        if (sectors > oldsectors && my_mddev->external) {
2370                /* need to check that all other rdevs with the same ->bdev
2371                 * do not overlap.  We need to unlock the mddev to avoid
2372                 * a deadlock.  We have already changed rdev->sectors, and if
2373                 * we have to change it back, we will have the lock again.
2374                 */
2375                mddev_t *mddev;
2376                int overlap = 0;
2377                struct list_head *tmp;
2378
2379                mddev_unlock(my_mddev);
2380                for_each_mddev(mddev, tmp) {
2381                        mdk_rdev_t *rdev2;
2382
2383                        mddev_lock(mddev);
2384                        list_for_each_entry(rdev2, &mddev->disks, same_set)
2385                                if (test_bit(AllReserved, &rdev2->flags) ||
2386                                    (rdev->bdev == rdev2->bdev &&
2387                                     rdev != rdev2 &&
2388                                     overlaps(rdev->data_offset, rdev->sectors,
2389                                              rdev2->data_offset,
2390                                              rdev2->sectors))) {
2391                                        overlap = 1;
2392                                        break;
2393                                }
2394                        mddev_unlock(mddev);
2395                        if (overlap) {
2396                                mddev_put(mddev);
2397                                break;
2398                        }
2399                }
2400                mddev_lock(my_mddev);
2401                if (overlap) {
2402                        /* Someone else could have slipped in a size
2403                         * change here, but doing so is just silly.
2404                         * We put oldsectors back because we *know* it is
2405                         * safe, and trust userspace not to race with
2406                         * itself
2407                         */
2408                        rdev->sectors = oldsectors;
2409                        return -EBUSY;
2410                }
2411        }
2412        return len;
2413}
2414
2415static struct rdev_sysfs_entry rdev_size =
2416__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2417
2418static struct attribute *rdev_default_attrs[] = {
2419        &rdev_state.attr,
2420        &rdev_errors.attr,
2421        &rdev_slot.attr,
2422        &rdev_offset.attr,
2423        &rdev_size.attr,
2424        NULL,
2425};
2426static ssize_t
2427rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2428{
2429        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2430        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2431        mddev_t *mddev = rdev->mddev;
2432        ssize_t rv;
2433
2434        if (!entry->show)
2435                return -EIO;
2436
2437        rv = mddev ? mddev_lock(mddev) : -EBUSY;
2438        if (!rv) {
2439                if (rdev->mddev == NULL)
2440                        rv = -EBUSY;
2441                else
2442                        rv = entry->show(rdev, page);
2443                mddev_unlock(mddev);
2444        }
2445        return rv;
2446}
2447
2448static ssize_t
2449rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2450              const char *page, size_t length)
2451{
2452        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2453        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2454        ssize_t rv;
2455        mddev_t *mddev = rdev->mddev;
2456
2457        if (!entry->store)
2458                return -EIO;
2459        if (!capable(CAP_SYS_ADMIN))
2460                return -EACCES;
2461        rv = mddev ? mddev_lock(mddev): -EBUSY;
2462        if (!rv) {
2463                if (rdev->mddev == NULL)
2464                        rv = -EBUSY;
2465                else
2466                        rv = entry->store(rdev, page, length);
2467                mddev_unlock(mddev);
2468        }
2469        return rv;
2470}
2471
2472static void rdev_free(struct kobject *ko)
2473{
2474        mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2475        kfree(rdev);
2476}
2477static struct sysfs_ops rdev_sysfs_ops = {
2478        .show           = rdev_attr_show,
2479        .store          = rdev_attr_store,
2480};
2481static struct kobj_type rdev_ktype = {
2482        .release        = rdev_free,
2483        .sysfs_ops      = &rdev_sysfs_ops,
2484        .default_attrs  = rdev_default_attrs,
2485};
2486
2487/*
2488 * Import a device. If 'super_format' >= 0, then sanity check the superblock
2489 *
2490 * mark the device faulty if:
2491 *
2492 *   - the device is nonexistent (zero size)
2493 *   - the device has no valid superblock
2494 *
2495 * a faulty rdev _never_ has rdev->sb set.
2496 */
2497static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2498{
2499        char b[BDEVNAME_SIZE];
2500        int err;
2501        mdk_rdev_t *rdev;
2502        sector_t size;
2503
2504        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2505        if (!rdev) {
2506                printk(KERN_ERR "md: could not alloc mem for new device!\n");
2507                return ERR_PTR(-ENOMEM);
2508        }
2509
2510        if ((err = alloc_disk_sb(rdev)))
2511                goto abort_free;
2512
2513        err = lock_rdev(rdev, newdev, super_format == -2);
2514        if (err)
2515                goto abort_free;
2516
2517        kobject_init(&rdev->kobj, &rdev_ktype);
2518
2519        rdev->desc_nr = -1;
2520        rdev->saved_raid_disk = -1;
2521        rdev->raid_disk = -1;
2522        rdev->flags = 0;
2523        rdev->data_offset = 0;
2524        rdev->sb_events = 0;
2525        atomic_set(&rdev->nr_pending, 0);
2526        atomic_set(&rdev->read_errors, 0);
2527        atomic_set(&rdev->corrected_errors, 0);
2528
2529        size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2530        if (!size) {
2531                printk(KERN_WARNING 
2532                        "md: %s has zero or unknown size, marking faulty!\n",
2533                        bdevname(rdev->bdev,b));
2534                err = -EINVAL;
2535                goto abort_free;
2536        }
2537
2538        if (super_format >= 0) {
2539                err = super_types[super_format].
2540                        load_super(rdev, NULL, super_minor);
2541                if (err == -EINVAL) {
2542                        printk(KERN_WARNING
2543                                "md: %s does not have a valid v%d.%d "
2544                               "superblock, not importing!\n",
2545                                bdevname(rdev->bdev,b),
2546                               super_format, super_minor);
2547                        goto abort_free;
2548                }
2549                if (err < 0) {
2550                        printk(KERN_WARNING 
2551                                "md: could not read %s's sb, not importing!\n",
2552                                bdevname(rdev->bdev,b));
2553                        goto abort_free;
2554                }
2555        }
2556
2557        INIT_LIST_HEAD(&rdev->same_set);
2558        init_waitqueue_head(&rdev->blocked_wait);
2559
2560        return rdev;
2561
2562abort_free:
2563        if (rdev->sb_page) {
2564                if (rdev->bdev)
2565                        unlock_rdev(rdev);
2566                free_disk_sb(rdev);
2567        }
2568        kfree(rdev);
2569        return ERR_PTR(err);
2570}
2571
2572/*
2573 * Check a full RAID array for plausibility
2574 */
2575
2576
2577static void analyze_sbs(mddev_t * mddev)
2578{
2579        int i;
2580        mdk_rdev_t *rdev, *freshest, *tmp;
2581        char b[BDEVNAME_SIZE];
2582
2583        freshest = NULL;
2584        rdev_for_each(rdev, tmp, mddev)
2585                switch (super_types[mddev->major_version].
2586                        load_super(rdev, freshest, mddev->minor_version)) {
2587                case 1:
2588                        freshest = rdev;
2589                        break;
2590                case 0:
2591                        break;
2592                default:
2593                        printk( KERN_ERR \
2594                                "md: fatal superblock inconsistency in %s"
2595                                " -- removing from array\n", 
2596                                bdevname(rdev->bdev,b));
2597                        kick_rdev_from_array(rdev);
2598                }
2599
2600
2601        super_types[mddev->major_version].
2602                validate_super(mddev, freshest);
2603
2604        i = 0;
2605        rdev_for_each(rdev, tmp, mddev) {
2606                if (rdev->desc_nr >= mddev->max_disks ||
2607                    i > mddev->max_disks) {
2608                        printk(KERN_WARNING
2609                               "md: %s: %s: only %d devices permitted\n",
2610                               mdname(mddev), bdevname(rdev->bdev, b),
2611                               mddev->max_disks);
2612                        kick_rdev_from_array(rdev);
2613                        continue;
2614                }
2615                if (rdev != freshest)
2616                        if (super_types[mddev->major_version].
2617                            validate_super(mddev, rdev)) {
2618                                printk(KERN_WARNING "md: kicking non-fresh %s"
2619                                        " from array!\n",
2620                                        bdevname(rdev->bdev,b));
2621                                kick_rdev_from_array(rdev);
2622                                continue;
2623                        }
2624                if (mddev->level == LEVEL_MULTIPATH) {
2625                        rdev->desc_nr = i++;
2626                        rdev->raid_disk = rdev->desc_nr;
2627                        set_bit(In_sync, &rdev->flags);
2628                } else if (rdev->raid_disk >= mddev->raid_disks) {
2629                        rdev->raid_disk = -1;
2630                        clear_bit(In_sync, &rdev->flags);
2631                }
2632        }
2633}
2634
2635static void md_safemode_timeout(unsigned long data);
2636
2637static ssize_t
2638safe_delay_show(mddev_t *mddev, char *page)
2639{
2640        int msec = (mddev->safemode_delay*1000)/HZ;
2641        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2642}
2643static ssize_t
2644safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2645{
2646        int scale=1;
2647        int dot=0;
2648        int i;
2649        unsigned long msec;
2650        char buf[30];
2651
2652        /* remove a period, and count digits after it */
2653        if (len >= sizeof(buf))
2654                return -EINVAL;
2655        strlcpy(buf, cbuf, sizeof(buf));
2656        for (i=0; i<len; i++) {
2657                if (dot) {
2658                        if (isdigit(buf[i])) {
2659                                buf[i-1] = buf[i];
2660                                scale *= 10;
2661                        }
2662                        buf[i] = 0;
2663                } else if (buf[i] == '.') {
2664                        dot=1;
2665                        buf[i] = 0;
2666                }
2667        }
2668        if (strict_strtoul(buf, 10, &msec) < 0)
2669                return -EINVAL;
2670        msec = (msec * 1000) / scale;
2671        if (msec == 0)
2672                mddev->safemode_delay = 0;
2673        else {
2674                unsigned long old_delay = mddev->safemode_delay;
2675                mddev->safemode_delay = (msec*HZ)/1000;
2676                if (mddev->safemode_delay == 0)
2677                        mddev->safemode_delay = 1;
2678                if (mddev->safemode_delay < old_delay)
2679                        md_safemode_timeout((unsigned long)mddev);
2680        }
2681        return len;
2682}
2683static struct md_sysfs_entry md_safe_delay =
2684__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2685
2686static ssize_t
2687level_show(mddev_t *mddev, char *page)
2688{
2689        struct mdk_personality *p = mddev->pers;
2690        if (p)
2691                return sprintf(page, "%s\n", p->name);
2692        else if (mddev->clevel[0])
2693                return sprintf(page, "%s\n", mddev->clevel);
2694        else if (mddev->level != LEVEL_NONE)
2695                return sprintf(page, "%d\n", mddev->level);
2696        else
2697                return 0;
2698}
2699
2700static ssize_t
2701level_store(mddev_t *mddev, const char *buf, size_t len)
2702{
2703        char level[16];
2704        ssize_t rv = len;
2705        struct mdk_personality *pers;
2706        void *priv;
2707        mdk_rdev_t *rdev;
2708
2709        if (mddev->pers == NULL) {
2710                if (len == 0)
2711                        return 0;
2712                if (len >= sizeof(mddev->clevel))
2713                        return -ENOSPC;
2714                strncpy(mddev->clevel, buf, len);
2715                if (mddev->clevel[len-1] == '\n')
2716                        len--;
2717                mddev->clevel[len] = 0;
2718                mddev->level = LEVEL_NONE;
2719                return rv;
2720        }
2721
2722        /* request to change the personality.  Need to ensure:
2723         *  - array is not engaged in resync/recovery/reshape
2724         *  - old personality can be suspended
2725         *  - new personality will access other array.
2726         */
2727
2728        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2729                return -EBUSY;
2730
2731        if (!mddev->pers->quiesce) {
2732                printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2733                       mdname(mddev), mddev->pers->name);
2734                return -EINVAL;
2735        }
2736
2737        /* Now find the new personality */
2738        if (len == 0 || len >= sizeof(level))
2739                return -EINVAL;
2740        strncpy(level, buf, len);
2741        if (level[len-1] == '\n')
2742                len--;
2743        level[len] = 0;
2744
2745        request_module("md-%s", level);
2746        spin_lock(&pers_lock);
2747        pers = find_pers(LEVEL_NONE, level);
2748        if (!pers || !try_module_get(pers->owner)) {
2749                spin_unlock(&pers_lock);
2750                printk(KERN_WARNING "md: personality %s not loaded\n", level);
2751                return -EINVAL;
2752        }
2753        spin_unlock(&pers_lock);
2754
2755        if (pers == mddev->pers) {
2756                /* Nothing to do! */
2757                module_put(pers->owner);
2758                return rv;
2759        }
2760        if (!pers->takeover) {
2761                module_put(pers->owner);
2762                printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2763                       mdname(mddev), level);
2764                return -EINVAL;
2765        }
2766
2767        /* ->takeover must set new_* and/or delta_disks
2768         * if it succeeds, and may set them when it fails.
2769         */
2770        priv = pers->takeover(mddev);
2771        if (IS_ERR(priv)) {
2772                mddev->new_level = mddev->level;
2773                mddev->new_layout = mddev->layout;
2774                mddev->new_chunk_sectors = mddev->chunk_sectors;
2775                mddev->raid_disks -= mddev->delta_disks;
2776                mddev->delta_disks = 0;
2777                module_put(pers->owner);
2778                printk(KERN_WARNING "md: %s: %s would not accept array\n",
2779                       mdname(mddev), level);
2780                return PTR_ERR(priv);
2781        }
2782
2783        /* Looks like we have a winner */
2784        mddev_suspend(mddev);
2785        mddev->pers->stop(mddev);
2786        module_put(mddev->pers->owner);
2787        /* Invalidate devices that are now superfluous */
2788        list_for_each_entry(rdev, &mddev->disks, same_set)
2789                if (rdev->raid_disk >= mddev->raid_disks) {
2790                        rdev->raid_disk = -1;
2791                        clear_bit(In_sync, &rdev->flags);
2792                }
2793        mddev->pers = pers;
2794        mddev->private = priv;
2795        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2796        mddev->level = mddev->new_level;
2797        mddev->layout = mddev->new_layout;
2798        mddev->chunk_sectors = mddev->new_chunk_sectors;
2799        mddev->delta_disks = 0;
2800        pers->run(mddev);
2801        mddev_resume(mddev);
2802        set_bit(MD_CHANGE_DEVS, &mddev->flags);
2803        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2804        md_wakeup_thread(mddev->thread);
2805        return rv;
2806}
2807
2808static struct md_sysfs_entry md_level =
2809__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2810
2811
2812static ssize_t
2813layout_show(mddev_t *mddev, char *page)
2814{
2815        /* just a number, not meaningful for all levels */
2816        if (mddev->reshape_position != MaxSector &&
2817            mddev->layout != mddev->new_layout)
2818                return sprintf(page, "%d (%d)\n",
2819                               mddev->new_layout, mddev->layout);
2820        return sprintf(page, "%d\n", mddev->layout);
2821}
2822
2823static ssize_t
2824layout_store(mddev_t *mddev, const char *buf, size_t len)
2825{
2826        char *e;
2827        unsigned long n = simple_strtoul(buf, &e, 10);
2828
2829        if (!*buf || (*e && *e != '\n'))
2830                return -EINVAL;
2831
2832        if (mddev->pers) {
2833                int err;
2834                if (mddev->pers->check_reshape == NULL)
2835                        return -EBUSY;
2836                mddev->new_layout = n;
2837                err = mddev->pers->check_reshape(mddev);
2838                if (err) {
2839                        mddev->new_layout = mddev->layout;
2840                        return err;
2841                }
2842        } else {
2843                mddev->new_layout = n;
2844                if (mddev->reshape_position == MaxSector)
2845                        mddev->layout = n;
2846        }
2847        return len;
2848}
2849static struct md_sysfs_entry md_layout =
2850__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2851
2852
2853static ssize_t
2854raid_disks_show(mddev_t *mddev, char *page)
2855{
2856        if (mddev->raid_disks == 0)
2857                return 0;
2858        if (mddev->reshape_position != MaxSector &&
2859            mddev->delta_disks != 0)
2860                return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2861                               mddev->raid_disks - mddev->delta_disks);
2862        return sprintf(page, "%d\n", mddev->raid_disks);
2863}
2864
2865static int update_raid_disks(mddev_t *mddev, int raid_disks);
2866
2867static ssize_t
2868raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2869{
2870        char *e;
2871        int rv = 0;
2872        unsigned long n = simple_strtoul(buf, &e, 10);
2873
2874        if (!*buf || (*e && *e != '\n'))
2875                return -EINVAL;
2876
2877        if (mddev->pers)
2878                rv = update_raid_disks(mddev, n);
2879        else if (mddev->reshape_position != MaxSector) {
2880                int olddisks = mddev->raid_disks - mddev->delta_disks;
2881                mddev->delta_disks = n - olddisks;
2882                mddev->raid_disks = n;
2883        } else
2884                mddev->raid_disks = n;
2885        return rv ? rv : len;
2886}
2887static struct md_sysfs_entry md_raid_disks =
2888__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2889
2890static ssize_t
2891chunk_size_show(mddev_t *mddev, char *page)
2892{
2893        if (mddev->reshape_position != MaxSector &&
2894            mddev->chunk_sectors != mddev->new_chunk_sectors)
2895                return sprintf(page, "%d (%d)\n",
2896                               mddev->new_chunk_sectors << 9,
2897                               mddev->chunk_sectors << 9);
2898        return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
2899}
2900
2901static ssize_t
2902chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2903{
2904        char *e;
2905        unsigned long n = simple_strtoul(buf, &e, 10);
2906
2907        if (!*buf || (*e && *e != '\n'))
2908                return -EINVAL;
2909
2910        if (mddev->pers) {
2911                int err;
2912                if (mddev->pers->check_reshape == NULL)
2913                        return -EBUSY;
2914                mddev->new_chunk_sectors = n >> 9;
2915                err = mddev->pers->check_reshape(mddev);
2916                if (err) {
2917                        mddev->new_chunk_sectors = mddev->chunk_sectors;
2918                        return err;
2919                }
2920        } else {
2921                mddev->new_chunk_sectors = n >> 9;
2922                if (mddev->reshape_position == MaxSector)
2923                        mddev->chunk_sectors = n >> 9;
2924        }
2925        return len;
2926}
2927static struct md_sysfs_entry md_chunk_size =
2928__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2929
2930static ssize_t
2931resync_start_show(mddev_t *mddev, char *page)
2932{
2933        if (mddev->recovery_cp == MaxSector)
2934                return sprintf(page, "none\n");
2935        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2936}
2937
2938static ssize_t
2939resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2940{
2941        char *e;
2942        unsigned long long n = simple_strtoull(buf, &e, 10);
2943
2944        if (mddev->pers)
2945                return -EBUSY;
2946        if (!*buf || (*e && *e != '\n'))
2947                return -EINVAL;
2948
2949        mddev->recovery_cp = n;
2950        return len;
2951}
2952static struct md_sysfs_entry md_resync_start =
2953__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2954
2955/*
2956 * The array state can be:
2957 *
2958 * clear
2959 *     No devices, no size, no level
2960 *     Equivalent to STOP_ARRAY ioctl
2961 * inactive
2962 *     May have some settings, but array is not active
2963 *        all IO results in error
2964 *     When written, doesn't tear down array, but just stops it
2965 * suspended (not supported yet)
2966 *     All IO requests will block. The array can be reconfigured.
2967 *     Writing this, if accepted, will block until array is quiescent
2968 * readonly
2969 *     no resync can happen.  no superblocks get written.
2970 *     write requests fail
2971 * read-auto
2972 *     like readonly, but behaves like 'clean' on a write request.
2973 *
2974 * clean - no pending writes, but otherwise active.
2975 *     When written to inactive array, starts without resync
2976 *     If a write request arrives then
2977 *       if metadata is known, mark 'dirty' and switch to 'active'.
2978 *       if not known, block and switch to write-pending
2979 *     If written to an active array that has pending writes, then fails.
2980 * active
2981 *     fully active: IO and resync can be happening.
2982 *     When written to inactive array, starts with resync
2983 *
2984 * write-pending
2985 *     clean, but writes are blocked waiting for 'active' to be written.
2986 *
2987 * active-idle
2988 *     like active, but no writes have been seen for a while (100msec).
2989 *
2990 */
2991enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2992                   write_pending, active_idle, bad_word};
2993static char *array_states[] = {
2994        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2995        "write-pending", "active-idle", NULL };
2996
2997static int match_word(const char *word, char **list)
2998{
2999        int n;
3000        for (n=0; list[n]; n++)
3001                if (cmd_match(word, list[n]))
3002                        break;
3003        return n;
3004}
3005
3006static ssize_t
3007array_state_show(mddev_t *mddev, char *page)
3008{
3009        enum array_state st = inactive;
3010
3011        if (mddev->pers)
3012                switch(mddev->ro) {
3013                case 1:
3014                        st = readonly;
3015                        break;
3016                case 2:
3017                        st = read_auto;
3018                        break;
3019                case 0:
3020                        if (mddev->in_sync)
3021                                st = clean;
3022                        else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
3023                                st = write_pending;
3024                        else if (mddev->safemode)
3025                                st = active_idle;
3026                        else
3027                                st = active;
3028                }
3029        else {
3030                if (list_empty(&mddev->disks) &&
3031                    mddev->raid_disks == 0 &&
3032                    mddev->dev_sectors == 0)
3033                        st = clear;
3034                else
3035                        st = inactive;
3036        }
3037        return sprintf(page, "%s\n", array_states[st]);
3038}
3039
3040static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3041static int do_md_run(mddev_t * mddev);
3042static int restart_array(mddev_t *mddev);
3043
3044static ssize_t
3045array_state_store(mddev_t *mddev, const char *buf, size_t len)
3046{
3047        int err = -EINVAL;
3048        enum array_state st = match_word(buf, array_states);
3049        switch(st) {
3050        case bad_word:
3051                break;
3052        case clear:
3053                /* stopping an active array */
3054                if (atomic_read(&mddev->openers) > 0)
3055                        return -EBUSY;
3056                err = do_md_stop(mddev, 0, 0);
3057                break;
3058        case inactive:
3059                /* stopping an active array */
3060                if (mddev->pers) {
3061                        if (atomic_read(&mddev->openers) > 0)
3062                                return -EBUSY;
3063                        err = do_md_stop(mddev, 2, 0);
3064                } else
3065                        err = 0; /* already inactive */
3066                break;
3067        case suspended:
3068                break; /* not supported yet */
3069        case readonly:
3070                if (mddev->pers)
3071                        err = do_md_stop(mddev, 1, 0);
3072                else {
3073                        mddev->ro = 1;
3074                        set_disk_ro(mddev->gendisk, 1);
3075                        err = do_md_run(mddev);
3076                }
3077                break;
3078        case read_auto:
3079                if (mddev->pers) {
3080                        if (mddev->ro == 0)
3081                                err = do_md_stop(mddev, 1, 0);
3082                        else if (mddev->ro == 1)
3083                                err = restart_array(mddev);
3084                        if (err == 0) {
3085                                mddev->ro = 2;
3086                                set_disk_ro(mddev->gendisk, 0);
3087                        }
3088                } else {
3089                        mddev->ro = 2;
3090                        err = do_md_run(mddev);
3091                }
3092                break;
3093        case clean:
3094                if (mddev->pers) {
3095                        restart_array(mddev);
3096                        spin_lock_irq(&mddev->write_lock);
3097                        if (atomic_read(&mddev->writes_pending) == 0) {
3098                                if (mddev->in_sync == 0) {
3099                                        mddev->in_sync = 1;
3100                                        if (mddev->safemode == 1)
3101                                                mddev->safemode = 0;
3102                                        if (mddev->persistent)
3103                                                set_bit(MD_CHANGE_CLEAN,
3104                                                        &mddev->flags);
3105                                }
3106                                err = 0;
3107                        } else
3108                                err = -EBUSY;
3109                        spin_unlock_irq(&mddev->write_lock);
3110                } else
3111                        err = -EINVAL;
3112                break;
3113        case active:
3114                if (mddev->pers) {
3115                        restart_array(mddev);
3116                        if (mddev->external)
3117                                clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3118                        wake_up(&mddev->sb_wait);
3119                        err = 0;
3120                } else {
3121                        mddev->ro = 0;
3122                        set_disk_ro(mddev->gendisk, 0);
3123                        err = do_md_run(mddev);
3124                }
3125                break;
3126        case write_pending:
3127        case active_idle:
3128                /* these cannot be set */
3129                break;
3130        }
3131        if (err)
3132                return err;
3133        else {
3134                sysfs_notify_dirent(mddev->sysfs_state);
3135                return len;
3136        }
3137}
3138static struct md_sysfs_entry md_array_state =
3139__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3140
3141static ssize_t
3142null_show(mddev_t *mddev, char *page)
3143{
3144        return -EINVAL;
3145}
3146
3147static ssize_t
3148new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3149{
3150        /* buf must be %d:%d\n? giving major and minor numbers */
3151        /* The new device is added to the array.
3152         * If the array has a persistent superblock, we read the
3153         * superblock to initialise info and check validity.
3154         * Otherwise, only checking done is that in bind_rdev_to_array,
3155         * which mainly checks size.
3156         */
3157        char *e;
3158        int major = simple_strtoul(buf, &e, 10);
3159        int minor;
3160        dev_t dev;
3161        mdk_rdev_t *rdev;
3162        int err;
3163
3164        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3165                return -EINVAL;
3166        minor = simple_strtoul(e+1, &e, 10);
3167        if (*e && *e != '\n')
3168                return -EINVAL;
3169        dev = MKDEV(major, minor);
3170        if (major != MAJOR(dev) ||
3171            minor != MINOR(dev))
3172                return -EOVERFLOW;
3173
3174
3175        if (mddev->persistent) {
3176                rdev = md_import_device(dev, mddev->major_version,
3177                                        mddev->minor_version);
3178                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3179                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3180                                                       mdk_rdev_t, same_set);
3181                        err = super_types[mddev->major_version]
3182                                .load_super(rdev, rdev0, mddev->minor_version);
3183                        if (err < 0)
3184                                goto out;
3185                }
3186        } else if (mddev->external)
3187                rdev = md_import_device(dev, -2, -1);
3188        else
3189                rdev = md_import_device(dev, -1, -1);
3190
3191        if (IS_ERR(rdev))
3192                return PTR_ERR(rdev);
3193        err = bind_rdev_to_array(rdev, mddev);
3194 out:
3195        if (err)
3196                export_rdev(rdev);
3197        return err ? err : len;
3198}
3199
3200static struct md_sysfs_entry md_new_device =
3201__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3202
3203static ssize_t
3204bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3205{
3206        char *end;
3207        unsigned long chunk, end_chunk;
3208
3209        if (!mddev->bitmap)
3210                goto out;
3211        /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3212        while (*buf) {
3213                chunk = end_chunk = simple_strtoul(buf, &end, 0);
3214                if (buf == end) break;
3215                if (*end == '-') { /* range */
3216                        buf = end + 1;
3217                        end_chunk = simple_strtoul(buf, &end, 0);
3218                        if (buf == end) break;
3219                }
3220                if (*end && !isspace(*end)) break;
3221                bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3222                buf = end;
3223                while (isspace(*buf)) buf++;
3224        }
3225        bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3226out:
3227        return len;
3228}
3229
3230static struct md_sysfs_entry md_bitmap =
3231__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3232
3233static ssize_t
3234size_show(mddev_t *mddev, char *page)
3235{
3236        return sprintf(page, "%llu\n",
3237                (unsigned long long)mddev->dev_sectors / 2);
3238}
3239
3240static int update_size(mddev_t *mddev, sector_t num_sectors);
3241
3242static ssize_t
3243size_store(mddev_t *mddev, const char *buf, size_t len)
3244{
3245        /* If array is inactive, we can reduce the component size, but
3246         * not increase it (except from 0).
3247         * If array is active, we can try an on-line resize
3248         */
3249        sector_t sectors;
3250        int err = strict_blocks_to_sectors(buf, &sectors);
3251
3252        if (err < 0)
3253                return err;
3254        if (mddev->pers) {
3255                err = update_size(mddev, sectors);
3256                md_update_sb(mddev, 1);
3257        } else {
3258                if (mddev->dev_sectors == 0 ||
3259                    mddev->dev_sectors > sectors)
3260                        mddev->dev_sectors = sectors;
3261                else
3262                        err = -ENOSPC;
3263        }
3264        return err ? err : len;
3265}
3266
3267static struct md_sysfs_entry md_size =
3268__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3269
3270
3271/* Metdata version.
3272 * This is one of
3273 *   'none' for arrays with no metadata (good luck...)
3274 *   'external' for arrays with externally managed metadata,
3275 * or N.M for internally known formats
3276 */
3277static ssize_t
3278metadata_show(mddev_t *mddev, char *page)
3279{
3280        if (mddev->persistent)
3281                return sprintf(page, "%d.%d\n",
3282                               mddev->major_version, mddev->minor_version);
3283        else if (mddev->external)
3284                return sprintf(page, "external:%s\n", mddev->metadata_type);
3285        else
3286                return sprintf(page, "none\n");
3287}
3288
3289static ssize_t
3290metadata_store(mddev_t *mddev, const char *buf, size_t len)
3291{
3292        int major, minor;
3293        char *e;
3294        /* Changing the details of 'external' metadata is
3295         * always permitted.  Otherwise there must be
3296         * no devices attached to the array.
3297         */
3298        if (mddev->external && strncmp(buf, "external:", 9) == 0)
3299                ;
3300        else if (!list_empty(&mddev->disks))
3301                return -EBUSY;
3302
3303        if (cmd_match(buf, "none")) {
3304                mddev->persistent = 0;
3305                mddev->external = 0;
3306                mddev->major_version = 0;
3307                mddev->minor_version = 90;
3308                return len;
3309        }
3310        if (strncmp(buf, "external:", 9) == 0) {
3311                size_t namelen = len-9;
3312                if (namelen >= sizeof(mddev->metadata_type))
3313                        namelen = sizeof(mddev->metadata_type)-1;
3314                strncpy(mddev->metadata_type, buf+9, namelen);
3315                mddev->metadata_type[namelen] = 0;
3316                if (namelen && mddev->metadata_type[namelen-1] == '\n')
3317                        mddev->metadata_type[--namelen] = 0;
3318                mddev->persistent = 0;
3319                mddev->external = 1;
3320                mddev->major_version = 0;
3321                mddev->minor_version = 90;
3322                return len;
3323        }
3324        major = simple_strtoul(buf, &e, 10);
3325        if (e==buf || *e != '.')
3326                return -EINVAL;
3327        buf = e+1;
3328        minor = simple_strtoul(buf, &e, 10);
3329        if (e==buf || (*e && *e != '\n') )
3330                return -EINVAL;
3331        if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3332                return -ENOENT;
3333        mddev->major_version = major;
3334        mddev->minor_version = minor;
3335        mddev->persistent = 1;
3336        mddev->external = 0;
3337        return len;
3338}
3339
3340static struct md_sysfs_entry md_metadata =
3341__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3342
3343static ssize_t
3344action_show(mddev_t *mddev, char *page)
3345{
3346        char *type = "idle";
3347        if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3348                type = "frozen";
3349        else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3350            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3351                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3352                        type = "reshape";
3353                else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3354                        if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3355                                type = "resync";
3356                        else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3357                                type = "check";
3358                        else
3359                                type = "repair";
3360                } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3361                        type = "recover";
3362        }
3363        return sprintf(page, "%s\n", type);
3364}
3365
3366static ssize_t
3367action_store(mddev_t *mddev, const char *page, size_t len)
3368{
3369        if (!mddev->pers || !mddev->pers->sync_request)
3370                return -EINVAL;
3371
3372        if (cmd_match(page, "frozen"))
3373                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3374        else
3375                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3376
3377        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3378                if (mddev->sync_thread) {
3379                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3380                        md_unregister_thread(mddev->sync_thread);
3381                        mddev->sync_thread = NULL;
3382                        mddev->recovery = 0;
3383                }
3384        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3385                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3386                return -EBUSY;
3387        else if (cmd_match(page, "resync"))
3388                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3389        else if (cmd_match(page, "recover")) {
3390                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3391                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3392        } else if (cmd_match(page, "reshape")) {
3393                int err;
3394                if (mddev->pers->start_reshape == NULL)
3395                        return -EINVAL;
3396                err = mddev->pers->start_reshape(mddev);
3397                if (err)
3398                        return err;
3399                sysfs_notify(&mddev->kobj, NULL, "degraded");
3400        } else {
3401                if (cmd_match(page, "check"))
3402                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3403                else if (!cmd_match(page, "repair"))
3404                        return -EINVAL;
3405                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3406                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3407        }
3408        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3409        md_wakeup_thread(mddev->thread);
3410        sysfs_notify_dirent(mddev->sysfs_action);
3411        return len;
3412}
3413
3414static ssize_t
3415mismatch_cnt_show(mddev_t *mddev, char *page)
3416{
3417        return sprintf(page, "%llu\n",
3418                       (unsigned long long) mddev->resync_mismatches);
3419}
3420
3421static struct md_sysfs_entry md_scan_mode =
3422__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3423
3424
3425static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3426
3427static ssize_t
3428sync_min_show(mddev_t *mddev, char *page)
3429{
3430        return sprintf(page, "%d (%s)\n", speed_min(mddev),
3431                       mddev->sync_speed_min ? "local": "system");
3432}
3433
3434static ssize_t
3435sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3436{
3437        int min;
3438        char *e;
3439        if (strncmp(buf, "system", 6)==0) {
3440                mddev->sync_speed_min = 0;
3441                return len;
3442        }
3443        min = simple_strtoul(buf, &e, 10);
3444        if (buf == e || (*e && *e != '\n') || min <= 0)
3445                return -EINVAL;
3446        mddev->sync_speed_min = min;
3447        return len;
3448}
3449
3450static struct md_sysfs_entry md_sync_min =
3451__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3452
3453static ssize_t
3454sync_max_show(mddev_t *mddev, char *page)
3455{
3456        return sprintf(page, "%d (%s)\n", speed_max(mddev),
3457                       mddev->sync_speed_max ? "local": "system");
3458}
3459
3460static ssize_t
3461sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3462{
3463        int max;
3464        char *e;
3465        if (strncmp(buf, "system", 6)==0) {
3466                mddev->sync_speed_max = 0;
3467                return len;
3468        }
3469        max = simple_strtoul(buf, &e, 10);
3470        if (buf == e || (*e && *e != '\n') || max <= 0)
3471                return -EINVAL;
3472        mddev->sync_speed_max = max;
3473        return len;
3474}
3475
3476static struct md_sysfs_entry md_sync_max =
3477__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3478
3479static ssize_t
3480degraded_show(mddev_t *mddev, char *page)
3481{
3482        return sprintf(page, "%d\n", mddev->degraded);
3483}
3484static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3485
3486static ssize_t
3487sync_force_parallel_show(mddev_t *mddev, char *page)
3488{
3489        return sprintf(page, "%d\n", mddev->parallel_resync);
3490}
3491
3492static ssize_t
3493sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3494{
3495        long n;
3496
3497        if (strict_strtol(buf, 10, &n))
3498                return -EINVAL;
3499
3500        if (n != 0 && n != 1)
3501                return -EINVAL;
3502
3503        mddev->parallel_resync = n;
3504
3505        if (mddev->sync_thread)
3506                wake_up(&resync_wait);
3507
3508        return len;
3509}
3510
3511/* force parallel resync, even with shared block devices */
3512static struct md_sysfs_entry md_sync_force_parallel =
3513__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3514       sync_force_parallel_show, sync_force_parallel_store);
3515
3516static ssize_t
3517sync_speed_show(mddev_t *mddev, char *page)
3518{
3519        unsigned long resync, dt, db;
3520        if (mddev->curr_resync == 0)
3521                return sprintf(page, "none\n");
3522        resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3523        dt = (jiffies - mddev->resync_mark) / HZ;
3524        if (!dt) dt++;
3525        db = resync - mddev->resync_mark_cnt;
3526        return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3527}
3528
3529static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3530
3531static ssize_t
3532sync_completed_show(mddev_t *mddev, char *page)
3533{
3534        unsigned long max_sectors, resync;
3535
3536        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3537                return sprintf(page, "none\n");
3538
3539        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3540                max_sectors = mddev->resync_max_sectors;
3541        else
3542                max_sectors = mddev->dev_sectors;
3543
3544        resync = mddev->curr_resync_completed;
3545        return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3546}
3547
3548static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3549
3550static ssize_t
3551min_sync_show(mddev_t *mddev, char *page)
3552{
3553        return sprintf(page, "%llu\n",
3554                       (unsigned long long)mddev->resync_min);
3555}
3556static ssize_t
3557min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3558{
3559        unsigned long long min;
3560        if (strict_strtoull(buf, 10, &min))
3561                return -EINVAL;
3562        if (min > mddev->resync_max)
3563                return -EINVAL;
3564        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3565                return -EBUSY;
3566
3567        /* Must be a multiple of chunk_size */
3568        if (mddev->chunk_sectors) {
3569                sector_t temp = min;
3570                if (sector_div(temp, mddev->chunk_sectors))
3571                        return -EINVAL;
3572        }
3573        mddev->resync_min = min;
3574
3575        return len;
3576}
3577
3578static struct md_sysfs_entry md_min_sync =
3579__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3580
3581static ssize_t
3582max_sync_show(mddev_t *mddev, char *page)
3583{
3584        if (mddev->resync_max == MaxSector)
3585                return sprintf(page, "max\n");
3586        else
3587                return sprintf(page, "%llu\n",
3588                               (unsigned long long)mddev->resync_max);
3589}
3590static ssize_t
3591max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3592{
3593        if (strncmp(buf, "max", 3) == 0)
3594                mddev->resync_max = MaxSector;
3595        else {
3596                unsigned long long max;
3597                if (strict_strtoull(buf, 10, &max))
3598                        return -EINVAL;
3599                if (max < mddev->resync_min)
3600                        return -EINVAL;
3601                if (max < mddev->resync_max &&
3602                    mddev->ro == 0 &&
3603                    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3604                        return -EBUSY;
3605
3606                /* Must be a multiple of chunk_size */
3607                if (mddev->chunk_sectors) {
3608                        sector_t temp = max;
3609                        if (sector_div(temp, mddev->chunk_sectors))
3610                                return -EINVAL;
3611                }
3612                mddev->resync_max = max;
3613        }
3614        wake_up(&mddev->recovery_wait);
3615        return len;
3616}
3617
3618static struct md_sysfs_entry md_max_sync =
3619__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3620
3621static ssize_t
3622suspend_lo_show(mddev_t *mddev, char *page)
3623{
3624        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3625}
3626
3627static ssize_t
3628suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3629{
3630        char *e;
3631        unsigned long long new = simple_strtoull(buf, &e, 10);
3632
3633        if (mddev->pers == NULL || 
3634            mddev->pers->quiesce == NULL)
3635                return -EINVAL;
3636        if (buf == e || (*e && *e != '\n'))
3637                return -EINVAL;
3638        if (new >= mddev->suspend_hi ||
3639            (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3640                mddev->suspend_lo = new;
3641                mddev->pers->quiesce(mddev, 2);
3642                return len;
3643        } else
3644                return -EINVAL;
3645}
3646static struct md_sysfs_entry md_suspend_lo =
3647__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3648
3649
3650static ssize_t
3651suspend_hi_show(mddev_t *mddev, char *page)
3652{
3653        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3654}
3655
3656static ssize_t
3657suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3658{
3659        char *e;
3660        unsigned long long new = simple_strtoull(buf, &e, 10);
3661
3662        if (mddev->pers == NULL ||
3663            mddev->pers->quiesce == NULL)
3664                return -EINVAL;
3665        if (buf == e || (*e && *e != '\n'))
3666                return -EINVAL;
3667        if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3668            (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3669                mddev->suspend_hi = new;
3670                mddev->pers->quiesce(mddev, 1);
3671                mddev->pers->quiesce(mddev, 0);
3672                return len;
3673        } else
3674                return -EINVAL;
3675}
3676static struct md_sysfs_entry md_suspend_hi =
3677__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3678
3679static ssize_t
3680reshape_position_show(mddev_t *mddev, char *page)
3681{
3682        if (mddev->reshape_position != MaxSector)
3683                return sprintf(page, "%llu\n",
3684                               (unsigned long long)mddev->reshape_position);
3685        strcpy(page, "none\n");
3686        return 5;
3687}
3688
3689static ssize_t
3690reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3691{
3692        char *e;
3693        unsigned long long new = simple_strtoull(buf, &e, 10);
3694        if (mddev->pers)
3695                return -EBUSY;
3696        if (buf == e || (*e && *e != '\n'))
3697                return -EINVAL;
3698        mddev->reshape_position = new;
3699        mddev->delta_disks = 0;
3700        mddev->new_level = mddev->level;
3701        mddev->new_layout = mddev->layout;
3702        mddev->new_chunk_sectors = mddev->chunk_sectors;
3703        return len;
3704}
3705
3706static struct md_sysfs_entry md_reshape_position =
3707__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3708       reshape_position_store);
3709
3710static ssize_t
3711array_size_show(mddev_t *mddev, char *page)
3712{
3713        if (mddev->external_size)
3714                return sprintf(page, "%llu\n",
3715                               (unsigned long long)mddev->array_sectors/2);
3716        else
3717                return sprintf(page, "default\n");
3718}
3719
3720static ssize_t
3721array_size_store(mddev_t *mddev, const char *buf, size_t len)
3722{
3723        sector_t sectors;
3724
3725        if (strncmp(buf, "default", 7) == 0) {
3726                if (mddev->pers)
3727                        sectors = mddev->pers->size(mddev, 0, 0);
3728                else
3729                        sectors = mddev->array_sectors;
3730
3731                mddev->external_size = 0;
3732        } else {
3733                if (strict_blocks_to_sectors(buf, &sectors) < 0)
3734                        return -EINVAL;
3735                if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3736                        return -E2BIG;
3737
3738                mddev->external_size = 1;
3739        }
3740
3741        mddev->array_sectors = sectors;
3742        set_capacity(mddev->gendisk, mddev->array_sectors);
3743        if (mddev->pers)
3744                revalidate_disk(mddev->gendisk);
3745
3746        return len;
3747}
3748
3749static struct md_sysfs_entry md_array_size =
3750__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3751       array_size_store);
3752
3753static struct attribute *md_default_attrs[] = {
3754        &md_level.attr,
3755        &md_layout.attr,
3756        &md_raid_disks.attr,
3757        &md_chunk_size.attr,
3758        &md_size.attr,
3759        &md_resync_start.attr,
3760        &md_metadata.attr,
3761        &md_new_device.attr,
3762        &md_safe_delay.attr,
3763        &md_array_state.attr,
3764        &md_reshape_position.attr,
3765        &md_array_size.attr,
3766        NULL,
3767};
3768
3769static struct attribute *md_redundancy_attrs[] = {
3770        &md_scan_mode.attr,
3771        &md_mismatches.attr,
3772        &md_sync_min.attr,
3773        &md_sync_max.attr,
3774        &md_sync_speed.attr,
3775        &md_sync_force_parallel.attr,
3776        &md_sync_completed.attr,
3777        &md_min_sync.attr,
3778        &md_max_sync.attr,
3779        &md_suspend_lo.attr,
3780        &md_suspend_hi.attr,
3781        &md_bitmap.attr,
3782        &md_degraded.attr,
3783        NULL,
3784};
3785static struct attribute_group md_redundancy_group = {
3786        .name = NULL,
3787        .attrs = md_redundancy_attrs,
3788};
3789
3790
3791static ssize_t
3792md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3793{
3794        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3795        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3796        ssize_t rv;
3797
3798        if (!entry->show)
3799                return -EIO;
3800        rv = mddev_lock(mddev);
3801        if (!rv) {
3802                rv = entry->show(mddev, page);
3803                mddev_unlock(mddev);
3804        }
3805        return rv;
3806}
3807
3808static ssize_t
3809md_attr_store(struct kobject *kobj, struct attribute *attr,
3810              const char *page, size_t length)
3811{
3812        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3813        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3814        ssize_t rv;
3815
3816        if (!entry->store)
3817                return -EIO;
3818        if (!capable(CAP_SYS_ADMIN))
3819                return -EACCES;
3820        rv = mddev_lock(mddev);
3821        if (mddev->hold_active == UNTIL_IOCTL)
3822                mddev->hold_active = 0;
3823        if (!rv) {
3824                rv = entry->store(mddev, page, length);
3825                mddev_unlock(mddev);
3826        }
3827        return rv;
3828}
3829
3830static void md_free(struct kobject *ko)
3831{
3832        mddev_t *mddev = container_of(ko, mddev_t, kobj);
3833
3834        if (mddev->sysfs_state)
3835                sysfs_put(mddev->sysfs_state);
3836
3837        if (mddev->gendisk) {
3838                del_gendisk(mddev->gendisk);
3839                put_disk(mddev->gendisk);
3840        }
3841        if (mddev->queue)
3842                blk_cleanup_queue(mddev->queue);
3843
3844        kfree(mddev);
3845}
3846
3847static struct sysfs_ops md_sysfs_ops = {
3848        .show   = md_attr_show,
3849        .store  = md_attr_store,
3850};
3851static struct kobj_type md_ktype = {
3852        .release        = md_free,
3853        .sysfs_ops      = &md_sysfs_ops,
3854        .default_attrs  = md_default_attrs,
3855};
3856
3857int mdp_major = 0;
3858
3859static void mddev_delayed_delete(struct work_struct *ws)
3860{
3861        mddev_t *mddev = container_of(ws, mddev_t, del_work);
3862
3863        if (mddev->private == &md_redundancy_group) {
3864                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3865                if (mddev->sysfs_action)
3866                        sysfs_put(mddev->sysfs_action);
3867                mddev->sysfs_action = NULL;
3868                mddev->private = NULL;
3869        }
3870        kobject_del(&mddev->kobj);
3871        kobject_put(&mddev->kobj);
3872}
3873
3874static int md_alloc(dev_t dev, char *name)
3875{
3876        static DEFINE_MUTEX(disks_mutex);
3877        mddev_t *mddev = mddev_find(dev);
3878        struct gendisk *disk;
3879        int partitioned;
3880        int shift;
3881        int unit;
3882        int error;
3883
3884        if (!mddev)
3885                return -ENODEV;
3886
3887        partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
3888        shift = partitioned ? MdpMinorShift : 0;
3889        unit = MINOR(mddev->unit) >> shift;
3890
3891        /* wait for any previous instance if this device
3892         * to be completed removed (mddev_delayed_delete).
3893         */
3894        flush_scheduled_work();
3895
3896        mutex_lock(&disks_mutex);
3897        error = -EEXIST;
3898        if (mddev->gendisk)
3899                goto abort;
3900
3901        if (name) {
3902                /* Need to ensure that 'name' is not a duplicate.
3903                 */
3904                mddev_t *mddev2;
3905                spin_lock(&all_mddevs_lock);
3906
3907                list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
3908                        if (mddev2->gendisk &&
3909                            strcmp(mddev2->gendisk->disk_name, name) == 0) {
3910                                spin_unlock(&all_mddevs_lock);
3911                                goto abort;
3912                        }
3913                spin_unlock(&all_mddevs_lock);
3914        }
3915
3916        error = -ENOMEM;
3917        mddev->queue = blk_alloc_queue(GFP_KERNEL);
3918        if (!mddev->queue)
3919                goto abort;
3920        mddev->queue->queuedata = mddev;
3921
3922        /* Can be unlocked because the queue is new: no concurrency */
3923        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3924
3925        blk_queue_make_request(mddev->queue, md_make_request);
3926
3927        disk = alloc_disk(1 << shift);
3928        if (!disk) {
3929                blk_cleanup_queue(mddev->queue);
3930                mddev->queue = NULL;
3931                goto abort;
3932        }
3933        disk->major = MAJOR(mddev->unit);
3934        disk->first_minor = unit << shift;
3935        if (name)
3936                strcpy(disk->disk_name, name);
3937        else if (partitioned)
3938                sprintf(disk->disk_name, "md_d%d", unit);
3939        else
3940                sprintf(disk->disk_name, "md%d", unit);
3941        disk->fops = &md_fops;
3942        disk->private_data = mddev;
3943        disk->queue = mddev->queue;
3944        /* Allow extended partitions.  This makes the
3945         * 'mdp' device redundant, but we can't really
3946         * remove it now.
3947         */
3948        disk->flags |= GENHD_FL_EXT_DEVT;
3949        add_disk(disk);
3950        mddev->gendisk = disk;
3951        error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3952                                     &disk_to_dev(disk)->kobj, "%s", "md");
3953        if (error) {
3954                /* This isn't possible, but as kobject_init_and_add is marked
3955                 * __must_check, we must do something with the result
3956                 */
3957                printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3958                       disk->disk_name);
3959                error = 0;
3960        }
3961 abort:
3962        mutex_unlock(&disks_mutex);
3963        if (!error) {
3964                kobject_uevent(&mddev->kobj, KOBJ_ADD);
3965                mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3966        }
3967        mddev_put(mddev);
3968        return error;
3969}
3970
3971static struct kobject *md_probe(dev_t dev, int *part, void *data)
3972{
3973        md_alloc(dev, NULL);
3974        return NULL;
3975}
3976
3977static int add_named_array(const char *val, struct kernel_param *kp)
3978{
3979        /* val must be "md_*" where * is not all digits.
3980         * We allocate an array with a large free minor number, and
3981         * set the name to val.  val must not already be an active name.
3982         */
3983        int len = strlen(val);
3984        char buf[DISK_NAME_LEN];
3985
3986        while (len && val[len-1] == '\n')
3987                len--;
3988        if (len >= DISK_NAME_LEN)
3989                return -E2BIG;
3990        strlcpy(buf, val, len+1);
3991        if (strncmp(buf, "md_", 3) != 0)
3992                return -EINVAL;
3993        return md_alloc(0, buf);
3994}
3995
3996static void md_safemode_timeout(unsigned long data)
3997{
3998        mddev_t *mddev = (mddev_t *) data;
3999
4000        if (!atomic_read(&mddev->writes_pending)) {
4001                mddev->safemode = 1;
4002                if (mddev->external)
4003                        sysfs_notify_dirent(mddev->sysfs_state);
4004        }
4005        md_wakeup_thread(mddev->thread);
4006}
4007
4008static int start_dirty_degraded;
4009
4010static int do_md_run(mddev_t * mddev)
4011{
4012        int err;
4013        mdk_rdev_t *rdev;
4014        struct gendisk *disk;
4015        struct mdk_personality *pers;
4016
4017        if (list_empty(&mddev->disks))
4018                /* cannot run an array with no devices.. */
4019                return -EINVAL;
4020
4021        if (mddev->pers)
4022                return -EBUSY;
4023
4024        /*
4025         * Analyze all RAID superblock(s)
4026         */
4027        if (!mddev->raid_disks) {
4028                if (!mddev->persistent)
4029                        return -EINVAL;
4030                analyze_sbs(mddev);
4031        }
4032
4033        if (mddev->level != LEVEL_NONE)
4034                request_module("md-level-%d", mddev->level);
4035        else if (mddev->clevel[0])
4036                request_module("md-%s", mddev->clevel);
4037
4038        /*
4039         * Drop all container device buffers, from now on
4040         * the only valid external interface is through the md
4041         * device.
4042         */
4043        list_for_each_entry(rdev, &mddev->disks, same_set) {
4044                if (test_bit(Faulty, &rdev->flags))
4045                        continue;
4046                sync_blockdev(rdev->bdev);
4047                invalidate_bdev(rdev->bdev);
4048
4049                /* perform some consistency tests on the device.
4050                 * We don't want the data to overlap the metadata,
4051                 * Internal Bitmap issues have been handled elsewhere.
4052                 */
4053                if (rdev->data_offset < rdev->sb_start) {
4054                        if (mddev->dev_sectors &&
4055                            rdev->data_offset + mddev->dev_sectors
4056                            > rdev->sb_start) {
4057                                printk("md: %s: data overlaps metadata\n",
4058                                       mdname(mddev));
4059                                return -EINVAL;
4060                        }
4061                } else {
4062                        if (rdev->sb_start + rdev->sb_size/512
4063                            > rdev->data_offset) {
4064                                printk("md: %s: metadata overlaps data\n",
4065                                       mdname(mddev));
4066                                return -EINVAL;
4067                        }
4068                }
4069                sysfs_notify_dirent(rdev->sysfs_state);
4070        }
4071
4072        md_probe(mddev->unit, NULL, NULL);
4073        disk = mddev->gendisk;
4074        if (!disk)
4075                return -ENOMEM;
4076
4077        spin_lock(&pers_lock);
4078        pers = find_pers(mddev->level, mddev->clevel);
4079        if (!pers || !try_module_get(pers->owner)) {
4080                spin_unlock(&pers_lock);
4081                if (mddev->level != LEVEL_NONE)
4082                        printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4083                               mddev->level);
4084                else
4085                        printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4086                               mddev->clevel);
4087                return -EINVAL;
4088        }
4089        mddev->pers = pers;
4090        spin_unlock(&pers_lock);
4091        if (mddev->level != pers->level) {
4092                mddev->level = pers->level;
4093                mddev->new_level = pers->level;
4094        }
4095        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4096
4097        if (mddev->reshape_position != MaxSector &&
4098            pers->start_reshape == NULL) {
4099                /* This personality cannot handle reshaping... */
4100                mddev->pers = NULL;
4101                module_put(pers->owner);
4102                return -EINVAL;
4103        }
4104
4105        if (pers->sync_request) {
4106                /* Warn if this is a potentially silly
4107                 * configuration.
4108                 */
4109                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4110                mdk_rdev_t *rdev2;
4111                int warned = 0;
4112
4113                list_for_each_entry(rdev, &mddev->disks, same_set)
4114                        list_for_each_entry(rdev2, &mddev->disks, same_set) {
4115                                if (rdev < rdev2 &&
4116                                    rdev->bdev->bd_contains ==
4117                                    rdev2->bdev->bd_contains) {
4118                                        printk(KERN_WARNING
4119                                               "%s: WARNING: %s appears to be"
4120                                               " on the same physical disk as"
4121                                               " %s.\n",
4122                                               mdname(mddev),
4123                                               bdevname(rdev->bdev,b),
4124                                               bdevname(rdev2->bdev,b2));
4125                                        warned = 1;
4126                                }
4127                        }
4128
4129                if (warned)
4130                        printk(KERN_WARNING
4131                               "True protection against single-disk"
4132                               " failure might be compromised.\n");
4133        }
4134
4135        mddev->recovery = 0;
4136        /* may be over-ridden by personality */
4137        mddev->resync_max_sectors = mddev->dev_sectors;
4138
4139        mddev->barriers_work = 1;
4140        mddev->ok_start_degraded = start_dirty_degraded;
4141
4142        if (start_readonly)
4143                mddev->ro = 2; /* read-only, but switch on first write */
4144
4145        err = mddev->pers->run(mddev);
4146        if (err)
4147                printk(KERN_ERR "md: pers->run() failed ...\n");
4148        else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4149                WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4150                          " but 'external_size' not in effect?\n", __func__);
4151                printk(KERN_ERR
4152                       "md: invalid array_size %llu > default size %llu\n",
4153                       (unsigned long long)mddev->array_sectors / 2,
4154                       (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4155                err = -EINVAL;
4156                mddev->pers->stop(mddev);
4157        }
4158        if (err == 0 && mddev->pers->sync_request) {
4159                err = bitmap_create(mddev);
4160                if (err) {
4161                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4162                               mdname(mddev), err);
4163                        mddev->pers->stop(mddev);
4164                }
4165        }
4166        if (err) {
4167                module_put(mddev->pers->owner);
4168                mddev->pers = NULL;
4169                bitmap_destroy(mddev);
4170                return err;
4171        }
4172        if (mddev->pers->sync_request) {
4173                if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4174                        printk(KERN_WARNING
4175                               "md: cannot register extra attributes for %s\n",
4176                               mdname(mddev));
4177                mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4178        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4179                mddev->ro = 0;
4180
4181        atomic_set(&mddev->writes_pending,0);
4182        mddev->safemode = 0;
4183        mddev->safemode_timer.function = md_safemode_timeout;
4184        mddev->safemode_timer.data = (unsigned long) mddev;
4185        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4186        mddev->in_sync = 1;
4187
4188        list_for_each_entry(rdev, &mddev->disks, same_set)
4189                if (rdev->raid_disk >= 0) {
4190                        char nm[20];
4191                        sprintf(nm, "rd%d", rdev->raid_disk);
4192                        if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4193                                printk("md: cannot register %s for %s\n",
4194                                       nm, mdname(mddev));
4195                }
4196        
4197        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4198        
4199        if (mddev->flags)
4200                md_update_sb(mddev, 0);
4201
4202        set_capacity(disk, mddev->array_sectors);
4203
4204        /* If there is a partially-recovered drive we need to
4205         * start recovery here.  If we leave it to md_check_recovery,
4206         * it will remove the drives and not do the right thing
4207         */
4208        if (mddev->degraded && !mddev->sync_thread) {
4209                int spares = 0;
4210                list_for_each_entry(rdev, &mddev->disks, same_set)
4211                        if (rdev->raid_disk >= 0 &&
4212                            !test_bit(In_sync, &rdev->flags) &&
4213                            !test_bit(Faulty, &rdev->flags))
4214                                /* complete an interrupted recovery */
4215                                spares++;
4216                if (spares && mddev->pers->sync_request) {
4217                        mddev->recovery = 0;
4218                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4219                        mddev->sync_thread = md_register_thread(md_do_sync,
4220                                                                mddev,
4221                                                                "%s_resync");
4222                        if (!mddev->sync_thread) {
4223                                printk(KERN_ERR "%s: could not start resync"
4224                                       " thread...\n",
4225                                       mdname(mddev));
4226                                /* leave the spares where they are, it shouldn't hurt */
4227                                mddev->recovery = 0;
4228                        }
4229                }
4230        }
4231        md_wakeup_thread(mddev->thread);
4232        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4233
4234        revalidate_disk(mddev->gendisk);
4235        mddev->changed = 1;
4236        md_new_event(mddev);
4237        sysfs_notify_dirent(mddev->sysfs_state);
4238        if (mddev->sysfs_action)
4239                sysfs_notify_dirent(mddev->sysfs_action);
4240        sysfs_notify(&mddev->kobj, NULL, "degraded");
4241        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4242        return 0;
4243}
4244
4245static int restart_array(mddev_t *mddev)
4246{
4247        struct gendisk *disk = mddev->gendisk;
4248
4249        /* Complain if it has no devices */
4250        if (list_empty(&mddev->disks))
4251                return -ENXIO;
4252        if (!mddev->pers)
4253                return -EINVAL;
4254        if (!mddev->ro)
4255                return -EBUSY;
4256        mddev->safemode = 0;
4257        mddev->ro = 0;
4258        set_disk_ro(disk, 0);
4259        printk(KERN_INFO "md: %s switched to read-write mode.\n",
4260                mdname(mddev));
4261        /* Kick recovery or resync if necessary */
4262        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4263        md_wakeup_thread(mddev->thread);
4264        md_wakeup_thread(mddev->sync_thread);
4265        sysfs_notify_dirent(mddev->sysfs_state);
4266        return 0;
4267}
4268
4269/* similar to deny_write_access, but accounts for our holding a reference
4270 * to the file ourselves */
4271static int deny_bitmap_write_access(struct file * file)
4272{
4273        struct inode *inode = file->f_mapping->host;
4274
4275        spin_lock(&inode->i_lock);
4276        if (atomic_read(&inode->i_writecount) > 1) {
4277                spin_unlock(&inode->i_lock);
4278                return -ETXTBSY;
4279        }
4280        atomic_set(&inode->i_writecount, -1);
4281        spin_unlock(&inode->i_lock);
4282
4283        return 0;
4284}
4285
4286static void restore_bitmap_write_access(struct file *file)
4287{
4288        struct inode *inode = file->f_mapping->host;
4289
4290        spin_lock(&inode->i_lock);
4291        atomic_set(&inode->i_writecount, 1);
4292        spin_unlock(&inode->i_lock);
4293}
4294
4295/* mode:
4296 *   0 - completely stop and dis-assemble array
4297 *   1 - switch to readonly
4298 *   2 - stop but do not disassemble array
4299 */
4300static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4301{
4302        int err = 0;
4303        struct gendisk *disk = mddev->gendisk;
4304        mdk_rdev_t *rdev;
4305
4306        mutex_lock(&mddev->open_mutex);
4307        if (atomic_read(&mddev->openers) > is_open) {
4308                printk("md: %s still in use.\n",mdname(mddev));
4309                err = -EBUSY;
4310        } else if (mddev->pers) {
4311
4312                if (mddev->sync_thread) {
4313                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4314                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4315                        md_unregister_thread(mddev->sync_thread);
4316                        mddev->sync_thread = NULL;
4317                }
4318
4319                del_timer_sync(&mddev->safemode_timer);
4320
4321                switch(mode) {
4322                case 1: /* readonly */
4323                        err  = -ENXIO;
4324                        if (mddev->ro==1)
4325                                goto out;
4326                        mddev->ro = 1;
4327                        break;
4328                case 0: /* disassemble */
4329                case 2: /* stop */
4330                        bitmap_flush(mddev);
4331                        md_super_wait(mddev);
4332                        if (mddev->ro)
4333                                set_disk_ro(disk, 0);
4334
4335                        mddev->pers->stop(mddev);
4336                        mddev->queue->merge_bvec_fn = NULL;
4337                        mddev->queue->unplug_fn = NULL;
4338                        mddev->queue->backing_dev_info.congested_fn = NULL;
4339                        module_put(mddev->pers->owner);
4340                        if (mddev->pers->sync_request)
4341                                mddev->private = &md_redundancy_group;
4342                        mddev->pers = NULL;
4343                        /* tell userspace to handle 'inactive' */
4344                        sysfs_notify_dirent(mddev->sysfs_state);
4345
4346                        list_for_each_entry(rdev, &mddev->disks, same_set)
4347                                if (rdev->raid_disk >= 0) {
4348                                        char nm[20];
4349                                        sprintf(nm, "rd%d", rdev->raid_disk);
4350                                        sysfs_remove_link(&mddev->kobj, nm);
4351                                }
4352
4353                        set_capacity(disk, 0);
4354                        mddev->changed = 1;
4355
4356                        if (mddev->ro)
4357                                mddev->ro = 0;
4358                }
4359                if (!mddev->in_sync || mddev->flags) {
4360                        /* mark array as shutdown cleanly */
4361                        mddev->in_sync = 1;
4362                        md_update_sb(mddev, 1);
4363                }
4364                if (mode == 1)
4365                        set_disk_ro(disk, 1);
4366                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4367                err = 0;
4368        }
4369out:
4370        mutex_unlock(&mddev->open_mutex);
4371        if (err)
4372                return err;
4373        /*
4374         * Free resources if final stop
4375         */
4376        if (mode == 0) {
4377
4378                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4379
4380                bitmap_destroy(mddev);
4381                if (mddev->bitmap_file) {
4382                        restore_bitmap_write_access(mddev->bitmap_file);
4383                        fput(mddev->bitmap_file);
4384                        mddev->bitmap_file = NULL;
4385                }
4386                mddev->bitmap_offset = 0;
4387
4388                /* make sure all md_delayed_delete calls have finished */
4389                flush_scheduled_work();
4390
4391                export_array(mddev);
4392
4393                mddev->array_sectors = 0;
4394                mddev->external_size = 0;
4395                mddev->dev_sectors = 0;
4396                mddev->raid_disks = 0;
4397                mddev->recovery_cp = 0;
4398                mddev->resync_min = 0;
4399                mddev->resync_max = MaxSector;
4400                mddev->reshape_position = MaxSector;
4401                mddev->external = 0;
4402                mddev->persistent = 0;
4403                mddev->level = LEVEL_NONE;
4404                mddev->clevel[0] = 0;
4405                mddev->flags = 0;
4406                mddev->ro = 0;
4407                mddev->metadata_type[0] = 0;
4408                mddev->chunk_sectors = 0;
4409                mddev->ctime = mddev->utime = 0;
4410                mddev->layout = 0;
4411                mddev->max_disks = 0;
4412                mddev->events = 0;
4413                mddev->delta_disks = 0;
4414                mddev->new_level = LEVEL_NONE;
4415                mddev->new_layout = 0;
4416                mddev->new_chunk_sectors = 0;
4417                mddev->curr_resync = 0;
4418                mddev->resync_mismatches = 0;
4419                mddev->suspend_lo = mddev->suspend_hi = 0;
4420                mddev->sync_speed_min = mddev->sync_speed_max = 0;
4421                mddev->recovery = 0;
4422                mddev->in_sync = 0;
4423                mddev->changed = 0;
4424                mddev->degraded = 0;
4425                mddev->barriers_work = 0;
4426                mddev->safemode = 0;
4427                kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4428                if (mddev->hold_active == UNTIL_STOP)
4429                        mddev->hold_active = 0;
4430
4431        } else if (mddev->pers)
4432                printk(KERN_INFO "md: %s switched to read-only mode.\n",
4433                        mdname(mddev));
4434        err = 0;
4435        blk_integrity_unregister(disk);
4436        md_new_event(mddev);
4437        sysfs_notify_dirent(mddev->sysfs_state);
4438        return err;
4439}
4440
4441#ifndef MODULE
4442static void autorun_array(mddev_t *mddev)
4443{
4444        mdk_rdev_t *rdev;
4445        int err;
4446
4447        if (list_empty(&mddev->disks))
4448                return;
4449
4450        printk(KERN_INFO "md: running: ");
4451
4452        list_for_each_entry(rdev, &mddev->disks, same_set) {
4453                char b[BDEVNAME_SIZE];
4454                printk("<%s>", bdevname(rdev->bdev,b));
4455        }
4456        printk("\n");
4457
4458        err = do_md_run(mddev);
4459        if (err) {
4460                printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4461                do_md_stop(mddev, 0, 0);
4462        }
4463}
4464
4465/*
4466 * lets try to run arrays based on all disks that have arrived
4467 * until now. (those are in pending_raid_disks)
4468 *
4469 * the method: pick the first pending disk, collect all disks with
4470 * the same UUID, remove all from the pending list and put them into
4471 * the 'same_array' list. Then order this list based on superblock
4472 * update time (freshest comes first), kick out 'old' disks and
4473 * compare superblocks. If everything's fine then run it.
4474 *
4475 * If "unit" is allocated, then bump its reference count
4476 */
4477static void autorun_devices(int part)
4478{
4479        mdk_rdev_t *rdev0, *rdev, *tmp;
4480        mddev_t *mddev;
4481        char b[BDEVNAME_SIZE];
4482
4483        printk(KERN_INFO "md: autorun ...\n");
4484        while (!list_empty(&pending_raid_disks)) {
4485                int unit;
4486                dev_t dev;
4487                LIST_HEAD(candidates);
4488                rdev0 = list_entry(pending_raid_disks.next,
4489                                         mdk_rdev_t, same_set);
4490
4491                printk(KERN_INFO "md: considering %s ...\n",
4492                        bdevname(rdev0->bdev,b));
4493                INIT_LIST_HEAD(&candidates);
4494                rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4495                        if (super_90_load(rdev, rdev0, 0) >= 0) {
4496                                printk(KERN_INFO "md:  adding %s ...\n",
4497                                        bdevname(rdev->bdev,b));
4498                                list_move(&rdev->same_set, &candidates);
4499                        }
4500                /*
4501                 * now we have a set of devices, with all of them having
4502                 * mostly sane superblocks. It's time to allocate the
4503                 * mddev.
4504                 */
4505                if (part) {
4506                        dev = MKDEV(mdp_major,
4507                                    rdev0->preferred_minor << MdpMinorShift);
4508                        unit = MINOR(dev) >> MdpMinorShift;
4509                } else {
4510                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4511                        unit = MINOR(dev);
4512                }
4513                if (rdev0->preferred_minor != unit) {
4514                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4515                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4516                        break;
4517                }
4518
4519                md_probe(dev, NULL, NULL);
4520                mddev = mddev_find(dev);
4521                if (!mddev || !mddev->gendisk) {
4522                        if (mddev)
4523                                mddev_put(mddev);
4524                        printk(KERN_ERR
4525                                "md: cannot allocate memory for md drive.\n");
4526                        break;
4527                }
4528                if (mddev_lock(mddev)) 
4529                        printk(KERN_WARNING "md: %s locked, cannot run\n",
4530                               mdname(mddev));
4531                else if (mddev->raid_disks || mddev->major_version
4532                         || !list_empty(&mddev->disks)) {
4533                        printk(KERN_WARNING 
4534                                "md: %s already running, cannot run %s\n",
4535                                mdname(mddev), bdevname(rdev0->bdev,b));
4536                        mddev_unlock(mddev);
4537                } else {
4538                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
4539                        mddev->persistent = 1;
4540                        rdev_for_each_list(rdev, tmp, &candidates) {
4541                                list_del_init(&rdev->same_set);
4542                                if (bind_rdev_to_array(rdev, mddev))
4543                                        export_rdev(rdev);
4544                        }
4545                        autorun_array(mddev);
4546                        mddev_unlock(mddev);
4547                }
4548                /* on success, candidates will be empty, on error
4549                 * it won't...
4550                 */
4551                rdev_for_each_list(rdev, tmp, &candidates) {
4552                        list_del_init(&rdev->same_set);
4553                        export_rdev(rdev);
4554                }
4555                mddev_put(mddev);
4556        }
4557        printk(KERN_INFO "md: ... autorun DONE.\n");
4558}
4559#endif /* !MODULE */
4560
4561static int get_version(void __user * arg)
4562{
4563        mdu_version_t ver;
4564
4565        ver.major = MD_MAJOR_VERSION;
4566        ver.minor = MD_MINOR_VERSION;
4567        ver.patchlevel = MD_PATCHLEVEL_VERSION;
4568
4569        if (copy_to_user(arg, &ver, sizeof(ver)))
4570                return -EFAULT;
4571
4572        return 0;
4573}
4574
4575static int get_array_info(mddev_t * mddev, void __user * arg)
4576{
4577        mdu_array_info_t info;
4578        int nr,working,active,failed,spare;
4579        mdk_rdev_t *rdev;
4580
4581        nr=working=active=failed=spare=0;
4582        list_for_each_entry(rdev, &mddev->disks, same_set) {
4583                nr++;
4584                if (test_bit(Faulty, &rdev->flags))
4585                        failed++;
4586                else {
4587                        working++;
4588                        if (test_bit(In_sync, &rdev->flags))
4589                                active++;       
4590                        else
4591                                spare++;
4592                }
4593        }
4594
4595        info.major_version = mddev->major_version;
4596        info.minor_version = mddev->minor_version;
4597        info.patch_version = MD_PATCHLEVEL_VERSION;
4598        info.ctime         = mddev->ctime;
4599        info.level         = mddev->level;
4600        info.size          = mddev->dev_sectors / 2;
4601        if (info.size != mddev->dev_sectors / 2) /* overflow */
4602                info.size = -1;
4603        info.nr_disks      = nr;
4604        info.raid_disks    = mddev->raid_disks;
4605        info.md_minor      = mddev->md_minor;
4606        info.not_persistent= !mddev->persistent;
4607
4608        info.utime         = mddev->utime;
4609        info.state         = 0;
4610        if (mddev->in_sync)
4611                info.state = (1<<MD_SB_CLEAN);
4612        if (mddev->bitmap && mddev->bitmap_offset)
4613                info.state = (1<<MD_SB_BITMAP_PRESENT);
4614        info.active_disks  = active;
4615        info.working_disks = working;
4616        info.failed_disks  = failed;
4617        info.spare_disks   = spare;
4618
4619        info.layout        = mddev->layout;
4620        info.chunk_size    = mddev->chunk_sectors << 9;
4621
4622        if (copy_to_user(arg, &info, sizeof(info)))
4623                return -EFAULT;
4624
4625        return 0;
4626}
4627
4628static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4629{
4630        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4631        char *ptr, *buf = NULL;
4632        int err = -ENOMEM;
4633
4634        if (md_allow_write(mddev))
4635                file = kmalloc(sizeof(*file), GFP_NOIO);
4636        else
4637                file = kmalloc(sizeof(*file), GFP_KERNEL);
4638
4639        if (!file)
4640                goto out;
4641
4642        /* bitmap disabled, zero the first byte and copy out */
4643        if (!mddev->bitmap || !mddev->bitmap->file) {
4644                file->pathname[0] = '\0';
4645                goto copy_out;
4646        }
4647
4648        buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4649        if (!buf)
4650                goto out;
4651
4652        ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4653        if (IS_ERR(ptr))
4654                goto out;
4655
4656        strcpy(file->pathname, ptr);
4657
4658copy_out:
4659        err = 0;
4660        if (copy_to_user(arg, file, sizeof(*file)))
4661                err = -EFAULT;
4662out:
4663        kfree(buf);
4664        kfree(file);
4665        return err;
4666}
4667
4668static int get_disk_info(mddev_t * mddev, void __user * arg)
4669{
4670        mdu_disk_info_t info;
4671        mdk_rdev_t *rdev;
4672
4673        if (copy_from_user(&info, arg, sizeof(info)))
4674                return -EFAULT;
4675
4676        rdev = find_rdev_nr(mddev, info.number);
4677        if (rdev) {
4678                info.major = MAJOR(rdev->bdev->bd_dev);
4679                info.minor = MINOR(rdev->bdev->bd_dev);
4680                info.raid_disk = rdev->raid_disk;
4681                info.state = 0;
4682                if (test_bit(Faulty, &rdev->flags))
4683                        info.state |= (1<<MD_DISK_FAULTY);
4684                else if (test_bit(In_sync, &rdev->flags)) {
4685                        info.state |= (1<<MD_DISK_ACTIVE);
4686                        info.state |= (1<<MD_DISK_SYNC);
4687                }
4688                if (test_bit(WriteMostly, &rdev->flags))
4689                        info.state |= (1<<MD_DISK_WRITEMOSTLY);
4690        } else {
4691                info.major = info.minor = 0;
4692                info.raid_disk = -1;
4693                info.state = (1<<MD_DISK_REMOVED);
4694        }
4695
4696        if (copy_to_user(arg, &info, sizeof(info)))
4697                return -EFAULT;
4698
4699        return 0;
4700}
4701
4702static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4703{
4704        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4705        mdk_rdev_t *rdev;
4706        dev_t dev = MKDEV(info->major,info->minor);
4707
4708        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4709                return -EOVERFLOW;
4710
4711        if (!mddev->raid_disks) {
4712                int err;
4713                /* expecting a device which has a superblock */
4714                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4715                if (IS_ERR(rdev)) {
4716                        printk(KERN_WARNING 
4717                                "md: md_import_device returned %ld\n",
4718                                PTR_ERR(rdev));
4719                        return PTR_ERR(rdev);
4720                }
4721                if (!list_empty(&mddev->disks)) {
4722                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4723                                                        mdk_rdev_t, same_set);
4724                        int err = super_types[mddev->major_version]
4725                                .load_super(rdev, rdev0, mddev->minor_version);
4726                        if (err < 0) {
4727                                printk(KERN_WARNING 
4728                                        "md: %s has different UUID to %s\n",
4729                                        bdevname(rdev->bdev,b), 
4730                                        bdevname(rdev0->bdev,b2));
4731                                export_rdev(rdev);
4732                                return -EINVAL;
4733                        }
4734                }
4735                err = bind_rdev_to_array(rdev, mddev);
4736                if (err)
4737                        export_rdev(rdev);
4738                return err;
4739        }
4740
4741        /*
4742         * add_new_disk can be used once the array is assembled
4743         * to add "hot spares".  They must already have a superblock
4744         * written
4745         */
4746        if (mddev->pers) {
4747                int err;
4748                if (!mddev->pers->hot_add_disk) {
4749                        printk(KERN_WARNING 
4750                                "%s: personality does not support diskops!\n",
4751                               mdname(mddev));
4752                        return -EINVAL;
4753                }
4754                if (mddev->persistent)
4755                        rdev = md_import_device(dev, mddev->major_version,
4756                                                mddev->minor_version);
4757                else
4758                        rdev = md_import_device(dev, -1, -1);
4759                if (IS_ERR(rdev)) {
4760                        printk(KERN_WARNING 
4761                                "md: md_import_device returned %ld\n",
4762                                PTR_ERR(rdev));
4763                        return PTR_ERR(rdev);
4764                }
4765                /* set save_raid_disk if appropriate */
4766                if (!mddev->persistent) {
4767                        if (info->state & (1<<MD_DISK_SYNC)  &&
4768                            info->raid_disk < mddev->raid_disks)
4769                                rdev->raid_disk = info->raid_disk;
4770                        else
4771                                rdev->raid_disk = -1;
4772                } else
4773                        super_types[mddev->major_version].
4774                                validate_super(mddev, rdev);
4775                rdev->saved_raid_disk = rdev->raid_disk;
4776
4777                clear_bit(In_sync, &rdev->flags); /* just to be sure */
4778                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4779                        set_bit(WriteMostly, &rdev->flags);
4780                else
4781                        clear_bit(WriteMostly, &rdev->flags);
4782
4783                rdev->raid_disk = -1;
4784                err = bind_rdev_to_array(rdev, mddev);
4785                if (!err && !mddev->pers->hot_remove_disk) {
4786                        /* If there is hot_add_disk but no hot_remove_disk
4787                         * then added disks for geometry changes,
4788                         * and should be added immediately.
4789                         */
4790                        super_types[mddev->major_version].
4791                                validate_super(mddev, rdev);
4792                        err = mddev->pers->hot_add_disk(mddev, rdev);
4793                        if (err)
4794                                unbind_rdev_from_array(rdev);
4795                }
4796                if (err)
4797                        export_rdev(rdev);
4798                else
4799                        sysfs_notify_dirent(rdev->sysfs_state);
4800
4801                md_update_sb(mddev, 1);
4802                if (mddev->degraded)
4803                        set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4804                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4805                md_wakeup_thread(mddev->thread);
4806                return err;
4807        }
4808
4809        /* otherwise, add_new_disk is only allowed
4810         * for major_version==0 superblocks
4811         */
4812        if (mddev->major_version != 0) {
4813                printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4814                       mdname(mddev));
4815                return -EINVAL;
4816        }
4817
4818        if (!(info->state & (1<<MD_DISK_FAULTY))) {
4819                int err;
4820                rdev = md_import_device(dev, -1, 0);
4821                if (IS_ERR(rdev)) {
4822                        printk(KERN_WARNING 
4823                                "md: error, md_import_device() returned %ld\n",
4824                                PTR_ERR(rdev));
4825                        return PTR_ERR(rdev);
4826                }
4827                rdev->desc_nr = info->number;
4828                if (info->raid_disk < mddev->raid_disks)
4829                        rdev->raid_disk = info->raid_disk;
4830                else
4831                        rdev->raid_disk = -1;
4832
4833                if (rdev->raid_disk < mddev->raid_disks)
4834                        if (info->state & (1<<MD_DISK_SYNC))
4835                                set_bit(In_sync, &rdev->flags);
4836
4837                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4838                        set_bit(WriteMostly, &rdev->flags);
4839
4840                if (!mddev->persistent) {
4841                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
4842                        rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4843                } else 
4844                        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4845                rdev->sectors = rdev->sb_start;
4846
4847                err = bind_rdev_to_array(rdev, mddev);
4848                if (err) {
4849                        export_rdev(rdev);
4850                        return err;
4851                }
4852        }
4853
4854        return 0;
4855}
4856
4857static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4858{
4859        char b[BDEVNAME_SIZE];
4860        mdk_rdev_t *rdev;
4861
4862        rdev = find_rdev(mddev, dev);
4863        if (!rdev)
4864                return -ENXIO;
4865
4866        if (rdev->raid_disk >= 0)
4867                goto busy;
4868
4869        kick_rdev_from_array(rdev);
4870        md_update_sb(mddev, 1);
4871        md_new_event(mddev);
4872
4873        return 0;
4874busy:
4875        printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4876                bdevname(rdev->bdev,b), mdname(mddev));
4877        return -EBUSY;
4878}
4879
4880static int hot_add_disk(mddev_t * mddev, dev_t dev)
4881{
4882        char b[BDEVNAME_SIZE];
4883        int err;
4884        mdk_rdev_t *rdev;
4885
4886        if (!mddev->pers)
4887                return -ENODEV;
4888
4889        if (mddev->major_version != 0) {
4890                printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4891                        " version-0 superblocks.\n",
4892                        mdname(mddev));
4893                return -EINVAL;
4894        }
4895        if (!mddev->pers->hot_add_disk) {
4896                printk(KERN_WARNING 
4897                        "%s: personality does not support diskops!\n",
4898                        mdname(mddev));
4899                return -EINVAL;
4900        }
4901
4902        rdev = md_import_device(dev, -1, 0);
4903        if (IS_ERR(rdev)) {
4904                printk(KERN_WARNING 
4905                        "md: error, md_import_device() returned %ld\n",
4906                        PTR_ERR(rdev));
4907                return -EINVAL;
4908        }
4909
4910        if (mddev->persistent)
4911                rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4912        else
4913                rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4914
4915        rdev->sectors = rdev->sb_start;
4916
4917        if (test_bit(Faulty, &rdev->flags)) {
4918                printk(KERN_WARNING 
4919                        "md: can not hot-add faulty %s disk to %s!\n",
4920                        bdevname(rdev->bdev,b), mdname(mddev));
4921                err = -EINVAL;
4922                goto abort_export;
4923        }
4924        clear_bit(In_sync, &rdev->flags);
4925        rdev->desc_nr = -1;
4926        rdev->saved_raid_disk = -1;
4927        err = bind_rdev_to_array(rdev, mddev);
4928        if (err)
4929                goto abort_export;
4930
4931        /*
4932         * The rest should better be atomic, we can have disk failures
4933         * noticed in interrupt contexts ...
4934         */
4935
4936        rdev->raid_disk = -1;
4937
4938        md_update_sb(mddev, 1);
4939
4940        /*
4941         * Kick recovery, maybe this spare has to be added to the
4942         * array immediately.
4943         */
4944        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4945        md_wakeup_thread(mddev->thread);
4946        md_new_event(mddev);
4947        return 0;
4948
4949abort_export:
4950        export_rdev(rdev);
4951        return err;
4952}
4953
4954static int set_bitmap_file(mddev_t *mddev, int fd)
4955{
4956        int err;
4957
4958        if (mddev->pers) {
4959                if (!mddev->pers->quiesce)
4960                        return -EBUSY;
4961                if (mddev->recovery || mddev->sync_thread)
4962                        return -EBUSY;
4963                /* we should be able to change the bitmap.. */
4964        }
4965
4966
4967        if (fd >= 0) {
4968                if (mddev->bitmap)
4969                        return -EEXIST; /* cannot add when bitmap is present */
4970                mddev->bitmap_file = fget(fd);
4971
4972                if (mddev->bitmap_file == NULL) {
4973                        printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4974                               mdname(mddev));
4975                        return -EBADF;
4976                }
4977
4978                err = deny_bitmap_write_access(mddev->bitmap_file);
4979                if (err) {
4980                        printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4981                               mdname(mddev));
4982                        fput(mddev->bitmap_file);
4983                        mddev->bitmap_file = NULL;
4984                        return err;
4985                }
4986                mddev->bitmap_offset = 0; /* file overrides offset */
4987        } else if (mddev->bitmap == NULL)
4988                return -ENOENT; /* cannot remove what isn't there */
4989        err = 0;
4990        if (mddev->pers) {
4991                mddev->pers->quiesce(mddev, 1);
4992                if (fd >= 0)
4993                        err = bitmap_create(mddev);
4994                if (fd < 0 || err) {
4995                        bitmap_destroy(mddev);
4996                        fd = -1; /* make sure to put the file */
4997                }
4998                mddev->pers->quiesce(mddev, 0);
4999        }
5000        if (fd < 0) {
5001                if (mddev->bitmap_file) {
5002                        restore_bitmap_write_access(mddev->bitmap_file);
5003                        fput(mddev->bitmap_file);
5004                }
5005                mddev->bitmap_file = NULL;
5006        }
5007
5008        return err;
5009}
5010
5011/*
5012 * set_array_info is used two different ways
5013 * The original usage is when creating a new array.
5014 * In this usage, raid_disks is > 0 and it together with
5015 *  level, size, not_persistent,layout,chunksize determine the
5016 *  shape of the array.
5017 *  This will always create an array with a type-0.90.0 superblock.
5018 * The newer usage is when assembling an array.
5019 *  In this case raid_disks will be 0, and the major_version field is
5020 *  use to determine which style super-blocks are to be found on the devices.
5021 *  The minor and patch _version numbers are also kept incase the
5022 *  super_block handler wishes to interpret them.
5023 */
5024static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5025{
5026
5027        if (info->raid_disks == 0) {
5028                /* just setting version number for superblock loading */
5029                if (info->major_version < 0 ||
5030                    info->major_version >= ARRAY_SIZE(super_types) ||
5031                    super_types[info->major_version].name == NULL) {
5032                        /* maybe try to auto-load a module? */
5033                        printk(KERN_INFO 
5034                                "md: superblock version %d not known\n",
5035                                info->major_version);
5036                        return -EINVAL;
5037                }
5038                mddev->major_version = info->major_version;
5039                mddev->minor_version = info->minor_version;
5040                mddev->patch_version = info->patch_version;
5041                mddev->persistent = !info->not_persistent;
5042                return 0;
5043        }
5044        mddev->major_version = MD_MAJOR_VERSION;
5045        mddev->minor_version = MD_MINOR_VERSION;
5046        mddev->patch_version = MD_PATCHLEVEL_VERSION;
5047        mddev->ctime         = get_seconds();
5048
5049        mddev->level         = info->level;
5050        mddev->clevel[0]     = 0;
5051        mddev->dev_sectors   = 2 * (sector_t)info->size;
5052        mddev->raid_disks    = info->raid_disks;
5053        /* don't set md_minor, it is determined by which /dev/md* was
5054         * openned
5055         */
5056        if (info->state & (1<<MD_SB_CLEAN))
5057                mddev->recovery_cp = MaxSector;
5058        else
5059                mddev->recovery_cp = 0;
5060        mddev->persistent    = ! info->not_persistent;
5061        mddev->external      = 0;
5062
5063        mddev->layout        = info->layout;
5064        mddev->chunk_sectors = info->chunk_size >> 9;
5065
5066        mddev->max_disks     = MD_SB_DISKS;
5067
5068        if (mddev->persistent)
5069                mddev->flags         = 0;
5070        set_bit(MD_CHANGE_DEVS, &mddev->flags);
5071
5072        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
5073        mddev->bitmap_offset = 0;
5074
5075        mddev->reshape_position = MaxSector;
5076
5077        /*
5078         * Generate a 128 bit UUID
5079         */
5080        get_random_bytes(mddev->uuid, 16);
5081
5082        mddev->new_level = mddev->level;
5083        mddev->new_chunk_sectors = mddev->chunk_sectors;
5084        mddev->new_layout = mddev->layout;
5085        mddev->delta_disks = 0;
5086
5087        return 0;
5088}
5089
5090void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5091{
5092        WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5093
5094        if (mddev->external_size)
5095                return;
5096
5097        mddev->array_sectors = array_sectors;
5098}
5099EXPORT_SYMBOL(md_set_array_sectors);
5100
5101static int update_size(mddev_t *mddev, sector_t num_sectors)
5102{
5103        mdk_rdev_t *rdev;
5104        int rv;
5105        int fit = (num_sectors == 0);
5106
5107        if (mddev->pers->resize == NULL)
5108                return -EINVAL;
5109        /* The "num_sectors" is the number of sectors of each device that
5110         * is used.  This can only make sense for arrays with redundancy.
5111         * linear and raid0 always use whatever space is available. We can only
5112         * consider changing this number if no resync or reconstruction is
5113         * happening, and if the new size is acceptable. It must fit before the
5114         * sb_start or, if that is <data_offset, it must fit before the size
5115         * of each device.  If num_sectors is zero, we find the largest size
5116         * that fits.
5117
5118         */
5119        if (mddev->sync_thread)
5120                return -EBUSY;
5121        if (mddev->bitmap)
5122                /* Sorry, cannot grow a bitmap yet, just remove it,
5123                 * grow, and re-add.
5124                 */
5125                return -EBUSY;
5126        list_for_each_entry(rdev, &mddev->disks, same_set) {
5127                sector_t avail = rdev->sectors;
5128
5129                if (fit && (num_sectors == 0 || num_sectors > avail))
5130                        num_sectors = avail;
5131                if (avail < num_sectors)
5132                        return -ENOSPC;
5133        }
5134        rv = mddev->pers->resize(mddev, num_sectors);
5135        if (!rv)
5136                revalidate_disk(mddev->gendisk);
5137        return rv;
5138}
5139
5140static int update_raid_disks(mddev_t *mddev, int raid_disks)
5141{
5142        int rv;
5143        /* change the number of raid disks */
5144        if (mddev->pers->check_reshape == NULL)
5145                return -EINVAL;
5146        if (raid_disks <= 0 ||
5147            raid_disks >= mddev->max_disks)
5148                return -EINVAL;
5149        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5150                return -EBUSY;
5151        mddev->delta_disks = raid_disks - mddev->raid_disks;
5152
5153        rv = mddev->pers->check_reshape(mddev);
5154        return rv;
5155}
5156
5157
5158/*
5159 * update_array_info is used to change the configuration of an
5160 * on-line array.
5161 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
5162 * fields in the info are checked against the array.
5163 * Any differences that cannot be handled will cause an error.
5164 * Normally, only one change can be managed at a time.
5165 */
5166static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5167{
5168        int rv = 0;
5169        int cnt = 0;
5170        int state = 0;
5171
5172        /* calculate expected state,ignoring low bits */
5173        if (mddev->bitmap && mddev->bitmap_offset)
5174                state |= (1 << MD_SB_BITMAP_PRESENT);
5175
5176        if (mddev->major_version != info->major_version ||
5177            mddev->minor_version != info->minor_version ||
5178/*          mddev->patch_version != info->patch_version || */
5179            mddev->ctime         != info->ctime         ||
5180            mddev->level         != info->level         ||
5181/*          mddev->layout        != info->layout        || */
5182            !mddev->persistent   != info->not_persistent||
5183            mddev->chunk_sectors != info->chunk_size >> 9 ||
5184            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5185            ((state^info->state) & 0xfffffe00)
5186                )
5187                return -EINVAL;
5188        /* Check there is only one change */
5189        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5190                cnt++;
5191        if (mddev->raid_disks != info->raid_disks)
5192                cnt++;
5193        if (mddev->layout != info->layout)
5194                cnt++;
5195        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5196                cnt++;
5197        if (cnt == 0)
5198                return 0;
5199        if (cnt > 1)
5200                return -EINVAL;
5201
5202        if (mddev->layout != info->layout) {
5203                /* Change layout
5204                 * we don't need to do anything at the md level, the
5205                 * personality will take care of it all.
5206                 */
5207                if (mddev->pers->check_reshape == NULL)
5208                        return -EINVAL;
5209                else {
5210                        mddev->new_layout = info->layout;
5211                        rv = mddev->pers->check_reshape(mddev);
5212                        if (rv)
5213                                mddev->new_layout = mddev->layout;
5214                        return rv;
5215                }
5216        }
5217        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5218                rv = update_size(mddev, (sector_t)info->size * 2);
5219
5220        if (mddev->raid_disks    != info->raid_disks)
5221                rv = update_raid_disks(mddev, info->raid_disks);
5222
5223        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5224                if (mddev->pers->quiesce == NULL)
5225                        return -EINVAL;
5226                if (mddev->recovery || mddev->sync_thread)
5227                        return -EBUSY;
5228                if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5229                        /* add the bitmap */
5230                        if (mddev->bitmap)
5231                                return -EEXIST;
5232                        if (mddev->default_bitmap_offset == 0)
5233                                return -EINVAL;
5234                        mddev->bitmap_offset = mddev->default_bitmap_offset;
5235                        mddev->pers->quiesce(mddev, 1);
5236                        rv = bitmap_create(mddev);
5237                        if (rv)
5238                                bitmap_destroy(mddev);
5239                        mddev->pers->quiesce(mddev, 0);
5240                } else {
5241                        /* remove the bitmap */
5242                        if (!mddev->bitmap)
5243                                return -ENOENT;
5244                        if (mddev->bitmap->file)
5245                                return -EINVAL;
5246                        mddev->pers->quiesce(mddev, 1);
5247                        bitmap_destroy(mddev);
5248                        mddev->pers->quiesce(mddev, 0);
5249                        mddev->bitmap_offset = 0;
5250                }
5251        }
5252        md_update_sb(mddev, 1);
5253        return rv;
5254}
5255
5256static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5257{
5258        mdk_rdev_t *rdev;
5259
5260        if (mddev->pers == NULL)
5261                return -ENODEV;
5262
5263        rdev = find_rdev(mddev, dev);
5264        if (!rdev)
5265                return -ENODEV;
5266
5267        md_error(mddev, rdev);
5268        return 0;
5269}
5270
5271/*
5272 * We have a problem here : there is no easy way to give a CHS
5273 * virtual geometry. We currently pretend that we have a 2 heads
5274 * 4 sectors (with a BIG number of cylinders...). This drives
5275 * dosfs just mad... ;-)
5276 */
5277static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5278{
5279        mddev_t *mddev = bdev->bd_disk->private_data;
5280
5281        geo->heads = 2;
5282        geo->sectors = 4;
5283        geo->cylinders = get_capacity(mddev->gendisk) / 8;
5284        return 0;
5285}
5286
5287static int md_ioctl(struct block_device *bdev, fmode_t mode,
5288                        unsigned int cmd, unsigned long arg)
5289{
5290        int err = 0;
5291        void __user *argp = (void __user *)arg;
5292        mddev_t *mddev = NULL;
5293
5294        if (!capable(CAP_SYS_ADMIN))
5295                return -EACCES;
5296
5297        /*
5298         * Commands dealing with the RAID driver but not any
5299         * particular array:
5300         */
5301        switch (cmd)
5302        {
5303                case RAID_VERSION:
5304                        err = get_version(argp);
5305                        goto done;
5306
5307                case PRINT_RAID_DEBUG:
5308                        err = 0;
5309                        md_print_devices();
5310                        goto done;
5311
5312#ifndef MODULE
5313                case RAID_AUTORUN:
5314                        err = 0;
5315                        autostart_arrays(arg);
5316                        goto done;
5317#endif
5318                default:;
5319        }
5320
5321        /*
5322         * Commands creating/starting a new array:
5323         */
5324
5325        mddev = bdev->bd_disk->private_data;
5326
5327        if (!mddev) {
5328                BUG();
5329                goto abort;
5330        }
5331
5332        err = mddev_lock(mddev);
5333        if (err) {
5334                printk(KERN_INFO 
5335                        "md: ioctl lock interrupted, reason %d, cmd %d\n",
5336                        err, cmd);
5337                goto abort;
5338        }
5339
5340        switch (cmd)
5341        {
5342                case SET_ARRAY_INFO:
5343                        {
5344                                mdu_array_info_t info;
5345                                if (!arg)
5346                                        memset(&info, 0, sizeof(info));
5347                                else if (copy_from_user(&info, argp, sizeof(info))) {
5348                                        err = -EFAULT;
5349                                        goto abort_unlock;
5350                                }
5351                                if (mddev->pers) {
5352                                        err = update_array_info(mddev, &info);
5353                                        if (err) {
5354                                                printk(KERN_WARNING "md: couldn't update"
5355                                                       " array info. %d\n", err);
5356                                                goto abort_unlock;
5357                                        }
5358                                        goto done_unlock;
5359                                }
5360                                if (!list_empty(&mddev->disks)) {
5361                                        printk(KERN_WARNING
5362                                               "md: array %s already has disks!\n",
5363                                               mdname(mddev));
5364                                        err = -EBUSY;
5365                                        goto abort_unlock;
5366                                }
5367                                if (mddev->raid_disks) {
5368                                        printk(KERN_WARNING
5369                                               "md: array %s already initialised!\n",
5370                                               mdname(mddev));
5371                                        err = -EBUSY;
5372                                        goto abort_unlock;
5373                                }
5374                                err = set_array_info(mddev, &info);
5375                                if (err) {
5376                                        printk(KERN_WARNING "md: couldn't set"
5377                                               " array info. %d\n", err);
5378                                        goto abort_unlock;
5379                                }
5380                        }
5381                        goto done_unlock;
5382
5383                default:;
5384        }
5385
5386        /*
5387         * Commands querying/configuring an existing array:
5388         */
5389        /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
5390         * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
5391        if ((!mddev->raid_disks && !mddev->external)
5392            && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5393            && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5394            && cmd != GET_BITMAP_FILE) {
5395                err = -ENODEV;
5396                goto abort_unlock;
5397        }
5398
5399        /*
5400         * Commands even a read-only array can execute:
5401         */
5402        switch (cmd)
5403        {
5404                case GET_ARRAY_INFO:
5405                        err = get_array_info(mddev, argp);
5406                        goto done_unlock;
5407
5408                case GET_BITMAP_FILE:
5409                        err = get_bitmap_file(mddev, argp);
5410                        goto done_unlock;
5411
5412                case GET_DISK_INFO:
5413                        err = get_disk_info(mddev, argp);
5414                        goto done_unlock;
5415
5416                case RESTART_ARRAY_RW:
5417                        err = restart_array(mddev);
5418                        goto done_unlock;
5419
5420                case STOP_ARRAY:
5421                        err = do_md_stop(mddev, 0, 1);
5422                        goto done_unlock;
5423
5424                case STOP_ARRAY_RO:
5425                        err = do_md_stop(mddev, 1, 1);
5426                        goto done_unlock;
5427
5428        }
5429
5430        /*
5431         * The remaining ioctls are changing the state of the
5432         * superblock, so we do not allow them on read-only arrays.
5433         * However non-MD ioctls (e.g. get-size) will still come through
5434         * here and hit the 'default' below, so only disallow
5435         * 'md' ioctls, and switch to rw mode if started auto-readonly.
5436         */
5437        if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5438                if (mddev->ro == 2) {
5439                        mddev->ro = 0;
5440                        sysfs_notify_dirent(mddev->sysfs_state);
5441                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5442                        md_wakeup_thread(mddev->thread);
5443                } else {
5444                        err = -EROFS;
5445                        goto abort_unlock;
5446                }
5447        }
5448
5449        switch (cmd)
5450        {
5451                case ADD_NEW_DISK:
5452                {
5453                        mdu_disk_info_t info;
5454                        if (copy_from_user(&info, argp, sizeof(info)))
5455                                err = -EFAULT;
5456                        else
5457                                err = add_new_disk(mddev, &info);
5458                        goto done_unlock;
5459                }
5460
5461                case HOT_REMOVE_DISK:
5462                        err = hot_remove_disk(mddev, new_decode_dev(arg));
5463                        goto done_unlock;
5464
5465                case HOT_ADD_DISK:
5466                        err = hot_add_disk(mddev, new_decode_dev(arg));
5467                        goto done_unlock;
5468
5469                case SET_DISK_FAULTY:
5470                        err = set_disk_faulty(mddev, new_decode_dev(arg));
5471                        goto done_unlock;
5472
5473                case RUN_ARRAY:
5474                        err = do_md_run(mddev);
5475                        goto done_unlock;
5476
5477                case SET_BITMAP_FILE:
5478                        err = set_bitmap_file(mddev, (int)arg);
5479                        goto done_unlock;
5480
5481                default:
5482                        err = -EINVAL;
5483                        goto abort_unlock;
5484        }
5485
5486done_unlock:
5487abort_unlock:
5488        if (mddev->hold_active == UNTIL_IOCTL &&
5489            err != -EINVAL)
5490                mddev->hold_active = 0;
5491        mddev_unlock(mddev);
5492
5493        return err;
5494done:
5495        if (err)
5496                MD_BUG();
5497abort:
5498        return err;
5499}
5500
5501static int md_open(struct block_device *bdev, fmode_t mode)
5502{
5503        /*
5504         * Succeed if we can lock the mddev, which confirms that
5505         * it isn't being stopped right now.
5506         */
5507        mddev_t *mddev = mddev_find(bdev->bd_dev);
5508        int err;
5509
5510        if (mddev->gendisk != bdev->bd_disk) {
5511                /* we are racing with mddev_put which is discarding this
5512                 * bd_disk.
5513                 */
5514                mddev_put(mddev);
5515                /* Wait until bdev->bd_disk is definitely gone */
5516                flush_scheduled_work();
5517                /* Then retry the open from the top */
5518                return -ERESTARTSYS;
5519        }
5520        BUG_ON(mddev != bdev->bd_disk->private_data);
5521
5522        if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
5523                goto out;
5524
5525        err = 0;
5526        atomic_inc(&mddev->openers);
5527        mutex_unlock(&mddev->open_mutex);
5528
5529        check_disk_change(bdev);
5530 out:
5531        return err;
5532}
5533
5534static int md_release(struct gendisk *disk, fmode_t mode)
5535{
5536        mddev_t *mddev = disk->private_data;
5537
5538        BUG_ON(!mddev);
5539        atomic_dec(&mddev->openers);
5540        mddev_put(mddev);
5541
5542        return 0;
5543}
5544
5545static int md_media_changed(struct gendisk *disk)
5546{
5547        mddev_t *mddev = disk->private_data;
5548
5549        return mddev->changed;
5550}
5551
5552static int md_revalidate(struct gendisk *disk)
5553{
5554        mddev_t *mddev = disk->private_data;
5555
5556        mddev->changed = 0;
5557        return 0;
5558}
5559static struct block_device_operations md_fops =
5560{
5561        .owner          = THIS_MODULE,
5562        .open           = md_open,
5563        .release        = md_release,
5564        .ioctl          = md_ioctl,
5565        .getgeo         = md_getgeo,
5566        .media_changed  = md_media_changed,
5567        .revalidate_disk= md_revalidate,
5568};
5569
5570static int md_thread(void * arg)
5571{
5572        mdk_thread_t *thread = arg;
5573
5574        /*
5575         * md_thread is a 'system-thread', it's priority should be very
5576         * high. We avoid resource deadlocks individually in each
5577         * raid personality. (RAID5 does preallocation) We also use RR and
5578         * the very same RT priority as kswapd, thus we will never get
5579         * into a priority inversion deadlock.
5580         *
5581         * we definitely have to have equal or higher priority than
5582         * bdflush, otherwise bdflush will deadlock if there are too
5583         * many dirty RAID5 blocks.
5584         */
5585
5586        allow_signal(SIGKILL);
5587        while (!kthread_should_stop()) {
5588
5589                /* We need to wait INTERRUPTIBLE so that
5590                 * we don't add to the load-average.
5591                 * That means we need to be sure no signals are
5592                 * pending
5593                 */
5594                if (signal_pending(current))
5595                        flush_signals(current);
5596
5597                wait_event_interruptible_timeout
5598                        (thread->wqueue,
5599                         test_bit(THREAD_WAKEUP, &thread->flags)
5600                         || kthread_should_stop(),
5601                         thread->timeout);
5602
5603                clear_bit(THREAD_WAKEUP, &thread->flags);
5604
5605                thread->run(thread->mddev);
5606        }
5607
5608        return 0;
5609}
5610
5611void md_wakeup_thread(mdk_thread_t *thread)
5612{
5613        if (thread) {
5614                dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5615                set_bit(THREAD_WAKEUP, &thread->flags);
5616                wake_up(&thread->wqueue);
5617        }
5618}
5619
5620mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5621                                 const char *name)
5622{
5623        mdk_thread_t *thread;
5624
5625        thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5626        if (!thread)
5627                return NULL;
5628
5629        init_waitqueue_head(&thread->wqueue);
5630
5631        thread->run = run;
5632        thread->mddev = mddev;
5633        thread->timeout = MAX_SCHEDULE_TIMEOUT;
5634        thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
5635        if (IS_ERR(thread->tsk)) {
5636                kfree(thread);
5637                return NULL;
5638        }
5639        return thread;
5640}
5641
5642void md_unregister_thread(mdk_thread_t *thread)
5643{
5644        if (!thread)
5645                return;
5646        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5647
5648        kthread_stop(thread->tsk);
5649        kfree(thread);
5650}
5651
5652void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5653{
5654        if (!mddev) {
5655                MD_BUG();
5656                return;
5657        }
5658
5659        if (!rdev || test_bit(Faulty, &rdev->flags))
5660                return;
5661
5662        if (mddev->external)
5663                set_bit(Blocked, &rdev->flags);
5664/*
5665        dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5666                mdname(mddev),
5667                MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5668                __builtin_return_address(0),__builtin_return_address(1),
5669                __builtin_return_address(2),__builtin_return_address(3));
5670*/
5671        if (!mddev->pers)
5672                return;
5673        if (!mddev->pers->error_handler)
5674                return;
5675        mddev->pers->error_handler(mddev,rdev);
5676        if (mddev->degraded)
5677                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5678        set_bit(StateChanged, &rdev->flags);
5679        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5680        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5681        md_wakeup_thread(mddev->thread);
5682        md_new_event_inintr(mddev);
5683}
5684
5685/* seq_file implementation /proc/mdstat */
5686
5687static void status_unused(struct seq_file *seq)
5688{
5689        int i = 0;
5690        mdk_rdev_t *rdev;
5691
5692        seq_printf(seq, "unused devices: ");
5693
5694        list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5695                char b[BDEVNAME_SIZE];
5696                i++;
5697                seq_printf(seq, "%s ",
5698                              bdevname(rdev->bdev,b));
5699        }
5700        if (!i)
5701                seq_printf(seq, "<none>");
5702
5703        seq_printf(seq, "\n");
5704}
5705
5706
5707static void status_resync(struct seq_file *seq, mddev_t * mddev)
5708{
5709        sector_t max_sectors, resync, res;
5710        unsigned long dt, db;
5711        sector_t rt;
5712        int scale;
5713        unsigned int per_milli;
5714
5715        resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
5716
5717        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5718                max_sectors = mddev->resync_max_sectors;
5719        else
5720                max_sectors = mddev->dev_sectors;
5721
5722        /*
5723         * Should not happen.
5724         */
5725        if (!max_sectors) {
5726                MD_BUG();
5727                return;
5728        }
5729        /* Pick 'scale' such that (resync>>scale)*1000 will fit
5730         * in a sector_t, and (max_sectors>>scale) will fit in a
5731         * u32, as those are the requirements for sector_div.
5732         * Thus 'scale' must be at least 10
5733         */
5734        scale = 10;
5735        if (sizeof(sector_t) > sizeof(unsigned long)) {
5736                while ( max_sectors/2 > (1ULL<<(scale+32)))
5737                        scale++;
5738        }
5739        res = (resync>>scale)*1000;
5740        sector_div(res, (u32)((max_sectors>>scale)+1));
5741
5742        per_milli = res;
5743        {
5744                int i, x = per_milli/50, y = 20-x;
5745                seq_printf(seq, "[");
5746                for (i = 0; i < x; i++)
5747                        seq_printf(seq, "=");
5748                seq_printf(seq, ">");
5749                for (i = 0; i < y; i++)
5750                        seq_printf(seq, ".");
5751                seq_printf(seq, "] ");
5752        }
5753        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5754                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5755                    "reshape" :
5756                    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5757                     "check" :
5758                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5759                      "resync" : "recovery"))),
5760                   per_milli/10, per_milli % 10,
5761                   (unsigned long long) resync/2,
5762                   (unsigned long long) max_sectors/2);
5763
5764        /*
5765         * dt: time from mark until now
5766         * db: blocks written from mark until now
5767         * rt: remaining time
5768         *
5769         * rt is a sector_t, so could be 32bit or 64bit.
5770         * So we divide before multiply in case it is 32bit and close
5771         * to the limit.
5772         * We scale the divisor (db) by 32 to avoid loosing precision
5773         * near the end of resync when the number of remaining sectors
5774         * is close to 'db'.
5775         * We then divide rt by 32 after multiplying by db to compensate.
5776         * The '+1' avoids division by zero if db is very small.
5777         */
5778        dt = ((jiffies - mddev->resync_mark) / HZ);
5779        if (!dt) dt++;
5780        db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5781                - mddev->resync_mark_cnt;
5782
5783        rt = max_sectors - resync;    /* number of remaining sectors */
5784        sector_div(rt, db/32+1);
5785        rt *= dt;
5786        rt >>= 5;
5787
5788        seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
5789                   ((unsigned long)rt % 60)/6);
5790
5791        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5792}
5793
5794static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5795{
5796        struct list_head *tmp;
5797        loff_t l = *pos;
5798        mddev_t *mddev;
5799
5800        if (l >= 0x10000)
5801                return NULL;
5802        if (!l--)
5803                /* header */
5804                return (void*)1;
5805
5806        spin_lock(&all_mddevs_lock);
5807        list_for_each(tmp,&all_mddevs)
5808                if (!l--) {
5809                        mddev = list_entry(tmp, mddev_t, all_mddevs);
5810                        mddev_get(mddev);
5811                        spin_unlock(&all_mddevs_lock);
5812                        return mddev;
5813                }
5814        spin_unlock(&all_mddevs_lock);
5815        if (!l--)
5816                return (void*)2;/* tail */
5817        return NULL;
5818}
5819
5820static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5821{
5822        struct list_head *tmp;
5823        mddev_t *next_mddev, *mddev = v;
5824        
5825        ++*pos;
5826        if (v == (void*)2)
5827                return NULL;
5828
5829        spin_lock(&all_mddevs_lock);
5830        if (v == (void*)1)
5831                tmp = all_mddevs.next;
5832        else
5833                tmp = mddev->all_mddevs.next;
5834        if (tmp != &all_mddevs)
5835                next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5836        else {
5837                next_mddev = (void*)2;
5838                *pos = 0x10000;
5839        }               
5840        spin_unlock(&all_mddevs_lock);
5841
5842        if (v != (void*)1)
5843                mddev_put(mddev);
5844        return next_mddev;
5845
5846}
5847
5848static void md_seq_stop(struct seq_file *seq, void *v)
5849{
5850        mddev_t *mddev = v;
5851
5852        if (mddev && v != (void*)1 && v != (void*)2)
5853                mddev_put(mddev);
5854}
5855
5856struct mdstat_info {
5857        int event;
5858};
5859
5860static int md_seq_show(struct seq_file *seq, void *v)
5861{
5862        mddev_t *mddev = v;
5863        sector_t sectors;
5864        mdk_rdev_t *rdev;
5865        struct mdstat_info *mi = seq->private;
5866        struct bitmap *bitmap;
5867
5868        if (v == (void*)1) {
5869                struct mdk_personality *pers;
5870                seq_printf(seq, "Personalities : ");
5871                spin_lock(&pers_lock);
5872                list_for_each_entry(pers, &pers_list, list)
5873                        seq_printf(seq, "[%s] ", pers->name);
5874
5875                spin_unlock(&pers_lock);
5876                seq_printf(seq, "\n");
5877                mi->event = atomic_read(&md_event_count);
5878                return 0;
5879        }
5880        if (v == (void*)2) {
5881                status_unused(seq);
5882                return 0;
5883        }
5884
5885        if (mddev_lock(mddev) < 0)
5886                return -EINTR;
5887
5888        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5889                seq_printf(seq, "%s : %sactive", mdname(mddev),
5890                                                mddev->pers ? "" : "in");
5891                if (mddev->pers) {
5892                        if (mddev->ro==1)
5893                                seq_printf(seq, " (read-only)");
5894                        if (mddev->ro==2)
5895                                seq_printf(seq, " (auto-read-only)");
5896                        seq_printf(seq, " %s", mddev->pers->name);
5897                }
5898
5899                sectors = 0;
5900                list_for_each_entry(rdev, &mddev->disks, same_set) {
5901                        char b[BDEVNAME_SIZE];
5902                        seq_printf(seq, " %s[%d]",
5903                                bdevname(rdev->bdev,b), rdev->desc_nr);
5904                        if (test_bit(WriteMostly, &rdev->flags))
5905                                seq_printf(seq, "(W)");
5906                        if (test_bit(Faulty, &rdev->flags)) {
5907                                seq_printf(seq, "(F)");
5908                                continue;
5909                        } else if (rdev->raid_disk < 0)
5910                                seq_printf(seq, "(S)"); /* spare */
5911                        sectors += rdev->sectors;
5912                }
5913
5914                if (!list_empty(&mddev->disks)) {
5915                        if (mddev->pers)
5916                                seq_printf(seq, "\n      %llu blocks",
5917                                           (unsigned long long)
5918                                           mddev->array_sectors / 2);
5919                        else
5920                                seq_printf(seq, "\n      %llu blocks",
5921                                           (unsigned long long)sectors / 2);
5922                }
5923                if (mddev->persistent) {
5924                        if (mddev->major_version != 0 ||
5925                            mddev->minor_version != 90) {
5926                                seq_printf(seq," super %d.%d",
5927                                           mddev->major_version,
5928                                           mddev->minor_version);
5929                        }
5930                } else if (mddev->external)
5931                        seq_printf(seq, " super external:%s",
5932                                   mddev->metadata_type);
5933                else
5934                        seq_printf(seq, " super non-persistent");
5935
5936                if (mddev->pers) {
5937                        mddev->pers->status(seq, mddev);
5938                        seq_printf(seq, "\n      ");
5939                        if (mddev->pers->sync_request) {
5940                                if (mddev->curr_resync > 2) {
5941                                        status_resync(seq, mddev);
5942                                        seq_printf(seq, "\n      ");
5943                                } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5944                                        seq_printf(seq, "\tresync=DELAYED\n      ");
5945                                else if (mddev->recovery_cp < MaxSector)
5946                                        seq_printf(seq, "\tresync=PENDING\n      ");
5947                        }
5948                } else
5949                        seq_printf(seq, "\n       ");
5950
5951                if ((bitmap = mddev->bitmap)) {
5952                        unsigned long chunk_kb;
5953                        unsigned long flags;
5954                        spin_lock_irqsave(&bitmap->lock, flags);
5955                        chunk_kb = bitmap->chunksize >> 10;
5956                        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5957                                "%lu%s chunk",
5958                                bitmap->pages - bitmap->missing_pages,
5959                                bitmap->pages,
5960                                (bitmap->pages - bitmap->missing_pages)
5961                                        << (PAGE_SHIFT - 10),
5962                                chunk_kb ? chunk_kb : bitmap->chunksize,
5963                                chunk_kb ? "KB" : "B");
5964                        if (bitmap->file) {
5965                                seq_printf(seq, ", file: ");
5966                                seq_path(seq, &bitmap->file->f_path, " \t\n");
5967                        }
5968
5969                        seq_printf(seq, "\n");
5970                        spin_unlock_irqrestore(&bitmap->lock, flags);
5971                }
5972
5973                seq_printf(seq, "\n");
5974        }
5975        mddev_unlock(mddev);
5976        
5977        return 0;
5978}
5979
5980static const struct seq_operations md_seq_ops = {
5981        .start  = md_seq_start,
5982        .next   = md_seq_next,
5983        .stop   = md_seq_stop,
5984        .show   = md_seq_show,
5985};
5986
5987static int md_seq_open(struct inode *inode, struct file *file)
5988{
5989        int error;
5990        struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5991        if (mi == NULL)
5992                return -ENOMEM;
5993
5994        error = seq_open(file, &md_seq_ops);
5995        if (error)
5996                kfree(mi);
5997        else {
5998                struct seq_file *p = file->private_data;
5999                p->private = mi;
6000                mi->event = atomic_read(&md_event_count);
6001        }
6002        return error;
6003}
6004
6005static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6006{
6007        struct seq_file *m = filp->private_data;
6008        struct mdstat_info *mi = m->private;
6009        int mask;
6010
6011        poll_wait(filp, &md_event_waiters, wait);
6012
6013        /* always allow read */
6014        mask = POLLIN | POLLRDNORM;
6015
6016        if (mi->event != atomic_read(&md_event_count))
6017                mask |= POLLERR | POLLPRI;
6018        return mask;
6019}
6020
6021static const struct file_operations md_seq_fops = {
6022        .owner          = THIS_MODULE,
6023        .open           = md_seq_open,
6024        .read           = seq_read,
6025        .llseek         = seq_lseek,
6026        .release        = seq_release_private,
6027        .poll           = mdstat_poll,
6028};
6029
6030int register_md_personality(struct mdk_personality *p)
6031{
6032        spin_lock(&pers_lock);
6033        list_add_tail(&p->list, &pers_list);
6034        printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
6035        spin_unlock(&pers_lock);
6036        return 0;
6037}
6038
6039int unregister_md_personality(struct mdk_personality *p)
6040{
6041        printk(KERN_INFO "md: %s personality unregistered\n", p->name);
6042        spin_lock(&pers_lock);
6043        list_del_init(&p->list);
6044        spin_unlock(&pers_lock);
6045        return 0;
6046}
6047
6048static int is_mddev_idle(mddev_t *mddev, int init)
6049{
6050        mdk_rdev_t * rdev;
6051        int idle;
6052        int curr_events;
6053
6054        idle = 1;
6055        rcu_read_lock();
6056        rdev_for_each_rcu(rdev, mddev) {
6057                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
6058                curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
6059                              (int)part_stat_read(&disk->part0, sectors[1]) -
6060                              atomic_read(&disk->sync_io);
6061                /* sync IO will cause sync_io to increase before the disk_stats
6062                 * as sync_io is counted when a request starts, and
6063                 * disk_stats is counted when it completes.
6064                 * So resync activity will cause curr_events to be smaller than
6065                 * when there was no such activity.
6066                 * non-sync IO will cause disk_stat to increase without
6067                 * increasing sync_io so curr_events will (eventually)
6068                 * be larger than it was before.  Once it becomes
6069                 * substantially larger, the test below will cause
6070                 * the array to appear non-idle, and resync will slow
6071                 * down.
6072                 * If there is a lot of outstanding resync activity when
6073                 * we set last_event to curr_events, then all that activity
6074                 * completing might cause the array to appear non-idle
6075                 * and resync will be slowed down even though there might
6076                 * not have been non-resync activity.  This will only
6077                 * happen once though.  'last_events' will soon reflect
6078                 * the state where there is little or no outstanding
6079                 * resync requests, and further resync activity will
6080                 * always make curr_events less than last_events.
6081                 *
6082                 */
6083                if (init || curr_events - rdev->last_events > 64) {
6084                        rdev->last_events = curr_events;
6085                        idle = 0;
6086                }
6087        }
6088        rcu_read_unlock();
6089        return idle;
6090}
6091
6092void md_done_sync(mddev_t *mddev, int blocks, int ok)
6093{
6094        /* another "blocks" (512byte) blocks have been synced */
6095        atomic_sub(blocks, &mddev->recovery_active);
6096        wake_up(&mddev->recovery_wait);
6097        if (!ok) {
6098                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6099                md_wakeup_thread(mddev->thread);
6100                // stop recovery, signal do_sync ....
6101        }
6102}
6103
6104
6105/* md_write_start(mddev, bi)
6106 * If we need to update some array metadata (e.g. 'active' flag
6107 * in superblock) before writing, schedule a superblock update
6108 * and wait for it to complete.
6109 */
6110void md_write_start(mddev_t *mddev, struct bio *bi)
6111{
6112        int did_change = 0;
6113        if (bio_data_dir(bi) != WRITE)
6114                return;
6115
6116        BUG_ON(mddev->ro == 1);
6117        if (mddev->ro == 2) {
6118                /* need to switch to read/write */
6119                mddev->ro = 0;
6120                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6121                md_wakeup_thread(mddev->thread);
6122                md_wakeup_thread(mddev->sync_thread);
6123                did_change = 1;
6124        }
6125        atomic_inc(&mddev->writes_pending);
6126        if (mddev->safemode == 1)
6127                mddev->safemode = 0;
6128        if (mddev->in_sync) {
6129                spin_lock_irq(&mddev->write_lock);
6130                if (mddev->in_sync) {
6131                        mddev->in_sync = 0;
6132                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6133                        md_wakeup_thread(mddev->thread);
6134                        did_change = 1;
6135                }
6136                spin_unlock_irq(&mddev->write_lock);
6137        }
6138        if (did_change)
6139                sysfs_notify_dirent(mddev->sysfs_state);
6140        wait_event(mddev->sb_wait,
6141                   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
6142                   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6143}
6144
6145void md_write_end(mddev_t *mddev)
6146{
6147        if (atomic_dec_and_test(&mddev->writes_pending)) {
6148                if (mddev->safemode == 2)
6149                        md_wakeup_thread(mddev->thread);
6150                else if (mddev->safemode_delay)
6151                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
6152        }
6153}
6154
6155/* md_allow_write(mddev)
6156 * Calling this ensures that the array is marked 'active' so that writes
6157 * may proceed without blocking.  It is important to call this before
6158 * attempting a GFP_KERNEL allocation while holding the mddev lock.
6159 * Must be called with mddev_lock held.
6160 *
6161 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
6162 * is dropped, so return -EAGAIN after notifying userspace.
6163 */
6164int md_allow_write(mddev_t *mddev)
6165{
6166        if (!mddev->pers)
6167                return 0;
6168        if (mddev->ro)
6169                return 0;
6170        if (!mddev->pers->sync_request)
6171                return 0;
6172
6173        spin_lock_irq(&mddev->write_lock);
6174        if (mddev->in_sync) {
6175                mddev->in_sync = 0;
6176                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6177                if (mddev->safemode_delay &&
6178                    mddev->safemode == 0)
6179                        mddev->safemode = 1;
6180                spin_unlock_irq(&mddev->write_lock);
6181                md_update_sb(mddev, 0);
6182                sysfs_notify_dirent(mddev->sysfs_state);
6183        } else
6184                spin_unlock_irq(&mddev->write_lock);
6185
6186        if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
6187                return -EAGAIN;
6188        else
6189                return 0;
6190}
6191EXPORT_SYMBOL_GPL(md_allow_write);
6192
6193#define SYNC_MARKS      10
6194#define SYNC_MARK_STEP  (3*HZ)
6195void md_do_sync(mddev_t *mddev)
6196{
6197        mddev_t *mddev2;
6198        unsigned int currspeed = 0,
6199                 window;
6200        sector_t max_sectors,j, io_sectors;
6201        unsigned long mark[SYNC_MARKS];
6202        sector_t mark_cnt[SYNC_MARKS];
6203        int last_mark,m;
6204        struct list_head *tmp;
6205        sector_t last_check;
6206        int skipped = 0;
6207        mdk_rdev_t *rdev;
6208        char *desc;
6209
6210        /* just incase thread restarts... */
6211        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
6212                return;
6213        if (mddev->ro) /* never try to sync a read-only array */
6214                return;
6215
6216        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6217                if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
6218                        desc = "data-check";
6219                else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6220                        desc = "requested-resync";
6221                else
6222                        desc = "resync";
6223        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6224                desc = "reshape";
6225        else
6226                desc = "recovery";
6227
6228        /* we overload curr_resync somewhat here.
6229         * 0 == not engaged in resync at all
6230         * 2 == checking that there is no conflict with another sync
6231         * 1 == like 2, but have yielded to allow conflicting resync to
6232         *              commense
6233         * other == active in resync - this many blocks
6234         *
6235         * Before starting a resync we must have set curr_resync to
6236         * 2, and then checked that every "conflicting" array has curr_resync
6237         * less than ours.  When we find one that is the same or higher
6238         * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
6239         * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
6240         * This will mean we have to start checking from the beginning again.
6241         *
6242         */
6243
6244        do {
6245                mddev->curr_resync = 2;
6246
6247        try_again:
6248                if (kthread_should_stop()) {
6249                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6250                        goto skip;
6251                }
6252                for_each_mddev(mddev2, tmp) {
6253                        if (mddev2 == mddev)
6254                                continue;
6255                        if (!mddev->parallel_resync
6256                        &&  mddev2->curr_resync
6257                        &&  match_mddev_units(mddev, mddev2)) {
6258                                DEFINE_WAIT(wq);
6259                                if (mddev < mddev2 && mddev->curr_resync == 2) {
6260                                        /* arbitrarily yield */
6261                                        mddev->curr_resync = 1;
6262                                        wake_up(&resync_wait);
6263                                }
6264                                if (mddev > mddev2 && mddev->curr_resync == 1)
6265                                        /* no need to wait here, we can wait the next
6266                                         * time 'round when curr_resync == 2
6267                                         */
6268                                        continue;
6269                                /* We need to wait 'interruptible' so as not to
6270                                 * contribute to the load average, and not to
6271                                 * be caught by 'softlockup'
6272                                 */
6273                                prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
6274                                if (!kthread_should_stop() &&
6275                                    mddev2->curr_resync >= mddev->curr_resync) {
6276                                        printk(KERN_INFO "md: delaying %s of %s"
6277                                               " until %s has finished (they"
6278                                               " share one or more physical units)\n",
6279                                               desc, mdname(mddev), mdname(mddev2));
6280                                        mddev_put(mddev2);
6281                                        if (signal_pending(current))
6282                                                flush_signals(current);
6283                                        schedule();
6284                                        finish_wait(&resync_wait, &wq);
6285                                        goto try_again;
6286                                }
6287                                finish_wait(&resync_wait, &wq);
6288                        }
6289                }
6290        } while (mddev->curr_resync < 2);
6291
6292        j = 0;
6293        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6294                /* resync follows the size requested by the personality,
6295                 * which defaults to physical size, but can be virtual size
6296                 */
6297                max_sectors = mddev->resync_max_sectors;
6298                mddev->resync_mismatches = 0;
6299                /* we don't use the checkpoint if there's a bitmap */
6300                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6301                        j = mddev->resync_min;
6302                else if (!mddev->bitmap)
6303                        j = mddev->recovery_cp;
6304
6305        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6306                max_sectors = mddev->dev_sectors;
6307        else {
6308                /* recovery follows the physical size of devices */
6309                max_sectors = mddev->dev_sectors;
6310                j = MaxSector;
6311                list_for_each_entry(rdev, &mddev->disks, same_set)
6312                        if (rdev->raid_disk >= 0 &&
6313                            !test_bit(Faulty, &rdev->flags) &&
6314                            !test_bit(In_sync, &rdev->flags) &&
6315                            rdev->recovery_offset < j)
6316                                j = rdev->recovery_offset;
6317        }
6318
6319        printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
6320        printk(KERN_INFO "md: minimum _guaranteed_  speed:"
6321                " %d KB/sec/disk.\n", speed_min(mddev));
6322        printk(KERN_INFO "md: using maximum available idle IO bandwidth "
6323               "(but not more than %d KB/sec) for %s.\n",
6324               speed_max(mddev), desc);
6325
6326        is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6327
6328        io_sectors = 0;
6329        for (m = 0; m < SYNC_MARKS; m++) {
6330                mark[m] = jiffies;
6331                mark_cnt[m] = io_sectors;
6332        }
6333        last_mark = 0;
6334        mddev->resync_mark = mark[last_mark];
6335        mddev->resync_mark_cnt = mark_cnt[last_mark];
6336
6337        /*
6338         * Tune reconstruction:
6339         */
6340        window = 32*(PAGE_SIZE/512);
6341        printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
6342                window/2,(unsigned long long) max_sectors/2);
6343
6344        atomic_set(&mddev->recovery_active, 0);
6345        last_check = 0;
6346
6347        if (j>2) {
6348                printk(KERN_INFO 
6349                       "md: resuming %s of %s from checkpoint.\n",
6350                       desc, mdname(mddev));
6351                mddev->curr_resync = j;
6352        }
6353
6354        while (j < max_sectors) {
6355                sector_t sectors;
6356
6357                skipped = 0;
6358
6359                if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6360                    ((mddev->curr_resync > mddev->curr_resync_completed &&
6361                      (mddev->curr_resync - mddev->curr_resync_completed)
6362                      > (max_sectors >> 4)) ||
6363                     (j - mddev->curr_resync_completed)*2
6364                     >= mddev->resync_max - mddev->curr_resync_completed
6365                            )) {
6366                        /* time to update curr_resync_completed */
6367                        blk_unplug(mddev->queue);
6368                        wait_event(mddev->recovery_wait,
6369                                   atomic_read(&mddev->recovery_active) == 0);
6370                        mddev->curr_resync_completed =
6371                                mddev->curr_resync;
6372                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6373                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6374                }
6375
6376                while (j >= mddev->resync_max && !kthread_should_stop()) {
6377                        /* As this condition is controlled by user-space,
6378                         * we can block indefinitely, so use '_interruptible'
6379                         * to avoid triggering warnings.
6380                         */
6381                        flush_signals(current); /* just in case */
6382                        wait_event_interruptible(mddev->recovery_wait,
6383                                                 mddev->resync_max > j
6384                                                 || kthread_should_stop());
6385                }
6386
6387                if (kthread_should_stop())
6388                        goto interrupted;
6389
6390                sectors = mddev->pers->sync_request(mddev, j, &skipped,
6391                                                  currspeed < speed_min(mddev));
6392                if (sectors == 0) {
6393                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6394                        goto out;
6395                }
6396
6397                if (!skipped) { /* actual IO requested */
6398                        io_sectors += sectors;
6399                        atomic_add(sectors, &mddev->recovery_active);
6400                }
6401
6402                j += sectors;
6403                if (j>1) mddev->curr_resync = j;
6404                mddev->curr_mark_cnt = io_sectors;
6405                if (last_check == 0)
6406                        /* this is the earliers that rebuilt will be
6407                         * visible in /proc/mdstat
6408                         */
6409                        md_new_event(mddev);
6410
6411                if (last_check + window > io_sectors || j == max_sectors)
6412                        continue;
6413
6414                last_check = io_sectors;
6415
6416                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6417                        break;
6418
6419        repeat:
6420                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6421                        /* step marks */
6422                        int next = (last_mark+1) % SYNC_MARKS;
6423
6424                        mddev->resync_mark = mark[next];
6425                        mddev->resync_mark_cnt = mark_cnt[next];
6426                        mark[next] = jiffies;
6427                        mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
6428                        last_mark = next;
6429                }
6430
6431
6432                if (kthread_should_stop())
6433                        goto interrupted;
6434
6435
6436                /*
6437                 * this loop exits only if either when we are slower than
6438                 * the 'hard' speed limit, or the system was IO-idle for
6439                 * a jiffy.
6440                 * the system might be non-idle CPU-wise, but we only care
6441                 * about not overloading the IO subsystem. (things like an
6442                 * e2fsck being done on the RAID array should execute fast)
6443                 */
6444                blk_unplug(mddev->queue);
6445                cond_resched();
6446
6447                currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
6448                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
6449
6450                if (currspeed > speed_min(mddev)) {
6451                        if ((currspeed > speed_max(mddev)) ||
6452                                        !is_mddev_idle(mddev, 0)) {
6453                                msleep(500);
6454                                goto repeat;
6455                        }
6456                }
6457        }
6458        printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
6459        /*
6460         * this also signals 'finished resyncing' to md_stop
6461         */
6462 out:
6463        blk_unplug(mddev->queue);
6464
6465        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6466
6467        /* tell personality that we are finished */
6468        mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
6469
6470        if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
6471            mddev->curr_resync > 2) {
6472                if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6473                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6474                                if (mddev->curr_resync >= mddev->recovery_cp) {
6475                                        printk(KERN_INFO
6476                                               "md: checkpointing %s of %s.\n",
6477                                               desc, mdname(mddev));
6478                                        mddev->recovery_cp = mddev->curr_resync;
6479                                }
6480                        } else
6481                                mddev->recovery_cp = MaxSector;
6482                } else {
6483                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6484                                mddev->curr_resync = MaxSector;
6485                        list_for_each_entry(rdev, &mddev->disks, same_set)
6486                                if (rdev->raid_disk >= 0 &&
6487                                    !test_bit(Faulty, &rdev->flags) &&
6488                                    !test_bit(In_sync, &rdev->flags) &&
6489                                    rdev->recovery_offset < mddev->curr_resync)
6490                                        rdev->recovery_offset = mddev->curr_resync;
6491                }
6492        }
6493        set_bit(MD_CHANGE_DEVS, &mddev->flags);
6494
6495 skip:
6496        mddev->curr_resync = 0;
6497        mddev->curr_resync_completed = 0;
6498        mddev->resync_min = 0;
6499        mddev->resync_max = MaxSector;
6500        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6501        wake_up(&resync_wait);
6502        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6503        md_wakeup_thread(mddev->thread);
6504        return;
6505
6506 interrupted:
6507        /*
6508         * got a signal, exit.
6509         */
6510        printk(KERN_INFO
6511               "md: md_do_sync() got signal ... exiting\n");
6512        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6513        goto out;
6514
6515}
6516EXPORT_SYMBOL_GPL(md_do_sync);
6517
6518
6519static int remove_and_add_spares(mddev_t *mddev)
6520{
6521        mdk_rdev_t *rdev;
6522        int spares = 0;
6523
6524        mddev->curr_resync_completed = 0;
6525
6526        list_for_each_entry(rdev, &mddev->disks, same_set)
6527                if (rdev->raid_disk >= 0 &&
6528                    !test_bit(Blocked, &rdev->flags) &&
6529                    (test_bit(Faulty, &rdev->flags) ||
6530                     ! test_bit(In_sync, &rdev->flags)) &&
6531                    atomic_read(&rdev->nr_pending)==0) {
6532                        if (mddev->pers->hot_remove_disk(
6533                                    mddev, rdev->raid_disk)==0) {
6534                                char nm[20];
6535                                sprintf(nm,"rd%d", rdev->raid_disk);
6536                                sysfs_remove_link(&mddev->kobj, nm);
6537                                rdev->raid_disk = -1;
6538                        }
6539                }
6540
6541        if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6542                list_for_each_entry(rdev, &mddev->disks, same_set) {
6543                        if (rdev->raid_disk >= 0 &&
6544                            !test_bit(In_sync, &rdev->flags) &&
6545                            !test_bit(Blocked, &rdev->flags))
6546                                spares++;
6547                        if (rdev->raid_disk < 0
6548                            && !test_bit(Faulty, &rdev->flags)) {
6549                                rdev->recovery_offset = 0;
6550                                if (mddev->pers->
6551                                    hot_add_disk(mddev, rdev) == 0) {
6552                                        char nm[20];
6553                                        sprintf(nm, "rd%d", rdev->raid_disk);
6554                                        if (sysfs_create_link(&mddev->kobj,
6555                                                              &rdev->kobj, nm))
6556                                                printk(KERN_WARNING
6557                                                       "md: cannot register "
6558                                                       "%s for %s\n",
6559                                                       nm, mdname(mddev));
6560                                        spares++;
6561                                        md_new_event(mddev);
6562                                } else
6563                                        break;
6564                        }
6565                }
6566        }
6567        return spares;
6568}
6569/*
6570 * This routine is regularly called by all per-raid-array threads to
6571 * deal with generic issues like resync and super-block update.
6572 * Raid personalities that don't have a thread (linear/raid0) do not
6573 * need this as they never do any recovery or update the superblock.
6574 *
6575 * It does not do any resync itself, but rather "forks" off other threads
6576 * to do that as needed.
6577 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
6578 * "->recovery" and create a thread at ->sync_thread.
6579 * When the thread finishes it sets MD_RECOVERY_DONE
6580 * and wakeups up this thread which will reap the thread and finish up.
6581 * This thread also removes any faulty devices (with nr_pending == 0).
6582 *
6583 * The overall approach is:
6584 *  1/ if the superblock needs updating, update it.
6585 *  2/ If a recovery thread is running, don't do anything else.
6586 *  3/ If recovery has finished, clean up, possibly marking spares active.
6587 *  4/ If there are any faulty devices, remove them.
6588 *  5/ If array is degraded, try to add spares devices
6589 *  6/ If array has spares or is not in-sync, start a resync thread.
6590 */
6591void md_check_recovery(mddev_t *mddev)
6592{
6593        mdk_rdev_t *rdev;
6594
6595
6596        if (mddev->bitmap)
6597                bitmap_daemon_work(mddev->bitmap);
6598
6599        if (mddev->ro)
6600                return;
6601
6602        if (signal_pending(current)) {
6603                if (mddev->pers->sync_request && !mddev->external) {
6604                        printk(KERN_INFO "md: %s in immediate safe mode\n",
6605                               mdname(mddev));
6606                        mddev->safemode = 2;
6607                }
6608                flush_signals(current);
6609        }
6610
6611        if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
6612                return;
6613        if ( ! (
6614                (mddev->flags && !mddev->external) ||
6615                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6616                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6617                (mddev->external == 0 && mddev->safemode == 1) ||
6618                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6619                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6620                ))
6621                return;
6622
6623        if (mddev_trylock(mddev)) {
6624                int spares = 0;
6625
6626                if (mddev->ro) {
6627                        /* Only thing we do on a ro array is remove
6628                         * failed devices.
6629                         */
6630                        remove_and_add_spares(mddev);
6631                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6632                        goto unlock;
6633                }
6634
6635                if (!mddev->external) {
6636                        int did_change = 0;
6637                        spin_lock_irq(&mddev->write_lock);
6638                        if (mddev->safemode &&
6639                            !atomic_read(&mddev->writes_pending) &&
6640                            !mddev->in_sync &&
6641                            mddev->recovery_cp == MaxSector) {
6642                                mddev->in_sync = 1;
6643                                did_change = 1;
6644                                if (mddev->persistent)
6645                                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6646                        }
6647                        if (mddev->safemode == 1)
6648                                mddev->safemode = 0;
6649                        spin_unlock_irq(&mddev->write_lock);
6650                        if (did_change)
6651                                sysfs_notify_dirent(mddev->sysfs_state);
6652                }
6653
6654                if (mddev->flags)
6655                        md_update_sb(mddev, 0);
6656
6657                list_for_each_entry(rdev, &mddev->disks, same_set)
6658                        if (test_and_clear_bit(StateChanged, &rdev->flags))
6659                                sysfs_notify_dirent(rdev->sysfs_state);
6660
6661
6662                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6663                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6664                        /* resync/recovery still happening */
6665                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6666                        goto unlock;
6667                }
6668                if (mddev->sync_thread) {
6669                        /* resync has finished, collect result */
6670                        md_unregister_thread(mddev->sync_thread);
6671                        mddev->sync_thread = NULL;
6672                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
6673                            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
6674                                /* success...*/
6675                                /* activate any spares */
6676                                if (mddev->pers->spare_active(mddev))
6677                                        sysfs_notify(&mddev->kobj, NULL,
6678                                                     "degraded");
6679                        }
6680                        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6681                            mddev->pers->finish_reshape)
6682                                mddev->pers->finish_reshape(mddev);
6683                        md_update_sb(mddev, 1);
6684
6685                        /* if array is no-longer degraded, then any saved_raid_disk
6686                         * information must be scrapped
6687                         */
6688                        if (!mddev->degraded)
6689                                list_for_each_entry(rdev, &mddev->disks, same_set)
6690                                        rdev->saved_raid_disk = -1;
6691
6692                        mddev->recovery = 0;
6693                        /* flag recovery needed just to double check */
6694                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6695                        sysfs_notify_dirent(mddev->sysfs_action);
6696                        md_new_event(mddev);
6697                        goto unlock;
6698                }
6699                /* Set RUNNING before clearing NEEDED to avoid
6700                 * any transients in the value of "sync_action".
6701                 */
6702                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6703                clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6704                /* Clear some bits that don't mean anything, but
6705                 * might be left set
6706                 */
6707                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6708                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6709
6710                if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6711                        goto unlock;
6712                /* no recovery is running.
6713                 * remove any failed drives, then
6714                 * add spares if possible.
6715                 * Spare are also removed and re-added, to allow
6716                 * the personality to fail the re-add.
6717                 */
6718
6719                if (mddev->reshape_position != MaxSector) {
6720                        if (mddev->pers->check_reshape == NULL ||
6721                            mddev->pers->check_reshape(mddev) != 0)
6722                                /* Cannot proceed */
6723                                goto unlock;
6724                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6725                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6726                } else if ((spares = remove_and_add_spares(mddev))) {
6727                        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6728                        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6729                        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
6730                        set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6731                } else if (mddev->recovery_cp < MaxSector) {
6732                        set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6733                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6734                } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6735                        /* nothing to be done ... */
6736                        goto unlock;
6737
6738                if (mddev->pers->sync_request) {
6739                        if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6740                                /* We are adding a device or devices to an array
6741                                 * which has the bitmap stored on all devices.
6742                                 * So make sure all bitmap pages get written
6743                                 */
6744                                bitmap_write_all(mddev->bitmap);
6745                        }
6746                        mddev->sync_thread = md_register_thread(md_do_sync,
6747                                                                mddev,
6748                                                                "%s_resync");
6749                        if (!mddev->sync_thread) {
6750                                printk(KERN_ERR "%s: could not start resync"
6751                                        " thread...\n", 
6752                                        mdname(mddev));
6753                                /* leave the spares where they are, it shouldn't hurt */
6754                                mddev->recovery = 0;
6755                        } else
6756                                md_wakeup_thread(mddev->sync_thread);
6757                        sysfs_notify_dirent(mddev->sysfs_action);
6758                        md_new_event(mddev);
6759                }
6760        unlock:
6761                if (!mddev->sync_thread) {
6762                        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6763                        if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6764                                               &mddev->recovery))
6765                                if (mddev->sysfs_action)
6766                                        sysfs_notify_dirent(mddev->sysfs_action);
6767                }
6768                mddev_unlock(mddev);
6769        }
6770}
6771
6772void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
6773{
6774        sysfs_notify_dirent(rdev->sysfs_state);
6775        wait_event_timeout(rdev->blocked_wait,
6776                           !test_bit(Blocked, &rdev->flags),
6777                           msecs_to_jiffies(5000));
6778        rdev_dec_pending(rdev, mddev);
6779}
6780EXPORT_SYMBOL(md_wait_for_blocked_rdev);
6781
6782static int md_notify_reboot(struct notifier_block *this,
6783                            unsigned long code, void *x)
6784{
6785        struct list_head *tmp;
6786        mddev_t *mddev;
6787
6788        if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
6789
6790                printk(KERN_INFO "md: stopping all md devices.\n");
6791
6792                for_each_mddev(mddev, tmp)
6793                        if (mddev_trylock(mddev)) {
6794                                /* Force a switch to readonly even array
6795                                 * appears to still be in use.  Hence
6796                                 * the '100'.
6797                                 */
6798                                do_md_stop(mddev, 1, 100);
6799                                mddev_unlock(mddev);
6800                        }
6801                /*
6802                 * certain more exotic SCSI devices are known to be
6803                 * volatile wrt too early system reboots. While the
6804                 * right place to handle this issue is the given
6805                 * driver, we do want to have a safe RAID driver ...
6806                 */
6807                mdelay(1000*1);
6808        }
6809        return NOTIFY_DONE;
6810}
6811
6812static struct notifier_block md_notifier = {
6813        .notifier_call  = md_notify_reboot,
6814        .next           = NULL,
6815        .priority       = INT_MAX, /* before any real devices */
6816};
6817
6818static void md_geninit(void)
6819{
6820        dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6821
6822        proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
6823}
6824
6825static int __init md_init(void)
6826{
6827        if (register_blkdev(MD_MAJOR, "md"))
6828                return -1;
6829        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6830                unregister_blkdev(MD_MAJOR, "md");
6831                return -1;
6832        }
6833        blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
6834                            md_probe, NULL, NULL);
6835        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6836                            md_probe, NULL, NULL);
6837
6838        register_reboot_notifier(&md_notifier);
6839        raid_table_header = register_sysctl_table(raid_root_table);
6840
6841        md_geninit();
6842        return 0;
6843}
6844
6845
6846#ifndef MODULE
6847
6848/*
6849 * Searches all registered partitions for autorun RAID arrays
6850 * at boot time.
6851 */
6852
6853static LIST_HEAD(all_detected_devices);
6854struct detected_devices_node {
6855        struct list_head list;
6856        dev_t dev;
6857};
6858
6859void md_autodetect_dev(dev_t dev)
6860{
6861        struct detected_devices_node *node_detected_dev;
6862
6863        node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
6864        if (node_detected_dev) {
6865                node_detected_dev->dev = dev;
6866                list_add_tail(&node_detected_dev->list, &all_detected_devices);
6867        } else {
6868                printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
6869                        ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
6870        }
6871}
6872
6873
6874static void autostart_arrays(int part)
6875{
6876        mdk_rdev_t *rdev;
6877        struct detected_devices_node *node_detected_dev;
6878        dev_t dev;
6879        int i_scanned, i_passed;
6880
6881        i_scanned = 0;
6882        i_passed = 0;
6883
6884        printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6885
6886        while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6887                i_scanned++;
6888                node_detected_dev = list_entry(all_detected_devices.next,
6889                                        struct detected_devices_node, list);
6890                list_del(&node_detected_dev->list);
6891                dev = node_detected_dev->dev;
6892                kfree(node_detected_dev);
6893                rdev = md_import_device(dev,0, 90);
6894                if (IS_ERR(rdev))
6895                        continue;
6896
6897                if (test_bit(Faulty, &rdev->flags)) {
6898                        MD_BUG();
6899                        continue;
6900                }
6901                set_bit(AutoDetected, &rdev->flags);
6902                list_add(&rdev->same_set, &pending_raid_disks);
6903                i_passed++;
6904        }
6905
6906        printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6907                                                i_scanned, i_passed);
6908
6909        autorun_devices(part);
6910}
6911
6912#endif /* !MODULE */
6913
6914static __exit void md_exit(void)
6915{
6916        mddev_t *mddev;
6917        struct list_head *tmp;
6918
6919        blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
6920        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6921
6922        unregister_blkdev(MD_MAJOR,"md");
6923        unregister_blkdev(mdp_major, "mdp");
6924        unregister_reboot_notifier(&md_notifier);
6925        unregister_sysctl_table(raid_table_header);
6926        remove_proc_entry("mdstat", NULL);
6927        for_each_mddev(mddev, tmp) {
6928                export_array(mddev);
6929                mddev->hold_active = 0;
6930        }
6931}
6932
6933subsys_initcall(md_init);
6934module_exit(md_exit)
6935
6936static int get_ro(char *buffer, struct kernel_param *kp)
6937{
6938        return sprintf(buffer, "%d", start_readonly);
6939}
6940static int set_ro(const char *val, struct kernel_param *kp)
6941{
6942        char *e;
6943        int num = simple_strtoul(val, &e, 10);
6944        if (*val && (*e == '\0' || *e == '\n')) {
6945                start_readonly = num;
6946                return 0;
6947        }
6948        return -EINVAL;
6949}
6950
6951module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6952module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6953
6954module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
6955
6956EXPORT_SYMBOL(register_md_personality);
6957EXPORT_SYMBOL(unregister_md_personality);
6958EXPORT_SYMBOL(md_error);
6959EXPORT_SYMBOL(md_done_sync);
6960EXPORT_SYMBOL(md_write_start);
6961EXPORT_SYMBOL(md_write_end);
6962EXPORT_SYMBOL(md_register_thread);
6963EXPORT_SYMBOL(md_unregister_thread);
6964EXPORT_SYMBOL(md_wakeup_thread);
6965EXPORT_SYMBOL(md_check_recovery);
6966MODULE_LICENSE("GPL");
6967MODULE_ALIAS("md");
6968MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
6969
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.