linux-old/drivers/md/md.c
<<
>>
Prefs
   1/*
   2   md.c : Multiple Devices driver for Linux
   3          Copyright (C) 1998, 1999, 2000 Ingo Molnar
   4
   5     completely rewritten, based on the MD driver code from Marc Zyngier
   6
   7   Changes:
   8
   9   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
  10   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
  11   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
  12   - kmod support by: Cyrus Durgin
  13   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
  14   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
  15
  16   - lots of fixes and improvements to the RAID1/RAID5 and generic
  17     RAID code (such as request based resynchronization):
  18
  19     Neil Brown <neilb@cse.unsw.edu.au>.
  20
  21   This program is free software; you can redistribute it and/or modify
  22   it under the terms of the GNU General Public License as published by
  23   the Free Software Foundation; either version 2, or (at your option)
  24   any later version.
  25
  26   You should have received a copy of the GNU General Public License
  27   (for example /usr/src/linux/COPYING); if not, write to the Free
  28   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  29*/
  30
  31#include <linux/module.h>
  32#include <linux/config.h>
  33#include <linux/raid/md.h>
  34#include <linux/sysctl.h>
  35#include <linux/raid/xor.h>
  36#include <linux/devfs_fs_kernel.h>
  37
  38#include <linux/init.h>
  39
  40#ifdef CONFIG_KMOD
  41#include <linux/kmod.h>
  42#endif
  43
  44#define __KERNEL_SYSCALLS__
  45#include <linux/unistd.h>
  46
  47#include <asm/unaligned.h>
  48
  49#define MAJOR_NR MD_MAJOR
  50#define MD_DRIVER
  51
  52#include <linux/blk.h>
  53
  54#define DEBUG 0
  55#if DEBUG
  56# define dprintk(x...) printk(x)
  57#else
  58# define dprintk(x...) do { } while(0)
  59#endif
  60
  61#ifndef MODULE
  62static void autostart_arrays (void);
  63#endif
  64
  65static mdk_personality_t *pers[MAX_PERSONALITY];
  66
  67/*
  68 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  69 * is 100 KB/sec, so the extra system load does not show up that much.
  70 * Increase it if you want to have more _guaranteed_ speed. Note that
  71 * the RAID driver will use the maximum available bandwith if the IO
  72 * subsystem is idle. There is also an 'absolute maximum' reconstruction
  73 * speed limit - in case reconstruction slows down your system despite
  74 * idle IO detection.
  75 *
  76 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
  77 */
  78
  79static int sysctl_speed_limit_min = 100;
  80static int sysctl_speed_limit_max = 100000;
  81
  82static struct ctl_table_header *raid_table_header;
  83
  84static ctl_table raid_table[] = {
  85        {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
  86         &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
  87        {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
  88         &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
  89        {0}
  90};
  91
  92static ctl_table raid_dir_table[] = {
  93        {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
  94        {0}
  95};
  96
  97static ctl_table raid_root_table[] = {
  98        {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
  99        {0}
 100};
 101
 102/*
 103 * these have to be allocated separately because external
 104 * subsystems want to have a pre-defined structure
 105 */
 106struct hd_struct md_hd_struct[MAX_MD_DEVS];
 107static int md_blocksizes[MAX_MD_DEVS];
 108static int md_hardsect_sizes[MAX_MD_DEVS];
 109static int md_maxreadahead[MAX_MD_DEVS];
 110static mdk_thread_t *md_recovery_thread;
 111
 112int md_size[MAX_MD_DEVS];
 113
 114static struct block_device_operations md_fops;
 115static devfs_handle_t devfs_handle;
 116
 117static struct gendisk md_gendisk=
 118{
 119        major: MD_MAJOR,
 120        major_name: "md",
 121        minor_shift: 0,
 122        max_p: 1,
 123        part: md_hd_struct,
 124        sizes: md_size,
 125        nr_real: MAX_MD_DEVS,
 126        real_devices: NULL,
 127        next: NULL,
 128        fops: &md_fops,
 129};
 130
 131/*
 132 * Enables to iterate over all existing md arrays
 133 */
 134static MD_LIST_HEAD(all_mddevs);
 135
 136/*
 137 * The mapping between kdev and mddev is not necessary a simple
 138 * one! Eg. HSM uses several sub-devices to implement Logical
 139 * Volumes. All these sub-devices map to the same mddev.
 140 */
 141dev_mapping_t mddev_map[MAX_MD_DEVS];
 142
 143void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data)
 144{
 145        unsigned int minor = MINOR(dev);
 146
 147        if (MAJOR(dev) != MD_MAJOR) {
 148                MD_BUG();
 149                return;
 150        }
 151        if (mddev_map[minor].mddev) {
 152                MD_BUG();
 153                return;
 154        }
 155        mddev_map[minor].mddev = mddev;
 156        mddev_map[minor].data = data;
 157}
 158
 159void del_mddev_mapping(mddev_t * mddev, kdev_t dev)
 160{
 161        unsigned int minor = MINOR(dev);
 162
 163        if (MAJOR(dev) != MD_MAJOR) {
 164                MD_BUG();
 165                return;
 166        }
 167        if (mddev_map[minor].mddev != mddev) {
 168                MD_BUG();
 169                return;
 170        }
 171        mddev_map[minor].mddev = NULL;
 172        mddev_map[minor].data = NULL;
 173}
 174
 175static int md_make_request(request_queue_t *q, int rw, struct buffer_head * bh)
 176{
 177        mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
 178
 179        if (mddev && mddev->pers)
 180                return mddev->pers->make_request(mddev, rw, bh);
 181        else {
 182                buffer_IO_error(bh);
 183                return 0;
 184        }
 185}
 186
 187static mddev_t * alloc_mddev(kdev_t dev)
 188{
 189        mddev_t *mddev;
 190
 191        if (MAJOR(dev) != MD_MAJOR) {
 192                MD_BUG();
 193                return 0;
 194        }
 195        mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
 196        if (!mddev)
 197                return NULL;
 198
 199        memset(mddev, 0, sizeof(*mddev));
 200
 201        mddev->__minor = MINOR(dev);
 202        init_MUTEX(&mddev->reconfig_sem);
 203        init_MUTEX(&mddev->recovery_sem);
 204        init_MUTEX(&mddev->resync_sem);
 205        MD_INIT_LIST_HEAD(&mddev->disks);
 206        MD_INIT_LIST_HEAD(&mddev->all_mddevs);
 207        atomic_set(&mddev->active, 0);
 208
 209        /*
 210         * The 'base' mddev is the one with data NULL.
 211         * personalities can create additional mddevs
 212         * if necessary.
 213         */
 214        add_mddev_mapping(mddev, dev, 0);
 215        md_list_add(&mddev->all_mddevs, &all_mddevs);
 216
 217        MOD_INC_USE_COUNT;
 218
 219        return mddev;
 220}
 221
 222mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 223{
 224        mdk_rdev_t * rdev;
 225        struct md_list_head *tmp;
 226
 227        ITERATE_RDEV(mddev,rdev,tmp) {
 228                if (rdev->desc_nr == nr)
 229                        return rdev;
 230        }
 231        return NULL;
 232}
 233
 234mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
 235{
 236        struct md_list_head *tmp;
 237        mdk_rdev_t *rdev;
 238
 239        ITERATE_RDEV(mddev,rdev,tmp) {
 240                if (rdev->dev == dev)
 241                        return rdev;
 242        }
 243        return NULL;
 244}
 245
 246static MD_LIST_HEAD(device_names);
 247
 248char * partition_name(kdev_t dev)
 249{
 250        struct gendisk *hd;
 251        static char nomem [] = "<nomem>";
 252        dev_name_t *dname;
 253        struct md_list_head *tmp = device_names.next;
 254
 255        while (tmp != &device_names) {
 256                dname = md_list_entry(tmp, dev_name_t, list);
 257                if (dname->dev == dev)
 258                        return dname->name;
 259                tmp = tmp->next;
 260        }
 261
 262        dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
 263
 264        if (!dname)
 265                return nomem;
 266        /*
 267         * ok, add this new device name to the list
 268         */
 269        hd = get_gendisk (dev);
 270        dname->name = NULL;
 271        if (hd)
 272                dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
 273        if (!dname->name) {
 274                sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
 275                dname->name = dname->namebuf;
 276        }
 277
 278        dname->dev = dev;
 279        MD_INIT_LIST_HEAD(&dname->list);
 280        md_list_add(&dname->list, &device_names);
 281
 282        return dname->name;
 283}
 284
 285static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
 286                                                int persistent)
 287{
 288        unsigned int size = 0;
 289
 290        if (blk_size[MAJOR(dev)])
 291                size = blk_size[MAJOR(dev)][MINOR(dev)];
 292        if (persistent)
 293                size = MD_NEW_SIZE_BLOCKS(size);
 294        return size;
 295}
 296
 297static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
 298{
 299        unsigned int size;
 300
 301        size = calc_dev_sboffset(dev, mddev, persistent);
 302        if (!mddev->sb) {
 303                MD_BUG();
 304                return size;
 305        }
 306        if (mddev->sb->chunk_size)
 307                size &= ~(mddev->sb->chunk_size/1024 - 1);
 308        return size;
 309}
 310
 311static unsigned int zoned_raid_size(mddev_t *mddev)
 312{
 313        unsigned int mask;
 314        mdk_rdev_t * rdev;
 315        struct md_list_head *tmp;
 316
 317        if (!mddev->sb) {
 318                MD_BUG();
 319                return -EINVAL;
 320        }
 321        /*
 322         * do size and offset calculations.
 323         */
 324        mask = ~(mddev->sb->chunk_size/1024 - 1);
 325
 326        ITERATE_RDEV(mddev,rdev,tmp) {
 327                rdev->size &= mask;
 328                md_size[mdidx(mddev)] += rdev->size;
 329        }
 330        return 0;
 331}
 332
 333/*
 334 * We check wether all devices are numbered from 0 to nb_dev-1. The
 335 * order is guaranteed even after device name changes.
 336 *
 337 * Some personalities (raid0, linear) use this. Personalities that
 338 * provide data have to be able to deal with loss of individual
 339 * disks, so they do their checking themselves.
 340 */
 341int md_check_ordering(mddev_t *mddev)
 342{
 343        int i, c;
 344        mdk_rdev_t *rdev;
 345        struct md_list_head *tmp;
 346
 347        /*
 348         * First, all devices must be fully functional
 349         */
 350        ITERATE_RDEV(mddev,rdev,tmp) {
 351                if (rdev->faulty) {
 352                        printk(KERN_ERR "md: md%d's device %s faulty, aborting.\n",
 353                               mdidx(mddev), partition_name(rdev->dev));
 354                        goto abort;
 355                }
 356        }
 357
 358        c = 0;
 359        ITERATE_RDEV(mddev,rdev,tmp) {
 360                c++;
 361        }
 362        if (c != mddev->nb_dev) {
 363                MD_BUG();
 364                goto abort;
 365        }
 366        if (mddev->nb_dev != mddev->sb->raid_disks) {
 367                printk(KERN_ERR "md: md%d, array needs %d disks, has %d, aborting.\n",
 368                        mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
 369                goto abort;
 370        }
 371        /*
 372         * Now the numbering check
 373         */
 374        for (i = 0; i < mddev->nb_dev; i++) {
 375                c = 0;
 376                ITERATE_RDEV(mddev,rdev,tmp) {
 377                        if (rdev->desc_nr == i)
 378                                c++;
 379                }
 380                if (!c) {
 381                        printk(KERN_ERR "md: md%d, missing disk #%d, aborting.\n",
 382                               mdidx(mddev), i);
 383                        goto abort;
 384                }
 385                if (c > 1) {
 386                        printk(KERN_ERR "md: md%d, too many disks #%d, aborting.\n",
 387                               mdidx(mddev), i);
 388                        goto abort;
 389                }
 390        }
 391        return 0;
 392abort:
 393        return 1;
 394}
 395
 396static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
 397{
 398        if (disk_active(disk)) {
 399                sb->working_disks--;
 400        } else {
 401                if (disk_spare(disk)) {
 402                        sb->spare_disks--;
 403                        sb->working_disks--;
 404                } else  {
 405                        sb->failed_disks--;
 406                }
 407        }
 408        sb->nr_disks--;
 409        disk->major = 0;
 410        disk->minor = 0;
 411        mark_disk_removed(disk);
 412}
 413
 414#define BAD_MAGIC KERN_ERR \
 415"md: invalid raid superblock magic on %s\n"
 416
 417#define BAD_MINOR KERN_ERR \
 418"md: %s: invalid raid minor (%x)\n"
 419
 420#define OUT_OF_MEM KERN_ALERT \
 421"md: out of memory.\n"
 422
 423#define NO_SB KERN_ERR \
 424"md: disabled device %s, could not read superblock.\n"
 425
 426#define BAD_CSUM KERN_WARNING \
 427"md: invalid superblock checksum on %s\n"
 428
 429static int alloc_array_sb(mddev_t * mddev)
 430{
 431        if (mddev->sb) {
 432                MD_BUG();
 433                return 0;
 434        }
 435
 436        mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
 437        if (!mddev->sb)
 438                return -ENOMEM;
 439        md_clear_page(mddev->sb);
 440        return 0;
 441}
 442
 443static int alloc_disk_sb(mdk_rdev_t * rdev)
 444{
 445        if (rdev->sb)
 446                MD_BUG();
 447
 448        rdev->sb_page = alloc_page(GFP_KERNEL);
 449        if (!rdev->sb_page) {
 450                printk(OUT_OF_MEM);
 451                return -EINVAL;
 452        }
 453        rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
 454
 455        return 0;
 456}
 457
 458static void free_disk_sb(mdk_rdev_t * rdev)
 459{
 460        if (rdev->sb_page) {
 461                page_cache_release(rdev->sb_page);
 462                rdev->sb = NULL;
 463                rdev->sb_page = NULL;
 464                rdev->sb_offset = 0;
 465                rdev->size = 0;
 466        } else {
 467                if (!rdev->faulty)
 468                        MD_BUG();
 469        }
 470}
 471
 472
 473static void bh_complete(struct buffer_head *bh, int uptodate)
 474{
 475
 476        if (uptodate)
 477                set_bit(BH_Uptodate, &bh->b_state);
 478
 479        complete((struct completion*)bh->b_private);
 480}
 481
 482static int sync_page_io(kdev_t dev, unsigned long sector, int size,
 483                        struct page *page, int rw)
 484{
 485        struct buffer_head bh;
 486        struct completion event;
 487
 488        init_completion(&event);
 489        init_buffer(&bh, bh_complete, &event);
 490        bh.b_rdev = dev;
 491        bh.b_rsector = sector;
 492        bh.b_state      = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
 493        bh.b_size = size;
 494        bh.b_page = page;
 495        bh.b_reqnext = NULL;
 496        bh.b_data = page_address(page);
 497        generic_make_request(rw, &bh);
 498
 499        run_task_queue(&tq_disk);
 500        wait_for_completion(&event);
 501
 502        return test_bit(BH_Uptodate, &bh.b_state);
 503}
 504
 505static int read_disk_sb(mdk_rdev_t * rdev)
 506{
 507        int ret = -EINVAL;
 508        kdev_t dev = rdev->dev;
 509        unsigned long sb_offset;
 510
 511        if (!rdev->sb) {
 512                MD_BUG();
 513                goto abort;
 514        }
 515
 516        /*
 517         * Calculate the position of the superblock,
 518         * it's at the end of the disk
 519         */
 520        sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
 521        rdev->sb_offset = sb_offset;
 522
 523        if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
 524                printk(NO_SB,partition_name(dev));
 525                return -EINVAL;
 526        }
 527        printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
 528        ret = 0;
 529abort:
 530        return ret;
 531}
 532
 533static unsigned int calc_sb_csum(mdp_super_t * sb)
 534{
 535        unsigned int disk_csum, csum;
 536
 537        disk_csum = sb->sb_csum;
 538        sb->sb_csum = 0;
 539        csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
 540        sb->sb_csum = disk_csum;
 541        return csum;
 542}
 543
 544/*
 545 * Check one RAID superblock for generic plausibility
 546 */
 547
 548static int check_disk_sb(mdk_rdev_t * rdev)
 549{
 550        mdp_super_t *sb;
 551        int ret = -EINVAL;
 552
 553        sb = rdev->sb;
 554        if (!sb) {
 555                MD_BUG();
 556                goto abort;
 557        }
 558
 559        if (sb->md_magic != MD_SB_MAGIC) {
 560                printk(BAD_MAGIC, partition_name(rdev->dev));
 561                goto abort;
 562        }
 563
 564        if (sb->md_minor >= MAX_MD_DEVS) {
 565                printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
 566                goto abort;
 567        }
 568
 569        if (calc_sb_csum(sb) != sb->sb_csum) {
 570                printk(BAD_CSUM, partition_name(rdev->dev));
 571                goto abort;
 572        }
 573        ret = 0;
 574abort:
 575        return ret;
 576}
 577
 578static kdev_t dev_unit(kdev_t dev)
 579{
 580        unsigned int mask;
 581        struct gendisk *hd = get_gendisk(dev);
 582
 583        if (!hd)
 584                return 0;
 585        mask = ~((1 << hd->minor_shift) - 1);
 586
 587        return MKDEV(MAJOR(dev), MINOR(dev) & mask);
 588}
 589
 590static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
 591{
 592        struct md_list_head *tmp;
 593        mdk_rdev_t *rdev;
 594
 595        ITERATE_RDEV(mddev,rdev,tmp)
 596                if (dev_unit(rdev->dev) == dev_unit(dev))
 597                        return rdev;
 598
 599        return NULL;
 600}
 601
 602static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 603{
 604        struct md_list_head *tmp;
 605        mdk_rdev_t *rdev;
 606
 607        ITERATE_RDEV(mddev1,rdev,tmp)
 608                if (match_dev_unit(mddev2, rdev->dev))
 609                        return 1;
 610
 611        return 0;
 612}
 613
 614static MD_LIST_HEAD(all_raid_disks);
 615static MD_LIST_HEAD(pending_raid_disks);
 616
 617static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 618{
 619        mdk_rdev_t *same_pdev;
 620
 621        if (rdev->mddev) {
 622                MD_BUG();
 623                return;
 624        }
 625        same_pdev = match_dev_unit(mddev, rdev->dev);
 626        if (same_pdev)
 627                printk( KERN_WARNING
 628"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
 629"     protection against single-disk failure might be compromised.\n",
 630                        mdidx(mddev), partition_name(rdev->dev),
 631                                partition_name(same_pdev->dev));
 632
 633        md_list_add(&rdev->same_set, &mddev->disks);
 634        rdev->mddev = mddev;
 635        mddev->nb_dev++;
 636        printk(KERN_INFO "md: bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
 637}
 638
 639static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 640{
 641        if (!rdev->mddev) {
 642                MD_BUG();
 643                return;
 644        }
 645        md_list_del(&rdev->same_set);
 646        MD_INIT_LIST_HEAD(&rdev->same_set);
 647        rdev->mddev->nb_dev--;
 648        printk(KERN_INFO "md: unbind<%s,%d>\n", partition_name(rdev->dev),
 649                                                 rdev->mddev->nb_dev);
 650        rdev->mddev = NULL;
 651}
 652
 653/*
 654 * prevent the device from being mounted, repartitioned or
 655 * otherwise reused by a RAID array (or any other kernel
 656 * subsystem), by opening the device. [simply getting an
 657 * inode is not enough, the SCSI module usage code needs
 658 * an explicit open() on the device]
 659 */
 660static int lock_rdev(mdk_rdev_t *rdev)
 661{
 662        int err = 0;
 663        struct block_device *bdev;
 664
 665        bdev = bdget(rdev->dev);
 666        if (!bdev)
 667                return -ENOMEM;
 668        err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
 669        if (!err)
 670                rdev->bdev = bdev;
 671        return err;
 672}
 673
 674static void unlock_rdev(mdk_rdev_t *rdev)
 675{
 676        struct block_device *bdev = rdev->bdev;
 677        rdev->bdev = NULL;
 678        if (!bdev)
 679                MD_BUG();
 680        blkdev_put(bdev, BDEV_RAW);
 681}
 682
 683void md_autodetect_dev(kdev_t dev);
 684
 685static void export_rdev(mdk_rdev_t * rdev)
 686{
 687        printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
 688        if (rdev->mddev)
 689                MD_BUG();
 690        unlock_rdev(rdev);
 691        free_disk_sb(rdev);
 692        md_list_del(&rdev->all);
 693        MD_INIT_LIST_HEAD(&rdev->all);
 694        if (rdev->pending.next != &rdev->pending) {
 695                printk(KERN_INFO "md: (%s was pending)\n",
 696                        partition_name(rdev->dev));
 697                md_list_del(&rdev->pending);
 698                MD_INIT_LIST_HEAD(&rdev->pending);
 699        }
 700#ifndef MODULE
 701        md_autodetect_dev(rdev->dev);
 702#endif
 703        rdev->dev = 0;
 704        rdev->faulty = 0;
 705        kfree(rdev);
 706}
 707
 708static void kick_rdev_from_array(mdk_rdev_t * rdev)
 709{
 710        unbind_rdev_from_array(rdev);
 711        export_rdev(rdev);
 712}
 713
 714static void export_array(mddev_t *mddev)
 715{
 716        struct md_list_head *tmp;
 717        mdk_rdev_t *rdev;
 718        mdp_super_t *sb = mddev->sb;
 719
 720        if (mddev->sb) {
 721                mddev->sb = NULL;
 722                free_page((unsigned long) sb);
 723        }
 724
 725        ITERATE_RDEV(mddev,rdev,tmp) {
 726                if (!rdev->mddev) {
 727                        MD_BUG();
 728                        continue;
 729                }
 730                kick_rdev_from_array(rdev);
 731        }
 732        if (mddev->nb_dev)
 733                MD_BUG();
 734}
 735
 736static void free_mddev(mddev_t *mddev)
 737{
 738        if (!mddev) {
 739                MD_BUG();
 740                return;
 741        }
 742
 743        export_array(mddev);
 744        md_size[mdidx(mddev)] = 0;
 745        md_hd_struct[mdidx(mddev)].nr_sects = 0;
 746
 747        /*
 748         * Make sure nobody else is using this mddev
 749         * (careful, we rely on the global kernel lock here)
 750         */
 751        while (sem_getcount(&mddev->resync_sem) != 1)
 752                schedule();
 753        while (sem_getcount(&mddev->recovery_sem) != 1)
 754                schedule();
 755
 756        del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
 757        md_list_del(&mddev->all_mddevs);
 758        MD_INIT_LIST_HEAD(&mddev->all_mddevs);
 759        kfree(mddev);
 760        MOD_DEC_USE_COUNT;
 761}
 762
 763#undef BAD_CSUM
 764#undef BAD_MAGIC
 765#undef OUT_OF_MEM
 766#undef NO_SB
 767
 768static void print_desc(mdp_disk_t *desc)
 769{
 770        printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
 771                partition_name(MKDEV(desc->major,desc->minor)),
 772                desc->major,desc->minor,desc->raid_disk,desc->state);
 773}
 774
 775static void print_sb(mdp_super_t *sb)
 776{
 777        int i;
 778
 779        printk(KERN_INFO "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
 780                sb->major_version, sb->minor_version, sb->patch_version,
 781                sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
 782                sb->ctime);
 783        printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
 784                sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
 785                sb->layout, sb->chunk_size);
 786        printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
 787                sb->utime, sb->state, sb->active_disks, sb->working_disks,
 788                sb->failed_disks, sb->spare_disks,
 789                sb->sb_csum, (unsigned long)sb->events_lo);
 790
 791        printk(KERN_INFO);
 792        for (i = 0; i < MD_SB_DISKS; i++) {
 793                mdp_disk_t *desc;
 794
 795                desc = sb->disks + i;
 796                if (desc->number || desc->major || desc->minor ||
 797                    desc->raid_disk || (desc->state && (desc->state != 4))) {
 798                        printk("     D %2d: ", i);
 799                        print_desc(desc);
 800                }
 801        }
 802        printk(KERN_INFO "md:     THIS: ");
 803        print_desc(&sb->this_disk);
 804
 805}
 806
 807static void print_rdev(mdk_rdev_t *rdev)
 808{
 809        printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
 810                partition_name(rdev->dev), partition_name(rdev->old_dev),
 811                rdev->size, rdev->faulty, rdev->desc_nr);
 812        if (rdev->sb) {
 813                printk(KERN_INFO "md: rdev superblock:\n");
 814                print_sb(rdev->sb);
 815        } else
 816                printk(KERN_INFO "md: no rdev superblock!\n");
 817}
 818
 819void md_print_devices(void)
 820{
 821        struct md_list_head *tmp, *tmp2;
 822        mdk_rdev_t *rdev;
 823        mddev_t *mddev;
 824
 825        printk("\n");
 826        printk("md:     **********************************\n");
 827        printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
 828        printk("md:     **********************************\n");
 829        ITERATE_MDDEV(mddev,tmp) {
 830                printk("md%d: ", mdidx(mddev));
 831
 832                ITERATE_RDEV(mddev,rdev,tmp2)
 833                        printk("<%s>", partition_name(rdev->dev));
 834
 835                if (mddev->sb) {
 836                        printk(" array superblock:\n");
 837                        print_sb(mddev->sb);
 838                } else
 839                        printk(" no array superblock.\n");
 840
 841                ITERATE_RDEV(mddev,rdev,tmp2)
 842                        print_rdev(rdev);
 843        }
 844        printk("md:     **********************************\n");
 845        printk("\n");
 846}
 847
 848static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 849{
 850        int ret;
 851        mdp_super_t *tmp1, *tmp2;
 852
 853        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
 854        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
 855
 856        if (!tmp1 || !tmp2) {
 857                ret = 0;
 858                printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
 859                goto abort;
 860        }
 861
 862        *tmp1 = *sb1;
 863        *tmp2 = *sb2;
 864
 865        /*
 866         * nr_disks is not constant
 867         */
 868        tmp1->nr_disks = 0;
 869        tmp2->nr_disks = 0;
 870
 871        if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
 872                ret = 0;
 873        else
 874                ret = 1;
 875
 876abort:
 877        if (tmp1)
 878                kfree(tmp1);
 879        if (tmp2)
 880                kfree(tmp2);
 881
 882        return ret;
 883}
 884
 885static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
 886{
 887        if (    (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
 888                (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
 889                (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
 890                (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
 891
 892                return 1;
 893
 894        return 0;
 895}
 896
 897static mdk_rdev_t * find_rdev_all(kdev_t dev)
 898{
 899        struct md_list_head *tmp;
 900        mdk_rdev_t *rdev;
 901
 902        tmp = all_raid_disks.next;
 903        while (tmp != &all_raid_disks) {
 904                rdev = md_list_entry(tmp, mdk_rdev_t, all);
 905                if (rdev->dev == dev)
 906                        return rdev;
 907                tmp = tmp->next;
 908        }
 909        return NULL;
 910}
 911
 912#define GETBLK_FAILED KERN_ERR \
 913"md: getblk failed for device %s\n"
 914
 915static int write_disk_sb(mdk_rdev_t * rdev)
 916{
 917        kdev_t dev;
 918        unsigned long sb_offset, size;
 919
 920        if (!rdev->sb) {
 921                MD_BUG();
 922                return 1;
 923        }
 924        if (rdev->faulty) {
 925                MD_BUG();
 926                return 1;
 927        }
 928        if (rdev->sb->md_magic != MD_SB_MAGIC) {
 929                MD_BUG();
 930                return 1;
 931        }
 932
 933        dev = rdev->dev;
 934        sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
 935        if (rdev->sb_offset != sb_offset) {
 936                printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
 937                       partition_name(dev), rdev->sb_offset, sb_offset);
 938                goto skip;
 939        }
 940        /*
 941         * If the disk went offline meanwhile and it's just a spare, then
 942         * its size has changed to zero silently, and the MD code does
 943         * not yet know that it's faulty.
 944         */
 945        size = calc_dev_size(dev, rdev->mddev, 1);
 946        if (size != rdev->size) {
 947                printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
 948                       partition_name(dev), rdev->size, size);
 949                goto skip;
 950        }
 951
 952        printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
 953
 954        if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
 955                printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
 956                return 1;
 957        }
 958skip:
 959        return 0;
 960}
 961#undef GETBLK_FAILED
 962
 963static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 964{
 965        int i, ok = 0;
 966        mdp_disk_t *desc;
 967
 968        for (i = 0; i < MD_SB_DISKS; i++) {
 969                desc = mddev->sb->disks + i;
 970#if 0
 971                if (disk_faulty(desc)) {
 972                        if (MKDEV(desc->major,desc->minor) == rdev->dev)
 973                                ok = 1;
 974                        continue;
 975                }
 976#endif
 977                if (MKDEV(desc->major,desc->minor) == rdev->dev) {
 978                        rdev->sb->this_disk = *desc;
 979                        rdev->desc_nr = desc->number;
 980                        ok = 1;
 981                        break;
 982                }
 983        }
 984
 985        if (!ok) {
 986                MD_BUG();
 987        }
 988}
 989
 990static int sync_sbs(mddev_t * mddev)
 991{
 992        mdk_rdev_t *rdev;
 993        mdp_super_t *sb;
 994        struct md_list_head *tmp;
 995
 996        ITERATE_RDEV(mddev,rdev,tmp) {
 997                if (rdev->faulty || rdev->alias_device)
 998                        continue;
 999                sb = rdev->sb;
1000                *sb = *mddev->sb;
1001                set_this_disk(mddev, rdev);
1002                sb->sb_csum = calc_sb_csum(sb);
1003        }
1004        return 0;
1005}
1006
1007int md_update_sb(mddev_t * mddev)
1008{
1009        int err, count = 100;
1010        struct md_list_head *tmp;
1011        mdk_rdev_t *rdev;
1012
1013        if (!mddev->sb_dirty) {
1014                printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
1015                return 0;
1016        }
1017        mddev->sb_dirty = 0;
1018repeat:
1019        mddev->sb->utime = CURRENT_TIME;
1020        if ((++mddev->sb->events_lo)==0)
1021                ++mddev->sb->events_hi;
1022
1023        if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
1024                /*
1025                 * oops, this 64-bit counter should never wrap.
1026                 * Either we are in around ~1 trillion A.C., assuming
1027                 * 1 reboot per second, or we have a bug:
1028                 */
1029                MD_BUG();
1030                mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
1031        }
1032        sync_sbs(mddev);
1033
1034        /*
1035         * do not write anything to disk if using
1036         * nonpersistent superblocks
1037         */
1038        if (mddev->sb->not_persistent)
1039                return 0;
1040
1041        printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1042                                        mdidx(mddev));
1043
1044        err = 0;
1045        ITERATE_RDEV(mddev,rdev,tmp) {
1046                printk(KERN_INFO "md: ");
1047                if (rdev->faulty)
1048                        printk("(skipping faulty ");
1049                if (rdev->alias_device)
1050                        printk("(skipping alias ");
1051                if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
1052                        printk("(skipping new-faulty %s )\n",
1053                               partition_name(rdev->dev));
1054                        continue;
1055                }
1056                printk("%s ", partition_name(rdev->dev));
1057                if (!rdev->faulty && !rdev->alias_device) {
1058                        printk("[events: %08lx]",
1059                                (unsigned long)rdev->sb->events_lo);
1060                        err += write_disk_sb(rdev);
1061                } else
1062                        printk(")\n");
1063        }
1064        if (err) {
1065                if (--count) {
1066                        printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
1067                        goto repeat;
1068                }
1069                printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
1070        }
1071        return 0;
1072}
1073
1074/*
1075 * Import a device. If 'on_disk', then sanity check the superblock
1076 *
1077 * mark the device faulty if:
1078 *
1079 *   - the device is nonexistent (zero size)
1080 *   - the device has no valid superblock
1081 *
1082 */
1083static int md_import_device(kdev_t newdev, int on_disk)
1084{
1085        int err;
1086        mdk_rdev_t *rdev;
1087        unsigned int size;
1088
1089        if (find_rdev_all(newdev))
1090                return -EEXIST;
1091
1092        rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1093        if (!rdev) {
1094                printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
1095                return -ENOMEM;
1096        }
1097        memset(rdev, 0, sizeof(*rdev));
1098
1099        if (is_mounted(newdev)) {
1100                printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
1101                        partition_name(newdev));
1102                err = -EBUSY;
1103                goto abort_free;
1104        }
1105
1106        if ((err = alloc_disk_sb(rdev)))
1107                goto abort_free;
1108
1109        rdev->dev = newdev;
1110        if (lock_rdev(rdev)) {
1111                printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
1112                        partition_name(newdev));
1113                err = -EINVAL;
1114                goto abort_free;
1115        }
1116        rdev->desc_nr = -1;
1117        rdev->faulty = 0;
1118
1119        size = 0;
1120        if (blk_size[MAJOR(newdev)])
1121                size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1122        if (!size) {
1123                printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
1124                                partition_name(newdev));
1125                err = -EINVAL;
1126                goto abort_free;
1127        }
1128
1129        if (on_disk) {
1130                if ((err = read_disk_sb(rdev))) {
1131                        printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
1132                               partition_name(newdev));
1133                        goto abort_free;
1134                }
1135                if ((err = check_disk_sb(rdev))) {
1136                        printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
1137                               partition_name(newdev));
1138                        goto abort_free;
1139                }
1140
1141                if (rdev->sb->level != -4) {
1142                        rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1143                                                rdev->sb->this_disk.minor);
1144                        rdev->desc_nr = rdev->sb->this_disk.number;
1145                } else {
1146                        rdev->old_dev = MKDEV(0, 0);
1147                        rdev->desc_nr = -1;
1148                }
1149        }
1150        md_list_add(&rdev->all, &all_raid_disks);
1151        MD_INIT_LIST_HEAD(&rdev->pending);
1152
1153        return 0;
1154
1155abort_free:
1156        if (rdev->sb) {
1157                if (rdev->bdev)
1158                        unlock_rdev(rdev);
1159                free_disk_sb(rdev);
1160        }
1161        kfree(rdev);
1162        return err;
1163}
1164
1165/*
1166 * Check a full RAID array for plausibility
1167 */
1168
1169#define INCONSISTENT KERN_ERR \
1170"md: fatal superblock inconsistency in %s -- removing from array\n"
1171
1172#define OUT_OF_DATE KERN_ERR \
1173"md: superblock update time inconsistency -- using the most recent one\n"
1174
1175#define OLD_VERSION KERN_ALERT \
1176"md: md%d: unsupported raid array version %d.%d.%d\n"
1177
1178#define NOT_CLEAN_IGNORE KERN_ERR \
1179"md: md%d: raid array is not clean -- starting background reconstruction\n"
1180
1181#define UNKNOWN_LEVEL KERN_ERR \
1182"md: md%d: unsupported raid level %d\n"
1183
1184static int analyze_sbs(mddev_t * mddev)
1185{
1186        int out_of_date = 0, i, first;
1187        struct md_list_head *tmp, *tmp2;
1188        mdk_rdev_t *rdev, *rdev2, *freshest;
1189        mdp_super_t *sb;
1190
1191        /*
1192         * Verify the RAID superblock on each real device
1193         */
1194        ITERATE_RDEV(mddev,rdev,tmp) {
1195                if (rdev->faulty) {
1196                        MD_BUG();
1197                        goto abort;
1198                }
1199                if (!rdev->sb) {
1200                        MD_BUG();
1201                        goto abort;
1202                }
1203                if (check_disk_sb(rdev))
1204                        goto abort;
1205        }
1206
1207        /*
1208         * The superblock constant part has to be the same
1209         * for all disks in the array.
1210         */
1211        sb = NULL;
1212
1213        ITERATE_RDEV(mddev,rdev,tmp) {
1214                if (!sb) {
1215                        sb = rdev->sb;
1216                        continue;
1217                }
1218                if (!sb_equal(sb, rdev->sb)) {
1219                        printk(INCONSISTENT, partition_name(rdev->dev));
1220                        kick_rdev_from_array(rdev);
1221                        continue;
1222                }
1223        }
1224
1225        /*
1226         * OK, we have all disks and the array is ready to run. Let's
1227         * find the freshest superblock, that one will be the superblock
1228         * that represents the whole array.
1229         */
1230        if (!mddev->sb)
1231                if (alloc_array_sb(mddev))
1232                        goto abort;
1233        sb = mddev->sb;
1234        freshest = NULL;
1235
1236        ITERATE_RDEV(mddev,rdev,tmp) {
1237                __u64 ev1, ev2;
1238                /*
1239                 * if the checksum is invalid, use the superblock
1240                 * only as a last resort. (decrease it's age by
1241                 * one event)
1242                 */
1243                if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1244                        if (rdev->sb->events_lo || rdev->sb->events_hi)
1245                                if ((rdev->sb->events_lo--)==0)
1246                                        rdev->sb->events_hi--;
1247                }
1248
1249                printk(KERN_INFO "md: %s's event counter: %08lx\n",
1250                       partition_name(rdev->dev),
1251                        (unsigned long)rdev->sb->events_lo);
1252                if (!freshest) {
1253                        freshest = rdev;
1254                        continue;
1255                }
1256                /*
1257                 * Find the newest superblock version
1258                 */
1259                ev1 = md_event(rdev->sb);
1260                ev2 = md_event(freshest->sb);
1261                if (ev1 != ev2) {
1262                        out_of_date = 1;
1263                        if (ev1 > ev2)
1264                                freshest = rdev;
1265                }
1266        }
1267        if (out_of_date) {
1268                printk(OUT_OF_DATE);
1269                printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
1270        }
1271        memcpy (sb, freshest->sb, sizeof(*sb));
1272
1273        /*
1274         * at this point we have picked the 'best' superblock
1275         * from all available superblocks.
1276         * now we validate this superblock and kick out possibly
1277         * failed disks.
1278         */
1279        ITERATE_RDEV(mddev,rdev,tmp) {
1280                /*
1281                 * Kick all non-fresh devices
1282                 */
1283                __u64 ev1, ev2;
1284                ev1 = md_event(rdev->sb);
1285                ev2 = md_event(sb);
1286                ++ev1;
1287                if (ev1 < ev2) {
1288                        printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
1289                                                partition_name(rdev->dev));
1290                        kick_rdev_from_array(rdev);
1291                        continue;
1292                }
1293        }
1294
1295        /*
1296         * Fix up changed device names ... but only if this disk has a
1297         * recent update time. Use faulty checksum ones too.
1298         */
1299        if (mddev->sb->level != -4)
1300        ITERATE_RDEV(mddev,rdev,tmp) {
1301                __u64 ev1, ev2, ev3;
1302                if (rdev->faulty || rdev->alias_device) {
1303                        MD_BUG();
1304                        goto abort;
1305                }
1306                ev1 = md_event(rdev->sb);
1307                ev2 = md_event(sb);
1308                ev3 = ev2;
1309                --ev3;
1310                if ((rdev->dev != rdev->old_dev) &&
1311                        ((ev1 == ev2) || (ev1 == ev3))) {
1312                        mdp_disk_t *desc;
1313
1314                        printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
1315                               partition_name(rdev->old_dev), partition_name(rdev->dev));
1316                        if (rdev->desc_nr == -1) {
1317                                MD_BUG();
1318                                goto abort;
1319                        }
1320                        desc = &sb->disks[rdev->desc_nr];
1321                        if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1322                                MD_BUG();
1323                                goto abort;
1324                        }
1325                        desc->major = MAJOR(rdev->dev);
1326                        desc->minor = MINOR(rdev->dev);
1327                        desc = &rdev->sb->this_disk;
1328                        desc->major = MAJOR(rdev->dev);
1329                        desc->minor = MINOR(rdev->dev);
1330                }
1331        }
1332
1333        /*
1334         * Remove unavailable and faulty devices ...
1335         *
1336         * note that if an array becomes completely unrunnable due to
1337         * missing devices, we do not write the superblock back, so the
1338         * administrator has a chance to fix things up. The removal thus
1339         * only happens if it's nonfatal to the contents of the array.
1340         */
1341        for (i = 0; i < MD_SB_DISKS; i++) {
1342                int found;
1343                mdp_disk_t *desc;
1344                kdev_t dev;
1345
1346                desc = sb->disks + i;
1347                dev = MKDEV(desc->major, desc->minor);
1348
1349                /*
1350                 * We kick faulty devices/descriptors immediately.
1351                 *
1352                 * Note: multipath devices are a special case.  Since we
1353                 * were able to read the superblock on the path, we don't
1354                 * care if it was previously marked as faulty, it's up now
1355                 * so enable it.
1356                 */
1357                if (disk_faulty(desc) && mddev->sb->level != -4) {
1358                        found = 0;
1359                        ITERATE_RDEV(mddev,rdev,tmp) {
1360                                if (rdev->desc_nr != desc->number)
1361                                        continue;
1362                                printk(KERN_WARNING "md%d: kicking faulty %s!\n",
1363                                        mdidx(mddev),partition_name(rdev->dev));
1364                                kick_rdev_from_array(rdev);
1365                                found = 1;
1366                                break;
1367                        }
1368                        if (!found) {
1369                                if (dev == MKDEV(0,0))
1370                                        continue;
1371                                printk(KERN_WARNING "md%d: removing former faulty %s!\n",
1372                                        mdidx(mddev), partition_name(dev));
1373                        }
1374                        remove_descriptor(desc, sb);
1375                        continue;
1376                } else if (disk_faulty(desc)) {
1377                        /*
1378                         * multipath entry marked as faulty, unfaulty it
1379                         */
1380                        rdev = find_rdev(mddev, dev);
1381                        if(rdev)
1382                                mark_disk_spare(desc);
1383                        else
1384                                remove_descriptor(desc, sb);
1385                }
1386
1387                if (dev == MKDEV(0,0))
1388                        continue;
1389                /*
1390                 * Is this device present in the rdev ring?
1391                 */
1392                found = 0;
1393                ITERATE_RDEV(mddev,rdev,tmp) {
1394                        /*
1395                         * Multi-path IO special-case: since we have no
1396                         * this_disk descriptor at auto-detect time,
1397                         * we cannot check rdev->number.
1398                         * We can check the device though.
1399                         */
1400                        if ((sb->level == -4) && (rdev->dev ==
1401                                        MKDEV(desc->major,desc->minor))) {
1402                                found = 1;
1403                                break;
1404                        }
1405                        if (rdev->desc_nr == desc->number) {
1406                                found = 1;
1407                                break;
1408                        }
1409                }
1410                if (found)
1411                        continue;
1412
1413                printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
1414                       mdidx(mddev), partition_name(dev));
1415                remove_descriptor(desc, sb);
1416        }
1417
1418        /*
1419         * Double check wether all devices mentioned in the
1420         * superblock are in the rdev ring.
1421         */
1422        first = 1;
1423        for (i = 0; i < MD_SB_DISKS; i++) {
1424                mdp_disk_t *desc;
1425                kdev_t dev;
1426
1427                desc = sb->disks + i;
1428                dev = MKDEV(desc->major, desc->minor);
1429
1430                if (dev == MKDEV(0,0))
1431                        continue;
1432
1433                if (disk_faulty(desc)) {
1434                        MD_BUG();
1435                        goto abort;
1436                }
1437
1438                rdev = find_rdev(mddev, dev);
1439                if (!rdev) {
1440                        MD_BUG();
1441                        goto abort;
1442                }
1443                /*
1444                 * In the case of Multipath-IO, we have no
1445                 * other information source to find out which
1446                 * disk is which, only the position of the device
1447                 * in the superblock:
1448                 */
1449                if (mddev->sb->level == -4) {
1450                        if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
1451                                MD_BUG();
1452                                goto abort;
1453                        }
1454                        rdev->desc_nr = i;
1455                        if (!first)
1456                                rdev->alias_device = 1;
1457                        else
1458                                first = 0;
1459                }
1460        }
1461
1462        /*
1463         * Kick all rdevs that are not in the
1464         * descriptor array:
1465         */
1466        ITERATE_RDEV(mddev,rdev,tmp) {
1467                if (rdev->desc_nr == -1)
1468                        kick_rdev_from_array(rdev);
1469        }
1470
1471        /*
1472         * Do a final reality check.
1473         */
1474        if (mddev->sb->level != -4) {
1475                ITERATE_RDEV(mddev,rdev,tmp) {
1476                        if (rdev->desc_nr == -1) {
1477                                MD_BUG();
1478                                goto abort;
1479                        }
1480                        /*
1481                         * is the desc_nr unique?
1482                         */
1483                        ITERATE_RDEV(mddev,rdev2,tmp2) {
1484                                if ((rdev2 != rdev) &&
1485                                                (rdev2->desc_nr == rdev->desc_nr)) {
1486                                        MD_BUG();
1487                                        goto abort;
1488                                }
1489                        }
1490                        /*
1491                         * is the device unique?
1492                         */
1493                        ITERATE_RDEV(mddev,rdev2,tmp2) {
1494                                if ((rdev2 != rdev) &&
1495                                                (rdev2->dev == rdev->dev)) {
1496                                        MD_BUG();
1497                                        goto abort;
1498                                }
1499                        }
1500                }
1501        }
1502
1503        /*
1504         * Check if we can support this RAID array
1505         */
1506        if (sb->major_version != MD_MAJOR_VERSION ||
1507                        sb->minor_version > MD_MINOR_VERSION) {
1508
1509                printk(OLD_VERSION, mdidx(mddev), sb->major_version,
1510                                sb->minor_version, sb->patch_version);
1511                goto abort;
1512        }
1513
1514        if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1515                        (sb->level == 4) || (sb->level == 5)))
1516                printk(NOT_CLEAN_IGNORE, mdidx(mddev));
1517
1518        return 0;
1519abort:
1520        return 1;
1521}
1522
1523#undef INCONSISTENT
1524#undef OUT_OF_DATE
1525#undef OLD_VERSION
1526#undef OLD_LEVEL
1527
1528static int device_size_calculation(mddev_t * mddev)
1529{
1530        int data_disks = 0, persistent;
1531        unsigned int readahead;
1532        mdp_super_t *sb = mddev->sb;
1533        struct md_list_head *tmp;
1534        mdk_rdev_t *rdev;
1535
1536        /*
1537         * Do device size calculation. Bail out if too small.
1538         * (we have to do this after having validated chunk_size,
1539         * because device size has to be modulo chunk_size)
1540         */
1541        persistent = !mddev->sb->not_persistent;
1542        ITERATE_RDEV(mddev,rdev,tmp) {
1543                if (rdev->faulty)
1544                        continue;
1545                if (rdev->size) {
1546                        MD_BUG();
1547                        continue;
1548                }
1549                rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1550                if (rdev->size < sb->chunk_size / 1024) {
1551                        printk(KERN_WARNING
1552                                "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
1553                                partition_name(rdev->dev),
1554                                rdev->size, sb->chunk_size / 1024);
1555                        return -EINVAL;
1556                }
1557        }
1558
1559        switch (sb->level) {
1560                case -4:
1561                        data_disks = 1;
1562                        break;
1563                case -3:
1564                        data_disks = 1;
1565                        break;
1566                case -2:
1567                        data_disks = 1;
1568                        break;
1569                case -1:
1570                        zoned_raid_size(mddev);
1571                        data_disks = 1;
1572                        break;
1573                case 0:
1574                        zoned_raid_size(mddev);
1575                        data_disks = sb->raid_disks;
1576                        break;
1577                case 1:
1578                        data_disks = 1;
1579                        break;
1580                case 4:
1581                case 5:
1582                        data_disks = sb->raid_disks-1;
1583                        break;
1584                default:
1585                        printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1586                        goto abort;
1587        }
1588        if (!md_size[mdidx(mddev)])
1589                md_size[mdidx(mddev)] = sb->size * data_disks;
1590
1591        readahead = MD_READAHEAD;
1592        if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
1593                readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
1594                if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
1595                        readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
1596        } else {
1597                // (no multipath branch - it uses the default setting)
1598                if (sb->level == -3)
1599                        readahead = 0;
1600        }
1601        md_maxreadahead[mdidx(mddev)] = readahead;
1602
1603        printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
1604                mdidx(mddev), readahead*(PAGE_SIZE/1024));
1605
1606        printk(KERN_INFO
1607                "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1608                        mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
1609        return 0;
1610abort:
1611        return 1;
1612}
1613
1614
1615#define TOO_BIG_CHUNKSIZE KERN_ERR \
1616"too big chunk_size: %d > %d\n"
1617
1618#define TOO_SMALL_CHUNKSIZE KERN_ERR \
1619"too small chunk_size: %d < %ld\n"
1620
1621#define BAD_CHUNKSIZE KERN_ERR \
1622"no chunksize specified, see 'man raidtab'\n"
1623
1624static int do_md_run(mddev_t * mddev)
1625{
1626        int pnum, err;
1627        int chunk_size;
1628        struct md_list_head *tmp;
1629        mdk_rdev_t *rdev;
1630
1631
1632        if (!mddev->nb_dev) {
1633                MD_BUG();
1634                return -EINVAL;
1635        }
1636
1637        if (mddev->pers)
1638                return -EBUSY;
1639
1640        /*
1641         * Resize disks to align partitions size on a given
1642         * chunk size.
1643         */
1644        md_size[mdidx(mddev)] = 0;
1645
1646        /*
1647         * Analyze all RAID superblock(s)
1648         */
1649        if (analyze_sbs(mddev)) {
1650                MD_BUG();
1651                return -EINVAL;
1652        }
1653
1654        chunk_size = mddev->sb->chunk_size;
1655        pnum = level_to_pers(mddev->sb->level);
1656
1657        mddev->param.chunk_size = chunk_size;
1658        mddev->param.personality = pnum;
1659
1660        if ((pnum != MULTIPATH) && (pnum != RAID1)) {
1661                if (!chunk_size) {
1662                        /*
1663                         * 'default chunksize' in the old md code used to
1664                         * be PAGE_SIZE, baaad.
1665                         * we abort here to be on the safe side. We dont
1666                         * want to continue the bad practice.
1667                         */
1668                        printk(BAD_CHUNKSIZE);
1669                        return -EINVAL;
1670                }
1671                if (chunk_size > MAX_CHUNK_SIZE) {
1672                        printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1673                        return -EINVAL;
1674                }
1675                /*
1676                 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1677                 */
1678                if ( (1 << ffz(~chunk_size)) != chunk_size) {
1679                        MD_BUG();
1680                        return -EINVAL;
1681                }
1682                if (chunk_size < PAGE_SIZE) {
1683                        printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1684                        return -EINVAL;
1685                }
1686        } else
1687                if (chunk_size)
1688                        printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
1689                               mddev->sb->level);
1690
1691        if (pnum >= MAX_PERSONALITY) {
1692                MD_BUG();
1693                return -EINVAL;
1694        }
1695
1696        if (!pers[pnum])
1697        {
1698#ifdef CONFIG_KMOD
1699                char module_name[80];
1700                sprintf (module_name, "md-personality-%d", pnum);
1701                request_module (module_name);
1702                if (!pers[pnum])
1703#endif
1704                {
1705                        printk(KERN_ERR "md: personality %d is not loaded!\n",
1706                                pnum);
1707                        return -EINVAL;
1708                }
1709        }
1710
1711        if (device_size_calculation(mddev))
1712                return -EINVAL;
1713
1714        /*
1715         * Drop all container device buffers, from now on
1716         * the only valid external interface is through the md
1717         * device.
1718         * Also find largest hardsector size
1719         */
1720        md_hardsect_sizes[mdidx(mddev)] = 512;
1721        ITERATE_RDEV(mddev,rdev,tmp) {
1722                if (rdev->faulty)
1723                        continue;
1724                invalidate_device(rdev->dev, 1);
1725                if (get_hardsect_size(rdev->dev)
1726                        > md_hardsect_sizes[mdidx(mddev)])
1727                        md_hardsect_sizes[mdidx(mddev)] =
1728                                get_hardsect_size(rdev->dev);
1729        }
1730        md_blocksizes[mdidx(mddev)] = 1024;
1731        if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
1732                md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
1733        mddev->pers = pers[pnum];
1734
1735        err = mddev->pers->run(mddev);
1736        if (err) {
1737                printk(KERN_ERR "md: pers->run() failed ...\n");
1738                mddev->pers = NULL;
1739                return -EINVAL;
1740        }
1741
1742        mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1743        mddev->sb_dirty = 1;
1744        md_update_sb(mddev);
1745
1746        /*
1747         * md_size has units of 1K blocks, which are
1748         * twice as large as sectors.
1749         */
1750        md_hd_struct[mdidx(mddev)].start_sect = 0;
1751        register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
1752                        1, &md_fops, md_size[mdidx(mddev)]<<1);
1753
1754        read_ahead[MD_MAJOR] = 1024;
1755        return (0);
1756}
1757
1758#undef TOO_BIG_CHUNKSIZE
1759#undef BAD_CHUNKSIZE
1760
1761#define OUT(x) do { err = (x); goto out; } while (0)
1762
1763static int restart_array(mddev_t *mddev)
1764{
1765        int err = 0;
1766
1767        /*
1768         * Complain if it has no devices
1769         */
1770        if (!mddev->nb_dev)
1771                OUT(-ENXIO);
1772
1773        if (mddev->pers) {
1774                if (!mddev->ro)
1775                        OUT(-EBUSY);
1776
1777                mddev->ro = 0;
1778                set_device_ro(mddev_to_kdev(mddev), 0);
1779
1780                printk(KERN_INFO
1781                        "md: md%d switched to read-write mode.\n", mdidx(mddev));
1782                /*
1783                 * Kick recovery or resync if necessary
1784                 */
1785                md_recover_arrays();
1786                if (mddev->pers->restart_resync)
1787                        mddev->pers->restart_resync(mddev);
1788        } else {
1789                printk(KERN_ERR "md: md%d has no personality assigned.\n",
1790                        mdidx(mddev));
1791                err = -EINVAL;
1792        }
1793
1794out:
1795        return err;
1796}
1797
1798#define STILL_MOUNTED KERN_WARNING \
1799"md: md%d still mounted.\n"
1800#define STILL_IN_USE \
1801"md: md%d still in use.\n"
1802
1803static int do_md_stop(mddev_t * mddev, int ro)
1804{
1805        int err = 0, resync_interrupted = 0;
1806        kdev_t dev = mddev_to_kdev(mddev);
1807
1808#if 0 /* ->active is not currently reliable */
1809        if (atomic_read(&mddev->active)>1) {
1810                printk(STILL_IN_USE, mdidx(mddev));
1811                OUT(-EBUSY);
1812        }
1813#endif
1814
1815        if (mddev->pers) {
1816                /*
1817                 * It is safe to call stop here, it only frees private
1818                 * data. Also, it tells us if a device is unstoppable
1819                 * (eg. resyncing is in progress)
1820                 */
1821                if (mddev->pers->stop_resync)
1822                        if (mddev->pers->stop_resync(mddev))
1823                                resync_interrupted = 1;
1824
1825                if (mddev->recovery_running)
1826                        md_interrupt_thread(md_recovery_thread);
1827
1828                /*
1829                 * This synchronizes with signal delivery to the
1830                 * resync or reconstruction thread. It also nicely
1831                 * hangs the process if some reconstruction has not
1832                 * finished.
1833                 */
1834                down(&mddev->recovery_sem);
1835                up(&mddev->recovery_sem);
1836
1837                invalidate_device(dev, 1);
1838
1839                if (ro) {
1840                        if (mddev->ro)
1841                                OUT(-ENXIO);
1842                        mddev->ro = 1;
1843                } else {
1844                        if (mddev->ro)
1845                                set_device_ro(dev, 0);
1846                        if (mddev->pers->stop(mddev)) {
1847                                if (mddev->ro)
1848                                        set_device_ro(dev, 1);
1849                                OUT(-EBUSY);
1850                        }
1851                        if (mddev->ro)
1852                                mddev->ro = 0;
1853                }
1854                if (mddev->sb) {
1855                        /*
1856                         * mark it clean only if there was no resync
1857                         * interrupted.
1858                         */
1859                        if (!mddev->recovery_running && !resync_interrupted) {
1860                                printk(KERN_INFO "md: marking sb clean...\n");
1861                                mddev->sb->state |= 1 << MD_SB_CLEAN;
1862                        }
1863                        mddev->sb_dirty = 1;
1864                        md_update_sb(mddev);
1865                }
1866                if (ro)
1867                        set_device_ro(dev, 1);
1868        }
1869
1870        /*
1871         * Free resources if final stop
1872         */
1873        if (!ro) {
1874                printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
1875                free_mddev(mddev);
1876
1877        } else
1878                printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
1879out:
1880        return err;
1881}
1882
1883#undef OUT
1884
1885/*
1886 * We have to safely support old arrays too.
1887 */
1888int detect_old_array(mdp_super_t *sb)
1889{
1890        if (sb->major_version > 0)
1891                return 0;
1892        if (sb->minor_version >= 90)
1893                return 0;
1894
1895        return -EINVAL;
1896}
1897
1898
1899static void autorun_array(mddev_t *mddev)
1900{
1901        mdk_rdev_t *rdev;
1902        struct md_list_head *tmp;
1903        int err;
1904
1905        if (mddev->disks.prev == &mddev->disks) {
1906                MD_BUG();
1907                return;
1908        }
1909
1910        printk(KERN_INFO "md: running: ");
1911
1912        ITERATE_RDEV(mddev,rdev,tmp) {
1913                printk("<%s>", partition_name(rdev->dev));
1914        }
1915        printk("\n");
1916
1917        err = do_md_run (mddev);
1918        if (err) {
1919                printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
1920                /*
1921                 * prevent the writeback of an unrunnable array
1922                 */
1923                mddev->sb_dirty = 0;
1924                do_md_stop (mddev, 0);
1925        }
1926}
1927
1928/*
1929 * lets try to run arrays based on all disks that have arrived
1930 * until now. (those are in the ->pending list)
1931 *
1932 * the method: pick the first pending disk, collect all disks with
1933 * the same UUID, remove all from the pending list and put them into
1934 * the 'same_array' list. Then order this list based on superblock
1935 * update time (freshest comes first), kick out 'old' disks and
1936 * compare superblocks. If everything's fine then run it.
1937 *
1938 * If "unit" is allocated, then bump its reference count
1939 */
1940static void autorun_devices(kdev_t countdev)
1941{
1942        struct md_list_head candidates;
1943        struct md_list_head *tmp;
1944        mdk_rdev_t *rdev0, *rdev;
1945        mddev_t *mddev;
1946        kdev_t md_kdev;
1947
1948
1949        printk(KERN_INFO "md: autorun ...\n");
1950        while (pending_raid_disks.next != &pending_raid_disks) {
1951                rdev0 = md_list_entry(pending_raid_disks.next,
1952                                         mdk_rdev_t, pending);
1953
1954                printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
1955                MD_INIT_LIST_HEAD(&candidates);
1956                ITERATE_RDEV_PENDING(rdev,tmp) {
1957                        if (uuid_equal(rdev0, rdev)) {
1958                                if (!sb_equal(rdev0->sb, rdev->sb)) {
1959                                        printk(KERN_WARNING
1960                                               "md: %s has same UUID as %s, but superblocks differ ...\n",
1961                                               partition_name(rdev->dev), partition_name(rdev0->dev));
1962                                        continue;
1963                                }
1964                                printk(KERN_INFO "md:  adding %s ...\n", partition_name(rdev->dev));
1965                                md_list_del(&rdev->pending);
1966                                md_list_add(&rdev->pending, &candidates);
1967                        }
1968                }
1969                /*
1970                 * now we have a set of devices, with all of them having
1971                 * mostly sane superblocks. It's time to allocate the
1972                 * mddev.
1973                 */
1974                md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1975                mddev = kdev_to_mddev(md_kdev);
1976                if (mddev) {
1977                        printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
1978                               mdidx(mddev), partition_name(rdev0->dev));
1979                        ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1980                                export_rdev(rdev);
1981                        continue;
1982                }
1983                mddev = alloc_mddev(md_kdev);
1984                if (!mddev) {
1985                        printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
1986                        break;
1987                }
1988                if (md_kdev == countdev)
1989                        atomic_inc(&mddev->active);
1990                printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
1991                ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
1992                        bind_rdev_to_array(rdev, mddev);
1993                        md_list_del(&rdev->pending);
1994                        MD_INIT_LIST_HEAD(&rdev->pending);
1995                }
1996                autorun_array(mddev);
1997        }
1998        printk(KERN_INFO "md: ... autorun DONE.\n");
1999}
2000
2001/*
2002 * import RAID devices based on one partition
2003 * if possible, the array gets run as well.
2004 */
2005
2006#define BAD_VERSION KERN_ERR \
2007"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
2008
2009#define OUT_OF_MEM KERN_ALERT \
2010"md: out of memory.\n"
2011
2012#define NO_DEVICE KERN_ERR \
2013"md: disabled device %s\n"
2014
2015#define AUTOADD_FAILED KERN_ERR \
2016"md: auto-adding devices to md%d FAILED (error %d).\n"
2017
2018#define AUTOADD_FAILED_USED KERN_ERR \
2019"md: cannot auto-add device %s to md%d, already used.\n"
2020
2021#define AUTORUN_FAILED KERN_ERR \
2022"md: auto-running md%d FAILED (error %d).\n"
2023
2024#define MDDEV_BUSY KERN_ERR \
2025"md: cannot auto-add to md%d, already running.\n"
2026
2027#define AUTOADDING KERN_INFO \
2028"md: auto-adding devices to md%d, based on %s's superblock.\n"
2029
2030#define AUTORUNNING KERN_INFO \
2031"md: auto-running md%d.\n"
2032
2033static int autostart_array(kdev_t startdev, kdev_t countdev)
2034{
2035        int err = -EINVAL, i;
2036        mdp_super_t *sb = NULL;
2037        mdk_rdev_t *start_rdev = NULL, *rdev;
2038
2039        if (md_import_device(startdev, 1)) {
2040                printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
2041                goto abort;
2042        }
2043
2044        start_rdev = find_rdev_all(startdev);
2045        if (!start_rdev) {
2046                MD_BUG();
2047                goto abort;
2048        }
2049        if (start_rdev->faulty) {
2050                printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
2051                                                partition_name(startdev));
2052                goto abort;
2053        }
2054        md_list_add(&start_rdev->pending, &pending_raid_disks);
2055
2056        sb = start_rdev->sb;
2057
2058        err = detect_old_array(sb);
2059        if (err) {
2060                printk(KERN_WARNING "md: array version is too old to be autostarted ,"
2061                       "use raidtools 0.90 mkraid --upgrade to upgrade the array "
2062                       "without data loss!\n");
2063                goto abort;
2064        }
2065
2066        for (i = 0; i < MD_SB_DISKS; i++) {
2067                mdp_disk_t *desc;
2068                kdev_t dev;
2069
2070                desc = sb->disks + i;
2071                dev = MKDEV(desc->major, desc->minor);
2072
2073                if (dev == MKDEV(0,0))
2074                        continue;
2075                if (dev == startdev)
2076                        continue;
2077                if (md_import_device(dev, 1)) {
2078                        printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
2079                               partition_name(dev));
2080                        continue;
2081                }
2082                rdev = find_rdev_all(dev);
2083                if (!rdev) {
2084                        MD_BUG();
2085                        goto abort;
2086                }
2087                md_list_add(&rdev->pending, &pending_raid_disks);
2088        }
2089
2090        /*
2091         * possibly return codes
2092         */
2093        autorun_devices(countdev);
2094        return 0;
2095
2096abort:
2097        if (start_rdev)
2098                export_rdev(start_rdev);
2099        return err;
2100}
2101
2102#undef BAD_VERSION
2103#undef OUT_OF_MEM
2104#undef NO_DEVICE
2105#undef AUTOADD_FAILED_USED
2106#undef AUTOADD_FAILED
2107#undef AUTORUN_FAILED
2108#undef AUTOADDING
2109#undef AUTORUNNING
2110
2111
2112static int get_version(void * arg)
2113{
2114        mdu_version_t ver;
2115
2116        ver.major = MD_MAJOR_VERSION;
2117        ver.minor = MD_MINOR_VERSION;
2118        ver.patchlevel = MD_PATCHLEVEL_VERSION;
2119
2120        if (md_copy_to_user(arg, &ver, sizeof(ver)))
2121                return -EFAULT;
2122
2123        return 0;
2124}
2125
2126#define SET_FROM_SB(x) info.x = mddev->sb->x
2127static int get_array_info(mddev_t * mddev, void * arg)
2128{
2129        mdu_array_info_t info;
2130
2131        if (!mddev->sb) {
2132                MD_BUG();
2133                return -EINVAL;
2134        }
2135
2136        SET_FROM_SB(major_version);
2137        SET_FROM_SB(minor_version);
2138        SET_FROM_SB(patch_version);
2139        SET_FROM_SB(ctime);
2140        SET_FROM_SB(level);
2141        SET_FROM_SB(size);
2142        SET_FROM_SB(nr_disks);
2143        SET_FROM_SB(raid_disks);
2144        SET_FROM_SB(md_minor);
2145        SET_FROM_SB(not_persistent);
2146
2147        SET_FROM_SB(utime);
2148        SET_FROM_SB(state);
2149        SET_FROM_SB(active_disks);
2150        SET_FROM_SB(working_disks);
2151        SET_FROM_SB(failed_disks);
2152        SET_FROM_SB(spare_disks);
2153
2154        SET_FROM_SB(layout);
2155        SET_FROM_SB(chunk_size);
2156
2157        if (md_copy_to_user(arg, &info, sizeof(info)))
2158                return -EFAULT;
2159
2160        return 0;
2161}
2162#undef SET_FROM_SB
2163
2164#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2165static int get_disk_info(mddev_t * mddev, void * arg)
2166{
2167        mdu_disk_info_t info;
2168        unsigned int nr;
2169
2170        if (!mddev->sb)
2171                return -EINVAL;
2172
2173        if (md_copy_from_user(&info, arg, sizeof(info)))
2174                return -EFAULT;
2175
2176        nr = info.number;
2177        if (nr >= MD_SB_DISKS)
2178                return -EINVAL;
2179
2180        SET_FROM_SB(major);
2181        SET_FROM_SB(minor);
2182        SET_FROM_SB(raid_disk);
2183        SET_FROM_SB(state);
2184
2185        if (md_copy_to_user(arg, &info, sizeof(info)))
2186                return -EFAULT;
2187
2188        return 0;
2189}
2190#undef SET_FROM_SB
2191
2192#define SET_SB(x) mddev->sb->disks[nr].x = info->x
2193
2194static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2195{
2196        int err, size, persistent;
2197        mdk_rdev_t *rdev;
2198        unsigned int nr;
2199        kdev_t dev;
2200        dev = MKDEV(info->major,info->minor);
2201
2202        if (find_rdev_all(dev)) {
2203                printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
2204                       partition_name(dev));
2205                return -EBUSY;
2206        }
2207        if (!mddev->sb) {
2208                /* expecting a device which has a superblock */
2209                err = md_import_device(dev, 1);
2210                if (err) {
2211                        printk(KERN_WARNING "md: md_import_device returned %d\n", err);
2212                        return -EINVAL;
2213                }
2214                rdev = find_rdev_all(dev);
2215                if (!rdev) {
2216                        MD_BUG();
2217                        return -EINVAL;
2218                }
2219                if (mddev->nb_dev) {
2220                        mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2221                                                        mdk_rdev_t, same_set);
2222                        if (!uuid_equal(rdev0, rdev)) {
2223                                printk(KERN_WARNING "md: %s has different UUID to %s\n",
2224                                       partition_name(rdev->dev), partition_name(rdev0->dev));
2225                                export_rdev(rdev);
2226                                return -EINVAL;
2227                        }
2228                        if (!sb_equal(rdev0->sb, rdev->sb)) {
2229                                printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
2230                                       partition_name(rdev->dev), partition_name(rdev0->dev));
2231                                export_rdev(rdev);
2232                                return -EINVAL;
2233                        }
2234                }
2235                bind_rdev_to_array(rdev, mddev);
2236                return 0;
2237        }
2238
2239        nr = info->number;
2240        if (nr >= mddev->sb->nr_disks) {
2241                MD_BUG();
2242                return -EINVAL;
2243        }
2244
2245
2246        SET_SB(number);
2247        SET_SB(major);
2248        SET_SB(minor);
2249        SET_SB(raid_disk);
2250        SET_SB(state);
2251
2252        if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2253                err = md_import_device (dev, 0);
2254                if (err) {
2255                        printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
2256                        return -EINVAL;
2257                }
2258                rdev = find_rdev_all(dev);
2259                if (!rdev) {
2260                        MD_BUG();
2261                        return -EINVAL;
2262                }
2263
2264                rdev->old_dev = dev;
2265                rdev->desc_nr = info->number;
2266
2267                bind_rdev_to_array(rdev, mddev);
2268
2269                persistent = !mddev->sb->not_persistent;
2270                if (!persistent)
2271                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
2272
2273                size = calc_dev_size(dev, mddev, persistent);
2274                rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2275
2276                if (!mddev->sb->size || (mddev->sb->size > size))
2277                        mddev->sb->size = size;
2278        }
2279
2280        /*
2281         * sync all other superblocks with the main superblock
2282         */
2283        sync_sbs(mddev);
2284
2285        return 0;
2286}
2287#undef SET_SB
2288
2289static int hot_generate_error(mddev_t * mddev, kdev_t dev)
2290{
2291        struct request_queue *q;
2292        mdk_rdev_t *rdev;
2293        mdp_disk_t *disk;
2294
2295        if (!mddev->pers)
2296                return -ENODEV;
2297
2298        printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
2299                partition_name(dev), mdidx(mddev));
2300
2301        rdev = find_rdev(mddev, dev);
2302        if (!rdev) {
2303                MD_BUG();
2304                return -ENXIO;
2305        }
2306
2307        if (rdev->desc_nr == -1) {
2308                MD_BUG();
2309                return -EINVAL;
2310        }
2311        disk = &mddev->sb->disks[rdev->desc_nr];
2312        if (!disk_active(disk))
2313                return -ENODEV;
2314
2315        q = blk_get_queue(rdev->dev);
2316        if (!q) {
2317                MD_BUG();
2318                return -ENODEV;
2319        }
2320        printk(KERN_INFO "md: okay, generating error!\n");
2321//      q->oneshot_error = 1; // disabled for now
2322
2323        return 0;
2324}
2325
2326static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
2327{
2328        int err;
2329        mdk_rdev_t *rdev;
2330        mdp_disk_t *disk;
2331
2332        if (!mddev->pers)
2333                return -ENODEV;
2334
2335        printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
2336                partition_name(dev), mdidx(mddev));
2337
2338        if (!mddev->pers->diskop) {
2339                printk(KERN_WARNING "md%d: personality does not support diskops!\n",
2340                       mdidx(mddev));
2341                return -EINVAL;
2342        }
2343
2344        rdev = find_rdev(mddev, dev);
2345        if (!rdev)
2346                return -ENXIO;
2347
2348        if (rdev->desc_nr == -1) {
2349                MD_BUG();
2350                return -EINVAL;
2351        }
2352        disk = &mddev->sb->disks[rdev->desc_nr];
2353        if (disk_active(disk))
2354                goto busy;
2355
2356        if (disk_removed(disk))
2357                return -EINVAL;
2358
2359        err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2360        if (err == -EBUSY)
2361                goto busy;
2362
2363        if (err) {
2364                MD_BUG();
2365                return -EINVAL;
2366        }
2367
2368        remove_descriptor(disk, mddev->sb);
2369        kick_rdev_from_array(rdev);
2370        mddev->sb_dirty = 1;
2371        md_update_sb(mddev);
2372
2373        return 0;
2374busy:
2375        printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
2376                partition_name(dev), mdidx(mddev));
2377        return -EBUSY;
2378}
2379
2380static int hot_add_disk(mddev_t * mddev, kdev_t dev)
2381{
2382        int i, err, persistent;
2383        unsigned int size;
2384        mdk_rdev_t *rdev;
2385        mdp_disk_t *disk;
2386
2387        if (!mddev->pers)
2388                return -ENODEV;
2389
2390        printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
2391                partition_name(dev), mdidx(mddev));
2392
2393        if (!mddev->pers->diskop) {
2394                printk(KERN_WARNING "md%d: personality does not support diskops!\n",
2395                       mdidx(mddev));
2396                return -EINVAL;
2397        }
2398
2399        persistent = !mddev->sb->not_persistent;
2400
2401        rdev = find_rdev(mddev, dev);
2402        if (rdev)
2403                return -EBUSY;
2404
2405        err = md_import_device (dev, 0);
2406        if (err) {
2407                printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
2408                return -EINVAL;
2409        }
2410        rdev = find_rdev_all(dev);
2411        if (!rdev) {
2412                MD_BUG();
2413                return -EINVAL;
2414        }
2415        if (rdev->faulty) {
2416                printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
2417                                partition_name(dev), mdidx(mddev));
2418                err = -EINVAL;
2419                goto abort_export;
2420        }
2421        size = calc_dev_size(dev, mddev, persistent);
2422
2423        if (size < mddev->sb->size) {
2424                printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
2425                                mdidx(mddev), size, mddev->sb->size);
2426                err = -ENOSPC;
2427                goto abort_export;
2428        }
2429        bind_rdev_to_array(rdev, mddev);
2430
2431        /*
2432         * The rest should better be atomic, we can have disk failures
2433         * noticed in interrupt contexts ...
2434         */
2435        rdev->old_dev = dev;
2436        rdev->size = size;
2437        rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2438
2439        disk = mddev->sb->disks + mddev->sb->raid_disks;
2440        for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2441                disk = mddev->sb->disks + i;
2442
2443                if (!disk->major && !disk->minor)
2444                        break;
2445                if (disk_removed(disk))
2446                        break;
2447        }
2448        if (i == MD_SB_DISKS) {
2449                printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
2450                       mdidx(mddev));
2451                err = -EBUSY;
2452                goto abort_unbind_export;
2453        }
2454
2455        if (disk_removed(disk)) {
2456                /*
2457                 * reuse slot
2458                 */
2459                if (disk->number != i) {
2460                        MD_BUG();
2461                        err = -EINVAL;
2462                        goto abort_unbind_export;
2463                }
2464        } else {
2465                disk->number = i;
2466        }
2467
2468        disk->raid_disk = disk->number;
2469        disk->major = MAJOR(dev);
2470        disk->minor = MINOR(dev);
2471
2472        if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2473                MD_BUG();
2474                err = -EINVAL;
2475                goto abort_unbind_export;
2476        }
2477
2478        mark_disk_spare(disk);
2479        mddev->sb->nr_disks++;
2480        mddev->sb->spare_disks++;
2481        mddev->sb->working_disks++;
2482
2483        mddev->sb_dirty = 1;
2484        md_update_sb(mddev);
2485
2486        /*
2487         * Kick recovery, maybe this spare has to be added to the
2488         * array immediately.
2489         */
2490        md_recover_arrays();
2491
2492        return 0;
2493
2494abort_unbind_export:
2495        unbind_rdev_from_array(rdev);
2496
2497abort_export:
2498        export_rdev(rdev);
2499        return err;
2500}
2501
2502#define SET_SB(x) mddev->sb->x = info->x
2503static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2504{
2505
2506        if (alloc_array_sb(mddev))
2507                return -ENOMEM;
2508
2509        mddev->sb->major_version = MD_MAJOR_VERSION;
2510        mddev->sb->minor_version = MD_MINOR_VERSION;
2511        mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2512        mddev->sb->ctime = CURRENT_TIME;
2513
2514        SET_SB(level);
2515        SET_SB(size);
2516        SET_SB(nr_disks);
2517        SET_SB(raid_disks);
2518        SET_SB(md_minor);
2519        SET_SB(not_persistent);
2520
2521        SET_SB(state);
2522        SET_SB(active_disks);
2523        SET_SB(working_disks);
2524        SET_SB(failed_disks);
2525        SET_SB(spare_disks);
2526
2527        SET_SB(layout);
2528        SET_SB(chunk_size);
2529
2530        mddev->sb->md_magic = MD_SB_MAGIC;
2531
2532        /*
2533         * Generate a 128 bit UUID
2534         */
2535        get_random_bytes(&mddev->sb->set_uuid0, 4);
2536        get_random_bytes(&mddev->sb->set_uuid1, 4);
2537        get_random_bytes(&mddev->sb->set_uuid2, 4);
2538        get_random_bytes(&mddev->sb->set_uuid3, 4);
2539
2540        return 0;
2541}
2542#undef SET_SB
2543
2544static int set_disk_info(mddev_t * mddev, void * arg)
2545{
2546        printk(KERN_INFO "md: not yet");
2547        return -EINVAL;
2548}
2549
2550static int clear_array(mddev_t * mddev)
2551{
2552        printk(KERN_INFO "md: not yet");
2553        return -EINVAL;
2554}
2555
2556static int write_raid_info(mddev_t * mddev)
2557{
2558        printk(KERN_INFO "md: not yet");
2559        return -EINVAL;
2560}
2561
2562static int protect_array(mddev_t * mddev)
2563{
2564        printk(KERN_INFO "md: not yet");
2565        return -EINVAL;
2566}
2567
2568static int unprotect_array(mddev_t * mddev)
2569{
2570        printk(KERN_INFO "md: not yet");
2571        return -EINVAL;
2572}
2573
2574static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
2575{
2576        int ret;
2577
2578        ret = md_error(mddev, dev);
2579        return ret;
2580}
2581
2582static int md_ioctl(struct inode *inode, struct file *file,
2583                        unsigned int cmd, unsigned long arg)
2584{
2585        unsigned int minor;
2586        int err = 0;
2587        struct hd_geometry *loc = (struct hd_geometry *) arg;
2588        mddev_t *mddev = NULL;
2589        kdev_t dev;
2590
2591        if (!md_capable_admin())
2592                return -EACCES;
2593
2594        dev = inode->i_rdev;
2595        minor = MINOR(dev);
2596        if (minor >= MAX_MD_DEVS) {
2597                MD_BUG();
2598                return -EINVAL;
2599        }
2600
2601        /*
2602         * Commands dealing with the RAID driver but not any
2603         * particular array:
2604         */
2605        switch (cmd)
2606        {
2607                case RAID_VERSION:
2608                        err = get_version((void *)arg);
2609                        goto done;
2610
2611                case PRINT_RAID_DEBUG:
2612                        err = 0;
2613                        md_print_devices();
2614                        goto done_unlock;
2615
2616#ifndef MODULE
2617                case RAID_AUTORUN:
2618                        err = 0;
2619                        autostart_arrays();
2620                        goto done;
2621#endif
2622
2623                case BLKGETSIZE:
2624                case BLKGETSIZE64:
2625                case BLKRAGET:
2626                case BLKRASET:
2627                case BLKFLSBUF:
2628                case BLKSSZGET:
2629                case BLKBSZGET:
2630                case BLKBSZSET:
2631                        err = blk_ioctl (dev, cmd, arg);
2632                        goto abort;
2633
2634                default:;
2635        }
2636
2637        /*
2638         * Commands creating/starting a new array:
2639         */
2640
2641        mddev = kdev_to_mddev(dev);
2642
2643        switch (cmd)
2644        {
2645                case SET_ARRAY_INFO:
2646                case START_ARRAY:
2647                        if (mddev) {
2648                                printk(KERN_WARNING "md: array md%d already exists!\n",
2649                                                                mdidx(mddev));
2650                                err = -EEXIST;
2651                                goto abort;
2652                        }
2653                default:;
2654        }
2655        switch (cmd)
2656        {
2657                case SET_ARRAY_INFO:
2658                        mddev = alloc_mddev(dev);
2659                        if (!mddev) {
2660                                err = -ENOMEM;
2661                                goto abort;
2662                        }
2663                        atomic_inc(&mddev->active);
2664
2665                        /*
2666                         * alloc_mddev() should possibly self-lock.
2667                         */
2668                        err = lock_mddev(mddev);
2669                        if (err) {
2670                                printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
2671                                       err, cmd);
2672                                goto abort;
2673                        }
2674
2675                        if (mddev->sb) {
2676                                printk(KERN_WARNING "md: array md%d already has a superblock!\n",
2677                                        mdidx(mddev));
2678                                err = -EBUSY;
2679                                goto abort_unlock;
2680                        }
2681                        if (arg) {
2682                                mdu_array_info_t info;
2683                                if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2684                                        err = -EFAULT;
2685                                        goto abort_unlock;
2686                                }
2687                                err = set_array_info(mddev, &info);
2688                                if (err) {
2689                                        printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
2690                                        goto abort_unlock;
2691                                }
2692                        }
2693                        goto done_unlock;
2694
2695                case START_ARRAY:
2696                        /*
2697                         * possibly make it lock the array ...
2698                         */
2699                        err = autostart_array((kdev_t)arg, dev);
2700                        if (err) {
2701                                printk(KERN_WARNING "md: autostart %s failed!\n",
2702                                        partition_name((kdev_t)arg));
2703                                goto abort;
2704                        }
2705                        goto done;
2706
2707                default:;
2708        }
2709
2710        /*
2711         * Commands querying/configuring an existing array:
2712         */
2713
2714        if (!mddev) {
2715                err = -ENODEV;
2716                goto abort;
2717        }
2718        err = lock_mddev(mddev);
2719        if (err) {
2720                printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2721                goto abort;
2722        }
2723        /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2724        if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2725                err = -ENODEV;
2726                goto abort_unlock;
2727        }
2728
2729        /*
2730         * Commands even a read-only array can execute:
2731         */
2732        switch (cmd)
2733        {
2734                case GET_ARRAY_INFO:
2735                        err = get_array_info(mddev, (void *)arg);
2736                        goto done_unlock;
2737
2738                case GET_DISK_INFO:
2739                        err = get_disk_info(mddev, (void *)arg);
2740                        goto done_unlock;
2741
2742                case RESTART_ARRAY_RW:
2743                        err = restart_array(mddev);
2744                        goto done_unlock;
2745
2746                case STOP_ARRAY:
2747                        if (inode->i_bdev->bd_openers > 1)
2748                                err = -EBUSY;
2749                        else if (!(err = do_md_stop (mddev, 0)))
2750                                mddev = NULL;
2751                        goto done_unlock;
2752
2753                case STOP_ARRAY_RO:
2754                        if (inode->i_bdev->bd_openers > 1)
2755                                err = -EBUSY;
2756                        else 
2757                                err = do_md_stop (mddev, 1);
2758                        goto done_unlock;
2759
2760        /*
2761         * We have a problem here : there is no easy way to give a CHS
2762         * virtual geometry. We currently pretend that we have a 2 heads
2763         * 4 sectors (with a BIG number of cylinders...). This drives
2764         * dosfs just mad... ;-)
2765         */
2766                case HDIO_GETGEO:
2767                        if (!loc) {
2768                                err = -EINVAL;
2769                                goto abort_unlock;
2770                        }
2771                        err = md_put_user (2, (char *) &loc->heads);
2772                        if (err)
2773                                goto abort_unlock;
2774                        err = md_put_user (4, (char *) &loc->sectors);
2775                        if (err)
2776                                goto abort_unlock;
2777                        err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2778                                                (short *) &loc->cylinders);
2779                        if (err)
2780                                goto abort_unlock;
2781                        err = md_put_user (md_hd_struct[minor].start_sect,
2782                                                (long *) &loc->start);
2783                        goto done_unlock;
2784        }
2785
2786        /*
2787         * The remaining ioctls are changing the state of the
2788         * superblock, so we do not allow read-only arrays
2789         * here:
2790         */
2791        if (mddev->ro) {
2792                err = -EROFS;
2793                goto abort_unlock;
2794        }
2795
2796        switch (cmd)
2797        {
2798                case CLEAR_ARRAY:
2799                        err = clear_array(mddev);
2800                        goto done_unlock;
2801
2802                case ADD_NEW_DISK:
2803                {
2804                        mdu_disk_info_t info;
2805                        if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2806                                err = -EFAULT;
2807                        else
2808                                err = add_new_disk(mddev, &info);
2809                        goto done_unlock;
2810                }
2811                case HOT_GENERATE_ERROR:
2812                        err = hot_generate_error(mddev, (kdev_t)arg);
2813                        goto done_unlock;
2814                case HOT_REMOVE_DISK:
2815                        err = hot_remove_disk(mddev, (kdev_t)arg);
2816                        goto done_unlock;
2817
2818                case HOT_ADD_DISK:
2819                        err = hot_add_disk(mddev, (kdev_t)arg);
2820                        goto done_unlock;
2821
2822                case SET_DISK_INFO:
2823                        err = set_disk_info(mddev, (void *)arg);
2824                        goto done_unlock;
2825
2826                case WRITE_RAID_INFO:
2827                        err = write_raid_info(mddev);
2828                        goto done_unlock;
2829
2830                case UNPROTECT_ARRAY:
2831                        err = unprotect_array(mddev);
2832                        goto done_unlock;
2833
2834                case PROTECT_ARRAY:
2835                        err = protect_array(mddev);
2836                        goto done_unlock;
2837
2838                case SET_DISK_FAULTY:
2839                        err = set_disk_faulty(mddev, (kdev_t)arg);
2840                        goto done_unlock;
2841
2842                case RUN_ARRAY:
2843                {
2844/* The data is never used....
2845                        mdu_param_t param;
2846                        err = md_copy_from_user(&param, (mdu_param_t *)arg,
2847                                                         sizeof(param));
2848                        if (err)
2849                                goto abort_unlock;
2850*/
2851                        err = do_md_run (mddev);
2852                        /*
2853                         * we have to clean up the mess if
2854                         * the array cannot be run for some
2855                         * reason ...
2856                         */
2857                        if (err) {
2858                                mddev->sb_dirty = 0;
2859                                if (!do_md_stop (mddev, 0))
2860                                        mddev = NULL;
2861                        }
2862                        goto done_unlock;
2863                }
2864
2865                default:
2866                        printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
2867                               "upgrade your software to use new ictls.\n",
2868                               current->comm, current->pid);
2869                        err = -EINVAL;
2870                        goto abort_unlock;
2871        }
2872
2873done_unlock:
2874abort_unlock:
2875        if (mddev)
2876                unlock_mddev(mddev);
2877
2878        return err;
2879done:
2880        if (err)
2881                MD_BUG();
2882abort:
2883        return err;
2884}
2885
2886static int md_open(struct inode *inode, struct file *file)
2887{
2888        /*
2889         * Always succeed, but increment the usage count
2890         */
2891        mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2892        if (mddev)
2893                atomic_inc(&mddev->active);
2894        return (0);
2895}
2896
2897static int md_release(struct inode *inode, struct file * file)
2898{
2899        mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2900        if (mddev)
2901                atomic_dec(&mddev->active);
2902        return 0;
2903}
2904
2905static struct block_device_operations md_fops=
2906{
2907        owner:          THIS_MODULE,
2908        open:           md_open,
2909        release:        md_release,
2910        ioctl:          md_ioctl,
2911};
2912
2913
2914int md_thread(void * arg)
2915{
2916        mdk_thread_t *thread = arg;
2917
2918        md_lock_kernel();
2919
2920        /*
2921         * Detach thread
2922         */
2923
2924        daemonize();
2925
2926        sprintf(current->comm, thread->name);
2927        md_init_signals();
2928        md_flush_signals();
2929        thread->tsk = current;
2930
2931        /*
2932         * md_thread is a 'system-thread', it's priority should be very
2933         * high. We avoid resource deadlocks individually in each
2934         * raid personality. (RAID5 does preallocation) We also use RR and
2935         * the very same RT priority as kswapd, thus we will never get
2936         * into a priority inversion deadlock.
2937         *
2938         * we definitely have to have equal or higher priority than
2939         * bdflush, otherwise bdflush will deadlock if there are too
2940         * many dirty RAID5 blocks.
2941         */
2942        current->policy = SCHED_OTHER;
2943        current->nice = -20;
2944        md_unlock_kernel();
2945
2946        complete(thread->event);
2947        while (thread->run) {
2948                void (*run)(void *data);
2949                DECLARE_WAITQUEUE(wait, current);
2950
2951                add_wait_queue(&thread->wqueue, &wait);
2952                set_task_state(current, TASK_INTERRUPTIBLE);
2953                if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2954                        dprintk("md: thread %p went to sleep.\n", thread);
2955                        schedule();
2956                        dprintk("md: thread %p woke up.\n", thread);
2957                }
2958                current->state = TASK_RUNNING;
2959                remove_wait_queue(&thread->wqueue, &wait);
2960                clear_bit(THREAD_WAKEUP, &thread->flags);
2961
2962                run = thread->run;
2963                if (run) {
2964                        run(thread->data);
2965                        run_task_queue(&tq_disk);
2966                }
2967                if (md_signal_pending(current))
2968                        md_flush_signals();
2969        }
2970        complete(thread->event);
2971        return 0;
2972}
2973
2974void md_wakeup_thread(mdk_thread_t *thread)
2975{
2976        dprintk("md: waking up MD thread %p.\n", thread);
2977        set_bit(THREAD_WAKEUP, &thread->flags);
2978        wake_up(&thread->wqueue);
2979}
2980
2981mdk_thread_t *md_register_thread(void (*run) (void *),
2982                                                void *data, const char *name)
2983{
2984        mdk_thread_t *thread;
2985        int ret;
2986        struct completion event;
2987
2988        thread = (mdk_thread_t *) kmalloc
2989                                (sizeof(mdk_thread_t), GFP_KERNEL);
2990        if (!thread)
2991                return NULL;
2992
2993        memset(thread, 0, sizeof(mdk_thread_t));
2994        md_init_waitqueue_head(&thread->wqueue);
2995
2996        init_completion(&event);
2997        thread->event = &event;
2998        thread->run = run;
2999        thread->data = data;
3000        thread->name = name;
3001        ret = kernel_thread(md_thread, thread, 0);
3002        if (ret < 0) {
3003                kfree(thread);
3004                return NULL;
3005        }
3006        wait_for_completion(&event);
3007        return thread;
3008}
3009
3010void md_interrupt_thread(mdk_thread_t *thread)
3011{
3012        if (!thread->tsk) {
3013                MD_BUG();
3014                return;
3015        }
3016        dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
3017        send_sig(SIGKILL, thread->tsk, 1);
3018}
3019
3020void md_unregister_thread(mdk_thread_t *thread)
3021{
3022        struct completion event;
3023
3024        init_completion(&event);
3025
3026        thread->event = &event;
3027        thread->run = NULL;
3028        thread->name = NULL;
3029        md_interrupt_thread(thread);
3030        wait_for_completion(&event);
3031        kfree(thread);
3032}
3033
3034void md_recover_arrays(void)
3035{
3036        if (!md_recovery_thread) {
3037                MD_BUG();
3038                return;
3039        }
3040        md_wakeup_thread(md_recovery_thread);
3041}
3042
3043
3044int md_error(mddev_t *mddev, kdev_t rdev)
3045{
3046        mdk_rdev_t * rrdev;
3047
3048        dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
3049                MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev),
3050                __builtin_return_address(0),__builtin_return_address(1),
3051                __builtin_return_address(2),__builtin_return_address(3));
3052
3053        if (!mddev) {
3054                MD_BUG();
3055                return 0;
3056        }
3057        rrdev = find_rdev(mddev, rdev);
3058        if (!rrdev || rrdev->faulty)
3059                return 0;
3060        if (!mddev->pers->error_handler
3061                        || mddev->pers->error_handler(mddev,rdev) <= 0) {
3062                rrdev->faulty = 1;
3063        } else
3064                return 1;
3065        /*
3066         * if recovery was running, stop it now.
3067         */
3068        if (mddev->pers->stop_resync)
3069                mddev->pers->stop_resync(mddev);
3070        if (mddev->recovery_running)
3071                md_interrupt_thread(md_recovery_thread);
3072        md_recover_arrays();
3073
3074        return 0;
3075}
3076
3077static void status_unused(struct seq_file *seq)
3078{
3079        int i = 0;
3080        mdk_rdev_t *rdev;
3081        struct md_list_head *tmp;
3082
3083        seq_printf(seq, "unused devices: ");
3084
3085        ITERATE_RDEV_ALL(rdev,tmp) {
3086                if (!rdev->same_set.next && !rdev->same_set.prev) {
3087                        /*
3088                         * The device is not yet used by any array.
3089                         */
3090                        i++;
3091                        seq_printf(seq, "%s ",
3092                                partition_name(rdev->dev));
3093                }
3094        }
3095        if (!i)
3096                seq_printf(seq, "<none>");
3097
3098        seq_printf(seq, "\n");
3099}
3100
3101
3102static void status_resync(struct seq_file *seq, mddev_t * mddev)
3103{
3104        unsigned long max_blocks, resync, res, dt, db, rt;
3105
3106        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
3107        max_blocks = mddev->sb->size;
3108
3109        /*
3110         * Should not happen.
3111         */
3112        if (!max_blocks)
3113                MD_BUG();
3114
3115        res = (resync/1024)*1000/(max_blocks/1024 + 1);
3116        {
3117                int i, x = res/50, y = 20-x;
3118                seq_printf(seq, "[");
3119                for (i = 0; i < x; i++)
3120                        seq_printf(seq, "=");
3121                seq_printf(seq, ">");
3122                for (i = 0; i < y; i++)
3123                        seq_printf(seq, ".");
3124                seq_printf(seq, "] ");
3125        }
3126        if (!mddev->recovery_running)
3127                /*
3128                 * true resync
3129                 */
3130                seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)",
3131                                res/10, res % 10, resync, max_blocks);
3132        else
3133                /*
3134                 * recovery ...
3135                 */
3136                seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)",
3137                                res/10, res % 10, resync, max_blocks);
3138
3139        /*
3140         * We do not want to overflow, so the order of operands and
3141         * the * 100 / 100 trick are important. We do a +1 to be
3142         * safe against division by zero. We only estimate anyway.
3143         *
3144         * dt: time from mark until now
3145         * db: blocks written from mark until now
3146         * rt: remaining time
3147         */
3148        dt = ((jiffies - mddev->resync_mark) / HZ);
3149        if (!dt) dt++;
3150        db = resync - (mddev->resync_mark_cnt/2);
3151        rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3152
3153        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3154
3155        seq_printf(seq, " speed=%ldK/sec", db/dt);
3156
3157}
3158
3159
3160static void *md_seq_start(struct seq_file *seq, loff_t *pos)
3161{
3162        struct list_head *tmp;
3163        loff_t l = *pos;
3164        mddev_t *mddev;
3165
3166        if (l >= 0x10000)
3167                return NULL;
3168        if (!l--)
3169                /* header */
3170                return (void*)1;
3171
3172        list_for_each(tmp,&all_mddevs)
3173                if (!l--) {
3174                        mddev = list_entry(tmp, mddev_t, all_mddevs);
3175                        return mddev;
3176                }
3177        if (!l--)       
3178                return (void*)2;/* tail */
3179        return NULL;
3180}
3181
3182static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3183{
3184        struct list_head *tmp;
3185        mddev_t *next_mddev, *mddev = v;
3186        
3187        ++*pos;
3188        if (v == (void*)2)
3189                return NULL;
3190
3191        if (v == (void*)1)
3192                tmp = all_mddevs.next;
3193        else
3194                tmp = mddev->all_mddevs.next;
3195        if (tmp != &all_mddevs)
3196                next_mddev = list_entry(tmp,mddev_t,all_mddevs);
3197        else {
3198                next_mddev = (void*)2;
3199                *pos = 0x10000;
3200        }               
3201
3202        return next_mddev;
3203
3204}
3205
3206static void md_seq_stop(struct seq_file *seq, void *v)
3207{
3208
3209}
3210
3211static int md_seq_show(struct seq_file *seq, void *v)
3212{
3213        int j, size;
3214        struct md_list_head *tmp2;
3215        mdk_rdev_t *rdev;
3216        mddev_t *mddev = v;
3217
3218        if (v == (void*)1) {
3219                seq_printf(seq, "Personalities : ");
3220                for (j = 0; j < MAX_PERSONALITY; j++)
3221                        if (pers[j])
3222                                seq_printf(seq, "[%s] ", pers[j]->name);
3223
3224                seq_printf(seq, "\n");
3225                seq_printf(seq, "read_ahead ");
3226                if (read_ahead[MD_MAJOR] == INT_MAX)
3227                        seq_printf(seq, "not set\n");
3228                else
3229                        seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
3230                return 0;
3231        }
3232        if (v == (void*)2) {
3233                status_unused(seq);
3234                return 0;
3235        }
3236
3237        seq_printf(seq, "md%d : %sactive", mdidx(mddev),
3238                   mddev->pers ? "" : "in");
3239        if (mddev->pers) {
3240                if (mddev->ro)
3241                        seq_printf(seq, " (read-only)");
3242                seq_printf(seq, " %s", mddev->pers->name);
3243        }
3244        
3245        size = 0;
3246        ITERATE_RDEV(mddev,rdev,tmp2) {
3247                seq_printf(seq, " %s[%d]",
3248                           partition_name(rdev->dev), rdev->desc_nr);
3249                if (rdev->faulty) {
3250                        seq_printf(seq, "(F)");
3251                        continue;
3252                }
3253                size += rdev->size;
3254        }
3255
3256        if (mddev->nb_dev) {
3257                if (mddev->pers)
3258                        seq_printf(seq, "\n      %d blocks",
3259                                   md_size[mdidx(mddev)]);
3260                else
3261                        seq_printf(seq, "\n      %d blocks", size);
3262        }
3263
3264        if (mddev->pers) {
3265
3266                mddev->pers->status (seq, mddev);
3267
3268                seq_printf(seq, "\n      ");
3269                if (mddev->curr_resync) {
3270                        status_resync (seq, mddev);
3271                } else {
3272                        if (sem_getcount(&mddev->resync_sem) != 1)
3273                                seq_printf(seq, "       resync=DELAYED");
3274                }
3275        }
3276        seq_printf(seq, "\n");
3277
3278        return 0;
3279}
3280
3281  
3282static struct seq_operations md_seq_ops = {
3283        .start  = md_seq_start,
3284        .next   = md_seq_next,
3285        .stop   = md_seq_stop,
3286        .show   = md_seq_show,
3287};
3288
3289static int md_seq_open(struct inode *inode, struct file *file)
3290{
3291        int error;
3292
3293        error = seq_open(file, &md_seq_ops);
3294        return error;
3295}
3296
3297static struct file_operations md_seq_fops = {
3298        .open           = md_seq_open,
3299        .read           = seq_read,
3300        .llseek         = seq_lseek,
3301        .release        = seq_release,
3302};
3303
3304
3305int register_md_personality(int pnum, mdk_personality_t *p)
3306{
3307        if (pnum >= MAX_PERSONALITY) {
3308                MD_BUG();
3309                return -EINVAL;
3310        }
3311
3312        if (pers[pnum]) {
3313                MD_BUG();
3314                return -EBUSY;
3315        }
3316
3317        pers[pnum] = p;
3318        printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3319        return 0;
3320}
3321
3322int unregister_md_personality(int pnum)
3323{
3324        if (pnum >= MAX_PERSONALITY) {
3325                MD_BUG();
3326                return -EINVAL;
3327        }
3328
3329        printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3330        pers[pnum] = NULL;
3331        return 0;
3332}
3333
3334mdp_disk_t *get_spare(mddev_t *mddev)
3335{
3336        mdp_super_t *sb = mddev->sb;
3337        mdp_disk_t *disk;
3338        mdk_rdev_t *rdev;
3339        struct md_list_head *tmp;
3340
3341        ITERATE_RDEV(mddev,rdev,tmp) {
3342                if (rdev->faulty)
3343                        continue;
3344                if (!rdev->sb) {
3345                        MD_BUG();
3346                        continue;
3347                }
3348                disk = &sb->disks[rdev->desc_nr];
3349                if (disk_faulty(disk)) {
3350                        MD_BUG();
3351                        continue;
3352                }
3353                if (disk_active(disk))
3354                        continue;
3355                return disk;
3356        }
3357        return NULL;
3358}
3359
3360static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
3361void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
3362{
3363        unsigned int major = MAJOR(dev);
3364        unsigned int index;
3365
3366        index = disk_index(dev);
3367        if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3368                return;
3369
3370        sync_io[major][index] += nr_sectors;
3371}
3372
3373static int is_mddev_idle(mddev_t *mddev)
3374{
3375        mdk_rdev_t * rdev;
3376        struct md_list_head *tmp;
3377        int idle;
3378        unsigned long curr_events;
3379
3380        idle = 1;
3381        ITERATE_RDEV(mddev,rdev,tmp) {
3382                int major = MAJOR(rdev->dev);
3383                int idx = disk_index(rdev->dev);
3384
3385                if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3386                        continue;
3387
3388                curr_events = kstat.dk_drive_rblk[major][idx] +
3389                                                kstat.dk_drive_wblk[major][idx] ;
3390                curr_events -= sync_io[major][idx];
3391                if ((curr_events - rdev->last_events) > 32) {
3392                        rdev->last_events = curr_events;
3393                        idle = 0;
3394                }
3395        }
3396        return idle;
3397}
3398
3399MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3400
3401void md_done_sync(mddev_t *mddev, int blocks, int ok)
3402{
3403        /* another "blocks" (512byte) blocks have been synced */
3404        atomic_sub(blocks, &mddev->recovery_active);
3405        wake_up(&mddev->recovery_wait);
3406        if (!ok) {
3407                // stop recovery, signal do_sync ....
3408                if (mddev->pers->stop_resync)
3409                        mddev->pers->stop_resync(mddev);
3410                if (mddev->recovery_running)
3411                        md_interrupt_thread(md_recovery_thread);
3412        }
3413}
3414
3415#define SYNC_MARKS      10
3416#define SYNC_MARK_STEP  (3*HZ)
3417int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3418{
3419        mddev_t *mddev2;
3420        unsigned int max_sectors, currspeed,
3421                j, window, err, serialize;
3422        unsigned long mark[SYNC_MARKS];
3423        unsigned long mark_cnt[SYNC_MARKS];
3424        int last_mark,m;
3425        struct md_list_head *tmp;
3426        unsigned long last_check;
3427
3428
3429        err = down_interruptible(&mddev->resync_sem);
3430        if (err)
3431                goto out_nolock;
3432
3433recheck:
3434        serialize = 0;
3435        ITERATE_MDDEV(mddev2,tmp) {
3436                if (mddev2 == mddev)
3437                        continue;
3438                if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3439                        printk(KERN_INFO "md: delaying resync of md%d until md%d "
3440                               "has finished resync (they share one or more physical units)\n",
3441                               mdidx(mddev), mdidx(mddev2));
3442                        serialize = 1;
3443                        break;
3444                }
3445        }
3446        if (serialize) {
3447                interruptible_sleep_on(&resync_wait);
3448                if (md_signal_pending(current)) {
3449                        md_flush_signals();
3450                        err = -EINTR;
3451                        goto out;
3452                }
3453                goto recheck;
3454        }
3455
3456        mddev->curr_resync = 1;
3457
3458        max_sectors = mddev->sb->size<<1;
3459
3460        printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3461        printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3462                                                sysctl_speed_limit_min);
3463        printk(KERN_INFO "md: using maximum available idle IO bandwith "
3464               "(but not more than %d KB/sec) for reconstruction.\n",
3465               sysctl_speed_limit_max);
3466
3467        /*
3468         * Resync has low priority.
3469         */
3470        current->nice = 19;
3471
3472        is_mddev_idle(mddev); /* this also initializes IO event counters */
3473        for (m = 0; m < SYNC_MARKS; m++) {
3474                mark[m] = jiffies;
3475                mark_cnt[m] = 0;
3476        }
3477        last_mark = 0;
3478        mddev->resync_mark = mark[last_mark];
3479        mddev->resync_mark_cnt = mark_cnt[last_mark];
3480
3481        /*
3482         * Tune reconstruction:
3483         */
3484        window = vm_max_readahead*(PAGE_SIZE/512);
3485        printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
3486               window/2,max_sectors/2);
3487
3488        atomic_set(&mddev->recovery_active, 0);
3489        init_waitqueue_head(&mddev->recovery_wait);
3490        last_check = 0;
3491        for (j = 0; j < max_sectors;) {
3492                int sectors;
3493
3494                sectors = mddev->pers->sync_request(mddev, j);
3495
3496                if (sectors < 0) {
3497                        err = sectors;
3498                        goto out;
3499                }
3500                atomic_add(sectors, &mddev->recovery_active);
3501                j += sectors;
3502                mddev->curr_resync = j;
3503
3504                if (last_check + window > j)
3505                        continue;
3506
3507                last_check = j;
3508
3509                run_task_queue(&tq_disk);
3510
3511        repeat:
3512                if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3513                        /* step marks */
3514                        int next = (last_mark+1) % SYNC_MARKS;
3515
3516                        mddev->resync_mark = mark[next];
3517                        mddev->resync_mark_cnt = mark_cnt[next];
3518                        mark[next] = jiffies;
3519                        mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3520                        last_mark = next;
3521                }
3522
3523
3524                if (md_signal_pending(current)) {
3525                        /*
3526                         * got a signal, exit.
3527                         */
3528                        mddev->curr_resync = 0;
3529                        printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
3530                        md_flush_signals();
3531                        err = -EINTR;
3532                        goto out;
3533                }
3534
3535                /*
3536                 * this loop exits only if either when we are slower than
3537                 * the 'hard' speed limit, or the system was IO-idle for
3538                 * a jiffy.
3539                 * the system might be non-idle CPU-wise, but we only care
3540                 * about not overloading the IO subsystem. (things like an
3541                 * e2fsck being done on the RAID array should execute fast)
3542                 */
3543                if (md_need_resched(current))
3544                        schedule();
3545
3546                currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3547
3548                if (currspeed > sysctl_speed_limit_min) {
3549                        current->nice = 19;
3550
3551                        if ((currspeed > sysctl_speed_limit_max) ||
3552                                        !is_mddev_idle(mddev)) {
3553                                current->state = TASK_INTERRUPTIBLE;
3554                                md_schedule_timeout(HZ/4);
3555                                goto repeat;
3556                        }
3557                } else
3558                        current->nice = -20;
3559        }
3560        printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3561        err = 0;
3562        /*
3563         * this also signals 'finished resyncing' to md_stop
3564         */
3565out:
3566        wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3567        up(&mddev->resync_sem);
3568out_nolock:
3569        mddev->curr_resync = 0;
3570        wake_up(&resync_wait);
3571        return err;
3572}
3573
3574
3575/*
3576 * This is a kernel thread which syncs a spare disk with the active array
3577 *
3578 * the amount of foolproofing might seem to be a tad excessive, but an
3579 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3580 * of my root partition with the first 0.5 gigs of my /home partition ... so
3581 * i'm a bit nervous ;)
3582 */
3583void md_do_recovery(void *data)
3584{
3585        int err;
3586        mddev_t *mddev;
3587        mdp_super_t *sb;
3588        mdp_disk_t *spare;
3589        struct md_list_head *tmp;
3590
3591        printk(KERN_INFO "md: recovery thread got woken up ...\n");
3592restart:
3593        ITERATE_MDDEV(mddev,tmp) {
3594                sb = mddev->sb;
3595                if (!sb)
3596                        continue;
3597                if (mddev->recovery_running)
3598                        continue;
3599                if (sb->active_disks == sb->raid_disks)
3600                        continue;
3601                if (mddev->sb_dirty)
3602                        md_update_sb(mddev);
3603                if (!sb->spare_disks) {
3604                        printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
3605                               "-- continuing in degraded mode\n", mdidx(mddev));
3606                        continue;
3607                }
3608                /*
3609                 * now here we get the spare and resync it.
3610                 */
3611                spare = get_spare(mddev);
3612                if (!spare)
3613                        continue;
3614                printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
3615                       mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3616                if (!mddev->pers->diskop)
3617                        continue;
3618                if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3619                        continue;
3620                down(&mddev->recovery_sem);
3621                mddev->recovery_running = 1;
3622                err = md_do_sync(mddev, spare);
3623                if (err == -EIO) {
3624                        printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
3625                               mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3626                        if (!disk_faulty(spare)) {
3627                                mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3628                                mark_disk_faulty(spare);
3629                                mark_disk_nonsync(spare);
3630                                mark_disk_inactive(spare);
3631                                sb->spare_disks--;
3632                                sb->working_disks--;
3633                                sb->failed_disks++;
3634                        }
3635                } else
3636                        if (disk_faulty(spare))
3637                                mddev->pers->diskop(mddev, &spare,
3638                                                DISKOP_SPARE_INACTIVE);
3639                if (err == -EINTR || err == -ENOMEM) {
3640                        /*
3641                         * Recovery got interrupted, or ran out of mem ...
3642                         * signal back that we have finished using the array.
3643                         */
3644                        mddev->pers->diskop(mddev, &spare,
3645                                                         DISKOP_SPARE_INACTIVE);
3646                        up(&mddev->recovery_sem);
3647                        mddev->recovery_running = 0;
3648                        continue;
3649                } else {
3650                        mddev->recovery_running = 0;
3651                        up(&mddev->recovery_sem);
3652                }
3653                if (!disk_faulty(spare)) {
3654                        /*
3655                         * the SPARE_ACTIVE diskop possibly changes the
3656                         * pointer too
3657                         */
3658                        mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3659                        mark_disk_sync(spare);
3660                        mark_disk_active(spare);
3661                        sb->active_disks++;
3662                        sb->spare_disks--;
3663                }
3664                mddev->sb_dirty = 1;
3665                md_update_sb(mddev);
3666                goto restart;
3667        }
3668        printk(KERN_INFO "md: recovery thread finished ...\n");
3669
3670}
3671
3672int md_notify_reboot(struct notifier_block *this,
3673                                        unsigned long code, void *x)
3674{
3675        struct md_list_head *tmp;
3676        mddev_t *mddev;
3677
3678        if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3679                                        || (code == MD_SYS_POWER_OFF)) {
3680
3681                printk(KERN_INFO "md: stopping all md devices.\n");
3682
3683                ITERATE_MDDEV(mddev,tmp)
3684                        do_md_stop (mddev, 1);
3685                /*
3686                 * certain more exotic SCSI devices are known to be
3687                 * volatile wrt too early system reboots. While the
3688                 * right place to handle this issue is the given
3689                 * driver, we do want to have a safe RAID driver ...
3690                 */
3691                md_mdelay(1000*1);
3692        }
3693        return NOTIFY_DONE;
3694}
3695
3696struct notifier_block md_notifier = {
3697        notifier_call:  md_notify_reboot,
3698        next:           NULL,
3699        priority:       INT_MAX, /* before any real devices */
3700};
3701
3702static void md_geninit(void)
3703{
3704        struct proc_dir_entry *p;
3705        int i;
3706
3707        for(i = 0; i < MAX_MD_DEVS; i++) {
3708                md_blocksizes[i] = 1024;
3709                md_size[i] = 0;
3710                md_hardsect_sizes[i] = 512;
3711                md_maxreadahead[i] = MD_READAHEAD;
3712        }
3713        blksize_size[MAJOR_NR] = md_blocksizes;
3714        blk_size[MAJOR_NR] = md_size;
3715        max_readahead[MAJOR_NR] = md_maxreadahead;
3716        hardsect_size[MAJOR_NR] = md_hardsect_sizes;
3717
3718        dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3719
3720#ifdef CONFIG_PROC_FS
3721        p = create_proc_entry("mdstat", S_IRUGO, NULL);
3722        if (p)
3723                p->proc_fops = &md_seq_fops;
3724#endif
3725}
3726
3727int md__init md_init(void)
3728{
3729        static char * name = "mdrecoveryd";
3730        int minor;
3731
3732        printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
3733                        MD_MAJOR_VERSION, MD_MINOR_VERSION,
3734                        MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3735
3736        if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
3737        {
3738                printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
3739                return (-1);
3740        }
3741        devfs_handle = devfs_mk_dir (NULL, "md", NULL);
3742        /* we don't use devfs_register_series because we want to fill md_hd_struct */
3743        for (minor=0; minor < MAX_MD_DEVS; ++minor) {
3744                char devname[128];
3745                sprintf (devname, "%u", minor);
3746                md_hd_struct[minor].de = devfs_register (devfs_handle,
3747                        devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
3748                        S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
3749        }
3750
3751        /* forward all md request to md_make_request */
3752        blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
3753
3754
3755        read_ahead[MAJOR_NR] = INT_MAX;
3756
3757        add_gendisk(&md_gendisk);
3758
3759        md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3760        if (!md_recovery_thread)
3761                printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
3762
3763        md_register_reboot_notifier(&md_notifier);
3764        raid_table_header = register_sysctl_table(raid_root_table, 1);
3765
3766        md_geninit();
3767        return (0);
3768}
3769
3770
3771#ifndef MODULE
3772
3773/*
3774 * When md (and any require personalities) are compiled into the kernel
3775 * (not a module), arrays can be assembles are boot time using with AUTODETECT
3776 * where specially marked partitions are registered with md_autodetect_dev(),
3777 * and with MD_BOOT where devices to be collected are given on the boot line
3778 * with md=.....
3779 * The code for that is here.
3780 */
3781
3782struct {
3783        int set;
3784        int noautodetect;
3785} raid_setup_args md__initdata;
3786
3787/*
3788 * Searches all registered partitions for autorun RAID arrays
3789 * at boot time.
3790 */
3791static kdev_t detected_devices[128];
3792static int dev_cnt;
3793
3794void md_autodetect_dev(kdev_t dev)
3795{
3796        if (dev_cnt >= 0 && dev_cnt < 127)
3797                detected_devices[dev_cnt++] = dev;
3798}
3799
3800
3801static void autostart_arrays(void)
3802{
3803        mdk_rdev_t *rdev;
3804        int i;
3805
3806        printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
3807
3808        for (i = 0; i < dev_cnt; i++) {
3809                kdev_t dev = detected_devices[i];
3810
3811                if (md_import_device(dev,1)) {
3812                        printk(KERN_ALERT "md: could not import %s!\n",
3813                                partition_name(dev));
3814                        continue;
3815                }
3816                /*
3817                 * Sanity checks:
3818                 */
3819                rdev = find_rdev_all(dev);
3820                if (!rdev) {
3821                        MD_BUG();
3822                        continue;
3823                }
3824                if (rdev->faulty) {
3825                        MD_BUG();
3826                        continue;
3827                }
3828                md_list_add(&rdev->pending, &pending_raid_disks);
3829        }
3830        dev_cnt = 0;
3831
3832        autorun_devices(-1);
3833}
3834
3835static struct {
3836        char device_set [MAX_MD_DEVS];
3837        int pers[MAX_MD_DEVS];
3838        int chunk[MAX_MD_DEVS];
3839        char *device_names[MAX_MD_DEVS];
3840} md_setup_args md__initdata;
3841
3842/*
3843 * Parse the command-line parameters given our kernel, but do not
3844 * actually try to invoke the MD device now; that is handled by
3845 * md_setup_drive after the low-level disk drivers have initialised.
3846 *
3847 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3848 *             assigns the task of parsing integer arguments to the
3849 *             invoked program now).  Added ability to initialise all
3850 *             the MD devices (by specifying multiple "md=" lines)
3851 *             instead of just one.  -- KTK
3852 * 18May2000: Added support for persistant-superblock arrays:
3853 *             md=n,0,factor,fault,device-list   uses RAID0 for device n
3854 *             md=n,-1,factor,fault,device-list  uses LINEAR for device n
3855 *             md=n,device-list      reads a RAID superblock from the devices
3856 *             elements in device-list are read by name_to_kdev_t so can be
3857 *             a hex number or something like /dev/hda1 /dev/sdb
3858 * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
3859 *              Shifted name_to_kdev_t() and related operations to md_set_drive()
3860 *              for later execution. Rewrote section to make devfs compatible.
3861 */
3862static int md__init md_setup(char *str)
3863{
3864        int minor, level, factor, fault;
3865        char *pername = "";
3866        char *str1 = str;
3867
3868        if (get_option(&str, &minor) != 2) {    /* MD Number */
3869                printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
3870                return 0;
3871        }
3872        if (minor >= MAX_MD_DEVS) {
3873                printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
3874                return 0;
3875        } else if (md_setup_args.device_names[minor]) {
3876                printk(KERN_WARNING "md: md=%d, Specified more then once. "
3877                       "Replacing previous definition.\n", minor);
3878        }
3879        switch (get_option(&str, &level)) {     /* RAID Personality */
3880        case 2: /* could be 0 or -1.. */
3881                if (level == 0 || level == -1) {
3882                        if (get_option(&str, &factor) != 2 ||   /* Chunk Size */
3883                                        get_option(&str, &fault) != 2) {
3884                                printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
3885                                return 0;
3886                        }
3887                        md_setup_args.pers[minor] = level;
3888                        md_setup_args.chunk[minor] = 1 << (factor+12);
3889                        switch(level) {
3890                        case -1:
3891                                level = LINEAR;
3892                                pername = "linear";
3893                                break;
3894                        case 0:
3895                                level = RAID0;
3896                                pername = "raid0";
3897                                break;
3898                        default:
3899                                printk(KERN_WARNING
3900                                       "md: The kernel has not been configured for raid%d support!\n",
3901                                       level);
3902                                return 0;
3903                        }
3904                        md_setup_args.pers[minor] = level;
3905                        break;
3906                }
3907                /* FALL THROUGH */
3908        case 1: /* the first device is numeric */
3909                str = str1;
3910                /* FALL THROUGH */
3911        case 0:
3912                md_setup_args.pers[minor] = 0;
3913                pername="super-block";
3914        }
3915
3916        printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
3917                minor, pername, str);
3918        md_setup_args.device_names[minor] = str;
3919
3920        return 1;
3921}
3922
3923extern kdev_t name_to_kdev_t(char *line) md__init;
3924void md__init md_setup_drive(void)
3925{
3926        int minor, i;
3927        kdev_t dev;
3928        mddev_t*mddev;
3929        kdev_t devices[MD_SB_DISKS+1];
3930
3931        for (minor = 0; minor < MAX_MD_DEVS; minor++) {
3932                int err = 0;
3933                char *devname;
3934                mdu_disk_info_t dinfo;
3935
3936                if ((devname = md_setup_args.device_names[minor]) == 0) continue;
3937
3938                for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
3939
3940                        char *p;
3941                        void *handle;
3942
3943                        p = strchr(devname, ',');
3944                        if (p)
3945                                *p++ = 0;
3946
3947                        dev = name_to_kdev_t(devname);
3948                        handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
3949                                                        DEVFS_SPECIAL_BLK, 1);
3950                        if (handle != 0) {
3951                                unsigned major, minor;
3952                                devfs_get_maj_min(handle, &major, &minor);
3953                                dev = MKDEV(major, minor);
3954                        }
3955                        if (dev == 0) {
3956                                printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
3957                                break;
3958                        }
3959
3960                        devices[i] = dev;
3961                        md_setup_args.device_set[minor] = 1;
3962
3963                        devname = p;
3964                }
3965                devices[i] = 0;
3966
3967                if (md_setup_args.device_set[minor] == 0)
3968                        continue;
3969
3970                if (mddev_map[minor].mddev) {
3971                        printk(KERN_WARNING
3972                               "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
3973                               minor);
3974                        continue;
3975                }
3976                printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
3977
3978                mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3979                if (!mddev) {
3980                        printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
3981                        continue;
3982                }
3983                if (md_setup_args.pers[minor]) {
3984                        /* non-persistent */
3985                        mdu_array_info_t ainfo;
3986                        ainfo.level = pers_to_level(md_setup_args.pers[minor]);
3987                        ainfo.size = 0;
3988                        ainfo.nr_disks =0;
3989                        ainfo.raid_disks =0;
3990                        ainfo.md_minor =minor;
3991                        ainfo.not_persistent = 1;
3992
3993                        ainfo.state = (1 << MD_SB_CLEAN);
3994                        ainfo.active_disks = 0;
3995                        ainfo.working_disks = 0;
3996                        ainfo.failed_disks = 0;
3997                        ainfo.spare_disks = 0;
3998                        ainfo.layout = 0;
3999                        ainfo.chunk_size = md_setup_args.chunk[minor];
4000                        err = set_array_info(mddev, &ainfo);
4001                        for (i = 0; !err && (dev = devices[i]); i++) {
4002                                dinfo.number = i;
4003                                dinfo.raid_disk = i;
4004                                dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
4005                                dinfo.major = MAJOR(dev);
4006                                dinfo.minor = MINOR(dev);
4007                                mddev->sb->nr_disks++;
4008                                mddev->sb->raid_disks++;
4009                                mddev->sb->active_disks++;
4010                                mddev->sb->working_disks++;
4011                                err = add_new_disk (mddev, &dinfo);
4012                        }
4013                } else {
4014                        /* persistent */
4015                        for (i = 0; (dev = devices[i]); i++) {
4016                                dinfo.major = MAJOR(dev);
4017                                dinfo.minor = MINOR(dev);
4018                                add_new_disk (mddev, &dinfo);
4019                        }
4020                }
4021                if (!err)
4022                        err = do_md_run(mddev);
4023                if (err) {
4024                        mddev->sb_dirty = 0;
4025                        do_md_stop(mddev, 0);
4026                        printk(KERN_WARNING "md: starting md%d failed\n", minor);
4027                }
4028        }
4029}
4030
4031static int md__init raid_setup(char *str)
4032{
4033        int len, pos;
4034
4035        len = strlen(str) + 1;
4036        pos = 0;
4037
4038        while (pos < len) {
4039                char *comma = strchr(str+pos, ',');
4040                int wlen;
4041                if (comma)
4042                        wlen = (comma-str)-pos;
4043                else    wlen = (len-1)-pos;
4044
4045                if (strncmp(str, "noautodetect", wlen) == 0)
4046                        raid_setup_args.noautodetect = 1;
4047                pos += wlen+1;
4048        }
4049        raid_setup_args.set = 1;
4050        return 1;
4051}
4052
4053int md__init md_run_setup(void)
4054{
4055        if (raid_setup_args.noautodetect)
4056                printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
4057        else
4058                autostart_arrays();
4059        md_setup_drive();
4060        return 0;
4061}
4062
4063__setup("raid=", raid_setup);
4064__setup("md=", md_setup);
4065
4066__initcall(md_init);
4067__initcall(md_run_setup);
4068
4069#else /* It is a MODULE */
4070
4071int init_module(void)
4072{
4073        return md_init();
4074}
4075
4076static void free_device_names(void)
4077{
4078        while (device_names.next != &device_names) {
4079                struct list_head *tmp = device_names.next;
4080                list_del(tmp);
4081                kfree(tmp);
4082        }
4083}
4084
4085
4086void cleanup_module(void)
4087{
4088        md_unregister_thread(md_recovery_thread);
4089        devfs_unregister(devfs_handle);
4090
4091        devfs_unregister_blkdev(MAJOR_NR,"md");
4092        unregister_reboot_notifier(&md_notifier);
4093        unregister_sysctl_table(raid_table_header);
4094#ifdef CONFIG_PROC_FS
4095        remove_proc_entry("mdstat", NULL);
4096#endif
4097
4098        del_gendisk(&md_gendisk);
4099
4100        blk_dev[MAJOR_NR].queue = NULL;
4101        blksize_size[MAJOR_NR] = NULL;
4102        blk_size[MAJOR_NR] = NULL;
4103        max_readahead[MAJOR_NR] = NULL;
4104        hardsect_size[MAJOR_NR] = NULL;
4105
4106        free_device_names();
4107
4108}
4109#endif
4110
4111MD_EXPORT_SYMBOL(md_size);
4112MD_EXPORT_SYMBOL(register_md_personality);
4113MD_EXPORT_SYMBOL(unregister_md_personality);
4114MD_EXPORT_SYMBOL(partition_name);
4115MD_EXPORT_SYMBOL(md_error);
4116MD_EXPORT_SYMBOL(md_do_sync);
4117MD_EXPORT_SYMBOL(md_sync_acct);
4118MD_EXPORT_SYMBOL(md_done_sync);
4119MD_EXPORT_SYMBOL(md_recover_arrays);
4120MD_EXPORT_SYMBOL(md_register_thread);
4121MD_EXPORT_SYMBOL(md_unregister_thread);
4122MD_EXPORT_SYMBOL(md_update_sb);
4123MD_EXPORT_SYMBOL(md_wakeup_thread);
4124MD_EXPORT_SYMBOL(md_print_devices);
4125MD_EXPORT_SYMBOL(find_rdev_nr);
4126MD_EXPORT_SYMBOL(md_interrupt_thread);
4127MD_EXPORT_SYMBOL(mddev_map);
4128MD_EXPORT_SYMBOL(md_check_ordering);
4129MD_EXPORT_SYMBOL(get_spare);
4130MODULE_LICENSE("GPL");
4131
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.