linux-old/drivers/md/raid5.c
<<
>>
Prefs
   1/*
   2 * raid5.c : Multiple Devices driver for Linux
   3 *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4 *         Copyright (C) 1999, 2000 Ingo Molnar
   5 *
   6 * RAID-5 management functions.
   7 *
   8 * This program is free software; you can redistribute it and/or modify
   9 * it under the terms of the GNU General Public License as published by
  10 * the Free Software Foundation; either version 2, or (at your option)
  11 * any later version.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * (for example /usr/src/linux/COPYING); if not, write to the Free
  15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16 */
  17
  18
  19#include <linux/config.h>
  20#include <linux/module.h>
  21#include <linux/locks.h>
  22#include <linux/slab.h>
  23#include <linux/raid/raid5.h>
  24#include <asm/bitops.h>
  25#include <asm/atomic.h>
  26
  27static mdk_personality_t raid5_personality;
  28
  29/*
  30 * Stripe cache
  31 */
  32
  33#define NR_STRIPES              256
  34#define IO_THRESHOLD            1
  35#define HASH_PAGES              1
  36#define HASH_PAGES_ORDER        0
  37#define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
  38#define HASH_MASK               (NR_HASH - 1)
  39#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
  40
  41/*
  42 * The following can be used to debug the driver
  43 */
  44#define RAID5_DEBUG     0
  45#define RAID5_PARANOIA  1
  46#if RAID5_PARANOIA && CONFIG_SMP
  47# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
  48#else
  49# define CHECK_DEVLOCK()
  50#endif
  51
  52#if RAID5_DEBUG
  53#define PRINTK(x...) printk(x)
  54#define inline
  55#define __inline__
  56#else
  57#define PRINTK(x...) do { } while (0)
  58#endif
  59
  60static void print_raid5_conf (raid5_conf_t *conf);
  61
  62static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
  63{
  64        if (atomic_dec_and_test(&sh->count)) {
  65                if (!list_empty(&sh->lru))
  66                        BUG();
  67                if (atomic_read(&conf->active_stripes)==0)
  68                        BUG();
  69                if (test_bit(STRIPE_HANDLE, &sh->state)) {
  70                        if (test_bit(STRIPE_DELAYED, &sh->state))
  71                                list_add_tail(&sh->lru, &conf->delayed_list);
  72                        else
  73                                list_add_tail(&sh->lru, &conf->handle_list);
  74                        md_wakeup_thread(conf->thread);
  75                } else {
  76                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
  77                                atomic_dec(&conf->preread_active_stripes);
  78                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
  79                                        md_wakeup_thread(conf->thread);
  80                        }
  81                        list_add_tail(&sh->lru, &conf->inactive_list);
  82                        atomic_dec(&conf->active_stripes);
  83                        if (!conf->inactive_blocked ||
  84                            atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
  85                                wake_up(&conf->wait_for_stripe);
  86                }
  87        }
  88}
  89static void release_stripe(struct stripe_head *sh)
  90{
  91        raid5_conf_t *conf = sh->raid_conf;
  92        unsigned long flags;
  93        
  94        spin_lock_irqsave(&conf->device_lock, flags);
  95        __release_stripe(conf, sh);
  96        spin_unlock_irqrestore(&conf->device_lock, flags);
  97}
  98
  99static void remove_hash(struct stripe_head *sh)
 100{
 101        PRINTK("remove_hash(), stripe %lu\n", sh->sector);
 102
 103        if (sh->hash_pprev) {
 104                if (sh->hash_next)
 105                        sh->hash_next->hash_pprev = sh->hash_pprev;
 106                *sh->hash_pprev = sh->hash_next;
 107                sh->hash_pprev = NULL;
 108        }
 109}
 110
 111static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 112{
 113        struct stripe_head **shp = &stripe_hash(conf, sh->sector);
 114
 115        PRINTK("insert_hash(), stripe %lu\n",sh->sector);
 116
 117        CHECK_DEVLOCK();
 118        if ((sh->hash_next = *shp) != NULL)
 119                (*shp)->hash_pprev = &sh->hash_next;
 120        *shp = sh;
 121        sh->hash_pprev = shp;
 122}
 123
 124
 125/* find an idle stripe, make sure it is unhashed, and return it. */
 126static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 127{
 128        struct stripe_head *sh = NULL;
 129        struct list_head *first;
 130
 131        CHECK_DEVLOCK();
 132        if (list_empty(&conf->inactive_list))
 133                goto out;
 134        first = conf->inactive_list.next;
 135        sh = list_entry(first, struct stripe_head, lru);
 136        list_del_init(first);
 137        remove_hash(sh);
 138        atomic_inc(&conf->active_stripes);
 139out:
 140        return sh;
 141}
 142
 143static void shrink_buffers(struct stripe_head *sh, int num)
 144{
 145        struct buffer_head *bh;
 146        int i;
 147
 148        for (i=0; i<num ; i++) {
 149                bh = sh->bh_cache[i];
 150                if (!bh)
 151                        return;
 152                sh->bh_cache[i] = NULL;
 153                free_page((unsigned long) bh->b_data);
 154                kfree(bh);
 155        }
 156}
 157
 158static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
 159{
 160        struct buffer_head *bh;
 161        int i;
 162
 163        for (i=0; i<num; i++) {
 164                struct page *page;
 165                bh = kmalloc(sizeof(struct buffer_head), priority);
 166                if (!bh)
 167                        return 1;
 168                memset(bh, 0, sizeof (struct buffer_head));
 169                init_waitqueue_head(&bh->b_wait);
 170                if ((page = alloc_page(priority)))
 171                        bh->b_data = page_address(page);
 172                else {
 173                        kfree(bh);
 174                        return 1;
 175                }
 176                atomic_set(&bh->b_count, 0);
 177                bh->b_page = page;
 178                sh->bh_cache[i] = bh;
 179
 180        }
 181        return 0;
 182}
 183
 184static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
 185
 186static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
 187{
 188        raid5_conf_t *conf = sh->raid_conf;
 189        int disks = conf->raid_disks, i;
 190
 191        if (atomic_read(&sh->count) != 0)
 192                BUG();
 193        if (test_bit(STRIPE_HANDLE, &sh->state))
 194                BUG();
 195        
 196        CHECK_DEVLOCK();
 197        PRINTK("init_stripe called, stripe %lu\n", sh->sector);
 198
 199        remove_hash(sh);
 200        
 201        sh->sector = sector;
 202        sh->size = conf->buffer_size;
 203        sh->state = 0;
 204
 205        for (i=disks; i--; ) {
 206                if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
 207                    buffer_locked(sh->bh_cache[i])) {
 208                        printk("sector=%lx i=%d %p %p %p %d\n",
 209                               sh->sector, i, sh->bh_read[i],
 210                               sh->bh_write[i], sh->bh_written[i],
 211                               buffer_locked(sh->bh_cache[i]));
 212                        BUG();
 213                }
 214                clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
 215                raid5_build_block(sh, i);
 216        }
 217        insert_hash(conf, sh);
 218}
 219
 220/* the buffer size has changed, so unhash all stripes
 221 * as active stripes complete, they will go onto inactive list
 222 */
 223static void shrink_stripe_cache(raid5_conf_t *conf)
 224{
 225        int i;
 226        CHECK_DEVLOCK();
 227        if (atomic_read(&conf->active_stripes))
 228                BUG();
 229        for (i=0; i < NR_HASH; i++) {
 230                struct stripe_head *sh;
 231                while ((sh = conf->stripe_hashtbl[i])) 
 232                        remove_hash(sh);
 233        }
 234}
 235
 236static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
 237{
 238        struct stripe_head *sh;
 239
 240        CHECK_DEVLOCK();
 241        PRINTK("__find_stripe, sector %lu\n", sector);
 242        for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
 243                if (sh->sector == sector)
 244                        return sh;
 245        PRINTK("__stripe %lu not in cache\n", sector);
 246        return NULL;
 247}
 248
 249static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock) 
 250{
 251        struct stripe_head *sh;
 252
 253        PRINTK("get_stripe, sector %lu\n", sector);
 254
 255        md_spin_lock_irq(&conf->device_lock);
 256
 257        do {
 258                if (conf->buffer_size == 0 ||
 259                    (size && size != conf->buffer_size)) {
 260                        /* either the size is being changed (buffer_size==0) or
 261                         * we need to change it.
 262                         * If size==0, we can proceed as soon as buffer_size gets set.
 263                         * If size>0, we can proceed when active_stripes reaches 0, or
 264                         * when someone else sets the buffer_size to size.
 265                         * If someone sets the buffer size to something else, we will need to
 266                         * assert that we want to change it again
 267                         */
 268                        int oldsize = conf->buffer_size;
 269                        PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
 270                        if (size==0)
 271                                wait_event_lock_irq(conf->wait_for_stripe,
 272                                                    conf->buffer_size,
 273                                                    conf->device_lock);
 274                        else {
 275                                while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
 276                                        conf->buffer_size = 0;
 277                                        wait_event_lock_irq(conf->wait_for_stripe,
 278                                                            atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
 279                                                            conf->device_lock);
 280                                        PRINTK("waited and now  %ld/%d buffer_size is %d - %d active\n", sector, size,
 281                                               conf->buffer_size, atomic_read(&conf->active_stripes));
 282                                }
 283
 284                                if (conf->buffer_size != size) {
 285                                        printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
 286                                        shrink_stripe_cache(conf);
 287                                        if (size==0) BUG();
 288                                        conf->buffer_size = size;
 289                                        PRINTK("size now %d\n", conf->buffer_size);
 290                                }
 291                        }
 292                }
 293                if (size == 0)
 294                        sector -= sector & ((conf->buffer_size>>9)-1);
 295
 296                sh = __find_stripe(conf, sector);
 297                if (!sh) {
 298                        if (!conf->inactive_blocked)
 299                                sh = get_free_stripe(conf);
 300                        if (noblock && sh == NULL)
 301                                break;
 302                        if (!sh) {
 303                                conf->inactive_blocked = 1;
 304                                wait_event_lock_irq(conf->wait_for_stripe,
 305                                                    !list_empty(&conf->inactive_list) &&
 306                                                    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
 307                                                     || !conf->inactive_blocked),
 308                                                    conf->device_lock);
 309                                conf->inactive_blocked = 0;
 310                        } else
 311                                init_stripe(sh, sector);
 312                } else {
 313                        if (atomic_read(&sh->count)) {
 314                                if (!list_empty(&sh->lru))
 315                                        BUG();
 316                        } else {
 317                                if (!test_bit(STRIPE_HANDLE, &sh->state))
 318                                        atomic_inc(&conf->active_stripes);
 319                                if (list_empty(&sh->lru))
 320                                        BUG();
 321                                list_del_init(&sh->lru);
 322                        }
 323                }
 324        } while (sh == NULL);
 325
 326        if (sh)
 327                atomic_inc(&sh->count);
 328
 329        md_spin_unlock_irq(&conf->device_lock);
 330        return sh;
 331}
 332
 333static int grow_stripes(raid5_conf_t *conf, int num, int priority)
 334{
 335        struct stripe_head *sh;
 336
 337        while (num--) {
 338                sh = kmalloc(sizeof(struct stripe_head), priority);
 339                if (!sh)
 340                        return 1;
 341                memset(sh, 0, sizeof(*sh));
 342                sh->raid_conf = conf;
 343                sh->lock = SPIN_LOCK_UNLOCKED;
 344
 345                if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
 346                        shrink_buffers(sh, conf->raid_disks);
 347                        kfree(sh);
 348                        return 1;
 349                }
 350                /* we just created an active stripe so... */
 351                atomic_set(&sh->count, 1);
 352                atomic_inc(&conf->active_stripes);
 353                INIT_LIST_HEAD(&sh->lru);
 354                release_stripe(sh);
 355        }
 356        return 0;
 357}
 358
 359static void shrink_stripes(raid5_conf_t *conf, int num)
 360{
 361        struct stripe_head *sh;
 362
 363        while (num--) {
 364                spin_lock_irq(&conf->device_lock);
 365                sh = get_free_stripe(conf);
 366                spin_unlock_irq(&conf->device_lock);
 367                if (!sh)
 368                        break;
 369                if (atomic_read(&sh->count))
 370                        BUG();
 371                shrink_buffers(sh, conf->raid_disks);
 372                kfree(sh);
 373                atomic_dec(&conf->active_stripes);
 374        }
 375}
 376
 377
 378static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
 379{
 380        struct stripe_head *sh = bh->b_private;
 381        raid5_conf_t *conf = sh->raid_conf;
 382        int disks = conf->raid_disks, i;
 383        unsigned long flags;
 384
 385        for (i=0 ; i<disks; i++)
 386                if (bh == sh->bh_cache[i])
 387                        break;
 388
 389        PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
 390        if (i == disks) {
 391                BUG();
 392                return;
 393        }
 394
 395        if (uptodate) {
 396                struct buffer_head *buffer;
 397                spin_lock_irqsave(&conf->device_lock, flags);
 398                /* we can return a buffer if we bypassed the cache or
 399                 * if the top buffer is not in highmem.  If there are
 400                 * multiple buffers, leave the extra work to
 401                 * handle_stripe
 402                 */
 403                buffer = sh->bh_read[i];
 404                if (buffer &&
 405                    (!PageHighMem(buffer->b_page)
 406                     || buffer->b_page == bh->b_page )
 407                        ) {
 408                        sh->bh_read[i] = buffer->b_reqnext;
 409                        buffer->b_reqnext = NULL;
 410                } else
 411                        buffer = NULL;
 412                spin_unlock_irqrestore(&conf->device_lock, flags);
 413                if (sh->bh_page[i]==NULL)
 414                        set_bit(BH_Uptodate, &bh->b_state);
 415                if (buffer) {
 416                        if (buffer->b_page != bh->b_page)
 417                                memcpy(buffer->b_data, bh->b_data, bh->b_size);
 418                        buffer->b_end_io(buffer, 1);
 419                }
 420        } else {
 421                md_error(conf->mddev, bh->b_dev);
 422                clear_bit(BH_Uptodate, &bh->b_state);
 423        }
 424        /* must restore b_page before unlocking buffer... */
 425        if (sh->bh_page[i]) {
 426                bh->b_page = sh->bh_page[i];
 427                bh->b_data = page_address(bh->b_page);
 428                sh->bh_page[i] = NULL;
 429                clear_bit(BH_Uptodate, &bh->b_state);
 430        }
 431        clear_bit(BH_Lock, &bh->b_state);
 432        set_bit(STRIPE_HANDLE, &sh->state);
 433        release_stripe(sh);
 434}
 435
 436static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
 437{
 438        struct stripe_head *sh = bh->b_private;
 439        raid5_conf_t *conf = sh->raid_conf;
 440        int disks = conf->raid_disks, i;
 441        unsigned long flags;
 442
 443        for (i=0 ; i<disks; i++)
 444                if (bh == sh->bh_cache[i])
 445                        break;
 446
 447        PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
 448        if (i == disks) {
 449                BUG();
 450                return;
 451        }
 452
 453        md_spin_lock_irqsave(&conf->device_lock, flags);
 454        if (!uptodate)
 455                md_error(conf->mddev, bh->b_dev);
 456        clear_bit(BH_Lock, &bh->b_state);
 457        set_bit(STRIPE_HANDLE, &sh->state);
 458        __release_stripe(conf, sh);
 459        md_spin_unlock_irqrestore(&conf->device_lock, flags);
 460}
 461        
 462
 463
 464static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
 465{
 466        raid5_conf_t *conf = sh->raid_conf;
 467        struct buffer_head *bh = sh->bh_cache[i];
 468        unsigned long block = sh->sector / (sh->size >> 9);
 469
 470        init_buffer(bh, raid5_end_read_request, sh);
 471        bh->b_dev       = conf->disks[i].dev;
 472        bh->b_blocknr   = block;
 473
 474        bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
 475        bh->b_size      = sh->size;
 476        bh->b_list      = BUF_LOCKED;
 477        return bh;
 478}
 479
 480static int raid5_error (mddev_t *mddev, kdev_t dev)
 481{
 482        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 483        mdp_super_t *sb = mddev->sb;
 484        struct disk_info *disk;
 485        int i;
 486
 487        PRINTK("raid5_error called\n");
 488
 489        for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
 490                if (disk->dev == dev) {
 491                        if (disk->operational) {
 492                                disk->operational = 0;
 493                                mark_disk_faulty(sb->disks+disk->number);
 494                                mark_disk_nonsync(sb->disks+disk->number);
 495                                mark_disk_inactive(sb->disks+disk->number);
 496                                sb->active_disks--;
 497                                sb->working_disks--;
 498                                sb->failed_disks++;
 499                                mddev->sb_dirty = 1;
 500                                conf->working_disks--;
 501                                conf->failed_disks++;
 502                                md_wakeup_thread(conf->thread);
 503                                printk (KERN_ALERT
 504                                        "raid5: Disk failure on %s, disabling device."
 505                                        " Operation continuing on %d devices\n",
 506                                        partition_name (dev), conf->working_disks);
 507                        }
 508                        return 0;
 509                }
 510        }
 511        /*
 512         * handle errors in spares (during reconstruction)
 513         */
 514        if (conf->spare) {
 515                disk = conf->spare;
 516                if (disk->dev == dev) {
 517                        printk (KERN_ALERT
 518                                "raid5: Disk failure on spare %s\n",
 519                                partition_name (dev));
 520                        if (!conf->spare->operational) {
 521                                /* probably a SET_DISK_FAULTY ioctl */
 522                                return -EIO;
 523                        }
 524                        disk->operational = 0;
 525                        disk->write_only = 0;
 526                        conf->spare = NULL;
 527                        mark_disk_faulty(sb->disks+disk->number);
 528                        mark_disk_nonsync(sb->disks+disk->number);
 529                        mark_disk_inactive(sb->disks+disk->number);
 530                        sb->spare_disks--;
 531                        sb->working_disks--;
 532                        sb->failed_disks++;
 533
 534                        mddev->sb_dirty = 1;
 535                        md_wakeup_thread(conf->thread);
 536
 537                        return 0;
 538                }
 539        }
 540        MD_BUG();
 541        return -EIO;
 542}       
 543
 544/*
 545 * Input: a 'big' sector number,
 546 * Output: index of the data and parity disk, and the sector # in them.
 547 */
 548static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
 549                        unsigned int data_disks, unsigned int * dd_idx,
 550                        unsigned int * pd_idx, raid5_conf_t *conf)
 551{
 552        unsigned long stripe;
 553        unsigned long chunk_number;
 554        unsigned int chunk_offset;
 555        unsigned long new_sector;
 556        int sectors_per_chunk = conf->chunk_size >> 9;
 557
 558        /* First compute the information on this sector */
 559
 560        /*
 561         * Compute the chunk number and the sector offset inside the chunk
 562         */
 563        chunk_number = r_sector / sectors_per_chunk;
 564        chunk_offset = r_sector % sectors_per_chunk;
 565
 566        /*
 567         * Compute the stripe number
 568         */
 569        stripe = chunk_number / data_disks;
 570
 571        /*
 572         * Compute the data disk and parity disk indexes inside the stripe
 573         */
 574        *dd_idx = chunk_number % data_disks;
 575
 576        /*
 577         * Select the parity disk based on the user selected algorithm.
 578         */
 579        if (conf->level == 4)
 580                *pd_idx = data_disks;
 581        else switch (conf->algorithm) {
 582                case ALGORITHM_LEFT_ASYMMETRIC:
 583                        *pd_idx = data_disks - stripe % raid_disks;
 584                        if (*dd_idx >= *pd_idx)
 585                                (*dd_idx)++;
 586                        break;
 587                case ALGORITHM_RIGHT_ASYMMETRIC:
 588                        *pd_idx = stripe % raid_disks;
 589                        if (*dd_idx >= *pd_idx)
 590                                (*dd_idx)++;
 591                        break;
 592                case ALGORITHM_LEFT_SYMMETRIC:
 593                        *pd_idx = data_disks - stripe % raid_disks;
 594                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 595                        break;
 596                case ALGORITHM_RIGHT_SYMMETRIC:
 597                        *pd_idx = stripe % raid_disks;
 598                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 599                        break;
 600                default:
 601                        printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 602        }
 603
 604        /*
 605         * Finally, compute the new sector number
 606         */
 607        new_sector = stripe * sectors_per_chunk + chunk_offset;
 608        return new_sector;
 609}
 610
 611#if 0
 612static unsigned long compute_blocknr(struct stripe_head *sh, int i)
 613{
 614        raid5_conf_t *conf = sh->raid_conf;
 615        int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
 616        unsigned long new_sector = sh->sector, check;
 617        int sectors_per_chunk = conf->chunk_size >> 9;
 618        unsigned long stripe = new_sector / sectors_per_chunk;
 619        int chunk_offset = new_sector % sectors_per_chunk;
 620        int chunk_number, dummy1, dummy2, dd_idx = i;
 621        unsigned long r_sector, blocknr;
 622
 623        switch (conf->algorithm) {
 624                case ALGORITHM_LEFT_ASYMMETRIC:
 625                case ALGORITHM_RIGHT_ASYMMETRIC:
 626                        if (i > sh->pd_idx)
 627                                i--;
 628                        break;
 629                case ALGORITHM_LEFT_SYMMETRIC:
 630                case ALGORITHM_RIGHT_SYMMETRIC:
 631                        if (i < sh->pd_idx)
 632                                i += raid_disks;
 633                        i -= (sh->pd_idx + 1);
 634                        break;
 635                default:
 636                        printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 637        }
 638
 639        chunk_number = stripe * data_disks + i;
 640        r_sector = chunk_number * sectors_per_chunk + chunk_offset;
 641        blocknr = r_sector / (sh->size >> 9);
 642
 643        check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
 644        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
 645                printk("compute_blocknr: map not correct\n");
 646                return 0;
 647        }
 648        return blocknr;
 649}
 650#endif
 651
 652#define check_xor()     do {                                    \
 653                           if (count == MAX_XOR_BLOCKS) {       \
 654                                xor_block(count, bh_ptr);       \
 655                                count = 1;                      \
 656                           }                                    \
 657                        } while(0)
 658
 659
 660static void compute_block(struct stripe_head *sh, int dd_idx)
 661{
 662        raid5_conf_t *conf = sh->raid_conf;
 663        int i, count, disks = conf->raid_disks;
 664        struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
 665
 666        PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
 667
 668
 669        memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
 670        bh_ptr[0] = sh->bh_cache[dd_idx];
 671        count = 1;
 672        for (i = disks ; i--; ) {
 673                if (i == dd_idx)
 674                        continue;
 675                bh = sh->bh_cache[i];
 676                if (buffer_uptodate(bh))
 677                        bh_ptr[count++] = bh;
 678                else
 679                        printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
 680
 681                check_xor();
 682        }
 683        if (count != 1)
 684                xor_block(count, bh_ptr);
 685        set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
 686}
 687
 688static void compute_parity(struct stripe_head *sh, int method)
 689{
 690        raid5_conf_t *conf = sh->raid_conf;
 691        int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
 692        struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
 693        struct buffer_head *chosen[MD_SB_DISKS];
 694
 695        PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
 696        memset(chosen, 0, sizeof(chosen));
 697
 698        count = 1;
 699        bh_ptr[0] = sh->bh_cache[pd_idx];
 700        switch(method) {
 701        case READ_MODIFY_WRITE:
 702                if (!buffer_uptodate(sh->bh_cache[pd_idx]))
 703                        BUG();
 704                for (i=disks ; i-- ;) {
 705                        if (i==pd_idx)
 706                                continue;
 707                        if (sh->bh_write[i] &&
 708                            buffer_uptodate(sh->bh_cache[i])) {
 709                                bh_ptr[count++] = sh->bh_cache[i];
 710                                chosen[i] = sh->bh_write[i];
 711                                sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
 712                                chosen[i]->b_reqnext = sh->bh_written[i];
 713                                sh->bh_written[i] = chosen[i];
 714                                check_xor();
 715                        }
 716                }
 717                break;
 718        case RECONSTRUCT_WRITE:
 719                memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
 720                for (i= disks; i-- ;)
 721                        if (i!=pd_idx && sh->bh_write[i]) {
 722                                chosen[i] = sh->bh_write[i];
 723                                sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
 724                                chosen[i]->b_reqnext = sh->bh_written[i];
 725                                sh->bh_written[i] = chosen[i];
 726                        }
 727                break;
 728        case CHECK_PARITY:
 729                break;
 730        }
 731        if (count>1) {
 732                xor_block(count, bh_ptr);
 733                count = 1;
 734        }
 735        
 736        for (i = disks; i--;)
 737                if (chosen[i]) {
 738                        struct buffer_head *bh = sh->bh_cache[i];
 739                        char *bdata;
 740                        bdata = bh_kmap(chosen[i]);
 741                        memcpy(bh->b_data,
 742                               bdata,sh->size);
 743                        bh_kunmap(chosen[i]);
 744                        set_bit(BH_Lock, &bh->b_state);
 745                        mark_buffer_uptodate(bh, 1);
 746                }
 747
 748        switch(method) {
 749        case RECONSTRUCT_WRITE:
 750        case CHECK_PARITY:
 751                for (i=disks; i--;)
 752                        if (i != pd_idx) {
 753                                bh_ptr[count++] = sh->bh_cache[i];
 754                                check_xor();
 755                        }
 756                break;
 757        case READ_MODIFY_WRITE:
 758                for (i = disks; i--;)
 759                        if (chosen[i]) {
 760                                bh_ptr[count++] = sh->bh_cache[i];
 761                                check_xor();
 762                        }
 763        }
 764        if (count != 1)
 765                xor_block(count, bh_ptr);
 766        
 767        if (method != CHECK_PARITY) {
 768                mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
 769                set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
 770        } else
 771                mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
 772}
 773
 774static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
 775{
 776        struct buffer_head **bhp;
 777        raid5_conf_t *conf = sh->raid_conf;
 778
 779        PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
 780
 781
 782        spin_lock(&sh->lock);
 783        spin_lock_irq(&conf->device_lock);
 784        bh->b_reqnext = NULL;
 785        if (rw == READ)
 786                bhp = &sh->bh_read[dd_idx];
 787        else
 788                bhp = &sh->bh_write[dd_idx];
 789        while (*bhp) {
 790                printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
 791                bhp = & (*bhp)->b_reqnext;
 792        }
 793        *bhp = bh;
 794        spin_unlock_irq(&conf->device_lock);
 795        spin_unlock(&sh->lock);
 796
 797        PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
 798}
 799
 800
 801
 802
 803
 804/*
 805 * handle_stripe - do things to a stripe.
 806 *
 807 * We lock the stripe and then examine the state of various bits
 808 * to see what needs to be done.
 809 * Possible results:
 810 *    return some read request which now have data
 811 *    return some write requests which are safely on disc
 812 *    schedule a read on some buffers
 813 *    schedule a write of some buffers
 814 *    return confirmation of parity correctness
 815 *
 816 * Parity calculations are done inside the stripe lock
 817 * buffers are taken off read_list or write_list, and bh_cache buffers
 818 * get BH_Lock set before the stripe lock is released.
 819 *
 820 */
 821 
 822static void handle_stripe(struct stripe_head *sh)
 823{
 824        raid5_conf_t *conf = sh->raid_conf;
 825        int disks = conf->raid_disks;
 826        struct buffer_head *return_ok= NULL, *return_fail = NULL;
 827        int action[MD_SB_DISKS];
 828        int i;
 829        int syncing;
 830        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 831        int failed_num=0;
 832        struct buffer_head *bh;
 833
 834        PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
 835        memset(action, 0, sizeof(action));
 836
 837        spin_lock(&sh->lock);
 838        clear_bit(STRIPE_HANDLE, &sh->state);
 839        clear_bit(STRIPE_DELAYED, &sh->state);
 840
 841        syncing = test_bit(STRIPE_SYNCING, &sh->state);
 842        /* Now to look around and see what can be done */
 843
 844        for (i=disks; i--; ) {
 845                bh = sh->bh_cache[i];
 846                PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
 847                /* maybe we can reply to a read */
 848                if (buffer_uptodate(bh) && sh->bh_read[i]) {
 849                        struct buffer_head *rbh, *rbh2;
 850                        PRINTK("Return read for disc %d\n", i);
 851                        spin_lock_irq(&conf->device_lock);
 852                        rbh = sh->bh_read[i];
 853                        sh->bh_read[i] = NULL;
 854                        spin_unlock_irq(&conf->device_lock);
 855                        while (rbh) {
 856                                char *bdata;
 857                                bdata = bh_kmap(rbh);
 858                                memcpy(bdata, bh->b_data, bh->b_size);
 859                                bh_kunmap(rbh);
 860                                rbh2 = rbh->b_reqnext;
 861                                rbh->b_reqnext = return_ok;
 862                                return_ok = rbh;
 863                                rbh = rbh2;
 864                        }
 865                }
 866
 867                /* now count some things */
 868                if (buffer_locked(bh)) locked++;
 869                if (buffer_uptodate(bh)) uptodate++;
 870
 871                
 872                if (sh->bh_read[i]) to_read++;
 873                if (sh->bh_write[i]) to_write++;
 874                if (sh->bh_written[i]) written++;
 875                if (!conf->disks[i].operational) {
 876                        failed++;
 877                        failed_num = i;
 878                }
 879        }
 880        PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
 881               locked, uptodate, to_read, to_write, failed, failed_num);
 882        /* check if the array has lost two devices and, if so, some requests might
 883         * need to be failed
 884         */
 885        if (failed > 1 && to_read+to_write+written) {
 886                for (i=disks; i--; ) {
 887                        /* fail all writes first */
 888                        if (sh->bh_write[i]) to_write--;
 889                        while ((bh = sh->bh_write[i])) {
 890                                sh->bh_write[i] = bh->b_reqnext;
 891                                bh->b_reqnext = return_fail;
 892                                return_fail = bh;
 893                        }
 894                        /* and fail all 'written' */
 895                        if (sh->bh_written[i]) written--;
 896                        while ((bh = sh->bh_written[i])) {
 897                                sh->bh_written[i] = bh->b_reqnext;
 898                                bh->b_reqnext = return_fail;
 899                                return_fail = bh;
 900                        }
 901
 902                        /* fail any reads if this device is non-operational */
 903                        if (!conf->disks[i].operational) {
 904                                spin_lock_irq(&conf->device_lock);
 905                                if (sh->bh_read[i]) to_read--;
 906                                while ((bh = sh->bh_read[i])) {
 907                                        sh->bh_read[i] = bh->b_reqnext;
 908                                        bh->b_reqnext = return_fail;
 909                                        return_fail = bh;
 910                                }
 911                                spin_unlock_irq(&conf->device_lock);
 912                        }
 913                }
 914        }
 915        if (failed > 1 && syncing) {
 916                md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
 917                clear_bit(STRIPE_SYNCING, &sh->state);
 918                syncing = 0;
 919        }
 920
 921        /* might be able to return some write requests if the parity block
 922         * is safe, or on a failed drive
 923         */
 924        bh = sh->bh_cache[sh->pd_idx];
 925        if ( written &&
 926             ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
 927               || (failed == 1 && failed_num == sh->pd_idx))
 928            ) {
 929            /* any written block on a uptodate or failed drive can be returned */
 930            for (i=disks; i--; )
 931                if (sh->bh_written[i]) {
 932                    bh = sh->bh_cache[i];
 933                    if (!conf->disks[sh->pd_idx].operational ||
 934                        (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
 935                        /* maybe we can return some write requests */
 936                        struct buffer_head *wbh, *wbh2;
 937                        PRINTK("Return write for disc %d\n", i);
 938                        wbh = sh->bh_written[i];
 939                        sh->bh_written[i] = NULL;
 940                        while (wbh) {
 941                            wbh2 = wbh->b_reqnext;
 942                            wbh->b_reqnext = return_ok;
 943                            return_ok = wbh;
 944                            wbh = wbh2;
 945                        }
 946                    }
 947                }
 948        }
 949                
 950        /* Now we might consider reading some blocks, either to check/generate
 951         * parity, or to satisfy requests
 952         */
 953        if (to_read || (syncing && (uptodate < disks))) {
 954                for (i=disks; i--;) {
 955                        bh = sh->bh_cache[i];
 956                        if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
 957                            (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
 958                                /* we would like to get this block, possibly
 959                                 * by computing it, but we might not be able to
 960                                 */
 961                                if (uptodate == disks-1) {
 962                                        PRINTK("Computing block %d\n", i);
 963                                        compute_block(sh, i);
 964                                        uptodate++;
 965                                } else if (conf->disks[i].operational) {
 966                                        set_bit(BH_Lock, &bh->b_state);
 967                                        action[i] = READ+1;
 968                                        /* if I am just reading this block and we don't have
 969                                           a failed drive, or any pending writes then sidestep the cache */
 970                                        if (sh->bh_page[i]) BUG();
 971                                        if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
 972                                            ! syncing && !failed && !to_write) {
 973                                                sh->bh_page[i] = sh->bh_cache[i]->b_page;
 974                                                sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
 975                                                sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
 976                                        }
 977                                        locked++;
 978                                        PRINTK("Reading block %d (sync=%d)\n", i, syncing);
 979                                        if (syncing)
 980                                                md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
 981                                }
 982                        }
 983                }
 984                set_bit(STRIPE_HANDLE, &sh->state);
 985        }
 986
 987        /* now to consider writing and what else, if anything should be read */
 988        if (to_write) {
 989                int rmw=0, rcw=0;
 990                for (i=disks ; i--;) {
 991                        /* would I have to read this buffer for read_modify_write */
 992                        bh = sh->bh_cache[i];
 993                        if ((sh->bh_write[i] || i == sh->pd_idx) &&
 994                            (!buffer_locked(bh) || sh->bh_page[i]) &&
 995                            !buffer_uptodate(bh)) {
 996                                if (conf->disks[i].operational 
 997/*                                  && !(conf->resync_parity && i == sh->pd_idx) */
 998                                        )
 999                                        rmw++;
1000                                else rmw += 2*disks;  /* cannot read it */
1001                        }
1002                        /* Would I have to read this buffer for reconstruct_write */
1003                        if (!sh->bh_write[i] && i != sh->pd_idx &&
1004                            (!buffer_locked(bh) || sh->bh_page[i]) &&
1005                            !buffer_uptodate(bh)) {
1006                                if (conf->disks[i].operational) rcw++;
1007                                else rcw += 2*disks;
1008                        }
1009                }
1010                PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1011                set_bit(STRIPE_HANDLE, &sh->state);
1012                if (rmw < rcw && rmw > 0)
1013                        /* prefer read-modify-write, but need to get some data */
1014                        for (i=disks; i--;) {
1015                                bh = sh->bh_cache[i];
1016                                if ((sh->bh_write[i] || i == sh->pd_idx) &&
1017                                    !buffer_locked(bh) && !buffer_uptodate(bh) &&
1018                                    conf->disks[i].operational) {
1019                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1020                                        {
1021                                                PRINTK("Read_old block %d for r-m-w\n", i);
1022                                                set_bit(BH_Lock, &bh->b_state);
1023                                                action[i] = READ+1;
1024                                                locked++;
1025                                        } else {
1026                                                set_bit(STRIPE_DELAYED, &sh->state);
1027                                                set_bit(STRIPE_HANDLE, &sh->state);
1028                                        }
1029                                }
1030                        }
1031                if (rcw <= rmw && rcw > 0)
1032                        /* want reconstruct write, but need to get some data */
1033                        for (i=disks; i--;) {
1034                                bh = sh->bh_cache[i];
1035                                if (!sh->bh_write[i]  && i != sh->pd_idx &&
1036                                    !buffer_locked(bh) && !buffer_uptodate(bh) &&
1037                                    conf->disks[i].operational) {
1038                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1039                                        {
1040                                                PRINTK("Read_old block %d for Reconstruct\n", i);
1041                                                set_bit(BH_Lock, &bh->b_state);
1042                                                action[i] = READ+1;
1043                                                locked++;
1044                                        } else {
1045                                                set_bit(STRIPE_DELAYED, &sh->state);
1046                                                set_bit(STRIPE_HANDLE, &sh->state);
1047                                        }
1048                                }
1049                        }
1050                /* now if nothing is locked, and if we have enough data, we can start a write request */
1051                if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1052                        PRINTK("Computing parity...\n");
1053                        compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1054                        /* now every locked buffer is ready to be written */
1055                        for (i=disks; i--;)
1056                                if (buffer_locked(sh->bh_cache[i])) {
1057                                        PRINTK("Writing block %d\n", i);
1058                                        locked++;
1059                                        action[i] = WRITE+1;
1060                                        if (!conf->disks[i].operational
1061                                            || (i==sh->pd_idx && failed == 0))
1062                                                set_bit(STRIPE_INSYNC, &sh->state);
1063                                }
1064                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1065                                atomic_dec(&conf->preread_active_stripes);
1066                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1067                                        md_wakeup_thread(conf->thread);
1068                        }
1069                }
1070        }
1071
1072        /* maybe we need to check and possibly fix the parity for this stripe
1073         * Any reads will already have been scheduled, so we just see if enough data
1074         * is available
1075         */
1076        if (syncing && locked == 0 &&
1077            !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1078                set_bit(STRIPE_HANDLE, &sh->state);
1079                if (failed == 0) {
1080                        if (uptodate != disks)
1081                                BUG();
1082                        compute_parity(sh, CHECK_PARITY);
1083                        uptodate--;
1084                        bh = sh->bh_cache[sh->pd_idx];
1085                        if ((*(u32*)bh->b_data) == 0 &&
1086                            !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1087                                /* parity is correct (on disc, not in buffer any more) */
1088                                set_bit(STRIPE_INSYNC, &sh->state);
1089                        }
1090                }
1091                if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1092                        struct disk_info *spare;
1093                        if (failed==0)
1094                                failed_num = sh->pd_idx;
1095                        /* should be able to compute the missing block and write it to spare */
1096                        if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1097                                if (uptodate+1 != disks)
1098                                        BUG();
1099                                compute_block(sh, failed_num);
1100                                uptodate++;
1101                        }
1102                        if (uptodate != disks)
1103                                BUG();
1104                        bh = sh->bh_cache[failed_num];
1105                        set_bit(BH_Lock, &bh->b_state);
1106                        action[failed_num] = WRITE+1;
1107                        locked++;
1108                        set_bit(STRIPE_INSYNC, &sh->state);
1109                        if (conf->disks[failed_num].operational)
1110                                md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
1111                        else if ((spare=conf->spare))
1112                                md_sync_acct(spare->dev, bh->b_size>>9);
1113
1114                }
1115        }
1116        if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1117                md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
1118                clear_bit(STRIPE_SYNCING, &sh->state);
1119        }
1120        
1121        
1122        spin_unlock(&sh->lock);
1123
1124        while ((bh=return_ok)) {
1125                return_ok = bh->b_reqnext;
1126                bh->b_reqnext = NULL;
1127                bh->b_end_io(bh, 1);
1128        }
1129        while ((bh=return_fail)) {
1130                return_fail = bh->b_reqnext;
1131                bh->b_reqnext = NULL;
1132                bh->b_end_io(bh, 0);
1133        }
1134        for (i=disks; i-- ;) 
1135                if (action[i]) {
1136                        struct buffer_head *bh = sh->bh_cache[i];
1137                        struct disk_info *spare = conf->spare;
1138                        int skip = 0;
1139                        if (action[i] == READ+1)
1140                                bh->b_end_io = raid5_end_read_request;
1141                        else
1142                                bh->b_end_io = raid5_end_write_request;
1143                        if (conf->disks[i].operational)
1144                                bh->b_dev = conf->disks[i].dev;
1145                        else if (spare && action[i] == WRITE+1)
1146                                bh->b_dev = spare->dev;
1147                        else skip=1;
1148                        if (!skip) {
1149                                PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1150                                atomic_inc(&sh->count);
1151                                bh->b_rdev = bh->b_dev;
1152                                bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1153                                generic_make_request(action[i]-1, bh);
1154                        } else {
1155                                PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1156                                clear_bit(BH_Lock, &bh->b_state);
1157                                set_bit(STRIPE_HANDLE, &sh->state);
1158                        }
1159                }
1160}
1161
1162static inline void raid5_activate_delayed(raid5_conf_t *conf)
1163{
1164        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1165                while (!list_empty(&conf->delayed_list)) {
1166                        struct list_head *l = conf->delayed_list.next;
1167                        struct stripe_head *sh;
1168                        sh = list_entry(l, struct stripe_head, lru);
1169                        list_del_init(l);
1170                        clear_bit(STRIPE_DELAYED, &sh->state);
1171                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1172                                atomic_inc(&conf->preread_active_stripes);
1173                        list_add_tail(&sh->lru, &conf->handle_list);
1174                }
1175        }
1176}
1177static void raid5_unplug_device(void *data)
1178{
1179        raid5_conf_t *conf = (raid5_conf_t *)data;
1180        unsigned long flags;
1181
1182        spin_lock_irqsave(&conf->device_lock, flags);
1183
1184        raid5_activate_delayed(conf);
1185        
1186        conf->plugged = 0;
1187        md_wakeup_thread(conf->thread);
1188
1189        spin_unlock_irqrestore(&conf->device_lock, flags);
1190}
1191
1192static inline void raid5_plug_device(raid5_conf_t *conf)
1193{
1194        spin_lock_irq(&conf->device_lock);
1195        if (list_empty(&conf->delayed_list))
1196                if (!conf->plugged) {
1197                        conf->plugged = 1;
1198                        queue_task(&conf->plug_tq, &tq_disk);
1199                }
1200        spin_unlock_irq(&conf->device_lock);
1201}
1202
1203static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1204{
1205        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1206        const unsigned int raid_disks = conf->raid_disks;
1207        const unsigned int data_disks = raid_disks - 1;
1208        unsigned int dd_idx, pd_idx;
1209        unsigned long new_sector;
1210        int read_ahead = 0;
1211
1212        struct stripe_head *sh;
1213
1214        if (rw == READA) {
1215                rw = READ;
1216                read_ahead=1;
1217        }
1218
1219        new_sector = raid5_compute_sector(bh->b_rsector,
1220                        raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1221
1222        PRINTK("raid5_make_request, sector %lu\n", new_sector);
1223        sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1224        if (sh) {
1225                sh->pd_idx = pd_idx;
1226
1227                add_stripe_bh(sh, bh, dd_idx, rw);
1228
1229                raid5_plug_device(conf);
1230                handle_stripe(sh);
1231                release_stripe(sh);
1232        } else
1233                bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1234        return 0;
1235}
1236
1237/*
1238 * Determine correct block size for this device.
1239 */
1240unsigned int device_bsize (kdev_t dev)
1241{
1242        unsigned int i, correct_size;
1243
1244        correct_size = BLOCK_SIZE;
1245        if (blksize_size[MAJOR(dev)]) {
1246                i = blksize_size[MAJOR(dev)][MINOR(dev)];
1247                if (i)
1248                        correct_size = i;
1249        }
1250
1251        return correct_size;
1252}
1253
1254static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
1255{
1256        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1257        struct stripe_head *sh;
1258        int sectors_per_chunk = conf->chunk_size >> 9;
1259        unsigned long stripe = sector_nr/sectors_per_chunk;
1260        int chunk_offset = sector_nr % sectors_per_chunk;
1261        int dd_idx, pd_idx;
1262        unsigned long first_sector;
1263        int raid_disks = conf->raid_disks;
1264        int data_disks = raid_disks-1;
1265        int redone = 0;
1266        int bufsize;
1267
1268        sh = get_active_stripe(conf, sector_nr, 0, 0);
1269        bufsize = sh->size;
1270        redone = sector_nr - sh->sector;
1271        first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1272                + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1273        sh->pd_idx = pd_idx;
1274        spin_lock(&sh->lock);   
1275        set_bit(STRIPE_SYNCING, &sh->state);
1276        clear_bit(STRIPE_INSYNC, &sh->state);
1277        sh->sync_redone = redone;
1278        spin_unlock(&sh->lock);
1279
1280        handle_stripe(sh);
1281        release_stripe(sh);
1282
1283        return (bufsize>>9)-redone;
1284}
1285
1286/*
1287 * This is our raid5 kernel thread.
1288 *
1289 * We scan the hash table for stripes which can be handled now.
1290 * During the scan, completed stripes are saved for us by the interrupt
1291 * handler, so that they will not have to wait for our next wakeup.
1292 */
1293static void raid5d (void *data)
1294{
1295        struct stripe_head *sh;
1296        raid5_conf_t *conf = data;
1297        mddev_t *mddev = conf->mddev;
1298        int handled;
1299
1300        PRINTK("+++ raid5d active\n");
1301
1302        handled = 0;
1303
1304        if (mddev->sb_dirty)
1305                md_update_sb(mddev);
1306        md_spin_lock_irq(&conf->device_lock);
1307        while (1) {
1308                struct list_head *first;
1309
1310                if (list_empty(&conf->handle_list) &&
1311                    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1312                    !conf->plugged &&
1313                    !list_empty(&conf->delayed_list))
1314                        raid5_activate_delayed(conf);
1315
1316                if (list_empty(&conf->handle_list))
1317                        break;
1318
1319                first = conf->handle_list.next;
1320                sh = list_entry(first, struct stripe_head, lru);
1321
1322                list_del_init(first);
1323                atomic_inc(&sh->count);
1324                if (atomic_read(&sh->count)!= 1)
1325                        BUG();
1326                md_spin_unlock_irq(&conf->device_lock);
1327                
1328                handled++;
1329                handle_stripe(sh);
1330                release_stripe(sh);
1331
1332                md_spin_lock_irq(&conf->device_lock);
1333        }
1334        PRINTK("%d stripes handled\n", handled);
1335
1336        md_spin_unlock_irq(&conf->device_lock);
1337
1338        PRINTK("--- raid5d inactive\n");
1339}
1340
1341/*
1342 * Private kernel thread for parity reconstruction after an unclean
1343 * shutdown. Reconstruction on spare drives in case of a failed drive
1344 * is done by the generic mdsyncd.
1345 */
1346static void raid5syncd (void *data)
1347{
1348        raid5_conf_t *conf = data;
1349        mddev_t *mddev = conf->mddev;
1350
1351        if (!conf->resync_parity)
1352                return;
1353        if (conf->resync_parity == 2)
1354                return;
1355        down(&mddev->recovery_sem);
1356        if (md_do_sync(mddev,NULL)) {
1357                up(&mddev->recovery_sem);
1358                printk("raid5: resync aborted!\n");
1359                return;
1360        }
1361        conf->resync_parity = 0;
1362        up(&mddev->recovery_sem);
1363        printk("raid5: resync finished.\n");
1364}
1365
1366static int raid5_run (mddev_t *mddev)
1367{
1368        raid5_conf_t *conf;
1369        int i, j, raid_disk, memory;
1370        mdp_super_t *sb = mddev->sb;
1371        mdp_disk_t *desc;
1372        mdk_rdev_t *rdev;
1373        struct disk_info *disk;
1374        struct md_list_head *tmp;
1375        int start_recovery = 0;
1376
1377        MOD_INC_USE_COUNT;
1378
1379        if (sb->level != 5 && sb->level != 4) {
1380                printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1381                MOD_DEC_USE_COUNT;
1382                return -EIO;
1383        }
1384
1385        mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1386        if ((conf = mddev->private) == NULL)
1387                goto abort;
1388        memset (conf, 0, sizeof (*conf));
1389        conf->mddev = mddev;
1390
1391        if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1392                goto abort;
1393        memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1394
1395        conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1396        md_init_waitqueue_head(&conf->wait_for_stripe);
1397        INIT_LIST_HEAD(&conf->handle_list);
1398        INIT_LIST_HEAD(&conf->delayed_list);
1399        INIT_LIST_HEAD(&conf->inactive_list);
1400        atomic_set(&conf->active_stripes, 0);
1401        atomic_set(&conf->preread_active_stripes, 0);
1402        conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
1403
1404        conf->plugged = 0;
1405        conf->plug_tq.sync = 0;
1406        conf->plug_tq.routine = &raid5_unplug_device;
1407        conf->plug_tq.data = conf;
1408
1409        PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1410
1411        ITERATE_RDEV(mddev,rdev,tmp) {
1412                /*
1413                 * This is important -- we are using the descriptor on
1414                 * the disk only to get a pointer to the descriptor on
1415                 * the main superblock, which might be more recent.
1416                 */
1417                desc = sb->disks + rdev->desc_nr;
1418                raid_disk = desc->raid_disk;
1419                disk = conf->disks + raid_disk;
1420
1421                if (disk_faulty(desc)) {
1422                        printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1423                        if (!rdev->faulty) {
1424                                MD_BUG();
1425                                goto abort;
1426                        }
1427                        disk->number = desc->number;
1428                        disk->raid_disk = raid_disk;
1429                        disk->dev = rdev->dev;
1430
1431                        disk->operational = 0;
1432                        disk->write_only = 0;
1433                        disk->spare = 0;
1434                        disk->used_slot = 1;
1435                        continue;
1436                }
1437                if (disk_active(desc)) {
1438                        if (!disk_sync(desc)) {
1439                                printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1440                                MD_BUG();
1441                                goto abort;
1442                        }
1443                        if (raid_disk > sb->raid_disks) {
1444                                printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1445                                continue;
1446                        }
1447                        if (disk->operational) {
1448                                printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1449                                continue;
1450                        }
1451                        printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1452        
1453                        disk->number = desc->number;
1454                        disk->raid_disk = raid_disk;
1455                        disk->dev = rdev->dev;
1456                        disk->operational = 1;
1457                        disk->used_slot = 1;
1458
1459                        conf->working_disks++;
1460                } else {
1461                        /*
1462                         * Must be a spare disk ..
1463                         */
1464                        printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1465                        disk->number = desc->number;
1466                        disk->raid_disk = raid_disk;
1467                        disk->dev = rdev->dev;
1468
1469                        disk->operational = 0;
1470                        disk->write_only = 0;
1471                        disk->spare = 1;
1472                        disk->used_slot = 1;
1473                }
1474        }
1475
1476        for (i = 0; i < MD_SB_DISKS; i++) {
1477                desc = sb->disks + i;
1478                raid_disk = desc->raid_disk;
1479                disk = conf->disks + raid_disk;
1480
1481                if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1482                        !conf->disks[raid_disk].used_slot) {
1483
1484                        disk->number = desc->number;
1485                        disk->raid_disk = raid_disk;
1486                        disk->dev = MKDEV(0,0);
1487
1488                        disk->operational = 0;
1489                        disk->write_only = 0;
1490                        disk->spare = 0;
1491                        disk->used_slot = 1;
1492                }
1493        }
1494
1495        conf->raid_disks = sb->raid_disks;
1496        /*
1497         * 0 for a fully functional array, 1 for a degraded array.
1498         */
1499        conf->failed_disks = conf->raid_disks - conf->working_disks;
1500        conf->mddev = mddev;
1501        conf->chunk_size = sb->chunk_size;
1502        conf->level = sb->level;
1503        conf->algorithm = sb->layout;
1504        conf->max_nr_stripes = NR_STRIPES;
1505
1506#if 0
1507        for (i = 0; i < conf->raid_disks; i++) {
1508                if (!conf->disks[i].used_slot) {
1509                        MD_BUG();
1510                        goto abort;
1511                }
1512        }
1513#endif
1514        if (!conf->chunk_size || conf->chunk_size % 4) {
1515                printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1516                goto abort;
1517        }
1518        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1519                printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1520                goto abort;
1521        }
1522        if (conf->failed_disks > 1) {
1523                printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1524                goto abort;
1525        }
1526
1527        if (conf->working_disks != sb->raid_disks) {
1528                printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1529                start_recovery = 1;
1530        }
1531
1532        {
1533                const char * name = "raid5d";
1534
1535                conf->thread = md_register_thread(raid5d, conf, name);
1536                if (!conf->thread) {
1537                        printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1538                        goto abort;
1539                }
1540        }
1541
1542        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1543                 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1544        if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1545                printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1546                shrink_stripes(conf, conf->max_nr_stripes);
1547                goto abort;
1548        } else
1549                printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1550
1551        /*
1552         * Regenerate the "device is in sync with the raid set" bit for
1553         * each device.
1554         */
1555        for (i = 0; i < MD_SB_DISKS ; i++) {
1556                mark_disk_nonsync(sb->disks + i);
1557                for (j = 0; j < sb->raid_disks; j++) {
1558                        if (!conf->disks[j].operational)
1559                                continue;
1560                        if (sb->disks[i].number == conf->disks[j].number)
1561                                mark_disk_sync(sb->disks + i);
1562                }
1563        }
1564        sb->active_disks = conf->working_disks;
1565
1566        if (sb->active_disks == sb->raid_disks)
1567                printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1568        else
1569                printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1570
1571        if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1572                const char * name = "raid5syncd";
1573
1574                conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1575                if (!conf->resync_thread) {
1576                        printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1577                        goto abort;
1578                }
1579
1580                printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1581                conf->resync_parity = 1;
1582                md_wakeup_thread(conf->resync_thread);
1583        }
1584
1585        print_raid5_conf(conf);
1586        if (start_recovery)
1587                md_recover_arrays();
1588        print_raid5_conf(conf);
1589
1590        /* Ok, everything is just fine now */
1591        return (0);
1592abort:
1593        if (conf) {
1594                print_raid5_conf(conf);
1595                if (conf->stripe_hashtbl)
1596                        free_pages((unsigned long) conf->stripe_hashtbl,
1597                                                        HASH_PAGES_ORDER);
1598                kfree(conf);
1599        }
1600        mddev->private = NULL;
1601        printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1602        MOD_DEC_USE_COUNT;
1603        return -EIO;
1604}
1605
1606static int raid5_stop_resync (mddev_t *mddev)
1607{
1608        raid5_conf_t *conf = mddev_to_conf(mddev);
1609        mdk_thread_t *thread = conf->resync_thread;
1610
1611        if (thread) {
1612                if (conf->resync_parity) {
1613                        conf->resync_parity = 2;
1614                        md_interrupt_thread(thread);
1615                        printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1616                        return 1;
1617                }
1618                return 0;
1619        }
1620        return 0;
1621}
1622
1623static int raid5_restart_resync (mddev_t *mddev)
1624{
1625        raid5_conf_t *conf = mddev_to_conf(mddev);
1626
1627        if (conf->resync_parity) {
1628                if (!conf->resync_thread) {
1629                        MD_BUG();
1630                        return 0;
1631                }
1632                printk("raid5: waking up raid5resync.\n");
1633                conf->resync_parity = 1;
1634                md_wakeup_thread(conf->resync_thread);
1635                return 1;
1636        } else
1637                printk("raid5: no restart-resync needed.\n");
1638        return 0;
1639}
1640
1641
1642static int raid5_stop (mddev_t *mddev)
1643{
1644        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1645
1646        if (conf->resync_thread)
1647                md_unregister_thread(conf->resync_thread);
1648        md_unregister_thread(conf->thread);
1649        shrink_stripes(conf, conf->max_nr_stripes);
1650        free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1651        kfree(conf);
1652        mddev->private = NULL;
1653        MOD_DEC_USE_COUNT;
1654        return 0;
1655}
1656
1657#if RAID5_DEBUG
1658static void print_sh (struct stripe_head *sh)
1659{
1660        int i;
1661
1662        printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1663        printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
1664        printk("sh %lu, ", sh->sector);
1665        for (i = 0; i < MD_SB_DISKS; i++) {
1666                if (sh->bh_cache[i])
1667                        printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1668        }
1669        printk("\n");
1670}
1671
1672static void printall (raid5_conf_t *conf)
1673{
1674        struct stripe_head *sh;
1675        int i;
1676
1677        md_spin_lock_irq(&conf->device_lock);
1678        for (i = 0; i < NR_HASH; i++) {
1679                sh = conf->stripe_hashtbl[i];
1680                for (; sh; sh = sh->hash_next) {
1681                        if (sh->raid_conf != conf)
1682                                continue;
1683                        print_sh(sh);
1684                }
1685        }
1686        md_spin_unlock_irq(&conf->device_lock);
1687
1688        PRINTK("--- raid5d inactive\n");
1689}
1690#endif
1691
1692static void raid5_status (struct seq_file *seq, mddev_t *mddev)
1693{
1694        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1695        mdp_super_t *sb = mddev->sb;
1696        int i;
1697
1698        seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1699        seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1700        for (i = 0; i < conf->raid_disks; i++)
1701                seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_");
1702        seq_printf (seq, "]");
1703#if RAID5_DEBUG
1704#define D(x) \
1705        seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1706        printall(conf);
1707#endif
1708
1709}
1710
1711static void print_raid5_conf (raid5_conf_t *conf)
1712{
1713        int i;
1714        struct disk_info *tmp;
1715
1716        printk("RAID5 conf printout:\n");
1717        if (!conf) {
1718                printk("(conf==NULL)\n");
1719                return;
1720        }
1721        printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1722                 conf->working_disks, conf->failed_disks);
1723
1724#if RAID5_DEBUG
1725        for (i = 0; i < MD_SB_DISKS; i++) {
1726#else
1727        for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1728#endif
1729                tmp = conf->disks + i;
1730                printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1731                        i, tmp->spare,tmp->operational,
1732                        tmp->number,tmp->raid_disk,tmp->used_slot,
1733                        partition_name(tmp->dev));
1734        }
1735}
1736
1737static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1738{
1739        int err = 0;
1740        int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1741        raid5_conf_t *conf = mddev->private;
1742        struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1743        mdp_super_t *sb = mddev->sb;
1744        mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1745        mdk_rdev_t *spare_rdev, *failed_rdev;
1746
1747        print_raid5_conf(conf);
1748        md_spin_lock_irq(&conf->device_lock);
1749        /*
1750         * find the disk ...
1751         */
1752        switch (state) {
1753
1754        case DISKOP_SPARE_ACTIVE:
1755
1756                /*
1757                 * Find the failed disk within the RAID5 configuration ...
1758                 * (this can only be in the first conf->raid_disks part)
1759                 */
1760                for (i = 0; i < conf->raid_disks; i++) {
1761                        tmp = conf->disks + i;
1762                        if ((!tmp->operational && !tmp->spare) ||
1763                                        !tmp->used_slot) {
1764                                failed_disk = i;
1765                                break;
1766                        }
1767                }
1768                /*
1769                 * When we activate a spare disk we _must_ have a disk in
1770                 * the lower (active) part of the array to replace.
1771                 */
1772                if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1773                        MD_BUG();
1774                        err = 1;
1775                        goto abort;
1776                }
1777                /* fall through */
1778
1779        case DISKOP_SPARE_WRITE:
1780        case DISKOP_SPARE_INACTIVE:
1781
1782                /*
1783                 * Find the spare disk ... (can only be in the 'high'
1784                 * area of the array)
1785                 */
1786                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1787                        tmp = conf->disks + i;
1788                        if (tmp->spare && tmp->number == (*d)->number) {
1789                                spare_disk = i;
1790                                break;
1791                        }
1792                }
1793                if (spare_disk == -1) {
1794                        MD_BUG();
1795                        err = 1;
1796                        goto abort;
1797                }
1798                break;
1799
1800        case DISKOP_HOT_REMOVE_DISK:
1801
1802                for (i = 0; i < MD_SB_DISKS; i++) {
1803                        tmp = conf->disks + i;
1804                        if (tmp->used_slot && (tmp->number == (*d)->number)) {
1805                                if (tmp->operational) {
1806                                        err = -EBUSY;
1807                                        goto abort;
1808                                }
1809                                removed_disk = i;
1810                                break;
1811                        }
1812                }
1813                if (removed_disk == -1) {
1814                        MD_BUG();
1815                        err = 1;
1816                        goto abort;
1817                }
1818                break;
1819
1820        case DISKOP_HOT_ADD_DISK:
1821
1822                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1823                        tmp = conf->disks + i;
1824                        if (!tmp->used_slot) {
1825                                added_disk = i;
1826                                break;
1827                        }
1828                }
1829                if (added_disk == -1) {
1830                        MD_BUG();
1831                        err = 1;
1832                        goto abort;
1833                }
1834                break;
1835        }
1836
1837        switch (state) {
1838        /*
1839         * Switch the spare disk to write-only mode:
1840         */
1841        case DISKOP_SPARE_WRITE:
1842                if (conf->spare) {
1843                        MD_BUG();
1844                        err = 1;
1845                        goto abort;
1846                }
1847                sdisk = conf->disks + spare_disk;
1848                sdisk->operational = 1;
1849                sdisk->write_only = 1;
1850                conf->spare = sdisk;
1851                break;
1852        /*
1853         * Deactivate a spare disk:
1854         */
1855        case DISKOP_SPARE_INACTIVE:
1856                sdisk = conf->disks + spare_disk;
1857                sdisk->operational = 0;
1858                sdisk->write_only = 0;
1859                /*
1860                 * Was the spare being resynced?
1861                 */
1862                if (conf->spare == sdisk)
1863                        conf->spare = NULL;
1864                break;
1865        /*
1866         * Activate (mark read-write) the (now sync) spare disk,
1867         * which means we switch it's 'raid position' (->raid_disk)
1868         * with the failed disk. (only the first 'conf->raid_disks'
1869         * slots are used for 'real' disks and we must preserve this
1870         * property)
1871         */
1872        case DISKOP_SPARE_ACTIVE:
1873                if (!conf->spare) {
1874                        MD_BUG();
1875                        err = 1;
1876                        goto abort;
1877                }
1878                sdisk = conf->disks + spare_disk;
1879                fdisk = conf->disks + failed_disk;
1880
1881                spare_desc = &sb->disks[sdisk->number];
1882                failed_desc = &sb->disks[fdisk->number];
1883
1884                if (spare_desc != *d) {
1885                        MD_BUG();
1886                        err = 1;
1887                        goto abort;
1888                }
1889
1890                if (spare_desc->raid_disk != sdisk->raid_disk) {
1891                        MD_BUG();
1892                        err = 1;
1893                        goto abort;
1894                }
1895                        
1896                if (sdisk->raid_disk != spare_disk) {
1897                        MD_BUG();
1898                        err = 1;
1899                        goto abort;
1900                }
1901
1902                if (failed_desc->raid_disk != fdisk->raid_disk) {
1903                        MD_BUG();
1904                        err = 1;
1905                        goto abort;
1906                }
1907
1908                if (fdisk->raid_disk != failed_disk) {
1909                        MD_BUG();
1910                        err = 1;
1911                        goto abort;
1912                }
1913
1914                /*
1915                 * do the switch finally
1916                 */
1917                spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1918                failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1919
1920                /* There must be a spare_rdev, but there may not be a
1921                 * failed_rdev.  That slot might be empty...
1922                 */
1923                spare_rdev->desc_nr = failed_desc->number;
1924                if (failed_rdev)
1925                        failed_rdev->desc_nr = spare_desc->number;
1926                
1927                xchg_values(*spare_desc, *failed_desc);
1928                xchg_values(*fdisk, *sdisk);
1929
1930                /*
1931                 * (careful, 'failed' and 'spare' are switched from now on)
1932                 *
1933                 * we want to preserve linear numbering and we want to
1934                 * give the proper raid_disk number to the now activated
1935                 * disk. (this means we switch back these values)
1936                 */
1937        
1938                xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1939                xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1940                xchg_values(spare_desc->number, failed_desc->number);
1941                xchg_values(sdisk->number, fdisk->number);
1942
1943                *d = failed_desc;
1944
1945                if (sdisk->dev == MKDEV(0,0))
1946                        sdisk->used_slot = 0;
1947
1948                /*
1949                 * this really activates the spare.
1950                 */
1951                fdisk->spare = 0;
1952                fdisk->write_only = 0;
1953
1954                /*
1955                 * if we activate a spare, we definitely replace a
1956                 * non-operational disk slot in the 'low' area of
1957                 * the disk array.
1958                 */
1959                conf->failed_disks--;
1960                conf->working_disks++;
1961                conf->spare = NULL;
1962
1963                break;
1964
1965        case DISKOP_HOT_REMOVE_DISK:
1966                rdisk = conf->disks + removed_disk;
1967
1968                if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1969                        MD_BUG();       
1970                        err = 1;
1971                        goto abort;
1972                }
1973                rdisk->dev = MKDEV(0,0);
1974                rdisk->used_slot = 0;
1975
1976                break;
1977
1978        case DISKOP_HOT_ADD_DISK:
1979                adisk = conf->disks + added_disk;
1980                added_desc = *d;
1981
1982                if (added_disk != added_desc->number) {
1983                        MD_BUG();       
1984                        err = 1;
1985                        goto abort;
1986                }
1987
1988                adisk->number = added_desc->number;
1989                adisk->raid_disk = added_desc->raid_disk;
1990                adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1991
1992                adisk->operational = 0;
1993                adisk->write_only = 0;
1994                adisk->spare = 1;
1995                adisk->used_slot = 1;
1996
1997
1998                break;
1999
2000        default:
2001                MD_BUG();       
2002                err = 1;
2003                goto abort;
2004        }
2005abort:
2006        md_spin_unlock_irq(&conf->device_lock);
2007        print_raid5_conf(conf);
2008        return err;
2009}
2010
2011static mdk_personality_t raid5_personality=
2012{
2013        name:           "raid5",
2014        make_request:   raid5_make_request,
2015        run:            raid5_run,
2016        stop:           raid5_stop,
2017        status:         raid5_status,
2018        error_handler:  raid5_error,
2019        diskop:         raid5_diskop,
2020        stop_resync:    raid5_stop_resync,
2021        restart_resync: raid5_restart_resync,
2022        sync_request:   raid5_sync_request
2023};
2024
2025static int md__init raid5_init (void)
2026{
2027        return register_md_personality (RAID5, &raid5_personality);
2028}
2029
2030static void raid5_exit (void)
2031{
2032        unregister_md_personality (RAID5);
2033}
2034
2035module_init(raid5_init);
2036module_exit(raid5_exit);
2037MODULE_LICENSE("GPL");
2038
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.