linux-bk/drivers/md/raid5.c
<<
>>
Prefs
   1/*
   2 * raid5.c : Multiple Devices driver for Linux
   3 *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4 *         Copyright (C) 1999, 2000 Ingo Molnar
   5 *
   6 * RAID-5 management functions.
   7 *
   8 * This program is free software; you can redistribute it and/or modify
   9 * it under the terms of the GNU General Public License as published by
  10 * the Free Software Foundation; either version 2, or (at your option)
  11 * any later version.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * (for example /usr/src/linux/COPYING); if not, write to the Free
  15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16 */
  17
  18
  19#include <linux/config.h>
  20#include <linux/module.h>
  21#include <linux/slab.h>
  22#include <linux/raid/raid5.h>
  23#include <linux/bio.h>
  24#include <asm/bitops.h>
  25#include <asm/atomic.h>
  26
  27/*
  28 * Stripe cache
  29 */
  30
  31#define NR_STRIPES              256
  32#define STRIPE_SIZE             PAGE_SIZE
  33#define STRIPE_SECTORS          (STRIPE_SIZE>>9)
  34#define IO_THRESHOLD            1
  35#define HASH_PAGES              1
  36#define HASH_PAGES_ORDER        0
  37#define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
  38#define HASH_MASK               (NR_HASH - 1)
  39#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / STRIPE_SECTORS) & HASH_MASK])
  40
  41/*
  42 * The following can be used to debug the driver
  43 */
  44#define RAID5_DEBUG     0
  45#define RAID5_PARANOIA  1
  46#if RAID5_PARANOIA && CONFIG_SMP
  47# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
  48#else
  49# define CHECK_DEVLOCK()
  50#endif
  51
  52#if RAID5_DEBUG
  53#define PRINTK(x...) printk(x)
  54#define inline
  55#define __inline__
  56#else
  57#define PRINTK(x...) do { } while (0)
  58#endif
  59
  60static void print_raid5_conf (raid5_conf_t *conf);
  61
  62static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
  63{
  64        if (atomic_dec_and_test(&sh->count)) {
  65                if (!list_empty(&sh->lru))
  66                        BUG();
  67                if (atomic_read(&conf->active_stripes)==0)
  68                        BUG();
  69                if (test_bit(STRIPE_HANDLE, &sh->state)) {
  70                        if (test_bit(STRIPE_DELAYED, &sh->state))
  71                                list_add_tail(&sh->lru, &conf->delayed_list);
  72                        else
  73                                list_add_tail(&sh->lru, &conf->handle_list);
  74                        md_wakeup_thread(conf->thread);
  75                } else {
  76                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
  77                                atomic_dec(&conf->preread_active_stripes);
  78                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
  79                                        md_wakeup_thread(conf->thread);
  80                        }
  81                        list_add_tail(&sh->lru, &conf->inactive_list);
  82                        atomic_dec(&conf->active_stripes);
  83                        if (!conf->inactive_blocked ||
  84                            atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
  85                                wake_up(&conf->wait_for_stripe);
  86                }
  87        }
  88}
  89static void release_stripe(struct stripe_head *sh)
  90{
  91        raid5_conf_t *conf = sh->raid_conf;
  92        unsigned long flags;
  93        
  94        spin_lock_irqsave(&conf->device_lock, flags);
  95        __release_stripe(conf, sh);
  96        spin_unlock_irqrestore(&conf->device_lock, flags);
  97}
  98
  99static void remove_hash(struct stripe_head *sh)
 100{
 101        PRINTK("remove_hash(), stripe %lu\n", sh->sector);
 102
 103        if (sh->hash_pprev) {
 104                if (sh->hash_next)
 105                        sh->hash_next->hash_pprev = sh->hash_pprev;
 106                *sh->hash_pprev = sh->hash_next;
 107                sh->hash_pprev = NULL;
 108        }
 109}
 110
 111static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 112{
 113        struct stripe_head **shp = &stripe_hash(conf, sh->sector);
 114
 115        PRINTK("insert_hash(), stripe %lu\n",sh->sector);
 116
 117        CHECK_DEVLOCK();
 118        if ((sh->hash_next = *shp) != NULL)
 119                (*shp)->hash_pprev = &sh->hash_next;
 120        *shp = sh;
 121        sh->hash_pprev = shp;
 122}
 123
 124
 125/* find an idle stripe, make sure it is unhashed, and return it. */
 126static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 127{
 128        struct stripe_head *sh = NULL;
 129        struct list_head *first;
 130
 131        CHECK_DEVLOCK();
 132        if (list_empty(&conf->inactive_list))
 133                goto out;
 134        first = conf->inactive_list.next;
 135        sh = list_entry(first, struct stripe_head, lru);
 136        list_del_init(first);
 137        remove_hash(sh);
 138        atomic_inc(&conf->active_stripes);
 139out:
 140        return sh;
 141}
 142
 143static void shrink_buffers(struct stripe_head *sh, int num)
 144{
 145        struct page *p;
 146        int i;
 147
 148        for (i=0; i<num ; i++) {
 149                p = sh->dev[i].page;
 150                if (!p)
 151                        continue;
 152                sh->dev[i].page = NULL;
 153                page_cache_release(p);
 154        }
 155}
 156
 157static int grow_buffers(struct stripe_head *sh, int num)
 158{
 159        int i;
 160
 161        for (i=0; i<num; i++) {
 162                struct page *page;
 163
 164                if (!(page = alloc_page(GFP_KERNEL))) {
 165                        return 1;
 166                }
 167                sh->dev[i].page = page;
 168        }
 169        return 0;
 170}
 171
 172static void raid5_build_block (struct stripe_head *sh, int i);
 173
 174static inline void init_stripe(struct stripe_head *sh, unsigned long sector, int pd_idx)
 175{
 176        raid5_conf_t *conf = sh->raid_conf;
 177        int disks = conf->raid_disks, i;
 178
 179        if (atomic_read(&sh->count) != 0)
 180                BUG();
 181        if (test_bit(STRIPE_HANDLE, &sh->state))
 182                BUG();
 183        
 184        CHECK_DEVLOCK();
 185        PRINTK("init_stripe called, stripe %lu\n", sh->sector);
 186
 187        remove_hash(sh);
 188        
 189        sh->sector = sector;
 190        sh->pd_idx = pd_idx;
 191        sh->state = 0;
 192
 193        for (i=disks; i--; ) {
 194                struct r5dev *dev = &sh->dev[i];
 195
 196                if (dev->toread || dev->towrite || dev->written ||
 197                    test_bit(R5_LOCKED, &dev->flags)) {
 198                        printk("sector=%lx i=%d %p %p %p %d\n",
 199                               sh->sector, i, dev->toread,
 200                               dev->towrite, dev->written,
 201                               test_bit(R5_LOCKED, &dev->flags));
 202                        BUG();
 203                }
 204                dev->flags = 0;
 205                raid5_build_block(sh, i);
 206        }
 207        insert_hash(conf, sh);
 208}
 209
 210static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
 211{
 212        struct stripe_head *sh;
 213
 214        CHECK_DEVLOCK();
 215        PRINTK("__find_stripe, sector %lu\n", sector);
 216        for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
 217                if (sh->sector == sector)
 218                        return sh;
 219        PRINTK("__stripe %lu not in cache\n", sector);
 220        return NULL;
 221}
 222
 223static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, 
 224                                             int pd_idx, int noblock) 
 225{
 226        struct stripe_head *sh;
 227
 228        PRINTK("get_stripe, sector %lu\n", sector);
 229
 230        spin_lock_irq(&conf->device_lock);
 231
 232        do {
 233                sh = __find_stripe(conf, sector);
 234                if (!sh) {
 235                        if (!conf->inactive_blocked)
 236                                sh = get_free_stripe(conf);
 237                        if (noblock && sh == NULL)
 238                                break;
 239                        if (!sh) {
 240                                conf->inactive_blocked = 1;
 241                                wait_event_lock_irq(conf->wait_for_stripe,
 242                                                    !list_empty(&conf->inactive_list) &&
 243                                                    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
 244                                                     || !conf->inactive_blocked),
 245                                                    conf->device_lock);
 246                                conf->inactive_blocked = 0;
 247                        } else
 248                                init_stripe(sh, sector, pd_idx);
 249                } else {
 250                        if (atomic_read(&sh->count)) {
 251                                if (!list_empty(&sh->lru))
 252                                        BUG();
 253                        } else {
 254                                if (!test_bit(STRIPE_HANDLE, &sh->state))
 255                                        atomic_inc(&conf->active_stripes);
 256                                if (list_empty(&sh->lru))
 257                                        BUG();
 258                                list_del_init(&sh->lru);
 259                        }
 260                }
 261        } while (sh == NULL);
 262
 263        if (sh)
 264                atomic_inc(&sh->count);
 265
 266        spin_unlock_irq(&conf->device_lock);
 267        return sh;
 268}
 269
 270static int grow_stripes(raid5_conf_t *conf, int num)
 271{
 272        struct stripe_head *sh;
 273        kmem_cache_t *sc;
 274        int devs = conf->raid_disks;
 275
 276        sprintf(conf->cache_name, "md/raid5-%d", conf->mddev->__minor);
 277
 278        sc = kmem_cache_create(conf->cache_name, 
 279                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 280                               0, 0, NULL, NULL);
 281        if (!sc)
 282                return 1;
 283        conf->slab_cache = sc;
 284        while (num--) {
 285                sh = kmem_cache_alloc(sc, GFP_KERNEL);
 286                if (!sh)
 287                        return 1;
 288                memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
 289                sh->raid_conf = conf;
 290                sh->lock = SPIN_LOCK_UNLOCKED;
 291
 292                if (grow_buffers(sh, conf->raid_disks)) {
 293                        shrink_buffers(sh, conf->raid_disks);
 294                        kmem_cache_free(sc, sh);
 295                        return 1;
 296                }
 297                /* we just created an active stripe so... */
 298                atomic_set(&sh->count, 1);
 299                atomic_inc(&conf->active_stripes);
 300                INIT_LIST_HEAD(&sh->lru);
 301                release_stripe(sh);
 302        }
 303        return 0;
 304}
 305
 306static void shrink_stripes(raid5_conf_t *conf)
 307{
 308        struct stripe_head *sh;
 309
 310        while (1) {
 311                spin_lock_irq(&conf->device_lock);
 312                sh = get_free_stripe(conf);
 313                spin_unlock_irq(&conf->device_lock);
 314                if (!sh)
 315                        break;
 316                if (atomic_read(&sh->count))
 317                        BUG();
 318                shrink_buffers(sh, conf->raid_disks);
 319                kmem_cache_free(conf->slab_cache, sh);
 320                atomic_dec(&conf->active_stripes);
 321        }
 322        kmem_cache_destroy(conf->slab_cache);
 323        conf->slab_cache = NULL;
 324}
 325
 326static void raid5_end_read_request (struct bio * bi)
 327{
 328        struct stripe_head *sh = bi->bi_private;
 329        raid5_conf_t *conf = sh->raid_conf;
 330        int disks = conf->raid_disks, i;
 331        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 332
 333        for (i=0 ; i<disks; i++)
 334                if (bi == &sh->dev[i].req)
 335                        break;
 336
 337        PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
 338        if (i == disks) {
 339                BUG();
 340                return;
 341        }
 342
 343        if (uptodate) {
 344#if 0
 345                struct bio *bio;
 346                unsigned long flags;
 347                spin_lock_irqsave(&conf->device_lock, flags);
 348                /* we can return a buffer if we bypassed the cache or
 349                 * if the top buffer is not in highmem.  If there are
 350                 * multiple buffers, leave the extra work to
 351                 * handle_stripe
 352                 */
 353                buffer = sh->bh_read[i];
 354                if (buffer &&
 355                    (!PageHighMem(buffer->b_page)
 356                     || buffer->b_page == bh->b_page )
 357                        ) {
 358                        sh->bh_read[i] = buffer->b_reqnext;
 359                        buffer->b_reqnext = NULL;
 360                } else
 361                        buffer = NULL;
 362                spin_unlock_irqrestore(&conf->device_lock, flags);
 363                if (sh->bh_page[i]==bh->b_page)
 364                        set_buffer_uptodate(bh);
 365                if (buffer) {
 366                        if (buffer->b_page != bh->b_page)
 367                                memcpy(buffer->b_data, bh->b_data, bh->b_size);
 368                        buffer->b_end_io(buffer, 1);
 369                }
 370#else
 371                set_bit(R5_UPTODATE, &sh->dev[i].flags);
 372#endif          
 373        } else {
 374                md_error(conf->mddev, conf->disks[i].bdev);
 375                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 376        }
 377#if 0
 378        /* must restore b_page before unlocking buffer... */
 379        if (sh->bh_page[i] != bh->b_page) {
 380                bh->b_page = sh->bh_page[i];
 381                bh->b_data = page_address(bh->b_page);
 382                clear_buffer_uptodate(bh);
 383        }
 384#endif
 385        clear_bit(R5_LOCKED, &sh->dev[i].flags);
 386        set_bit(STRIPE_HANDLE, &sh->state);
 387        release_stripe(sh);
 388}
 389
 390static void raid5_end_write_request (struct bio *bi)
 391{
 392        struct stripe_head *sh = bi->bi_private;
 393        raid5_conf_t *conf = sh->raid_conf;
 394        int disks = conf->raid_disks, i;
 395        unsigned long flags;
 396        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 397
 398        for (i=0 ; i<disks; i++)
 399                if (bi == &sh->dev[i].req)
 400                        break;
 401
 402        PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
 403        if (i == disks) {
 404                BUG();
 405                return;
 406        }
 407
 408        spin_lock_irqsave(&conf->device_lock, flags);
 409        if (!uptodate)
 410                md_error(conf->mddev, conf->disks[i].bdev);
 411        
 412        clear_bit(R5_LOCKED, &sh->dev[i].flags);
 413        set_bit(STRIPE_HANDLE, &sh->state);
 414        __release_stripe(conf, sh);
 415        spin_unlock_irqrestore(&conf->device_lock, flags);
 416}
 417
 418
 419static unsigned long compute_blocknr(struct stripe_head *sh, int i);
 420        
 421static void raid5_build_block (struct stripe_head *sh, int i)
 422{
 423        raid5_conf_t *conf = sh->raid_conf;
 424        struct r5dev *dev = &sh->dev[i];
 425
 426        bio_init(&dev->req);
 427        dev->req.bi_io_vec = &dev->vec;
 428        dev->req.bi_vcnt++;
 429        dev->vec.bv_page = dev->page;
 430        dev->vec.bv_len = STRIPE_SIZE;
 431        dev->vec.bv_offset = 0;
 432
 433        dev->req.bi_bdev = conf->disks[i].bdev;
 434        dev->req.bi_sector = sh->sector;
 435        dev->req.bi_private = sh;
 436
 437        dev->flags = 0;
 438        if (i != sh->pd_idx)
 439                dev->sector = compute_blocknr(sh, i);
 440}
 441
 442static int error(mddev_t *mddev, struct block_device *bdev)
 443{
 444        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 445        struct disk_info *disk;
 446        int i;
 447
 448        PRINTK("raid5: error called\n");
 449
 450        for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
 451                if (disk->bdev != bdev)
 452                        continue;
 453                if (disk->operational) {
 454                        disk->operational = 0;
 455                        mddev->sb_dirty = 1;
 456                        mddev->degraded++;
 457                        conf->working_disks--;
 458                        conf->failed_disks++;
 459                        printk (KERN_ALERT
 460                                "raid5: Disk failure on %s, disabling device."
 461                                " Operation continuing on %d devices\n",
 462                                bdev_partition_name(bdev), conf->working_disks);
 463                }
 464                return 0;
 465        }
 466        /*
 467         * handle errors in spares (during reconstruction)
 468         */
 469        if (conf->spare) {
 470                disk = conf->spare;
 471                if (disk->bdev == bdev) {
 472                        printk (KERN_ALERT
 473                                "raid5: Disk failure on spare %s\n",
 474                                bdev_partition_name (bdev));
 475                        if (!conf->spare->operational) {
 476                                /* probably a SET_DISK_FAULTY ioctl */
 477                                return -EIO;
 478                        }
 479                        disk->operational = 0;
 480                        disk->write_only = 0;
 481                        conf->spare = NULL;
 482
 483                        mddev->sb_dirty = 1;
 484
 485                        return 0;
 486                }
 487        }
 488        MD_BUG();
 489        return -EIO;
 490}       
 491
 492/*
 493 * Input: a 'big' sector number,
 494 * Output: index of the data and parity disk, and the sector # in them.
 495 */
 496static unsigned long raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 497                        unsigned int data_disks, unsigned int * dd_idx,
 498                        unsigned int * pd_idx, raid5_conf_t *conf)
 499{
 500        sector_t stripe;
 501        unsigned long chunk_number;
 502        unsigned int chunk_offset;
 503        sector_t new_sector;
 504        int sectors_per_chunk = conf->chunk_size >> 9;
 505
 506        /* First compute the information on this sector */
 507
 508        /*
 509         * Compute the chunk number and the sector offset inside the chunk
 510         */
 511        chunk_number = r_sector / sectors_per_chunk;
 512        chunk_offset = r_sector % sectors_per_chunk;
 513
 514        /*
 515         * Compute the stripe number
 516         */
 517        stripe = chunk_number / data_disks;
 518
 519        /*
 520         * Compute the data disk and parity disk indexes inside the stripe
 521         */
 522        *dd_idx = chunk_number % data_disks;
 523
 524        /*
 525         * Select the parity disk based on the user selected algorithm.
 526         */
 527        if (conf->level == 4)
 528                *pd_idx = data_disks;
 529        else switch (conf->algorithm) {
 530                case ALGORITHM_LEFT_ASYMMETRIC:
 531                        *pd_idx = data_disks - stripe % raid_disks;
 532                        if (*dd_idx >= *pd_idx)
 533                                (*dd_idx)++;
 534                        break;
 535                case ALGORITHM_RIGHT_ASYMMETRIC:
 536                        *pd_idx = stripe % raid_disks;
 537                        if (*dd_idx >= *pd_idx)
 538                                (*dd_idx)++;
 539                        break;
 540                case ALGORITHM_LEFT_SYMMETRIC:
 541                        *pd_idx = data_disks - stripe % raid_disks;
 542                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 543                        break;
 544                case ALGORITHM_RIGHT_SYMMETRIC:
 545                        *pd_idx = stripe % raid_disks;
 546                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 547                        break;
 548                default:
 549                        printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 550        }
 551
 552        /*
 553         * Finally, compute the new sector number
 554         */
 555        new_sector = stripe * sectors_per_chunk + chunk_offset;
 556        return new_sector;
 557}
 558
 559
 560static sector_t compute_blocknr(struct stripe_head *sh, int i)
 561{
 562        raid5_conf_t *conf = sh->raid_conf;
 563        int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
 564        sector_t new_sector = sh->sector, check;
 565        int sectors_per_chunk = conf->chunk_size >> 9;
 566        sector_t stripe = new_sector / sectors_per_chunk;
 567        int chunk_offset = new_sector % sectors_per_chunk;
 568        int chunk_number, dummy1, dummy2, dd_idx = i;
 569        sector_t r_sector;
 570
 571        switch (conf->algorithm) {
 572                case ALGORITHM_LEFT_ASYMMETRIC:
 573                case ALGORITHM_RIGHT_ASYMMETRIC:
 574                        if (i > sh->pd_idx)
 575                                i--;
 576                        break;
 577                case ALGORITHM_LEFT_SYMMETRIC:
 578                case ALGORITHM_RIGHT_SYMMETRIC:
 579                        if (i < sh->pd_idx)
 580                                i += raid_disks;
 581                        i -= (sh->pd_idx + 1);
 582                        break;
 583                default:
 584                        printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
 585        }
 586
 587        chunk_number = stripe * data_disks + i;
 588        r_sector = chunk_number * sectors_per_chunk + chunk_offset;
 589
 590        check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
 591        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
 592                printk("compute_blocknr: map not correct\n");
 593                return 0;
 594        }
 595        return r_sector;
 596}
 597
 598
 599
 600/*
 601 * Copy data between a page in the stripe cache, and one or more bion
 602 * The page could align with the middle of the bio, or there could be 
 603 * several bion, each with several bio_vecs, which cover part of the page
 604 * Multiple bion are linked together on bi_next.  There may be extras
 605 * at the end of this list.  We ignore them.
 606 */
 607static void copy_data(int frombio, struct bio *bio,
 608                     struct page *page,
 609                     sector_t sector)
 610{
 611        char *pa = page_address(page);
 612        struct bio_vec *bvl;
 613        int i;
 614
 615        for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
 616                bio = bio->bi_next) {
 617                int page_offset;
 618                if (bio->bi_sector >= sector)
 619                        page_offset = (signed)(bio->bi_sector - sector) * 512;
 620                else 
 621                        page_offset = (signed)(sector - bio->bi_sector) * -512;
 622                bio_for_each_segment(bvl, bio, i) {
 623                        int len = bio_iovec_idx(bio,i)->bv_len;
 624                        int clen;
 625                        int b_offset = 0;                       
 626
 627                        if (page_offset < 0) {
 628                                b_offset = -page_offset;
 629                                page_offset += b_offset;
 630                                len -= b_offset;
 631                        }
 632
 633                        if (len > 0 && page_offset + len > STRIPE_SIZE)
 634                                clen = STRIPE_SIZE - page_offset;       
 635                        else clen = len;
 636                        
 637                        if (clen > 0) {
 638                                char *ba = __bio_kmap(bio, i);
 639                                if (frombio)
 640                                        memcpy(pa+page_offset, ba+b_offset, clen);
 641                                else
 642                                        memcpy(ba+b_offset, pa+page_offset, clen);
 643                                __bio_kunmap(bio, i);
 644                        }       
 645                        if (clen < len) /* hit end of page */
 646                                break;
 647                        page_offset +=  len;
 648                }
 649        }
 650}
 651
 652#define check_xor()     do {                                            \
 653                           if (count == MAX_XOR_BLOCKS) {               \
 654                                xor_block(count, STRIPE_SIZE, ptr);     \
 655                                count = 1;                              \
 656                           }                                            \
 657                        } while(0)
 658
 659
 660static void compute_block(struct stripe_head *sh, int dd_idx)
 661{
 662        raid5_conf_t *conf = sh->raid_conf;
 663        int i, count, disks = conf->raid_disks;
 664        void *ptr[MAX_XOR_BLOCKS], *p;
 665
 666        PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
 667
 668        ptr[0] = page_address(sh->dev[dd_idx].page);
 669        memset(ptr[0], 0, STRIPE_SIZE);
 670        count = 1;
 671        for (i = disks ; i--; ) {
 672                if (i == dd_idx)
 673                        continue;
 674                p = page_address(sh->dev[i].page);
 675                if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
 676                        ptr[count++] = p;
 677                else
 678                        printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
 679
 680                check_xor();
 681        }
 682        if (count != 1)
 683                xor_block(count, STRIPE_SIZE, ptr);
 684        set_bit(R5_UPTODATE, &sh->dev[i].flags);
 685}
 686
 687static void compute_parity(struct stripe_head *sh, int method)
 688{
 689        raid5_conf_t *conf = sh->raid_conf;
 690        int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
 691        void *ptr[MAX_XOR_BLOCKS];
 692        struct bio *chosen[MD_SB_DISKS];
 693
 694        PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
 695        memset(chosen, 0, sizeof(chosen));
 696
 697        count = 1;
 698        ptr[0] = page_address(sh->dev[pd_idx].page);
 699        switch(method) {
 700        case READ_MODIFY_WRITE:
 701                if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags))
 702                        BUG();
 703                for (i=disks ; i-- ;) {
 704                        if (i==pd_idx)
 705                                continue;
 706                        if (sh->dev[i].towrite &&
 707                            test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
 708                                ptr[count++] = page_address(sh->dev[i].page);
 709                                chosen[i] = sh->dev[i].towrite;
 710                                sh->dev[i].towrite = NULL;
 711                                if (sh->dev[i].written) BUG();
 712                                sh->dev[i].written = chosen[i];
 713                                check_xor();
 714                        }
 715                }
 716                break;
 717        case RECONSTRUCT_WRITE:
 718                memset(ptr[0], 0, STRIPE_SIZE);
 719                for (i= disks; i-- ;)
 720                        if (i!=pd_idx && sh->dev[i].towrite) {
 721                                chosen[i] = sh->dev[i].towrite;
 722                                sh->dev[i].towrite = NULL;
 723                                if (sh->dev[i].written) BUG();
 724                                sh->dev[i].written = chosen[i];
 725                        }
 726                break;
 727        case CHECK_PARITY:
 728                break;
 729        }
 730        if (count>1) {
 731                xor_block(count, STRIPE_SIZE, ptr);
 732                count = 1;
 733        }
 734        
 735        for (i = disks; i--;)
 736                if (chosen[i]) {
 737                        sector_t sector = sh->dev[i].sector;
 738                        copy_data(1, chosen[i], sh->dev[i].page, sector);
 739
 740                        set_bit(R5_LOCKED, &sh->dev[i].flags);
 741                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
 742                }
 743
 744        switch(method) {
 745        case RECONSTRUCT_WRITE:
 746        case CHECK_PARITY:
 747                for (i=disks; i--;)
 748                        if (i != pd_idx) {
 749                                ptr[count++] = page_address(sh->dev[i].page);
 750                                check_xor();
 751                        }
 752                break;
 753        case READ_MODIFY_WRITE:
 754                for (i = disks; i--;)
 755                        if (chosen[i]) {
 756                                ptr[count++] = page_address(sh->dev[i].page);
 757                                check_xor();
 758                        }
 759        }
 760        if (count != 1)
 761                xor_block(count, STRIPE_SIZE, ptr);
 762        
 763        if (method != CHECK_PARITY) {
 764                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 765                set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
 766        } else
 767                clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 768}
 769
 770/*
 771 * Each stripe/dev can have one or more bion attached.
 772 * toread/towrite point to the first in a chain. 
 773 * The bi_next chain must be in order.
 774 */
 775static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 776{
 777        struct bio **bip;
 778        raid5_conf_t *conf = sh->raid_conf;
 779
 780        PRINTK("adding bh b#%lu to stripe s#%lu\n", bi->bi_sector, sh->sector);
 781
 782
 783        spin_lock(&sh->lock);
 784        spin_lock_irq(&conf->device_lock);
 785        if (forwrite)
 786                bip = &sh->dev[dd_idx].towrite;
 787        else
 788                bip = &sh->dev[dd_idx].toread;
 789        while (*bip && (*bip)->bi_sector < bi->bi_sector)
 790                bip = & (*bip)->bi_next;
 791/* FIXME do I need to worry about overlapping bion */
 792        if (*bip && bi->bi_next && (*bip) != bi->bi_next)
 793                BUG();
 794        if (*bip)
 795                bi->bi_next = *bip;
 796        *bip = bi;
 797        bi->bi_phys_segments ++;
 798        spin_unlock_irq(&conf->device_lock);
 799        spin_unlock(&sh->lock);
 800
 801        PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
 802
 803        if (forwrite) {
 804                /* check if page is coverred */
 805                sector_t sector = sh->dev[dd_idx].sector;
 806                for (bi=sh->dev[dd_idx].towrite;
 807                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
 808                             bi && bi->bi_sector <= sector;
 809                     bi = bi->bi_next) {
 810                        if (bi->bi_sector + (bi->bi_size>>9) >= sector)
 811                                sector = bi->bi_sector + (bi->bi_size>>9);
 812                }
 813                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 814                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 815        }
 816}
 817
 818
 819/*
 820 * handle_stripe - do things to a stripe.
 821 *
 822 * We lock the stripe and then examine the state of various bits
 823 * to see what needs to be done.
 824 * Possible results:
 825 *    return some read request which now have data
 826 *    return some write requests which are safely on disc
 827 *    schedule a read on some buffers
 828 *    schedule a write of some buffers
 829 *    return confirmation of parity correctness
 830 *
 831 * Parity calculations are done inside the stripe lock
 832 * buffers are taken off read_list or write_list, and bh_cache buffers
 833 * get BH_Lock set before the stripe lock is released.
 834 *
 835 */
 836 
 837static void handle_stripe(struct stripe_head *sh)
 838{
 839        raid5_conf_t *conf = sh->raid_conf;
 840        int disks = conf->raid_disks;
 841        struct bio *return_bi= NULL;
 842        struct bio *bi;
 843        int action[MD_SB_DISKS];
 844        int i;
 845        int syncing;
 846        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 847        int failed_num=0;
 848        struct r5dev *dev;
 849
 850        PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
 851        memset(action, 0, sizeof(action));
 852
 853        spin_lock(&sh->lock);
 854        clear_bit(STRIPE_HANDLE, &sh->state);
 855        clear_bit(STRIPE_DELAYED, &sh->state);
 856
 857        syncing = test_bit(STRIPE_SYNCING, &sh->state);
 858        /* Now to look around and see what can be done */
 859
 860        for (i=disks; i--; ) {
 861                dev = &sh->dev[i];
 862                PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, 
 863                       dev->flags, dev->toread, dev->towrite, dev->written);
 864                /* maybe we can reply to a read */
 865                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
 866                        struct bio *rbi, *rbi2;
 867                        PRINTK("Return read for disc %d\n", i);
 868                        spin_lock_irq(&conf->device_lock);
 869                        rbi = dev->toread;
 870                        dev->toread = NULL;
 871                        spin_unlock_irq(&conf->device_lock);
 872                        while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 873                                copy_data(0, rbi, dev->page, dev->sector);
 874                                rbi2 = rbi->bi_next;
 875                                spin_lock_irq(&conf->device_lock);
 876                                if (--rbi->bi_phys_segments == 0) {
 877                                        rbi->bi_next = return_bi;
 878                                        return_bi = rbi;
 879                                }
 880                                spin_unlock_irq(&conf->device_lock);
 881                                rbi = rbi2;
 882                        }
 883                }
 884
 885                /* now count some things */
 886                if (test_bit(R5_LOCKED, &dev->flags)) locked++;
 887                if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
 888
 889                
 890                if (dev->toread) to_read++;
 891                if (dev->towrite) to_write++;
 892                if (dev->written) written++;
 893                if (!conf->disks[i].operational) {
 894                        failed++;
 895                        failed_num = i;
 896                }
 897        }
 898        PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
 899               locked, uptodate, to_read, to_write, failed, failed_num);
 900        /* check if the array has lost two devices and, if so, some requests might
 901         * need to be failed
 902         */
 903        if (failed > 1 && to_read+to_write) {
 904                spin_lock_irq(&conf->device_lock);
 905                for (i=disks; i--; ) {
 906                        /* fail all writes first */
 907                        bi = sh->dev[i].towrite;
 908                        sh->dev[i].towrite = NULL;
 909                        if (bi) to_write--;
 910
 911                        while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
 912                                struct bio *nextbi = bi->bi_next;
 913                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
 914                                if (--bi->bi_phys_segments == 0) {
 915                                        bi->bi_next = return_bi;
 916                                        return_bi = bi;
 917                                }
 918                                bi = nextbi;
 919                        }
 920                        /* fail any reads if this device is non-operational */
 921                        if (!conf->disks[i].operational) {
 922                                bi = sh->dev[i].toread;
 923                                sh->dev[i].toread = NULL;
 924                                if (bi) to_read--;
 925                                while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
 926                                        struct bio *nextbi = bi->bi_next;
 927                                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
 928                                        if (--bi->bi_phys_segments == 0) {
 929                                                bi->bi_next = return_bi;
 930                                                return_bi = bi;
 931                                        }
 932                                        bi = nextbi;
 933                                }
 934                        }
 935                }
 936                spin_unlock_irq(&conf->device_lock);
 937        }
 938        if (failed > 1 && syncing) {
 939                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 940                clear_bit(STRIPE_SYNCING, &sh->state);
 941                syncing = 0;
 942        }
 943
 944        /* might be able to return some write requests if the parity block
 945         * is safe, or on a failed drive
 946         */
 947        dev = &sh->dev[sh->pd_idx];
 948        if ( written &&
 949             ( (conf->disks[sh->pd_idx].operational && !test_bit(R5_LOCKED, &dev->flags) &&
 950                test_bit(R5_UPTODATE, &dev->flags))
 951               || (failed == 1 && failed_num == sh->pd_idx))
 952            ) {
 953            /* any written block on an uptodate or failed drive can be returned */
 954            for (i=disks; i--; )
 955                if (sh->dev[i].written) {
 956                    dev = &sh->dev[i];
 957                    if (!conf->disks[sh->pd_idx].operational ||
 958                        (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) ) {
 959                        /* maybe we can return some write requests */
 960                            struct bio *wbi, *wbi2;
 961                            PRINTK("Return write for disc %d\n", i);
 962                            wbi = dev->written;
 963                            dev->written = NULL;
 964                            while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 965                                    wbi2 = wbi->bi_next;
 966                                    if (--wbi->bi_phys_segments == 0) {
 967                                            wbi->bi_next = return_bi;
 968                                            return_bi = wbi;
 969                                    }
 970                                    wbi = wbi2;
 971                            }
 972                    }
 973                }
 974        }
 975
 976        /* Now we might consider reading some blocks, either to check/generate
 977         * parity, or to satisfy requests
 978         */
 979        if (to_read || (syncing && (uptodate+failed < disks))) {
 980                for (i=disks; i--;) {
 981                        dev = &sh->dev[i];
 982                        if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
 983                            (dev->toread || syncing || (failed && sh->dev[failed_num].toread))) {
 984                                /* we would like to get this block, possibly
 985                                 * by computing it, but we might not be able to
 986                                 */
 987                                if (uptodate == disks-1) {
 988                                        PRINTK("Computing block %d\n", i);
 989                                        compute_block(sh, i);
 990                                        uptodate++;
 991                                } else if (conf->disks[i].operational) {
 992                                        set_bit(R5_LOCKED, &dev->flags);
 993                                        action[i] = READ+1;
 994#if 0
 995                                        /* if I am just reading this block and we don't have
 996                                           a failed drive, or any pending writes then sidestep the cache */
 997                                        if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
 998                                            ! syncing && !failed && !to_write) {
 999                                                sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
1000                                                sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
1001                                        }
1002#endif
1003                                        locked++;
1004                                        PRINTK("Reading block %d (sync=%d)\n", i, syncing);
1005                                        if (syncing)
1006                                                md_sync_acct(conf->disks[i].bdev, STRIPE_SECTORS);
1007                                }
1008                        }
1009                }
1010                set_bit(STRIPE_HANDLE, &sh->state);
1011        }
1012
1013        /* now to consider writing and what else, if anything should be read */
1014        if (to_write) {
1015                int rmw=0, rcw=0;
1016                for (i=disks ; i--;) {
1017                        /* would I have to read this buffer for read_modify_write */
1018                        dev = &sh->dev[i];
1019                        if ((dev->towrite || i == sh->pd_idx) &&
1020                            (!test_bit(R5_LOCKED, &dev->flags) 
1021#if 0
1022|| sh->bh_page[i]!=bh->b_page
1023#endif
1024                                    ) &&
1025                            !test_bit(R5_UPTODATE, &dev->flags)) {
1026                                if (conf->disks[i].operational 
1027/*                                  && !(!mddev->insync && i == sh->pd_idx) */
1028                                        )
1029                                        rmw++;
1030                                else rmw += 2*disks;  /* cannot read it */
1031                        }
1032                        /* Would I have to read this buffer for reconstruct_write */
1033                        if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1034                            (!test_bit(R5_LOCKED, &dev->flags) 
1035#if 0
1036|| sh->bh_page[i] != bh->b_page
1037#endif
1038                                    ) &&
1039                            !test_bit(R5_UPTODATE, &dev->flags)) {
1040                                if (conf->disks[i].operational) rcw++;
1041                                else rcw += 2*disks;
1042                        }
1043                }
1044                PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1045                set_bit(STRIPE_HANDLE, &sh->state);
1046                if (rmw < rcw && rmw > 0)
1047                        /* prefer read-modify-write, but need to get some data */
1048                        for (i=disks; i--;) {
1049                                dev = &sh->dev[i];
1050                                if ((dev->towrite || i == sh->pd_idx) &&
1051                                    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1052                                    conf->disks[i].operational) {
1053                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1054                                        {
1055                                                PRINTK("Read_old block %d for r-m-w\n", i);
1056                                                set_bit(R5_LOCKED, &dev->flags);
1057                                                action[i] = READ+1;
1058                                                locked++;
1059                                        } else {
1060                                                set_bit(STRIPE_DELAYED, &sh->state);
1061                                                set_bit(STRIPE_HANDLE, &sh->state);
1062                                        }
1063                                }
1064                        }
1065                if (rcw <= rmw && rcw > 0)
1066                        /* want reconstruct write, but need to get some data */
1067                        for (i=disks; i--;) {
1068                                dev = &sh->dev[i];
1069                                if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1070                                    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1071                                    conf->disks[i].operational) {
1072                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1073                                        {
1074                                                PRINTK("Read_old block %d for Reconstruct\n", i);
1075                                                set_bit(R5_LOCKED, &dev->flags);
1076                                                action[i] = READ+1;
1077                                                locked++;
1078                                        } else {
1079                                                set_bit(STRIPE_DELAYED, &sh->state);
1080                                                set_bit(STRIPE_HANDLE, &sh->state);
1081                                        }
1082                                }
1083                        }
1084                /* now if nothing is locked, and if we have enough data, we can start a write request */
1085                if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1086                        PRINTK("Computing parity...\n");
1087                        compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1088                        /* now every locked buffer is ready to be written */
1089                        for (i=disks; i--;)
1090                                if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1091                                        PRINTK("Writing block %d\n", i);
1092                                        locked++;
1093                                        action[i] = WRITE+1;
1094                                        if (!conf->disks[i].operational
1095                                            || (i==sh->pd_idx && failed == 0))
1096                                                set_bit(STRIPE_INSYNC, &sh->state);
1097                                }
1098                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1099                                atomic_dec(&conf->preread_active_stripes);
1100                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1101                                        md_wakeup_thread(conf->thread);
1102                        }
1103                }
1104        }
1105
1106        /* maybe we need to check and possibly fix the parity for this stripe
1107         * Any reads will already have been scheduled, so we just see if enough data
1108         * is available
1109         */
1110        if (syncing && locked == 0 &&
1111            !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1112                set_bit(STRIPE_HANDLE, &sh->state);
1113                if (failed == 0) {
1114                        char *pagea;
1115                        if (uptodate != disks)
1116                                BUG();
1117                        compute_parity(sh, CHECK_PARITY);
1118                        uptodate--;
1119                        pagea = page_address(sh->dev[sh->pd_idx].page);
1120                        if ((*(u32*)pagea) == 0 &&
1121                            !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1122                                /* parity is correct (on disc, not in buffer any more) */
1123                                set_bit(STRIPE_INSYNC, &sh->state);
1124                        }
1125                }
1126                if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1127                        struct disk_info *spare;
1128                        if (failed==0)
1129                                failed_num = sh->pd_idx;
1130                        /* should be able to compute the missing block and write it to spare */
1131                        if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
1132                                if (uptodate+1 != disks)
1133                                        BUG();
1134                                compute_block(sh, failed_num);
1135                                uptodate++;
1136                        }
1137                        if (uptodate != disks)
1138                                BUG();
1139                        dev = &sh->dev[failed_num];
1140                        set_bit(R5_LOCKED, &dev->flags);
1141                        action[failed_num] = WRITE+1;
1142                        locked++;
1143                        set_bit(STRIPE_INSYNC, &sh->state);
1144                        if (conf->disks[failed_num].operational)
1145                                md_sync_acct(conf->disks[failed_num].bdev, STRIPE_SECTORS);
1146                        else if ((spare=conf->spare))
1147                                md_sync_acct(spare->bdev, STRIPE_SECTORS);
1148
1149                }
1150        }
1151        if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1152                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1153                clear_bit(STRIPE_SYNCING, &sh->state);
1154        }
1155        
1156        spin_unlock(&sh->lock);
1157
1158        while ((bi=return_bi)) {
1159                return_bi = bi->bi_next;
1160                bi->bi_next = NULL;
1161                bi->bi_end_io(bi);
1162        }
1163        for (i=disks; i-- ;) 
1164                if (action[i]) {
1165                        struct bio *bi = &sh->dev[i].req;
1166                        struct disk_info *spare = conf->spare;
1167                        int skip = 0;
1168                        if (action[i] == READ+1)
1169                                bi->bi_end_io = raid5_end_read_request;
1170                        else
1171                                bi->bi_end_io = raid5_end_write_request;
1172                        if (conf->disks[i].operational)
1173                                bi->bi_bdev = conf->disks[i].bdev;
1174                        else if (spare && action[i] == WRITE+1)
1175                                bi->bi_bdev = spare->bdev;
1176                        else skip=1;
1177                        if (!skip) {
1178                                PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1179                                atomic_inc(&sh->count);
1180                                bi->bi_sector = sh->sector;
1181                                if (action[i] == READ+1) 
1182                                        bi->bi_rw = 0;
1183                                else
1184                                        bi->bi_rw = 1;
1185                                bi->bi_flags = 0;
1186                                bi->bi_vcnt = 1;        
1187                                bi->bi_idx = 0;
1188                                bi->bi_io_vec = &sh->dev[i].vec;
1189                                bi->bi_size = STRIPE_SIZE;
1190                                bi->bi_next = NULL;
1191                                generic_make_request(bi);
1192                        } else {
1193                                PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1194                                clear_bit(R5_LOCKED, &dev->flags);
1195                                set_bit(STRIPE_HANDLE, &sh->state);
1196                        }
1197                }
1198}
1199
1200static inline void raid5_activate_delayed(raid5_conf_t *conf)
1201{
1202        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1203                while (!list_empty(&conf->delayed_list)) {
1204                        struct list_head *l = conf->delayed_list.next;
1205                        struct stripe_head *sh;
1206                        sh = list_entry(l, struct stripe_head, lru);
1207                        list_del_init(l);
1208                        clear_bit(STRIPE_DELAYED, &sh->state);
1209                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1210                                atomic_inc(&conf->preread_active_stripes);
1211                        list_add_tail(&sh->lru, &conf->handle_list);
1212                }
1213        }
1214}
1215static void raid5_unplug_device(void *data)
1216{
1217        request_queue_t *q = data;
1218        mddev_t *mddev = q->queuedata;
1219        raid5_conf_t *conf = mddev_to_conf(mddev);
1220        unsigned long flags;
1221
1222        spin_lock_irqsave(&conf->device_lock, flags);
1223
1224        if (blk_remove_plug(q))
1225                raid5_activate_delayed(conf);
1226        md_wakeup_thread(conf->thread);
1227
1228        spin_unlock_irqrestore(&conf->device_lock, flags);
1229}
1230
1231static inline void raid5_plug_device(raid5_conf_t *conf)
1232{
1233        spin_lock_irq(&conf->device_lock);
1234        blk_plug_device(&conf->mddev->queue);
1235        spin_unlock_irq(&conf->device_lock);
1236}
1237
1238static int make_request (request_queue_t *q, struct bio * bi)
1239{
1240        mddev_t *mddev = q->queuedata;
1241        raid5_conf_t *conf = mddev_to_conf(mddev);
1242        const unsigned int raid_disks = conf->raid_disks;
1243        const unsigned int data_disks = raid_disks - 1;
1244        unsigned int dd_idx, pd_idx;
1245        sector_t new_sector;
1246        sector_t logical_sector, last_sector;
1247        struct stripe_head *sh;
1248
1249        logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1);
1250        last_sector = bi->bi_sector + (bi->bi_size>>9);
1251
1252        bi->bi_next = NULL;
1253        set_bit(BIO_UPTODATE, &bi->bi_flags); /* will be cleared if error detected */
1254        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
1255        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1256                
1257                new_sector = raid5_compute_sector(logical_sector,
1258                                                  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1259
1260                PRINTK("raid5: make_request, sector %ul logical %ul\n", 
1261                       new_sector, logical_sector);
1262
1263                sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1264                if (sh) {
1265
1266                        add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
1267
1268                        raid5_plug_device(conf);
1269                        handle_stripe(sh);
1270                        release_stripe(sh);
1271                }
1272        }
1273        spin_lock_irq(&conf->device_lock);
1274        if (--bi->bi_phys_segments == 0) 
1275                bi->bi_end_io(bi);
1276        spin_unlock_irq(&conf->device_lock);
1277        return 0;
1278}
1279
1280/* FIXME go_faster isn't used */
1281static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1282{
1283        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1284        struct stripe_head *sh;
1285        int sectors_per_chunk = conf->chunk_size >> 9;
1286        unsigned long stripe = sector_nr/sectors_per_chunk;
1287        int chunk_offset = sector_nr % sectors_per_chunk;
1288        int dd_idx, pd_idx;
1289        unsigned long first_sector;
1290        int raid_disks = conf->raid_disks;
1291        int data_disks = raid_disks-1;
1292
1293        if (sector_nr >= mddev->size <<1)
1294                /* just being told to finish up .. nothing to do */
1295                return 0;
1296
1297        first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1298                + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1299        sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1300        spin_lock(&sh->lock);   
1301        set_bit(STRIPE_SYNCING, &sh->state);
1302        clear_bit(STRIPE_INSYNC, &sh->state);
1303        spin_unlock(&sh->lock);
1304
1305        handle_stripe(sh);
1306        release_stripe(sh);
1307
1308        return STRIPE_SECTORS;
1309}
1310
1311/*
1312 * This is our raid5 kernel thread.
1313 *
1314 * We scan the hash table for stripes which can be handled now.
1315 * During the scan, completed stripes are saved for us by the interrupt
1316 * handler, so that they will not have to wait for our next wakeup.
1317 */
1318static void raid5d (void *data)
1319{
1320        struct stripe_head *sh;
1321        raid5_conf_t *conf = data;
1322        mddev_t *mddev = conf->mddev;
1323        int handled;
1324
1325        PRINTK("+++ raid5d active\n");
1326
1327        handled = 0;
1328        spin_lock_irq(&conf->device_lock);
1329        while (1) {
1330                struct list_head *first;
1331
1332                if (list_empty(&conf->handle_list) &&
1333                    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1334                    !blk_queue_plugged(&mddev->queue) &&
1335                    !list_empty(&conf->delayed_list))
1336                        raid5_activate_delayed(conf);
1337
1338                if (list_empty(&conf->handle_list))
1339                        break;
1340
1341                first = conf->handle_list.next;
1342                sh = list_entry(first, struct stripe_head, lru);
1343
1344                list_del_init(first);
1345                atomic_inc(&sh->count);
1346                if (atomic_read(&sh->count)!= 1)
1347                        BUG();
1348                spin_unlock_irq(&conf->device_lock);
1349                
1350                handled++;
1351                handle_stripe(sh);
1352                release_stripe(sh);
1353
1354                spin_lock_irq(&conf->device_lock);
1355        }
1356        PRINTK("%d stripes handled\n", handled);
1357
1358        spin_unlock_irq(&conf->device_lock);
1359
1360        PRINTK("--- raid5d inactive\n");
1361}
1362
1363static int run (mddev_t *mddev)
1364{
1365        raid5_conf_t *conf;
1366        int i, raid_disk, memory;
1367        mdk_rdev_t *rdev;
1368        struct disk_info *disk;
1369        struct list_head *tmp;
1370
1371        MOD_INC_USE_COUNT;
1372
1373        if (mddev->level != 5 && mddev->level != 4) {
1374                printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), mddev->level);
1375                MOD_DEC_USE_COUNT;
1376                return -EIO;
1377        }
1378
1379        mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1380        if ((conf = mddev->private) == NULL)
1381                goto abort;
1382        memset (conf, 0, sizeof (*conf));
1383        conf->mddev = mddev;
1384
1385        if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1386                goto abort;
1387        memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1388
1389        conf->device_lock = SPIN_LOCK_UNLOCKED;
1390        init_waitqueue_head(&conf->wait_for_stripe);
1391        INIT_LIST_HEAD(&conf->handle_list);
1392        INIT_LIST_HEAD(&conf->delayed_list);
1393        INIT_LIST_HEAD(&conf->inactive_list);
1394        atomic_set(&conf->active_stripes, 0);
1395        atomic_set(&conf->preread_active_stripes, 0);
1396
1397        mddev->queue.unplug_fn = raid5_unplug_device;
1398
1399        PRINTK("raid5: run(md%d) called.\n", mdidx(mddev));
1400
1401        ITERATE_RDEV(mddev,rdev,tmp) {
1402                /*
1403                 * This is important -- we are using the descriptor on
1404                 * the disk only to get a pointer to the descriptor on
1405                 * the main superblock, which might be more recent.
1406                 */
1407                raid_disk = rdev->raid_disk;
1408                disk = conf->disks + raid_disk;
1409
1410                if (rdev->faulty) {
1411                        printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", bdev_partition_name(rdev->bdev));
1412                        disk->bdev = rdev->bdev;
1413
1414                        disk->operational = 0;
1415                        disk->write_only = 0;
1416                        disk->spare = 0;
1417                        disk->used_slot = 1;
1418                        continue;
1419                }
1420                if (rdev->in_sync) {
1421                        if (disk->operational) {
1422                                printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", bdev_partition_name(rdev->bdev), raid_disk);
1423                                continue;
1424                        }
1425                        printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", bdev_partition_name(rdev->bdev), raid_disk);
1426        
1427                        disk->bdev = rdev->bdev;
1428                        disk->operational = 1;
1429                        disk->used_slot = 1;
1430
1431                        conf->working_disks++;
1432                } else {
1433                        /*
1434                         * Must be a spare disk ..
1435                         */
1436                        printk(KERN_INFO "raid5: spare disk %s\n", bdev_partition_name(rdev->bdev));
1437                        disk->bdev = rdev->bdev;
1438
1439                        disk->operational = 0;
1440                        disk->write_only = 0;
1441                        disk->spare = 1;
1442                        disk->used_slot = 1;
1443                }
1444        }
1445
1446        for (i = 0; i < conf->raid_disks; i++) {
1447                disk = conf->disks + i;
1448
1449                if (!disk->used_slot) {
1450                        disk->bdev = NULL;
1451
1452                        disk->operational = 0;
1453                        disk->write_only = 0;
1454                        disk->spare = 0;
1455                        disk->used_slot = 1;
1456                }
1457        }
1458
1459        conf->raid_disks = mddev->raid_disks;
1460        /*
1461         * 0 for a fully functional array, 1 for a degraded array.
1462         */
1463        mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
1464        conf->mddev = mddev;
1465        conf->chunk_size = mddev->chunk_size;
1466        conf->level = mddev->level;
1467        conf->algorithm = mddev->layout;
1468        conf->max_nr_stripes = NR_STRIPES;
1469
1470#if 0
1471        for (i = 0; i < conf->raid_disks; i++) {
1472                if (!conf->disks[i].used_slot) {
1473                        MD_BUG();
1474                        goto abort;
1475                }
1476        }
1477#endif
1478        if (!conf->chunk_size || conf->chunk_size % 4) {
1479                printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1480                goto abort;
1481        }
1482        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1483                printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1484                goto abort;
1485        }
1486        if (mddev->degraded > 1) {
1487                printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1488                goto abort;
1489        }
1490
1491        if (mddev->degraded == 1 &&
1492            !(mddev->state & (1<<MD_SB_CLEAN))) {
1493                printk(KERN_ERR "raid5: cannot start dirty degraded array for md%d\n", mdidx(mddev));
1494                goto abort;
1495        }
1496
1497        {
1498                const char * name = "raid5d";
1499
1500                conf->thread = md_register_thread(raid5d, conf, name);
1501                if (!conf->thread) {
1502                        printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1503                        goto abort;
1504                }
1505        }
1506
1507        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1508                 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
1509        if (grow_stripes(conf, conf->max_nr_stripes)) {
1510                printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1511                shrink_stripes(conf);
1512                md_unregister_thread(conf->thread);
1513                goto abort;
1514        } else
1515                printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1516
1517        if (mddev->degraded == 0)
1518                printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), 
1519                       mddev->raid_disks-mddev->degraded, mddev->raid_disks, conf->algorithm);
1520        else
1521                printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev),
1522                       mddev->raid_disks = mddev->degraded, mddev->raid_disks, conf->algorithm);
1523
1524        print_raid5_conf(conf);
1525
1526        /* Ok, everything is just fine now */
1527        return (0);
1528abort:
1529        if (conf) {
1530                print_raid5_conf(conf);
1531                if (conf->stripe_hashtbl)
1532                        free_pages((unsigned long) conf->stripe_hashtbl,
1533                                                        HASH_PAGES_ORDER);
1534                kfree(conf);
1535        }
1536        mddev->private = NULL;
1537        printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1538        MOD_DEC_USE_COUNT;
1539        return -EIO;
1540}
1541
1542
1543
1544static int stop (mddev_t *mddev)
1545{
1546        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1547
1548        md_unregister_thread(conf->thread);
1549        shrink_stripes(conf);
1550        free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1551        kfree(conf);
1552        mddev->private = NULL;
1553        MOD_DEC_USE_COUNT;
1554        return 0;
1555}
1556
1557#if RAID5_DEBUG
1558static void print_sh (struct stripe_head *sh)
1559{
1560        int i;
1561
1562        printk("sh %lu, pd_idx %d, state %ld.\n", sh->sector, sh->pd_idx, sh->state);
1563        printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
1564        printk("sh %lu, ", sh->sector);
1565        for (i = 0; i < sh->raid_conf->raid_disks; i++) {
1566                printk("(cache%d: %p %ld) ", i, sh->dev[i].page, sh->dev[i].flags);
1567        }
1568        printk("\n");
1569}
1570
1571static void printall (raid5_conf_t *conf)
1572{
1573        struct stripe_head *sh;
1574        int i;
1575
1576        spin_lock_irq(&conf->device_lock);
1577        for (i = 0; i < NR_HASH; i++) {
1578                sh = conf->stripe_hashtbl[i];
1579                for (; sh; sh = sh->hash_next) {
1580                        if (sh->raid_conf != conf)
1581                                continue;
1582                        print_sh(sh);
1583                }
1584        }
1585        spin_unlock_irq(&conf->device_lock);
1586
1587        PRINTK("--- raid5d inactive\n");
1588}
1589#endif
1590
1591static int status (char *page, mddev_t *mddev)
1592{
1593        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1594        int sz = 0, i;
1595
1596        sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
1597        sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1598        for (i = 0; i < conf->raid_disks; i++)
1599                sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
1600        sz += sprintf (page+sz, "]");
1601#if RAID5_DEBUG
1602#define D(x) \
1603        sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
1604        printall(conf);
1605#endif
1606        return sz;
1607}
1608
1609static void print_raid5_conf (raid5_conf_t *conf)
1610{
1611        int i;
1612        struct disk_info *tmp;
1613
1614        printk("RAID5 conf printout:\n");
1615        if (!conf) {
1616                printk("(conf==NULL)\n");
1617                return;
1618        }
1619        printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1620                 conf->working_disks, conf->failed_disks);
1621
1622#if RAID5_DEBUG
1623        for (i = 0; i < MD_SB_DISKS; i++) {
1624#else
1625        for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1626#endif
1627                tmp = conf->disks + i;
1628                printk(" disk %d, s:%d, o:%d, us:%d dev:%s\n",
1629                        i, tmp->spare,tmp->operational,
1630                        tmp->used_slot,
1631                        bdev_partition_name(tmp->bdev));
1632        }
1633}
1634
1635static int raid5_spare_active(mddev_t *mddev)
1636{
1637        int err = 0;
1638        int i, failed_disk=-1, spare_disk=-1;
1639        raid5_conf_t *conf = mddev->private;
1640        struct disk_info *tmp, *sdisk, *fdisk;
1641        mdk_rdev_t *spare_rdev, *failed_rdev;
1642
1643        print_raid5_conf(conf);
1644        spin_lock_irq(&conf->device_lock);
1645        for (i = 0; i < conf->raid_disks; i++) {
1646                tmp = conf->disks + i;
1647                if ((!tmp->operational && !tmp->spare) ||
1648                                !tmp->used_slot) {
1649                        failed_disk = i;
1650                        break;
1651                }
1652        }
1653        if (failed_disk == -1) {
1654                MD_BUG();
1655                err = 1;
1656                goto abort;
1657        }
1658        /*
1659         * Find the spare disk ... (can only be in the 'high'
1660         * area of the array)
1661         */
1662        spare_disk = mddev->spare->raid_disk;
1663
1664        if (!conf->spare) {
1665                MD_BUG();
1666                err = 1;
1667                goto abort;
1668        }
1669        sdisk = conf->disks + spare_disk;
1670        fdisk = conf->disks + failed_disk;
1671
1672        /*
1673         * do the switch finally
1674         */
1675        spare_rdev = find_rdev_nr(mddev, spare_disk);
1676        failed_rdev = find_rdev_nr(mddev, failed_disk);
1677
1678        /* There must be a spare_rdev, but there may not be a
1679         * failed_rdev.  That slot might be empty...
1680         */
1681        spare_rdev->desc_nr = failed_disk;
1682        spare_rdev->raid_disk = failed_disk;
1683        if (failed_rdev) {
1684                failed_rdev->desc_nr = spare_disk;
1685                failed_rdev->raid_disk = spare_disk;
1686        }
1687        
1688        xchg_values(*fdisk, *sdisk);
1689
1690        /*
1691         * (careful, 'failed' and 'spare' are switched from now on)
1692         *
1693         * we want to preserve linear numbering and we want to
1694         * give the proper raid_disk number to the now activated
1695         * disk. (this means we switch back these values)
1696         */
1697
1698        if (!sdisk->bdev)
1699                sdisk->used_slot = 0;
1700
1701        /*
1702         * this really activates the spare.
1703         */
1704        fdisk->spare = 0;
1705        fdisk->write_only = 0;
1706
1707        /*
1708         * if we activate a spare, we definitely replace a
1709         * non-operational disk slot in the 'low' area of
1710         * the disk array.
1711         */
1712        mddev->degraded--;
1713        conf->failed_disks--;
1714        conf->working_disks++;
1715        conf->spare = NULL;
1716abort:
1717        spin_unlock_irq(&conf->device_lock);
1718        print_raid5_conf(conf);
1719        return err;
1720}
1721
1722static int raid5_spare_inactive(mddev_t *mddev)
1723{
1724        raid5_conf_t *conf = mddev->private;
1725        struct disk_info *p;
1726        int err = 0;
1727
1728        print_raid5_conf(conf);
1729        spin_lock_irq(&conf->device_lock);
1730        p = conf->disks + mddev->spare->raid_disk;
1731        if (p) {
1732                p->operational = 0;
1733                p->write_only = 0;
1734                if (conf->spare == p)
1735                        conf->spare = NULL;
1736        } else {
1737                MD_BUG();
1738                err = 1;
1739        }
1740        spin_unlock_irq(&conf->device_lock);
1741        print_raid5_conf(conf);
1742        return err;
1743}
1744
1745static int raid5_spare_write(mddev_t *mddev)
1746{
1747        raid5_conf_t *conf = mddev->private;
1748        struct disk_info *p;
1749        int err = 0;
1750
1751        print_raid5_conf(conf);
1752        spin_lock_irq(&conf->device_lock);
1753        p = conf->disks + mddev->spare->raid_disk;
1754        if (p && !conf->spare) {
1755                p->operational = 1;
1756                p->write_only = 1;
1757                conf->spare = p;
1758        } else {
1759                MD_BUG();
1760                err = 1;
1761        }
1762        spin_unlock_irq(&conf->device_lock);
1763        print_raid5_conf(conf);
1764        return err;
1765}
1766
1767static int raid5_remove_disk(mddev_t *mddev, int number)
1768{
1769        raid5_conf_t *conf = mddev->private;
1770        int err = 1;
1771        struct disk_info *p = conf->disks + number;
1772
1773        print_raid5_conf(conf);
1774        spin_lock_irq(&conf->device_lock);
1775
1776        if (p->used_slot) {
1777                if (p->operational) {
1778                        err = -EBUSY;
1779                        goto abort;
1780                }
1781                p->bdev = NULL;
1782                p->used_slot = 0;
1783                err = 0;
1784        }
1785        if (err)
1786                MD_BUG();
1787abort:
1788        spin_unlock_irq(&conf->device_lock);
1789        print_raid5_conf(conf);
1790        return err;
1791}
1792
1793static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1794{
1795        raid5_conf_t *conf = mddev->private;
1796        int err = 1;
1797        struct disk_info *p = conf->disks + rdev->raid_disk;
1798
1799        print_raid5_conf(conf);
1800        spin_lock_irq(&conf->device_lock);
1801        /*
1802         * find the disk ...
1803         */
1804
1805        if (!p->used_slot) {
1806                /* it will be held open by rdev */
1807                p->bdev = rdev->bdev;
1808                p->operational = 0;
1809                p->write_only = 0;
1810                p->spare = 1;
1811                p->used_slot = 1;
1812                err = 0;
1813        }
1814        if (err)
1815                MD_BUG();
1816        spin_unlock_irq(&conf->device_lock);
1817        print_raid5_conf(conf);
1818        return err;
1819}
1820
1821static mdk_personality_t raid5_personality=
1822{
1823        .name           = "raid5",
1824        .make_request   = make_request,
1825        .run            = run,
1826        .stop           = stop,
1827        .status         = status,
1828        .error_handler  = error,
1829        .hot_add_disk   = raid5_add_disk,
1830        .hot_remove_disk= raid5_remove_disk,
1831        .spare_write    = raid5_spare_write,
1832        .spare_inactive = raid5_spare_inactive,
1833        .spare_active   = raid5_spare_active,
1834        .sync_request   = sync_request,
1835};
1836
1837static int __init raid5_init (void)
1838{
1839        return register_md_personality (RAID5, &raid5_personality);
1840}
1841
1842static void raid5_exit (void)
1843{
1844        unregister_md_personality (RAID5);
1845}
1846
1847module_init(raid5_init);
1848module_exit(raid5_exit);
1849MODULE_LICENSE("GPL");
1850
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.