linux/drivers/md/raid5.c
<<
>>
Prefs
   1/*
   2 * raid5.c : Multiple Devices driver for Linux
   3 *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4 *         Copyright (C) 1999, 2000 Ingo Molnar
   5 *         Copyright (C) 2002, 2003 H. Peter Anvin
   6 *
   7 * RAID-4/5/6 management functions.
   8 * Thanks to Penguin Computing for making the RAID-6 development possible
   9 * by donating a test server!
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of the GNU General Public License as published by
  13 * the Free Software Foundation; either version 2, or (at your option)
  14 * any later version.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * (for example /usr/src/linux/COPYING); if not, write to the Free
  18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21/*
  22 * BITMAP UNPLUGGING:
  23 *
  24 * The sequencing for updating the bitmap reliably is a little
  25 * subtle (and I got it wrong the first time) so it deserves some
  26 * explanation.
  27 *
  28 * We group bitmap updates into batches.  Each batch has a number.
  29 * We may write out several batches at once, but that isn't very important.
  30 * conf->bm_write is the number of the last batch successfully written.
  31 * conf->bm_flush is the number of the last batch that was closed to
  32 *    new additions.
  33 * When we discover that we will need to write to any block in a stripe
  34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  35 * the number of the batch it will be in. This is bm_flush+1.
  36 * When we are ready to do a write, if that batch hasn't been written yet,
  37 *   we plug the array and queue the stripe for later.
  38 * When an unplug happens, we increment bm_flush, thus closing the current
  39 *   batch.
  40 * When we notice that bm_flush > bm_write, we write out all pending updates
  41 * to the bitmap, and advance bm_write to where bm_flush was.
  42 * This may occasionally write a bit out twice, but is sure never to
  43 * miss any bits.
  44 */
  45
  46#include <linux/kthread.h>
  47#include "raid6.h"
  48
  49#include <linux/raid/bitmap.h>
  50#include <linux/async_tx.h>
  51
  52/*
  53 * Stripe cache
  54 */
  55
  56#define NR_STRIPES              256
  57#define STRIPE_SIZE             PAGE_SIZE
  58#define STRIPE_SHIFT            (PAGE_SHIFT - 9)
  59#define STRIPE_SECTORS          (STRIPE_SIZE>>9)
  60#define IO_THRESHOLD            1
  61#define BYPASS_THRESHOLD        1
  62#define NR_HASH                 (PAGE_SIZE / sizeof(struct hlist_head))
  63#define HASH_MASK               (NR_HASH - 1)
  64
  65#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
  66
  67/* bio's attached to a stripe+device for I/O are linked together in bi_sector
  68 * order without overlap.  There may be several bio's per stripe+device, and
  69 * a bio could span several devices.
  70 * When walking this list for a particular stripe+device, we must never proceed
  71 * beyond a bio that extends past this device, as the next bio might no longer
  72 * be valid.
  73 * This macro is used to determine the 'next' bio in the list, given the sector
  74 * of the current stripe+device
  75 */
  76#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
  77/*
  78 * The following can be used to debug the driver
  79 */
  80#define RAID5_PARANOIA  1
  81#if RAID5_PARANOIA && defined(CONFIG_SMP)
  82# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
  83#else
  84# define CHECK_DEVLOCK()
  85#endif
  86
  87#ifdef DEBUG
  88#define inline
  89#define __inline__
  90#endif
  91
  92#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
  93
  94#if !RAID6_USE_EMPTY_ZERO_PAGE
  95/* In .bss so it's zeroed */
  96const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
  97#endif
  98
  99/*
 100 * We maintain a biased count of active stripes in the bottom 16 bits of
 101 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
 102 */
 103static inline int raid5_bi_phys_segments(struct bio *bio)
 104{
 105        return bio->bi_phys_segments & 0xffff;
 106}
 107
 108static inline int raid5_bi_hw_segments(struct bio *bio)
 109{
 110        return (bio->bi_phys_segments >> 16) & 0xffff;
 111}
 112
 113static inline int raid5_dec_bi_phys_segments(struct bio *bio)
 114{
 115        --bio->bi_phys_segments;
 116        return raid5_bi_phys_segments(bio);
 117}
 118
 119static inline int raid5_dec_bi_hw_segments(struct bio *bio)
 120{
 121        unsigned short val = raid5_bi_hw_segments(bio);
 122
 123        --val;
 124        bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
 125        return val;
 126}
 127
 128static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
 129{
 130        bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
 131}
 132
 133static inline int raid6_next_disk(int disk, int raid_disks)
 134{
 135        disk++;
 136        return (disk < raid_disks) ? disk : 0;
 137}
 138
 139static void return_io(struct bio *return_bi)
 140{
 141        struct bio *bi = return_bi;
 142        while (bi) {
 143
 144                return_bi = bi->bi_next;
 145                bi->bi_next = NULL;
 146                bi->bi_size = 0;
 147                bio_endio(bi, 0);
 148                bi = return_bi;
 149        }
 150}
 151
 152static void print_raid5_conf (raid5_conf_t *conf);
 153
 154static int stripe_operations_active(struct stripe_head *sh)
 155{
 156        return sh->check_state || sh->reconstruct_state ||
 157               test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 158               test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 159}
 160
 161static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 162{
 163        if (atomic_dec_and_test(&sh->count)) {
 164                BUG_ON(!list_empty(&sh->lru));
 165                BUG_ON(atomic_read(&conf->active_stripes)==0);
 166                if (test_bit(STRIPE_HANDLE, &sh->state)) {
 167                        if (test_bit(STRIPE_DELAYED, &sh->state)) {
 168                                list_add_tail(&sh->lru, &conf->delayed_list);
 169                                blk_plug_device(conf->mddev->queue);
 170                        } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 171                                   sh->bm_seq - conf->seq_write > 0) {
 172                                list_add_tail(&sh->lru, &conf->bitmap_list);
 173                                blk_plug_device(conf->mddev->queue);
 174                        } else {
 175                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
 176                                list_add_tail(&sh->lru, &conf->handle_list);
 177                        }
 178                        md_wakeup_thread(conf->mddev->thread);
 179                } else {
 180                        BUG_ON(stripe_operations_active(sh));
 181                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 182                                atomic_dec(&conf->preread_active_stripes);
 183                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
 184                                        md_wakeup_thread(conf->mddev->thread);
 185                        }
 186                        atomic_dec(&conf->active_stripes);
 187                        if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 188                                list_add_tail(&sh->lru, &conf->inactive_list);
 189                                wake_up(&conf->wait_for_stripe);
 190                                if (conf->retry_read_aligned)
 191                                        md_wakeup_thread(conf->mddev->thread);
 192                        }
 193                }
 194        }
 195}
 196static void release_stripe(struct stripe_head *sh)
 197{
 198        raid5_conf_t *conf = sh->raid_conf;
 199        unsigned long flags;
 200
 201        spin_lock_irqsave(&conf->device_lock, flags);
 202        __release_stripe(conf, sh);
 203        spin_unlock_irqrestore(&conf->device_lock, flags);
 204}
 205
 206static inline void remove_hash(struct stripe_head *sh)
 207{
 208        pr_debug("remove_hash(), stripe %llu\n",
 209                (unsigned long long)sh->sector);
 210
 211        hlist_del_init(&sh->hash);
 212}
 213
 214static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 215{
 216        struct hlist_head *hp = stripe_hash(conf, sh->sector);
 217
 218        pr_debug("insert_hash(), stripe %llu\n",
 219                (unsigned long long)sh->sector);
 220
 221        CHECK_DEVLOCK();
 222        hlist_add_head(&sh->hash, hp);
 223}
 224
 225
 226/* find an idle stripe, make sure it is unhashed, and return it. */
 227static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 228{
 229        struct stripe_head *sh = NULL;
 230        struct list_head *first;
 231
 232        CHECK_DEVLOCK();
 233        if (list_empty(&conf->inactive_list))
 234                goto out;
 235        first = conf->inactive_list.next;
 236        sh = list_entry(first, struct stripe_head, lru);
 237        list_del_init(first);
 238        remove_hash(sh);
 239        atomic_inc(&conf->active_stripes);
 240out:
 241        return sh;
 242}
 243
 244static void shrink_buffers(struct stripe_head *sh, int num)
 245{
 246        struct page *p;
 247        int i;
 248
 249        for (i=0; i<num ; i++) {
 250                p = sh->dev[i].page;
 251                if (!p)
 252                        continue;
 253                sh->dev[i].page = NULL;
 254                put_page(p);
 255        }
 256}
 257
 258static int grow_buffers(struct stripe_head *sh, int num)
 259{
 260        int i;
 261
 262        for (i=0; i<num; i++) {
 263                struct page *page;
 264
 265                if (!(page = alloc_page(GFP_KERNEL))) {
 266                        return 1;
 267                }
 268                sh->dev[i].page = page;
 269        }
 270        return 0;
 271}
 272
 273static void raid5_build_block(struct stripe_head *sh, int i);
 274
 275static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
 276{
 277        raid5_conf_t *conf = sh->raid_conf;
 278        int i;
 279
 280        BUG_ON(atomic_read(&sh->count) != 0);
 281        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 282        BUG_ON(stripe_operations_active(sh));
 283
 284        CHECK_DEVLOCK();
 285        pr_debug("init_stripe called, stripe %llu\n",
 286                (unsigned long long)sh->sector);
 287
 288        remove_hash(sh);
 289
 290        sh->sector = sector;
 291        sh->pd_idx = pd_idx;
 292        sh->state = 0;
 293
 294        sh->disks = disks;
 295
 296        for (i = sh->disks; i--; ) {
 297                struct r5dev *dev = &sh->dev[i];
 298
 299                if (dev->toread || dev->read || dev->towrite || dev->written ||
 300                    test_bit(R5_LOCKED, &dev->flags)) {
 301                        printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
 302                               (unsigned long long)sh->sector, i, dev->toread,
 303                               dev->read, dev->towrite, dev->written,
 304                               test_bit(R5_LOCKED, &dev->flags));
 305                        BUG();
 306                }
 307                dev->flags = 0;
 308                raid5_build_block(sh, i);
 309        }
 310        insert_hash(conf, sh);
 311}
 312
 313static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
 314{
 315        struct stripe_head *sh;
 316        struct hlist_node *hn;
 317
 318        CHECK_DEVLOCK();
 319        pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 320        hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
 321                if (sh->sector == sector && sh->disks == disks)
 322                        return sh;
 323        pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 324        return NULL;
 325}
 326
 327static void unplug_slaves(mddev_t *mddev);
 328static void raid5_unplug_device(struct request_queue *q);
 329
 330static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
 331                                             int pd_idx, int noblock)
 332{
 333        struct stripe_head *sh;
 334
 335        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 336
 337        spin_lock_irq(&conf->device_lock);
 338
 339        do {
 340                wait_event_lock_irq(conf->wait_for_stripe,
 341                                    conf->quiesce == 0,
 342                                    conf->device_lock, /* nothing */);
 343                sh = __find_stripe(conf, sector, disks);
 344                if (!sh) {
 345                        if (!conf->inactive_blocked)
 346                                sh = get_free_stripe(conf);
 347                        if (noblock && sh == NULL)
 348                                break;
 349                        if (!sh) {
 350                                conf->inactive_blocked = 1;
 351                                wait_event_lock_irq(conf->wait_for_stripe,
 352                                                    !list_empty(&conf->inactive_list) &&
 353                                                    (atomic_read(&conf->active_stripes)
 354                                                     < (conf->max_nr_stripes *3/4)
 355                                                     || !conf->inactive_blocked),
 356                                                    conf->device_lock,
 357                                                    raid5_unplug_device(conf->mddev->queue)
 358                                        );
 359                                conf->inactive_blocked = 0;
 360                        } else
 361                                init_stripe(sh, sector, pd_idx, disks);
 362                } else {
 363                        if (atomic_read(&sh->count)) {
 364                          BUG_ON(!list_empty(&sh->lru));
 365                        } else {
 366                                if (!test_bit(STRIPE_HANDLE, &sh->state))
 367                                        atomic_inc(&conf->active_stripes);
 368                                if (list_empty(&sh->lru) &&
 369                                    !test_bit(STRIPE_EXPANDING, &sh->state))
 370                                        BUG();
 371                                list_del_init(&sh->lru);
 372                        }
 373                }
 374        } while (sh == NULL);
 375
 376        if (sh)
 377                atomic_inc(&sh->count);
 378
 379        spin_unlock_irq(&conf->device_lock);
 380        return sh;
 381}
 382
 383static void
 384raid5_end_read_request(struct bio *bi, int error);
 385static void
 386raid5_end_write_request(struct bio *bi, int error);
 387
 388static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 389{
 390        raid5_conf_t *conf = sh->raid_conf;
 391        int i, disks = sh->disks;
 392
 393        might_sleep();
 394
 395        for (i = disks; i--; ) {
 396                int rw;
 397                struct bio *bi;
 398                mdk_rdev_t *rdev;
 399                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
 400                        rw = WRITE;
 401                else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 402                        rw = READ;
 403                else
 404                        continue;
 405
 406                bi = &sh->dev[i].req;
 407
 408                bi->bi_rw = rw;
 409                if (rw == WRITE)
 410                        bi->bi_end_io = raid5_end_write_request;
 411                else
 412                        bi->bi_end_io = raid5_end_read_request;
 413
 414                rcu_read_lock();
 415                rdev = rcu_dereference(conf->disks[i].rdev);
 416                if (rdev && test_bit(Faulty, &rdev->flags))
 417                        rdev = NULL;
 418                if (rdev)
 419                        atomic_inc(&rdev->nr_pending);
 420                rcu_read_unlock();
 421
 422                if (rdev) {
 423                        if (s->syncing || s->expanding || s->expanded)
 424                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 425
 426                        set_bit(STRIPE_IO_STARTED, &sh->state);
 427
 428                        bi->bi_bdev = rdev->bdev;
 429                        pr_debug("%s: for %llu schedule op %ld on disc %d\n",
 430                                __func__, (unsigned long long)sh->sector,
 431                                bi->bi_rw, i);
 432                        atomic_inc(&sh->count);
 433                        bi->bi_sector = sh->sector + rdev->data_offset;
 434                        bi->bi_flags = 1 << BIO_UPTODATE;
 435                        bi->bi_vcnt = 1;
 436                        bi->bi_max_vecs = 1;
 437                        bi->bi_idx = 0;
 438                        bi->bi_io_vec = &sh->dev[i].vec;
 439                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 440                        bi->bi_io_vec[0].bv_offset = 0;
 441                        bi->bi_size = STRIPE_SIZE;
 442                        bi->bi_next = NULL;
 443                        if (rw == WRITE &&
 444                            test_bit(R5_ReWrite, &sh->dev[i].flags))
 445                                atomic_add(STRIPE_SECTORS,
 446                                        &rdev->corrected_errors);
 447                        generic_make_request(bi);
 448                } else {
 449                        if (rw == WRITE)
 450                                set_bit(STRIPE_DEGRADED, &sh->state);
 451                        pr_debug("skip op %ld on disc %d for sector %llu\n",
 452                                bi->bi_rw, i, (unsigned long long)sh->sector);
 453                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
 454                        set_bit(STRIPE_HANDLE, &sh->state);
 455                }
 456        }
 457}
 458
 459static struct dma_async_tx_descriptor *
 460async_copy_data(int frombio, struct bio *bio, struct page *page,
 461        sector_t sector, struct dma_async_tx_descriptor *tx)
 462{
 463        struct bio_vec *bvl;
 464        struct page *bio_page;
 465        int i;
 466        int page_offset;
 467
 468        if (bio->bi_sector >= sector)
 469                page_offset = (signed)(bio->bi_sector - sector) * 512;
 470        else
 471                page_offset = (signed)(sector - bio->bi_sector) * -512;
 472        bio_for_each_segment(bvl, bio, i) {
 473                int len = bio_iovec_idx(bio, i)->bv_len;
 474                int clen;
 475                int b_offset = 0;
 476
 477                if (page_offset < 0) {
 478                        b_offset = -page_offset;
 479                        page_offset += b_offset;
 480                        len -= b_offset;
 481                }
 482
 483                if (len > 0 && page_offset + len > STRIPE_SIZE)
 484                        clen = STRIPE_SIZE - page_offset;
 485                else
 486                        clen = len;
 487
 488                if (clen > 0) {
 489                        b_offset += bio_iovec_idx(bio, i)->bv_offset;
 490                        bio_page = bio_iovec_idx(bio, i)->bv_page;
 491                        if (frombio)
 492                                tx = async_memcpy(page, bio_page, page_offset,
 493                                        b_offset, clen,
 494                                        ASYNC_TX_DEP_ACK,
 495                                        tx, NULL, NULL);
 496                        else
 497                                tx = async_memcpy(bio_page, page, b_offset,
 498                                        page_offset, clen,
 499                                        ASYNC_TX_DEP_ACK,
 500                                        tx, NULL, NULL);
 501                }
 502                if (clen < len) /* hit end of page */
 503                        break;
 504                page_offset +=  len;
 505        }
 506
 507        return tx;
 508}
 509
 510static void ops_complete_biofill(void *stripe_head_ref)
 511{
 512        struct stripe_head *sh = stripe_head_ref;
 513        struct bio *return_bi = NULL;
 514        raid5_conf_t *conf = sh->raid_conf;
 515        int i;
 516
 517        pr_debug("%s: stripe %llu\n", __func__,
 518                (unsigned long long)sh->sector);
 519
 520        /* clear completed biofills */
 521        spin_lock_irq(&conf->device_lock);
 522        for (i = sh->disks; i--; ) {
 523                struct r5dev *dev = &sh->dev[i];
 524
 525                /* acknowledge completion of a biofill operation */
 526                /* and check if we need to reply to a read request,
 527                 * new R5_Wantfill requests are held off until
 528                 * !STRIPE_BIOFILL_RUN
 529                 */
 530                if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
 531                        struct bio *rbi, *rbi2;
 532
 533                        BUG_ON(!dev->read);
 534                        rbi = dev->read;
 535                        dev->read = NULL;
 536                        while (rbi && rbi->bi_sector <
 537                                dev->sector + STRIPE_SECTORS) {
 538                                rbi2 = r5_next_bio(rbi, dev->sector);
 539                                if (!raid5_dec_bi_phys_segments(rbi)) {
 540                                        rbi->bi_next = return_bi;
 541                                        return_bi = rbi;
 542                                }
 543                                rbi = rbi2;
 544                        }
 545                }
 546        }
 547        spin_unlock_irq(&conf->device_lock);
 548        clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 549
 550        return_io(return_bi);
 551
 552        set_bit(STRIPE_HANDLE, &sh->state);
 553        release_stripe(sh);
 554}
 555
 556static void ops_run_biofill(struct stripe_head *sh)
 557{
 558        struct dma_async_tx_descriptor *tx = NULL;
 559        raid5_conf_t *conf = sh->raid_conf;
 560        int i;
 561
 562        pr_debug("%s: stripe %llu\n", __func__,
 563                (unsigned long long)sh->sector);
 564
 565        for (i = sh->disks; i--; ) {
 566                struct r5dev *dev = &sh->dev[i];
 567                if (test_bit(R5_Wantfill, &dev->flags)) {
 568                        struct bio *rbi;
 569                        spin_lock_irq(&conf->device_lock);
 570                        dev->read = rbi = dev->toread;
 571                        dev->toread = NULL;
 572                        spin_unlock_irq(&conf->device_lock);
 573                        while (rbi && rbi->bi_sector <
 574                                dev->sector + STRIPE_SECTORS) {
 575                                tx = async_copy_data(0, rbi, dev->page,
 576                                        dev->sector, tx);
 577                                rbi = r5_next_bio(rbi, dev->sector);
 578                        }
 579                }
 580        }
 581
 582        atomic_inc(&sh->count);
 583        async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
 584                ops_complete_biofill, sh);
 585}
 586
 587static void ops_complete_compute5(void *stripe_head_ref)
 588{
 589        struct stripe_head *sh = stripe_head_ref;
 590        int target = sh->ops.target;
 591        struct r5dev *tgt = &sh->dev[target];
 592
 593        pr_debug("%s: stripe %llu\n", __func__,
 594                (unsigned long long)sh->sector);
 595
 596        set_bit(R5_UPTODATE, &tgt->flags);
 597        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 598        clear_bit(R5_Wantcompute, &tgt->flags);
 599        clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
 600        if (sh->check_state == check_state_compute_run)
 601                sh->check_state = check_state_compute_result;
 602        set_bit(STRIPE_HANDLE, &sh->state);
 603        release_stripe(sh);
 604}
 605
 606static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
 607{
 608        /* kernel stack size limits the total number of disks */
 609        int disks = sh->disks;
 610        struct page *xor_srcs[disks];
 611        int target = sh->ops.target;
 612        struct r5dev *tgt = &sh->dev[target];
 613        struct page *xor_dest = tgt->page;
 614        int count = 0;
 615        struct dma_async_tx_descriptor *tx;
 616        int i;
 617
 618        pr_debug("%s: stripe %llu block: %d\n",
 619                __func__, (unsigned long long)sh->sector, target);
 620        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 621
 622        for (i = disks; i--; )
 623                if (i != target)
 624                        xor_srcs[count++] = sh->dev[i].page;
 625
 626        atomic_inc(&sh->count);
 627
 628        if (unlikely(count == 1))
 629                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
 630                        0, NULL, ops_complete_compute5, sh);
 631        else
 632                tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 633                        ASYNC_TX_XOR_ZERO_DST, NULL,
 634                        ops_complete_compute5, sh);
 635
 636        return tx;
 637}
 638
 639static void ops_complete_prexor(void *stripe_head_ref)
 640{
 641        struct stripe_head *sh = stripe_head_ref;
 642
 643        pr_debug("%s: stripe %llu\n", __func__,
 644                (unsigned long long)sh->sector);
 645}
 646
 647static struct dma_async_tx_descriptor *
 648ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 649{
 650        /* kernel stack size limits the total number of disks */
 651        int disks = sh->disks;
 652        struct page *xor_srcs[disks];
 653        int count = 0, pd_idx = sh->pd_idx, i;
 654
 655        /* existing parity data subtracted */
 656        struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 657
 658        pr_debug("%s: stripe %llu\n", __func__,
 659                (unsigned long long)sh->sector);
 660
 661        for (i = disks; i--; ) {
 662                struct r5dev *dev = &sh->dev[i];
 663                /* Only process blocks that are known to be uptodate */
 664                if (test_bit(R5_Wantdrain, &dev->flags))
 665                        xor_srcs[count++] = dev->page;
 666        }
 667
 668        tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 669                ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
 670                ops_complete_prexor, sh);
 671
 672        return tx;
 673}
 674
 675static struct dma_async_tx_descriptor *
 676ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 677{
 678        int disks = sh->disks;
 679        int i;
 680
 681        pr_debug("%s: stripe %llu\n", __func__,
 682                (unsigned long long)sh->sector);
 683
 684        for (i = disks; i--; ) {
 685                struct r5dev *dev = &sh->dev[i];
 686                struct bio *chosen;
 687
 688                if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
 689                        struct bio *wbi;
 690
 691                        spin_lock(&sh->lock);
 692                        chosen = dev->towrite;
 693                        dev->towrite = NULL;
 694                        BUG_ON(dev->written);
 695                        wbi = dev->written = chosen;
 696                        spin_unlock(&sh->lock);
 697
 698                        while (wbi && wbi->bi_sector <
 699                                dev->sector + STRIPE_SECTORS) {
 700                                tx = async_copy_data(1, wbi, dev->page,
 701                                        dev->sector, tx);
 702                                wbi = r5_next_bio(wbi, dev->sector);
 703                        }
 704                }
 705        }
 706
 707        return tx;
 708}
 709
 710static void ops_complete_postxor(void *stripe_head_ref)
 711{
 712        struct stripe_head *sh = stripe_head_ref;
 713        int disks = sh->disks, i, pd_idx = sh->pd_idx;
 714
 715        pr_debug("%s: stripe %llu\n", __func__,
 716                (unsigned long long)sh->sector);
 717
 718        for (i = disks; i--; ) {
 719                struct r5dev *dev = &sh->dev[i];
 720                if (dev->written || i == pd_idx)
 721                        set_bit(R5_UPTODATE, &dev->flags);
 722        }
 723
 724        if (sh->reconstruct_state == reconstruct_state_drain_run)
 725                sh->reconstruct_state = reconstruct_state_drain_result;
 726        else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
 727                sh->reconstruct_state = reconstruct_state_prexor_drain_result;
 728        else {
 729                BUG_ON(sh->reconstruct_state != reconstruct_state_run);
 730                sh->reconstruct_state = reconstruct_state_result;
 731        }
 732
 733        set_bit(STRIPE_HANDLE, &sh->state);
 734        release_stripe(sh);
 735}
 736
 737static void
 738ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 739{
 740        /* kernel stack size limits the total number of disks */
 741        int disks = sh->disks;
 742        struct page *xor_srcs[disks];
 743
 744        int count = 0, pd_idx = sh->pd_idx, i;
 745        struct page *xor_dest;
 746        int prexor = 0;
 747        unsigned long flags;
 748
 749        pr_debug("%s: stripe %llu\n", __func__,
 750                (unsigned long long)sh->sector);
 751
 752        /* check if prexor is active which means only process blocks
 753         * that are part of a read-modify-write (written)
 754         */
 755        if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
 756                prexor = 1;
 757                xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 758                for (i = disks; i--; ) {
 759                        struct r5dev *dev = &sh->dev[i];
 760                        if (dev->written)
 761                                xor_srcs[count++] = dev->page;
 762                }
 763        } else {
 764                xor_dest = sh->dev[pd_idx].page;
 765                for (i = disks; i--; ) {
 766                        struct r5dev *dev = &sh->dev[i];
 767                        if (i != pd_idx)
 768                                xor_srcs[count++] = dev->page;
 769                }
 770        }
 771
 772        /* 1/ if we prexor'd then the dest is reused as a source
 773         * 2/ if we did not prexor then we are redoing the parity
 774         * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
 775         * for the synchronous xor case
 776         */
 777        flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
 778                (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
 779
 780        atomic_inc(&sh->count);
 781
 782        if (unlikely(count == 1)) {
 783                flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
 784                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
 785                        flags, tx, ops_complete_postxor, sh);
 786        } else
 787                tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 788                        flags, tx, ops_complete_postxor, sh);
 789}
 790
 791static void ops_complete_check(void *stripe_head_ref)
 792{
 793        struct stripe_head *sh = stripe_head_ref;
 794
 795        pr_debug("%s: stripe %llu\n", __func__,
 796                (unsigned long long)sh->sector);
 797
 798        sh->check_state = check_state_check_result;
 799        set_bit(STRIPE_HANDLE, &sh->state);
 800        release_stripe(sh);
 801}
 802
 803static void ops_run_check(struct stripe_head *sh)
 804{
 805        /* kernel stack size limits the total number of disks */
 806        int disks = sh->disks;
 807        struct page *xor_srcs[disks];
 808        struct dma_async_tx_descriptor *tx;
 809
 810        int count = 0, pd_idx = sh->pd_idx, i;
 811        struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 812
 813        pr_debug("%s: stripe %llu\n", __func__,
 814                (unsigned long long)sh->sector);
 815
 816        for (i = disks; i--; ) {
 817                struct r5dev *dev = &sh->dev[i];
 818                if (i != pd_idx)
 819                        xor_srcs[count++] = dev->page;
 820        }
 821
 822        tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 823                &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
 824
 825        atomic_inc(&sh->count);
 826        tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
 827                ops_complete_check, sh);
 828}
 829
 830static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 831{
 832        int overlap_clear = 0, i, disks = sh->disks;
 833        struct dma_async_tx_descriptor *tx = NULL;
 834
 835        if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 836                ops_run_biofill(sh);
 837                overlap_clear++;
 838        }
 839
 840        if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
 841                tx = ops_run_compute5(sh);
 842                /* terminate the chain if postxor is not set to be run */
 843                if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
 844                        async_tx_ack(tx);
 845        }
 846
 847        if (test_bit(STRIPE_OP_PREXOR, &ops_request))
 848                tx = ops_run_prexor(sh, tx);
 849
 850        if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
 851                tx = ops_run_biodrain(sh, tx);
 852                overlap_clear++;
 853        }
 854
 855        if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
 856                ops_run_postxor(sh, tx);
 857
 858        if (test_bit(STRIPE_OP_CHECK, &ops_request))
 859                ops_run_check(sh);
 860
 861        if (overlap_clear)
 862                for (i = disks; i--; ) {
 863                        struct r5dev *dev = &sh->dev[i];
 864                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
 865                                wake_up(&sh->raid_conf->wait_for_overlap);
 866                }
 867}
 868
 869static int grow_one_stripe(raid5_conf_t *conf)
 870{
 871        struct stripe_head *sh;
 872        sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
 873        if (!sh)
 874                return 0;
 875        memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
 876        sh->raid_conf = conf;
 877        spin_lock_init(&sh->lock);
 878
 879        if (grow_buffers(sh, conf->raid_disks)) {
 880                shrink_buffers(sh, conf->raid_disks);
 881                kmem_cache_free(conf->slab_cache, sh);
 882                return 0;
 883        }
 884        sh->disks = conf->raid_disks;
 885        /* we just created an active stripe so... */
 886        atomic_set(&sh->count, 1);
 887        atomic_inc(&conf->active_stripes);
 888        INIT_LIST_HEAD(&sh->lru);
 889        release_stripe(sh);
 890        return 1;
 891}
 892
 893static int grow_stripes(raid5_conf_t *conf, int num)
 894{
 895        struct kmem_cache *sc;
 896        int devs = conf->raid_disks;
 897
 898        sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
 899        sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
 900        conf->active_name = 0;
 901        sc = kmem_cache_create(conf->cache_name[conf->active_name],
 902                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 903                               0, 0, NULL);
 904        if (!sc)
 905                return 1;
 906        conf->slab_cache = sc;
 907        conf->pool_size = devs;
 908        while (num--)
 909                if (!grow_one_stripe(conf))
 910                        return 1;
 911        return 0;
 912}
 913
 914#ifdef CONFIG_MD_RAID5_RESHAPE
 915static int resize_stripes(raid5_conf_t *conf, int newsize)
 916{
 917        /* Make all the stripes able to hold 'newsize' devices.
 918         * New slots in each stripe get 'page' set to a new page.
 919         *
 920         * This happens in stages:
 921         * 1/ create a new kmem_cache and allocate the required number of
 922         *    stripe_heads.
 923         * 2/ gather all the old stripe_heads and tranfer the pages across
 924         *    to the new stripe_heads.  This will have the side effect of
 925         *    freezing the array as once all stripe_heads have been collected,
 926         *    no IO will be possible.  Old stripe heads are freed once their
 927         *    pages have been transferred over, and the old kmem_cache is
 928         *    freed when all stripes are done.
 929         * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
 930         *    we simple return a failre status - no need to clean anything up.
 931         * 4/ allocate new pages for the new slots in the new stripe_heads.
 932         *    If this fails, we don't bother trying the shrink the
 933         *    stripe_heads down again, we just leave them as they are.
 934         *    As each stripe_head is processed the new one is released into
 935         *    active service.
 936         *
 937         * Once step2 is started, we cannot afford to wait for a write,
 938         * so we use GFP_NOIO allocations.
 939         */
 940        struct stripe_head *osh, *nsh;
 941        LIST_HEAD(newstripes);
 942        struct disk_info *ndisks;
 943        int err;
 944        struct kmem_cache *sc;
 945        int i;
 946
 947        if (newsize <= conf->pool_size)
 948                return 0; /* never bother to shrink */
 949
 950        err = md_allow_write(conf->mddev);
 951        if (err)
 952                return err;
 953
 954        /* Step 1 */
 955        sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
 956                               sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
 957                               0, 0, NULL);
 958        if (!sc)
 959                return -ENOMEM;
 960
 961        for (i = conf->max_nr_stripes; i; i--) {
 962                nsh = kmem_cache_alloc(sc, GFP_KERNEL);
 963                if (!nsh)
 964                        break;
 965
 966                memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
 967
 968                nsh->raid_conf = conf;
 969                spin_lock_init(&nsh->lock);
 970
 971                list_add(&nsh->lru, &newstripes);
 972        }
 973        if (i) {
 974                /* didn't get enough, give up */
 975                while (!list_empty(&newstripes)) {
 976                        nsh = list_entry(newstripes.next, struct stripe_head, lru);
 977                        list_del(&nsh->lru);
 978                        kmem_cache_free(sc, nsh);
 979                }
 980                kmem_cache_destroy(sc);
 981                return -ENOMEM;
 982        }
 983        /* Step 2 - Must use GFP_NOIO now.
 984         * OK, we have enough stripes, start collecting inactive
 985         * stripes and copying them over
 986         */
 987        list_for_each_entry(nsh, &newstripes, lru) {
 988                spin_lock_irq(&conf->device_lock);
 989                wait_event_lock_irq(conf->wait_for_stripe,
 990                                    !list_empty(&conf->inactive_list),
 991                                    conf->device_lock,
 992                                    unplug_slaves(conf->mddev)
 993                        );
 994                osh = get_free_stripe(conf);
 995                spin_unlock_irq(&conf->device_lock);
 996                atomic_set(&nsh->count, 1);
 997                for(i=0; i<conf->pool_size; i++)
 998                        nsh->dev[i].page = osh->dev[i].page;
 999                for( ; i<newsize; i++)
1000                        nsh->dev[i].page = NULL;
1001                kmem_cache_free(conf->slab_cache, osh);
1002        }
1003        kmem_cache_destroy(conf->slab_cache);
1004
1005        /* Step 3.
1006         * At this point, we are holding all the stripes so the array
1007         * is completely stalled, so now is a good time to resize
1008         * conf->disks.
1009         */
1010        ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1011        if (ndisks) {
1012                for (i=0; i<conf->raid_disks; i++)
1013                        ndisks[i] = conf->disks[i];
1014                kfree(conf->disks);
1015                conf->disks = ndisks;
1016        } else
1017                err = -ENOMEM;
1018
1019        /* Step 4, return new stripes to service */
1020        while(!list_empty(&newstripes)) {
1021                nsh = list_entry(newstripes.next, struct stripe_head, lru);
1022                list_del_init(&nsh->lru);
1023                for (i=conf->raid_disks; i < newsize; i++)
1024                        if (nsh->dev[i].page == NULL) {
1025                                struct page *p = alloc_page(GFP_NOIO);
1026                                nsh->dev[i].page = p;
1027                                if (!p)
1028                                        err = -ENOMEM;
1029                        }
1030                release_stripe(nsh);
1031        }
1032        /* critical section pass, GFP_NOIO no longer needed */
1033
1034        conf->slab_cache = sc;
1035        conf->active_name = 1-conf->active_name;
1036        conf->pool_size = newsize;
1037        return err;
1038}
1039#endif
1040
1041static int drop_one_stripe(raid5_conf_t *conf)
1042{
1043        struct stripe_head *sh;
1044
1045        spin_lock_irq(&conf->device_lock);
1046        sh = get_free_stripe(conf);
1047        spin_unlock_irq(&conf->device_lock);
1048        if (!sh)
1049                return 0;
1050        BUG_ON(atomic_read(&sh->count));
1051        shrink_buffers(sh, conf->pool_size);
1052        kmem_cache_free(conf->slab_cache, sh);
1053        atomic_dec(&conf->active_stripes);
1054        return 1;
1055}
1056
1057static void shrink_stripes(raid5_conf_t *conf)
1058{
1059        while (drop_one_stripe(conf))
1060                ;
1061
1062        if (conf->slab_cache)
1063                kmem_cache_destroy(conf->slab_cache);
1064        conf->slab_cache = NULL;
1065}
1066
1067static void raid5_end_read_request(struct bio * bi, int error)
1068{
1069        struct stripe_head *sh = bi->bi_private;
1070        raid5_conf_t *conf = sh->raid_conf;
1071        int disks = sh->disks, i;
1072        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1073        char b[BDEVNAME_SIZE];
1074        mdk_rdev_t *rdev;
1075
1076
1077        for (i=0 ; i<disks; i++)
1078                if (bi == &sh->dev[i].req)
1079                        break;
1080
1081        pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1082                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1083                uptodate);
1084        if (i == disks) {
1085                BUG();
1086                return;
1087        }
1088
1089        if (uptodate) {
1090                set_bit(R5_UPTODATE, &sh->dev[i].flags);
1091                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1092                        rdev = conf->disks[i].rdev;
1093                        printk_rl(KERN_INFO "raid5:%s: read error corrected"
1094                                  " (%lu sectors at %llu on %s)\n",
1095                                  mdname(conf->mddev), STRIPE_SECTORS,
1096                                  (unsigned long long)(sh->sector
1097                                                       + rdev->data_offset),
1098                                  bdevname(rdev->bdev, b));
1099                        clear_bit(R5_ReadError, &sh->dev[i].flags);
1100                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
1101                }
1102                if (atomic_read(&conf->disks[i].rdev->read_errors))
1103                        atomic_set(&conf->disks[i].rdev->read_errors, 0);
1104        } else {
1105                const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1106                int retry = 0;
1107                rdev = conf->disks[i].rdev;
1108
1109                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1110                atomic_inc(&rdev->read_errors);
1111                if (conf->mddev->degraded)
1112                        printk_rl(KERN_WARNING
1113                                  "raid5:%s: read error not correctable "
1114                                  "(sector %llu on %s).\n",
1115                                  mdname(conf->mddev),
1116                                  (unsigned long long)(sh->sector
1117                                                       + rdev->data_offset),
1118                                  bdn);
1119                else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1120                        /* Oh, no!!! */
1121                        printk_rl(KERN_WARNING
1122                                  "raid5:%s: read error NOT corrected!! "
1123                                  "(sector %llu on %s).\n",
1124                                  mdname(conf->mddev),
1125                                  (unsigned long long)(sh->sector
1126                                                       + rdev->data_offset),
1127                                  bdn);
1128                else if (atomic_read(&rdev->read_errors)
1129                         > conf->max_nr_stripes)
1130                        printk(KERN_WARNING
1131                               "raid5:%s: Too many read errors, failing device %s.\n",
1132                               mdname(conf->mddev), bdn);
1133                else
1134                        retry = 1;
1135                if (retry)
1136                        set_bit(R5_ReadError, &sh->dev[i].flags);
1137                else {
1138                        clear_bit(R5_ReadError, &sh->dev[i].flags);
1139                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
1140                        md_error(conf->mddev, rdev);
1141                }
1142        }
1143        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1144        clear_bit(R5_LOCKED, &sh->dev[i].flags);
1145        set_bit(STRIPE_HANDLE, &sh->state);
1146        release_stripe(sh);
1147}
1148
1149static void raid5_end_write_request(struct bio *bi, int error)
1150{
1151        struct stripe_head *sh = bi->bi_private;
1152        raid5_conf_t *conf = sh->raid_conf;
1153        int disks = sh->disks, i;
1154        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1155
1156        for (i=0 ; i<disks; i++)
1157                if (bi == &sh->dev[i].req)
1158                        break;
1159
1160        pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1161                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1162                uptodate);
1163        if (i == disks) {
1164                BUG();
1165                return;
1166        }
1167
1168        if (!uptodate)
1169                md_error(conf->mddev, conf->disks[i].rdev);
1170
1171        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1172        
1173        clear_bit(R5_LOCKED, &sh->dev[i].flags);
1174        set_bit(STRIPE_HANDLE, &sh->state);
1175        release_stripe(sh);
1176}
1177
1178
1179static sector_t compute_blocknr(struct stripe_head *sh, int i);
1180        
1181static void raid5_build_block(struct stripe_head *sh, int i)
1182{
1183        struct r5dev *dev = &sh->dev[i];
1184
1185        bio_init(&dev->req);
1186        dev->req.bi_io_vec = &dev->vec;
1187        dev->req.bi_vcnt++;
1188        dev->req.bi_max_vecs++;
1189        dev->vec.bv_page = dev->page;
1190        dev->vec.bv_len = STRIPE_SIZE;
1191        dev->vec.bv_offset = 0;
1192
1193        dev->req.bi_sector = sh->sector;
1194        dev->req.bi_private = sh;
1195
1196        dev->flags = 0;
1197        dev->sector = compute_blocknr(sh, i);
1198}
1199
1200static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1201{
1202        char b[BDEVNAME_SIZE];
1203        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1204        pr_debug("raid5: error called\n");
1205
1206        if (!test_bit(Faulty, &rdev->flags)) {
1207                set_bit(MD_CHANGE_DEVS, &mddev->flags);
1208                if (test_and_clear_bit(In_sync, &rdev->flags)) {
1209                        unsigned long flags;
1210                        spin_lock_irqsave(&conf->device_lock, flags);
1211                        mddev->degraded++;
1212                        spin_unlock_irqrestore(&conf->device_lock, flags);
1213                        /*
1214                         * if recovery was running, make sure it aborts.
1215                         */
1216                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1217                }
1218                set_bit(Faulty, &rdev->flags);
1219                printk(KERN_ALERT
1220                       "raid5: Disk failure on %s, disabling device.\n"
1221                       "raid5: Operation continuing on %d devices.\n",
1222                       bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1223        }
1224}
1225
1226/*
1227 * Input: a 'big' sector number,
1228 * Output: index of the data and parity disk, and the sector # in them.
1229 */
1230static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
1231                        unsigned int data_disks, unsigned int * dd_idx,
1232                        unsigned int * pd_idx, raid5_conf_t *conf)
1233{
1234        long stripe;
1235        unsigned long chunk_number;
1236        unsigned int chunk_offset;
1237        sector_t new_sector;
1238        int sectors_per_chunk = conf->chunk_size >> 9;
1239
1240        /* First compute the information on this sector */
1241
1242        /*
1243         * Compute the chunk number and the sector offset inside the chunk
1244         */
1245        chunk_offset = sector_div(r_sector, sectors_per_chunk);
1246        chunk_number = r_sector;
1247        BUG_ON(r_sector != chunk_number);
1248
1249        /*
1250         * Compute the stripe number
1251         */
1252        stripe = chunk_number / data_disks;
1253
1254        /*
1255         * Compute the data disk and parity disk indexes inside the stripe
1256         */
1257        *dd_idx = chunk_number % data_disks;
1258
1259        /*
1260         * Select the parity disk based on the user selected algorithm.
1261         */
1262        switch(conf->level) {
1263        case 4:
1264                *pd_idx = data_disks;
1265                break;
1266        case 5:
1267                switch (conf->algorithm) {
1268                case ALGORITHM_LEFT_ASYMMETRIC:
1269                        *pd_idx = data_disks - stripe % raid_disks;
1270                        if (*dd_idx >= *pd_idx)
1271                                (*dd_idx)++;
1272                        break;
1273                case ALGORITHM_RIGHT_ASYMMETRIC:
1274                        *pd_idx = stripe % raid_disks;
1275                        if (*dd_idx >= *pd_idx)
1276                                (*dd_idx)++;
1277                        break;
1278                case ALGORITHM_LEFT_SYMMETRIC:
1279                        *pd_idx = data_disks - stripe % raid_disks;
1280                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
1281                        break;
1282                case ALGORITHM_RIGHT_SYMMETRIC:
1283                        *pd_idx = stripe % raid_disks;
1284                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
1285                        break;
1286                default:
1287                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1288                                conf->algorithm);
1289                }
1290                break;
1291        case 6:
1292
1293                /**** FIX THIS ****/
1294                switch (conf->algorithm) {
1295                case ALGORITHM_LEFT_ASYMMETRIC:
1296                        *pd_idx = raid_disks - 1 - (stripe % raid_disks);
1297                        if (*pd_idx == raid_disks-1)
1298                                (*dd_idx)++;    /* Q D D D P */
1299                        else if (*dd_idx >= *pd_idx)
1300                                (*dd_idx) += 2; /* D D P Q D */
1301                        break;
1302                case ALGORITHM_RIGHT_ASYMMETRIC:
1303                        *pd_idx = stripe % raid_disks;
1304                        if (*pd_idx == raid_disks-1)
1305                                (*dd_idx)++;    /* Q D D D P */
1306                        else if (*dd_idx >= *pd_idx)
1307                                (*dd_idx) += 2; /* D D P Q D */
1308                        break;
1309                case ALGORITHM_LEFT_SYMMETRIC:
1310                        *pd_idx = raid_disks - 1 - (stripe % raid_disks);
1311                        *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
1312                        break;
1313                case ALGORITHM_RIGHT_SYMMETRIC:
1314                        *pd_idx = stripe % raid_disks;
1315                        *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
1316                        break;
1317                default:
1318                        printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1319                               conf->algorithm);
1320                }
1321                break;
1322        }
1323
1324        /*
1325         * Finally, compute the new sector number
1326         */
1327        new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1328        return new_sector;
1329}
1330
1331
1332static sector_t compute_blocknr(struct stripe_head *sh, int i)
1333{
1334        raid5_conf_t *conf = sh->raid_conf;
1335        int raid_disks = sh->disks;
1336        int data_disks = raid_disks - conf->max_degraded;
1337        sector_t new_sector = sh->sector, check;
1338        int sectors_per_chunk = conf->chunk_size >> 9;
1339        sector_t stripe;
1340        int chunk_offset;
1341        int chunk_number, dummy1, dummy2, dd_idx = i;
1342        sector_t r_sector;
1343
1344
1345        chunk_offset = sector_div(new_sector, sectors_per_chunk);
1346        stripe = new_sector;
1347        BUG_ON(new_sector != stripe);
1348
1349        if (i == sh->pd_idx)
1350                return 0;
1351        switch(conf->level) {
1352        case 4: break;
1353        case 5:
1354                switch (conf->algorithm) {
1355                case ALGORITHM_LEFT_ASYMMETRIC:
1356                case ALGORITHM_RIGHT_ASYMMETRIC:
1357                        if (i > sh->pd_idx)
1358                                i--;
1359                        break;
1360                case ALGORITHM_LEFT_SYMMETRIC:
1361                case ALGORITHM_RIGHT_SYMMETRIC:
1362                        if (i < sh->pd_idx)
1363                                i += raid_disks;
1364                        i -= (sh->pd_idx + 1);
1365                        break;
1366                default:
1367                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1368                               conf->algorithm);
1369                }
1370                break;
1371        case 6:
1372                if (i == raid6_next_disk(sh->pd_idx, raid_disks))
1373                        return 0; /* It is the Q disk */
1374                switch (conf->algorithm) {
1375                case ALGORITHM_LEFT_ASYMMETRIC:
1376                case ALGORITHM_RIGHT_ASYMMETRIC:
1377                        if (sh->pd_idx == raid_disks-1)
1378                                i--;    /* Q D D D P */
1379                        else if (i > sh->pd_idx)
1380                                i -= 2; /* D D P Q D */
1381                        break;
1382                case ALGORITHM_LEFT_SYMMETRIC:
1383                case ALGORITHM_RIGHT_SYMMETRIC:
1384                        if (sh->pd_idx == raid_disks-1)
1385                                i--; /* Q D D D P */
1386                        else {
1387                                /* D D P Q D */
1388                                if (i < sh->pd_idx)
1389                                        i += raid_disks;
1390                                i -= (sh->pd_idx + 2);
1391                        }
1392                        break;
1393                default:
1394                        printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1395                               conf->algorithm);
1396                }
1397                break;
1398        }
1399
1400        chunk_number = stripe * data_disks + i;
1401        r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
1402
1403        check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
1404        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
1405                printk(KERN_ERR "compute_blocknr: map not correct\n");
1406                return 0;
1407        }
1408        return r_sector;
1409}
1410
1411
1412
1413/*
1414 * Copy data between a page in the stripe cache, and one or more bion
1415 * The page could align with the middle of the bio, or there could be
1416 * several bion, each with several bio_vecs, which cover part of the page
1417 * Multiple bion are linked together on bi_next.  There may be extras
1418 * at the end of this list.  We ignore them.
1419 */
1420static void copy_data(int frombio, struct bio *bio,
1421                     struct page *page,
1422                     sector_t sector)
1423{
1424        char *pa = page_address(page);
1425        struct bio_vec *bvl;
1426        int i;
1427        int page_offset;
1428
1429        if (bio->bi_sector >= sector)
1430                page_offset = (signed)(bio->bi_sector - sector) * 512;
1431        else
1432                page_offset = (signed)(sector - bio->bi_sector) * -512;
1433        bio_for_each_segment(bvl, bio, i) {
1434                int len = bio_iovec_idx(bio,i)->bv_len;
1435                int clen;
1436                int b_offset = 0;
1437
1438                if (page_offset < 0) {
1439                        b_offset = -page_offset;
1440                        page_offset += b_offset;
1441                        len -= b_offset;
1442                }
1443
1444                if (len > 0 && page_offset + len > STRIPE_SIZE)
1445                        clen = STRIPE_SIZE - page_offset;
1446                else clen = len;
1447
1448                if (clen > 0) {
1449                        char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1450                        if (frombio)
1451                                memcpy(pa+page_offset, ba+b_offset, clen);
1452                        else
1453                                memcpy(ba+b_offset, pa+page_offset, clen);
1454                        __bio_kunmap_atomic(ba, KM_USER0);
1455                }
1456                if (clen < len) /* hit end of page */
1457                        break;
1458                page_offset +=  len;
1459        }
1460}
1461
1462#define check_xor()     do {                                              \
1463                                if (count == MAX_XOR_BLOCKS) {            \
1464                                xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1465                                count = 0;                                \
1466                           }                                              \
1467                        } while(0)
1468
1469static void compute_parity6(struct stripe_head *sh, int method)
1470{
1471        raid6_conf_t *conf = sh->raid_conf;
1472        int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1473        struct bio *chosen;
1474        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1475        void *ptrs[disks];
1476
1477        qd_idx = raid6_next_disk(pd_idx, disks);
1478        d0_idx = raid6_next_disk(qd_idx, disks);
1479
1480        pr_debug("compute_parity, stripe %llu, method %d\n",
1481                (unsigned long long)sh->sector, method);
1482
1483        switch(method) {
1484        case READ_MODIFY_WRITE:
1485                BUG();          /* READ_MODIFY_WRITE N/A for RAID-6 */
1486        case RECONSTRUCT_WRITE:
1487                for (i= disks; i-- ;)
1488                        if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1489                                chosen = sh->dev[i].towrite;
1490                                sh->dev[i].towrite = NULL;
1491
1492                                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1493                                        wake_up(&conf->wait_for_overlap);
1494
1495                                BUG_ON(sh->dev[i].written);
1496                                sh->dev[i].written = chosen;
1497                        }
1498                break;
1499        case CHECK_PARITY:
1500                BUG();          /* Not implemented yet */
1501        }
1502
1503        for (i = disks; i--;)
1504                if (sh->dev[i].written) {
1505                        sector_t sector = sh->dev[i].sector;
1506                        struct bio *wbi = sh->dev[i].written;
1507                        while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1508                                copy_data(1, wbi, sh->dev[i].page, sector);
1509                                wbi = r5_next_bio(wbi, sector);
1510                        }
1511
1512                        set_bit(R5_LOCKED, &sh->dev[i].flags);
1513                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
1514                }
1515
1516//      switch(method) {
1517//      case RECONSTRUCT_WRITE:
1518//      case CHECK_PARITY:
1519//      case UPDATE_PARITY:
1520                /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
1521                /* FIX: Is this ordering of drives even remotely optimal? */
1522                count = 0;
1523                i = d0_idx;
1524                do {
1525                        ptrs[count++] = page_address(sh->dev[i].page);
1526                        if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1527                                printk("block %d/%d not uptodate on parity calc\n", i,count);
1528                        i = raid6_next_disk(i, disks);
1529                } while ( i != d0_idx );
1530//              break;
1531//      }
1532
1533        raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
1534
1535        switch(method) {
1536        case RECONSTRUCT_WRITE:
1537                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1538                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1539                set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
1540                set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
1541                break;
1542        case UPDATE_PARITY:
1543                set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1544                set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1545                break;
1546        }
1547}
1548
1549
1550/* Compute one missing block */
1551static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1552{
1553        int i, count, disks = sh->disks;
1554        void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1555        int pd_idx = sh->pd_idx;
1556        int qd_idx = raid6_next_disk(pd_idx, disks);
1557
1558        pr_debug("compute_block_1, stripe %llu, idx %d\n",
1559                (unsigned long long)sh->sector, dd_idx);
1560
1561        if ( dd_idx == qd_idx ) {
1562                /* We're actually computing the Q drive */
1563                compute_parity6(sh, UPDATE_PARITY);
1564        } else {
1565                dest = page_address(sh->dev[dd_idx].page);
1566                if (!nozero) memset(dest, 0, STRIPE_SIZE);
1567                count = 0;
1568                for (i = disks ; i--; ) {
1569                        if (i == dd_idx || i == qd_idx)
1570                                continue;
1571                        p = page_address(sh->dev[i].page);
1572                        if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1573                                ptr[count++] = p;
1574                        else
1575                                printk("compute_block() %d, stripe %llu, %d"
1576                                       " not present\n", dd_idx,
1577                                       (unsigned long long)sh->sector, i);
1578
1579                        check_xor();
1580                }
1581                if (count)
1582                        xor_blocks(count, STRIPE_SIZE, dest, ptr);
1583                if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1584                else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1585        }
1586}
1587
1588/* Compute two missing blocks */
1589static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1590{
1591        int i, count, disks = sh->disks;
1592        int pd_idx = sh->pd_idx;
1593        int qd_idx = raid6_next_disk(pd_idx, disks);
1594        int d0_idx = raid6_next_disk(qd_idx, disks);
1595        int faila, failb;
1596
1597        /* faila and failb are disk numbers relative to d0_idx */
1598        /* pd_idx become disks-2 and qd_idx become disks-1 */
1599        faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
1600        failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
1601
1602        BUG_ON(faila == failb);
1603        if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1604
1605        pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1606               (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1607
1608        if ( failb == disks-1 ) {
1609                /* Q disk is one of the missing disks */
1610                if ( faila == disks-2 ) {
1611                        /* Missing P+Q, just recompute */
1612                        compute_parity6(sh, UPDATE_PARITY);
1613                        return;
1614                } else {
1615                        /* We're missing D+Q; recompute D from P */
1616                        compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
1617                        compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1618                        return;
1619                }
1620        }
1621
1622        /* We're missing D+P or D+D; build pointer table */
1623        {
1624                /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1625                void *ptrs[disks];
1626
1627                count = 0;
1628                i = d0_idx;
1629                do {
1630                        ptrs[count++] = page_address(sh->dev[i].page);
1631                        i = raid6_next_disk(i, disks);
1632                        if (i != dd_idx1 && i != dd_idx2 &&
1633                            !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1634                                printk("compute_2 with missing block %d/%d\n", count, i);
1635                } while ( i != d0_idx );
1636
1637                if ( failb == disks-2 ) {
1638                        /* We're missing D+P. */
1639                        raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1640                } else {
1641                        /* We're missing D+D. */
1642                        raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1643                }
1644
1645                /* Both the above update both missing blocks */
1646                set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1647                set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1648        }
1649}
1650
1651static void
1652schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1653                         int rcw, int expand)
1654{
1655        int i, pd_idx = sh->pd_idx, disks = sh->disks;
1656
1657        if (rcw) {
1658                /* if we are not expanding this is a proper write request, and
1659                 * there will be bios with new data to be drained into the
1660                 * stripe cache
1661                 */
1662                if (!expand) {
1663                        sh->reconstruct_state = reconstruct_state_drain_run;
1664                        set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1665                } else
1666                        sh->reconstruct_state = reconstruct_state_run;
1667
1668                set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1669
1670                for (i = disks; i--; ) {
1671                        struct r5dev *dev = &sh->dev[i];
1672
1673                        if (dev->towrite) {
1674                                set_bit(R5_LOCKED, &dev->flags);
1675                                set_bit(R5_Wantdrain, &dev->flags);
1676                                if (!expand)
1677                                        clear_bit(R5_UPTODATE, &dev->flags);
1678                                s->locked++;
1679                        }
1680                }
1681                if (s->locked + 1 == disks)
1682                        if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1683                                atomic_inc(&sh->raid_conf->pending_full_writes);
1684        } else {
1685                BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1686                        test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1687
1688                sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1689                set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1690                set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1691                set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1692
1693                for (i = disks; i--; ) {
1694                        struct r5dev *dev = &sh->dev[i];
1695                        if (i == pd_idx)
1696                                continue;
1697
1698                        if (dev->towrite &&
1699                            (test_bit(R5_UPTODATE, &dev->flags) ||
1700                             test_bit(R5_Wantcompute, &dev->flags))) {
1701                                set_bit(R5_Wantdrain, &dev->flags);
1702                                set_bit(R5_LOCKED, &dev->flags);
1703                                clear_bit(R5_UPTODATE, &dev->flags);
1704                                s->locked++;
1705                        }
1706                }
1707        }
1708
1709        /* keep the parity disk locked while asynchronous operations
1710         * are in flight
1711         */
1712        set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1713        clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1714        s->locked++;
1715
1716        pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1717                __func__, (unsigned long long)sh->sector,
1718                s->locked, s->ops_request);
1719}
1720
1721/*
1722 * Each stripe/dev can have one or more bion attached.
1723 * toread/towrite point to the first in a chain.
1724 * The bi_next chain must be in order.
1725 */
1726static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
1727{
1728        struct bio **bip;
1729        raid5_conf_t *conf = sh->raid_conf;
1730        int firstwrite=0;
1731
1732        pr_debug("adding bh b#%llu to stripe s#%llu\n",
1733                (unsigned long long)bi->bi_sector,
1734                (unsigned long long)sh->sector);
1735
1736
1737        spin_lock(&sh->lock);
1738        spin_lock_irq(&conf->device_lock);
1739        if (forwrite) {
1740                bip = &sh->dev[dd_idx].towrite;
1741                if (*bip == NULL && sh->dev[dd_idx].written == NULL)
1742                        firstwrite = 1;
1743        } else
1744                bip = &sh->dev[dd_idx].toread;
1745        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
1746                if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
1747                        goto overlap;
1748                bip = & (*bip)->bi_next;
1749        }
1750        if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
1751                goto overlap;
1752
1753        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
1754        if (*bip)
1755                bi->bi_next = *bip;
1756        *bip = bi;
1757        bi->bi_phys_segments++;
1758        spin_unlock_irq(&conf->device_lock);
1759        spin_unlock(&sh->lock);
1760
1761        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
1762                (unsigned long long)bi->bi_sector,
1763                (unsigned long long)sh->sector, dd_idx);
1764
1765        if (conf->mddev->bitmap && firstwrite) {
1766                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
1767                                  STRIPE_SECTORS, 0);
1768                sh->bm_seq = conf->seq_flush+1;
1769                set_bit(STRIPE_BIT_DELAY, &sh->state);
1770        }
1771
1772        if (forwrite) {
1773                /* check if page is covered */
1774                sector_t sector = sh->dev[dd_idx].sector;
1775                for (bi=sh->dev[dd_idx].towrite;
1776                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
1777                             bi && bi->bi_sector <= sector;
1778                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
1779                        if (bi->bi_sector + (bi->bi_size>>9) >= sector)
1780                                sector = bi->bi_sector + (bi->bi_size>>9);
1781                }
1782                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
1783                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
1784        }
1785        return 1;
1786
1787 overlap:
1788        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1789        spin_unlock_irq(&conf->device_lock);
1790        spin_unlock(&sh->lock);
1791        return 0;
1792}
1793
1794static void end_reshape(raid5_conf_t *conf);
1795
1796static int page_is_zero(struct page *p)
1797{
1798        char *a = page_address(p);
1799        return ((*(u32*)a) == 0 &&
1800                memcmp(a, a+4, STRIPE_SIZE-4)==0);
1801}
1802
1803static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1804{
1805        int sectors_per_chunk = conf->chunk_size >> 9;
1806        int pd_idx, dd_idx;
1807        int chunk_offset = sector_div(stripe, sectors_per_chunk);
1808
1809        raid5_compute_sector(stripe * (disks - conf->max_degraded)
1810                             *sectors_per_chunk + chunk_offset,
1811                             disks, disks - conf->max_degraded,
1812                             &dd_idx, &pd_idx, conf);
1813        return pd_idx;
1814}
1815
1816static void
1817handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
1818                                struct stripe_head_state *s, int disks,
1819                                struct bio **return_bi)
1820{
1821        int i;
1822        for (i = disks; i--; ) {
1823                struct bio *bi;
1824                int bitmap_end = 0;
1825
1826                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1827                        mdk_rdev_t *rdev;
1828                        rcu_read_lock();
1829                        rdev = rcu_dereference(conf->disks[i].rdev);
1830                        if (rdev && test_bit(In_sync, &rdev->flags))
1831                                /* multiple read failures in one stripe */
1832                                md_error(conf->mddev, rdev);
1833                        rcu_read_unlock();
1834                }
1835                spin_lock_irq(&conf->device_lock);
1836                /* fail all writes first */
1837                bi = sh->dev[i].towrite;
1838                sh->dev[i].towrite = NULL;
1839                if (bi) {
1840                        s->to_write--;
1841                        bitmap_end = 1;
1842                }
1843
1844                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1845                        wake_up(&conf->wait_for_overlap);
1846
1847                while (bi && bi->bi_sector <
1848                        sh->dev[i].sector + STRIPE_SECTORS) {
1849                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1850                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
1851                        if (!raid5_dec_bi_phys_segments(bi)) {
1852                                md_write_end(conf->mddev);
1853                                bi->bi_next = *return_bi;
1854                                *return_bi = bi;
1855                        }
1856                        bi = nextbi;
1857                }
1858                /* and fail all 'written' */
1859                bi = sh->dev[i].written;
1860                sh->dev[i].written = NULL;
1861                if (bi) bitmap_end = 1;
1862                while (bi && bi->bi_sector <
1863                       sh->dev[i].sector + STRIPE_SECTORS) {
1864                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1865                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
1866                        if (!raid5_dec_bi_phys_segments(bi)) {
1867                                md_write_end(conf->mddev);
1868                                bi->bi_next = *return_bi;
1869                                *return_bi = bi;
1870                        }
1871                        bi = bi2;
1872                }
1873
1874                /* fail any reads if this device is non-operational and
1875                 * the data has not reached the cache yet.
1876                 */
1877                if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
1878                    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1879                      test_bit(R5_ReadError, &sh->dev[i].flags))) {
1880                        bi = sh->dev[i].toread;
1881                        sh->dev[i].toread = NULL;
1882                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1883                                wake_up(&conf->wait_for_overlap);
1884                        if (bi) s->to_read--;
1885                        while (bi && bi->bi_sector <
1886                               sh->dev[i].sector + STRIPE_SECTORS) {
1887                                struct bio *nextbi =
1888                                        r5_next_bio(bi, sh->dev[i].sector);
1889                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
1890                                if (!raid5_dec_bi_phys_segments(bi)) {
1891                                        bi->bi_next = *return_bi;
1892                                        *return_bi = bi;
1893                                }
1894                                bi = nextbi;
1895                        }
1896                }
1897                spin_unlock_irq(&conf->device_lock);
1898                if (bitmap_end)
1899                        bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1900                                        STRIPE_SECTORS, 0, 0);
1901        }
1902
1903        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1904                if (atomic_dec_and_test(&conf->pending_full_writes))
1905                        md_wakeup_thread(conf->mddev->thread);
1906}
1907
1908/* fetch_block5 - checks the given member device to see if its data needs
1909 * to be read or computed to satisfy a request.
1910 *
1911 * Returns 1 when no more member devices need to be checked, otherwise returns
1912 * 0 to tell the loop in handle_stripe_fill5 to continue
1913 */
1914static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
1915                        int disk_idx, int disks)
1916{
1917        struct r5dev *dev = &sh->dev[disk_idx];
1918        struct r5dev *failed_dev = &sh->dev[s->failed_num];
1919
1920        /* is the data in this block needed, and can we get it? */
1921        if (!test_bit(R5_LOCKED, &dev->flags) &&
1922            !test_bit(R5_UPTODATE, &dev->flags) &&
1923            (dev->toread ||
1924             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1925             s->syncing || s->expanding ||
1926             (s->failed &&
1927              (failed_dev->toread ||
1928               (failed_dev->towrite &&
1929                !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
1930                /* We would like to get this block, possibly by computing it,
1931                 * otherwise read it if the backing disk is insync
1932                 */
1933                if ((s->uptodate == disks - 1) &&
1934                    (s->failed && disk_idx == s->failed_num)) {
1935                        set_bit(STRIPE_COMPUTE_RUN, &sh->state);
1936                        set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
1937                        set_bit(R5_Wantcompute, &dev->flags);
1938                        sh->ops.target = disk_idx;
1939                        s->req_compute = 1;
1940                        /* Careful: from this point on 'uptodate' is in the eye
1941                         * of raid5_run_ops which services 'compute' operations
1942                         * before writes. R5_Wantcompute flags a block that will
1943                         * be R5_UPTODATE by the time it is needed for a
1944                         * subsequent operation.
1945                         */
1946                        s->uptodate++;
1947                        return 1; /* uptodate + compute == disks */
1948                } else if (test_bit(R5_Insync, &dev->flags)) {
1949                        set_bit(R5_LOCKED, &dev->flags);
1950                        set_bit(R5_Wantread, &dev->flags);
1951                        s->locked++;
1952                        pr_debug("Reading block %d (sync=%d)\n", disk_idx,
1953                                s->syncing);
1954                }
1955        }
1956
1957        return 0;
1958}
1959
1960/**
1961 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
1962 */
1963static void handle_stripe_fill5(struct stripe_head *sh,
1964                        struct stripe_head_state *s, int disks)
1965{
1966        int i;
1967
1968        /* look for blocks to read/compute, skip this if a compute
1969         * is already in flight, or if the stripe contents are in the
1970         * midst of changing due to a write
1971         */
1972        if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
1973            !sh->reconstruct_state)
1974                for (i = disks; i--; )
1975                        if (fetch_block5(sh, s, i, disks))
1976                                break;
1977        set_bit(STRIPE_HANDLE, &sh->state);
1978}
1979
1980static void handle_stripe_fill6(struct stripe_head *sh,
1981                        struct stripe_head_state *s, struct r6_state *r6s,
1982                        int disks)
1983{
1984        int i;
1985        for (i = disks; i--; ) {
1986                struct r5dev *dev = &sh->dev[i];
1987                if (!test_bit(R5_LOCKED, &dev->flags) &&
1988                    !test_bit(R5_UPTODATE, &dev->flags) &&
1989                    (dev->toread || (dev->towrite &&
1990                     !test_bit(R5_OVERWRITE, &dev->flags)) ||
1991                     s->syncing || s->expanding ||
1992                     (s->failed >= 1 &&
1993                      (sh->dev[r6s->failed_num[0]].toread ||
1994                       s->to_write)) ||
1995                     (s->failed >= 2 &&
1996                      (sh->dev[r6s->failed_num[1]].toread ||
1997                       s->to_write)))) {
1998                        /* we would like to get this block, possibly
1999                         * by computing it, but we might not be able to
2000                         */
2001                        if ((s->uptodate == disks - 1) &&
2002                            (s->failed && (i == r6s->failed_num[0] ||
2003                                           i == r6s->failed_num[1]))) {
2004                                pr_debug("Computing stripe %llu block %d\n",
2005                                       (unsigned long long)sh->sector, i);
2006                                compute_block_1(sh, i, 0);
2007                                s->uptodate++;
2008                        } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
2009                                /* Computing 2-failure is *very* expensive; only
2010                                 * do it if failed >= 2
2011                                 */
2012                                int other;
2013                                for (other = disks; other--; ) {
2014                                        if (other == i)
2015                                                continue;
2016                                        if (!test_bit(R5_UPTODATE,
2017                                              &sh->dev[other].flags))
2018                                                break;
2019                                }
2020                                BUG_ON(other < 0);
2021                                pr_debug("Computing stripe %llu blocks %d,%d\n",
2022                                       (unsigned long long)sh->sector,
2023                                       i, other);
2024                                compute_block_2(sh, i, other);
2025                                s->uptodate += 2;
2026                        } else if (test_bit(R5_Insync, &dev->flags)) {
2027                                set_bit(R5_LOCKED, &dev->flags);
2028                                set_bit(R5_Wantread, &dev->flags);
2029                                s->locked++;
2030                                pr_debug("Reading block %d (sync=%d)\n",
2031                                        i, s->syncing);
2032                        }
2033                }
2034        }
2035        set_bit(STRIPE_HANDLE, &sh->state);
2036}
2037
2038
2039/* handle_stripe_clean_event
2040 * any written block on an uptodate or failed drive can be returned.
2041 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2042 * never LOCKED, so we don't need to test 'failed' directly.
2043 */
2044static void handle_stripe_clean_event(raid5_conf_t *conf,
2045        struct stripe_head *sh, int disks, struct bio **return_bi)
2046{
2047        int i;
2048        struct r5dev *dev;
2049
2050        for (i = disks; i--; )
2051                if (sh->dev[i].written) {
2052                        dev = &sh->dev[i];
2053                        if (!test_bit(R5_LOCKED, &dev->flags) &&
2054                                test_bit(R5_UPTODATE, &dev->flags)) {
2055                                /* We can return any write requests */
2056                                struct bio *wbi, *wbi2;
2057                                int bitmap_end = 0;
2058                                pr_debug("Return write for disc %d\n", i);
2059                                spin_lock_irq(&conf->device_lock);
2060                                wbi = dev->written;
2061                                dev->written = NULL;
2062                                while (wbi && wbi->bi_sector <
2063                                        dev->sector + STRIPE_SECTORS) {
2064                                        wbi2 = r5_next_bio(wbi, dev->sector);
2065                                        if (!raid5_dec_bi_phys_segments(wbi)) {
2066                                                md_write_end(conf->mddev);
2067                                                wbi->bi_next = *return_bi;
2068                                                *return_bi = wbi;
2069                                        }
2070                                        wbi = wbi2;
2071                                }
2072                                if (dev->towrite == NULL)
2073                                        bitmap_end = 1;
2074                                spin_unlock_irq(&conf->device_lock);
2075                                if (bitmap_end)
2076                                        bitmap_endwrite(conf->mddev->bitmap,
2077                                                        sh->sector,
2078                                                        STRIPE_SECTORS,
2079                                         !test_bit(STRIPE_DEGRADED, &sh->state),
2080                                                        0);
2081                        }
2082                }
2083
2084        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2085                if (atomic_dec_and_test(&conf->pending_full_writes))
2086                        md_wakeup_thread(conf->mddev->thread);
2087}
2088
2089static void handle_stripe_dirtying5(raid5_conf_t *conf,
2090                struct stripe_head *sh, struct stripe_head_state *s, int disks)
2091{
2092        int rmw = 0, rcw = 0, i;
2093        for (i = disks; i--; ) {
2094                /* would I have to read this buffer for read_modify_write */
2095                struct r5dev *dev = &sh->dev[i];
2096                if ((dev->towrite || i == sh->pd_idx) &&
2097                    !test_bit(R5_LOCKED, &dev->flags) &&
2098                    !(test_bit(R5_UPTODATE, &dev->flags) ||
2099                      test_bit(R5_Wantcompute, &dev->flags))) {
2100                        if (test_bit(R5_Insync, &dev->flags))
2101                                rmw++;
2102                        else
2103                                rmw += 2*disks;  /* cannot read it */
2104                }
2105                /* Would I have to read this buffer for reconstruct_write */
2106                if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2107                    !test_bit(R5_LOCKED, &dev->flags) &&
2108                    !(test_bit(R5_UPTODATE, &dev->flags) ||
2109                    test_bit(R5_Wantcompute, &dev->flags))) {
2110                        if (test_bit(R5_Insync, &dev->flags)) rcw++;
2111                        else
2112                                rcw += 2*disks;
2113                }
2114        }
2115        pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2116                (unsigned long long)sh->sector, rmw, rcw);
2117        set_bit(STRIPE_HANDLE, &sh->state);
2118        if (rmw < rcw && rmw > 0)
2119                /* prefer read-modify-write, but need to get some data */
2120                for (i = disks; i--; ) {
2121                        struct r5dev *dev = &sh->dev[i];
2122                        if ((dev->towrite || i == sh->pd_idx) &&
2123                            !test_bit(R5_LOCKED, &dev->flags) &&
2124                            !(test_bit(R5_UPTODATE, &dev->flags) ||
2125                            test_bit(R5_Wantcompute, &dev->flags)) &&
2126                            test_bit(R5_Insync, &dev->flags)) {
2127                                if (
2128                                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2129                                        pr_debug("Read_old block "
2130                                                "%d for r-m-w\n", i);
2131                                        set_bit(R5_LOCKED, &dev->flags);
2132                                        set_bit(R5_Wantread, &dev->flags);
2133                                        s->locked++;
2134                                } else {
2135                                        set_bit(STRIPE_DELAYED, &sh->state);
2136                                        set_bit(STRIPE_HANDLE, &sh->state);
2137                                }
2138                        }
2139                }
2140        if (rcw <= rmw && rcw > 0)
2141                /* want reconstruct write, but need to get some data */
2142                for (i = disks; i--; ) {
2143                        struct r5dev *dev = &sh->dev[i];
2144                        if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2145                            i != sh->pd_idx &&
2146                            !test_bit(R5_LOCKED, &dev->flags) &&
2147                            !(test_bit(R5_UPTODATE, &dev->flags) ||
2148                            test_bit(R5_Wantcompute, &dev->flags)) &&
2149                            test_bit(R5_Insync, &dev->flags)) {
2150                                if (
2151                                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2152                                        pr_debug("Read_old block "
2153                                                "%d for Reconstruct\n", i);
2154                                        set_bit(R5_LOCKED, &dev->flags);
2155                                        set_bit(R5_Wantread, &dev->flags);
2156                                        s->locked++;
2157                                } else {
2158                                        set_bit(STRIPE_DELAYED, &sh->state);
2159                                        set_bit(STRIPE_HANDLE, &sh->state);
2160                                }
2161                        }
2162                }
2163        /* now if nothing is locked, and if we have enough data,
2164         * we can start a write request
2165         */
2166        /* since handle_stripe can be called at any time we need to handle the
2167         * case where a compute block operation has been submitted and then a
2168         * subsequent call wants to start a write request.  raid5_run_ops only
2169         * handles the case where compute block and postxor are requested
2170         * simultaneously.  If this is not the case then new writes need to be
2171         * held off until the compute completes.
2172         */
2173        if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2174            (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2175            !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2176                schedule_reconstruction5(sh, s, rcw == 0, 0);
2177}
2178
2179static void handle_stripe_dirtying6(raid5_conf_t *conf,
2180                struct stripe_head *sh, struct stripe_head_state *s,
2181                struct r6_state *r6s, int disks)
2182{
2183        int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2184        int qd_idx = r6s->qd_idx;
2185        for (i = disks; i--; ) {
2186                struct r5dev *dev = &sh->dev[i];
2187                /* Would I have to read this buffer for reconstruct_write */
2188                if (!test_bit(R5_OVERWRITE, &dev->flags)
2189                    && i != pd_idx && i != qd_idx
2190                    && (!test_bit(R5_LOCKED, &dev->flags)
2191                            ) &&
2192                    !test_bit(R5_UPTODATE, &dev->flags)) {
2193                        if (test_bit(R5_Insync, &dev->flags)) rcw++;
2194                        else {
2195                                pr_debug("raid6: must_compute: "
2196                                        "disk %d flags=%#lx\n", i, dev->flags);
2197                                must_compute++;
2198                        }
2199                }
2200        }
2201        pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2202               (unsigned long long)sh->sector, rcw, must_compute);
2203        set_bit(STRIPE_HANDLE, &sh->state);
2204
2205        if (rcw > 0)
2206                /* want reconstruct write, but need to get some data */
2207                for (i = disks; i--; ) {
2208                        struct r5dev *dev = &sh->dev[i];
2209                        if (!test_bit(R5_OVERWRITE, &dev->flags)
2210                            && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2211                            && !test_bit(R5_LOCKED, &dev->flags) &&
2212                            !test_bit(R5_UPTODATE, &dev->flags) &&
2213                            test_bit(R5_Insync, &dev->flags)) {
2214                                if (
2215                                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2216                                        pr_debug("Read_old stripe %llu "
2217                                                "block %d for Reconstruct\n",
2218                                             (unsigned long long)sh->sector, i);
2219                                        set_bit(R5_LOCKED, &dev->flags);
2220                                        set_bit(R5_Wantread, &dev->flags);
2221                                        s->locked++;
2222                                } else {
2223                                        pr_debug("Request delayed stripe %llu "
2224                                                "block %d for Reconstruct\n",
2225                                             (unsigned long long)sh->sector, i);
2226                                        set_bit(STRIPE_DELAYED, &sh->state);
2227                                        set_bit(STRIPE_HANDLE, &sh->state);
2228                                }
2229                        }
2230                }
2231        /* now if nothing is locked, and if we have enough data, we can start a
2232         * write request
2233         */
2234        if (s->locked == 0 && rcw == 0 &&
2235            !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2236                if (must_compute > 0) {
2237                        /* We have failed blocks and need to compute them */
2238                        switch (s->failed) {
2239                        case 0:
2240                                BUG();
2241                        case 1:
2242                                compute_block_1(sh, r6s->failed_num[0], 0);
2243                                break;
2244                        case 2:
2245                                compute_block_2(sh, r6s->failed_num[0],
2246                                                r6s->failed_num[1]);
2247                                break;
2248                        default: /* This request should have been failed? */
2249                                BUG();
2250                        }
2251                }
2252
2253                pr_debug("Computing parity for stripe %llu\n",
2254                        (unsigned long long)sh->sector);
2255                compute_parity6(sh, RECONSTRUCT_WRITE);
2256                /* now every locked buffer is ready to be written */
2257                for (i = disks; i--; )
2258                        if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2259                                pr_debug("Writing stripe %llu block %d\n",
2260                                       (unsigned long long)sh->sector, i);
2261                                s->locked++;
2262                                set_bit(R5_Wantwrite, &sh->dev[i].flags);
2263                        }
2264                if (s->locked == disks)
2265                        if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2266                                atomic_inc(&conf->pending_full_writes);
2267                /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2268                set_bit(STRIPE_INSYNC, &sh->state);
2269
2270                if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2271                        atomic_dec(&conf->preread_active_stripes);
2272                        if (atomic_read(&conf->preread_active_stripes) <
2273                            IO_THRESHOLD)
2274                                md_wakeup_thread(conf->mddev->thread);
2275                }
2276        }
2277}
2278
2279static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2280                                struct stripe_head_state *s, int disks)
2281{
2282        struct r5dev *dev = NULL;
2283
2284        set_bit(STRIPE_HANDLE, &sh->state);
2285
2286        switch (sh->check_state) {
2287        case check_state_idle:
2288                /* start a new check operation if there are no failures */
2289                if (s->failed == 0) {
2290                        BUG_ON(s->uptodate != disks);
2291                        sh->check_state = check_state_run;
2292                        set_bit(STRIPE_OP_CHECK, &s->ops_request);
2293                        clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2294                        s->uptodate--;
2295                        break;
2296                }
2297                dev = &sh->dev[s->failed_num];
2298                /* fall through */
2299        case check_state_compute_result:
2300                sh->check_state = check_state_idle;
2301                if (!dev)
2302                        dev = &sh->dev[sh->pd_idx];
2303
2304                /* check that a write has not made the stripe insync */
2305                if (test_bit(STRIPE_INSYNC, &sh->state))
2306                        break;
2307
2308                /* either failed parity check, or recovery is happening */
2309                BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2310                BUG_ON(s->uptodate != disks);
2311
2312                set_bit(R5_LOCKED, &dev->flags);
2313                s->locked++;
2314                set_bit(R5_Wantwrite, &dev->flags);
2315
2316                clear_bit(STRIPE_DEGRADED, &sh->state);
2317                set_bit(STRIPE_INSYNC, &sh->state);
2318                break;
2319        case check_state_run:
2320                break; /* we will be called again upon completion */
2321        case check_state_check_result:
2322                sh->check_state = check_state_idle;
2323
2324                /* if a failure occurred during the check operation, leave
2325                 * STRIPE_INSYNC not set and let the stripe be handled again
2326                 */
2327                if (s->failed)
2328                        break;
2329
2330                /* handle a successful check operation, if parity is correct
2331                 * we are done.  Otherwise update the mismatch count and repair
2332                 * parity if !MD_RECOVERY_CHECK
2333                 */
2334                if (sh->ops.zero_sum_result == 0)
2335                        /* parity is correct (on disc,
2336                         * not in buffer any more)
2337                         */
2338                        set_bit(STRIPE_INSYNC, &sh->state);
2339                else {
2340                        conf->mddev->resync_mismatches += STRIPE_SECTORS;
2341                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2342                                /* don't try to repair!! */
2343                                set_bit(STRIPE_INSYNC, &sh->state);
2344                        else {
2345                                sh->check_state = check_state_compute_run;
2346                                set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2347                                set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2348                                set_bit(R5_Wantcompute,
2349                                        &sh->dev[sh->pd_idx].flags);
2350                                sh->ops.target = sh->pd_idx;
2351                                s->uptodate++;
2352                        }
2353                }
2354                break;
2355        case check_state_compute_run:
2356                break;
2357        default:
2358                printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2359                       __func__, sh->check_state,
2360                       (unsigned long long) sh->sector);
2361                BUG();
2362        }
2363}
2364
2365
2366static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2367                                struct stripe_head_state *s,
2368                                struct r6_state *r6s, struct page *tmp_page,
2369                                int disks)
2370{
2371        int update_p = 0, update_q = 0;
2372        struct r5dev *dev;
2373        int pd_idx = sh->pd_idx;
2374        int qd_idx = r6s->qd_idx;
2375
2376        set_bit(STRIPE_HANDLE, &sh->state);
2377
2378        BUG_ON(s->failed > 2);
2379        BUG_ON(s->uptodate < disks);
2380        /* Want to check and possibly repair P and Q.
2381         * However there could be one 'failed' device, in which
2382         * case we can only check one of them, possibly using the
2383         * other to generate missing data
2384         */
2385
2386        /* If !tmp_page, we cannot do the calculations,
2387         * but as we have set STRIPE_HANDLE, we will soon be called
2388         * by stripe_handle with a tmp_page - just wait until then.
2389         */
2390        if (tmp_page) {
2391                if (s->failed == r6s->q_failed) {
2392                        /* The only possible failed device holds 'Q', so it
2393                         * makes sense to check P (If anything else were failed,
2394                         * we would have used P to recreate it).
2395                         */
2396                        compute_block_1(sh, pd_idx, 1);
2397                        if (!page_is_zero(sh->dev[pd_idx].page)) {
2398                                compute_block_1(sh, pd_idx, 0);
2399                                update_p = 1;
2400                        }
2401                }
2402                if (!r6s->q_failed && s->failed < 2) {
2403                        /* q is not failed, and we didn't use it to generate
2404                         * anything, so it makes sense to check it
2405                         */
2406                        memcpy(page_address(tmp_page),
2407                               page_address(sh->dev[qd_idx].page),
2408                               STRIPE_SIZE);
2409                        compute_parity6(sh, UPDATE_PARITY);
2410                        if (memcmp(page_address(tmp_page),
2411                                   page_address(sh->dev[qd_idx].page),
2412                                   STRIPE_SIZE) != 0) {
2413                                clear_bit(STRIPE_INSYNC, &sh->state);
2414                                update_q = 1;
2415                        }
2416                }
2417                if (update_p || update_q) {
2418                        conf->mddev->resync_mismatches += STRIPE_SECTORS;
2419                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2420                                /* don't try to repair!! */
2421                                update_p = update_q = 0;
2422                }
2423
2424                /* now write out any block on a failed drive,
2425                 * or P or Q if they need it
2426                 */
2427
2428                if (s->failed == 2) {
2429                        dev = &sh->dev[r6s->failed_num[1]];
2430                        s->locked++;
2431                        set_bit(R5_LOCKED, &dev->flags);
2432                        set_bit(R5_Wantwrite, &dev->flags);
2433                }
2434                if (s->failed >= 1) {
2435                        dev = &sh->dev[r6s->failed_num[0]];
2436                        s->locked++;
2437                        set_bit(R5_LOCKED, &dev->flags);
2438                        set_bit(R5_Wantwrite, &dev->flags);
2439                }
2440
2441                if (update_p) {
2442                        dev = &sh->dev[pd_idx];
2443                        s->locked++;
2444                        set_bit(R5_LOCKED, &dev->flags);
2445                        set_bit(R5_Wantwrite, &dev->flags);
2446                }
2447                if (update_q) {
2448                        dev = &sh->dev[qd_idx];
2449                        s->locked++;
2450                        set_bit(R5_LOCKED, &dev->flags);
2451                        set_bit(R5_Wantwrite, &dev->flags);
2452                }
2453                clear_bit(STRIPE_DEGRADED, &sh->state);
2454
2455                set_bit(STRIPE_INSYNC, &sh->state);
2456        }
2457}
2458
2459static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2460                                struct r6_state *r6s)
2461{
2462        int i;
2463
2464        /* We have read all the blocks in this stripe and now we need to
2465         * copy some of them into a target stripe for expand.
2466         */
2467        struct dma_async_tx_descriptor *tx = NULL;
2468        clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2469        for (i = 0; i < sh->disks; i++)
2470                if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
2471                        int dd_idx, pd_idx, j;
2472                        struct stripe_head *sh2;
2473
2474                        sector_t bn = compute_blocknr(sh, i);
2475                        sector_t s = raid5_compute_sector(bn, conf->raid_disks,
2476                                                conf->raid_disks -
2477                                                conf->max_degraded, &dd_idx,
2478                                                &pd_idx, conf);
2479                        sh2 = get_active_stripe(conf, s, conf->raid_disks,
2480                                                pd_idx, 1);
2481                        if (sh2 == NULL)
2482                                /* so far only the early blocks of this stripe
2483                                 * have been requested.  When later blocks
2484                                 * get requested, we will try again
2485                                 */
2486                                continue;
2487                        if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2488                           test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2489                                /* must have already done this block */
2490                                release_stripe(sh2);
2491                                continue;
2492                        }
2493
2494                        /* place all the copies on one channel */
2495                        tx = async_memcpy(sh2->dev[dd_idx].page,
2496                                sh->dev[i].page, 0, 0, STRIPE_SIZE,
2497                                ASYNC_TX_DEP_ACK, tx, NULL, NULL);
2498
2499                        set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2500                        set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2501                        for (j = 0; j < conf->raid_disks; j++)
2502                                if (j != sh2->pd_idx &&
2503                                    (!r6s || j != raid6_next_disk(sh2->pd_idx,
2504                                                                 sh2->disks)) &&
2505                                    !test_bit(R5_Expanded, &sh2->dev[j].flags))
2506                                        break;
2507                        if (j == conf->raid_disks) {
2508                                set_bit(STRIPE_EXPAND_READY, &sh2->state);
2509                                set_bit(STRIPE_HANDLE, &sh2->state);
2510                        }
2511                        release_stripe(sh2);
2512
2513                }
2514        /* done submitting copies, wait for them to complete */
2515        if (tx) {
2516                async_tx_ack(tx);
2517                dma_wait_for_async_tx(tx);
2518        }
2519}
2520
2521
2522/*
2523 * handle_stripe - do things to a stripe.
2524 *
2525 * We lock the stripe and then examine the state of various bits
2526 * to see what needs to be done.
2527 * Possible results:
2528 *    return some read request which now have data
2529 *    return some write requests which are safely on disc
2530 *    schedule a read on some buffers
2531 *    schedule a write of some buffers
2532 *    return confirmation of parity correctness
2533 *
2534 * buffers are taken off read_list or write_list, and bh_cache buffers
2535 * get BH_Lock set before the stripe lock is released.
2536 *
2537 */
2538
2539static bool handle_stripe5(struct stripe_head *sh)
2540{
2541        raid5_conf_t *conf = sh->raid_conf;
2542        int disks = sh->disks, i;
2543        struct bio *return_bi = NULL;
2544        struct stripe_head_state s;
2545        struct r5dev *dev;
2546        mdk_rdev_t *blocked_rdev = NULL;
2547        int prexor;
2548
2549        memset(&s, 0, sizeof(s));
2550        pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2551                 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2552                 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2553                 sh->reconstruct_state);
2554
2555        spin_lock(&sh->lock);
2556        clear_bit(STRIPE_HANDLE, &sh->state);
2557        clear_bit(STRIPE_DELAYED, &sh->state);
2558
2559        s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2560        s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2561        s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2562
2563        /* Now to look around and see what can be done */
2564        rcu_read_lock();
2565        for (i=disks; i--; ) {
2566                mdk_rdev_t *rdev;
2567                struct r5dev *dev = &sh->dev[i];
2568                clear_bit(R5_Insync, &dev->flags);
2569
2570                pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2571                        "written %p\n", i, dev->flags, dev->toread, dev->read,
2572                        dev->towrite, dev->written);
2573
2574                /* maybe we can request a biofill operation
2575                 *
2576                 * new wantfill requests are only permitted while
2577                 * ops_complete_biofill is guaranteed to be inactive
2578                 */
2579                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2580                    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2581                        set_bit(R5_Wantfill, &dev->flags);
2582
2583                /* now count some things */
2584                if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2585                if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2586                if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2587
2588                if (test_bit(R5_Wantfill, &dev->flags))
2589                        s.to_fill++;
2590                else if (dev->toread)
2591                        s.to_read++;
2592                if (dev->towrite) {
2593                        s.to_write++;
2594                        if (!test_bit(R5_OVERWRITE, &dev->flags))
2595                                s.non_overwrite++;
2596                }
2597                if (dev->written)
2598                        s.written++;
2599                rdev = rcu_dereference(conf->disks[i].rdev);
2600                if (blocked_rdev == NULL &&
2601                    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2602                        blocked_rdev = rdev;
2603                        atomic_inc(&rdev->nr_pending);
2604                }
2605                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2606                        /* The ReadError flag will just be confusing now */
2607                        clear_bit(R5_ReadError, &dev->flags);
2608                        clear_bit(R5_ReWrite, &dev->flags);
2609                }
2610                if (!rdev || !test_bit(In_sync, &rdev->flags)
2611                    || test_bit(R5_ReadError, &dev->flags)) {
2612                        s.failed++;
2613                        s.failed_num = i;
2614                } else
2615                        set_bit(R5_Insync, &dev->flags);
2616        }
2617        rcu_read_unlock();
2618
2619        if (unlikely(blocked_rdev)) {
2620                if (s.syncing || s.expanding || s.expanded ||
2621                    s.to_write || s.written) {
2622                        set_bit(STRIPE_HANDLE, &sh->state);
2623                        goto unlock;
2624                }
2625                /* There is nothing for the blocked_rdev to block */
2626                rdev_dec_pending(blocked_rdev, conf->mddev);
2627                blocked_rdev = NULL;
2628        }
2629
2630        if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
2631                set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
2632                set_bit(STRIPE_BIOFILL_RUN, &sh->state);
2633        }
2634
2635        pr_debug("locked=%d uptodate=%d to_read=%d"
2636                " to_write=%d failed=%d failed_num=%d\n",
2637                s.locked, s.uptodate, s.to_read, s.to_write,
2638                s.failed, s.failed_num);
2639        /* check if the array has lost two devices and, if so, some requests might
2640         * need to be failed
2641         */
2642        if (s.failed > 1 && s.to_read+s.to_write+s.written)
2643                handle_failed_stripe(conf, sh, &s, disks, &return_bi);
2644        if (s.failed > 1 && s.syncing) {
2645                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2646                clear_bit(STRIPE_SYNCING, &sh->state);
2647                s.syncing = 0;
2648        }
2649
2650        /* might be able to return some write requests if the parity block
2651         * is safe, or on a failed drive
2652         */
2653        dev = &sh->dev[sh->pd_idx];
2654        if ( s.written &&
2655             ((test_bit(R5_Insync, &dev->flags) &&
2656               !test_bit(R5_LOCKED, &dev->flags) &&
2657               test_bit(R5_UPTODATE, &dev->flags)) ||
2658               (s.failed == 1 && s.failed_num == sh->pd_idx)))
2659                handle_stripe_clean_event(conf, sh, disks, &return_bi);
2660
2661        /* Now we might consider reading some blocks, either to check/generate
2662         * parity, or to satisfy requests
2663         * or to load a block that is being partially written.
2664         */
2665        if (s.to_read || s.non_overwrite ||
2666            (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
2667                handle_stripe_fill5(sh, &s, disks);
2668
2669        /* Now we check to see if any write operations have recently
2670         * completed
2671         */
2672        prexor = 0;
2673        if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
2674                prexor = 1;
2675        if (sh->reconstruct_state == reconstruct_state_drain_result ||
2676            sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
2677                sh->reconstruct_state = reconstruct_state_idle;
2678
2679                /* All the 'written' buffers and the parity block are ready to
2680                 * be written back to disk
2681                 */
2682                BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
2683                for (i = disks; i--; ) {
2684                        dev = &sh->dev[i];
2685                        if (test_bit(R5_LOCKED, &dev->flags) &&
2686                                (i == sh->pd_idx || dev->written)) {
2687                                pr_debug("Writing block %d\n", i);
2688                                set_bit(R5_Wantwrite, &dev->flags);
2689                                if (prexor)
2690                                        continue;
2691                                if (!test_bit(R5_Insync, &dev->flags) ||
2692                                    (i == sh->pd_idx && s.failed == 0))
2693                                        set_bit(STRIPE_INSYNC, &sh->state);
2694                        }
2695                }
2696                if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2697                        atomic_dec(&conf->preread_active_stripes);
2698                        if (atomic_read(&conf->preread_active_stripes) <
2699                                IO_THRESHOLD)
2700                                md_wakeup_thread(conf->mddev->thread);
2701                }
2702        }
2703
2704        /* Now to consider new write requests and what else, if anything
2705         * should be read.  We do not handle new writes when:
2706         * 1/ A 'write' operation (copy+xor) is already in flight.
2707         * 2/ A 'check' operation is in flight, as it may clobber the parity
2708         *    block.
2709         */
2710        if (s.to_write && !sh->reconstruct_state && !sh->check_state)
2711                handle_stripe_dirtying5(conf, sh, &s, disks);
2712
2713        /* maybe we need to check and possibly fix the parity for this stripe
2714         * Any reads will already have been scheduled, so we just see if enough
2715         * data is available.  The parity check is held off while parity
2716         * dependent operations are in flight.
2717         */
2718        if (sh->check_state ||
2719            (s.syncing && s.locked == 0 &&
2720             !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
2721             !test_bit(STRIPE_INSYNC, &sh->state)))
2722                handle_parity_checks5(conf, sh, &s, disks);
2723
2724        if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2725                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2726                clear_bit(STRIPE_SYNCING, &sh->state);
2727        }
2728
2729        /* If the failed drive is just a ReadError, then we might need to progress
2730         * the repair/check process
2731         */
2732        if (s.failed == 1 && !conf->mddev->ro &&
2733            test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
2734            && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
2735            && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
2736                ) {
2737                dev = &sh->dev[s.failed_num];
2738                if (!test_bit(R5_ReWrite, &dev->flags)) {
2739                        set_bit(R5_Wantwrite, &dev->flags);
2740                        set_bit(R5_ReWrite, &dev->flags);
2741                        set_bit(R5_LOCKED, &dev->flags);
2742                        s.locked++;
2743                } else {
2744                        /* let's read it back */
2745                        set_bit(R5_Wantread, &dev->flags);
2746                        set_bit(R5_LOCKED, &dev->flags);
2747                        s.locked++;
2748                }
2749        }
2750
2751        /* Finish reconstruct operations initiated by the expansion process */
2752        if (sh->reconstruct_state == reconstruct_state_result) {
2753                sh->reconstruct_state = reconstruct_state_idle;
2754                clear_bit(STRIPE_EXPANDING, &sh->state);
2755                for (i = conf->raid_disks; i--; ) {
2756                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
2757                        set_bit(R5_LOCKED, &sh->dev[i].flags);
2758                        s.locked++;
2759                }
2760        }
2761
2762        if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2763            !sh->reconstruct_state) {
2764                /* Need to write out all blocks after computing parity */
2765                sh->disks = conf->raid_disks;
2766                sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2767                        conf->raid_disks);
2768                schedule_reconstruction5(sh, &s, 1, 1);
2769        } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2770                clear_bit(STRIPE_EXPAND_READY, &sh->state);
2771                atomic_dec(&conf->reshape_stripes);
2772                wake_up(&conf->wait_for_overlap);
2773                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2774        }
2775
2776        if (s.expanding && s.locked == 0 &&
2777            !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
2778                handle_stripe_expansion(conf, sh, NULL);
2779
2780 unlock:
2781        spin_unlock(&sh->lock);
2782
2783        /* wait for this device to become unblocked */
2784        if (unlikely(blocked_rdev))
2785                md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2786
2787        if (s.ops_request)
2788                raid5_run_ops(sh, s.ops_request);
2789
2790        ops_run_io(sh, &s);
2791
2792        return_io(return_bi);
2793
2794        return blocked_rdev == NULL;
2795}
2796
2797static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2798{
2799        raid6_conf_t *conf = sh->raid_conf;
2800        int disks = sh->disks;
2801        struct bio *return_bi = NULL;
2802        int i, pd_idx = sh->pd_idx;
2803        struct stripe_head_state s;
2804        struct r6_state r6s;
2805        struct r5dev *dev, *pdev, *qdev;
2806        mdk_rdev_t *blocked_rdev = NULL;
2807
2808        r6s.qd_idx = raid6_next_disk(pd_idx, disks);
2809        pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
2810                "pd_idx=%d, qd_idx=%d\n",
2811               (unsigned long long)sh->sector, sh->state,
2812               atomic_read(&sh->count), pd_idx, r6s.qd_idx);
2813        memset(&s, 0, sizeof(s));
2814
2815        spin_lock(&sh->lock);
2816        clear_bit(STRIPE_HANDLE, &sh->state);
2817        clear_bit(STRIPE_DELAYED, &sh->state);
2818
2819        s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2820        s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2821        s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2822        /* Now to look around and see what can be done */
2823
2824        rcu_read_lock();
2825        for (i=disks; i--; ) {
2826                mdk_rdev_t *rdev;
2827                dev = &sh->dev[i];
2828                clear_bit(R5_Insync, &dev->flags);
2829
2830                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
2831                        i, dev->flags, dev->toread, dev->towrite, dev->written);
2832                /* maybe we can reply to a read */
2833                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
2834                        struct bio *rbi, *rbi2;
2835                        pr_debug("Return read for disc %d\n", i);
2836                        spin_lock_irq(&conf->device_lock);
2837                        rbi = dev->toread;
2838                        dev->toread = NULL;
2839                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
2840                                wake_up(&conf->wait_for_overlap);
2841                        spin_unlock_irq(&conf->device_lock);
2842                        while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2843                                copy_data(0, rbi, dev->page, dev->sector);
2844                                rbi2 = r5_next_bio(rbi, dev->sector);
2845                                spin_lock_irq(&conf->device_lock);
2846                                if (!raid5_dec_bi_phys_segments(rbi)) {
2847                                        rbi->bi_next = return_bi;
2848                                        return_bi = rbi;
2849                                }
2850                                spin_unlock_irq(&conf->device_lock);
2851                                rbi = rbi2;
2852                        }
2853                }
2854
2855                /* now count some things */
2856                if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2857                if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2858
2859
2860                if (dev->toread)
2861                        s.to_read++;
2862                if (dev->towrite) {
2863                        s.to_write++;
2864                        if (!test_bit(R5_OVERWRITE, &dev->flags))
2865                                s.non_overwrite++;
2866                }
2867                if (dev->written)
2868                        s.written++;
2869                rdev = rcu_dereference(conf->disks[i].rdev);
2870                if (blocked_rdev == NULL &&
2871                    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2872                        blocked_rdev = rdev;
2873                        atomic_inc(&rdev->nr_pending);
2874                }
2875                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2876                        /* The ReadError flag will just be confusing now */
2877                        clear_bit(R5_ReadError, &dev->flags);
2878                        clear_bit(R5_ReWrite, &dev->flags);
2879                }
2880                if (!rdev || !test_bit(In_sync, &rdev->flags)
2881                    || test_bit(R5_ReadError, &dev->flags)) {
2882                        if (s.failed < 2)
2883                                r6s.failed_num[s.failed] = i;
2884                        s.failed++;
2885                } else
2886                        set_bit(R5_Insync, &dev->flags);
2887        }
2888        rcu_read_unlock();
2889
2890        if (unlikely(blocked_rdev)) {
2891                if (s.syncing || s.expanding || s.expanded ||
2892                    s.to_write || s.written) {
2893                        set_bit(STRIPE_HANDLE, &sh->state);
2894                        goto unlock;
2895                }
2896                /* There is nothing for the blocked_rdev to block */
2897                rdev_dec_pending(blocked_rdev, conf->mddev);
2898                blocked_rdev = NULL;
2899        }
2900
2901        pr_debug("locked=%d uptodate=%d to_read=%d"
2902               " to_write=%d failed=%d failed_num=%d,%d\n",
2903               s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
2904               r6s.failed_num[0], r6s.failed_num[1]);
2905        /* check if the array has lost >2 devices and, if so, some requests
2906         * might need to be failed
2907         */
2908        if (s.failed > 2 && s.to_read+s.to_write+s.written)
2909                handle_failed_stripe(conf, sh, &s, disks, &return_bi);
2910        if (s.failed > 2 && s.syncing) {
2911                md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2912                clear_bit(STRIPE_SYNCING, &sh->state);
2913                s.syncing = 0;
2914        }
2915
2916        /*
2917         * might be able to return some write requests if the parity blocks
2918         * are safe, or on a failed drive
2919         */
2920        pdev = &sh->dev[pd_idx];
2921        r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
2922                || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
2923        qdev = &sh->dev[r6s.qd_idx];
2924        r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
2925                || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
2926
2927        if ( s.written &&
2928             ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
2929                             && !test_bit(R5_LOCKED, &pdev->flags)
2930                             && test_bit(R5_UPTODATE, &pdev->flags)))) &&
2931             ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
2932                             && !test_bit(R5_LOCKED, &qdev->flags)
2933                             && test_bit(R5_UPTODATE, &qdev->flags)))))
2934                handle_stripe_clean_event(conf, sh, disks, &return_bi);
2935
2936        /* Now we might consider reading some blocks, either to check/generate
2937         * parity, or to satisfy requests
2938         * or to load a block that is being partially written.
2939         */
2940        if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
2941            (s.syncing && (s.uptodate < disks)) || s.expanding)
2942                handle_stripe_fill6(sh, &s, &r6s, disks);
2943
2944        /* now to consider writing and what else, if anything should be read */
2945        if (s.to_write)
2946                handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
2947
2948        /* maybe we need to check and possibly fix the parity for this stripe
2949         * Any reads will already have been scheduled, so we just see if enough
2950         * data is available
2951         */
2952        if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
2953                handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
2954
2955        if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2956                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2957                clear_bit(STRIPE_SYNCING, &sh->state);
2958        }
2959
2960        /* If the failed drives are just a ReadError, then we might need
2961         * to progress the repair/check process
2962         */
2963        if (s.failed <= 2 && !conf->mddev->ro)
2964                for (i = 0; i < s.failed; i++) {
2965                        dev = &sh->dev[r6s.failed_num[i]];
2966                        if (test_bit(R5_ReadError, &dev->flags)
2967                            && !test_bit(R5_LOCKED, &dev->flags)
2968                            && test_bit(R5_UPTODATE, &dev->flags)
2969                                ) {
2970                                if (!test_bit(R5_ReWrite, &dev->flags)) {
2971                                        set_bit(R5_Wantwrite, &dev->flags);
2972                                        set_bit(R5_ReWrite, &dev->flags);
2973                                        set_bit(R5_LOCKED, &dev->flags);
2974                                } else {
2975                                        /* let's read it back */
2976                                        set_bit(R5_Wantread, &dev->flags);
2977                                        set_bit(R5_LOCKED, &dev->flags);
2978                                }
2979                        }
2980                }
2981
2982        if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
2983                /* Need to write out all blocks after computing P&Q */
2984                sh->disks = conf->raid_disks;
2985                sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2986                                             conf->raid_disks);
2987                compute_parity6(sh, RECONSTRUCT_WRITE);
2988                for (i = conf->raid_disks ; i-- ;  ) {
2989                        set_bit(R5_LOCKED, &sh->dev[i].flags);
2990                        s.locked++;
2991                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
2992                }
2993                clear_bit(STRIPE_EXPANDING, &sh->state);
2994        } else if (s.expanded) {
2995                clear_bit(STRIPE_EXPAND_READY, &sh->state);
2996                atomic_dec(&conf->reshape_stripes);
2997                wake_up(&conf->wait_for_overlap);
2998                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2999        }
3000
3001        if (s.expanding && s.locked == 0 &&
3002            !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3003                handle_stripe_expansion(conf, sh, &r6s);
3004
3005 unlock:
3006        spin_unlock(&sh->lock);
3007
3008        /* wait for this device to become unblocked */
3009        if (unlikely(blocked_rdev))
3010                md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3011
3012        ops_run_io(sh, &s);
3013
3014        return_io(return_bi);
3015
3016        return blocked_rdev == NULL;
3017}
3018
3019/* returns true if the stripe was handled */
3020static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page)
3021{
3022        if (sh->raid_conf->level == 6)
3023                return handle_stripe6(sh, tmp_page);
3024        else
3025                return handle_stripe5(sh);
3026}
3027
3028
3029
3030static void raid5_activate_delayed(raid5_conf_t *conf)
3031{
3032        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3033                while (!list_empty(&conf->delayed_list)) {
3034                        struct list_head *l = conf->delayed_list.next;
3035                        struct stripe_head *sh;
3036                        sh = list_entry(l, struct stripe_head, lru);
3037                        list_del_init(l);
3038                        clear_bit(STRIPE_DELAYED, &sh->state);
3039                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3040                                atomic_inc(&conf->preread_active_stripes);
3041                        list_add_tail(&sh->lru, &conf->hold_list);
3042                }
3043        } else
3044                blk_plug_device(conf->mddev->queue);
3045}
3046
3047static void activate_bit_delay(raid5_conf_t *conf)
3048{
3049        /* device_lock is held */
3050        struct list_head head;
3051        list_add(&head, &conf->bitmap_list);
3052        list_del_init(&conf->bitmap_list);
3053        while (!list_empty(&head)) {
3054                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3055                list_del_init(&sh->lru);
3056                atomic_inc(&sh->count);
3057                __release_stripe(conf, sh);
3058        }
3059}
3060
3061static void unplug_slaves(mddev_t *mddev)
3062{
3063        raid5_conf_t *conf = mddev_to_conf(mddev);
3064        int i;
3065
3066        rcu_read_lock();
3067        for (i=0; i<mddev->raid_disks; i++) {
3068                mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3069                if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3070                        struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3071
3072                        atomic_inc(&rdev->nr_pending);
3073                        rcu_read_unlock();
3074
3075                        blk_unplug(r_queue);
3076
3077                        rdev_dec_pending(rdev, mddev);
3078                        rcu_read_lock();
3079                }
3080        }
3081        rcu_read_unlock();
3082}
3083
3084static void raid5_unplug_device(struct request_queue *q)
3085{
3086        mddev_t *mddev = q->queuedata;
3087        raid5_conf_t *conf = mddev_to_conf(mddev);
3088        unsigned long flags;
3089
3090        spin_lock_irqsave(&conf->device_lock, flags);
3091
3092        if (blk_remove_plug(q)) {
3093                conf->seq_flush++;
3094                raid5_activate_delayed(conf);
3095        }
3096        md_wakeup_thread(mddev->thread);
3097
3098        spin_unlock_irqrestore(&conf->device_lock, flags);
3099
3100        unplug_slaves(mddev);
3101}
3102
3103static int raid5_congested(void *data, int bits)
3104{
3105        mddev_t *mddev = data;
3106        raid5_conf_t *conf = mddev_to_conf(mddev);
3107
3108        /* No difference between reads and writes.  Just check
3109         * how busy the stripe_cache is
3110         */
3111        if (conf->inactive_blocked)
3112                return 1;
3113        if (conf->quiesce)
3114                return 1;
3115        if (list_empty_careful(&conf->inactive_list))
3116                return 1;
3117
3118        return 0;
3119}
3120
3121/* We want read requests to align with chunks where possible,
3122 * but write requests don't need to.
3123 */
3124static int raid5_mergeable_bvec(struct request_queue *q,
3125                                struct bvec_merge_data *bvm,
3126                                struct bio_vec *biovec)
3127{
3128        mddev_t *mddev = q->queuedata;
3129        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3130        int max;
3131        unsigned int chunk_sectors = mddev->chunk_size >> 9;
3132        unsigned int bio_sectors = bvm->bi_size >> 9;
3133
3134        if ((bvm->bi_rw & 1) == WRITE)
3135                return biovec->bv_len; /* always allow writes to be mergeable */
3136
3137        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3138        if (max < 0) max = 0;
3139        if (max <= biovec->bv_len && bio_sectors == 0)
3140                return biovec->bv_len;
3141        else
3142                return max;
3143}
3144
3145
3146static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3147{
3148        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3149        unsigned int chunk_sectors = mddev->chunk_size >> 9;
3150        unsigned int bio_sectors = bio->bi_size >> 9;
3151
3152        return  chunk_sectors >=
3153                ((sector & (chunk_sectors - 1)) + bio_sectors);
3154}
3155
3156/*
3157 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
3158 *  later sampled by raid5d.
3159 */
3160static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
3161{
3162        unsigned long flags;
3163
3164        spin_lock_irqsave(&conf->device_lock, flags);
3165
3166        bi->bi_next = conf->retry_read_aligned_list;
3167        conf->retry_read_aligned_list = bi;
3168
3169        spin_unlock_irqrestore(&conf->device_lock, flags);
3170        md_wakeup_thread(conf->mddev->thread);
3171}
3172
3173
3174static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
3175{
3176        struct bio *bi;
3177
3178        bi = conf->retry_read_aligned;
3179        if (bi) {
3180                conf->retry_read_aligned = NULL;
3181                return bi;
3182        }
3183        bi = conf->retry_read_aligned_list;
3184        if(bi) {
3185                conf->retry_read_aligned_list = bi->bi_next;
3186                bi->bi_next = NULL;
3187                /*
3188                 * this sets the active strip count to 1 and the processed
3189                 * strip count to zero (upper 8 bits)
3190                 */
3191                bi->bi_phys_segments = 1; /* biased count of active stripes */
3192        }
3193
3194        return bi;
3195}
3196
3197
3198/*
3199 *  The "raid5_align_endio" should check if the read succeeded and if it
3200 *  did, call bio_endio on the original bio (having bio_put the new bio
3201 *  first).
3202 *  If the read failed..
3203 */
3204static void raid5_align_endio(struct bio *bi, int error)
3205{
3206        struct bio* raid_bi  = bi->bi_private;
3207        mddev_t *mddev;
3208        raid5_conf_t *conf;
3209        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3210        mdk_rdev_t *rdev;
3211
3212        bio_put(bi);
3213
3214        mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3215        conf = mddev_to_conf(mddev);
3216        rdev = (void*)raid_bi->bi_next;
3217        raid_bi->bi_next = NULL;
3218
3219        rdev_dec_pending(rdev, conf->mddev);
3220
3221        if (!error && uptodate) {
3222                bio_endio(raid_bi, 0);
3223                if (atomic_dec_and_test(&conf->active_aligned_reads))
3224                        wake_up(&conf->wait_for_stripe);
3225                return;
3226        }
3227
3228
3229        pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3230
3231        add_bio_to_retry(raid_bi, conf);
3232}
3233
3234static int bio_fits_rdev(struct bio *bi)
3235{
3236        struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3237
3238        if ((bi->bi_size>>9) > q->max_sectors)
3239                return 0;
3240        blk_recount_segments(q, bi);
3241        if (bi->bi_phys_segments > q->max_phys_segments)
3242                return 0;
3243
3244        if (q->merge_bvec_fn)
3245                /* it's too hard to apply the merge_bvec_fn at this stage,
3246                 * just just give up
3247                 */
3248                return 0;
3249
3250        return 1;
3251}
3252
3253
3254static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3255{
3256        mddev_t *mddev = q->queuedata;
3257        raid5_conf_t *conf = mddev_to_conf(mddev);
3258        const unsigned int raid_disks = conf->raid_disks;
3259        const unsigned int data_disks = raid_disks - conf->max_degraded;
3260        unsigned int dd_idx, pd_idx;
3261        struct bio* align_bi;
3262        mdk_rdev_t *rdev;
3263
3264        if (!in_chunk_boundary(mddev, raid_bio)) {
3265                pr_debug("chunk_aligned_read : non aligned\n");
3266                return 0;
3267        }
3268        /*
3269         * use bio_clone to make a copy of the bio
3270         */
3271        align_bi = bio_clone(raid_bio, GFP_NOIO);
3272        if (!align_bi)
3273                return 0;
3274        /*
3275         *   set bi_end_io to a new function, and set bi_private to the
3276         *     original bio.
3277         */
3278        align_bi->bi_end_io  = raid5_align_endio;
3279        align_bi->bi_private = raid_bio;
3280        /*
3281         *      compute position
3282         */
3283        align_bi->bi_sector =  raid5_compute_sector(raid_bio->bi_sector,
3284                                        raid_disks,
3285                                        data_disks,
3286                                        &dd_idx,
3287                                        &pd_idx,
3288                                        conf);
3289
3290        rcu_read_lock();
3291        rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3292        if (rdev && test_bit(In_sync, &rdev->flags)) {
3293                atomic_inc(&rdev->nr_pending);
3294                rcu_read_unlock();
3295                raid_bio->bi_next = (void*)rdev;
3296                align_bi->bi_bdev =  rdev->bdev;
3297                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3298                align_bi->bi_sector += rdev->data_offset;
3299
3300                if (!bio_fits_rdev(align_bi)) {
3301                        /* too big in some way */
3302                        bio_put(align_bi);
3303                        rdev_dec_pending(rdev, mddev);
3304                        return 0;
3305                }
3306
3307                spin_lock_irq(&conf->device_lock);
3308                wait_event_lock_irq(conf->wait_for_stripe,
3309                                    conf->quiesce == 0,
3310                                    conf->device_lock, /* nothing */);
3311                atomic_inc(&conf->active_aligned_reads);
3312                spin_unlock_irq(&conf->device_lock);
3313
3314                generic_make_request(align_bi);
3315                return 1;
3316        } else {
3317                rcu_read_unlock();
3318                bio_put(align_bi);
3319                return 0;
3320        }
3321}
3322
3323/* __get_priority_stripe - get the next stripe to process
3324 *
3325 * Full stripe writes are allowed to pass preread active stripes up until
3326 * the bypass_threshold is exceeded.  In general the bypass_count
3327 * increments when the handle_list is handled before the hold_list; however, it
3328 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3329 * stripe with in flight i/o.  The bypass_count will be reset when the
3330 * head of the hold_list has changed, i.e. the head was promoted to the
3331 * handle_list.
3332 */
3333static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3334{
3335        struct stripe_head *sh;
3336
3337        pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3338                  __func__,
3339                  list_empty(&conf->handle_list) ? "empty" : "busy",
3340                  list_empty(&conf->hold_list) ? "empty" : "busy",
3341                  atomic_read(&conf->pending_full_writes), conf->bypass_count);
3342
3343        if (!list_empty(&conf->handle_list)) {
3344                sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3345
3346                if (list_empty(&conf->hold_list))
3347                        conf->bypass_count = 0;
3348                else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3349                        if (conf->hold_list.next == conf->last_hold)
3350                                conf->bypass_count++;
3351                        else {
3352                                conf->last_hold = conf->hold_list.next;
3353                                conf->bypass_count -= conf->bypass_threshold;
3354                                if (conf->bypass_count < 0)
3355                                        conf->bypass_count = 0;
3356                        }
3357                }
3358        } else if (!list_empty(&conf->hold_list) &&
3359                   ((conf->bypass_threshold &&
3360                     conf->bypass_count > conf->bypass_threshold) ||
3361                    atomic_read(&conf->pending_full_writes) == 0)) {
3362                sh = list_entry(conf->hold_list.next,
3363                                typeof(*sh), lru);
3364                conf->bypass_count -= conf->bypass_threshold;
3365                if (conf->bypass_count < 0)
3366                        conf->bypass_count = 0;
3367        } else
3368                return NULL;
3369
3370        list_del_init(&sh->lru);
3371        atomic_inc(&sh->count);
3372        BUG_ON(atomic_read(&sh->count) != 1);
3373        return sh;
3374}
3375
3376static int make_request(struct request_queue *q, struct bio * bi)
3377{
3378        mddev_t *mddev = q->queuedata;
3379        raid5_conf_t *conf = mddev_to_conf(mddev);
3380        unsigned int dd_idx, pd_idx;
3381        sector_t new_sector;
3382        sector_t logical_sector, last_sector;
3383        struct stripe_head *sh;
3384        const int rw = bio_data_dir(bi);
3385        int cpu, remaining;
3386
3387        if (unlikely(bio_barrier(bi))) {
3388                bio_endio(bi, -EOPNOTSUPP);
3389                return 0;
3390        }
3391
3392        md_write_start(mddev, bi);
3393
3394        cpu = part_stat_lock();
3395        part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
3396        part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
3397                      bio_sectors(bi));
3398        part_stat_unlock();
3399
3400        if (rw == READ &&
3401             mddev->reshape_position == MaxSector &&
3402             chunk_aligned_read(q,bi))
3403                return 0;
3404
3405        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3406        last_sector = bi->bi_sector + (bi->bi_size>>9);
3407        bi->bi_next = NULL;
3408        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
3409
3410        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3411                DEFINE_WAIT(w);
3412                int disks, data_disks;
3413
3414        retry:
3415                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3416                if (likely(conf->expand_progress == MaxSector))
3417                        disks = conf->raid_disks;
3418                else {
3419                        /* spinlock is needed as expand_progress may be
3420                         * 64bit on a 32bit platform, and so it might be
3421                         * possible to see a half-updated value
3422                         * Ofcourse expand_progress could change after
3423                         * the lock is dropped, so once we get a reference
3424                         * to the stripe that we think it is, we will have
3425                         * to check again.
3426                         */
3427                        spin_lock_irq(&conf->device_lock);
3428                        disks = conf->raid_disks;
3429                        if (logical_sector >= conf->expand_progress)
3430                                disks = conf->previous_raid_disks;
3431                        else {
3432                                if (logical_sector >= conf->expand_lo) {
3433                                        spin_unlock_irq(&conf->device_lock);
3434                                        schedule();
3435                                        goto retry;
3436                                }
3437                        }
3438                        spin_unlock_irq(&conf->device_lock);
3439                }
3440                data_disks = disks - conf->max_degraded;
3441
3442                new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
3443                                                  &dd_idx, &pd_idx, conf);
3444                pr_debug("raid5: make_request, sector %llu logical %llu\n",
3445                        (unsigned long long)new_sector, 
3446                        (unsigned long long)logical_sector);
3447
3448                sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
3449                if (sh) {
3450                        if (unlikely(conf->expand_progress != MaxSector)) {
3451                                /* expansion might have moved on while waiting for a
3452                                 * stripe, so we must do the range check again.
3453                                 * Expansion could still move past after this
3454                                 * test, but as we are holding a reference to
3455                                 * 'sh', we know that if that happens,
3456                                 *  STRIPE_EXPANDING will get set and the expansion
3457                                 * won't proceed until we finish with the stripe.
3458                                 */
3459                                int must_retry = 0;
3460                                spin_lock_irq(&conf->device_lock);
3461                                if (logical_sector <  conf->expand_progress &&
3462                                    disks == conf->previous_raid_disks)
3463                                        /* mismatch, need to try again */
3464                                        must_retry = 1;
3465                                spin_unlock_irq(&conf->device_lock);
3466                                if (must_retry) {
3467                                        release_stripe(sh);
3468                                        goto retry;
3469                                }
3470                        }
3471                        /* FIXME what if we get a false positive because these
3472                         * are being updated.
3473                         */
3474                        if (logical_sector >= mddev->suspend_lo &&
3475                            logical_sector < mddev->suspend_hi) {
3476                                release_stripe(sh);
3477                                schedule();
3478                                goto retry;
3479                        }
3480
3481                        if (test_bit(STRIPE_EXPANDING, &sh->state) ||
3482                            !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
3483                                /* Stripe is busy expanding or
3484                                 * add failed due to overlap.  Flush everything
3485                                 * and wait a while
3486                                 */
3487                                raid5_unplug_device(mddev->queue);
3488                                release_stripe(sh);
3489                                schedule();
3490                                goto retry;
3491                        }
3492                        finish_wait(&conf->wait_for_overlap, &w);
3493                        set_bit(STRIPE_HANDLE, &sh->state);
3494                        clear_bit(STRIPE_DELAYED, &sh->state);
3495                        release_stripe(sh);
3496                } else {
3497                        /* cannot get stripe for read-ahead, just give-up */
3498                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
3499                        finish_wait(&conf->wait_for_overlap, &w);
3500                        break;
3501                }
3502                        
3503        }
3504        spin_lock_irq(&conf->device_lock);
3505        remaining = raid5_dec_bi_phys_segments(bi);
3506        spin_unlock_irq(&conf->device_lock);
3507        if (remaining == 0) {
3508
3509                if ( rw == WRITE )
3510                        md_write_end(mddev);
3511
3512                bio_endio(bi, 0);
3513        }
3514        return 0;
3515}
3516
3517static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
3518{
3519        /* reshaping is quite different to recovery/resync so it is
3520         * handled quite separately ... here.
3521         *
3522         * On each call to sync_request, we gather one chunk worth of
3523         * destination stripes and flag them as expanding.
3524         * Then we find all the source stripes and request reads.
3525         * As the reads complete, handle_stripe will copy the data
3526         * into the destination stripe and release that stripe.
3527         */
3528        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3529        struct stripe_head *sh;
3530        int pd_idx;
3531        sector_t first_sector, last_sector;
3532        int raid_disks = conf->previous_raid_disks;
3533        int data_disks = raid_disks - conf->max_degraded;
3534        int new_data_disks = conf->raid_disks - conf->max_degraded;
3535        int i;
3536        int dd_idx;
3537        sector_t writepos, safepos, gap;
3538
3539        if (sector_nr == 0 &&
3540            conf->expand_progress != 0) {
3541                /* restarting in the middle, skip the initial sectors */
3542                sector_nr = conf->expand_progress;
3543                sector_div(sector_nr, new_data_disks);
3544                *skipped = 1;
3545                return sector_nr;
3546        }
3547
3548        /* we update the metadata when there is more than 3Meg
3549         * in the block range (that is rather arbitrary, should
3550         * probably be time based) or when the data about to be
3551         * copied would over-write the source of the data at
3552         * the front of the range.
3553         * i.e. one new_stripe forward from expand_progress new_maps
3554         * to after where expand_lo old_maps to
3555         */
3556        writepos = conf->expand_progress +
3557                conf->chunk_size/512*(new_data_disks);
3558        sector_div(writepos, new_data_disks);
3559        safepos = conf->expand_lo;
3560        sector_div(safepos, data_disks);
3561        gap = conf->expand_progress - conf->expand_lo;
3562
3563        if (writepos >= safepos ||
3564            gap > (new_data_disks)*3000*2 /*3Meg*/) {
3565                /* Cannot proceed until we've updated the superblock... */
3566                wait_event(conf->wait_for_overlap,
3567                           atomic_read(&conf->reshape_stripes)==0);
3568                mddev->reshape_position = conf->expand_progress;
3569                set_bit(MD_CHANGE_DEVS, &mddev->flags);
3570                md_wakeup_thread(mddev->thread);
3571                wait_event(mddev->sb_wait, mddev->flags == 0 ||
3572                           kthread_should_stop());
3573                spin_lock_irq(&conf->device_lock);
3574                conf->expand_lo = mddev->reshape_position;
3575                spin_unlock_irq(&conf->device_lock);
3576                wake_up(&conf->wait_for_overlap);
3577        }
3578
3579        for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
3580                int j;
3581                int skipped = 0;
3582                pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
3583                sh = get_active_stripe(conf, sector_nr+i,
3584                                       conf->raid_disks, pd_idx, 0);
3585                set_bit(STRIPE_EXPANDING, &sh->state);
3586                atomic_inc(&conf->reshape_stripes);
3587                /* If any of this stripe is beyond the end of the old
3588                 * array, then we need to zero those blocks
3589                 */
3590                for (j=sh->disks; j--;) {
3591                        sector_t s;
3592                        if (j == sh->pd_idx)
3593                                continue;
3594                        if (conf->level == 6 &&
3595                            j == raid6_next_disk(sh->pd_idx, sh->disks))
3596                                continue;
3597                        s = compute_blocknr(sh, j);
3598                        if (s < mddev->array_sectors) {
3599                                skipped = 1;
3600                                continue;
3601                        }
3602                        memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
3603                        set_bit(R5_Expanded, &sh->dev[j].flags);
3604                        set_bit(R5_UPTODATE, &sh->dev[j].flags);
3605                }
3606                if (!skipped) {
3607                        set_bit(STRIPE_EXPAND_READY, &sh->state);
3608                        set_bit(STRIPE_HANDLE, &sh->state);
3609                }
3610                release_stripe(sh);
3611        }
3612        spin_lock_irq(&conf->device_lock);
3613        conf->expand_progress = (sector_nr + i) * new_data_disks;
3614        spin_unlock_irq(&conf->device_lock);
3615        /* Ok, those stripe are ready. We can start scheduling
3616         * reads on the source stripes.
3617         * The source stripes are determined by mapping the first and last
3618         * block on the destination stripes.
3619         */
3620        first_sector =
3621                raid5_compute_sector(sector_nr*(new_data_disks),
3622                                     raid_disks, data_disks,
3623                                     &dd_idx, &pd_idx, conf);
3624        last_sector =
3625                raid5_compute_sector((sector_nr+conf->chunk_size/512)
3626                                     *(new_data_disks) -1,
3627                                     raid_disks, data_disks,
3628                                     &dd_idx, &pd_idx, conf);
3629        if (last_sector >= (mddev->size<<1))
3630                last_sector = (mddev->size<<1)-1;
3631        while (first_sector <= last_sector) {
3632                pd_idx = stripe_to_pdidx(first_sector, conf,
3633                                         conf->previous_raid_disks);
3634                sh = get_active_stripe(conf, first_sector,
3635                                       conf->previous_raid_disks, pd_idx, 0);
3636                set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3637                set_bit(STRIPE_HANDLE, &sh->state);
3638                release_stripe(sh);
3639                first_sector += STRIPE_SECTORS;
3640        }
3641        /* If this takes us to the resync_max point where we have to pause,
3642         * then we need to write out the superblock.
3643         */
3644        sector_nr += conf->chunk_size>>9;
3645        if (sector_nr >= mddev->resync_max) {
3646                /* Cannot proceed until we've updated the superblock... */
3647                wait_event(conf->wait_for_overlap,
3648                           atomic_read(&conf->reshape_stripes) == 0);
3649                mddev->reshape_position = conf->expand_progress;
3650                set_bit(MD_CHANGE_DEVS, &mddev->flags);
3651                md_wakeup_thread(mddev->thread);
3652                wait_event(mddev->sb_wait,
3653                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
3654                           || kthread_should_stop());
3655                spin_lock_irq(&conf->device_lock);
3656                conf->expand_lo = mddev->reshape_position;
3657                spin_unlock_irq(&conf->device_lock);
3658                wake_up(&conf->wait_for_overlap);
3659        }
3660        return conf->chunk_size>>9;
3661}
3662
3663/* FIXME go_faster isn't used */
3664static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
3665{
3666        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3667        struct stripe_head *sh;
3668        int pd_idx;
3669        int raid_disks = conf->raid_disks;
3670        sector_t max_sector = mddev->size << 1;
3671        int sync_blocks;
3672        int still_degraded = 0;
3673        int i;
3674
3675        if (sector_nr >= max_sector) {
3676                /* just being told to finish up .. nothing much to do */
3677                unplug_slaves(mddev);
3678                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3679                        end_reshape(conf);
3680                        return 0;
3681                }
3682
3683                if (mddev->curr_resync < max_sector) /* aborted */
3684                        bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
3685                                        &sync_blocks, 1);
3686                else /* completed sync */
3687                        conf->fullsync = 0;
3688                bitmap_close_sync(mddev->bitmap);
3689
3690                return 0;
3691        }
3692
3693        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3694                return reshape_request(mddev, sector_nr, skipped);
3695
3696        /* No need to check resync_max as we never do more than one
3697         * stripe, and as resync_max will always be on a chunk boundary,
3698         * if the check in md_do_sync didn't fire, there is no chance
3699         * of overstepping resync_max here
3700         */
3701
3702        /* if there is too many failed drives and we are trying
3703         * to resync, then assert that we are finished, because there is
3704         * nothing we can do.
3705         */
3706        if (mddev->degraded >= conf->max_degraded &&
3707            test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3708                sector_t rv = (mddev->size << 1) - sector_nr;
3709                *skipped = 1;
3710                return rv;
3711        }
3712        if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
3713            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
3714            !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
3715                /* we can skip this block, and probably more */
3716                sync_blocks /= STRIPE_SECTORS;
3717                *skipped = 1;
3718                return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
3719        }
3720
3721
3722        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3723
3724        pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
3725        sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
3726        if (sh == NULL) {
3727                sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
3728                /* make sure we don't swamp the stripe cache if someone else
3729                 * is trying to get access
3730                 */
3731                schedule_timeout_uninterruptible(1);
3732        }
3733        /* Need to check if array will still be degraded after recovery/resync
3734         * We don't need to check the 'failed' flag as when that gets set,
3735         * recovery aborts.
3736         */
3737        for (i=0; i<mddev->raid_disks; i++)
3738                if (conf->disks[i].rdev == NULL)
3739                        still_degraded = 1;
3740
3741        bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
3742
3743        spin_lock(&sh->lock);
3744        set_bit(STRIPE_SYNCING, &sh->state);
3745        clear_bit(STRIPE_INSYNC, &sh->state);
3746        spin_unlock(&sh->lock);
3747
3748        /* wait for any blocked device to be handled */
3749        while(unlikely(!handle_stripe(sh, NULL)))
3750                ;
3751        release_stripe(sh);
3752
3753        return STRIPE_SECTORS;
3754}
3755
3756static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
3757{
3758        /* We may not be able to submit a whole bio at once as there
3759         * may not be enough stripe_heads available.
3760         * We cannot pre-allocate enough stripe_heads as we may need
3761         * more than exist in the cache (if we allow ever large chunks).
3762         * So we do one stripe head at a time and record in
3763         * ->bi_hw_segments how many have been done.
3764         *
3765         * We *know* that this entire raid_bio is in one chunk, so
3766         * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
3767         */
3768        struct stripe_head *sh;
3769        int dd_idx, pd_idx;
3770        sector_t sector, logical_sector, last_sector;
3771        int scnt = 0;
3772        int remaining;
3773        int handled = 0;
3774
3775        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3776        sector = raid5_compute_sector(  logical_sector,
3777                                        conf->raid_disks,
3778                                        conf->raid_disks - conf->max_degraded,
3779                                        &dd_idx,
3780                                        &pd_idx,
3781                                        conf);
3782        last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
3783
3784        for (; logical_sector < last_sector;
3785             logical_sector += STRIPE_SECTORS,
3786                     sector += STRIPE_SECTORS,
3787                     scnt++) {
3788
3789                if (scnt < raid5_bi_hw_segments(raid_bio))
3790                        /* already done this stripe */
3791                        continue;
3792
3793                sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
3794
3795                if (!sh) {
3796                        /* failed to get a stripe - must wait */
3797                        raid5_set_bi_hw_segments(raid_bio, scnt);
3798                        conf->retry_read_aligned = raid_bio;
3799                        return handled;
3800                }
3801
3802                set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
3803                if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
3804                        release_stripe(sh);
3805                        raid5_set_bi_hw_segments(raid_bio, scnt);
3806                        conf->retry_read_aligned = raid_bio;
3807                        return handled;
3808                }
3809
3810                handle_stripe(sh, NULL);
3811                release_stripe(sh);
3812                handled++;
3813        }
3814        spin_lock_irq(&conf->device_lock);
3815        remaining = raid5_dec_bi_phys_segments(raid_bio);
3816        spin_unlock_irq(&conf->device_lock);
3817        if (remaining == 0)
3818                bio_endio(raid_bio, 0);
3819        if (atomic_dec_and_test(&conf->active_aligned_reads))
3820                wake_up(&conf->wait_for_stripe);
3821        return handled;
3822}
3823
3824
3825
3826/*
3827 * This is our raid5 kernel thread.
3828 *
3829 * We scan the hash table for stripes which can be handled now.
3830 * During the scan, completed stripes are saved for us by the interrupt
3831 * handler, so that they will not have to wait for our next wakeup.
3832 */
3833static void raid5d(mddev_t *mddev)
3834{
3835        struct stripe_head *sh;
3836        raid5_conf_t *conf = mddev_to_conf(mddev);
3837        int handled;
3838
3839        pr_debug("+++ raid5d active\n");
3840
3841        md_check_recovery(mddev);
3842
3843        handled = 0;
3844        spin_lock_irq(&conf->device_lock);
3845        while (1) {
3846                struct bio *bio;
3847
3848                if (conf->seq_flush != conf->seq_write) {
3849                        int seq = conf->seq_flush;
3850                        spin_unlock_irq(&conf->device_lock);
3851                        bitmap_unplug(mddev->bitmap);
3852                        spin_lock_irq(&conf->device_lock);
3853                        conf->seq_write = seq;
3854                        activate_bit_delay(conf);
3855                }
3856
3857                while ((bio = remove_bio_from_retry(conf))) {
3858                        int ok;
3859                        spin_unlock_irq(&conf->device_lock);
3860                        ok = retry_aligned_read(conf, bio);
3861                        spin_lock_irq(&conf->device_lock);
3862                        if (!ok)
3863                                break;
3864                        handled++;
3865                }
3866
3867                sh = __get_priority_stripe(conf);
3868
3869                if (!sh)
3870                        break;
3871                spin_unlock_irq(&conf->device_lock);
3872                
3873                handled++;
3874                handle_stripe(sh, conf->spare_page);
3875                release_stripe(sh);
3876
3877                spin_lock_irq(&conf->device_lock);
3878        }
3879        pr_debug("%d stripes handled\n", handled);
3880
3881        spin_unlock_irq(&conf->device_lock);
3882
3883        async_tx_issue_pending_all();
3884        unplug_slaves(mddev);
3885
3886        pr_debug("--- raid5d inactive\n");
3887}
3888
3889static ssize_t
3890raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
3891{
3892        raid5_conf_t *conf = mddev_to_conf(mddev);
3893        if (conf)
3894                return sprintf(page, "%d\n", conf->max_nr_stripes);
3895        else
3896                return 0;
3897}
3898
3899static ssize_t
3900raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
3901{
3902        raid5_conf_t *conf = mddev_to_conf(mddev);
3903        unsigned long new;
3904        int err;
3905
3906        if (len >= PAGE_SIZE)
3907                return -EINVAL;
3908        if (!conf)
3909                return -ENODEV;
3910
3911        if (strict_strtoul(page, 10, &new))
3912                return -EINVAL;
3913        if (new <= 16 || new > 32768)
3914                return -EINVAL;
3915        while (new < conf->max_nr_stripes) {
3916                if (drop_one_stripe(conf))
3917                        conf->max_nr_stripes--;
3918                else
3919                        break;
3920        }
3921        err = md_allow_write(mddev);
3922        if (err)
3923                return err;
3924        while (new > conf->max_nr_stripes) {
3925                if (grow_one_stripe(conf))
3926                        conf->max_nr_stripes++;
3927                else break;
3928        }
3929        return len;
3930}
3931
3932static struct md_sysfs_entry
3933raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
3934                                raid5_show_stripe_cache_size,
3935                                raid5_store_stripe_cache_size);
3936
3937static ssize_t
3938raid5_show_preread_threshold(mddev_t *mddev, char *page)
3939{
3940        raid5_conf_t *conf = mddev_to_conf(mddev);
3941        if (conf)
3942                return sprintf(page, "%d\n", conf->bypass_threshold);
3943        else
3944                return 0;
3945}
3946
3947static ssize_t
3948raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
3949{
3950        raid5_conf_t *conf = mddev_to_conf(mddev);
3951        unsigned long new;
3952        if (len >= PAGE_SIZE)
3953                return -EINVAL;
3954        if (!conf)
3955                return -ENODEV;
3956
3957        if (strict_strtoul(page, 10, &new))
3958                return -EINVAL;
3959        if (new > conf->max_nr_stripes)
3960                return -EINVAL;
3961        conf->bypass_threshold = new;
3962        return len;
3963}
3964
3965static struct md_sysfs_entry
3966raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
3967                                        S_IRUGO | S_IWUSR,
3968                                        raid5_show_preread_threshold,
3969                                        raid5_store_preread_threshold);
3970
3971static ssize_t
3972stripe_cache_active_show(mddev_t *mddev, char *page)
3973{
3974        raid5_conf_t *conf = mddev_to_conf(mddev);
3975        if (conf)
3976                return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
3977        else
3978                return 0;
3979}
3980
3981static struct md_sysfs_entry
3982raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
3983
3984static struct attribute *raid5_attrs[] =  {
3985        &raid5_stripecache_size.attr,
3986        &raid5_stripecache_active.attr,
3987        &raid5_preread_bypass_threshold.attr,
3988        NULL,
3989};
3990static struct attribute_group raid5_attrs_group = {
3991        .name = NULL,
3992        .attrs = raid5_attrs,
3993};
3994
3995static int run(mddev_t *mddev)
3996{
3997        raid5_conf_t *conf;
3998        int raid_disk, memory;
3999        mdk_rdev_t *rdev;
4000        struct disk_info *disk;
4001        struct list_head *tmp;
4002        int working_disks = 0;
4003
4004        if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
4005                printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
4006                       mdname(mddev), mddev->level);
4007                return -EIO;
4008        }
4009
4010        if (mddev->chunk_size < PAGE_SIZE) {
4011                printk(KERN_ERR "md/raid5: chunk_size must be at least "
4012                       "PAGE_SIZE but %d < %ld\n",
4013                       mddev->chunk_size, PAGE_SIZE);
4014                return -EINVAL;
4015        }
4016
4017        if (mddev->reshape_position != MaxSector) {
4018                /* Check that we can continue the reshape.
4019                 * Currently only disks can change, it must
4020                 * increase, and we must be past the point where
4021                 * a stripe over-writes itself
4022                 */
4023                sector_t here_new, here_old;
4024                int old_disks;
4025                int max_degraded = (mddev->level == 5 ? 1 : 2);
4026
4027                if (mddev->new_level != mddev->level ||
4028                    mddev->new_layout != mddev->layout ||
4029                    mddev->new_chunk != mddev->chunk_size) {
4030                        printk(KERN_ERR "raid5: %s: unsupported reshape "
4031                               "required - aborting.\n",
4032                               mdname(mddev));
4033                        return -EINVAL;
4034                }
4035                if (mddev->delta_disks <= 0) {
4036                        printk(KERN_ERR "raid5: %s: unsupported reshape "
4037                               "(reduce disks) required - aborting.\n",
4038                               mdname(mddev));
4039                        return -EINVAL;
4040                }
4041                old_disks = mddev->raid_disks - mddev->delta_disks;
4042                /* reshape_position must be on a new-stripe boundary, and one
4043                 * further up in new geometry must map after here in old
4044                 * geometry.
4045                 */
4046                here_new = mddev->reshape_position;
4047                if (sector_div(here_new, (mddev->chunk_size>>9)*
4048                               (mddev->raid_disks - max_degraded))) {
4049                        printk(KERN_ERR "raid5: reshape_position not "
4050                               "on a stripe boundary\n");
4051                        return -EINVAL;
4052                }
4053                /* here_new is the stripe we will write to */
4054                here_old = mddev->reshape_position;
4055                sector_div(here_old, (mddev->chunk_size>>9)*
4056                           (old_disks-max_degraded));
4057                /* here_old is the first stripe that we might need to read
4058                 * from */
4059                if (here_new >= here_old) {
4060                        /* Reading from the same stripe as writing to - bad */
4061                        printk(KERN_ERR "raid5: reshape_position too early for "
4062                               "auto-recovery - aborting.\n");
4063                        return -EINVAL;
4064                }
4065                printk(KERN_INFO "raid5: reshape will continue\n");
4066                /* OK, we should be able to continue; */
4067        }
4068
4069
4070        mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
4071        if ((conf = mddev->private) == NULL)
4072                goto abort;
4073        if (mddev->reshape_position == MaxSector) {
4074                conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
4075        } else {
4076                conf->raid_disks = mddev->raid_disks;
4077                conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4078        }
4079
4080        conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
4081                              GFP_KERNEL);
4082        if (!conf->disks)
4083                goto abort;
4084
4085        conf->mddev = mddev;
4086
4087        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4088                goto abort;
4089
4090        if (mddev->level == 6) {
4091                conf->spare_page = alloc_page(GFP_KERNEL);
4092                if (!conf->spare_page)
4093                        goto abort;
4094        }
4095        spin_lock_init(&conf->device_lock);
4096        mddev->queue->queue_lock = &conf->device_lock;
4097        init_waitqueue_head(&conf->wait_for_stripe);
4098        init_waitqueue_head(&conf->wait_for_overlap);
4099        INIT_LIST_HEAD(&conf->handle_list);
4100        INIT_LIST_HEAD(&conf->hold_list);
4101        INIT_LIST_HEAD(&conf->delayed_list);
4102        INIT_LIST_HEAD(&conf->bitmap_list);
4103        INIT_LIST_HEAD(&conf->inactive_list);
4104        atomic_set(&conf->active_stripes, 0);
4105        atomic_set(&conf->preread_active_stripes, 0);
4106        atomic_set(&conf->active_aligned_reads, 0);
4107        conf->bypass_threshold = BYPASS_THRESHOLD;
4108
4109        pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4110
4111        rdev_for_each(rdev, tmp, mddev) {
4112                raid_disk = rdev->raid_disk;
4113                if (raid_disk >= conf->raid_disks
4114                    || raid_disk < 0)
4115                        continue;
4116                disk = conf->disks + raid_disk;
4117
4118                disk->rdev = rdev;
4119
4120                if (test_bit(In_sync, &rdev->flags)) {
4121                        char b[BDEVNAME_SIZE];
4122                        printk(KERN_INFO "raid5: device %s operational as raid"
4123                                " disk %d\n", bdevname(rdev->bdev,b),
4124                                raid_disk);
4125                        working_disks++;
4126                } else
4127                        /* Cannot rely on bitmap to complete recovery */
4128                        conf->fullsync = 1;
4129        }
4130
4131        /*
4132         * 0 for a fully functional array, 1 or 2 for a degraded array.
4133         */
4134        mddev->degraded = conf->raid_disks - working_disks;
4135        conf->mddev = mddev;
4136        conf->chunk_size = mddev->chunk_size;
4137        conf->level = mddev->level;
4138        if (conf->level == 6)
4139                conf->max_degraded = 2;
4140        else
4141                conf->max_degraded = 1;
4142        conf->algorithm = mddev->layout;
4143        conf->max_nr_stripes = NR_STRIPES;
4144        conf->expand_progress = mddev->reshape_position;
4145
4146        /* device size must be a multiple of chunk size */
4147        mddev->size &= ~(mddev->chunk_size/1024 -1);
4148        mddev->resync_max_sectors = mddev->size << 1;
4149
4150        if (conf->level == 6 && conf->raid_disks < 4) {
4151                printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
4152                       mdname(mddev), conf->raid_disks);
4153                goto abort;
4154        }
4155        if (!conf->chunk_size || conf->chunk_size % 4) {
4156                printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
4157                        conf->chunk_size, mdname(mddev));
4158                goto abort;
4159        }
4160        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
4161                printk(KERN_ERR 
4162                        "raid5: unsupported parity algorithm %d for %s\n",
4163                        conf->algorithm, mdname(mddev));
4164                goto abort;
4165        }
4166        if (mddev->degraded > conf->max_degraded) {
4167                printk(KERN_ERR "raid5: not enough operational devices for %s"
4168                        " (%d/%d failed)\n",
4169                        mdname(mddev), mddev->degraded, conf->raid_disks);
4170                goto abort;
4171        }
4172
4173        if (mddev->degraded > 0 &&
4174            mddev->recovery_cp != MaxSector) {
4175                if (mddev->ok_start_degraded)
4176                        printk(KERN_WARNING
4177                               "raid5: starting dirty degraded array: %s"
4178                               "- data corruption possible.\n",
4179                               mdname(mddev));
4180                else {
4181                        printk(KERN_ERR
4182                               "raid5: cannot start dirty degraded array for %s\n",
4183                               mdname(mddev));
4184                        goto abort;
4185                }
4186        }
4187
4188        {
4189                mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
4190                if (!mddev->thread) {
4191                        printk(KERN_ERR 
4192                                "raid5: couldn't allocate thread for %s\n",
4193                                mdname(mddev));
4194                        goto abort;
4195                }
4196        }
4197        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4198                 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4199        if (grow_stripes(conf, conf->max_nr_stripes)) {
4200                printk(KERN_ERR 
4201                        "raid5: couldn't allocate %dkB for buffers\n", memory);
4202                shrink_stripes(conf);
4203                md_unregister_thread(mddev->thread);
4204                goto abort;
4205        } else
4206                printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4207                        memory, mdname(mddev));
4208
4209        if (mddev->degraded == 0)
4210                printk("raid5: raid level %d set %s active with %d out of %d"
4211                        " devices, algorithm %d\n", conf->level, mdname(mddev), 
4212                        mddev->raid_disks-mddev->degraded, mddev->raid_disks,
4213                        conf->algorithm);
4214        else
4215                printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
4216                        " out of %d devices, algorithm %d\n", conf->level,
4217                        mdname(mddev), mddev->raid_disks - mddev->degraded,
4218                        mddev->raid_disks, conf->algorithm);
4219
4220        print_raid5_conf(conf);
4221
4222        if (conf->expand_progress != MaxSector) {
4223                printk("...ok start reshape thread\n");
4224                conf->expand_lo = conf->expand_progress;
4225                atomic_set(&conf->reshape_stripes, 0);
4226                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4227                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4228                set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4229                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4230                mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4231                                                        "%s_reshape");
4232        }
4233
4234        /* read-ahead size must cover two whole stripes, which is
4235         * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4236         */
4237        {
4238                int data_disks = conf->previous_raid_disks - conf->max_degraded;
4239                int stripe = data_disks *
4240                        (mddev->chunk_size / PAGE_SIZE);
4241                if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4242                        mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4243        }
4244
4245        /* Ok, everything is just fine now */
4246        if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
4247                printk(KERN_WARNING
4248                       "raid5: failed to create sysfs attributes for %s\n",
4249                       mdname(mddev));
4250
4251        mddev->queue->unplug_fn = raid5_unplug_device;
4252        mddev->queue->backing_dev_info.congested_data = mddev;
4253        mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4254
4255        mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
4256                                            conf->max_degraded);
4257
4258        blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
4259
4260        return 0;
4261abort:
4262        if (conf) {
4263                print_raid5_conf(conf);
4264                safe_put_page(conf->spare_page);
4265                kfree(conf->disks);
4266                kfree(conf->stripe_hashtbl);
4267                kfree(conf);
4268        }
4269        mddev->private = NULL;
4270        printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
4271        return -EIO;
4272}
4273
4274
4275
4276static int stop(mddev_t *mddev)
4277{
4278        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4279
4280        md_unregister_thread(mddev->thread);
4281        mddev->thread = NULL;
4282        shrink_stripes(conf);
4283        kfree(conf->stripe_hashtbl);
4284        mddev->queue->backing_dev_info.congested_fn = NULL;
4285        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4286        sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4287        kfree(conf->disks);
4288        kfree(conf);
4289        mddev->private = NULL;
4290        return 0;
4291}
4292
4293#ifdef DEBUG
4294static void print_sh(struct seq_file *seq, struct stripe_head *sh)
4295{
4296        int i;
4297
4298        seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
4299                   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
4300        seq_printf(seq, "sh %llu,  count %d.\n",
4301                   (unsigned long long)sh->sector, atomic_read(&sh->count));
4302        seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
4303        for (i = 0; i < sh->disks; i++) {
4304                seq_printf(seq, "(cache%d: %p %ld) ",
4305                           i, sh->dev[i].page, sh->dev[i].flags);
4306        }
4307        seq_printf(seq, "\n");
4308}
4309
4310static void printall(struct seq_file *seq, raid5_conf_t *conf)
4311{
4312        struct stripe_head *sh;
4313        struct hlist_node *hn;
4314        int i;
4315
4316        spin_lock_irq(&conf->device_lock);
4317        for (i = 0; i < NR_HASH; i++) {
4318                hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
4319                        if (sh->raid_conf != conf)
4320                                continue;
4321                        print_sh(seq, sh);
4322                }
4323        }
4324        spin_unlock_irq(&conf->device_lock);
4325}
4326#endif
4327
4328static void status(struct seq_file *seq, mddev_t *mddev)
4329{
4330        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4331        int i;
4332
4333        seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
4334        seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
4335        for (i = 0; i < conf->raid_disks; i++)
4336                seq_printf (seq, "%s",
4337                               conf->disks[i].rdev &&
4338                               test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
4339        seq_printf (seq, "]");
4340#ifdef DEBUG
4341        seq_printf (seq, "\n");
4342        printall(seq, conf);
4343#endif
4344}
4345
4346static void print_raid5_conf (raid5_conf_t *conf)
4347{
4348        int i;
4349        struct disk_info *tmp;
4350
4351        printk("RAID5 conf printout:\n");
4352        if (!conf) {
4353                printk("(conf==NULL)\n");
4354                return;
4355        }
4356        printk(" --- rd:%d wd:%d\n", conf->raid_disks,
4357                 conf->raid_disks - conf->mddev->degraded);
4358
4359        for (i = 0; i < conf->raid_disks; i++) {
4360                char b[BDEVNAME_SIZE];
4361                tmp = conf->disks + i;
4362                if (tmp->rdev)
4363                printk(" disk %d, o:%d, dev:%s\n",
4364                        i, !test_bit(Faulty, &tmp->rdev->flags),
4365                        bdevname(tmp->rdev->bdev,b));
4366        }
4367}
4368
4369static int raid5_spare_active(mddev_t *mddev)
4370{
4371        int i;
4372        raid5_conf_t *conf = mddev->private;
4373        struct disk_info *tmp;
4374
4375        for (i = 0; i < conf->raid_disks; i++) {
4376                tmp = conf->disks + i;
4377                if (tmp->rdev
4378                    && !test_bit(Faulty, &tmp->rdev->flags)
4379                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
4380                        unsigned long flags;
4381                        spin_lock_irqsave(&conf->device_lock, flags);
4382                        mddev->degraded--;
4383                        spin_unlock_irqrestore(&conf->device_lock, flags);
4384                }
4385        }
4386        print_raid5_conf(conf);
4387        return 0;
4388}
4389
4390static int raid5_remove_disk(mddev_t *mddev, int number)
4391{
4392        raid5_conf_t *conf = mddev->private;
4393        int err = 0;
4394        mdk_rdev_t *rdev;
4395        struct disk_info *p = conf->disks + number;
4396
4397        print_raid5_conf(conf);
4398        rdev = p->rdev;
4399        if (rdev) {
4400                if (test_bit(In_sync, &rdev->flags) ||
4401                    atomic_read(&rdev->nr_pending)) {
4402                        err = -EBUSY;
4403                        goto abort;
4404                }
4405                /* Only remove non-faulty devices if recovery
4406                 * isn't possible.
4407                 */
4408                if (!test_bit(Faulty, &rdev->flags) &&
4409                    mddev->degraded <= conf->max_degraded) {
4410                        err = -EBUSY;
4411                        goto abort;
4412                }
4413                p->rdev = NULL;
4414                synchronize_rcu();
4415                if (atomic_read(&rdev->nr_pending)) {
4416                        /* lost the race, try later */
4417                        err = -EBUSY;
4418                        p->rdev = rdev;
4419                }
4420        }
4421abort:
4422
4423        print_raid5_conf(conf);
4424        return err;
4425}
4426
4427static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4428{
4429        raid5_conf_t *conf = mddev->private;
4430        int err = -EEXIST;
4431        int disk;
4432        struct disk_info *p;
4433        int first = 0;
4434        int last = conf->raid_disks - 1;
4435
4436        if (mddev->degraded > conf->max_degraded)
4437                /* no point adding a device */
4438                return -EINVAL;
4439
4440        if (rdev->raid_disk >= 0)
4441                first = last = rdev->raid_disk;
4442
4443        /*
4444         * find the disk ... but prefer rdev->saved_raid_disk
4445         * if possible.
4446         */
4447        if (rdev->saved_raid_disk >= 0 &&
4448            rdev->saved_raid_disk >= first &&
4449            conf->disks[rdev->saved_raid_disk].rdev == NULL)
4450                disk = rdev->saved_raid_disk;
4451        else
4452                disk = first;
4453        for ( ; disk <= last ; disk++)
4454                if ((p=conf->disks + disk)->rdev == NULL) {
4455                        clear_bit(In_sync, &rdev->flags);
4456                        rdev->raid_disk = disk;
4457                        err = 0;
4458                        if (rdev->saved_raid_disk != disk)
4459                                conf->fullsync = 1;
4460                        rcu_assign_pointer(p->rdev, rdev);
4461                        break;
4462                }
4463        print_raid5_conf(conf);
4464        return err;
4465}
4466
4467static int raid5_resize(mddev_t *mddev, sector_t sectors)
4468{
4469        /* no resync is happening, and there is enough space
4470         * on all devices, so we can resize.
4471         * We need to make sure resync covers any new space.
4472         * If the array is shrinking we should possibly wait until
4473         * any io in the removed space completes, but it hardly seems
4474         * worth it.
4475         */
4476        raid5_conf_t *conf = mddev_to_conf(mddev);
4477
4478        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4479        mddev->array_sectors = sectors * (mddev->raid_disks
4480                                          - conf->max_degraded);
4481        set_capacity(mddev->gendisk, mddev->array_sectors);
4482        mddev->changed = 1;
4483        if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
4484                mddev->recovery_cp = mddev->size << 1;
4485                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4486        }
4487        mddev->size = sectors /2;
4488        mddev->resync_max_sectors = sectors;
4489        return 0;
4490}
4491
4492#ifdef CONFIG_MD_RAID5_RESHAPE
4493static int raid5_check_reshape(mddev_t *mddev)
4494{
4495        raid5_conf_t *conf = mddev_to_conf(mddev);
4496        int err;
4497
4498        if (mddev->delta_disks < 0 ||
4499            mddev->new_level != mddev->level)
4500                return -EINVAL; /* Cannot shrink array or change level yet */
4501        if (mddev->delta_disks == 0)
4502                return 0; /* nothing to do */
4503        if (mddev->bitmap)
4504                /* Cannot grow a bitmap yet */
4505                return -EBUSY;
4506
4507        /* Can only proceed if there are plenty of stripe_heads.
4508         * We need a minimum of one full stripe,, and for sensible progress
4509         * it is best to have about 4 times that.
4510         * If we require 4 times, then the default 256 4K stripe_heads will
4511         * allow for chunk sizes up to 256K, which is probably OK.
4512         * If the chunk size is greater, user-space should request more
4513         * stripe_heads first.
4514         */
4515        if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
4516            (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
4517                printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
4518                       (mddev->chunk_size / STRIPE_SIZE)*4);
4519                return -ENOSPC;
4520        }
4521
4522        err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
4523        if (err)
4524                return err;
4525
4526        if (mddev->degraded > conf->max_degraded)
4527                return -EINVAL;
4528        /* looks like we might be able to manage this */
4529        return 0;
4530}
4531
4532static int raid5_start_reshape(mddev_t *mddev)
4533{
4534        raid5_conf_t *conf = mddev_to_conf(mddev);
4535        mdk_rdev_t *rdev;
4536        struct list_head *rtmp;
4537        int spares = 0;
4538        int added_devices = 0;
4539        unsigned long flags;
4540
4541        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4542                return -EBUSY;
4543
4544        rdev_for_each(rdev, rtmp, mddev)
4545                if (rdev->raid_disk < 0 &&
4546                    !test_bit(Faulty, &rdev->flags))
4547                        spares++;
4548
4549        if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
4550                /* Not enough devices even to make a degraded array
4551                 * of that size
4552                 */
4553                return -EINVAL;
4554
4555        atomic_set(&conf->reshape_stripes, 0);
4556        spin_lock_irq(&conf->device_lock);
4557        conf->previous_raid_disks = conf->raid_disks;
4558        conf->raid_disks += mddev->delta_disks;
4559        conf->expand_progress = 0;
4560        conf->expand_lo = 0;
4561        spin_unlock_irq(&conf->device_lock);
4562
4563        /* Add some new drives, as many as will fit.
4564         * We know there are enough to make the newly sized array work.
4565         */
4566        rdev_for_each(rdev, rtmp, mddev)
4567                if (rdev->raid_disk < 0 &&
4568                    !test_bit(Faulty, &rdev->flags)) {
4569                        if (raid5_add_disk(mddev, rdev) == 0) {
4570                                char nm[20];
4571                                set_bit(In_sync, &rdev->flags);
4572                                added_devices++;
4573                                rdev->recovery_offset = 0;
4574                                sprintf(nm, "rd%d", rdev->raid_disk);
4575                                if (sysfs_create_link(&mddev->kobj,
4576                                                      &rdev->kobj, nm))
4577                                        printk(KERN_WARNING
4578                                               "raid5: failed to create "
4579                                               " link %s for %s\n",
4580                                               nm, mdname(mddev));
4581                        } else
4582                                break;
4583                }
4584
4585        spin_lock_irqsave(&conf->device_lock, flags);
4586        mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
4587        spin_unlock_irqrestore(&conf->device_lock, flags);
4588        mddev->raid_disks = conf->raid_disks;
4589        mddev->reshape_position = 0;
4590        set_bit(MD_CHANGE_DEVS, &mddev->flags);
4591
4592        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4593        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4594        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4595        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4596        mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4597                                                "%s_reshape");
4598        if (!mddev->sync_thread) {
4599                mddev->recovery = 0;
4600                spin_lock_irq(&conf->device_lock);
4601                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
4602                conf->expand_progress = MaxSector;
4603                spin_unlock_irq(&conf->device_lock);
4604                return -EAGAIN;
4605        }
4606        md_wakeup_thread(mddev->sync_thread);
4607        md_new_event(mddev);
4608        return 0;
4609}
4610#endif
4611
4612static void end_reshape(raid5_conf_t *conf)
4613{
4614        struct block_device *bdev;
4615
4616        if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
4617                conf->mddev->array_sectors = 2 * conf->mddev->size *
4618                        (conf->raid_disks - conf->max_degraded);
4619                set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
4620                conf->mddev->changed = 1;
4621
4622                bdev = bdget_disk(conf->mddev->gendisk, 0);
4623                if (bdev) {
4624                        mutex_lock(&bdev->bd_inode->i_mutex);
4625                        i_size_write(bdev->bd_inode,
4626                                     (loff_t)conf->mddev->array_sectors << 9);
4627                        mutex_unlock(&bdev->bd_inode->i_mutex);
4628                        bdput(bdev);
4629                }
4630                spin_lock_irq(&conf->device_lock);
4631                conf->expand_progress = MaxSector;
4632                spin_unlock_irq(&conf->device_lock);
4633                conf->mddev->reshape_position = MaxSector;
4634
4635                /* read-ahead size must cover two whole stripes, which is
4636                 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4637                 */
4638                {
4639                        int data_disks = conf->previous_raid_disks - conf->max_degraded;
4640                        int stripe = data_disks *
4641                                (conf->mddev->chunk_size / PAGE_SIZE);
4642                        if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4643                                conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4644                }
4645        }
4646}
4647
4648static void raid5_quiesce(mddev_t *mddev, int state)
4649{
4650        raid5_conf_t *conf = mddev_to_conf(mddev);
4651
4652        switch(state) {
4653        case 2: /* resume for a suspend */
4654                wake_up(&conf->wait_for_overlap);
4655                break;
4656
4657        case 1: /* stop all writes */
4658                spin_lock_irq(&conf->device_lock);
4659                conf->quiesce = 1;
4660                wait_event_lock_irq(conf->wait_for_stripe,
4661                                    atomic_read(&conf->active_stripes) == 0 &&
4662                                    atomic_read(&conf->active_aligned_reads) == 0,
4663                                    conf->device_lock, /* nothing */);
4664                spin_unlock_irq(&conf->device_lock);
4665                break;
4666
4667        case 0: /* re-enable writes */
4668                spin_lock_irq(&conf->device_lock);
4669                conf->quiesce = 0;
4670                wake_up(&conf->wait_for_stripe);
4671                wake_up(&conf->wait_for_overlap);
4672                spin_unlock_irq(&conf->device_lock);
4673                break;
4674        }
4675}
4676
4677static struct mdk_personality raid6_personality =
4678{
4679        .name           = "raid6",
4680        .level          = 6,
4681        .owner          = THIS_MODULE,
4682        .make_request   = make_request,
4683        .run            = run,
4684        .stop           = stop,
4685        .status         = status,
4686        .error_handler  = error,
4687        .hot_add_disk   = raid5_add_disk,
4688        .hot_remove_disk= raid5_remove_disk,
4689        .spare_active   = raid5_spare_active,
4690        .sync_request   = sync_request,
4691        .resize         = raid5_resize,
4692#ifdef CONFIG_MD_RAID5_RESHAPE
4693        .check_reshape  = raid5_check_reshape,
4694        .start_reshape  = raid5_start_reshape,
4695#endif
4696        .quiesce        = raid5_quiesce,
4697};
4698static struct mdk_personality raid5_personality =
4699{
4700        .name           = "raid5",
4701        .level          = 5,
4702        .owner          = THIS_MODULE,
4703        .make_request   = make_request,
4704        .run            = run,
4705        .stop           = stop,
4706        .status         = status,
4707        .error_handler  = error,
4708        .hot_add_disk   = raid5_add_disk,
4709        .hot_remove_disk= raid5_remove_disk,
4710        .spare_active   = raid5_spare_active,
4711        .sync_request   = sync_request,
4712        .resize         = raid5_resize,
4713#ifdef CONFIG_MD_RAID5_RESHAPE
4714        .check_reshape  = raid5_check_reshape,
4715        .start_reshape  = raid5_start_reshape,
4716#endif
4717        .quiesce        = raid5_quiesce,
4718};
4719
4720static struct mdk_personality raid4_personality =
4721{
4722        .name           = "raid4",
4723        .level          = 4,
4724        .owner          = THIS_MODULE,
4725        .make_request   = make_request,
4726        .run            = run,
4727        .stop           = stop,
4728        .status         = status,
4729        .error_handler  = error,
4730        .hot_add_disk   = raid5_add_disk,
4731        .hot_remove_disk= raid5_remove_disk,
4732        .spare_active   = raid5_spare_active,
4733        .sync_request   = sync_request,
4734        .resize         = raid5_resize,
4735#ifdef CONFIG_MD_RAID5_RESHAPE
4736        .check_reshape  = raid5_check_reshape,
4737        .start_reshape  = raid5_start_reshape,
4738#endif
4739        .quiesce        = raid5_quiesce,
4740};
4741
4742static int __init raid5_init(void)
4743{
4744        int e;
4745
4746        e = raid6_select_algo();
4747        if ( e )
4748                return e;
4749        register_md_personality(&raid6_personality);
4750        register_md_personality(&raid5_personality);
4751        register_md_personality(&raid4_personality);
4752        return 0;
4753}
4754
4755static void raid5_exit(void)
4756{
4757        unregister_md_personality(&raid6_personality);
4758        unregister_md_personality(&raid5_personality);
4759        unregister_md_personality(&raid4_personality);
4760}
4761
4762module_init(raid5_init);
4763module_exit(raid5_exit);
4764MODULE_LICENSE("GPL");
4765MODULE_ALIAS("md-personality-4"); /* RAID5 */
4766MODULE_ALIAS("md-raid5");
4767MODULE_ALIAS("md-raid4");
4768MODULE_ALIAS("md-level-5");
4769MODULE_ALIAS("md-level-4");
4770MODULE_ALIAS("md-personality-8"); /* RAID6 */
4771MODULE_ALIAS("md-raid6");
4772MODULE_ALIAS("md-level-6");
4773
4774/* This used to be two separate modules, they were: */
4775MODULE_ALIAS("raid5");
4776MODULE_ALIAS("raid6");
4777
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.