linux/drivers/md/raid5.c
<<
>>
Prefs
   1/*
   2 * raid5.c : Multiple Devices driver for Linux
   3 *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4 *         Copyright (C) 1999, 2000 Ingo Molnar
   5 *         Copyright (C) 2002, 2003 H. Peter Anvin
   6 *
   7 * RAID-4/5/6 management functions.
   8 * Thanks to Penguin Computing for making the RAID-6 development possible
   9 * by donating a test server!
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of the GNU General Public License as published by
  13 * the Free Software Foundation; either version 2, or (at your option)
  14 * any later version.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * (for example /usr/src/linux/COPYING); if not, write to the Free
  18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21/*
  22 * BITMAP UNPLUGGING:
  23 *
  24 * The sequencing for updating the bitmap reliably is a little
  25 * subtle (and I got it wrong the first time) so it deserves some
  26 * explanation.
  27 *
  28 * We group bitmap updates into batches.  Each batch has a number.
  29 * We may write out several batches at once, but that isn't very important.
  30 * conf->seq_write is the number of the last batch successfully written.
  31 * conf->seq_flush is the number of the last batch that was closed to
  32 *    new additions.
  33 * When we discover that we will need to write to any block in a stripe
  34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  35 * the number of the batch it will be in. This is seq_flush+1.
  36 * When we are ready to do a write, if that batch hasn't been written yet,
  37 *   we plug the array and queue the stripe for later.
  38 * When an unplug happens, we increment bm_flush, thus closing the current
  39 *   batch.
  40 * When we notice that bm_flush > bm_write, we write out all pending updates
  41 * to the bitmap, and advance bm_write to where bm_flush was.
  42 * This may occasionally write a bit out twice, but is sure never to
  43 * miss any bits.
  44 */
  45
  46#include <linux/blkdev.h>
  47#include <linux/kthread.h>
  48#include <linux/raid/pq.h>
  49#include <linux/async_tx.h>
  50#include <linux/module.h>
  51#include <linux/async.h>
  52#include <linux/seq_file.h>
  53#include <linux/cpu.h>
  54#include <linux/slab.h>
  55#include <linux/ratelimit.h>
  56#include "md.h"
  57#include "raid5.h"
  58#include "raid0.h"
  59#include "bitmap.h"
  60
  61/*
  62 * Stripe cache
  63 */
  64
  65#define NR_STRIPES              256
  66#define STRIPE_SIZE             PAGE_SIZE
  67#define STRIPE_SHIFT            (PAGE_SHIFT - 9)
  68#define STRIPE_SECTORS          (STRIPE_SIZE>>9)
  69#define IO_THRESHOLD            1
  70#define BYPASS_THRESHOLD        1
  71#define NR_HASH                 (PAGE_SIZE / sizeof(struct hlist_head))
  72#define HASH_MASK               (NR_HASH - 1)
  73
  74static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  75{
  76        int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
  77        return &conf->stripe_hashtbl[hash];
  78}
  79
  80/* bio's attached to a stripe+device for I/O are linked together in bi_sector
  81 * order without overlap.  There may be several bio's per stripe+device, and
  82 * a bio could span several devices.
  83 * When walking this list for a particular stripe+device, we must never proceed
  84 * beyond a bio that extends past this device, as the next bio might no longer
  85 * be valid.
  86 * This function is used to determine the 'next' bio in the list, given the sector
  87 * of the current stripe+device
  88 */
  89static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
  90{
  91        int sectors = bio->bi_size >> 9;
  92        if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
  93                return bio->bi_next;
  94        else
  95                return NULL;
  96}
  97
  98/*
  99 * We maintain a biased count of active stripes in the bottom 16 bits of
 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
 101 */
 102static inline int raid5_bi_processed_stripes(struct bio *bio)
 103{
 104        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 105        return (atomic_read(segments) >> 16) & 0xffff;
 106}
 107
 108static inline int raid5_dec_bi_active_stripes(struct bio *bio)
 109{
 110        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 111        return atomic_sub_return(1, segments) & 0xffff;
 112}
 113
 114static inline void raid5_inc_bi_active_stripes(struct bio *bio)
 115{
 116        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 117        atomic_inc(segments);
 118}
 119
 120static inline void raid5_set_bi_processed_stripes(struct bio *bio,
 121        unsigned int cnt)
 122{
 123        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 124        int old, new;
 125
 126        do {
 127                old = atomic_read(segments);
 128                new = (old & 0xffff) | (cnt << 16);
 129        } while (atomic_cmpxchg(segments, old, new) != old);
 130}
 131
 132static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
 133{
 134        atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 135        atomic_set(segments, cnt);
 136}
 137
 138/* Find first data disk in a raid6 stripe */
 139static inline int raid6_d0(struct stripe_head *sh)
 140{
 141        if (sh->ddf_layout)
 142                /* ddf always start from first device */
 143                return 0;
 144        /* md starts just after Q block */
 145        if (sh->qd_idx == sh->disks - 1)
 146                return 0;
 147        else
 148                return sh->qd_idx + 1;
 149}
 150static inline int raid6_next_disk(int disk, int raid_disks)
 151{
 152        disk++;
 153        return (disk < raid_disks) ? disk : 0;
 154}
 155
 156/* When walking through the disks in a raid5, starting at raid6_d0,
 157 * We need to map each disk to a 'slot', where the data disks are slot
 158 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 159 * is raid_disks-1.  This help does that mapping.
 160 */
 161static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 162                             int *count, int syndrome_disks)
 163{
 164        int slot = *count;
 165
 166        if (sh->ddf_layout)
 167                (*count)++;
 168        if (idx == sh->pd_idx)
 169                return syndrome_disks;
 170        if (idx == sh->qd_idx)
 171                return syndrome_disks + 1;
 172        if (!sh->ddf_layout)
 173                (*count)++;
 174        return slot;
 175}
 176
 177static void return_io(struct bio *return_bi)
 178{
 179        struct bio *bi = return_bi;
 180        while (bi) {
 181
 182                return_bi = bi->bi_next;
 183                bi->bi_next = NULL;
 184                bi->bi_size = 0;
 185                bio_endio(bi, 0);
 186                bi = return_bi;
 187        }
 188}
 189
 190static void print_raid5_conf (struct r5conf *conf);
 191
 192static int stripe_operations_active(struct stripe_head *sh)
 193{
 194        return sh->check_state || sh->reconstruct_state ||
 195               test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 196               test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 197}
 198
 199static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 200{
 201        BUG_ON(!list_empty(&sh->lru));
 202        BUG_ON(atomic_read(&conf->active_stripes)==0);
 203        if (test_bit(STRIPE_HANDLE, &sh->state)) {
 204                if (test_bit(STRIPE_DELAYED, &sh->state) &&
 205                    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 206                        list_add_tail(&sh->lru, &conf->delayed_list);
 207                else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 208                           sh->bm_seq - conf->seq_write > 0)
 209                        list_add_tail(&sh->lru, &conf->bitmap_list);
 210                else {
 211                        clear_bit(STRIPE_DELAYED, &sh->state);
 212                        clear_bit(STRIPE_BIT_DELAY, &sh->state);
 213                        list_add_tail(&sh->lru, &conf->handle_list);
 214                }
 215                md_wakeup_thread(conf->mddev->thread);
 216        } else {
 217                BUG_ON(stripe_operations_active(sh));
 218                if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 219                        if (atomic_dec_return(&conf->preread_active_stripes)
 220                            < IO_THRESHOLD)
 221                                md_wakeup_thread(conf->mddev->thread);
 222                atomic_dec(&conf->active_stripes);
 223                if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 224                        list_add_tail(&sh->lru, &conf->inactive_list);
 225                        wake_up(&conf->wait_for_stripe);
 226                        if (conf->retry_read_aligned)
 227                                md_wakeup_thread(conf->mddev->thread);
 228                }
 229        }
 230}
 231
 232static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
 233{
 234        if (atomic_dec_and_test(&sh->count))
 235                do_release_stripe(conf, sh);
 236}
 237
 238static void release_stripe(struct stripe_head *sh)
 239{
 240        struct r5conf *conf = sh->raid_conf;
 241        unsigned long flags;
 242
 243        local_irq_save(flags);
 244        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
 245                do_release_stripe(conf, sh);
 246                spin_unlock(&conf->device_lock);
 247        }
 248        local_irq_restore(flags);
 249}
 250
 251static inline void remove_hash(struct stripe_head *sh)
 252{
 253        pr_debug("remove_hash(), stripe %llu\n",
 254                (unsigned long long)sh->sector);
 255
 256        hlist_del_init(&sh->hash);
 257}
 258
 259static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 260{
 261        struct hlist_head *hp = stripe_hash(conf, sh->sector);
 262
 263        pr_debug("insert_hash(), stripe %llu\n",
 264                (unsigned long long)sh->sector);
 265
 266        hlist_add_head(&sh->hash, hp);
 267}
 268
 269
 270/* find an idle stripe, make sure it is unhashed, and return it. */
 271static struct stripe_head *get_free_stripe(struct r5conf *conf)
 272{
 273        struct stripe_head *sh = NULL;
 274        struct list_head *first;
 275
 276        if (list_empty(&conf->inactive_list))
 277                goto out;
 278        first = conf->inactive_list.next;
 279        sh = list_entry(first, struct stripe_head, lru);
 280        list_del_init(first);
 281        remove_hash(sh);
 282        atomic_inc(&conf->active_stripes);
 283out:
 284        return sh;
 285}
 286
 287static void shrink_buffers(struct stripe_head *sh)
 288{
 289        struct page *p;
 290        int i;
 291        int num = sh->raid_conf->pool_size;
 292
 293        for (i = 0; i < num ; i++) {
 294                p = sh->dev[i].page;
 295                if (!p)
 296                        continue;
 297                sh->dev[i].page = NULL;
 298                put_page(p);
 299        }
 300}
 301
 302static int grow_buffers(struct stripe_head *sh)
 303{
 304        int i;
 305        int num = sh->raid_conf->pool_size;
 306
 307        for (i = 0; i < num; i++) {
 308                struct page *page;
 309
 310                if (!(page = alloc_page(GFP_KERNEL))) {
 311                        return 1;
 312                }
 313                sh->dev[i].page = page;
 314        }
 315        return 0;
 316}
 317
 318static void raid5_build_block(struct stripe_head *sh, int i, int previous);
 319static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 320                            struct stripe_head *sh);
 321
 322static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 323{
 324        struct r5conf *conf = sh->raid_conf;
 325        int i;
 326
 327        BUG_ON(atomic_read(&sh->count) != 0);
 328        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 329        BUG_ON(stripe_operations_active(sh));
 330
 331        pr_debug("init_stripe called, stripe %llu\n",
 332                (unsigned long long)sh->sector);
 333
 334        remove_hash(sh);
 335
 336        sh->generation = conf->generation - previous;
 337        sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 338        sh->sector = sector;
 339        stripe_set_idx(sector, conf, previous, sh);
 340        sh->state = 0;
 341
 342
 343        for (i = sh->disks; i--; ) {
 344                struct r5dev *dev = &sh->dev[i];
 345
 346                if (dev->toread || dev->read || dev->towrite || dev->written ||
 347                    test_bit(R5_LOCKED, &dev->flags)) {
 348                        printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
 349                               (unsigned long long)sh->sector, i, dev->toread,
 350                               dev->read, dev->towrite, dev->written,
 351                               test_bit(R5_LOCKED, &dev->flags));
 352                        WARN_ON(1);
 353                }
 354                dev->flags = 0;
 355                raid5_build_block(sh, i, previous);
 356        }
 357        insert_hash(conf, sh);
 358}
 359
 360static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 361                                         short generation)
 362{
 363        struct stripe_head *sh;
 364        struct hlist_node *hn;
 365
 366        pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 367        hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
 368                if (sh->sector == sector && sh->generation == generation)
 369                        return sh;
 370        pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 371        return NULL;
 372}
 373
 374/*
 375 * Need to check if array has failed when deciding whether to:
 376 *  - start an array
 377 *  - remove non-faulty devices
 378 *  - add a spare
 379 *  - allow a reshape
 380 * This determination is simple when no reshape is happening.
 381 * However if there is a reshape, we need to carefully check
 382 * both the before and after sections.
 383 * This is because some failed devices may only affect one
 384 * of the two sections, and some non-in_sync devices may
 385 * be insync in the section most affected by failed devices.
 386 */
 387static int calc_degraded(struct r5conf *conf)
 388{
 389        int degraded, degraded2;
 390        int i;
 391
 392        rcu_read_lock();
 393        degraded = 0;
 394        for (i = 0; i < conf->previous_raid_disks; i++) {
 395                struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 396                if (rdev && test_bit(Faulty, &rdev->flags))
 397                        rdev = rcu_dereference(conf->disks[i].replacement);
 398                if (!rdev || test_bit(Faulty, &rdev->flags))
 399                        degraded++;
 400                else if (test_bit(In_sync, &rdev->flags))
 401                        ;
 402                else
 403                        /* not in-sync or faulty.
 404                         * If the reshape increases the number of devices,
 405                         * this is being recovered by the reshape, so
 406                         * this 'previous' section is not in_sync.
 407                         * If the number of devices is being reduced however,
 408                         * the device can only be part of the array if
 409                         * we are reverting a reshape, so this section will
 410                         * be in-sync.
 411                         */
 412                        if (conf->raid_disks >= conf->previous_raid_disks)
 413                                degraded++;
 414        }
 415        rcu_read_unlock();
 416        if (conf->raid_disks == conf->previous_raid_disks)
 417                return degraded;
 418        rcu_read_lock();
 419        degraded2 = 0;
 420        for (i = 0; i < conf->raid_disks; i++) {
 421                struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 422                if (rdev && test_bit(Faulty, &rdev->flags))
 423                        rdev = rcu_dereference(conf->disks[i].replacement);
 424                if (!rdev || test_bit(Faulty, &rdev->flags))
 425                        degraded2++;
 426                else if (test_bit(In_sync, &rdev->flags))
 427                        ;
 428                else
 429                        /* not in-sync or faulty.
 430                         * If reshape increases the number of devices, this
 431                         * section has already been recovered, else it
 432                         * almost certainly hasn't.
 433                         */
 434                        if (conf->raid_disks <= conf->previous_raid_disks)
 435                                degraded2++;
 436        }
 437        rcu_read_unlock();
 438        if (degraded2 > degraded)
 439                return degraded2;
 440        return degraded;
 441}
 442
 443static int has_failed(struct r5conf *conf)
 444{
 445        int degraded;
 446
 447        if (conf->mddev->reshape_position == MaxSector)
 448                return conf->mddev->degraded > conf->max_degraded;
 449
 450        degraded = calc_degraded(conf);
 451        if (degraded > conf->max_degraded)
 452                return 1;
 453        return 0;
 454}
 455
 456static struct stripe_head *
 457get_active_stripe(struct r5conf *conf, sector_t sector,
 458                  int previous, int noblock, int noquiesce)
 459{
 460        struct stripe_head *sh;
 461
 462        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 463
 464        spin_lock_irq(&conf->device_lock);
 465
 466        do {
 467                wait_event_lock_irq(conf->wait_for_stripe,
 468                                    conf->quiesce == 0 || noquiesce,
 469                                    conf->device_lock, /* nothing */);
 470                sh = __find_stripe(conf, sector, conf->generation - previous);
 471                if (!sh) {
 472                        if (!conf->inactive_blocked)
 473                                sh = get_free_stripe(conf);
 474                        if (noblock && sh == NULL)
 475                                break;
 476                        if (!sh) {
 477                                conf->inactive_blocked = 1;
 478                                wait_event_lock_irq(conf->wait_for_stripe,
 479                                                    !list_empty(&conf->inactive_list) &&
 480                                                    (atomic_read(&conf->active_stripes)
 481                                                     < (conf->max_nr_stripes *3/4)
 482                                                     || !conf->inactive_blocked),
 483                                                    conf->device_lock,
 484                                                    );
 485                                conf->inactive_blocked = 0;
 486                        } else
 487                                init_stripe(sh, sector, previous);
 488                } else {
 489                        if (atomic_read(&sh->count)) {
 490                                BUG_ON(!list_empty(&sh->lru)
 491                                    && !test_bit(STRIPE_EXPANDING, &sh->state)
 492                                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
 493                        } else {
 494                                if (!test_bit(STRIPE_HANDLE, &sh->state))
 495                                        atomic_inc(&conf->active_stripes);
 496                                if (list_empty(&sh->lru) &&
 497                                    !test_bit(STRIPE_EXPANDING, &sh->state))
 498                                        BUG();
 499                                list_del_init(&sh->lru);
 500                        }
 501                }
 502        } while (sh == NULL);
 503
 504        if (sh)
 505                atomic_inc(&sh->count);
 506
 507        spin_unlock_irq(&conf->device_lock);
 508        return sh;
 509}
 510
 511/* Determine if 'data_offset' or 'new_data_offset' should be used
 512 * in this stripe_head.
 513 */
 514static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 515{
 516        sector_t progress = conf->reshape_progress;
 517        /* Need a memory barrier to make sure we see the value
 518         * of conf->generation, or ->data_offset that was set before
 519         * reshape_progress was updated.
 520         */
 521        smp_rmb();
 522        if (progress == MaxSector)
 523                return 0;
 524        if (sh->generation == conf->generation - 1)
 525                return 0;
 526        /* We are in a reshape, and this is a new-generation stripe,
 527         * so use new_data_offset.
 528         */
 529        return 1;
 530}
 531
 532static void
 533raid5_end_read_request(struct bio *bi, int error);
 534static void
 535raid5_end_write_request(struct bio *bi, int error);
 536
 537static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 538{
 539        struct r5conf *conf = sh->raid_conf;
 540        int i, disks = sh->disks;
 541
 542        might_sleep();
 543
 544        for (i = disks; i--; ) {
 545                int rw;
 546                int replace_only = 0;
 547                struct bio *bi, *rbi;
 548                struct md_rdev *rdev, *rrdev = NULL;
 549                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 550                        if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 551                                rw = WRITE_FUA;
 552                        else
 553                                rw = WRITE;
 554                        if (test_bit(R5_Discard, &sh->dev[i].flags))
 555                                rw |= REQ_DISCARD;
 556                } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 557                        rw = READ;
 558                else if (test_and_clear_bit(R5_WantReplace,
 559                                            &sh->dev[i].flags)) {
 560                        rw = WRITE;
 561                        replace_only = 1;
 562                } else
 563                        continue;
 564                if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
 565                        rw |= REQ_SYNC;
 566
 567                bi = &sh->dev[i].req;
 568                rbi = &sh->dev[i].rreq; /* For writing to replacement */
 569
 570                bi->bi_rw = rw;
 571                rbi->bi_rw = rw;
 572                if (rw & WRITE) {
 573                        bi->bi_end_io = raid5_end_write_request;
 574                        rbi->bi_end_io = raid5_end_write_request;
 575                } else
 576                        bi->bi_end_io = raid5_end_read_request;
 577
 578                rcu_read_lock();
 579                rrdev = rcu_dereference(conf->disks[i].replacement);
 580                smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
 581                rdev = rcu_dereference(conf->disks[i].rdev);
 582                if (!rdev) {
 583                        rdev = rrdev;
 584                        rrdev = NULL;
 585                }
 586                if (rw & WRITE) {
 587                        if (replace_only)
 588                                rdev = NULL;
 589                        if (rdev == rrdev)
 590                                /* We raced and saw duplicates */
 591                                rrdev = NULL;
 592                } else {
 593                        if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
 594                                rdev = rrdev;
 595                        rrdev = NULL;
 596                }
 597
 598                if (rdev && test_bit(Faulty, &rdev->flags))
 599                        rdev = NULL;
 600                if (rdev)
 601                        atomic_inc(&rdev->nr_pending);
 602                if (rrdev && test_bit(Faulty, &rrdev->flags))
 603                        rrdev = NULL;
 604                if (rrdev)
 605                        atomic_inc(&rrdev->nr_pending);
 606                rcu_read_unlock();
 607
 608                /* We have already checked bad blocks for reads.  Now
 609                 * need to check for writes.  We never accept write errors
 610                 * on the replacement, so we don't to check rrdev.
 611                 */
 612                while ((rw & WRITE) && rdev &&
 613                       test_bit(WriteErrorSeen, &rdev->flags)) {
 614                        sector_t first_bad;
 615                        int bad_sectors;
 616                        int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 617                                              &first_bad, &bad_sectors);
 618                        if (!bad)
 619                                break;
 620
 621                        if (bad < 0) {
 622                                set_bit(BlockedBadBlocks, &rdev->flags);
 623                                if (!conf->mddev->external &&
 624                                    conf->mddev->flags) {
 625                                        /* It is very unlikely, but we might
 626                                         * still need to write out the
 627                                         * bad block log - better give it
 628                                         * a chance*/
 629                                        md_check_recovery(conf->mddev);
 630                                }
 631                                /*
 632                                 * Because md_wait_for_blocked_rdev
 633                                 * will dec nr_pending, we must
 634                                 * increment it first.
 635                                 */
 636                                atomic_inc(&rdev->nr_pending);
 637                                md_wait_for_blocked_rdev(rdev, conf->mddev);
 638                        } else {
 639                                /* Acknowledged bad block - skip the write */
 640                                rdev_dec_pending(rdev, conf->mddev);
 641                                rdev = NULL;
 642                        }
 643                }
 644
 645                if (rdev) {
 646                        if (s->syncing || s->expanding || s->expanded
 647                            || s->replacing)
 648                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 649
 650                        set_bit(STRIPE_IO_STARTED, &sh->state);
 651
 652                        bi->bi_bdev = rdev->bdev;
 653                        pr_debug("%s: for %llu schedule op %ld on disc %d\n",
 654                                __func__, (unsigned long long)sh->sector,
 655                                bi->bi_rw, i);
 656                        atomic_inc(&sh->count);
 657                        if (use_new_offset(conf, sh))
 658                                bi->bi_sector = (sh->sector
 659                                                 + rdev->new_data_offset);
 660                        else
 661                                bi->bi_sector = (sh->sector
 662                                                 + rdev->data_offset);
 663                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 664                                bi->bi_rw |= REQ_FLUSH;
 665
 666                        bi->bi_flags = 1 << BIO_UPTODATE;
 667                        bi->bi_idx = 0;
 668                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 669                        bi->bi_io_vec[0].bv_offset = 0;
 670                        bi->bi_size = STRIPE_SIZE;
 671                        bi->bi_next = NULL;
 672                        if (rrdev)
 673                                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
 674                        generic_make_request(bi);
 675                }
 676                if (rrdev) {
 677                        if (s->syncing || s->expanding || s->expanded
 678                            || s->replacing)
 679                                md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
 680
 681                        set_bit(STRIPE_IO_STARTED, &sh->state);
 682
 683                        rbi->bi_bdev = rrdev->bdev;
 684                        pr_debug("%s: for %llu schedule op %ld on "
 685                                 "replacement disc %d\n",
 686                                __func__, (unsigned long long)sh->sector,
 687                                rbi->bi_rw, i);
 688                        atomic_inc(&sh->count);
 689                        if (use_new_offset(conf, sh))
 690                                rbi->bi_sector = (sh->sector
 691                                                  + rrdev->new_data_offset);
 692                        else
 693                                rbi->bi_sector = (sh->sector
 694                                                  + rrdev->data_offset);
 695                        rbi->bi_flags = 1 << BIO_UPTODATE;
 696                        rbi->bi_idx = 0;
 697                        rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 698                        rbi->bi_io_vec[0].bv_offset = 0;
 699                        rbi->bi_size = STRIPE_SIZE;
 700                        rbi->bi_next = NULL;
 701                        generic_make_request(rbi);
 702                }
 703                if (!rdev && !rrdev) {
 704                        if (rw & WRITE)
 705                                set_bit(STRIPE_DEGRADED, &sh->state);
 706                        pr_debug("skip op %ld on disc %d for sector %llu\n",
 707                                bi->bi_rw, i, (unsigned long long)sh->sector);
 708                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
 709                        set_bit(STRIPE_HANDLE, &sh->state);
 710                }
 711        }
 712}
 713
 714static struct dma_async_tx_descriptor *
 715async_copy_data(int frombio, struct bio *bio, struct page *page,
 716        sector_t sector, struct dma_async_tx_descriptor *tx)
 717{
 718        struct bio_vec *bvl;
 719        struct page *bio_page;
 720        int i;
 721        int page_offset;
 722        struct async_submit_ctl submit;
 723        enum async_tx_flags flags = 0;
 724
 725        if (bio->bi_sector >= sector)
 726                page_offset = (signed)(bio->bi_sector - sector) * 512;
 727        else
 728                page_offset = (signed)(sector - bio->bi_sector) * -512;
 729
 730        if (frombio)
 731                flags |= ASYNC_TX_FENCE;
 732        init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
 733
 734        bio_for_each_segment(bvl, bio, i) {
 735                int len = bvl->bv_len;
 736                int clen;
 737                int b_offset = 0;
 738
 739                if (page_offset < 0) {
 740                        b_offset = -page_offset;
 741                        page_offset += b_offset;
 742                        len -= b_offset;
 743                }
 744
 745                if (len > 0 && page_offset + len > STRIPE_SIZE)
 746                        clen = STRIPE_SIZE - page_offset;
 747                else
 748                        clen = len;
 749
 750                if (clen > 0) {
 751                        b_offset += bvl->bv_offset;
 752                        bio_page = bvl->bv_page;
 753                        if (frombio)
 754                                tx = async_memcpy(page, bio_page, page_offset,
 755                                                  b_offset, clen, &submit);
 756                        else
 757                                tx = async_memcpy(bio_page, page, b_offset,
 758                                                  page_offset, clen, &submit);
 759                }
 760                /* chain the operations */
 761                submit.depend_tx = tx;
 762
 763                if (clen < len) /* hit end of page */
 764                        break;
 765                page_offset +=  len;
 766        }
 767
 768        return tx;
 769}
 770
 771static void ops_complete_biofill(void *stripe_head_ref)
 772{
 773        struct stripe_head *sh = stripe_head_ref;
 774        struct bio *return_bi = NULL;
 775        int i;
 776
 777        pr_debug("%s: stripe %llu\n", __func__,
 778                (unsigned long long)sh->sector);
 779
 780        /* clear completed biofills */
 781        for (i = sh->disks; i--; ) {
 782                struct r5dev *dev = &sh->dev[i];
 783
 784                /* acknowledge completion of a biofill operation */
 785                /* and check if we need to reply to a read request,
 786                 * new R5_Wantfill requests are held off until
 787                 * !STRIPE_BIOFILL_RUN
 788                 */
 789                if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
 790                        struct bio *rbi, *rbi2;
 791
 792                        BUG_ON(!dev->read);
 793                        rbi = dev->read;
 794                        dev->read = NULL;
 795                        while (rbi && rbi->bi_sector <
 796                                dev->sector + STRIPE_SECTORS) {
 797                                rbi2 = r5_next_bio(rbi, dev->sector);
 798                                if (!raid5_dec_bi_active_stripes(rbi)) {
 799                                        rbi->bi_next = return_bi;
 800                                        return_bi = rbi;
 801                                }
 802                                rbi = rbi2;
 803                        }
 804                }
 805        }
 806        clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 807
 808        return_io(return_bi);
 809
 810        set_bit(STRIPE_HANDLE, &sh->state);
 811        release_stripe(sh);
 812}
 813
 814static void ops_run_biofill(struct stripe_head *sh)
 815{
 816        struct dma_async_tx_descriptor *tx = NULL;
 817        struct async_submit_ctl submit;
 818        int i;
 819
 820        pr_debug("%s: stripe %llu\n", __func__,
 821                (unsigned long long)sh->sector);
 822
 823        for (i = sh->disks; i--; ) {
 824                struct r5dev *dev = &sh->dev[i];
 825                if (test_bit(R5_Wantfill, &dev->flags)) {
 826                        struct bio *rbi;
 827                        spin_lock_irq(&sh->stripe_lock);
 828                        dev->read = rbi = dev->toread;
 829                        dev->toread = NULL;
 830                        spin_unlock_irq(&sh->stripe_lock);
 831                        while (rbi && rbi->bi_sector <
 832                                dev->sector + STRIPE_SECTORS) {
 833                                tx = async_copy_data(0, rbi, dev->page,
 834                                        dev->sector, tx);
 835                                rbi = r5_next_bio(rbi, dev->sector);
 836                        }
 837                }
 838        }
 839
 840        atomic_inc(&sh->count);
 841        init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
 842        async_trigger_callback(&submit);
 843}
 844
 845static void mark_target_uptodate(struct stripe_head *sh, int target)
 846{
 847        struct r5dev *tgt;
 848
 849        if (target < 0)
 850                return;
 851
 852        tgt = &sh->dev[target];
 853        set_bit(R5_UPTODATE, &tgt->flags);
 854        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 855        clear_bit(R5_Wantcompute, &tgt->flags);
 856}
 857
 858static void ops_complete_compute(void *stripe_head_ref)
 859{
 860        struct stripe_head *sh = stripe_head_ref;
 861
 862        pr_debug("%s: stripe %llu\n", __func__,
 863                (unsigned long long)sh->sector);
 864
 865        /* mark the computed target(s) as uptodate */
 866        mark_target_uptodate(sh, sh->ops.target);
 867        mark_target_uptodate(sh, sh->ops.target2);
 868
 869        clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
 870        if (sh->check_state == check_state_compute_run)
 871                sh->check_state = check_state_compute_result;
 872        set_bit(STRIPE_HANDLE, &sh->state);
 873        release_stripe(sh);
 874}
 875
 876/* return a pointer to the address conversion region of the scribble buffer */
 877static addr_conv_t *to_addr_conv(struct stripe_head *sh,
 878                                 struct raid5_percpu *percpu)
 879{
 880        return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
 881}
 882
 883static struct dma_async_tx_descriptor *
 884ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 885{
 886        int disks = sh->disks;
 887        struct page **xor_srcs = percpu->scribble;
 888        int target = sh->ops.target;
 889        struct r5dev *tgt = &sh->dev[target];
 890        struct page *xor_dest = tgt->page;
 891        int count = 0;
 892        struct dma_async_tx_descriptor *tx;
 893        struct async_submit_ctl submit;
 894        int i;
 895
 896        pr_debug("%s: stripe %llu block: %d\n",
 897                __func__, (unsigned long long)sh->sector, target);
 898        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 899
 900        for (i = disks; i--; )
 901                if (i != target)
 902                        xor_srcs[count++] = sh->dev[i].page;
 903
 904        atomic_inc(&sh->count);
 905
 906        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
 907                          ops_complete_compute, sh, to_addr_conv(sh, percpu));
 908        if (unlikely(count == 1))
 909                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 910        else
 911                tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 912
 913        return tx;
 914}
 915
 916/* set_syndrome_sources - populate source buffers for gen_syndrome
 917 * @srcs - (struct page *) array of size sh->disks
 918 * @sh - stripe_head to parse
 919 *
 920 * Populates srcs in proper layout order for the stripe and returns the
 921 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
 922 * destination buffer is recorded in srcs[count] and the Q destination
 923 * is recorded in srcs[count+1]].
 924 */
 925static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
 926{
 927        int disks = sh->disks;
 928        int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
 929        int d0_idx = raid6_d0(sh);
 930        int count;
 931        int i;
 932
 933        for (i = 0; i < disks; i++)
 934                srcs[i] = NULL;
 935
 936        count = 0;
 937        i = d0_idx;
 938        do {
 939                int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
 940
 941                srcs[slot] = sh->dev[i].page;
 942                i = raid6_next_disk(i, disks);
 943        } while (i != d0_idx);
 944
 945        return syndrome_disks;
 946}
 947
 948static struct dma_async_tx_descriptor *
 949ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 950{
 951        int disks = sh->disks;
 952        struct page **blocks = percpu->scribble;
 953        int target;
 954        int qd_idx = sh->qd_idx;
 955        struct dma_async_tx_descriptor *tx;
 956        struct async_submit_ctl submit;
 957        struct r5dev *tgt;
 958        struct page *dest;
 959        int i;
 960        int count;
 961
 962        if (sh->ops.target < 0)
 963                target = sh->ops.target2;
 964        else if (sh->ops.target2 < 0)
 965                target = sh->ops.target;
 966        else
 967                /* we should only have one valid target */
 968                BUG();
 969        BUG_ON(target < 0);
 970        pr_debug("%s: stripe %llu block: %d\n",
 971                __func__, (unsigned long long)sh->sector, target);
 972
 973        tgt = &sh->dev[target];
 974        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 975        dest = tgt->page;
 976
 977        atomic_inc(&sh->count);
 978
 979        if (target == qd_idx) {
 980                count = set_syndrome_sources(blocks, sh);
 981                blocks[count] = NULL; /* regenerating p is not necessary */
 982                BUG_ON(blocks[count+1] != dest); /* q should already be set */
 983                init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 984                                  ops_complete_compute, sh,
 985                                  to_addr_conv(sh, percpu));
 986                tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
 987        } else {
 988                /* Compute any data- or p-drive using XOR */
 989                count = 0;
 990                for (i = disks; i-- ; ) {
 991                        if (i == target || i == qd_idx)
 992                                continue;
 993                        blocks[count++] = sh->dev[i].page;
 994                }
 995
 996                init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 997                                  NULL, ops_complete_compute, sh,
 998                                  to_addr_conv(sh, percpu));
 999                tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1000        }
1001
1002        return tx;
1003}
1004
1005static struct dma_async_tx_descriptor *
1006ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1007{
1008        int i, count, disks = sh->disks;
1009        int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1010        int d0_idx = raid6_d0(sh);
1011        int faila = -1, failb = -1;
1012        int target = sh->ops.target;
1013        int target2 = sh->ops.target2;
1014        struct r5dev *tgt = &sh->dev[target];
1015        struct r5dev *tgt2 = &sh->dev[target2];
1016        struct dma_async_tx_descriptor *tx;
1017        struct page **blocks = percpu->scribble;
1018        struct async_submit_ctl submit;
1019
1020        pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1021                 __func__, (unsigned long long)sh->sector, target, target2);
1022        BUG_ON(target < 0 || target2 < 0);
1023        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1024        BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1025
1026        /* we need to open-code set_syndrome_sources to handle the
1027         * slot number conversion for 'faila' and 'failb'
1028         */
1029        for (i = 0; i < disks ; i++)
1030                blocks[i] = NULL;
1031        count = 0;
1032        i = d0_idx;
1033        do {
1034                int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1035
1036                blocks[slot] = sh->dev[i].page;
1037
1038                if (i == target)
1039                        faila = slot;
1040                if (i == target2)
1041                        failb = slot;
1042                i = raid6_next_disk(i, disks);
1043        } while (i != d0_idx);
1044
1045        BUG_ON(faila == failb);
1046        if (failb < faila)
1047                swap(faila, failb);
1048        pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1049                 __func__, (unsigned long long)sh->sector, faila, failb);
1050
1051        atomic_inc(&sh->count);
1052
1053        if (failb == syndrome_disks+1) {
1054                /* Q disk is one of the missing disks */
1055                if (faila == syndrome_disks) {
1056                        /* Missing P+Q, just recompute */
1057                        init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1058                                          ops_complete_compute, sh,
1059                                          to_addr_conv(sh, percpu));
1060                        return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1061                                                  STRIPE_SIZE, &submit);
1062                } else {
1063                        struct page *dest;
1064                        int data_target;
1065                        int qd_idx = sh->qd_idx;
1066
1067                        /* Missing D+Q: recompute D from P, then recompute Q */
1068                        if (target == qd_idx)
1069                                data_target = target2;
1070                        else
1071                                data_target = target;
1072
1073                        count = 0;
1074                        for (i = disks; i-- ; ) {
1075                                if (i == data_target || i == qd_idx)
1076                                        continue;
1077                                blocks[count++] = sh->dev[i].page;
1078                        }
1079                        dest = sh->dev[data_target].page;
1080                        init_async_submit(&submit,
1081                                          ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1082                                          NULL, NULL, NULL,
1083                                          to_addr_conv(sh, percpu));
1084                        tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1085                                       &submit);
1086
1087                        count = set_syndrome_sources(blocks, sh);
1088                        init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1089                                          ops_complete_compute, sh,
1090                                          to_addr_conv(sh, percpu));
1091                        return async_gen_syndrome(blocks, 0, count+2,
1092                                                  STRIPE_SIZE, &submit);
1093                }
1094        } else {
1095                init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1096                                  ops_complete_compute, sh,
1097                                  to_addr_conv(sh, percpu));
1098                if (failb == syndrome_disks) {
1099                        /* We're missing D+P. */
1100                        return async_raid6_datap_recov(syndrome_disks+2,
1101                                                       STRIPE_SIZE, faila,
1102                                                       blocks, &submit);
1103                } else {
1104                        /* We're missing D+D. */
1105                        return async_raid6_2data_recov(syndrome_disks+2,
1106                                                       STRIPE_SIZE, faila, failb,
1107                                                       blocks, &submit);
1108                }
1109        }
1110}
1111
1112
1113static void ops_complete_prexor(void *stripe_head_ref)
1114{
1115        struct stripe_head *sh = stripe_head_ref;
1116
1117        pr_debug("%s: stripe %llu\n", __func__,
1118                (unsigned long long)sh->sector);
1119}
1120
1121static struct dma_async_tx_descriptor *
1122ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
1123               struct dma_async_tx_descriptor *tx)
1124{
1125        int disks = sh->disks;
1126        struct page **xor_srcs = percpu->scribble;
1127        int count = 0, pd_idx = sh->pd_idx, i;
1128        struct async_submit_ctl submit;
1129
1130        /* existing parity data subtracted */
1131        struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1132
1133        pr_debug("%s: stripe %llu\n", __func__,
1134                (unsigned long long)sh->sector);
1135
1136        for (i = disks; i--; ) {
1137                struct r5dev *dev = &sh->dev[i];
1138                /* Only process blocks that are known to be uptodate */
1139                if (test_bit(R5_Wantdrain, &dev->flags))
1140                        xor_srcs[count++] = dev->page;
1141        }
1142
1143        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1144                          ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1145        tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1146
1147        return tx;
1148}
1149
1150static struct dma_async_tx_descriptor *
1151ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1152{
1153        int disks = sh->disks;
1154        int i;
1155
1156        pr_debug("%s: stripe %llu\n", __func__,
1157                (unsigned long long)sh->sector);
1158
1159        for (i = disks; i--; ) {
1160                struct r5dev *dev = &sh->dev[i];
1161                struct bio *chosen;
1162
1163                if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1164                        struct bio *wbi;
1165
1166                        spin_lock_irq(&sh->stripe_lock);
1167                        chosen = dev->towrite;
1168                        dev->towrite = NULL;
1169                        BUG_ON(dev->written);
1170                        wbi = dev->written = chosen;
1171                        spin_unlock_irq(&sh->stripe_lock);
1172
1173                        while (wbi && wbi->bi_sector <
1174                                dev->sector + STRIPE_SECTORS) {
1175                                if (wbi->bi_rw & REQ_FUA)
1176                                        set_bit(R5_WantFUA, &dev->flags);
1177                                if (wbi->bi_rw & REQ_SYNC)
1178                                        set_bit(R5_SyncIO, &dev->flags);
1179                                if (wbi->bi_rw & REQ_DISCARD)
1180                                        set_bit(R5_Discard, &dev->flags);
1181                                else
1182                                        tx = async_copy_data(1, wbi, dev->page,
1183                                                dev->sector, tx);
1184                                wbi = r5_next_bio(wbi, dev->sector);
1185                        }
1186                }
1187        }
1188
1189        return tx;
1190}
1191
1192static void ops_complete_reconstruct(void *stripe_head_ref)
1193{
1194        struct stripe_head *sh = stripe_head_ref;
1195        int disks = sh->disks;
1196        int pd_idx = sh->pd_idx;
1197        int qd_idx = sh->qd_idx;
1198        int i;
1199        bool fua = false, sync = false, discard = false;
1200
1201        pr_debug("%s: stripe %llu\n", __func__,
1202                (unsigned long long)sh->sector);
1203
1204        for (i = disks; i--; ) {
1205                fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1206                sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1207                discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1208        }
1209
1210        for (i = disks; i--; ) {
1211                struct r5dev *dev = &sh->dev[i];
1212
1213                if (dev->written || i == pd_idx || i == qd_idx) {
1214                        if (!discard)
1215                                set_bit(R5_UPTODATE, &dev->flags);
1216                        if (fua)
1217                                set_bit(R5_WantFUA, &dev->flags);
1218                        if (sync)
1219                                set_bit(R5_SyncIO, &dev->flags);
1220                }
1221        }
1222
1223        if (sh->reconstruct_state == reconstruct_state_drain_run)
1224                sh->reconstruct_state = reconstruct_state_drain_result;
1225        else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1226                sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1227        else {
1228                BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1229                sh->reconstruct_state = reconstruct_state_result;
1230        }
1231
1232        set_bit(STRIPE_HANDLE, &sh->state);
1233        release_stripe(sh);
1234}
1235
1236static void
1237ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1238                     struct dma_async_tx_descriptor *tx)
1239{
1240        int disks = sh->disks;
1241        struct page **xor_srcs = percpu->scribble;
1242        struct async_submit_ctl submit;
1243        int count = 0, pd_idx = sh->pd_idx, i;
1244        struct page *xor_dest;
1245        int prexor = 0;
1246        unsigned long flags;
1247
1248        pr_debug("%s: stripe %llu\n", __func__,
1249                (unsigned long long)sh->sector);
1250
1251        for (i = 0; i < sh->disks; i++) {
1252                if (pd_idx == i)
1253                        continue;
1254                if (!test_bit(R5_Discard, &sh->dev[i].flags))
1255                        break;
1256        }
1257        if (i >= sh->disks) {
1258                atomic_inc(&sh->count);
1259                set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1260                ops_complete_reconstruct(sh);
1261                return;
1262        }
1263        /* check if prexor is active which means only process blocks
1264         * that are part of a read-modify-write (written)
1265         */
1266        if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1267                prexor = 1;
1268                xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1269                for (i = disks; i--; ) {
1270                        struct r5dev *dev = &sh->dev[i];
1271                        if (dev->written)
1272                                xor_srcs[count++] = dev->page;
1273                }
1274        } else {
1275                xor_dest = sh->dev[pd_idx].page;
1276                for (i = disks; i--; ) {
1277                        struct r5dev *dev = &sh->dev[i];
1278                        if (i != pd_idx)
1279                                xor_srcs[count++] = dev->page;
1280                }
1281        }
1282
1283        /* 1/ if we prexor'd then the dest is reused as a source
1284         * 2/ if we did not prexor then we are redoing the parity
1285         * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1286         * for the synchronous xor case
1287         */
1288        flags = ASYNC_TX_ACK |
1289                (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1290
1291        atomic_inc(&sh->count);
1292
1293        init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1294                          to_addr_conv(sh, percpu));
1295        if (unlikely(count == 1))
1296                tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1297        else
1298                tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1299}
1300
1301static void
1302ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1303                     struct dma_async_tx_descriptor *tx)
1304{
1305        struct async_submit_ctl submit;
1306        struct page **blocks = percpu->scribble;
1307        int count, i;
1308
1309        pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1310
1311        for (i = 0; i < sh->disks; i++) {
1312                if (sh->pd_idx == i || sh->qd_idx == i)
1313                        continue;
1314                if (!test_bit(R5_Discard, &sh->dev[i].flags))
1315                        break;
1316        }
1317        if (i >= sh->disks) {
1318                atomic_inc(&sh->count);
1319                set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1320                set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1321                ops_complete_reconstruct(sh);
1322                return;
1323        }
1324
1325        count = set_syndrome_sources(blocks, sh);
1326
1327        atomic_inc(&sh->count);
1328
1329        init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1330                          sh, to_addr_conv(sh, percpu));
1331        async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1332}
1333
1334static void ops_complete_check(void *stripe_head_ref)
1335{
1336        struct stripe_head *sh = stripe_head_ref;
1337
1338        pr_debug("%s: stripe %llu\n", __func__,
1339                (unsigned long long)sh->sector);
1340
1341        sh->check_state = check_state_check_result;
1342        set_bit(STRIPE_HANDLE, &sh->state);
1343        release_stripe(sh);
1344}
1345
1346static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1347{
1348        int disks = sh->disks;
1349        int pd_idx = sh->pd_idx;
1350        int qd_idx = sh->qd_idx;
1351        struct page *xor_dest;
1352        struct page **xor_srcs = percpu->scribble;
1353        struct dma_async_tx_descriptor *tx;
1354        struct async_submit_ctl submit;
1355        int count;
1356        int i;
1357
1358        pr_debug("%s: stripe %llu\n", __func__,
1359                (unsigned long long)sh->sector);
1360
1361        count = 0;
1362        xor_dest = sh->dev[pd_idx].page;
1363        xor_srcs[count++] = xor_dest;
1364        for (i = disks; i--; ) {
1365                if (i == pd_idx || i == qd_idx)
1366                        continue;
1367                xor_srcs[count++] = sh->dev[i].page;
1368        }
1369
1370        init_async_submit(&submit, 0, NULL, NULL, NULL,
1371                          to_addr_conv(sh, percpu));
1372        tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1373                           &sh->ops.zero_sum_result, &submit);
1374
1375        atomic_inc(&sh->count);
1376        init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1377        tx = async_trigger_callback(&submit);
1378}
1379
1380static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1381{
1382        struct page **srcs = percpu->scribble;
1383        struct async_submit_ctl submit;
1384        int count;
1385
1386        pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1387                (unsigned long long)sh->sector, checkp);
1388
1389        count = set_syndrome_sources(srcs, sh);
1390        if (!checkp)
1391                srcs[count] = NULL;
1392
1393        atomic_inc(&sh->count);
1394        init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1395                          sh, to_addr_conv(sh, percpu));
1396        async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1397                           &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1398}
1399
1400static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1401{
1402        int overlap_clear = 0, i, disks = sh->disks;
1403        struct dma_async_tx_descriptor *tx = NULL;
1404        struct r5conf *conf = sh->raid_conf;
1405        int level = conf->level;
1406        struct raid5_percpu *percpu;
1407        unsigned long cpu;
1408
1409        cpu = get_cpu();
1410        percpu = per_cpu_ptr(conf->percpu, cpu);
1411        if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1412                ops_run_biofill(sh);
1413                overlap_clear++;
1414        }
1415
1416        if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1417                if (level < 6)
1418                        tx = ops_run_compute5(sh, percpu);
1419                else {
1420                        if (sh->ops.target2 < 0 || sh->ops.target < 0)
1421                                tx = ops_run_compute6_1(sh, percpu);
1422                        else
1423                                tx = ops_run_compute6_2(sh, percpu);
1424                }
1425                /* terminate the chain if reconstruct is not set to be run */
1426                if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1427                        async_tx_ack(tx);
1428        }
1429
1430        if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1431                tx = ops_run_prexor(sh, percpu, tx);
1432
1433        if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1434                tx = ops_run_biodrain(sh, tx);
1435                overlap_clear++;
1436        }
1437
1438        if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1439                if (level < 6)
1440                        ops_run_reconstruct5(sh, percpu, tx);
1441                else
1442                        ops_run_reconstruct6(sh, percpu, tx);
1443        }
1444
1445        if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1446                if (sh->check_state == check_state_run)
1447                        ops_run_check_p(sh, percpu);
1448                else if (sh->check_state == check_state_run_q)
1449                        ops_run_check_pq(sh, percpu, 0);
1450                else if (sh->check_state == check_state_run_pq)
1451                        ops_run_check_pq(sh, percpu, 1);
1452                else
1453                        BUG();
1454        }
1455
1456        if (overlap_clear)
1457                for (i = disks; i--; ) {
1458                        struct r5dev *dev = &sh->dev[i];
1459                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
1460                                wake_up(&sh->raid_conf->wait_for_overlap);
1461                }
1462        put_cpu();
1463}
1464
1465#ifdef CONFIG_MULTICORE_RAID456
1466static void async_run_ops(void *param, async_cookie_t cookie)
1467{
1468        struct stripe_head *sh = param;
1469        unsigned long ops_request = sh->ops.request;
1470
1471        clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1472        wake_up(&sh->ops.wait_for_ops);
1473
1474        __raid_run_ops(sh, ops_request);
1475        release_stripe(sh);
1476}
1477
1478static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1479{
1480        /* since handle_stripe can be called outside of raid5d context
1481         * we need to ensure sh->ops.request is de-staged before another
1482         * request arrives
1483         */
1484        wait_event(sh->ops.wait_for_ops,
1485                   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1486        sh->ops.request = ops_request;
1487
1488        atomic_inc(&sh->count);
1489        async_schedule(async_run_ops, sh);
1490}
1491#else
1492#define raid_run_ops __raid_run_ops
1493#endif
1494
1495static int grow_one_stripe(struct r5conf *conf)
1496{
1497        struct stripe_head *sh;
1498        sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1499        if (!sh)
1500                return 0;
1501
1502        sh->raid_conf = conf;
1503        #ifdef CONFIG_MULTICORE_RAID456
1504        init_waitqueue_head(&sh->ops.wait_for_ops);
1505        #endif
1506
1507        spin_lock_init(&sh->stripe_lock);
1508
1509        if (grow_buffers(sh)) {
1510                shrink_buffers(sh);
1511                kmem_cache_free(conf->slab_cache, sh);
1512                return 0;
1513        }
1514        /* we just created an active stripe so... */
1515        atomic_set(&sh->count, 1);
1516        atomic_inc(&conf->active_stripes);
1517        INIT_LIST_HEAD(&sh->lru);
1518        release_stripe(sh);
1519        return 1;
1520}
1521
1522static int grow_stripes(struct r5conf *conf, int num)
1523{
1524        struct kmem_cache *sc;
1525        int devs = max(conf->raid_disks, conf->previous_raid_disks);
1526
1527        if (conf->mddev->gendisk)
1528                sprintf(conf->cache_name[0],
1529                        "raid%d-%s", conf->level, mdname(conf->mddev));
1530        else
1531                sprintf(conf->cache_name[0],
1532                        "raid%d-%p", conf->level, conf->mddev);
1533        sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
1534
1535        conf->active_name = 0;
1536        sc = kmem_cache_create(conf->cache_name[conf->active_name],
1537                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1538                               0, 0, NULL);
1539        if (!sc)
1540                return 1;
1541        conf->slab_cache = sc;
1542        conf->pool_size = devs;
1543        while (num--)
1544                if (!grow_one_stripe(conf))
1545                        return 1;
1546        return 0;
1547}
1548
1549/**
1550 * scribble_len - return the required size of the scribble region
1551 * @num - total number of disks in the array
1552 *
1553 * The size must be enough to contain:
1554 * 1/ a struct page pointer for each device in the array +2
1555 * 2/ room to convert each entry in (1) to its corresponding dma
1556 *    (dma_map_page()) or page (page_address()) address.
1557 *
1558 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1559 * calculate over all devices (not just the data blocks), using zeros in place
1560 * of the P and Q blocks.
1561 */
1562static size_t scribble_len(int num)
1563{
1564        size_t len;
1565
1566        len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1567
1568        return len;
1569}
1570
1571static int resize_stripes(struct r5conf *conf, int newsize)
1572{
1573        /* Make all the stripes able to hold 'newsize' devices.
1574         * New slots in each stripe get 'page' set to a new page.
1575         *
1576         * This happens in stages:
1577         * 1/ create a new kmem_cache and allocate the required number of
1578         *    stripe_heads.
1579         * 2/ gather all the old stripe_heads and tranfer the pages across
1580         *    to the new stripe_heads.  This will have the side effect of
1581         *    freezing the array as once all stripe_heads have been collected,
1582         *    no IO will be possible.  Old stripe heads are freed once their
1583         *    pages have been transferred over, and the old kmem_cache is
1584         *    freed when all stripes are done.
1585         * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
1586         *    we simple return a failre status - no need to clean anything up.
1587         * 4/ allocate new pages for the new slots in the new stripe_heads.
1588         *    If this fails, we don't bother trying the shrink the
1589         *    stripe_heads down again, we just leave them as they are.
1590         *    As each stripe_head is processed the new one is released into
1591         *    active service.
1592         *
1593         * Once step2 is started, we cannot afford to wait for a write,
1594         * so we use GFP_NOIO allocations.
1595         */
1596        struct stripe_head *osh, *nsh;
1597        LIST_HEAD(newstripes);
1598        struct disk_info *ndisks;
1599        unsigned long cpu;
1600        int err;
1601        struct kmem_cache *sc;
1602        int i;
1603
1604        if (newsize <= conf->pool_size)
1605                return 0; /* never bother to shrink */
1606
1607        err = md_allow_write(conf->mddev);
1608        if (err)
1609                return err;
1610
1611        /* Step 1 */
1612        sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1613                               sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1614                               0, 0, NULL);
1615        if (!sc)
1616                return -ENOMEM;
1617
1618        for (i = conf->max_nr_stripes; i; i--) {
1619                nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1620                if (!nsh)
1621                        break;
1622
1623                nsh->raid_conf = conf;
1624                #ifdef CONFIG_MULTICORE_RAID456
1625                init_waitqueue_head(&nsh->ops.wait_for_ops);
1626                #endif
1627                spin_lock_init(&nsh->stripe_lock);
1628
1629                list_add(&nsh->lru, &newstripes);
1630        }
1631        if (i) {
1632                /* didn't get enough, give up */
1633                while (!list_empty(&newstripes)) {
1634                        nsh = list_entry(newstripes.next, struct stripe_head, lru);
1635                        list_del(&nsh->lru);
1636                        kmem_cache_free(sc, nsh);
1637                }
1638                kmem_cache_destroy(sc);
1639                return -ENOMEM;
1640        }
1641        /* Step 2 - Must use GFP_NOIO now.
1642         * OK, we have enough stripes, start collecting inactive
1643         * stripes and copying them over
1644         */
1645        list_for_each_entry(nsh, &newstripes, lru) {
1646                spin_lock_irq(&conf->device_lock);
1647                wait_event_lock_irq(conf->wait_for_stripe,
1648                                    !list_empty(&conf->inactive_list),
1649                                    conf->device_lock,
1650                                    );
1651                osh = get_free_stripe(conf);
1652                spin_unlock_irq(&conf->device_lock);
1653                atomic_set(&nsh->count, 1);
1654                for(i=0; i<conf->pool_size; i++)
1655                        nsh->dev[i].page = osh->dev[i].page;
1656                for( ; i<newsize; i++)
1657                        nsh->dev[i].page = NULL;
1658                kmem_cache_free(conf->slab_cache, osh);
1659        }
1660        kmem_cache_destroy(conf->slab_cache);
1661
1662        /* Step 3.
1663         * At this point, we are holding all the stripes so the array
1664         * is completely stalled, so now is a good time to resize
1665         * conf->disks and the scribble region
1666         */
1667        ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1668        if (ndisks) {
1669                for (i=0; i<conf->raid_disks; i++)
1670                        ndisks[i] = conf->disks[i];
1671                kfree(conf->disks);
1672                conf->disks = ndisks;
1673        } else
1674                err = -ENOMEM;
1675
1676        get_online_cpus();
1677        conf->scribble_len = scribble_len(newsize);
1678        for_each_present_cpu(cpu) {
1679                struct raid5_percpu *percpu;
1680                void *scribble;
1681
1682                percpu = per_cpu_ptr(conf->percpu, cpu);
1683                scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1684
1685                if (scribble) {
1686                        kfree(percpu->scribble);
1687                        percpu->scribble = scribble;
1688                } else {
1689                        err = -ENOMEM;
1690                        break;
1691                }
1692        }
1693        put_online_cpus();
1694
1695        /* Step 4, return new stripes to service */
1696        while(!list_empty(&newstripes)) {
1697                nsh = list_entry(newstripes.next, struct stripe_head, lru);
1698                list_del_init(&nsh->lru);
1699
1700                for (i=conf->raid_disks; i < newsize; i++)
1701                        if (nsh->dev[i].page == NULL) {
1702                                struct page *p = alloc_page(GFP_NOIO);
1703                                nsh->dev[i].page = p;
1704                                if (!p)
1705                                        err = -ENOMEM;
1706                        }
1707                release_stripe(nsh);
1708        }
1709        /* critical section pass, GFP_NOIO no longer needed */
1710
1711        conf->slab_cache = sc;
1712        conf->active_name = 1-conf->active_name;
1713        conf->pool_size = newsize;
1714        return err;
1715}
1716
1717static int drop_one_stripe(struct r5conf *conf)
1718{
1719        struct stripe_head *sh;
1720
1721        spin_lock_irq(&conf->device_lock);
1722        sh = get_free_stripe(conf);
1723        spin_unlock_irq(&conf->device_lock);
1724        if (!sh)
1725                return 0;
1726        BUG_ON(atomic_read(&sh->count));
1727        shrink_buffers(sh);
1728        kmem_cache_free(conf->slab_cache, sh);
1729        atomic_dec(&conf->active_stripes);
1730        return 1;
1731}
1732
1733static void shrink_stripes(struct r5conf *conf)
1734{
1735        while (drop_one_stripe(conf))
1736                ;
1737
1738        if (conf->slab_cache)
1739                kmem_cache_destroy(conf->slab_cache);
1740        conf->slab_cache = NULL;
1741}
1742
1743static void raid5_end_read_request(struct bio * bi, int error)
1744{
1745        struct stripe_head *sh = bi->bi_private;
1746        struct r5conf *conf = sh->raid_conf;
1747        int disks = sh->disks, i;
1748        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1749        char b[BDEVNAME_SIZE];
1750        struct md_rdev *rdev = NULL;
1751        sector_t s;
1752
1753        for (i=0 ; i<disks; i++)
1754                if (bi == &sh->dev[i].req)
1755                        break;
1756
1757        pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1758                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1759                uptodate);
1760        if (i == disks) {
1761                BUG();
1762                return;
1763        }
1764        if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1765                /* If replacement finished while this request was outstanding,
1766                 * 'replacement' might be NULL already.
1767                 * In that case it moved down to 'rdev'.
1768                 * rdev is not removed until all requests are finished.
1769                 */
1770                rdev = conf->disks[i].replacement;
1771        if (!rdev)
1772                rdev = conf->disks[i].rdev;
1773
1774        if (use_new_offset(conf, sh))
1775                s = sh->sector + rdev->new_data_offset;
1776        else
1777                s = sh->sector + rdev->data_offset;
1778        if (uptodate) {
1779                set_bit(R5_UPTODATE, &sh->dev[i].flags);
1780                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1781                        /* Note that this cannot happen on a
1782                         * replacement device.  We just fail those on
1783                         * any error
1784                         */
1785                        printk_ratelimited(
1786                                KERN_INFO
1787                                "md/raid:%s: read error corrected"
1788                                " (%lu sectors at %llu on %s)\n",
1789                                mdname(conf->mddev), STRIPE_SECTORS,
1790                                (unsigned long long)s,
1791                                bdevname(rdev->bdev, b));
1792                        atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1793                        clear_bit(R5_ReadError, &sh->dev[i].flags);
1794                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
1795                } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
1796                        clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1797
1798                if (atomic_read(&rdev->read_errors))
1799                        atomic_set(&rdev->read_errors, 0);
1800        } else {
1801                const char *bdn = bdevname(rdev->bdev, b);
1802                int retry = 0;
1803                int set_bad = 0;
1804
1805                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1806                atomic_inc(&rdev->read_errors);
1807                if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1808                        printk_ratelimited(
1809                                KERN_WARNING
1810                                "md/raid:%s: read error on replacement device "
1811                                "(sector %llu on %s).\n",
1812                                mdname(conf->mddev),
1813                                (unsigned long long)s,
1814                                bdn);
1815                else if (conf->mddev->degraded >= conf->max_degraded) {
1816                        set_bad = 1;
1817                        printk_ratelimited(
1818                                KERN_WARNING
1819                                "md/raid:%s: read error not correctable "
1820                                "(sector %llu on %s).\n",
1821                                mdname(conf->mddev),
1822                                (unsigned long long)s,
1823                                bdn);
1824                } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
1825                        /* Oh, no!!! */
1826                        set_bad = 1;
1827                        printk_ratelimited(
1828                                KERN_WARNING
1829                                "md/raid:%s: read error NOT corrected!! "
1830                                "(sector %llu on %s).\n",
1831                                mdname(conf->mddev),
1832                                (unsigned long long)s,
1833                                bdn);
1834                } else if (atomic_read(&rdev->read_errors)
1835                         > conf->max_nr_stripes)
1836                        printk(KERN_WARNING
1837                               "md/raid:%s: Too many read errors, failing device %s.\n",
1838                               mdname(conf->mddev), bdn);
1839                else
1840                        retry = 1;
1841                if (retry)
1842                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1843                                set_bit(R5_ReadError, &sh->dev[i].flags);
1844                                clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1845                        } else
1846                                set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1847                else {
1848                        clear_bit(R5_ReadError, &sh->dev[i].flags);
1849                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
1850                        if (!(set_bad
1851                              && test_bit(In_sync, &rdev->flags)
1852                              && rdev_set_badblocks(
1853                                      rdev, sh->sector, STRIPE_SECTORS, 0)))
1854                                md_error(conf->mddev, rdev);
1855                }
1856        }
1857        rdev_dec_pending(rdev, conf->mddev);
1858        clear_bit(R5_LOCKED, &sh->dev[i].flags);
1859        set_bit(STRIPE_HANDLE, &sh->state);
1860        release_stripe(sh);
1861}
1862
1863static void raid5_end_write_request(struct bio *bi, int error)
1864{
1865        struct stripe_head *sh = bi->bi_private;
1866        struct r5conf *conf = sh->raid_conf;
1867        int disks = sh->disks, i;
1868        struct md_rdev *uninitialized_var(rdev);
1869        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1870        sector_t first_bad;
1871        int bad_sectors;
1872        int replacement = 0;
1873
1874        for (i = 0 ; i < disks; i++) {
1875                if (bi == &sh->dev[i].req) {
1876                        rdev = conf->disks[i].rdev;
1877                        break;
1878                }
1879                if (bi == &sh->dev[i].rreq) {
1880                        rdev = conf->disks[i].replacement;
1881                        if (rdev)
1882                                replacement = 1;
1883                        else
1884                                /* rdev was removed and 'replacement'
1885                                 * replaced it.  rdev is not removed
1886                                 * until all requests are finished.
1887                                 */
1888                                rdev = conf->disks[i].rdev;
1889                        break;
1890                }
1891        }
1892        pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1893                (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1894                uptodate);
1895        if (i == disks) {
1896                BUG();
1897                return;
1898        }
1899
1900        if (replacement) {
1901                if (!uptodate)
1902                        md_error(conf->mddev, rdev);
1903                else if (is_badblock(rdev, sh->sector,
1904                                     STRIPE_SECTORS,
1905                                     &first_bad, &bad_sectors))
1906                        set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1907        } else {
1908                if (!uptodate) {
1909                        set_bit(WriteErrorSeen, &rdev->flags);
1910                        set_bit(R5_WriteError, &sh->dev[i].flags);
1911                        if (!test_and_set_bit(WantReplacement, &rdev->flags))
1912                                set_bit(MD_RECOVERY_NEEDED,
1913                                        &rdev->mddev->recovery);
1914                } else if (is_badblock(rdev, sh->sector,
1915                                       STRIPE_SECTORS,
1916                                       &first_bad, &bad_sectors))
1917                        set_bit(R5_MadeGood, &sh->dev[i].flags);
1918        }
1919        rdev_dec_pending(rdev, conf->mddev);
1920
1921        if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
1922                clear_bit(R5_LOCKED, &sh->dev[i].flags);
1923        set_bit(STRIPE_HANDLE, &sh->state);
1924        release_stripe(sh);
1925}
1926
1927static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1928        
1929static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1930{
1931        struct r5dev *dev = &sh->dev[i];
1932
1933        bio_init(&dev->req);
1934        dev->req.bi_io_vec = &dev->vec;
1935        dev->req.bi_vcnt++;
1936        dev->req.bi_max_vecs++;
1937        dev->req.bi_private = sh;
1938        dev->vec.bv_page = dev->page;
1939
1940        bio_init(&dev->rreq);
1941        dev->rreq.bi_io_vec = &dev->rvec;
1942        dev->rreq.bi_vcnt++;
1943        dev->rreq.bi_max_vecs++;
1944        dev->rreq.bi_private = sh;
1945        dev->rvec.bv_page = dev->page;
1946
1947        dev->flags = 0;
1948        dev->sector = compute_blocknr(sh, i, previous);
1949}
1950
1951static void error(struct mddev *mddev, struct md_rdev *rdev)
1952{
1953        char b[BDEVNAME_SIZE];
1954        struct r5conf *conf = mddev->private;
1955        unsigned long flags;
1956        pr_debug("raid456: error called\n");
1957
1958        spin_lock_irqsave(&conf->device_lock, flags);
1959        clear_bit(In_sync, &rdev->flags);
1960        mddev->degraded = calc_degraded(conf);
1961        spin_unlock_irqrestore(&conf->device_lock, flags);
1962        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1963
1964        set_bit(Blocked, &rdev->flags);
1965        set_bit(Faulty, &rdev->flags);
1966        set_bit(MD_CHANGE_DEVS, &mddev->flags);
1967        printk(KERN_ALERT
1968               "md/raid:%s: Disk failure on %s, disabling device.\n"
1969               "md/raid:%s: Operation continuing on %d devices.\n",
1970               mdname(mddev),
1971               bdevname(rdev->bdev, b),
1972               mdname(mddev),
1973               conf->raid_disks - mddev->degraded);
1974}
1975
1976/*
1977 * Input: a 'big' sector number,
1978 * Output: index of the data and parity disk, and the sector # in them.
1979 */
1980static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
1981                                     int previous, int *dd_idx,
1982                                     struct stripe_head *sh)
1983{
1984        sector_t stripe, stripe2;
1985        sector_t chunk_number;
1986        unsigned int chunk_offset;
1987        int pd_idx, qd_idx;
1988        int ddf_layout = 0;
1989        sector_t new_sector;
1990        int algorithm = previous ? conf->prev_algo
1991                                 : conf->algorithm;
1992        int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1993                                         : conf->chunk_sectors;
1994        int raid_disks = previous ? conf->previous_raid_disks
1995                                  : conf->raid_disks;
1996        int data_disks = raid_disks - conf->max_degraded;
1997
1998        /* First compute the information on this sector */
1999
2000        /*
2001         * Compute the chunk number and the sector offset inside the chunk
2002         */
2003        chunk_offset = sector_div(r_sector, sectors_per_chunk);
2004        chunk_number = r_sector;
2005
2006        /*
2007         * Compute the stripe number
2008         */
2009        stripe = chunk_number;
2010        *dd_idx = sector_div(stripe, data_disks);
2011        stripe2 = stripe;
2012        /*
2013         * Select the parity disk based on the user selected algorithm.
2014         */
2015        pd_idx = qd_idx = -1;
2016        switch(conf->level) {
2017        case 4:
2018                pd_idx = data_disks;
2019                break;
2020        case 5:
2021                switch (algorithm) {
2022                case ALGORITHM_LEFT_ASYMMETRIC:
2023                        pd_idx = data_disks - sector_div(stripe2, raid_disks);
2024                        if (*dd_idx >= pd_idx)
2025                                (*dd_idx)++;
2026                        break;
2027                case ALGORITHM_RIGHT_ASYMMETRIC:
2028                        pd_idx = sector_div(stripe2, raid_disks);
2029                        if (*dd_idx >= pd_idx)
2030                                (*dd_idx)++;
2031                        break;
2032                case ALGORITHM_LEFT_SYMMETRIC:
2033                        pd_idx = data_disks - sector_div(stripe2, raid_disks);
2034                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2035                        break;
2036                case ALGORITHM_RIGHT_SYMMETRIC:
2037                        pd_idx = sector_div(stripe2, raid_disks);
2038                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2039                        break;
2040                case ALGORITHM_PARITY_0:
2041                        pd_idx = 0;
2042                        (*dd_idx)++;
2043                        break;
2044                case ALGORITHM_PARITY_N:
2045                        pd_idx = data_disks;
2046                        break;
2047                default:
2048                        BUG();
2049                }
2050                break;
2051        case 6:
2052
2053                switch (algorithm) {
2054                case ALGORITHM_LEFT_ASYMMETRIC:
2055                        pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2056                        qd_idx = pd_idx + 1;
2057                        if (pd_idx == raid_disks-1) {
2058                                (*dd_idx)++;    /* Q D D D P */
2059                                qd_idx = 0;
2060                        } else if (*dd_idx >= pd_idx)
2061                                (*dd_idx) += 2; /* D D P Q D */
2062                        break;
2063                case ALGORITHM_RIGHT_ASYMMETRIC:
2064                        pd_idx = sector_div(stripe2, raid_disks);
2065                        qd_idx = pd_idx + 1;
2066                        if (pd_idx == raid_disks-1) {
2067                                (*dd_idx)++;    /* Q D D D P */
2068                                qd_idx = 0;
2069                        } else if (*dd_idx >= pd_idx)
2070                                (*dd_idx) += 2; /* D D P Q D */
2071                        break;
2072                case ALGORITHM_LEFT_SYMMETRIC:
2073                        pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2074                        qd_idx = (pd_idx + 1) % raid_disks;
2075                        *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2076                        break;
2077                case ALGORITHM_RIGHT_SYMMETRIC:
2078                        pd_idx = sector_div(stripe2, raid_disks);
2079                        qd_idx = (pd_idx + 1) % raid_disks;
2080                        *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2081                        break;
2082
2083                case ALGORITHM_PARITY_0:
2084                        pd_idx = 0;
2085                        qd_idx = 1;
2086                        (*dd_idx) += 2;
2087                        break;
2088                case ALGORITHM_PARITY_N:
2089                        pd_idx = data_disks;
2090                        qd_idx = data_disks + 1;
2091                        break;
2092
2093                case ALGORITHM_ROTATING_ZERO_RESTART:
2094                        /* Exactly the same as RIGHT_ASYMMETRIC, but or
2095                         * of blocks for computing Q is different.
2096                         */
2097                        pd_idx = sector_div(stripe2, raid_disks);
2098                        qd_idx = pd_idx + 1;
2099                        if (pd_idx == raid_disks-1) {
2100                                (*dd_idx)++;    /* Q D D D P */
2101                                qd_idx = 0;
2102                        } else if (*dd_idx >= pd_idx)
2103                                (*dd_idx) += 2; /* D D P Q D */
2104                        ddf_layout = 1;
2105                        break;
2106
2107                case ALGORITHM_ROTATING_N_RESTART:
2108                        /* Same a left_asymmetric, by first stripe is
2109                         * D D D P Q  rather than
2110                         * Q D D D P
2111                         */
2112                        stripe2 += 1;
2113                        pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2114                        qd_idx = pd_idx + 1;
2115                        if (pd_idx == raid_disks-1) {
2116                                (*dd_idx)++;    /* Q D D D P */
2117                                qd_idx = 0;
2118                        } else if (*dd_idx >= pd_idx)
2119                                (*dd_idx) += 2; /* D D P Q D */
2120                        ddf_layout = 1;
2121                        break;
2122
2123                case ALGORITHM_ROTATING_N_CONTINUE:
2124                        /* Same as left_symmetric but Q is before P */
2125                        pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2126                        qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2127                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2128                        ddf_layout = 1;
2129                        break;
2130
2131                case ALGORITHM_LEFT_ASYMMETRIC_6:
2132                        /* RAID5 left_asymmetric, with Q on last device */
2133                        pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2134                        if (*dd_idx >= pd_idx)
2135                                (*dd_idx)++;
2136                        qd_idx = raid_disks - 1;
2137                        break;
2138
2139                case ALGORITHM_RIGHT_ASYMMETRIC_6:
2140                        pd_idx = sector_div(stripe2, raid_disks-1);
2141                        if (*dd_idx >= pd_idx)
2142                                (*dd_idx)++;
2143                        qd_idx = raid_disks - 1;
2144                        break;
2145
2146                case ALGORITHM_LEFT_SYMMETRIC_6:
2147                        pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2148                        *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2149                        qd_idx = raid_disks - 1;
2150                        break;
2151
2152                case ALGORITHM_RIGHT_SYMMETRIC_6:
2153                        pd_idx = sector_div(stripe2, raid_disks-1);
2154                        *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2155                        qd_idx = raid_disks - 1;
2156                        break;
2157
2158                case ALGORITHM_PARITY_0_6:
2159                        pd_idx = 0;
2160                        (*dd_idx)++;
2161                        qd_idx = raid_disks - 1;
2162                        break;
2163
2164                default:
2165                        BUG();
2166                }
2167                break;
2168        }
2169
2170        if (sh) {
2171                sh->pd_idx = pd_idx;
2172                sh->qd_idx = qd_idx;
2173                sh->ddf_layout = ddf_layout;
2174        }
2175        /*
2176         * Finally, compute the new sector number
2177         */
2178        new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2179        return new_sector;
2180}
2181
2182
2183static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
2184{
2185        struct r5conf *conf = sh->raid_conf;
2186        int raid_disks = sh->disks;
2187        int data_disks = raid_disks - conf->max_degraded;
2188        sector_t new_sector = sh->sector, check;
2189        int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2190                                         : conf->chunk_sectors;
2191        int algorithm = previous ? conf->prev_algo
2192                                 : conf->algorithm;
2193        sector_t stripe;
2194        int chunk_offset;
2195        sector_t chunk_number;
2196        int dummy1, dd_idx = i;
2197        sector_t r_sector;
2198        struct stripe_head sh2;
2199
2200
2201        chunk_offset = sector_div(new_sector, sectors_per_chunk);
2202        stripe = new_sector;
2203
2204        if (i == sh->pd_idx)
2205                return 0;
2206        switch(conf->level) {
2207        case 4: break;
2208        case 5:
2209                switch (algorithm) {
2210                case ALGORITHM_LEFT_ASYMMETRIC:
2211                case ALGORITHM_RIGHT_ASYMMETRIC:
2212                        if (i > sh->pd_idx)
2213                                i--;
2214                        break;
2215                case ALGORITHM_LEFT_SYMMETRIC:
2216                case ALGORITHM_RIGHT_SYMMETRIC:
2217                        if (i < sh->pd_idx)
2218                                i += raid_disks;
2219                        i -= (sh->pd_idx + 1);
2220                        break;
2221                case ALGORITHM_PARITY_0:
2222                        i -= 1;
2223                        break;
2224                case ALGORITHM_PARITY_N:
2225                        break;
2226                default:
2227                        BUG();
2228                }
2229                break;
2230        case 6:
2231                if (i == sh->qd_idx)
2232                        return 0; /* It is the Q disk */
2233                switch (algorithm) {
2234                case ALGORITHM_LEFT_ASYMMETRIC:
2235                case ALGORITHM_RIGHT_ASYMMETRIC:
2236                case ALGORITHM_ROTATING_ZERO_RESTART:
2237                case ALGORITHM_ROTATING_N_RESTART:
2238                        if (sh->pd_idx == raid_disks-1)
2239                                i--;    /* Q D D D P */
2240                        else if (i > sh->pd_idx)
2241                                i -= 2; /* D D P Q D */
2242                        break;
2243                case ALGORITHM_LEFT_SYMMETRIC:
2244                case ALGORITHM_RIGHT_SYMMETRIC:
2245                        if (sh->pd_idx == raid_disks-1)
2246                                i--; /* Q D D D P */
2247                        else {
2248                                /* D D P Q D */
2249                                if (i < sh->pd_idx)
2250                                        i += raid_disks;
2251                                i -= (sh->pd_idx + 2);
2252                        }
2253                        break;
2254                case ALGORITHM_PARITY_0:
2255                        i -= 2;
2256                        break;
2257                case ALGORITHM_PARITY_N:
2258                        break;
2259                case ALGORITHM_ROTATING_N_CONTINUE:
2260                        /* Like left_symmetric, but P is before Q */
2261                        if (sh->pd_idx == 0)
2262                                i--;    /* P D D D Q */
2263                        else {
2264                                /* D D Q P D */
2265                                if (i < sh->pd_idx)
2266                                        i += raid_disks;
2267                                i -= (sh->pd_idx + 1);
2268                        }
2269                        break;
2270                case ALGORITHM_LEFT_ASYMMETRIC_6:
2271                case ALGORITHM_RIGHT_ASYMMETRIC_6:
2272                        if (i > sh->pd_idx)
2273                                i--;
2274                        break;
2275                case ALGORITHM_LEFT_SYMMETRIC_6:
2276                case ALGORITHM_RIGHT_SYMMETRIC_6:
2277                        if (i < sh->pd_idx)
2278                                i += data_disks + 1;
2279                        i -= (sh->pd_idx + 1);
2280                        break;
2281                case ALGORITHM_PARITY_0_6:
2282                        i -= 1;
2283                        break;
2284                default:
2285                        BUG();
2286                }
2287                break;
2288        }
2289
2290        chunk_number = stripe * data_disks + i;
2291        r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2292
2293        check = raid5_compute_sector(conf, r_sector,
2294                                     previous, &dummy1, &sh2);
2295        if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2296                || sh2.qd_idx != sh->qd_idx) {
2297                printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2298                       mdname(conf->mddev));
2299                return 0;
2300        }
2301        return r_sector;
2302}
2303
2304
2305static void
2306schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2307                         int rcw, int expand)
2308{
2309        int i, pd_idx = sh->pd_idx, disks = sh->disks;
2310        struct r5conf *conf = sh->raid_conf;
2311        int level = conf->level;
2312
2313        if (rcw) {
2314                /* if we are not expanding this is a proper write request, and
2315                 * there will be bios with new data to be drained into the
2316                 * stripe cache
2317                 */
2318                if (!expand) {
2319                        sh->reconstruct_state = reconstruct_state_drain_run;
2320                        set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2321                } else
2322                        sh->reconstruct_state = reconstruct_state_run;
2323
2324                set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2325
2326                for (i = disks; i--; ) {
2327                        struct r5dev *dev = &sh->dev[i];
2328
2329                        if (dev->towrite) {
2330                                set_bit(R5_LOCKED, &dev->flags);
2331                                set_bit(R5_Wantdrain, &dev->flags);
2332                                if (!expand)
2333                                        clear_bit(R5_UPTODATE, &dev->flags);
2334                                s->locked++;
2335                        }
2336                }
2337                if (s->locked + conf->max_degraded == disks)
2338                        if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2339                                atomic_inc(&conf->pending_full_writes);
2340        } else {
2341                BUG_ON(level == 6);
2342                BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2343                        test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2344
2345                sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2346                set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2347                set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2348                set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2349
2350                for (i = disks; i--; ) {
2351                        struct r5dev *dev = &sh->dev[i];
2352                        if (i == pd_idx)
2353                                continue;
2354
2355                        if (dev->towrite &&
2356                            (test_bit(R5_UPTODATE, &dev->flags) ||
2357                             test_bit(R5_Wantcompute, &dev->flags))) {
2358                                set_bit(R5_Wantdrain, &dev->flags);
2359                                set_bit(R5_LOCKED, &dev->flags);
2360                                clear_bit(R5_UPTODATE, &dev->flags);
2361                                s->locked++;
2362                        }
2363                }
2364        }
2365
2366        /* keep the parity disk(s) locked while asynchronous operations
2367         * are in flight
2368         */
2369        set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2370        clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2371        s->locked++;
2372
2373        if (level == 6) {
2374                int qd_idx = sh->qd_idx;
2375                struct r5dev *dev = &sh->dev[qd_idx];
2376
2377                set_bit(R5_LOCKED, &dev->flags);
2378                clear_bit(R5_UPTODATE, &dev->flags);
2379                s->locked++;
2380        }
2381
2382        pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2383                __func__, (unsigned long long)sh->sector,
2384                s->locked, s->ops_request);
2385}
2386
2387/*
2388 * Each stripe/dev can have one or more bion attached.
2389 * toread/towrite point to the first in a chain.
2390 * The bi_next chain must be in order.
2391 */
2392static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2393{
2394        struct bio **bip;
2395        struct r5conf *conf = sh->raid_conf;
2396        int firstwrite=0;
2397
2398        pr_debug("adding bi b#%llu to stripe s#%llu\n",
2399                (unsigned long long)bi->bi_sector,
2400                (unsigned long long)sh->sector);
2401
2402        /*
2403         * If several bio share a stripe. The bio bi_phys_segments acts as a
2404         * reference count to avoid race. The reference count should already be
2405         * increased before this function is called (for example, in
2406         * make_request()), so other bio sharing this stripe will not free the
2407         * stripe. If a stripe is owned by one stripe, the stripe lock will
2408         * protect it.
2409         */
2410        spin_lock_irq(&sh->stripe_lock);
2411        if (forwrite) {
2412                bip = &sh->dev[dd_idx].towrite;
2413                if (*bip == NULL)
2414                        firstwrite = 1;
2415        } else
2416                bip = &sh->dev[dd_idx].toread;
2417        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2418                if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2419                        goto overlap;
2420                bip = & (*bip)->bi_next;
2421        }
2422        if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2423                goto overlap;
2424
2425        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2426        if (*bip)
2427                bi->bi_next = *bip;
2428        *bip = bi;
2429        raid5_inc_bi_active_stripes(bi);
2430
2431        if (forwrite) {
2432                /* check if page is covered */
2433                sector_t sector = sh->dev[dd_idx].sector;
2434                for (bi=sh->dev[dd_idx].towrite;
2435                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2436                             bi && bi->bi_sector <= sector;
2437                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2438                        if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2439                                sector = bi->bi_sector + (bi->bi_size>>9);
2440                }
2441                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2442                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2443        }
2444
2445        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2446                (unsigned long long)(*bip)->bi_sector,
2447                (unsigned long long)sh->sector, dd_idx);
2448        spin_unlock_irq(&sh->stripe_lock);
2449
2450        if (conf->mddev->bitmap && firstwrite) {
2451                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2452                                  STRIPE_SECTORS, 0);
2453                sh->bm_seq = conf->seq_flush+1;
2454                set_bit(STRIPE_BIT_DELAY, &sh->state);
2455        }
2456        return 1;
2457
2458 overlap:
2459        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2460        spin_unlock_irq(&sh->stripe_lock);
2461        return 0;
2462}
2463
2464static void end_reshape(struct r5conf *conf);
2465
2466static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
2467                            struct stripe_head *sh)
2468{
2469        int sectors_per_chunk =
2470                previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2471        int dd_idx;
2472        int chunk_offset = sector_div(stripe, sectors_per_chunk);
2473        int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2474
2475        raid5_compute_sector(conf,
2476                             stripe * (disks - conf->max_degraded)
2477                             *sectors_per_chunk + chunk_offset,
2478                             previous,
2479                             &dd_idx, sh);
2480}
2481
2482static void
2483handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2484                                struct stripe_head_state *s, int disks,
2485                                struct bio **return_bi)
2486{
2487        int i;
2488        for (i = disks; i--; ) {
2489                struct bio *bi;
2490                int bitmap_end = 0;
2491
2492                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2493                        struct md_rdev *rdev;
2494                        rcu_read_lock();
2495                        rdev = rcu_dereference(conf->disks[i].rdev);
2496                        if (rdev && test_bit(In_sync, &rdev->flags))
2497                                atomic_inc(&rdev->nr_pending);
2498                        else
2499                                rdev = NULL;
2500                        rcu_read_unlock();
2501                        if (rdev) {
2502                                if (!rdev_set_badblocks(
2503                                            rdev,
2504                                            sh->sector,
2505                                            STRIPE_SECTORS, 0))
2506                                        md_error(conf->mddev, rdev);
2507                                rdev_dec_pending(rdev, conf->mddev);
2508                        }
2509                }
2510                spin_lock_irq(&sh->stripe_lock);
2511                /* fail all writes first */
2512                bi = sh->dev[i].towrite;
2513                sh->dev[i].towrite = NULL;
2514                spin_unlock_irq(&sh->stripe_lock);
2515                if (bi)
2516                        bitmap_end = 1;
2517
2518                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2519                        wake_up(&conf->wait_for_overlap);
2520
2521                while (bi && bi->bi_sector <
2522                        sh->dev[i].sector + STRIPE_SECTORS) {
2523                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2524                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
2525                        if (!raid5_dec_bi_active_stripes(bi)) {
2526                                md_write_end(conf->mddev);
2527                                bi->bi_next = *return_bi;
2528                                *return_bi = bi;
2529                        }
2530                        bi = nextbi;
2531                }
2532                if (bitmap_end)
2533                        bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2534                                STRIPE_SECTORS, 0, 0);
2535                bitmap_end = 0;
2536                /* and fail all 'written' */
2537                bi = sh->dev[i].written;
2538                sh->dev[i].written = NULL;
2539                if (bi) bitmap_end = 1;
2540                while (bi && bi->bi_sector <
2541                       sh->dev[i].sector + STRIPE_SECTORS) {
2542                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2543                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
2544                        if (!raid5_dec_bi_active_stripes(bi)) {
2545                                md_write_end(conf->mddev);
2546                                bi->bi_next = *return_bi;
2547                                *return_bi = bi;
2548                        }
2549                        bi = bi2;
2550                }
2551
2552                /* fail any reads if this device is non-operational and
2553                 * the data has not reached the cache yet.
2554                 */
2555                if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2556                    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2557                      test_bit(R5_ReadError, &sh->dev[i].flags))) {
2558                        spin_lock_irq(&sh->stripe_lock);
2559                        bi = sh->dev[i].toread;
2560                        sh->dev[i].toread = NULL;
2561                        spin_unlock_irq(&sh->stripe_lock);
2562                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2563                                wake_up(&conf->wait_for_overlap);
2564                        while (bi && bi->bi_sector <
2565                               sh->dev[i].sector + STRIPE_SECTORS) {
2566                                struct bio *nextbi =
2567                                        r5_next_bio(bi, sh->dev[i].sector);
2568                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
2569                                if (!raid5_dec_bi_active_stripes(bi)) {
2570                                        bi->bi_next = *return_bi;
2571                                        *return_bi = bi;
2572                                }
2573                                bi = nextbi;
2574                        }
2575                }
2576                if (bitmap_end)
2577                        bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2578                                        STRIPE_SECTORS, 0, 0);
2579                /* If we were in the middle of a write the parity block might
2580                 * still be locked - so just clear all R5_LOCKED flags
2581                 */
2582                clear_bit(R5_LOCKED, &sh->dev[i].flags);
2583        }
2584
2585        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2586                if (atomic_dec_and_test(&conf->pending_full_writes))
2587                        md_wakeup_thread(conf->mddev->thread);
2588}
2589
2590static void
2591handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2592                   struct stripe_head_state *s)
2593{
2594        int abort = 0;
2595        int i;
2596
2597        clear_bit(STRIPE_SYNCING, &sh->state);
2598        s->syncing = 0;
2599        s->replacing = 0;
2600        /* There is nothing more to do for sync/check/repair.
2601         * Don't even need to abort as that is handled elsewhere
2602         * if needed, and not always wanted e.g. if there is a known
2603         * bad block here.
2604         * For recover/replace we need to record a bad block on all
2605         * non-sync devices, or abort the recovery
2606         */
2607        if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
2608                /* During recovery devices cannot be removed, so
2609                 * locking and refcounting of rdevs is not needed
2610                 */
2611                for (i = 0; i < conf->raid_disks; i++) {
2612                        struct md_rdev *rdev = conf->disks[i].rdev;
2613                        if (rdev
2614                            && !test_bit(Faulty, &rdev->flags)
2615                            && !test_bit(In_sync, &rdev->flags)
2616                            && !rdev_set_badblocks(rdev, sh->sector,
2617                                                   STRIPE_SECTORS, 0))
2618                                abort = 1;
2619                        rdev = conf->disks[i].replacement;
2620                        if (rdev
2621                            && !test_bit(Faulty, &rdev->flags)
2622                            && !test_bit(In_sync, &rdev->flags)
2623                            && !rdev_set_badblocks(rdev, sh->sector,
2624                                                   STRIPE_SECTORS, 0))
2625                                abort = 1;
2626                }
2627                if (abort)
2628                        conf->recovery_disabled =
2629                                conf->mddev->recovery_disabled;
2630        }
2631        md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
2632}
2633
2634static int want_replace(struct stripe_head *sh, int disk_idx)
2635{
2636        struct md_rdev *rdev;
2637        int rv = 0;
2638        /* Doing recovery so rcu locking not required */
2639        rdev = sh->raid_conf->disks[disk_idx].replacement;
2640        if (rdev
2641            && !test_bit(Faulty, &rdev->flags)
2642            && !test_bit(In_sync, &rdev->flags)
2643            && (rdev->recovery_offset <= sh->sector
2644                || rdev->mddev->recovery_cp <= sh->sector))
2645                rv = 1;
2646
2647        return rv;
2648}
2649
2650/* fetch_block - checks the given member device to see if its data needs
2651 * to be read or computed to satisfy a request.
2652 *
2653 * Returns 1 when no more member devices need to be checked, otherwise returns
2654 * 0 to tell the loop in handle_stripe_fill to continue
2655 */
2656static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2657                       int disk_idx, int disks)
2658{
2659        struct r5dev *dev = &sh->dev[disk_idx];
2660        struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2661                                  &sh->dev[s->failed_num[1]] };
2662
2663        /* is the data in this block needed, and can we get it? */
2664        if (!test_bit(R5_LOCKED, &dev->flags) &&
2665            !test_bit(R5_UPTODATE, &dev->flags) &&
2666            (dev->toread ||
2667             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2668             s->syncing || s->expanding ||
2669             (s->replacing && want_replace(sh, disk_idx)) ||
2670             (s->failed >= 1 && fdev[0]->toread) ||
2671             (s->failed >= 2 && fdev[1]->toread) ||
2672             (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2673              !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2674             (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2675                /* we would like to get this block, possibly by computing it,
2676                 * otherwise read it if the backing disk is insync
2677                 */
2678                BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2679                BUG_ON(test_bit(R5_Wantread, &dev->flags));
2680                if ((s->uptodate == disks - 1) &&
2681                    (s->failed && (disk_idx == s->failed_num[0] ||
2682                                   disk_idx == s->failed_num[1]))) {
2683                        /* have disk failed, and we're requested to fetch it;
2684                         * do compute it
2685                         */
2686                        pr_debug("Computing stripe %llu block %d\n",
2687                               (unsigned long long)sh->sector, disk_idx);
2688                        set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2689                        set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2690                        set_bit(R5_Wantcompute, &dev->flags);
2691                        sh->ops.target = disk_idx;
2692                        sh->ops.target2 = -1; /* no 2nd target */
2693                        s->req_compute = 1;
2694                        /* Careful: from this point on 'uptodate' is in the eye
2695                         * of raid_run_ops which services 'compute' operations
2696                         * before writes. R5_Wantcompute flags a block that will
2697                         * be R5_UPTODATE by the time it is needed for a
2698                         * subsequent operation.
2699                         */
2700                        s->uptodate++;
2701                        return 1;
2702                } else if (s->uptodate == disks-2 && s->failed >= 2) {
2703                        /* Computing 2-failure is *very* expensive; only
2704                         * do it if failed >= 2
2705                         */
2706                        int other;
2707                        for (other = disks; other--; ) {
2708                                if (other == disk_idx)
2709                                        continue;
2710                                if (!test_bit(R5_UPTODATE,
2711                                      &sh->dev[other].flags))
2712                                        break;
2713                        }
2714                        BUG_ON(other < 0);
2715                        pr_debug("Computing stripe %llu blocks %d,%d\n",
2716                               (unsigned long long)sh->sector,
2717                               disk_idx, other);
2718                        set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2719                        set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2720                        set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2721                        set_bit(R5_Wantcompute, &sh->dev[other].flags);
2722                        sh->ops.target = disk_idx;
2723                        sh->ops.target2 = other;
2724                        s->uptodate += 2;
2725                        s->req_compute = 1;
2726                        return 1;
2727                } else if (test_bit(R5_Insync, &dev->flags)) {
2728                        set_bit(R5_LOCKED, &dev->flags);
2729                        set_bit(R5_Wantread, &dev->flags);
2730                        s->locked++;
2731                        pr_debug("Reading block %d (sync=%d)\n",
2732                                disk_idx, s->syncing);
2733                }
2734        }
2735
2736        return 0;
2737}
2738
2739/**
2740 * handle_stripe_fill - read or compute data to satisfy pending requests.
2741 */
2742static void handle_stripe_fill(struct stripe_head *sh,
2743                               struct stripe_head_state *s,
2744                               int disks)
2745{
2746        int i;
2747
2748        /* look for blocks to read/compute, skip this if a compute
2749         * is already in flight, or if the stripe contents are in the
2750         * midst of changing due to a write
2751         */
2752        if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2753            !sh->reconstruct_state)
2754                for (i = disks; i--; )
2755                        if (fetch_block(sh, s, i, disks))
2756                                break;
2757        set_bit(STRIPE_HANDLE, &sh->state);
2758}
2759
2760
2761/* handle_stripe_clean_event
2762 * any written block on an uptodate or failed drive can be returned.
2763 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2764 * never LOCKED, so we don't need to test 'failed' directly.
2765 */
2766static void handle_stripe_clean_event(struct r5conf *conf,
2767        struct stripe_head *sh, int disks, struct bio **return_bi)
2768{
2769        int i;
2770        struct r5dev *dev;
2771
2772        for (i = disks; i--; )
2773                if (sh->dev[i].written) {
2774                        dev = &sh->dev[i];
2775                        if (!test_bit(R5_LOCKED, &dev->flags) &&
2776                            (test_bit(R5_UPTODATE, &dev->flags) ||
2777                             test_bit(R5_Discard, &dev->flags))) {
2778                                /* We can return any write requests */
2779                                struct bio *wbi, *wbi2;
2780                                pr_debug("Return write for disc %d\n", i);
2781                                if (test_and_clear_bit(R5_Discard, &dev->flags))
2782                                        clear_bit(R5_UPTODATE, &dev->flags);
2783                                wbi = dev->written;
2784                                dev->written = NULL;
2785                                while (wbi && wbi->bi_sector <
2786                                        dev->sector + STRIPE_SECTORS) {
2787                                        wbi2 = r5_next_bio(wbi, dev->sector);
2788                                        if (!raid5_dec_bi_active_stripes(wbi)) {
2789                                                md_write_end(conf->mddev);
2790                                                wbi->bi_next = *return_bi;
2791                                                *return_bi = wbi;
2792                                        }
2793                                        wbi = wbi2;
2794                                }
2795                                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2796                                                STRIPE_SECTORS,
2797                                         !test_bit(STRIPE_DEGRADED, &sh->state),
2798                                                0);
2799                        }
2800                } else if (test_bit(R5_Discard, &sh->dev[i].flags))
2801                        clear_bit(R5_Discard, &sh->dev[i].flags);
2802
2803        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2804                if (atomic_dec_and_test(&conf->pending_full_writes))
2805                        md_wakeup_thread(conf->mddev->thread);
2806}
2807
2808static void handle_stripe_dirtying(struct r5conf *conf,
2809                                   struct stripe_head *sh,
2810                                   struct stripe_head_state *s,
2811                                   int disks)
2812{
2813        int rmw = 0, rcw = 0, i;
2814        sector_t recovery_cp = conf->mddev->recovery_cp;
2815
2816        /* RAID6 requires 'rcw' in current implementation.
2817         * Otherwise, check whether resync is now happening or should start.
2818         * If yes, then the array is dirty (after unclean shutdown or
2819         * initial creation), so parity in some stripes might be inconsistent.
2820         * In this case, we need to always do reconstruct-write, to ensure
2821         * that in case of drive failure or read-error correction, we
2822         * generate correct data from the parity.
2823         */
2824        if (conf->max_degraded == 2 ||
2825            (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
2826                /* Calculate the real rcw later - for now make it
2827                 * look like rcw is cheaper
2828                 */
2829                rcw = 1; rmw = 2;
2830                pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
2831                         conf->max_degraded, (unsigned long long)recovery_cp,
2832                         (unsigned long long)sh->sector);
2833        } else for (i = disks; i--; ) {
2834                /* would I have to read this buffer for read_modify_write */
2835                struct r5dev *dev = &sh->dev[i];
2836                if ((dev->towrite || i == sh->pd_idx) &&
2837                    !test_bit(R5_LOCKED, &dev->flags) &&
2838                    !(test_bit(R5_UPTODATE, &dev->flags) ||
2839                      test_bit(R5_Wantcompute, &dev->flags))) {
2840                        if (test_bit(R5_Insync, &dev->flags))
2841                                rmw++;
2842                        else
2843                                rmw += 2*disks;  /* cannot read it */
2844                }
2845                /* Would I have to read this buffer for reconstruct_write */
2846                if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2847                    !test_bit(R5_LOCKED, &dev->flags) &&
2848                    !(test_bit(R5_UPTODATE, &dev->flags) ||
2849                    test_bit(R5_Wantcompute, &dev->flags))) {
2850                        if (test_bit(R5_Insync, &dev->flags)) rcw++;
2851                        else
2852                                rcw += 2*disks;
2853                }
2854        }
2855        pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2856                (unsigned long long)sh->sector, rmw, rcw);
2857        set_bit(STRIPE_HANDLE, &sh->state);
2858        if (rmw < rcw && rmw > 0)
2859                /* prefer read-modify-write, but need to get some data */
2860                for (i = disks; i--; ) {
2861                        struct r5dev *dev = &sh->dev[i];
2862                        if ((dev->towrite || i == sh->pd_idx) &&
2863                            !test_bit(R5_LOCKED, &dev->flags) &&
2864                            !(test_bit(R5_UPTODATE, &dev->flags) ||
2865                            test_bit(R5_Wantcompute, &dev->flags)) &&
2866                            test_bit(R5_Insync, &dev->flags)) {
2867                                if (
2868                                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2869                                        pr_debug("Read_old block "
2870                                                "%d for r-m-w\n", i);
2871                                        set_bit(R5_LOCKED, &dev->flags);
2872                                        set_bit(R5_Wantread, &dev->flags);
2873                                        s->locked++;
2874                                } else {
2875                                        set_bit(STRIPE_DELAYED, &sh->state);
2876                                        set_bit(STRIPE_HANDLE, &sh->state);
2877                                }
2878                        }
2879                }
2880        if (rcw <= rmw && rcw > 0) {
2881                /* want reconstruct write, but need to get some data */
2882                rcw = 0;
2883                for (i = disks; i--; ) {
2884                        struct r5dev *dev = &sh->dev[i];
2885                        if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2886                            i != sh->pd_idx && i != sh->qd_idx &&
2887                            !test_bit(R5_LOCKED, &dev->flags) &&
2888                            !(test_bit(R5_UPTODATE, &dev->flags) ||
2889                              test_bit(R5_Wantcompute, &dev->flags))) {
2890                                rcw++;
2891                                if (!test_bit(R5_Insync, &dev->flags))
2892                                        continue; /* it's a failed drive */
2893                                if (
2894                                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2895                                        pr_debug("Read_old block "
2896                                                "%d for Reconstruct\n", i);
2897                                        set_bit(R5_LOCKED, &dev->flags);
2898                                        set_bit(R5_Wantread, &dev->flags);
2899                                        s->locked++;
2900                                } else {
2901                                        set_bit(STRIPE_DELAYED, &sh->state);
2902                                        set_bit(STRIPE_HANDLE, &sh->state);
2903                                }
2904                        }
2905                }
2906        }
2907        /* now if nothing is locked, and if we have enough data,
2908         * we can start a write request
2909         */
2910        /* since handle_stripe can be called at any time we need to handle the
2911         * case where a compute block operation has been submitted and then a
2912         * subsequent call wants to start a write request.  raid_run_ops only
2913         * handles the case where compute block and reconstruct are requested
2914         * simultaneously.  If this is not the case then new writes need to be
2915         * held off until the compute completes.
2916         */
2917        if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2918            (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2919            !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2920                schedule_reconstruction(sh, s, rcw == 0, 0);
2921}
2922
2923static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
2924                                struct stripe_head_state *s, int disks)
2925{
2926        struct r5dev *dev = NULL;
2927
2928        set_bit(STRIPE_HANDLE, &sh->state);
2929
2930        switch (sh->check_state) {
2931        case check_state_idle:
2932                /* start a new check operation if there are no failures */
2933                if (s->failed == 0) {
2934                        BUG_ON(s->uptodate != disks);
2935                        sh->check_state = check_state_run;
2936                        set_bit(STRIPE_OP_CHECK, &s->ops_request);
2937                        clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2938                        s->uptodate--;
2939                        break;
2940                }
2941                dev = &sh->dev[s->failed_num[0]];
2942                /* fall through */
2943        case check_state_compute_result:
2944                sh->check_state = check_state_idle;
2945                if (!dev)
2946                        dev = &sh->dev[sh->pd_idx];
2947
2948                /* check that a write has not made the stripe insync */
2949                if (test_bit(STRIPE_INSYNC, &sh->state))
2950                        break;
2951
2952                /* either failed parity check, or recovery is happening */
2953                BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2954                BUG_ON(s->uptodate != disks);
2955
2956                set_bit(R5_LOCKED, &dev->flags);
2957                s->locked++;
2958                set_bit(R5_Wantwrite, &dev->flags);
2959
2960                clear_bit(STRIPE_DEGRADED, &sh->state);
2961                set_bit(STRIPE_INSYNC, &sh->state);
2962                break;
2963        case check_state_run:
2964                break; /* we will be called again upon completion */
2965        case check_state_check_result:
2966                sh->check_state = check_state_idle;
2967
2968                /* if a failure occurred during the check operation, leave
2969                 * STRIPE_INSYNC not set and let the stripe be handled again
2970                 */
2971                if (s->failed)
2972                        break;
2973
2974                /* handle a successful check operation, if parity is correct
2975                 * we are done.  Otherwise update the mismatch count and repair
2976                 * parity if !MD_RECOVERY_CHECK
2977                 */
2978                if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2979                        /* parity is correct (on disc,
2980                         * not in buffer any more)
2981                         */
2982                        set_bit(STRIPE_INSYNC, &sh->state);
2983                else {
2984                        atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
2985                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2986                                /* don't try to repair!! */
2987                                set_bit(STRIPE_INSYNC, &sh->state);
2988                        else {
2989                                sh->check_state = check_state_compute_run;
2990                                set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2991                                set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2992                                set_bit(R5_Wantcompute,
2993                                        &sh->dev[sh->pd_idx].flags);
2994                                sh->ops.target = sh->pd_idx;
2995                                sh->ops.target2 = -1;
2996                                s->uptodate++;
2997                        }
2998                }
2999                break;
3000        case check_state_compute_run:
3001                break;
3002        default:
3003                printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3004                       __func__, sh->check_state,
3005                       (unsigned long long) sh->sector);
3006                BUG();
3007        }
3008}
3009
3010
3011static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3012                                  struct stripe_head_state *s,
3013                                  int disks)
3014{
3015        int pd_idx = sh->pd_idx;
3016        int qd_idx = sh->qd_idx;
3017        struct r5dev *dev;
3018
3019        set_bit(STRIPE_HANDLE, &sh->state);
3020
3021        BUG_ON(s->failed > 2);
3022
3023        /* Want to check and possibly repair P and Q.
3024         * However there could be one 'failed' device, in which
3025         * case we can only check one of them, possibly using the
3026         * other to generate missing data
3027         */
3028
3029        switch (sh->check_state) {
3030        case check_state_idle:
3031                /* start a new check operation if there are < 2 failures */
3032                if (s->failed == s->q_failed) {
3033                        /* The only possible failed device holds Q, so it
3034                         * makes sense to check P (If anything else were failed,
3035                         * we would have used P to recreate it).
3036                         */
3037                        sh->check_state = check_state_run;
3038                }
3039                if (!s->q_failed && s->failed < 2) {
3040                        /* Q is not failed, and we didn't use it to generate
3041                         * anything, so it makes sense to check it
3042                         */
3043                        if (sh->check_state == check_state_run)
3044                                sh->check_state = check_state_run_pq;
3045                        else
3046                                sh->check_state = check_state_run_q;
3047                }
3048
3049                /* discard potentially stale zero_sum_result */
3050                sh->ops.zero_sum_result = 0;
3051
3052                if (sh->check_state == check_state_run) {
3053                        /* async_xor_zero_sum destroys the contents of P */
3054                        clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3055                        s->uptodate--;
3056                }
3057                if (sh->check_state >= check_state_run &&
3058                    sh->check_state <= check_state_run_pq) {
3059                        /* async_syndrome_zero_sum preserves P and Q, so
3060                         * no need to mark them !uptodate here
3061                         */
3062                        set_bit(STRIPE_OP_CHECK, &s->ops_request);
3063                        break;
3064                }
3065
3066                /* we have 2-disk failure */
3067                BUG_ON(s->failed != 2);
3068                /* fall through */
3069        case check_state_compute_result:
3070                sh->check_state = check_state_idle;
3071
3072                /* check that a write has not made the stripe insync */
3073                if (test_bit(STRIPE_INSYNC, &sh->state))
3074                        break;
3075
3076                /* now write out any block on a failed drive,
3077                 * or P or Q if they were recomputed
3078                 */
3079                BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
3080                if (s->failed == 2) {
3081                        dev = &sh->dev[s->failed_num[1]];
3082                        s->locked++;
3083                        set_bit(R5_LOCKED, &dev->flags);
3084                        set_bit(R5_Wantwrite, &dev->flags);
3085                }
3086                if (s->failed >= 1) {
3087                        dev = &sh->dev[s->failed_num[0]];
3088                        s->locked++;
3089                        set_bit(R5_LOCKED, &dev->flags);
3090                        set_bit(R5_Wantwrite, &dev->flags);
3091                }
3092                if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
3093                        dev = &sh->dev[pd_idx];
3094                        s->locked++;
3095                        set_bit(R5_LOCKED, &dev->flags);
3096                        set_bit(R5_Wantwrite, &dev->flags);
3097                }
3098                if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
3099                        dev = &sh->dev[qd_idx];
3100                        s->locked++;
3101                        set_bit(R5_LOCKED, &dev->flags);
3102                        set_bit(R5_Wantwrite, &dev->flags);
3103                }
3104                clear_bit(STRIPE_DEGRADED, &sh->state);
3105
3106                set_bit(STRIPE_INSYNC, &sh->state);
3107                break;
3108        case check_state_run:
3109        case check_state_run_q:
3110        case check_state_run_pq:
3111                break; /* we will be called again upon completion */
3112        case check_state_check_result:
3113                sh->check_state = check_state_idle;
3114
3115                /* handle a successful check operation, if parity is correct
3116                 * we are done.  Otherwise update the mismatch count and repair
3117                 * parity if !MD_RECOVERY_CHECK
3118                 */
3119                if (sh->ops.zero_sum_result == 0) {
3120                        /* both parities are correct */
3121                        if (!s->failed)
3122                                set_bit(STRIPE_INSYNC, &sh->state);
3123                        else {
3124                                /* in contrast to the raid5 case we can validate
3125                                 * parity, but still have a failure to write
3126                                 * back
3127                                 */
3128                                sh->check_state = check_state_compute_result;
3129                                /* Returning at this point means that we may go
3130                                 * off and bring p and/or q uptodate again so
3131                                 * we make sure to check zero_sum_result again
3132                                 * to verify if p or q need writeback
3133                                 */
3134                        }
3135                } else {
3136                        atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
3137                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3138                                /* don't try to repair!! */
3139                                set_bit(STRIPE_INSYNC, &sh->state);
3140                        else {
3141                                int *target = &sh->ops.target;
3142
3143                                sh->ops.target = -1;
3144                                sh->ops.target2 = -1;
3145                                sh->check_state = check_state_compute_run;
3146                                set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3147                                set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3148                                if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
3149                                        set_bit(R5_Wantcompute,
3150                                                &sh->dev[pd_idx].flags);
3151                                        *target = pd_idx;
3152                                        target = &sh->ops.target2;
3153                                        s->uptodate++;
3154                                }
3155                                if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
3156                                        set_bit(R5_Wantcompute,
3157                                                &sh->dev[qd_idx].flags);
3158                                        *target = qd_idx;
3159                                        s->uptodate++;
3160                                }
3161                        }
3162                }
3163                break;
3164        case check_state_compute_run:
3165                break;
3166        default:
3167                printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3168                       __func__, sh->check_state,
3169                       (unsigned long long) sh->sector);
3170                BUG();
3171        }
3172}
3173
3174static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3175{
3176        int i;
3177
3178        /* We have read all the blocks in this stripe and now we need to
3179         * copy some of them into a target stripe for expand.
3180         */
3181        struct dma_async_tx_descriptor *tx = NULL;
3182        clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3183        for (i = 0; i < sh->disks; i++)
3184                if (i != sh->pd_idx && i != sh->qd_idx) {
3185                        int dd_idx, j;
3186                        struct stripe_head *sh2;
3187                        struct async_submit_ctl submit;
3188
3189                        sector_t bn = compute_blocknr(sh, i, 1);
3190                        sector_t s = raid5_compute_sector(conf, bn, 0,
3191                                                          &dd_idx, NULL);
3192                        sh2 = get_active_stripe(conf, s, 0, 1, 1);
3193                        if (sh2 == NULL)
3194                                /* so far only the early blocks of this stripe
3195                                 * have been requested.  When later blocks
3196                                 * get requested, we will try again
3197                                 */
3198                                continue;
3199                        if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
3200                           test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
3201                                /* must have already done this block */
3202                                release_stripe(sh2);
3203                                continue;
3204                        }
3205
3206                        /* place all the copies on one channel */
3207                        init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
3208                        tx = async_memcpy(sh2->dev[dd_idx].page,
3209                                          sh->dev[i].page, 0, 0, STRIPE_SIZE,
3210                                          &submit);
3211
3212                        set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
3213                        set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
3214                        for (j = 0; j < conf->raid_disks; j++)
3215                                if (j != sh2->pd_idx &&
3216                                    j != sh2->qd_idx &&
3217                                    !test_bit(R5_Expanded, &sh2->dev[j].flags))
3218                                        break;
3219                        if (j == conf->raid_disks) {
3220                                set_bit(STRIPE_EXPAND_READY, &sh2->state);
3221                                set_bit(STRIPE_HANDLE, &sh2->state);
3222                        }
3223                        release_stripe(sh2);
3224
3225                }
3226        /* done submitting copies, wait for them to complete */
3227        if (tx) {
3228                async_tx_ack(tx);
3229                dma_wait_for_async_tx(tx);
3230        }
3231}
3232
3233/*
3234 * handle_stripe - do things to a stripe.
3235 *
3236 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
3237 * state of various bits to see what needs to be done.
3238 * Possible results:
3239 *    return some read requests which now have data
3240 *    return some write requests which are safely on storage
3241 *    schedule a read on some buffers
3242 *    schedule a write of some buffers
3243 *    return confirmation of parity correctness
3244 *
3245 */
3246
3247static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3248{
3249        struct r5conf *conf = sh->raid_conf;
3250        int disks = sh->disks;
3251        struct r5dev *dev;
3252        int i;
3253        int do_recovery = 0;
3254
3255        memset(s, 0, sizeof(*s));
3256
3257        s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3258        s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3259        s->failed_num[0] = -1;
3260        s->failed_num[1] = -1;
3261
3262        /* Now to look around and see what can be done */
3263        rcu_read_lock();
3264        for (i=disks; i--; ) {
3265                struct md_rdev *rdev;
3266                sector_t first_bad;
3267                int bad_sectors;
3268                int is_bad = 0;
3269
3270                dev = &sh->dev[i];
3271
3272                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3273                         i, dev->flags,
3274                         dev->toread, dev->towrite, dev->written);
3275                /* maybe we can reply to a read
3276                 *
3277                 * new wantfill requests are only permitted while
3278                 * ops_complete_biofill is guaranteed to be inactive
3279                 */
3280                if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3281                    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3282                        set_bit(R5_Wantfill, &dev->flags);
3283
3284                /* now count some things */
3285                if (test_bit(R5_LOCKED, &dev->flags))
3286                        s->locked++;
3287                if (test_bit(R5_UPTODATE, &dev->flags))
3288                        s->uptodate++;
3289                if (test_bit(R5_Wantcompute, &dev->flags)) {
3290                        s->compute++;
3291                        BUG_ON(s->compute > 2);
3292                }
3293
3294                if (test_bit(R5_Wantfill, &dev->flags))
3295                        s->to_fill++;
3296                else if (dev->toread)
3297                        s->to_read++;
3298                if (dev->towrite) {
3299                        s->to_write++;
3300                        if (!test_bit(R5_OVERWRITE, &dev->flags))
3301                                s->non_overwrite++;
3302                }
3303                if (dev->written)
3304                        s->written++;
3305                /* Prefer to use the replacement for reads, but only
3306                 * if it is recovered enough and has no bad blocks.
3307                 */
3308                rdev = rcu_dereference(conf->disks[i].replacement);
3309                if (rdev && !test_bit(Faulty, &rdev->flags) &&
3310                    rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3311                    !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3312                                 &first_bad, &bad_sectors))
3313                        set_bit(R5_ReadRepl, &dev->flags);
3314                else {
3315                        if (rdev)
3316                                set_bit(R5_NeedReplace, &dev->flags);
3317                        rdev = rcu_dereference(conf->disks[i].rdev);
3318                        clear_bit(R5_ReadRepl, &dev->flags);
3319                }
3320                if (rdev && test_bit(Faulty, &rdev->flags))
3321                        rdev = NULL;
3322                if (rdev) {
3323                        is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3324                                             &first_bad, &bad_sectors);
3325                        if (s->blocked_rdev == NULL
3326                            && (test_bit(Blocked, &rdev->flags)
3327                                || is_bad < 0)) {
3328                                if (is_bad < 0)
3329                                        set_bit(BlockedBadBlocks,
3330                                                &rdev->flags);
3331                                s->blocked_rdev = rdev;
3332                                atomic_inc(&rdev->nr_pending);
3333                        }
3334                }
3335                clear_bit(R5_Insync, &dev->flags);
3336                if (!rdev)
3337                        /* Not in-sync */;
3338                else if (is_bad) {
3339                        /* also not in-sync */
3340                        if (!test_bit(WriteErrorSeen, &rdev->flags) &&
3341                            test_bit(R5_UPTODATE, &dev->flags)) {
3342                                /* treat as in-sync, but with a read error
3343                                 * which we can now try to correct
3344                                 */
3345                                set_bit(R5_Insync, &dev->flags);
3346                                set_bit(R5_ReadError, &dev->flags);
3347                        }
3348                } else if (test_bit(In_sync, &rdev->flags))
3349                        set_bit(R5_Insync, &dev->flags);
3350                else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3351                        /* in sync if before recovery_offset */
3352                        set_bit(R5_Insync, &dev->flags);
3353                else if (test_bit(R5_UPTODATE, &dev->flags) &&
3354                         test_bit(R5_Expanded, &dev->flags))
3355                        /* If we've reshaped into here, we assume it is Insync.
3356                         * We will shortly update recovery_offset to make
3357                         * it official.
3358                         */
3359                        set_bit(R5_Insync, &dev->flags);
3360
3361                if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3362                        /* This flag does not apply to '.replacement'
3363                         * only to .rdev, so make sure to check that*/
3364                        struct md_rdev *rdev2 = rcu_dereference(
3365                                conf->disks[i].rdev);
3366                        if (rdev2 == rdev)
3367                                clear_bit(R5_Insync, &dev->flags);
3368                        if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3369                                s->handle_bad_blocks = 1;
3370                                atomic_inc(&rdev2->nr_pending);
3371                        } else
3372                                clear_bit(R5_WriteError, &dev->flags);
3373                }
3374                if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3375                        /* This flag does not apply to '.replacement'
3376                         * only to .rdev, so make sure to check that*/
3377                        struct md_rdev *rdev2 = rcu_dereference(
3378                                conf->disks[i].rdev);
3379                        if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3380                                s->handle_bad_blocks = 1;
3381                                atomic_inc(&rdev2->nr_pending);
3382                        } else
3383                                clear_bit(R5_MadeGood, &dev->flags);
3384                }
3385                if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3386                        struct md_rdev *rdev2 = rcu_dereference(
3387                                conf->disks[i].replacement);
3388                        if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3389                                s->handle_bad_blocks = 1;
3390                                atomic_inc(&rdev2->nr_pending);
3391                        } else
3392                                clear_bit(R5_MadeGoodRepl, &dev->flags);
3393                }
3394                if (!test_bit(R5_Insync, &dev->flags)) {
3395                        /* The ReadError flag will just be confusing now */
3396                        clear_bit(R5_ReadError, &dev->flags);
3397                        clear_bit(R5_ReWrite, &dev->flags);
3398                }
3399                if (test_bit(R5_ReadError, &dev->flags))
3400                        clear_bit(R5_Insync, &dev->flags);
3401                if (!test_bit(R5_Insync, &dev->flags)) {
3402                        if (s->failed < 2)
3403                                s->failed_num[s->failed] = i;
3404                        s->failed++;
3405                        if (rdev && !test_bit(Faulty, &rdev->flags))
3406                                do_recovery = 1;
3407                }
3408        }
3409        if (test_bit(STRIPE_SYNCING, &sh->state)) {
3410                /* If there is a failed device being replaced,
3411                 *     we must be recovering.
3412                 * else if we are after recovery_cp, we must be syncing
3413                 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
3414                 * else we can only be replacing
3415                 * sync and recovery both need to read all devices, and so
3416                 * use the same flag.
3417                 */
3418                if (do_recovery ||
3419                    sh->sector >= conf->mddev->recovery_cp ||
3420                    test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
3421                        s->syncing = 1;
3422                else
3423                        s->replacing = 1;
3424        }
3425        rcu_read_unlock();
3426}
3427
3428static void handle_stripe(struct stripe_head *sh)
3429{
3430        struct stripe_head_state s;
3431        struct r5conf *conf = sh->raid_conf;
3432        int i;
3433        int prexor;
3434        int disks = sh->disks;
3435        struct r5dev *pdev, *qdev;
3436
3437        clear_bit(STRIPE_HANDLE, &sh->state);
3438        if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
3439                /* already being handled, ensure it gets handled
3440                 * again when current action finishes */
3441                set_bit(STRIPE_HANDLE, &sh->state);
3442                return;
3443        }
3444
3445        if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3446                set_bit(STRIPE_SYNCING, &sh->state);
3447                clear_bit(STRIPE_INSYNC, &sh->state);
3448        }
3449        clear_bit(STRIPE_DELAYED, &sh->state);
3450
3451        pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3452                "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3453               (unsigned long long)sh->sector, sh->state,
3454               atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3455               sh->check_state, sh->reconstruct_state);
3456
3457        analyse_stripe(sh, &s);
3458
3459        if (s.handle_bad_blocks) {
3460                set_bit(STRIPE_HANDLE, &sh->state);
3461                goto finish;
3462        }
3463
3464        if (unlikely(s.blocked_rdev)) {
3465                if (s.syncing || s.expanding || s.expanded ||
3466                    s.replacing || s.to_write || s.written) {
3467                        set_bit(STRIPE_HANDLE, &sh->state);
3468                        goto finish;
3469                }
3470                /* There is nothing for the blocked_rdev to block */
3471                rdev_dec_pending(s.blocked_rdev, conf->mddev);
3472                s.blocked_rdev = NULL;
3473        }
3474
3475        if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3476                set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3477                set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3478        }
3479
3480        pr_debug("locked=%d uptodate=%d to_read=%d"
3481               " to_write=%d failed=%d failed_num=%d,%d\n",
3482               s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3483               s.failed_num[0], s.failed_num[1]);
3484        /* check if the array has lost more than max_degraded devices and,
3485         * if so, some requests might need to be failed.
3486         */
3487        if (s.failed > conf->max_degraded) {
3488                sh->check_state = 0;
3489                sh->reconstruct_state = 0;
3490                if (s.to_read+s.to_write+s.written)
3491                        handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3492                if (s.syncing + s.replacing)
3493                        handle_failed_sync(conf, sh, &s);
3494        }
3495
3496        /* Now we check to see if any write operations have recently
3497         * completed
3498         */
3499        prexor = 0;
3500        if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3501                prexor = 1;
3502        if (sh->reconstruct_state == reconstruct_state_drain_result ||
3503            sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3504                sh->reconstruct_state = reconstruct_state_idle;
3505
3506                /* All the 'written' buffers and the parity block are ready to
3507                 * be written back to disk
3508                 */
3509                BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
3510                       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
3511                BUG_ON(sh->qd_idx >= 0 &&
3512                       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
3513                       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
3514                for (i = disks; i--; ) {
3515                        struct r5dev *dev = &sh->dev[i];
3516                        if (test_bit(R5_LOCKED, &dev->flags) &&
3517                                (i == sh->pd_idx || i == sh->qd_idx ||
3518                                 dev->written)) {
3519                                pr_debug("Writing block %d\n", i);
3520                                set_bit(R5_Wantwrite, &dev->flags);
3521                                if (prexor)
3522                                        continue;
3523                                if (!test_bit(R5_Insync, &dev->flags) ||
3524                                    ((i == sh->pd_idx || i == sh->qd_idx)  &&
3525                                     s.failed == 0))
3526                                        set_bit(STRIPE_INSYNC, &sh->state);
3527                        }
3528                }
3529                if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3530                        s.dec_preread_active = 1;
3531        }
3532
3533        /*
3534         * might be able to return some write requests if the parity blocks
3535         * are safe, or on a failed drive
3536         */
3537        pdev = &sh->dev[sh->pd_idx];
3538        s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3539                || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3540        qdev = &sh->dev[sh->qd_idx];
3541        s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3542                || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3543                || conf->level < 6;
3544
3545        if (s.written &&
3546            (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3547                             && !test_bit(R5_LOCKED, &pdev->flags)
3548                             && (test_bit(R5_UPTODATE, &pdev->flags) ||
3549                                 test_bit(R5_Discard, &pdev->flags))))) &&
3550            (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3551                             && !test_bit(R5_LOCKED, &qdev->flags)
3552                             && (test_bit(R5_UPTODATE, &qdev->flags) ||
3553                                 test_bit(R5_Discard, &qdev->flags))))))
3554                handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3555
3556        /* Now we might consider reading some blocks, either to check/generate
3557         * parity, or to satisfy requests
3558         * or to load a block that is being partially written.
3559         */
3560        if (s.to_read || s.non_overwrite
3561            || (conf->level == 6 && s.to_write && s.failed)
3562            || (s.syncing && (s.uptodate + s.compute < disks))
3563            || s.replacing
3564            || s.expanding)
3565                handle_stripe_fill(sh, &s, disks);
3566
3567        /* Now to consider new write requests and what else, if anything
3568         * should be read.  We do not handle new writes when:
3569         * 1/ A 'write' operation (copy+xor) is already in flight.
3570         * 2/ A 'check' operation is in flight, as it may clobber the parity
3571         *    block.
3572         */
3573        if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3574                handle_stripe_dirtying(conf, sh, &s, disks);
3575
3576        /* maybe we need to check and possibly fix the parity for this stripe
3577         * Any reads will already have been scheduled, so we just see if enough
3578         * data is available.  The parity check is held off while parity
3579         * dependent operations are in flight.
3580         */
3581        if (sh->check_state ||
3582            (s.syncing && s.locked == 0 &&
3583             !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3584             !test_bit(STRIPE_INSYNC, &sh->state))) {
3585                if (conf->level == 6)
3586                        handle_parity_checks6(conf, sh, &s, disks);
3587                else
3588                        handle_parity_checks5(conf, sh, &s, disks);
3589        }
3590
3591        if (s.replacing && s.locked == 0
3592            && !test_bit(STRIPE_INSYNC, &sh->state)) {
3593                /* Write out to replacement devices where possible */
3594                for (i = 0; i < conf->raid_disks; i++)
3595                        if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3596                            test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3597                                set_bit(R5_WantReplace, &sh->dev[i].flags);
3598                                set_bit(R5_LOCKED, &sh->dev[i].flags);
3599                                s.locked++;
3600                        }
3601                set_bit(STRIPE_INSYNC, &sh->state);
3602        }
3603        if ((s.syncing || s.replacing) && s.locked == 0 &&
3604            test_bit(STRIPE_INSYNC, &sh->state)) {
3605                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3606                clear_bit(STRIPE_SYNCING, &sh->state);
3607        }
3608
3609        /* If the failed drives are just a ReadError, then we might need
3610         * to progress the repair/check process
3611         */
3612        if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3613                for (i = 0; i < s.failed; i++) {
3614                        struct r5dev *dev = &sh->dev[s.failed_num[i]];
3615                        if (test_bit(R5_ReadError, &dev->flags)
3616                            && !test_bit(R5_LOCKED, &dev->flags)
3617                            && test_bit(R5_UPTODATE, &dev->flags)
3618                                ) {
3619                                if (!test_bit(R5_ReWrite, &dev->flags)) {
3620                                        set_bit(R5_Wantwrite, &dev->flags);
3621                                        set_bit(R5_ReWrite, &dev->flags);
3622                                        set_bit(R5_LOCKED, &dev->flags);
3623                                        s.locked++;
3624                                } else {
3625                                        /* let's read it back */
3626                                        set_bit(R5_Wantread, &dev->flags);
3627                                        set_bit(R5_LOCKED, &dev->flags);
3628                                        s.locked++;
3629                                }
3630                        }
3631                }
3632
3633
3634        /* Finish reconstruct operations initiated by the expansion process */
3635        if (sh->reconstruct_state == reconstruct_state_result) {
3636                struct stripe_head *sh_src
3637                        = get_active_stripe(conf, sh->sector, 1, 1, 1);
3638                if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3639                        /* sh cannot be written until sh_src has been read.
3640                         * so arrange for sh to be delayed a little
3641                         */
3642                        set_bit(STRIPE_DELAYED, &sh->state);
3643                        set_bit(STRIPE_HANDLE, &sh->state);
3644                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3645                                              &sh_src->state))
3646                                atomic_inc(&conf->preread_active_stripes);
3647                        release_stripe(sh_src);
3648                        goto finish;
3649                }
3650                if (sh_src)
3651                        release_stripe(sh_src);
3652
3653                sh->reconstruct_state = reconstruct_state_idle;
3654                clear_bit(STRIPE_EXPANDING, &sh->state);
3655                for (i = conf->raid_disks; i--; ) {
3656                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
3657                        set_bit(R5_LOCKED, &sh->dev[i].flags);
3658                        s.locked++;
3659                }
3660        }
3661
3662        if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3663            !sh->reconstruct_state) {
3664                /* Need to write out all blocks after computing parity */
3665                sh->disks = conf->raid_disks;
3666                stripe_set_idx(sh->sector, conf, 0, sh);
3667                schedule_reconstruction(sh, &s, 1, 1);
3668        } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3669                clear_bit(STRIPE_EXPAND_READY, &sh->state);
3670                atomic_dec(&conf->reshape_stripes);
3671                wake_up(&conf->wait_for_overlap);
3672                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3673        }
3674
3675        if (s.expanding && s.locked == 0 &&
3676            !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3677                handle_stripe_expansion(conf, sh);
3678
3679finish:
3680        /* wait for this device to become unblocked */
3681        if (unlikely(s.blocked_rdev)) {
3682                if (conf->mddev->external)
3683                        md_wait_for_blocked_rdev(s.blocked_rdev,
3684                                                 conf->mddev);
3685                else
3686                        /* Internal metadata will immediately
3687                         * be written by raid5d, so we don't
3688                         * need to wait here.
3689                         */
3690                        rdev_dec_pending(s.blocked_rdev,
3691                                         conf->mddev);
3692        }
3693
3694        if (s.handle_bad_blocks)
3695                for (i = disks; i--; ) {
3696                        struct md_rdev *rdev;
3697                        struct r5dev *dev = &sh->dev[i];
3698                        if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3699                                /* We own a safe reference to the rdev */
3700                                rdev = conf->disks[i].rdev;
3701                                if (!rdev_set_badblocks(rdev, sh->sector,
3702                                                        STRIPE_SECTORS, 0))
3703                                        md_error(conf->mddev, rdev);
3704                                rdev_dec_pending(rdev, conf->mddev);
3705                        }
3706                        if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3707                                rdev = conf->disks[i].rdev;
3708                                rdev_clear_badblocks(rdev, sh->sector,
3709                                                     STRIPE_SECTORS, 0);
3710                                rdev_dec_pending(rdev, conf->mddev);
3711                        }
3712                        if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3713                                rdev = conf->disks[i].replacement;
3714                                if (!rdev)
3715                                        /* rdev have been moved down */
3716                                        rdev = conf->disks[i].rdev;
3717                                rdev_clear_badblocks(rdev, sh->sector,
3718                                                     STRIPE_SECTORS, 0);
3719                                rdev_dec_pending(rdev, conf->mddev);
3720                        }
3721                }
3722
3723        if (s.ops_request)
3724                raid_run_ops(sh, s.ops_request);
3725
3726        ops_run_io(sh, &s);
3727
3728        if (s.dec_preread_active) {
3729                /* We delay this until after ops_run_io so that if make_request
3730                 * is waiting on a flush, it won't continue until the writes
3731                 * have actually been submitted.
3732                 */
3733                atomic_dec(&conf->preread_active_stripes);
3734                if (atomic_read(&conf->preread_active_stripes) <
3735                    IO_THRESHOLD)
3736                        md_wakeup_thread(conf->mddev->thread);
3737        }
3738
3739        return_io(s.return_bi);
3740
3741        clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
3742}
3743
3744static void raid5_activate_delayed(struct r5conf *conf)
3745{
3746        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3747                while (!list_empty(&conf->delayed_list)) {
3748                        struct list_head *l = conf->delayed_list.next;
3749                        struct stripe_head *sh;
3750                        sh = list_entry(l, struct stripe_head, lru);
3751                        list_del_init(l);
3752                        clear_bit(STRIPE_DELAYED, &sh->state);
3753                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3754                                atomic_inc(&conf->preread_active_stripes);
3755                        list_add_tail(&sh->lru, &conf->hold_list);
3756                }
3757        }
3758}
3759
3760static void activate_bit_delay(struct r5conf *conf)
3761{
3762        /* device_lock is held */
3763        struct list_head head;
3764        list_add(&head, &conf->bitmap_list);
3765        list_del_init(&conf->bitmap_list);
3766        while (!list_empty(&head)) {
3767                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3768                list_del_init(&sh->lru);
3769                atomic_inc(&sh->count);
3770                __release_stripe(conf, sh);
3771        }
3772}
3773
3774int md_raid5_congested(struct mddev *mddev, int bits)
3775{
3776        struct r5conf *conf = mddev->private;
3777
3778        /* No difference between reads and writes.  Just check
3779         * how busy the stripe_cache is
3780         */
3781
3782        if (conf->inactive_blocked)
3783                return 1;
3784        if (conf->quiesce)
3785                return 1;
3786        if (list_empty_careful(&conf->inactive_list))
3787                return 1;
3788
3789        return 0;
3790}
3791EXPORT_SYMBOL_GPL(md_raid5_congested);
3792
3793static int raid5_congested(void *data, int bits)
3794{
3795        struct mddev *mddev = data;
3796
3797        return mddev_congested(mddev, bits) ||
3798                md_raid5_congested(mddev, bits);
3799}
3800
3801/* We want read requests to align with chunks where possible,
3802 * but write requests don't need to.
3803 */
3804static int raid5_mergeable_bvec(struct request_queue *q,
3805                                struct bvec_merge_data *bvm,
3806                                struct bio_vec *biovec)
3807{
3808        struct mddev *mddev = q->queuedata;
3809        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3810        int max;
3811        unsigned int chunk_sectors = mddev->chunk_sectors;
3812        unsigned int bio_sectors = bvm->bi_size >> 9;
3813
3814        if ((bvm->bi_rw & 1) == WRITE)
3815                return biovec->bv_len; /* always allow writes to be mergeable */
3816
3817        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3818                chunk_sectors = mddev->new_chunk_sectors;
3819        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3820        if (max < 0) max = 0;
3821        if (max <= biovec->bv_len && bio_sectors == 0)
3822                return biovec->bv_len;
3823        else
3824                return max;
3825}
3826
3827
3828static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
3829{
3830        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3831        unsigned int chunk_sectors = mddev->chunk_sectors;
3832        unsigned int bio_sectors = bio->bi_size >> 9;
3833
3834        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3835                chunk_sectors = mddev->new_chunk_sectors;
3836        return  chunk_sectors >=
3837                ((sector & (chunk_sectors - 1)) + bio_sectors);
3838}
3839
3840/*
3841 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
3842 *  later sampled by raid5d.
3843 */
3844static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
3845{
3846        unsigned long flags;
3847
3848        spin_lock_irqsave(&conf->device_lock, flags);
3849
3850        bi->bi_next = conf->retry_read_aligned_list;
3851        conf->retry_read_aligned_list = bi;
3852
3853        spin_unlock_irqrestore(&conf->device_lock, flags);
3854        md_wakeup_thread(conf->mddev->thread);
3855}
3856
3857
3858static struct bio *remove_bio_from_retry(struct r5conf *conf)
3859{
3860        struct bio *bi;
3861
3862        bi = conf->retry_read_aligned;
3863        if (bi) {
3864                conf->retry_read_aligned = NULL;
3865                return bi;
3866        }
3867        bi = conf->retry_read_aligned_list;
3868        if(bi) {
3869                conf->retry_read_aligned_list = bi->bi_next;
3870                bi->bi_next = NULL;
3871                /*
3872                 * this sets the active strip count to 1 and the processed
3873                 * strip count to zero (upper 8 bits)
3874                 */
3875                raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
3876        }
3877
3878        return bi;
3879}
3880
3881
3882/*
3883 *  The "raid5_align_endio" should check if the read succeeded and if it
3884 *  did, call bio_endio on the original bio (having bio_put the new bio
3885 *  first).
3886 *  If the read failed..
3887 */
3888static void raid5_align_endio(struct bio *bi, int error)
3889{
3890        struct bio* raid_bi  = bi->bi_private;
3891        struct mddev *mddev;
3892        struct r5conf *conf;
3893        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3894        struct md_rdev *rdev;
3895
3896        bio_put(bi);
3897
3898        rdev = (void*)raid_bi->bi_next;
3899        raid_bi->bi_next = NULL;
3900        mddev = rdev->mddev;
3901        conf = mddev->private;
3902
3903        rdev_dec_pending(rdev, conf->mddev);
3904
3905        if (!error && uptodate) {
3906                bio_endio(raid_bi, 0);
3907                if (atomic_dec_and_test(&conf->active_aligned_reads))
3908                        wake_up(&conf->wait_for_stripe);
3909                return;
3910        }
3911
3912
3913        pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3914
3915        add_bio_to_retry(raid_bi, conf);
3916}
3917
3918static int bio_fits_rdev(struct bio *bi)
3919{
3920        struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3921
3922        if ((bi->bi_size>>9) > queue_max_sectors(q))
3923                return 0;
3924        blk_recount_segments(q, bi);
3925        if (bi->bi_phys_segments > queue_max_segments(q))
3926                return 0;
3927
3928        if (q->merge_bvec_fn)
3929                /* it's too hard to apply the merge_bvec_fn at this stage,
3930                 * just just give up
3931                 */
3932                return 0;
3933
3934        return 1;
3935}
3936
3937
3938static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3939{
3940        struct r5conf *conf = mddev->private;
3941        int dd_idx;
3942        struct bio* align_bi;
3943        struct md_rdev *rdev;
3944        sector_t end_sector;
3945
3946        if (!in_chunk_boundary(mddev, raid_bio)) {
3947                pr_debug("chunk_aligned_read : non aligned\n");
3948                return 0;
3949        }
3950        /*
3951         * use bio_clone_mddev to make a copy of the bio
3952         */
3953        align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
3954        if (!align_bi)
3955                return 0;
3956        /*
3957         *   set bi_end_io to a new function, and set bi_private to the
3958         *     original bio.
3959         */
3960        align_bi->bi_end_io  = raid5_align_endio;
3961        align_bi->bi_private = raid_bio;
3962        /*
3963         *      compute position
3964         */
3965        align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
3966                                                    0,
3967                                                    &dd_idx, NULL);
3968
3969        end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3970        rcu_read_lock();
3971        rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3972        if (!rdev || test_bit(Faulty, &rdev->flags) ||
3973            rdev->recovery_offset < end_sector) {
3974                rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3975                if (rdev &&
3976                    (test_bit(Faulty, &rdev->flags) ||
3977                    !(test_bit(In_sync, &rdev->flags) ||
3978                      rdev->recovery_offset >= end_sector)))
3979                        rdev = NULL;
3980        }
3981        if (rdev) {
3982                sector_t first_bad;
3983                int bad_sectors;
3984
3985                atomic_inc(&rdev->nr_pending);
3986                rcu_read_unlock();
3987                raid_bio->bi_next = (void*)rdev;
3988                align_bi->bi_bdev =  rdev->bdev;
3989                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3990
3991                if (!bio_fits_rdev(align_bi) ||
3992                    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3993                                &first_bad, &bad_sectors)) {
3994                        /* too big in some way, or has a known bad block */
3995                        bio_put(align_bi);
3996                        rdev_dec_pending(rdev, mddev);
3997                        return 0;
3998                }
3999
4000                /* No reshape active, so we can trust rdev->data_offset */
4001                align_bi->bi_sector += rdev->data_offset;
4002
4003                spin_lock_irq(&conf->device_lock);
4004                wait_event_lock_irq(conf->wait_for_stripe,
4005                                    conf->quiesce == 0,
4006                                    conf->device_lock, /* nothing */);
4007                atomic_inc(&conf->active_aligned_reads);
4008                spin_unlock_irq(&conf->device_lock);
4009
4010                generic_make_request(align_bi);
4011                return 1;
4012        } else {
4013                rcu_read_unlock();
4014                bio_put(align_bi);
4015                return 0;
4016        }
4017}
4018
4019/* __get_priority_stripe - get the next stripe to process
4020 *
4021 * Full stripe writes are allowed to pass preread active stripes up until
4022 * the bypass_threshold is exceeded.  In general the bypass_count
4023 * increments when the handle_list is handled before the hold_list; however, it
4024 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
4025 * stripe with in flight i/o.  The bypass_count will be reset when the
4026 * head of the hold_list has changed, i.e. the head was promoted to the
4027 * handle_list.
4028 */
4029static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
4030{
4031        struct stripe_head *sh;
4032
4033        pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
4034                  __func__,
4035                  list_empty(&conf->handle_list) ? "empty" : "busy",
4036                  list_empty(&conf->hold_list) ? "empty" : "busy",
4037                  atomic_read(&conf->pending_full_writes), conf->bypass_count);
4038
4039        if (!list_empty(&conf->handle_list)) {
4040                sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
4041
4042                if (list_empty(&conf->hold_list))
4043                        conf->bypass_count = 0;
4044                else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
4045                        if (conf->hold_list.next == conf->last_hold)
4046                                conf->bypass_count++;
4047                        else {
4048                                conf->last_hold = conf->hold_list.next;
4049                                conf->bypass_count -= conf->bypass_threshold;
4050                                if (conf->bypass_count < 0)
4051                                        conf->bypass_count = 0;
4052                        }
4053                }
4054        } else if (!list_empty(&conf->hold_list) &&
4055                   ((conf->bypass_threshold &&
4056                     conf->bypass_count > conf->bypass_threshold) ||
4057                    atomic_read(&conf->pending_full_writes) == 0)) {
4058                sh = list_entry(conf->hold_list.next,
4059                                typeof(*sh), lru);
4060                conf->bypass_count -= conf->bypass_threshold;
4061                if (conf->bypass_count < 0)
4062                        conf->bypass_count = 0;
4063        } else
4064                return NULL;
4065
4066        list_del_init(&sh->lru);
4067        atomic_inc(&sh->count);
4068        BUG_ON(atomic_read(&sh->count) != 1);
4069        return sh;
4070}
4071
4072struct raid5_plug_cb {
4073        struct blk_plug_cb      cb;
4074        struct list_head        list;
4075};
4076
4077static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4078{
4079        struct raid5_plug_cb *cb = container_of(
4080                blk_cb, struct raid5_plug_cb, cb);
4081        struct stripe_head *sh;
4082        struct mddev *mddev = cb->cb.data;
4083        struct r5conf *conf = mddev->private;
4084
4085        if (cb->list.next && !list_empty(&cb->list)) {
4086                spin_lock_irq(&conf->device_lock);
4087                while (!list_empty(&cb->list)) {
4088                        sh = list_first_entry(&cb->list, struct stripe_head, lru);
4089                        list_del_init(&sh->lru);
4090                        /*
4091                         * avoid race release_stripe_plug() sees
4092                         * STRIPE_ON_UNPLUG_LIST clear but the stripe
4093                         * is still in our list
4094                         */
4095                        smp_mb__before_clear_bit();
4096                        clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
4097                        __release_stripe(conf, sh);
4098                }
4099                spin_unlock_irq(&conf->device_lock);
4100        }
4101        kfree(cb);
4102}
4103
4104static void release_stripe_plug(struct mddev *mddev,
4105                                struct stripe_head *sh)
4106{
4107        struct blk_plug_cb *blk_cb = blk_check_plugged(
4108                raid5_unplug, mddev,
4109                sizeof(struct raid5_plug_cb));
4110        struct raid5_plug_cb *cb;
4111
4112        if (!blk_cb) {
4113                release_stripe(sh);
4114                return;
4115        }
4116
4117        cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4118
4119        if (cb->list.next == NULL)
4120                INIT_LIST_HEAD(&cb->list);
4121
4122        if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4123                list_add_tail(&sh->lru, &cb->list);
4124        else
4125                release_stripe(sh);
4126}
4127
4128static void make_discard_request(struct mddev *mddev, struct bio *bi)
4129{
4130        struct r5conf *conf = mddev->private;
4131        sector_t logical_sector, last_sector;
4132        struct stripe_head *sh;
4133        int remaining;
4134        int stripe_sectors;
4135
4136        if (mddev->reshape_position != MaxSector)
4137                /* Skip discard while reshape is happening */
4138                return;
4139
4140        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4141        last_sector = bi->bi_sector + (bi->bi_size>>9);
4142
4143        bi->bi_next = NULL;
4144        bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4145
4146        stripe_sectors = conf->chunk_sectors *
4147                (conf->raid_disks - conf->max_degraded);
4148        logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
4149                                               stripe_sectors);
4150        sector_div(last_sector, stripe_sectors);
4151
4152        logical_sector *= conf->chunk_sectors;
4153        last_sector *= conf->chunk_sectors;
4154
4155        for (; logical_sector < last_sector;
4156             logical_sector += STRIPE_SECTORS) {
4157                DEFINE_WAIT(w);
4158                int d;
4159        again:
4160                sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
4161                prepare_to_wait(&conf->wait_for_overlap, &w,
4162                                TASK_UNINTERRUPTIBLE);
4163                spin_lock_irq(&sh->stripe_lock);
4164                for (d = 0; d < conf->raid_disks; d++) {
4165                        if (d == sh->pd_idx || d == sh->qd_idx)
4166                                continue;
4167                        if (sh->dev[d].towrite || sh->dev[d].toread) {
4168                                set_bit(R5_Overlap, &sh->dev[d].flags);
4169                                spin_unlock_irq(&sh->stripe_lock);
4170                                release_stripe(sh);
4171                                schedule();
4172                                goto again;
4173                        }
4174                }
4175                finish_wait(&conf->wait_for_overlap, &w);
4176                for (d = 0; d < conf->raid_disks; d++) {
4177                        if (d == sh->pd_idx || d == sh->qd_idx)
4178                                continue;
4179                        sh->dev[d].towrite = bi;
4180                        set_bit(R5_OVERWRITE, &sh->dev[d].flags);
4181                        raid5_inc_bi_active_stripes(bi);
4182                }
4183                spin_unlock_irq(&sh->stripe_lock);
4184                if (conf->mddev->bitmap) {
4185                        for (d = 0;
4186                             d < conf->raid_disks - conf->max_degraded;
4187                             d++)
4188                                bitmap_startwrite(mddev->bitmap,
4189                                                  sh->sector,
4190                                                  STRIPE_SECTORS,
4191                                                  0);
4192                        sh->bm_seq = conf->seq_flush + 1;
4193                        set_bit(STRIPE_BIT_DELAY, &sh->state);
4194                }
4195
4196                set_bit(STRIPE_HANDLE, &sh->state);
4197                clear_bit(STRIPE_DELAYED, &sh->state);
4198                if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4199                        atomic_inc(&conf->preread_active_stripes);
4200                release_stripe_plug(mddev, sh);
4201        }
4202
4203        remaining = raid5_dec_bi_active_stripes(bi);
4204        if (remaining == 0) {
4205                md_write_end(mddev);
4206                bio_endio(bi, 0);
4207        }
4208}
4209
4210static void make_request(struct mddev *mddev, struct bio * bi)
4211{
4212        struct r5conf *conf = mddev->private;
4213        int dd_idx;
4214        sector_t new_sector;
4215        sector_t logical_sector, last_sector;
4216        struct stripe_head *sh;
4217        const int rw = bio_data_dir(bi);
4218        int remaining;
4219
4220        if (unlikely(bi->bi_rw & REQ_FLUSH)) {
4221                md_flush_request(mddev, bi);
4222                return;
4223        }
4224
4225        md_write_start(mddev, bi);
4226
4227        if (rw == READ &&
4228             mddev->reshape_position == MaxSector &&
4229             chunk_aligned_read(mddev,bi))
4230                return;
4231
4232        if (unlikely(bi->bi_rw & REQ_DISCARD)) {
4233                make_discard_request(mddev, bi);
4234                return;
4235        }
4236
4237        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4238        last_sector = bi->bi_sector + (bi->bi_size>>9);
4239        bi->bi_next = NULL;
4240        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
4241
4242        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
4243                DEFINE_WAIT(w);
4244                int previous;
4245
4246        retry:
4247                previous = 0;
4248                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
4249                if (unlikely(conf->reshape_progress != MaxSector)) {
4250                        /* spinlock is needed as reshape_progress may be
4251                         * 64bit on a 32bit platform, and so it might be
4252                         * possible to see a half-updated value
4253                         * Of course reshape_progress could change after
4254                         * the lock is dropped, so once we get a reference
4255                         * to the stripe that we think it is, we will have
4256                         * to check again.
4257                         */
4258                        spin_lock_irq(&conf->device_lock);
4259                        if (mddev->reshape_backwards
4260                            ? logical_sector < conf->reshape_progress
4261                            : logical_sector >= conf->reshape_progress) {
4262                                previous = 1;
4263                        } else {
4264                                if (mddev->reshape_backwards
4265                                    ? logical_sector < conf->reshape_safe
4266                                    : logical_sector >= conf->reshape_safe) {
4267                                        spin_unlock_irq(&conf->device_lock);
4268                                        schedule();
4269                                        goto retry;
4270                                }
4271                        }
4272                        spin_unlock_irq(&conf->device_lock);
4273                }
4274
4275                new_sector = raid5_compute_sector(conf, logical_sector,
4276                                                  previous,
4277                                                  &dd_idx, NULL);
4278                pr_debug("raid456: make_request, sector %llu logical %llu\n",
4279                        (unsigned long long)new_sector, 
4280                        (unsigned long long)logical_sector);
4281
4282                sh = get_active_stripe(conf, new_sector, previous,
4283                                       (bi->bi_rw&RWA_MASK), 0);
4284                if (sh) {
4285                        if (unlikely(previous)) {
4286                                /* expansion might have moved on while waiting for a
4287                                 * stripe, so we must do the range check again.
4288                                 * Expansion could still move past after this
4289                                 * test, but as we are holding a reference to
4290                                 * 'sh', we know that if that happens,
4291                                 *  STRIPE_EXPANDING will get set and the expansion
4292                                 * won't proceed until we finish with the stripe.
4293                                 */
4294                                int must_retry = 0;
4295                                spin_lock_irq(&conf->device_lock);
4296                                if (mddev->reshape_backwards
4297                                    ? logical_sector >= conf->reshape_progress
4298                                    : logical_sector < conf->reshape_progress)
4299                                        /* mismatch, need to try again */
4300                                        must_retry = 1;
4301                                spin_unlock_irq(&conf->device_lock);
4302                                if (must_retry) {
4303                                        release_stripe(sh);
4304                                        schedule();
4305                                        goto retry;
4306                                }
4307                        }
4308
4309                        if (rw == WRITE &&
4310                            logical_sector >= mddev->suspend_lo &&
4311                            logical_sector < mddev->suspend_hi) {
4312                                release_stripe(sh);
4313                                /* As the suspend_* range is controlled by
4314                                 * userspace, we want an interruptible
4315                                 * wait.
4316                                 */
4317                                flush_signals(current);
4318                                prepare_to_wait(&conf->wait_for_overlap,
4319                                                &w, TASK_INTERRUPTIBLE);
4320                                if (logical_sector >= mddev->suspend_lo &&
4321                                    logical_sector < mddev->suspend_hi)
4322                                        schedule();
4323                                goto retry;
4324                        }
4325
4326                        if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4327                            !add_stripe_bio(sh, bi, dd_idx, rw)) {
4328                                /* Stripe is busy expanding or
4329                                 * add failed due to overlap.  Flush everything
4330                                 * and wait a while
4331                                 */
4332                                md_wakeup_thread(mddev->thread);
4333                                release_stripe(sh);
4334                                schedule();
4335                                goto retry;
4336                        }
4337                        finish_wait(&conf->wait_for_overlap, &w);
4338                        set_bit(STRIPE_HANDLE, &sh->state);
4339                        clear_bit(STRIPE_DELAYED, &sh->state);
4340                        if ((bi->bi_rw & REQ_SYNC) &&
4341                            !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4342                                atomic_inc(&conf->preread_active_stripes);
4343                        release_stripe_plug(mddev, sh);
4344                } else {
4345                        /* cannot get stripe for read-ahead, just give-up */
4346                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
4347                        finish_wait(&conf->wait_for_overlap, &w);
4348                        break;
4349                }
4350        }
4351
4352        remaining = raid5_dec_bi_active_stripes(bi);
4353        if (remaining == 0) {
4354
4355                if ( rw == WRITE )
4356                        md_write_end(mddev);
4357
4358                bio_endio(bi, 0);
4359        }
4360}
4361
4362static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
4363
4364static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
4365{
4366        /* reshaping is quite different to recovery/resync so it is
4367         * handled quite separately ... here.
4368         *
4369         * On each call to sync_request, we gather one chunk worth of
4370         * destination stripes and flag them as expanding.
4371         * Then we find all the source stripes and request reads.
4372         * As the reads complete, handle_stripe will copy the data
4373         * into the destination stripe and release that stripe.
4374         */
4375        struct r5conf *conf = mddev->private;
4376        struct stripe_head *sh;
4377        sector_t first_sector, last_sector;
4378        int raid_disks = conf->previous_raid_disks;
4379        int data_disks = raid_disks - conf->max_degraded;
4380        int new_data_disks = conf->raid_disks - conf->max_degraded;
4381        int i;
4382        int dd_idx;
4383        sector_t writepos, readpos, safepos;
4384        sector_t stripe_addr;
4385        int reshape_sectors;
4386        struct list_head stripes;
4387
4388        if (sector_nr == 0) {
4389                /* If restarting in the middle, skip the initial sectors */
4390                if (mddev->reshape_backwards &&
4391                    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4392                        sector_nr = raid5_size(mddev, 0, 0)
4393                                - conf->reshape_progress;
4394                } else if (!mddev->reshape_backwards &&
4395                           conf->reshape_progress > 0)
4396                        sector_nr = conf->reshape_progress;
4397                sector_div(sector_nr, new_data_disks);
4398                if (sector_nr) {
4399                        mddev->curr_resync_completed = sector_nr;
4400                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4401                        *skipped = 1;
4402                        return sector_nr;
4403                }
4404        }
4405
4406        /* We need to process a full chunk at a time.
4407         * If old and new chunk sizes differ, we need to process the
4408         * largest of these
4409         */
4410        if (mddev->new_chunk_sectors > mddev->chunk_sectors)
4411                reshape_sectors = mddev->new_chunk_sectors;
4412        else
4413                reshape_sectors = mddev->chunk_sectors;
4414
4415        /* We update the metadata at least every 10 seconds, or when
4416         * the data about to be copied would over-write the source of
4417         * the data at the front of the range.  i.e. one new_stripe
4418         * along from reshape_progress new_maps to after where
4419         * reshape_safe old_maps to
4420         */
4421        writepos = conf->reshape_progress;
4422        sector_div(writepos, new_data_disks);
4423        readpos = conf->reshape_progress;
4424        sector_div(readpos, data_disks);
4425        safepos = conf->reshape_safe;
4426        sector_div(safepos, data_disks);
4427        if (mddev->reshape_backwards) {
4428                writepos -= min_t(sector_t, reshape_sectors, writepos);
4429                readpos += reshape_sectors;
4430                safepos += reshape_sectors;
4431        } else {
4432                writepos += reshape_sectors;
4433                readpos -= min_t(sector_t, reshape_sectors, readpos);
4434                safepos -= min_t(sector_t, reshape_sectors, safepos);
4435        }
4436
4437        /* Having calculated the 'writepos' possibly use it
4438         * to set 'stripe_addr' which is where we will write to.
4439         */
4440        if (mddev->reshape_backwards) {
4441                BUG_ON(conf->reshape_progress == 0);
4442                stripe_addr = writepos;
4443                BUG_ON((mddev->dev_sectors &
4444                        ~((sector_t)reshape_sectors - 1))
4445                       - reshape_sectors - stripe_addr
4446                       != sector_nr);
4447        } else {
4448                BUG_ON(writepos != sector_nr + reshape_sectors);
4449                stripe_addr = sector_nr;
4450        }
4451
4452        /* 'writepos' is the most advanced device address we might write.
4453         * 'readpos' is the least advanced device address we might read.
4454         * 'safepos' is the least address recorded in the metadata as having
4455         *     been reshaped.
4456         * If there is a min_offset_diff, these are adjusted either by
4457         * increasing the safepos/readpos if diff is negative, or
4458         * increasing writepos if diff is positive.
4459         * If 'readpos' is then behind 'writepos', there is no way that we can
4460         * ensure safety in the face of a crash - that must be done by userspace
4461         * making a backup of the data.  So in that case there is no particular
4462         * rush to update metadata.
4463         * Otherwise if 'safepos' is behind 'writepos', then we really need to
4464         * update the metadata to advance 'safepos' to match 'readpos' so that
4465         * we can be safe in the event of a crash.
4466         * So we insist on updating metadata if safepos is behind writepos and
4467         * readpos is beyond writepos.
4468         * In any case, update the metadata every 10 seconds.
4469         * Maybe that number should be configurable, but I'm not sure it is
4470         * worth it.... maybe it could be a multiple of safemode_delay???
4471         */
4472        if (conf->min_offset_diff < 0) {
4473                safepos += -conf->min_offset_diff;
4474                readpos += -conf->min_offset_diff;
4475        } else
4476                writepos += conf->min_offset_diff;
4477
4478        if ((mddev->reshape_backwards
4479             ? (safepos > writepos && readpos < writepos)
4480             : (safepos < writepos && readpos > writepos)) ||
4481            time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4482                /* Cannot proceed until we've updated the superblock... */
4483                wait_event(conf->wait_for_overlap,
4484                           atomic_read(&conf->reshape_stripes)==0);
4485                mddev->reshape_position = conf->reshape_progress;
4486                mddev->curr_resync_completed = sector_nr;
4487                conf->reshape_checkpoint = jiffies;
4488                set_bit(MD_CHANGE_DEVS, &mddev->flags);
4489                md_wakeup_thread(mddev->thread);
4490                wait_event(mddev->sb_wait, mddev->flags == 0 ||
4491                           kthread_should_stop());
4492                spin_lock_irq(&conf->device_lock);
4493                conf->reshape_safe = mddev->reshape_position;
4494                spin_unlock_irq(&conf->device_lock);
4495                wake_up(&conf->wait_for_overlap);
4496                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4497        }
4498
4499        INIT_LIST_HEAD(&stripes);
4500        for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4501                int j;
4502                int skipped_disk = 0;
4503                sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
4504                set_bit(STRIPE_EXPANDING, &sh->state);
4505                atomic_inc(&conf->reshape_stripes);
4506                /* If any of this stripe is beyond the end of the old
4507                 * array, then we need to zero those blocks
4508                 */
4509                for (j=sh->disks; j--;) {
4510                        sector_t s;
4511                        if (j == sh->pd_idx)
4512                                continue;
4513                        if (conf->level == 6 &&
4514                            j == sh->qd_idx)
4515                                continue;
4516                        s = compute_blocknr(sh, j, 0);
4517                        if (s < raid5_size(mddev, 0, 0)) {
4518                                skipped_disk = 1;
4519                                continue;
4520                        }
4521                        memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
4522                        set_bit(R5_Expanded, &sh->dev[j].flags);
4523                        set_bit(R5_UPTODATE, &sh->dev[j].flags);
4524                }
4525                if (!skipped_disk) {
4526                        set_bit(STRIPE_EXPAND_READY, &sh->state);
4527                        set_bit(STRIPE_HANDLE, &sh->state);
4528                }
4529                list_add(&sh->lru, &stripes);
4530        }
4531        spin_lock_irq(&conf->device_lock);
4532        if (mddev->reshape_backwards)
4533                conf->reshape_progress -= reshape_sectors * new_data_disks;
4534        else
4535                conf->reshape_progress += reshape_sectors * new_data_disks;
4536        spin_unlock_irq(&conf->device_lock);
4537        /* Ok, those stripe are ready. We can start scheduling
4538         * reads on the source stripes.
4539         * The source stripes are determined by mapping the first and last
4540         * block on the destination stripes.
4541         */
4542        first_sector =
4543                raid5_compute_sector(conf, stripe_addr*(new_data_disks),
4544                                     1, &dd_idx, NULL);
4545        last_sector =
4546                raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4547                                            * new_data_disks - 1),
4548                                     1, &dd_idx, NULL);
4549        if (last_sector >= mddev->dev_sectors)
4550                last_sector = mddev->dev_sectors - 1;
4551        while (first_sector <= last_sector) {
4552                sh = get_active_stripe(conf, first_sector, 1, 0, 1);
4553                set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4554                set_bit(STRIPE_HANDLE, &sh->state);
4555                release_stripe(sh);
4556                first_sector += STRIPE_SECTORS;
4557        }
4558        /* Now that the sources are clearly marked, we can release
4559         * the destination stripes
4560         */
4561        while (!list_empty(&stripes)) {
4562                sh = list_entry(stripes.next, struct stripe_head, lru);
4563                list_del_init(&sh->lru);
4564                release_stripe(sh);
4565        }
4566        /* If this takes us to the resync_max point where we have to pause,
4567         * then we need to write out the superblock.
4568         */
4569        sector_nr += reshape_sectors;
4570        if ((sector_nr - mddev->curr_resync_completed) * 2
4571            >= mddev->resync_max - mddev->curr_resync_completed) {
4572                /* Cannot proceed until we've updated the superblock... */
4573                wait_event(conf->wait_for_overlap,
4574                           atomic_read(&conf->reshape_stripes) == 0);
4575                mddev->reshape_position = conf->reshape_progress;
4576                mddev->curr_resync_completed = sector_nr;
4577                conf->reshape_checkpoint = jiffies;
4578                set_bit(MD_CHANGE_DEVS, &mddev->flags);
4579                md_wakeup_thread(mddev->thread);
4580                wait_event(mddev->sb_wait,
4581                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4582                           || kthread_should_stop());
4583                spin_lock_irq(&conf->device_lock);
4584                conf->reshape_safe = mddev->reshape_position;
4585                spin_unlock_irq(&conf->device_lock);
4586                wake_up(&conf->wait_for_overlap);
4587                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4588        }
4589        return reshape_sectors;
4590}
4591
4592/* FIXME go_faster isn't used */
4593static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
4594{
4595        struct r5conf *conf = mddev->private;
4596        struct stripe_head *sh;
4597        sector_t max_sector = mddev->dev_sectors;
4598        sector_t sync_blocks;
4599        int still_degraded = 0;
4600        int i;
4601
4602        if (sector_nr >= max_sector) {
4603                /* just being told to finish up .. nothing much to do */
4604
4605                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4606                        end_reshape(conf);
4607                        return 0;
4608                }
4609
4610                if (mddev->curr_resync < max_sector) /* aborted */
4611                        bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
4612                                        &sync_blocks, 1);
4613                else /* completed sync */
4614                        conf->fullsync = 0;
4615                bitmap_close_sync(mddev->bitmap);
4616
4617                return 0;
4618        }
4619
4620        /* Allow raid5_quiesce to complete */
4621        wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4622
4623        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4624                return reshape_request(mddev, sector_nr, skipped);
4625
4626        /* No need to check resync_max as we never do more than one
4627         * stripe, and as resync_max will always be on a chunk boundary,
4628         * if the check in md_do_sync didn't fire, there is no chance
4629         * of overstepping resync_max here
4630         */
4631
4632        /* if there is too many failed drives and we are trying
4633         * to resync, then assert that we are finished, because there is
4634         * nothing we can do.
4635         */
4636        if (mddev->degraded >= conf->max_degraded &&
4637            test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4638                sector_t rv = mddev->dev_sectors - sector_nr;
4639                *skipped = 1;
4640                return rv;
4641        }
4642        if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
4643            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
4644            !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
4645                /* we can skip this block, and probably more */
4646                sync_blocks /= STRIPE_SECTORS;
4647                *skipped = 1;
4648                return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4649        }
4650
4651        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4652
4653        sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4654        if (sh == NULL) {
4655                sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4656                /* make sure we don't swamp the stripe cache if someone else
4657                 * is trying to get access
4658                 */
4659                schedule_timeout_uninterruptible(1);
4660        }
4661        /* Need to check if array will still be degraded after recovery/resync
4662         * We don't need to check the 'failed' flag as when that gets set,
4663         * recovery aborts.
4664         */
4665        for (i = 0; i < conf->raid_disks; i++)
4666                if (conf->disks[i].rdev == NULL)
4667                        still_degraded = 1;
4668
4669        bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4670
4671        set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4672
4673        handle_stripe(sh);
4674        release_stripe(sh);
4675
4676        return STRIPE_SECTORS;
4677}
4678
4679static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4680{
4681        /* We may not be able to submit a whole bio at once as there
4682         * may not be enough stripe_heads available.
4683         * We cannot pre-allocate enough stripe_heads as we may need
4684         * more than exist in the cache (if we allow ever large chunks).
4685         * So we do one stripe head at a time and record in
4686         * ->bi_hw_segments how many have been done.
4687         *
4688         * We *know* that this entire raid_bio is in one chunk, so
4689         * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
4690         */
4691        struct stripe_head *sh;
4692        int dd_idx;
4693        sector_t sector, logical_sector, last_sector;
4694        int scnt = 0;
4695        int remaining;
4696        int handled = 0;
4697
4698        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4699        sector = raid5_compute_sector(conf, logical_sector,
4700                                      0, &dd_idx, NULL);
4701        last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
4702
4703        for (; logical_sector < last_sector;
4704             logical_sector += STRIPE_SECTORS,
4705                     sector += STRIPE_SECTORS,
4706                     scnt++) {
4707
4708                if (scnt < raid5_bi_processed_stripes(raid_bio))
4709                        /* already done this stripe */
4710                        continue;
4711
4712                sh = get_active_stripe(conf, sector, 0, 1, 0);
4713
4714                if (!sh) {
4715                        /* failed to get a stripe - must wait */
4716                        raid5_set_bi_processed_stripes(raid_bio, scnt);
4717                        conf->retry_read_aligned = raid_bio;
4718                        return handled;
4719                }
4720
4721                if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4722                        release_stripe(sh);
4723                        raid5_set_bi_processed_stripes(raid_bio, scnt);
4724                        conf->retry_read_aligned = raid_bio;
4725                        return handled;
4726                }
4727
4728                set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
4729                handle_stripe(sh);
4730                release_stripe(sh);
4731                handled++;
4732        }
4733        remaining = raid5_dec_bi_active_stripes(raid_bio);
4734        if (remaining == 0)
4735                bio_endio(raid_bio, 0);
4736        if (atomic_dec_and_test(&conf->active_aligned_reads))
4737                wake_up(&conf->wait_for_stripe);
4738        return handled;
4739}
4740
4741#define MAX_STRIPE_BATCH 8
4742static int handle_active_stripes(struct r5conf *conf)
4743{
4744        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4745        int i, batch_size = 0;
4746
4747        while (batch_size < MAX_STRIPE_BATCH &&
4748                        (sh = __get_priority_stripe(conf)) != NULL)
4749                batch[batch_size++] = sh;
4750
4751        if (batch_size == 0)
4752                return batch_size;
4753        spin_unlock_irq(&conf->device_lock);
4754
4755        for (i = 0; i < batch_size; i++)
4756                handle_stripe(batch[i]);
4757
4758        cond_resched();
4759
4760        spin_lock_irq(&conf->device_lock);
4761        for (i = 0; i < batch_size; i++)
4762                __release_stripe(conf, batch[i]);
4763        return batch_size;
4764}
4765
4766/*
4767 * This is our raid5 kernel thread.
4768 *
4769 * We scan the hash table for stripes which can be handled now.
4770 * During the scan, completed stripes are saved for us by the interrupt
4771 * handler, so that they will not have to wait for our next wakeup.
4772 */
4773static void raid5d(struct md_thread *thread)
4774{
4775        struct mddev *mddev = thread->mddev;
4776        struct r5conf *conf = mddev->private;
4777        int handled;
4778        struct blk_plug plug;
4779
4780        pr_debug("+++ raid5d active\n");
4781
4782        md_check_recovery(mddev);
4783
4784        blk_start_plug(&plug);
4785        handled = 0;
4786        spin_lock_irq(&conf->device_lock);
4787        while (1) {
4788                struct bio *bio;
4789                int batch_size;
4790
4791                if (
4792                    !list_empty(&conf->bitmap_list)) {
4793                        /* Now is a good time to flush some bitmap updates */
4794                        conf->seq_flush++;
4795                        spin_unlock_irq(&conf->device_lock);
4796                        bitmap_unplug(mddev->bitmap);
4797                        spin_lock_irq(&conf->device_lock);
4798                        conf->seq_write = conf->seq_flush;
4799                        activate_bit_delay(conf);
4800                }
4801                raid5_activate_delayed(conf);
4802
4803                while ((bio = remove_bio_from_retry(conf))) {
4804                        int ok;
4805                        spin_unlock_irq(&conf->device_lock);
4806                        ok = retry_aligned_read(conf, bio);
4807                        spin_lock_irq(&conf->device_lock);
4808                        if (!ok)
4809                                break;
4810                        handled++;
4811                }
4812
4813                batch_size = handle_active_stripes(conf);
4814                if (!batch_size)
4815                        break;
4816                handled += batch_size;
4817
4818                if (