1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61#define NR_RAID10_BIOS 256
62
63
64
65
66
67
68#define IO_BLOCKED ((struct bio *)1)
69
70
71
72
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77
78
79
80
81static int max_queued_requests = 1024;
82
83static void allow_barrier(struct r10conf *conf);
84static void lower_barrier(struct r10conf *conf);
85static int enough(struct r10conf *conf, int ignore);
86static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
87 int *skipped);
88static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
89static void end_reshape_write(struct bio *bio, int error);
90static void end_reshape(struct r10conf *conf);
91
92static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
93{
94 struct r10conf *conf = data;
95 int size = offsetof(struct r10bio, devs[conf->copies]);
96
97
98
99 return kzalloc(size, gfp_flags);
100}
101
102static void r10bio_pool_free(void *r10_bio, void *data)
103{
104 kfree(r10_bio);
105}
106
107
108#define RESYNC_BLOCK_SIZE (64*1024)
109#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
110
111#define RESYNC_WINDOW (1024*1024)
112
113#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
114
115
116
117
118
119
120
121
122static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123{
124 struct r10conf *conf = data;
125 struct page *page;
126 struct r10bio *r10_bio;
127 struct bio *bio;
128 int i, j;
129 int nalloc;
130
131 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
132 if (!r10_bio)
133 return NULL;
134
135 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
136 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
137 nalloc = conf->copies;
138 else
139 nalloc = 2;
140
141
142
143
144 for (j = nalloc ; j-- ; ) {
145 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
146 if (!bio)
147 goto out_free_bio;
148 r10_bio->devs[j].bio = bio;
149 if (!conf->have_replacement)
150 continue;
151 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
152 if (!bio)
153 goto out_free_bio;
154 r10_bio->devs[j].repl_bio = bio;
155 }
156
157
158
159
160 for (j = 0 ; j < nalloc; j++) {
161 struct bio *rbio = r10_bio->devs[j].repl_bio;
162 bio = r10_bio->devs[j].bio;
163 for (i = 0; i < RESYNC_PAGES; i++) {
164 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
165 &conf->mddev->recovery)) {
166
167
168 struct bio *rbio = r10_bio->devs[0].bio;
169 page = rbio->bi_io_vec[i].bv_page;
170 get_page(page);
171 } else
172 page = alloc_page(gfp_flags);
173 if (unlikely(!page))
174 goto out_free_pages;
175
176 bio->bi_io_vec[i].bv_page = page;
177 if (rbio)
178 rbio->bi_io_vec[i].bv_page = page;
179 }
180 }
181
182 return r10_bio;
183
184out_free_pages:
185 for ( ; i > 0 ; i--)
186 safe_put_page(bio->bi_io_vec[i-1].bv_page);
187 while (j--)
188 for (i = 0; i < RESYNC_PAGES ; i++)
189 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
190 j = 0;
191out_free_bio:
192 for ( ; j < nalloc; j++) {
193 if (r10_bio->devs[j].bio)
194 bio_put(r10_bio->devs[j].bio);
195 if (r10_bio->devs[j].repl_bio)
196 bio_put(r10_bio->devs[j].repl_bio);
197 }
198 r10bio_pool_free(r10_bio, conf);
199 return NULL;
200}
201
202static void r10buf_pool_free(void *__r10_bio, void *data)
203{
204 int i;
205 struct r10conf *conf = data;
206 struct r10bio *r10bio = __r10_bio;
207 int j;
208
209 for (j=0; j < conf->copies; j++) {
210 struct bio *bio = r10bio->devs[j].bio;
211 if (bio) {
212 for (i = 0; i < RESYNC_PAGES; i++) {
213 safe_put_page(bio->bi_io_vec[i].bv_page);
214 bio->bi_io_vec[i].bv_page = NULL;
215 }
216 bio_put(bio);
217 }
218 bio = r10bio->devs[j].repl_bio;
219 if (bio)
220 bio_put(bio);
221 }
222 r10bio_pool_free(r10bio, conf);
223}
224
225static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
226{
227 int i;
228
229 for (i = 0; i < conf->copies; i++) {
230 struct bio **bio = & r10_bio->devs[i].bio;
231 if (!BIO_SPECIAL(*bio))
232 bio_put(*bio);
233 *bio = NULL;
234 bio = &r10_bio->devs[i].repl_bio;
235 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
236 bio_put(*bio);
237 *bio = NULL;
238 }
239}
240
241static void free_r10bio(struct r10bio *r10_bio)
242{
243 struct r10conf *conf = r10_bio->mddev->private;
244
245 put_all_bios(conf, r10_bio);
246 mempool_free(r10_bio, conf->r10bio_pool);
247}
248
249static void put_buf(struct r10bio *r10_bio)
250{
251 struct r10conf *conf = r10_bio->mddev->private;
252
253 mempool_free(r10_bio, conf->r10buf_pool);
254
255 lower_barrier(conf);
256}
257
258static void reschedule_retry(struct r10bio *r10_bio)
259{
260 unsigned long flags;
261 struct mddev *mddev = r10_bio->mddev;
262 struct r10conf *conf = mddev->private;
263
264 spin_lock_irqsave(&conf->device_lock, flags);
265 list_add(&r10_bio->retry_list, &conf->retry_list);
266 conf->nr_queued ++;
267 spin_unlock_irqrestore(&conf->device_lock, flags);
268
269
270 wake_up(&conf->wait_barrier);
271
272 md_wakeup_thread(mddev->thread);
273}
274
275
276
277
278
279
280static void raid_end_bio_io(struct r10bio *r10_bio)
281{
282 struct bio *bio = r10_bio->master_bio;
283 int done;
284 struct r10conf *conf = r10_bio->mddev->private;
285
286 if (bio->bi_phys_segments) {
287 unsigned long flags;
288 spin_lock_irqsave(&conf->device_lock, flags);
289 bio->bi_phys_segments--;
290 done = (bio->bi_phys_segments == 0);
291 spin_unlock_irqrestore(&conf->device_lock, flags);
292 } else
293 done = 1;
294 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
295 clear_bit(BIO_UPTODATE, &bio->bi_flags);
296 if (done) {
297 bio_endio(bio, 0);
298
299
300
301
302 allow_barrier(conf);
303 }
304 free_r10bio(r10_bio);
305}
306
307
308
309
310static inline void update_head_pos(int slot, struct r10bio *r10_bio)
311{
312 struct r10conf *conf = r10_bio->mddev->private;
313
314 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
315 r10_bio->devs[slot].addr + (r10_bio->sectors);
316}
317
318
319
320
321static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
322 struct bio *bio, int *slotp, int *replp)
323{
324 int slot;
325 int repl = 0;
326
327 for (slot = 0; slot < conf->copies; slot++) {
328 if (r10_bio->devs[slot].bio == bio)
329 break;
330 if (r10_bio->devs[slot].repl_bio == bio) {
331 repl = 1;
332 break;
333 }
334 }
335
336 BUG_ON(slot == conf->copies);
337 update_head_pos(slot, r10_bio);
338
339 if (slotp)
340 *slotp = slot;
341 if (replp)
342 *replp = repl;
343 return r10_bio->devs[slot].devnum;
344}
345
346static void raid10_end_read_request(struct bio *bio, int error)
347{
348 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
349 struct r10bio *r10_bio = bio->bi_private;
350 int slot, dev;
351 struct md_rdev *rdev;
352 struct r10conf *conf = r10_bio->mddev->private;
353
354
355 slot = r10_bio->read_slot;
356 dev = r10_bio->devs[slot].devnum;
357 rdev = r10_bio->devs[slot].rdev;
358
359
360
361 update_head_pos(slot, r10_bio);
362
363 if (uptodate) {
364
365
366
367
368
369
370
371
372
373 set_bit(R10BIO_Uptodate, &r10_bio->state);
374 } else {
375
376
377
378
379
380 unsigned long flags;
381 spin_lock_irqsave(&conf->device_lock, flags);
382 if (!enough(conf, rdev->raid_disk))
383 uptodate = 1;
384 spin_unlock_irqrestore(&conf->device_lock, flags);
385 }
386 if (uptodate) {
387 raid_end_bio_io(r10_bio);
388 rdev_dec_pending(rdev, conf->mddev);
389 } else {
390
391
392
393 char b[BDEVNAME_SIZE];
394 printk_ratelimited(KERN_ERR
395 "md/raid10:%s: %s: rescheduling sector %llu\n",
396 mdname(conf->mddev),
397 bdevname(rdev->bdev, b),
398 (unsigned long long)r10_bio->sector);
399 set_bit(R10BIO_ReadError, &r10_bio->state);
400 reschedule_retry(r10_bio);
401 }
402}
403
404static void close_write(struct r10bio *r10_bio)
405{
406
407 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
408 r10_bio->sectors,
409 !test_bit(R10BIO_Degraded, &r10_bio->state),
410 0);
411 md_write_end(r10_bio->mddev);
412}
413
414static void one_write_done(struct r10bio *r10_bio)
415{
416 if (atomic_dec_and_test(&r10_bio->remaining)) {
417 if (test_bit(R10BIO_WriteError, &r10_bio->state))
418 reschedule_retry(r10_bio);
419 else {
420 close_write(r10_bio);
421 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
422 reschedule_retry(r10_bio);
423 else
424 raid_end_bio_io(r10_bio);
425 }
426 }
427}
428
429static void raid10_end_write_request(struct bio *bio, int error)
430{
431 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
432 struct r10bio *r10_bio = bio->bi_private;
433 int dev;
434 int dec_rdev = 1;
435 struct r10conf *conf = r10_bio->mddev->private;
436 int slot, repl;
437 struct md_rdev *rdev = NULL;
438
439 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
440
441 if (repl)
442 rdev = conf->mirrors[dev].replacement;
443 if (!rdev) {
444 smp_rmb();
445 repl = 0;
446 rdev = conf->mirrors[dev].rdev;
447 }
448
449
450
451 if (!uptodate) {
452 if (repl)
453
454
455
456 md_error(rdev->mddev, rdev);
457 else {
458 set_bit(WriteErrorSeen, &rdev->flags);
459 if (!test_and_set_bit(WantReplacement, &rdev->flags))
460 set_bit(MD_RECOVERY_NEEDED,
461 &rdev->mddev->recovery);
462 set_bit(R10BIO_WriteError, &r10_bio->state);
463 dec_rdev = 0;
464 }
465 } else {
466
467
468
469
470
471
472
473
474
475 sector_t first_bad;
476 int bad_sectors;
477
478 set_bit(R10BIO_Uptodate, &r10_bio->state);
479
480
481 if (is_badblock(rdev,
482 r10_bio->devs[slot].addr,
483 r10_bio->sectors,
484 &first_bad, &bad_sectors)) {
485 bio_put(bio);
486 if (repl)
487 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
488 else
489 r10_bio->devs[slot].bio = IO_MADE_GOOD;
490 dec_rdev = 0;
491 set_bit(R10BIO_MadeGood, &r10_bio->state);
492 }
493 }
494
495
496
497
498
499
500 one_write_done(r10_bio);
501 if (dec_rdev)
502 rdev_dec_pending(rdev, conf->mddev);
503}
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
531{
532 int n,f;
533 sector_t sector;
534 sector_t chunk;
535 sector_t stripe;
536 int dev;
537 int slot = 0;
538
539
540 chunk = r10bio->sector >> geo->chunk_shift;
541 sector = r10bio->sector & geo->chunk_mask;
542
543 chunk *= geo->near_copies;
544 stripe = chunk;
545 dev = sector_div(stripe, geo->raid_disks);
546 if (geo->far_offset)
547 stripe *= geo->far_copies;
548
549 sector += stripe << geo->chunk_shift;
550
551
552 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev;
554 sector_t s = sector;
555 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d;
557 slot++;
558
559 for (f = 1; f < geo->far_copies; f++) {
560 d += geo->near_copies;
561 if (d >= geo->raid_disks)
562 d -= geo->raid_disks;
563 s += geo->stride;
564 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s;
566 slot++;
567 }
568 dev++;
569 if (dev >= geo->raid_disks) {
570 dev = 0;
571 sector += (geo->chunk_mask + 1);
572 }
573 }
574}
575
576static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
577{
578 struct geom *geo = &conf->geo;
579
580 if (conf->reshape_progress != MaxSector &&
581 ((r10bio->sector >= conf->reshape_progress) !=
582 conf->mddev->reshape_backwards)) {
583 set_bit(R10BIO_Previous, &r10bio->state);
584 geo = &conf->prev;
585 } else
586 clear_bit(R10BIO_Previous, &r10bio->state);
587
588 __raid10_find_phys(geo, r10bio);
589}
590
591static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
592{
593 sector_t offset, chunk, vchunk;
594
595
596
597 struct geom *geo = &conf->geo;
598
599 offset = sector & geo->chunk_mask;
600 if (geo->far_offset) {
601 int fc;
602 chunk = sector >> geo->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies);
604 dev -= fc * geo->near_copies;
605 if (dev < 0)
606 dev += geo->raid_disks;
607 } else {
608 while (sector >= geo->stride) {
609 sector -= geo->stride;
610 if (dev < geo->near_copies)
611 dev += geo->raid_disks - geo->near_copies;
612 else
613 dev -= geo->near_copies;
614 }
615 chunk = sector >> geo->chunk_shift;
616 }
617 vchunk = chunk * geo->raid_disks + dev;
618 sector_div(vchunk, geo->near_copies);
619 return (vchunk << geo->chunk_shift) + offset;
620}
621
622
623
624
625
626
627
628
629
630
631
632static int raid10_mergeable_bvec(struct request_queue *q,
633 struct bvec_merge_data *bvm,
634 struct bio_vec *biovec)
635{
636 struct mddev *mddev = q->queuedata;
637 struct r10conf *conf = mddev->private;
638 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
639 int max;
640 unsigned int chunk_sectors;
641 unsigned int bio_sectors = bvm->bi_size >> 9;
642 struct geom *geo = &conf->geo;
643
644 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
645 if (conf->reshape_progress != MaxSector &&
646 ((sector >= conf->reshape_progress) !=
647 conf->mddev->reshape_backwards))
648 geo = &conf->prev;
649
650 if (geo->near_copies < geo->raid_disks) {
651 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
652 + bio_sectors)) << 9;
653 if (max < 0)
654
655 max = 0;
656 if (max <= biovec->bv_len && bio_sectors == 0)
657 return biovec->bv_len;
658 } else
659 max = biovec->bv_len;
660
661 if (mddev->merge_check_needed) {
662 struct {
663 struct r10bio r10_bio;
664 struct r10dev devs[conf->copies];
665 } on_stack;
666 struct r10bio *r10_bio = &on_stack.r10_bio;
667 int s;
668 if (conf->reshape_progress != MaxSector) {
669
670 if (max <= biovec->bv_len && bio_sectors == 0)
671 return biovec->bv_len;
672 return 0;
673 }
674 r10_bio->sector = sector;
675 raid10_find_phys(conf, r10_bio);
676 rcu_read_lock();
677 for (s = 0; s < conf->copies; s++) {
678 int disk = r10_bio->devs[s].devnum;
679 struct md_rdev *rdev = rcu_dereference(
680 conf->mirrors[disk].rdev);
681 if (rdev && !test_bit(Faulty, &rdev->flags)) {
682 struct request_queue *q =
683 bdev_get_queue(rdev->bdev);
684 if (q->merge_bvec_fn) {
685 bvm->bi_sector = r10_bio->devs[s].addr
686 + rdev->data_offset;
687 bvm->bi_bdev = rdev->bdev;
688 max = min(max, q->merge_bvec_fn(
689 q, bvm, biovec));
690 }
691 }
692 rdev = rcu_dereference(conf->mirrors[disk].replacement);
693 if (rdev && !test_bit(Faulty, &rdev->flags)) {
694 struct request_queue *q =
695 bdev_get_queue(rdev->bdev);
696 if (q->merge_bvec_fn) {
697 bvm->bi_sector = r10_bio->devs[s].addr
698 + rdev->data_offset;
699 bvm->bi_bdev = rdev->bdev;
700 max = min(max, q->merge_bvec_fn(
701 q, bvm, biovec));
702 }
703 }
704 }
705 rcu_read_unlock();
706 }
707 return max;
708}
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729static struct md_rdev *read_balance(struct r10conf *conf,
730 struct r10bio *r10_bio,
731 int *max_sectors)
732{
733 const sector_t this_sector = r10_bio->sector;
734 int disk, slot;
735 int sectors = r10_bio->sectors;
736 int best_good_sectors;
737 sector_t new_distance, best_dist;
738 struct md_rdev *best_rdev, *rdev = NULL;
739 int do_balance;
740 int best_slot;
741 struct geom *geo = &conf->geo;
742
743 raid10_find_phys(conf, r10_bio);
744 rcu_read_lock();
745retry:
746 sectors = r10_bio->sectors;
747 best_slot = -1;
748 best_rdev = NULL;
749 best_dist = MaxSector;
750 best_good_sectors = 0;
751 do_balance = 1;
752
753
754
755
756
757
758 if (conf->mddev->recovery_cp < MaxSector
759 && (this_sector + sectors >= conf->next_resync))
760 do_balance = 0;
761
762 for (slot = 0; slot < conf->copies ; slot++) {
763 sector_t first_bad;
764 int bad_sectors;
765 sector_t dev_sector;
766
767 if (r10_bio->devs[slot].bio == IO_BLOCKED)
768 continue;
769 disk = r10_bio->devs[slot].devnum;
770 rdev = rcu_dereference(conf->mirrors[disk].replacement);
771 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
772 test_bit(Unmerged, &rdev->flags) ||
773 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
774 rdev = rcu_dereference(conf->mirrors[disk].rdev);
775 if (rdev == NULL ||
776 test_bit(Faulty, &rdev->flags) ||
777 test_bit(Unmerged, &rdev->flags))
778 continue;
779 if (!test_bit(In_sync, &rdev->flags) &&
780 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
781 continue;
782
783 dev_sector = r10_bio->devs[slot].addr;
784 if (is_badblock(rdev, dev_sector, sectors,
785 &first_bad, &bad_sectors)) {
786 if (best_dist < MaxSector)
787
788 continue;
789 if (first_bad <= dev_sector) {
790
791
792
793
794 bad_sectors -= (dev_sector - first_bad);
795 if (!do_balance && sectors > bad_sectors)
796 sectors = bad_sectors;
797 if (best_good_sectors > sectors)
798 best_good_sectors = sectors;
799 } else {
800 sector_t good_sectors =
801 first_bad - dev_sector;
802 if (good_sectors > best_good_sectors) {
803 best_good_sectors = good_sectors;
804 best_slot = slot;
805 best_rdev = rdev;
806 }
807 if (!do_balance)
808
809 break;
810 }
811 continue;
812 } else
813 best_good_sectors = sectors;
814
815 if (!do_balance)
816 break;
817
818
819
820
821
822 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
823 break;
824
825
826 if (geo->far_copies > 1)
827 new_distance = r10_bio->devs[slot].addr;
828 else
829 new_distance = abs(r10_bio->devs[slot].addr -
830 conf->mirrors[disk].head_position);
831 if (new_distance < best_dist) {
832 best_dist = new_distance;
833 best_slot = slot;
834 best_rdev = rdev;
835 }
836 }
837 if (slot >= conf->copies) {
838 slot = best_slot;
839 rdev = best_rdev;
840 }
841
842 if (slot >= 0) {
843 atomic_inc(&rdev->nr_pending);
844 if (test_bit(Faulty, &rdev->flags)) {
845
846
847
848 rdev_dec_pending(rdev, conf->mddev);
849 goto retry;
850 }
851 r10_bio->read_slot = slot;
852 } else
853 rdev = NULL;
854 rcu_read_unlock();
855 *max_sectors = best_good_sectors;
856
857 return rdev;
858}
859
860int md_raid10_congested(struct mddev *mddev, int bits)
861{
862 struct r10conf *conf = mddev->private;
863 int i, ret = 0;
864
865 if ((bits & (1 << BDI_async_congested)) &&
866 conf->pending_count >= max_queued_requests)
867 return 1;
868
869 rcu_read_lock();
870 for (i = 0;
871 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
872 && ret == 0;
873 i++) {
874 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
875 if (rdev && !test_bit(Faulty, &rdev->flags)) {
876 struct request_queue *q = bdev_get_queue(rdev->bdev);
877
878 ret |= bdi_congested(&q->backing_dev_info, bits);
879 }
880 }
881 rcu_read_unlock();
882 return ret;
883}
884EXPORT_SYMBOL_GPL(md_raid10_congested);
885
886static int raid10_congested(void *data, int bits)
887{
888 struct mddev *mddev = data;
889
890 return mddev_congested(mddev, bits) ||
891 md_raid10_congested(mddev, bits);
892}
893
894static void flush_pending_writes(struct r10conf *conf)
895{
896
897
898
899 spin_lock_irq(&conf->device_lock);
900
901 if (conf->pending_bio_list.head) {
902 struct bio *bio;
903 bio = bio_list_get(&conf->pending_bio_list);
904 conf->pending_count = 0;
905 spin_unlock_irq(&conf->device_lock);
906
907
908 bitmap_unplug(conf->mddev->bitmap);
909 wake_up(&conf->wait_barrier);
910
911 while (bio) {
912 struct bio *next = bio->bi_next;
913 bio->bi_next = NULL;
914 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
915 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
916
917 bio_endio(bio, 0);
918 else
919 generic_make_request(bio);
920 bio = next;
921 }
922 } else
923 spin_unlock_irq(&conf->device_lock);
924}
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948static void raise_barrier(struct r10conf *conf, int force)
949{
950 BUG_ON(force && !conf->barrier);
951 spin_lock_irq(&conf->resync_lock);
952
953
954 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
955 conf->resync_lock);
956
957
958 conf->barrier++;
959
960
961 wait_event_lock_irq(conf->wait_barrier,
962 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
963 conf->resync_lock);
964
965 spin_unlock_irq(&conf->resync_lock);
966}
967
968static void lower_barrier(struct r10conf *conf)
969{
970 unsigned long flags;
971 spin_lock_irqsave(&conf->resync_lock, flags);
972 conf->barrier--;
973 spin_unlock_irqrestore(&conf->resync_lock, flags);
974 wake_up(&conf->wait_barrier);
975}
976
977static void wait_barrier(struct r10conf *conf)
978{
979 spin_lock_irq(&conf->resync_lock);
980 if (conf->barrier) {
981 conf->nr_waiting++;
982
983
984
985
986
987
988
989
990
991 wait_event_lock_irq(conf->wait_barrier,
992 !conf->barrier ||
993 (conf->nr_pending &&
994 current->bio_list &&
995 !bio_list_empty(current->bio_list)),
996 conf->resync_lock);
997 conf->nr_waiting--;
998 }
999 conf->nr_pending++;
1000 spin_unlock_irq(&conf->resync_lock);
1001}
1002
1003static void allow_barrier(struct r10conf *conf)
1004{
1005 unsigned long flags;
1006 spin_lock_irqsave(&conf->resync_lock, flags);
1007 conf->nr_pending--;
1008 spin_unlock_irqrestore(&conf->resync_lock, flags);
1009 wake_up(&conf->wait_barrier);
1010}
1011
1012static void freeze_array(struct r10conf *conf)
1013{
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026 spin_lock_irq(&conf->resync_lock);
1027 conf->barrier++;
1028 conf->nr_waiting++;
1029 wait_event_lock_irq_cmd(conf->wait_barrier,
1030 conf->nr_pending == conf->nr_queued+1,
1031 conf->resync_lock,
1032 flush_pending_writes(conf));
1033
1034 spin_unlock_irq(&conf->resync_lock);
1035}
1036
1037static void unfreeze_array(struct r10conf *conf)
1038{
1039
1040 spin_lock_irq(&conf->resync_lock);
1041 conf->barrier--;
1042 conf->nr_waiting--;
1043 wake_up(&conf->wait_barrier);
1044 spin_unlock_irq(&conf->resync_lock);
1045}
1046
1047static sector_t choose_data_offset(struct r10bio *r10_bio,
1048 struct md_rdev *rdev)
1049{
1050 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1051 test_bit(R10BIO_Previous, &r10_bio->state))
1052 return rdev->data_offset;
1053 else
1054 return rdev->new_data_offset;
1055}
1056
1057struct raid10_plug_cb {
1058 struct blk_plug_cb cb;
1059 struct bio_list pending;
1060 int pending_cnt;
1061};
1062
1063static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1064{
1065 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1066 cb);
1067 struct mddev *mddev = plug->cb.data;
1068 struct r10conf *conf = mddev->private;
1069 struct bio *bio;
1070
1071 if (from_schedule || current->bio_list) {
1072 spin_lock_irq(&conf->device_lock);
1073 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1074 conf->pending_count += plug->pending_cnt;
1075 spin_unlock_irq(&conf->device_lock);
1076 md_wakeup_thread(mddev->thread);
1077 kfree(plug);
1078 return;
1079 }
1080
1081
1082 bio = bio_list_get(&plug->pending);
1083 bitmap_unplug(mddev->bitmap);
1084 wake_up(&conf->wait_barrier);
1085
1086 while (bio) {
1087 struct bio *next = bio->bi_next;
1088 bio->bi_next = NULL;
1089 generic_make_request(bio);
1090 bio = next;
1091 }
1092 kfree(plug);
1093}
1094
1095static void make_request(struct mddev *mddev, struct bio * bio)
1096{
1097 struct r10conf *conf = mddev->private;
1098 struct r10bio *r10_bio;
1099 struct bio *read_bio;
1100 int i;
1101 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1102 int chunk_sects = chunk_mask + 1;
1103 const int rw = bio_data_dir(bio);
1104 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1105 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1106 const unsigned long do_discard = (bio->bi_rw
1107 & (REQ_DISCARD | REQ_SECURE));
1108 unsigned long flags;
1109 struct md_rdev *blocked_rdev;
1110 struct blk_plug_cb *cb;
1111 struct raid10_plug_cb *plug = NULL;
1112 int sectors_handled;
1113 int max_sectors;
1114 int sectors;
1115
1116 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1117 md_flush_request(mddev, bio);
1118 return;
1119 }
1120
1121
1122
1123
1124 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
1125 > chunk_sects
1126 && (conf->geo.near_copies < conf->geo.raid_disks
1127 || conf->prev.near_copies < conf->prev.raid_disks))) {
1128 struct bio_pair *bp;
1129
1130 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
1131 bio->bi_idx != 0)
1132 goto bad_map;
1133
1134
1135
1136 bp = bio_split(bio,
1137 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147 spin_lock_irq(&conf->resync_lock);
1148 conf->nr_waiting++;
1149 spin_unlock_irq(&conf->resync_lock);
1150
1151 make_request(mddev, &bp->bio1);
1152 make_request(mddev, &bp->bio2);
1153
1154 spin_lock_irq(&conf->resync_lock);
1155 conf->nr_waiting--;
1156 wake_up(&conf->wait_barrier);
1157 spin_unlock_irq(&conf->resync_lock);
1158
1159 bio_pair_release(bp);
1160 return;
1161 bad_map:
1162 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1163 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1164 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
1165
1166 bio_io_error(bio);
1167 return;
1168 }
1169
1170 md_write_start(mddev, bio);
1171
1172
1173
1174
1175
1176
1177 wait_barrier(conf);
1178
1179 sectors = bio->bi_size >> 9;
1180 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1181 bio->bi_sector < conf->reshape_progress &&
1182 bio->bi_sector + sectors > conf->reshape_progress) {
1183
1184
1185
1186 allow_barrier(conf);
1187 wait_event(conf->wait_barrier,
1188 conf->reshape_progress <= bio->bi_sector ||
1189 conf->reshape_progress >= bio->bi_sector + sectors);
1190 wait_barrier(conf);
1191 }
1192 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1193 bio_data_dir(bio) == WRITE &&
1194 (mddev->reshape_backwards
1195 ? (bio->bi_sector < conf->reshape_safe &&
1196 bio->bi_sector + sectors > conf->reshape_progress)
1197 : (bio->bi_sector + sectors > conf->reshape_safe &&
1198 bio->bi_sector < conf->reshape_progress))) {
1199
1200 mddev->reshape_position = conf->reshape_progress;
1201 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1202 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1203 md_wakeup_thread(mddev->thread);
1204 wait_event(mddev->sb_wait,
1205 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1206
1207 conf->reshape_safe = mddev->reshape_position;
1208 }
1209
1210 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1211
1212 r10_bio->master_bio = bio;
1213 r10_bio->sectors = sectors;
1214
1215 r10_bio->mddev = mddev;
1216 r10_bio->sector = bio->bi_sector;
1217 r10_bio->state = 0;
1218
1219
1220
1221
1222
1223
1224
1225
1226 bio->bi_phys_segments = 0;
1227 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1228
1229 if (rw == READ) {
1230
1231
1232
1233 struct md_rdev *rdev;
1234 int slot;
1235
1236read_again:
1237 rdev = read_balance(conf, r10_bio, &max_sectors);
1238 if (!rdev) {
1239 raid_end_bio_io(r10_bio);
1240 return;
1241 }
1242 slot = r10_bio->read_slot;
1243
1244 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1245 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1246 max_sectors);
1247
1248 r10_bio->devs[slot].bio = read_bio;
1249 r10_bio->devs[slot].rdev = rdev;
1250
1251 read_bio->bi_sector = r10_bio->devs[slot].addr +
1252 choose_data_offset(r10_bio, rdev);
1253 read_bio->bi_bdev = rdev->bdev;
1254 read_bio->bi_end_io = raid10_end_read_request;
1255 read_bio->bi_rw = READ | do_sync;
1256 read_bio->bi_private = r10_bio;
1257
1258 if (max_sectors < r10_bio->sectors) {
1259
1260
1261
1262 sectors_handled = (r10_bio->sectors + max_sectors
1263 - bio->bi_sector);
1264 r10_bio->sectors = max_sectors;
1265 spin_lock_irq(&conf->device_lock);
1266 if (bio->bi_phys_segments == 0)
1267 bio->bi_phys_segments = 2;
1268 else
1269 bio->bi_phys_segments++;
1270 spin_unlock(&conf->device_lock);
1271
1272
1273
1274
1275
1276 reschedule_retry(r10_bio);
1277
1278 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1279
1280 r10_bio->master_bio = bio;
1281 r10_bio->sectors = ((bio->bi_size >> 9)
1282 - sectors_handled);
1283 r10_bio->state = 0;
1284 r10_bio->mddev = mddev;
1285 r10_bio->sector = bio->bi_sector + sectors_handled;
1286 goto read_again;
1287 } else
1288 generic_make_request(read_bio);
1289 return;
1290 }
1291
1292
1293
1294
1295 if (conf->pending_count >= max_queued_requests) {
1296 md_wakeup_thread(mddev->thread);
1297 wait_event(conf->wait_barrier,
1298 conf->pending_count < max_queued_requests);
1299 }
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312 r10_bio->read_slot = -1;
1313 raid10_find_phys(conf, r10_bio);
1314retry_write:
1315 blocked_rdev = NULL;
1316 rcu_read_lock();
1317 max_sectors = r10_bio->sectors;
1318
1319 for (i = 0; i < conf->copies; i++) {
1320 int d = r10_bio->devs[i].devnum;
1321 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1322 struct md_rdev *rrdev = rcu_dereference(
1323 conf->mirrors[d].replacement);
1324 if (rdev == rrdev)
1325 rrdev = NULL;
1326 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1327 atomic_inc(&rdev->nr_pending);
1328 blocked_rdev = rdev;
1329 break;
1330 }
1331 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1332 atomic_inc(&rrdev->nr_pending);
1333 blocked_rdev = rrdev;
1334 break;
1335 }
1336 if (rdev && (test_bit(Faulty, &rdev->flags)
1337 || test_bit(Unmerged, &rdev->flags)))
1338 rdev = NULL;
1339 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1340 || test_bit(Unmerged, &rrdev->flags)))
1341 rrdev = NULL;
1342
1343 r10_bio->devs[i].bio = NULL;
1344 r10_bio->devs[i].repl_bio = NULL;
1345
1346 if (!rdev && !rrdev) {
1347 set_bit(R10BIO_Degraded, &r10_bio->state);
1348 continue;
1349 }
1350 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1351 sector_t first_bad;
1352 sector_t dev_sector = r10_bio->devs[i].addr;
1353 int bad_sectors;
1354 int is_bad;
1355
1356 is_bad = is_badblock(rdev, dev_sector,
1357 max_sectors,
1358 &first_bad, &bad_sectors);
1359 if (is_bad < 0) {
1360
1361
1362
1363 atomic_inc(&rdev->nr_pending);
1364 set_bit(BlockedBadBlocks, &rdev->flags);
1365 blocked_rdev = rdev;
1366 break;
1367 }
1368 if (is_bad && first_bad <= dev_sector) {
1369
1370 bad_sectors -= (dev_sector - first_bad);
1371 if (bad_sectors < max_sectors)
1372
1373
1374
1375 max_sectors = bad_sectors;
1376
1377
1378
1379
1380
1381
1382
1383
1384 continue;
1385 }
1386 if (is_bad) {
1387 int good_sectors = first_bad - dev_sector;
1388 if (good_sectors < max_sectors)
1389 max_sectors = good_sectors;
1390 }
1391 }
1392 if (rdev) {
1393 r10_bio->devs[i].bio = bio;
1394 atomic_inc(&rdev->nr_pending);
1395 }
1396 if (rrdev) {
1397 r10_bio->devs[i].repl_bio = bio;
1398 atomic_inc(&rrdev->nr_pending);
1399 }
1400 }
1401 rcu_read_unlock();
1402
1403 if (unlikely(blocked_rdev)) {
1404
1405 int j;
1406 int d;
1407
1408 for (j = 0; j < i; j++) {
1409 if (r10_bio->devs[j].bio) {
1410 d = r10_bio->devs[j].devnum;
1411 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1412 }
1413 if (r10_bio->devs[j].repl_bio) {
1414 struct md_rdev *rdev;
1415 d = r10_bio->devs[j].devnum;
1416 rdev = conf->mirrors[d].replacement;
1417 if (!rdev) {
1418
1419 smp_mb();
1420 rdev = conf->mirrors[d].rdev;
1421 }
1422 rdev_dec_pending(rdev, mddev);
1423 }
1424 }
1425 allow_barrier(conf);
1426 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1427 wait_barrier(conf);
1428 goto retry_write;
1429 }
1430
1431 if (max_sectors < r10_bio->sectors) {
1432
1433
1434
1435 r10_bio->sectors = max_sectors;
1436 spin_lock_irq(&conf->device_lock);
1437 if (bio->bi_phys_segments == 0)
1438 bio->bi_phys_segments = 2;
1439 else
1440 bio->bi_phys_segments++;
1441 spin_unlock_irq(&conf->device_lock);
1442 }
1443 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1444
1445 atomic_set(&r10_bio->remaining, 1);
1446 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1447
1448 for (i = 0; i < conf->copies; i++) {
1449 struct bio *mbio;
1450 int d = r10_bio->devs[i].devnum;
1451 if (r10_bio->devs[i].bio) {
1452 struct md_rdev *rdev = conf->mirrors[d].rdev;
1453 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1454 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1455 max_sectors);
1456 r10_bio->devs[i].bio = mbio;
1457
1458 mbio->bi_sector = (r10_bio->devs[i].addr+
1459 choose_data_offset(r10_bio,
1460 rdev));
1461 mbio->bi_bdev = rdev->bdev;
1462 mbio->bi_end_io = raid10_end_write_request;
1463 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1464 mbio->bi_private = r10_bio;
1465
1466 atomic_inc(&r10_bio->remaining);
1467
1468 cb = blk_check_plugged(raid10_unplug, mddev,
1469 sizeof(*plug));
1470 if (cb)
1471 plug = container_of(cb, struct raid10_plug_cb,
1472 cb);
1473 else
1474 plug = NULL;
1475 spin_lock_irqsave(&conf->device_lock, flags);
1476 if (plug) {
1477 bio_list_add(&plug->pending, mbio);
1478 plug->pending_cnt++;
1479 } else {
1480 bio_list_add(&conf->pending_bio_list, mbio);
1481 conf->pending_count++;
1482 }
1483 spin_unlock_irqrestore(&conf->device_lock, flags);
1484 if (!plug)
1485 md_wakeup_thread(mddev->thread);
1486 }
1487
1488 if (r10_bio->devs[i].repl_bio) {
1489 struct md_rdev *rdev = conf->mirrors[d].replacement;
1490 if (rdev == NULL) {
1491
1492 smp_mb();
1493 rdev = conf->mirrors[d].rdev;
1494 }
1495 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1496 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1497 max_sectors);
1498 r10_bio->devs[i].repl_bio = mbio;
1499
1500 mbio->bi_sector = (r10_bio->devs[i].addr +
1501 choose_data_offset(
1502 r10_bio, rdev));
1503 mbio->bi_bdev = rdev->bdev;
1504 mbio->bi_end_io = raid10_end_write_request;
1505 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1506 mbio->bi_private = r10_bio;
1507
1508 atomic_inc(&r10_bio->remaining);
1509 spin_lock_irqsave(&conf->device_lock, flags);
1510 bio_list_add(&conf->pending_bio_list, mbio);
1511 conf->pending_count++;
1512 spin_unlock_irqrestore(&conf->device_lock, flags);
1513 if (!mddev_check_plugged(mddev))
1514 md_wakeup_thread(mddev->thread);
1515 }
1516 }
1517
1518
1519
1520
1521
1522 if (sectors_handled < (bio->bi_size >> 9)) {
1523 one_write_done(r10_bio);
1524
1525
1526
1527 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1528
1529 r10_bio->master_bio = bio;
1530 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1531
1532 r10_bio->mddev = mddev;
1533 r10_bio->sector = bio->bi_sector + sectors_handled;
1534 r10_bio->state = 0;
1535 goto retry_write;
1536 }
1537 one_write_done(r10_bio);
1538
1539
1540 wake_up(&conf->wait_barrier);
1541}
1542
1543static void status(struct seq_file *seq, struct mddev *mddev)
1544{
1545 struct r10conf *conf = mddev->private;
1546 int i;
1547
1548 if (conf->geo.near_copies < conf->geo.raid_disks)
1549 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1550 if (conf->geo.near_copies > 1)
1551 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1552 if (conf->geo.far_copies > 1) {
1553 if (conf->geo.far_offset)
1554 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1555 else
1556 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1557 }
1558 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1559 conf->geo.raid_disks - mddev->degraded);
1560 for (i = 0; i < conf->geo.raid_disks; i++)
1561 seq_printf(seq, "%s",
1562 conf->mirrors[i].rdev &&
1563 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1564 seq_printf(seq, "]");
1565}
1566
1567
1568
1569
1570
1571
1572static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1573{
1574 int first = 0;
1575
1576 do {
1577 int n = conf->copies;
1578 int cnt = 0;
1579 int this = first;
1580 while (n--) {
1581 if (conf->mirrors[this].rdev &&
1582 this != ignore)
1583 cnt++;
1584 this = (this+1) % geo->raid_disks;
1585 }
1586 if (cnt == 0)
1587 return 0;
1588 first = (first + geo->near_copies) % geo->raid_disks;
1589 } while (first != 0);
1590 return 1;
1591}
1592
1593static int enough(struct r10conf *conf, int ignore)
1594{
1595 return _enough(conf, &conf->geo, ignore) &&
1596 _enough(conf, &conf->prev, ignore);
1597}
1598
1599static void error(struct mddev *mddev, struct md_rdev *rdev)
1600{
1601 char b[BDEVNAME_SIZE];
1602 struct r10conf *conf = mddev->private;
1603
1604
1605
1606
1607
1608
1609
1610 if (test_bit(In_sync, &rdev->flags)
1611 && !enough(conf, rdev->raid_disk))
1612
1613
1614
1615 return;
1616 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1617 unsigned long flags;
1618 spin_lock_irqsave(&conf->device_lock, flags);
1619 mddev->degraded++;
1620 spin_unlock_irqrestore(&conf->device_lock, flags);
1621
1622
1623
1624 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1625 }
1626 set_bit(Blocked, &rdev->flags);
1627 set_bit(Faulty, &rdev->flags);
1628 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1629 printk(KERN_ALERT
1630 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1631 "md/raid10:%s: Operation continuing on %d devices.\n",
1632 mdname(mddev), bdevname(rdev->bdev, b),
1633 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1634}
1635
1636static void print_conf(struct r10conf *conf)
1637{
1638 int i;
1639 struct raid10_info *tmp;
1640
1641 printk(KERN_DEBUG "RAID10 conf printout:\n");
1642 if (!conf) {
1643 printk(KERN_DEBUG "(!conf)\n");
1644 return;
1645 }
1646 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1647 conf->geo.raid_disks);
1648
1649 for (i = 0; i < conf->geo.raid_disks; i++) {
1650 char b[BDEVNAME_SIZE];
1651 tmp = conf->mirrors + i;
1652 if (tmp->rdev)
1653 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1654 i, !test_bit(In_sync, &tmp->rdev->flags),
1655 !test_bit(Faulty, &tmp->rdev->flags),
1656 bdevname(tmp->rdev->bdev,b));
1657 }
1658}
1659
1660static void close_sync(struct r10conf *conf)
1661{
1662 wait_barrier(conf);
1663 allow_barrier(conf);
1664
1665 mempool_destroy(conf->r10buf_pool);
1666 conf->r10buf_pool = NULL;
1667}
1668
1669static int raid10_spare_active(struct mddev *mddev)
1670{
1671 int i;
1672 struct r10conf *conf = mddev->private;
1673 struct raid10_info *tmp;
1674 int count = 0;
1675 unsigned long flags;
1676
1677
1678
1679
1680
1681 for (i = 0; i < conf->geo.raid_disks; i++) {
1682 tmp = conf->mirrors + i;
1683 if (tmp->replacement
1684 && tmp->replacement->recovery_offset == MaxSector
1685 && !test_bit(Faulty, &tmp->replacement->flags)
1686 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1687
1688 if (!tmp->rdev
1689 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1690 count++;
1691 if (tmp->rdev) {
1692
1693
1694
1695
1696 set_bit(Faulty, &tmp->rdev->flags);
1697 sysfs_notify_dirent_safe(
1698 tmp->rdev->sysfs_state);
1699 }
1700 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1701 } else if (tmp->rdev
1702 && !test_bit(Faulty, &tmp->rdev->flags)
1703 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1704 count++;
1705 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1706 }
1707 }
1708 spin_lock_irqsave(&conf->device_lock, flags);
1709 mddev->degraded -= count;
1710 spin_unlock_irqrestore(&conf->device_lock, flags);
1711
1712 print_conf(conf);
1713 return count;
1714}
1715
1716
1717static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1718{
1719 struct r10conf *conf = mddev->private;
1720 int err = -EEXIST;
1721 int mirror;
1722 int first = 0;
1723 int last = conf->geo.raid_disks - 1;
1724 struct request_queue *q = bdev_get_queue(rdev->bdev);
1725
1726 if (mddev->recovery_cp < MaxSector)
1727
1728
1729
1730 return -EBUSY;
1731 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1732 return -EINVAL;
1733
1734 if (rdev->raid_disk >= 0)
1735 first = last = rdev->raid_disk;
1736
1737 if (q->merge_bvec_fn) {
1738 set_bit(Unmerged, &rdev->flags);
1739 mddev->merge_check_needed = 1;
1740 }
1741
1742 if (rdev->saved_raid_disk >= first &&
1743 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1744 mirror = rdev->saved_raid_disk;
1745 else
1746 mirror = first;
1747 for ( ; mirror <= last ; mirror++) {
1748 struct raid10_info *p = &conf->mirrors[mirror];
1749 if (p->recovery_disabled == mddev->recovery_disabled)
1750 continue;
1751 if (p->rdev) {
1752 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1753 p->replacement != NULL)
1754 continue;
1755 clear_bit(In_sync, &rdev->flags);
1756 set_bit(Replacement, &rdev->flags);
1757 rdev->raid_disk = mirror;
1758 err = 0;
1759 disk_stack_limits(mddev->gendisk, rdev->bdev,
1760 rdev->data_offset << 9);
1761 conf->fullsync = 1;
1762 rcu_assign_pointer(p->replacement, rdev);
1763 break;
1764 }
1765
1766 disk_stack_limits(mddev->gendisk, rdev->bdev,
1767 rdev->data_offset << 9);
1768
1769 p->head_position = 0;
1770 p->recovery_disabled = mddev->recovery_disabled - 1;
1771 rdev->raid_disk = mirror;
1772 err = 0;
1773 if (rdev->saved_raid_disk != mirror)
1774 conf->fullsync = 1;
1775 rcu_assign_pointer(p->rdev, rdev);
1776 break;
1777 }
1778 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1779
1780
1781
1782
1783
1784
1785
1786 synchronize_sched();
1787 raise_barrier(conf, 0);
1788 lower_barrier(conf);
1789 clear_bit(Unmerged, &rdev->flags);
1790 }
1791 md_integrity_add_rdev(rdev, mddev);
1792 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1793 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1794
1795 print_conf(conf);
1796 return err;
1797}
1798
1799static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1800{
1801 struct r10conf *conf = mddev->private;
1802 int err = 0;
1803 int number = rdev->raid_disk;
1804 struct md_rdev **rdevp;
1805 struct raid10_info *p = conf->mirrors + number;
1806
1807 print_conf(conf);
1808 if (rdev == p->rdev)
1809 rdevp = &p->rdev;
1810 else if (rdev == p->replacement)
1811 rdevp = &p->replacement;
1812 else
1813 return 0;
1814
1815 if (test_bit(In_sync, &rdev->flags) ||
1816 atomic_read(&rdev->nr_pending)) {
1817 err = -EBUSY;
1818 goto abort;
1819 }
1820
1821
1822
1823 if (!test_bit(Faulty, &rdev->flags) &&
1824 mddev->recovery_disabled != p->recovery_disabled &&
1825 (!p->replacement || p->replacement == rdev) &&
1826 number < conf->geo.raid_disks &&
1827 enough(conf, -1)) {
1828 err = -EBUSY;
1829 goto abort;
1830 }
1831 *rdevp = NULL;
1832 synchronize_rcu();
1833 if (atomic_read(&rdev->nr_pending)) {
1834
1835 err = -EBUSY;
1836 *rdevp = rdev;
1837 goto abort;
1838 } else if (p->replacement) {
1839
1840 p->rdev = p->replacement;
1841 clear_bit(Replacement, &p->replacement->flags);
1842 smp_mb();
1843
1844
1845 p->replacement = NULL;
1846 clear_bit(WantReplacement, &rdev->flags);
1847 } else
1848
1849
1850
1851 clear_bit(WantReplacement, &rdev->flags);
1852
1853 err = md_integrity_register(mddev);
1854
1855abort:
1856
1857 print_conf(conf);
1858 return err;
1859}
1860
1861
1862static void end_sync_read(struct bio *bio, int error)
1863{
1864 struct r10bio *r10_bio = bio->bi_private;
1865 struct r10conf *conf = r10_bio->mddev->private;
1866 int d;
1867
1868 if (bio == r10_bio->master_bio) {
1869
1870 d = r10_bio->read_slot;
1871 } else
1872 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1873
1874 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1875 set_bit(R10BIO_Uptodate, &r10_bio->state);
1876 else
1877
1878
1879
1880 atomic_add(r10_bio->sectors,
1881 &conf->mirrors[d].rdev->corrected_errors);
1882
1883
1884
1885
1886 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1887 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1888 atomic_dec_and_test(&r10_bio->remaining)) {
1889
1890
1891
1892 reschedule_retry(r10_bio);
1893 }
1894}
1895
1896static void end_sync_request(struct r10bio *r10_bio)
1897{
1898 struct mddev *mddev = r10_bio->mddev;
1899
1900 while (atomic_dec_and_test(&r10_bio->remaining)) {
1901 if (r10_bio->master_bio == NULL) {
1902
1903 sector_t s = r10_bio->sectors;
1904 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1905 test_bit(R10BIO_WriteError, &r10_bio->state))
1906 reschedule_retry(r10_bio);
1907 else
1908 put_buf(r10_bio);
1909 md_done_sync(mddev, s, 1);
1910 break;
1911 } else {
1912 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1913 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1914 test_bit(R10BIO_WriteError, &r10_bio->state))
1915 reschedule_retry(r10_bio);
1916 else
1917 put_buf(r10_bio);
1918 r10_bio = r10_bio2;
1919 }
1920 }
1921}
1922
1923static void end_sync_write(struct bio *bio, int error)
1924{
1925 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1926 struct r10bio *r10_bio = bio->bi_private;
1927 struct mddev *mddev = r10_bio->mddev;
1928 struct r10conf *conf = mddev->private;
1929 int d;
1930 sector_t first_bad;
1931 int bad_sectors;
1932 int slot;
1933 int repl;
1934 struct md_rdev *rdev = NULL;
1935
1936 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1937 if (repl)
1938 rdev = conf->mirrors[d].replacement;
1939 else
1940 rdev = conf->mirrors[d].rdev;
1941
1942 if (!uptodate) {
1943 if (repl)
1944 md_error(mddev, rdev);
1945 else {
1946 set_bit(WriteErrorSeen, &rdev->flags);
1947 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1948 set_bit(MD_RECOVERY_NEEDED,
1949 &rdev->mddev->recovery);
1950 set_bit(R10BIO_WriteError, &r10_bio->state);
1951 }
1952 } else if (is_badblock(rdev,
1953 r10_bio->devs[slot].addr,
1954 r10_bio->sectors,
1955 &first_bad, &bad_sectors))
1956 set_bit(R10BIO_MadeGood, &r10_bio->state);
1957
1958 rdev_dec_pending(rdev, mddev);
1959
1960 end_sync_request(r10_bio);
1961}
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1980{
1981 struct r10conf *conf = mddev->private;
1982 int i, first;
1983 struct bio *tbio, *fbio;
1984 int vcnt;
1985
1986 atomic_set(&r10_bio->remaining, 1);
1987
1988
1989 for (i=0; i<conf->copies; i++)
1990 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1991 break;
1992
1993 if (i == conf->copies)
1994 goto done;
1995
1996 first = i;
1997 fbio = r10_bio->devs[i].bio;
1998
1999 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2000
2001 for (i=0 ; i < conf->copies ; i++) {
2002 int j, d;
2003
2004 tbio = r10_bio->devs[i].bio;
2005
2006 if (tbio->bi_end_io != end_sync_read)
2007 continue;
2008 if (i == first)
2009 continue;
2010 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2011
2012
2013
2014
2015 for (j = 0; j < vcnt; j++)
2016 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2017 page_address(tbio->bi_io_vec[j].bv_page),
2018 fbio->bi_io_vec[j].bv_len))
2019 break;
2020 if (j == vcnt)
2021 continue;
2022 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2023 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2024
2025 continue;
2026 }
2027
2028
2029
2030
2031
2032 tbio->bi_vcnt = vcnt;
2033 tbio->bi_size = r10_bio->sectors << 9;
2034 tbio->bi_idx = 0;
2035 tbio->bi_phys_segments = 0;
2036 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
2037 tbio->bi_flags |= 1 << BIO_UPTODATE;
2038 tbio->bi_next = NULL;
2039 tbio->bi_rw = WRITE;
2040 tbio->bi_private = r10_bio;
2041 tbio->bi_sector = r10_bio->devs[i].addr;
2042
2043 for (j=0; j < vcnt ; j++) {
2044 tbio->bi_io_vec[j].bv_offset = 0;
2045 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2046
2047 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2048 page_address(fbio->bi_io_vec[j].bv_page),
2049 PAGE_SIZE);
2050 }
2051 tbio->bi_end_io = end_sync_write;
2052
2053 d = r10_bio->devs[i].devnum;
2054 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2055 atomic_inc(&r10_bio->remaining);
2056 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
2057
2058 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2059 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2060 generic_make_request(tbio);
2061 }
2062
2063
2064
2065
2066 for (i = 0; i < conf->copies; i++) {
2067 int j, d;
2068
2069 tbio = r10_bio->devs[i].repl_bio;
2070 if (!tbio || !tbio->bi_end_io)
2071 continue;
2072 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2073 && r10_bio->devs[i].bio != fbio)
2074 for (j = 0; j < vcnt; j++)
2075 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2076 page_address(fbio->bi_io_vec[j].bv_page),
2077 PAGE_SIZE);
2078 d = r10_bio->devs[i].devnum;
2079 atomic_inc(&r10_bio->remaining);
2080 md_sync_acct(conf->mirrors[d].replacement->bdev,
2081 tbio->bi_size >> 9);
2082 generic_make_request(tbio);
2083 }
2084
2085done:
2086 if (atomic_dec_and_test(&r10_bio->remaining)) {
2087 md_done_sync(mddev, r10_bio->sectors, 1);
2088 put_buf(r10_bio);
2089 }
2090}
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102static void fix_recovery_read_error(struct r10bio *r10_bio)
2103{
2104
2105
2106
2107
2108
2109
2110
2111 struct mddev *mddev = r10_bio->mddev;
2112 struct r10conf *conf = mddev->private;
2113 struct bio *bio = r10_bio->devs[0].bio;
2114 sector_t sect = 0;
2115 int sectors = r10_bio->sectors;
2116 int idx = 0;
2117 int dr = r10_bio->devs[0].devnum;
2118 int dw = r10_bio->devs[1].devnum;
2119
2120 while (sectors) {
2121 int s = sectors;
2122 struct md_rdev *rdev;
2123 sector_t addr;
2124 int ok;
2125
2126 if (s > (PAGE_SIZE>>9))
2127 s = PAGE_SIZE >> 9;
2128
2129 rdev = conf->mirrors[dr].rdev;
2130 addr = r10_bio->devs[0].addr + sect,
2131 ok = sync_page_io(rdev,
2132 addr,
2133 s << 9,
2134 bio->bi_io_vec[idx].bv_page,
2135 READ, false);
2136 if (ok) {
2137 rdev = conf->mirrors[dw].rdev;
2138 addr = r10_bio->devs[1].addr + sect;
2139 ok = sync_page_io(rdev,
2140 addr,
2141 s << 9,
2142 bio->bi_io_vec[idx].bv_page,
2143 WRITE, false);
2144 if (!ok) {
2145 set_bit(WriteErrorSeen, &rdev->flags);
2146 if (!test_and_set_bit(WantReplacement,
2147 &rdev->flags))
2148 set_bit(MD_RECOVERY_NEEDED,
2149 &rdev->mddev->recovery);
2150 }
2151 }
2152 if (!ok) {
2153
2154
2155
2156
2157 rdev_set_badblocks(rdev, addr, s, 0);
2158
2159 if (rdev != conf->mirrors[dw].rdev) {
2160
2161 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2162 addr = r10_bio->devs[1].addr + sect;
2163 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2164 if (!ok) {
2165
2166 printk(KERN_NOTICE
2167 "md/raid10:%s: recovery aborted"
2168 " due to read error\n",
2169 mdname(mddev));
2170
2171 conf->mirrors[dw].recovery_disabled
2172 = mddev->recovery_disabled;
2173 set_bit(MD_RECOVERY_INTR,
2174 &mddev->recovery);
2175 break;
2176 }
2177 }
2178 }
2179
2180 sectors -= s;
2181 sect += s;
2182 idx++;
2183 }
2184}
2185
2186static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2187{
2188 struct r10conf *conf = mddev->private;
2189 int d;
2190 struct bio *wbio, *wbio2;
2191
2192 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2193 fix_recovery_read_error(r10_bio);
2194 end_sync_request(r10_bio);
2195 return;
2196 }
2197
2198
2199
2200
2201
2202 d = r10_bio->devs[1].devnum;
2203 wbio = r10_bio->devs[1].bio;
2204 wbio2 = r10_bio->devs[1].repl_bio;
2205 if (wbio->bi_end_io) {
2206 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2207 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
2208 generic_make_request(wbio);
2209 }
2210 if (wbio2 && wbio2->bi_end_io) {
2211 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2212 md_sync_acct(conf->mirrors[d].replacement->bdev,
2213 wbio2->bi_size >> 9);
2214 generic_make_request(wbio2);
2215 }
2216}
2217
2218
2219
2220
2221
2222
2223
2224
2225static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2226{
2227 struct timespec cur_time_mon;
2228 unsigned long hours_since_last;
2229 unsigned int read_errors = atomic_read(&rdev->read_errors);
2230
2231 ktime_get_ts(&cur_time_mon);
2232
2233 if (rdev->last_read_error.tv_sec == 0 &&
2234 rdev->last_read_error.tv_nsec == 0) {
2235
2236 rdev->last_read_error = cur_time_mon;
2237 return;
2238 }
2239
2240 hours_since_last = (cur_time_mon.tv_sec -
2241 rdev->last_read_error.tv_sec) / 3600;
2242
2243 rdev->last_read_error = cur_time_mon;
2244
2245
2246
2247
2248
2249
2250 if (hours_since_last >= 8 * sizeof(read_errors))
2251 atomic_set(&rdev->read_errors, 0);
2252 else
2253 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2254}
2255
2256static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2257 int sectors, struct page *page, int rw)
2258{
2259 sector_t first_bad;
2260 int bad_sectors;
2261
2262 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2263 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2264 return -1;
2265 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2266
2267 return 1;
2268 if (rw == WRITE) {
2269 set_bit(WriteErrorSeen, &rdev->flags);
2270 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2271 set_bit(MD_RECOVERY_NEEDED,
2272 &rdev->mddev->recovery);
2273 }
2274
2275 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2276 md_error(rdev->mddev, rdev);
2277 return 0;
2278}
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2289{
2290 int sect = 0;
2291 int sectors = r10_bio->sectors;
2292 struct md_rdev*rdev;
2293 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2294 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2295
2296
2297
2298
2299 rdev = conf->mirrors[d].rdev;
2300
2301 if (test_bit(Faulty, &rdev->flags))
2302
2303
2304 return;
2305
2306 check_decay_read_errors(mddev, rdev);
2307 atomic_inc(&rdev->read_errors);
2308 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2309 char b[BDEVNAME_SIZE];
2310 bdevname(rdev->bdev, b);
2311
2312 printk(KERN_NOTICE
2313 "md/raid10:%s: %s: Raid device exceeded "
2314 "read_error threshold [cur %d:max %d]\n",
2315 mdname(mddev), b,
2316 atomic_read(&rdev->read_errors), max_read_errors);
2317 printk(KERN_NOTICE
2318 "md/raid10:%s: %s: Failing raid device\n",
2319 mdname(mddev), b);
2320 md_error(mddev, conf->mirrors[d].rdev);
2321 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2322 return;
2323 }
2324
2325 while(sectors) {
2326 int s = sectors;
2327 int sl = r10_bio->read_slot;
2328 int success = 0;
2329 int start;
2330
2331 if (s > (PAGE_SIZE>>9))
2332 s = PAGE_SIZE >> 9;
2333
2334 rcu_read_lock();
2335 do {
2336 sector_t first_bad;
2337 int bad_sectors;
2338
2339 d = r10_bio->devs[sl].devnum;
2340 rdev = rcu_dereference(conf->mirrors[d].rdev);
2341 if (rdev &&
2342 !test_bit(Unmerged, &rdev->flags) &&
2343 test_bit(In_sync, &rdev->flags) &&
2344 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2345 &first_bad, &bad_sectors) == 0) {
2346 atomic_inc(&rdev->nr_pending);
2347 rcu_read_unlock();
2348 success = sync_page_io(rdev,
2349 r10_bio->devs[sl].addr +
2350 sect,
2351 s<<9,
2352 conf->tmppage, READ, false);
2353 rdev_dec_pending(rdev, mddev);
2354 rcu_read_lock();
2355 if (success)
2356 break;
2357 }
2358 sl++;
2359 if (sl == conf->copies)
2360 sl = 0;
2361 } while (!success && sl != r10_bio->read_slot);
2362 rcu_read_unlock();
2363
2364 if (!success) {
2365
2366
2367
2368
2369 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2370 rdev = conf->mirrors[dn].rdev;
2371
2372 if (!rdev_set_badblocks(
2373 rdev,
2374 r10_bio->devs[r10_bio->read_slot].addr
2375 + sect,
2376 s, 0)) {
2377 md_error(mddev, rdev);
2378 r10_bio->devs[r10_bio->read_slot].bio
2379 = IO_BLOCKED;
2380 }
2381 break;
2382 }
2383
2384 start = sl;
2385
2386 rcu_read_lock();
2387 while (sl != r10_bio->read_slot) {
2388 char b[BDEVNAME_SIZE];
2389
2390 if (sl==0)
2391 sl = conf->copies;
2392 sl--;
2393 d = r10_bio->devs[sl].devnum;
2394 rdev = rcu_dereference(conf->mirrors[d].rdev);
2395 if (!rdev ||
2396 test_bit(Unmerged, &rdev->flags) ||
2397 !test_bit(In_sync, &rdev->flags))
2398 continue;
2399
2400 atomic_inc(&rdev->nr_pending);
2401 rcu_read_unlock();
2402 if (r10_sync_page_io(rdev,
2403 r10_bio->devs[sl].addr +
2404 sect,
2405 s, conf->tmppage, WRITE)
2406 == 0) {
2407
2408 printk(KERN_NOTICE
2409 "md/raid10:%s: read correction "
2410 "write failed"
2411 " (%d sectors at %llu on %s)\n",
2412 mdname(mddev), s,
2413 (unsigned long long)(
2414 sect +
2415 choose_data_offset(r10_bio,
2416 rdev)),
2417 bdevname(rdev->bdev, b));
2418 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2419 "drive\n",
2420 mdname(mddev),
2421 bdevname(rdev->bdev, b));
2422 }
2423 rdev_dec_pending(rdev, mddev);
2424 rcu_read_lock();
2425 }
2426 sl = start;
2427 while (sl != r10_bio->read_slot) {
2428 char b[BDEVNAME_SIZE];
2429
2430 if (sl==0)
2431 sl = conf->copies;
2432 sl--;
2433 d = r10_bio->devs[sl].devnum;
2434 rdev = rcu_dereference(conf->mirrors[d].rdev);
2435 if (!rdev ||
2436 !test_bit(In_sync, &rdev->flags))
2437 continue;
2438
2439 atomic_inc(&rdev->nr_pending);
2440 rcu_read_unlock();
2441 switch (r10_sync_page_io(rdev,
2442 r10_bio->devs[sl].addr +
2443 sect,
2444 s, conf->tmppage,
2445 READ)) {
2446 case 0:
2447
2448 printk(KERN_NOTICE
2449 "md/raid10:%s: unable to read back "
2450 "corrected sectors"
2451 " (%d sectors at %llu on %s)\n",
2452 mdname(mddev), s,
2453 (unsigned long long)(
2454 sect +
2455 choose_data_offset(r10_bio, rdev)),
2456 bdevname(rdev->bdev, b));
2457 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2458 "drive\n",
2459 mdname(mddev),
2460 bdevname(rdev->bdev, b));
2461 break;
2462 case 1:
2463 printk(KERN_INFO
2464 "md/raid10:%s: read error corrected"
2465 " (%d sectors at %llu on %s)\n",
2466 mdname(mddev), s,
2467 (unsigned long long)(
2468 sect +
2469 choose_data_offset(r10_bio, rdev)),
2470 bdevname(rdev->bdev, b));
2471 atomic_add(s, &rdev->corrected_errors);
2472 }
2473
2474 rdev_dec_pending(rdev, mddev);
2475 rcu_read_lock();
2476 }
2477 rcu_read_unlock();
2478
2479 sectors -= s;
2480 sect += s;
2481 }
2482}
2483
2484static void bi_complete(struct bio *bio, int error)
2485{
2486 complete((struct completion *)bio->bi_private);
2487}
2488
2489static int submit_bio_wait(int rw, struct bio *bio)
2490{
2491 struct completion event;
2492 rw |= REQ_SYNC;
2493
2494 init_completion(&event);
2495 bio->bi_private = &event;
2496 bio->bi_end_io = bi_complete;
2497 submit_bio(rw, bio);
2498 wait_for_completion(&event);
2499
2500 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2501}
2502
2503static int narrow_write_error(struct r10bio *r10_bio, int i)
2504{
2505 struct bio *bio = r10_bio->master_bio;
2506 struct mddev *mddev = r10_bio->mddev;
2507 struct r10conf *conf = mddev->private;
2508 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520 int block_sectors;
2521 sector_t sector;
2522 int sectors;
2523 int sect_to_write = r10_bio->sectors;
2524 int ok = 1;
2525
2526 if (rdev->badblocks.shift < 0)
2527 return 0;
2528
2529 block_sectors = 1 << rdev->badblocks.shift;
2530 sector = r10_bio->sector;
2531 sectors = ((r10_bio->sector + block_sectors)
2532 & ~(sector_t)(block_sectors - 1))
2533 - sector;
2534
2535 while (sect_to_write) {
2536 struct bio *wbio;
2537 if (sectors > sect_to_write)
2538 sectors = sect_to_write;
2539
2540 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2541 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2542 wbio->bi_sector = (r10_bio->devs[i].addr+
2543 choose_data_offset(r10_bio, rdev) +
2544 (sector - r10_bio->sector));
2545 wbio->bi_bdev = rdev->bdev;
2546 if (submit_bio_wait(WRITE, wbio) == 0)
2547
2548 ok = rdev_set_badblocks(rdev, sector,
2549 sectors, 0)
2550 && ok;
2551
2552 bio_put(wbio);
2553 sect_to_write -= sectors;
2554 sector += sectors;
2555 sectors = block_sectors;
2556 }
2557 return ok;
2558}
2559
2560static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2561{
2562 int slot = r10_bio->read_slot;
2563 struct bio *bio;
2564 struct r10conf *conf = mddev->private;
2565 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2566 char b[BDEVNAME_SIZE];
2567 unsigned long do_sync;
2568 int max_sectors;
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578 bio = r10_bio->devs[slot].bio;
2579 bdevname(bio->bi_bdev, b);
2580 bio_put(bio);
2581 r10_bio->devs[slot].bio = NULL;
2582
2583 if (mddev->ro == 0) {
2584 freeze_array(conf);
2585 fix_read_error(conf, mddev, r10_bio);
2586 unfreeze_array(conf);
2587 } else
2588 r10_bio->devs[slot].bio = IO_BLOCKED;
2589
2590 rdev_dec_pending(rdev, mddev);
2591
2592read_more:
2593 rdev = read_balance(conf, r10_bio, &max_sectors);
2594 if (rdev == NULL) {
2595 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2596 " read error for block %llu\n",
2597 mdname(mddev), b,
2598 (unsigned long long)r10_bio->sector);
2599 raid_end_bio_io(r10_bio);
2600 return;
2601 }
2602
2603 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2604 slot = r10_bio->read_slot;
2605 printk_ratelimited(
2606 KERN_ERR
2607 "md/raid10:%s: %s: redirecting "
2608 "sector %llu to another mirror\n",
2609 mdname(mddev),
2610 bdevname(rdev->bdev, b),
2611 (unsigned long long)r10_bio->sector);
2612 bio = bio_clone_mddev(r10_bio->master_bio,
2613 GFP_NOIO, mddev);
2614 md_trim_bio(bio,
2615 r10_bio->sector - bio->bi_sector,
2616 max_sectors);
2617 r10_bio->devs[slot].bio = bio;
2618 r10_bio->devs[slot].rdev = rdev;
2619 bio->bi_sector = r10_bio->devs[slot].addr
2620 + choose_data_offset(r10_bio, rdev);
2621 bio->bi_bdev = rdev->bdev;
2622 bio->bi_rw = READ | do_sync;
2623 bio->bi_private = r10_bio;
2624 bio->bi_end_io = raid10_end_read_request;
2625 if (max_sectors < r10_bio->sectors) {
2626
2627 struct bio *mbio = r10_bio->master_bio;
2628 int sectors_handled =
2629 r10_bio->sector + max_sectors
2630 - mbio->bi_sector;
2631 r10_bio->sectors = max_sectors;
2632 spin_lock_irq(&conf->device_lock);
2633 if (mbio->bi_phys_segments == 0)
2634 mbio->bi_phys_segments = 2;
2635 else
2636 mbio->bi_phys_segments++;
2637 spin_unlock_irq(&conf->device_lock);
2638 generic_make_request(bio);
2639
2640 r10_bio = mempool_alloc(conf->r10bio_pool,
2641 GFP_NOIO);
2642 r10_bio->master_bio = mbio;
2643 r10_bio->sectors = (mbio->bi_size >> 9)
2644 - sectors_handled;
2645 r10_bio->state = 0;
2646 set_bit(R10BIO_ReadError,
2647 &r10_bio->state);
2648 r10_bio->mddev = mddev;
2649 r10_bio->sector = mbio->bi_sector
2650 + sectors_handled;
2651
2652 goto read_more;
2653 } else
2654 generic_make_request(bio);
2655}
2656
2657static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2658{
2659
2660
2661
2662
2663
2664
2665 int m;
2666 struct md_rdev *rdev;
2667
2668 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2669 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2670 for (m = 0; m < conf->copies; m++) {
2671 int dev = r10_bio->devs[m].devnum;
2672 rdev = conf->mirrors[dev].rdev;
2673 if (r10_bio->devs[m].bio == NULL)
2674 continue;
2675 if (test_bit(BIO_UPTODATE,
2676 &r10_bio->devs[m].bio->bi_flags)) {
2677 rdev_clear_badblocks(
2678 rdev,
2679 r10_bio->devs[m].addr,
2680 r10_bio->sectors, 0);
2681 } else {
2682 if (!rdev_set_badblocks(
2683 rdev,
2684 r10_bio->devs[m].addr,
2685 r10_bio->sectors, 0))
2686 md_error(conf->mddev, rdev);
2687 }
2688 rdev = conf->mirrors[dev].replacement;
2689 if (r10_bio->devs[m].repl_bio == NULL)
2690 continue;
2691 if (test_bit(BIO_UPTODATE,
2692 &r10_bio->devs[m].repl_bio->bi_flags)) {
2693 rdev_clear_badblocks(
2694 rdev,
2695 r10_bio->devs[m].addr,
2696 r10_bio->sectors, 0);
2697 } else {
2698 if (!rdev_set_badblocks(
2699 rdev,
2700 r10_bio->devs[m].addr,
2701 r10_bio->sectors, 0))
2702 md_error(conf->mddev, rdev);
2703 }
2704 }
2705 put_buf(r10_bio);
2706 } else {
2707 for (m = 0; m < conf->copies; m++) {
2708 int dev = r10_bio->devs[m].devnum;
2709 struct bio *bio = r10_bio->devs[m].bio;
2710 rdev = conf->mirrors[dev].rdev;
2711 if (bio == IO_MADE_GOOD) {
2712 rdev_clear_badblocks(
2713 rdev,
2714 r10_bio->devs[m].addr,
2715 r10_bio->sectors, 0);
2716 rdev_dec_pending(rdev, conf->mddev);
2717 } else if (bio != NULL &&
2718 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2719 if (!narrow_write_error(r10_bio, m)) {
2720 md_error(conf->mddev, rdev);
2721 set_bit(R10BIO_Degraded,
2722 &r10_bio->state);
2723 }
2724 rdev_dec_pending(rdev, conf->mddev);
2725 }
2726 bio = r10_bio->devs[m].repl_bio;
2727 rdev = conf->mirrors[dev].replacement;
2728 if (rdev && bio == IO_MADE_GOOD) {
2729 rdev_clear_badblocks(
2730 rdev,
2731 r10_bio->devs[m].addr,
2732 r10_bio->sectors, 0);
2733 rdev_dec_pending(rdev, conf->mddev);
2734 }
2735 }
2736 if (test_bit(R10BIO_WriteError,
2737 &r10_bio->state))
2738 close_write(r10_bio);
2739 raid_end_bio_io(r10_bio);
2740 }
2741}
2742
2743static void raid10d(struct md_thread *thread)
2744{
2745 struct mddev *mddev = thread->mddev;
2746 struct r10bio *r10_bio;
2747 unsigned long flags;
2748 struct r10conf *conf = mddev->private;
2749 struct list_head *head = &conf->retry_list;
2750 struct blk_plug plug;
2751
2752 md_check_recovery(mddev);
2753
2754 blk_start_plug(&plug);
2755 for (;;) {
2756
2757 flush_pending_writes(conf);
2758
2759 spin_lock_irqsave(&conf->device_lock, flags);
2760 if (list_empty(head)) {
2761 spin_unlock_irqrestore(&conf->device_lock, flags);
2762 break;
2763 }
2764 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2765 list_del(head->prev);
2766 conf->nr_queued--;
2767 spin_unlock_irqrestore(&conf->device_lock, flags);
2768
2769 mddev = r10_bio->mddev;
2770 conf = mddev->private;
2771 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2772 test_bit(R10BIO_WriteError, &r10_bio->state))
2773 handle_write_completed(conf, r10_bio);
2774 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2775 reshape_request_write(mddev, r10_bio);
2776 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2777 sync_request_write(mddev, r10_bio);
2778 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2779 recovery_request_write(mddev, r10_bio);
2780 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2781 handle_read_error(mddev, r10_bio);
2782 else {
2783
2784
2785
2786 int slot = r10_bio->read_slot;
2787 generic_make_request(r10_bio->devs[slot].bio);
2788 }
2789
2790 cond_resched();
2791 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2792 md_check_recovery(mddev);
2793 }
2794 blk_finish_plug(&plug);
2795}
2796
2797
2798static int init_resync(struct r10conf *conf)
2799{
2800 int buffs;
2801 int i;
2802
2803 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2804 BUG_ON(conf->r10buf_pool);
2805 conf->have_replacement = 0;
2806 for (i = 0; i < conf->geo.raid_disks; i++)
2807 if (conf->mirrors[i].replacement)
2808 conf->have_replacement = 1;
2809 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2810 if (!conf->r10buf_pool)
2811 return -ENOMEM;
2812 conf->next_resync = 0;
2813 return 0;
2814}
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2849 int *skipped, int go_faster)
2850{
2851 struct r10conf *conf = mddev->private;
2852 struct r10bio *r10_bio;
2853 struct bio *biolist = NULL, *bio;
2854 sector_t max_sector, nr_sectors;
2855 int i;
2856 int max_sync;
2857 sector_t sync_blocks;
2858 sector_t sectors_skipped = 0;
2859 int chunks_skipped = 0;
2860 sector_t chunk_mask = conf->geo.chunk_mask;
2861
2862 if (!conf->r10buf_pool)
2863 if (init_resync(conf))
2864 return 0;
2865
2866 skipped:
2867 max_sector = mddev->dev_sectors;
2868 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2869 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2870 max_sector = mddev->resync_max_sectors;
2871 if (sector_nr >= max_sector) {
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2882 end_reshape(conf);
2883 return 0;
2884 }
2885
2886 if (mddev->curr_resync < max_sector) {
2887 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2888 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2889 &sync_blocks, 1);
2890 else for (i = 0; i < conf->geo.raid_disks; i++) {
2891 sector_t sect =
2892 raid10_find_virt(conf, mddev->curr_resync, i);
2893 bitmap_end_sync(mddev->bitmap, sect,
2894 &sync_blocks, 1);
2895 }
2896 } else {
2897
2898 if ((!mddev->bitmap || conf->fullsync)
2899 && conf->have_replacement
2900 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2901
2902
2903
2904 for (i = 0; i < conf->geo.raid_disks; i++)
2905 if (conf->mirrors[i].replacement)
2906 conf->mirrors[i].replacement
2907 ->recovery_offset
2908 = MaxSector;
2909 }
2910 conf->fullsync = 0;
2911 }
2912 bitmap_close_sync(mddev->bitmap);
2913 close_sync(conf);
2914 *skipped = 1;
2915 return sectors_skipped;
2916 }
2917
2918 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2919 return reshape_request(mddev, sector_nr, skipped);
2920
2921 if (chunks_skipped >= conf->geo.raid_disks) {
2922
2923
2924
2925 *skipped = 1;
2926 return (max_sector - sector_nr) + sectors_skipped;
2927 }
2928
2929 if (max_sector > mddev->resync_max)
2930 max_sector = mddev->resync_max;
2931
2932
2933
2934
2935 if (conf->geo.near_copies < conf->geo.raid_disks &&
2936 max_sector > (sector_nr | chunk_mask))
2937 max_sector = (sector_nr | chunk_mask) + 1;
2938
2939
2940
2941
2942 if (!go_faster && conf->nr_waiting)
2943 msleep_interruptible(1000);
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2961 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2962
2963 int j;
2964 r10_bio = NULL;
2965
2966 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2967 int still_degraded;
2968 struct r10bio *rb2;
2969 sector_t sect;
2970 int must_sync;
2971 int any_working;
2972 struct raid10_info *mirror = &conf->mirrors[i];
2973
2974 if ((mirror->rdev == NULL ||
2975 test_bit(In_sync, &mirror->rdev->flags))
2976 &&
2977 (mirror->replacement == NULL ||
2978 test_bit(Faulty,
2979 &mirror->replacement->flags)))
2980 continue;
2981
2982 still_degraded = 0;
2983
2984 rb2 = r10_bio;
2985 sect = raid10_find_virt(conf, sector_nr, i);
2986 if (sect >= mddev->resync_max_sectors) {
2987
2988
2989
2990 continue;
2991 }
2992
2993
2994
2995
2996 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2997 &sync_blocks, 1);
2998 if (sync_blocks < max_sync)
2999 max_sync = sync_blocks;
3000 if (!must_sync &&
3001 mirror->replacement == NULL &&
3002 !conf->fullsync) {
3003
3004
3005
3006 chunks_skipped = -1;
3007 continue;
3008 }
3009
3010 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3011 raise_barrier(conf, rb2 != NULL);
3012 atomic_set(&r10_bio->remaining, 0);
3013
3014 r10_bio->master_bio = (struct bio*)rb2;
3015 if (rb2)
3016 atomic_inc(&rb2->remaining);
3017 r10_bio->mddev = mddev;
3018 set_bit(R10BIO_IsRecover, &r10_bio->state);
3019 r10_bio->sector = sect;
3020
3021 raid10_find_phys(conf, r10_bio);
3022
3023
3024
3025
3026 for (j = 0; j < conf->geo.raid_disks; j++)
3027 if (conf->mirrors[j].rdev == NULL ||
3028 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3029 still_degraded = 1;
3030 break;
3031 }
3032
3033 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3034 &sync_blocks, still_degraded);
3035
3036 any_working = 0;
3037 for (j=0; j<conf->copies;j++) {
3038 int k;
3039 int d = r10_bio->devs[j].devnum;
3040 sector_t from_addr, to_addr;
3041 struct md_rdev *rdev;
3042 sector_t sector, first_bad;
3043 int bad_sectors;
3044 if (!conf->mirrors[d].rdev ||
3045 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3046 continue;
3047
3048 any_working = 1;
3049 rdev = conf->mirrors[d].rdev;
3050 sector = r10_bio->devs[j].addr;
3051
3052 if (is_badblock(rdev, sector, max_sync,
3053 &first_bad, &bad_sectors)) {
3054 if (first_bad > sector)
3055 max_sync = first_bad - sector;
3056 else {
3057 bad_sectors -= (sector
3058 - first_bad);
3059 if (max_sync > bad_sectors)
3060 max_sync = bad_sectors;
3061 continue;
3062 }
3063 }
3064 bio = r10_bio->devs[0].bio;
3065 bio->bi_next = biolist;
3066 biolist = bio;
3067 bio->bi_private = r10_bio;
3068 bio->bi_end_io = end_sync_read;
3069 bio->bi_rw = READ;
3070 from_addr = r10_bio->devs[j].addr;
3071 bio->bi_sector = from_addr + rdev->data_offset;
3072 bio->bi_bdev = rdev->bdev;
3073 atomic_inc(&rdev->nr_pending);
3074
3075
3076 for (k=0; k<conf->copies; k++)
3077 if (r10_bio->devs[k].devnum == i)
3078 break;
3079 BUG_ON(k == conf->copies);
3080 to_addr = r10_bio->devs[k].addr;
3081 r10_bio->devs[0].devnum = d;
3082 r10_bio->devs[0].addr = from_addr;
3083 r10_bio->devs[1].devnum = i;
3084 r10_bio->devs[1].addr = to_addr;
3085
3086 rdev = mirror->rdev;
3087 if (!test_bit(In_sync, &rdev->flags)) {
3088 bio = r10_bio->devs[1].bio;
3089 bio->bi_next = biolist;
3090 biolist = bio;
3091 bio->bi_private = r10_bio;
3092 bio->bi_end_io = end_sync_write;
3093 bio->bi_rw = WRITE;
3094 bio->bi_sector = to_addr
3095 + rdev->data_offset;
3096 bio->bi_bdev = rdev->bdev;
3097 atomic_inc(&r10_bio->remaining);
3098 } else
3099 r10_bio->devs[1].bio->bi_end_io = NULL;
3100
3101
3102 bio = r10_bio->devs[1].repl_bio;
3103 if (bio)
3104 bio->bi_end_io = NULL;
3105 rdev = mirror->replacement;
3106
3107
3108
3109
3110
3111
3112
3113
3114 if (rdev == NULL || bio == NULL ||
3115 test_bit(Faulty, &rdev->flags))
3116 break;
3117 bio->bi_next = biolist;
3118 biolist = bio;
3119 bio->bi_private = r10_bio;
3120 bio->bi_end_io = end_sync_write;
3121 bio->bi_rw = WRITE;
3122 bio->bi_sector = to_addr + rdev->data_offset;
3123 bio->bi_bdev = rdev->bdev;
3124 atomic_inc(&r10_bio->remaining);
3125 break;
3126 }
3127 if (j == conf->copies) {
3128
3129
3130 put_buf(r10_bio);
3131 if (rb2)
3132 atomic_dec(&rb2->remaining);
3133 r10_bio = rb2;
3134 if (any_working) {
3135
3136
3137
3138 int k;
3139 for (k = 0; k < conf->copies; k++)
3140 if (r10_bio->devs[k].devnum == i)
3141 break;
3142 if (!test_bit(In_sync,
3143 &mirror->rdev->flags)
3144 && !rdev_set_badblocks(
3145 mirror->rdev,
3146 r10_bio->devs[k].addr,
3147 max_sync, 0))
3148 any_working = 0;
3149 if (mirror->replacement &&
3150 !rdev_set_badblocks(
3151 mirror->replacement,
3152 r10_bio->devs[k].addr,
3153 max_sync, 0))
3154 any_working = 0;
3155 }
3156 if (!any_working) {
3157 if (!test_and_set_bit(MD_RECOVERY_INTR,
3158 &mddev->recovery))
3159 printk(KERN_INFO "md/raid10:%s: insufficient "
3160 "working devices for recovery.\n",
3161 mdname(mddev));
3162 mirror->recovery_disabled
3163 = mddev->recovery_disabled;
3164 }
3165 break;
3166 }
3167 }
3168 if (biolist == NULL) {
3169 while (r10_bio) {
3170 struct r10bio *rb2 = r10_bio;
3171 r10_bio = (struct r10bio*) rb2->master_bio;
3172 rb2->master_bio = NULL;
3173 put_buf(rb2);
3174 }
3175 goto giveup;
3176 }
3177 } else {
3178
3179 int count = 0;
3180
3181 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3182
3183 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3184 &sync_blocks, mddev->degraded) &&
3185 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3186 &mddev->recovery)) {
3187
3188 *skipped = 1;
3189 return sync_blocks + sectors_skipped;
3190 }
3191 if (sync_blocks < max_sync)
3192 max_sync = sync_blocks;
3193 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3194
3195 r10_bio->mddev = mddev;
3196 atomic_set(&r10_bio->remaining, 0);
3197 raise_barrier(conf, 0);
3198 conf->next_resync = sector_nr;
3199
3200 r10_bio->master_bio = NULL;
3201 r10_bio->sector = sector_nr;
3202 set_bit(R10BIO_IsSync, &r10_bio->state);
3203 raid10_find_phys(conf, r10_bio);
3204 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3205
3206 for (i = 0; i < conf->copies; i++) {
3207 int d = r10_bio->devs[i].devnum;
3208 sector_t first_bad, sector;
3209 int bad_sectors;
3210
3211 if (r10_bio->devs[i].repl_bio)
3212 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3213
3214 bio = r10_bio->devs[i].bio;
3215 bio->bi_end_io = NULL;
3216 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3217 if (conf->mirrors[d].rdev == NULL ||
3218 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3219 continue;
3220 sector = r10_bio->devs[i].addr;
3221 if (is_badblock(conf->mirrors[d].rdev,
3222 sector, max_sync,
3223 &first_bad, &bad_sectors)) {
3224 if (first_bad > sector)
3225 max_sync = first_bad - sector;
3226 else {
3227 bad_sectors -= (sector - first_bad);
3228 if (max_sync > bad_sectors)
3229 max_sync = bad_sectors;
3230 continue;
3231 }
3232 }
3233 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3234 atomic_inc(&r10_bio->remaining);
3235 bio->bi_next = biolist;
3236 biolist = bio;
3237 bio->bi_private = r10_bio;
3238 bio->bi_end_io = end_sync_read;
3239 bio->bi_rw = READ;
3240 bio->bi_sector = sector +
3241 conf->mirrors[d].rdev->data_offset;
3242 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3243 count++;
3244
3245 if (conf->mirrors[d].replacement == NULL ||
3246 test_bit(Faulty,
3247 &conf->mirrors[d].replacement->flags))
3248 continue;
3249
3250
3251 bio = r10_bio->devs[i].repl_bio;
3252 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3253
3254 sector = r10_bio->devs[i].addr;
3255 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3256 bio->bi_next = biolist;
3257 biolist = bio;
3258 bio->bi_private = r10_bio;
3259 bio->bi_end_io = end_sync_write;
3260 bio->bi_rw = WRITE;
3261 bio->bi_sector = sector +
3262 conf->mirrors[d].replacement->data_offset;
3263 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3264 count++;
3265 }
3266
3267 if (count < 2) {
3268 for (i=0; i<conf->copies; i++) {
3269 int d = r10_bio->devs[i].devnum;
3270 if (r10_bio->devs[i].bio->bi_end_io)
3271 rdev_dec_pending(conf->mirrors[d].rdev,
3272 mddev);
3273 if (r10_bio->devs[i].repl_bio &&
3274 r10_bio->devs[i].repl_bio->bi_end_io)
3275 rdev_dec_pending(
3276 conf->mirrors[d].replacement,
3277 mddev);
3278 }
3279 put_buf(r10_bio);
3280 biolist = NULL;
3281 goto giveup;
3282 }
3283 }
3284
3285 for (bio = biolist; bio ; bio=bio->bi_next) {
3286
3287 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3288 if (bio->bi_end_io)
3289 bio->bi_flags |= 1 << BIO_UPTODATE;
3290 bio->bi_vcnt = 0;
3291 bio->bi_idx = 0;
3292 bio->bi_phys_segments = 0;
3293 bio->bi_size = 0;
3294 }
3295
3296 nr_sectors = 0;
3297 if (sector_nr + max_sync < max_sector)
3298 max_sector = sector_nr + max_sync;
3299 do {
3300 struct page *page;
3301 int len = PAGE_SIZE;
3302 if (sector_nr + (len>>9) > max_sector)
3303 len = (max_sector - sector_nr) << 9;
3304 if (len == 0)
3305 break;
3306 for (bio= biolist ; bio ; bio=bio->bi_next) {
3307 struct bio *bio2;
3308 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3309 if (bio_add_page(bio, page, len, 0))
3310 continue;
3311
3312
3313 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3314 for (bio2 = biolist;
3315 bio2 && bio2 != bio;
3316 bio2 = bio2->bi_next) {
3317
3318 bio2->bi_vcnt--;
3319 bio2->bi_size -= len;
3320 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3321 }
3322 goto bio_full;
3323 }
3324 nr_sectors += len>>9;
3325 sector_nr += len>>9;
3326 } while (biolist->bi_vcnt < RESYNC_PAGES);
3327 bio_full:
3328 r10_bio->sectors = nr_sectors;
3329
3330 while (biolist) {
3331 bio = biolist;
3332 biolist = biolist->bi_next;
3333
3334 bio->bi_next = NULL;
3335 r10_bio = bio->bi_private;
3336 r10_bio->sectors = nr_sectors;
3337
3338 if (bio->bi_end_io == end_sync_read) {
3339 md_sync_acct(bio->bi_bdev, nr_sectors);
3340 generic_make_request(bio);
3341 }
3342 }
3343
3344 if (sectors_skipped)
3345
3346
3347
3348 md_done_sync(mddev, sectors_skipped, 1);
3349
3350 return sectors_skipped + nr_sectors;
3351 giveup:
3352
3353
3354
3355
3356 if (sector_nr + max_sync < max_sector)
3357 max_sector = sector_nr + max_sync;
3358
3359 sectors_skipped += (max_sector - sector_nr);
3360 chunks_skipped ++;
3361 sector_nr = max_sector;
3362 goto skipped;
3363}
3364
3365static sector_t
3366raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3367{
3368 sector_t size;
3369 struct r10conf *conf = mddev->private;
3370
3371 if (!raid_disks)
3372 raid_disks = min(conf->geo.raid_disks,
3373 conf->prev.raid_disks);
3374 if (!sectors)
3375 sectors = conf->dev_sectors;
3376
3377 size = sectors >> conf->geo.chunk_shift;
3378 sector_div(size, conf->geo.far_copies);
3379 size = size * raid_disks;
3380 sector_div(size, conf->geo.near_copies);
3381
3382 return size << conf->geo.chunk_shift;
3383}
3384
3385static void calc_sectors(struct r10conf *conf, sector_t size)
3386{
3387
3388
3389
3390
3391
3392 size = size >> conf->geo.chunk_shift;
3393 sector_div(size, conf->geo.far_copies);
3394 size = size * conf->geo.raid_disks;
3395 sector_div(size, conf->geo.near_copies);
3396
3397
3398 size = size * conf->copies;
3399
3400
3401
3402
3403 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3404
3405 conf->dev_sectors = size << conf->geo.chunk_shift;
3406
3407 if (conf->geo.far_offset)
3408 conf->geo.stride = 1 << conf->geo.chunk_shift;
3409 else {
3410 sector_div(size, conf->geo.far_copies);
3411 conf->geo.stride = size << conf->geo.chunk_shift;
3412 }
3413}
3414
3415enum geo_type {geo_new, geo_old, geo_start};
3416static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3417{
3418 int nc, fc, fo;
3419 int layout, chunk, disks;
3420 switch (new) {
3421 case geo_old:
3422 layout = mddev->layout;
3423 chunk = mddev->chunk_sectors;
3424 disks = mddev->raid_disks - mddev->delta_disks;
3425 break;
3426 case geo_new:
3427 layout = mddev->new_layout;
3428 chunk = mddev->new_chunk_sectors;
3429 disks = mddev->raid_disks;
3430 break;
3431 default:
3432 case geo_start:
3433
3434 layout = mddev->new_layout;
3435 chunk = mddev->new_chunk_sectors;
3436 disks = mddev->raid_disks + mddev->delta_disks;
3437 break;
3438 }
3439 if (layout >> 17)
3440 return -1;
3441 if (chunk < (PAGE_SIZE >> 9) ||
3442 !is_power_of_2(chunk))
3443 return -2;
3444 nc = layout & 255;
3445 fc = (layout >> 8) & 255;
3446 fo = layout & (1<<16);
3447 geo->raid_disks = disks;
3448 geo->near_copies = nc;
3449 geo->far_copies = fc;
3450 geo->far_offset = fo;
3451 geo->chunk_mask = chunk - 1;
3452 geo->chunk_shift = ffz(~chunk);
3453 return nc*fc;
3454}
3455
3456static struct r10conf *setup_conf(struct mddev *mddev)
3457{
3458 struct r10conf *conf = NULL;
3459 int err = -EINVAL;
3460 struct geom geo;
3461 int copies;
3462
3463 copies = setup_geo(&geo, mddev, geo_new);
3464
3465 if (copies == -2) {
3466 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3467 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3468 mdname(mddev), PAGE_SIZE);
3469 goto out;
3470 }
3471
3472 if (copies < 2 || copies > mddev->raid_disks) {
3473 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3474 mdname(mddev), mddev->new_layout);
3475 goto out;
3476 }
3477
3478 err = -ENOMEM;
3479 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3480 if (!conf)
3481 goto out;
3482
3483
3484 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3485 max(0,mddev->delta_disks)),
3486 GFP_KERNEL);
3487 if (!conf->mirrors)
3488 goto out;
3489
3490 conf->tmppage = alloc_page(GFP_KERNEL);
3491 if (!conf->tmppage)
3492 goto out;
3493
3494 conf->geo = geo;
3495 conf->copies = copies;
3496 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3497 r10bio_pool_free, conf);
3498 if (!conf->r10bio_pool)
3499 goto out;
3500
3501 calc_sectors(conf, mddev->dev_sectors);
3502 if (mddev->reshape_position == MaxSector) {
3503 conf->prev = conf->geo;
3504 conf->reshape_progress = MaxSector;
3505 } else {
3506 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3507 err = -EINVAL;
3508 goto out;
3509 }
3510 conf->reshape_progress = mddev->reshape_position;
3511 if (conf->prev.far_offset)
3512 conf->prev.stride = 1 << conf->prev.chunk_shift;
3513 else
3514
3515 conf->prev.stride = conf->dev_sectors;
3516 }
3517 spin_lock_init(&conf->device_lock);
3518 INIT_LIST_HEAD(&conf->retry_list);
3519
3520 spin_lock_init(&conf->resync_lock);
3521 init_waitqueue_head(&conf->wait_barrier);
3522
3523 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3524 if (!conf->thread)
3525 goto out;
3526
3527 conf->mddev = mddev;
3528 return conf;
3529
3530 out:
3531 if (err == -ENOMEM)
3532 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3533 mdname(mddev));
3534 if (conf) {
3535 if (conf->r10bio_pool)
3536 mempool_destroy(conf->r10bio_pool);
3537 kfree(conf->mirrors);
3538 safe_put_page(conf->tmppage);
3539 kfree(conf);
3540 }
3541 return ERR_PTR(err);
3542}
3543
3544static int run(struct mddev *mddev)
3545{
3546 struct r10conf *conf;
3547 int i, disk_idx, chunk_size;
3548 struct raid10_info *disk;
3549 struct md_rdev *rdev;
3550 sector_t size;
3551 sector_t min_offset_diff = 0;
3552 int first = 1;
3553 bool discard_supported = false;
3554
3555 if (mddev->private == NULL) {
3556 conf = setup_conf(mddev);
3557 if (IS_ERR(conf))
3558 return PTR_ERR(conf);
3559 mddev->private = conf;
3560 }
3561 conf = mddev->private;
3562 if (!conf)
3563 goto out;
3564
3565 mddev->thread = conf->thread;
3566 conf->thread = NULL;
3567
3568 chunk_size = mddev->chunk_sectors << 9;
3569 if (mddev->queue) {
3570 blk_queue_max_discard_sectors(mddev->queue,
3571 mddev->chunk_sectors);
3572 blk_queue_io_min(mddev->queue, chunk_size);
3573 if (conf->geo.raid_disks % conf->geo.near_copies)
3574 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3575 else
3576 blk_queue_io_opt(mddev->queue, chunk_size *
3577 (conf->geo.raid_disks / conf->geo.near_copies));
3578 }
3579
3580 rdev_for_each(rdev, mddev) {
3581 long long diff;
3582 struct request_queue *q;
3583
3584 disk_idx = rdev->raid_disk;
3585 if (disk_idx < 0)
3586 continue;
3587 if (disk_idx >= conf->geo.raid_disks &&
3588 disk_idx >= conf->prev.raid_disks)
3589 continue;
3590 disk = conf->mirrors + disk_idx;
3591
3592 if (test_bit(Replacement, &rdev->flags)) {
3593 if (disk->replacement)
3594 goto out_free_conf;
3595 disk->replacement = rdev;
3596 } else {
3597 if (disk->rdev)
3598 goto out_free_conf;
3599 disk->rdev = rdev;
3600 }
3601 q = bdev_get_queue(rdev->bdev);
3602 if (q->merge_bvec_fn)
3603 mddev->merge_check_needed = 1;
3604 diff = (rdev->new_data_offset - rdev->data_offset);
3605 if (!mddev->reshape_backwards)
3606 diff = -diff;
3607 if (diff < 0)
3608 diff = 0;
3609 if (first || diff < min_offset_diff)
3610 min_offset_diff = diff;
3611
3612 if (mddev->gendisk)
3613 disk_stack_limits(mddev->gendisk, rdev->bdev,
3614 rdev->data_offset << 9);
3615
3616 disk->head_position = 0;
3617
3618 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3619 discard_supported = true;
3620 }
3621
3622 if (mddev->queue) {
3623 if (discard_supported)
3624 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3625 mddev->queue);
3626 else
3627 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3628 mddev->queue);
3629 }
3630
3631 if (!enough(conf, -1)) {
3632 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3633 mdname(mddev));
3634 goto out_free_conf;
3635 }
3636
3637 if (conf->reshape_progress != MaxSector) {
3638
3639 if (conf->geo.far_copies != 1 &&
3640 conf->geo.far_offset == 0)
3641 goto out_free_conf;
3642 if (conf->prev.far_copies != 1 &&
3643 conf->geo.far_offset == 0)
3644 goto out_free_conf;
3645 }
3646
3647 mddev->degraded = 0;
3648 for (i = 0;
3649 i < conf->geo.raid_disks
3650 || i < conf->prev.raid_disks;
3651 i++) {
3652
3653 disk = conf->mirrors + i;
3654
3655 if (!disk->rdev && disk->replacement) {
3656
3657 disk->rdev = disk->replacement;
3658 disk->replacement = NULL;
3659 clear_bit(Replacement, &disk->rdev->flags);
3660 }
3661
3662 if (!disk->rdev ||
3663 !test_bit(In_sync, &disk->rdev->flags)) {
3664 disk->head_position = 0;
3665 mddev->degraded++;
3666 if (disk->rdev)
3667 conf->fullsync = 1;
3668 }
3669 disk->recovery_disabled = mddev->recovery_disabled - 1;
3670 }
3671
3672 if (mddev->recovery_cp != MaxSector)
3673 printk(KERN_NOTICE "md/raid10:%s: not clean"
3674 " -- starting background reconstruction\n",
3675 mdname(mddev));
3676 printk(KERN_INFO
3677 "md/raid10:%s: active with %d out of %d devices\n",
3678 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3679 conf->geo.raid_disks);
3680
3681
3682
3683 mddev->dev_sectors = conf->dev_sectors;
3684 size = raid10_size(mddev, 0, 0);
3685 md_set_array_sectors(mddev, size);
3686 mddev->resync_max_sectors = size;
3687
3688 if (mddev->queue) {
3689 int stripe = conf->geo.raid_disks *
3690 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3691 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3692 mddev->queue->backing_dev_info.congested_data = mddev;
3693
3694
3695
3696
3697
3698 stripe /= conf->geo.near_copies;
3699 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3700 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3701 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3702 }
3703
3704
3705 if (md_integrity_register(mddev))
3706 goto out_free_conf;
3707
3708 if (conf->reshape_progress != MaxSector) {
3709 unsigned long before_length, after_length;
3710
3711 before_length = ((1 << conf->prev.chunk_shift) *
3712 conf->prev.far_copies);
3713 after_length = ((1 << conf->geo.chunk_shift) *
3714 conf->geo.far_copies);
3715
3716 if (max(before_length, after_length) > min_offset_diff) {
3717
3718 printk("md/raid10: offset difference not enough to continue reshape\n");
3719 goto out_free_conf;
3720 }
3721 conf->offset_diff = min_offset_diff;
3722
3723 conf->reshape_safe = conf->reshape_progress;
3724 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3725 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3726 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3727 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3728 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3729 "reshape");
3730 }
3731
3732 return 0;
3733
3734out_free_conf:
3735 md_unregister_thread(&mddev->thread);
3736 if (conf->r10bio_pool)
3737 mempool_destroy(conf->r10bio_pool);
3738 safe_put_page(conf->tmppage);
3739 kfree(conf->mirrors);
3740 kfree(conf);
3741 mddev->private = NULL;
3742out:
3743 return -EIO;
3744}
3745
3746static int stop(struct mddev *mddev)
3747{
3748 struct r10conf *conf = mddev->private;
3749
3750 raise_barrier(conf, 0);
3751 lower_barrier(conf);
3752
3753 md_unregister_thread(&mddev->thread);
3754 if (mddev->queue)
3755
3756 blk_sync_queue(mddev->queue);
3757
3758 if (conf->r10bio_pool)
3759 mempool_destroy(conf->r10bio_pool);
3760 kfree(conf->mirrors);
3761 kfree(conf);
3762 mddev->private = NULL;
3763 return 0;
3764}
3765
3766static void raid10_quiesce(struct mddev *mddev, int state)
3767{
3768 struct r10conf *conf = mddev->private;
3769
3770 switch(state) {
3771 case 1:
3772 raise_barrier(conf, 0);
3773 break;
3774 case 0:
3775 lower_barrier(conf);
3776 break;
3777 }
3778}
3779
3780static int raid10_resize(struct mddev *mddev, sector_t sectors)
3781{
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794 struct r10conf *conf = mddev->private;
3795 sector_t oldsize, size;
3796
3797 if (mddev->reshape_position != MaxSector)
3798 return -EBUSY;
3799
3800 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3801 return -EINVAL;
3802
3803 oldsize = raid10_size(mddev, 0, 0);
3804 size = raid10_size(mddev, sectors, 0);
3805 if (mddev->external_size &&
3806 mddev->array_sectors > size)
3807 return -EINVAL;
3808 if (mddev->bitmap) {
3809 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3810 if (ret)
3811 return ret;
3812 }
3813 md_set_array_sectors(mddev, size);
3814 set_capacity(mddev->gendisk, mddev->array_sectors);
3815 revalidate_disk(mddev->gendisk);
3816 if (sectors > mddev->dev_sectors &&
3817 mddev->recovery_cp > oldsize) {
3818 mddev->recovery_cp = oldsize;
3819 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3820 }
3821 calc_sectors(conf, sectors);
3822 mddev->dev_sectors = conf->dev_sectors;
3823 mddev->resync_max_sectors = size;
3824 return 0;
3825}
3826
3827static void *raid10_takeover_raid0(struct mddev *mddev)
3828{
3829 struct md_rdev *rdev;
3830 struct r10conf *conf;
3831
3832 if (mddev->degraded > 0) {
3833 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3834 mdname(mddev));
3835 return ERR_PTR(-EINVAL);
3836 }
3837
3838
3839 mddev->new_level = 10;
3840
3841 mddev->new_layout = (1<<8) + 2;
3842 mddev->new_chunk_sectors = mddev->chunk_sectors;
3843 mddev->delta_disks = mddev->raid_disks;
3844 mddev->raid_disks *= 2;
3845
3846 mddev->recovery_cp = MaxSector;
3847
3848 conf = setup_conf(mddev);
3849 if (!IS_ERR(conf)) {
3850 rdev_for_each(rdev, mddev)
3851 if (rdev->raid_disk >= 0)
3852 rdev->new_raid_disk = rdev->raid_disk * 2;
3853 conf->barrier = 1;
3854 }
3855
3856 return conf;
3857}
3858
3859static void *raid10_takeover(struct mddev *mddev)
3860{
3861 struct r0conf *raid0_conf;
3862
3863
3864
3865
3866 if (mddev->level == 0) {
3867
3868 raid0_conf = mddev->private;
3869 if (raid0_conf->nr_strip_zones > 1) {
3870 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3871 " with more than one zone.\n",
3872 mdname(mddev));
3873 return ERR_PTR(-EINVAL);
3874 }
3875 return raid10_takeover_raid0(mddev);
3876 }
3877 return ERR_PTR(-EINVAL);
3878}
3879
3880static int raid10_check_reshape(struct mddev *mddev)
3881{
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896 struct r10conf *conf = mddev->private;
3897 struct geom geo;
3898
3899 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3900 return -EINVAL;
3901
3902 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3903
3904 return -EINVAL;
3905 if (geo.far_copies > 1 && !geo.far_offset)
3906
3907 return -EINVAL;
3908
3909 if (mddev->array_sectors & geo.chunk_mask)
3910
3911 return -EINVAL;
3912
3913 if (!enough(conf, -1))
3914 return -EINVAL;
3915
3916 kfree(conf->mirrors_new);
3917 conf->mirrors_new = NULL;
3918 if (mddev->delta_disks > 0) {
3919
3920 conf->mirrors_new = kzalloc(
3921 sizeof(struct raid10_info)
3922 *(mddev->raid_disks +
3923 mddev->delta_disks),
3924 GFP_KERNEL);
3925 if (!conf->mirrors_new)
3926 return -ENOMEM;
3927 }
3928 return 0;
3929}
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944static int calc_degraded(struct r10conf *conf)
3945{
3946 int degraded, degraded2;
3947 int i;
3948
3949 rcu_read_lock();
3950 degraded = 0;
3951
3952 for (i = 0; i < conf->prev.raid_disks; i++) {
3953 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3954 if (!rdev || test_bit(Faulty, &rdev->flags))
3955 degraded++;
3956 else if (!test_bit(In_sync, &rdev->flags))
3957
3958
3959
3960
3961 degraded++;
3962 }
3963 rcu_read_unlock();
3964 if (conf->geo.raid_disks == conf->prev.raid_disks)
3965 return degraded;
3966 rcu_read_lock();
3967 degraded2 = 0;
3968 for (i = 0; i < conf->geo.raid_disks; i++) {
3969 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3970 if (!rdev || test_bit(Faulty, &rdev->flags))
3971 degraded2++;
3972 else if (!test_bit(In_sync, &rdev->flags)) {
3973
3974
3975
3976
3977
3978 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3979 degraded2++;
3980 }
3981 }
3982 rcu_read_unlock();
3983 if (degraded2 > degraded)
3984 return degraded2;
3985 return degraded;
3986}
3987
3988static int raid10_start_reshape(struct mddev *mddev)
3989{
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000 unsigned long before_length, after_length;
4001 sector_t min_offset_diff = 0;
4002 int first = 1;
4003 struct geom new;
4004 struct r10conf *conf = mddev->private;
4005 struct md_rdev *rdev;
4006 int spares = 0;
4007 int ret;
4008
4009 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4010 return -EBUSY;
4011
4012 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4013 return -EINVAL;
4014
4015 before_length = ((1 << conf->prev.chunk_shift) *
4016 conf->prev.far_copies);
4017 after_length = ((1 << conf->geo.chunk_shift) *
4018 conf->geo.far_copies);
4019
4020 rdev_for_each(rdev, mddev) {
4021 if (!test_bit(In_sync, &rdev->flags)
4022 && !test_bit(Faulty, &rdev->flags))
4023 spares++;
4024 if (rdev->raid_disk >= 0) {
4025 long long diff = (rdev->new_data_offset
4026 - rdev->data_offset);
4027 if (!mddev->reshape_backwards)
4028 diff = -diff;
4029 if (diff < 0)
4030 diff = 0;
4031 if (first || diff < min_offset_diff)
4032 min_offset_diff = diff;
4033 }
4034 }
4035
4036 if (max(before_length, after_length) > min_offset_diff)
4037 return -EINVAL;
4038
4039 if (spares < mddev->delta_disks)
4040 return -EINVAL;
4041
4042 conf->offset_diff = min_offset_diff;
4043 spin_lock_irq(&conf->device_lock);
4044 if (conf->mirrors_new) {
4045 memcpy(conf->mirrors_new, conf->mirrors,
4046 sizeof(struct raid10_info)*conf->prev.raid_disks);
4047 smp_mb();
4048 kfree(conf->mirrors_old);
4049 conf->mirrors_old = conf->mirrors;
4050 conf->mirrors = conf->mirrors_new;
4051 conf->mirrors_new = NULL;
4052 }
4053 setup_geo(&conf->geo, mddev, geo_start);
4054 smp_mb();
4055 if (mddev->reshape_backwards) {
4056 sector_t size = raid10_size(mddev, 0, 0);
4057 if (size < mddev->array_sectors) {
4058 spin_unlock_irq(&conf->device_lock);
4059 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4060 mdname(mddev));
4061 return -EINVAL;
4062 }
4063 mddev->resync_max_sectors = size;
4064 conf->reshape_progress = size;
4065 } else
4066 conf->reshape_progress = 0;
4067 spin_unlock_irq(&conf->device_lock);
4068
4069 if (mddev->delta_disks && mddev->bitmap) {
4070 ret = bitmap_resize(mddev->bitmap,
4071 raid10_size(mddev, 0,
4072 conf->geo.raid_disks),
4073 0, 0);
4074 if (ret)
4075 goto abort;
4076 }
4077 if (mddev->delta_disks > 0) {
4078 rdev_for_each(rdev, mddev)
4079 if (rdev->raid_disk < 0 &&
4080 !test_bit(Faulty, &rdev->flags)) {
4081 if (raid10_add_disk(mddev, rdev) == 0) {
4082 if (rdev->raid_disk >=
4083 conf->prev.raid_disks)
4084 set_bit(In_sync, &rdev->flags);
4085 else
4086 rdev->recovery_offset = 0;
4087
4088 if (sysfs_link_rdev(mddev, rdev))
4089 ;
4090 }
4091 } else if (rdev->raid_disk >= conf->prev.raid_disks
4092 && !test_bit(Faulty, &rdev->flags)) {
4093
4094 set_bit(In_sync, &rdev->flags);
4095 }
4096 }
4097
4098
4099
4100
4101 spin_lock_irq(&conf->device_lock);
4102 mddev->degraded = calc_degraded(conf);
4103 spin_unlock_irq(&conf->device_lock);
4104 mddev->raid_disks = conf->geo.raid_disks;
4105 mddev->reshape_position = conf->reshape_progress;
4106 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4107
4108 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4109 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4110 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4111 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4112
4113 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4114 "reshape");
4115 if (!mddev->sync_thread) {
4116 ret = -EAGAIN;
4117 goto abort;
4118 }
4119 conf->reshape_checkpoint = jiffies;
4120 md_wakeup_thread(mddev->sync_thread);
4121 md_new_event(mddev);
4122 return 0;
4123
4124abort:
4125 mddev->recovery = 0;
4126 spin_lock_irq(&conf->device_lock);
4127 conf->geo = conf->prev;
4128 mddev->raid_disks = conf->geo.raid_disks;
4129 rdev_for_each(rdev, mddev)
4130 rdev->new_data_offset = rdev->data_offset;
4131 smp_wmb();
4132 conf->reshape_progress = MaxSector;
4133 mddev->reshape_position = MaxSector;
4134 spin_unlock_irq(&conf->device_lock);
4135 return ret;
4136}
4137
4138
4139
4140
4141
4142
4143
4144static sector_t last_dev_address(sector_t s, struct geom *geo)
4145{
4146 s = (s | geo->chunk_mask) + 1;
4147 s >>= geo->chunk_shift;
4148 s *= geo->near_copies;
4149 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4150 s *= geo->far_copies;
4151 s <<= geo->chunk_shift;
4152 return s;
4153}
4154
4155
4156
4157
4158
4159static sector_t first_dev_address(sector_t s, struct geom *geo)
4160{
4161 s >>= geo->chunk_shift;
4162 s *= geo->near_copies;
4163 sector_div(s, geo->raid_disks);
4164 s *= geo->far_copies;
4165 s <<= geo->chunk_shift;
4166 return s;
4167}
4168
4169static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4170 int *skipped)
4171{
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209 struct r10conf *conf = mddev->private;
4210 struct r10bio *r10_bio;
4211 sector_t next, safe, last;
4212 int max_sectors;
4213 int nr_sectors;
4214 int s;
4215 struct md_rdev *rdev;
4216 int need_flush = 0;
4217 struct bio *blist;
4218 struct bio *bio, *read_bio;
4219 int sectors_done = 0;
4220
4221 if (sector_nr == 0) {
4222
4223 if (mddev->reshape_backwards &&
4224 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4225 sector_nr = (raid10_size(mddev, 0, 0)
4226 - conf->reshape_progress);
4227 } else if (!mddev->reshape_backwards &&
4228 conf->reshape_progress > 0)
4229 sector_nr = conf->reshape_progress;
4230 if (sector_nr) {
4231 mddev->curr_resync_completed = sector_nr;
4232 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4233 *skipped = 1;
4234 return sector_nr;
4235 }
4236 }
4237
4238
4239
4240
4241
4242 if (mddev->reshape_backwards) {
4243
4244
4245
4246 next = first_dev_address(conf->reshape_progress - 1,
4247 &conf->geo);
4248
4249
4250
4251
4252 safe = last_dev_address(conf->reshape_safe - 1,
4253 &conf->prev);
4254
4255 if (next + conf->offset_diff < safe)
4256 need_flush = 1;
4257
4258 last = conf->reshape_progress - 1;
4259 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4260 & conf->prev.chunk_mask);
4261 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4262 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4263 } else {
4264
4265
4266
4267 next = last_dev_address(conf->reshape_progress, &conf->geo);
4268
4269
4270
4271
4272 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4273
4274
4275
4276
4277 if (next > safe + conf->offset_diff)
4278 need_flush = 1;
4279
4280 sector_nr = conf->reshape_progress;
4281 last = sector_nr | (conf->geo.chunk_mask
4282 & conf->prev.chunk_mask);
4283
4284 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4285 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4286 }
4287
4288 if (need_flush ||
4289 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4290
4291 wait_barrier(conf);
4292 mddev->reshape_position = conf->reshape_progress;
4293 if (mddev->reshape_backwards)
4294 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4295 - conf->reshape_progress;
4296 else
4297 mddev->curr_resync_completed = conf->reshape_progress;
4298 conf->reshape_checkpoint = jiffies;
4299 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4300 md_wakeup_thread(mddev->thread);
4301 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4302 kthread_should_stop());
4303 conf->reshape_safe = mddev->reshape_position;
4304 allow_barrier(conf);
4305 }
4306
4307read_more:
4308
4309 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4310 raise_barrier(conf, sectors_done != 0);
4311 atomic_set(&r10_bio->remaining, 0);
4312 r10_bio->mddev = mddev;
4313 r10_bio->sector = sector_nr;
4314 set_bit(R10BIO_IsReshape, &r10_bio->state);
4315 r10_bio->sectors = last - sector_nr + 1;
4316 rdev = read_balance(conf, r10_bio, &max_sectors);
4317 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4318
4319 if (!rdev) {
4320
4321
4322
4323
4324 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4325 return sectors_done;
4326 }
4327
4328 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4329
4330 read_bio->bi_bdev = rdev->bdev;
4331 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4332 + rdev->data_offset);
4333 read_bio->bi_private = r10_bio;
4334 read_bio->bi_end_io = end_sync_read;
4335 read_bio->bi_rw = READ;
4336 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4337 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4338 read_bio->bi_vcnt = 0;
4339 read_bio->bi_idx = 0;
4340 read_bio->bi_size = 0;
4341 r10_bio->master_bio = read_bio;
4342 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4343
4344
4345 __raid10_find_phys(&conf->geo, r10_bio);
4346
4347 blist = read_bio;
4348 read_bio->bi_next = NULL;
4349
4350 for (s = 0; s < conf->copies*2; s++) {
4351 struct bio *b;
4352 int d = r10_bio->devs[s/2].devnum;
4353 struct md_rdev *rdev2;
4354 if (s&1) {
4355 rdev2 = conf->mirrors[d].replacement;
4356 b = r10_bio->devs[s/2].repl_bio;
4357 } else {
4358 rdev2 = conf->mirrors[d].rdev;
4359 b = r10_bio->devs[s/2].bio;
4360 }
4361 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4362 continue;
4363 b->bi_bdev = rdev2->bdev;
4364 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4365 b->bi_private = r10_bio;
4366 b->bi_end_io = end_reshape_write;
4367 b->bi_rw = WRITE;
4368 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4369 b->bi_flags |= 1 << BIO_UPTODATE;
4370 b->bi_next = blist;
4371 b->bi_vcnt = 0;
4372 b->bi_idx = 0;
4373 b->bi_size = 0;
4374 blist = b;
4375 }
4376
4377
4378
4379 nr_sectors = 0;
4380 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4381 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4382 int len = (max_sectors - s) << 9;
4383 if (len > PAGE_SIZE)
4384 len = PAGE_SIZE;
4385 for (bio = blist; bio ; bio = bio->bi_next) {
4386 struct bio *bio2;
4387 if (bio_add_page(bio, page, len, 0))
4388 continue;
4389
4390
4391 for (bio2 = blist;
4392 bio2 && bio2 != bio;
4393 bio2 = bio2->bi_next) {
4394
4395 bio2->bi_vcnt--;
4396 bio2->bi_size -= len;
4397 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4398 }
4399 goto bio_full;
4400 }
4401 sector_nr += len >> 9;
4402 nr_sectors += len >> 9;
4403 }
4404bio_full:
4405 r10_bio->sectors = nr_sectors;
4406
4407
4408 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4409 atomic_inc(&r10_bio->remaining);
4410 read_bio->bi_next = NULL;
4411 generic_make_request(read_bio);
4412 sector_nr += nr_sectors;
4413 sectors_done += nr_sectors;
4414 if (sector_nr <= last)
4415 goto read_more;
4416
4417
4418
4419
4420 if (mddev->reshape_backwards)
4421 conf->reshape_progress -= sectors_done;
4422 else
4423 conf->reshape_progress += sectors_done;
4424
4425 return sectors_done;
4426}
4427
4428static void end_reshape_request(struct r10bio *r10_bio);
4429static int handle_reshape_read_error(struct mddev *mddev,
4430 struct r10bio *r10_bio);
4431static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4432{
4433
4434
4435
4436
4437
4438 struct r10conf *conf = mddev->private;
4439 int s;
4440
4441 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4442 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4443
4444 md_done_sync(mddev, r10_bio->sectors, 0);
4445 return;
4446 }
4447
4448
4449
4450
4451 atomic_set(&r10_bio->remaining, 1);
4452 for (s = 0; s < conf->copies*2; s++) {
4453 struct bio *b;
4454 int d = r10_bio->devs[s/2].devnum;
4455 struct md_rdev *rdev;
4456 if (s&1) {
4457 rdev = conf->mirrors[d].replacement;
4458 b = r10_bio->devs[s/2].repl_bio;
4459 } else {
4460 rdev = conf->mirrors[d].rdev;
4461 b = r10_bio->devs[s/2].bio;
4462 }
4463 if (!rdev || test_bit(Faulty, &rdev->flags))
4464 continue;
4465 atomic_inc(&rdev->nr_pending);
4466 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4467 atomic_inc(&r10_bio->remaining);
4468 b->bi_next = NULL;
4469 generic_make_request(b);
4470 }
4471 end_reshape_request(r10_bio);
4472}
4473
4474static void end_reshape(struct r10conf *conf)
4475{
4476 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4477 return;
4478
4479 spin_lock_irq(&conf->device_lock);
4480 conf->prev = conf->geo;
4481 md_finish_reshape(conf->mddev);
4482 smp_wmb();
4483 conf->reshape_progress = MaxSector;
4484 spin_unlock_irq(&conf->device_lock);
4485
4486
4487
4488
4489 if (conf->mddev->queue) {
4490 int stripe = conf->geo.raid_disks *
4491 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4492 stripe /= conf->geo.near_copies;
4493 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4494 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4495 }
4496 conf->fullsync = 0;
4497}
4498
4499
4500static int handle_reshape_read_error(struct mddev *mddev,
4501 struct r10bio *r10_bio)
4502{
4503
4504 int sectors = r10_bio->sectors;
4505 struct r10conf *conf = mddev->private;
4506 struct {
4507 struct r10bio r10_bio;
4508 struct r10dev devs[conf->copies];
4509 } on_stack;
4510 struct r10bio *r10b = &on_stack.r10_bio;
4511 int slot = 0;
4512 int idx = 0;
4513 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4514
4515 r10b->sector = r10_bio->sector;
4516 __raid10_find_phys(&conf->prev, r10b);
4517
4518 while (sectors) {
4519 int s = sectors;
4520 int success = 0;
4521 int first_slot = slot;
4522
4523 if (s > (PAGE_SIZE >> 9))
4524 s = PAGE_SIZE >> 9;
4525
4526 while (!success) {
4527 int d = r10b->devs[slot].devnum;
4528 struct md_rdev *rdev = conf->mirrors[d].rdev;
4529 sector_t addr;
4530 if (rdev == NULL ||
4531 test_bit(Faulty, &rdev->flags) ||
4532 !test_bit(In_sync, &rdev->flags))
4533 goto failed;
4534
4535 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4536 success = sync_page_io(rdev,
4537 addr,
4538 s << 9,
4539 bvec[idx].bv_page,
4540 READ, false);
4541 if (success)
4542 break;
4543 failed:
4544 slot++;
4545 if (slot >= conf->copies)
4546 slot = 0;
4547 if (slot == first_slot)
4548 break;
4549 }
4550 if (!success) {
4551
4552 set_bit(MD_RECOVERY_INTR,
4553 &mddev->recovery);
4554 return -EIO;
4555 }
4556 sectors -= s;
4557 idx++;
4558 }
4559 return 0;
4560}
4561
4562static void end_reshape_write(struct bio *bio, int error)
4563{
4564 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4565 struct r10bio *r10_bio = bio->bi_private;
4566 struct mddev *mddev = r10_bio->mddev;
4567 struct r10conf *conf = mddev->private;
4568 int d;
4569 int slot;
4570 int repl;
4571 struct md_rdev *rdev = NULL;
4572
4573 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4574 if (repl)
4575 rdev = conf->mirrors[d].replacement;
4576 if (!rdev) {
4577 smp_mb();
4578 rdev = conf->mirrors[d].rdev;
4579 }
4580
4581 if (!uptodate) {
4582
4583 md_error(mddev, rdev);
4584 }
4585
4586 rdev_dec_pending(rdev, mddev);
4587 end_reshape_request(r10_bio);
4588}
4589
4590static void end_reshape_request(struct r10bio *r10_bio)
4591{
4592 if (!atomic_dec_and_test(&r10_bio->remaining))
4593 return;
4594 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4595 bio_put(r10_bio->master_bio);
4596 put_buf(r10_bio);
4597}
4598
4599static void raid10_finish_reshape(struct mddev *mddev)
4600{
4601 struct r10conf *conf = mddev->private;
4602
4603 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4604 return;
4605
4606 if (mddev->delta_disks > 0) {
4607 sector_t size = raid10_size(mddev, 0, 0);
4608 md_set_array_sectors(mddev, size);
4609 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4610 mddev->recovery_cp = mddev->resync_max_sectors;
4611 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4612 }
4613 mddev->resync_max_sectors = size;
4614 set_capacity(mddev->gendisk, mddev->array_sectors);
4615 revalidate_disk(mddev->gendisk);
4616 } else {
4617 int d;
4618 for (d = conf->geo.raid_disks ;
4619 d < conf->geo.raid_disks - mddev->delta_disks;
4620 d++) {
4621 struct md_rdev *rdev = conf->mirrors[d].rdev;
4622 if (rdev)
4623 clear_bit(In_sync, &rdev->flags);
4624 rdev = conf->mirrors[d].replacement;
4625 if (rdev)
4626 clear_bit(In_sync, &rdev->flags);
4627 }
4628 }
4629 mddev->layout = mddev->new_layout;
4630 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4631 mddev->reshape_position = MaxSector;
4632 mddev->delta_disks = 0;
4633 mddev->reshape_backwards = 0;
4634}
4635
4636static struct md_personality raid10_personality =
4637{
4638 .name = "raid10",
4639 .level = 10,
4640 .owner = THIS_MODULE,
4641 .make_request = make_request,
4642 .run = run,
4643 .stop = stop,
4644 .status = status,
4645 .error_handler = error,
4646 .hot_add_disk = raid10_add_disk,
4647 .hot_remove_disk= raid10_remove_disk,
4648 .spare_active = raid10_spare_active,
4649 .sync_request = sync_request,
4650 .quiesce = raid10_quiesce,
4651 .size = raid10_size,
4652 .resize = raid10_resize,
4653 .takeover = raid10_takeover,
4654 .check_reshape = raid10_check_reshape,
4655 .start_reshape = raid10_start_reshape,
4656 .finish_reshape = raid10_finish_reshape,
4657};
4658
4659static int __init raid_init(void)
4660{
4661 return register_md_personality(&raid10_personality);
4662}
4663
4664static void raid_exit(void)
4665{
4666 unregister_md_personality(&raid10_personality);
4667}
4668
4669module_init(raid_init);
4670module_exit(raid_exit);
4671MODULE_LICENSE("GPL");
4672MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4673MODULE_ALIAS("md-personality-9");
4674MODULE_ALIAS("md-raid10");
4675MODULE_ALIAS("md-level-10");
4676
4677module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4678