1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61#define NR_RAID10_BIOS 256
62
63
64
65
66
67
68#define IO_BLOCKED ((struct bio *)1)
69
70
71
72
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77
78
79
80
81static int max_queued_requests = 1024;
82
83static void allow_barrier(struct r10conf *conf);
84static void lower_barrier(struct r10conf *conf);
85static int enough(struct r10conf *conf, int ignore);
86static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
87 int *skipped);
88static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
89static void end_reshape_write(struct bio *bio, int error);
90static void end_reshape(struct r10conf *conf);
91
92static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
93{
94 struct r10conf *conf = data;
95 int size = offsetof(struct r10bio, devs[conf->copies]);
96
97
98
99 return kzalloc(size, gfp_flags);
100}
101
102static void r10bio_pool_free(void *r10_bio, void *data)
103{
104 kfree(r10_bio);
105}
106
107
108#define RESYNC_BLOCK_SIZE (64*1024)
109#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
110
111#define RESYNC_WINDOW (1024*1024)
112
113#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
114
115
116
117
118
119
120
121
122static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123{
124 struct r10conf *conf = data;
125 struct page *page;
126 struct r10bio *r10_bio;
127 struct bio *bio;
128 int i, j;
129 int nalloc;
130
131 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
132 if (!r10_bio)
133 return NULL;
134
135 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
136 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
137 nalloc = conf->copies;
138 else
139 nalloc = 2;
140
141
142
143
144 for (j = nalloc ; j-- ; ) {
145 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
146 if (!bio)
147 goto out_free_bio;
148 r10_bio->devs[j].bio = bio;
149 if (!conf->have_replacement)
150 continue;
151 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
152 if (!bio)
153 goto out_free_bio;
154 r10_bio->devs[j].repl_bio = bio;
155 }
156
157
158
159
160 for (j = 0 ; j < nalloc; j++) {
161 struct bio *rbio = r10_bio->devs[j].repl_bio;
162 bio = r10_bio->devs[j].bio;
163 for (i = 0; i < RESYNC_PAGES; i++) {
164 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
165 &conf->mddev->recovery)) {
166
167
168 struct bio *rbio = r10_bio->devs[0].bio;
169 page = rbio->bi_io_vec[i].bv_page;
170 get_page(page);
171 } else
172 page = alloc_page(gfp_flags);
173 if (unlikely(!page))
174 goto out_free_pages;
175
176 bio->bi_io_vec[i].bv_page = page;
177 if (rbio)
178 rbio->bi_io_vec[i].bv_page = page;
179 }
180 }
181
182 return r10_bio;
183
184out_free_pages:
185 for ( ; i > 0 ; i--)
186 safe_put_page(bio->bi_io_vec[i-1].bv_page);
187 while (j--)
188 for (i = 0; i < RESYNC_PAGES ; i++)
189 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
190 j = 0;
191out_free_bio:
192 for ( ; j < nalloc; j++) {
193 if (r10_bio->devs[j].bio)
194 bio_put(r10_bio->devs[j].bio);
195 if (r10_bio->devs[j].repl_bio)
196 bio_put(r10_bio->devs[j].repl_bio);
197 }
198 r10bio_pool_free(r10_bio, conf);
199 return NULL;
200}
201
202static void r10buf_pool_free(void *__r10_bio, void *data)
203{
204 int i;
205 struct r10conf *conf = data;
206 struct r10bio *r10bio = __r10_bio;
207 int j;
208
209 for (j=0; j < conf->copies; j++) {
210 struct bio *bio = r10bio->devs[j].bio;
211 if (bio) {
212 for (i = 0; i < RESYNC_PAGES; i++) {
213 safe_put_page(bio->bi_io_vec[i].bv_page);
214 bio->bi_io_vec[i].bv_page = NULL;
215 }
216 bio_put(bio);
217 }
218 bio = r10bio->devs[j].repl_bio;
219 if (bio)
220 bio_put(bio);
221 }
222 r10bio_pool_free(r10bio, conf);
223}
224
225static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
226{
227 int i;
228
229 for (i = 0; i < conf->copies; i++) {
230 struct bio **bio = & r10_bio->devs[i].bio;
231 if (!BIO_SPECIAL(*bio))
232 bio_put(*bio);
233 *bio = NULL;
234 bio = &r10_bio->devs[i].repl_bio;
235 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
236 bio_put(*bio);
237 *bio = NULL;
238 }
239}
240
241static void free_r10bio(struct r10bio *r10_bio)
242{
243 struct r10conf *conf = r10_bio->mddev->private;
244
245 put_all_bios(conf, r10_bio);
246 mempool_free(r10_bio, conf->r10bio_pool);
247}
248
249static void put_buf(struct r10bio *r10_bio)
250{
251 struct r10conf *conf = r10_bio->mddev->private;
252
253 mempool_free(r10_bio, conf->r10buf_pool);
254
255 lower_barrier(conf);
256}
257
258static void reschedule_retry(struct r10bio *r10_bio)
259{
260 unsigned long flags;
261 struct mddev *mddev = r10_bio->mddev;
262 struct r10conf *conf = mddev->private;
263
264 spin_lock_irqsave(&conf->device_lock, flags);
265 list_add(&r10_bio->retry_list, &conf->retry_list);
266 conf->nr_queued ++;
267 spin_unlock_irqrestore(&conf->device_lock, flags);
268
269
270 wake_up(&conf->wait_barrier);
271
272 md_wakeup_thread(mddev->thread);
273}
274
275
276
277
278
279
280static void raid_end_bio_io(struct r10bio *r10_bio)
281{
282 struct bio *bio = r10_bio->master_bio;
283 int done;
284 struct r10conf *conf = r10_bio->mddev->private;
285
286 if (bio->bi_phys_segments) {
287 unsigned long flags;
288 spin_lock_irqsave(&conf->device_lock, flags);
289 bio->bi_phys_segments--;
290 done = (bio->bi_phys_segments == 0);
291 spin_unlock_irqrestore(&conf->device_lock, flags);
292 } else
293 done = 1;
294 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
295 clear_bit(BIO_UPTODATE, &bio->bi_flags);
296 if (done) {
297 bio_endio(bio, 0);
298
299
300
301
302 allow_barrier(conf);
303 }
304 free_r10bio(r10_bio);
305}
306
307
308
309
310static inline void update_head_pos(int slot, struct r10bio *r10_bio)
311{
312 struct r10conf *conf = r10_bio->mddev->private;
313
314 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
315 r10_bio->devs[slot].addr + (r10_bio->sectors);
316}
317
318
319
320
321static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
322 struct bio *bio, int *slotp, int *replp)
323{
324 int slot;
325 int repl = 0;
326
327 for (slot = 0; slot < conf->copies; slot++) {
328 if (r10_bio->devs[slot].bio == bio)
329 break;
330 if (r10_bio->devs[slot].repl_bio == bio) {
331 repl = 1;
332 break;
333 }
334 }
335
336 BUG_ON(slot == conf->copies);
337 update_head_pos(slot, r10_bio);
338
339 if (slotp)
340 *slotp = slot;
341 if (replp)
342 *replp = repl;
343 return r10_bio->devs[slot].devnum;
344}
345
346static void raid10_end_read_request(struct bio *bio, int error)
347{
348 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
349 struct r10bio *r10_bio = bio->bi_private;
350 int slot, dev;
351 struct md_rdev *rdev;
352 struct r10conf *conf = r10_bio->mddev->private;
353
354
355 slot = r10_bio->read_slot;
356 dev = r10_bio->devs[slot].devnum;
357 rdev = r10_bio->devs[slot].rdev;
358
359
360
361 update_head_pos(slot, r10_bio);
362
363 if (uptodate) {
364
365
366
367
368
369
370
371
372
373 set_bit(R10BIO_Uptodate, &r10_bio->state);
374 } else {
375
376
377
378
379
380 unsigned long flags;
381 spin_lock_irqsave(&conf->device_lock, flags);
382 if (!enough(conf, rdev->raid_disk))
383 uptodate = 1;
384 spin_unlock_irqrestore(&conf->device_lock, flags);
385 }
386 if (uptodate) {
387 raid_end_bio_io(r10_bio);
388 rdev_dec_pending(rdev, conf->mddev);
389 } else {
390
391
392
393 char b[BDEVNAME_SIZE];
394 printk_ratelimited(KERN_ERR
395 "md/raid10:%s: %s: rescheduling sector %llu\n",
396 mdname(conf->mddev),
397 bdevname(rdev->bdev, b),
398 (unsigned long long)r10_bio->sector);
399 set_bit(R10BIO_ReadError, &r10_bio->state);
400 reschedule_retry(r10_bio);
401 }
402}
403
404static void close_write(struct r10bio *r10_bio)
405{
406
407 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
408 r10_bio->sectors,
409 !test_bit(R10BIO_Degraded, &r10_bio->state),
410 0);
411 md_write_end(r10_bio->mddev);
412}
413
414static void one_write_done(struct r10bio *r10_bio)
415{
416 if (atomic_dec_and_test(&r10_bio->remaining)) {
417 if (test_bit(R10BIO_WriteError, &r10_bio->state))
418 reschedule_retry(r10_bio);
419 else {
420 close_write(r10_bio);
421 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
422 reschedule_retry(r10_bio);
423 else
424 raid_end_bio_io(r10_bio);
425 }
426 }
427}
428
429static void raid10_end_write_request(struct bio *bio, int error)
430{
431 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
432 struct r10bio *r10_bio = bio->bi_private;
433 int dev;
434 int dec_rdev = 1;
435 struct r10conf *conf = r10_bio->mddev->private;
436 int slot, repl;
437 struct md_rdev *rdev = NULL;
438
439 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
440
441 if (repl)
442 rdev = conf->mirrors[dev].replacement;
443 if (!rdev) {
444 smp_rmb();
445 repl = 0;
446 rdev = conf->mirrors[dev].rdev;
447 }
448
449
450
451 if (!uptodate) {
452 if (repl)
453
454
455
456 md_error(rdev->mddev, rdev);
457 else {
458 set_bit(WriteErrorSeen, &rdev->flags);
459 if (!test_and_set_bit(WantReplacement, &rdev->flags))
460 set_bit(MD_RECOVERY_NEEDED,
461 &rdev->mddev->recovery);
462 set_bit(R10BIO_WriteError, &r10_bio->state);
463 dec_rdev = 0;
464 }
465 } else {
466
467
468
469
470
471
472
473
474
475 sector_t first_bad;
476 int bad_sectors;
477
478 set_bit(R10BIO_Uptodate, &r10_bio->state);
479
480
481 if (is_badblock(rdev,
482 r10_bio->devs[slot].addr,
483 r10_bio->sectors,
484 &first_bad, &bad_sectors)) {
485 bio_put(bio);
486 if (repl)
487 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
488 else
489 r10_bio->devs[slot].bio = IO_MADE_GOOD;
490 dec_rdev = 0;
491 set_bit(R10BIO_MadeGood, &r10_bio->state);
492 }
493 }
494
495
496
497
498
499
500 one_write_done(r10_bio);
501 if (dec_rdev)
502 rdev_dec_pending(rdev, conf->mddev);
503}
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
531{
532 int n,f;
533 sector_t sector;
534 sector_t chunk;
535 sector_t stripe;
536 int dev;
537 int slot = 0;
538
539
540 chunk = r10bio->sector >> geo->chunk_shift;
541 sector = r10bio->sector & geo->chunk_mask;
542
543 chunk *= geo->near_copies;
544 stripe = chunk;
545 dev = sector_div(stripe, geo->raid_disks);
546 if (geo->far_offset)
547 stripe *= geo->far_copies;
548
549 sector += stripe << geo->chunk_shift;
550
551
552 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev;
554 sector_t s = sector;
555 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d;
557 slot++;
558
559 for (f = 1; f < geo->far_copies; f++) {
560 d += geo->near_copies;
561 if (d >= geo->raid_disks)
562 d -= geo->raid_disks;
563 s += geo->stride;
564 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s;
566 slot++;
567 }
568 dev++;
569 if (dev >= geo->raid_disks) {
570 dev = 0;
571 sector += (geo->chunk_mask + 1);
572 }
573 }
574}
575
576static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
577{
578 struct geom *geo = &conf->geo;
579
580 if (conf->reshape_progress != MaxSector &&
581 ((r10bio->sector >= conf->reshape_progress) !=
582 conf->mddev->reshape_backwards)) {
583 set_bit(R10BIO_Previous, &r10bio->state);
584 geo = &conf->prev;
585 } else
586 clear_bit(R10BIO_Previous, &r10bio->state);
587
588 __raid10_find_phys(geo, r10bio);
589}
590
591static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
592{
593 sector_t offset, chunk, vchunk;
594
595
596
597 struct geom *geo = &conf->geo;
598
599 offset = sector & geo->chunk_mask;
600 if (geo->far_offset) {
601 int fc;
602 chunk = sector >> geo->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies);
604 dev -= fc * geo->near_copies;
605 if (dev < 0)
606 dev += geo->raid_disks;
607 } else {
608 while (sector >= geo->stride) {
609 sector -= geo->stride;
610 if (dev < geo->near_copies)
611 dev += geo->raid_disks - geo->near_copies;
612 else
613 dev -= geo->near_copies;
614 }
615 chunk = sector >> geo->chunk_shift;
616 }
617 vchunk = chunk * geo->raid_disks + dev;
618 sector_div(vchunk, geo->near_copies);
619 return (vchunk << geo->chunk_shift) + offset;
620}
621
622
623
624
625
626
627
628
629
630
631
632static int raid10_mergeable_bvec(struct request_queue *q,
633 struct bvec_merge_data *bvm,
634 struct bio_vec *biovec)
635{
636 struct mddev *mddev = q->queuedata;
637 struct r10conf *conf = mddev->private;
638 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
639 int max;
640 unsigned int chunk_sectors;
641 unsigned int bio_sectors = bvm->bi_size >> 9;
642 struct geom *geo = &conf->geo;
643
644 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
645 if (conf->reshape_progress != MaxSector &&
646 ((sector >= conf->reshape_progress) !=
647 conf->mddev->reshape_backwards))
648 geo = &conf->prev;
649
650 if (geo->near_copies < geo->raid_disks) {
651 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
652 + bio_sectors)) << 9;
653 if (max < 0)
654
655 max = 0;
656 if (max <= biovec->bv_len && bio_sectors == 0)
657 return biovec->bv_len;
658 } else
659 max = biovec->bv_len;
660
661 if (mddev->merge_check_needed) {
662 struct {
663 struct r10bio r10_bio;
664 struct r10dev devs[conf->copies];
665 } on_stack;
666 struct r10bio *r10_bio = &on_stack.r10_bio;
667 int s;
668 if (conf->reshape_progress != MaxSector) {
669
670 if (max <= biovec->bv_len && bio_sectors == 0)
671 return biovec->bv_len;
672 return 0;
673 }
674 r10_bio->sector = sector;
675 raid10_find_phys(conf, r10_bio);
676 rcu_read_lock();
677 for (s = 0; s < conf->copies; s++) {
678 int disk = r10_bio->devs[s].devnum;
679 struct md_rdev *rdev = rcu_dereference(
680 conf->mirrors[disk].rdev);
681 if (rdev && !test_bit(Faulty, &rdev->flags)) {
682 struct request_queue *q =
683 bdev_get_queue(rdev->bdev);
684 if (q->merge_bvec_fn) {
685 bvm->bi_sector = r10_bio->devs[s].addr
686 + rdev->data_offset;
687 bvm->bi_bdev = rdev->bdev;
688 max = min(max, q->merge_bvec_fn(
689 q, bvm, biovec));
690 }
691 }
692 rdev = rcu_dereference(conf->mirrors[disk].replacement);
693 if (rdev && !test_bit(Faulty, &rdev->flags)) {
694 struct request_queue *q =
695 bdev_get_queue(rdev->bdev);
696 if (q->merge_bvec_fn) {
697 bvm->bi_sector = r10_bio->devs[s].addr
698 + rdev->data_offset;
699 bvm->bi_bdev = rdev->bdev;
700 max = min(max, q->merge_bvec_fn(
701 q, bvm, biovec));
702 }
703 }
704 }
705 rcu_read_unlock();
706 }
707 return max;
708}
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729static struct md_rdev *read_balance(struct r10conf *conf,
730 struct r10bio *r10_bio,
731 int *max_sectors)
732{
733 const sector_t this_sector = r10_bio->sector;
734 int disk, slot;
735 int sectors = r10_bio->sectors;
736 int best_good_sectors;
737 sector_t new_distance, best_dist;
738 struct md_rdev *best_rdev, *rdev = NULL;
739 int do_balance;
740 int best_slot;
741 struct geom *geo = &conf->geo;
742
743 raid10_find_phys(conf, r10_bio);
744 rcu_read_lock();
745retry:
746 sectors = r10_bio->sectors;
747 best_slot = -1;
748 best_rdev = NULL;
749 best_dist = MaxSector;
750 best_good_sectors = 0;
751 do_balance = 1;
752
753
754
755
756
757
758 if (conf->mddev->recovery_cp < MaxSector
759 && (this_sector + sectors >= conf->next_resync))
760 do_balance = 0;
761
762 for (slot = 0; slot < conf->copies ; slot++) {
763 sector_t first_bad;
764 int bad_sectors;
765 sector_t dev_sector;
766
767 if (r10_bio->devs[slot].bio == IO_BLOCKED)
768 continue;
769 disk = r10_bio->devs[slot].devnum;
770 rdev = rcu_dereference(conf->mirrors[disk].replacement);
771 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
772 test_bit(Unmerged, &rdev->flags) ||
773 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
774 rdev = rcu_dereference(conf->mirrors[disk].rdev);
775 if (rdev == NULL ||
776 test_bit(Faulty, &rdev->flags) ||
777 test_bit(Unmerged, &rdev->flags))
778 continue;
779 if (!test_bit(In_sync, &rdev->flags) &&
780 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
781 continue;
782
783 dev_sector = r10_bio->devs[slot].addr;
784 if (is_badblock(rdev, dev_sector, sectors,
785 &first_bad, &bad_sectors)) {
786 if (best_dist < MaxSector)
787
788 continue;
789 if (first_bad <= dev_sector) {
790
791
792
793
794 bad_sectors -= (dev_sector - first_bad);
795 if (!do_balance && sectors > bad_sectors)
796 sectors = bad_sectors;
797 if (best_good_sectors > sectors)
798 best_good_sectors = sectors;
799 } else {
800 sector_t good_sectors =
801 first_bad - dev_sector;
802 if (good_sectors > best_good_sectors) {
803 best_good_sectors = good_sectors;
804 best_slot = slot;
805 best_rdev = rdev;
806 }
807 if (!do_balance)
808
809 break;
810 }
811 continue;
812 } else
813 best_good_sectors = sectors;
814
815 if (!do_balance)
816 break;
817
818
819
820
821
822 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
823 break;
824
825
826 if (geo->far_copies > 1)
827 new_distance = r10_bio->devs[slot].addr;
828 else
829 new_distance = abs(r10_bio->devs[slot].addr -
830 conf->mirrors[disk].head_position);
831 if (new_distance < best_dist) {
832 best_dist = new_distance;
833 best_slot = slot;
834 best_rdev = rdev;
835 }
836 }
837 if (slot >= conf->copies) {
838 slot = best_slot;
839 rdev = best_rdev;
840 }
841
842 if (slot >= 0) {
843 atomic_inc(&rdev->nr_pending);
844 if (test_bit(Faulty, &rdev->flags)) {
845
846
847
848 rdev_dec_pending(rdev, conf->mddev);
849 goto retry;
850 }
851 r10_bio->read_slot = slot;
852 } else
853 rdev = NULL;
854 rcu_read_unlock();
855 *max_sectors = best_good_sectors;
856
857 return rdev;
858}
859
860int md_raid10_congested(struct mddev *mddev, int bits)
861{
862 struct r10conf *conf = mddev->private;
863 int i, ret = 0;
864
865 if ((bits & (1 << BDI_async_congested)) &&
866 conf->pending_count >= max_queued_requests)
867 return 1;
868
869 rcu_read_lock();
870 for (i = 0;
871 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
872 && ret == 0;
873 i++) {
874 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
875 if (rdev && !test_bit(Faulty, &rdev->flags)) {
876 struct request_queue *q = bdev_get_queue(rdev->bdev);
877
878 ret |= bdi_congested(&q->backing_dev_info, bits);
879 }
880 }
881 rcu_read_unlock();
882 return ret;
883}
884EXPORT_SYMBOL_GPL(md_raid10_congested);
885
886static int raid10_congested(void *data, int bits)
887{
888 struct mddev *mddev = data;
889
890 return mddev_congested(mddev, bits) ||
891 md_raid10_congested(mddev, bits);
892}
893
894static void flush_pending_writes(struct r10conf *conf)
895{
896
897
898
899 spin_lock_irq(&conf->device_lock);
900
901 if (conf->pending_bio_list.head) {
902 struct bio *bio;
903 bio = bio_list_get(&conf->pending_bio_list);
904 conf->pending_count = 0;
905 spin_unlock_irq(&conf->device_lock);
906
907
908 bitmap_unplug(conf->mddev->bitmap);
909 wake_up(&conf->wait_barrier);
910
911 while (bio) {
912 struct bio *next = bio->bi_next;
913 bio->bi_next = NULL;
914 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
915 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
916
917 bio_endio(bio, 0);
918 else
919 generic_make_request(bio);
920 bio = next;
921 }
922 } else
923 spin_unlock_irq(&conf->device_lock);
924}
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948static void raise_barrier(struct r10conf *conf, int force)
949{
950 BUG_ON(force && !conf->barrier);
951 spin_lock_irq(&conf->resync_lock);
952
953
954 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
955 conf->resync_lock, );
956
957
958 conf->barrier++;
959
960
961 wait_event_lock_irq(conf->wait_barrier,
962 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
963 conf->resync_lock, );
964
965 spin_unlock_irq(&conf->resync_lock);
966}
967
968static void lower_barrier(struct r10conf *conf)
969{
970 unsigned long flags;
971 spin_lock_irqsave(&conf->resync_lock, flags);
972 conf->barrier--;
973 spin_unlock_irqrestore(&conf->resync_lock, flags);
974 wake_up(&conf->wait_barrier);
975}
976
977static void wait_barrier(struct r10conf *conf)
978{
979 spin_lock_irq(&conf->resync_lock);
980 if (conf->barrier) {
981 conf->nr_waiting++;
982
983
984
985
986
987
988
989
990
991 wait_event_lock_irq(conf->wait_barrier,
992 !conf->barrier ||
993 (conf->nr_pending &&
994 current->bio_list &&
995 !bio_list_empty(current->bio_list)),
996 conf->resync_lock,
997 );
998 conf->nr_waiting--;
999 }
1000 conf->nr_pending++;
1001 spin_unlock_irq(&conf->resync_lock);
1002}
1003
1004static void allow_barrier(struct r10conf *conf)
1005{
1006 unsigned long flags;
1007 spin_lock_irqsave(&conf->resync_lock, flags);
1008 conf->nr_pending--;
1009 spin_unlock_irqrestore(&conf->resync_lock, flags);
1010 wake_up(&conf->wait_barrier);
1011}
1012
1013static void freeze_array(struct r10conf *conf)
1014{
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027 spin_lock_irq(&conf->resync_lock);
1028 conf->barrier++;
1029 conf->nr_waiting++;
1030 wait_event_lock_irq(conf->wait_barrier,
1031 conf->nr_pending == conf->nr_queued+1,
1032 conf->resync_lock,
1033 flush_pending_writes(conf));
1034
1035 spin_unlock_irq(&conf->resync_lock);
1036}
1037
1038static void unfreeze_array(struct r10conf *conf)
1039{
1040
1041 spin_lock_irq(&conf->resync_lock);
1042 conf->barrier--;
1043 conf->nr_waiting--;
1044 wake_up(&conf->wait_barrier);
1045 spin_unlock_irq(&conf->resync_lock);
1046}
1047
1048static sector_t choose_data_offset(struct r10bio *r10_bio,
1049 struct md_rdev *rdev)
1050{
1051 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1052 test_bit(R10BIO_Previous, &r10_bio->state))
1053 return rdev->data_offset;
1054 else
1055 return rdev->new_data_offset;
1056}
1057
1058struct raid10_plug_cb {
1059 struct blk_plug_cb cb;
1060 struct bio_list pending;
1061 int pending_cnt;
1062};
1063
1064static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1065{
1066 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1067 cb);
1068 struct mddev *mddev = plug->cb.data;
1069 struct r10conf *conf = mddev->private;
1070 struct bio *bio;
1071
1072 if (from_schedule || current->bio_list) {
1073 spin_lock_irq(&conf->device_lock);
1074 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1075 conf->pending_count += plug->pending_cnt;
1076 spin_unlock_irq(&conf->device_lock);
1077 md_wakeup_thread(mddev->thread);
1078 kfree(plug);
1079 return;
1080 }
1081
1082
1083 bio = bio_list_get(&plug->pending);
1084 bitmap_unplug(mddev->bitmap);
1085 wake_up(&conf->wait_barrier);
1086
1087 while (bio) {
1088 struct bio *next = bio->bi_next;
1089 bio->bi_next = NULL;
1090 generic_make_request(bio);
1091 bio = next;
1092 }
1093 kfree(plug);
1094}
1095
1096static void make_request(struct mddev *mddev, struct bio * bio)
1097{
1098 struct r10conf *conf = mddev->private;
1099 struct r10bio *r10_bio;
1100 struct bio *read_bio;
1101 int i;
1102 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1103 int chunk_sects = chunk_mask + 1;
1104 const int rw = bio_data_dir(bio);
1105 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1106 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1107 const unsigned long do_discard = (bio->bi_rw
1108 & (REQ_DISCARD | REQ_SECURE));
1109 unsigned long flags;
1110 struct md_rdev *blocked_rdev;
1111 struct blk_plug_cb *cb;
1112 struct raid10_plug_cb *plug = NULL;
1113 int sectors_handled;
1114 int max_sectors;
1115 int sectors;
1116
1117 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1118 md_flush_request(mddev, bio);
1119 return;
1120 }
1121
1122
1123
1124
1125 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
1126 > chunk_sects
1127 && (conf->geo.near_copies < conf->geo.raid_disks
1128 || conf->prev.near_copies < conf->prev.raid_disks))) {
1129 struct bio_pair *bp;
1130
1131 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
1132 bio->bi_idx != 0)
1133 goto bad_map;
1134
1135
1136
1137 bp = bio_split(bio,
1138 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148 spin_lock_irq(&conf->resync_lock);
1149 conf->nr_waiting++;
1150 spin_unlock_irq(&conf->resync_lock);
1151
1152 make_request(mddev, &bp->bio1);
1153 make_request(mddev, &bp->bio2);
1154
1155 spin_lock_irq(&conf->resync_lock);
1156 conf->nr_waiting--;
1157 wake_up(&conf->wait_barrier);
1158 spin_unlock_irq(&conf->resync_lock);
1159
1160 bio_pair_release(bp);
1161 return;
1162 bad_map:
1163 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1164 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1165 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
1166
1167 bio_io_error(bio);
1168 return;
1169 }
1170
1171 md_write_start(mddev, bio);
1172
1173
1174
1175
1176
1177
1178 wait_barrier(conf);
1179
1180 sectors = bio->bi_size >> 9;
1181 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1182 bio->bi_sector < conf->reshape_progress &&
1183 bio->bi_sector + sectors > conf->reshape_progress) {
1184
1185
1186
1187 allow_barrier(conf);
1188 wait_event(conf->wait_barrier,
1189 conf->reshape_progress <= bio->bi_sector ||
1190 conf->reshape_progress >= bio->bi_sector + sectors);
1191 wait_barrier(conf);
1192 }
1193 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1194 bio_data_dir(bio) == WRITE &&
1195 (mddev->reshape_backwards
1196 ? (bio->bi_sector < conf->reshape_safe &&
1197 bio->bi_sector + sectors > conf->reshape_progress)
1198 : (bio->bi_sector + sectors > conf->reshape_safe &&
1199 bio->bi_sector < conf->reshape_progress))) {
1200
1201 mddev->reshape_position = conf->reshape_progress;
1202 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1203 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1204 md_wakeup_thread(mddev->thread);
1205 wait_event(mddev->sb_wait,
1206 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1207
1208 conf->reshape_safe = mddev->reshape_position;
1209 }
1210
1211 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1212
1213 r10_bio->master_bio = bio;
1214 r10_bio->sectors = sectors;
1215
1216 r10_bio->mddev = mddev;
1217 r10_bio->sector = bio->bi_sector;
1218 r10_bio->state = 0;
1219
1220
1221
1222
1223
1224
1225
1226
1227 bio->bi_phys_segments = 0;
1228 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1229
1230 if (rw == READ) {
1231
1232
1233
1234 struct md_rdev *rdev;
1235 int slot;
1236
1237read_again:
1238 rdev = read_balance(conf, r10_bio, &max_sectors);
1239 if (!rdev) {
1240 raid_end_bio_io(r10_bio);
1241 return;
1242 }
1243 slot = r10_bio->read_slot;
1244
1245 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1246 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1247 max_sectors);
1248
1249 r10_bio->devs[slot].bio = read_bio;
1250 r10_bio->devs[slot].rdev = rdev;
1251
1252 read_bio->bi_sector = r10_bio->devs[slot].addr +
1253 choose_data_offset(r10_bio, rdev);
1254 read_bio->bi_bdev = rdev->bdev;
1255 read_bio->bi_end_io = raid10_end_read_request;
1256 read_bio->bi_rw = READ | do_sync;
1257 read_bio->bi_private = r10_bio;
1258
1259 if (max_sectors < r10_bio->sectors) {
1260
1261
1262
1263 sectors_handled = (r10_bio->sectors + max_sectors
1264 - bio->bi_sector);
1265 r10_bio->sectors = max_sectors;
1266 spin_lock_irq(&conf->device_lock);
1267 if (bio->bi_phys_segments == 0)
1268 bio->bi_phys_segments = 2;
1269 else
1270 bio->bi_phys_segments++;
1271 spin_unlock(&conf->device_lock);
1272
1273
1274
1275
1276
1277 reschedule_retry(r10_bio);
1278
1279 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1280
1281 r10_bio->master_bio = bio;
1282 r10_bio->sectors = ((bio->bi_size >> 9)
1283 - sectors_handled);
1284 r10_bio->state = 0;
1285 r10_bio->mddev = mddev;
1286 r10_bio->sector = bio->bi_sector + sectors_handled;
1287 goto read_again;
1288 } else
1289 generic_make_request(read_bio);
1290 return;
1291 }
1292
1293
1294
1295
1296 if (conf->pending_count >= max_queued_requests) {
1297 md_wakeup_thread(mddev->thread);
1298 wait_event(conf->wait_barrier,
1299 conf->pending_count < max_queued_requests);
1300 }
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313 r10_bio->read_slot = -1;
1314 raid10_find_phys(conf, r10_bio);
1315retry_write:
1316 blocked_rdev = NULL;
1317 rcu_read_lock();
1318 max_sectors = r10_bio->sectors;
1319
1320 for (i = 0; i < conf->copies; i++) {
1321 int d = r10_bio->devs[i].devnum;
1322 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1323 struct md_rdev *rrdev = rcu_dereference(
1324 conf->mirrors[d].replacement);
1325 if (rdev == rrdev)
1326 rrdev = NULL;
1327 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1328 atomic_inc(&rdev->nr_pending);
1329 blocked_rdev = rdev;
1330 break;
1331 }
1332 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1333 atomic_inc(&rrdev->nr_pending);
1334 blocked_rdev = rrdev;
1335 break;
1336 }
1337 if (rdev && (test_bit(Faulty, &rdev->flags)
1338 || test_bit(Unmerged, &rdev->flags)))
1339 rdev = NULL;
1340 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1341 || test_bit(Unmerged, &rrdev->flags)))
1342 rrdev = NULL;
1343
1344 r10_bio->devs[i].bio = NULL;
1345 r10_bio->devs[i].repl_bio = NULL;
1346
1347 if (!rdev && !rrdev) {
1348 set_bit(R10BIO_Degraded, &r10_bio->state);
1349 continue;
1350 }
1351 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1352 sector_t first_bad;
1353 sector_t dev_sector = r10_bio->devs[i].addr;
1354 int bad_sectors;
1355 int is_bad;
1356
1357 is_bad = is_badblock(rdev, dev_sector,
1358 max_sectors,
1359 &first_bad, &bad_sectors);
1360 if (is_bad < 0) {
1361
1362
1363
1364 atomic_inc(&rdev->nr_pending);
1365 set_bit(BlockedBadBlocks, &rdev->flags);
1366 blocked_rdev = rdev;
1367 break;
1368 }
1369 if (is_bad && first_bad <= dev_sector) {
1370
1371 bad_sectors -= (dev_sector - first_bad);
1372 if (bad_sectors < max_sectors)
1373
1374
1375
1376 max_sectors = bad_sectors;
1377
1378
1379
1380
1381
1382
1383
1384
1385 continue;
1386 }
1387 if (is_bad) {
1388 int good_sectors = first_bad - dev_sector;
1389 if (good_sectors < max_sectors)
1390 max_sectors = good_sectors;
1391 }
1392 }
1393 if (rdev) {
1394 r10_bio->devs[i].bio = bio;
1395 atomic_inc(&rdev->nr_pending);
1396 }
1397 if (rrdev) {
1398 r10_bio->devs[i].repl_bio = bio;
1399 atomic_inc(&rrdev->nr_pending);
1400 }
1401 }
1402 rcu_read_unlock();
1403
1404 if (unlikely(blocked_rdev)) {
1405
1406 int j;
1407 int d;
1408
1409 for (j = 0; j < i; j++) {
1410 if (r10_bio->devs[j].bio) {
1411 d = r10_bio->devs[j].devnum;
1412 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1413 }
1414 if (r10_bio->devs[j].repl_bio) {
1415 struct md_rdev *rdev;
1416 d = r10_bio->devs[j].devnum;
1417 rdev = conf->mirrors[d].replacement;
1418 if (!rdev) {
1419
1420 smp_mb();
1421 rdev = conf->mirrors[d].rdev;
1422 }
1423 rdev_dec_pending(rdev, mddev);
1424 }
1425 }
1426 allow_barrier(conf);
1427 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1428 wait_barrier(conf);
1429 goto retry_write;
1430 }
1431
1432 if (max_sectors < r10_bio->sectors) {
1433
1434
1435
1436 r10_bio->sectors = max_sectors;
1437 spin_lock_irq(&conf->device_lock);
1438 if (bio->bi_phys_segments == 0)
1439 bio->bi_phys_segments = 2;
1440 else
1441 bio->bi_phys_segments++;
1442 spin_unlock_irq(&conf->device_lock);
1443 }
1444 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1445
1446 atomic_set(&r10_bio->remaining, 1);
1447 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1448
1449 for (i = 0; i < conf->copies; i++) {
1450 struct bio *mbio;
1451 int d = r10_bio->devs[i].devnum;
1452 if (r10_bio->devs[i].bio) {
1453 struct md_rdev *rdev = conf->mirrors[d].rdev;
1454 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1455 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1456 max_sectors);
1457 r10_bio->devs[i].bio = mbio;
1458
1459 mbio->bi_sector = (r10_bio->devs[i].addr+
1460 choose_data_offset(r10_bio,
1461 rdev));
1462 mbio->bi_bdev = rdev->bdev;
1463 mbio->bi_end_io = raid10_end_write_request;
1464 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1465 mbio->bi_private = r10_bio;
1466
1467 atomic_inc(&r10_bio->remaining);
1468
1469 cb = blk_check_plugged(raid10_unplug, mddev,
1470 sizeof(*plug));
1471 if (cb)
1472 plug = container_of(cb, struct raid10_plug_cb,
1473 cb);
1474 else
1475 plug = NULL;
1476 spin_lock_irqsave(&conf->device_lock, flags);
1477 if (plug) {
1478 bio_list_add(&plug->pending, mbio);
1479 plug->pending_cnt++;
1480 } else {
1481 bio_list_add(&conf->pending_bio_list, mbio);
1482 conf->pending_count++;
1483 }
1484 spin_unlock_irqrestore(&conf->device_lock, flags);
1485 if (!plug)
1486 md_wakeup_thread(mddev->thread);
1487 }
1488
1489 if (r10_bio->devs[i].repl_bio) {
1490 struct md_rdev *rdev = conf->mirrors[d].replacement;
1491 if (rdev == NULL) {
1492
1493 smp_mb();
1494 rdev = conf->mirrors[d].rdev;
1495 }
1496 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1497 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1498 max_sectors);
1499 r10_bio->devs[i].repl_bio = mbio;
1500
1501 mbio->bi_sector = (r10_bio->devs[i].addr +
1502 choose_data_offset(
1503 r10_bio, rdev));
1504 mbio->bi_bdev = rdev->bdev;
1505 mbio->bi_end_io = raid10_end_write_request;
1506 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
1507 mbio->bi_private = r10_bio;
1508
1509 atomic_inc(&r10_bio->remaining);
1510 spin_lock_irqsave(&conf->device_lock, flags);
1511 bio_list_add(&conf->pending_bio_list, mbio);
1512 conf->pending_count++;
1513 spin_unlock_irqrestore(&conf->device_lock, flags);
1514 if (!mddev_check_plugged(mddev))
1515 md_wakeup_thread(mddev->thread);
1516 }
1517 }
1518
1519
1520
1521
1522
1523 if (sectors_handled < (bio->bi_size >> 9)) {
1524 one_write_done(r10_bio);
1525
1526
1527
1528 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1529
1530 r10_bio->master_bio = bio;
1531 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1532
1533 r10_bio->mddev = mddev;
1534 r10_bio->sector = bio->bi_sector + sectors_handled;
1535 r10_bio->state = 0;
1536 goto retry_write;
1537 }
1538 one_write_done(r10_bio);
1539
1540
1541 wake_up(&conf->wait_barrier);
1542}
1543
1544static void status(struct seq_file *seq, struct mddev *mddev)
1545{
1546 struct r10conf *conf = mddev->private;
1547 int i;
1548
1549 if (conf->geo.near_copies < conf->geo.raid_disks)
1550 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1551 if (conf->geo.near_copies > 1)
1552 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1553 if (conf->geo.far_copies > 1) {
1554 if (conf->geo.far_offset)
1555 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1556 else
1557 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1558 }
1559 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1560 conf->geo.raid_disks - mddev->degraded);
1561 for (i = 0; i < conf->geo.raid_disks; i++)
1562 seq_printf(seq, "%s",
1563 conf->mirrors[i].rdev &&
1564 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1565 seq_printf(seq, "]");
1566}
1567
1568
1569
1570
1571
1572
1573static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1574{
1575 int first = 0;
1576
1577 do {
1578 int n = conf->copies;
1579 int cnt = 0;
1580 int this = first;
1581 while (n--) {
1582 if (conf->mirrors[this].rdev &&
1583 this != ignore)
1584 cnt++;
1585 this = (this+1) % geo->raid_disks;
1586 }
1587 if (cnt == 0)
1588 return 0;
1589 first = (first + geo->near_copies) % geo->raid_disks;
1590 } while (first != 0);
1591 return 1;
1592}
1593
1594static int enough(struct r10conf *conf, int ignore)
1595{
1596 return _enough(conf, &conf->geo, ignore) &&
1597 _enough(conf, &conf->prev, ignore);
1598}
1599
1600static void error(struct mddev *mddev, struct md_rdev *rdev)
1601{
1602 char b[BDEVNAME_SIZE];
1603 struct r10conf *conf = mddev->private;
1604
1605
1606
1607
1608
1609
1610
1611 if (test_bit(In_sync, &rdev->flags)
1612 && !enough(conf, rdev->raid_disk))
1613
1614
1615
1616 return;
1617 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1618 unsigned long flags;
1619 spin_lock_irqsave(&conf->device_lock, flags);
1620 mddev->degraded++;
1621 spin_unlock_irqrestore(&conf->device_lock, flags);
1622
1623
1624
1625 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1626 }
1627 set_bit(Blocked, &rdev->flags);
1628 set_bit(Faulty, &rdev->flags);
1629 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1630 printk(KERN_ALERT
1631 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1632 "md/raid10:%s: Operation continuing on %d devices.\n",
1633 mdname(mddev), bdevname(rdev->bdev, b),
1634 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1635}
1636
1637static void print_conf(struct r10conf *conf)
1638{
1639 int i;
1640 struct raid10_info *tmp;
1641
1642 printk(KERN_DEBUG "RAID10 conf printout:\n");
1643 if (!conf) {
1644 printk(KERN_DEBUG "(!conf)\n");
1645 return;
1646 }
1647 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1648 conf->geo.raid_disks);
1649
1650 for (i = 0; i < conf->geo.raid_disks; i++) {
1651 char b[BDEVNAME_SIZE];
1652 tmp = conf->mirrors + i;
1653 if (tmp->rdev)
1654 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1655 i, !test_bit(In_sync, &tmp->rdev->flags),
1656 !test_bit(Faulty, &tmp->rdev->flags),
1657 bdevname(tmp->rdev->bdev,b));
1658 }
1659}
1660
1661static void close_sync(struct r10conf *conf)
1662{
1663 wait_barrier(conf);
1664 allow_barrier(conf);
1665
1666 mempool_destroy(conf->r10buf_pool);
1667 conf->r10buf_pool = NULL;
1668}
1669
1670static int raid10_spare_active(struct mddev *mddev)
1671{
1672 int i;
1673 struct r10conf *conf = mddev->private;
1674 struct raid10_info *tmp;
1675 int count = 0;
1676 unsigned long flags;
1677
1678
1679
1680
1681
1682 for (i = 0; i < conf->geo.raid_disks; i++) {
1683 tmp = conf->mirrors + i;
1684 if (tmp->replacement
1685 && tmp->replacement->recovery_offset == MaxSector
1686 && !test_bit(Faulty, &tmp->replacement->flags)
1687 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1688
1689 if (!tmp->rdev
1690 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1691 count++;
1692 if (tmp->rdev) {
1693
1694
1695
1696
1697 set_bit(Faulty, &tmp->rdev->flags);
1698 sysfs_notify_dirent_safe(
1699 tmp->rdev->sysfs_state);
1700 }
1701 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1702 } else if (tmp->rdev
1703 && !test_bit(Faulty, &tmp->rdev->flags)
1704 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1705 count++;
1706 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1707 }
1708 }
1709 spin_lock_irqsave(&conf->device_lock, flags);
1710 mddev->degraded -= count;
1711 spin_unlock_irqrestore(&conf->device_lock, flags);
1712
1713 print_conf(conf);
1714 return count;
1715}
1716
1717
1718static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1719{
1720 struct r10conf *conf = mddev->private;
1721 int err = -EEXIST;
1722 int mirror;
1723 int first = 0;
1724 int last = conf->geo.raid_disks - 1;
1725 struct request_queue *q = bdev_get_queue(rdev->bdev);
1726
1727 if (mddev->recovery_cp < MaxSector)
1728
1729
1730
1731 return -EBUSY;
1732 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1733 return -EINVAL;
1734
1735 if (rdev->raid_disk >= 0)
1736 first = last = rdev->raid_disk;
1737
1738 if (q->merge_bvec_fn) {
1739 set_bit(Unmerged, &rdev->flags);
1740 mddev->merge_check_needed = 1;
1741 }
1742
1743 if (rdev->saved_raid_disk >= first &&
1744 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1745 mirror = rdev->saved_raid_disk;
1746 else
1747 mirror = first;
1748 for ( ; mirror <= last ; mirror++) {
1749 struct raid10_info *p = &conf->mirrors[mirror];
1750 if (p->recovery_disabled == mddev->recovery_disabled)
1751 continue;
1752 if (p->rdev) {
1753 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1754 p->replacement != NULL)
1755 continue;
1756 clear_bit(In_sync, &rdev->flags);
1757 set_bit(Replacement, &rdev->flags);
1758 rdev->raid_disk = mirror;
1759 err = 0;
1760 disk_stack_limits(mddev->gendisk, rdev->bdev,
1761 rdev->data_offset << 9);
1762 conf->fullsync = 1;
1763 rcu_assign_pointer(p->replacement, rdev);
1764 break;
1765 }
1766
1767 disk_stack_limits(mddev->gendisk, rdev->bdev,
1768 rdev->data_offset << 9);
1769
1770 p->head_position = 0;
1771 p->recovery_disabled = mddev->recovery_disabled - 1;
1772 rdev->raid_disk = mirror;
1773 err = 0;
1774 if (rdev->saved_raid_disk != mirror)
1775 conf->fullsync = 1;
1776 rcu_assign_pointer(p->rdev, rdev);
1777 break;
1778 }
1779 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1780
1781
1782
1783
1784
1785
1786
1787 synchronize_sched();
1788 raise_barrier(conf, 0);
1789 lower_barrier(conf);
1790 clear_bit(Unmerged, &rdev->flags);
1791 }
1792 md_integrity_add_rdev(rdev, mddev);
1793 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1794 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1795
1796 print_conf(conf);
1797 return err;
1798}
1799
1800static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1801{
1802 struct r10conf *conf = mddev->private;
1803 int err = 0;
1804 int number = rdev->raid_disk;
1805 struct md_rdev **rdevp;
1806 struct raid10_info *p = conf->mirrors + number;
1807
1808 print_conf(conf);
1809 if (rdev == p->rdev)
1810 rdevp = &p->rdev;
1811 else if (rdev == p->replacement)
1812 rdevp = &p->replacement;
1813 else
1814 return 0;
1815
1816 if (test_bit(In_sync, &rdev->flags) ||
1817 atomic_read(&rdev->nr_pending)) {
1818 err = -EBUSY;
1819 goto abort;
1820 }
1821
1822
1823
1824 if (!test_bit(Faulty, &rdev->flags) &&
1825 mddev->recovery_disabled != p->recovery_disabled &&
1826 (!p->replacement || p->replacement == rdev) &&
1827 number < conf->geo.raid_disks &&
1828 enough(conf, -1)) {
1829 err = -EBUSY;
1830 goto abort;
1831 }
1832 *rdevp = NULL;
1833 synchronize_rcu();
1834 if (atomic_read(&rdev->nr_pending)) {
1835
1836 err = -EBUSY;
1837 *rdevp = rdev;
1838 goto abort;
1839 } else if (p->replacement) {
1840
1841 p->rdev = p->replacement;
1842 clear_bit(Replacement, &p->replacement->flags);
1843 smp_mb();
1844
1845
1846 p->replacement = NULL;
1847 clear_bit(WantReplacement, &rdev->flags);
1848 } else
1849
1850
1851
1852 clear_bit(WantReplacement, &rdev->flags);
1853
1854 err = md_integrity_register(mddev);
1855
1856abort:
1857
1858 print_conf(conf);
1859 return err;
1860}
1861
1862
1863static void end_sync_read(struct bio *bio, int error)
1864{
1865 struct r10bio *r10_bio = bio->bi_private;
1866 struct r10conf *conf = r10_bio->mddev->private;
1867 int d;
1868
1869 if (bio == r10_bio->master_bio) {
1870
1871 d = r10_bio->read_slot;
1872 } else
1873 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1874
1875 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1876 set_bit(R10BIO_Uptodate, &r10_bio->state);
1877 else
1878
1879
1880
1881 atomic_add(r10_bio->sectors,
1882 &conf->mirrors[d].rdev->corrected_errors);
1883
1884
1885
1886
1887 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1888 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1889 atomic_dec_and_test(&r10_bio->remaining)) {
1890
1891
1892
1893 reschedule_retry(r10_bio);
1894 }
1895}
1896
1897static void end_sync_request(struct r10bio *r10_bio)
1898{
1899 struct mddev *mddev = r10_bio->mddev;
1900
1901 while (atomic_dec_and_test(&r10_bio->remaining)) {
1902 if (r10_bio->master_bio == NULL) {
1903
1904 sector_t s = r10_bio->sectors;
1905 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1906 test_bit(R10BIO_WriteError, &r10_bio->state))
1907 reschedule_retry(r10_bio);
1908 else
1909 put_buf(r10_bio);
1910 md_done_sync(mddev, s, 1);
1911 break;
1912 } else {
1913 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1914 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1915 test_bit(R10BIO_WriteError, &r10_bio->state))
1916 reschedule_retry(r10_bio);
1917 else
1918 put_buf(r10_bio);
1919 r10_bio = r10_bio2;
1920 }
1921 }
1922}
1923
1924static void end_sync_write(struct bio *bio, int error)
1925{
1926 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1927 struct r10bio *r10_bio = bio->bi_private;
1928 struct mddev *mddev = r10_bio->mddev;
1929 struct r10conf *conf = mddev->private;
1930 int d;
1931 sector_t first_bad;
1932 int bad_sectors;
1933 int slot;
1934 int repl;
1935 struct md_rdev *rdev = NULL;
1936
1937 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1938 if (repl)
1939 rdev = conf->mirrors[d].replacement;
1940 else
1941 rdev = conf->mirrors[d].rdev;
1942
1943 if (!uptodate) {
1944 if (repl)
1945 md_error(mddev, rdev);
1946 else {
1947 set_bit(WriteErrorSeen, &rdev->flags);
1948 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1949 set_bit(MD_RECOVERY_NEEDED,
1950 &rdev->mddev->recovery);
1951 set_bit(R10BIO_WriteError, &r10_bio->state);
1952 }
1953 } else if (is_badblock(rdev,
1954 r10_bio->devs[slot].addr,
1955 r10_bio->sectors,
1956 &first_bad, &bad_sectors))
1957 set_bit(R10BIO_MadeGood, &r10_bio->state);
1958
1959 rdev_dec_pending(rdev, mddev);
1960
1961 end_sync_request(r10_bio);
1962}
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1981{
1982 struct r10conf *conf = mddev->private;
1983 int i, first;
1984 struct bio *tbio, *fbio;
1985 int vcnt;
1986
1987 atomic_set(&r10_bio->remaining, 1);
1988
1989
1990 for (i=0; i<conf->copies; i++)
1991 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1992 break;
1993
1994 if (i == conf->copies)
1995 goto done;
1996
1997 first = i;
1998 fbio = r10_bio->devs[i].bio;
1999
2000 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2001
2002 for (i=0 ; i < conf->copies ; i++) {
2003 int j, d;
2004
2005 tbio = r10_bio->devs[i].bio;
2006
2007 if (tbio->bi_end_io != end_sync_read)
2008 continue;
2009 if (i == first)
2010 continue;
2011 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2012
2013
2014
2015
2016 for (j = 0; j < vcnt; j++)
2017 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2018 page_address(tbio->bi_io_vec[j].bv_page),
2019 fbio->bi_io_vec[j].bv_len))
2020 break;
2021 if (j == vcnt)
2022 continue;
2023 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2024 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2025
2026 continue;
2027 }
2028
2029
2030
2031
2032
2033 tbio->bi_vcnt = vcnt;
2034 tbio->bi_size = r10_bio->sectors << 9;
2035 tbio->bi_idx = 0;
2036 tbio->bi_phys_segments = 0;
2037 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
2038 tbio->bi_flags |= 1 << BIO_UPTODATE;
2039 tbio->bi_next = NULL;
2040 tbio->bi_rw = WRITE;
2041 tbio->bi_private = r10_bio;
2042 tbio->bi_sector = r10_bio->devs[i].addr;
2043
2044 for (j=0; j < vcnt ; j++) {
2045 tbio->bi_io_vec[j].bv_offset = 0;
2046 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2047
2048 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2049 page_address(fbio->bi_io_vec[j].bv_page),
2050 PAGE_SIZE);
2051 }
2052 tbio->bi_end_io = end_sync_write;
2053
2054 d = r10_bio->devs[i].devnum;
2055 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2056 atomic_inc(&r10_bio->remaining);
2057 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
2058
2059 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2060 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2061 generic_make_request(tbio);
2062 }
2063
2064
2065
2066
2067 for (i = 0; i < conf->copies; i++) {
2068 int j, d;
2069
2070 tbio = r10_bio->devs[i].repl_bio;
2071 if (!tbio || !tbio->bi_end_io)
2072 continue;
2073 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2074 && r10_bio->devs[i].bio != fbio)
2075 for (j = 0; j < vcnt; j++)
2076 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2077 page_address(fbio->bi_io_vec[j].bv_page),
2078 PAGE_SIZE);
2079 d = r10_bio->devs[i].devnum;
2080 atomic_inc(&r10_bio->remaining);
2081 md_sync_acct(conf->mirrors[d].replacement->bdev,
2082 tbio->bi_size >> 9);
2083 generic_make_request(tbio);
2084 }
2085
2086done:
2087 if (atomic_dec_and_test(&r10_bio->remaining)) {
2088 md_done_sync(mddev, r10_bio->sectors, 1);
2089 put_buf(r10_bio);
2090 }
2091}
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103static void fix_recovery_read_error(struct r10bio *r10_bio)
2104{
2105
2106
2107
2108
2109
2110
2111
2112 struct mddev *mddev = r10_bio->mddev;
2113 struct r10conf *conf = mddev->private;
2114 struct bio *bio = r10_bio->devs[0].bio;
2115 sector_t sect = 0;
2116 int sectors = r10_bio->sectors;
2117 int idx = 0;
2118 int dr = r10_bio->devs[0].devnum;
2119 int dw = r10_bio->devs[1].devnum;
2120
2121 while (sectors) {
2122 int s = sectors;
2123 struct md_rdev *rdev;
2124 sector_t addr;
2125 int ok;
2126
2127 if (s > (PAGE_SIZE>>9))
2128 s = PAGE_SIZE >> 9;
2129
2130 rdev = conf->mirrors[dr].rdev;
2131 addr = r10_bio->devs[0].addr + sect,
2132 ok = sync_page_io(rdev,
2133 addr,
2134 s << 9,
2135 bio->bi_io_vec[idx].bv_page,
2136 READ, false);
2137 if (ok) {
2138 rdev = conf->mirrors[dw].rdev;
2139 addr = r10_bio->devs[1].addr + sect;
2140 ok = sync_page_io(rdev,
2141 addr,
2142 s << 9,
2143 bio->bi_io_vec[idx].bv_page,
2144 WRITE, false);
2145 if (!ok) {
2146 set_bit(WriteErrorSeen, &rdev->flags);
2147 if (!test_and_set_bit(WantReplacement,
2148 &rdev->flags))
2149 set_bit(MD_RECOVERY_NEEDED,
2150 &rdev->mddev->recovery);
2151 }
2152 }
2153 if (!ok) {
2154
2155
2156
2157
2158 rdev_set_badblocks(rdev, addr, s, 0);
2159
2160 if (rdev != conf->mirrors[dw].rdev) {
2161
2162 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2163 addr = r10_bio->devs[1].addr + sect;
2164 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2165 if (!ok) {
2166
2167 printk(KERN_NOTICE
2168 "md/raid10:%s: recovery aborted"
2169 " due to read error\n",
2170 mdname(mddev));
2171
2172 conf->mirrors[dw].recovery_disabled
2173 = mddev->recovery_disabled;
2174 set_bit(MD_RECOVERY_INTR,
2175 &mddev->recovery);
2176 break;
2177 }
2178 }
2179 }
2180
2181 sectors -= s;
2182 sect += s;
2183 idx++;
2184 }
2185}
2186
2187static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2188{
2189 struct r10conf *conf = mddev->private;
2190 int d;
2191 struct bio *wbio, *wbio2;
2192
2193 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2194 fix_recovery_read_error(r10_bio);
2195 end_sync_request(r10_bio);
2196 return;
2197 }
2198
2199
2200
2201
2202
2203 d = r10_bio->devs[1].devnum;
2204 wbio = r10_bio->devs[1].bio;
2205 wbio2 = r10_bio->devs[1].repl_bio;
2206 if (wbio->bi_end_io) {
2207 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2208 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
2209 generic_make_request(wbio);
2210 }
2211 if (wbio2 && wbio2->bi_end_io) {
2212 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2213 md_sync_acct(conf->mirrors[d].replacement->bdev,
2214 wbio2->bi_size >> 9);
2215 generic_make_request(wbio2);
2216 }
2217}
2218
2219
2220
2221
2222
2223
2224
2225
2226static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2227{
2228 struct timespec cur_time_mon;
2229 unsigned long hours_since_last;
2230 unsigned int read_errors = atomic_read(&rdev->read_errors);
2231
2232 ktime_get_ts(&cur_time_mon);
2233
2234 if (rdev->last_read_error.tv_sec == 0 &&
2235 rdev->last_read_error.tv_nsec == 0) {
2236
2237 rdev->last_read_error = cur_time_mon;
2238 return;
2239 }
2240
2241 hours_since_last = (cur_time_mon.tv_sec -
2242 rdev->last_read_error.tv_sec) / 3600;
2243
2244 rdev->last_read_error = cur_time_mon;
2245
2246
2247
2248
2249
2250
2251 if (hours_since_last >= 8 * sizeof(read_errors))
2252 atomic_set(&rdev->read_errors, 0);
2253 else
2254 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2255}
2256
2257static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2258 int sectors, struct page *page, int rw)
2259{
2260 sector_t first_bad;
2261 int bad_sectors;
2262
2263 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2264 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2265 return -1;
2266 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2267
2268 return 1;
2269 if (rw == WRITE) {
2270 set_bit(WriteErrorSeen, &rdev->flags);
2271 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2272 set_bit(MD_RECOVERY_NEEDED,
2273 &rdev->mddev->recovery);
2274 }
2275
2276 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2277 md_error(rdev->mddev, rdev);
2278 return 0;
2279}
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2290{
2291 int sect = 0;
2292 int sectors = r10_bio->sectors;
2293 struct md_rdev*rdev;
2294 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2295 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2296
2297
2298
2299
2300 rdev = conf->mirrors[d].rdev;
2301
2302 if (test_bit(Faulty, &rdev->flags))
2303
2304
2305 return;
2306
2307 check_decay_read_errors(mddev, rdev);
2308 atomic_inc(&rdev->read_errors);
2309 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2310 char b[BDEVNAME_SIZE];
2311 bdevname(rdev->bdev, b);
2312
2313 printk(KERN_NOTICE
2314 "md/raid10:%s: %s: Raid device exceeded "
2315 "read_error threshold [cur %d:max %d]\n",
2316 mdname(mddev), b,
2317 atomic_read(&rdev->read_errors), max_read_errors);
2318 printk(KERN_NOTICE
2319 "md/raid10:%s: %s: Failing raid device\n",
2320 mdname(mddev), b);
2321 md_error(mddev, conf->mirrors[d].rdev);
2322 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2323 return;
2324 }
2325
2326 while(sectors) {
2327 int s = sectors;
2328 int sl = r10_bio->read_slot;
2329 int success = 0;
2330 int start;
2331
2332 if (s > (PAGE_SIZE>>9))
2333 s = PAGE_SIZE >> 9;
2334
2335 rcu_read_lock();
2336 do {
2337 sector_t first_bad;
2338 int bad_sectors;
2339
2340 d = r10_bio->devs[sl].devnum;
2341 rdev = rcu_dereference(conf->mirrors[d].rdev);
2342 if (rdev &&
2343 !test_bit(Unmerged, &rdev->flags) &&
2344 test_bit(In_sync, &rdev->flags) &&
2345 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2346 &first_bad, &bad_sectors) == 0) {
2347 atomic_inc(&rdev->nr_pending);
2348 rcu_read_unlock();
2349 success = sync_page_io(rdev,
2350 r10_bio->devs[sl].addr +
2351 sect,
2352 s<<9,
2353 conf->tmppage, READ, false);
2354 rdev_dec_pending(rdev, mddev);
2355 rcu_read_lock();
2356 if (success)
2357 break;
2358 }
2359 sl++;
2360 if (sl == conf->copies)
2361 sl = 0;
2362 } while (!success && sl != r10_bio->read_slot);
2363 rcu_read_unlock();
2364
2365 if (!success) {
2366
2367
2368
2369
2370 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2371 rdev = conf->mirrors[dn].rdev;
2372
2373 if (!rdev_set_badblocks(
2374 rdev,
2375 r10_bio->devs[r10_bio->read_slot].addr
2376 + sect,
2377 s, 0)) {
2378 md_error(mddev, rdev);
2379 r10_bio->devs[r10_bio->read_slot].bio
2380 = IO_BLOCKED;
2381 }
2382 break;
2383 }
2384
2385 start = sl;
2386
2387 rcu_read_lock();
2388 while (sl != r10_bio->read_slot) {
2389 char b[BDEVNAME_SIZE];
2390
2391 if (sl==0)
2392 sl = conf->copies;
2393 sl--;
2394 d = r10_bio->devs[sl].devnum;
2395 rdev = rcu_dereference(conf->mirrors[d].rdev);
2396 if (!rdev ||
2397 test_bit(Unmerged, &rdev->flags) ||
2398 !test_bit(In_sync, &rdev->flags))
2399 continue;
2400
2401 atomic_inc(&rdev->nr_pending);
2402 rcu_read_unlock();
2403 if (r10_sync_page_io(rdev,
2404 r10_bio->devs[sl].addr +
2405 sect,
2406 s, conf->tmppage, WRITE)
2407 == 0) {
2408
2409 printk(KERN_NOTICE
2410 "md/raid10:%s: read correction "
2411 "write failed"
2412 " (%d sectors at %llu on %s)\n",
2413 mdname(mddev), s,
2414 (unsigned long long)(
2415 sect +
2416 choose_data_offset(r10_bio,
2417 rdev)),
2418 bdevname(rdev->bdev, b));
2419 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2420 "drive\n",
2421 mdname(mddev),
2422 bdevname(rdev->bdev, b));
2423 }
2424 rdev_dec_pending(rdev, mddev);
2425 rcu_read_lock();
2426 }
2427 sl = start;
2428 while (sl != r10_bio->read_slot) {
2429 char b[BDEVNAME_SIZE];
2430
2431 if (sl==0)
2432 sl = conf->copies;
2433 sl--;
2434 d = r10_bio->devs[sl].devnum;
2435 rdev = rcu_dereference(conf->mirrors[d].rdev);
2436 if (!rdev ||
2437 !test_bit(In_sync, &rdev->flags))
2438 continue;
2439
2440 atomic_inc(&rdev->nr_pending);
2441 rcu_read_unlock();
2442 switch (r10_sync_page_io(rdev,
2443 r10_bio->devs[sl].addr +
2444 sect,
2445 s, conf->tmppage,
2446 READ)) {
2447 case 0:
2448
2449 printk(KERN_NOTICE
2450 "md/raid10:%s: unable to read back "
2451 "corrected sectors"
2452 " (%d sectors at %llu on %s)\n",
2453 mdname(mddev), s,
2454 (unsigned long long)(
2455 sect +
2456 choose_data_offset(r10_bio, rdev)),
2457 bdevname(rdev->bdev, b));
2458 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2459 "drive\n",
2460 mdname(mddev),
2461 bdevname(rdev->bdev, b));
2462 break;
2463 case 1:
2464 printk(KERN_INFO
2465 "md/raid10:%s: read error corrected"
2466 " (%d sectors at %llu on %s)\n",
2467 mdname(mddev), s,
2468 (unsigned long long)(
2469 sect +
2470 choose_data_offset(r10_bio, rdev)),
2471 bdevname(rdev->bdev, b));
2472 atomic_add(s, &rdev->corrected_errors);
2473 }
2474
2475 rdev_dec_pending(rdev, mddev);
2476 rcu_read_lock();
2477 }
2478 rcu_read_unlock();
2479
2480 sectors -= s;
2481 sect += s;
2482 }
2483}
2484
2485static void bi_complete(struct bio *bio, int error)
2486{
2487 complete((struct completion *)bio->bi_private);
2488}
2489
2490static int submit_bio_wait(int rw, struct bio *bio)
2491{
2492 struct completion event;
2493 rw |= REQ_SYNC;
2494
2495 init_completion(&event);
2496 bio->bi_private = &event;
2497 bio->bi_end_io = bi_complete;
2498 submit_bio(rw, bio);
2499 wait_for_completion(&event);
2500
2501 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2502}
2503
2504static int narrow_write_error(struct r10bio *r10_bio, int i)
2505{
2506 struct bio *bio = r10_bio->master_bio;
2507 struct mddev *mddev = r10_bio->mddev;
2508 struct r10conf *conf = mddev->private;
2509 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521 int block_sectors;
2522 sector_t sector;
2523 int sectors;
2524 int sect_to_write = r10_bio->sectors;
2525 int ok = 1;
2526
2527 if (rdev->badblocks.shift < 0)
2528 return 0;
2529
2530 block_sectors = 1 << rdev->badblocks.shift;
2531 sector = r10_bio->sector;
2532 sectors = ((r10_bio->sector + block_sectors)
2533 & ~(sector_t)(block_sectors - 1))
2534 - sector;
2535
2536 while (sect_to_write) {
2537 struct bio *wbio;
2538 if (sectors > sect_to_write)
2539 sectors = sect_to_write;
2540
2541 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2542 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2543 wbio->bi_sector = (r10_bio->devs[i].addr+
2544 choose_data_offset(r10_bio, rdev) +
2545 (sector - r10_bio->sector));
2546 wbio->bi_bdev = rdev->bdev;
2547 if (submit_bio_wait(WRITE, wbio) == 0)
2548
2549 ok = rdev_set_badblocks(rdev, sector,
2550 sectors, 0)
2551 && ok;
2552
2553 bio_put(wbio);
2554 sect_to_write -= sectors;
2555 sector += sectors;
2556 sectors = block_sectors;
2557 }
2558 return ok;
2559}
2560
2561static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2562{
2563 int slot = r10_bio->read_slot;
2564 struct bio *bio;
2565 struct r10conf *conf = mddev->private;
2566 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2567 char b[BDEVNAME_SIZE];
2568 unsigned long do_sync;
2569 int max_sectors;
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579 bio = r10_bio->devs[slot].bio;
2580 bdevname(bio->bi_bdev, b);
2581 bio_put(bio);
2582 r10_bio->devs[slot].bio = NULL;
2583
2584 if (mddev->ro == 0) {
2585 freeze_array(conf);
2586 fix_read_error(conf, mddev, r10_bio);
2587 unfreeze_array(conf);
2588 } else
2589 r10_bio->devs[slot].bio = IO_BLOCKED;
2590
2591 rdev_dec_pending(rdev, mddev);
2592
2593read_more:
2594 rdev = read_balance(conf, r10_bio, &max_sectors);
2595 if (rdev == NULL) {
2596 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2597 " read error for block %llu\n",
2598 mdname(mddev), b,
2599 (unsigned long long)r10_bio->sector);
2600 raid_end_bio_io(r10_bio);
2601 return;
2602 }
2603
2604 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2605 slot = r10_bio->read_slot;
2606 printk_ratelimited(
2607 KERN_ERR
2608 "md/raid10:%s: %s: redirecting "
2609 "sector %llu to another mirror\n",
2610 mdname(mddev),
2611 bdevname(rdev->bdev, b),
2612 (unsigned long long)r10_bio->sector);
2613 bio = bio_clone_mddev(r10_bio->master_bio,
2614 GFP_NOIO, mddev);
2615 md_trim_bio(bio,
2616 r10_bio->sector - bio->bi_sector,
2617 max_sectors);
2618 r10_bio->devs[slot].bio = bio;
2619 r10_bio->devs[slot].rdev = rdev;
2620 bio->bi_sector = r10_bio->devs[slot].addr
2621 + choose_data_offset(r10_bio, rdev);
2622 bio->bi_bdev = rdev->bdev;
2623 bio->bi_rw = READ | do_sync;
2624 bio->bi_private = r10_bio;
2625 bio->bi_end_io = raid10_end_read_request;
2626 if (max_sectors < r10_bio->sectors) {
2627
2628 struct bio *mbio = r10_bio->master_bio;
2629 int sectors_handled =
2630 r10_bio->sector + max_sectors
2631 - mbio->bi_sector;
2632 r10_bio->sectors = max_sectors;
2633 spin_lock_irq(&conf->device_lock);
2634 if (mbio->bi_phys_segments == 0)
2635 mbio->bi_phys_segments = 2;
2636 else
2637 mbio->bi_phys_segments++;
2638 spin_unlock_irq(&conf->device_lock);
2639 generic_make_request(bio);
2640
2641 r10_bio = mempool_alloc(conf->r10bio_pool,
2642 GFP_NOIO);
2643 r10_bio->master_bio = mbio;
2644 r10_bio->sectors = (mbio->bi_size >> 9)
2645 - sectors_handled;
2646 r10_bio->state = 0;
2647 set_bit(R10BIO_ReadError,
2648 &r10_bio->state);
2649 r10_bio->mddev = mddev;
2650 r10_bio->sector = mbio->bi_sector
2651 + sectors_handled;
2652
2653 goto read_more;
2654 } else
2655 generic_make_request(bio);
2656}
2657
2658static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2659{
2660
2661
2662
2663
2664
2665
2666 int m;
2667 struct md_rdev *rdev;
2668
2669 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2670 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2671 for (m = 0; m < conf->copies; m++) {
2672 int dev = r10_bio->devs[m].devnum;
2673 rdev = conf->mirrors[dev].rdev;
2674 if (r10_bio->devs[m].bio == NULL)
2675 continue;
2676 if (test_bit(BIO_UPTODATE,
2677 &r10_bio->devs[m].bio->bi_flags)) {
2678 rdev_clear_badblocks(
2679 rdev,
2680 r10_bio->devs[m].addr,
2681 r10_bio->sectors, 0);
2682 } else {
2683 if (!rdev_set_badblocks(
2684 rdev,
2685 r10_bio->devs[m].addr,
2686 r10_bio->sectors, 0))
2687 md_error(conf->mddev, rdev);
2688 }
2689 rdev = conf->mirrors[dev].replacement;
2690 if (r10_bio->devs[m].repl_bio == NULL)
2691 continue;
2692 if (test_bit(BIO_UPTODATE,
2693 &r10_bio->devs[m].repl_bio->bi_flags)) {
2694 rdev_clear_badblocks(
2695 rdev,
2696 r10_bio->devs[m].addr,
2697 r10_bio->sectors, 0);
2698 } else {
2699 if (!rdev_set_badblocks(
2700 rdev,
2701 r10_bio->devs[m].addr,
2702 r10_bio->sectors, 0))
2703 md_error(conf->mddev, rdev);
2704 }
2705 }
2706 put_buf(r10_bio);
2707 } else {
2708 for (m = 0; m < conf->copies; m++) {
2709 int dev = r10_bio->devs[m].devnum;
2710 struct bio *bio = r10_bio->devs[m].bio;
2711 rdev = conf->mirrors[dev].rdev;
2712 if (bio == IO_MADE_GOOD) {
2713 rdev_clear_badblocks(
2714 rdev,
2715 r10_bio->devs[m].addr,
2716 r10_bio->sectors, 0);
2717 rdev_dec_pending(rdev, conf->mddev);
2718 } else if (bio != NULL &&
2719 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2720 if (!narrow_write_error(r10_bio, m)) {
2721 md_error(conf->mddev, rdev);
2722 set_bit(R10BIO_Degraded,
2723 &r10_bio->state);
2724 }
2725 rdev_dec_pending(rdev, conf->mddev);
2726 }
2727 bio = r10_bio->devs[m].repl_bio;
2728 rdev = conf->mirrors[dev].replacement;
2729 if (rdev && bio == IO_MADE_GOOD) {
2730 rdev_clear_badblocks(
2731 rdev,
2732 r10_bio->devs[m].addr,
2733 r10_bio->sectors, 0);
2734 rdev_dec_pending(rdev, conf->mddev);
2735 }
2736 }
2737 if (test_bit(R10BIO_WriteError,
2738 &r10_bio->state))
2739 close_write(r10_bio);
2740 raid_end_bio_io(r10_bio);
2741 }
2742}
2743
2744static void raid10d(struct md_thread *thread)
2745{
2746 struct mddev *mddev = thread->mddev;
2747 struct r10bio *r10_bio;
2748 unsigned long flags;
2749 struct r10conf *conf = mddev->private;
2750 struct list_head *head = &conf->retry_list;
2751 struct blk_plug plug;
2752
2753 md_check_recovery(mddev);
2754
2755 blk_start_plug(&plug);
2756 for (;;) {
2757
2758 flush_pending_writes(conf);
2759
2760 spin_lock_irqsave(&conf->device_lock, flags);
2761 if (list_empty(head)) {
2762 spin_unlock_irqrestore(&conf->device_lock, flags);
2763 break;
2764 }
2765 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2766 list_del(head->prev);
2767 conf->nr_queued--;
2768 spin_unlock_irqrestore(&conf->device_lock, flags);
2769
2770 mddev = r10_bio->mddev;
2771 conf = mddev->private;
2772 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2773 test_bit(R10BIO_WriteError, &r10_bio->state))
2774 handle_write_completed(conf, r10_bio);
2775 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2776 reshape_request_write(mddev, r10_bio);
2777 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2778 sync_request_write(mddev, r10_bio);
2779 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2780 recovery_request_write(mddev, r10_bio);
2781 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2782 handle_read_error(mddev, r10_bio);
2783 else {
2784
2785
2786
2787 int slot = r10_bio->read_slot;
2788 generic_make_request(r10_bio->devs[slot].bio);
2789 }
2790
2791 cond_resched();
2792 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2793 md_check_recovery(mddev);
2794 }
2795 blk_finish_plug(&plug);
2796}
2797
2798
2799static int init_resync(struct r10conf *conf)
2800{
2801 int buffs;
2802 int i;
2803
2804 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2805 BUG_ON(conf->r10buf_pool);
2806 conf->have_replacement = 0;
2807 for (i = 0; i < conf->geo.raid_disks; i++)
2808 if (conf->mirrors[i].replacement)
2809 conf->have_replacement = 1;
2810 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2811 if (!conf->r10buf_pool)
2812 return -ENOMEM;
2813 conf->next_resync = 0;
2814 return 0;
2815}
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2850 int *skipped, int go_faster)
2851{
2852 struct r10conf *conf = mddev->private;
2853 struct r10bio *r10_bio;
2854 struct bio *biolist = NULL, *bio;
2855 sector_t max_sector, nr_sectors;
2856 int i;
2857 int max_sync;
2858 sector_t sync_blocks;
2859 sector_t sectors_skipped = 0;
2860 int chunks_skipped = 0;
2861 sector_t chunk_mask = conf->geo.chunk_mask;
2862
2863 if (!conf->r10buf_pool)
2864 if (init_resync(conf))
2865 return 0;
2866
2867 skipped:
2868 max_sector = mddev->dev_sectors;
2869 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2870 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2871 max_sector = mddev->resync_max_sectors;
2872 if (sector_nr >= max_sector) {
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2883 end_reshape(conf);
2884 return 0;
2885 }
2886
2887 if (mddev->curr_resync < max_sector) {
2888 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2889 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2890 &sync_blocks, 1);
2891 else for (i = 0; i < conf->geo.raid_disks; i++) {
2892 sector_t sect =
2893 raid10_find_virt(conf, mddev->curr_resync, i);
2894 bitmap_end_sync(mddev->bitmap, sect,
2895 &sync_blocks, 1);
2896 }
2897 } else {
2898
2899 if ((!mddev->bitmap || conf->fullsync)
2900 && conf->have_replacement
2901 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2902
2903
2904
2905 for (i = 0; i < conf->geo.raid_disks; i++)
2906 if (conf->mirrors[i].replacement)
2907 conf->mirrors[i].replacement
2908 ->recovery_offset
2909 = MaxSector;
2910 }
2911 conf->fullsync = 0;
2912 }
2913 bitmap_close_sync(mddev->bitmap);
2914 close_sync(conf);
2915 *skipped = 1;
2916 return sectors_skipped;
2917 }
2918
2919 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2920 return reshape_request(mddev, sector_nr, skipped);
2921
2922 if (chunks_skipped >= conf->geo.raid_disks) {
2923
2924
2925
2926 *skipped = 1;
2927 return (max_sector - sector_nr) + sectors_skipped;
2928 }
2929
2930 if (max_sector > mddev->resync_max)
2931 max_sector = mddev->resync_max;
2932
2933
2934
2935
2936 if (conf->geo.near_copies < conf->geo.raid_disks &&
2937 max_sector > (sector_nr | chunk_mask))
2938 max_sector = (sector_nr | chunk_mask) + 1;
2939
2940
2941
2942
2943 if (!go_faster && conf->nr_waiting)
2944 msleep_interruptible(1000);
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2962 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2963
2964 int j;
2965 r10_bio = NULL;
2966
2967 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2968 int still_degraded;
2969 struct r10bio *rb2;
2970 sector_t sect;
2971 int must_sync;
2972 int any_working;
2973 struct raid10_info *mirror = &conf->mirrors[i];
2974
2975 if ((mirror->rdev == NULL ||
2976 test_bit(In_sync, &mirror->rdev->flags))
2977 &&
2978 (mirror->replacement == NULL ||
2979 test_bit(Faulty,
2980 &mirror->replacement->flags)))
2981 continue;
2982
2983 still_degraded = 0;
2984
2985 rb2 = r10_bio;
2986 sect = raid10_find_virt(conf, sector_nr, i);
2987 if (sect >= mddev->resync_max_sectors) {
2988
2989
2990
2991 continue;
2992 }
2993
2994
2995
2996
2997 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2998 &sync_blocks, 1);
2999 if (sync_blocks < max_sync)
3000 max_sync = sync_blocks;
3001 if (!must_sync &&
3002 mirror->replacement == NULL &&
3003 !conf->fullsync) {
3004
3005
3006
3007 chunks_skipped = -1;
3008 continue;
3009 }
3010
3011 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3012 raise_barrier(conf, rb2 != NULL);
3013 atomic_set(&r10_bio->remaining, 0);
3014
3015 r10_bio->master_bio = (struct bio*)rb2;
3016 if (rb2)
3017 atomic_inc(&rb2->remaining);
3018 r10_bio->mddev = mddev;
3019 set_bit(R10BIO_IsRecover, &r10_bio->state);
3020 r10_bio->sector = sect;
3021
3022 raid10_find_phys(conf, r10_bio);
3023
3024
3025
3026
3027 for (j = 0; j < conf->geo.raid_disks; j++)
3028 if (conf->mirrors[j].rdev == NULL ||
3029 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3030 still_degraded = 1;
3031 break;
3032 }
3033
3034 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3035 &sync_blocks, still_degraded);
3036
3037 any_working = 0;
3038 for (j=0; j<conf->copies;j++) {
3039 int k;
3040 int d = r10_bio->devs[j].devnum;
3041 sector_t from_addr, to_addr;
3042 struct md_rdev *rdev;
3043 sector_t sector, first_bad;
3044 int bad_sectors;
3045 if (!conf->mirrors[d].rdev ||
3046 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3047 continue;
3048
3049 any_working = 1;
3050 rdev = conf->mirrors[d].rdev;
3051 sector = r10_bio->devs[j].addr;
3052
3053 if (is_badblock(rdev, sector, max_sync,
3054 &first_bad, &bad_sectors)) {
3055 if (first_bad > sector)
3056 max_sync = first_bad - sector;
3057 else {
3058 bad_sectors -= (sector
3059 - first_bad);
3060 if (max_sync > bad_sectors)
3061 max_sync = bad_sectors;
3062 continue;
3063 }
3064 }
3065 bio = r10_bio->devs[0].bio;
3066 bio->bi_next = biolist;
3067 biolist = bio;
3068 bio->bi_private = r10_bio;
3069 bio->bi_end_io = end_sync_read;
3070 bio->bi_rw = READ;
3071 from_addr = r10_bio->devs[j].addr;
3072 bio->bi_sector = from_addr + rdev->data_offset;
3073 bio->bi_bdev = rdev->bdev;
3074 atomic_inc(&rdev->nr_pending);
3075
3076
3077 for (k=0; k<conf->copies; k++)
3078 if (r10_bio->devs[k].devnum == i)
3079 break;
3080 BUG_ON(k == conf->copies);
3081 to_addr = r10_bio->devs[k].addr;
3082 r10_bio->devs[0].devnum = d;
3083 r10_bio->devs[0].addr = from_addr;
3084 r10_bio->devs[1].devnum = i;
3085 r10_bio->devs[1].addr = to_addr;
3086
3087 rdev = mirror->rdev;
3088 if (!test_bit(In_sync, &rdev->flags)) {
3089 bio = r10_bio->devs[1].bio;
3090 bio->bi_next = biolist;
3091 biolist = bio;
3092 bio->bi_private = r10_bio;
3093 bio->bi_end_io = end_sync_write;
3094 bio->bi_rw = WRITE;
3095 bio->bi_sector = to_addr
3096 + rdev->data_offset;
3097 bio->bi_bdev = rdev->bdev;
3098 atomic_inc(&r10_bio->remaining);
3099 } else
3100 r10_bio->devs[1].bio->bi_end_io = NULL;
3101
3102
3103 bio = r10_bio->devs[1].repl_bio;
3104 if (bio)
3105 bio->bi_end_io = NULL;
3106 rdev = mirror->replacement;
3107
3108
3109
3110
3111
3112
3113
3114
3115 if (rdev == NULL || bio == NULL ||
3116 test_bit(Faulty, &rdev->flags))
3117 break;
3118 bio->bi_next = biolist;
3119 biolist = bio;
3120 bio->bi_private = r10_bio;
3121 bio->bi_end_io = end_sync_write;
3122 bio->bi_rw = WRITE;
3123 bio->bi_sector = to_addr + rdev->data_offset;
3124 bio->bi_bdev = rdev->bdev;
3125 atomic_inc(&r10_bio->remaining);
3126 break;
3127 }
3128 if (j == conf->copies) {
3129
3130
3131 put_buf(r10_bio);
3132 if (rb2)
3133 atomic_dec(&rb2->remaining);
3134 r10_bio = rb2;
3135 if (any_working) {
3136
3137
3138
3139 int k;
3140 for (k = 0; k < conf->copies; k++)
3141 if (r10_bio->devs[k].devnum == i)
3142 break;
3143 if (!test_bit(In_sync,
3144 &mirror->rdev->flags)
3145 && !rdev_set_badblocks(
3146 mirror->rdev,
3147 r10_bio->devs[k].addr,
3148 max_sync, 0))
3149 any_working = 0;
3150 if (mirror->replacement &&
3151 !rdev_set_badblocks(
3152 mirror->replacement,
3153 r10_bio->devs[k].addr,
3154 max_sync, 0))
3155 any_working = 0;
3156 }
3157 if (!any_working) {
3158 if (!test_and_set_bit(MD_RECOVERY_INTR,
3159 &mddev->recovery))
3160 printk(KERN_INFO "md/raid10:%s: insufficient "
3161 "working devices for recovery.\n",
3162 mdname(mddev));
3163 mirror->recovery_disabled
3164 = mddev->recovery_disabled;
3165 }
3166 break;
3167 }
3168 }
3169 if (biolist == NULL) {
3170 while (r10_bio) {
3171 struct r10bio *rb2 = r10_bio;
3172 r10_bio = (struct r10bio*) rb2->master_bio;
3173 rb2->master_bio = NULL;
3174 put_buf(rb2);
3175 }
3176 goto giveup;
3177 }
3178 } else {
3179
3180 int count = 0;
3181
3182 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3183
3184 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3185 &sync_blocks, mddev->degraded) &&
3186 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3187 &mddev->recovery)) {
3188
3189 *skipped = 1;
3190 return sync_blocks + sectors_skipped;
3191 }
3192 if (sync_blocks < max_sync)
3193 max_sync = sync_blocks;
3194 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3195
3196 r10_bio->mddev = mddev;
3197 atomic_set(&r10_bio->remaining, 0);
3198 raise_barrier(conf, 0);
3199 conf->next_resync = sector_nr;
3200
3201 r10_bio->master_bio = NULL;
3202 r10_bio->sector = sector_nr;
3203 set_bit(R10BIO_IsSync, &r10_bio->state);
3204 raid10_find_phys(conf, r10_bio);
3205 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3206
3207 for (i = 0; i < conf->copies; i++) {
3208 int d = r10_bio->devs[i].devnum;
3209 sector_t first_bad, sector;
3210 int bad_sectors;
3211
3212 if (r10_bio->devs[i].repl_bio)
3213 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3214
3215 bio = r10_bio->devs[i].bio;
3216 bio->bi_end_io = NULL;
3217 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3218 if (conf->mirrors[d].rdev == NULL ||
3219 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3220 continue;
3221 sector = r10_bio->devs[i].addr;
3222 if (is_badblock(conf->mirrors[d].rdev,
3223 sector, max_sync,
3224 &first_bad, &bad_sectors)) {
3225 if (first_bad > sector)
3226 max_sync = first_bad - sector;
3227 else {
3228 bad_sectors -= (sector - first_bad);
3229 if (max_sync > bad_sectors)
3230 max_sync = bad_sectors;
3231 continue;
3232 }
3233 }
3234 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3235 atomic_inc(&r10_bio->remaining);
3236 bio->bi_next = biolist;
3237 biolist = bio;
3238 bio->bi_private = r10_bio;
3239 bio->bi_end_io = end_sync_read;
3240 bio->bi_rw = READ;
3241 bio->bi_sector = sector +
3242 conf->mirrors[d].rdev->data_offset;
3243 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3244 count++;
3245
3246 if (conf->mirrors[d].replacement == NULL ||
3247 test_bit(Faulty,
3248 &conf->mirrors[d].replacement->flags))
3249 continue;
3250
3251
3252 bio = r10_bio->devs[i].repl_bio;
3253 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3254
3255 sector = r10_bio->devs[i].addr;
3256 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3257 bio->bi_next = biolist;
3258 biolist = bio;
3259 bio->bi_private = r10_bio;
3260 bio->bi_end_io = end_sync_write;
3261 bio->bi_rw = WRITE;
3262 bio->bi_sector = sector +
3263 conf->mirrors[d].replacement->data_offset;
3264 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3265 count++;
3266 }
3267
3268 if (count < 2) {
3269 for (i=0; i<conf->copies; i++) {
3270 int d = r10_bio->devs[i].devnum;
3271 if (r10_bio->devs[i].bio->bi_end_io)
3272 rdev_dec_pending(conf->mirrors[d].rdev,
3273 mddev);
3274 if (r10_bio->devs[i].repl_bio &&
3275 r10_bio->devs[i].repl_bio->bi_end_io)
3276 rdev_dec_pending(
3277 conf->mirrors[d].replacement,
3278 mddev);
3279 }
3280 put_buf(r10_bio);
3281 biolist = NULL;
3282 goto giveup;
3283 }
3284 }
3285
3286 for (bio = biolist; bio ; bio=bio->bi_next) {
3287
3288 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3289 if (bio->bi_end_io)
3290 bio->bi_flags |= 1 << BIO_UPTODATE;
3291 bio->bi_vcnt = 0;
3292 bio->bi_idx = 0;
3293 bio->bi_phys_segments = 0;
3294 bio->bi_size = 0;
3295 }
3296
3297 nr_sectors = 0;
3298 if (sector_nr + max_sync < max_sector)
3299 max_sector = sector_nr + max_sync;
3300 do {
3301 struct page *page;
3302 int len = PAGE_SIZE;
3303 if (sector_nr + (len>>9) > max_sector)
3304 len = (max_sector - sector_nr) << 9;
3305 if (len == 0)
3306 break;
3307 for (bio= biolist ; bio ; bio=bio->bi_next) {
3308 struct bio *bio2;
3309 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3310 if (bio_add_page(bio, page, len, 0))
3311 continue;
3312
3313
3314 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3315 for (bio2 = biolist;
3316 bio2 && bio2 != bio;
3317 bio2 = bio2->bi_next) {
3318
3319 bio2->bi_vcnt--;
3320 bio2->bi_size -= len;
3321 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3322 }
3323 goto bio_full;
3324 }
3325 nr_sectors += len>>9;
3326 sector_nr += len>>9;
3327 } while (biolist->bi_vcnt < RESYNC_PAGES);
3328 bio_full:
3329 r10_bio->sectors = nr_sectors;
3330
3331 while (biolist) {
3332 bio = biolist;
3333 biolist = biolist->bi_next;
3334
3335 bio->bi_next = NULL;
3336 r10_bio = bio->bi_private;
3337 r10_bio->sectors = nr_sectors;
3338
3339 if (bio->bi_end_io == end_sync_read) {
3340 md_sync_acct(bio->bi_bdev, nr_sectors);
3341 generic_make_request(bio);
3342 }
3343 }
3344
3345 if (sectors_skipped)
3346
3347
3348
3349 md_done_sync(mddev, sectors_skipped, 1);
3350
3351 return sectors_skipped + nr_sectors;
3352 giveup:
3353
3354
3355
3356
3357 if (sector_nr + max_sync < max_sector)
3358 max_sector = sector_nr + max_sync;
3359
3360 sectors_skipped += (max_sector - sector_nr);
3361 chunks_skipped ++;
3362 sector_nr = max_sector;
3363 goto skipped;
3364}
3365
3366static sector_t
3367raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3368{
3369 sector_t size;
3370 struct r10conf *conf = mddev->private;
3371
3372 if (!raid_disks)
3373 raid_disks = min(conf->geo.raid_disks,
3374 conf->prev.raid_disks);
3375 if (!sectors)
3376 sectors = conf->dev_sectors;
3377
3378 size = sectors >> conf->geo.chunk_shift;
3379 sector_div(size, conf->geo.far_copies);
3380 size = size * raid_disks;
3381 sector_div(size, conf->geo.near_copies);
3382
3383 return size << conf->geo.chunk_shift;
3384}
3385
3386static void calc_sectors(struct r10conf *conf, sector_t size)
3387{
3388
3389
3390
3391
3392
3393 size = size >> conf->geo.chunk_shift;
3394 sector_div(size, conf->geo.far_copies);
3395 size = size * conf->geo.raid_disks;
3396 sector_div(size, conf->geo.near_copies);
3397
3398
3399 size = size * conf->copies;
3400
3401
3402
3403
3404 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3405
3406 conf->dev_sectors = size << conf->geo.chunk_shift;
3407
3408 if (conf->geo.far_offset)
3409 conf->geo.stride = 1 << conf->geo.chunk_shift;
3410 else {
3411 sector_div(size, conf->geo.far_copies);
3412 conf->geo.stride = size << conf->geo.chunk_shift;
3413 }
3414}
3415
3416enum geo_type {geo_new, geo_old, geo_start};
3417static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3418{
3419 int nc, fc, fo;
3420 int layout, chunk, disks;
3421 switch (new) {
3422 case geo_old:
3423 layout = mddev->layout;
3424 chunk = mddev->chunk_sectors;
3425 disks = mddev->raid_disks - mddev->delta_disks;
3426 break;
3427 case geo_new:
3428 layout = mddev->new_layout;
3429 chunk = mddev->new_chunk_sectors;
3430 disks = mddev->raid_disks;
3431 break;
3432 default:
3433 case geo_start:
3434
3435 layout = mddev->new_layout;
3436 chunk = mddev->new_chunk_sectors;
3437 disks = mddev->raid_disks + mddev->delta_disks;
3438 break;
3439 }
3440 if (layout >> 17)
3441 return -1;
3442 if (chunk < (PAGE_SIZE >> 9) ||
3443 !is_power_of_2(chunk))
3444 return -2;
3445 nc = layout & 255;
3446 fc = (layout >> 8) & 255;
3447 fo = layout & (1<<16);
3448 geo->raid_disks = disks;
3449 geo->near_copies = nc;
3450 geo->far_copies = fc;
3451 geo->far_offset = fo;
3452 geo->chunk_mask = chunk - 1;
3453 geo->chunk_shift = ffz(~chunk);
3454 return nc*fc;
3455}
3456
3457static struct r10conf *setup_conf(struct mddev *mddev)
3458{
3459 struct r10conf *conf = NULL;
3460 int err = -EINVAL;
3461 struct geom geo;
3462 int copies;
3463
3464 copies = setup_geo(&geo, mddev, geo_new);
3465
3466 if (copies == -2) {
3467 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3468 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3469 mdname(mddev), PAGE_SIZE);
3470 goto out;
3471 }
3472
3473 if (copies < 2 || copies > mddev->raid_disks) {
3474 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3475 mdname(mddev), mddev->new_layout);
3476 goto out;
3477 }
3478
3479 err = -ENOMEM;
3480 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3481 if (!conf)
3482 goto out;
3483
3484
3485 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3486 max(0,mddev->delta_disks)),
3487 GFP_KERNEL);
3488 if (!conf->mirrors)
3489 goto out;
3490
3491 conf->tmppage = alloc_page(GFP_KERNEL);
3492 if (!conf->tmppage)
3493 goto out;
3494
3495 conf->geo = geo;
3496 conf->copies = copies;
3497 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3498 r10bio_pool_free, conf);
3499 if (!conf->r10bio_pool)
3500 goto out;
3501
3502 calc_sectors(conf, mddev->dev_sectors);
3503 if (mddev->reshape_position == MaxSector) {
3504 conf->prev = conf->geo;
3505 conf->reshape_progress = MaxSector;
3506 } else {
3507 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3508 err = -EINVAL;
3509 goto out;
3510 }
3511 conf->reshape_progress = mddev->reshape_position;
3512 if (conf->prev.far_offset)
3513 conf->prev.stride = 1 << conf->prev.chunk_shift;
3514 else
3515
3516 conf->prev.stride = conf->dev_sectors;
3517 }
3518 spin_lock_init(&conf->device_lock);
3519 INIT_LIST_HEAD(&conf->retry_list);
3520
3521 spin_lock_init(&conf->resync_lock);
3522 init_waitqueue_head(&conf->wait_barrier);
3523
3524 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3525 if (!conf->thread)
3526 goto out;
3527
3528 conf->mddev = mddev;
3529 return conf;
3530
3531 out:
3532 if (err == -ENOMEM)
3533 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3534 mdname(mddev));
3535 if (conf) {
3536 if (conf->r10bio_pool)
3537 mempool_destroy(conf->r10bio_pool);
3538 kfree(conf->mirrors);
3539 safe_put_page(conf->tmppage);
3540 kfree(conf);
3541 }
3542 return ERR_PTR(err);
3543}
3544
3545static int run(struct mddev *mddev)
3546{
3547 struct r10conf *conf;
3548 int i, disk_idx, chunk_size;
3549 struct raid10_info *disk;
3550 struct md_rdev *rdev;
3551 sector_t size;
3552 sector_t min_offset_diff = 0;
3553 int first = 1;
3554 bool discard_supported = false;
3555
3556 if (mddev->private == NULL) {
3557 conf = setup_conf(mddev);
3558 if (IS_ERR(conf))
3559 return PTR_ERR(conf);
3560 mddev->private = conf;
3561 }
3562 conf = mddev->private;
3563 if (!conf)
3564 goto out;
3565
3566 mddev->thread = conf->thread;
3567 conf->thread = NULL;
3568
3569 chunk_size = mddev->chunk_sectors << 9;
3570 if (mddev->queue) {
3571 blk_queue_max_discard_sectors(mddev->queue,
3572 mddev->chunk_sectors);
3573 blk_queue_io_min(mddev->queue, chunk_size);
3574 if (conf->geo.raid_disks % conf->geo.near_copies)
3575 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3576 else
3577 blk_queue_io_opt(mddev->queue, chunk_size *
3578 (conf->geo.raid_disks / conf->geo.near_copies));
3579 }
3580
3581 rdev_for_each(rdev, mddev) {
3582 long long diff;
3583 struct request_queue *q;
3584
3585 disk_idx = rdev->raid_disk;
3586 if (disk_idx < 0)
3587 continue;
3588 if (disk_idx >= conf->geo.raid_disks &&
3589 disk_idx >= conf->prev.raid_disks)
3590 continue;
3591 disk = conf->mirrors + disk_idx;
3592
3593 if (test_bit(Replacement, &rdev->flags)) {
3594 if (disk->replacement)
3595 goto out_free_conf;
3596 disk->replacement = rdev;
3597 } else {
3598 if (disk->rdev)
3599 goto out_free_conf;
3600 disk->rdev = rdev;
3601 }
3602 q = bdev_get_queue(rdev->bdev);
3603 if (q->merge_bvec_fn)
3604 mddev->merge_check_needed = 1;
3605 diff = (rdev->new_data_offset - rdev->data_offset);
3606 if (!mddev->reshape_backwards)
3607 diff = -diff;
3608 if (diff < 0)
3609 diff = 0;
3610 if (first || diff < min_offset_diff)
3611 min_offset_diff = diff;
3612
3613 if (mddev->gendisk)
3614 disk_stack_limits(mddev->gendisk, rdev->bdev,
3615 rdev->data_offset << 9);
3616
3617 disk->head_position = 0;
3618
3619 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3620 discard_supported = true;
3621 }
3622
3623 if (mddev->queue) {
3624 if (discard_supported)
3625 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3626 mddev->queue);
3627 else
3628 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3629 mddev->queue);
3630 }
3631
3632 if (!enough(conf, -1)) {
3633 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3634 mdname(mddev));
3635 goto out_free_conf;
3636 }
3637
3638 if (conf->reshape_progress != MaxSector) {
3639
3640 if (conf->geo.far_copies != 1 &&
3641 conf->geo.far_offset == 0)
3642 goto out_free_conf;
3643 if (conf->prev.far_copies != 1 &&
3644 conf->geo.far_offset == 0)
3645 goto out_free_conf;
3646 }
3647
3648 mddev->degraded = 0;
3649 for (i = 0;
3650 i < conf->geo.raid_disks
3651 || i < conf->prev.raid_disks;
3652 i++) {
3653
3654 disk = conf->mirrors + i;
3655
3656 if (!disk->rdev && disk->replacement) {
3657
3658 disk->rdev = disk->replacement;
3659 disk->replacement = NULL;
3660 clear_bit(Replacement, &disk->rdev->flags);
3661 }
3662
3663 if (!disk->rdev ||
3664 !test_bit(In_sync, &disk->rdev->flags)) {
3665 disk->head_position = 0;
3666 mddev->degraded++;
3667 if (disk->rdev)
3668 conf->fullsync = 1;
3669 }
3670 disk->recovery_disabled = mddev->recovery_disabled - 1;
3671 }
3672
3673 if (mddev->recovery_cp != MaxSector)
3674 printk(KERN_NOTICE "md/raid10:%s: not clean"
3675 " -- starting background reconstruction\n",
3676 mdname(mddev));
3677 printk(KERN_INFO
3678 "md/raid10:%s: active with %d out of %d devices\n",
3679 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3680 conf->geo.raid_disks);
3681
3682
3683
3684 mddev->dev_sectors = conf->dev_sectors;
3685 size = raid10_size(mddev, 0, 0);
3686 md_set_array_sectors(mddev, size);
3687 mddev->resync_max_sectors = size;
3688
3689 if (mddev->queue) {
3690 int stripe = conf->geo.raid_disks *
3691 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3692 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3693 mddev->queue->backing_dev_info.congested_data = mddev;
3694
3695
3696
3697
3698
3699 stripe /= conf->geo.near_copies;
3700 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3701 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3702 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3703 }
3704
3705
3706 if (md_integrity_register(mddev))
3707 goto out_free_conf;
3708
3709 if (conf->reshape_progress != MaxSector) {
3710 unsigned long before_length, after_length;
3711
3712 before_length = ((1 << conf->prev.chunk_shift) *
3713 conf->prev.far_copies);
3714 after_length = ((1 << conf->geo.chunk_shift) *
3715 conf->geo.far_copies);
3716
3717 if (max(before_length, after_length) > min_offset_diff) {
3718
3719 printk("md/raid10: offset difference not enough to continue reshape\n");
3720 goto out_free_conf;
3721 }
3722 conf->offset_diff = min_offset_diff;
3723
3724 conf->reshape_safe = conf->reshape_progress;
3725 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3726 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3727 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3728 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3729 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3730 "reshape");
3731 }
3732
3733 return 0;
3734
3735out_free_conf:
3736 md_unregister_thread(&mddev->thread);
3737 if (conf->r10bio_pool)
3738 mempool_destroy(conf->r10bio_pool);
3739 safe_put_page(conf->tmppage);
3740 kfree(conf->mirrors);
3741 kfree(conf);
3742 mddev->private = NULL;
3743out:
3744 return -EIO;
3745}
3746
3747static int stop(struct mddev *mddev)
3748{
3749 struct r10conf *conf = mddev->private;
3750
3751 raise_barrier(conf, 0);
3752 lower_barrier(conf);
3753
3754 md_unregister_thread(&mddev->thread);
3755 if (mddev->queue)
3756
3757 blk_sync_queue(mddev->queue);
3758
3759 if (conf->r10bio_pool)
3760 mempool_destroy(conf->r10bio_pool);
3761 kfree(conf->mirrors);
3762 kfree(conf);
3763 mddev->private = NULL;
3764 return 0;
3765}
3766
3767static void raid10_quiesce(struct mddev *mddev, int state)
3768{
3769 struct r10conf *conf = mddev->private;
3770
3771 switch(state) {
3772 case 1:
3773 raise_barrier(conf, 0);
3774 break;
3775 case 0:
3776 lower_barrier(conf);
3777 break;
3778 }
3779}
3780
3781static int raid10_resize(struct mddev *mddev, sector_t sectors)
3782{
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795 struct r10conf *conf = mddev->private;
3796 sector_t oldsize, size;
3797
3798 if (mddev->reshape_position != MaxSector)
3799 return -EBUSY;
3800
3801 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3802 return -EINVAL;
3803
3804 oldsize = raid10_size(mddev, 0, 0);
3805 size = raid10_size(mddev, sectors, 0);
3806 if (mddev->external_size &&
3807 mddev->array_sectors > size)
3808 return -EINVAL;
3809 if (mddev->bitmap) {
3810 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3811 if (ret)
3812 return ret;
3813 }
3814 md_set_array_sectors(mddev, size);
3815 set_capacity(mddev->gendisk, mddev->array_sectors);
3816 revalidate_disk(mddev->gendisk);
3817 if (sectors > mddev->dev_sectors &&
3818 mddev->recovery_cp > oldsize) {
3819 mddev->recovery_cp = oldsize;
3820 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3821 }
3822 calc_sectors(conf, sectors);
3823 mddev->dev_sectors = conf->dev_sectors;
3824 mddev->resync_max_sectors = size;
3825 return 0;
3826}
3827
3828static void *raid10_takeover_raid0(struct mddev *mddev)
3829{
3830 struct md_rdev *rdev;
3831 struct r10conf *conf;
3832
3833 if (mddev->degraded > 0) {
3834 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3835 mdname(mddev));
3836 return ERR_PTR(-EINVAL);
3837 }
3838
3839
3840 mddev->new_level = 10;
3841
3842 mddev->new_layout = (1<<8) + 2;
3843 mddev->new_chunk_sectors = mddev->chunk_sectors;
3844 mddev->delta_disks = mddev->raid_disks;
3845 mddev->raid_disks *= 2;
3846
3847 mddev->recovery_cp = MaxSector;
3848
3849 conf = setup_conf(mddev);
3850 if (!IS_ERR(conf)) {
3851 rdev_for_each(rdev, mddev)
3852 if (rdev->raid_disk >= 0)
3853 rdev->new_raid_disk = rdev->raid_disk * 2;
3854 conf->barrier = 1;
3855 }
3856
3857 return conf;
3858}
3859
3860static void *raid10_takeover(struct mddev *mddev)
3861{
3862 struct r0conf *raid0_conf;
3863
3864
3865
3866
3867 if (mddev->level == 0) {
3868
3869 raid0_conf = mddev->private;
3870 if (raid0_conf->nr_strip_zones > 1) {
3871 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3872 " with more than one zone.\n",
3873 mdname(mddev));
3874 return ERR_PTR(-EINVAL);
3875 }
3876 return raid10_takeover_raid0(mddev);
3877 }
3878 return ERR_PTR(-EINVAL);
3879}
3880
3881static int raid10_check_reshape(struct mddev *mddev)
3882{
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897 struct r10conf *conf = mddev->private;
3898 struct geom geo;
3899
3900 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3901 return -EINVAL;
3902
3903 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3904
3905 return -EINVAL;
3906 if (geo.far_copies > 1 && !geo.far_offset)
3907
3908 return -EINVAL;
3909
3910 if (mddev->array_sectors & geo.chunk_mask)
3911
3912 return -EINVAL;
3913
3914 if (!enough(conf, -1))
3915 return -EINVAL;
3916
3917 kfree(conf->mirrors_new);
3918 conf->mirrors_new = NULL;
3919 if (mddev->delta_disks > 0) {
3920
3921 conf->mirrors_new = kzalloc(
3922 sizeof(struct raid10_info)
3923 *(mddev->raid_disks +
3924 mddev->delta_disks),
3925 GFP_KERNEL);
3926 if (!conf->mirrors_new)
3927 return -ENOMEM;
3928 }
3929 return 0;
3930}
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945static int calc_degraded(struct r10conf *conf)
3946{
3947 int degraded, degraded2;
3948 int i;
3949
3950 rcu_read_lock();
3951 degraded = 0;
3952
3953 for (i = 0; i < conf->prev.raid_disks; i++) {
3954 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3955 if (!rdev || test_bit(Faulty, &rdev->flags))
3956 degraded++;
3957 else if (!test_bit(In_sync, &rdev->flags))
3958
3959
3960
3961
3962 degraded++;
3963 }
3964 rcu_read_unlock();
3965 if (conf->geo.raid_disks == conf->prev.raid_disks)
3966 return degraded;
3967 rcu_read_lock();
3968 degraded2 = 0;
3969 for (i = 0; i < conf->geo.raid_disks; i++) {
3970 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3971 if (!rdev || test_bit(Faulty, &rdev->flags))
3972 degraded2++;
3973 else if (!test_bit(In_sync, &rdev->flags)) {
3974
3975
3976
3977
3978
3979 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3980 degraded2++;
3981 }
3982 }
3983 rcu_read_unlock();
3984 if (degraded2 > degraded)
3985 return degraded2;
3986 return degraded;
3987}
3988
3989static int raid10_start_reshape(struct mddev *mddev)
3990{
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001 unsigned long before_length, after_length;
4002 sector_t min_offset_diff = 0;
4003 int first = 1;
4004 struct geom new;
4005 struct r10conf *conf = mddev->private;
4006 struct md_rdev *rdev;
4007 int spares = 0;
4008 int ret;
4009
4010 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4011 return -EBUSY;
4012
4013 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4014 return -EINVAL;
4015
4016 before_length = ((1 << conf->prev.chunk_shift) *
4017 conf->prev.far_copies);
4018 after_length = ((1 << conf->geo.chunk_shift) *
4019 conf->geo.far_copies);
4020
4021 rdev_for_each(rdev, mddev) {
4022 if (!test_bit(In_sync, &rdev->flags)
4023 && !test_bit(Faulty, &rdev->flags))
4024 spares++;
4025 if (rdev->raid_disk >= 0) {
4026 long long diff = (rdev->new_data_offset
4027 - rdev->data_offset);
4028 if (!mddev->reshape_backwards)
4029 diff = -diff;
4030 if (diff < 0)
4031 diff = 0;
4032 if (first || diff < min_offset_diff)
4033 min_offset_diff = diff;
4034 }
4035 }
4036
4037 if (max(before_length, after_length) > min_offset_diff)
4038 return -EINVAL;
4039
4040 if (spares < mddev->delta_disks)
4041 return -EINVAL;
4042
4043 conf->offset_diff = min_offset_diff;
4044 spin_lock_irq(&conf->device_lock);
4045 if (conf->mirrors_new) {
4046 memcpy(conf->mirrors_new, conf->mirrors,
4047 sizeof(struct raid10_info)*conf->prev.raid_disks);
4048 smp_mb();
4049 kfree(conf->mirrors_old);
4050 conf->mirrors_old = conf->mirrors;
4051 conf->mirrors = conf->mirrors_new;
4052 conf->mirrors_new = NULL;
4053 }
4054 setup_geo(&conf->geo, mddev, geo_start);
4055 smp_mb();
4056 if (mddev->reshape_backwards) {
4057 sector_t size = raid10_size(mddev, 0, 0);
4058 if (size < mddev->array_sectors) {
4059 spin_unlock_irq(&conf->device_lock);
4060 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4061 mdname(mddev));
4062 return -EINVAL;
4063 }
4064 mddev->resync_max_sectors = size;
4065 conf->reshape_progress = size;
4066 } else
4067 conf->reshape_progress = 0;
4068 spin_unlock_irq(&conf->device_lock);
4069
4070 if (mddev->delta_disks && mddev->bitmap) {
4071 ret = bitmap_resize(mddev->bitmap,
4072 raid10_size(mddev, 0,
4073 conf->geo.raid_disks),
4074 0, 0);
4075 if (ret)
4076 goto abort;
4077 }
4078 if (mddev->delta_disks > 0) {
4079 rdev_for_each(rdev, mddev)
4080 if (rdev->raid_disk < 0 &&
4081 !test_bit(Faulty, &rdev->flags)) {
4082 if (raid10_add_disk(mddev, rdev) == 0) {
4083 if (rdev->raid_disk >=
4084 conf->prev.raid_disks)
4085 set_bit(In_sync, &rdev->flags);
4086 else
4087 rdev->recovery_offset = 0;
4088
4089 if (sysfs_link_rdev(mddev, rdev))
4090 ;
4091 }
4092 } else if (rdev->raid_disk >= conf->prev.raid_disks
4093 && !test_bit(Faulty, &rdev->flags)) {
4094
4095 set_bit(In_sync, &rdev->flags);
4096 }
4097 }
4098
4099
4100
4101
4102 spin_lock_irq(&conf->device_lock);
4103 mddev->degraded = calc_degraded(conf);
4104 spin_unlock_irq(&conf->device_lock);
4105 mddev->raid_disks = conf->geo.raid_disks;
4106 mddev->reshape_position = conf->reshape_progress;
4107 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4108
4109 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4110 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4111 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4112 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4113
4114 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4115 "reshape");
4116 if (!mddev->sync_thread) {
4117 ret = -EAGAIN;
4118 goto abort;
4119 }
4120 conf->reshape_checkpoint = jiffies;
4121 md_wakeup_thread(mddev->sync_thread);
4122 md_new_event(mddev);
4123 return 0;
4124
4125abort:
4126 mddev->recovery = 0;
4127 spin_lock_irq(&conf->device_lock);
4128 conf->geo = conf->prev;
4129 mddev->raid_disks = conf->geo.raid_disks;
4130 rdev_for_each(rdev, mddev)
4131 rdev->new_data_offset = rdev->data_offset;
4132 smp_wmb();
4133 conf->reshape_progress = MaxSector;
4134 mddev->reshape_position = MaxSector;
4135 spin_unlock_irq(&conf->device_lock);
4136 return ret;
4137}
4138
4139
4140
4141
4142
4143
4144
4145static sector_t last_dev_address(sector_t s, struct geom *geo)
4146{
4147 s = (s | geo->chunk_mask) + 1;
4148 s >>= geo->chunk_shift;
4149 s *= geo->near_copies;
4150 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4151 s *= geo->far_copies;
4152 s <<= geo->chunk_shift;
4153 return s;
4154}
4155
4156
4157
4158
4159
4160static sector_t first_dev_address(sector_t s, struct geom *geo)
4161{
4162 s >>= geo->chunk_shift;
4163 s *= geo->near_copies;
4164 sector_div(s, geo->raid_disks);
4165 s *= geo->far_copies;
4166 s <<= geo->chunk_shift;
4167 return s;
4168}
4169
4170static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4171 int *skipped)
4172{
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210 struct r10conf *conf = mddev->private;
4211 struct r10bio *r10_bio;
4212 sector_t next, safe, last;
4213 int max_sectors;
4214 int nr_sectors;
4215 int s;
4216 struct md_rdev *rdev;
4217 int need_flush = 0;
4218 struct bio *blist;
4219 struct bio *bio, *read_bio;
4220 int sectors_done = 0;
4221
4222 if (sector_nr == 0) {
4223
4224 if (mddev->reshape_backwards &&
4225 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4226 sector_nr = (raid10_size(mddev, 0, 0)
4227 - conf->reshape_progress);
4228 } else if (!mddev->reshape_backwards &&
4229 conf->reshape_progress > 0)
4230 sector_nr = conf->reshape_progress;
4231 if (sector_nr) {
4232 mddev->curr_resync_completed = sector_nr;
4233 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4234 *skipped = 1;
4235 return sector_nr;
4236 }
4237 }
4238
4239
4240
4241
4242
4243 if (mddev->reshape_backwards) {
4244
4245
4246
4247 next = first_dev_address(conf->reshape_progress - 1,
4248 &conf->geo);
4249
4250
4251
4252
4253 safe = last_dev_address(conf->reshape_safe - 1,
4254 &conf->prev);
4255
4256 if (next + conf->offset_diff < safe)
4257 need_flush = 1;
4258
4259 last = conf->reshape_progress - 1;
4260 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4261 & conf->prev.chunk_mask);
4262 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4263 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4264 } else {
4265
4266
4267
4268 next = last_dev_address(conf->reshape_progress, &conf->geo);
4269
4270
4271
4272
4273 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4274
4275
4276
4277
4278 if (next > safe + conf->offset_diff)
4279 need_flush = 1;
4280
4281 sector_nr = conf->reshape_progress;
4282 last = sector_nr | (conf->geo.chunk_mask
4283 & conf->prev.chunk_mask);
4284
4285 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4286 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4287 }
4288
4289 if (need_flush ||
4290 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4291
4292 wait_barrier(conf);
4293 mddev->reshape_position = conf->reshape_progress;
4294 if (mddev->reshape_backwards)
4295 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4296 - conf->reshape_progress;
4297 else
4298 mddev->curr_resync_completed = conf->reshape_progress;
4299 conf->reshape_checkpoint = jiffies;
4300 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4301 md_wakeup_thread(mddev->thread);
4302 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4303 kthread_should_stop());
4304 conf->reshape_safe = mddev->reshape_position;
4305 allow_barrier(conf);
4306 }
4307
4308read_more:
4309
4310 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4311 raise_barrier(conf, sectors_done != 0);
4312 atomic_set(&r10_bio->remaining, 0);
4313 r10_bio->mddev = mddev;
4314 r10_bio->sector = sector_nr;
4315 set_bit(R10BIO_IsReshape, &r10_bio->state);
4316 r10_bio->sectors = last - sector_nr + 1;
4317 rdev = read_balance(conf, r10_bio, &max_sectors);
4318 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4319
4320 if (!rdev) {
4321
4322
4323
4324
4325 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4326 return sectors_done;
4327 }
4328
4329 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4330
4331 read_bio->bi_bdev = rdev->bdev;
4332 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4333 + rdev->data_offset);
4334 read_bio->bi_private = r10_bio;
4335 read_bio->bi_end_io = end_sync_read;
4336 read_bio->bi_rw = READ;
4337 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4338 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4339 read_bio->bi_vcnt = 0;
4340 read_bio->bi_idx = 0;
4341 read_bio->bi_size = 0;
4342 r10_bio->master_bio = read_bio;
4343 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4344
4345
4346 __raid10_find_phys(&conf->geo, r10_bio);
4347
4348 blist = read_bio;
4349 read_bio->bi_next = NULL;
4350
4351 for (s = 0; s < conf->copies*2; s++) {
4352 struct bio *b;
4353 int d = r10_bio->devs[s/2].devnum;
4354 struct md_rdev *rdev2;
4355 if (s&1) {
4356 rdev2 = conf->mirrors[d].replacement;
4357 b = r10_bio->devs[s/2].repl_bio;
4358 } else {
4359 rdev2 = conf->mirrors[d].rdev;
4360 b = r10_bio->devs[s/2].bio;
4361 }
4362 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4363 continue;
4364 b->bi_bdev = rdev2->bdev;
4365 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4366 b->bi_private = r10_bio;
4367 b->bi_end_io = end_reshape_write;
4368 b->bi_rw = WRITE;
4369 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4370 b->bi_flags |= 1 << BIO_UPTODATE;
4371 b->bi_next = blist;
4372 b->bi_vcnt = 0;
4373 b->bi_idx = 0;
4374 b->bi_size = 0;
4375 blist = b;
4376 }
4377
4378
4379
4380 nr_sectors = 0;
4381 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4382 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4383 int len = (max_sectors - s) << 9;
4384 if (len > PAGE_SIZE)
4385 len = PAGE_SIZE;
4386 for (bio = blist; bio ; bio = bio->bi_next) {
4387 struct bio *bio2;
4388 if (bio_add_page(bio, page, len, 0))
4389 continue;
4390
4391
4392 for (bio2 = blist;
4393 bio2 && bio2 != bio;
4394 bio2 = bio2->bi_next) {
4395
4396 bio2->bi_vcnt--;
4397 bio2->bi_size -= len;
4398 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4399 }
4400 goto bio_full;
4401 }
4402 sector_nr += len >> 9;
4403 nr_sectors += len >> 9;
4404 }
4405bio_full:
4406 r10_bio->sectors = nr_sectors;
4407
4408
4409 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4410 atomic_inc(&r10_bio->remaining);
4411 read_bio->bi_next = NULL;
4412 generic_make_request(read_bio);
4413 sector_nr += nr_sectors;
4414 sectors_done += nr_sectors;
4415 if (sector_nr <= last)
4416 goto read_more;
4417
4418
4419
4420
4421 if (mddev->reshape_backwards)
4422 conf->reshape_progress -= sectors_done;
4423 else
4424 conf->reshape_progress += sectors_done;
4425
4426 return sectors_done;
4427}
4428
4429static void end_reshape_request(struct r10bio *r10_bio);
4430static int handle_reshape_read_error(struct mddev *mddev,
4431 struct r10bio *r10_bio);
4432static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4433{
4434
4435
4436
4437
4438
4439 struct r10conf *conf = mddev->private;
4440 int s;
4441
4442 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4443 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4444
4445 md_done_sync(mddev, r10_bio->sectors, 0);
4446 return;
4447 }
4448
4449
4450
4451
4452 atomic_set(&r10_bio->remaining, 1);
4453 for (s = 0; s < conf->copies*2; s++) {
4454 struct bio *b;
4455 int d = r10_bio->devs[s/2].devnum;
4456 struct md_rdev *rdev;
4457 if (s&1) {
4458 rdev = conf->mirrors[d].replacement;
4459 b = r10_bio->devs[s/2].repl_bio;
4460 } else {
4461 rdev = conf->mirrors[d].rdev;
4462 b = r10_bio->devs[s/2].bio;
4463 }
4464 if (!rdev || test_bit(Faulty, &rdev->flags))
4465 continue;
4466 atomic_inc(&rdev->nr_pending);
4467 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4468 atomic_inc(&r10_bio->remaining);
4469 b->bi_next = NULL;
4470 generic_make_request(b);
4471 }
4472 end_reshape_request(r10_bio);
4473}
4474
4475static void end_reshape(struct r10conf *conf)
4476{
4477 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4478 return;
4479
4480 spin_lock_irq(&conf->device_lock);
4481 conf->prev = conf->geo;
4482 md_finish_reshape(conf->mddev);
4483 smp_wmb();
4484 conf->reshape_progress = MaxSector;
4485 spin_unlock_irq(&conf->device_lock);
4486
4487
4488
4489
4490 if (conf->mddev->queue) {
4491 int stripe = conf->geo.raid_disks *
4492 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4493 stripe /= conf->geo.near_copies;
4494 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4495 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4496 }
4497 conf->fullsync = 0;
4498}
4499
4500
4501static int handle_reshape_read_error(struct mddev *mddev,
4502 struct r10bio *r10_bio)
4503{
4504
4505 int sectors = r10_bio->sectors;
4506 struct r10conf *conf = mddev->private;
4507 struct {
4508 struct r10bio r10_bio;
4509 struct r10dev devs[conf->copies];
4510 } on_stack;
4511 struct r10bio *r10b = &on_stack.r10_bio;
4512 int slot = 0;
4513 int idx = 0;
4514 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4515
4516 r10b->sector = r10_bio->sector;
4517 __raid10_find_phys(&conf->prev, r10b);
4518
4519 while (sectors) {
4520 int s = sectors;
4521 int success = 0;
4522 int first_slot = slot;
4523
4524 if (s > (PAGE_SIZE >> 9))
4525 s = PAGE_SIZE >> 9;
4526
4527 while (!success) {
4528 int d = r10b->devs[slot].devnum;
4529 struct md_rdev *rdev = conf->mirrors[d].rdev;
4530 sector_t addr;
4531 if (rdev == NULL ||
4532 test_bit(Faulty, &rdev->flags) ||
4533 !test_bit(In_sync, &rdev->flags))
4534 goto failed;
4535
4536 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4537 success = sync_page_io(rdev,
4538 addr,
4539 s << 9,
4540 bvec[idx].bv_page,
4541 READ, false);
4542 if (success)
4543 break;
4544 failed:
4545 slot++;
4546 if (slot >= conf->copies)
4547 slot = 0;
4548 if (slot == first_slot)
4549 break;
4550 }
4551 if (!success) {
4552
4553 set_bit(MD_RECOVERY_INTR,
4554 &mddev->recovery);
4555 return -EIO;
4556 }
4557 sectors -= s;
4558 idx++;
4559 }
4560 return 0;
4561}
4562
4563static void end_reshape_write(struct bio *bio, int error)
4564{
4565 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4566 struct r10bio *r10_bio = bio->bi_private;
4567 struct mddev *mddev = r10_bio->mddev;
4568 struct r10conf *conf = mddev->private;
4569 int d;
4570 int slot;
4571 int repl;
4572 struct md_rdev *rdev = NULL;
4573
4574 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4575 if (repl)
4576 rdev = conf->mirrors[d].replacement;
4577 if (!rdev) {
4578 smp_mb();
4579 rdev = conf->mirrors[d].rdev;
4580 }
4581
4582 if (!uptodate) {
4583
4584 md_error(mddev, rdev);
4585 }
4586
4587 rdev_dec_pending(rdev, mddev);
4588 end_reshape_request(r10_bio);
4589}
4590
4591static void end_reshape_request(struct r10bio *r10_bio)
4592{
4593 if (!atomic_dec_and_test(&r10_bio->remaining))
4594 return;
4595 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4596 bio_put(r10_bio->master_bio);
4597 put_buf(r10_bio);
4598}
4599
4600static void raid10_finish_reshape(struct mddev *mddev)
4601{
4602 struct r10conf *conf = mddev->private;
4603
4604 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4605 return;
4606
4607 if (mddev->delta_disks > 0) {
4608 sector_t size = raid10_size(mddev, 0, 0);
4609 md_set_array_sectors(mddev, size);
4610 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4611 mddev->recovery_cp = mddev->resync_max_sectors;
4612 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4613 }
4614 mddev->resync_max_sectors = size;
4615 set_capacity(mddev->gendisk, mddev->array_sectors);
4616 revalidate_disk(mddev->gendisk);
4617 } else {
4618 int d;
4619 for (d = conf->geo.raid_disks ;
4620 d < conf->geo.raid_disks - mddev->delta_disks;
4621 d++) {
4622 struct md_rdev *rdev = conf->mirrors[d].rdev;
4623 if (rdev)
4624 clear_bit(In_sync, &rdev->flags);
4625 rdev = conf->mirrors[d].replacement;
4626 if (rdev)
4627 clear_bit(In_sync, &rdev->flags);
4628 }
4629 }
4630 mddev->layout = mddev->new_layout;
4631 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4632 mddev->reshape_position = MaxSector;
4633 mddev->delta_disks = 0;
4634 mddev->reshape_backwards = 0;
4635}
4636
4637static struct md_personality raid10_personality =
4638{
4639 .name = "raid10",
4640 .level = 10,
4641 .owner = THIS_MODULE,
4642 .make_request = make_request,
4643 .run = run,
4644 .stop = stop,
4645 .status = status,
4646 .error_handler = error,
4647 .hot_add_disk = raid10_add_disk,
4648 .hot_remove_disk= raid10_remove_disk,
4649 .spare_active = raid10_spare_active,
4650 .sync_request = sync_request,
4651 .quiesce = raid10_quiesce,
4652 .size = raid10_size,
4653 .resize = raid10_resize,
4654 .takeover = raid10_takeover,
4655 .check_reshape = raid10_check_reshape,
4656 .start_reshape = raid10_start_reshape,
4657 .finish_reshape = raid10_finish_reshape,
4658};
4659
4660static int __init raid_init(void)
4661{
4662 return register_md_personality(&raid10_personality);
4663}
4664
4665static void raid_exit(void)
4666{
4667 unregister_md_personality(&raid10_personality);
4668}
4669
4670module_init(raid_init);
4671module_exit(raid_exit);
4672MODULE_LICENSE("GPL");
4673MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4674MODULE_ALIAS("md-personality-9");
4675MODULE_ALIAS("md-raid10");
4676MODULE_ALIAS("md-level-10");
4677
4678module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4679