1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/module.h>
25#include <linux/seq_file.h>
26#include <linux/ratelimit.h>
27#include <linux/kthread.h>
28#include "md.h"
29#include "raid10.h"
30#include "raid0.h"
31#include "bitmap.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61#define NR_RAID10_BIOS 256
62
63
64
65
66
67
68#define IO_BLOCKED ((struct bio *)1)
69
70
71
72
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77
78
79
80
81static int max_queued_requests = 1024;
82
83static void allow_barrier(struct r10conf *conf);
84static void lower_barrier(struct r10conf *conf);
85static int enough(struct r10conf *conf, int ignore);
86static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
87 int *skipped);
88static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
89static void end_reshape_write(struct bio *bio, int error);
90static void end_reshape(struct r10conf *conf);
91
92static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
93{
94 struct r10conf *conf = data;
95 int size = offsetof(struct r10bio, devs[conf->copies]);
96
97
98
99 return kzalloc(size, gfp_flags);
100}
101
102static void r10bio_pool_free(void *r10_bio, void *data)
103{
104 kfree(r10_bio);
105}
106
107
108#define RESYNC_BLOCK_SIZE (64*1024)
109#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
110
111#define RESYNC_WINDOW (1024*1024)
112
113#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
114
115
116
117
118
119
120
121
122static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123{
124 struct r10conf *conf = data;
125 struct page *page;
126 struct r10bio *r10_bio;
127 struct bio *bio;
128 int i, j;
129 int nalloc;
130
131 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
132 if (!r10_bio)
133 return NULL;
134
135 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
136 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
137 nalloc = conf->copies;
138 else
139 nalloc = 2;
140
141
142
143
144 for (j = nalloc ; j-- ; ) {
145 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
146 if (!bio)
147 goto out_free_bio;
148 r10_bio->devs[j].bio = bio;
149 if (!conf->have_replacement)
150 continue;
151 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
152 if (!bio)
153 goto out_free_bio;
154 r10_bio->devs[j].repl_bio = bio;
155 }
156
157
158
159
160 for (j = 0 ; j < nalloc; j++) {
161 struct bio *rbio = r10_bio->devs[j].repl_bio;
162 bio = r10_bio->devs[j].bio;
163 for (i = 0; i < RESYNC_PAGES; i++) {
164 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
165 &conf->mddev->recovery)) {
166
167
168 struct bio *rbio = r10_bio->devs[0].bio;
169 page = rbio->bi_io_vec[i].bv_page;
170 get_page(page);
171 } else
172 page = alloc_page(gfp_flags);
173 if (unlikely(!page))
174 goto out_free_pages;
175
176 bio->bi_io_vec[i].bv_page = page;
177 if (rbio)
178 rbio->bi_io_vec[i].bv_page = page;
179 }
180 }
181
182 return r10_bio;
183
184out_free_pages:
185 for ( ; i > 0 ; i--)
186 safe_put_page(bio->bi_io_vec[i-1].bv_page);
187 while (j--)
188 for (i = 0; i < RESYNC_PAGES ; i++)
189 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
190 j = 0;
191out_free_bio:
192 for ( ; j < nalloc; j++) {
193 if (r10_bio->devs[j].bio)
194 bio_put(r10_bio->devs[j].bio);
195 if (r10_bio->devs[j].repl_bio)
196 bio_put(r10_bio->devs[j].repl_bio);
197 }
198 r10bio_pool_free(r10_bio, conf);
199 return NULL;
200}
201
202static void r10buf_pool_free(void *__r10_bio, void *data)
203{
204 int i;
205 struct r10conf *conf = data;
206 struct r10bio *r10bio = __r10_bio;
207 int j;
208
209 for (j=0; j < conf->copies; j++) {
210 struct bio *bio = r10bio->devs[j].bio;
211 if (bio) {
212 for (i = 0; i < RESYNC_PAGES; i++) {
213 safe_put_page(bio->bi_io_vec[i].bv_page);
214 bio->bi_io_vec[i].bv_page = NULL;
215 }
216 bio_put(bio);
217 }
218 bio = r10bio->devs[j].repl_bio;
219 if (bio)
220 bio_put(bio);
221 }
222 r10bio_pool_free(r10bio, conf);
223}
224
225static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
226{
227 int i;
228
229 for (i = 0; i < conf->copies; i++) {
230 struct bio **bio = & r10_bio->devs[i].bio;
231 if (!BIO_SPECIAL(*bio))
232 bio_put(*bio);
233 *bio = NULL;
234 bio = &r10_bio->devs[i].repl_bio;
235 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
236 bio_put(*bio);
237 *bio = NULL;
238 }
239}
240
241static void free_r10bio(struct r10bio *r10_bio)
242{
243 struct r10conf *conf = r10_bio->mddev->private;
244
245 put_all_bios(conf, r10_bio);
246 mempool_free(r10_bio, conf->r10bio_pool);
247}
248
249static void put_buf(struct r10bio *r10_bio)
250{
251 struct r10conf *conf = r10_bio->mddev->private;
252
253 mempool_free(r10_bio, conf->r10buf_pool);
254
255 lower_barrier(conf);
256}
257
258static void reschedule_retry(struct r10bio *r10_bio)
259{
260 unsigned long flags;
261 struct mddev *mddev = r10_bio->mddev;
262 struct r10conf *conf = mddev->private;
263
264 spin_lock_irqsave(&conf->device_lock, flags);
265 list_add(&r10_bio->retry_list, &conf->retry_list);
266 conf->nr_queued ++;
267 spin_unlock_irqrestore(&conf->device_lock, flags);
268
269
270 wake_up(&conf->wait_barrier);
271
272 md_wakeup_thread(mddev->thread);
273}
274
275
276
277
278
279
280static void raid_end_bio_io(struct r10bio *r10_bio)
281{
282 struct bio *bio = r10_bio->master_bio;
283 int done;
284 struct r10conf *conf = r10_bio->mddev->private;
285
286 if (bio->bi_phys_segments) {
287 unsigned long flags;
288 spin_lock_irqsave(&conf->device_lock, flags);
289 bio->bi_phys_segments--;
290 done = (bio->bi_phys_segments == 0);
291 spin_unlock_irqrestore(&conf->device_lock, flags);
292 } else
293 done = 1;
294 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
295 clear_bit(BIO_UPTODATE, &bio->bi_flags);
296 if (done) {
297 bio_endio(bio, 0);
298
299
300
301
302 allow_barrier(conf);
303 }
304 free_r10bio(r10_bio);
305}
306
307
308
309
310static inline void update_head_pos(int slot, struct r10bio *r10_bio)
311{
312 struct r10conf *conf = r10_bio->mddev->private;
313
314 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
315 r10_bio->devs[slot].addr + (r10_bio->sectors);
316}
317
318
319
320
321static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
322 struct bio *bio, int *slotp, int *replp)
323{
324 int slot;
325 int repl = 0;
326
327 for (slot = 0; slot < conf->copies; slot++) {
328 if (r10_bio->devs[slot].bio == bio)
329 break;
330 if (r10_bio->devs[slot].repl_bio == bio) {
331 repl = 1;
332 break;
333 }
334 }
335
336 BUG_ON(slot == conf->copies);
337 update_head_pos(slot, r10_bio);
338
339 if (slotp)
340 *slotp = slot;
341 if (replp)
342 *replp = repl;
343 return r10_bio->devs[slot].devnum;
344}
345
346static void raid10_end_read_request(struct bio *bio, int error)
347{
348 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
349 struct r10bio *r10_bio = bio->bi_private;
350 int slot, dev;
351 struct md_rdev *rdev;
352 struct r10conf *conf = r10_bio->mddev->private;
353
354
355 slot = r10_bio->read_slot;
356 dev = r10_bio->devs[slot].devnum;
357 rdev = r10_bio->devs[slot].rdev;
358
359
360
361 update_head_pos(slot, r10_bio);
362
363 if (uptodate) {
364
365
366
367
368
369
370
371
372
373 set_bit(R10BIO_Uptodate, &r10_bio->state);
374 } else {
375
376
377
378
379
380 unsigned long flags;
381 spin_lock_irqsave(&conf->device_lock, flags);
382 if (!enough(conf, rdev->raid_disk))
383 uptodate = 1;
384 spin_unlock_irqrestore(&conf->device_lock, flags);
385 }
386 if (uptodate) {
387 raid_end_bio_io(r10_bio);
388 rdev_dec_pending(rdev, conf->mddev);
389 } else {
390
391
392
393 char b[BDEVNAME_SIZE];
394 printk_ratelimited(KERN_ERR
395 "md/raid10:%s: %s: rescheduling sector %llu\n",
396 mdname(conf->mddev),
397 bdevname(rdev->bdev, b),
398 (unsigned long long)r10_bio->sector);
399 set_bit(R10BIO_ReadError, &r10_bio->state);
400 reschedule_retry(r10_bio);
401 }
402}
403
404static void close_write(struct r10bio *r10_bio)
405{
406
407 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
408 r10_bio->sectors,
409 !test_bit(R10BIO_Degraded, &r10_bio->state),
410 0);
411 md_write_end(r10_bio->mddev);
412}
413
414static void one_write_done(struct r10bio *r10_bio)
415{
416 if (atomic_dec_and_test(&r10_bio->remaining)) {
417 if (test_bit(R10BIO_WriteError, &r10_bio->state))
418 reschedule_retry(r10_bio);
419 else {
420 close_write(r10_bio);
421 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
422 reschedule_retry(r10_bio);
423 else
424 raid_end_bio_io(r10_bio);
425 }
426 }
427}
428
429static void raid10_end_write_request(struct bio *bio, int error)
430{
431 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
432 struct r10bio *r10_bio = bio->bi_private;
433 int dev;
434 int dec_rdev = 1;
435 struct r10conf *conf = r10_bio->mddev->private;
436 int slot, repl;
437 struct md_rdev *rdev = NULL;
438
439 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
440
441 if (repl)
442 rdev = conf->mirrors[dev].replacement;
443 if (!rdev) {
444 smp_rmb();
445 repl = 0;
446 rdev = conf->mirrors[dev].rdev;
447 }
448
449
450
451 if (!uptodate) {
452 if (repl)
453
454
455
456 md_error(rdev->mddev, rdev);
457 else {
458 set_bit(WriteErrorSeen, &rdev->flags);
459 if (!test_and_set_bit(WantReplacement, &rdev->flags))
460 set_bit(MD_RECOVERY_NEEDED,
461 &rdev->mddev->recovery);
462 set_bit(R10BIO_WriteError, &r10_bio->state);
463 dec_rdev = 0;
464 }
465 } else {
466
467
468
469
470
471
472
473
474
475 sector_t first_bad;
476 int bad_sectors;
477
478 set_bit(R10BIO_Uptodate, &r10_bio->state);
479
480
481 if (is_badblock(rdev,
482 r10_bio->devs[slot].addr,
483 r10_bio->sectors,
484 &first_bad, &bad_sectors)) {
485 bio_put(bio);
486 if (repl)
487 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
488 else
489 r10_bio->devs[slot].bio = IO_MADE_GOOD;
490 dec_rdev = 0;
491 set_bit(R10BIO_MadeGood, &r10_bio->state);
492 }
493 }
494
495
496
497
498
499
500 one_write_done(r10_bio);
501 if (dec_rdev)
502 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
503}
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
531{
532 int n,f;
533 sector_t sector;
534 sector_t chunk;
535 sector_t stripe;
536 int dev;
537 int slot = 0;
538
539
540 chunk = r10bio->sector >> geo->chunk_shift;
541 sector = r10bio->sector & geo->chunk_mask;
542
543 chunk *= geo->near_copies;
544 stripe = chunk;
545 dev = sector_div(stripe, geo->raid_disks);
546 if (geo->far_offset)
547 stripe *= geo->far_copies;
548
549 sector += stripe << geo->chunk_shift;
550
551
552 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev;
554 sector_t s = sector;
555 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d;
557 slot++;
558
559 for (f = 1; f < geo->far_copies; f++) {
560 d += geo->near_copies;
561 if (d >= geo->raid_disks)
562 d -= geo->raid_disks;
563 s += geo->stride;
564 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s;
566 slot++;
567 }
568 dev++;
569 if (dev >= geo->raid_disks) {
570 dev = 0;
571 sector += (geo->chunk_mask + 1);
572 }
573 }
574}
575
576static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
577{
578 struct geom *geo = &conf->geo;
579
580 if (conf->reshape_progress != MaxSector &&
581 ((r10bio->sector >= conf->reshape_progress) !=
582 conf->mddev->reshape_backwards)) {
583 set_bit(R10BIO_Previous, &r10bio->state);
584 geo = &conf->prev;
585 } else
586 clear_bit(R10BIO_Previous, &r10bio->state);
587
588 __raid10_find_phys(geo, r10bio);
589}
590
591static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
592{
593 sector_t offset, chunk, vchunk;
594
595
596
597 struct geom *geo = &conf->geo;
598
599 offset = sector & geo->chunk_mask;
600 if (geo->far_offset) {
601 int fc;
602 chunk = sector >> geo->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies);
604 dev -= fc * geo->near_copies;
605 if (dev < 0)
606 dev += geo->raid_disks;
607 } else {
608 while (sector >= geo->stride) {
609 sector -= geo->stride;
610 if (dev < geo->near_copies)
611 dev += geo->raid_disks - geo->near_copies;
612 else
613 dev -= geo->near_copies;
614 }
615 chunk = sector >> geo->chunk_shift;
616 }
617 vchunk = chunk * geo->raid_disks + dev;
618 sector_div(vchunk, geo->near_copies);
619 return (vchunk << geo->chunk_shift) + offset;
620}
621
622
623
624
625
626
627
628
629
630
631
632static int raid10_mergeable_bvec(struct request_queue *q,
633 struct bvec_merge_data *bvm,
634 struct bio_vec *biovec)
635{
636 struct mddev *mddev = q->queuedata;
637 struct r10conf *conf = mddev->private;
638 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
639 int max;
640 unsigned int chunk_sectors;
641 unsigned int bio_sectors = bvm->bi_size >> 9;
642 struct geom *geo = &conf->geo;
643
644 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
645 if (conf->reshape_progress != MaxSector &&
646 ((sector >= conf->reshape_progress) !=
647 conf->mddev->reshape_backwards))
648 geo = &conf->prev;
649
650 if (geo->near_copies < geo->raid_disks) {
651 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
652 + bio_sectors)) << 9;
653 if (max < 0)
654
655 max = 0;
656 if (max <= biovec->bv_len && bio_sectors == 0)
657 return biovec->bv_len;
658 } else
659 max = biovec->bv_len;
660
661 if (mddev->merge_check_needed) {
662 struct {
663 struct r10bio r10_bio;
664 struct r10dev devs[conf->copies];
665 } on_stack;
666 struct r10bio *r10_bio = &on_stack.r10_bio;
667 int s;
668 if (conf->reshape_progress != MaxSector) {
669
670 if (max <= biovec->bv_len && bio_sectors == 0)
671 return biovec->bv_len;
672 return 0;
673 }
674 r10_bio->sector = sector;
675 raid10_find_phys(conf, r10_bio);
676 rcu_read_lock();
677 for (s = 0; s < conf->copies; s++) {
678 int disk = r10_bio->devs[s].devnum;
679 struct md_rdev *rdev = rcu_dereference(
680 conf->mirrors[disk].rdev);
681 if (rdev && !test_bit(Faulty, &rdev->flags)) {
682 struct request_queue *q =
683 bdev_get_queue(rdev->bdev);
684 if (q->merge_bvec_fn) {
685 bvm->bi_sector = r10_bio->devs[s].addr
686 + rdev->data_offset;
687 bvm->bi_bdev = rdev->bdev;
688 max = min(max, q->merge_bvec_fn(
689 q, bvm, biovec));
690 }
691 }
692 rdev = rcu_dereference(conf->mirrors[disk].replacement);
693 if (rdev && !test_bit(Faulty, &rdev->flags)) {
694 struct request_queue *q =
695 bdev_get_queue(rdev->bdev);
696 if (q->merge_bvec_fn) {
697 bvm->bi_sector = r10_bio->devs[s].addr
698 + rdev->data_offset;
699 bvm->bi_bdev = rdev->bdev;
700 max = min(max, q->merge_bvec_fn(
701 q, bvm, biovec));
702 }
703 }
704 }
705 rcu_read_unlock();
706 }
707 return max;
708}
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729static struct md_rdev *read_balance(struct r10conf *conf,
730 struct r10bio *r10_bio,
731 int *max_sectors)
732{
733 const sector_t this_sector = r10_bio->sector;
734 int disk, slot;
735 int sectors = r10_bio->sectors;
736 int best_good_sectors;
737 sector_t new_distance, best_dist;
738 struct md_rdev *best_rdev, *rdev = NULL;
739 int do_balance;
740 int best_slot;
741 struct geom *geo = &conf->geo;
742
743 raid10_find_phys(conf, r10_bio);
744 rcu_read_lock();
745retry:
746 sectors = r10_bio->sectors;
747 best_slot = -1;
748 best_rdev = NULL;
749 best_dist = MaxSector;
750 best_good_sectors = 0;
751 do_balance = 1;
752
753
754
755
756
757
758 if (conf->mddev->recovery_cp < MaxSector
759 && (this_sector + sectors >= conf->next_resync))
760 do_balance = 0;
761
762 for (slot = 0; slot < conf->copies ; slot++) {
763 sector_t first_bad;
764 int bad_sectors;
765 sector_t dev_sector;
766
767 if (r10_bio->devs[slot].bio == IO_BLOCKED)
768 continue;
769 disk = r10_bio->devs[slot].devnum;
770 rdev = rcu_dereference(conf->mirrors[disk].replacement);
771 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
772 test_bit(Unmerged, &rdev->flags) ||
773 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
774 rdev = rcu_dereference(conf->mirrors[disk].rdev);
775 if (rdev == NULL ||
776 test_bit(Faulty, &rdev->flags) ||
777 test_bit(Unmerged, &rdev->flags))
778 continue;
779 if (!test_bit(In_sync, &rdev->flags) &&
780 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
781 continue;
782
783 dev_sector = r10_bio->devs[slot].addr;
784 if (is_badblock(rdev, dev_sector, sectors,
785 &first_bad, &bad_sectors)) {
786 if (best_dist < MaxSector)
787
788 continue;
789 if (first_bad <= dev_sector) {
790
791
792
793
794 bad_sectors -= (dev_sector - first_bad);
795 if (!do_balance && sectors > bad_sectors)
796 sectors = bad_sectors;
797 if (best_good_sectors > sectors)
798 best_good_sectors = sectors;
799 } else {
800 sector_t good_sectors =
801 first_bad - dev_sector;
802 if (good_sectors > best_good_sectors) {
803 best_good_sectors = good_sectors;
804 best_slot = slot;
805 best_rdev = rdev;
806 }
807 if (!do_balance)
808
809 break;
810 }
811 continue;
812 } else
813 best_good_sectors = sectors;
814
815 if (!do_balance)
816 break;
817
818
819
820
821
822 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
823 break;
824
825
826 if (geo->far_copies > 1)
827 new_distance = r10_bio->devs[slot].addr;
828 else
829 new_distance = abs(r10_bio->devs[slot].addr -
830 conf->mirrors[disk].head_position);
831 if (new_distance < best_dist) {
832 best_dist = new_distance;
833 best_slot = slot;
834 best_rdev = rdev;
835 }
836 }
837 if (slot >= conf->copies) {
838 slot = best_slot;
839 rdev = best_rdev;
840 }
841
842 if (slot >= 0) {
843 atomic_inc(&rdev->nr_pending);
844 if (test_bit(Faulty, &rdev->flags)) {
845
846
847
848 rdev_dec_pending(rdev, conf->mddev);
849 goto retry;
850 }
851 r10_bio->read_slot = slot;
852 } else
853 rdev = NULL;
854 rcu_read_unlock();
855 *max_sectors = best_good_sectors;
856
857 return rdev;
858}
859
860int md_raid10_congested(struct mddev *mddev, int bits)
861{
862 struct r10conf *conf = mddev->private;
863 int i, ret = 0;
864
865 if ((bits & (1 << BDI_async_congested)) &&
866 conf->pending_count >= max_queued_requests)
867 return 1;
868
869 rcu_read_lock();
870 for (i = 0;
871 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
872 && ret == 0;
873 i++) {
874 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
875 if (rdev && !test_bit(Faulty, &rdev->flags)) {
876 struct request_queue *q = bdev_get_queue(rdev->bdev);
877
878 ret |= bdi_congested(&q->backing_dev_info, bits);
879 }
880 }
881 rcu_read_unlock();
882 return ret;
883}
884EXPORT_SYMBOL_GPL(md_raid10_congested);
885
886static int raid10_congested(void *data, int bits)
887{
888 struct mddev *mddev = data;
889
890 return mddev_congested(mddev, bits) ||
891 md_raid10_congested(mddev, bits);
892}
893
894static void flush_pending_writes(struct r10conf *conf)
895{
896
897
898
899 spin_lock_irq(&conf->device_lock);
900
901 if (conf->pending_bio_list.head) {
902 struct bio *bio;
903 bio = bio_list_get(&conf->pending_bio_list);
904 conf->pending_count = 0;
905 spin_unlock_irq(&conf->device_lock);
906
907
908 bitmap_unplug(conf->mddev->bitmap);
909 wake_up(&conf->wait_barrier);
910
911 while (bio) {
912 struct bio *next = bio->bi_next;
913 bio->bi_next = NULL;
914 generic_make_request(bio);
915 bio = next;
916 }
917 } else
918 spin_unlock_irq(&conf->device_lock);
919}
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943static void raise_barrier(struct r10conf *conf, int force)
944{
945 BUG_ON(force && !conf->barrier);
946 spin_lock_irq(&conf->resync_lock);
947
948
949 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
950 conf->resync_lock, );
951
952
953 conf->barrier++;
954
955
956 wait_event_lock_irq(conf->wait_barrier,
957 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
958 conf->resync_lock, );
959
960 spin_unlock_irq(&conf->resync_lock);
961}
962
963static void lower_barrier(struct r10conf *conf)
964{
965 unsigned long flags;
966 spin_lock_irqsave(&conf->resync_lock, flags);
967 conf->barrier--;
968 spin_unlock_irqrestore(&conf->resync_lock, flags);
969 wake_up(&conf->wait_barrier);
970}
971
972static void wait_barrier(struct r10conf *conf)
973{
974 spin_lock_irq(&conf->resync_lock);
975 if (conf->barrier) {
976 conf->nr_waiting++;
977
978
979
980
981
982
983
984
985
986 wait_event_lock_irq(conf->wait_barrier,
987 !conf->barrier ||
988 (conf->nr_pending &&
989 current->bio_list &&
990 !bio_list_empty(current->bio_list)),
991 conf->resync_lock,
992 );
993 conf->nr_waiting--;
994 }
995 conf->nr_pending++;
996 spin_unlock_irq(&conf->resync_lock);
997}
998
999static void allow_barrier(struct r10conf *conf)
1000{
1001 unsigned long flags;
1002 spin_lock_irqsave(&conf->resync_lock, flags);
1003 conf->nr_pending--;
1004 spin_unlock_irqrestore(&conf->resync_lock, flags);
1005 wake_up(&conf->wait_barrier);
1006}
1007
1008static void freeze_array(struct r10conf *conf)
1009{
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022 spin_lock_irq(&conf->resync_lock);
1023 conf->barrier++;
1024 conf->nr_waiting++;
1025 wait_event_lock_irq(conf->wait_barrier,
1026 conf->nr_pending == conf->nr_queued+1,
1027 conf->resync_lock,
1028 flush_pending_writes(conf));
1029
1030 spin_unlock_irq(&conf->resync_lock);
1031}
1032
1033static void unfreeze_array(struct r10conf *conf)
1034{
1035
1036 spin_lock_irq(&conf->resync_lock);
1037 conf->barrier--;
1038 conf->nr_waiting--;
1039 wake_up(&conf->wait_barrier);
1040 spin_unlock_irq(&conf->resync_lock);
1041}
1042
1043static sector_t choose_data_offset(struct r10bio *r10_bio,
1044 struct md_rdev *rdev)
1045{
1046 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1047 test_bit(R10BIO_Previous, &r10_bio->state))
1048 return rdev->data_offset;
1049 else
1050 return rdev->new_data_offset;
1051}
1052
1053static void make_request(struct mddev *mddev, struct bio * bio)
1054{
1055 struct r10conf *conf = mddev->private;
1056 struct r10bio *r10_bio;
1057 struct bio *read_bio;
1058 int i;
1059 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1060 int chunk_sects = chunk_mask + 1;
1061 const int rw = bio_data_dir(bio);
1062 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1063 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1064 unsigned long flags;
1065 struct md_rdev *blocked_rdev;
1066 int sectors_handled;
1067 int max_sectors;
1068 int sectors;
1069
1070 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1071 md_flush_request(mddev, bio);
1072 return;
1073 }
1074
1075
1076
1077
1078 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
1079 > chunk_sects
1080 && (conf->geo.near_copies < conf->geo.raid_disks
1081 || conf->prev.near_copies < conf->prev.raid_disks))) {
1082 struct bio_pair *bp;
1083
1084 if (bio->bi_vcnt != 1 ||
1085 bio->bi_idx != 0)
1086 goto bad_map;
1087
1088
1089
1090 bp = bio_split(bio,
1091 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101 spin_lock_irq(&conf->resync_lock);
1102 conf->nr_waiting++;
1103 spin_unlock_irq(&conf->resync_lock);
1104
1105 make_request(mddev, &bp->bio1);
1106 make_request(mddev, &bp->bio2);
1107
1108 spin_lock_irq(&conf->resync_lock);
1109 conf->nr_waiting--;
1110 wake_up(&conf->wait_barrier);
1111 spin_unlock_irq(&conf->resync_lock);
1112
1113 bio_pair_release(bp);
1114 return;
1115 bad_map:
1116 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1117 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1118 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
1119
1120 bio_io_error(bio);
1121 return;
1122 }
1123
1124 md_write_start(mddev, bio);
1125
1126
1127
1128
1129
1130
1131 wait_barrier(conf);
1132
1133 sectors = bio->bi_size >> 9;
1134 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1135 bio->bi_sector < conf->reshape_progress &&
1136 bio->bi_sector + sectors > conf->reshape_progress) {
1137
1138
1139
1140 allow_barrier(conf);
1141 wait_event(conf->wait_barrier,
1142 conf->reshape_progress <= bio->bi_sector ||
1143 conf->reshape_progress >= bio->bi_sector + sectors);
1144 wait_barrier(conf);
1145 }
1146 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1147 bio_data_dir(bio) == WRITE &&
1148 (mddev->reshape_backwards
1149 ? (bio->bi_sector < conf->reshape_safe &&
1150 bio->bi_sector + sectors > conf->reshape_progress)
1151 : (bio->bi_sector + sectors > conf->reshape_safe &&
1152 bio->bi_sector < conf->reshape_progress))) {
1153
1154 mddev->reshape_position = conf->reshape_progress;
1155 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1156 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1157 md_wakeup_thread(mddev->thread);
1158 wait_event(mddev->sb_wait,
1159 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1160
1161 conf->reshape_safe = mddev->reshape_position;
1162 }
1163
1164 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1165
1166 r10_bio->master_bio = bio;
1167 r10_bio->sectors = sectors;
1168
1169 r10_bio->mddev = mddev;
1170 r10_bio->sector = bio->bi_sector;
1171 r10_bio->state = 0;
1172
1173
1174
1175
1176
1177
1178
1179
1180 bio->bi_phys_segments = 0;
1181 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1182
1183 if (rw == READ) {
1184
1185
1186
1187 struct md_rdev *rdev;
1188 int slot;
1189
1190read_again:
1191 rdev = read_balance(conf, r10_bio, &max_sectors);
1192 if (!rdev) {
1193 raid_end_bio_io(r10_bio);
1194 return;
1195 }
1196 slot = r10_bio->read_slot;
1197
1198 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1199 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1200 max_sectors);
1201
1202 r10_bio->devs[slot].bio = read_bio;
1203 r10_bio->devs[slot].rdev = rdev;
1204
1205 read_bio->bi_sector = r10_bio->devs[slot].addr +
1206 choose_data_offset(r10_bio, rdev);
1207 read_bio->bi_bdev = rdev->bdev;
1208 read_bio->bi_end_io = raid10_end_read_request;
1209 read_bio->bi_rw = READ | do_sync;
1210 read_bio->bi_private = r10_bio;
1211
1212 if (max_sectors < r10_bio->sectors) {
1213
1214
1215
1216 sectors_handled = (r10_bio->sectors + max_sectors
1217 - bio->bi_sector);
1218 r10_bio->sectors = max_sectors;
1219 spin_lock_irq(&conf->device_lock);
1220 if (bio->bi_phys_segments == 0)
1221 bio->bi_phys_segments = 2;
1222 else
1223 bio->bi_phys_segments++;
1224 spin_unlock(&conf->device_lock);
1225
1226
1227
1228
1229
1230 reschedule_retry(r10_bio);
1231
1232 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1233
1234 r10_bio->master_bio = bio;
1235 r10_bio->sectors = ((bio->bi_size >> 9)
1236 - sectors_handled);
1237 r10_bio->state = 0;
1238 r10_bio->mddev = mddev;
1239 r10_bio->sector = bio->bi_sector + sectors_handled;
1240 goto read_again;
1241 } else
1242 generic_make_request(read_bio);
1243 return;
1244 }
1245
1246
1247
1248
1249 if (conf->pending_count >= max_queued_requests) {
1250 md_wakeup_thread(mddev->thread);
1251 wait_event(conf->wait_barrier,
1252 conf->pending_count < max_queued_requests);
1253 }
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266 r10_bio->read_slot = -1;
1267 raid10_find_phys(conf, r10_bio);
1268retry_write:
1269 blocked_rdev = NULL;
1270 rcu_read_lock();
1271 max_sectors = r10_bio->sectors;
1272
1273 for (i = 0; i < conf->copies; i++) {
1274 int d = r10_bio->devs[i].devnum;
1275 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1276 struct md_rdev *rrdev = rcu_dereference(
1277 conf->mirrors[d].replacement);
1278 if (rdev == rrdev)
1279 rrdev = NULL;
1280 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1281 atomic_inc(&rdev->nr_pending);
1282 blocked_rdev = rdev;
1283 break;
1284 }
1285 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1286 atomic_inc(&rrdev->nr_pending);
1287 blocked_rdev = rrdev;
1288 break;
1289 }
1290 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1291 || test_bit(Unmerged, &rrdev->flags)))
1292 rrdev = NULL;
1293
1294 r10_bio->devs[i].bio = NULL;
1295 r10_bio->devs[i].repl_bio = NULL;
1296 if (!rdev || test_bit(Faulty, &rdev->flags) ||
1297 test_bit(Unmerged, &rdev->flags)) {
1298 set_bit(R10BIO_Degraded, &r10_bio->state);
1299 continue;
1300 }
1301 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1302 sector_t first_bad;
1303 sector_t dev_sector = r10_bio->devs[i].addr;
1304 int bad_sectors;
1305 int is_bad;
1306
1307 is_bad = is_badblock(rdev, dev_sector,
1308 max_sectors,
1309 &first_bad, &bad_sectors);
1310 if (is_bad < 0) {
1311
1312
1313
1314 atomic_inc(&rdev->nr_pending);
1315 set_bit(BlockedBadBlocks, &rdev->flags);
1316 blocked_rdev = rdev;
1317 break;
1318 }
1319 if (is_bad && first_bad <= dev_sector) {
1320
1321 bad_sectors -= (dev_sector - first_bad);
1322 if (bad_sectors < max_sectors)
1323
1324
1325
1326 max_sectors = bad_sectors;
1327
1328
1329
1330
1331
1332
1333
1334
1335 continue;
1336 }
1337 if (is_bad) {
1338 int good_sectors = first_bad - dev_sector;
1339 if (good_sectors < max_sectors)
1340 max_sectors = good_sectors;
1341 }
1342 }
1343 r10_bio->devs[i].bio = bio;
1344 atomic_inc(&rdev->nr_pending);
1345 if (rrdev) {
1346 r10_bio->devs[i].repl_bio = bio;
1347 atomic_inc(&rrdev->nr_pending);
1348 }
1349 }
1350 rcu_read_unlock();
1351
1352 if (unlikely(blocked_rdev)) {
1353
1354 int j;
1355 int d;
1356
1357 for (j = 0; j < i; j++) {
1358 if (r10_bio->devs[j].bio) {
1359 d = r10_bio->devs[j].devnum;
1360 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1361 }
1362 if (r10_bio->devs[j].repl_bio) {
1363 struct md_rdev *rdev;
1364 d = r10_bio->devs[j].devnum;
1365 rdev = conf->mirrors[d].replacement;
1366 if (!rdev) {
1367
1368 smp_mb();
1369 rdev = conf->mirrors[d].rdev;
1370 }
1371 rdev_dec_pending(rdev, mddev);
1372 }
1373 }
1374 allow_barrier(conf);
1375 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1376 wait_barrier(conf);
1377 goto retry_write;
1378 }
1379
1380 if (max_sectors < r10_bio->sectors) {
1381
1382
1383
1384 r10_bio->sectors = max_sectors;
1385 spin_lock_irq(&conf->device_lock);
1386 if (bio->bi_phys_segments == 0)
1387 bio->bi_phys_segments = 2;
1388 else
1389 bio->bi_phys_segments++;
1390 spin_unlock_irq(&conf->device_lock);
1391 }
1392 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1393
1394 atomic_set(&r10_bio->remaining, 1);
1395 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1396
1397 for (i = 0; i < conf->copies; i++) {
1398 struct bio *mbio;
1399 int d = r10_bio->devs[i].devnum;
1400 if (!r10_bio->devs[i].bio)
1401 continue;
1402
1403 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1404 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1405 max_sectors);
1406 r10_bio->devs[i].bio = mbio;
1407
1408 mbio->bi_sector = (r10_bio->devs[i].addr+
1409 choose_data_offset(r10_bio,
1410 conf->mirrors[d].rdev));
1411 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1412 mbio->bi_end_io = raid10_end_write_request;
1413 mbio->bi_rw = WRITE | do_sync | do_fua;
1414 mbio->bi_private = r10_bio;
1415
1416 atomic_inc(&r10_bio->remaining);
1417 spin_lock_irqsave(&conf->device_lock, flags);
1418 bio_list_add(&conf->pending_bio_list, mbio);
1419 conf->pending_count++;
1420 spin_unlock_irqrestore(&conf->device_lock, flags);
1421 if (!mddev_check_plugged(mddev))
1422 md_wakeup_thread(mddev->thread);
1423
1424 if (!r10_bio->devs[i].repl_bio)
1425 continue;
1426
1427 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1428 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1429 max_sectors);
1430 r10_bio->devs[i].repl_bio = mbio;
1431
1432
1433
1434
1435
1436 mbio->bi_sector = (r10_bio->devs[i].addr +
1437 choose_data_offset(
1438 r10_bio,
1439 conf->mirrors[d].replacement));
1440 mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1441 mbio->bi_end_io = raid10_end_write_request;
1442 mbio->bi_rw = WRITE | do_sync | do_fua;
1443 mbio->bi_private = r10_bio;
1444
1445 atomic_inc(&r10_bio->remaining);
1446 spin_lock_irqsave(&conf->device_lock, flags);
1447 bio_list_add(&conf->pending_bio_list, mbio);
1448 conf->pending_count++;
1449 spin_unlock_irqrestore(&conf->device_lock, flags);
1450 if (!mddev_check_plugged(mddev))
1451 md_wakeup_thread(mddev->thread);
1452 }
1453
1454
1455
1456
1457
1458 if (sectors_handled < (bio->bi_size >> 9)) {
1459 one_write_done(r10_bio);
1460
1461
1462
1463 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1464
1465 r10_bio->master_bio = bio;
1466 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1467
1468 r10_bio->mddev = mddev;
1469 r10_bio->sector = bio->bi_sector + sectors_handled;
1470 r10_bio->state = 0;
1471 goto retry_write;
1472 }
1473 one_write_done(r10_bio);
1474
1475
1476 wake_up(&conf->wait_barrier);
1477}
1478
1479static void status(struct seq_file *seq, struct mddev *mddev)
1480{
1481 struct r10conf *conf = mddev->private;
1482 int i;
1483
1484 if (conf->geo.near_copies < conf->geo.raid_disks)
1485 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1486 if (conf->geo.near_copies > 1)
1487 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1488 if (conf->geo.far_copies > 1) {
1489 if (conf->geo.far_offset)
1490 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1491 else
1492 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1493 }
1494 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1495 conf->geo.raid_disks - mddev->degraded);
1496 for (i = 0; i < conf->geo.raid_disks; i++)
1497 seq_printf(seq, "%s",
1498 conf->mirrors[i].rdev &&
1499 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1500 seq_printf(seq, "]");
1501}
1502
1503
1504
1505
1506
1507
1508static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1509{
1510 int first = 0;
1511
1512 do {
1513 int n = conf->copies;
1514 int cnt = 0;
1515 int this = first;
1516 while (n--) {
1517 if (conf->mirrors[this].rdev &&
1518 this != ignore)
1519 cnt++;
1520 this = (this+1) % geo->raid_disks;
1521 }
1522 if (cnt == 0)
1523 return 0;
1524 first = (first + geo->near_copies) % geo->raid_disks;
1525 } while (first != 0);
1526 return 1;
1527}
1528
1529static int enough(struct r10conf *conf, int ignore)
1530{
1531 return _enough(conf, &conf->geo, ignore) &&
1532 _enough(conf, &conf->prev, ignore);
1533}
1534
1535static void error(struct mddev *mddev, struct md_rdev *rdev)
1536{
1537 char b[BDEVNAME_SIZE];
1538 struct r10conf *conf = mddev->private;
1539
1540
1541
1542
1543
1544
1545
1546 if (test_bit(In_sync, &rdev->flags)
1547 && !enough(conf, rdev->raid_disk))
1548
1549
1550
1551 return;
1552 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1553 unsigned long flags;
1554 spin_lock_irqsave(&conf->device_lock, flags);
1555 mddev->degraded++;
1556 spin_unlock_irqrestore(&conf->device_lock, flags);
1557
1558
1559
1560 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1561 }
1562 set_bit(Blocked, &rdev->flags);
1563 set_bit(Faulty, &rdev->flags);
1564 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1565 printk(KERN_ALERT
1566 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1567 "md/raid10:%s: Operation continuing on %d devices.\n",
1568 mdname(mddev), bdevname(rdev->bdev, b),
1569 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1570}
1571
1572static void print_conf(struct r10conf *conf)
1573{
1574 int i;
1575 struct raid10_info *tmp;
1576
1577 printk(KERN_DEBUG "RAID10 conf printout:\n");
1578 if (!conf) {
1579 printk(KERN_DEBUG "(!conf)\n");
1580 return;
1581 }
1582 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1583 conf->geo.raid_disks);
1584
1585 for (i = 0; i < conf->geo.raid_disks; i++) {
1586 char b[BDEVNAME_SIZE];
1587 tmp = conf->mirrors + i;
1588 if (tmp->rdev)
1589 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1590 i, !test_bit(In_sync, &tmp->rdev->flags),
1591 !test_bit(Faulty, &tmp->rdev->flags),
1592 bdevname(tmp->rdev->bdev,b));
1593 }
1594}
1595
1596static void close_sync(struct r10conf *conf)
1597{
1598 wait_barrier(conf);
1599 allow_barrier(conf);
1600
1601 mempool_destroy(conf->r10buf_pool);
1602 conf->r10buf_pool = NULL;
1603}
1604
1605static int raid10_spare_active(struct mddev *mddev)
1606{
1607 int i;
1608 struct r10conf *conf = mddev->private;
1609 struct raid10_info *tmp;
1610 int count = 0;
1611 unsigned long flags;
1612
1613
1614
1615
1616
1617 for (i = 0; i < conf->geo.raid_disks; i++) {
1618 tmp = conf->mirrors + i;
1619 if (tmp->replacement
1620 && tmp->replacement->recovery_offset == MaxSector
1621 && !test_bit(Faulty, &tmp->replacement->flags)
1622 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1623
1624 if (!tmp->rdev
1625 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1626 count++;
1627 if (tmp->rdev) {
1628
1629
1630
1631
1632 set_bit(Faulty, &tmp->rdev->flags);
1633 sysfs_notify_dirent_safe(
1634 tmp->rdev->sysfs_state);
1635 }
1636 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1637 } else if (tmp->rdev
1638 && !test_bit(Faulty, &tmp->rdev->flags)
1639 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1640 count++;
1641 sysfs_notify_dirent(tmp->rdev->sysfs_state);
1642 }
1643 }
1644 spin_lock_irqsave(&conf->device_lock, flags);
1645 mddev->degraded -= count;
1646 spin_unlock_irqrestore(&conf->device_lock, flags);
1647
1648 print_conf(conf);
1649 return count;
1650}
1651
1652
1653static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1654{
1655 struct r10conf *conf = mddev->private;
1656 int err = -EEXIST;
1657 int mirror;
1658 int first = 0;
1659 int last = conf->geo.raid_disks - 1;
1660 struct request_queue *q = bdev_get_queue(rdev->bdev);
1661
1662 if (mddev->recovery_cp < MaxSector)
1663
1664
1665
1666 return -EBUSY;
1667 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1668 return -EINVAL;
1669
1670 if (rdev->raid_disk >= 0)
1671 first = last = rdev->raid_disk;
1672
1673 if (q->merge_bvec_fn) {
1674 set_bit(Unmerged, &rdev->flags);
1675 mddev->merge_check_needed = 1;
1676 }
1677
1678 if (rdev->saved_raid_disk >= first &&
1679 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1680 mirror = rdev->saved_raid_disk;
1681 else
1682 mirror = first;
1683 for ( ; mirror <= last ; mirror++) {
1684 struct raid10_info *p = &conf->mirrors[mirror];
1685 if (p->recovery_disabled == mddev->recovery_disabled)
1686 continue;
1687 if (p->rdev) {
1688 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1689 p->replacement != NULL)
1690 continue;
1691 clear_bit(In_sync, &rdev->flags);
1692 set_bit(Replacement, &rdev->flags);
1693 rdev->raid_disk = mirror;
1694 err = 0;
1695 disk_stack_limits(mddev->gendisk, rdev->bdev,
1696 rdev->data_offset << 9);
1697 conf->fullsync = 1;
1698 rcu_assign_pointer(p->replacement, rdev);
1699 break;
1700 }
1701
1702 disk_stack_limits(mddev->gendisk, rdev->bdev,
1703 rdev->data_offset << 9);
1704
1705 p->head_position = 0;
1706 p->recovery_disabled = mddev->recovery_disabled - 1;
1707 rdev->raid_disk = mirror;
1708 err = 0;
1709 if (rdev->saved_raid_disk != mirror)
1710 conf->fullsync = 1;
1711 rcu_assign_pointer(p->rdev, rdev);
1712 break;
1713 }
1714 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1715
1716
1717
1718
1719
1720
1721
1722 synchronize_sched();
1723 raise_barrier(conf, 0);
1724 lower_barrier(conf);
1725 clear_bit(Unmerged, &rdev->flags);
1726 }
1727 md_integrity_add_rdev(rdev, mddev);
1728 print_conf(conf);
1729 return err;
1730}
1731
1732static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1733{
1734 struct r10conf *conf = mddev->private;
1735 int err = 0;
1736 int number = rdev->raid_disk;
1737 struct md_rdev **rdevp;
1738 struct raid10_info *p = conf->mirrors + number;
1739
1740 print_conf(conf);
1741 if (rdev == p->rdev)
1742 rdevp = &p->rdev;
1743 else if (rdev == p->replacement)
1744 rdevp = &p->replacement;
1745 else
1746 return 0;
1747
1748 if (test_bit(In_sync, &rdev->flags) ||
1749 atomic_read(&rdev->nr_pending)) {
1750 err = -EBUSY;
1751 goto abort;
1752 }
1753
1754
1755
1756 if (!test_bit(Faulty, &rdev->flags) &&
1757 mddev->recovery_disabled != p->recovery_disabled &&
1758 (!p->replacement || p->replacement == rdev) &&
1759 number < conf->geo.raid_disks &&
1760 enough(conf, -1)) {
1761 err = -EBUSY;
1762 goto abort;
1763 }
1764 *rdevp = NULL;
1765 synchronize_rcu();
1766 if (atomic_read(&rdev->nr_pending)) {
1767
1768 err = -EBUSY;
1769 *rdevp = rdev;
1770 goto abort;
1771 } else if (p->replacement) {
1772
1773 p->rdev = p->replacement;
1774 clear_bit(Replacement, &p->replacement->flags);
1775 smp_mb();
1776
1777
1778 p->replacement = NULL;
1779 clear_bit(WantReplacement, &rdev->flags);
1780 } else
1781
1782
1783
1784 clear_bit(WantReplacement, &rdev->flags);
1785
1786 err = md_integrity_register(mddev);
1787
1788abort:
1789
1790 print_conf(conf);
1791 return err;
1792}
1793
1794
1795static void end_sync_read(struct bio *bio, int error)
1796{
1797 struct r10bio *r10_bio = bio->bi_private;
1798 struct r10conf *conf = r10_bio->mddev->private;
1799 int d;
1800
1801 if (bio == r10_bio->master_bio) {
1802
1803 d = r10_bio->read_slot;
1804 } else
1805 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1806
1807 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1808 set_bit(R10BIO_Uptodate, &r10_bio->state);
1809 else
1810
1811
1812
1813 atomic_add(r10_bio->sectors,
1814 &conf->mirrors[d].rdev->corrected_errors);
1815
1816
1817
1818
1819 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1820 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1821 atomic_dec_and_test(&r10_bio->remaining)) {
1822
1823
1824
1825 reschedule_retry(r10_bio);
1826 }
1827}
1828
1829static void end_sync_request(struct r10bio *r10_bio)
1830{
1831 struct mddev *mddev = r10_bio->mddev;
1832
1833 while (atomic_dec_and_test(&r10_bio->remaining)) {
1834 if (r10_bio->master_bio == NULL) {
1835
1836 sector_t s = r10_bio->sectors;
1837 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1838 test_bit(R10BIO_WriteError, &r10_bio->state))
1839 reschedule_retry(r10_bio);
1840 else
1841 put_buf(r10_bio);
1842 md_done_sync(mddev, s, 1);
1843 break;
1844 } else {
1845 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1846 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1847 test_bit(R10BIO_WriteError, &r10_bio->state))
1848 reschedule_retry(r10_bio);
1849 else
1850 put_buf(r10_bio);
1851 r10_bio = r10_bio2;
1852 }
1853 }
1854}
1855
1856static void end_sync_write(struct bio *bio, int error)
1857{
1858 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1859 struct r10bio *r10_bio = bio->bi_private;
1860 struct mddev *mddev = r10_bio->mddev;
1861 struct r10conf *conf = mddev->private;
1862 int d;
1863 sector_t first_bad;
1864 int bad_sectors;
1865 int slot;
1866 int repl;
1867 struct md_rdev *rdev = NULL;
1868
1869 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1870 if (repl)
1871 rdev = conf->mirrors[d].replacement;
1872 else
1873 rdev = conf->mirrors[d].rdev;
1874
1875 if (!uptodate) {
1876 if (repl)
1877 md_error(mddev, rdev);
1878 else {
1879 set_bit(WriteErrorSeen, &rdev->flags);
1880 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1881 set_bit(MD_RECOVERY_NEEDED,
1882 &rdev->mddev->recovery);
1883 set_bit(R10BIO_WriteError, &r10_bio->state);
1884 }
1885 } else if (is_badblock(rdev,
1886 r10_bio->devs[slot].addr,
1887 r10_bio->sectors,
1888 &first_bad, &bad_sectors))
1889 set_bit(R10BIO_MadeGood, &r10_bio->state);
1890
1891 rdev_dec_pending(rdev, mddev);
1892
1893 end_sync_request(r10_bio);
1894}
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1913{
1914 struct r10conf *conf = mddev->private;
1915 int i, first;
1916 struct bio *tbio, *fbio;
1917 int vcnt;
1918
1919 atomic_set(&r10_bio->remaining, 1);
1920
1921
1922 for (i=0; i<conf->copies; i++)
1923 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1924 break;
1925
1926 if (i == conf->copies)
1927 goto done;
1928
1929 first = i;
1930 fbio = r10_bio->devs[i].bio;
1931
1932 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
1933
1934 for (i=0 ; i < conf->copies ; i++) {
1935 int j, d;
1936
1937 tbio = r10_bio->devs[i].bio;
1938
1939 if (tbio->bi_end_io != end_sync_read)
1940 continue;
1941 if (i == first)
1942 continue;
1943 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1944
1945
1946
1947
1948 for (j = 0; j < vcnt; j++)
1949 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1950 page_address(tbio->bi_io_vec[j].bv_page),
1951 fbio->bi_io_vec[j].bv_len))
1952 break;
1953 if (j == vcnt)
1954 continue;
1955 mddev->resync_mismatches += r10_bio->sectors;
1956 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1957
1958 continue;
1959 }
1960
1961
1962
1963
1964
1965 tbio->bi_vcnt = vcnt;
1966 tbio->bi_size = r10_bio->sectors << 9;
1967 tbio->bi_idx = 0;
1968 tbio->bi_phys_segments = 0;
1969 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1970 tbio->bi_flags |= 1 << BIO_UPTODATE;
1971 tbio->bi_next = NULL;
1972 tbio->bi_rw = WRITE;
1973 tbio->bi_private = r10_bio;
1974 tbio->bi_sector = r10_bio->devs[i].addr;
1975
1976 for (j=0; j < vcnt ; j++) {
1977 tbio->bi_io_vec[j].bv_offset = 0;
1978 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1979
1980 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1981 page_address(fbio->bi_io_vec[j].bv_page),
1982 PAGE_SIZE);
1983 }
1984 tbio->bi_end_io = end_sync_write;
1985
1986 d = r10_bio->devs[i].devnum;
1987 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1988 atomic_inc(&r10_bio->remaining);
1989 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1990
1991 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1992 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1993 generic_make_request(tbio);
1994 }
1995
1996
1997
1998
1999 for (i = 0; i < conf->copies; i++) {
2000 int j, d;
2001
2002 tbio = r10_bio->devs[i].repl_bio;
2003 if (!tbio || !tbio->bi_end_io)
2004 continue;
2005 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2006 && r10_bio->devs[i].bio != fbio)
2007 for (j = 0; j < vcnt; j++)
2008 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2009 page_address(fbio->bi_io_vec[j].bv_page),
2010 PAGE_SIZE);
2011 d = r10_bio->devs[i].devnum;
2012 atomic_inc(&r10_bio->remaining);
2013 md_sync_acct(conf->mirrors[d].replacement->bdev,
2014 tbio->bi_size >> 9);
2015 generic_make_request(tbio);
2016 }
2017
2018done:
2019 if (atomic_dec_and_test(&r10_bio->remaining)) {
2020 md_done_sync(mddev, r10_bio->sectors, 1);
2021 put_buf(r10_bio);
2022 }
2023}
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035static void fix_recovery_read_error(struct r10bio *r10_bio)
2036{
2037
2038
2039
2040
2041
2042
2043
2044 struct mddev *mddev = r10_bio->mddev;
2045 struct r10conf *conf = mddev->private;
2046 struct bio *bio = r10_bio->devs[0].bio;
2047 sector_t sect = 0;
2048 int sectors = r10_bio->sectors;
2049 int idx = 0;
2050 int dr = r10_bio->devs[0].devnum;
2051 int dw = r10_bio->devs[1].devnum;
2052
2053 while (sectors) {
2054 int s = sectors;
2055 struct md_rdev *rdev;
2056 sector_t addr;
2057 int ok;
2058
2059 if (s > (PAGE_SIZE>>9))
2060 s = PAGE_SIZE >> 9;
2061
2062 rdev = conf->mirrors[dr].rdev;
2063 addr = r10_bio->devs[0].addr + sect,
2064 ok = sync_page_io(rdev,
2065 addr,
2066 s << 9,
2067 bio->bi_io_vec[idx].bv_page,
2068 READ, false);
2069 if (ok) {
2070 rdev = conf->mirrors[dw].rdev;
2071 addr = r10_bio->devs[1].addr + sect;
2072 ok = sync_page_io(rdev,
2073 addr,
2074 s << 9,
2075 bio->bi_io_vec[idx].bv_page,
2076 WRITE, false);
2077 if (!ok) {
2078 set_bit(WriteErrorSeen, &rdev->flags);
2079 if (!test_and_set_bit(WantReplacement,
2080 &rdev->flags))
2081 set_bit(MD_RECOVERY_NEEDED,
2082 &rdev->mddev->recovery);
2083 }
2084 }
2085 if (!ok) {
2086
2087
2088
2089
2090 rdev_set_badblocks(rdev, addr, s, 0);
2091
2092 if (rdev != conf->mirrors[dw].rdev) {
2093
2094 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2095 addr = r10_bio->devs[1].addr + sect;
2096 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2097 if (!ok) {
2098
2099 printk(KERN_NOTICE
2100 "md/raid10:%s: recovery aborted"
2101 " due to read error\n",
2102 mdname(mddev));
2103
2104 conf->mirrors[dw].recovery_disabled
2105 = mddev->recovery_disabled;
2106 set_bit(MD_RECOVERY_INTR,
2107 &mddev->recovery);
2108 break;
2109 }
2110 }
2111 }
2112
2113 sectors -= s;
2114 sect += s;
2115 idx++;
2116 }
2117}
2118
2119static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2120{
2121 struct r10conf *conf = mddev->private;
2122 int d;
2123 struct bio *wbio, *wbio2;
2124
2125 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2126 fix_recovery_read_error(r10_bio);
2127 end_sync_request(r10_bio);
2128 return;
2129 }
2130
2131
2132
2133
2134
2135 d = r10_bio->devs[1].devnum;
2136 wbio = r10_bio->devs[1].bio;
2137 wbio2 = r10_bio->devs[1].repl_bio;
2138 if (wbio->bi_end_io) {
2139 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2140 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
2141 generic_make_request(wbio);
2142 }
2143 if (wbio2 && wbio2->bi_end_io) {
2144 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2145 md_sync_acct(conf->mirrors[d].replacement->bdev,
2146 wbio2->bi_size >> 9);
2147 generic_make_request(wbio2);
2148 }
2149}
2150
2151
2152
2153
2154
2155
2156
2157
2158static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2159{
2160 struct timespec cur_time_mon;
2161 unsigned long hours_since_last;
2162 unsigned int read_errors = atomic_read(&rdev->read_errors);
2163
2164 ktime_get_ts(&cur_time_mon);
2165
2166 if (rdev->last_read_error.tv_sec == 0 &&
2167 rdev->last_read_error.tv_nsec == 0) {
2168
2169 rdev->last_read_error = cur_time_mon;
2170 return;
2171 }
2172
2173 hours_since_last = (cur_time_mon.tv_sec -
2174 rdev->last_read_error.tv_sec) / 3600;
2175
2176 rdev->last_read_error = cur_time_mon;
2177
2178
2179
2180
2181
2182
2183 if (hours_since_last >= 8 * sizeof(read_errors))
2184 atomic_set(&rdev->read_errors, 0);
2185 else
2186 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2187}
2188
2189static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2190 int sectors, struct page *page, int rw)
2191{
2192 sector_t first_bad;
2193 int bad_sectors;
2194
2195 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2196 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2197 return -1;
2198 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2199
2200 return 1;
2201 if (rw == WRITE) {
2202 set_bit(WriteErrorSeen, &rdev->flags);
2203 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2204 set_bit(MD_RECOVERY_NEEDED,
2205 &rdev->mddev->recovery);
2206 }
2207
2208 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2209 md_error(rdev->mddev, rdev);
2210 return 0;
2211}
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2222{
2223 int sect = 0;
2224 int sectors = r10_bio->sectors;
2225 struct md_rdev*rdev;
2226 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2227 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2228
2229
2230
2231
2232 rdev = conf->mirrors[d].rdev;
2233
2234 if (test_bit(Faulty, &rdev->flags))
2235
2236
2237 return;
2238
2239 check_decay_read_errors(mddev, rdev);
2240 atomic_inc(&rdev->read_errors);
2241 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2242 char b[BDEVNAME_SIZE];
2243 bdevname(rdev->bdev, b);
2244
2245 printk(KERN_NOTICE
2246 "md/raid10:%s: %s: Raid device exceeded "
2247 "read_error threshold [cur %d:max %d]\n",
2248 mdname(mddev), b,
2249 atomic_read(&rdev->read_errors), max_read_errors);
2250 printk(KERN_NOTICE
2251 "md/raid10:%s: %s: Failing raid device\n",
2252 mdname(mddev), b);
2253 md_error(mddev, conf->mirrors[d].rdev);
2254 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2255 return;
2256 }
2257
2258 while(sectors) {
2259 int s = sectors;
2260 int sl = r10_bio->read_slot;
2261 int success = 0;
2262 int start;
2263
2264 if (s > (PAGE_SIZE>>9))
2265 s = PAGE_SIZE >> 9;
2266
2267 rcu_read_lock();
2268 do {
2269 sector_t first_bad;
2270 int bad_sectors;
2271
2272 d = r10_bio->devs[sl].devnum;
2273 rdev = rcu_dereference(conf->mirrors[d].rdev);
2274 if (rdev &&
2275 !test_bit(Unmerged, &rdev->flags) &&
2276 test_bit(In_sync, &rdev->flags) &&
2277 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2278 &first_bad, &bad_sectors) == 0) {
2279 atomic_inc(&rdev->nr_pending);
2280 rcu_read_unlock();
2281 success = sync_page_io(rdev,
2282 r10_bio->devs[sl].addr +
2283 sect,
2284 s<<9,
2285 conf->tmppage, READ, false);
2286 rdev_dec_pending(rdev, mddev);
2287 rcu_read_lock();
2288 if (success)
2289 break;
2290 }
2291 sl++;
2292 if (sl == conf->copies)
2293 sl = 0;
2294 } while (!success && sl != r10_bio->read_slot);
2295 rcu_read_unlock();
2296
2297 if (!success) {
2298
2299
2300
2301
2302 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2303 rdev = conf->mirrors[dn].rdev;
2304
2305 if (!rdev_set_badblocks(
2306 rdev,
2307 r10_bio->devs[r10_bio->read_slot].addr
2308 + sect,
2309 s, 0)) {
2310 md_error(mddev, rdev);
2311 r10_bio->devs[r10_bio->read_slot].bio
2312 = IO_BLOCKED;
2313 }
2314 break;
2315 }
2316
2317 start = sl;
2318
2319 rcu_read_lock();
2320 while (sl != r10_bio->read_slot) {
2321 char b[BDEVNAME_SIZE];
2322
2323 if (sl==0)
2324 sl = conf->copies;
2325 sl--;
2326 d = r10_bio->devs[sl].devnum;
2327 rdev = rcu_dereference(conf->mirrors[d].rdev);
2328 if (!rdev ||
2329 test_bit(Unmerged, &rdev->flags) ||
2330 !test_bit(In_sync, &rdev->flags))
2331 continue;
2332
2333 atomic_inc(&rdev->nr_pending);
2334 rcu_read_unlock();
2335 if (r10_sync_page_io(rdev,
2336 r10_bio->devs[sl].addr +
2337 sect,
2338 s, conf->tmppage, WRITE)
2339 == 0) {
2340
2341 printk(KERN_NOTICE
2342 "md/raid10:%s: read correction "
2343 "write failed"
2344 " (%d sectors at %llu on %s)\n",
2345 mdname(mddev), s,
2346 (unsigned long long)(
2347 sect +
2348 choose_data_offset(r10_bio,
2349 rdev)),
2350 bdevname(rdev->bdev, b));
2351 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2352 "drive\n",
2353 mdname(mddev),
2354 bdevname(rdev->bdev, b));
2355 }
2356 rdev_dec_pending(rdev, mddev);
2357 rcu_read_lock();
2358 }
2359 sl = start;
2360 while (sl != r10_bio->read_slot) {
2361 char b[BDEVNAME_SIZE];
2362
2363 if (sl==0)
2364 sl = conf->copies;
2365 sl--;
2366 d = r10_bio->devs[sl].devnum;
2367 rdev = rcu_dereference(conf->mirrors[d].rdev);
2368 if (!rdev ||
2369 !test_bit(In_sync, &rdev->flags))
2370 continue;
2371
2372 atomic_inc(&rdev->nr_pending);
2373 rcu_read_unlock();
2374 switch (r10_sync_page_io(rdev,
2375 r10_bio->devs[sl].addr +
2376 sect,
2377 s, conf->tmppage,
2378 READ)) {
2379 case 0:
2380
2381 printk(KERN_NOTICE
2382 "md/raid10:%s: unable to read back "
2383 "corrected sectors"
2384 " (%d sectors at %llu on %s)\n",
2385 mdname(mddev), s,
2386 (unsigned long long)(
2387 sect +
2388 choose_data_offset(r10_bio, rdev)),
2389 bdevname(rdev->bdev, b));
2390 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2391 "drive\n",
2392 mdname(mddev),
2393 bdevname(rdev->bdev, b));
2394 break;
2395 case 1:
2396 printk(KERN_INFO
2397 "md/raid10:%s: read error corrected"
2398 " (%d sectors at %llu on %s)\n",
2399 mdname(mddev), s,
2400 (unsigned long long)(
2401 sect +
2402 choose_data_offset(r10_bio, rdev)),
2403 bdevname(rdev->bdev, b));
2404 atomic_add(s, &rdev->corrected_errors);
2405 }
2406
2407 rdev_dec_pending(rdev, mddev);
2408 rcu_read_lock();
2409 }
2410 rcu_read_unlock();
2411
2412 sectors -= s;
2413 sect += s;
2414 }
2415}
2416
2417static void bi_complete(struct bio *bio, int error)
2418{
2419 complete((struct completion *)bio->bi_private);
2420}
2421
2422static int submit_bio_wait(int rw, struct bio *bio)
2423{
2424 struct completion event;
2425 rw |= REQ_SYNC;
2426
2427 init_completion(&event);
2428 bio->bi_private = &event;
2429 bio->bi_end_io = bi_complete;
2430 submit_bio(rw, bio);
2431 wait_for_completion(&event);
2432
2433 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2434}
2435
2436static int narrow_write_error(struct r10bio *r10_bio, int i)
2437{
2438 struct bio *bio = r10_bio->master_bio;
2439 struct mddev *mddev = r10_bio->mddev;
2440 struct r10conf *conf = mddev->private;
2441 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453 int block_sectors;
2454 sector_t sector;
2455 int sectors;
2456 int sect_to_write = r10_bio->sectors;
2457 int ok = 1;
2458
2459 if (rdev->badblocks.shift < 0)
2460 return 0;
2461
2462 block_sectors = 1 << rdev->badblocks.shift;
2463 sector = r10_bio->sector;
2464 sectors = ((r10_bio->sector + block_sectors)
2465 & ~(sector_t)(block_sectors - 1))
2466 - sector;
2467
2468 while (sect_to_write) {
2469 struct bio *wbio;
2470 if (sectors > sect_to_write)
2471 sectors = sect_to_write;
2472
2473 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2474 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2475 wbio->bi_sector = (r10_bio->devs[i].addr+
2476 choose_data_offset(r10_bio, rdev) +
2477 (sector - r10_bio->sector));
2478 wbio->bi_bdev = rdev->bdev;
2479 if (submit_bio_wait(WRITE, wbio) == 0)
2480
2481 ok = rdev_set_badblocks(rdev, sector,
2482 sectors, 0)
2483 && ok;
2484
2485 bio_put(wbio);
2486 sect_to_write -= sectors;
2487 sector += sectors;
2488 sectors = block_sectors;
2489 }
2490 return ok;
2491}
2492
2493static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2494{
2495 int slot = r10_bio->read_slot;
2496 struct bio *bio;
2497 struct r10conf *conf = mddev->private;
2498 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2499 char b[BDEVNAME_SIZE];
2500 unsigned long do_sync;
2501 int max_sectors;
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511 bio = r10_bio->devs[slot].bio;
2512 bdevname(bio->bi_bdev, b);
2513 bio_put(bio);
2514 r10_bio->devs[slot].bio = NULL;
2515
2516 if (mddev->ro == 0) {
2517 freeze_array(conf);
2518 fix_read_error(conf, mddev, r10_bio);
2519 unfreeze_array(conf);
2520 } else
2521 r10_bio->devs[slot].bio = IO_BLOCKED;
2522
2523 rdev_dec_pending(rdev, mddev);
2524
2525read_more:
2526 rdev = read_balance(conf, r10_bio, &max_sectors);
2527 if (rdev == NULL) {
2528 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2529 " read error for block %llu\n",
2530 mdname(mddev), b,
2531 (unsigned long long)r10_bio->sector);
2532 raid_end_bio_io(r10_bio);
2533 return;
2534 }
2535
2536 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2537 slot = r10_bio->read_slot;
2538 printk_ratelimited(
2539 KERN_ERR
2540 "md/raid10:%s: %s: redirecting "
2541 "sector %llu to another mirror\n",
2542 mdname(mddev),
2543 bdevname(rdev->bdev, b),
2544 (unsigned long long)r10_bio->sector);
2545 bio = bio_clone_mddev(r10_bio->master_bio,
2546 GFP_NOIO, mddev);
2547 md_trim_bio(bio,
2548 r10_bio->sector - bio->bi_sector,
2549 max_sectors);
2550 r10_bio->devs[slot].bio = bio;
2551 r10_bio->devs[slot].rdev = rdev;
2552 bio->bi_sector = r10_bio->devs[slot].addr
2553 + choose_data_offset(r10_bio, rdev);
2554 bio->bi_bdev = rdev->bdev;
2555 bio->bi_rw = READ | do_sync;
2556 bio->bi_private = r10_bio;
2557 bio->bi_end_io = raid10_end_read_request;
2558 if (max_sectors < r10_bio->sectors) {
2559
2560 struct bio *mbio = r10_bio->master_bio;
2561 int sectors_handled =
2562 r10_bio->sector + max_sectors
2563 - mbio->bi_sector;
2564 r10_bio->sectors = max_sectors;
2565 spin_lock_irq(&conf->device_lock);
2566 if (mbio->bi_phys_segments == 0)
2567 mbio->bi_phys_segments = 2;
2568 else
2569 mbio->bi_phys_segments++;
2570 spin_unlock_irq(&conf->device_lock);
2571 generic_make_request(bio);
2572
2573 r10_bio = mempool_alloc(conf->r10bio_pool,
2574 GFP_NOIO);
2575 r10_bio->master_bio = mbio;
2576 r10_bio->sectors = (mbio->bi_size >> 9)
2577 - sectors_handled;
2578 r10_bio->state = 0;
2579 set_bit(R10BIO_ReadError,
2580 &r10_bio->state);
2581 r10_bio->mddev = mddev;
2582 r10_bio->sector = mbio->bi_sector
2583 + sectors_handled;
2584
2585 goto read_more;
2586 } else
2587 generic_make_request(bio);
2588}
2589
2590static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2591{
2592
2593
2594
2595
2596
2597
2598 int m;
2599 struct md_rdev *rdev;
2600
2601 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2602 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2603 for (m = 0; m < conf->copies; m++) {
2604 int dev = r10_bio->devs[m].devnum;
2605 rdev = conf->mirrors[dev].rdev;
2606 if (r10_bio->devs[m].bio == NULL)
2607 continue;
2608 if (test_bit(BIO_UPTODATE,
2609 &r10_bio->devs[m].bio->bi_flags)) {
2610 rdev_clear_badblocks(
2611 rdev,
2612 r10_bio->devs[m].addr,
2613 r10_bio->sectors, 0);
2614 } else {
2615 if (!rdev_set_badblocks(
2616 rdev,
2617 r10_bio->devs[m].addr,
2618 r10_bio->sectors, 0))
2619 md_error(conf->mddev, rdev);
2620 }
2621 rdev = conf->mirrors[dev].replacement;
2622 if (r10_bio->devs[m].repl_bio == NULL)
2623 continue;
2624 if (test_bit(BIO_UPTODATE,
2625 &r10_bio->devs[m].repl_bio->bi_flags)) {
2626 rdev_clear_badblocks(
2627 rdev,
2628 r10_bio->devs[m].addr,
2629 r10_bio->sectors, 0);
2630 } else {
2631 if (!rdev_set_badblocks(
2632 rdev,
2633 r10_bio->devs[m].addr,
2634 r10_bio->sectors, 0))
2635 md_error(conf->mddev, rdev);
2636 }
2637 }
2638 put_buf(r10_bio);
2639 } else {
2640 for (m = 0; m < conf->copies; m++) {
2641 int dev = r10_bio->devs[m].devnum;
2642 struct bio *bio = r10_bio->devs[m].bio;
2643 rdev = conf->mirrors[dev].rdev;
2644 if (bio == IO_MADE_GOOD) {
2645 rdev_clear_badblocks(
2646 rdev,
2647 r10_bio->devs[m].addr,
2648 r10_bio->sectors, 0);
2649 rdev_dec_pending(rdev, conf->mddev);
2650 } else if (bio != NULL &&
2651 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2652 if (!narrow_write_error(r10_bio, m)) {
2653 md_error(conf->mddev, rdev);
2654 set_bit(R10BIO_Degraded,
2655 &r10_bio->state);
2656 }
2657 rdev_dec_pending(rdev, conf->mddev);
2658 }
2659 bio = r10_bio->devs[m].repl_bio;
2660 rdev = conf->mirrors[dev].replacement;
2661 if (rdev && bio == IO_MADE_GOOD) {
2662 rdev_clear_badblocks(
2663 rdev,
2664 r10_bio->devs[m].addr,
2665 r10_bio->sectors, 0);
2666 rdev_dec_pending(rdev, conf->mddev);
2667 }
2668 }
2669 if (test_bit(R10BIO_WriteError,
2670 &r10_bio->state))
2671 close_write(r10_bio);
2672 raid_end_bio_io(r10_bio);
2673 }
2674}
2675
2676static void raid10d(struct mddev *mddev)
2677{
2678 struct r10bio *r10_bio;
2679 unsigned long flags;
2680 struct r10conf *conf = mddev->private;
2681 struct list_head *head = &conf->retry_list;
2682 struct blk_plug plug;
2683
2684 md_check_recovery(mddev);
2685
2686 blk_start_plug(&plug);
2687 for (;;) {
2688
2689 flush_pending_writes(conf);
2690
2691 spin_lock_irqsave(&conf->device_lock, flags);
2692 if (list_empty(head)) {
2693 spin_unlock_irqrestore(&conf->device_lock, flags);
2694 break;
2695 }
2696 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2697 list_del(head->prev);
2698 conf->nr_queued--;
2699 spin_unlock_irqrestore(&conf->device_lock, flags);
2700
2701 mddev = r10_bio->mddev;
2702 conf = mddev->private;
2703 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2704 test_bit(R10BIO_WriteError, &r10_bio->state))
2705 handle_write_completed(conf, r10_bio);
2706 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2707 reshape_request_write(mddev, r10_bio);
2708 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2709 sync_request_write(mddev, r10_bio);
2710 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2711 recovery_request_write(mddev, r10_bio);
2712 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2713 handle_read_error(mddev, r10_bio);
2714 else {
2715
2716
2717
2718 int slot = r10_bio->read_slot;
2719 generic_make_request(r10_bio->devs[slot].bio);
2720 }
2721
2722 cond_resched();
2723 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2724 md_check_recovery(mddev);
2725 }
2726 blk_finish_plug(&plug);
2727}
2728
2729
2730static int init_resync(struct r10conf *conf)
2731{
2732 int buffs;
2733 int i;
2734
2735 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2736 BUG_ON(conf->r10buf_pool);
2737 conf->have_replacement = 0;
2738 for (i = 0; i < conf->geo.raid_disks; i++)
2739 if (conf->mirrors[i].replacement)
2740 conf->have_replacement = 1;
2741 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2742 if (!conf->r10buf_pool)
2743 return -ENOMEM;
2744 conf->next_resync = 0;
2745 return 0;
2746}
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2781 int *skipped, int go_faster)
2782{
2783 struct r10conf *conf = mddev->private;
2784 struct r10bio *r10_bio;
2785 struct bio *biolist = NULL, *bio;
2786 sector_t max_sector, nr_sectors;
2787 int i;
2788 int max_sync;
2789 sector_t sync_blocks;
2790 sector_t sectors_skipped = 0;
2791 int chunks_skipped = 0;
2792 sector_t chunk_mask = conf->geo.chunk_mask;
2793
2794 if (!conf->r10buf_pool)
2795 if (init_resync(conf))
2796 return 0;
2797
2798 skipped:
2799 max_sector = mddev->dev_sectors;
2800 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2801 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2802 max_sector = mddev->resync_max_sectors;
2803 if (sector_nr >= max_sector) {
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2814 end_reshape(conf);
2815 return 0;
2816 }
2817
2818 if (mddev->curr_resync < max_sector) {
2819 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2820 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2821 &sync_blocks, 1);
2822 else for (i = 0; i < conf->geo.raid_disks; i++) {
2823 sector_t sect =
2824 raid10_find_virt(conf, mddev->curr_resync, i);
2825 bitmap_end_sync(mddev->bitmap, sect,
2826 &sync_blocks, 1);
2827 }
2828 } else {
2829
2830 if ((!mddev->bitmap || conf->fullsync)
2831 && conf->have_replacement
2832 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2833
2834
2835
2836 for (i = 0; i < conf->geo.raid_disks; i++)
2837 if (conf->mirrors[i].replacement)
2838 conf->mirrors[i].replacement
2839 ->recovery_offset
2840 = MaxSector;
2841 }
2842 conf->fullsync = 0;
2843 }
2844 bitmap_close_sync(mddev->bitmap);
2845 close_sync(conf);
2846 *skipped = 1;
2847 return sectors_skipped;
2848 }
2849
2850 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2851 return reshape_request(mddev, sector_nr, skipped);
2852
2853 if (chunks_skipped >= conf->geo.raid_disks) {
2854
2855
2856
2857 *skipped = 1;
2858 return (max_sector - sector_nr) + sectors_skipped;
2859 }
2860
2861 if (max_sector > mddev->resync_max)
2862 max_sector = mddev->resync_max;
2863
2864
2865
2866
2867 if (conf->geo.near_copies < conf->geo.raid_disks &&
2868 max_sector > (sector_nr | chunk_mask))
2869 max_sector = (sector_nr | chunk_mask) + 1;
2870
2871
2872
2873
2874 if (!go_faster && conf->nr_waiting)
2875 msleep_interruptible(1000);
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2893 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2894
2895 int j;
2896 r10_bio = NULL;
2897
2898 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2899 int still_degraded;
2900 struct r10bio *rb2;
2901 sector_t sect;
2902 int must_sync;
2903 int any_working;
2904 struct raid10_info *mirror = &conf->mirrors[i];
2905
2906 if ((mirror->rdev == NULL ||
2907 test_bit(In_sync, &mirror->rdev->flags))
2908 &&
2909 (mirror->replacement == NULL ||
2910 test_bit(Faulty,
2911 &mirror->replacement->flags)))
2912 continue;
2913
2914 still_degraded = 0;
2915
2916 rb2 = r10_bio;
2917 sect = raid10_find_virt(conf, sector_nr, i);
2918 if (sect >= mddev->resync_max_sectors) {
2919
2920
2921
2922 continue;
2923 }
2924
2925
2926
2927
2928 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2929 &sync_blocks, 1);
2930 if (sync_blocks < max_sync)
2931 max_sync = sync_blocks;
2932 if (!must_sync &&
2933 mirror->replacement == NULL &&
2934 !conf->fullsync) {
2935
2936
2937
2938 chunks_skipped = -1;
2939 continue;
2940 }
2941
2942 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2943 raise_barrier(conf, rb2 != NULL);
2944 atomic_set(&r10_bio->remaining, 0);
2945
2946 r10_bio->master_bio = (struct bio*)rb2;
2947 if (rb2)
2948 atomic_inc(&rb2->remaining);
2949 r10_bio->mddev = mddev;
2950 set_bit(R10BIO_IsRecover, &r10_bio->state);
2951 r10_bio->sector = sect;
2952
2953 raid10_find_phys(conf, r10_bio);
2954
2955
2956
2957
2958 for (j = 0; j < conf->geo.raid_disks; j++)
2959 if (conf->mirrors[j].rdev == NULL ||
2960 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2961 still_degraded = 1;
2962 break;
2963 }
2964
2965 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2966 &sync_blocks, still_degraded);
2967
2968 any_working = 0;
2969 for (j=0; j<conf->copies;j++) {
2970 int k;
2971 int d = r10_bio->devs[j].devnum;
2972 sector_t from_addr, to_addr;
2973 struct md_rdev *rdev;
2974 sector_t sector, first_bad;
2975 int bad_sectors;
2976 if (!conf->mirrors[d].rdev ||
2977 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
2978 continue;
2979
2980 any_working = 1;
2981 rdev = conf->mirrors[d].rdev;
2982 sector = r10_bio->devs[j].addr;
2983
2984 if (is_badblock(rdev, sector, max_sync,
2985 &first_bad, &bad_sectors)) {
2986 if (first_bad > sector)
2987 max_sync = first_bad - sector;
2988 else {
2989 bad_sectors -= (sector
2990 - first_bad);
2991 if (max_sync > bad_sectors)
2992 max_sync = bad_sectors;
2993 continue;
2994 }
2995 }
2996 bio = r10_bio->devs[0].bio;
2997 bio->bi_next = biolist;
2998 biolist = bio;
2999 bio->bi_private = r10_bio;
3000 bio->bi_end_io = end_sync_read;
3001 bio->bi_rw = READ;
3002 from_addr = r10_bio->devs[j].addr;
3003 bio->bi_sector = from_addr + rdev->data_offset;
3004 bio->bi_bdev = rdev->bdev;
3005 atomic_inc(&rdev->nr_pending);
3006
3007
3008 for (k=0; k<conf->copies; k++)
3009 if (r10_bio->devs[k].devnum == i)
3010 break;
3011 BUG_ON(k == conf->copies);
3012 to_addr = r10_bio->devs[k].addr;
3013 r10_bio->devs[0].devnum = d;
3014 r10_bio->devs[0].addr = from_addr;
3015 r10_bio->devs[1].devnum = i;
3016 r10_bio->devs[1].addr = to_addr;
3017
3018 rdev = mirror->rdev;
3019 if (!test_bit(In_sync, &rdev->flags)) {
3020 bio = r10_bio->devs[1].bio;
3021 bio->bi_next = biolist;
3022 biolist = bio;
3023 bio->bi_private = r10_bio;
3024 bio->bi_end_io = end_sync_write;
3025 bio->bi_rw = WRITE;
3026 bio->bi_sector = to_addr
3027 + rdev->data_offset;
3028 bio->bi_bdev = rdev->bdev;
3029 atomic_inc(&r10_bio->remaining);
3030 } else
3031 r10_bio->devs[1].bio->bi_end_io = NULL;
3032
3033
3034 bio = r10_bio->devs[1].repl_bio;
3035 if (bio)
3036 bio->bi_end_io = NULL;
3037 rdev = mirror->replacement;
3038
3039
3040
3041
3042
3043
3044
3045
3046 if (rdev == NULL || bio == NULL ||
3047 test_bit(Faulty, &rdev->flags))
3048 break;
3049 bio->bi_next = biolist;
3050 biolist = bio;
3051 bio->bi_private = r10_bio;
3052 bio->bi_end_io = end_sync_write;
3053 bio->bi_rw = WRITE;
3054 bio->bi_sector = to_addr + rdev->data_offset;
3055 bio->bi_bdev = rdev->bdev;
3056 atomic_inc(&r10_bio->remaining);
3057 break;
3058 }
3059 if (j == conf->copies) {
3060
3061
3062 put_buf(r10_bio);
3063 if (rb2)
3064 atomic_dec(&rb2->remaining);
3065 r10_bio = rb2;
3066 if (any_working) {
3067
3068
3069
3070 int k;
3071 for (k = 0; k < conf->copies; k++)
3072 if (r10_bio->devs[k].devnum == i)
3073 break;
3074 if (!test_bit(In_sync,
3075 &mirror->rdev->flags)
3076 && !rdev_set_badblocks(
3077 mirror->rdev,
3078 r10_bio->devs[k].addr,
3079 max_sync, 0))
3080 any_working = 0;
3081 if (mirror->replacement &&
3082 !rdev_set_badblocks(
3083 mirror->replacement,
3084 r10_bio->devs[k].addr,
3085 max_sync, 0))
3086 any_working = 0;
3087 }
3088 if (!any_working) {
3089 if (!test_and_set_bit(MD_RECOVERY_INTR,
3090 &mddev->recovery))
3091 printk(KERN_INFO "md/raid10:%s: insufficient "
3092 "working devices for recovery.\n",
3093 mdname(mddev));
3094 mirror->recovery_disabled
3095 = mddev->recovery_disabled;
3096 }
3097 break;
3098 }
3099 }
3100 if (biolist == NULL) {
3101 while (r10_bio) {
3102 struct r10bio *rb2 = r10_bio;
3103 r10_bio = (struct r10bio*) rb2->master_bio;
3104 rb2->master_bio = NULL;
3105 put_buf(rb2);
3106 }
3107 goto giveup;
3108 }
3109 } else {
3110
3111 int count = 0;
3112
3113 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3114
3115 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3116 &sync_blocks, mddev->degraded) &&
3117 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3118 &mddev->recovery)) {
3119
3120 *skipped = 1;
3121 return sync_blocks + sectors_skipped;
3122 }
3123 if (sync_blocks < max_sync)
3124 max_sync = sync_blocks;
3125 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3126
3127 r10_bio->mddev = mddev;
3128 atomic_set(&r10_bio->remaining, 0);
3129 raise_barrier(conf, 0);
3130 conf->next_resync = sector_nr;
3131
3132 r10_bio->master_bio = NULL;
3133 r10_bio->sector = sector_nr;
3134 set_bit(R10BIO_IsSync, &r10_bio->state);
3135 raid10_find_phys(conf, r10_bio);
3136 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3137
3138 for (i = 0; i < conf->copies; i++) {
3139 int d = r10_bio->devs[i].devnum;
3140 sector_t first_bad, sector;
3141 int bad_sectors;
3142
3143 if (r10_bio->devs[i].repl_bio)
3144 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3145
3146 bio = r10_bio->devs[i].bio;
3147 bio->bi_end_io = NULL;
3148 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3149 if (conf->mirrors[d].rdev == NULL ||
3150 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3151 continue;
3152 sector = r10_bio->devs[i].addr;
3153 if (is_badblock(conf->mirrors[d].rdev,
3154 sector, max_sync,
3155 &first_bad, &bad_sectors)) {
3156 if (first_bad > sector)
3157 max_sync = first_bad - sector;
3158 else {
3159 bad_sectors -= (sector - first_bad);
3160 if (max_sync > bad_sectors)
3161 max_sync = bad_sectors;
3162 continue;
3163 }
3164 }
3165 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3166 atomic_inc(&r10_bio->remaining);
3167 bio->bi_next = biolist;
3168 biolist = bio;
3169 bio->bi_private = r10_bio;
3170 bio->bi_end_io = end_sync_read;
3171 bio->bi_rw = READ;
3172 bio->bi_sector = sector +
3173 conf->mirrors[d].rdev->data_offset;
3174 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3175 count++;
3176
3177 if (conf->mirrors[d].replacement == NULL ||
3178 test_bit(Faulty,
3179 &conf->mirrors[d].replacement->flags))
3180 continue;
3181
3182
3183 bio = r10_bio->devs[i].repl_bio;
3184 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3185
3186 sector = r10_bio->devs[i].addr;
3187 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3188 bio->bi_next = biolist;
3189 biolist = bio;
3190 bio->bi_private = r10_bio;
3191 bio->bi_end_io = end_sync_write;
3192 bio->bi_rw = WRITE;
3193 bio->bi_sector = sector +
3194 conf->mirrors[d].replacement->data_offset;
3195 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3196 count++;
3197 }
3198
3199 if (count < 2) {
3200 for (i=0; i<conf->copies; i++) {
3201 int d = r10_bio->devs[i].devnum;
3202 if (r10_bio->devs[i].bio->bi_end_io)
3203 rdev_dec_pending(conf->mirrors[d].rdev,
3204 mddev);
3205 if (r10_bio->devs[i].repl_bio &&
3206 r10_bio->devs[i].repl_bio->bi_end_io)
3207 rdev_dec_pending(
3208 conf->mirrors[d].replacement,
3209 mddev);
3210 }
3211 put_buf(r10_bio);
3212 biolist = NULL;
3213 goto giveup;
3214 }
3215 }
3216
3217 for (bio = biolist; bio ; bio=bio->bi_next) {
3218
3219 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3220 if (bio->bi_end_io)
3221 bio->bi_flags |= 1 << BIO_UPTODATE;
3222 bio->bi_vcnt = 0;
3223 bio->bi_idx = 0;
3224 bio->bi_phys_segments = 0;
3225 bio->bi_size = 0;
3226 }
3227
3228 nr_sectors = 0;
3229 if (sector_nr + max_sync < max_sector)
3230 max_sector = sector_nr + max_sync;
3231 do {
3232 struct page *page;
3233 int len = PAGE_SIZE;
3234 if (sector_nr + (len>>9) > max_sector)
3235 len = (max_sector - sector_nr) << 9;
3236 if (len == 0)
3237 break;
3238 for (bio= biolist ; bio ; bio=bio->bi_next) {
3239 struct bio *bio2;
3240 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3241 if (bio_add_page(bio, page, len, 0))
3242 continue;
3243
3244
3245 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3246 for (bio2 = biolist;
3247 bio2 && bio2 != bio;
3248 bio2 = bio2->bi_next) {
3249
3250 bio2->bi_vcnt--;
3251 bio2->bi_size -= len;
3252 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3253 }
3254 goto bio_full;
3255 }
3256 nr_sectors += len>>9;
3257 sector_nr += len>>9;
3258 } while (biolist->bi_vcnt < RESYNC_PAGES);
3259 bio_full:
3260 r10_bio->sectors = nr_sectors;
3261
3262 while (biolist) {
3263 bio = biolist;
3264 biolist = biolist->bi_next;
3265
3266 bio->bi_next = NULL;
3267 r10_bio = bio->bi_private;
3268 r10_bio->sectors = nr_sectors;
3269
3270 if (bio->bi_end_io == end_sync_read) {
3271 md_sync_acct(bio->bi_bdev, nr_sectors);
3272 generic_make_request(bio);
3273 }
3274 }
3275
3276 if (sectors_skipped)
3277
3278
3279
3280 md_done_sync(mddev, sectors_skipped, 1);
3281
3282 return sectors_skipped + nr_sectors;
3283 giveup:
3284
3285
3286
3287
3288 if (sector_nr + max_sync < max_sector)
3289 max_sector = sector_nr + max_sync;
3290
3291 sectors_skipped += (max_sector - sector_nr);
3292 chunks_skipped ++;
3293 sector_nr = max_sector;
3294 goto skipped;
3295}
3296
3297static sector_t
3298raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3299{
3300 sector_t size;
3301 struct r10conf *conf = mddev->private;
3302
3303 if (!raid_disks)
3304 raid_disks = min(conf->geo.raid_disks,
3305 conf->prev.raid_disks);
3306 if (!sectors)
3307 sectors = conf->dev_sectors;
3308
3309 size = sectors >> conf->geo.chunk_shift;
3310 sector_div(size, conf->geo.far_copies);
3311 size = size * raid_disks;
3312 sector_div(size, conf->geo.near_copies);
3313
3314 return size << conf->geo.chunk_shift;
3315}
3316
3317static void calc_sectors(struct r10conf *conf, sector_t size)
3318{
3319
3320
3321
3322
3323
3324 size = size >> conf->geo.chunk_shift;
3325 sector_div(size, conf->geo.far_copies);
3326 size = size * conf->geo.raid_disks;
3327 sector_div(size, conf->geo.near_copies);
3328
3329
3330 size = size * conf->copies;
3331
3332
3333
3334
3335 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3336
3337 conf->dev_sectors = size << conf->geo.chunk_shift;
3338
3339 if (conf->geo.far_offset)
3340 conf->geo.stride = 1 << conf->geo.chunk_shift;
3341 else {
3342 sector_div(size, conf->geo.far_copies);
3343 conf->geo.stride = size << conf->geo.chunk_shift;
3344 }
3345}
3346
3347enum geo_type {geo_new, geo_old, geo_start};
3348static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3349{
3350 int nc, fc, fo;
3351 int layout, chunk, disks;
3352 switch (new) {
3353 case geo_old:
3354 layout = mddev->layout;
3355 chunk = mddev->chunk_sectors;
3356 disks = mddev->raid_disks - mddev->delta_disks;
3357 break;
3358 case geo_new:
3359 layout = mddev->new_layout;
3360 chunk = mddev->new_chunk_sectors;
3361 disks = mddev->raid_disks;
3362 break;
3363 default:
3364 case geo_start:
3365
3366 layout = mddev->new_layout;
3367 chunk = mddev->new_chunk_sectors;
3368 disks = mddev->raid_disks + mddev->delta_disks;
3369 break;
3370 }
3371 if (layout >> 17)
3372 return -1;
3373 if (chunk < (PAGE_SIZE >> 9) ||
3374 !is_power_of_2(chunk))
3375 return -2;
3376 nc = layout & 255;
3377 fc = (layout >> 8) & 255;
3378 fo = layout & (1<<16);
3379 geo->raid_disks = disks;
3380 geo->near_copies = nc;
3381 geo->far_copies = fc;
3382 geo->far_offset = fo;
3383 geo->chunk_mask = chunk - 1;
3384 geo->chunk_shift = ffz(~chunk);
3385 return nc*fc;
3386}
3387
3388static struct r10conf *setup_conf(struct mddev *mddev)
3389{
3390 struct r10conf *conf = NULL;
3391 int err = -EINVAL;
3392 struct geom geo;
3393 int copies;
3394
3395 copies = setup_geo(&geo, mddev, geo_new);
3396
3397 if (copies == -2) {
3398 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3399 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3400 mdname(mddev), PAGE_SIZE);
3401 goto out;
3402 }
3403
3404 if (copies < 2 || copies > mddev->raid_disks) {
3405 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3406 mdname(mddev), mddev->new_layout);
3407 goto out;
3408 }
3409
3410 err = -ENOMEM;
3411 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3412 if (!conf)
3413 goto out;
3414
3415
3416 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3417 max(0,mddev->delta_disks)),
3418 GFP_KERNEL);
3419 if (!conf->mirrors)
3420 goto out;
3421
3422 conf->tmppage = alloc_page(GFP_KERNEL);
3423 if (!conf->tmppage)
3424 goto out;
3425
3426 conf->geo = geo;
3427 conf->copies = copies;
3428 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3429 r10bio_pool_free, conf);
3430 if (!conf->r10bio_pool)
3431 goto out;
3432
3433 calc_sectors(conf, mddev->dev_sectors);
3434 if (mddev->reshape_position == MaxSector) {
3435 conf->prev = conf->geo;
3436 conf->reshape_progress = MaxSector;
3437 } else {
3438 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3439 err = -EINVAL;
3440 goto out;
3441 }
3442 conf->reshape_progress = mddev->reshape_position;
3443 if (conf->prev.far_offset)
3444 conf->prev.stride = 1 << conf->prev.chunk_shift;
3445 else
3446
3447 conf->prev.stride = conf->dev_sectors;
3448 }
3449 spin_lock_init(&conf->device_lock);
3450 INIT_LIST_HEAD(&conf->retry_list);
3451
3452 spin_lock_init(&conf->resync_lock);
3453 init_waitqueue_head(&conf->wait_barrier);
3454
3455 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3456 if (!conf->thread)
3457 goto out;
3458
3459 conf->mddev = mddev;
3460 return conf;
3461
3462 out:
3463 if (err == -ENOMEM)
3464 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3465 mdname(mddev));
3466 if (conf) {
3467 if (conf->r10bio_pool)
3468 mempool_destroy(conf->r10bio_pool);
3469 kfree(conf->mirrors);
3470 safe_put_page(conf->tmppage);
3471 kfree(conf);
3472 }
3473 return ERR_PTR(err);
3474}
3475
3476static int run(struct mddev *mddev)
3477{
3478 struct r10conf *conf;
3479 int i, disk_idx, chunk_size;
3480 struct raid10_info *disk;
3481 struct md_rdev *rdev;
3482 sector_t size;
3483 sector_t min_offset_diff = 0;
3484 int first = 1;
3485
3486 if (mddev->private == NULL) {
3487 conf = setup_conf(mddev);
3488 if (IS_ERR(conf))
3489 return PTR_ERR(conf);
3490 mddev->private = conf;
3491 }
3492 conf = mddev->private;
3493 if (!conf)
3494 goto out;
3495
3496 mddev->thread = conf->thread;
3497 conf->thread = NULL;
3498
3499 chunk_size = mddev->chunk_sectors << 9;
3500 if (mddev->queue) {
3501 blk_queue_io_min(mddev->queue, chunk_size);
3502 if (conf->geo.raid_disks % conf->geo.near_copies)
3503 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3504 else
3505 blk_queue_io_opt(mddev->queue, chunk_size *
3506 (conf->geo.raid_disks / conf->geo.near_copies));
3507 }
3508
3509 rdev_for_each(rdev, mddev) {
3510 long long diff;
3511 struct request_queue *q;
3512
3513 disk_idx = rdev->raid_disk;
3514 if (disk_idx < 0)
3515 continue;
3516 if (disk_idx >= conf->geo.raid_disks &&
3517 disk_idx >= conf->prev.raid_disks)
3518 continue;
3519 disk = conf->mirrors + disk_idx;
3520
3521 if (test_bit(Replacement, &rdev->flags)) {
3522 if (disk->replacement)
3523 goto out_free_conf;
3524 disk->replacement = rdev;
3525 } else {
3526 if (disk->rdev)
3527 goto out_free_conf;
3528 disk->rdev = rdev;
3529 }
3530 q = bdev_get_queue(rdev->bdev);
3531 if (q->merge_bvec_fn)
3532 mddev->merge_check_needed = 1;
3533 diff = (rdev->new_data_offset - rdev->data_offset);
3534 if (!mddev->reshape_backwards)
3535 diff = -diff;
3536 if (diff < 0)
3537 diff = 0;
3538 if (first || diff < min_offset_diff)
3539 min_offset_diff = diff;
3540
3541 if (mddev->gendisk)
3542 disk_stack_limits(mddev->gendisk, rdev->bdev,
3543 rdev->data_offset << 9);
3544
3545 disk->head_position = 0;
3546 }
3547
3548
3549 if (!enough(conf, -1)) {
3550 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3551 mdname(mddev));
3552 goto out_free_conf;
3553 }
3554
3555 if (conf->reshape_progress != MaxSector) {
3556
3557 if (conf->geo.far_copies != 1 &&
3558 conf->geo.far_offset == 0)
3559 goto out_free_conf;
3560 if (conf->prev.far_copies != 1 &&
3561 conf->geo.far_offset == 0)
3562 goto out_free_conf;
3563 }
3564
3565 mddev->degraded = 0;
3566 for (i = 0;
3567 i < conf->geo.raid_disks
3568 || i < conf->prev.raid_disks;
3569 i++) {
3570
3571 disk = conf->mirrors + i;
3572
3573 if (!disk->rdev && disk->replacement) {
3574
3575 disk->rdev = disk->replacement;
3576 disk->replacement = NULL;
3577 clear_bit(Replacement, &disk->rdev->flags);
3578 }
3579
3580 if (!disk->rdev ||
3581 !test_bit(In_sync, &disk->rdev->flags)) {
3582 disk->head_position = 0;
3583 mddev->degraded++;
3584 if (disk->rdev)
3585 conf->fullsync = 1;
3586 }
3587 disk->recovery_disabled = mddev->recovery_disabled - 1;
3588 }
3589
3590 if (mddev->recovery_cp != MaxSector)
3591 printk(KERN_NOTICE "md/raid10:%s: not clean"
3592 " -- starting background reconstruction\n",
3593 mdname(mddev));
3594 printk(KERN_INFO
3595 "md/raid10:%s: active with %d out of %d devices\n",
3596 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3597 conf->geo.raid_disks);
3598
3599
3600
3601 mddev->dev_sectors = conf->dev_sectors;
3602 size = raid10_size(mddev, 0, 0);
3603 md_set_array_sectors(mddev, size);
3604 mddev->resync_max_sectors = size;
3605
3606 if (mddev->queue) {
3607 int stripe = conf->geo.raid_disks *
3608 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3609 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3610 mddev->queue->backing_dev_info.congested_data = mddev;
3611
3612
3613
3614
3615
3616 stripe /= conf->geo.near_copies;
3617 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3618 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3619 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3620 }
3621
3622
3623 if (md_integrity_register(mddev))
3624 goto out_free_conf;
3625
3626 if (conf->reshape_progress != MaxSector) {
3627 unsigned long before_length, after_length;
3628
3629 before_length = ((1 << conf->prev.chunk_shift) *
3630 conf->prev.far_copies);
3631 after_length = ((1 << conf->geo.chunk_shift) *
3632 conf->geo.far_copies);
3633
3634 if (max(before_length, after_length) > min_offset_diff) {
3635
3636 printk("md/raid10: offset difference not enough to continue reshape\n");
3637 goto out_free_conf;
3638 }
3639 conf->offset_diff = min_offset_diff;
3640
3641 conf->reshape_safe = conf->reshape_progress;
3642 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3643 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3644 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3645 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3646 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3647 "reshape");
3648 }
3649
3650 return 0;
3651
3652out_free_conf:
3653 md_unregister_thread(&mddev->thread);
3654 if (conf->r10bio_pool)
3655 mempool_destroy(conf->r10bio_pool);
3656 safe_put_page(conf->tmppage);
3657 kfree(conf->mirrors);
3658 kfree(conf);
3659 mddev->private = NULL;
3660out:
3661 return -EIO;
3662}
3663
3664static int stop(struct mddev *mddev)
3665{
3666 struct r10conf *conf = mddev->private;
3667
3668 raise_barrier(conf, 0);
3669 lower_barrier(conf);
3670
3671 md_unregister_thread(&mddev->thread);
3672 if (mddev->queue)
3673
3674 blk_sync_queue(mddev->queue);
3675
3676 if (conf->r10bio_pool)
3677 mempool_destroy(conf->r10bio_pool);
3678 kfree(conf->mirrors);
3679 kfree(conf);
3680 mddev->private = NULL;
3681 return 0;
3682}
3683
3684static void raid10_quiesce(struct mddev *mddev, int state)
3685{
3686 struct r10conf *conf = mddev->private;
3687
3688 switch(state) {
3689 case 1:
3690 raise_barrier(conf, 0);
3691 break;
3692 case 0:
3693 lower_barrier(conf);
3694 break;
3695 }
3696}
3697
3698static int raid10_resize(struct mddev *mddev, sector_t sectors)
3699{
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712 struct r10conf *conf = mddev->private;
3713 sector_t oldsize, size;
3714
3715 if (mddev->reshape_position != MaxSector)
3716 return -EBUSY;
3717
3718 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3719 return -EINVAL;
3720
3721 oldsize = raid10_size(mddev, 0, 0);
3722 size = raid10_size(mddev, sectors, 0);
3723 if (mddev->external_size &&
3724 mddev->array_sectors > size)
3725 return -EINVAL;
3726 if (mddev->bitmap) {
3727 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3728 if (ret)
3729 return ret;
3730 }
3731 md_set_array_sectors(mddev, size);
3732 set_capacity(mddev->gendisk, mddev->array_sectors);
3733 revalidate_disk(mddev->gendisk);
3734 if (sectors > mddev->dev_sectors &&
3735 mddev->recovery_cp > oldsize) {
3736 mddev->recovery_cp = oldsize;
3737 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3738 }
3739 calc_sectors(conf, sectors);
3740 mddev->dev_sectors = conf->dev_sectors;
3741 mddev->resync_max_sectors = size;
3742 return 0;
3743}
3744
3745static void *raid10_takeover_raid0(struct mddev *mddev)
3746{
3747 struct md_rdev *rdev;
3748 struct r10conf *conf;
3749
3750 if (mddev->degraded > 0) {
3751 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3752 mdname(mddev));
3753 return ERR_PTR(-EINVAL);
3754 }
3755
3756
3757 mddev->new_level = 10;
3758
3759 mddev->new_layout = (1<<8) + 2;
3760 mddev->new_chunk_sectors = mddev->chunk_sectors;
3761 mddev->delta_disks = mddev->raid_disks;
3762 mddev->raid_disks *= 2;
3763
3764 mddev->recovery_cp = MaxSector;
3765
3766 conf = setup_conf(mddev);
3767 if (!IS_ERR(conf)) {
3768 rdev_for_each(rdev, mddev)
3769 if (rdev->raid_disk >= 0)
3770 rdev->new_raid_disk = rdev->raid_disk * 2;
3771 conf->barrier = 1;
3772 }
3773
3774 return conf;
3775}
3776
3777static void *raid10_takeover(struct mddev *mddev)
3778{
3779 struct r0conf *raid0_conf;
3780
3781
3782
3783
3784 if (mddev->level == 0) {
3785
3786 raid0_conf = mddev->private;
3787 if (raid0_conf->nr_strip_zones > 1) {
3788 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3789 " with more than one zone.\n",
3790 mdname(mddev));
3791 return ERR_PTR(-EINVAL);
3792 }
3793 return raid10_takeover_raid0(mddev);
3794 }
3795 return ERR_PTR(-EINVAL);
3796}
3797
3798static int raid10_check_reshape(struct mddev *mddev)
3799{
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814 struct r10conf *conf = mddev->private;
3815 struct geom geo;
3816
3817 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3818 return -EINVAL;
3819
3820 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3821
3822 return -EINVAL;
3823 if (geo.far_copies > 1 && !geo.far_offset)
3824
3825 return -EINVAL;
3826
3827 if (mddev->array_sectors & geo.chunk_mask)
3828
3829 return -EINVAL;
3830
3831 if (!enough(conf, -1))
3832 return -EINVAL;
3833
3834 kfree(conf->mirrors_new);
3835 conf->mirrors_new = NULL;
3836 if (mddev->delta_disks > 0) {
3837
3838 conf->mirrors_new = kzalloc(
3839 sizeof(struct raid10_info)
3840 *(mddev->raid_disks +
3841 mddev->delta_disks),
3842 GFP_KERNEL);
3843 if (!conf->mirrors_new)
3844 return -ENOMEM;
3845 }
3846 return 0;
3847}
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862static int calc_degraded(struct r10conf *conf)
3863{
3864 int degraded, degraded2;
3865 int i;
3866
3867 rcu_read_lock();
3868 degraded = 0;
3869
3870 for (i = 0; i < conf->prev.raid_disks; i++) {
3871 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3872 if (!rdev || test_bit(Faulty, &rdev->flags))
3873 degraded++;
3874 else if (!test_bit(In_sync, &rdev->flags))
3875
3876
3877
3878
3879 degraded++;
3880 }
3881 rcu_read_unlock();
3882 if (conf->geo.raid_disks == conf->prev.raid_disks)
3883 return degraded;
3884 rcu_read_lock();
3885 degraded2 = 0;
3886 for (i = 0; i < conf->geo.raid_disks; i++) {
3887 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3888 if (!rdev || test_bit(Faulty, &rdev->flags))
3889 degraded2++;
3890 else if (!test_bit(In_sync, &rdev->flags)) {
3891
3892
3893
3894
3895
3896 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3897 degraded2++;
3898 }
3899 }
3900 rcu_read_unlock();
3901 if (degraded2 > degraded)
3902 return degraded2;
3903 return degraded;
3904}
3905
3906static int raid10_start_reshape(struct mddev *mddev)
3907{
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918 unsigned long before_length, after_length;
3919 sector_t min_offset_diff = 0;
3920 int first = 1;
3921 struct geom new;
3922 struct r10conf *conf = mddev->private;
3923 struct md_rdev *rdev;
3924 int spares = 0;
3925 int ret;
3926
3927 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3928 return -EBUSY;
3929
3930 if (setup_geo(&new, mddev, geo_start) != conf->copies)
3931 return -EINVAL;
3932
3933 before_length = ((1 << conf->prev.chunk_shift) *
3934 conf->prev.far_copies);
3935 after_length = ((1 << conf->geo.chunk_shift) *
3936 conf->geo.far_copies);
3937
3938 rdev_for_each(rdev, mddev) {
3939 if (!test_bit(In_sync, &rdev->flags)
3940 && !test_bit(Faulty, &rdev->flags))
3941 spares++;
3942 if (rdev->raid_disk >= 0) {
3943 long long diff = (rdev->new_data_offset
3944 - rdev->data_offset);
3945 if (!mddev->reshape_backwards)
3946 diff = -diff;
3947 if (diff < 0)
3948 diff = 0;
3949 if (first || diff < min_offset_diff)
3950 min_offset_diff = diff;
3951 }
3952 }
3953
3954 if (max(before_length, after_length) > min_offset_diff)
3955 return -EINVAL;
3956
3957 if (spares < mddev->delta_disks)
3958 return -EINVAL;
3959
3960 conf->offset_diff = min_offset_diff;
3961 spin_lock_irq(&conf->device_lock);
3962 if (conf->mirrors_new) {
3963 memcpy(conf->mirrors_new, conf->mirrors,
3964 sizeof(struct raid10_info)*conf->prev.raid_disks);
3965 smp_mb();
3966 kfree(conf->mirrors_old);
3967 conf->mirrors_old = conf->mirrors;
3968 conf->mirrors = conf->mirrors_new;
3969 conf->mirrors_new = NULL;
3970 }
3971 setup_geo(&conf->geo, mddev, geo_start);
3972 smp_mb();
3973 if (mddev->reshape_backwards) {
3974 sector_t size = raid10_size(mddev, 0, 0);
3975 if (size < mddev->array_sectors) {
3976 spin_unlock_irq(&conf->device_lock);
3977 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
3978 mdname(mddev));
3979 return -EINVAL;
3980 }
3981 mddev->resync_max_sectors = size;
3982 conf->reshape_progress = size;
3983 } else
3984 conf->reshape_progress = 0;
3985 spin_unlock_irq(&conf->device_lock);
3986
3987 if (mddev->delta_disks && mddev->bitmap) {
3988 ret = bitmap_resize(mddev->bitmap,
3989 raid10_size(mddev, 0,
3990 conf->geo.raid_disks),
3991 0, 0);
3992 if (ret)
3993 goto abort;
3994 }
3995 if (mddev->delta_disks > 0) {
3996 rdev_for_each(rdev, mddev)
3997 if (rdev->raid_disk < 0 &&
3998 !test_bit(Faulty, &rdev->flags)) {
3999 if (raid10_add_disk(mddev, rdev) == 0) {
4000 if (rdev->raid_disk >=
4001 conf->prev.raid_disks)
4002 set_bit(In_sync, &rdev->flags);
4003 else
4004 rdev->recovery_offset = 0;
4005
4006 if (sysfs_link_rdev(mddev, rdev))
4007 ;
4008 }
4009 } else if (rdev->raid_disk >= conf->prev.raid_disks
4010 && !test_bit(Faulty, &rdev->flags)) {
4011
4012 set_bit(In_sync, &rdev->flags);
4013 }
4014 }
4015
4016
4017
4018
4019 spin_lock_irq(&conf->device_lock);
4020 mddev->degraded = calc_degraded(conf);
4021 spin_unlock_irq(&conf->device_lock);
4022 mddev->raid_disks = conf->geo.raid_disks;
4023 mddev->reshape_position = conf->reshape_progress;
4024 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4025
4026 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4027 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4028 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4029 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4030
4031 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4032 "reshape");
4033 if (!mddev->sync_thread) {
4034 ret = -EAGAIN;
4035 goto abort;
4036 }
4037 conf->reshape_checkpoint = jiffies;
4038 md_wakeup_thread(mddev->sync_thread);
4039 md_new_event(mddev);
4040 return 0;
4041
4042abort:
4043 mddev->recovery = 0;
4044 spin_lock_irq(&conf->device_lock);
4045 conf->geo = conf->prev;
4046 mddev->raid_disks = conf->geo.raid_disks;
4047 rdev_for_each(rdev, mddev)
4048 rdev->new_data_offset = rdev->data_offset;
4049 smp_wmb();
4050 conf->reshape_progress = MaxSector;
4051 mddev->reshape_position = MaxSector;
4052 spin_unlock_irq(&conf->device_lock);
4053 return ret;
4054}
4055
4056
4057
4058
4059
4060
4061
4062static sector_t last_dev_address(sector_t s, struct geom *geo)
4063{
4064 s = (s | geo->chunk_mask) + 1;
4065 s >>= geo->chunk_shift;
4066 s *= geo->near_copies;
4067 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4068 s *= geo->far_copies;
4069 s <<= geo->chunk_shift;
4070 return s;
4071}
4072
4073
4074
4075
4076
4077static sector_t first_dev_address(sector_t s, struct geom *geo)
4078{
4079 s >>= geo->chunk_shift;
4080 s *= geo->near_copies;
4081 sector_div(s, geo->raid_disks);
4082 s *= geo->far_copies;
4083 s <<= geo->chunk_shift;
4084 return s;
4085}
4086
4087static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4088 int *skipped)
4089{
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127 struct r10conf *conf = mddev->private;
4128 struct r10bio *r10_bio;
4129 sector_t next, safe, last;
4130 int max_sectors;
4131 int nr_sectors;
4132 int s;
4133 struct md_rdev *rdev;
4134 int need_flush = 0;
4135 struct bio *blist;
4136 struct bio *bio, *read_bio;
4137 int sectors_done = 0;
4138
4139 if (sector_nr == 0) {
4140
4141 if (mddev->reshape_backwards &&
4142 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4143 sector_nr = (raid10_size(mddev, 0, 0)
4144 - conf->reshape_progress);
4145 } else if (!mddev->reshape_backwards &&
4146 conf->reshape_progress > 0)
4147 sector_nr = conf->reshape_progress;
4148 if (sector_nr) {
4149 mddev->curr_resync_completed = sector_nr;
4150 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4151 *skipped = 1;
4152 return sector_nr;
4153 }
4154 }
4155
4156
4157
4158
4159
4160 if (mddev->reshape_backwards) {
4161
4162
4163
4164 next = first_dev_address(conf->reshape_progress - 1,
4165 &conf->geo);
4166
4167
4168
4169
4170 safe = last_dev_address(conf->reshape_safe - 1,
4171 &conf->prev);
4172
4173 if (next + conf->offset_diff < safe)
4174 need_flush = 1;
4175
4176 last = conf->reshape_progress - 1;
4177 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4178 & conf->prev.chunk_mask);
4179 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4180 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4181 } else {
4182
4183
4184
4185 next = last_dev_address(conf->reshape_progress, &conf->geo);
4186
4187
4188
4189
4190 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4191
4192
4193
4194
4195 if (next > safe + conf->offset_diff)
4196 need_flush = 1;
4197
4198 sector_nr = conf->reshape_progress;
4199 last = sector_nr | (conf->geo.chunk_mask
4200 & conf->prev.chunk_mask);
4201
4202 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4203 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4204 }
4205
4206 if (need_flush ||
4207 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4208
4209 wait_barrier(conf);
4210 mddev->reshape_position = conf->reshape_progress;
4211 if (mddev->reshape_backwards)
4212 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4213 - conf->reshape_progress;
4214 else
4215 mddev->curr_resync_completed = conf->reshape_progress;
4216 conf->reshape_checkpoint = jiffies;
4217 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4218 md_wakeup_thread(mddev->thread);
4219 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4220 kthread_should_stop());
4221 conf->reshape_safe = mddev->reshape_position;
4222 allow_barrier(conf);
4223 }
4224
4225read_more:
4226
4227 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4228 raise_barrier(conf, sectors_done != 0);
4229 atomic_set(&r10_bio->remaining, 0);
4230 r10_bio->mddev = mddev;
4231 r10_bio->sector = sector_nr;
4232 set_bit(R10BIO_IsReshape, &r10_bio->state);
4233 r10_bio->sectors = last - sector_nr + 1;
4234 rdev = read_balance(conf, r10_bio, &max_sectors);
4235 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4236
4237 if (!rdev) {
4238
4239
4240
4241
4242 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4243 return sectors_done;
4244 }
4245
4246 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4247
4248 read_bio->bi_bdev = rdev->bdev;
4249 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4250 + rdev->data_offset);
4251 read_bio->bi_private = r10_bio;
4252 read_bio->bi_end_io = end_sync_read;
4253 read_bio->bi_rw = READ;
4254 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4255 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4256 read_bio->bi_vcnt = 0;
4257 read_bio->bi_idx = 0;
4258 read_bio->bi_size = 0;
4259 r10_bio->master_bio = read_bio;
4260 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4261
4262
4263 __raid10_find_phys(&conf->geo, r10_bio);
4264
4265 blist = read_bio;
4266 read_bio->bi_next = NULL;
4267
4268 for (s = 0; s < conf->copies*2; s++) {
4269 struct bio *b;
4270 int d = r10_bio->devs[s/2].devnum;
4271 struct md_rdev *rdev2;
4272 if (s&1) {
4273 rdev2 = conf->mirrors[d].replacement;
4274 b = r10_bio->devs[s/2].repl_bio;
4275 } else {
4276 rdev2 = conf->mirrors[d].rdev;
4277 b = r10_bio->devs[s/2].bio;
4278 }
4279 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4280 continue;
4281 b->bi_bdev = rdev2->bdev;
4282 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4283 b->bi_private = r10_bio;
4284 b->bi_end_io = end_reshape_write;
4285 b->bi_rw = WRITE;
4286 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4287 b->bi_flags |= 1 << BIO_UPTODATE;
4288 b->bi_next = blist;
4289 b->bi_vcnt = 0;
4290 b->bi_idx = 0;
4291 b->bi_size = 0;
4292 blist = b;
4293 }
4294
4295
4296
4297 nr_sectors = 0;
4298 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4299 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4300 int len = (max_sectors - s) << 9;
4301 if (len > PAGE_SIZE)
4302 len = PAGE_SIZE;
4303 for (bio = blist; bio ; bio = bio->bi_next) {
4304 struct bio *bio2;
4305 if (bio_add_page(bio, page, len, 0))
4306 continue;
4307
4308
4309 for (bio2 = blist;
4310 bio2 && bio2 != bio;
4311 bio2 = bio2->bi_next) {
4312
4313 bio2->bi_vcnt--;
4314 bio2->bi_size -= len;
4315 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4316 }
4317 goto bio_full;
4318 }
4319 sector_nr += len >> 9;
4320 nr_sectors += len >> 9;
4321 }
4322bio_full:
4323 r10_bio->sectors = nr_sectors;
4324
4325
4326 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4327 atomic_inc(&r10_bio->remaining);
4328 read_bio->bi_next = NULL;
4329 generic_make_request(read_bio);
4330 sector_nr += nr_sectors;
4331 sectors_done += nr_sectors;
4332 if (sector_nr <= last)
4333 goto read_more;
4334
4335
4336
4337
4338 if (mddev->reshape_backwards)
4339 conf->reshape_progress -= sectors_done;
4340 else
4341 conf->reshape_progress += sectors_done;
4342
4343 return sectors_done;
4344}
4345
4346static void end_reshape_request(struct r10bio *r10_bio);
4347static int handle_reshape_read_error(struct mddev *mddev,
4348 struct r10bio *r10_bio);
4349static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4350{
4351
4352
4353
4354
4355
4356 struct r10conf *conf = mddev->private;
4357 int s;
4358
4359 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4360 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4361
4362 md_done_sync(mddev, r10_bio->sectors, 0);
4363 return;
4364 }
4365
4366
4367
4368
4369 atomic_set(&r10_bio->remaining, 1);
4370 for (s = 0; s < conf->copies*2; s++) {
4371 struct bio *b;
4372 int d = r10_bio->devs[s/2].devnum;
4373 struct md_rdev *rdev;
4374 if (s&1) {
4375 rdev = conf->mirrors[d].replacement;
4376 b = r10_bio->devs[s/2].repl_bio;
4377 } else {
4378 rdev = conf->mirrors[d].rdev;
4379 b = r10_bio->devs[s/2].bio;
4380 }
4381 if (!rdev || test_bit(Faulty, &rdev->flags))
4382 continue;
4383 atomic_inc(&rdev->nr_pending);
4384 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4385 atomic_inc(&r10_bio->remaining);
4386 b->bi_next = NULL;
4387 generic_make_request(b);
4388 }
4389 end_reshape_request(r10_bio);
4390}
4391
4392static void end_reshape(struct r10conf *conf)
4393{
4394 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4395 return;
4396
4397 spin_lock_irq(&conf->device_lock);
4398 conf->prev = conf->geo;
4399 md_finish_reshape(conf->mddev);
4400 smp_wmb();
4401 conf->reshape_progress = MaxSector;
4402 spin_unlock_irq(&conf->device_lock);
4403
4404
4405
4406
4407 if (conf->mddev->queue) {
4408 int stripe = conf->geo.raid_disks *
4409 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4410 stripe /= conf->geo.near_copies;
4411 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4412 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4413 }
4414 conf->fullsync = 0;
4415}
4416
4417
4418static int handle_reshape_read_error(struct mddev *mddev,
4419 struct r10bio *r10_bio)
4420{
4421
4422 int sectors = r10_bio->sectors;
4423 struct r10conf *conf = mddev->private;
4424 struct {
4425 struct r10bio r10_bio;
4426 struct r10dev devs[conf->copies];
4427 } on_stack;
4428 struct r10bio *r10b = &on_stack.r10_bio;
4429 int slot = 0;
4430 int idx = 0;
4431 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4432
4433 r10b->sector = r10_bio->sector;
4434 __raid10_find_phys(&conf->prev, r10b);
4435
4436 while (sectors) {
4437 int s = sectors;
4438 int success = 0;
4439 int first_slot = slot;
4440
4441 if (s > (PAGE_SIZE >> 9))
4442 s = PAGE_SIZE >> 9;
4443
4444 while (!success) {
4445 int d = r10b->devs[slot].devnum;
4446 struct md_rdev *rdev = conf->mirrors[d].rdev;
4447 sector_t addr;
4448 if (rdev == NULL ||
4449 test_bit(Faulty, &rdev->flags) ||
4450 !test_bit(In_sync, &rdev->flags))
4451 goto failed;
4452
4453 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4454 success = sync_page_io(rdev,
4455 addr,
4456 s << 9,
4457 bvec[idx].bv_page,
4458 READ, false);
4459 if (success)
4460 break;
4461 failed:
4462 slot++;
4463 if (slot >= conf->copies)
4464 slot = 0;
4465 if (slot == first_slot)
4466 break;
4467 }
4468 if (!success) {
4469
4470 set_bit(MD_RECOVERY_INTR,
4471 &mddev->recovery);
4472 return -EIO;
4473 }
4474 sectors -= s;
4475 idx++;
4476 }
4477 return 0;
4478}
4479
4480static void end_reshape_write(struct bio *bio, int error)
4481{
4482 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4483 struct r10bio *r10_bio = bio->bi_private;
4484 struct mddev *mddev = r10_bio->mddev;
4485 struct r10conf *conf = mddev->private;
4486 int d;
4487 int slot;
4488 int repl;
4489 struct md_rdev *rdev = NULL;
4490
4491 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4492 if (repl)
4493 rdev = conf->mirrors[d].replacement;
4494 if (!rdev) {
4495 smp_mb();
4496 rdev = conf->mirrors[d].rdev;
4497 }
4498
4499 if (!uptodate) {
4500
4501 md_error(mddev, rdev);
4502 }
4503
4504 rdev_dec_pending(rdev, mddev);
4505 end_reshape_request(r10_bio);
4506}
4507
4508static void end_reshape_request(struct r10bio *r10_bio)
4509{
4510 if (!atomic_dec_and_test(&r10_bio->remaining))
4511 return;
4512 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4513 bio_put(r10_bio->master_bio);
4514 put_buf(r10_bio);
4515}
4516
4517static void raid10_finish_reshape(struct mddev *mddev)
4518{
4519 struct r10conf *conf = mddev->private;
4520
4521 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4522 return;
4523
4524 if (mddev->delta_disks > 0) {
4525 sector_t size = raid10_size(mddev, 0, 0);
4526 md_set_array_sectors(mddev, size);
4527 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4528 mddev->recovery_cp = mddev->resync_max_sectors;
4529 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4530 }
4531 mddev->resync_max_sectors = size;
4532 set_capacity(mddev->gendisk, mddev->array_sectors);
4533 revalidate_disk(mddev->gendisk);
4534 } else {
4535 int d;
4536 for (d = conf->geo.raid_disks ;
4537 d < conf->geo.raid_disks - mddev->delta_disks;
4538 d++) {
4539 struct md_rdev *rdev = conf->mirrors[d].rdev;
4540 if (rdev)
4541 clear_bit(In_sync, &rdev->flags);
4542 rdev = conf->mirrors[d].replacement;
4543 if (rdev)
4544 clear_bit(In_sync, &rdev->flags);
4545 }
4546 }
4547 mddev->layout = mddev->new_layout;
4548 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4549 mddev->reshape_position = MaxSector;
4550 mddev->delta_disks = 0;
4551 mddev->reshape_backwards = 0;
4552}
4553
4554static struct md_personality raid10_personality =
4555{
4556 .name = "raid10",
4557 .level = 10,
4558 .owner = THIS_MODULE,
4559 .make_request = make_request,
4560 .run = run,
4561 .stop = stop,
4562 .status = status,
4563 .error_handler = error,
4564 .hot_add_disk = raid10_add_disk,
4565 .hot_remove_disk= raid10_remove_disk,
4566 .spare_active = raid10_spare_active,
4567 .sync_request = sync_request,
4568 .quiesce = raid10_quiesce,
4569 .size = raid10_size,
4570 .resize = raid10_resize,
4571 .takeover = raid10_takeover,
4572 .check_reshape = raid10_check_reshape,
4573 .start_reshape = raid10_start_reshape,
4574 .finish_reshape = raid10_finish_reshape,
4575};
4576
4577static int __init raid_init(void)
4578{
4579 return register_md_personality(&raid10_personality);
4580}
4581
4582static void raid_exit(void)
4583{
4584 unregister_md_personality(&raid10_personality);
4585}
4586
4587module_init(raid_init);
4588module_exit(raid_exit);
4589MODULE_LICENSE("GPL");
4590MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4591MODULE_ALIAS("md-personality-9");
4592MODULE_ALIAS("md-raid10");
4593MODULE_ALIAS("md-level-10");
4594
4595module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
4596