1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/delay.h>
22#include <linux/blkdev.h>
23#include <linux/seq_file.h>
24#include "md.h"
25#include "raid10.h"
26#include "bitmap.h"
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56#define NR_RAID10_BIOS 256
57
58static void unplug_slaves(mddev_t *mddev);
59
60static void allow_barrier(conf_t *conf);
61static void lower_barrier(conf_t *conf);
62
63static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
64{
65 conf_t *conf = data;
66 r10bio_t *r10_bio;
67 int size = offsetof(struct r10bio_s, devs[conf->copies]);
68
69
70 r10_bio = kzalloc(size, gfp_flags);
71 if (!r10_bio && conf->mddev)
72 unplug_slaves(conf->mddev);
73
74 return r10_bio;
75}
76
77static void r10bio_pool_free(void *r10_bio, void *data)
78{
79 kfree(r10_bio);
80}
81
82
83#define RESYNC_BLOCK_SIZE (64*1024)
84#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
85
86#define RESYNC_WINDOW (1024*1024)
87
88#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
89
90
91
92
93
94
95
96
97static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
98{
99 conf_t *conf = data;
100 struct page *page;
101 r10bio_t *r10_bio;
102 struct bio *bio;
103 int i, j;
104 int nalloc;
105
106 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
107 if (!r10_bio) {
108 unplug_slaves(conf->mddev);
109 return NULL;
110 }
111
112 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
113 nalloc = conf->copies;
114 else
115 nalloc = 2;
116
117
118
119
120 for (j = nalloc ; j-- ; ) {
121 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
122 if (!bio)
123 goto out_free_bio;
124 r10_bio->devs[j].bio = bio;
125 }
126
127
128
129
130 for (j = 0 ; j < nalloc; j++) {
131 bio = r10_bio->devs[j].bio;
132 for (i = 0; i < RESYNC_PAGES; i++) {
133 page = alloc_page(gfp_flags);
134 if (unlikely(!page))
135 goto out_free_pages;
136
137 bio->bi_io_vec[i].bv_page = page;
138 }
139 }
140
141 return r10_bio;
142
143out_free_pages:
144 for ( ; i > 0 ; i--)
145 safe_put_page(bio->bi_io_vec[i-1].bv_page);
146 while (j--)
147 for (i = 0; i < RESYNC_PAGES ; i++)
148 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
149 j = -1;
150out_free_bio:
151 while ( ++j < nalloc )
152 bio_put(r10_bio->devs[j].bio);
153 r10bio_pool_free(r10_bio, conf);
154 return NULL;
155}
156
157static void r10buf_pool_free(void *__r10_bio, void *data)
158{
159 int i;
160 conf_t *conf = data;
161 r10bio_t *r10bio = __r10_bio;
162 int j;
163
164 for (j=0; j < conf->copies; j++) {
165 struct bio *bio = r10bio->devs[j].bio;
166 if (bio) {
167 for (i = 0; i < RESYNC_PAGES; i++) {
168 safe_put_page(bio->bi_io_vec[i].bv_page);
169 bio->bi_io_vec[i].bv_page = NULL;
170 }
171 bio_put(bio);
172 }
173 }
174 r10bio_pool_free(r10bio, conf);
175}
176
177static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
178{
179 int i;
180
181 for (i = 0; i < conf->copies; i++) {
182 struct bio **bio = & r10_bio->devs[i].bio;
183 if (*bio && *bio != IO_BLOCKED)
184 bio_put(*bio);
185 *bio = NULL;
186 }
187}
188
189static void free_r10bio(r10bio_t *r10_bio)
190{
191 conf_t *conf = r10_bio->mddev->private;
192
193
194
195
196
197 allow_barrier(conf);
198
199 put_all_bios(conf, r10_bio);
200 mempool_free(r10_bio, conf->r10bio_pool);
201}
202
203static void put_buf(r10bio_t *r10_bio)
204{
205 conf_t *conf = r10_bio->mddev->private;
206
207 mempool_free(r10_bio, conf->r10buf_pool);
208
209 lower_barrier(conf);
210}
211
212static void reschedule_retry(r10bio_t *r10_bio)
213{
214 unsigned long flags;
215 mddev_t *mddev = r10_bio->mddev;
216 conf_t *conf = mddev->private;
217
218 spin_lock_irqsave(&conf->device_lock, flags);
219 list_add(&r10_bio->retry_list, &conf->retry_list);
220 conf->nr_queued ++;
221 spin_unlock_irqrestore(&conf->device_lock, flags);
222
223
224 wake_up(&conf->wait_barrier);
225
226 md_wakeup_thread(mddev->thread);
227}
228
229
230
231
232
233
234static void raid_end_bio_io(r10bio_t *r10_bio)
235{
236 struct bio *bio = r10_bio->master_bio;
237
238 bio_endio(bio,
239 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
240 free_r10bio(r10_bio);
241}
242
243
244
245
246static inline void update_head_pos(int slot, r10bio_t *r10_bio)
247{
248 conf_t *conf = r10_bio->mddev->private;
249
250 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
251 r10_bio->devs[slot].addr + (r10_bio->sectors);
252}
253
254static void raid10_end_read_request(struct bio *bio, int error)
255{
256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
257 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
258 int slot, dev;
259 conf_t *conf = r10_bio->mddev->private;
260
261
262 slot = r10_bio->read_slot;
263 dev = r10_bio->devs[slot].devnum;
264
265
266
267 update_head_pos(slot, r10_bio);
268
269 if (uptodate) {
270
271
272
273
274
275
276
277
278
279 set_bit(R10BIO_Uptodate, &r10_bio->state);
280 raid_end_bio_io(r10_bio);
281 } else {
282
283
284
285 char b[BDEVNAME_SIZE];
286 if (printk_ratelimit())
287 printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
288 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
289 reschedule_retry(r10_bio);
290 }
291
292 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
293}
294
295static void raid10_end_write_request(struct bio *bio, int error)
296{
297 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
298 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
299 int slot, dev;
300 conf_t *conf = r10_bio->mddev->private;
301
302 for (slot = 0; slot < conf->copies; slot++)
303 if (r10_bio->devs[slot].bio == bio)
304 break;
305 dev = r10_bio->devs[slot].devnum;
306
307
308
309
310 if (!uptodate) {
311 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
312
313 set_bit(R10BIO_Degraded, &r10_bio->state);
314 } else
315
316
317
318
319
320
321
322
323
324 set_bit(R10BIO_Uptodate, &r10_bio->state);
325
326 update_head_pos(slot, r10_bio);
327
328
329
330
331
332
333 if (atomic_dec_and_test(&r10_bio->remaining)) {
334
335 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
336 r10_bio->sectors,
337 !test_bit(R10BIO_Degraded, &r10_bio->state),
338 0);
339 md_write_end(r10_bio->mddev);
340 raid_end_bio_io(r10_bio);
341 }
342
343 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
344}
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
373{
374 int n,f;
375 sector_t sector;
376 sector_t chunk;
377 sector_t stripe;
378 int dev;
379
380 int slot = 0;
381
382
383 chunk = r10bio->sector >> conf->chunk_shift;
384 sector = r10bio->sector & conf->chunk_mask;
385
386 chunk *= conf->near_copies;
387 stripe = chunk;
388 dev = sector_div(stripe, conf->raid_disks);
389 if (conf->far_offset)
390 stripe *= conf->far_copies;
391
392 sector += stripe << conf->chunk_shift;
393
394
395 for (n=0; n < conf->near_copies; n++) {
396 int d = dev;
397 sector_t s = sector;
398 r10bio->devs[slot].addr = sector;
399 r10bio->devs[slot].devnum = d;
400 slot++;
401
402 for (f = 1; f < conf->far_copies; f++) {
403 d += conf->near_copies;
404 if (d >= conf->raid_disks)
405 d -= conf->raid_disks;
406 s += conf->stride;
407 r10bio->devs[slot].devnum = d;
408 r10bio->devs[slot].addr = s;
409 slot++;
410 }
411 dev++;
412 if (dev >= conf->raid_disks) {
413 dev = 0;
414 sector += (conf->chunk_mask + 1);
415 }
416 }
417 BUG_ON(slot != conf->copies);
418}
419
420static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
421{
422 sector_t offset, chunk, vchunk;
423
424 offset = sector & conf->chunk_mask;
425 if (conf->far_offset) {
426 int fc;
427 chunk = sector >> conf->chunk_shift;
428 fc = sector_div(chunk, conf->far_copies);
429 dev -= fc * conf->near_copies;
430 if (dev < 0)
431 dev += conf->raid_disks;
432 } else {
433 while (sector >= conf->stride) {
434 sector -= conf->stride;
435 if (dev < conf->near_copies)
436 dev += conf->raid_disks - conf->near_copies;
437 else
438 dev -= conf->near_copies;
439 }
440 chunk = sector >> conf->chunk_shift;
441 }
442 vchunk = chunk * conf->raid_disks + dev;
443 sector_div(vchunk, conf->near_copies);
444 return (vchunk << conf->chunk_shift) + offset;
445}
446
447
448
449
450
451
452
453
454
455
456
457static int raid10_mergeable_bvec(struct request_queue *q,
458 struct bvec_merge_data *bvm,
459 struct bio_vec *biovec)
460{
461 mddev_t *mddev = q->queuedata;
462 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
463 int max;
464 unsigned int chunk_sectors = mddev->chunk_sectors;
465 unsigned int bio_sectors = bvm->bi_size >> 9;
466
467 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
468 if (max < 0) max = 0;
469 if (max <= biovec->bv_len && bio_sectors == 0)
470 return biovec->bv_len;
471 else
472 return max;
473}
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494static int read_balance(conf_t *conf, r10bio_t *r10_bio)
495{
496 const unsigned long this_sector = r10_bio->sector;
497 int disk, slot, nslot;
498 const int sectors = r10_bio->sectors;
499 sector_t new_distance, current_distance;
500 mdk_rdev_t *rdev;
501
502 raid10_find_phys(conf, r10_bio);
503 rcu_read_lock();
504
505
506
507
508
509
510 if (conf->mddev->recovery_cp < MaxSector
511 && (this_sector + sectors >= conf->next_resync)) {
512
513 slot = 0;
514 disk = r10_bio->devs[slot].devnum;
515
516 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
517 r10_bio->devs[slot].bio == IO_BLOCKED ||
518 !test_bit(In_sync, &rdev->flags)) {
519 slot++;
520 if (slot == conf->copies) {
521 slot = 0;
522 disk = -1;
523 break;
524 }
525 disk = r10_bio->devs[slot].devnum;
526 }
527 goto rb_out;
528 }
529
530
531
532 slot = 0;
533 disk = r10_bio->devs[slot].devnum;
534 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
535 r10_bio->devs[slot].bio == IO_BLOCKED ||
536 !test_bit(In_sync, &rdev->flags)) {
537 slot ++;
538 if (slot == conf->copies) {
539 disk = -1;
540 goto rb_out;
541 }
542 disk = r10_bio->devs[slot].devnum;
543 }
544
545
546 current_distance = abs(r10_bio->devs[slot].addr -
547 conf->mirrors[disk].head_position);
548
549
550
551
552 for (nslot = slot; nslot < conf->copies; nslot++) {
553 int ndisk = r10_bio->devs[nslot].devnum;
554
555
556 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
557 r10_bio->devs[nslot].bio == IO_BLOCKED ||
558 !test_bit(In_sync, &rdev->flags))
559 continue;
560
561
562
563
564
565 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
566 disk = ndisk;
567 slot = nslot;
568 break;
569 }
570
571
572 if (conf->far_copies > 1)
573 new_distance = r10_bio->devs[nslot].addr;
574 else
575 new_distance = abs(r10_bio->devs[nslot].addr -
576 conf->mirrors[ndisk].head_position);
577 if (new_distance < current_distance) {
578 current_distance = new_distance;
579 disk = ndisk;
580 slot = nslot;
581 }
582 }
583
584rb_out:
585 r10_bio->read_slot = slot;
586
587
588 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
589 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
590 else
591 disk = -1;
592 rcu_read_unlock();
593
594 return disk;
595}
596
597static void unplug_slaves(mddev_t *mddev)
598{
599 conf_t *conf = mddev->private;
600 int i;
601
602 rcu_read_lock();
603 for (i=0; i<mddev->raid_disks; i++) {
604 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
605 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
606 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
607
608 atomic_inc(&rdev->nr_pending);
609 rcu_read_unlock();
610
611 blk_unplug(r_queue);
612
613 rdev_dec_pending(rdev, mddev);
614 rcu_read_lock();
615 }
616 }
617 rcu_read_unlock();
618}
619
620static void raid10_unplug(struct request_queue *q)
621{
622 mddev_t *mddev = q->queuedata;
623
624 unplug_slaves(q->queuedata);
625 md_wakeup_thread(mddev->thread);
626}
627
628static int raid10_congested(void *data, int bits)
629{
630 mddev_t *mddev = data;
631 conf_t *conf = mddev->private;
632 int i, ret = 0;
633
634 if (mddev_congested(mddev, bits))
635 return 1;
636 rcu_read_lock();
637 for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
638 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
639 if (rdev && !test_bit(Faulty, &rdev->flags)) {
640 struct request_queue *q = bdev_get_queue(rdev->bdev);
641
642 ret |= bdi_congested(&q->backing_dev_info, bits);
643 }
644 }
645 rcu_read_unlock();
646 return ret;
647}
648
649static int flush_pending_writes(conf_t *conf)
650{
651
652
653
654
655 int rv = 0;
656
657 spin_lock_irq(&conf->device_lock);
658
659 if (conf->pending_bio_list.head) {
660 struct bio *bio;
661 bio = bio_list_get(&conf->pending_bio_list);
662 blk_remove_plug(conf->mddev->queue);
663 spin_unlock_irq(&conf->device_lock);
664
665
666 bitmap_unplug(conf->mddev->bitmap);
667
668 while (bio) {
669 struct bio *next = bio->bi_next;
670 bio->bi_next = NULL;
671 generic_make_request(bio);
672 bio = next;
673 }
674 rv = 1;
675 } else
676 spin_unlock_irq(&conf->device_lock);
677 return rv;
678}
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701static void raise_barrier(conf_t *conf, int force)
702{
703 BUG_ON(force && !conf->barrier);
704 spin_lock_irq(&conf->resync_lock);
705
706
707 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
708 conf->resync_lock,
709 raid10_unplug(conf->mddev->queue));
710
711
712 conf->barrier++;
713
714
715 wait_event_lock_irq(conf->wait_barrier,
716 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
717 conf->resync_lock,
718 raid10_unplug(conf->mddev->queue));
719
720 spin_unlock_irq(&conf->resync_lock);
721}
722
723static void lower_barrier(conf_t *conf)
724{
725 unsigned long flags;
726 spin_lock_irqsave(&conf->resync_lock, flags);
727 conf->barrier--;
728 spin_unlock_irqrestore(&conf->resync_lock, flags);
729 wake_up(&conf->wait_barrier);
730}
731
732static void wait_barrier(conf_t *conf)
733{
734 spin_lock_irq(&conf->resync_lock);
735 if (conf->barrier) {
736 conf->nr_waiting++;
737 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
738 conf->resync_lock,
739 raid10_unplug(conf->mddev->queue));
740 conf->nr_waiting--;
741 }
742 conf->nr_pending++;
743 spin_unlock_irq(&conf->resync_lock);
744}
745
746static void allow_barrier(conf_t *conf)
747{
748 unsigned long flags;
749 spin_lock_irqsave(&conf->resync_lock, flags);
750 conf->nr_pending--;
751 spin_unlock_irqrestore(&conf->resync_lock, flags);
752 wake_up(&conf->wait_barrier);
753}
754
755static void freeze_array(conf_t *conf)
756{
757
758
759
760
761
762
763
764
765
766
767
768
769 spin_lock_irq(&conf->resync_lock);
770 conf->barrier++;
771 conf->nr_waiting++;
772 wait_event_lock_irq(conf->wait_barrier,
773 conf->nr_pending == conf->nr_queued+1,
774 conf->resync_lock,
775 ({ flush_pending_writes(conf);
776 raid10_unplug(conf->mddev->queue); }));
777 spin_unlock_irq(&conf->resync_lock);
778}
779
780static void unfreeze_array(conf_t *conf)
781{
782
783 spin_lock_irq(&conf->resync_lock);
784 conf->barrier--;
785 conf->nr_waiting--;
786 wake_up(&conf->wait_barrier);
787 spin_unlock_irq(&conf->resync_lock);
788}
789
790static int make_request(struct request_queue *q, struct bio * bio)
791{
792 mddev_t *mddev = q->queuedata;
793 conf_t *conf = mddev->private;
794 mirror_info_t *mirror;
795 r10bio_t *r10_bio;
796 struct bio *read_bio;
797 int cpu;
798 int i;
799 int chunk_sects = conf->chunk_mask + 1;
800 const int rw = bio_data_dir(bio);
801 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
802 struct bio_list bl;
803 unsigned long flags;
804 mdk_rdev_t *blocked_rdev;
805
806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
807 bio_endio(bio, -EOPNOTSUPP);
808 return 0;
809 }
810
811
812
813
814 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
815 > chunk_sects &&
816 conf->near_copies < conf->raid_disks)) {
817 struct bio_pair *bp;
818
819 if (bio->bi_vcnt != 1 ||
820 bio->bi_idx != 0)
821 goto bad_map;
822
823
824
825 bp = bio_split(bio,
826 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
827 if (make_request(q, &bp->bio1))
828 generic_make_request(&bp->bio1);
829 if (make_request(q, &bp->bio2))
830 generic_make_request(&bp->bio2);
831
832 bio_pair_release(bp);
833 return 0;
834 bad_map:
835 printk("raid10_make_request bug: can't convert block across chunks"
836 " or bigger than %dk %llu %d\n", chunk_sects/2,
837 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
838
839 bio_io_error(bio);
840 return 0;
841 }
842
843 md_write_start(mddev, bio);
844
845
846
847
848
849
850 wait_barrier(conf);
851
852 cpu = part_stat_lock();
853 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
854 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
855 bio_sectors(bio));
856 part_stat_unlock();
857
858 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
859
860 r10_bio->master_bio = bio;
861 r10_bio->sectors = bio->bi_size >> 9;
862
863 r10_bio->mddev = mddev;
864 r10_bio->sector = bio->bi_sector;
865 r10_bio->state = 0;
866
867 if (rw == READ) {
868
869
870
871 int disk = read_balance(conf, r10_bio);
872 int slot = r10_bio->read_slot;
873 if (disk < 0) {
874 raid_end_bio_io(r10_bio);
875 return 0;
876 }
877 mirror = conf->mirrors + disk;
878
879 read_bio = bio_clone(bio, GFP_NOIO);
880
881 r10_bio->devs[slot].bio = read_bio;
882
883 read_bio->bi_sector = r10_bio->devs[slot].addr +
884 mirror->rdev->data_offset;
885 read_bio->bi_bdev = mirror->rdev->bdev;
886 read_bio->bi_end_io = raid10_end_read_request;
887 read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
888 read_bio->bi_private = r10_bio;
889
890 generic_make_request(read_bio);
891 return 0;
892 }
893
894
895
896
897
898
899
900
901 raid10_find_phys(conf, r10_bio);
902 retry_write:
903 blocked_rdev = NULL;
904 rcu_read_lock();
905 for (i = 0; i < conf->copies; i++) {
906 int d = r10_bio->devs[i].devnum;
907 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
908 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
909 atomic_inc(&rdev->nr_pending);
910 blocked_rdev = rdev;
911 break;
912 }
913 if (rdev && !test_bit(Faulty, &rdev->flags)) {
914 atomic_inc(&rdev->nr_pending);
915 r10_bio->devs[i].bio = bio;
916 } else {
917 r10_bio->devs[i].bio = NULL;
918 set_bit(R10BIO_Degraded, &r10_bio->state);
919 }
920 }
921 rcu_read_unlock();
922
923 if (unlikely(blocked_rdev)) {
924
925 int j;
926 int d;
927
928 for (j = 0; j < i; j++)
929 if (r10_bio->devs[j].bio) {
930 d = r10_bio->devs[j].devnum;
931 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
932 }
933 allow_barrier(conf);
934 md_wait_for_blocked_rdev(blocked_rdev, mddev);
935 wait_barrier(conf);
936 goto retry_write;
937 }
938
939 atomic_set(&r10_bio->remaining, 0);
940
941 bio_list_init(&bl);
942 for (i = 0; i < conf->copies; i++) {
943 struct bio *mbio;
944 int d = r10_bio->devs[i].devnum;
945 if (!r10_bio->devs[i].bio)
946 continue;
947
948 mbio = bio_clone(bio, GFP_NOIO);
949 r10_bio->devs[i].bio = mbio;
950
951 mbio->bi_sector = r10_bio->devs[i].addr+
952 conf->mirrors[d].rdev->data_offset;
953 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
954 mbio->bi_end_io = raid10_end_write_request;
955 mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO);
956 mbio->bi_private = r10_bio;
957
958 atomic_inc(&r10_bio->remaining);
959 bio_list_add(&bl, mbio);
960 }
961
962 if (unlikely(!atomic_read(&r10_bio->remaining))) {
963
964 md_write_end(mddev);
965 raid_end_bio_io(r10_bio);
966 return 0;
967 }
968
969 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
970 spin_lock_irqsave(&conf->device_lock, flags);
971 bio_list_merge(&conf->pending_bio_list, &bl);
972 blk_plug_device(mddev->queue);
973 spin_unlock_irqrestore(&conf->device_lock, flags);
974
975
976 wake_up(&conf->wait_barrier);
977
978 if (do_sync)
979 md_wakeup_thread(mddev->thread);
980
981 return 0;
982}
983
984static void status(struct seq_file *seq, mddev_t *mddev)
985{
986 conf_t *conf = mddev->private;
987 int i;
988
989 if (conf->near_copies < conf->raid_disks)
990 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
991 if (conf->near_copies > 1)
992 seq_printf(seq, " %d near-copies", conf->near_copies);
993 if (conf->far_copies > 1) {
994 if (conf->far_offset)
995 seq_printf(seq, " %d offset-copies", conf->far_copies);
996 else
997 seq_printf(seq, " %d far-copies", conf->far_copies);
998 }
999 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1000 conf->raid_disks - mddev->degraded);
1001 for (i = 0; i < conf->raid_disks; i++)
1002 seq_printf(seq, "%s",
1003 conf->mirrors[i].rdev &&
1004 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1005 seq_printf(seq, "]");
1006}
1007
1008static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1009{
1010 char b[BDEVNAME_SIZE];
1011 conf_t *conf = mddev->private;
1012
1013
1014
1015
1016
1017
1018
1019 if (test_bit(In_sync, &rdev->flags)
1020 && conf->raid_disks-mddev->degraded == 1)
1021
1022
1023
1024
1025
1026
1027
1028 return;
1029 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1030 unsigned long flags;
1031 spin_lock_irqsave(&conf->device_lock, flags);
1032 mddev->degraded++;
1033 spin_unlock_irqrestore(&conf->device_lock, flags);
1034
1035
1036
1037 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1038 }
1039 set_bit(Faulty, &rdev->flags);
1040 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1041 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n"
1042 "raid10: Operation continuing on %d devices.\n",
1043 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1044}
1045
1046static void print_conf(conf_t *conf)
1047{
1048 int i;
1049 mirror_info_t *tmp;
1050
1051 printk("RAID10 conf printout:\n");
1052 if (!conf) {
1053 printk("(!conf)\n");
1054 return;
1055 }
1056 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1057 conf->raid_disks);
1058
1059 for (i = 0; i < conf->raid_disks; i++) {
1060 char b[BDEVNAME_SIZE];
1061 tmp = conf->mirrors + i;
1062 if (tmp->rdev)
1063 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
1064 i, !test_bit(In_sync, &tmp->rdev->flags),
1065 !test_bit(Faulty, &tmp->rdev->flags),
1066 bdevname(tmp->rdev->bdev,b));
1067 }
1068}
1069
1070static void close_sync(conf_t *conf)
1071{
1072 wait_barrier(conf);
1073 allow_barrier(conf);
1074
1075 mempool_destroy(conf->r10buf_pool);
1076 conf->r10buf_pool = NULL;
1077}
1078
1079
1080
1081
1082static int enough(conf_t *conf)
1083{
1084 int first = 0;
1085
1086 do {
1087 int n = conf->copies;
1088 int cnt = 0;
1089 while (n--) {
1090 if (conf->mirrors[first].rdev)
1091 cnt++;
1092 first = (first+1) % conf->raid_disks;
1093 }
1094 if (cnt == 0)
1095 return 0;
1096 } while (first != 0);
1097 return 1;
1098}
1099
1100static int raid10_spare_active(mddev_t *mddev)
1101{
1102 int i;
1103 conf_t *conf = mddev->private;
1104 mirror_info_t *tmp;
1105
1106
1107
1108
1109
1110 for (i = 0; i < conf->raid_disks; i++) {
1111 tmp = conf->mirrors + i;
1112 if (tmp->rdev
1113 && !test_bit(Faulty, &tmp->rdev->flags)
1114 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1115 unsigned long flags;
1116 spin_lock_irqsave(&conf->device_lock, flags);
1117 mddev->degraded--;
1118 spin_unlock_irqrestore(&conf->device_lock, flags);
1119 }
1120 }
1121
1122 print_conf(conf);
1123 return 0;
1124}
1125
1126
1127static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1128{
1129 conf_t *conf = mddev->private;
1130 int err = -EEXIST;
1131 int mirror;
1132 mirror_info_t *p;
1133 int first = 0;
1134 int last = mddev->raid_disks - 1;
1135
1136 if (mddev->recovery_cp < MaxSector)
1137
1138
1139
1140 return -EBUSY;
1141 if (!enough(conf))
1142 return -EINVAL;
1143
1144 if (rdev->raid_disk >= 0)
1145 first = last = rdev->raid_disk;
1146
1147 if (rdev->saved_raid_disk >= 0 &&
1148 rdev->saved_raid_disk >= first &&
1149 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1150 mirror = rdev->saved_raid_disk;
1151 else
1152 mirror = first;
1153 for ( ; mirror <= last ; mirror++)
1154 if ( !(p=conf->mirrors+mirror)->rdev) {
1155
1156 disk_stack_limits(mddev->gendisk, rdev->bdev,
1157 rdev->data_offset << 9);
1158
1159
1160
1161
1162 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1163 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
1164 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1165
1166 p->head_position = 0;
1167 rdev->raid_disk = mirror;
1168 err = 0;
1169 if (rdev->saved_raid_disk != mirror)
1170 conf->fullsync = 1;
1171 rcu_assign_pointer(p->rdev, rdev);
1172 break;
1173 }
1174
1175 md_integrity_add_rdev(rdev, mddev);
1176 print_conf(conf);
1177 return err;
1178}
1179
1180static int raid10_remove_disk(mddev_t *mddev, int number)
1181{
1182 conf_t *conf = mddev->private;
1183 int err = 0;
1184 mdk_rdev_t *rdev;
1185 mirror_info_t *p = conf->mirrors+ number;
1186
1187 print_conf(conf);
1188 rdev = p->rdev;
1189 if (rdev) {
1190 if (test_bit(In_sync, &rdev->flags) ||
1191 atomic_read(&rdev->nr_pending)) {
1192 err = -EBUSY;
1193 goto abort;
1194 }
1195
1196
1197
1198 if (!test_bit(Faulty, &rdev->flags) &&
1199 enough(conf)) {
1200 err = -EBUSY;
1201 goto abort;
1202 }
1203 p->rdev = NULL;
1204 synchronize_rcu();
1205 if (atomic_read(&rdev->nr_pending)) {
1206
1207 err = -EBUSY;
1208 p->rdev = rdev;
1209 goto abort;
1210 }
1211 md_integrity_register(mddev);
1212 }
1213abort:
1214
1215 print_conf(conf);
1216 return err;
1217}
1218
1219
1220static void end_sync_read(struct bio *bio, int error)
1221{
1222 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1223 conf_t *conf = r10_bio->mddev->private;
1224 int i,d;
1225
1226 for (i=0; i<conf->copies; i++)
1227 if (r10_bio->devs[i].bio == bio)
1228 break;
1229 BUG_ON(i == conf->copies);
1230 update_head_pos(i, r10_bio);
1231 d = r10_bio->devs[i].devnum;
1232
1233 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1234 set_bit(R10BIO_Uptodate, &r10_bio->state);
1235 else {
1236 atomic_add(r10_bio->sectors,
1237 &conf->mirrors[d].rdev->corrected_errors);
1238 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1239 md_error(r10_bio->mddev,
1240 conf->mirrors[d].rdev);
1241 }
1242
1243
1244
1245
1246 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1247 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1248 atomic_dec_and_test(&r10_bio->remaining)) {
1249
1250
1251
1252 reschedule_retry(r10_bio);
1253 }
1254}
1255
1256static void end_sync_write(struct bio *bio, int error)
1257{
1258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1259 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1260 mddev_t *mddev = r10_bio->mddev;
1261 conf_t *conf = mddev->private;
1262 int i,d;
1263
1264 for (i = 0; i < conf->copies; i++)
1265 if (r10_bio->devs[i].bio == bio)
1266 break;
1267 d = r10_bio->devs[i].devnum;
1268
1269 if (!uptodate)
1270 md_error(mddev, conf->mirrors[d].rdev);
1271
1272 update_head_pos(i, r10_bio);
1273
1274 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1275 while (atomic_dec_and_test(&r10_bio->remaining)) {
1276 if (r10_bio->master_bio == NULL) {
1277
1278 sector_t s = r10_bio->sectors;
1279 put_buf(r10_bio);
1280 md_done_sync(mddev, s, 1);
1281 break;
1282 } else {
1283 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1284 put_buf(r10_bio);
1285 r10_bio = r10_bio2;
1286 }
1287 }
1288}
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1307{
1308 conf_t *conf = mddev->private;
1309 int i, first;
1310 struct bio *tbio, *fbio;
1311
1312 atomic_set(&r10_bio->remaining, 1);
1313
1314
1315 for (i=0; i<conf->copies; i++)
1316 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1317 break;
1318
1319 if (i == conf->copies)
1320 goto done;
1321
1322 first = i;
1323 fbio = r10_bio->devs[i].bio;
1324
1325
1326 for (i=0 ; i < conf->copies ; i++) {
1327 int j, d;
1328 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1329
1330 tbio = r10_bio->devs[i].bio;
1331
1332 if (tbio->bi_end_io != end_sync_read)
1333 continue;
1334 if (i == first)
1335 continue;
1336 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1337
1338
1339
1340
1341 for (j = 0; j < vcnt; j++)
1342 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1343 page_address(tbio->bi_io_vec[j].bv_page),
1344 PAGE_SIZE))
1345 break;
1346 if (j == vcnt)
1347 continue;
1348 mddev->resync_mismatches += r10_bio->sectors;
1349 }
1350 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1351
1352 continue;
1353
1354
1355
1356
1357 tbio->bi_vcnt = vcnt;
1358 tbio->bi_size = r10_bio->sectors << 9;
1359 tbio->bi_idx = 0;
1360 tbio->bi_phys_segments = 0;
1361 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1362 tbio->bi_flags |= 1 << BIO_UPTODATE;
1363 tbio->bi_next = NULL;
1364 tbio->bi_rw = WRITE;
1365 tbio->bi_private = r10_bio;
1366 tbio->bi_sector = r10_bio->devs[i].addr;
1367
1368 for (j=0; j < vcnt ; j++) {
1369 tbio->bi_io_vec[j].bv_offset = 0;
1370 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1371
1372 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1373 page_address(fbio->bi_io_vec[j].bv_page),
1374 PAGE_SIZE);
1375 }
1376 tbio->bi_end_io = end_sync_write;
1377
1378 d = r10_bio->devs[i].devnum;
1379 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1380 atomic_inc(&r10_bio->remaining);
1381 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1382
1383 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1384 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1385 generic_make_request(tbio);
1386 }
1387
1388done:
1389 if (atomic_dec_and_test(&r10_bio->remaining)) {
1390 md_done_sync(mddev, r10_bio->sectors, 1);
1391 put_buf(r10_bio);
1392 }
1393}
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1407{
1408 conf_t *conf = mddev->private;
1409 int i, d;
1410 struct bio *bio, *wbio;
1411
1412
1413
1414
1415
1416 bio = r10_bio->devs[0].bio;
1417 wbio = r10_bio->devs[1].bio;
1418 for (i=0; i < wbio->bi_vcnt; i++) {
1419 struct page *p = bio->bi_io_vec[i].bv_page;
1420 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1421 wbio->bi_io_vec[i].bv_page = p;
1422 }
1423 d = r10_bio->devs[1].devnum;
1424
1425 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1426 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1427 if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1428 generic_make_request(wbio);
1429 else
1430 bio_endio(wbio, -EIO);
1431}
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1443{
1444 int sect = 0;
1445 int sectors = r10_bio->sectors;
1446 mdk_rdev_t*rdev;
1447 while(sectors) {
1448 int s = sectors;
1449 int sl = r10_bio->read_slot;
1450 int success = 0;
1451 int start;
1452
1453 if (s > (PAGE_SIZE>>9))
1454 s = PAGE_SIZE >> 9;
1455
1456 rcu_read_lock();
1457 do {
1458 int d = r10_bio->devs[sl].devnum;
1459 rdev = rcu_dereference(conf->mirrors[d].rdev);
1460 if (rdev &&
1461 test_bit(In_sync, &rdev->flags)) {
1462 atomic_inc(&rdev->nr_pending);
1463 rcu_read_unlock();
1464 success = sync_page_io(rdev->bdev,
1465 r10_bio->devs[sl].addr +
1466 sect + rdev->data_offset,
1467 s<<9,
1468 conf->tmppage, READ);
1469 rdev_dec_pending(rdev, mddev);
1470 rcu_read_lock();
1471 if (success)
1472 break;
1473 }
1474 sl++;
1475 if (sl == conf->copies)
1476 sl = 0;
1477 } while (!success && sl != r10_bio->read_slot);
1478 rcu_read_unlock();
1479
1480 if (!success) {
1481
1482 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1483 md_error(mddev, conf->mirrors[dn].rdev);
1484 break;
1485 }
1486
1487 start = sl;
1488
1489 rcu_read_lock();
1490 while (sl != r10_bio->read_slot) {
1491 int d;
1492 if (sl==0)
1493 sl = conf->copies;
1494 sl--;
1495 d = r10_bio->devs[sl].devnum;
1496 rdev = rcu_dereference(conf->mirrors[d].rdev);
1497 if (rdev &&
1498 test_bit(In_sync, &rdev->flags)) {
1499 atomic_inc(&rdev->nr_pending);
1500 rcu_read_unlock();
1501 atomic_add(s, &rdev->corrected_errors);
1502 if (sync_page_io(rdev->bdev,
1503 r10_bio->devs[sl].addr +
1504 sect + rdev->data_offset,
1505 s<<9, conf->tmppage, WRITE)
1506 == 0)
1507
1508 md_error(mddev, rdev);
1509 rdev_dec_pending(rdev, mddev);
1510 rcu_read_lock();
1511 }
1512 }
1513 sl = start;
1514 while (sl != r10_bio->read_slot) {
1515 int d;
1516 if (sl==0)
1517 sl = conf->copies;
1518 sl--;
1519 d = r10_bio->devs[sl].devnum;
1520 rdev = rcu_dereference(conf->mirrors[d].rdev);
1521 if (rdev &&
1522 test_bit(In_sync, &rdev->flags)) {
1523 char b[BDEVNAME_SIZE];
1524 atomic_inc(&rdev->nr_pending);
1525 rcu_read_unlock();
1526 if (sync_page_io(rdev->bdev,
1527 r10_bio->devs[sl].addr +
1528 sect + rdev->data_offset,
1529 s<<9, conf->tmppage, READ) == 0)
1530
1531 md_error(mddev, rdev);
1532 else
1533 printk(KERN_INFO
1534 "raid10:%s: read error corrected"
1535 " (%d sectors at %llu on %s)\n",
1536 mdname(mddev), s,
1537 (unsigned long long)(sect+
1538 rdev->data_offset),
1539 bdevname(rdev->bdev, b));
1540
1541 rdev_dec_pending(rdev, mddev);
1542 rcu_read_lock();
1543 }
1544 }
1545 rcu_read_unlock();
1546
1547 sectors -= s;
1548 sect += s;
1549 }
1550}
1551
1552static void raid10d(mddev_t *mddev)
1553{
1554 r10bio_t *r10_bio;
1555 struct bio *bio;
1556 unsigned long flags;
1557 conf_t *conf = mddev->private;
1558 struct list_head *head = &conf->retry_list;
1559 int unplug=0;
1560 mdk_rdev_t *rdev;
1561
1562 md_check_recovery(mddev);
1563
1564 for (;;) {
1565 char b[BDEVNAME_SIZE];
1566
1567 unplug += flush_pending_writes(conf);
1568
1569 spin_lock_irqsave(&conf->device_lock, flags);
1570 if (list_empty(head)) {
1571 spin_unlock_irqrestore(&conf->device_lock, flags);
1572 break;
1573 }
1574 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1575 list_del(head->prev);
1576 conf->nr_queued--;
1577 spin_unlock_irqrestore(&conf->device_lock, flags);
1578
1579 mddev = r10_bio->mddev;
1580 conf = mddev->private;
1581 if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1582 sync_request_write(mddev, r10_bio);
1583 unplug = 1;
1584 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1585 recovery_request_write(mddev, r10_bio);
1586 unplug = 1;
1587 } else {
1588 int mirror;
1589
1590
1591
1592
1593
1594
1595
1596
1597 if (mddev->ro == 0) {
1598 freeze_array(conf);
1599 fix_read_error(conf, mddev, r10_bio);
1600 unfreeze_array(conf);
1601 }
1602
1603 bio = r10_bio->devs[r10_bio->read_slot].bio;
1604 r10_bio->devs[r10_bio->read_slot].bio =
1605 mddev->ro ? IO_BLOCKED : NULL;
1606 mirror = read_balance(conf, r10_bio);
1607 if (mirror == -1) {
1608 printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
1609 " read error for block %llu\n",
1610 bdevname(bio->bi_bdev,b),
1611 (unsigned long long)r10_bio->sector);
1612 raid_end_bio_io(r10_bio);
1613 bio_put(bio);
1614 } else {
1615 const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO);
1616 bio_put(bio);
1617 rdev = conf->mirrors[mirror].rdev;
1618 if (printk_ratelimit())
1619 printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
1620 " another mirror\n",
1621 bdevname(rdev->bdev,b),
1622 (unsigned long long)r10_bio->sector);
1623 bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1624 r10_bio->devs[r10_bio->read_slot].bio = bio;
1625 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1626 + rdev->data_offset;
1627 bio->bi_bdev = rdev->bdev;
1628 bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
1629 bio->bi_private = r10_bio;
1630 bio->bi_end_io = raid10_end_read_request;
1631 unplug = 1;
1632 generic_make_request(bio);
1633 }
1634 }
1635 cond_resched();
1636 }
1637 if (unplug)
1638 unplug_slaves(mddev);
1639}
1640
1641
1642static int init_resync(conf_t *conf)
1643{
1644 int buffs;
1645
1646 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1647 BUG_ON(conf->r10buf_pool);
1648 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1649 if (!conf->r10buf_pool)
1650 return -ENOMEM;
1651 conf->next_resync = 0;
1652 return 0;
1653}
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1688{
1689 conf_t *conf = mddev->private;
1690 r10bio_t *r10_bio;
1691 struct bio *biolist = NULL, *bio;
1692 sector_t max_sector, nr_sectors;
1693 int disk;
1694 int i;
1695 int max_sync;
1696 int sync_blocks;
1697
1698 sector_t sectors_skipped = 0;
1699 int chunks_skipped = 0;
1700
1701 if (!conf->r10buf_pool)
1702 if (init_resync(conf))
1703 return 0;
1704
1705 skipped:
1706 max_sector = mddev->dev_sectors;
1707 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1708 max_sector = mddev->resync_max_sectors;
1709 if (sector_nr >= max_sector) {
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719 if (mddev->curr_resync < max_sector) {
1720 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1721 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1722 &sync_blocks, 1);
1723 else for (i=0; i<conf->raid_disks; i++) {
1724 sector_t sect =
1725 raid10_find_virt(conf, mddev->curr_resync, i);
1726 bitmap_end_sync(mddev->bitmap, sect,
1727 &sync_blocks, 1);
1728 }
1729 } else
1730 conf->fullsync = 0;
1731
1732 bitmap_close_sync(mddev->bitmap);
1733 close_sync(conf);
1734 *skipped = 1;
1735 return sectors_skipped;
1736 }
1737 if (chunks_skipped >= conf->raid_disks) {
1738
1739
1740
1741 *skipped = 1;
1742 return (max_sector - sector_nr) + sectors_skipped;
1743 }
1744
1745 if (max_sector > mddev->resync_max)
1746 max_sector = mddev->resync_max;
1747
1748
1749
1750
1751 if (conf->near_copies < conf->raid_disks &&
1752 max_sector > (sector_nr | conf->chunk_mask))
1753 max_sector = (sector_nr | conf->chunk_mask) + 1;
1754
1755
1756
1757
1758 if (!go_faster && conf->nr_waiting)
1759 msleep_interruptible(1000);
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1777 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1778
1779 int j, k;
1780 r10_bio = NULL;
1781
1782 for (i=0 ; i<conf->raid_disks; i++)
1783 if (conf->mirrors[i].rdev &&
1784 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1785 int still_degraded = 0;
1786
1787 r10bio_t *rb2 = r10_bio;
1788 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1789 int must_sync;
1790
1791
1792
1793 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1794 &sync_blocks, 1);
1795 if (sync_blocks < max_sync)
1796 max_sync = sync_blocks;
1797 if (!must_sync &&
1798 !conf->fullsync) {
1799
1800
1801
1802 chunks_skipped = -1;
1803 continue;
1804 }
1805
1806 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1807 raise_barrier(conf, rb2 != NULL);
1808 atomic_set(&r10_bio->remaining, 0);
1809
1810 r10_bio->master_bio = (struct bio*)rb2;
1811 if (rb2)
1812 atomic_inc(&rb2->remaining);
1813 r10_bio->mddev = mddev;
1814 set_bit(R10BIO_IsRecover, &r10_bio->state);
1815 r10_bio->sector = sect;
1816
1817 raid10_find_phys(conf, r10_bio);
1818
1819
1820
1821
1822 for (j=0; j<conf->raid_disks; j++)
1823 if (conf->mirrors[j].rdev == NULL ||
1824 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
1825 still_degraded = 1;
1826 break;
1827 }
1828
1829 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1830 &sync_blocks, still_degraded);
1831
1832 for (j=0; j<conf->copies;j++) {
1833 int d = r10_bio->devs[j].devnum;
1834 if (conf->mirrors[d].rdev &&
1835 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1836
1837 bio = r10_bio->devs[0].bio;
1838 bio->bi_next = biolist;
1839 biolist = bio;
1840 bio->bi_private = r10_bio;
1841 bio->bi_end_io = end_sync_read;
1842 bio->bi_rw = READ;
1843 bio->bi_sector = r10_bio->devs[j].addr +
1844 conf->mirrors[d].rdev->data_offset;
1845 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1846 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1847 atomic_inc(&r10_bio->remaining);
1848
1849
1850 for (k=0; k<conf->copies; k++)
1851 if (r10_bio->devs[k].devnum == i)
1852 break;
1853 BUG_ON(k == conf->copies);
1854 bio = r10_bio->devs[1].bio;
1855 bio->bi_next = biolist;
1856 biolist = bio;
1857 bio->bi_private = r10_bio;
1858 bio->bi_end_io = end_sync_write;
1859 bio->bi_rw = WRITE;
1860 bio->bi_sector = r10_bio->devs[k].addr +
1861 conf->mirrors[i].rdev->data_offset;
1862 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1863
1864 r10_bio->devs[0].devnum = d;
1865 r10_bio->devs[1].devnum = i;
1866
1867 break;
1868 }
1869 }
1870 if (j == conf->copies) {
1871
1872 put_buf(r10_bio);
1873 if (rb2)
1874 atomic_dec(&rb2->remaining);
1875 r10_bio = rb2;
1876 if (!test_and_set_bit(MD_RECOVERY_INTR,
1877 &mddev->recovery))
1878 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1879 mdname(mddev));
1880 break;
1881 }
1882 }
1883 if (biolist == NULL) {
1884 while (r10_bio) {
1885 r10bio_t *rb2 = r10_bio;
1886 r10_bio = (r10bio_t*) rb2->master_bio;
1887 rb2->master_bio = NULL;
1888 put_buf(rb2);
1889 }
1890 goto giveup;
1891 }
1892 } else {
1893
1894 int count = 0;
1895
1896 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1897
1898 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1899 &sync_blocks, mddev->degraded) &&
1900 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1901
1902 *skipped = 1;
1903 return sync_blocks + sectors_skipped;
1904 }
1905 if (sync_blocks < max_sync)
1906 max_sync = sync_blocks;
1907 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1908
1909 r10_bio->mddev = mddev;
1910 atomic_set(&r10_bio->remaining, 0);
1911 raise_barrier(conf, 0);
1912 conf->next_resync = sector_nr;
1913
1914 r10_bio->master_bio = NULL;
1915 r10_bio->sector = sector_nr;
1916 set_bit(R10BIO_IsSync, &r10_bio->state);
1917 raid10_find_phys(conf, r10_bio);
1918 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
1919
1920 for (i=0; i<conf->copies; i++) {
1921 int d = r10_bio->devs[i].devnum;
1922 bio = r10_bio->devs[i].bio;
1923 bio->bi_end_io = NULL;
1924 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1925 if (conf->mirrors[d].rdev == NULL ||
1926 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1927 continue;
1928 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1929 atomic_inc(&r10_bio->remaining);
1930 bio->bi_next = biolist;
1931 biolist = bio;
1932 bio->bi_private = r10_bio;
1933 bio->bi_end_io = end_sync_read;
1934 bio->bi_rw = READ;
1935 bio->bi_sector = r10_bio->devs[i].addr +
1936 conf->mirrors[d].rdev->data_offset;
1937 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1938 count++;
1939 }
1940
1941 if (count < 2) {
1942 for (i=0; i<conf->copies; i++) {
1943 int d = r10_bio->devs[i].devnum;
1944 if (r10_bio->devs[i].bio->bi_end_io)
1945 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1946 }
1947 put_buf(r10_bio);
1948 biolist = NULL;
1949 goto giveup;
1950 }
1951 }
1952
1953 for (bio = biolist; bio ; bio=bio->bi_next) {
1954
1955 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1956 if (bio->bi_end_io)
1957 bio->bi_flags |= 1 << BIO_UPTODATE;
1958 bio->bi_vcnt = 0;
1959 bio->bi_idx = 0;
1960 bio->bi_phys_segments = 0;
1961 bio->bi_size = 0;
1962 }
1963
1964 nr_sectors = 0;
1965 if (sector_nr + max_sync < max_sector)
1966 max_sector = sector_nr + max_sync;
1967 do {
1968 struct page *page;
1969 int len = PAGE_SIZE;
1970 disk = 0;
1971 if (sector_nr + (len>>9) > max_sector)
1972 len = (max_sector - sector_nr) << 9;
1973 if (len == 0)
1974 break;
1975 for (bio= biolist ; bio ; bio=bio->bi_next) {
1976 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1977 if (bio_add_page(bio, page, len, 0) == 0) {
1978
1979 struct bio *bio2;
1980 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1981 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
1982
1983 bio2->bi_vcnt--;
1984 bio2->bi_size -= len;
1985 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
1986 }
1987 goto bio_full;
1988 }
1989 disk = i;
1990 }
1991 nr_sectors += len>>9;
1992 sector_nr += len>>9;
1993 } while (biolist->bi_vcnt < RESYNC_PAGES);
1994 bio_full:
1995 r10_bio->sectors = nr_sectors;
1996
1997 while (biolist) {
1998 bio = biolist;
1999 biolist = biolist->bi_next;
2000
2001 bio->bi_next = NULL;
2002 r10_bio = bio->bi_private;
2003 r10_bio->sectors = nr_sectors;
2004
2005 if (bio->bi_end_io == end_sync_read) {
2006 md_sync_acct(bio->bi_bdev, nr_sectors);
2007 generic_make_request(bio);
2008 }
2009 }
2010
2011 if (sectors_skipped)
2012
2013
2014
2015 md_done_sync(mddev, sectors_skipped, 1);
2016
2017 return sectors_skipped + nr_sectors;
2018 giveup:
2019
2020
2021
2022 if (sector_nr + max_sync < max_sector)
2023 max_sector = sector_nr + max_sync;
2024
2025 sectors_skipped += (max_sector - sector_nr);
2026 chunks_skipped ++;
2027 sector_nr = max_sector;
2028 goto skipped;
2029}
2030
2031static sector_t
2032raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2033{
2034 sector_t size;
2035 conf_t *conf = mddev->private;
2036
2037 if (!raid_disks)
2038 raid_disks = mddev->raid_disks;
2039 if (!sectors)
2040 sectors = mddev->dev_sectors;
2041
2042 size = sectors >> conf->chunk_shift;
2043 sector_div(size, conf->far_copies);
2044 size = size * raid_disks;
2045 sector_div(size, conf->near_copies);
2046
2047 return size << conf->chunk_shift;
2048}
2049
2050static int run(mddev_t *mddev)
2051{
2052 conf_t *conf;
2053 int i, disk_idx, chunk_size;
2054 mirror_info_t *disk;
2055 mdk_rdev_t *rdev;
2056 int nc, fc, fo;
2057 sector_t stride, size;
2058
2059 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
2060 !is_power_of_2(mddev->chunk_sectors)) {
2061 printk(KERN_ERR "md/raid10: chunk size must be "
2062 "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
2063 return -EINVAL;
2064 }
2065
2066 nc = mddev->layout & 255;
2067 fc = (mddev->layout >> 8) & 255;
2068 fo = mddev->layout & (1<<16);
2069 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2070 (mddev->layout >> 17)) {
2071 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
2072 mdname(mddev), mddev->layout);
2073 goto out;
2074 }
2075
2076
2077
2078
2079
2080 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2081 mddev->private = conf;
2082 if (!conf) {
2083 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2084 mdname(mddev));
2085 goto out;
2086 }
2087 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2088 GFP_KERNEL);
2089 if (!conf->mirrors) {
2090 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2091 mdname(mddev));
2092 goto out_free_conf;
2093 }
2094
2095 conf->tmppage = alloc_page(GFP_KERNEL);
2096 if (!conf->tmppage)
2097 goto out_free_conf;
2098
2099 conf->raid_disks = mddev->raid_disks;
2100 conf->near_copies = nc;
2101 conf->far_copies = fc;
2102 conf->copies = nc*fc;
2103 conf->far_offset = fo;
2104 conf->chunk_mask = mddev->chunk_sectors - 1;
2105 conf->chunk_shift = ffz(~mddev->chunk_sectors);
2106 size = mddev->dev_sectors >> conf->chunk_shift;
2107 sector_div(size, fc);
2108 size = size * conf->raid_disks;
2109 sector_div(size, nc);
2110
2111
2112 stride = size * conf->copies;
2113
2114
2115
2116
2117 stride += conf->raid_disks - 1;
2118 sector_div(stride, conf->raid_disks);
2119 mddev->dev_sectors = stride << conf->chunk_shift;
2120
2121 if (fo)
2122 stride = 1;
2123 else
2124 sector_div(stride, fc);
2125 conf->stride = stride << conf->chunk_shift;
2126
2127 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2128 r10bio_pool_free, conf);
2129 if (!conf->r10bio_pool) {
2130 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2131 mdname(mddev));
2132 goto out_free_conf;
2133 }
2134
2135 conf->mddev = mddev;
2136 spin_lock_init(&conf->device_lock);
2137 mddev->queue->queue_lock = &conf->device_lock;
2138
2139 chunk_size = mddev->chunk_sectors << 9;
2140 blk_queue_io_min(mddev->queue, chunk_size);
2141 if (conf->raid_disks % conf->near_copies)
2142 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
2143 else
2144 blk_queue_io_opt(mddev->queue, chunk_size *
2145 (conf->raid_disks / conf->near_copies));
2146
2147 list_for_each_entry(rdev, &mddev->disks, same_set) {
2148 disk_idx = rdev->raid_disk;
2149 if (disk_idx >= mddev->raid_disks
2150 || disk_idx < 0)
2151 continue;
2152 disk = conf->mirrors + disk_idx;
2153
2154 disk->rdev = rdev;
2155 disk_stack_limits(mddev->gendisk, rdev->bdev,
2156 rdev->data_offset << 9);
2157
2158
2159
2160
2161 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2162 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2163 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2164
2165 disk->head_position = 0;
2166 }
2167 INIT_LIST_HEAD(&conf->retry_list);
2168
2169 spin_lock_init(&conf->resync_lock);
2170 init_waitqueue_head(&conf->wait_barrier);
2171
2172
2173 if (!enough(conf)) {
2174 printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
2175 mdname(mddev));
2176 goto out_free_conf;
2177 }
2178
2179 mddev->degraded = 0;
2180 for (i = 0; i < conf->raid_disks; i++) {
2181
2182 disk = conf->mirrors + i;
2183
2184 if (!disk->rdev ||
2185 !test_bit(In_sync, &disk->rdev->flags)) {
2186 disk->head_position = 0;
2187 mddev->degraded++;
2188 if (disk->rdev)
2189 conf->fullsync = 1;
2190 }
2191 }
2192
2193
2194 mddev->thread = md_register_thread(raid10d, mddev, NULL);
2195 if (!mddev->thread) {
2196 printk(KERN_ERR
2197 "raid10: couldn't allocate thread for %s\n",
2198 mdname(mddev));
2199 goto out_free_conf;
2200 }
2201
2202 if (mddev->recovery_cp != MaxSector)
2203 printk(KERN_NOTICE "raid10: %s is not clean"
2204 " -- starting background reconstruction\n",
2205 mdname(mddev));
2206 printk(KERN_INFO
2207 "raid10: raid set %s active with %d out of %d devices\n",
2208 mdname(mddev), mddev->raid_disks - mddev->degraded,
2209 mddev->raid_disks);
2210
2211
2212
2213 md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
2214 mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
2215
2216 mddev->queue->unplug_fn = raid10_unplug;
2217 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2218 mddev->queue->backing_dev_info.congested_data = mddev;
2219
2220
2221
2222
2223
2224 {
2225 int stripe = conf->raid_disks *
2226 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
2227 stripe /= conf->near_copies;
2228 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2229 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2230 }
2231
2232 if (conf->near_copies < mddev->raid_disks)
2233 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2234 md_integrity_register(mddev);
2235 return 0;
2236
2237out_free_conf:
2238 if (conf->r10bio_pool)
2239 mempool_destroy(conf->r10bio_pool);
2240 safe_put_page(conf->tmppage);
2241 kfree(conf->mirrors);
2242 kfree(conf);
2243 mddev->private = NULL;
2244out:
2245 return -EIO;
2246}
2247
2248static int stop(mddev_t *mddev)
2249{
2250 conf_t *conf = mddev->private;
2251
2252 raise_barrier(conf, 0);
2253 lower_barrier(conf);
2254
2255 md_unregister_thread(mddev->thread);
2256 mddev->thread = NULL;
2257 blk_sync_queue(mddev->queue);
2258 if (conf->r10bio_pool)
2259 mempool_destroy(conf->r10bio_pool);
2260 kfree(conf->mirrors);
2261 kfree(conf);
2262 mddev->private = NULL;
2263 return 0;
2264}
2265
2266static void raid10_quiesce(mddev_t *mddev, int state)
2267{
2268 conf_t *conf = mddev->private;
2269
2270 switch(state) {
2271 case 1:
2272 raise_barrier(conf, 0);
2273 break;
2274 case 0:
2275 lower_barrier(conf);
2276 break;
2277 }
2278 if (mddev->thread) {
2279 if (mddev->bitmap)
2280 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2281 else
2282 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2283 md_wakeup_thread(mddev->thread);
2284 }
2285}
2286
2287static struct mdk_personality raid10_personality =
2288{
2289 .name = "raid10",
2290 .level = 10,
2291 .owner = THIS_MODULE,
2292 .make_request = make_request,
2293 .run = run,
2294 .stop = stop,
2295 .status = status,
2296 .error_handler = error,
2297 .hot_add_disk = raid10_add_disk,
2298 .hot_remove_disk= raid10_remove_disk,
2299 .spare_active = raid10_spare_active,
2300 .sync_request = sync_request,
2301 .quiesce = raid10_quiesce,
2302 .size = raid10_size,
2303};
2304
2305static int __init raid_init(void)
2306{
2307 return register_md_personality(&raid10_personality);
2308}
2309
2310static void raid_exit(void)
2311{
2312 unregister_md_personality(&raid10_personality);
2313}
2314
2315module_init(raid_init);
2316module_exit(raid_exit);
2317MODULE_LICENSE("GPL");
2318MODULE_ALIAS("md-personality-9");
2319MODULE_ALIAS("md-raid10");
2320MODULE_ALIAS("md-level-10");
2321