1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include "dm-bio-list.h"
22#include <linux/delay.h>
23#include <linux/raid/raid10.h>
24#include <linux/raid/bitmap.h>
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54#define NR_RAID10_BIOS 256
55
56static void unplug_slaves(mddev_t *mddev);
57
58static void allow_barrier(conf_t *conf);
59static void lower_barrier(conf_t *conf);
60
61static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
62{
63 conf_t *conf = data;
64 r10bio_t *r10_bio;
65 int size = offsetof(struct r10bio_s, devs[conf->copies]);
66
67
68 r10_bio = kzalloc(size, gfp_flags);
69 if (!r10_bio)
70 unplug_slaves(conf->mddev);
71
72 return r10_bio;
73}
74
75static void r10bio_pool_free(void *r10_bio, void *data)
76{
77 kfree(r10_bio);
78}
79
80
81#define RESYNC_BLOCK_SIZE (64*1024)
82#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
83
84#define RESYNC_WINDOW (1024*1024)
85
86#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
87
88
89
90
91
92
93
94
95static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
96{
97 conf_t *conf = data;
98 struct page *page;
99 r10bio_t *r10_bio;
100 struct bio *bio;
101 int i, j;
102 int nalloc;
103
104 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
105 if (!r10_bio) {
106 unplug_slaves(conf->mddev);
107 return NULL;
108 }
109
110 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
111 nalloc = conf->copies;
112 else
113 nalloc = 2;
114
115
116
117
118 for (j = nalloc ; j-- ; ) {
119 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
120 if (!bio)
121 goto out_free_bio;
122 r10_bio->devs[j].bio = bio;
123 }
124
125
126
127
128 for (j = 0 ; j < nalloc; j++) {
129 bio = r10_bio->devs[j].bio;
130 for (i = 0; i < RESYNC_PAGES; i++) {
131 page = alloc_page(gfp_flags);
132 if (unlikely(!page))
133 goto out_free_pages;
134
135 bio->bi_io_vec[i].bv_page = page;
136 }
137 }
138
139 return r10_bio;
140
141out_free_pages:
142 for ( ; i > 0 ; i--)
143 safe_put_page(bio->bi_io_vec[i-1].bv_page);
144 while (j--)
145 for (i = 0; i < RESYNC_PAGES ; i++)
146 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
147 j = -1;
148out_free_bio:
149 while ( ++j < nalloc )
150 bio_put(r10_bio->devs[j].bio);
151 r10bio_pool_free(r10_bio, conf);
152 return NULL;
153}
154
155static void r10buf_pool_free(void *__r10_bio, void *data)
156{
157 int i;
158 conf_t *conf = data;
159 r10bio_t *r10bio = __r10_bio;
160 int j;
161
162 for (j=0; j < conf->copies; j++) {
163 struct bio *bio = r10bio->devs[j].bio;
164 if (bio) {
165 for (i = 0; i < RESYNC_PAGES; i++) {
166 safe_put_page(bio->bi_io_vec[i].bv_page);
167 bio->bi_io_vec[i].bv_page = NULL;
168 }
169 bio_put(bio);
170 }
171 }
172 r10bio_pool_free(r10bio, conf);
173}
174
175static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
176{
177 int i;
178
179 for (i = 0; i < conf->copies; i++) {
180 struct bio **bio = & r10_bio->devs[i].bio;
181 if (*bio && *bio != IO_BLOCKED)
182 bio_put(*bio);
183 *bio = NULL;
184 }
185}
186
187static void free_r10bio(r10bio_t *r10_bio)
188{
189 conf_t *conf = mddev_to_conf(r10_bio->mddev);
190
191
192
193
194
195 allow_barrier(conf);
196
197 put_all_bios(conf, r10_bio);
198 mempool_free(r10_bio, conf->r10bio_pool);
199}
200
201static void put_buf(r10bio_t *r10_bio)
202{
203 conf_t *conf = mddev_to_conf(r10_bio->mddev);
204
205 mempool_free(r10_bio, conf->r10buf_pool);
206
207 lower_barrier(conf);
208}
209
210static void reschedule_retry(r10bio_t *r10_bio)
211{
212 unsigned long flags;
213 mddev_t *mddev = r10_bio->mddev;
214 conf_t *conf = mddev_to_conf(mddev);
215
216 spin_lock_irqsave(&conf->device_lock, flags);
217 list_add(&r10_bio->retry_list, &conf->retry_list);
218 conf->nr_queued ++;
219 spin_unlock_irqrestore(&conf->device_lock, flags);
220
221
222 wake_up(&conf->wait_barrier);
223
224 md_wakeup_thread(mddev->thread);
225}
226
227
228
229
230
231
232static void raid_end_bio_io(r10bio_t *r10_bio)
233{
234 struct bio *bio = r10_bio->master_bio;
235
236 bio_endio(bio,
237 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
238 free_r10bio(r10_bio);
239}
240
241
242
243
244static inline void update_head_pos(int slot, r10bio_t *r10_bio)
245{
246 conf_t *conf = mddev_to_conf(r10_bio->mddev);
247
248 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
249 r10_bio->devs[slot].addr + (r10_bio->sectors);
250}
251
252static void raid10_end_read_request(struct bio *bio, int error)
253{
254 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
255 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
256 int slot, dev;
257 conf_t *conf = mddev_to_conf(r10_bio->mddev);
258
259
260 slot = r10_bio->read_slot;
261 dev = r10_bio->devs[slot].devnum;
262
263
264
265 update_head_pos(slot, r10_bio);
266
267 if (uptodate) {
268
269
270
271
272
273
274
275
276
277 set_bit(R10BIO_Uptodate, &r10_bio->state);
278 raid_end_bio_io(r10_bio);
279 } else {
280
281
282
283 char b[BDEVNAME_SIZE];
284 if (printk_ratelimit())
285 printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
286 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
287 reschedule_retry(r10_bio);
288 }
289
290 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
291}
292
293static void raid10_end_write_request(struct bio *bio, int error)
294{
295 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
296 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
297 int slot, dev;
298 conf_t *conf = mddev_to_conf(r10_bio->mddev);
299
300 for (slot = 0; slot < conf->copies; slot++)
301 if (r10_bio->devs[slot].bio == bio)
302 break;
303 dev = r10_bio->devs[slot].devnum;
304
305
306
307
308 if (!uptodate) {
309 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
310
311 set_bit(R10BIO_Degraded, &r10_bio->state);
312 } else
313
314
315
316
317
318
319
320
321
322 set_bit(R10BIO_Uptodate, &r10_bio->state);
323
324 update_head_pos(slot, r10_bio);
325
326
327
328
329
330
331 if (atomic_dec_and_test(&r10_bio->remaining)) {
332
333 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334 r10_bio->sectors,
335 !test_bit(R10BIO_Degraded, &r10_bio->state),
336 0);
337 md_write_end(r10_bio->mddev);
338 raid_end_bio_io(r10_bio);
339 }
340
341 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
342}
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
371{
372 int n,f;
373 sector_t sector;
374 sector_t chunk;
375 sector_t stripe;
376 int dev;
377
378 int slot = 0;
379
380
381 chunk = r10bio->sector >> conf->chunk_shift;
382 sector = r10bio->sector & conf->chunk_mask;
383
384 chunk *= conf->near_copies;
385 stripe = chunk;
386 dev = sector_div(stripe, conf->raid_disks);
387 if (conf->far_offset)
388 stripe *= conf->far_copies;
389
390 sector += stripe << conf->chunk_shift;
391
392
393 for (n=0; n < conf->near_copies; n++) {
394 int d = dev;
395 sector_t s = sector;
396 r10bio->devs[slot].addr = sector;
397 r10bio->devs[slot].devnum = d;
398 slot++;
399
400 for (f = 1; f < conf->far_copies; f++) {
401 d += conf->near_copies;
402 if (d >= conf->raid_disks)
403 d -= conf->raid_disks;
404 s += conf->stride;
405 r10bio->devs[slot].devnum = d;
406 r10bio->devs[slot].addr = s;
407 slot++;
408 }
409 dev++;
410 if (dev >= conf->raid_disks) {
411 dev = 0;
412 sector += (conf->chunk_mask + 1);
413 }
414 }
415 BUG_ON(slot != conf->copies);
416}
417
418static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
419{
420 sector_t offset, chunk, vchunk;
421
422 offset = sector & conf->chunk_mask;
423 if (conf->far_offset) {
424 int fc;
425 chunk = sector >> conf->chunk_shift;
426 fc = sector_div(chunk, conf->far_copies);
427 dev -= fc * conf->near_copies;
428 if (dev < 0)
429 dev += conf->raid_disks;
430 } else {
431 while (sector >= conf->stride) {
432 sector -= conf->stride;
433 if (dev < conf->near_copies)
434 dev += conf->raid_disks - conf->near_copies;
435 else
436 dev -= conf->near_copies;
437 }
438 chunk = sector >> conf->chunk_shift;
439 }
440 vchunk = chunk * conf->raid_disks + dev;
441 sector_div(vchunk, conf->near_copies);
442 return (vchunk << conf->chunk_shift) + offset;
443}
444
445
446
447
448
449
450
451
452
453
454
455static int raid10_mergeable_bvec(struct request_queue *q,
456 struct bvec_merge_data *bvm,
457 struct bio_vec *biovec)
458{
459 mddev_t *mddev = q->queuedata;
460 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
461 int max;
462 unsigned int chunk_sectors = mddev->chunk_size >> 9;
463 unsigned int bio_sectors = bvm->bi_size >> 9;
464
465 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
466 if (max < 0) max = 0;
467 if (max <= biovec->bv_len && bio_sectors == 0)
468 return biovec->bv_len;
469 else
470 return max;
471}
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492static int read_balance(conf_t *conf, r10bio_t *r10_bio)
493{
494 const unsigned long this_sector = r10_bio->sector;
495 int disk, slot, nslot;
496 const int sectors = r10_bio->sectors;
497 sector_t new_distance, current_distance;
498 mdk_rdev_t *rdev;
499
500 raid10_find_phys(conf, r10_bio);
501 rcu_read_lock();
502
503
504
505
506
507
508 if (conf->mddev->recovery_cp < MaxSector
509 && (this_sector + sectors >= conf->next_resync)) {
510
511 slot = 0;
512 disk = r10_bio->devs[slot].devnum;
513
514 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
515 r10_bio->devs[slot].bio == IO_BLOCKED ||
516 !test_bit(In_sync, &rdev->flags)) {
517 slot++;
518 if (slot == conf->copies) {
519 slot = 0;
520 disk = -1;
521 break;
522 }
523 disk = r10_bio->devs[slot].devnum;
524 }
525 goto rb_out;
526 }
527
528
529
530 slot = 0;
531 disk = r10_bio->devs[slot].devnum;
532 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
533 r10_bio->devs[slot].bio == IO_BLOCKED ||
534 !test_bit(In_sync, &rdev->flags)) {
535 slot ++;
536 if (slot == conf->copies) {
537 disk = -1;
538 goto rb_out;
539 }
540 disk = r10_bio->devs[slot].devnum;
541 }
542
543
544 current_distance = abs(r10_bio->devs[slot].addr -
545 conf->mirrors[disk].head_position);
546
547
548
549
550 for (nslot = slot; nslot < conf->copies; nslot++) {
551 int ndisk = r10_bio->devs[nslot].devnum;
552
553
554 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
555 r10_bio->devs[nslot].bio == IO_BLOCKED ||
556 !test_bit(In_sync, &rdev->flags))
557 continue;
558
559
560
561
562
563 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
564 disk = ndisk;
565 slot = nslot;
566 break;
567 }
568
569
570 if (conf->far_copies > 1)
571 new_distance = r10_bio->devs[nslot].addr;
572 else
573 new_distance = abs(r10_bio->devs[nslot].addr -
574 conf->mirrors[ndisk].head_position);
575 if (new_distance < current_distance) {
576 current_distance = new_distance;
577 disk = ndisk;
578 slot = nslot;
579 }
580 }
581
582rb_out:
583 r10_bio->read_slot = slot;
584
585
586 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
587 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
588 else
589 disk = -1;
590 rcu_read_unlock();
591
592 return disk;
593}
594
595static void unplug_slaves(mddev_t *mddev)
596{
597 conf_t *conf = mddev_to_conf(mddev);
598 int i;
599
600 rcu_read_lock();
601 for (i=0; i<mddev->raid_disks; i++) {
602 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
603 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
604 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
605
606 atomic_inc(&rdev->nr_pending);
607 rcu_read_unlock();
608
609 blk_unplug(r_queue);
610
611 rdev_dec_pending(rdev, mddev);
612 rcu_read_lock();
613 }
614 }
615 rcu_read_unlock();
616}
617
618static void raid10_unplug(struct request_queue *q)
619{
620 mddev_t *mddev = q->queuedata;
621
622 unplug_slaves(q->queuedata);
623 md_wakeup_thread(mddev->thread);
624}
625
626static int raid10_congested(void *data, int bits)
627{
628 mddev_t *mddev = data;
629 conf_t *conf = mddev_to_conf(mddev);
630 int i, ret = 0;
631
632 rcu_read_lock();
633 for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
634 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
635 if (rdev && !test_bit(Faulty, &rdev->flags)) {
636 struct request_queue *q = bdev_get_queue(rdev->bdev);
637
638 ret |= bdi_congested(&q->backing_dev_info, bits);
639 }
640 }
641 rcu_read_unlock();
642 return ret;
643}
644
645static int flush_pending_writes(conf_t *conf)
646{
647
648
649
650
651 int rv = 0;
652
653 spin_lock_irq(&conf->device_lock);
654
655 if (conf->pending_bio_list.head) {
656 struct bio *bio;
657 bio = bio_list_get(&conf->pending_bio_list);
658 blk_remove_plug(conf->mddev->queue);
659 spin_unlock_irq(&conf->device_lock);
660
661
662 bitmap_unplug(conf->mddev->bitmap);
663
664 while (bio) {
665 struct bio *next = bio->bi_next;
666 bio->bi_next = NULL;
667 generic_make_request(bio);
668 bio = next;
669 }
670 rv = 1;
671 } else
672 spin_unlock_irq(&conf->device_lock);
673 return rv;
674}
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697static void raise_barrier(conf_t *conf, int force)
698{
699 BUG_ON(force && !conf->barrier);
700 spin_lock_irq(&conf->resync_lock);
701
702
703 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
704 conf->resync_lock,
705 raid10_unplug(conf->mddev->queue));
706
707
708 conf->barrier++;
709
710
711 wait_event_lock_irq(conf->wait_barrier,
712 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
713 conf->resync_lock,
714 raid10_unplug(conf->mddev->queue));
715
716 spin_unlock_irq(&conf->resync_lock);
717}
718
719static void lower_barrier(conf_t *conf)
720{
721 unsigned long flags;
722 spin_lock_irqsave(&conf->resync_lock, flags);
723 conf->barrier--;
724 spin_unlock_irqrestore(&conf->resync_lock, flags);
725 wake_up(&conf->wait_barrier);
726}
727
728static void wait_barrier(conf_t *conf)
729{
730 spin_lock_irq(&conf->resync_lock);
731 if (conf->barrier) {
732 conf->nr_waiting++;
733 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
734 conf->resync_lock,
735 raid10_unplug(conf->mddev->queue));
736 conf->nr_waiting--;
737 }
738 conf->nr_pending++;
739 spin_unlock_irq(&conf->resync_lock);
740}
741
742static void allow_barrier(conf_t *conf)
743{
744 unsigned long flags;
745 spin_lock_irqsave(&conf->resync_lock, flags);
746 conf->nr_pending--;
747 spin_unlock_irqrestore(&conf->resync_lock, flags);
748 wake_up(&conf->wait_barrier);
749}
750
751static void freeze_array(conf_t *conf)
752{
753
754
755
756
757
758
759
760
761
762
763
764
765 spin_lock_irq(&conf->resync_lock);
766 conf->barrier++;
767 conf->nr_waiting++;
768 wait_event_lock_irq(conf->wait_barrier,
769 conf->nr_pending == conf->nr_queued+1,
770 conf->resync_lock,
771 ({ flush_pending_writes(conf);
772 raid10_unplug(conf->mddev->queue); }));
773 spin_unlock_irq(&conf->resync_lock);
774}
775
776static void unfreeze_array(conf_t *conf)
777{
778
779 spin_lock_irq(&conf->resync_lock);
780 conf->barrier--;
781 conf->nr_waiting--;
782 wake_up(&conf->wait_barrier);
783 spin_unlock_irq(&conf->resync_lock);
784}
785
786static int make_request(struct request_queue *q, struct bio * bio)
787{
788 mddev_t *mddev = q->queuedata;
789 conf_t *conf = mddev_to_conf(mddev);
790 mirror_info_t *mirror;
791 r10bio_t *r10_bio;
792 struct bio *read_bio;
793 int cpu;
794 int i;
795 int chunk_sects = conf->chunk_mask + 1;
796 const int rw = bio_data_dir(bio);
797 const int do_sync = bio_sync(bio);
798 struct bio_list bl;
799 unsigned long flags;
800 mdk_rdev_t *blocked_rdev;
801
802 if (unlikely(bio_barrier(bio))) {
803 bio_endio(bio, -EOPNOTSUPP);
804 return 0;
805 }
806
807
808
809
810 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
811 > chunk_sects &&
812 conf->near_copies < conf->raid_disks)) {
813 struct bio_pair *bp;
814
815 if (bio->bi_vcnt != 1 ||
816 bio->bi_idx != 0)
817 goto bad_map;
818
819
820
821 bp = bio_split(bio,
822 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
823 if (make_request(q, &bp->bio1))
824 generic_make_request(&bp->bio1);
825 if (make_request(q, &bp->bio2))
826 generic_make_request(&bp->bio2);
827
828 bio_pair_release(bp);
829 return 0;
830 bad_map:
831 printk("raid10_make_request bug: can't convert block across chunks"
832 " or bigger than %dk %llu %d\n", chunk_sects/2,
833 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
834
835 bio_io_error(bio);
836 return 0;
837 }
838
839 md_write_start(mddev, bio);
840
841
842
843
844
845
846 wait_barrier(conf);
847
848 cpu = part_stat_lock();
849 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
850 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
851 bio_sectors(bio));
852 part_stat_unlock();
853
854 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
855
856 r10_bio->master_bio = bio;
857 r10_bio->sectors = bio->bi_size >> 9;
858
859 r10_bio->mddev = mddev;
860 r10_bio->sector = bio->bi_sector;
861 r10_bio->state = 0;
862
863 if (rw == READ) {
864
865
866
867 int disk = read_balance(conf, r10_bio);
868 int slot = r10_bio->read_slot;
869 if (disk < 0) {
870 raid_end_bio_io(r10_bio);
871 return 0;
872 }
873 mirror = conf->mirrors + disk;
874
875 read_bio = bio_clone(bio, GFP_NOIO);
876
877 r10_bio->devs[slot].bio = read_bio;
878
879 read_bio->bi_sector = r10_bio->devs[slot].addr +
880 mirror->rdev->data_offset;
881 read_bio->bi_bdev = mirror->rdev->bdev;
882 read_bio->bi_end_io = raid10_end_read_request;
883 read_bio->bi_rw = READ | do_sync;
884 read_bio->bi_private = r10_bio;
885
886 generic_make_request(read_bio);
887 return 0;
888 }
889
890
891
892
893
894
895
896
897 raid10_find_phys(conf, r10_bio);
898 retry_write:
899 blocked_rdev = NULL;
900 rcu_read_lock();
901 for (i = 0; i < conf->copies; i++) {
902 int d = r10_bio->devs[i].devnum;
903 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
904 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
905 atomic_inc(&rdev->nr_pending);
906 blocked_rdev = rdev;
907 break;
908 }
909 if (rdev && !test_bit(Faulty, &rdev->flags)) {
910 atomic_inc(&rdev->nr_pending);
911 r10_bio->devs[i].bio = bio;
912 } else {
913 r10_bio->devs[i].bio = NULL;
914 set_bit(R10BIO_Degraded, &r10_bio->state);
915 }
916 }
917 rcu_read_unlock();
918
919 if (unlikely(blocked_rdev)) {
920
921 int j;
922 int d;
923
924 for (j = 0; j < i; j++)
925 if (r10_bio->devs[j].bio) {
926 d = r10_bio->devs[j].devnum;
927 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
928 }
929 allow_barrier(conf);
930 md_wait_for_blocked_rdev(blocked_rdev, mddev);
931 wait_barrier(conf);
932 goto retry_write;
933 }
934
935 atomic_set(&r10_bio->remaining, 0);
936
937 bio_list_init(&bl);
938 for (i = 0; i < conf->copies; i++) {
939 struct bio *mbio;
940 int d = r10_bio->devs[i].devnum;
941 if (!r10_bio->devs[i].bio)
942 continue;
943
944 mbio = bio_clone(bio, GFP_NOIO);
945 r10_bio->devs[i].bio = mbio;
946
947 mbio->bi_sector = r10_bio->devs[i].addr+
948 conf->mirrors[d].rdev->data_offset;
949 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
950 mbio->bi_end_io = raid10_end_write_request;
951 mbio->bi_rw = WRITE | do_sync;
952 mbio->bi_private = r10_bio;
953
954 atomic_inc(&r10_bio->remaining);
955 bio_list_add(&bl, mbio);
956 }
957
958 if (unlikely(!atomic_read(&r10_bio->remaining))) {
959
960 md_write_end(mddev);
961 raid_end_bio_io(r10_bio);
962 return 0;
963 }
964
965 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
966 spin_lock_irqsave(&conf->device_lock, flags);
967 bio_list_merge(&conf->pending_bio_list, &bl);
968 blk_plug_device(mddev->queue);
969 spin_unlock_irqrestore(&conf->device_lock, flags);
970
971
972 wake_up(&conf->wait_barrier);
973
974 if (do_sync)
975 md_wakeup_thread(mddev->thread);
976
977 return 0;
978}
979
980static void status(struct seq_file *seq, mddev_t *mddev)
981{
982 conf_t *conf = mddev_to_conf(mddev);
983 int i;
984
985 if (conf->near_copies < conf->raid_disks)
986 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
987 if (conf->near_copies > 1)
988 seq_printf(seq, " %d near-copies", conf->near_copies);
989 if (conf->far_copies > 1) {
990 if (conf->far_offset)
991 seq_printf(seq, " %d offset-copies", conf->far_copies);
992 else
993 seq_printf(seq, " %d far-copies", conf->far_copies);
994 }
995 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
996 conf->raid_disks - mddev->degraded);
997 for (i = 0; i < conf->raid_disks; i++)
998 seq_printf(seq, "%s",
999 conf->mirrors[i].rdev &&
1000 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1001 seq_printf(seq, "]");
1002}
1003
1004static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1005{
1006 char b[BDEVNAME_SIZE];
1007 conf_t *conf = mddev_to_conf(mddev);
1008
1009
1010
1011
1012
1013
1014
1015 if (test_bit(In_sync, &rdev->flags)
1016 && conf->raid_disks-mddev->degraded == 1)
1017
1018
1019
1020
1021
1022
1023
1024 return;
1025 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1026 unsigned long flags;
1027 spin_lock_irqsave(&conf->device_lock, flags);
1028 mddev->degraded++;
1029 spin_unlock_irqrestore(&conf->device_lock, flags);
1030
1031
1032
1033 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1034 }
1035 set_bit(Faulty, &rdev->flags);
1036 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1037 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n"
1038 "raid10: Operation continuing on %d devices.\n",
1039 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1040}
1041
1042static void print_conf(conf_t *conf)
1043{
1044 int i;
1045 mirror_info_t *tmp;
1046
1047 printk("RAID10 conf printout:\n");
1048 if (!conf) {
1049 printk("(!conf)\n");
1050 return;
1051 }
1052 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1053 conf->raid_disks);
1054
1055 for (i = 0; i < conf->raid_disks; i++) {
1056 char b[BDEVNAME_SIZE];
1057 tmp = conf->mirrors + i;
1058 if (tmp->rdev)
1059 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
1060 i, !test_bit(In_sync, &tmp->rdev->flags),
1061 !test_bit(Faulty, &tmp->rdev->flags),
1062 bdevname(tmp->rdev->bdev,b));
1063 }
1064}
1065
1066static void close_sync(conf_t *conf)
1067{
1068 wait_barrier(conf);
1069 allow_barrier(conf);
1070
1071 mempool_destroy(conf->r10buf_pool);
1072 conf->r10buf_pool = NULL;
1073}
1074
1075
1076
1077
1078static int enough(conf_t *conf)
1079{
1080 int first = 0;
1081
1082 do {
1083 int n = conf->copies;
1084 int cnt = 0;
1085 while (n--) {
1086 if (conf->mirrors[first].rdev)
1087 cnt++;
1088 first = (first+1) % conf->raid_disks;
1089 }
1090 if (cnt == 0)
1091 return 0;
1092 } while (first != 0);
1093 return 1;
1094}
1095
1096static int raid10_spare_active(mddev_t *mddev)
1097{
1098 int i;
1099 conf_t *conf = mddev->private;
1100 mirror_info_t *tmp;
1101
1102
1103
1104
1105
1106 for (i = 0; i < conf->raid_disks; i++) {
1107 tmp = conf->mirrors + i;
1108 if (tmp->rdev
1109 && !test_bit(Faulty, &tmp->rdev->flags)
1110 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1111 unsigned long flags;
1112 spin_lock_irqsave(&conf->device_lock, flags);
1113 mddev->degraded--;
1114 spin_unlock_irqrestore(&conf->device_lock, flags);
1115 }
1116 }
1117
1118 print_conf(conf);
1119 return 0;
1120}
1121
1122
1123static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1124{
1125 conf_t *conf = mddev->private;
1126 int err = -EEXIST;
1127 int mirror;
1128 mirror_info_t *p;
1129 int first = 0;
1130 int last = mddev->raid_disks - 1;
1131
1132 if (mddev->recovery_cp < MaxSector)
1133
1134
1135
1136 return -EBUSY;
1137 if (!enough(conf))
1138 return -EINVAL;
1139
1140 if (rdev->raid_disk >= 0)
1141 first = last = rdev->raid_disk;
1142
1143 if (rdev->saved_raid_disk >= 0 &&
1144 rdev->saved_raid_disk >= first &&
1145 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1146 mirror = rdev->saved_raid_disk;
1147 else
1148 mirror = first;
1149 for ( ; mirror <= last ; mirror++)
1150 if ( !(p=conf->mirrors+mirror)->rdev) {
1151
1152 blk_queue_stack_limits(mddev->queue,
1153 rdev->bdev->bd_disk->queue);
1154
1155
1156
1157
1158 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1159 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1160 mddev->queue->max_sectors = (PAGE_SIZE>>9);
1161
1162 p->head_position = 0;
1163 rdev->raid_disk = mirror;
1164 err = 0;
1165 if (rdev->saved_raid_disk != mirror)
1166 conf->fullsync = 1;
1167 rcu_assign_pointer(p->rdev, rdev);
1168 break;
1169 }
1170
1171 print_conf(conf);
1172 return err;
1173}
1174
1175static int raid10_remove_disk(mddev_t *mddev, int number)
1176{
1177 conf_t *conf = mddev->private;
1178 int err = 0;
1179 mdk_rdev_t *rdev;
1180 mirror_info_t *p = conf->mirrors+ number;
1181
1182 print_conf(conf);
1183 rdev = p->rdev;
1184 if (rdev) {
1185 if (test_bit(In_sync, &rdev->flags) ||
1186 atomic_read(&rdev->nr_pending)) {
1187 err = -EBUSY;
1188 goto abort;
1189 }
1190
1191
1192
1193 if (!test_bit(Faulty, &rdev->flags) &&
1194 enough(conf)) {
1195 err = -EBUSY;
1196 goto abort;
1197 }
1198 p->rdev = NULL;
1199 synchronize_rcu();
1200 if (atomic_read(&rdev->nr_pending)) {
1201
1202 err = -EBUSY;
1203 p->rdev = rdev;
1204 }
1205 }
1206abort:
1207
1208 print_conf(conf);
1209 return err;
1210}
1211
1212
1213static void end_sync_read(struct bio *bio, int error)
1214{
1215 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1216 conf_t *conf = mddev_to_conf(r10_bio->mddev);
1217 int i,d;
1218
1219 for (i=0; i<conf->copies; i++)
1220 if (r10_bio->devs[i].bio == bio)
1221 break;
1222 BUG_ON(i == conf->copies);
1223 update_head_pos(i, r10_bio);
1224 d = r10_bio->devs[i].devnum;
1225
1226 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1227 set_bit(R10BIO_Uptodate, &r10_bio->state);
1228 else {
1229 atomic_add(r10_bio->sectors,
1230 &conf->mirrors[d].rdev->corrected_errors);
1231 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1232 md_error(r10_bio->mddev,
1233 conf->mirrors[d].rdev);
1234 }
1235
1236
1237
1238
1239 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1240 atomic_dec_and_test(&r10_bio->remaining)) {
1241
1242
1243
1244 reschedule_retry(r10_bio);
1245 }
1246 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1247}
1248
1249static void end_sync_write(struct bio *bio, int error)
1250{
1251 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1252 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1253 mddev_t *mddev = r10_bio->mddev;
1254 conf_t *conf = mddev_to_conf(mddev);
1255 int i,d;
1256
1257 for (i = 0; i < conf->copies; i++)
1258 if (r10_bio->devs[i].bio == bio)
1259 break;
1260 d = r10_bio->devs[i].devnum;
1261
1262 if (!uptodate)
1263 md_error(mddev, conf->mirrors[d].rdev);
1264
1265 update_head_pos(i, r10_bio);
1266
1267 while (atomic_dec_and_test(&r10_bio->remaining)) {
1268 if (r10_bio->master_bio == NULL) {
1269
1270 md_done_sync(mddev, r10_bio->sectors, 1);
1271 put_buf(r10_bio);
1272 break;
1273 } else {
1274 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1275 put_buf(r10_bio);
1276 r10_bio = r10_bio2;
1277 }
1278 }
1279 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1280}
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1299{
1300 conf_t *conf = mddev_to_conf(mddev);
1301 int i, first;
1302 struct bio *tbio, *fbio;
1303
1304 atomic_set(&r10_bio->remaining, 1);
1305
1306
1307 for (i=0; i<conf->copies; i++)
1308 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1309 break;
1310
1311 if (i == conf->copies)
1312 goto done;
1313
1314 first = i;
1315 fbio = r10_bio->devs[i].bio;
1316
1317
1318 for (i=0 ; i < conf->copies ; i++) {
1319 int j, d;
1320 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1321
1322 tbio = r10_bio->devs[i].bio;
1323
1324 if (tbio->bi_end_io != end_sync_read)
1325 continue;
1326 if (i == first)
1327 continue;
1328 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1329
1330
1331
1332
1333 for (j = 0; j < vcnt; j++)
1334 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1335 page_address(tbio->bi_io_vec[j].bv_page),
1336 PAGE_SIZE))
1337 break;
1338 if (j == vcnt)
1339 continue;
1340 mddev->resync_mismatches += r10_bio->sectors;
1341 }
1342 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1343
1344 continue;
1345
1346
1347
1348
1349 tbio->bi_vcnt = vcnt;
1350 tbio->bi_size = r10_bio->sectors << 9;
1351 tbio->bi_idx = 0;
1352 tbio->bi_phys_segments = 0;
1353 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1354 tbio->bi_flags |= 1 << BIO_UPTODATE;
1355 tbio->bi_next = NULL;
1356 tbio->bi_rw = WRITE;
1357 tbio->bi_private = r10_bio;
1358 tbio->bi_sector = r10_bio->devs[i].addr;
1359
1360 for (j=0; j < vcnt ; j++) {
1361 tbio->bi_io_vec[j].bv_offset = 0;
1362 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1363
1364 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1365 page_address(fbio->bi_io_vec[j].bv_page),
1366 PAGE_SIZE);
1367 }
1368 tbio->bi_end_io = end_sync_write;
1369
1370 d = r10_bio->devs[i].devnum;
1371 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1372 atomic_inc(&r10_bio->remaining);
1373 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1374
1375 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1376 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1377 generic_make_request(tbio);
1378 }
1379
1380done:
1381 if (atomic_dec_and_test(&r10_bio->remaining)) {
1382 md_done_sync(mddev, r10_bio->sectors, 1);
1383 put_buf(r10_bio);
1384 }
1385}
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1399{
1400 conf_t *conf = mddev_to_conf(mddev);
1401 int i, d;
1402 struct bio *bio, *wbio;
1403
1404
1405
1406
1407
1408 bio = r10_bio->devs[0].bio;
1409 wbio = r10_bio->devs[1].bio;
1410 for (i=0; i < wbio->bi_vcnt; i++) {
1411 struct page *p = bio->bi_io_vec[i].bv_page;
1412 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1413 wbio->bi_io_vec[i].bv_page = p;
1414 }
1415 d = r10_bio->devs[1].devnum;
1416
1417 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1418 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1419 if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1420 generic_make_request(wbio);
1421 else
1422 bio_endio(wbio, -EIO);
1423}
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1435{
1436 int sect = 0;
1437 int sectors = r10_bio->sectors;
1438 mdk_rdev_t*rdev;
1439 while(sectors) {
1440 int s = sectors;
1441 int sl = r10_bio->read_slot;
1442 int success = 0;
1443 int start;
1444
1445 if (s > (PAGE_SIZE>>9))
1446 s = PAGE_SIZE >> 9;
1447
1448 rcu_read_lock();
1449 do {
1450 int d = r10_bio->devs[sl].devnum;
1451 rdev = rcu_dereference(conf->mirrors[d].rdev);
1452 if (rdev &&
1453 test_bit(In_sync, &rdev->flags)) {
1454 atomic_inc(&rdev->nr_pending);
1455 rcu_read_unlock();
1456 success = sync_page_io(rdev->bdev,
1457 r10_bio->devs[sl].addr +
1458 sect + rdev->data_offset,
1459 s<<9,
1460 conf->tmppage, READ);
1461 rdev_dec_pending(rdev, mddev);
1462 rcu_read_lock();
1463 if (success)
1464 break;
1465 }
1466 sl++;
1467 if (sl == conf->copies)
1468 sl = 0;
1469 } while (!success && sl != r10_bio->read_slot);
1470 rcu_read_unlock();
1471
1472 if (!success) {
1473
1474 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1475 md_error(mddev, conf->mirrors[dn].rdev);
1476 break;
1477 }
1478
1479 start = sl;
1480
1481 rcu_read_lock();
1482 while (sl != r10_bio->read_slot) {
1483 int d;
1484 if (sl==0)
1485 sl = conf->copies;
1486 sl--;
1487 d = r10_bio->devs[sl].devnum;
1488 rdev = rcu_dereference(conf->mirrors[d].rdev);
1489 if (rdev &&
1490 test_bit(In_sync, &rdev->flags)) {
1491 atomic_inc(&rdev->nr_pending);
1492 rcu_read_unlock();
1493 atomic_add(s, &rdev->corrected_errors);
1494 if (sync_page_io(rdev->bdev,
1495 r10_bio->devs[sl].addr +
1496 sect + rdev->data_offset,
1497 s<<9, conf->tmppage, WRITE)
1498 == 0)
1499
1500 md_error(mddev, rdev);
1501 rdev_dec_pending(rdev, mddev);
1502 rcu_read_lock();
1503 }
1504 }
1505 sl = start;
1506 while (sl != r10_bio->read_slot) {
1507 int d;
1508 if (sl==0)
1509 sl = conf->copies;
1510 sl--;
1511 d = r10_bio->devs[sl].devnum;
1512 rdev = rcu_dereference(conf->mirrors[d].rdev);
1513 if (rdev &&
1514 test_bit(In_sync, &rdev->flags)) {
1515 char b[BDEVNAME_SIZE];
1516 atomic_inc(&rdev->nr_pending);
1517 rcu_read_unlock();
1518 if (sync_page_io(rdev->bdev,
1519 r10_bio->devs[sl].addr +
1520 sect + rdev->data_offset,
1521 s<<9, conf->tmppage, READ) == 0)
1522
1523 md_error(mddev, rdev);
1524 else
1525 printk(KERN_INFO
1526 "raid10:%s: read error corrected"
1527 " (%d sectors at %llu on %s)\n",
1528 mdname(mddev), s,
1529 (unsigned long long)(sect+
1530 rdev->data_offset),
1531 bdevname(rdev->bdev, b));
1532
1533 rdev_dec_pending(rdev, mddev);
1534 rcu_read_lock();
1535 }
1536 }
1537 rcu_read_unlock();
1538
1539 sectors -= s;
1540 sect += s;
1541 }
1542}
1543
1544static void raid10d(mddev_t *mddev)
1545{
1546 r10bio_t *r10_bio;
1547 struct bio *bio;
1548 unsigned long flags;
1549 conf_t *conf = mddev_to_conf(mddev);
1550 struct list_head *head = &conf->retry_list;
1551 int unplug=0;
1552 mdk_rdev_t *rdev;
1553
1554 md_check_recovery(mddev);
1555
1556 for (;;) {
1557 char b[BDEVNAME_SIZE];
1558
1559 unplug += flush_pending_writes(conf);
1560
1561 spin_lock_irqsave(&conf->device_lock, flags);
1562 if (list_empty(head)) {
1563 spin_unlock_irqrestore(&conf->device_lock, flags);
1564 break;
1565 }
1566 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1567 list_del(head->prev);
1568 conf->nr_queued--;
1569 spin_unlock_irqrestore(&conf->device_lock, flags);
1570
1571 mddev = r10_bio->mddev;
1572 conf = mddev_to_conf(mddev);
1573 if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1574 sync_request_write(mddev, r10_bio);
1575 unplug = 1;
1576 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1577 recovery_request_write(mddev, r10_bio);
1578 unplug = 1;
1579 } else {
1580 int mirror;
1581
1582
1583
1584
1585
1586
1587
1588
1589 if (mddev->ro == 0) {
1590 freeze_array(conf);
1591 fix_read_error(conf, mddev, r10_bio);
1592 unfreeze_array(conf);
1593 }
1594
1595 bio = r10_bio->devs[r10_bio->read_slot].bio;
1596 r10_bio->devs[r10_bio->read_slot].bio =
1597 mddev->ro ? IO_BLOCKED : NULL;
1598 mirror = read_balance(conf, r10_bio);
1599 if (mirror == -1) {
1600 printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
1601 " read error for block %llu\n",
1602 bdevname(bio->bi_bdev,b),
1603 (unsigned long long)r10_bio->sector);
1604 raid_end_bio_io(r10_bio);
1605 bio_put(bio);
1606 } else {
1607 const int do_sync = bio_sync(r10_bio->master_bio);
1608 bio_put(bio);
1609 rdev = conf->mirrors[mirror].rdev;
1610 if (printk_ratelimit())
1611 printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
1612 " another mirror\n",
1613 bdevname(rdev->bdev,b),
1614 (unsigned long long)r10_bio->sector);
1615 bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1616 r10_bio->devs[r10_bio->read_slot].bio = bio;
1617 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1618 + rdev->data_offset;
1619 bio->bi_bdev = rdev->bdev;
1620 bio->bi_rw = READ | do_sync;
1621 bio->bi_private = r10_bio;
1622 bio->bi_end_io = raid10_end_read_request;
1623 unplug = 1;
1624 generic_make_request(bio);
1625 }
1626 }
1627 }
1628 if (unplug)
1629 unplug_slaves(mddev);
1630}
1631
1632
1633static int init_resync(conf_t *conf)
1634{
1635 int buffs;
1636
1637 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1638 BUG_ON(conf->r10buf_pool);
1639 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1640 if (!conf->r10buf_pool)
1641 return -ENOMEM;
1642 conf->next_resync = 0;
1643 return 0;
1644}
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1679{
1680 conf_t *conf = mddev_to_conf(mddev);
1681 r10bio_t *r10_bio;
1682 struct bio *biolist = NULL, *bio;
1683 sector_t max_sector, nr_sectors;
1684 int disk;
1685 int i;
1686 int max_sync;
1687 int sync_blocks;
1688
1689 sector_t sectors_skipped = 0;
1690 int chunks_skipped = 0;
1691
1692 if (!conf->r10buf_pool)
1693 if (init_resync(conf))
1694 return 0;
1695
1696 skipped:
1697 max_sector = mddev->size << 1;
1698 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1699 max_sector = mddev->resync_max_sectors;
1700 if (sector_nr >= max_sector) {
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710 if (mddev->curr_resync < max_sector) {
1711 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1712 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1713 &sync_blocks, 1);
1714 else for (i=0; i<conf->raid_disks; i++) {
1715 sector_t sect =
1716 raid10_find_virt(conf, mddev->curr_resync, i);
1717 bitmap_end_sync(mddev->bitmap, sect,
1718 &sync_blocks, 1);
1719 }
1720 } else
1721 conf->fullsync = 0;
1722
1723 bitmap_close_sync(mddev->bitmap);
1724 close_sync(conf);
1725 *skipped = 1;
1726 return sectors_skipped;
1727 }
1728 if (chunks_skipped >= conf->raid_disks) {
1729
1730
1731
1732 *skipped = 1;
1733 return (max_sector - sector_nr) + sectors_skipped;
1734 }
1735
1736 if (max_sector > mddev->resync_max)
1737 max_sector = mddev->resync_max;
1738
1739
1740
1741
1742 if (conf->near_copies < conf->raid_disks &&
1743 max_sector > (sector_nr | conf->chunk_mask))
1744 max_sector = (sector_nr | conf->chunk_mask) + 1;
1745
1746
1747
1748
1749 if (!go_faster && conf->nr_waiting)
1750 msleep_interruptible(1000);
1751
1752 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1770 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1771
1772 int i, j, k;
1773 r10_bio = NULL;
1774
1775 for (i=0 ; i<conf->raid_disks; i++)
1776 if (conf->mirrors[i].rdev &&
1777 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1778 int still_degraded = 0;
1779
1780 r10bio_t *rb2 = r10_bio;
1781 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1782 int must_sync;
1783
1784
1785
1786 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1787 &sync_blocks, 1);
1788 if (sync_blocks < max_sync)
1789 max_sync = sync_blocks;
1790 if (!must_sync &&
1791 !conf->fullsync) {
1792
1793
1794
1795 chunks_skipped = -1;
1796 continue;
1797 }
1798
1799 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1800 raise_barrier(conf, rb2 != NULL);
1801 atomic_set(&r10_bio->remaining, 0);
1802
1803 r10_bio->master_bio = (struct bio*)rb2;
1804 if (rb2)
1805 atomic_inc(&rb2->remaining);
1806 r10_bio->mddev = mddev;
1807 set_bit(R10BIO_IsRecover, &r10_bio->state);
1808 r10_bio->sector = sect;
1809
1810 raid10_find_phys(conf, r10_bio);
1811
1812
1813
1814 for (j=0; j<conf->copies;j++) {
1815 int d = r10_bio->devs[j].devnum;
1816 if (conf->mirrors[d].rdev == NULL ||
1817 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
1818 still_degraded = 1;
1819 break;
1820 }
1821 }
1822 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1823 &sync_blocks, still_degraded);
1824
1825 for (j=0; j<conf->copies;j++) {
1826 int d = r10_bio->devs[j].devnum;
1827 if (conf->mirrors[d].rdev &&
1828 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1829
1830 bio = r10_bio->devs[0].bio;
1831 bio->bi_next = biolist;
1832 biolist = bio;
1833 bio->bi_private = r10_bio;
1834 bio->bi_end_io = end_sync_read;
1835 bio->bi_rw = READ;
1836 bio->bi_sector = r10_bio->devs[j].addr +
1837 conf->mirrors[d].rdev->data_offset;
1838 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1839 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1840 atomic_inc(&r10_bio->remaining);
1841
1842
1843 for (k=0; k<conf->copies; k++)
1844 if (r10_bio->devs[k].devnum == i)
1845 break;
1846 BUG_ON(k == conf->copies);
1847 bio = r10_bio->devs[1].bio;
1848 bio->bi_next = biolist;
1849 biolist = bio;
1850 bio->bi_private = r10_bio;
1851 bio->bi_end_io = end_sync_write;
1852 bio->bi_rw = WRITE;
1853 bio->bi_sector = r10_bio->devs[k].addr +
1854 conf->mirrors[i].rdev->data_offset;
1855 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1856
1857 r10_bio->devs[0].devnum = d;
1858 r10_bio->devs[1].devnum = i;
1859
1860 break;
1861 }
1862 }
1863 if (j == conf->copies) {
1864
1865 put_buf(r10_bio);
1866 if (rb2)
1867 atomic_dec(&rb2->remaining);
1868 r10_bio = rb2;
1869 if (!test_and_set_bit(MD_RECOVERY_INTR,
1870 &mddev->recovery))
1871 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1872 mdname(mddev));
1873 break;
1874 }
1875 }
1876 if (biolist == NULL) {
1877 while (r10_bio) {
1878 r10bio_t *rb2 = r10_bio;
1879 r10_bio = (r10bio_t*) rb2->master_bio;
1880 rb2->master_bio = NULL;
1881 put_buf(rb2);
1882 }
1883 goto giveup;
1884 }
1885 } else {
1886
1887 int count = 0;
1888
1889 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1890 &sync_blocks, mddev->degraded) &&
1891 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1892
1893 *skipped = 1;
1894 return sync_blocks + sectors_skipped;
1895 }
1896 if (sync_blocks < max_sync)
1897 max_sync = sync_blocks;
1898 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1899
1900 r10_bio->mddev = mddev;
1901 atomic_set(&r10_bio->remaining, 0);
1902 raise_barrier(conf, 0);
1903 conf->next_resync = sector_nr;
1904
1905 r10_bio->master_bio = NULL;
1906 r10_bio->sector = sector_nr;
1907 set_bit(R10BIO_IsSync, &r10_bio->state);
1908 raid10_find_phys(conf, r10_bio);
1909 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
1910
1911 for (i=0; i<conf->copies; i++) {
1912 int d = r10_bio->devs[i].devnum;
1913 bio = r10_bio->devs[i].bio;
1914 bio->bi_end_io = NULL;
1915 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1916 if (conf->mirrors[d].rdev == NULL ||
1917 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1918 continue;
1919 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1920 atomic_inc(&r10_bio->remaining);
1921 bio->bi_next = biolist;
1922 biolist = bio;
1923 bio->bi_private = r10_bio;
1924 bio->bi_end_io = end_sync_read;
1925 bio->bi_rw = READ;
1926 bio->bi_sector = r10_bio->devs[i].addr +
1927 conf->mirrors[d].rdev->data_offset;
1928 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1929 count++;
1930 }
1931
1932 if (count < 2) {
1933 for (i=0; i<conf->copies; i++) {
1934 int d = r10_bio->devs[i].devnum;
1935 if (r10_bio->devs[i].bio->bi_end_io)
1936 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1937 }
1938 put_buf(r10_bio);
1939 biolist = NULL;
1940 goto giveup;
1941 }
1942 }
1943
1944 for (bio = biolist; bio ; bio=bio->bi_next) {
1945
1946 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1947 if (bio->bi_end_io)
1948 bio->bi_flags |= 1 << BIO_UPTODATE;
1949 bio->bi_vcnt = 0;
1950 bio->bi_idx = 0;
1951 bio->bi_phys_segments = 0;
1952 bio->bi_size = 0;
1953 }
1954
1955 nr_sectors = 0;
1956 if (sector_nr + max_sync < max_sector)
1957 max_sector = sector_nr + max_sync;
1958 do {
1959 struct page *page;
1960 int len = PAGE_SIZE;
1961 disk = 0;
1962 if (sector_nr + (len>>9) > max_sector)
1963 len = (max_sector - sector_nr) << 9;
1964 if (len == 0)
1965 break;
1966 for (bio= biolist ; bio ; bio=bio->bi_next) {
1967 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1968 if (bio_add_page(bio, page, len, 0) == 0) {
1969
1970 struct bio *bio2;
1971 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1972 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
1973
1974 bio2->bi_vcnt--;
1975 bio2->bi_size -= len;
1976 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
1977 }
1978 goto bio_full;
1979 }
1980 disk = i;
1981 }
1982 nr_sectors += len>>9;
1983 sector_nr += len>>9;
1984 } while (biolist->bi_vcnt < RESYNC_PAGES);
1985 bio_full:
1986 r10_bio->sectors = nr_sectors;
1987
1988 while (biolist) {
1989 bio = biolist;
1990 biolist = biolist->bi_next;
1991
1992 bio->bi_next = NULL;
1993 r10_bio = bio->bi_private;
1994 r10_bio->sectors = nr_sectors;
1995
1996 if (bio->bi_end_io == end_sync_read) {
1997 md_sync_acct(bio->bi_bdev, nr_sectors);
1998 generic_make_request(bio);
1999 }
2000 }
2001
2002 if (sectors_skipped)
2003
2004
2005
2006 md_done_sync(mddev, sectors_skipped, 1);
2007
2008 return sectors_skipped + nr_sectors;
2009 giveup:
2010
2011
2012
2013 {
2014 sector_t sec = max_sector - sector_nr;
2015 sectors_skipped += sec;
2016 chunks_skipped ++;
2017 sector_nr = max_sector;
2018 goto skipped;
2019 }
2020}
2021
2022static int run(mddev_t *mddev)
2023{
2024 conf_t *conf;
2025 int i, disk_idx;
2026 mirror_info_t *disk;
2027 mdk_rdev_t *rdev;
2028 struct list_head *tmp;
2029 int nc, fc, fo;
2030 sector_t stride, size;
2031
2032 if (mddev->chunk_size < PAGE_SIZE) {
2033 printk(KERN_ERR "md/raid10: chunk size must be "
2034 "at least PAGE_SIZE(%ld).\n", PAGE_SIZE);
2035 return -EINVAL;
2036 }
2037
2038 nc = mddev->layout & 255;
2039 fc = (mddev->layout >> 8) & 255;
2040 fo = mddev->layout & (1<<16);
2041 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2042 (mddev->layout >> 17)) {
2043 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
2044 mdname(mddev), mddev->layout);
2045 goto out;
2046 }
2047
2048
2049
2050
2051
2052 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2053 mddev->private = conf;
2054 if (!conf) {
2055 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2056 mdname(mddev));
2057 goto out;
2058 }
2059 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2060 GFP_KERNEL);
2061 if (!conf->mirrors) {
2062 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2063 mdname(mddev));
2064 goto out_free_conf;
2065 }
2066
2067 conf->tmppage = alloc_page(GFP_KERNEL);
2068 if (!conf->tmppage)
2069 goto out_free_conf;
2070
2071 conf->mddev = mddev;
2072 conf->raid_disks = mddev->raid_disks;
2073 conf->near_copies = nc;
2074 conf->far_copies = fc;
2075 conf->copies = nc*fc;
2076 conf->far_offset = fo;
2077 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
2078 conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
2079 size = mddev->size >> (conf->chunk_shift-1);
2080 sector_div(size, fc);
2081 size = size * conf->raid_disks;
2082 sector_div(size, nc);
2083
2084
2085 stride = size * conf->copies;
2086
2087
2088
2089
2090 stride += conf->raid_disks - 1;
2091 sector_div(stride, conf->raid_disks);
2092 mddev->size = stride << (conf->chunk_shift-1);
2093
2094 if (fo)
2095 stride = 1;
2096 else
2097 sector_div(stride, fc);
2098 conf->stride = stride << conf->chunk_shift;
2099
2100 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2101 r10bio_pool_free, conf);
2102 if (!conf->r10bio_pool) {
2103 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2104 mdname(mddev));
2105 goto out_free_conf;
2106 }
2107
2108 spin_lock_init(&conf->device_lock);
2109 mddev->queue->queue_lock = &conf->device_lock;
2110
2111 rdev_for_each(rdev, tmp, mddev) {
2112 disk_idx = rdev->raid_disk;
2113 if (disk_idx >= mddev->raid_disks
2114 || disk_idx < 0)
2115 continue;
2116 disk = conf->mirrors + disk_idx;
2117
2118 disk->rdev = rdev;
2119
2120 blk_queue_stack_limits(mddev->queue,
2121 rdev->bdev->bd_disk->queue);
2122
2123
2124
2125
2126 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2127 mddev->queue->max_sectors > (PAGE_SIZE>>9))
2128 mddev->queue->max_sectors = (PAGE_SIZE>>9);
2129
2130 disk->head_position = 0;
2131 }
2132 INIT_LIST_HEAD(&conf->retry_list);
2133
2134 spin_lock_init(&conf->resync_lock);
2135 init_waitqueue_head(&conf->wait_barrier);
2136
2137
2138 if (!enough(conf)) {
2139 printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
2140 mdname(mddev));
2141 goto out_free_conf;
2142 }
2143
2144 mddev->degraded = 0;
2145 for (i = 0; i < conf->raid_disks; i++) {
2146
2147 disk = conf->mirrors + i;
2148
2149 if (!disk->rdev ||
2150 !test_bit(In_sync, &disk->rdev->flags)) {
2151 disk->head_position = 0;
2152 mddev->degraded++;
2153 if (disk->rdev)
2154 conf->fullsync = 1;
2155 }
2156 }
2157
2158
2159 mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
2160 if (!mddev->thread) {
2161 printk(KERN_ERR
2162 "raid10: couldn't allocate thread for %s\n",
2163 mdname(mddev));
2164 goto out_free_conf;
2165 }
2166
2167 printk(KERN_INFO
2168 "raid10: raid set %s active with %d out of %d devices\n",
2169 mdname(mddev), mddev->raid_disks - mddev->degraded,
2170 mddev->raid_disks);
2171
2172
2173
2174 mddev->array_sectors = size << conf->chunk_shift;
2175 mddev->resync_max_sectors = size << conf->chunk_shift;
2176
2177 mddev->queue->unplug_fn = raid10_unplug;
2178 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2179 mddev->queue->backing_dev_info.congested_data = mddev;
2180
2181
2182
2183
2184
2185 {
2186 int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
2187 stripe /= conf->near_copies;
2188 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2189 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2190 }
2191
2192 if (conf->near_copies < mddev->raid_disks)
2193 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2194 return 0;
2195
2196out_free_conf:
2197 if (conf->r10bio_pool)
2198 mempool_destroy(conf->r10bio_pool);
2199 safe_put_page(conf->tmppage);
2200 kfree(conf->mirrors);
2201 kfree(conf);
2202 mddev->private = NULL;
2203out:
2204 return -EIO;
2205}
2206
2207static int stop(mddev_t *mddev)
2208{
2209 conf_t *conf = mddev_to_conf(mddev);
2210
2211 md_unregister_thread(mddev->thread);
2212 mddev->thread = NULL;
2213 blk_sync_queue(mddev->queue);
2214 if (conf->r10bio_pool)
2215 mempool_destroy(conf->r10bio_pool);
2216 kfree(conf->mirrors);
2217 kfree(conf);
2218 mddev->private = NULL;
2219 return 0;
2220}
2221
2222static void raid10_quiesce(mddev_t *mddev, int state)
2223{
2224 conf_t *conf = mddev_to_conf(mddev);
2225
2226 switch(state) {
2227 case 1:
2228 raise_barrier(conf, 0);
2229 break;
2230 case 0:
2231 lower_barrier(conf);
2232 break;
2233 }
2234 if (mddev->thread) {
2235 if (mddev->bitmap)
2236 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2237 else
2238 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2239 md_wakeup_thread(mddev->thread);
2240 }
2241}
2242
2243static struct mdk_personality raid10_personality =
2244{
2245 .name = "raid10",
2246 .level = 10,
2247 .owner = THIS_MODULE,
2248 .make_request = make_request,
2249 .run = run,
2250 .stop = stop,
2251 .status = status,
2252 .error_handler = error,
2253 .hot_add_disk = raid10_add_disk,
2254 .hot_remove_disk= raid10_remove_disk,
2255 .spare_active = raid10_spare_active,
2256 .sync_request = sync_request,
2257 .quiesce = raid10_quiesce,
2258};
2259
2260static int __init raid_init(void)
2261{
2262 return register_md_personality(&raid10_personality);
2263}
2264
2265static void raid_exit(void)
2266{
2267 unregister_md_personality(&raid10_personality);
2268}
2269
2270module_init(raid_init);
2271module_exit(raid_exit);
2272MODULE_LICENSE("GPL");
2273MODULE_ALIAS("md-personality-9");
2274MODULE_ALIAS("md-raid10");
2275MODULE_ALIAS("md-level-10");
2276