1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include <linux/config.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/raid/raid5.h>
23#include <linux/bio.h>
24#include <asm/bitops.h>
25#include <asm/atomic.h>
26
27
28
29
30
31#define NR_STRIPES 256
32#define STRIPE_SIZE PAGE_SIZE
33#define STRIPE_SECTORS (STRIPE_SIZE>>9)
34#define IO_THRESHOLD 1
35#define HASH_PAGES 1
36#define HASH_PAGES_ORDER 0
37#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
38#define HASH_MASK (NR_HASH - 1)
39#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / STRIPE_SECTORS) & HASH_MASK])
40
41
42
43
44#define RAID5_DEBUG 0
45#define RAID5_PARANOIA 1
46#if RAID5_PARANOIA && CONFIG_SMP
47# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
48#else
49# define CHECK_DEVLOCK()
50#endif
51
52#if RAID5_DEBUG
53#define PRINTK(x...) printk(x)
54#define inline
55#define __inline__
56#else
57#define PRINTK(x...) do { } while (0)
58#endif
59
60static void print_raid5_conf (raid5_conf_t *conf);
61
62static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
63{
64 if (atomic_dec_and_test(&sh->count)) {
65 if (!list_empty(&sh->lru))
66 BUG();
67 if (atomic_read(&conf->active_stripes)==0)
68 BUG();
69 if (test_bit(STRIPE_HANDLE, &sh->state)) {
70 if (test_bit(STRIPE_DELAYED, &sh->state))
71 list_add_tail(&sh->lru, &conf->delayed_list);
72 else
73 list_add_tail(&sh->lru, &conf->handle_list);
74 md_wakeup_thread(conf->thread);
75 } else {
76 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
77 atomic_dec(&conf->preread_active_stripes);
78 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
79 md_wakeup_thread(conf->thread);
80 }
81 list_add_tail(&sh->lru, &conf->inactive_list);
82 atomic_dec(&conf->active_stripes);
83 if (!conf->inactive_blocked ||
84 atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
85 wake_up(&conf->wait_for_stripe);
86 }
87 }
88}
89static void release_stripe(struct stripe_head *sh)
90{
91 raid5_conf_t *conf = sh->raid_conf;
92 unsigned long flags;
93
94 spin_lock_irqsave(&conf->device_lock, flags);
95 __release_stripe(conf, sh);
96 spin_unlock_irqrestore(&conf->device_lock, flags);
97}
98
99static void remove_hash(struct stripe_head *sh)
100{
101 PRINTK("remove_hash(), stripe %lu\n", sh->sector);
102
103 if (sh->hash_pprev) {
104 if (sh->hash_next)
105 sh->hash_next->hash_pprev = sh->hash_pprev;
106 *sh->hash_pprev = sh->hash_next;
107 sh->hash_pprev = NULL;
108 }
109}
110
111static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
112{
113 struct stripe_head **shp = &stripe_hash(conf, sh->sector);
114
115 PRINTK("insert_hash(), stripe %lu\n",sh->sector);
116
117 CHECK_DEVLOCK();
118 if ((sh->hash_next = *shp) != NULL)
119 (*shp)->hash_pprev = &sh->hash_next;
120 *shp = sh;
121 sh->hash_pprev = shp;
122}
123
124
125
126static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
127{
128 struct stripe_head *sh = NULL;
129 struct list_head *first;
130
131 CHECK_DEVLOCK();
132 if (list_empty(&conf->inactive_list))
133 goto out;
134 first = conf->inactive_list.next;
135 sh = list_entry(first, struct stripe_head, lru);
136 list_del_init(first);
137 remove_hash(sh);
138 atomic_inc(&conf->active_stripes);
139out:
140 return sh;
141}
142
143static void shrink_buffers(struct stripe_head *sh, int num)
144{
145 struct page *p;
146 int i;
147
148 for (i=0; i<num ; i++) {
149 p = sh->dev[i].page;
150 if (!p)
151 continue;
152 sh->dev[i].page = NULL;
153 page_cache_release(p);
154 }
155}
156
157static int grow_buffers(struct stripe_head *sh, int num)
158{
159 int i;
160
161 for (i=0; i<num; i++) {
162 struct page *page;
163
164 if (!(page = alloc_page(GFP_KERNEL))) {
165 return 1;
166 }
167 sh->dev[i].page = page;
168 }
169 return 0;
170}
171
172static void raid5_build_block (struct stripe_head *sh, int i);
173
174static inline void init_stripe(struct stripe_head *sh, unsigned long sector, int pd_idx)
175{
176 raid5_conf_t *conf = sh->raid_conf;
177 int disks = conf->raid_disks, i;
178
179 if (atomic_read(&sh->count) != 0)
180 BUG();
181 if (test_bit(STRIPE_HANDLE, &sh->state))
182 BUG();
183
184 CHECK_DEVLOCK();
185 PRINTK("init_stripe called, stripe %lu\n", sh->sector);
186
187 remove_hash(sh);
188
189 sh->sector = sector;
190 sh->pd_idx = pd_idx;
191 sh->state = 0;
192
193 for (i=disks; i--; ) {
194 struct r5dev *dev = &sh->dev[i];
195
196 if (dev->toread || dev->towrite || dev->written ||
197 test_bit(R5_LOCKED, &dev->flags)) {
198 printk("sector=%lx i=%d %p %p %p %d\n",
199 sh->sector, i, dev->toread,
200 dev->towrite, dev->written,
201 test_bit(R5_LOCKED, &dev->flags));
202 BUG();
203 }
204 dev->flags = 0;
205 raid5_build_block(sh, i);
206 }
207 insert_hash(conf, sh);
208}
209
210static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
211{
212 struct stripe_head *sh;
213
214 CHECK_DEVLOCK();
215 PRINTK("__find_stripe, sector %lu\n", sector);
216 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
217 if (sh->sector == sector)
218 return sh;
219 PRINTK("__stripe %lu not in cache\n", sector);
220 return NULL;
221}
222
223static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector,
224 int pd_idx, int noblock)
225{
226 struct stripe_head *sh;
227
228 PRINTK("get_stripe, sector %lu\n", sector);
229
230 spin_lock_irq(&conf->device_lock);
231
232 do {
233 sh = __find_stripe(conf, sector);
234 if (!sh) {
235 if (!conf->inactive_blocked)
236 sh = get_free_stripe(conf);
237 if (noblock && sh == NULL)
238 break;
239 if (!sh) {
240 conf->inactive_blocked = 1;
241 wait_event_lock_irq(conf->wait_for_stripe,
242 !list_empty(&conf->inactive_list) &&
243 (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
244 || !conf->inactive_blocked),
245 conf->device_lock);
246 conf->inactive_blocked = 0;
247 } else
248 init_stripe(sh, sector, pd_idx);
249 } else {
250 if (atomic_read(&sh->count)) {
251 if (!list_empty(&sh->lru))
252 BUG();
253 } else {
254 if (!test_bit(STRIPE_HANDLE, &sh->state))
255 atomic_inc(&conf->active_stripes);
256 if (list_empty(&sh->lru))
257 BUG();
258 list_del_init(&sh->lru);
259 }
260 }
261 } while (sh == NULL);
262
263 if (sh)
264 atomic_inc(&sh->count);
265
266 spin_unlock_irq(&conf->device_lock);
267 return sh;
268}
269
270static int grow_stripes(raid5_conf_t *conf, int num)
271{
272 struct stripe_head *sh;
273 kmem_cache_t *sc;
274 int devs = conf->raid_disks;
275
276 sprintf(conf->cache_name, "md/raid5-%d", conf->mddev->__minor);
277
278 sc = kmem_cache_create(conf->cache_name,
279 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
280 0, 0, NULL, NULL);
281 if (!sc)
282 return 1;
283 conf->slab_cache = sc;
284 while (num--) {
285 sh = kmem_cache_alloc(sc, GFP_KERNEL);
286 if (!sh)
287 return 1;
288 memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
289 sh->raid_conf = conf;
290 sh->lock = SPIN_LOCK_UNLOCKED;
291
292 if (grow_buffers(sh, conf->raid_disks)) {
293 shrink_buffers(sh, conf->raid_disks);
294 kmem_cache_free(sc, sh);
295 return 1;
296 }
297
298 atomic_set(&sh->count, 1);
299 atomic_inc(&conf->active_stripes);
300 INIT_LIST_HEAD(&sh->lru);
301 release_stripe(sh);
302 }
303 return 0;
304}
305
306static void shrink_stripes(raid5_conf_t *conf)
307{
308 struct stripe_head *sh;
309
310 while (1) {
311 spin_lock_irq(&conf->device_lock);
312 sh = get_free_stripe(conf);
313 spin_unlock_irq(&conf->device_lock);
314 if (!sh)
315 break;
316 if (atomic_read(&sh->count))
317 BUG();
318 shrink_buffers(sh, conf->raid_disks);
319 kmem_cache_free(conf->slab_cache, sh);
320 atomic_dec(&conf->active_stripes);
321 }
322 kmem_cache_destroy(conf->slab_cache);
323 conf->slab_cache = NULL;
324}
325
326static void raid5_end_read_request (struct bio * bi)
327{
328 struct stripe_head *sh = bi->bi_private;
329 raid5_conf_t *conf = sh->raid_conf;
330 int disks = conf->raid_disks, i;
331 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
332
333 for (i=0 ; i<disks; i++)
334 if (bi == &sh->dev[i].req)
335 break;
336
337 PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
338 if (i == disks) {
339 BUG();
340 return;
341 }
342
343 if (uptodate) {
344#if 0
345 struct bio *bio;
346 unsigned long flags;
347 spin_lock_irqsave(&conf->device_lock, flags);
348
349
350
351
352
353 buffer = sh->bh_read[i];
354 if (buffer &&
355 (!PageHighMem(buffer->b_page)
356 || buffer->b_page == bh->b_page )
357 ) {
358 sh->bh_read[i] = buffer->b_reqnext;
359 buffer->b_reqnext = NULL;
360 } else
361 buffer = NULL;
362 spin_unlock_irqrestore(&conf->device_lock, flags);
363 if (sh->bh_page[i]==bh->b_page)
364 set_buffer_uptodate(bh);
365 if (buffer) {
366 if (buffer->b_page != bh->b_page)
367 memcpy(buffer->b_data, bh->b_data, bh->b_size);
368 buffer->b_end_io(buffer, 1);
369 }
370#else
371 set_bit(R5_UPTODATE, &sh->dev[i].flags);
372#endif
373 } else {
374 md_error(conf->mddev, conf->disks[i].bdev);
375 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
376 }
377#if 0
378
379 if (sh->bh_page[i] != bh->b_page) {
380 bh->b_page = sh->bh_page[i];
381 bh->b_data = page_address(bh->b_page);
382 clear_buffer_uptodate(bh);
383 }
384#endif
385 clear_bit(R5_LOCKED, &sh->dev[i].flags);
386 set_bit(STRIPE_HANDLE, &sh->state);
387 release_stripe(sh);
388}
389
390static void raid5_end_write_request (struct bio *bi)
391{
392 struct stripe_head *sh = bi->bi_private;
393 raid5_conf_t *conf = sh->raid_conf;
394 int disks = conf->raid_disks, i;
395 unsigned long flags;
396 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
397
398 for (i=0 ; i<disks; i++)
399 if (bi == &sh->dev[i].req)
400 break;
401
402 PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
403 if (i == disks) {
404 BUG();
405 return;
406 }
407
408 spin_lock_irqsave(&conf->device_lock, flags);
409 if (!uptodate)
410 md_error(conf->mddev, conf->disks[i].bdev);
411
412 clear_bit(R5_LOCKED, &sh->dev[i].flags);
413 set_bit(STRIPE_HANDLE, &sh->state);
414 __release_stripe(conf, sh);
415 spin_unlock_irqrestore(&conf->device_lock, flags);
416}
417
418
419static unsigned long compute_blocknr(struct stripe_head *sh, int i);
420
421static void raid5_build_block (struct stripe_head *sh, int i)
422{
423 raid5_conf_t *conf = sh->raid_conf;
424 struct r5dev *dev = &sh->dev[i];
425
426 bio_init(&dev->req);
427 dev->req.bi_io_vec = &dev->vec;
428 dev->req.bi_vcnt++;
429 dev->vec.bv_page = dev->page;
430 dev->vec.bv_len = STRIPE_SIZE;
431 dev->vec.bv_offset = 0;
432
433 dev->req.bi_bdev = conf->disks[i].bdev;
434 dev->req.bi_sector = sh->sector;
435 dev->req.bi_private = sh;
436
437 dev->flags = 0;
438 if (i != sh->pd_idx)
439 dev->sector = compute_blocknr(sh, i);
440}
441
442static int error(mddev_t *mddev, struct block_device *bdev)
443{
444 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
445 struct disk_info *disk;
446 int i;
447
448 PRINTK("raid5: error called\n");
449
450 for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
451 if (disk->bdev != bdev)
452 continue;
453 if (disk->operational) {
454 disk->operational = 0;
455 mddev->sb_dirty = 1;
456 mddev->degraded++;
457 conf->working_disks--;
458 conf->failed_disks++;
459 printk (KERN_ALERT
460 "raid5: Disk failure on %s, disabling device."
461 " Operation continuing on %d devices\n",
462 bdev_partition_name(bdev), conf->working_disks);
463 }
464 return 0;
465 }
466
467
468
469 if (conf->spare) {
470 disk = conf->spare;
471 if (disk->bdev == bdev) {
472 printk (KERN_ALERT
473 "raid5: Disk failure on spare %s\n",
474 bdev_partition_name (bdev));
475 if (!conf->spare->operational) {
476
477 return -EIO;
478 }
479 disk->operational = 0;
480 disk->write_only = 0;
481 conf->spare = NULL;
482
483 mddev->sb_dirty = 1;
484
485 return 0;
486 }
487 }
488 MD_BUG();
489 return -EIO;
490}
491
492
493
494
495
496static unsigned long raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
497 unsigned int data_disks, unsigned int * dd_idx,
498 unsigned int * pd_idx, raid5_conf_t *conf)
499{
500 sector_t stripe;
501 unsigned long chunk_number;
502 unsigned int chunk_offset;
503 sector_t new_sector;
504 int sectors_per_chunk = conf->chunk_size >> 9;
505
506
507
508
509
510
511 chunk_number = r_sector / sectors_per_chunk;
512 chunk_offset = r_sector % sectors_per_chunk;
513
514
515
516
517 stripe = chunk_number / data_disks;
518
519
520
521
522 *dd_idx = chunk_number % data_disks;
523
524
525
526
527 if (conf->level == 4)
528 *pd_idx = data_disks;
529 else switch (conf->algorithm) {
530 case ALGORITHM_LEFT_ASYMMETRIC:
531 *pd_idx = data_disks - stripe % raid_disks;
532 if (*dd_idx >= *pd_idx)
533 (*dd_idx)++;
534 break;
535 case ALGORITHM_RIGHT_ASYMMETRIC:
536 *pd_idx = stripe % raid_disks;
537 if (*dd_idx >= *pd_idx)
538 (*dd_idx)++;
539 break;
540 case ALGORITHM_LEFT_SYMMETRIC:
541 *pd_idx = data_disks - stripe % raid_disks;
542 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
543 break;
544 case ALGORITHM_RIGHT_SYMMETRIC:
545 *pd_idx = stripe % raid_disks;
546 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
547 break;
548 default:
549 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
550 }
551
552
553
554
555 new_sector = stripe * sectors_per_chunk + chunk_offset;
556 return new_sector;
557}
558
559
560static sector_t compute_blocknr(struct stripe_head *sh, int i)
561{
562 raid5_conf_t *conf = sh->raid_conf;
563 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
564 sector_t new_sector = sh->sector, check;
565 int sectors_per_chunk = conf->chunk_size >> 9;
566 sector_t stripe = new_sector / sectors_per_chunk;
567 int chunk_offset = new_sector % sectors_per_chunk;
568 int chunk_number, dummy1, dummy2, dd_idx = i;
569 sector_t r_sector;
570
571 switch (conf->algorithm) {
572 case ALGORITHM_LEFT_ASYMMETRIC:
573 case ALGORITHM_RIGHT_ASYMMETRIC:
574 if (i > sh->pd_idx)
575 i--;
576 break;
577 case ALGORITHM_LEFT_SYMMETRIC:
578 case ALGORITHM_RIGHT_SYMMETRIC:
579 if (i < sh->pd_idx)
580 i += raid_disks;
581 i -= (sh->pd_idx + 1);
582 break;
583 default:
584 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
585 }
586
587 chunk_number = stripe * data_disks + i;
588 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
589
590 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
591 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
592 printk("compute_blocknr: map not correct\n");
593 return 0;
594 }
595 return r_sector;
596}
597
598
599
600
601
602
603
604
605
606
607static void copy_data(int frombio, struct bio *bio,
608 struct page *page,
609 sector_t sector)
610{
611 char *pa = page_address(page);
612 struct bio_vec *bvl;
613 int i;
614
615 for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
616 bio = bio->bi_next) {
617 int page_offset;
618 if (bio->bi_sector >= sector)
619 page_offset = (signed)(bio->bi_sector - sector) * 512;
620 else
621 page_offset = (signed)(sector - bio->bi_sector) * -512;
622 bio_for_each_segment(bvl, bio, i) {
623 int len = bio_iovec_idx(bio,i)->bv_len;
624 int clen;
625 int b_offset = 0;
626
627 if (page_offset < 0) {
628 b_offset = -page_offset;
629 page_offset += b_offset;
630 len -= b_offset;
631 }
632
633 if (len > 0 && page_offset + len > STRIPE_SIZE)
634 clen = STRIPE_SIZE - page_offset;
635 else clen = len;
636
637 if (clen > 0) {
638 char *ba = __bio_kmap(bio, i);
639 if (frombio)
640 memcpy(pa+page_offset, ba+b_offset, clen);
641 else
642 memcpy(ba+b_offset, pa+page_offset, clen);
643 __bio_kunmap(bio, i);
644 }
645 if (clen < len)
646 break;
647 page_offset += len;
648 }
649 }
650}
651
652#define check_xor() do { \
653 if (count == MAX_XOR_BLOCKS) { \
654 xor_block(count, STRIPE_SIZE, ptr); \
655 count = 1; \
656 } \
657 } while(0)
658
659
660static void compute_block(struct stripe_head *sh, int dd_idx)
661{
662 raid5_conf_t *conf = sh->raid_conf;
663 int i, count, disks = conf->raid_disks;
664 void *ptr[MAX_XOR_BLOCKS], *p;
665
666 PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
667
668 ptr[0] = page_address(sh->dev[dd_idx].page);
669 memset(ptr[0], 0, STRIPE_SIZE);
670 count = 1;
671 for (i = disks ; i--; ) {
672 if (i == dd_idx)
673 continue;
674 p = page_address(sh->dev[i].page);
675 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
676 ptr[count++] = p;
677 else
678 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
679
680 check_xor();
681 }
682 if (count != 1)
683 xor_block(count, STRIPE_SIZE, ptr);
684 set_bit(R5_UPTODATE, &sh->dev[i].flags);
685}
686
687static void compute_parity(struct stripe_head *sh, int method)
688{
689 raid5_conf_t *conf = sh->raid_conf;
690 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
691 void *ptr[MAX_XOR_BLOCKS];
692 struct bio *chosen[MD_SB_DISKS];
693
694 PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
695 memset(chosen, 0, sizeof(chosen));
696
697 count = 1;
698 ptr[0] = page_address(sh->dev[pd_idx].page);
699 switch(method) {
700 case READ_MODIFY_WRITE:
701 if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags))
702 BUG();
703 for (i=disks ; i-- ;) {
704 if (i==pd_idx)
705 continue;
706 if (sh->dev[i].towrite &&
707 test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
708 ptr[count++] = page_address(sh->dev[i].page);
709 chosen[i] = sh->dev[i].towrite;
710 sh->dev[i].towrite = NULL;
711 if (sh->dev[i].written) BUG();
712 sh->dev[i].written = chosen[i];
713 check_xor();
714 }
715 }
716 break;
717 case RECONSTRUCT_WRITE:
718 memset(ptr[0], 0, STRIPE_SIZE);
719 for (i= disks; i-- ;)
720 if (i!=pd_idx && sh->dev[i].towrite) {
721 chosen[i] = sh->dev[i].towrite;
722 sh->dev[i].towrite = NULL;
723 if (sh->dev[i].written) BUG();
724 sh->dev[i].written = chosen[i];
725 }
726 break;
727 case CHECK_PARITY:
728 break;
729 }
730 if (count>1) {
731 xor_block(count, STRIPE_SIZE, ptr);
732 count = 1;
733 }
734
735 for (i = disks; i--;)
736 if (chosen[i]) {
737 sector_t sector = sh->dev[i].sector;
738 copy_data(1, chosen[i], sh->dev[i].page, sector);
739
740 set_bit(R5_LOCKED, &sh->dev[i].flags);
741 set_bit(R5_UPTODATE, &sh->dev[i].flags);
742 }
743
744 switch(method) {
745 case RECONSTRUCT_WRITE:
746 case CHECK_PARITY:
747 for (i=disks; i--;)
748 if (i != pd_idx) {
749 ptr[count++] = page_address(sh->dev[i].page);
750 check_xor();
751 }
752 break;
753 case READ_MODIFY_WRITE:
754 for (i = disks; i--;)
755 if (chosen[i]) {
756 ptr[count++] = page_address(sh->dev[i].page);
757 check_xor();
758 }
759 }
760 if (count != 1)
761 xor_block(count, STRIPE_SIZE, ptr);
762
763 if (method != CHECK_PARITY) {
764 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
765 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
766 } else
767 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
768}
769
770
771
772
773
774
775static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
776{
777 struct bio **bip;
778 raid5_conf_t *conf = sh->raid_conf;
779
780 PRINTK("adding bh b#%lu to stripe s#%lu\n", bi->bi_sector, sh->sector);
781
782
783 spin_lock(&sh->lock);
784 spin_lock_irq(&conf->device_lock);
785 if (forwrite)
786 bip = &sh->dev[dd_idx].towrite;
787 else
788 bip = &sh->dev[dd_idx].toread;
789 while (*bip && (*bip)->bi_sector < bi->bi_sector)
790 bip = & (*bip)->bi_next;
791
792 if (*bip && bi->bi_next && (*bip) != bi->bi_next)
793 BUG();
794 if (*bip)
795 bi->bi_next = *bip;
796 *bip = bi;
797 bi->bi_phys_segments ++;
798 spin_unlock_irq(&conf->device_lock);
799 spin_unlock(&sh->lock);
800
801 PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
802
803 if (forwrite) {
804
805 sector_t sector = sh->dev[dd_idx].sector;
806 for (bi=sh->dev[dd_idx].towrite;
807 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
808 bi && bi->bi_sector <= sector;
809 bi = bi->bi_next) {
810 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
811 sector = bi->bi_sector + (bi->bi_size>>9);
812 }
813 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
814 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
815 }
816}
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837static void handle_stripe(struct stripe_head *sh)
838{
839 raid5_conf_t *conf = sh->raid_conf;
840 int disks = conf->raid_disks;
841 struct bio *return_bi= NULL;
842 struct bio *bi;
843 int action[MD_SB_DISKS];
844 int i;
845 int syncing;
846 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
847 int failed_num=0;
848 struct r5dev *dev;
849
850 PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
851 memset(action, 0, sizeof(action));
852
853 spin_lock(&sh->lock);
854 clear_bit(STRIPE_HANDLE, &sh->state);
855 clear_bit(STRIPE_DELAYED, &sh->state);
856
857 syncing = test_bit(STRIPE_SYNCING, &sh->state);
858
859
860 for (i=disks; i--; ) {
861 dev = &sh->dev[i];
862 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i,
863 dev->flags, dev->toread, dev->towrite, dev->written);
864
865 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
866 struct bio *rbi, *rbi2;
867 PRINTK("Return read for disc %d\n", i);
868 spin_lock_irq(&conf->device_lock);
869 rbi = dev->toread;
870 dev->toread = NULL;
871 spin_unlock_irq(&conf->device_lock);
872 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
873 copy_data(0, rbi, dev->page, dev->sector);
874 rbi2 = rbi->bi_next;
875 spin_lock_irq(&conf->device_lock);
876 if (--rbi->bi_phys_segments == 0) {
877 rbi->bi_next = return_bi;
878 return_bi = rbi;
879 }
880 spin_unlock_irq(&conf->device_lock);
881 rbi = rbi2;
882 }
883 }
884
885
886 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
887 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
888
889
890 if (dev->toread) to_read++;
891 if (dev->towrite) to_write++;
892 if (dev->written) written++;
893 if (!conf->disks[i].operational) {
894 failed++;
895 failed_num = i;
896 }
897 }
898 PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
899 locked, uptodate, to_read, to_write, failed, failed_num);
900
901
902
903 if (failed > 1 && to_read+to_write) {
904 spin_lock_irq(&conf->device_lock);
905 for (i=disks; i--; ) {
906
907 bi = sh->dev[i].towrite;
908 sh->dev[i].towrite = NULL;
909 if (bi) to_write--;
910
911 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
912 struct bio *nextbi = bi->bi_next;
913 clear_bit(BIO_UPTODATE, &bi->bi_flags);
914 if (--bi->bi_phys_segments == 0) {
915 bi->bi_next = return_bi;
916 return_bi = bi;
917 }
918 bi = nextbi;
919 }
920
921 if (!conf->disks[i].operational) {
922 bi = sh->dev[i].toread;
923 sh->dev[i].toread = NULL;
924 if (bi) to_read--;
925 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
926 struct bio *nextbi = bi->bi_next;
927 clear_bit(BIO_UPTODATE, &bi->bi_flags);
928 if (--bi->bi_phys_segments == 0) {
929 bi->bi_next = return_bi;
930 return_bi = bi;
931 }
932 bi = nextbi;
933 }
934 }
935 }
936 spin_unlock_irq(&conf->device_lock);
937 }
938 if (failed > 1 && syncing) {
939 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
940 clear_bit(STRIPE_SYNCING, &sh->state);
941 syncing = 0;
942 }
943
944
945
946
947 dev = &sh->dev[sh->pd_idx];
948 if ( written &&
949 ( (conf->disks[sh->pd_idx].operational && !test_bit(R5_LOCKED, &dev->flags) &&
950 test_bit(R5_UPTODATE, &dev->flags))
951 || (failed == 1 && failed_num == sh->pd_idx))
952 ) {
953
954 for (i=disks; i--; )
955 if (sh->dev[i].written) {
956 dev = &sh->dev[i];
957 if (!conf->disks[sh->pd_idx].operational ||
958 (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) ) {
959
960 struct bio *wbi, *wbi2;
961 PRINTK("Return write for disc %d\n", i);
962 wbi = dev->written;
963 dev->written = NULL;
964 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
965 wbi2 = wbi->bi_next;
966 if (--wbi->bi_phys_segments == 0) {
967 wbi->bi_next = return_bi;
968 return_bi = wbi;
969 }
970 wbi = wbi2;
971 }
972 }
973 }
974 }
975
976
977
978
979 if (to_read || (syncing && (uptodate+failed < disks))) {
980 for (i=disks; i--;) {
981 dev = &sh->dev[i];
982 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
983 (dev->toread || syncing || (failed && sh->dev[failed_num].toread))) {
984
985
986
987 if (uptodate == disks-1) {
988 PRINTK("Computing block %d\n", i);
989 compute_block(sh, i);
990 uptodate++;
991 } else if (conf->disks[i].operational) {
992 set_bit(R5_LOCKED, &dev->flags);
993 action[i] = READ+1;
994#if 0
995
996
997 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
998 ! syncing && !failed && !to_write) {
999 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1000 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1001 }
1002#endif
1003 locked++;
1004 PRINTK("Reading block %d (sync=%d)\n", i, syncing);
1005 if (syncing)
1006 md_sync_acct(conf->disks[i].bdev, STRIPE_SECTORS);
1007 }
1008 }
1009 }
1010 set_bit(STRIPE_HANDLE, &sh->state);
1011 }
1012
1013
1014 if (to_write) {
1015 int rmw=0, rcw=0;
1016 for (i=disks ; i--;) {
1017
1018 dev = &sh->dev[i];
1019 if ((dev->towrite || i == sh->pd_idx) &&
1020 (!test_bit(R5_LOCKED, &dev->flags)
1021#if 0
1022|| sh->bh_page[i]!=bh->b_page
1023#endif
1024 ) &&
1025 !test_bit(R5_UPTODATE, &dev->flags)) {
1026 if (conf->disks[i].operational
1027
1028 )
1029 rmw++;
1030 else rmw += 2*disks;
1031 }
1032
1033 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1034 (!test_bit(R5_LOCKED, &dev->flags)
1035#if 0
1036|| sh->bh_page[i] != bh->b_page
1037#endif
1038 ) &&
1039 !test_bit(R5_UPTODATE, &dev->flags)) {
1040 if (conf->disks[i].operational) rcw++;
1041 else rcw += 2*disks;
1042 }
1043 }
1044 PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1045 set_bit(STRIPE_HANDLE, &sh->state);
1046 if (rmw < rcw && rmw > 0)
1047
1048 for (i=disks; i--;) {
1049 dev = &sh->dev[i];
1050 if ((dev->towrite || i == sh->pd_idx) &&
1051 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1052 conf->disks[i].operational) {
1053 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1054 {
1055 PRINTK("Read_old block %d for r-m-w\n", i);
1056 set_bit(R5_LOCKED, &dev->flags);
1057 action[i] = READ+1;
1058 locked++;
1059 } else {
1060 set_bit(STRIPE_DELAYED, &sh->state);
1061 set_bit(STRIPE_HANDLE, &sh->state);
1062 }
1063 }
1064 }
1065 if (rcw <= rmw && rcw > 0)
1066
1067 for (i=disks; i--;) {
1068 dev = &sh->dev[i];
1069 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1070 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1071 conf->disks[i].operational) {
1072 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1073 {
1074 PRINTK("Read_old block %d for Reconstruct\n", i);
1075 set_bit(R5_LOCKED, &dev->flags);
1076 action[i] = READ+1;
1077 locked++;
1078 } else {
1079 set_bit(STRIPE_DELAYED, &sh->state);
1080 set_bit(STRIPE_HANDLE, &sh->state);
1081 }
1082 }
1083 }
1084
1085 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1086 PRINTK("Computing parity...\n");
1087 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1088
1089 for (i=disks; i--;)
1090 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1091 PRINTK("Writing block %d\n", i);
1092 locked++;
1093 action[i] = WRITE+1;
1094 if (!conf->disks[i].operational
1095 || (i==sh->pd_idx && failed == 0))
1096 set_bit(STRIPE_INSYNC, &sh->state);
1097 }
1098 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1099 atomic_dec(&conf->preread_active_stripes);
1100 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1101 md_wakeup_thread(conf->thread);
1102 }
1103 }
1104 }
1105
1106
1107
1108
1109
1110 if (syncing && locked == 0 &&
1111 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1112 set_bit(STRIPE_HANDLE, &sh->state);
1113 if (failed == 0) {
1114 char *pagea;
1115 if (uptodate != disks)
1116 BUG();
1117 compute_parity(sh, CHECK_PARITY);
1118 uptodate--;
1119 pagea = page_address(sh->dev[sh->pd_idx].page);
1120 if ((*(u32*)pagea) == 0 &&
1121 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1122
1123 set_bit(STRIPE_INSYNC, &sh->state);
1124 }
1125 }
1126 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1127 struct disk_info *spare;
1128 if (failed==0)
1129 failed_num = sh->pd_idx;
1130
1131 if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
1132 if (uptodate+1 != disks)
1133 BUG();
1134 compute_block(sh, failed_num);
1135 uptodate++;
1136 }
1137 if (uptodate != disks)
1138 BUG();
1139 dev = &sh->dev[failed_num];
1140 set_bit(R5_LOCKED, &dev->flags);
1141 action[failed_num] = WRITE+1;
1142 locked++;
1143 set_bit(STRIPE_INSYNC, &sh->state);
1144 if (conf->disks[failed_num].operational)
1145 md_sync_acct(conf->disks[failed_num].bdev, STRIPE_SECTORS);
1146 else if ((spare=conf->spare))
1147 md_sync_acct(spare->bdev, STRIPE_SECTORS);
1148
1149 }
1150 }
1151 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1152 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1153 clear_bit(STRIPE_SYNCING, &sh->state);
1154 }
1155
1156 spin_unlock(&sh->lock);
1157
1158 while ((bi=return_bi)) {
1159 return_bi = bi->bi_next;
1160 bi->bi_next = NULL;
1161 bi->bi_end_io(bi);
1162 }
1163 for (i=disks; i-- ;)
1164 if (action[i]) {
1165 struct bio *bi = &sh->dev[i].req;
1166 struct disk_info *spare = conf->spare;
1167 int skip = 0;
1168 if (action[i] == READ+1)
1169 bi->bi_end_io = raid5_end_read_request;
1170 else
1171 bi->bi_end_io = raid5_end_write_request;
1172 if (conf->disks[i].operational)
1173 bi->bi_bdev = conf->disks[i].bdev;
1174 else if (spare && action[i] == WRITE+1)
1175 bi->bi_bdev = spare->bdev;
1176 else skip=1;
1177 if (!skip) {
1178 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1179 atomic_inc(&sh->count);
1180 bi->bi_sector = sh->sector;
1181 if (action[i] == READ+1)
1182 bi->bi_rw = 0;
1183 else
1184 bi->bi_rw = 1;
1185 bi->bi_flags = 0;
1186 bi->bi_vcnt = 1;
1187 bi->bi_idx = 0;
1188 bi->bi_io_vec = &sh->dev[i].vec;
1189 bi->bi_size = STRIPE_SIZE;
1190 bi->bi_next = NULL;
1191 generic_make_request(bi);
1192 } else {
1193 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1194 clear_bit(R5_LOCKED, &dev->flags);
1195 set_bit(STRIPE_HANDLE, &sh->state);
1196 }
1197 }
1198}
1199
1200static inline void raid5_activate_delayed(raid5_conf_t *conf)
1201{
1202 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1203 while (!list_empty(&conf->delayed_list)) {
1204 struct list_head *l = conf->delayed_list.next;
1205 struct stripe_head *sh;
1206 sh = list_entry(l, struct stripe_head, lru);
1207 list_del_init(l);
1208 clear_bit(STRIPE_DELAYED, &sh->state);
1209 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1210 atomic_inc(&conf->preread_active_stripes);
1211 list_add_tail(&sh->lru, &conf->handle_list);
1212 }
1213 }
1214}
1215static void raid5_unplug_device(void *data)
1216{
1217 request_queue_t *q = data;
1218 mddev_t *mddev = q->queuedata;
1219 raid5_conf_t *conf = mddev_to_conf(mddev);
1220 unsigned long flags;
1221
1222 spin_lock_irqsave(&conf->device_lock, flags);
1223
1224 if (blk_remove_plug(q))
1225 raid5_activate_delayed(conf);
1226 md_wakeup_thread(conf->thread);
1227
1228 spin_unlock_irqrestore(&conf->device_lock, flags);
1229}
1230
1231static inline void raid5_plug_device(raid5_conf_t *conf)
1232{
1233 spin_lock_irq(&conf->device_lock);
1234 blk_plug_device(&conf->mddev->queue);
1235 spin_unlock_irq(&conf->device_lock);
1236}
1237
1238static int make_request (request_queue_t *q, struct bio * bi)
1239{
1240 mddev_t *mddev = q->queuedata;
1241 raid5_conf_t *conf = mddev_to_conf(mddev);
1242 const unsigned int raid_disks = conf->raid_disks;
1243 const unsigned int data_disks = raid_disks - 1;
1244 unsigned int dd_idx, pd_idx;
1245 sector_t new_sector;
1246 sector_t logical_sector, last_sector;
1247 struct stripe_head *sh;
1248
1249 logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1);
1250 last_sector = bi->bi_sector + (bi->bi_size>>9);
1251
1252 bi->bi_next = NULL;
1253 set_bit(BIO_UPTODATE, &bi->bi_flags);
1254 bi->bi_phys_segments = 1;
1255 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1256
1257 new_sector = raid5_compute_sector(logical_sector,
1258 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1259
1260 PRINTK("raid5: make_request, sector %ul logical %ul\n",
1261 new_sector, logical_sector);
1262
1263 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1264 if (sh) {
1265
1266 add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
1267
1268 raid5_plug_device(conf);
1269 handle_stripe(sh);
1270 release_stripe(sh);
1271 }
1272 }
1273 spin_lock_irq(&conf->device_lock);
1274 if (--bi->bi_phys_segments == 0)
1275 bi->bi_end_io(bi);
1276 spin_unlock_irq(&conf->device_lock);
1277 return 0;
1278}
1279
1280
1281static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1282{
1283 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1284 struct stripe_head *sh;
1285 int sectors_per_chunk = conf->chunk_size >> 9;
1286 unsigned long stripe = sector_nr/sectors_per_chunk;
1287 int chunk_offset = sector_nr % sectors_per_chunk;
1288 int dd_idx, pd_idx;
1289 unsigned long first_sector;
1290 int raid_disks = conf->raid_disks;
1291 int data_disks = raid_disks-1;
1292
1293 if (sector_nr >= mddev->size <<1)
1294
1295 return 0;
1296
1297 first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1298 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1299 sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1300 spin_lock(&sh->lock);
1301 set_bit(STRIPE_SYNCING, &sh->state);
1302 clear_bit(STRIPE_INSYNC, &sh->state);
1303 spin_unlock(&sh->lock);
1304
1305 handle_stripe(sh);
1306 release_stripe(sh);
1307
1308 return STRIPE_SECTORS;
1309}
1310
1311
1312
1313
1314
1315
1316
1317
1318static void raid5d (void *data)
1319{
1320 struct stripe_head *sh;
1321 raid5_conf_t *conf = data;
1322 mddev_t *mddev = conf->mddev;
1323 int handled;
1324
1325 PRINTK("+++ raid5d active\n");
1326
1327 handled = 0;
1328 spin_lock_irq(&conf->device_lock);
1329 while (1) {
1330 struct list_head *first;
1331
1332 if (list_empty(&conf->handle_list) &&
1333 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1334 !blk_queue_plugged(&mddev->queue) &&
1335 !list_empty(&conf->delayed_list))
1336 raid5_activate_delayed(conf);
1337
1338 if (list_empty(&conf->handle_list))
1339 break;
1340
1341 first = conf->handle_list.next;
1342 sh = list_entry(first, struct stripe_head, lru);
1343
1344 list_del_init(first);
1345 atomic_inc(&sh->count);
1346 if (atomic_read(&sh->count)!= 1)
1347 BUG();
1348 spin_unlock_irq(&conf->device_lock);
1349
1350 handled++;
1351 handle_stripe(sh);
1352 release_stripe(sh);
1353
1354 spin_lock_irq(&conf->device_lock);
1355 }
1356 PRINTK("%d stripes handled\n", handled);
1357
1358 spin_unlock_irq(&conf->device_lock);
1359
1360 PRINTK("--- raid5d inactive\n");
1361}
1362
1363static int run (mddev_t *mddev)
1364{
1365 raid5_conf_t *conf;
1366 int i, raid_disk, memory;
1367 mdk_rdev_t *rdev;
1368 struct disk_info *disk;
1369 struct list_head *tmp;
1370
1371 MOD_INC_USE_COUNT;
1372
1373 if (mddev->level != 5 && mddev->level != 4) {
1374 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), mddev->level);
1375 MOD_DEC_USE_COUNT;
1376 return -EIO;
1377 }
1378
1379 mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1380 if ((conf = mddev->private) == NULL)
1381 goto abort;
1382 memset (conf, 0, sizeof (*conf));
1383 conf->mddev = mddev;
1384
1385 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1386 goto abort;
1387 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1388
1389 conf->device_lock = SPIN_LOCK_UNLOCKED;
1390 init_waitqueue_head(&conf->wait_for_stripe);
1391 INIT_LIST_HEAD(&conf->handle_list);
1392 INIT_LIST_HEAD(&conf->delayed_list);
1393 INIT_LIST_HEAD(&conf->inactive_list);
1394 atomic_set(&conf->active_stripes, 0);
1395 atomic_set(&conf->preread_active_stripes, 0);
1396
1397 mddev->queue.unplug_fn = raid5_unplug_device;
1398
1399 PRINTK("raid5: run(md%d) called.\n", mdidx(mddev));
1400
1401 ITERATE_RDEV(mddev,rdev,tmp) {
1402
1403
1404
1405
1406
1407 raid_disk = rdev->raid_disk;
1408 disk = conf->disks + raid_disk;
1409
1410 if (rdev->faulty) {
1411 printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", bdev_partition_name(rdev->bdev));
1412 disk->bdev = rdev->bdev;
1413
1414 disk->operational = 0;
1415 disk->write_only = 0;
1416 disk->spare = 0;
1417 disk->used_slot = 1;
1418 continue;
1419 }
1420 if (rdev->in_sync) {
1421 if (disk->operational) {
1422 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", bdev_partition_name(rdev->bdev), raid_disk);
1423 continue;
1424 }
1425 printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", bdev_partition_name(rdev->bdev), raid_disk);
1426
1427 disk->bdev = rdev->bdev;
1428 disk->operational = 1;
1429 disk->used_slot = 1;
1430
1431 conf->working_disks++;
1432 } else {
1433
1434
1435
1436 printk(KERN_INFO "raid5: spare disk %s\n", bdev_partition_name(rdev->bdev));
1437 disk->bdev = rdev->bdev;
1438
1439 disk->operational = 0;
1440 disk->write_only = 0;
1441 disk->spare = 1;
1442 disk->used_slot = 1;
1443 }
1444 }
1445
1446 for (i = 0; i < conf->raid_disks; i++) {
1447 disk = conf->disks + i;
1448
1449 if (!disk->used_slot) {
1450 disk->bdev = NULL;
1451
1452 disk->operational = 0;
1453 disk->write_only = 0;
1454 disk->spare = 0;
1455 disk->used_slot = 1;
1456 }
1457 }
1458
1459 conf->raid_disks = mddev->raid_disks;
1460
1461
1462
1463 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
1464 conf->mddev = mddev;
1465 conf->chunk_size = mddev->chunk_size;
1466 conf->level = mddev->level;
1467 conf->algorithm = mddev->layout;
1468 conf->max_nr_stripes = NR_STRIPES;
1469
1470#if 0
1471 for (i = 0; i < conf->raid_disks; i++) {
1472 if (!conf->disks[i].used_slot) {
1473 MD_BUG();
1474 goto abort;
1475 }
1476 }
1477#endif
1478 if (!conf->chunk_size || conf->chunk_size % 4) {
1479 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1480 goto abort;
1481 }
1482 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1483 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1484 goto abort;
1485 }
1486 if (mddev->degraded > 1) {
1487 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1488 goto abort;
1489 }
1490
1491 if (mddev->degraded == 1 &&
1492 !(mddev->state & (1<<MD_SB_CLEAN))) {
1493 printk(KERN_ERR "raid5: cannot start dirty degraded array for md%d\n", mdidx(mddev));
1494 goto abort;
1495 }
1496
1497 {
1498 const char * name = "raid5d";
1499
1500 conf->thread = md_register_thread(raid5d, conf, name);
1501 if (!conf->thread) {
1502 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1503 goto abort;
1504 }
1505 }
1506
1507 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1508 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
1509 if (grow_stripes(conf, conf->max_nr_stripes)) {
1510 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1511 shrink_stripes(conf);
1512 md_unregister_thread(conf->thread);
1513 goto abort;
1514 } else
1515 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1516
1517 if (mddev->degraded == 0)
1518 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev),
1519 mddev->raid_disks-mddev->degraded, mddev->raid_disks, conf->algorithm);
1520 else
1521 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev),
1522 mddev->raid_disks = mddev->degraded, mddev->raid_disks, conf->algorithm);
1523
1524 print_raid5_conf(conf);
1525
1526
1527 return (0);
1528abort:
1529 if (conf) {
1530 print_raid5_conf(conf);
1531 if (conf->stripe_hashtbl)
1532 free_pages((unsigned long) conf->stripe_hashtbl,
1533 HASH_PAGES_ORDER);
1534 kfree(conf);
1535 }
1536 mddev->private = NULL;
1537 printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1538 MOD_DEC_USE_COUNT;
1539 return -EIO;
1540}
1541
1542
1543
1544static int stop (mddev_t *mddev)
1545{
1546 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1547
1548 md_unregister_thread(conf->thread);
1549 shrink_stripes(conf);
1550 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1551 kfree(conf);
1552 mddev->private = NULL;
1553 MOD_DEC_USE_COUNT;
1554 return 0;
1555}
1556
1557#if RAID5_DEBUG
1558static void print_sh (struct stripe_head *sh)
1559{
1560 int i;
1561
1562 printk("sh %lu, pd_idx %d, state %ld.\n", sh->sector, sh->pd_idx, sh->state);
1563 printk("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count));
1564 printk("sh %lu, ", sh->sector);
1565 for (i = 0; i < sh->raid_conf->raid_disks; i++) {
1566 printk("(cache%d: %p %ld) ", i, sh->dev[i].page, sh->dev[i].flags);
1567 }
1568 printk("\n");
1569}
1570
1571static void printall (raid5_conf_t *conf)
1572{
1573 struct stripe_head *sh;
1574 int i;
1575
1576 spin_lock_irq(&conf->device_lock);
1577 for (i = 0; i < NR_HASH; i++) {
1578 sh = conf->stripe_hashtbl[i];
1579 for (; sh; sh = sh->hash_next) {
1580 if (sh->raid_conf != conf)
1581 continue;
1582 print_sh(sh);
1583 }
1584 }
1585 spin_unlock_irq(&conf->device_lock);
1586
1587 PRINTK("--- raid5d inactive\n");
1588}
1589#endif
1590
1591static int status (char *page, mddev_t *mddev)
1592{
1593 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1594 int sz = 0, i;
1595
1596 sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
1597 sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1598 for (i = 0; i < conf->raid_disks; i++)
1599 sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
1600 sz += sprintf (page+sz, "]");
1601#if RAID5_DEBUG
1602#define D(x) \
1603 sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
1604 printall(conf);
1605#endif
1606 return sz;
1607}
1608
1609static void print_raid5_conf (raid5_conf_t *conf)
1610{
1611 int i;
1612 struct disk_info *tmp;
1613
1614 printk("RAID5 conf printout:\n");
1615 if (!conf) {
1616 printk("(conf==NULL)\n");
1617 return;
1618 }
1619 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1620 conf->working_disks, conf->failed_disks);
1621
1622#if RAID5_DEBUG
1623 for (i = 0; i < MD_SB_DISKS; i++) {
1624#else
1625 for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1626#endif
1627 tmp = conf->disks + i;
1628 printk(" disk %d, s:%d, o:%d, us:%d dev:%s\n",
1629 i, tmp->spare,tmp->operational,
1630 tmp->used_slot,
1631 bdev_partition_name(tmp->bdev));
1632 }
1633}
1634
1635static int raid5_spare_active(mddev_t *mddev)
1636{
1637 int err = 0;
1638 int i, failed_disk=-1, spare_disk=-1;
1639 raid5_conf_t *conf = mddev->private;
1640 struct disk_info *tmp, *sdisk, *fdisk;
1641 mdk_rdev_t *spare_rdev, *failed_rdev;
1642
1643 print_raid5_conf(conf);
1644 spin_lock_irq(&conf->device_lock);
1645 for (i = 0; i < conf->raid_disks; i++) {
1646 tmp = conf->disks + i;
1647 if ((!tmp->operational && !tmp->spare) ||
1648 !tmp->used_slot) {
1649 failed_disk = i;
1650 break;
1651 }
1652 }
1653 if (failed_disk == -1) {
1654 MD_BUG();
1655 err = 1;
1656 goto abort;
1657 }
1658
1659
1660
1661
1662 spare_disk = mddev->spare->raid_disk;
1663
1664 if (!conf->spare) {
1665 MD_BUG();
1666 err = 1;
1667 goto abort;
1668 }
1669 sdisk = conf->disks + spare_disk;
1670 fdisk = conf->disks + failed_disk;
1671
1672
1673
1674
1675 spare_rdev = find_rdev_nr(mddev, spare_disk);
1676 failed_rdev = find_rdev_nr(mddev, failed_disk);
1677
1678
1679
1680
1681 spare_rdev->desc_nr = failed_disk;
1682 spare_rdev->raid_disk = failed_disk;
1683 if (failed_rdev) {
1684 failed_rdev->desc_nr = spare_disk;
1685 failed_rdev->raid_disk = spare_disk;
1686 }
1687
1688 xchg_values(*fdisk, *sdisk);
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698 if (!sdisk->bdev)
1699 sdisk->used_slot = 0;
1700
1701
1702
1703
1704 fdisk->spare = 0;
1705 fdisk->write_only = 0;
1706
1707
1708
1709
1710
1711
1712 mddev->degraded--;
1713 conf->failed_disks--;
1714 conf->working_disks++;
1715 conf->spare = NULL;
1716abort:
1717 spin_unlock_irq(&conf->device_lock);
1718 print_raid5_conf(conf);
1719 return err;
1720}
1721
1722static int raid5_spare_inactive(mddev_t *mddev)
1723{
1724 raid5_conf_t *conf = mddev->private;
1725 struct disk_info *p;
1726 int err = 0;
1727
1728 print_raid5_conf(conf);
1729 spin_lock_irq(&conf->device_lock);
1730 p = conf->disks + mddev->spare->raid_disk;
1731 if (p) {
1732 p->operational = 0;
1733 p->write_only = 0;
1734 if (conf->spare == p)
1735 conf->spare = NULL;
1736 } else {
1737 MD_BUG();
1738 err = 1;
1739 }
1740 spin_unlock_irq(&conf->device_lock);
1741 print_raid5_conf(conf);
1742 return err;
1743}
1744
1745static int raid5_spare_write(mddev_t *mddev)
1746{
1747 raid5_conf_t *conf = mddev->private;
1748 struct disk_info *p;
1749 int err = 0;
1750
1751 print_raid5_conf(conf);
1752 spin_lock_irq(&conf->device_lock);
1753 p = conf->disks + mddev->spare->raid_disk;
1754 if (p && !conf->spare) {
1755 p->operational = 1;
1756 p->write_only = 1;
1757 conf->spare = p;
1758 } else {
1759 MD_BUG();
1760 err = 1;
1761 }
1762 spin_unlock_irq(&conf->device_lock);
1763 print_raid5_conf(conf);
1764 return err;
1765}
1766
1767static int raid5_remove_disk(mddev_t *mddev, int number)
1768{
1769 raid5_conf_t *conf = mddev->private;
1770 int err = 1;
1771 struct disk_info *p = conf->disks + number;
1772
1773 print_raid5_conf(conf);
1774 spin_lock_irq(&conf->device_lock);
1775
1776 if (p->used_slot) {
1777 if (p->operational) {
1778 err = -EBUSY;
1779 goto abort;
1780 }
1781 p->bdev = NULL;
1782 p->used_slot = 0;
1783 err = 0;
1784 }
1785 if (err)
1786 MD_BUG();
1787abort:
1788 spin_unlock_irq(&conf->device_lock);
1789 print_raid5_conf(conf);
1790 return err;
1791}
1792
1793static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1794{
1795 raid5_conf_t *conf = mddev->private;
1796 int err = 1;
1797 struct disk_info *p = conf->disks + rdev->raid_disk;
1798
1799 print_raid5_conf(conf);
1800 spin_lock_irq(&conf->device_lock);
1801
1802
1803
1804
1805 if (!p->used_slot) {
1806
1807 p->bdev = rdev->bdev;
1808 p->operational = 0;
1809 p->write_only = 0;
1810 p->spare = 1;
1811 p->used_slot = 1;
1812 err = 0;
1813 }
1814 if (err)
1815 MD_BUG();
1816 spin_unlock_irq(&conf->device_lock);
1817 print_raid5_conf(conf);
1818 return err;
1819}
1820
1821static mdk_personality_t raid5_personality=
1822{
1823 .name = "raid5",
1824 .make_request = make_request,
1825 .run = run,
1826 .stop = stop,
1827 .status = status,
1828 .error_handler = error,
1829 .hot_add_disk = raid5_add_disk,
1830 .hot_remove_disk= raid5_remove_disk,
1831 .spare_write = raid5_spare_write,
1832 .spare_inactive = raid5_spare_inactive,
1833 .spare_active = raid5_spare_active,
1834 .sync_request = sync_request,
1835};
1836
1837static int __init raid5_init (void)
1838{
1839 return register_md_personality (RAID5, &raid5_personality);
1840}
1841
1842static void raid5_exit (void)
1843{
1844 unregister_md_personality (RAID5);
1845}
1846
1847module_init(raid5_init);
1848module_exit(raid5_exit);
1849MODULE_LICENSE("GPL");
1850