1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include <linux/config.h>
20#include <linux/module.h>
21#include <linux/locks.h>
22#include <linux/slab.h>
23#include <linux/raid/raid5.h>
24#include <asm/bitops.h>
25#include <asm/atomic.h>
26
27static mdk_personality_t raid5_personality;
28
29
30
31
32
33#define NR_STRIPES 256
34#define IO_THRESHOLD 1
35#define HASH_PAGES 1
36#define HASH_PAGES_ORDER 0
37#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
38#define HASH_MASK (NR_HASH - 1)
39#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
40
41
42
43
44#define RAID5_DEBUG 0
45#define RAID5_PARANOIA 1
46#if RAID5_PARANOIA && CONFIG_SMP
47# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
48#else
49# define CHECK_DEVLOCK()
50#endif
51
52#if RAID5_DEBUG
53#define PRINTK(x...) printk(x)
54#define inline
55#define __inline__
56#else
57#define PRINTK(x...) do { } while (0)
58#endif
59
60static void print_raid5_conf (raid5_conf_t *conf);
61
62static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
63{
64 if (atomic_dec_and_test(&sh->count)) {
65 if (!list_empty(&sh->lru))
66 BUG();
67 if (atomic_read(&conf->active_stripes)==0)
68 BUG();
69 if (test_bit(STRIPE_HANDLE, &sh->state)) {
70 if (test_bit(STRIPE_DELAYED, &sh->state))
71 list_add_tail(&sh->lru, &conf->delayed_list);
72 else
73 list_add_tail(&sh->lru, &conf->handle_list);
74 md_wakeup_thread(conf->thread);
75 } else {
76 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
77 atomic_dec(&conf->preread_active_stripes);
78 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
79 md_wakeup_thread(conf->thread);
80 }
81 list_add_tail(&sh->lru, &conf->inactive_list);
82 atomic_dec(&conf->active_stripes);
83 if (!conf->inactive_blocked ||
84 atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
85 wake_up(&conf->wait_for_stripe);
86 }
87 }
88}
89static void release_stripe(struct stripe_head *sh)
90{
91 raid5_conf_t *conf = sh->raid_conf;
92 unsigned long flags;
93
94 spin_lock_irqsave(&conf->device_lock, flags);
95 __release_stripe(conf, sh);
96 spin_unlock_irqrestore(&conf->device_lock, flags);
97}
98
99static void remove_hash(struct stripe_head *sh)
100{
101 PRINTK("remove_hash(), stripe %lu\n", sh->sector);
102
103 if (sh->hash_pprev) {
104 if (sh->hash_next)
105 sh->hash_next->hash_pprev = sh->hash_pprev;
106 *sh->hash_pprev = sh->hash_next;
107 sh->hash_pprev = NULL;
108 }
109}
110
111static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
112{
113 struct stripe_head **shp = &stripe_hash(conf, sh->sector);
114
115 PRINTK("insert_hash(), stripe %lu\n",sh->sector);
116
117 CHECK_DEVLOCK();
118 if ((sh->hash_next = *shp) != NULL)
119 (*shp)->hash_pprev = &sh->hash_next;
120 *shp = sh;
121 sh->hash_pprev = shp;
122}
123
124
125
126static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
127{
128 struct stripe_head *sh = NULL;
129 struct list_head *first;
130
131 CHECK_DEVLOCK();
132 if (list_empty(&conf->inactive_list))
133 goto out;
134 first = conf->inactive_list.next;
135 sh = list_entry(first, struct stripe_head, lru);
136 list_del_init(first);
137 remove_hash(sh);
138 atomic_inc(&conf->active_stripes);
139out:
140 return sh;
141}
142
143static void shrink_buffers(struct stripe_head *sh, int num)
144{
145 struct buffer_head *bh;
146 int i;
147
148 for (i=0; i<num ; i++) {
149 bh = sh->bh_cache[i];
150 if (!bh)
151 return;
152 sh->bh_cache[i] = NULL;
153 free_page((unsigned long) bh->b_data);
154 kfree(bh);
155 }
156}
157
158static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
159{
160 struct buffer_head *bh;
161 int i;
162
163 for (i=0; i<num; i++) {
164 struct page *page;
165 bh = kmalloc(sizeof(struct buffer_head), priority);
166 if (!bh)
167 return 1;
168 memset(bh, 0, sizeof (struct buffer_head));
169 init_waitqueue_head(&bh->b_wait);
170 if ((page = alloc_page(priority)))
171 bh->b_data = page_address(page);
172 else {
173 kfree(bh);
174 return 1;
175 }
176 atomic_set(&bh->b_count, 0);
177 bh->b_page = page;
178 sh->bh_cache[i] = bh;
179
180 }
181 return 0;
182}
183
184static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
185
186static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
187{
188 raid5_conf_t *conf = sh->raid_conf;
189 int disks = conf->raid_disks, i;
190
191 if (atomic_read(&sh->count) != 0)
192 BUG();
193 if (test_bit(STRIPE_HANDLE, &sh->state))
194 BUG();
195
196 CHECK_DEVLOCK();
197 PRINTK("init_stripe called, stripe %lu\n", sh->sector);
198
199 remove_hash(sh);
200
201 sh->sector = sector;
202 sh->size = conf->buffer_size;
203 sh->state = 0;
204
205 for (i=disks; i--; ) {
206 if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
207 buffer_locked(sh->bh_cache[i])) {
208 printk("sector=%lx i=%d %p %p %p %d\n",
209 sh->sector, i, sh->bh_read[i],
210 sh->bh_write[i], sh->bh_written[i],
211 buffer_locked(sh->bh_cache[i]));
212 BUG();
213 }
214 clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
215 raid5_build_block(sh, i);
216 }
217 insert_hash(conf, sh);
218}
219
220
221
222
223static void shrink_stripe_cache(raid5_conf_t *conf)
224{
225 int i;
226 CHECK_DEVLOCK();
227 if (atomic_read(&conf->active_stripes))
228 BUG();
229 for (i=0; i < NR_HASH; i++) {
230 struct stripe_head *sh;
231 while ((sh = conf->stripe_hashtbl[i]))
232 remove_hash(sh);
233 }
234}
235
236static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
237{
238 struct stripe_head *sh;
239
240 CHECK_DEVLOCK();
241 PRINTK("__find_stripe, sector %lu\n", sector);
242 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
243 if (sh->sector == sector)
244 return sh;
245 PRINTK("__stripe %lu not in cache\n", sector);
246 return NULL;
247}
248
249static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
250{
251 struct stripe_head *sh;
252
253 PRINTK("get_stripe, sector %lu\n", sector);
254
255 md_spin_lock_irq(&conf->device_lock);
256
257 do {
258 if (conf->buffer_size == 0 ||
259 (size && size != conf->buffer_size)) {
260
261
262
263
264
265
266
267
268 int oldsize = conf->buffer_size;
269 PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
270 if (size==0)
271 wait_event_lock_irq(conf->wait_for_stripe,
272 conf->buffer_size,
273 conf->device_lock);
274 else {
275 while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
276 conf->buffer_size = 0;
277 wait_event_lock_irq(conf->wait_for_stripe,
278 atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
279 conf->device_lock);
280 PRINTK("waited and now %ld/%d buffer_size is %d - %d active\n", sector, size,
281 conf->buffer_size, atomic_read(&conf->active_stripes));
282 }
283
284 if (conf->buffer_size != size) {
285 printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
286 shrink_stripe_cache(conf);
287 if (size==0) BUG();
288 conf->buffer_size = size;
289 PRINTK("size now %d\n", conf->buffer_size);
290 }
291 }
292 }
293 if (size == 0)
294 sector -= sector & ((conf->buffer_size>>9)-1);
295
296 sh = __find_stripe(conf, sector);
297 if (!sh) {
298 if (!conf->inactive_blocked)
299 sh = get_free_stripe(conf);
300 if (noblock && sh == NULL)
301 break;
302 if (!sh) {
303 conf->inactive_blocked = 1;
304 wait_event_lock_irq(conf->wait_for_stripe,
305 !list_empty(&conf->inactive_list) &&
306 (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
307 || !conf->inactive_blocked),
308 conf->device_lock);
309 conf->inactive_blocked = 0;
310 } else
311 init_stripe(sh, sector);
312 } else {
313 if (atomic_read(&sh->count)) {
314 if (!list_empty(&sh->lru))
315 BUG();
316 } else {
317 if (!test_bit(STRIPE_HANDLE, &sh->state))
318 atomic_inc(&conf->active_stripes);
319 if (list_empty(&sh->lru))
320 BUG();
321 list_del_init(&sh->lru);
322 }
323 }
324 } while (sh == NULL);
325
326 if (sh)
327 atomic_inc(&sh->count);
328
329 md_spin_unlock_irq(&conf->device_lock);
330 return sh;
331}
332
333static int grow_stripes(raid5_conf_t *conf, int num, int priority)
334{
335 struct stripe_head *sh;
336
337 while (num--) {
338 sh = kmalloc(sizeof(struct stripe_head), priority);
339 if (!sh)
340 return 1;
341 memset(sh, 0, sizeof(*sh));
342 sh->raid_conf = conf;
343 sh->lock = SPIN_LOCK_UNLOCKED;
344
345 if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
346 shrink_buffers(sh, conf->raid_disks);
347 kfree(sh);
348 return 1;
349 }
350
351 atomic_set(&sh->count, 1);
352 atomic_inc(&conf->active_stripes);
353 INIT_LIST_HEAD(&sh->lru);
354 release_stripe(sh);
355 }
356 return 0;
357}
358
359static void shrink_stripes(raid5_conf_t *conf, int num)
360{
361 struct stripe_head *sh;
362
363 while (num--) {
364 spin_lock_irq(&conf->device_lock);
365 sh = get_free_stripe(conf);
366 spin_unlock_irq(&conf->device_lock);
367 if (!sh)
368 break;
369 if (atomic_read(&sh->count))
370 BUG();
371 shrink_buffers(sh, conf->raid_disks);
372 kfree(sh);
373 atomic_dec(&conf->active_stripes);
374 }
375}
376
377
378static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
379{
380 struct stripe_head *sh = bh->b_private;
381 raid5_conf_t *conf = sh->raid_conf;
382 int disks = conf->raid_disks, i;
383 unsigned long flags;
384
385 for (i=0 ; i<disks; i++)
386 if (bh == sh->bh_cache[i])
387 break;
388
389 PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
390 if (i == disks) {
391 BUG();
392 return;
393 }
394
395 if (uptodate) {
396 struct buffer_head *buffer;
397 spin_lock_irqsave(&conf->device_lock, flags);
398
399
400
401
402
403 buffer = sh->bh_read[i];
404 if (buffer &&
405 (!PageHighMem(buffer->b_page)
406 || buffer->b_page == bh->b_page )
407 ) {
408 sh->bh_read[i] = buffer->b_reqnext;
409 buffer->b_reqnext = NULL;
410 } else
411 buffer = NULL;
412 spin_unlock_irqrestore(&conf->device_lock, flags);
413 if (sh->bh_page[i]==NULL)
414 set_bit(BH_Uptodate, &bh->b_state);
415 if (buffer) {
416 if (buffer->b_page != bh->b_page)
417 memcpy(buffer->b_data, bh->b_data, bh->b_size);
418 buffer->b_end_io(buffer, 1);
419 }
420 } else {
421 md_error(conf->mddev, bh->b_dev);
422 clear_bit(BH_Uptodate, &bh->b_state);
423 }
424
425 if (sh->bh_page[i]) {
426 bh->b_page = sh->bh_page[i];
427 bh->b_data = page_address(bh->b_page);
428 sh->bh_page[i] = NULL;
429 clear_bit(BH_Uptodate, &bh->b_state);
430 }
431 clear_bit(BH_Lock, &bh->b_state);
432 set_bit(STRIPE_HANDLE, &sh->state);
433 release_stripe(sh);
434}
435
436static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
437{
438 struct stripe_head *sh = bh->b_private;
439 raid5_conf_t *conf = sh->raid_conf;
440 int disks = conf->raid_disks, i;
441 unsigned long flags;
442
443 for (i=0 ; i<disks; i++)
444 if (bh == sh->bh_cache[i])
445 break;
446
447 PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
448 if (i == disks) {
449 BUG();
450 return;
451 }
452
453 md_spin_lock_irqsave(&conf->device_lock, flags);
454 if (!uptodate)
455 md_error(conf->mddev, bh->b_dev);
456 clear_bit(BH_Lock, &bh->b_state);
457 set_bit(STRIPE_HANDLE, &sh->state);
458 __release_stripe(conf, sh);
459 md_spin_unlock_irqrestore(&conf->device_lock, flags);
460}
461
462
463
464static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
465{
466 raid5_conf_t *conf = sh->raid_conf;
467 struct buffer_head *bh = sh->bh_cache[i];
468 unsigned long block = sh->sector / (sh->size >> 9);
469
470 init_buffer(bh, raid5_end_read_request, sh);
471 bh->b_dev = conf->disks[i].dev;
472 bh->b_blocknr = block;
473
474 bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
475 bh->b_size = sh->size;
476 bh->b_list = BUF_LOCKED;
477 return bh;
478}
479
480static int raid5_error (mddev_t *mddev, kdev_t dev)
481{
482 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
483 mdp_super_t *sb = mddev->sb;
484 struct disk_info *disk;
485 int i;
486
487 PRINTK("raid5_error called\n");
488
489 for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
490 if (disk->dev == dev) {
491 if (disk->operational) {
492 disk->operational = 0;
493 mark_disk_faulty(sb->disks+disk->number);
494 mark_disk_nonsync(sb->disks+disk->number);
495 mark_disk_inactive(sb->disks+disk->number);
496 sb->active_disks--;
497 sb->working_disks--;
498 sb->failed_disks++;
499 mddev->sb_dirty = 1;
500 conf->working_disks--;
501 conf->failed_disks++;
502 md_wakeup_thread(conf->thread);
503 printk (KERN_ALERT
504 "raid5: Disk failure on %s, disabling device."
505 " Operation continuing on %d devices\n",
506 partition_name (dev), conf->working_disks);
507 }
508 return 0;
509 }
510 }
511
512
513
514 if (conf->spare) {
515 disk = conf->spare;
516 if (disk->dev == dev) {
517 printk (KERN_ALERT
518 "raid5: Disk failure on spare %s\n",
519 partition_name (dev));
520 if (!conf->spare->operational) {
521
522 return -EIO;
523 }
524 disk->operational = 0;
525 disk->write_only = 0;
526 conf->spare = NULL;
527 mark_disk_faulty(sb->disks+disk->number);
528 mark_disk_nonsync(sb->disks+disk->number);
529 mark_disk_inactive(sb->disks+disk->number);
530 sb->spare_disks--;
531 sb->working_disks--;
532 sb->failed_disks++;
533
534 mddev->sb_dirty = 1;
535 md_wakeup_thread(conf->thread);
536
537 return 0;
538 }
539 }
540 MD_BUG();
541 return -EIO;
542}
543
544
545
546
547
548static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
549 unsigned int data_disks, unsigned int * dd_idx,
550 unsigned int * pd_idx, raid5_conf_t *conf)
551{
552 unsigned long stripe;
553 unsigned long chunk_number;
554 unsigned int chunk_offset;
555 unsigned long new_sector;
556 int sectors_per_chunk = conf->chunk_size >> 9;
557
558
559
560
561
562
563 chunk_number = r_sector / sectors_per_chunk;
564 chunk_offset = r_sector % sectors_per_chunk;
565
566
567
568
569 stripe = chunk_number / data_disks;
570
571
572
573
574 *dd_idx = chunk_number % data_disks;
575
576
577
578
579 if (conf->level == 4)
580 *pd_idx = data_disks;
581 else switch (conf->algorithm) {
582 case ALGORITHM_LEFT_ASYMMETRIC:
583 *pd_idx = data_disks - stripe % raid_disks;
584 if (*dd_idx >= *pd_idx)
585 (*dd_idx)++;
586 break;
587 case ALGORITHM_RIGHT_ASYMMETRIC:
588 *pd_idx = stripe % raid_disks;
589 if (*dd_idx >= *pd_idx)
590 (*dd_idx)++;
591 break;
592 case ALGORITHM_LEFT_SYMMETRIC:
593 *pd_idx = data_disks - stripe % raid_disks;
594 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
595 break;
596 case ALGORITHM_RIGHT_SYMMETRIC:
597 *pd_idx = stripe % raid_disks;
598 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
599 break;
600 default:
601 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
602 }
603
604
605
606
607 new_sector = stripe * sectors_per_chunk + chunk_offset;
608 return new_sector;
609}
610
611#if 0
612static unsigned long compute_blocknr(struct stripe_head *sh, int i)
613{
614 raid5_conf_t *conf = sh->raid_conf;
615 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
616 unsigned long new_sector = sh->sector, check;
617 int sectors_per_chunk = conf->chunk_size >> 9;
618 unsigned long stripe = new_sector / sectors_per_chunk;
619 int chunk_offset = new_sector % sectors_per_chunk;
620 int chunk_number, dummy1, dummy2, dd_idx = i;
621 unsigned long r_sector, blocknr;
622
623 switch (conf->algorithm) {
624 case ALGORITHM_LEFT_ASYMMETRIC:
625 case ALGORITHM_RIGHT_ASYMMETRIC:
626 if (i > sh->pd_idx)
627 i--;
628 break;
629 case ALGORITHM_LEFT_SYMMETRIC:
630 case ALGORITHM_RIGHT_SYMMETRIC:
631 if (i < sh->pd_idx)
632 i += raid_disks;
633 i -= (sh->pd_idx + 1);
634 break;
635 default:
636 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
637 }
638
639 chunk_number = stripe * data_disks + i;
640 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
641 blocknr = r_sector / (sh->size >> 9);
642
643 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
644 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
645 printk("compute_blocknr: map not correct\n");
646 return 0;
647 }
648 return blocknr;
649}
650#endif
651
652#define check_xor() do { \
653 if (count == MAX_XOR_BLOCKS) { \
654 xor_block(count, bh_ptr); \
655 count = 1; \
656 } \
657 } while(0)
658
659
660static void compute_block(struct stripe_head *sh, int dd_idx)
661{
662 raid5_conf_t *conf = sh->raid_conf;
663 int i, count, disks = conf->raid_disks;
664 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
665
666 PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
667
668
669 memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
670 bh_ptr[0] = sh->bh_cache[dd_idx];
671 count = 1;
672 for (i = disks ; i--; ) {
673 if (i == dd_idx)
674 continue;
675 bh = sh->bh_cache[i];
676 if (buffer_uptodate(bh))
677 bh_ptr[count++] = bh;
678 else
679 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
680
681 check_xor();
682 }
683 if (count != 1)
684 xor_block(count, bh_ptr);
685 set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
686}
687
688static void compute_parity(struct stripe_head *sh, int method)
689{
690 raid5_conf_t *conf = sh->raid_conf;
691 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
692 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
693 struct buffer_head *chosen[MD_SB_DISKS];
694
695 PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
696 memset(chosen, 0, sizeof(chosen));
697
698 count = 1;
699 bh_ptr[0] = sh->bh_cache[pd_idx];
700 switch(method) {
701 case READ_MODIFY_WRITE:
702 if (!buffer_uptodate(sh->bh_cache[pd_idx]))
703 BUG();
704 for (i=disks ; i-- ;) {
705 if (i==pd_idx)
706 continue;
707 if (sh->bh_write[i] &&
708 buffer_uptodate(sh->bh_cache[i])) {
709 bh_ptr[count++] = sh->bh_cache[i];
710 chosen[i] = sh->bh_write[i];
711 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
712 chosen[i]->b_reqnext = sh->bh_written[i];
713 sh->bh_written[i] = chosen[i];
714 check_xor();
715 }
716 }
717 break;
718 case RECONSTRUCT_WRITE:
719 memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
720 for (i= disks; i-- ;)
721 if (i!=pd_idx && sh->bh_write[i]) {
722 chosen[i] = sh->bh_write[i];
723 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
724 chosen[i]->b_reqnext = sh->bh_written[i];
725 sh->bh_written[i] = chosen[i];
726 }
727 break;
728 case CHECK_PARITY:
729 break;
730 }
731 if (count>1) {
732 xor_block(count, bh_ptr);
733 count = 1;
734 }
735
736 for (i = disks; i--;)
737 if (chosen[i]) {
738 struct buffer_head *bh = sh->bh_cache[i];
739 char *bdata;
740 bdata = bh_kmap(chosen[i]);
741 memcpy(bh->b_data,
742 bdata,sh->size);
743 bh_kunmap(chosen[i]);
744 set_bit(BH_Lock, &bh->b_state);
745 mark_buffer_uptodate(bh, 1);
746 }
747
748 switch(method) {
749 case RECONSTRUCT_WRITE:
750 case CHECK_PARITY:
751 for (i=disks; i--;)
752 if (i != pd_idx) {
753 bh_ptr[count++] = sh->bh_cache[i];
754 check_xor();
755 }
756 break;
757 case READ_MODIFY_WRITE:
758 for (i = disks; i--;)
759 if (chosen[i]) {
760 bh_ptr[count++] = sh->bh_cache[i];
761 check_xor();
762 }
763 }
764 if (count != 1)
765 xor_block(count, bh_ptr);
766
767 if (method != CHECK_PARITY) {
768 mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
769 set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
770 } else
771 mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
772}
773
774static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
775{
776 struct buffer_head **bhp;
777 raid5_conf_t *conf = sh->raid_conf;
778
779 PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
780
781
782 spin_lock(&sh->lock);
783 spin_lock_irq(&conf->device_lock);
784 bh->b_reqnext = NULL;
785 if (rw == READ)
786 bhp = &sh->bh_read[dd_idx];
787 else
788 bhp = &sh->bh_write[dd_idx];
789 while (*bhp) {
790 printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
791 bhp = & (*bhp)->b_reqnext;
792 }
793 *bhp = bh;
794 spin_unlock_irq(&conf->device_lock);
795 spin_unlock(&sh->lock);
796
797 PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
798}
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822static void handle_stripe(struct stripe_head *sh)
823{
824 raid5_conf_t *conf = sh->raid_conf;
825 int disks = conf->raid_disks;
826 struct buffer_head *return_ok= NULL, *return_fail = NULL;
827 int action[MD_SB_DISKS];
828 int i;
829 int syncing;
830 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
831 int failed_num=0;
832 struct buffer_head *bh;
833
834 PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
835 memset(action, 0, sizeof(action));
836
837 spin_lock(&sh->lock);
838 clear_bit(STRIPE_HANDLE, &sh->state);
839 clear_bit(STRIPE_DELAYED, &sh->state);
840
841 syncing = test_bit(STRIPE_SYNCING, &sh->state);
842
843
844 for (i=disks; i--; ) {
845 bh = sh->bh_cache[i];
846 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
847
848 if (buffer_uptodate(bh) && sh->bh_read[i]) {
849 struct buffer_head *rbh, *rbh2;
850 PRINTK("Return read for disc %d\n", i);
851 spin_lock_irq(&conf->device_lock);
852 rbh = sh->bh_read[i];
853 sh->bh_read[i] = NULL;
854 spin_unlock_irq(&conf->device_lock);
855 while (rbh) {
856 char *bdata;
857 bdata = bh_kmap(rbh);
858 memcpy(bdata, bh->b_data, bh->b_size);
859 bh_kunmap(rbh);
860 rbh2 = rbh->b_reqnext;
861 rbh->b_reqnext = return_ok;
862 return_ok = rbh;
863 rbh = rbh2;
864 }
865 }
866
867
868 if (buffer_locked(bh)) locked++;
869 if (buffer_uptodate(bh)) uptodate++;
870
871
872 if (sh->bh_read[i]) to_read++;
873 if (sh->bh_write[i]) to_write++;
874 if (sh->bh_written[i]) written++;
875 if (!conf->disks[i].operational) {
876 failed++;
877 failed_num = i;
878 }
879 }
880 PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
881 locked, uptodate, to_read, to_write, failed, failed_num);
882
883
884
885 if (failed > 1 && to_read+to_write+written) {
886 for (i=disks; i--; ) {
887
888 if (sh->bh_write[i]) to_write--;
889 while ((bh = sh->bh_write[i])) {
890 sh->bh_write[i] = bh->b_reqnext;
891 bh->b_reqnext = return_fail;
892 return_fail = bh;
893 }
894
895 if (sh->bh_written[i]) written--;
896 while ((bh = sh->bh_written[i])) {
897 sh->bh_written[i] = bh->b_reqnext;
898 bh->b_reqnext = return_fail;
899 return_fail = bh;
900 }
901
902
903 if (!conf->disks[i].operational) {
904 spin_lock_irq(&conf->device_lock);
905 if (sh->bh_read[i]) to_read--;
906 while ((bh = sh->bh_read[i])) {
907 sh->bh_read[i] = bh->b_reqnext;
908 bh->b_reqnext = return_fail;
909 return_fail = bh;
910 }
911 spin_unlock_irq(&conf->device_lock);
912 }
913 }
914 }
915 if (failed > 1 && syncing) {
916 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
917 clear_bit(STRIPE_SYNCING, &sh->state);
918 syncing = 0;
919 }
920
921
922
923
924 bh = sh->bh_cache[sh->pd_idx];
925 if ( written &&
926 ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
927 || (failed == 1 && failed_num == sh->pd_idx))
928 ) {
929
930 for (i=disks; i--; )
931 if (sh->bh_written[i]) {
932 bh = sh->bh_cache[i];
933 if (!conf->disks[sh->pd_idx].operational ||
934 (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
935
936 struct buffer_head *wbh, *wbh2;
937 PRINTK("Return write for disc %d\n", i);
938 wbh = sh->bh_written[i];
939 sh->bh_written[i] = NULL;
940 while (wbh) {
941 wbh2 = wbh->b_reqnext;
942 wbh->b_reqnext = return_ok;
943 return_ok = wbh;
944 wbh = wbh2;
945 }
946 }
947 }
948 }
949
950
951
952
953 if (to_read || (syncing && (uptodate < disks))) {
954 for (i=disks; i--;) {
955 bh = sh->bh_cache[i];
956 if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
957 (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
958
959
960
961 if (uptodate == disks-1) {
962 PRINTK("Computing block %d\n", i);
963 compute_block(sh, i);
964 uptodate++;
965 } else if (conf->disks[i].operational) {
966 set_bit(BH_Lock, &bh->b_state);
967 action[i] = READ+1;
968
969
970 if (sh->bh_page[i]) BUG();
971 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
972 ! syncing && !failed && !to_write) {
973 sh->bh_page[i] = sh->bh_cache[i]->b_page;
974 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
975 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
976 }
977 locked++;
978 PRINTK("Reading block %d (sync=%d)\n", i, syncing);
979 if (syncing)
980 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
981 }
982 }
983 }
984 set_bit(STRIPE_HANDLE, &sh->state);
985 }
986
987
988 if (to_write) {
989 int rmw=0, rcw=0;
990 for (i=disks ; i--;) {
991
992 bh = sh->bh_cache[i];
993 if ((sh->bh_write[i] || i == sh->pd_idx) &&
994 (!buffer_locked(bh) || sh->bh_page[i]) &&
995 !buffer_uptodate(bh)) {
996 if (conf->disks[i].operational
997
998 )
999 rmw++;
1000 else rmw += 2*disks;
1001 }
1002
1003 if (!sh->bh_write[i] && i != sh->pd_idx &&
1004 (!buffer_locked(bh) || sh->bh_page[i]) &&
1005 !buffer_uptodate(bh)) {
1006 if (conf->disks[i].operational) rcw++;
1007 else rcw += 2*disks;
1008 }
1009 }
1010 PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1011 set_bit(STRIPE_HANDLE, &sh->state);
1012 if (rmw < rcw && rmw > 0)
1013
1014 for (i=disks; i--;) {
1015 bh = sh->bh_cache[i];
1016 if ((sh->bh_write[i] || i == sh->pd_idx) &&
1017 !buffer_locked(bh) && !buffer_uptodate(bh) &&
1018 conf->disks[i].operational) {
1019 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1020 {
1021 PRINTK("Read_old block %d for r-m-w\n", i);
1022 set_bit(BH_Lock, &bh->b_state);
1023 action[i] = READ+1;
1024 locked++;
1025 } else {
1026 set_bit(STRIPE_DELAYED, &sh->state);
1027 set_bit(STRIPE_HANDLE, &sh->state);
1028 }
1029 }
1030 }
1031 if (rcw <= rmw && rcw > 0)
1032
1033 for (i=disks; i--;) {
1034 bh = sh->bh_cache[i];
1035 if (!sh->bh_write[i] && i != sh->pd_idx &&
1036 !buffer_locked(bh) && !buffer_uptodate(bh) &&
1037 conf->disks[i].operational) {
1038 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1039 {
1040 PRINTK("Read_old block %d for Reconstruct\n", i);
1041 set_bit(BH_Lock, &bh->b_state);
1042 action[i] = READ+1;
1043 locked++;
1044 } else {
1045 set_bit(STRIPE_DELAYED, &sh->state);
1046 set_bit(STRIPE_HANDLE, &sh->state);
1047 }
1048 }
1049 }
1050
1051 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1052 PRINTK("Computing parity...\n");
1053 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1054
1055 for (i=disks; i--;)
1056 if (buffer_locked(sh->bh_cache[i])) {
1057 PRINTK("Writing block %d\n", i);
1058 locked++;
1059 action[i] = WRITE+1;
1060 if (!conf->disks[i].operational
1061 || (i==sh->pd_idx && failed == 0))
1062 set_bit(STRIPE_INSYNC, &sh->state);
1063 }
1064 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1065 atomic_dec(&conf->preread_active_stripes);
1066 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1067 md_wakeup_thread(conf->thread);
1068 }
1069 }
1070 }
1071
1072
1073
1074
1075
1076 if (syncing && locked == 0 &&
1077 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1078 set_bit(STRIPE_HANDLE, &sh->state);
1079 if (failed == 0) {
1080 if (uptodate != disks)
1081 BUG();
1082 compute_parity(sh, CHECK_PARITY);
1083 uptodate--;
1084 bh = sh->bh_cache[sh->pd_idx];
1085 if ((*(u32*)bh->b_data) == 0 &&
1086 !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1087
1088 set_bit(STRIPE_INSYNC, &sh->state);
1089 }
1090 }
1091 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1092 struct disk_info *spare;
1093 if (failed==0)
1094 failed_num = sh->pd_idx;
1095
1096 if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1097 if (uptodate+1 != disks)
1098 BUG();
1099 compute_block(sh, failed_num);
1100 uptodate++;
1101 }
1102 if (uptodate != disks)
1103 BUG();
1104 bh = sh->bh_cache[failed_num];
1105 set_bit(BH_Lock, &bh->b_state);
1106 action[failed_num] = WRITE+1;
1107 locked++;
1108 set_bit(STRIPE_INSYNC, &sh->state);
1109 if (conf->disks[failed_num].operational)
1110 md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
1111 else if ((spare=conf->spare))
1112 md_sync_acct(spare->dev, bh->b_size>>9);
1113
1114 }
1115 }
1116 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1117 md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
1118 clear_bit(STRIPE_SYNCING, &sh->state);
1119 }
1120
1121
1122 spin_unlock(&sh->lock);
1123
1124 while ((bh=return_ok)) {
1125 return_ok = bh->b_reqnext;
1126 bh->b_reqnext = NULL;
1127 bh->b_end_io(bh, 1);
1128 }
1129 while ((bh=return_fail)) {
1130 return_fail = bh->b_reqnext;
1131 bh->b_reqnext = NULL;
1132 bh->b_end_io(bh, 0);
1133 }
1134 for (i=disks; i-- ;)
1135 if (action[i]) {
1136 struct buffer_head *bh = sh->bh_cache[i];
1137 struct disk_info *spare = conf->spare;
1138 int skip = 0;
1139 if (action[i] == READ+1)
1140 bh->b_end_io = raid5_end_read_request;
1141 else
1142 bh->b_end_io = raid5_end_write_request;
1143 if (conf->disks[i].operational)
1144 bh->b_dev = conf->disks[i].dev;
1145 else if (spare && action[i] == WRITE+1)
1146 bh->b_dev = spare->dev;
1147 else skip=1;
1148 if (!skip) {
1149 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1150 atomic_inc(&sh->count);
1151 bh->b_rdev = bh->b_dev;
1152 bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1153 generic_make_request(action[i]-1, bh);
1154 } else {
1155 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1156 clear_bit(BH_Lock, &bh->b_state);
1157 set_bit(STRIPE_HANDLE, &sh->state);
1158 }
1159 }
1160}
1161
1162static inline void raid5_activate_delayed(raid5_conf_t *conf)
1163{
1164 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1165 while (!list_empty(&conf->delayed_list)) {
1166 struct list_head *l = conf->delayed_list.next;
1167 struct stripe_head *sh;
1168 sh = list_entry(l, struct stripe_head, lru);
1169 list_del_init(l);
1170 clear_bit(STRIPE_DELAYED, &sh->state);
1171 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1172 atomic_inc(&conf->preread_active_stripes);
1173 list_add_tail(&sh->lru, &conf->handle_list);
1174 }
1175 }
1176}
1177static void raid5_unplug_device(void *data)
1178{
1179 raid5_conf_t *conf = (raid5_conf_t *)data;
1180 unsigned long flags;
1181
1182 spin_lock_irqsave(&conf->device_lock, flags);
1183
1184 raid5_activate_delayed(conf);
1185
1186 conf->plugged = 0;
1187 md_wakeup_thread(conf->thread);
1188
1189 spin_unlock_irqrestore(&conf->device_lock, flags);
1190}
1191
1192static inline void raid5_plug_device(raid5_conf_t *conf)
1193{
1194 spin_lock_irq(&conf->device_lock);
1195 if (list_empty(&conf->delayed_list))
1196 if (!conf->plugged) {
1197 conf->plugged = 1;
1198 queue_task(&conf->plug_tq, &tq_disk);
1199 }
1200 spin_unlock_irq(&conf->device_lock);
1201}
1202
1203static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1204{
1205 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1206 const unsigned int raid_disks = conf->raid_disks;
1207 const unsigned int data_disks = raid_disks - 1;
1208 unsigned int dd_idx, pd_idx;
1209 unsigned long new_sector;
1210 int read_ahead = 0;
1211
1212 struct stripe_head *sh;
1213
1214 if (rw == READA) {
1215 rw = READ;
1216 read_ahead=1;
1217 }
1218
1219 new_sector = raid5_compute_sector(bh->b_rsector,
1220 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1221
1222 PRINTK("raid5_make_request, sector %lu\n", new_sector);
1223 sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1224 if (sh) {
1225 sh->pd_idx = pd_idx;
1226
1227 add_stripe_bh(sh, bh, dd_idx, rw);
1228
1229 raid5_plug_device(conf);
1230 handle_stripe(sh);
1231 release_stripe(sh);
1232 } else
1233 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1234 return 0;
1235}
1236
1237
1238
1239
1240unsigned int device_bsize (kdev_t dev)
1241{
1242 unsigned int i, correct_size;
1243
1244 correct_size = BLOCK_SIZE;
1245 if (blksize_size[MAJOR(dev)]) {
1246 i = blksize_size[MAJOR(dev)][MINOR(dev)];
1247 if (i)
1248 correct_size = i;
1249 }
1250
1251 return correct_size;
1252}
1253
1254static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
1255{
1256 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1257 struct stripe_head *sh;
1258 int sectors_per_chunk = conf->chunk_size >> 9;
1259 unsigned long stripe = sector_nr/sectors_per_chunk;
1260 int chunk_offset = sector_nr % sectors_per_chunk;
1261 int dd_idx, pd_idx;
1262 unsigned long first_sector;
1263 int raid_disks = conf->raid_disks;
1264 int data_disks = raid_disks-1;
1265 int redone = 0;
1266 int bufsize;
1267
1268 sh = get_active_stripe(conf, sector_nr, 0, 0);
1269 bufsize = sh->size;
1270 redone = sector_nr - sh->sector;
1271 first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1272 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1273 sh->pd_idx = pd_idx;
1274 spin_lock(&sh->lock);
1275 set_bit(STRIPE_SYNCING, &sh->state);
1276 clear_bit(STRIPE_INSYNC, &sh->state);
1277 sh->sync_redone = redone;
1278 spin_unlock(&sh->lock);
1279
1280 handle_stripe(sh);
1281 release_stripe(sh);
1282
1283 return (bufsize>>9)-redone;
1284}
1285
1286
1287
1288
1289
1290
1291
1292
1293static void raid5d (void *data)
1294{
1295 struct stripe_head *sh;
1296 raid5_conf_t *conf = data;
1297 mddev_t *mddev = conf->mddev;
1298 int handled;
1299
1300 PRINTK("+++ raid5d active\n");
1301
1302 handled = 0;
1303
1304 if (mddev->sb_dirty)
1305 md_update_sb(mddev);
1306 md_spin_lock_irq(&conf->device_lock);
1307 while (1) {
1308 struct list_head *first;
1309
1310 if (list_empty(&conf->handle_list) &&
1311 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1312 !conf->plugged &&
1313 !list_empty(&conf->delayed_list))
1314 raid5_activate_delayed(conf);
1315
1316 if (list_empty(&conf->handle_list))
1317 break;
1318
1319 first = conf->handle_list.next;
1320 sh = list_entry(first, struct stripe_head, lru);
1321
1322 list_del_init(first);
1323 atomic_inc(&sh->count);
1324 if (atomic_read(&sh->count)!= 1)
1325 BUG();
1326 md_spin_unlock_irq(&conf->device_lock);
1327
1328 handled++;
1329 handle_stripe(sh);
1330 release_stripe(sh);
1331
1332 md_spin_lock_irq(&conf->device_lock);
1333 }
1334 PRINTK("%d stripes handled\n", handled);
1335
1336 md_spin_unlock_irq(&conf->device_lock);
1337
1338 PRINTK("--- raid5d inactive\n");
1339}
1340
1341
1342
1343
1344
1345
1346static void raid5syncd (void *data)
1347{
1348 raid5_conf_t *conf = data;
1349 mddev_t *mddev = conf->mddev;
1350
1351 if (!conf->resync_parity)
1352 return;
1353 if (conf->resync_parity == 2)
1354 return;
1355 down(&mddev->recovery_sem);
1356 if (md_do_sync(mddev,NULL)) {
1357 up(&mddev->recovery_sem);
1358 printk("raid5: resync aborted!\n");
1359 return;
1360 }
1361 conf->resync_parity = 0;
1362 up(&mddev->recovery_sem);
1363 printk("raid5: resync finished.\n");
1364}
1365
1366static int raid5_run (mddev_t *mddev)
1367{
1368 raid5_conf_t *conf;
1369 int i, j, raid_disk, memory;
1370 mdp_super_t *sb = mddev->sb;
1371 mdp_disk_t *desc;
1372 mdk_rdev_t *rdev;
1373 struct disk_info *disk;
1374 struct md_list_head *tmp;
1375 int start_recovery = 0;
1376
1377 MOD_INC_USE_COUNT;
1378
1379 if (sb->level != 5 && sb->level != 4) {
1380 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1381 MOD_DEC_USE_COUNT;
1382 return -EIO;
1383 }
1384
1385 mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1386 if ((conf = mddev->private) == NULL)
1387 goto abort;
1388 memset (conf, 0, sizeof (*conf));
1389 conf->mddev = mddev;
1390
1391 if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1392 goto abort;
1393 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1394
1395 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1396 md_init_waitqueue_head(&conf->wait_for_stripe);
1397 INIT_LIST_HEAD(&conf->handle_list);
1398 INIT_LIST_HEAD(&conf->delayed_list);
1399 INIT_LIST_HEAD(&conf->inactive_list);
1400 atomic_set(&conf->active_stripes, 0);
1401 atomic_set(&conf->preread_active_stripes, 0);
1402 conf->buffer_size = PAGE_SIZE;
1403
1404 conf->plugged = 0;
1405 conf->plug_tq.sync = 0;
1406 conf->plug_tq.routine = &raid5_unplug_device;
1407 conf->plug_tq.data = conf;
1408
1409 PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1410
1411 ITERATE_RDEV(mddev,rdev,tmp) {
1412
1413
1414
1415
1416
1417 desc = sb->disks + rdev->desc_nr;
1418 raid_disk = desc->raid_disk;
1419 disk = conf->disks + raid_disk;
1420
1421 if (disk_faulty(desc)) {
1422 printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1423 if (!rdev->faulty) {
1424 MD_BUG();
1425 goto abort;
1426 }
1427 disk->number = desc->number;
1428 disk->raid_disk = raid_disk;
1429 disk->dev = rdev->dev;
1430
1431 disk->operational = 0;
1432 disk->write_only = 0;
1433 disk->spare = 0;
1434 disk->used_slot = 1;
1435 continue;
1436 }
1437 if (disk_active(desc)) {
1438 if (!disk_sync(desc)) {
1439 printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1440 MD_BUG();
1441 goto abort;
1442 }
1443 if (raid_disk > sb->raid_disks) {
1444 printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1445 continue;
1446 }
1447 if (disk->operational) {
1448 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1449 continue;
1450 }
1451 printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1452
1453 disk->number = desc->number;
1454 disk->raid_disk = raid_disk;
1455 disk->dev = rdev->dev;
1456 disk->operational = 1;
1457 disk->used_slot = 1;
1458
1459 conf->working_disks++;
1460 } else {
1461
1462
1463
1464 printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1465 disk->number = desc->number;
1466 disk->raid_disk = raid_disk;
1467 disk->dev = rdev->dev;
1468
1469 disk->operational = 0;
1470 disk->write_only = 0;
1471 disk->spare = 1;
1472 disk->used_slot = 1;
1473 }
1474 }
1475
1476 for (i = 0; i < MD_SB_DISKS; i++) {
1477 desc = sb->disks + i;
1478 raid_disk = desc->raid_disk;
1479 disk = conf->disks + raid_disk;
1480
1481 if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1482 !conf->disks[raid_disk].used_slot) {
1483
1484 disk->number = desc->number;
1485 disk->raid_disk = raid_disk;
1486 disk->dev = MKDEV(0,0);
1487
1488 disk->operational = 0;
1489 disk->write_only = 0;
1490 disk->spare = 0;
1491 disk->used_slot = 1;
1492 }
1493 }
1494
1495 conf->raid_disks = sb->raid_disks;
1496
1497
1498
1499 conf->failed_disks = conf->raid_disks - conf->working_disks;
1500 conf->mddev = mddev;
1501 conf->chunk_size = sb->chunk_size;
1502 conf->level = sb->level;
1503 conf->algorithm = sb->layout;
1504 conf->max_nr_stripes = NR_STRIPES;
1505
1506#if 0
1507 for (i = 0; i < conf->raid_disks; i++) {
1508 if (!conf->disks[i].used_slot) {
1509 MD_BUG();
1510 goto abort;
1511 }
1512 }
1513#endif
1514 if (!conf->chunk_size || conf->chunk_size % 4) {
1515 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1516 goto abort;
1517 }
1518 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1519 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1520 goto abort;
1521 }
1522 if (conf->failed_disks > 1) {
1523 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1524 goto abort;
1525 }
1526
1527 if (conf->working_disks != sb->raid_disks) {
1528 printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1529 start_recovery = 1;
1530 }
1531
1532 {
1533 const char * name = "raid5d";
1534
1535 conf->thread = md_register_thread(raid5d, conf, name);
1536 if (!conf->thread) {
1537 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1538 goto abort;
1539 }
1540 }
1541
1542 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1543 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1544 if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1545 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1546 shrink_stripes(conf, conf->max_nr_stripes);
1547 goto abort;
1548 } else
1549 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1550
1551
1552
1553
1554
1555 for (i = 0; i < MD_SB_DISKS ; i++) {
1556 mark_disk_nonsync(sb->disks + i);
1557 for (j = 0; j < sb->raid_disks; j++) {
1558 if (!conf->disks[j].operational)
1559 continue;
1560 if (sb->disks[i].number == conf->disks[j].number)
1561 mark_disk_sync(sb->disks + i);
1562 }
1563 }
1564 sb->active_disks = conf->working_disks;
1565
1566 if (sb->active_disks == sb->raid_disks)
1567 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1568 else
1569 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1570
1571 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1572 const char * name = "raid5syncd";
1573
1574 conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1575 if (!conf->resync_thread) {
1576 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1577 goto abort;
1578 }
1579
1580 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1581 conf->resync_parity = 1;
1582 md_wakeup_thread(conf->resync_thread);
1583 }
1584
1585 print_raid5_conf(conf);
1586 if (start_recovery)
1587 md_recover_arrays();
1588 print_raid5_conf(conf);
1589
1590
1591 return (0);
1592abort:
1593 if (conf) {
1594 print_raid5_conf(conf);
1595 if (conf->stripe_hashtbl)
1596 free_pages((unsigned long) conf->stripe_hashtbl,
1597 HASH_PAGES_ORDER);
1598 kfree(conf);
1599 }
1600 mddev->private = NULL;
1601 printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1602 MOD_DEC_USE_COUNT;
1603 return -EIO;
1604}
1605
1606static int raid5_stop_resync (mddev_t *mddev)
1607{
1608 raid5_conf_t *conf = mddev_to_conf(mddev);
1609 mdk_thread_t *thread = conf->resync_thread;
1610
1611 if (thread) {
1612 if (conf->resync_parity) {
1613 conf->resync_parity = 2;
1614 md_interrupt_thread(thread);
1615 printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1616 return 1;
1617 }
1618 return 0;
1619 }
1620 return 0;
1621}
1622
1623static int raid5_restart_resync (mddev_t *mddev)
1624{
1625 raid5_conf_t *conf = mddev_to_conf(mddev);
1626
1627 if (conf->resync_parity) {
1628 if (!conf->resync_thread) {
1629 MD_BUG();
1630 return 0;
1631 }
1632 printk("raid5: waking up raid5resync.\n");
1633 conf->resync_parity = 1;
1634 md_wakeup_thread(conf->resync_thread);
1635 return 1;
1636 } else
1637 printk("raid5: no restart-resync needed.\n");
1638 return 0;
1639}
1640
1641
1642static int raid5_stop (mddev_t *mddev)
1643{
1644 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1645
1646 if (conf->resync_thread)
1647 md_unregister_thread(conf->resync_thread);
1648 md_unregister_thread(conf->thread);
1649 shrink_stripes(conf, conf->max_nr_stripes);
1650 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1651 kfree(conf);
1652 mddev->private = NULL;
1653 MOD_DEC_USE_COUNT;
1654 return 0;
1655}
1656
1657#if RAID5_DEBUG
1658static void print_sh (struct stripe_head *sh)
1659{
1660 int i;
1661
1662 printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1663 printk("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count));
1664 printk("sh %lu, ", sh->sector);
1665 for (i = 0; i < MD_SB_DISKS; i++) {
1666 if (sh->bh_cache[i])
1667 printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1668 }
1669 printk("\n");
1670}
1671
1672static void printall (raid5_conf_t *conf)
1673{
1674 struct stripe_head *sh;
1675 int i;
1676
1677 md_spin_lock_irq(&conf->device_lock);
1678 for (i = 0; i < NR_HASH; i++) {
1679 sh = conf->stripe_hashtbl[i];
1680 for (; sh; sh = sh->hash_next) {
1681 if (sh->raid_conf != conf)
1682 continue;
1683 print_sh(sh);
1684 }
1685 }
1686 md_spin_unlock_irq(&conf->device_lock);
1687
1688 PRINTK("--- raid5d inactive\n");
1689}
1690#endif
1691
1692static void raid5_status (struct seq_file *seq, mddev_t *mddev)
1693{
1694 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1695 mdp_super_t *sb = mddev->sb;
1696 int i;
1697
1698 seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1699 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1700 for (i = 0; i < conf->raid_disks; i++)
1701 seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_");
1702 seq_printf (seq, "]");
1703#if RAID5_DEBUG
1704#define D(x) \
1705 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1706 printall(conf);
1707#endif
1708
1709}
1710
1711static void print_raid5_conf (raid5_conf_t *conf)
1712{
1713 int i;
1714 struct disk_info *tmp;
1715
1716 printk("RAID5 conf printout:\n");
1717 if (!conf) {
1718 printk("(conf==NULL)\n");
1719 return;
1720 }
1721 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1722 conf->working_disks, conf->failed_disks);
1723
1724#if RAID5_DEBUG
1725 for (i = 0; i < MD_SB_DISKS; i++) {
1726#else
1727 for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1728#endif
1729 tmp = conf->disks + i;
1730 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1731 i, tmp->spare,tmp->operational,
1732 tmp->number,tmp->raid_disk,tmp->used_slot,
1733 partition_name(tmp->dev));
1734 }
1735}
1736
1737static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1738{
1739 int err = 0;
1740 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1741 raid5_conf_t *conf = mddev->private;
1742 struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1743 mdp_super_t *sb = mddev->sb;
1744 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1745 mdk_rdev_t *spare_rdev, *failed_rdev;
1746
1747 print_raid5_conf(conf);
1748 md_spin_lock_irq(&conf->device_lock);
1749
1750
1751
1752 switch (state) {
1753
1754 case DISKOP_SPARE_ACTIVE:
1755
1756
1757
1758
1759
1760 for (i = 0; i < conf->raid_disks; i++) {
1761 tmp = conf->disks + i;
1762 if ((!tmp->operational && !tmp->spare) ||
1763 !tmp->used_slot) {
1764 failed_disk = i;
1765 break;
1766 }
1767 }
1768
1769
1770
1771
1772 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1773 MD_BUG();
1774 err = 1;
1775 goto abort;
1776 }
1777
1778
1779 case DISKOP_SPARE_WRITE:
1780 case DISKOP_SPARE_INACTIVE:
1781
1782
1783
1784
1785
1786 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1787 tmp = conf->disks + i;
1788 if (tmp->spare && tmp->number == (*d)->number) {
1789 spare_disk = i;
1790 break;
1791 }
1792 }
1793 if (spare_disk == -1) {
1794 MD_BUG();
1795 err = 1;
1796 goto abort;
1797 }
1798 break;
1799
1800 case DISKOP_HOT_REMOVE_DISK:
1801
1802 for (i = 0; i < MD_SB_DISKS; i++) {
1803 tmp = conf->disks + i;
1804 if (tmp->used_slot && (tmp->number == (*d)->number)) {
1805 if (tmp->operational) {
1806 err = -EBUSY;
1807 goto abort;
1808 }
1809 removed_disk = i;
1810 break;
1811 }
1812 }
1813 if (removed_disk == -1) {
1814 MD_BUG();
1815 err = 1;
1816 goto abort;
1817 }
1818 break;
1819
1820 case DISKOP_HOT_ADD_DISK:
1821
1822 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1823 tmp = conf->disks + i;
1824 if (!tmp->used_slot) {
1825 added_disk = i;
1826 break;
1827 }
1828 }
1829 if (added_disk == -1) {
1830 MD_BUG();
1831 err = 1;
1832 goto abort;
1833 }
1834 break;
1835 }
1836
1837 switch (state) {
1838
1839
1840
1841 case DISKOP_SPARE_WRITE:
1842 if (conf->spare) {
1843 MD_BUG();
1844 err = 1;
1845 goto abort;
1846 }
1847 sdisk = conf->disks + spare_disk;
1848 sdisk->operational = 1;
1849 sdisk->write_only = 1;
1850 conf->spare = sdisk;
1851 break;
1852
1853
1854
1855 case DISKOP_SPARE_INACTIVE:
1856 sdisk = conf->disks + spare_disk;
1857 sdisk->operational = 0;
1858 sdisk->write_only = 0;
1859
1860
1861
1862 if (conf->spare == sdisk)
1863 conf->spare = NULL;
1864 break;
1865
1866
1867
1868
1869
1870
1871
1872 case DISKOP_SPARE_ACTIVE:
1873 if (!conf->spare) {
1874 MD_BUG();
1875 err = 1;
1876 goto abort;
1877 }
1878 sdisk = conf->disks + spare_disk;
1879 fdisk = conf->disks + failed_disk;
1880
1881 spare_desc = &sb->disks[sdisk->number];
1882 failed_desc = &sb->disks[fdisk->number];
1883
1884 if (spare_desc != *d) {
1885 MD_BUG();
1886 err = 1;
1887 goto abort;
1888 }
1889
1890 if (spare_desc->raid_disk != sdisk->raid_disk) {
1891 MD_BUG();
1892 err = 1;
1893 goto abort;
1894 }
1895
1896 if (sdisk->raid_disk != spare_disk) {
1897 MD_BUG();
1898 err = 1;
1899 goto abort;
1900 }
1901
1902 if (failed_desc->raid_disk != fdisk->raid_disk) {
1903 MD_BUG();
1904 err = 1;
1905 goto abort;
1906 }
1907
1908 if (fdisk->raid_disk != failed_disk) {
1909 MD_BUG();
1910 err = 1;
1911 goto abort;
1912 }
1913
1914
1915
1916
1917 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1918 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1919
1920
1921
1922
1923 spare_rdev->desc_nr = failed_desc->number;
1924 if (failed_rdev)
1925 failed_rdev->desc_nr = spare_desc->number;
1926
1927 xchg_values(*spare_desc, *failed_desc);
1928 xchg_values(*fdisk, *sdisk);
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1939 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1940 xchg_values(spare_desc->number, failed_desc->number);
1941 xchg_values(sdisk->number, fdisk->number);
1942
1943 *d = failed_desc;
1944
1945 if (sdisk->dev == MKDEV(0,0))
1946 sdisk->used_slot = 0;
1947
1948
1949
1950
1951 fdisk->spare = 0;
1952 fdisk->write_only = 0;
1953
1954
1955
1956
1957
1958
1959 conf->failed_disks--;
1960 conf->working_disks++;
1961 conf->spare = NULL;
1962
1963 break;
1964
1965 case DISKOP_HOT_REMOVE_DISK:
1966 rdisk = conf->disks + removed_disk;
1967
1968 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1969 MD_BUG();
1970 err = 1;
1971 goto abort;
1972 }
1973 rdisk->dev = MKDEV(0,0);
1974 rdisk->used_slot = 0;
1975
1976 break;
1977
1978 case DISKOP_HOT_ADD_DISK:
1979 adisk = conf->disks + added_disk;
1980 added_desc = *d;
1981
1982 if (added_disk != added_desc->number) {
1983 MD_BUG();
1984 err = 1;
1985 goto abort;
1986 }
1987
1988 adisk->number = added_desc->number;
1989 adisk->raid_disk = added_desc->raid_disk;
1990 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1991
1992 adisk->operational = 0;
1993 adisk->write_only = 0;
1994 adisk->spare = 1;
1995 adisk->used_slot = 1;
1996
1997
1998 break;
1999
2000 default:
2001 MD_BUG();
2002 err = 1;
2003 goto abort;
2004 }
2005abort:
2006 md_spin_unlock_irq(&conf->device_lock);
2007 print_raid5_conf(conf);
2008 return err;
2009}
2010
2011static mdk_personality_t raid5_personality=
2012{
2013 name: "raid5",
2014 make_request: raid5_make_request,
2015 run: raid5_run,
2016 stop: raid5_stop,
2017 status: raid5_status,
2018 error_handler: raid5_error,
2019 diskop: raid5_diskop,
2020 stop_resync: raid5_stop_resync,
2021 restart_resync: raid5_restart_resync,
2022 sync_request: raid5_sync_request
2023};
2024
2025static int md__init raid5_init (void)
2026{
2027 return register_md_personality (RAID5, &raid5_personality);
2028}
2029
2030static void raid5_exit (void)
2031{
2032 unregister_md_personality (RAID5);
2033}
2034
2035module_init(raid5_init);
2036module_exit(raid5_exit);
2037MODULE_LICENSE("GPL");
2038