1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#include <linux/module.h>
36#include <linux/kthread.h>
37#include <linux/linkage.h>
38#include <linux/raid/md.h>
39#include <linux/raid/bitmap.h>
40#include <linux/sysctl.h>
41#include <linux/buffer_head.h>
42#include <linux/suspend.h>
43#include <linux/poll.h>
44#include <linux/mutex.h>
45#include <linux/ctype.h>
46
47#include <linux/init.h>
48
49#include <linux/file.h>
50
51#ifdef CONFIG_KMOD
52#include <linux/kmod.h>
53#endif
54
55#include <asm/unaligned.h>
56
57#define MAJOR_NR MD_MAJOR
58#define MD_DRIVER
59
60
61#define MdpMinorShift 6
62
63#define DEBUG 0
64#define dprintk(x...) ((void)(DEBUG && printk(x)))
65
66
67#ifndef MODULE
68static void autostart_arrays (int part);
69#endif
70
71static LIST_HEAD(pers_list);
72static DEFINE_SPINLOCK(pers_lock);
73
74static void md_print_devices(void);
75
76#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91static int sysctl_speed_limit_min = 1000;
92static int sysctl_speed_limit_max = 200000;
93static inline int speed_min(mddev_t *mddev)
94{
95 return mddev->sync_speed_min ?
96 mddev->sync_speed_min : sysctl_speed_limit_min;
97}
98
99static inline int speed_max(mddev_t *mddev)
100{
101 return mddev->sync_speed_max ?
102 mddev->sync_speed_max : sysctl_speed_limit_max;
103}
104
105static struct ctl_table_header *raid_table_header;
106
107static ctl_table raid_table[] = {
108 {
109 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
110 .procname = "speed_limit_min",
111 .data = &sysctl_speed_limit_min,
112 .maxlen = sizeof(int),
113 .mode = S_IRUGO|S_IWUSR,
114 .proc_handler = &proc_dointvec,
115 },
116 {
117 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
118 .procname = "speed_limit_max",
119 .data = &sysctl_speed_limit_max,
120 .maxlen = sizeof(int),
121 .mode = S_IRUGO|S_IWUSR,
122 .proc_handler = &proc_dointvec,
123 },
124 { .ctl_name = 0 }
125};
126
127static ctl_table raid_dir_table[] = {
128 {
129 .ctl_name = DEV_RAID,
130 .procname = "raid",
131 .maxlen = 0,
132 .mode = S_IRUGO|S_IXUGO,
133 .child = raid_table,
134 },
135 { .ctl_name = 0 }
136};
137
138static ctl_table raid_root_table[] = {
139 {
140 .ctl_name = CTL_DEV,
141 .procname = "dev",
142 .maxlen = 0,
143 .mode = 0555,
144 .child = raid_dir_table,
145 },
146 { .ctl_name = 0 }
147};
148
149static struct block_device_operations md_fops;
150
151static int start_readonly;
152
153
154
155
156
157
158
159
160
161
162
163static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
164static atomic_t md_event_count;
165void md_new_event(mddev_t *mddev)
166{
167 atomic_inc(&md_event_count);
168 wake_up(&md_event_waiters);
169 sysfs_notify(&mddev->kobj, NULL, "sync_action");
170}
171EXPORT_SYMBOL_GPL(md_new_event);
172
173
174
175
176static void md_new_event_inintr(mddev_t *mddev)
177{
178 atomic_inc(&md_event_count);
179 wake_up(&md_event_waiters);
180}
181
182
183
184
185
186static LIST_HEAD(all_mddevs);
187static DEFINE_SPINLOCK(all_mddevs_lock);
188
189
190
191
192
193
194
195
196
197#define ITERATE_MDDEV(mddev,tmp) \
198 \
199 for (({ spin_lock(&all_mddevs_lock); \
200 tmp = all_mddevs.next; \
201 mddev = NULL;}); \
202 ({ if (tmp != &all_mddevs) \
203 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
204 spin_unlock(&all_mddevs_lock); \
205 if (mddev) mddev_put(mddev); \
206 mddev = list_entry(tmp, mddev_t, all_mddevs); \
207 tmp != &all_mddevs;}); \
208 ({ spin_lock(&all_mddevs_lock); \
209 tmp = tmp->next;}) \
210 )
211
212
213static int md_fail_request (request_queue_t *q, struct bio *bio)
214{
215 bio_io_error(bio, bio->bi_size);
216 return 0;
217}
218
219static inline mddev_t *mddev_get(mddev_t *mddev)
220{
221 atomic_inc(&mddev->active);
222 return mddev;
223}
224
225static void mddev_put(mddev_t *mddev)
226{
227 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
228 return;
229 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
230 list_del(&mddev->all_mddevs);
231 spin_unlock(&all_mddevs_lock);
232 blk_cleanup_queue(mddev->queue);
233 kobject_unregister(&mddev->kobj);
234 } else
235 spin_unlock(&all_mddevs_lock);
236}
237
238static mddev_t * mddev_find(dev_t unit)
239{
240 mddev_t *mddev, *new = NULL;
241
242 retry:
243 spin_lock(&all_mddevs_lock);
244 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
245 if (mddev->unit == unit) {
246 mddev_get(mddev);
247 spin_unlock(&all_mddevs_lock);
248 kfree(new);
249 return mddev;
250 }
251
252 if (new) {
253 list_add(&new->all_mddevs, &all_mddevs);
254 spin_unlock(&all_mddevs_lock);
255 return new;
256 }
257 spin_unlock(&all_mddevs_lock);
258
259 new = kzalloc(sizeof(*new), GFP_KERNEL);
260 if (!new)
261 return NULL;
262
263 new->unit = unit;
264 if (MAJOR(unit) == MD_MAJOR)
265 new->md_minor = MINOR(unit);
266 else
267 new->md_minor = MINOR(unit) >> MdpMinorShift;
268
269 mutex_init(&new->reconfig_mutex);
270 INIT_LIST_HEAD(&new->disks);
271 INIT_LIST_HEAD(&new->all_mddevs);
272 init_timer(&new->safemode_timer);
273 atomic_set(&new->active, 1);
274 spin_lock_init(&new->write_lock);
275 init_waitqueue_head(&new->sb_wait);
276
277 new->queue = blk_alloc_queue(GFP_KERNEL);
278 if (!new->queue) {
279 kfree(new);
280 return NULL;
281 }
282 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
283
284 blk_queue_make_request(new->queue, md_fail_request);
285
286 goto retry;
287}
288
289static inline int mddev_lock(mddev_t * mddev)
290{
291 return mutex_lock_interruptible(&mddev->reconfig_mutex);
292}
293
294static inline int mddev_trylock(mddev_t * mddev)
295{
296 return mutex_trylock(&mddev->reconfig_mutex);
297}
298
299static inline void mddev_unlock(mddev_t * mddev)
300{
301 mutex_unlock(&mddev->reconfig_mutex);
302
303 md_wakeup_thread(mddev->thread);
304}
305
306static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
307{
308 mdk_rdev_t * rdev;
309 struct list_head *tmp;
310
311 ITERATE_RDEV(mddev,rdev,tmp) {
312 if (rdev->desc_nr == nr)
313 return rdev;
314 }
315 return NULL;
316}
317
318static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
319{
320 struct list_head *tmp;
321 mdk_rdev_t *rdev;
322
323 ITERATE_RDEV(mddev,rdev,tmp) {
324 if (rdev->bdev->bd_dev == dev)
325 return rdev;
326 }
327 return NULL;
328}
329
330static struct mdk_personality *find_pers(int level, char *clevel)
331{
332 struct mdk_personality *pers;
333 list_for_each_entry(pers, &pers_list, list) {
334 if (level != LEVEL_NONE && pers->level == level)
335 return pers;
336 if (strcmp(pers->name, clevel)==0)
337 return pers;
338 }
339 return NULL;
340}
341
342static inline sector_t calc_dev_sboffset(struct block_device *bdev)
343{
344 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
345 return MD_NEW_SIZE_BLOCKS(size);
346}
347
348static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
349{
350 sector_t size;
351
352 size = rdev->sb_offset;
353
354 if (chunk_size)
355 size &= ~((sector_t)chunk_size/1024 - 1);
356 return size;
357}
358
359static int alloc_disk_sb(mdk_rdev_t * rdev)
360{
361 if (rdev->sb_page)
362 MD_BUG();
363
364 rdev->sb_page = alloc_page(GFP_KERNEL);
365 if (!rdev->sb_page) {
366 printk(KERN_ALERT "md: out of memory.\n");
367 return -EINVAL;
368 }
369
370 return 0;
371}
372
373static void free_disk_sb(mdk_rdev_t * rdev)
374{
375 if (rdev->sb_page) {
376 put_page(rdev->sb_page);
377 rdev->sb_loaded = 0;
378 rdev->sb_page = NULL;
379 rdev->sb_offset = 0;
380 rdev->size = 0;
381 }
382}
383
384
385static int super_written(struct bio *bio, unsigned int bytes_done, int error)
386{
387 mdk_rdev_t *rdev = bio->bi_private;
388 mddev_t *mddev = rdev->mddev;
389 if (bio->bi_size)
390 return 1;
391
392 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
393 md_error(mddev, rdev);
394
395 if (atomic_dec_and_test(&mddev->pending_writes))
396 wake_up(&mddev->sb_wait);
397 bio_put(bio);
398 return 0;
399}
400
401static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
402{
403 struct bio *bio2 = bio->bi_private;
404 mdk_rdev_t *rdev = bio2->bi_private;
405 mddev_t *mddev = rdev->mddev;
406 if (bio->bi_size)
407 return 1;
408
409 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
410 error == -EOPNOTSUPP) {
411 unsigned long flags;
412
413 set_bit(BarriersNotsupp, &rdev->flags);
414 mddev->barriers_work = 0;
415 spin_lock_irqsave(&mddev->write_lock, flags);
416 bio2->bi_next = mddev->biolist;
417 mddev->biolist = bio2;
418 spin_unlock_irqrestore(&mddev->write_lock, flags);
419 wake_up(&mddev->sb_wait);
420 bio_put(bio);
421 return 0;
422 }
423 bio_put(bio2);
424 bio->bi_private = rdev;
425 return super_written(bio, bytes_done, error);
426}
427
428void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
429 sector_t sector, int size, struct page *page)
430{
431
432
433
434
435
436
437
438
439
440 struct bio *bio = bio_alloc(GFP_NOIO, 1);
441 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
442
443 bio->bi_bdev = rdev->bdev;
444 bio->bi_sector = sector;
445 bio_add_page(bio, page, size, 0);
446 bio->bi_private = rdev;
447 bio->bi_end_io = super_written;
448 bio->bi_rw = rw;
449
450 atomic_inc(&mddev->pending_writes);
451 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
452 struct bio *rbio;
453 rw |= (1<<BIO_RW_BARRIER);
454 rbio = bio_clone(bio, GFP_NOIO);
455 rbio->bi_private = bio;
456 rbio->bi_end_io = super_written_barrier;
457 submit_bio(rw, rbio);
458 } else
459 submit_bio(rw, bio);
460}
461
462void md_super_wait(mddev_t *mddev)
463{
464
465
466
467 DEFINE_WAIT(wq);
468 for(;;) {
469 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
470 if (atomic_read(&mddev->pending_writes)==0)
471 break;
472 while (mddev->biolist) {
473 struct bio *bio;
474 spin_lock_irq(&mddev->write_lock);
475 bio = mddev->biolist;
476 mddev->biolist = bio->bi_next ;
477 bio->bi_next = NULL;
478 spin_unlock_irq(&mddev->write_lock);
479 submit_bio(bio->bi_rw, bio);
480 }
481 schedule();
482 }
483 finish_wait(&mddev->sb_wait, &wq);
484}
485
486static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
487{
488 if (bio->bi_size)
489 return 1;
490
491 complete((struct completion*)bio->bi_private);
492 return 0;
493}
494
495int sync_page_io(struct block_device *bdev, sector_t sector, int size,
496 struct page *page, int rw)
497{
498 struct bio *bio = bio_alloc(GFP_NOIO, 1);
499 struct completion event;
500 int ret;
501
502 rw |= (1 << BIO_RW_SYNC);
503
504 bio->bi_bdev = bdev;
505 bio->bi_sector = sector;
506 bio_add_page(bio, page, size, 0);
507 init_completion(&event);
508 bio->bi_private = &event;
509 bio->bi_end_io = bi_complete;
510 submit_bio(rw, bio);
511 wait_for_completion(&event);
512
513 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
514 bio_put(bio);
515 return ret;
516}
517EXPORT_SYMBOL_GPL(sync_page_io);
518
519static int read_disk_sb(mdk_rdev_t * rdev, int size)
520{
521 char b[BDEVNAME_SIZE];
522 if (!rdev->sb_page) {
523 MD_BUG();
524 return -EINVAL;
525 }
526 if (rdev->sb_loaded)
527 return 0;
528
529
530 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
531 goto fail;
532 rdev->sb_loaded = 1;
533 return 0;
534
535fail:
536 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
537 bdevname(rdev->bdev,b));
538 return -EINVAL;
539}
540
541static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
542{
543 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
544 (sb1->set_uuid1 == sb2->set_uuid1) &&
545 (sb1->set_uuid2 == sb2->set_uuid2) &&
546 (sb1->set_uuid3 == sb2->set_uuid3))
547
548 return 1;
549
550 return 0;
551}
552
553
554static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
555{
556 int ret;
557 mdp_super_t *tmp1, *tmp2;
558
559 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
560 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
561
562 if (!tmp1 || !tmp2) {
563 ret = 0;
564 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
565 goto abort;
566 }
567
568 *tmp1 = *sb1;
569 *tmp2 = *sb2;
570
571
572
573
574 tmp1->nr_disks = 0;
575 tmp2->nr_disks = 0;
576
577 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
578 ret = 0;
579 else
580 ret = 1;
581
582abort:
583 kfree(tmp1);
584 kfree(tmp2);
585 return ret;
586}
587
588static unsigned int calc_sb_csum(mdp_super_t * sb)
589{
590 unsigned int disk_csum, csum;
591
592 disk_csum = sb->sb_csum;
593 sb->sb_csum = 0;
594 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
595 sb->sb_csum = disk_csum;
596 return csum;
597}
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630struct super_type {
631 char *name;
632 struct module *owner;
633 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
634 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
635 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
636};
637
638
639
640
641static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
642{
643 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
644 mdp_super_t *sb;
645 int ret;
646 sector_t sb_offset;
647
648
649
650
651
652
653
654 sb_offset = calc_dev_sboffset(rdev->bdev);
655 rdev->sb_offset = sb_offset;
656
657 ret = read_disk_sb(rdev, MD_SB_BYTES);
658 if (ret) return ret;
659
660 ret = -EINVAL;
661
662 bdevname(rdev->bdev, b);
663 sb = (mdp_super_t*)page_address(rdev->sb_page);
664
665 if (sb->md_magic != MD_SB_MAGIC) {
666 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
667 b);
668 goto abort;
669 }
670
671 if (sb->major_version != 0 ||
672 sb->minor_version < 90 ||
673 sb->minor_version > 91) {
674 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
675 sb->major_version, sb->minor_version,
676 b);
677 goto abort;
678 }
679
680 if (sb->raid_disks <= 0)
681 goto abort;
682
683 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
684 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
685 b);
686 goto abort;
687 }
688
689 rdev->preferred_minor = sb->md_minor;
690 rdev->data_offset = 0;
691 rdev->sb_size = MD_SB_BYTES;
692
693 if (sb->level == LEVEL_MULTIPATH)
694 rdev->desc_nr = -1;
695 else
696 rdev->desc_nr = sb->this_disk.number;
697
698 if (refdev == 0)
699 ret = 1;
700 else {
701 __u64 ev1, ev2;
702 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
703 if (!uuid_equal(refsb, sb)) {
704 printk(KERN_WARNING "md: %s has different UUID to %s\n",
705 b, bdevname(refdev->bdev,b2));
706 goto abort;
707 }
708 if (!sb_equal(refsb, sb)) {
709 printk(KERN_WARNING "md: %s has same UUID"
710 " but different superblock to %s\n",
711 b, bdevname(refdev->bdev, b2));
712 goto abort;
713 }
714 ev1 = md_event(sb);
715 ev2 = md_event(refsb);
716 if (ev1 > ev2)
717 ret = 1;
718 else
719 ret = 0;
720 }
721 rdev->size = calc_dev_size(rdev, sb->chunk_size);
722
723 if (rdev->size < sb->size && sb->level > 1)
724
725 ret = -EINVAL;
726
727 abort:
728 return ret;
729}
730
731
732
733
734static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
735{
736 mdp_disk_t *desc;
737 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
738 __u64 ev1 = md_event(sb);
739
740 rdev->raid_disk = -1;
741 rdev->flags = 0;
742 if (mddev->raid_disks == 0) {
743 mddev->major_version = 0;
744 mddev->minor_version = sb->minor_version;
745 mddev->patch_version = sb->patch_version;
746 mddev->persistent = ! sb->not_persistent;
747 mddev->chunk_size = sb->chunk_size;
748 mddev->ctime = sb->ctime;
749 mddev->utime = sb->utime;
750 mddev->level = sb->level;
751 mddev->clevel[0] = 0;
752 mddev->layout = sb->layout;
753 mddev->raid_disks = sb->raid_disks;
754 mddev->size = sb->size;
755 mddev->events = ev1;
756 mddev->bitmap_offset = 0;
757 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
758
759 if (mddev->minor_version >= 91) {
760 mddev->reshape_position = sb->reshape_position;
761 mddev->delta_disks = sb->delta_disks;
762 mddev->new_level = sb->new_level;
763 mddev->new_layout = sb->new_layout;
764 mddev->new_chunk = sb->new_chunk;
765 } else {
766 mddev->reshape_position = MaxSector;
767 mddev->delta_disks = 0;
768 mddev->new_level = mddev->level;
769 mddev->new_layout = mddev->layout;
770 mddev->new_chunk = mddev->chunk_size;
771 }
772
773 if (sb->state & (1<<MD_SB_CLEAN))
774 mddev->recovery_cp = MaxSector;
775 else {
776 if (sb->events_hi == sb->cp_events_hi &&
777 sb->events_lo == sb->cp_events_lo) {
778 mddev->recovery_cp = sb->recovery_cp;
779 } else
780 mddev->recovery_cp = 0;
781 }
782
783 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
784 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
785 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
786 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
787
788 mddev->max_disks = MD_SB_DISKS;
789
790 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
791 mddev->bitmap_file == NULL) {
792 if (mddev->level != 1 && mddev->level != 4
793 && mddev->level != 5 && mddev->level != 6
794 && mddev->level != 10) {
795
796 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
797 return -EINVAL;
798 }
799 mddev->bitmap_offset = mddev->default_bitmap_offset;
800 }
801
802 } else if (mddev->pers == NULL) {
803
804 ++ev1;
805 if (ev1 < mddev->events)
806 return -EINVAL;
807 } else if (mddev->bitmap) {
808
809
810
811 if (ev1 < mddev->bitmap->events_cleared)
812 return 0;
813 } else {
814 if (ev1 < mddev->events)
815
816 return 0;
817 }
818
819 if (mddev->level != LEVEL_MULTIPATH) {
820 desc = sb->disks + rdev->desc_nr;
821
822 if (desc->state & (1<<MD_DISK_FAULTY))
823 set_bit(Faulty, &rdev->flags);
824 else if (desc->state & (1<<MD_DISK_SYNC)
825) {
826 set_bit(In_sync, &rdev->flags);
827 rdev->raid_disk = desc->raid_disk;
828 }
829 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
830 set_bit(WriteMostly, &rdev->flags);
831 } else
832 set_bit(In_sync, &rdev->flags);
833 return 0;
834}
835
836
837
838
839static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
840{
841 mdp_super_t *sb;
842 struct list_head *tmp;
843 mdk_rdev_t *rdev2;
844 int next_spare = mddev->raid_disks;
845
846
847
848
849
850
851
852
853
854
855
856
857 int i;
858 int active=0, working=0,failed=0,spare=0,nr_disks=0;
859
860 rdev->sb_size = MD_SB_BYTES;
861
862 sb = (mdp_super_t*)page_address(rdev->sb_page);
863
864 memset(sb, 0, sizeof(*sb));
865
866 sb->md_magic = MD_SB_MAGIC;
867 sb->major_version = mddev->major_version;
868 sb->patch_version = mddev->patch_version;
869 sb->gvalid_words = 0;
870 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
871 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
872 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
873 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
874
875 sb->ctime = mddev->ctime;
876 sb->level = mddev->level;
877 sb->size = mddev->size;
878 sb->raid_disks = mddev->raid_disks;
879 sb->md_minor = mddev->md_minor;
880 sb->not_persistent = !mddev->persistent;
881 sb->utime = mddev->utime;
882 sb->state = 0;
883 sb->events_hi = (mddev->events>>32);
884 sb->events_lo = (u32)mddev->events;
885
886 if (mddev->reshape_position == MaxSector)
887 sb->minor_version = 90;
888 else {
889 sb->minor_version = 91;
890 sb->reshape_position = mddev->reshape_position;
891 sb->new_level = mddev->new_level;
892 sb->delta_disks = mddev->delta_disks;
893 sb->new_layout = mddev->new_layout;
894 sb->new_chunk = mddev->new_chunk;
895 }
896 mddev->minor_version = sb->minor_version;
897 if (mddev->in_sync)
898 {
899 sb->recovery_cp = mddev->recovery_cp;
900 sb->cp_events_hi = (mddev->events>>32);
901 sb->cp_events_lo = (u32)mddev->events;
902 if (mddev->recovery_cp == MaxSector)
903 sb->state = (1<< MD_SB_CLEAN);
904 } else
905 sb->recovery_cp = 0;
906
907 sb->layout = mddev->layout;
908 sb->chunk_size = mddev->chunk_size;
909
910 if (mddev->bitmap && mddev->bitmap_file == NULL)
911 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
912
913 sb->disks[0].state = (1<<MD_DISK_REMOVED);
914 ITERATE_RDEV(mddev,rdev2,tmp) {
915 mdp_disk_t *d;
916 int desc_nr;
917 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
918 && !test_bit(Faulty, &rdev2->flags))
919 desc_nr = rdev2->raid_disk;
920 else
921 desc_nr = next_spare++;
922 rdev2->desc_nr = desc_nr;
923 d = &sb->disks[rdev2->desc_nr];
924 nr_disks++;
925 d->number = rdev2->desc_nr;
926 d->major = MAJOR(rdev2->bdev->bd_dev);
927 d->minor = MINOR(rdev2->bdev->bd_dev);
928 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
929 && !test_bit(Faulty, &rdev2->flags))
930 d->raid_disk = rdev2->raid_disk;
931 else
932 d->raid_disk = rdev2->desc_nr;
933 if (test_bit(Faulty, &rdev2->flags))
934 d->state = (1<<MD_DISK_FAULTY);
935 else if (test_bit(In_sync, &rdev2->flags)) {
936 d->state = (1<<MD_DISK_ACTIVE);
937 d->state |= (1<<MD_DISK_SYNC);
938 active++;
939 working++;
940 } else {
941 d->state = 0;
942 spare++;
943 working++;
944 }
945 if (test_bit(WriteMostly, &rdev2->flags))
946 d->state |= (1<<MD_DISK_WRITEMOSTLY);
947 }
948
949 for (i=0 ; i < mddev->raid_disks ; i++) {
950 mdp_disk_t *d = &sb->disks[i];
951 if (d->state == 0 && d->number == 0) {
952 d->number = i;
953 d->raid_disk = i;
954 d->state = (1<<MD_DISK_REMOVED);
955 d->state |= (1<<MD_DISK_FAULTY);
956 failed++;
957 }
958 }
959 sb->nr_disks = nr_disks;
960 sb->active_disks = active;
961 sb->working_disks = working;
962 sb->failed_disks = failed;
963 sb->spare_disks = spare;
964
965 sb->this_disk = sb->disks[rdev->desc_nr];
966 sb->sb_csum = calc_sb_csum(sb);
967}
968
969
970
971
972
973static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
974{
975 unsigned int disk_csum, csum;
976 unsigned long long newcsum;
977 int size = 256 + le32_to_cpu(sb->max_dev)*2;
978 unsigned int *isuper = (unsigned int*)sb;
979 int i;
980
981 disk_csum = sb->sb_csum;
982 sb->sb_csum = 0;
983 newcsum = 0;
984 for (i=0; size>=4; size -= 4 )
985 newcsum += le32_to_cpu(*isuper++);
986
987 if (size == 2)
988 newcsum += le16_to_cpu(*(unsigned short*) isuper);
989
990 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
991 sb->sb_csum = disk_csum;
992 return cpu_to_le32(csum);
993}
994
995static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
996{
997 struct mdp_superblock_1 *sb;
998 int ret;
999 sector_t sb_offset;
1000 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1001 int bmask;
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011 switch(minor_version) {
1012 case 0:
1013 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1014 sb_offset -= 8*2;
1015 sb_offset &= ~(sector_t)(4*2-1);
1016
1017 sb_offset /= 2;
1018 break;
1019 case 1:
1020 sb_offset = 0;
1021 break;
1022 case 2:
1023 sb_offset = 4;
1024 break;
1025 default:
1026 return -EINVAL;
1027 }
1028 rdev->sb_offset = sb_offset;
1029
1030
1031
1032
1033 ret = read_disk_sb(rdev, 4096);
1034 if (ret) return ret;
1035
1036
1037 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1038
1039 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1040 sb->major_version != cpu_to_le32(1) ||
1041 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1042 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1043 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1044 return -EINVAL;
1045
1046 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1047 printk("md: invalid superblock checksum on %s\n",
1048 bdevname(rdev->bdev,b));
1049 return -EINVAL;
1050 }
1051 if (le64_to_cpu(sb->data_size) < 10) {
1052 printk("md: data_size too small on %s\n",
1053 bdevname(rdev->bdev,b));
1054 return -EINVAL;
1055 }
1056 rdev->preferred_minor = 0xffff;
1057 rdev->data_offset = le64_to_cpu(sb->data_offset);
1058 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1059
1060 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1061 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1062 if (rdev->sb_size & bmask)
1063 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1064
1065 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1066 rdev->desc_nr = -1;
1067 else
1068 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1069
1070 if (refdev == 0)
1071 ret = 1;
1072 else {
1073 __u64 ev1, ev2;
1074 struct mdp_superblock_1 *refsb =
1075 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1076
1077 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1078 sb->level != refsb->level ||
1079 sb->layout != refsb->layout ||
1080 sb->chunksize != refsb->chunksize) {
1081 printk(KERN_WARNING "md: %s has strangely different"
1082 " superblock to %s\n",
1083 bdevname(rdev->bdev,b),
1084 bdevname(refdev->bdev,b2));
1085 return -EINVAL;
1086 }
1087 ev1 = le64_to_cpu(sb->events);
1088 ev2 = le64_to_cpu(refsb->events);
1089
1090 if (ev1 > ev2)
1091 ret = 1;
1092 else
1093 ret = 0;
1094 }
1095 if (minor_version)
1096 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1097 else
1098 rdev->size = rdev->sb_offset;
1099 if (rdev->size < le64_to_cpu(sb->data_size)/2)
1100 return -EINVAL;
1101 rdev->size = le64_to_cpu(sb->data_size)/2;
1102 if (le32_to_cpu(sb->chunksize))
1103 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1104
1105 if (le32_to_cpu(sb->size) > rdev->size*2)
1106 return -EINVAL;
1107 return ret;
1108}
1109
1110static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1111{
1112 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1113 __u64 ev1 = le64_to_cpu(sb->events);
1114
1115 rdev->raid_disk = -1;
1116 rdev->flags = 0;
1117 if (mddev->raid_disks == 0) {
1118 mddev->major_version = 1;
1119 mddev->patch_version = 0;
1120 mddev->persistent = 1;
1121 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1122 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1123 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1124 mddev->level = le32_to_cpu(sb->level);
1125 mddev->clevel[0] = 0;
1126 mddev->layout = le32_to_cpu(sb->layout);
1127 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1128 mddev->size = le64_to_cpu(sb->size)/2;
1129 mddev->events = ev1;
1130 mddev->bitmap_offset = 0;
1131 mddev->default_bitmap_offset = 1024 >> 9;
1132
1133 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1134 memcpy(mddev->uuid, sb->set_uuid, 16);
1135
1136 mddev->max_disks = (4096-256)/2;
1137
1138 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1139 mddev->bitmap_file == NULL ) {
1140 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1141 && mddev->level != 10) {
1142 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1143 return -EINVAL;
1144 }
1145 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1146 }
1147 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1148 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1149 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1150 mddev->new_level = le32_to_cpu(sb->new_level);
1151 mddev->new_layout = le32_to_cpu(sb->new_layout);
1152 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1153 } else {
1154 mddev->reshape_position = MaxSector;
1155 mddev->delta_disks = 0;
1156 mddev->new_level = mddev->level;
1157 mddev->new_layout = mddev->layout;
1158 mddev->new_chunk = mddev->chunk_size;
1159 }
1160
1161 } else if (mddev->pers == NULL) {
1162
1163 ++ev1;
1164 if (ev1 < mddev->events)
1165 return -EINVAL;
1166 } else if (mddev->bitmap) {
1167
1168
1169
1170 if (ev1 < mddev->bitmap->events_cleared)
1171 return 0;
1172 } else {
1173 if (ev1 < mddev->events)
1174
1175 return 0;
1176 }
1177 if (mddev->level != LEVEL_MULTIPATH) {
1178 int role;
1179 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1180 switch(role) {
1181 case 0xffff:
1182 break;
1183 case 0xfffe:
1184 set_bit(Faulty, &rdev->flags);
1185 break;
1186 default:
1187 if ((le32_to_cpu(sb->feature_map) &
1188 MD_FEATURE_RECOVERY_OFFSET))
1189 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1190 else
1191 set_bit(In_sync, &rdev->flags);
1192 rdev->raid_disk = role;
1193 break;
1194 }
1195 if (sb->devflags & WriteMostly1)
1196 set_bit(WriteMostly, &rdev->flags);
1197 } else
1198 set_bit(In_sync, &rdev->flags);
1199
1200 return 0;
1201}
1202
1203static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1204{
1205 struct mdp_superblock_1 *sb;
1206 struct list_head *tmp;
1207 mdk_rdev_t *rdev2;
1208 int max_dev, i;
1209
1210
1211 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1212
1213 sb->feature_map = 0;
1214 sb->pad0 = 0;
1215 sb->recovery_offset = cpu_to_le64(0);
1216 memset(sb->pad1, 0, sizeof(sb->pad1));
1217 memset(sb->pad2, 0, sizeof(sb->pad2));
1218 memset(sb->pad3, 0, sizeof(sb->pad3));
1219
1220 sb->utime = cpu_to_le64((__u64)mddev->utime);
1221 sb->events = cpu_to_le64(mddev->events);
1222 if (mddev->in_sync)
1223 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1224 else
1225 sb->resync_offset = cpu_to_le64(0);
1226
1227 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1228
1229 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1230 sb->size = cpu_to_le64(mddev->size<<1);
1231
1232 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1233 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1234 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1235 }
1236
1237 if (rdev->raid_disk >= 0 &&
1238 !test_bit(In_sync, &rdev->flags) &&
1239 rdev->recovery_offset > 0) {
1240 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1241 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1242 }
1243
1244 if (mddev->reshape_position != MaxSector) {
1245 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1246 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1247 sb->new_layout = cpu_to_le32(mddev->new_layout);
1248 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1249 sb->new_level = cpu_to_le32(mddev->new_level);
1250 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1251 }
1252
1253 max_dev = 0;
1254 ITERATE_RDEV(mddev,rdev2,tmp)
1255 if (rdev2->desc_nr+1 > max_dev)
1256 max_dev = rdev2->desc_nr+1;
1257
1258 sb->max_dev = cpu_to_le32(max_dev);
1259 for (i=0; i<max_dev;i++)
1260 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1261
1262 ITERATE_RDEV(mddev,rdev2,tmp) {
1263 i = rdev2->desc_nr;
1264 if (test_bit(Faulty, &rdev2->flags))
1265 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1266 else if (test_bit(In_sync, &rdev2->flags))
1267 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1268 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1269 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1270 else
1271 sb->dev_roles[i] = cpu_to_le16(0xffff);
1272 }
1273
1274 sb->sb_csum = calc_sb_1_csum(sb);
1275}
1276
1277
1278static struct super_type super_types[] = {
1279 [0] = {
1280 .name = "0.90.0",
1281 .owner = THIS_MODULE,
1282 .load_super = super_90_load,
1283 .validate_super = super_90_validate,
1284 .sync_super = super_90_sync,
1285 },
1286 [1] = {
1287 .name = "md-1",
1288 .owner = THIS_MODULE,
1289 .load_super = super_1_load,
1290 .validate_super = super_1_validate,
1291 .sync_super = super_1_sync,
1292 },
1293};
1294
1295static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
1296{
1297 struct list_head *tmp;
1298 mdk_rdev_t *rdev;
1299
1300 ITERATE_RDEV(mddev,rdev,tmp)
1301 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
1302 return rdev;
1303
1304 return NULL;
1305}
1306
1307static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1308{
1309 struct list_head *tmp;
1310 mdk_rdev_t *rdev;
1311
1312 ITERATE_RDEV(mddev1,rdev,tmp)
1313 if (match_dev_unit(mddev2, rdev))
1314 return 1;
1315
1316 return 0;
1317}
1318
1319static LIST_HEAD(pending_raid_disks);
1320
1321static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1322{
1323 mdk_rdev_t *same_pdev;
1324 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1325 struct kobject *ko;
1326 char *s;
1327
1328 if (rdev->mddev) {
1329 MD_BUG();
1330 return -EINVAL;
1331 }
1332
1333 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1334 if (mddev->pers)
1335
1336 return -ENOSPC;
1337 else
1338 mddev->size = rdev->size;
1339 }
1340 same_pdev = match_dev_unit(mddev, rdev);
1341 if (same_pdev)
1342 printk(KERN_WARNING
1343 "%s: WARNING: %s appears to be on the same physical"
1344 " disk as %s. True\n protection against single-disk"
1345 " failure might be compromised.\n",
1346 mdname(mddev), bdevname(rdev->bdev,b),
1347 bdevname(same_pdev->bdev,b2));
1348
1349
1350
1351
1352
1353 if (rdev->desc_nr < 0) {
1354 int choice = 0;
1355 if (mddev->pers) choice = mddev->raid_disks;
1356 while (find_rdev_nr(mddev, choice))
1357 choice++;
1358 rdev->desc_nr = choice;
1359 } else {
1360 if (find_rdev_nr(mddev, rdev->desc_nr))
1361 return -EBUSY;
1362 }
1363 bdevname(rdev->bdev,b);
1364 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1365 return -ENOMEM;
1366 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1367 *s = '!';
1368
1369 list_add(&rdev->same_set, &mddev->disks);
1370 rdev->mddev = mddev;
1371 printk(KERN_INFO "md: bind<%s>\n", b);
1372
1373 rdev->kobj.parent = &mddev->kobj;
1374 kobject_add(&rdev->kobj);
1375
1376 if (rdev->bdev->bd_part)
1377 ko = &rdev->bdev->bd_part->kobj;
1378 else
1379 ko = &rdev->bdev->bd_disk->kobj;
1380 sysfs_create_link(&rdev->kobj, ko, "block");
1381 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1382 return 0;
1383}
1384
1385static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1386{
1387 char b[BDEVNAME_SIZE];
1388 if (!rdev->mddev) {
1389 MD_BUG();
1390 return;
1391 }
1392 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1393 list_del_init(&rdev->same_set);
1394 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1395 rdev->mddev = NULL;
1396 sysfs_remove_link(&rdev->kobj, "block");
1397 kobject_del(&rdev->kobj);
1398}
1399
1400
1401
1402
1403
1404
1405static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1406{
1407 int err = 0;
1408 struct block_device *bdev;
1409 char b[BDEVNAME_SIZE];
1410
1411 bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1412 if (IS_ERR(bdev)) {
1413 printk(KERN_ERR "md: could not open %s.\n",
1414 __bdevname(dev, b));
1415 return PTR_ERR(bdev);
1416 }
1417 err = bd_claim(bdev, rdev);
1418 if (err) {
1419 printk(KERN_ERR "md: could not bd_claim %s.\n",
1420 bdevname(bdev, b));
1421 blkdev_put_partition(bdev);
1422 return err;
1423 }
1424 rdev->bdev = bdev;
1425 return err;
1426}
1427
1428static void unlock_rdev(mdk_rdev_t *rdev)
1429{
1430 struct block_device *bdev = rdev->bdev;
1431 rdev->bdev = NULL;
1432 if (!bdev)
1433 MD_BUG();
1434 bd_release(bdev);
1435 blkdev_put_partition(bdev);
1436}
1437
1438void md_autodetect_dev(dev_t dev);
1439
1440static void export_rdev(mdk_rdev_t * rdev)
1441{
1442 char b[BDEVNAME_SIZE];
1443 printk(KERN_INFO "md: export_rdev(%s)\n",
1444 bdevname(rdev->bdev,b));
1445 if (rdev->mddev)
1446 MD_BUG();
1447 free_disk_sb(rdev);
1448 list_del_init(&rdev->same_set);
1449#ifndef MODULE
1450 md_autodetect_dev(rdev->bdev->bd_dev);
1451#endif
1452 unlock_rdev(rdev);
1453 kobject_put(&rdev->kobj);
1454}
1455
1456static void kick_rdev_from_array(mdk_rdev_t * rdev)
1457{
1458 unbind_rdev_from_array(rdev);
1459 export_rdev(rdev);
1460}
1461
1462static void export_array(mddev_t *mddev)
1463{
1464 struct list_head *tmp;
1465 mdk_rdev_t *rdev;
1466
1467 ITERATE_RDEV(mddev,rdev,tmp) {
1468 if (!rdev->mddev) {
1469 MD_BUG();
1470 continue;
1471 }
1472 kick_rdev_from_array(rdev);
1473 }
1474 if (!list_empty(&mddev->disks))
1475 MD_BUG();
1476 mddev->raid_disks = 0;
1477 mddev->major_version = 0;
1478}
1479
1480static void print_desc(mdp_disk_t *desc)
1481{
1482 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1483 desc->major,desc->minor,desc->raid_disk,desc->state);
1484}
1485
1486static void print_sb(mdp_super_t *sb)
1487{
1488 int i;
1489
1490 printk(KERN_INFO
1491 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1492 sb->major_version, sb->minor_version, sb->patch_version,
1493 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1494 sb->ctime);
1495 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1496 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1497 sb->md_minor, sb->layout, sb->chunk_size);
1498 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1499 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1500 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1501 sb->failed_disks, sb->spare_disks,
1502 sb->sb_csum, (unsigned long)sb->events_lo);
1503
1504 printk(KERN_INFO);
1505 for (i = 0; i < MD_SB_DISKS; i++) {
1506 mdp_disk_t *desc;
1507
1508 desc = sb->disks + i;
1509 if (desc->number || desc->major || desc->minor ||
1510 desc->raid_disk || (desc->state && (desc->state != 4))) {
1511 printk(" D %2d: ", i);
1512 print_desc(desc);
1513 }
1514 }
1515 printk(KERN_INFO "md: THIS: ");
1516 print_desc(&sb->this_disk);
1517
1518}
1519
1520static void print_rdev(mdk_rdev_t *rdev)
1521{
1522 char b[BDEVNAME_SIZE];
1523 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1524 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1525 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1526 rdev->desc_nr);
1527 if (rdev->sb_loaded) {
1528 printk(KERN_INFO "md: rdev superblock:\n");
1529 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1530 } else
1531 printk(KERN_INFO "md: no rdev superblock!\n");
1532}
1533
1534static void md_print_devices(void)
1535{
1536 struct list_head *tmp, *tmp2;
1537 mdk_rdev_t *rdev;
1538 mddev_t *mddev;
1539 char b[BDEVNAME_SIZE];
1540
1541 printk("\n");
1542 printk("md: **********************************\n");
1543 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1544 printk("md: **********************************\n");
1545 ITERATE_MDDEV(mddev,tmp) {
1546
1547 if (mddev->bitmap)
1548 bitmap_print_sb(mddev->bitmap);
1549 else
1550 printk("%s: ", mdname(mddev));
1551 ITERATE_RDEV(mddev,rdev,tmp2)
1552 printk("<%s>", bdevname(rdev->bdev,b));
1553 printk("\n");
1554
1555 ITERATE_RDEV(mddev,rdev,tmp2)
1556 print_rdev(rdev);
1557 }
1558 printk("md: **********************************\n");
1559 printk("\n");
1560}
1561
1562
1563static void sync_sbs(mddev_t * mddev, int nospares)
1564{
1565
1566
1567
1568
1569
1570
1571 mdk_rdev_t *rdev;
1572 struct list_head *tmp;
1573
1574 ITERATE_RDEV(mddev,rdev,tmp) {
1575 if (rdev->sb_events == mddev->events ||
1576 (nospares &&
1577 rdev->raid_disk < 0 &&
1578 (rdev->sb_events&1)==0 &&
1579 rdev->sb_events+1 == mddev->events)) {
1580
1581 rdev->sb_loaded = 2;
1582 } else {
1583 super_types[mddev->major_version].
1584 sync_super(mddev, rdev);
1585 rdev->sb_loaded = 1;
1586 }
1587 }
1588}
1589
1590void md_update_sb(mddev_t * mddev)
1591{
1592 int err;
1593 struct list_head *tmp;
1594 mdk_rdev_t *rdev;
1595 int sync_req;
1596 int nospares = 0;
1597
1598repeat:
1599 spin_lock_irq(&mddev->write_lock);
1600
1601 if (mddev->degraded && mddev->sb_dirty == 3)
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611 mddev->sb_dirty = 1;
1612
1613 sync_req = mddev->in_sync;
1614 mddev->utime = get_seconds();
1615 if (mddev->sb_dirty == 3)
1616
1617
1618
1619
1620 nospares = 1;
1621
1622
1623
1624 if (mddev->sb_dirty == 3
1625 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1626 && (mddev->events & 1))
1627 mddev->events--;
1628 else {
1629
1630 mddev->events ++;
1631 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) {
1632
1633 if ((mddev->events&1)==0) {
1634 mddev->events++;
1635 nospares = 0;
1636 }
1637 } else {
1638
1639 if ((mddev->events&1)) {
1640 mddev->events++;
1641 nospares = 0;
1642 }
1643 }
1644 }
1645
1646 if (!mddev->events) {
1647
1648
1649
1650
1651
1652 MD_BUG();
1653 mddev->events --;
1654 }
1655 mddev->sb_dirty = 2;
1656 sync_sbs(mddev, nospares);
1657
1658
1659
1660
1661
1662 if (!mddev->persistent) {
1663 mddev->sb_dirty = 0;
1664 spin_unlock_irq(&mddev->write_lock);
1665 wake_up(&mddev->sb_wait);
1666 return;
1667 }
1668 spin_unlock_irq(&mddev->write_lock);
1669
1670 dprintk(KERN_INFO
1671 "md: updating %s RAID superblock on device (in sync %d)\n",
1672 mdname(mddev),mddev->in_sync);
1673
1674 err = bitmap_update_sb(mddev->bitmap);
1675 ITERATE_RDEV(mddev,rdev,tmp) {
1676 char b[BDEVNAME_SIZE];
1677 dprintk(KERN_INFO "md: ");
1678 if (rdev->sb_loaded != 1)
1679 continue;
1680 if (test_bit(Faulty, &rdev->flags))
1681 dprintk("(skipping faulty ");
1682
1683 dprintk("%s ", bdevname(rdev->bdev,b));
1684 if (!test_bit(Faulty, &rdev->flags)) {
1685 md_super_write(mddev,rdev,
1686 rdev->sb_offset<<1, rdev->sb_size,
1687 rdev->sb_page);
1688 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1689 bdevname(rdev->bdev,b),
1690 (unsigned long long)rdev->sb_offset);
1691 rdev->sb_events = mddev->events;
1692
1693 } else
1694 dprintk(")\n");
1695 if (mddev->level == LEVEL_MULTIPATH)
1696
1697 break;
1698 }
1699 md_super_wait(mddev);
1700
1701
1702 spin_lock_irq(&mddev->write_lock);
1703 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1704
1705 spin_unlock_irq(&mddev->write_lock);
1706 goto repeat;
1707 }
1708 mddev->sb_dirty = 0;
1709 spin_unlock_irq(&mddev->write_lock);
1710 wake_up(&mddev->sb_wait);
1711
1712}
1713EXPORT_SYMBOL_GPL(md_update_sb);
1714
1715
1716
1717
1718static int cmd_match(const char *cmd, const char *str)
1719{
1720
1721
1722
1723
1724 while (*cmd && *str && *cmd == *str) {
1725 cmd++;
1726 str++;
1727 }
1728 if (*cmd == '\n')
1729 cmd++;
1730 if (*str || *cmd)
1731 return 0;
1732 return 1;
1733}
1734
1735struct rdev_sysfs_entry {
1736 struct attribute attr;
1737 ssize_t (*show)(mdk_rdev_t *, char *);
1738 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1739};
1740
1741static ssize_t
1742state_show(mdk_rdev_t *rdev, char *page)
1743{
1744 char *sep = "";
1745 int len=0;
1746
1747 if (test_bit(Faulty, &rdev->flags)) {
1748 len+= sprintf(page+len, "%sfaulty",sep);
1749 sep = ",";
1750 }
1751 if (test_bit(In_sync, &rdev->flags)) {
1752 len += sprintf(page+len, "%sin_sync",sep);
1753 sep = ",";
1754 }
1755 if (test_bit(WriteMostly, &rdev->flags)) {
1756 len += sprintf(page+len, "%swrite_mostly",sep);
1757 sep = ",";
1758 }
1759 if (!test_bit(Faulty, &rdev->flags) &&
1760 !test_bit(In_sync, &rdev->flags)) {
1761 len += sprintf(page+len, "%sspare", sep);
1762 sep = ",";
1763 }
1764 return len+sprintf(page+len, "\n");
1765}
1766
1767static ssize_t
1768state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1769{
1770
1771
1772
1773
1774
1775
1776 int err = -EINVAL;
1777 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1778 md_error(rdev->mddev, rdev);
1779 err = 0;
1780 } else if (cmd_match(buf, "remove")) {
1781 if (rdev->raid_disk >= 0)
1782 err = -EBUSY;
1783 else {
1784 mddev_t *mddev = rdev->mddev;
1785 kick_rdev_from_array(rdev);
1786 md_update_sb(mddev);
1787 md_new_event(mddev);
1788 err = 0;
1789 }
1790 } else if (cmd_match(buf, "writemostly")) {
1791 set_bit(WriteMostly, &rdev->flags);
1792 err = 0;
1793 } else if (cmd_match(buf, "-writemostly")) {
1794 clear_bit(WriteMostly, &rdev->flags);
1795 err = 0;
1796 }
1797 return err ? err : len;
1798}
1799static struct rdev_sysfs_entry rdev_state =
1800__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1801
1802static ssize_t
1803super_show(mdk_rdev_t *rdev, char *page)
1804{
1805 if (rdev->sb_loaded && rdev->sb_size) {
1806 memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1807 return rdev->sb_size;
1808 } else
1809 return 0;
1810}
1811static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1812
1813static ssize_t
1814errors_show(mdk_rdev_t *rdev, char *page)
1815{
1816 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1817}
1818
1819static ssize_t
1820errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1821{
1822 char *e;
1823 unsigned long n = simple_strtoul(buf, &e, 10);
1824 if (*buf && (*e == 0 || *e == '\n')) {
1825 atomic_set(&rdev->corrected_errors, n);
1826 return len;
1827 }
1828 return -EINVAL;
1829}
1830static struct rdev_sysfs_entry rdev_errors =
1831__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1832
1833static ssize_t
1834slot_show(mdk_rdev_t *rdev, char *page)
1835{
1836 if (rdev->raid_disk < 0)
1837 return sprintf(page, "none\n");
1838 else
1839 return sprintf(page, "%d\n", rdev->raid_disk);
1840}
1841
1842static ssize_t
1843slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1844{
1845 char *e;
1846 int slot = simple_strtoul(buf, &e, 10);
1847 if (strncmp(buf, "none", 4)==0)
1848 slot = -1;
1849 else if (e==buf || (*e && *e!= '\n'))
1850 return -EINVAL;
1851 if (rdev->mddev->pers)
1852
1853 return -EBUSY;
1854 if (slot >= rdev->mddev->raid_disks)
1855 return -ENOSPC;
1856 rdev->raid_disk = slot;
1857
1858 rdev->flags = 0;
1859 set_bit(In_sync, &rdev->flags);
1860 return len;
1861}
1862
1863
1864static struct rdev_sysfs_entry rdev_slot =
1865__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1866
1867static ssize_t
1868offset_show(mdk_rdev_t *rdev, char *page)
1869{
1870 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1871}
1872
1873static ssize_t
1874offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1875{
1876 char *e;
1877 unsigned long long offset = simple_strtoull(buf, &e, 10);
1878 if (e==buf || (*e && *e != '\n'))
1879 return -EINVAL;
1880 if (rdev->mddev->pers)
1881 return -EBUSY;
1882 rdev->data_offset = offset;
1883 return len;
1884}
1885
1886static struct rdev_sysfs_entry rdev_offset =
1887__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1888
1889static ssize_t
1890rdev_size_show(mdk_rdev_t *rdev, char *page)
1891{
1892 return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1893}
1894
1895static ssize_t
1896rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1897{
1898 char *e;
1899 unsigned long long size = simple_strtoull(buf, &e, 10);
1900 if (e==buf || (*e && *e != '\n'))
1901 return -EINVAL;
1902 if (rdev->mddev->pers)
1903 return -EBUSY;
1904 rdev->size = size;
1905 if (size < rdev->mddev->size || rdev->mddev->size == 0)
1906 rdev->mddev->size = size;
1907 return len;
1908}
1909
1910static struct rdev_sysfs_entry rdev_size =
1911__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
1912
1913static struct attribute *rdev_default_attrs[] = {
1914 &rdev_state.attr,
1915 &rdev_super.attr,
1916 &rdev_errors.attr,
1917 &rdev_slot.attr,
1918 &rdev_offset.attr,
1919 &rdev_size.attr,
1920 NULL,
1921};
1922static ssize_t
1923rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1924{
1925 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1926 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1927
1928 if (!entry->show)
1929 return -EIO;
1930 return entry->show(rdev, page);
1931}
1932
1933static ssize_t
1934rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1935 const char *page, size_t length)
1936{
1937 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1938 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1939
1940 if (!entry->store)
1941 return -EIO;
1942 if (!capable(CAP_SYS_ADMIN))
1943 return -EACCES;
1944 return entry->store(rdev, page, length);
1945}
1946
1947static void rdev_free(struct kobject *ko)
1948{
1949 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1950 kfree(rdev);
1951}
1952static struct sysfs_ops rdev_sysfs_ops = {
1953 .show = rdev_attr_show,
1954 .store = rdev_attr_store,
1955};
1956static struct kobj_type rdev_ktype = {
1957 .release = rdev_free,
1958 .sysfs_ops = &rdev_sysfs_ops,
1959 .default_attrs = rdev_default_attrs,
1960};
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1973{
1974 char b[BDEVNAME_SIZE];
1975 int err;
1976 mdk_rdev_t *rdev;
1977 sector_t size;
1978
1979 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1980 if (!rdev) {
1981 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1982 return ERR_PTR(-ENOMEM);
1983 }
1984
1985 if ((err = alloc_disk_sb(rdev)))
1986 goto abort_free;
1987
1988 err = lock_rdev(rdev, newdev);
1989 if (err)
1990 goto abort_free;
1991
1992 rdev->kobj.parent = NULL;
1993 rdev->kobj.ktype = &rdev_ktype;
1994 kobject_init(&rdev->kobj);
1995
1996 rdev->desc_nr = -1;
1997 rdev->flags = 0;
1998 rdev->data_offset = 0;
1999 rdev->sb_events = 0;
2000 atomic_set(&rdev->nr_pending, 0);
2001 atomic_set(&rdev->read_errors, 0);
2002 atomic_set(&rdev->corrected_errors, 0);
2003
2004 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2005 if (!size) {
2006 printk(KERN_WARNING
2007 "md: %s has zero or unknown size, marking faulty!\n",
2008 bdevname(rdev->bdev,b));
2009 err = -EINVAL;
2010 goto abort_free;
2011 }
2012
2013 if (super_format >= 0) {
2014 err = super_types[super_format].
2015 load_super(rdev, NULL, super_minor);
2016 if (err == -EINVAL) {
2017 printk(KERN_WARNING
2018 "md: %s has invalid sb, not importing!\n",
2019 bdevname(rdev->bdev,b));
2020 goto abort_free;
2021 }
2022 if (err < 0) {
2023 printk(KERN_WARNING
2024 "md: could not read %s's sb, not importing!\n",
2025 bdevname(rdev->bdev,b));
2026 goto abort_free;
2027 }
2028 }
2029 INIT_LIST_HEAD(&rdev->same_set);
2030
2031 return rdev;
2032
2033abort_free:
2034 if (rdev->sb_page) {
2035 if (rdev->bdev)
2036 unlock_rdev(rdev);
2037 free_disk_sb(rdev);
2038 }
2039 kfree(rdev);
2040 return ERR_PTR(err);
2041}
2042
2043
2044
2045
2046
2047
2048static void analyze_sbs(mddev_t * mddev)
2049{
2050 int i;
2051 struct list_head *tmp;
2052 mdk_rdev_t *rdev, *freshest;
2053 char b[BDEVNAME_SIZE];
2054
2055 freshest = NULL;
2056 ITERATE_RDEV(mddev,rdev,tmp)
2057 switch (super_types[mddev->major_version].
2058 load_super(rdev, freshest, mddev->minor_version)) {
2059 case 1:
2060 freshest = rdev;
2061 break;
2062 case 0:
2063 break;
2064 default:
2065 printk( KERN_ERR \
2066 "md: fatal superblock inconsistency in %s"
2067 " -- removing from array\n",
2068 bdevname(rdev->bdev,b));
2069 kick_rdev_from_array(rdev);
2070 }
2071
2072
2073 super_types[mddev->major_version].
2074 validate_super(mddev, freshest);
2075
2076 i = 0;
2077 ITERATE_RDEV(mddev,rdev,tmp) {
2078 if (rdev != freshest)
2079 if (super_types[mddev->major_version].
2080 validate_super(mddev, rdev)) {
2081 printk(KERN_WARNING "md: kicking non-fresh %s"
2082 " from array!\n",
2083 bdevname(rdev->bdev,b));
2084 kick_rdev_from_array(rdev);
2085 continue;
2086 }
2087 if (mddev->level == LEVEL_MULTIPATH) {
2088 rdev->desc_nr = i++;
2089 rdev->raid_disk = rdev->desc_nr;
2090 set_bit(In_sync, &rdev->flags);
2091 }
2092 }
2093
2094
2095
2096 if (mddev->recovery_cp != MaxSector &&
2097 mddev->level >= 1)
2098 printk(KERN_ERR "md: %s: raid array is not clean"
2099 " -- starting background reconstruction\n",
2100 mdname(mddev));
2101
2102}
2103
2104static ssize_t
2105safe_delay_show(mddev_t *mddev, char *page)
2106{
2107 int msec = (mddev->safemode_delay*1000)/HZ;
2108 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2109}
2110static ssize_t
2111safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2112{
2113 int scale=1;
2114 int dot=0;
2115 int i;
2116 unsigned long msec;
2117 char buf[30];
2118 char *e;
2119
2120 if (len >= sizeof(buf))
2121 return -EINVAL;
2122 strlcpy(buf, cbuf, len);
2123 buf[len] = 0;
2124 for (i=0; i<len; i++) {
2125 if (dot) {
2126 if (isdigit(buf[i])) {
2127 buf[i-1] = buf[i];
2128 scale *= 10;
2129 }
2130 buf[i] = 0;
2131 } else if (buf[i] == '.') {
2132 dot=1;
2133 buf[i] = 0;
2134 }
2135 }
2136 msec = simple_strtoul(buf, &e, 10);
2137 if (e == buf || (*e && *e != '\n'))
2138 return -EINVAL;
2139 msec = (msec * 1000) / scale;
2140 if (msec == 0)
2141 mddev->safemode_delay = 0;
2142 else {
2143 mddev->safemode_delay = (msec*HZ)/1000;
2144 if (mddev->safemode_delay == 0)
2145 mddev->safemode_delay = 1;
2146 }
2147 return len;
2148}
2149static struct md_sysfs_entry md_safe_delay =
2150__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2151
2152static ssize_t
2153level_show(mddev_t *mddev, char *page)
2154{
2155 struct mdk_personality *p = mddev->pers;
2156 if (p)
2157 return sprintf(page, "%s\n", p->name);
2158 else if (mddev->clevel[0])
2159 return sprintf(page, "%s\n", mddev->clevel);
2160 else if (mddev->level != LEVEL_NONE)
2161 return sprintf(page, "%d\n", mddev->level);
2162 else
2163 return 0;
2164}
2165
2166static ssize_t
2167level_store(mddev_t *mddev, const char *buf, size_t len)
2168{
2169 int rv = len;
2170 if (mddev->pers)
2171 return -EBUSY;
2172 if (len == 0)
2173 return 0;
2174 if (len >= sizeof(mddev->clevel))
2175 return -ENOSPC;
2176 strncpy(mddev->clevel, buf, len);
2177 if (mddev->clevel[len-1] == '\n')
2178 len--;
2179 mddev->clevel[len] = 0;
2180 mddev->level = LEVEL_NONE;
2181 return rv;
2182}
2183
2184static struct md_sysfs_entry md_level =
2185__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2186
2187
2188static ssize_t
2189layout_show(mddev_t *mddev, char *page)
2190{
2191
2192 return sprintf(page, "%d\n", mddev->layout);
2193}
2194
2195static ssize_t
2196layout_store(mddev_t *mddev, const char *buf, size_t len)
2197{
2198 char *e;
2199 unsigned long n = simple_strtoul(buf, &e, 10);
2200 if (mddev->pers)
2201 return -EBUSY;
2202
2203 if (!*buf || (*e && *e != '\n'))
2204 return -EINVAL;
2205
2206 mddev->layout = n;
2207 return len;
2208}
2209static struct md_sysfs_entry md_layout =
2210__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2211
2212
2213static ssize_t
2214raid_disks_show(mddev_t *mddev, char *page)
2215{
2216 if (mddev->raid_disks == 0)
2217 return 0;
2218 return sprintf(page, "%d\n", mddev->raid_disks);
2219}
2220
2221static int update_raid_disks(mddev_t *mddev, int raid_disks);
2222
2223static ssize_t
2224raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2225{
2226
2227 char *e;
2228 int rv = 0;
2229 unsigned long n = simple_strtoul(buf, &e, 10);
2230
2231 if (!*buf || (*e && *e != '\n'))
2232 return -EINVAL;
2233
2234 if (mddev->pers)
2235 rv = update_raid_disks(mddev, n);
2236 else
2237 mddev->raid_disks = n;
2238 return rv ? rv : len;
2239}
2240static struct md_sysfs_entry md_raid_disks =
2241__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2242
2243static ssize_t
2244chunk_size_show(mddev_t *mddev, char *page)
2245{
2246 return sprintf(page, "%d\n", mddev->chunk_size);
2247}
2248
2249static ssize_t
2250chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2251{
2252
2253 char *e;
2254 unsigned long n = simple_strtoul(buf, &e, 10);
2255
2256 if (mddev->pers)
2257 return -EBUSY;
2258 if (!*buf || (*e && *e != '\n'))
2259 return -EINVAL;
2260
2261 mddev->chunk_size = n;
2262 return len;
2263}
2264static struct md_sysfs_entry md_chunk_size =
2265__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2266
2267static ssize_t
2268resync_start_show(mddev_t *mddev, char *page)
2269{
2270 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2271}
2272
2273static ssize_t
2274resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2275{
2276
2277 char *e;
2278 unsigned long long n = simple_strtoull(buf, &e, 10);
2279
2280 if (mddev->pers)
2281 return -EBUSY;
2282 if (!*buf || (*e && *e != '\n'))
2283 return -EINVAL;
2284
2285 mddev->recovery_cp = n;
2286 return len;
2287}
2288static struct md_sysfs_entry md_resync_start =
2289__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2328 write_pending, active_idle, bad_word};
2329static char *array_states[] = {
2330 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2331 "write-pending", "active-idle", NULL };
2332
2333static int match_word(const char *word, char **list)
2334{
2335 int n;
2336 for (n=0; list[n]; n++)
2337 if (cmd_match(word, list[n]))
2338 break;
2339 return n;
2340}
2341
2342static ssize_t
2343array_state_show(mddev_t *mddev, char *page)
2344{
2345 enum array_state st = inactive;
2346
2347 if (mddev->pers)
2348 switch(mddev->ro) {
2349 case 1:
2350 st = readonly;
2351 break;
2352 case 2:
2353 st = read_auto;
2354 break;
2355 case 0:
2356 if (mddev->in_sync)
2357 st = clean;
2358 else if (mddev->safemode)
2359 st = active_idle;
2360 else
2361 st = active;
2362 }
2363 else {
2364 if (list_empty(&mddev->disks) &&
2365 mddev->raid_disks == 0 &&
2366 mddev->size == 0)
2367 st = clear;
2368 else
2369 st = inactive;
2370 }
2371 return sprintf(page, "%s\n", array_states[st]);
2372}
2373
2374static int do_md_stop(mddev_t * mddev, int ro);
2375static int do_md_run(mddev_t * mddev);
2376static int restart_array(mddev_t *mddev);
2377
2378static ssize_t
2379array_state_store(mddev_t *mddev, const char *buf, size_t len)
2380{
2381 int err = -EINVAL;
2382 enum array_state st = match_word(buf, array_states);
2383 switch(st) {
2384 case bad_word:
2385 break;
2386 case clear:
2387
2388 if (mddev->pers) {
2389 if (atomic_read(&mddev->active) > 1)
2390 return -EBUSY;
2391 err = do_md_stop(mddev, 0);
2392 }
2393 break;
2394 case inactive:
2395
2396 if (mddev->pers) {
2397 if (atomic_read(&mddev->active) > 1)
2398 return -EBUSY;
2399 err = do_md_stop(mddev, 2);
2400 }
2401 break;
2402 case suspended:
2403 break;
2404 case readonly:
2405 if (mddev->pers)
2406 err = do_md_stop(mddev, 1);
2407 else {
2408 mddev->ro = 1;
2409 err = do_md_run(mddev);
2410 }
2411 break;
2412 case read_auto:
2413
2414 if (mddev->pers) {
2415 err = do_md_stop(mddev, 1);
2416 if (err == 0)
2417 mddev->ro = 2;
2418 } else {
2419 mddev->ro = 2;
2420 err = do_md_run(mddev);
2421 }
2422 break;
2423 case clean:
2424 if (mddev->pers) {
2425 restart_array(mddev);
2426 spin_lock_irq(&mddev->write_lock);
2427 if (atomic_read(&mddev->writes_pending) == 0) {
2428 mddev->in_sync = 1;
2429 mddev->sb_dirty = 1;
2430 }
2431 spin_unlock_irq(&mddev->write_lock);
2432 } else {
2433 mddev->ro = 0;
2434 mddev->recovery_cp = MaxSector;
2435 err = do_md_run(mddev);
2436 }
2437 break;
2438 case active:
2439 if (mddev->pers) {
2440 restart_array(mddev);
2441 mddev->sb_dirty = 0;
2442 wake_up(&mddev->sb_wait);
2443 err = 0;
2444 } else {
2445 mddev->ro = 0;
2446 err = do_md_run(mddev);
2447 }
2448 break;
2449 case write_pending:
2450 case active_idle:
2451
2452 break;
2453 }
2454 if (err)
2455 return err;
2456 else
2457 return len;
2458}
2459static struct md_sysfs_entry md_array_state =
2460__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2461
2462static ssize_t
2463null_show(mddev_t *mddev, char *page)
2464{
2465 return -EINVAL;
2466}
2467
2468static ssize_t
2469new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2470{
2471
2472
2473
2474
2475
2476
2477
2478 char *e;
2479 int major = simple_strtoul(buf, &e, 10);
2480 int minor;
2481 dev_t dev;
2482 mdk_rdev_t *rdev;
2483 int err;
2484
2485 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2486 return -EINVAL;
2487 minor = simple_strtoul(e+1, &e, 10);
2488 if (*e && *e != '\n')
2489 return -EINVAL;
2490 dev = MKDEV(major, minor);
2491 if (major != MAJOR(dev) ||
2492 minor != MINOR(dev))
2493 return -EOVERFLOW;
2494
2495
2496 if (mddev->persistent) {
2497 rdev = md_import_device(dev, mddev->major_version,
2498 mddev->minor_version);
2499 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2500 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2501 mdk_rdev_t, same_set);
2502 err = super_types[mddev->major_version]
2503 .load_super(rdev, rdev0, mddev->minor_version);
2504 if (err < 0)
2505 goto out;
2506 }
2507 } else
2508 rdev = md_import_device(dev, -1, -1);
2509
2510 if (IS_ERR(rdev))
2511 return PTR_ERR(rdev);
2512 err = bind_rdev_to_array(rdev, mddev);
2513 out:
2514 if (err)
2515 export_rdev(rdev);
2516 return err ? err : len;
2517}
2518
2519static struct md_sysfs_entry md_new_device =
2520__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2521
2522static ssize_t
2523size_show(mddev_t *mddev, char *page)
2524{
2525 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2526}
2527
2528static int update_size(mddev_t *mddev, unsigned long size);
2529
2530static ssize_t
2531size_store(mddev_t *mddev, const char *buf, size_t len)
2532{
2533
2534
2535
2536
2537 char *e;
2538 int err = 0;
2539 unsigned long long size = simple_strtoull(buf, &e, 10);
2540 if (!*buf || *buf == '\n' ||
2541 (*e && *e != '\n'))
2542 return -EINVAL;
2543
2544 if (mddev->pers) {
2545 err = update_size(mddev, size);
2546 md_update_sb(mddev);
2547 } else {
2548 if (mddev->size == 0 ||
2549 mddev->size > size)
2550 mddev->size = size;
2551 else
2552 err = -ENOSPC;
2553 }
2554 return err ? err : len;
2555}
2556
2557static struct md_sysfs_entry md_size =
2558__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2559
2560
2561
2562
2563
2564
2565static ssize_t
2566metadata_show(mddev_t *mddev, char *page)
2567{
2568 if (mddev->persistent)
2569 return sprintf(page, "%d.%d\n",
2570 mddev->major_version, mddev->minor_version);
2571 else
2572 return sprintf(page, "none\n");
2573}
2574
2575static ssize_t
2576metadata_store(mddev_t *mddev, const char *buf, size_t len)
2577{
2578 int major, minor;
2579 char *e;
2580 if (!list_empty(&mddev->disks))
2581 return -EBUSY;
2582
2583 if (cmd_match(buf, "none")) {
2584 mddev->persistent = 0;
2585 mddev->major_version = 0;
2586 mddev->minor_version = 90;
2587 return len;
2588 }
2589 major = simple_strtoul(buf, &e, 10);
2590 if (e==buf || *e != '.')
2591 return -EINVAL;
2592 buf = e+1;
2593 minor = simple_strtoul(buf, &e, 10);
2594 if (e==buf || *e != '\n')
2595 return -EINVAL;
2596 if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2597 super_types[major].name == NULL)
2598 return -ENOENT;
2599 mddev->major_version = major;
2600 mddev->minor_version = minor;
2601 mddev->persistent = 1;
2602 return len;
2603}
2604
2605static struct md_sysfs_entry md_metadata =
2606__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2607
2608static ssize_t
2609action_show(mddev_t *mddev, char *page)
2610{
2611 char *type = "idle";
2612 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2613 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
2614 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2615 type = "reshape";
2616 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2617 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2618 type = "resync";
2619 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2620 type = "check";
2621 else
2622 type = "repair";
2623 } else
2624 type = "recover";
2625 }
2626 return sprintf(page, "%s\n", type);
2627}
2628
2629static ssize_t
2630action_store(mddev_t *mddev, const char *page, size_t len)
2631{
2632 if (!mddev->pers || !mddev->pers->sync_request)
2633 return -EINVAL;
2634
2635 if (cmd_match(page, "idle")) {
2636 if (mddev->sync_thread) {
2637 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2638 md_unregister_thread(mddev->sync_thread);
2639 mddev->sync_thread = NULL;
2640 mddev->recovery = 0;
2641 }
2642 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2643 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2644 return -EBUSY;
2645 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2646 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2647 else if (cmd_match(page, "reshape")) {
2648 int err;
2649 if (mddev->pers->start_reshape == NULL)
2650 return -EINVAL;
2651 err = mddev->pers->start_reshape(mddev);
2652 if (err)
2653 return err;
2654 } else {
2655 if (cmd_match(page, "check"))
2656 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2657 else if (!cmd_match(page, "repair"))
2658 return -EINVAL;
2659 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2660 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2661 }
2662 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2663 md_wakeup_thread(mddev->thread);
2664 return len;
2665}
2666
2667static ssize_t
2668mismatch_cnt_show(mddev_t *mddev, char *page)
2669{
2670 return sprintf(page, "%llu\n",
2671 (unsigned long long) mddev->resync_mismatches);
2672}
2673
2674static struct md_sysfs_entry md_scan_mode =
2675__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2676
2677
2678static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2679
2680static ssize_t
2681sync_min_show(mddev_t *mddev, char *page)
2682{
2683 return sprintf(page, "%d (%s)\n", speed_min(mddev),
2684 mddev->sync_speed_min ? "local": "system");
2685}
2686
2687static ssize_t
2688sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2689{
2690 int min;
2691 char *e;
2692 if (strncmp(buf, "system", 6)==0) {
2693 mddev->sync_speed_min = 0;
2694 return len;
2695 }
2696 min = simple_strtoul(buf, &e, 10);
2697 if (buf == e || (*e && *e != '\n') || min <= 0)
2698 return -EINVAL;
2699 mddev->sync_speed_min = min;
2700 return len;
2701}
2702
2703static struct md_sysfs_entry md_sync_min =
2704__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2705
2706static ssize_t
2707sync_max_show(mddev_t *mddev, char *page)
2708{
2709 return sprintf(page, "%d (%s)\n", speed_max(mddev),
2710 mddev->sync_speed_max ? "local": "system");
2711}
2712
2713static ssize_t
2714sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2715{
2716 int max;
2717 char *e;
2718 if (strncmp(buf, "system", 6)==0) {
2719 mddev->sync_speed_max = 0;
2720 return len;
2721 }
2722 max = simple_strtoul(buf, &e, 10);
2723 if (buf == e || (*e && *e != '\n') || max <= 0)
2724 return -EINVAL;
2725 mddev->sync_speed_max = max;
2726 return len;
2727}
2728
2729static struct md_sysfs_entry md_sync_max =
2730__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2731
2732
2733static ssize_t
2734sync_speed_show(mddev_t *mddev, char *page)
2735{
2736 unsigned long resync, dt, db;
2737 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2738 dt = ((jiffies - mddev->resync_mark) / HZ);
2739 if (!dt) dt++;
2740 db = resync - (mddev->resync_mark_cnt);
2741 return sprintf(page, "%ld\n", db/dt/2);
2742}
2743
2744static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2745
2746static ssize_t
2747sync_completed_show(mddev_t *mddev, char *page)
2748{
2749 unsigned long max_blocks, resync;
2750
2751 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2752 max_blocks = mddev->resync_max_sectors;
2753 else
2754 max_blocks = mddev->size << 1;
2755
2756 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2757 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2758}
2759
2760static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2761
2762static ssize_t
2763suspend_lo_show(mddev_t *mddev, char *page)
2764{
2765 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2766}
2767
2768static ssize_t
2769suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2770{
2771 char *e;
2772 unsigned long long new = simple_strtoull(buf, &e, 10);
2773
2774 if (mddev->pers->quiesce == NULL)
2775 return -EINVAL;
2776 if (buf == e || (*e && *e != '\n'))
2777 return -EINVAL;
2778 if (new >= mddev->suspend_hi ||
2779 (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2780 mddev->suspend_lo = new;
2781 mddev->pers->quiesce(mddev, 2);
2782 return len;
2783 } else
2784 return -EINVAL;
2785}
2786static struct md_sysfs_entry md_suspend_lo =
2787__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2788
2789
2790static ssize_t
2791suspend_hi_show(mddev_t *mddev, char *page)
2792{
2793 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2794}
2795
2796static ssize_t
2797suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2798{
2799 char *e;
2800 unsigned long long new = simple_strtoull(buf, &e, 10);
2801
2802 if (mddev->pers->quiesce == NULL)
2803 return -EINVAL;
2804 if (buf == e || (*e && *e != '\n'))
2805 return -EINVAL;
2806 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2807 (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2808 mddev->suspend_hi = new;
2809 mddev->pers->quiesce(mddev, 1);
2810 mddev->pers->quiesce(mddev, 0);
2811 return len;
2812 } else
2813 return -EINVAL;
2814}
2815static struct md_sysfs_entry md_suspend_hi =
2816__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2817
2818
2819static struct attribute *md_default_attrs[] = {
2820 &md_level.attr,
2821 &md_layout.attr,
2822 &md_raid_disks.attr,
2823 &md_chunk_size.attr,
2824 &md_size.attr,
2825 &md_resync_start.attr,
2826 &md_metadata.attr,
2827 &md_new_device.attr,
2828 &md_safe_delay.attr,
2829 &md_array_state.attr,
2830 NULL,
2831};
2832
2833static struct attribute *md_redundancy_attrs[] = {
2834 &md_scan_mode.attr,
2835 &md_mismatches.attr,
2836 &md_sync_min.attr,
2837 &md_sync_max.attr,
2838 &md_sync_speed.attr,
2839 &md_sync_completed.attr,
2840 &md_suspend_lo.attr,
2841 &md_suspend_hi.attr,
2842 NULL,
2843};
2844static struct attribute_group md_redundancy_group = {
2845 .name = NULL,
2846 .attrs = md_redundancy_attrs,
2847};
2848
2849
2850static ssize_t
2851md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2852{
2853 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2854 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2855 ssize_t rv;
2856
2857 if (!entry->show)
2858 return -EIO;
2859 rv = mddev_lock(mddev);
2860 if (!rv) {
2861 rv = entry->show(mddev, page);
2862 mddev_unlock(mddev);
2863 }
2864 return rv;
2865}
2866
2867static ssize_t
2868md_attr_store(struct kobject *kobj, struct attribute *attr,
2869 const char *page, size_t length)
2870{
2871 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2872 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2873 ssize_t rv;
2874
2875 if (!entry->store)
2876 return -EIO;
2877 if (!capable(CAP_SYS_ADMIN))
2878 return -EACCES;
2879 rv = mddev_lock(mddev);
2880 if (!rv) {
2881 rv = entry->store(mddev, page, length);
2882 mddev_unlock(mddev);
2883 }
2884 return rv;
2885}
2886
2887static void md_free(struct kobject *ko)
2888{
2889 mddev_t *mddev = container_of(ko, mddev_t, kobj);
2890 kfree(mddev);
2891}
2892
2893static struct sysfs_ops md_sysfs_ops = {
2894 .show = md_attr_show,
2895 .store = md_attr_store,
2896};
2897static struct kobj_type md_ktype = {
2898 .release = md_free,
2899 .sysfs_ops = &md_sysfs_ops,
2900 .default_attrs = md_default_attrs,
2901};
2902
2903int mdp_major = 0;
2904
2905static struct kobject *md_probe(dev_t dev, int *part, void *data)
2906{
2907 static DEFINE_MUTEX(disks_mutex);
2908 mddev_t *mddev = mddev_find(dev);
2909 struct gendisk *disk;
2910 int partitioned = (MAJOR(dev) != MD_MAJOR);
2911 int shift = partitioned ? MdpMinorShift : 0;
2912 int unit = MINOR(dev) >> shift;
2913
2914 if (!mddev)
2915 return NULL;
2916
2917 mutex_lock(&disks_mutex);
2918 if (mddev->gendisk) {
2919 mutex_unlock(&disks_mutex);
2920 mddev_put(mddev);
2921 return NULL;
2922 }
2923 disk = alloc_disk(1 << shift);
2924 if (!disk) {
2925 mutex_unlock(&disks_mutex);
2926 mddev_put(mddev);
2927 return NULL;
2928 }
2929 disk->major = MAJOR(dev);
2930 disk->first_minor = unit << shift;
2931 if (partitioned)
2932 sprintf(disk->disk_name, "md_d%d", unit);
2933 else
2934 sprintf(disk->disk_name, "md%d", unit);
2935 disk->fops = &md_fops;
2936 disk->private_data = mddev;
2937 disk->queue = mddev->queue;
2938 add_disk(disk);
2939 mddev->gendisk = disk;
2940 mutex_unlock(&disks_mutex);
2941 mddev->kobj.parent = &disk->kobj;
2942 mddev->kobj.k_name = NULL;
2943 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
2944 mddev->kobj.ktype = &md_ktype;
2945 kobject_register(&mddev->kobj);
2946 return NULL;
2947}
2948
2949static void md_safemode_timeout(unsigned long data)
2950{
2951 mddev_t *mddev = (mddev_t *) data;
2952
2953 mddev->safemode = 1;
2954 md_wakeup_thread(mddev->thread);
2955}
2956
2957static int start_dirty_degraded;
2958
2959static int do_md_run(mddev_t * mddev)
2960{
2961 int err;
2962 int chunk_size;
2963 struct list_head *tmp;
2964 mdk_rdev_t *rdev;
2965 struct gendisk *disk;
2966 struct mdk_personality *pers;
2967 char b[BDEVNAME_SIZE];
2968
2969 if (list_empty(&mddev->disks))
2970
2971 return -EINVAL;
2972
2973 if (mddev->pers)
2974 return -EBUSY;
2975
2976
2977
2978
2979 if (!mddev->raid_disks)
2980 analyze_sbs(mddev);
2981
2982 chunk_size = mddev->chunk_size;
2983
2984 if (chunk_size) {
2985 if (chunk_size > MAX_CHUNK_SIZE) {
2986 printk(KERN_ERR "too big chunk_size: %d > %d\n",
2987 chunk_size, MAX_CHUNK_SIZE);
2988 return -EINVAL;
2989 }
2990
2991
2992
2993 if ( (1 << ffz(~chunk_size)) != chunk_size) {
2994 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
2995 return -EINVAL;
2996 }
2997 if (chunk_size < PAGE_SIZE) {
2998 printk(KERN_ERR "too small chunk_size: %d < %ld\n",
2999 chunk_size, PAGE_SIZE);
3000 return -EINVAL;
3001 }
3002
3003
3004 ITERATE_RDEV(mddev,rdev,tmp) {
3005 if (test_bit(Faulty, &rdev->flags))
3006 continue;
3007 if (rdev->size < chunk_size / 1024) {
3008 printk(KERN_WARNING
3009 "md: Dev %s smaller than chunk_size:"
3010 " %lluk < %dk\n",
3011 bdevname(rdev->bdev,b),
3012 (unsigned long long)rdev->size,
3013 chunk_size / 1024);
3014 return -EINVAL;
3015 }
3016 }
3017 }
3018
3019#ifdef CONFIG_KMOD
3020 if (mddev->level != LEVEL_NONE)
3021 request_module("md-level-%d", mddev->level);
3022 else if (mddev->clevel[0])
3023 request_module("md-%s", mddev->clevel);
3024#endif
3025
3026
3027
3028
3029
3030
3031
3032 ITERATE_RDEV(mddev,rdev,tmp) {
3033 if (test_bit(Faulty, &rdev->flags))
3034 continue;
3035 sync_blockdev(rdev->bdev);
3036 invalidate_bdev(rdev->bdev, 0);
3037 }
3038
3039 md_probe(mddev->unit, NULL, NULL);
3040 disk = mddev->gendisk;
3041 if (!disk)
3042 return -ENOMEM;
3043
3044 spin_lock(&pers_lock);
3045 pers = find_pers(mddev->level, mddev->clevel);
3046 if (!pers || !try_module_get(pers->owner)) {
3047 spin_unlock(&pers_lock);
3048 if (mddev->level != LEVEL_NONE)
3049 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3050 mddev->level);
3051 else
3052 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3053 mddev->clevel);
3054 return -EINVAL;
3055 }
3056 mddev->pers = pers;
3057 spin_unlock(&pers_lock);
3058 mddev->level = pers->level;
3059 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3060
3061 if (mddev->reshape_position != MaxSector &&
3062 pers->start_reshape == NULL) {
3063
3064 mddev->pers = NULL;
3065 module_put(pers->owner);
3066 return -EINVAL;
3067 }
3068
3069 mddev->recovery = 0;
3070 mddev->resync_max_sectors = mddev->size << 1;
3071 mddev->barriers_work = 1;
3072 mddev->ok_start_degraded = start_dirty_degraded;
3073
3074 if (start_readonly)
3075 mddev->ro = 2;
3076
3077 err = mddev->pers->run(mddev);
3078 if (!err && mddev->pers->sync_request) {
3079 err = bitmap_create(mddev);
3080 if (err) {
3081 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3082 mdname(mddev), err);
3083 mddev->pers->stop(mddev);
3084 }
3085 }
3086 if (err) {
3087 printk(KERN_ERR "md: pers->run() failed ...\n");
3088 module_put(mddev->pers->owner);
3089 mddev->pers = NULL;
3090 bitmap_destroy(mddev);
3091 return err;
3092 }
3093 if (mddev->pers->sync_request)
3094 sysfs_create_group(&mddev->kobj, &md_redundancy_group);
3095 else if (mddev->ro == 2)
3096 mddev->ro = 0;
3097
3098 atomic_set(&mddev->writes_pending,0);
3099 mddev->safemode = 0;
3100 mddev->safemode_timer.function = md_safemode_timeout;
3101 mddev->safemode_timer.data = (unsigned long) mddev;
3102 mddev->safemode_delay = (200 * HZ)/1000 +1;
3103 mddev->in_sync = 1;
3104
3105 ITERATE_RDEV(mddev,rdev,tmp)
3106 if (rdev->raid_disk >= 0) {
3107 char nm[20];
3108 sprintf(nm, "rd%d", rdev->raid_disk);
3109 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
3110 }
3111
3112 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3113
3114 if (mddev->sb_dirty)
3115 md_update_sb(mddev);
3116
3117 set_capacity(disk, mddev->array_size<<1);
3118
3119
3120
3121
3122
3123
3124
3125
3126 mddev->queue->queuedata = mddev;
3127 mddev->queue->make_request_fn = mddev->pers->make_request;
3128
3129
3130
3131
3132
3133 if (mddev->degraded && !mddev->sync_thread) {
3134 struct list_head *rtmp;
3135 int spares = 0;
3136 ITERATE_RDEV(mddev,rdev,rtmp)
3137 if (rdev->raid_disk >= 0 &&
3138 !test_bit(In_sync, &rdev->flags) &&
3139 !test_bit(Faulty, &rdev->flags))
3140
3141 spares++;
3142 if (spares && mddev->pers->sync_request) {
3143 mddev->recovery = 0;
3144 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3145 mddev->sync_thread = md_register_thread(md_do_sync,
3146 mddev,
3147 "%s_resync");
3148 if (!mddev->sync_thread) {
3149 printk(KERN_ERR "%s: could not start resync"
3150 " thread...\n",
3151 mdname(mddev));
3152
3153 mddev->recovery = 0;
3154 }
3155 }
3156 }
3157 md_wakeup_thread(mddev->thread);
3158 md_wakeup_thread(mddev->sync_thread);
3159
3160 mddev->changed = 1;
3161 md_new_event(mddev);
3162 return 0;
3163}
3164
3165static int restart_array(mddev_t *mddev)
3166{
3167 struct gendisk *disk = mddev->gendisk;
3168 int err;
3169
3170
3171
3172
3173 err = -ENXIO;
3174 if (list_empty(&mddev->disks))
3175 goto out;
3176
3177 if (mddev->pers) {
3178 err = -EBUSY;
3179 if (!mddev->ro)
3180 goto out;
3181
3182 mddev->safemode = 0;
3183 mddev->ro = 0;
3184 set_disk_ro(disk, 0);
3185
3186 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3187 mdname(mddev));
3188
3189
3190
3191 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3192 md_wakeup_thread(mddev->thread);
3193 md_wakeup_thread(mddev->sync_thread);
3194 err = 0;
3195 } else
3196 err = -EINVAL;
3197
3198out:
3199 return err;
3200}
3201
3202
3203
3204static int deny_bitmap_write_access(struct file * file)
3205{
3206 struct inode *inode = file->f_mapping->host;
3207
3208 spin_lock(&inode->i_lock);
3209 if (atomic_read(&inode->i_writecount) > 1) {
3210 spin_unlock(&inode->i_lock);
3211 return -ETXTBSY;
3212 }
3213 atomic_set(&inode->i_writecount, -1);
3214 spin_unlock(&inode->i_lock);
3215
3216 return 0;
3217}
3218
3219static void restore_bitmap_write_access(struct file *file)
3220{
3221 struct inode *inode = file->f_mapping->host;
3222
3223 spin_lock(&inode->i_lock);
3224 atomic_set(&inode->i_writecount, 1);
3225 spin_unlock(&inode->i_lock);
3226}
3227
3228
3229
3230
3231
3232
3233static int do_md_stop(mddev_t * mddev, int mode)
3234{
3235 int err = 0;
3236 struct gendisk *disk = mddev->gendisk;
3237
3238 if (mddev->pers) {
3239 if (atomic_read(&mddev->active)>2) {
3240 printk("md: %s still in use.\n",mdname(mddev));
3241 return -EBUSY;
3242 }
3243
3244 if (mddev->sync_thread) {
3245 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3246 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3247 md_unregister_thread(mddev->sync_thread);
3248 mddev->sync_thread = NULL;
3249 }
3250
3251 del_timer_sync(&mddev->safemode_timer);
3252
3253 invalidate_partition(disk, 0);
3254
3255 switch(mode) {
3256 case 1:
3257 err = -ENXIO;
3258 if (mddev->ro==1)
3259 goto out;
3260 mddev->ro = 1;
3261 break;
3262 case 0:
3263 case 2:
3264 bitmap_flush(mddev);
3265 md_super_wait(mddev);
3266 if (mddev->ro)
3267 set_disk_ro(disk, 0);
3268 blk_queue_make_request(mddev->queue, md_fail_request);
3269 mddev->pers->stop(mddev);
3270 if (mddev->pers->sync_request)
3271 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3272
3273 module_put(mddev->pers->owner);
3274 mddev->pers = NULL;
3275 if (mddev->ro)
3276 mddev->ro = 0;
3277 }
3278 if (!mddev->in_sync || mddev->sb_dirty) {
3279
3280 mddev->in_sync = 1;
3281 md_update_sb(mddev);
3282 }
3283 if (mode == 1)
3284 set_disk_ro(disk, 1);
3285 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3286 }
3287
3288
3289
3290
3291 if (mode == 0) {
3292 mdk_rdev_t *rdev;
3293 struct list_head *tmp;
3294 struct gendisk *disk;
3295 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3296
3297 bitmap_destroy(mddev);
3298 if (mddev->bitmap_file) {
3299 restore_bitmap_write_access(mddev->bitmap_file);
3300 fput(mddev->bitmap_file);
3301 mddev->bitmap_file = NULL;
3302 }
3303 mddev->bitmap_offset = 0;
3304
3305 ITERATE_RDEV(mddev,rdev,tmp)
3306 if (rdev->raid_disk >= 0) {
3307 char nm[20];
3308 sprintf(nm, "rd%d", rdev->raid_disk);
3309 sysfs_remove_link(&mddev->kobj, nm);
3310 }
3311
3312 export_array(mddev);
3313
3314 mddev->array_size = 0;
3315 mddev->size = 0;
3316 mddev->raid_disks = 0;
3317 mddev->recovery_cp = 0;
3318
3319 disk = mddev->gendisk;
3320 if (disk)
3321 set_capacity(disk, 0);
3322 mddev->changed = 1;
3323 } else if (mddev->pers)
3324 printk(KERN_INFO "md: %s switched to read-only mode.\n",
3325 mdname(mddev));
3326 err = 0;
3327 md_new_event(mddev);
3328out:
3329 return err;
3330}
3331
3332static void autorun_array(mddev_t *mddev)
3333{
3334 mdk_rdev_t *rdev;
3335 struct list_head *tmp;
3336 int err;
3337
3338 if (list_empty(&mddev->disks))
3339 return;
3340
3341 printk(KERN_INFO "md: running: ");
3342
3343 ITERATE_RDEV(mddev,rdev,tmp) {
3344 char b[BDEVNAME_SIZE];
3345 printk("<%s>", bdevname(rdev->bdev,b));
3346 }
3347 printk("\n");
3348
3349 err = do_md_run (mddev);
3350 if (err) {
3351 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3352 do_md_stop (mddev, 0);
3353 }
3354}
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368static void autorun_devices(int part)
3369{
3370 struct list_head *tmp;
3371 mdk_rdev_t *rdev0, *rdev;
3372 mddev_t *mddev;
3373 char b[BDEVNAME_SIZE];
3374
3375 printk(KERN_INFO "md: autorun ...\n");
3376 while (!list_empty(&pending_raid_disks)) {
3377 dev_t dev;
3378 LIST_HEAD(candidates);
3379 rdev0 = list_entry(pending_raid_disks.next,
3380 mdk_rdev_t, same_set);
3381
3382 printk(KERN_INFO "md: considering %s ...\n",
3383 bdevname(rdev0->bdev,b));
3384 INIT_LIST_HEAD(&candidates);
3385 ITERATE_RDEV_PENDING(rdev,tmp)
3386 if (super_90_load(rdev, rdev0, 0) >= 0) {
3387 printk(KERN_INFO "md: adding %s ...\n",
3388 bdevname(rdev->bdev,b));
3389 list_move(&rdev->same_set, &candidates);
3390 }
3391
3392
3393
3394
3395
3396 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
3397 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3398 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3399 break;
3400 }
3401 if (part)
3402 dev = MKDEV(mdp_major,
3403 rdev0->preferred_minor << MdpMinorShift);
3404 else
3405 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3406
3407 md_probe(dev, NULL, NULL);
3408 mddev = mddev_find(dev);
3409 if (!mddev) {
3410 printk(KERN_ERR
3411 "md: cannot allocate memory for md drive.\n");
3412 break;
3413 }
3414 if (mddev_lock(mddev))
3415 printk(KERN_WARNING "md: %s locked, cannot run\n",
3416 mdname(mddev));
3417 else if (mddev->raid_disks || mddev->major_version
3418 || !list_empty(&mddev->disks)) {
3419 printk(KERN_WARNING
3420 "md: %s already running, cannot run %s\n",
3421 mdname(mddev), bdevname(rdev0->bdev,b));
3422 mddev_unlock(mddev);
3423 } else {
3424 printk(KERN_INFO "md: created %s\n", mdname(mddev));
3425 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
3426 list_del_init(&rdev->same_set);
3427 if (bind_rdev_to_array(rdev, mddev))
3428 export_rdev(rdev);
3429 }
3430 autorun_array(mddev);
3431 mddev_unlock(mddev);
3432 }
3433
3434
3435
3436 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
3437 export_rdev(rdev);
3438 mddev_put(mddev);
3439 }
3440 printk(KERN_INFO "md: ... autorun DONE.\n");
3441}
3442
3443
3444
3445
3446
3447
3448static int autostart_array(dev_t startdev)
3449{
3450 char b[BDEVNAME_SIZE];
3451 int err = -EINVAL, i;
3452 mdp_super_t *sb = NULL;
3453 mdk_rdev_t *start_rdev = NULL, *rdev;
3454
3455 start_rdev = md_import_device(startdev, 0, 0);
3456 if (IS_ERR(start_rdev))
3457 return err;
3458
3459
3460
3461 sb = (mdp_super_t*)page_address(start_rdev->sb_page);
3462 if (sb->major_version != 0 ||
3463 sb->minor_version != 90 ) {
3464 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
3465 export_rdev(start_rdev);
3466 return err;
3467 }
3468
3469 if (test_bit(Faulty, &start_rdev->flags)) {
3470 printk(KERN_WARNING
3471 "md: can not autostart based on faulty %s!\n",
3472 bdevname(start_rdev->bdev,b));
3473 export_rdev(start_rdev);
3474 return err;
3475 }
3476 list_add(&start_rdev->same_set, &pending_raid_disks);
3477
3478 for (i = 0; i < MD_SB_DISKS; i++) {
3479 mdp_disk_t *desc = sb->disks + i;
3480 dev_t dev = MKDEV(desc->major, desc->minor);
3481
3482 if (!dev)
3483 continue;
3484 if (dev == startdev)
3485 continue;
3486 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
3487 continue;
3488 rdev = md_import_device(dev, 0, 0);
3489 if (IS_ERR(rdev))
3490 continue;
3491
3492 list_add(&rdev->same_set, &pending_raid_disks);
3493 }
3494
3495
3496
3497
3498 autorun_devices(0);
3499 return 0;
3500
3501}
3502
3503
3504static int get_version(void __user * arg)
3505{
3506 mdu_version_t ver;
3507
3508 ver.major = MD_MAJOR_VERSION;
3509 ver.minor = MD_MINOR_VERSION;
3510 ver.patchlevel = MD_PATCHLEVEL_VERSION;
3511
3512 if (copy_to_user(arg, &ver, sizeof(ver)))
3513 return -EFAULT;
3514
3515 return 0;
3516}
3517
3518static int get_array_info(mddev_t * mddev, void __user * arg)
3519{
3520 mdu_array_info_t info;
3521 int nr,working,active,failed,spare;
3522 mdk_rdev_t *rdev;
3523 struct list_head *tmp;
3524
3525 nr=working=active=failed=spare=0;
3526 ITERATE_RDEV(mddev,rdev,tmp) {
3527 nr++;
3528 if (test_bit(Faulty, &rdev->flags))
3529 failed++;
3530 else {
3531 working++;
3532 if (test_bit(In_sync, &rdev->flags))
3533 active++;
3534 else
3535 spare++;
3536 }
3537 }
3538
3539 info.major_version = mddev->major_version;
3540 info.minor_version = mddev->minor_version;
3541 info.patch_version = MD_PATCHLEVEL_VERSION;
3542 info.ctime = mddev->ctime;
3543 info.level = mddev->level;
3544 info.size = mddev->size;
3545 if (info.size != mddev->size)
3546 info.size = -1;
3547 info.nr_disks = nr;
3548 info.raid_disks = mddev->raid_disks;
3549 info.md_minor = mddev->md_minor;
3550 info.not_persistent= !mddev->persistent;
3551
3552 info.utime = mddev->utime;
3553 info.state = 0;
3554 if (mddev->in_sync)
3555 info.state = (1<<MD_SB_CLEAN);
3556 if (mddev->bitmap && mddev->bitmap_offset)
3557 info.state = (1<<MD_SB_BITMAP_PRESENT);
3558 info.active_disks = active;
3559 info.working_disks = working;
3560 info.failed_disks = failed;
3561 info.spare_disks = spare;
3562
3563 info.layout = mddev->layout;
3564 info.chunk_size = mddev->chunk_size;
3565
3566 if (copy_to_user(arg, &info, sizeof(info)))
3567 return -EFAULT;
3568
3569 return 0;
3570}
3571
3572static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3573{
3574 mdu_bitmap_file_t *file = NULL;
3575 char *ptr, *buf = NULL;
3576 int err = -ENOMEM;
3577
3578 file = kmalloc(sizeof(*file), GFP_KERNEL);
3579 if (!file)
3580 goto out;
3581
3582
3583 if (!mddev->bitmap || !mddev->bitmap->file) {
3584 file->pathname[0] = '\0';
3585 goto copy_out;
3586 }
3587
3588 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3589 if (!buf)
3590 goto out;
3591
3592 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3593 if (!ptr)
3594 goto out;
3595
3596 strcpy(file->pathname, ptr);
3597
3598copy_out:
3599 err = 0;
3600 if (copy_to_user(arg, file, sizeof(*file)))
3601 err = -EFAULT;
3602out:
3603 kfree(buf);
3604 kfree(file);
3605 return err;
3606}
3607
3608static int get_disk_info(mddev_t * mddev, void __user * arg)
3609{
3610 mdu_disk_info_t info;
3611 unsigned int nr;
3612 mdk_rdev_t *rdev;
3613
3614 if (copy_from_user(&info, arg, sizeof(info)))
3615 return -EFAULT;
3616
3617 nr = info.number;
3618
3619 rdev = find_rdev_nr(mddev, nr);
3620 if (rdev) {
3621 info.major = MAJOR(rdev->bdev->bd_dev);
3622 info.minor = MINOR(rdev->bdev->bd_dev);
3623 info.raid_disk = rdev->raid_disk;
3624 info.state = 0;
3625 if (test_bit(Faulty, &rdev->flags))
3626 info.state |= (1<<MD_DISK_FAULTY);
3627 else if (test_bit(In_sync, &rdev->flags)) {
3628 info.state |= (1<<MD_DISK_ACTIVE);
3629 info.state |= (1<<MD_DISK_SYNC);
3630 }
3631 if (test_bit(WriteMostly, &rdev->flags))
3632 info.state |= (1<<MD_DISK_WRITEMOSTLY);
3633 } else {
3634 info.major = info.minor = 0;
3635 info.raid_disk = -1;
3636 info.state = (1<<MD_DISK_REMOVED);
3637 }
3638
3639 if (copy_to_user(arg, &info, sizeof(info)))
3640 return -EFAULT;
3641
3642 return 0;
3643}
3644
3645static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3646{
3647 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3648 mdk_rdev_t *rdev;
3649 dev_t dev = MKDEV(info->major,info->minor);
3650
3651 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3652 return -EOVERFLOW;
3653
3654 if (!mddev->raid_disks) {
3655 int err;
3656
3657 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3658 if (IS_ERR(rdev)) {
3659 printk(KERN_WARNING
3660 "md: md_import_device returned %ld\n",
3661 PTR_ERR(rdev));
3662 return PTR_ERR(rdev);
3663 }
3664 if (!list_empty(&mddev->disks)) {
3665 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3666 mdk_rdev_t, same_set);
3667 int err = super_types[mddev->major_version]
3668 .load_super(rdev, rdev0, mddev->minor_version);
3669 if (err < 0) {
3670 printk(KERN_WARNING
3671 "md: %s has different UUID to %s\n",
3672 bdevname(rdev->bdev,b),
3673 bdevname(rdev0->bdev,b2));
3674 export_rdev(rdev);
3675 return -EINVAL;
3676 }
3677 }
3678 err = bind_rdev_to_array(rdev, mddev);
3679 if (err)
3680 export_rdev(rdev);
3681 return err;
3682 }
3683
3684
3685
3686
3687
3688
3689 if (mddev->pers) {
3690 int err;
3691 if (!mddev->pers->hot_add_disk) {
3692 printk(KERN_WARNING
3693 "%s: personality does not support diskops!\n",
3694 mdname(mddev));
3695 return -EINVAL;
3696 }
3697 if (mddev->persistent)
3698 rdev = md_import_device(dev, mddev->major_version,
3699 mddev->minor_version);
3700 else
3701 rdev = md_import_device(dev, -1, -1);
3702 if (IS_ERR(rdev)) {
3703 printk(KERN_WARNING
3704 "md: md_import_device returned %ld\n",
3705 PTR_ERR(rdev));
3706 return PTR_ERR(rdev);
3707 }
3708
3709 if (!mddev->persistent) {
3710 if (info->state & (1<<MD_DISK_SYNC) &&
3711 info->raid_disk < mddev->raid_disks)
3712 rdev->raid_disk = info->raid_disk;
3713 else
3714 rdev->raid_disk = -1;
3715 } else
3716 super_types[mddev->major_version].
3717 validate_super(mddev, rdev);
3718 rdev->saved_raid_disk = rdev->raid_disk;
3719
3720 clear_bit(In_sync, &rdev->flags);
3721 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3722 set_bit(WriteMostly, &rdev->flags);
3723
3724 rdev->raid_disk = -1;
3725 err = bind_rdev_to_array(rdev, mddev);
3726 if (!err && !mddev->pers->hot_remove_disk) {
3727
3728
3729
3730
3731 super_types[mddev->major_version].
3732 validate_super(mddev, rdev);
3733 err = mddev->pers->hot_add_disk(mddev, rdev);
3734 if (err)
3735 unbind_rdev_from_array(rdev);
3736 }
3737 if (err)
3738 export_rdev(rdev);
3739
3740 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3741 md_wakeup_thread(mddev->thread);
3742 return err;
3743 }
3744
3745
3746
3747
3748 if (mddev->major_version != 0) {
3749 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3750 mdname(mddev));
3751 return -EINVAL;
3752 }
3753
3754 if (!(info->state & (1<<MD_DISK_FAULTY))) {
3755 int err;
3756 rdev = md_import_device (dev, -1, 0);
3757 if (IS_ERR(rdev)) {
3758 printk(KERN_WARNING
3759 "md: error, md_import_device() returned %ld\n",
3760 PTR_ERR(rdev));
3761 return PTR_ERR(rdev);
3762 }
3763 rdev->desc_nr = info->number;
3764 if (info->raid_disk < mddev->raid_disks)
3765 rdev->raid_disk = info->raid_disk;
3766 else
3767 rdev->raid_disk = -1;
3768
3769 rdev->flags = 0;
3770
3771 if (rdev->raid_disk < mddev->raid_disks)
3772 if (info->state & (1<<MD_DISK_SYNC))
3773 set_bit(In_sync, &rdev->flags);
3774
3775 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3776 set_bit(WriteMostly, &rdev->flags);
3777
3778 if (!mddev->persistent) {
3779 printk(KERN_INFO "md: nonpersistent superblock ...\n");
3780 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3781 } else
3782 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3783 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3784
3785 err = bind_rdev_to_array(rdev, mddev);
3786 if (err) {
3787 export_rdev(rdev);
3788 return err;
3789 }
3790 }
3791
3792 return 0;
3793}
3794
3795static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3796{
3797 char b[BDEVNAME_SIZE];
3798 mdk_rdev_t *rdev;
3799
3800 if (!mddev->pers)
3801 return -ENODEV;
3802
3803 rdev = find_rdev(mddev, dev);
3804 if (!rdev)
3805 return -ENXIO;
3806
3807 if (rdev->raid_disk >= 0)
3808 goto busy;
3809
3810 kick_rdev_from_array(rdev);
3811 md_update_sb(mddev);
3812 md_new_event(mddev);
3813
3814 return 0;
3815busy:
3816 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3817 bdevname(rdev->bdev,b), mdname(mddev));
3818 return -EBUSY;
3819}
3820
3821static int hot_add_disk(mddev_t * mddev, dev_t dev)
3822{
3823 char b[BDEVNAME_SIZE];
3824 int err;
3825 unsigned int size;
3826 mdk_rdev_t *rdev;
3827
3828 if (!mddev->pers)
3829 return -ENODEV;
3830
3831 if (mddev->major_version != 0) {
3832 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3833 " version-0 superblocks.\n",
3834 mdname(mddev));
3835 return -EINVAL;
3836 }
3837 if (!mddev->pers->hot_add_disk) {
3838 printk(KERN_WARNING
3839 "%s: personality does not support diskops!\n",
3840 mdname(mddev));
3841 return -EINVAL;
3842 }
3843
3844 rdev = md_import_device (dev, -1, 0);
3845 if (IS_ERR(rdev)) {
3846 printk(KERN_WARNING
3847 "md: error, md_import_device() returned %ld\n",
3848 PTR_ERR(rdev));
3849 return -EINVAL;
3850 }
3851
3852 if (mddev->persistent)
3853 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3854 else
3855 rdev->sb_offset =
3856 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3857
3858 size = calc_dev_size(rdev, mddev->chunk_size);
3859 rdev->size = size;
3860
3861 if (test_bit(Faulty, &rdev->flags)) {
3862 printk(KERN_WARNING
3863 "md: can not hot-add faulty %s disk to %s!\n",
3864 bdevname(rdev->bdev,b), mdname(mddev));
3865 err = -EINVAL;
3866 goto abort_export;
3867 }
3868 clear_bit(In_sync, &rdev->flags);
3869 rdev->desc_nr = -1;
3870 err = bind_rdev_to_array(rdev, mddev);
3871 if (err)
3872 goto abort_export;
3873
3874
3875
3876
3877
3878
3879 if (rdev->desc_nr == mddev->max_disks) {
3880 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
3881 mdname(mddev));
3882 err = -EBUSY;
3883 goto abort_unbind_export;
3884 }
3885
3886 rdev->raid_disk = -1;
3887
3888 md_update_sb(mddev);
3889
3890
3891
3892
3893
3894 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3895 md_wakeup_thread(mddev->thread);
3896 md_new_event(mddev);
3897 return 0;
3898
3899abort_unbind_export:
3900 unbind_rdev_from_array(rdev);
3901
3902abort_export:
3903 export_rdev(rdev);
3904 return err;
3905}
3906
3907static int set_bitmap_file(mddev_t *mddev, int fd)
3908{
3909 int err;
3910
3911 if (mddev->pers) {
3912 if (!mddev->pers->quiesce)
3913 return -EBUSY;
3914 if (mddev->recovery || mddev->sync_thread)
3915 return -EBUSY;
3916
3917 }
3918
3919
3920 if (fd >= 0) {
3921 if (mddev->bitmap)
3922 return -EEXIST;
3923 mddev->bitmap_file = fget(fd);
3924
3925 if (mddev->bitmap_file == NULL) {
3926 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
3927 mdname(mddev));
3928 return -EBADF;
3929 }
3930
3931 err = deny_bitmap_write_access(mddev->bitmap_file);
3932 if (err) {
3933 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
3934 mdname(mddev));
3935 fput(mddev->bitmap_file);
3936 mddev->bitmap_file = NULL;
3937 return err;
3938 }
3939 mddev->bitmap_offset = 0;
3940 } else if (mddev->bitmap == NULL)
3941 return -ENOENT;
3942 err = 0;
3943 if (mddev->pers) {
3944 mddev->pers->quiesce(mddev, 1);
3945 if (fd >= 0)
3946 err = bitmap_create(mddev);
3947 if (fd < 0 || err) {
3948 bitmap_destroy(mddev);
3949 fd = -1;
3950 }
3951 mddev->pers->quiesce(mddev, 0);
3952 }
3953 if (fd < 0) {
3954 if (mddev->bitmap_file) {
3955 restore_bitmap_write_access(mddev->bitmap_file);
3956 fput(mddev->bitmap_file);
3957 }
3958 mddev->bitmap_file = NULL;
3959 }
3960
3961 return err;
3962}
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
3978{
3979
3980 if (info->raid_disks == 0) {
3981
3982 if (info->major_version < 0 ||
3983 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
3984 super_types[info->major_version].name == NULL) {
3985
3986 printk(KERN_INFO
3987 "md: superblock version %d not known\n",
3988 info->major_version);
3989 return -EINVAL;
3990 }
3991 mddev->major_version = info->major_version;
3992 mddev->minor_version = info->minor_version;
3993 mddev->patch_version = info->patch_version;
3994 return 0;
3995 }
3996 mddev->major_version = MD_MAJOR_VERSION;
3997 mddev->minor_version = MD_MINOR_VERSION;
3998 mddev->patch_version = MD_PATCHLEVEL_VERSION;
3999 mddev->ctime = get_seconds();
4000
4001 mddev->level = info->level;
4002 mddev->clevel[0] = 0;
4003 mddev->size = info->size;
4004 mddev->raid_disks = info->raid_disks;
4005
4006
4007
4008 if (info->state & (1<<MD_SB_CLEAN))
4009 mddev->recovery_cp = MaxSector;
4010 else
4011 mddev->recovery_cp = 0;
4012 mddev->persistent = ! info->not_persistent;
4013
4014 mddev->layout = info->layout;
4015 mddev->chunk_size = info->chunk_size;
4016
4017 mddev->max_disks = MD_SB_DISKS;
4018
4019 mddev->sb_dirty = 1;
4020
4021 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4022 mddev->bitmap_offset = 0;
4023
4024 mddev->reshape_position = MaxSector;
4025
4026
4027
4028
4029 get_random_bytes(mddev->uuid, 16);
4030
4031 mddev->new_level = mddev->level;
4032 mddev->new_chunk = mddev->chunk_size;
4033 mddev->new_layout = mddev->layout;
4034 mddev->delta_disks = 0;
4035
4036 return 0;
4037}
4038
4039static int update_size(mddev_t *mddev, unsigned long size)
4040{
4041 mdk_rdev_t * rdev;
4042 int rv;
4043 struct list_head *tmp;
4044 int fit = (size == 0);
4045
4046 if (mddev->pers->resize == NULL)
4047 return -EINVAL;
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058 if (mddev->sync_thread)
4059 return -EBUSY;
4060 ITERATE_RDEV(mddev,rdev,tmp) {
4061 sector_t avail;
4062 if (rdev->sb_offset > rdev->data_offset)
4063 avail = (rdev->sb_offset*2) - rdev->data_offset;
4064 else
4065 avail = get_capacity(rdev->bdev->bd_disk)
4066 - rdev->data_offset;
4067 if (fit && (size == 0 || size > avail/2))
4068 size = avail/2;
4069 if (avail < ((sector_t)size << 1))
4070 return -ENOSPC;
4071 }
4072 rv = mddev->pers->resize(mddev, (sector_t)size *2);
4073 if (!rv) {
4074 struct block_device *bdev;
4075
4076 bdev = bdget_disk(mddev->gendisk, 0);
4077 if (bdev) {
4078 mutex_lock(&bdev->bd_inode->i_mutex);
4079 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4080 mutex_unlock(&bdev->bd_inode->i_mutex);
4081 bdput(bdev);
4082 }
4083 }
4084 return rv;
4085}
4086
4087static int update_raid_disks(mddev_t *mddev, int raid_disks)
4088{
4089 int rv;
4090
4091 if (mddev->pers->check_reshape == NULL)
4092 return -EINVAL;
4093 if (raid_disks <= 0 ||
4094 raid_disks >= mddev->max_disks)
4095 return -EINVAL;
4096 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4097 return -EBUSY;
4098 mddev->delta_disks = raid_disks - mddev->raid_disks;
4099
4100 rv = mddev->pers->check_reshape(mddev);
4101 return rv;
4102}
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4114{
4115 int rv = 0;
4116 int cnt = 0;
4117 int state = 0;
4118
4119
4120 if (mddev->bitmap && mddev->bitmap_offset)
4121 state |= (1 << MD_SB_BITMAP_PRESENT);
4122
4123 if (mddev->major_version != info->major_version ||
4124 mddev->minor_version != info->minor_version ||
4125
4126 mddev->ctime != info->ctime ||
4127 mddev->level != info->level ||
4128
4129 !mddev->persistent != info->not_persistent||
4130 mddev->chunk_size != info->chunk_size ||
4131
4132 ((state^info->state) & 0xfffffe00)
4133 )
4134 return -EINVAL;
4135
4136 if (info->size >= 0 && mddev->size != info->size) cnt++;
4137 if (mddev->raid_disks != info->raid_disks) cnt++;
4138 if (mddev->layout != info->layout) cnt++;
4139 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4140 if (cnt == 0) return 0;
4141 if (cnt > 1) return -EINVAL;
4142
4143 if (mddev->layout != info->layout) {
4144
4145
4146
4147
4148 if (mddev->pers->reconfig == NULL)
4149 return -EINVAL;
4150 else
4151 return mddev->pers->reconfig(mddev, info->layout, -1);
4152 }
4153 if (info->size >= 0 && mddev->size != info->size)
4154 rv = update_size(mddev, info->size);
4155
4156 if (mddev->raid_disks != info->raid_disks)
4157 rv = update_raid_disks(mddev, info->raid_disks);
4158
4159 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4160 if (mddev->pers->quiesce == NULL)
4161 return -EINVAL;
4162 if (mddev->recovery || mddev->sync_thread)
4163 return -EBUSY;
4164 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4165
4166 if (mddev->bitmap)
4167 return -EEXIST;
4168 if (mddev->default_bitmap_offset == 0)
4169 return -EINVAL;
4170 mddev->bitmap_offset = mddev->default_bitmap_offset;
4171 mddev->pers->quiesce(mddev, 1);
4172 rv = bitmap_create(mddev);
4173 if (rv)
4174 bitmap_destroy(mddev);
4175 mddev->pers->quiesce(mddev, 0);
4176 } else {
4177
4178 if (!mddev->bitmap)
4179 return -ENOENT;
4180 if (mddev->bitmap->file)
4181 return -EINVAL;
4182 mddev->pers->quiesce(mddev, 1);
4183 bitmap_destroy(mddev);
4184 mddev->pers->quiesce(mddev, 0);
4185 mddev->bitmap_offset = 0;
4186 }
4187 }
4188 md_update_sb(mddev);
4189 return rv;
4190}
4191
4192static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4193{
4194 mdk_rdev_t *rdev;
4195
4196 if (mddev->pers == NULL)
4197 return -ENODEV;
4198
4199 rdev = find_rdev(mddev, dev);
4200 if (!rdev)
4201 return -ENODEV;
4202
4203 md_error(mddev, rdev);
4204 return 0;
4205}
4206
4207static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4208{
4209 mddev_t *mddev = bdev->bd_disk->private_data;
4210
4211 geo->heads = 2;
4212 geo->sectors = 4;
4213 geo->cylinders = get_capacity(mddev->gendisk) / 8;
4214 return 0;
4215}
4216
4217static int md_ioctl(struct inode *inode, struct file *file,
4218 unsigned int cmd, unsigned long arg)
4219{
4220 int err = 0;
4221 void __user *argp = (void __user *)arg;
4222 mddev_t *mddev = NULL;
4223
4224 if (!capable(CAP_SYS_ADMIN))
4225 return -EACCES;
4226
4227
4228
4229
4230
4231 switch (cmd)
4232 {
4233 case RAID_VERSION:
4234 err = get_version(argp);
4235 goto done;
4236
4237 case PRINT_RAID_DEBUG:
4238 err = 0;
4239 md_print_devices();
4240 goto done;
4241
4242#ifndef MODULE
4243 case RAID_AUTORUN:
4244 err = 0;
4245 autostart_arrays(arg);
4246 goto done;
4247#endif
4248 default:;
4249 }
4250
4251
4252
4253
4254
4255 mddev = inode->i_bdev->bd_disk->private_data;
4256
4257 if (!mddev) {
4258 BUG();
4259 goto abort;
4260 }
4261
4262
4263 if (cmd == START_ARRAY) {
4264
4265
4266
4267 static int cnt = 3;
4268 if (cnt > 0 ) {
4269 printk(KERN_WARNING
4270 "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
4271 "This will not be supported beyond July 2006\n",
4272 current->comm, current->pid);
4273 cnt--;
4274 }
4275 err = autostart_array(new_decode_dev(arg));
4276 if (err) {
4277 printk(KERN_WARNING "md: autostart failed!\n");
4278 goto abort;
4279 }
4280 goto done;
4281 }
4282
4283 err = mddev_lock(mddev);
4284 if (err) {
4285 printk(KERN_INFO
4286 "md: ioctl lock interrupted, reason %d, cmd %d\n",
4287 err, cmd);
4288 goto abort;
4289 }
4290
4291 switch (cmd)
4292 {
4293 case SET_ARRAY_INFO:
4294 {
4295 mdu_array_info_t info;
4296 if (!arg)
4297 memset(&info, 0, sizeof(info));
4298 else if (copy_from_user(&info, argp, sizeof(info))) {
4299 err = -EFAULT;
4300 goto abort_unlock;
4301 }
4302 if (mddev->pers) {
4303 err = update_array_info(mddev, &info);
4304 if (err) {
4305 printk(KERN_WARNING "md: couldn't update"
4306 " array info. %d\n", err);
4307 goto abort_unlock;
4308 }
4309 goto done_unlock;
4310 }
4311 if (!list_empty(&mddev->disks)) {
4312 printk(KERN_WARNING
4313 "md: array %s already has disks!\n",
4314 mdname(mddev));
4315 err = -EBUSY;
4316 goto abort_unlock;
4317 }
4318 if (mddev->raid_disks) {
4319 printk(KERN_WARNING
4320 "md: array %s already initialised!\n",
4321 mdname(mddev));
4322 err = -EBUSY;
4323 goto abort_unlock;
4324 }
4325 err = set_array_info(mddev, &info);
4326 if (err) {
4327 printk(KERN_WARNING "md: couldn't set"
4328 " array info. %d\n", err);
4329 goto abort_unlock;
4330 }
4331 }
4332 goto done_unlock;
4333
4334 default:;
4335 }
4336
4337
4338
4339
4340
4341
4342 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4343 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
4344 err = -ENODEV;
4345 goto abort_unlock;
4346 }
4347
4348
4349
4350
4351 switch (cmd)
4352 {
4353 case GET_ARRAY_INFO:
4354 err = get_array_info(mddev, argp);
4355 goto done_unlock;
4356
4357 case GET_BITMAP_FILE:
4358 err = get_bitmap_file(mddev, argp);
4359 goto done_unlock;
4360
4361 case GET_DISK_INFO:
4362 err = get_disk_info(mddev, argp);
4363 goto done_unlock;
4364
4365 case RESTART_ARRAY_RW:
4366 err = restart_array(mddev);
4367 goto done_unlock;
4368
4369 case STOP_ARRAY:
4370 err = do_md_stop (mddev, 0);
4371 goto done_unlock;
4372
4373 case STOP_ARRAY_RO:
4374 err = do_md_stop (mddev, 1);
4375 goto done_unlock;
4376
4377
4378
4379
4380
4381
4382
4383 }
4384
4385
4386
4387
4388
4389
4390
4391
4392 if (_IOC_TYPE(cmd) == MD_MAJOR &&
4393 mddev->ro && mddev->pers) {
4394 if (mddev->ro == 2) {
4395 mddev->ro = 0;
4396 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4397 md_wakeup_thread(mddev->thread);
4398
4399 } else {
4400 err = -EROFS;
4401 goto abort_unlock;
4402 }
4403 }
4404
4405 switch (cmd)
4406 {
4407 case ADD_NEW_DISK:
4408 {
4409 mdu_disk_info_t info;
4410 if (copy_from_user(&info, argp, sizeof(info)))
4411 err = -EFAULT;
4412 else
4413 err = add_new_disk(mddev, &info);
4414 goto done_unlock;
4415 }
4416
4417 case HOT_REMOVE_DISK:
4418 err = hot_remove_disk(mddev, new_decode_dev(arg));
4419 goto done_unlock;
4420
4421 case HOT_ADD_DISK:
4422 err = hot_add_disk(mddev, new_decode_dev(arg));
4423 goto done_unlock;
4424
4425 case SET_DISK_FAULTY:
4426 err = set_disk_faulty(mddev, new_decode_dev(arg));
4427 goto done_unlock;
4428
4429 case RUN_ARRAY:
4430 err = do_md_run (mddev);
4431 goto done_unlock;
4432
4433 case SET_BITMAP_FILE:
4434 err = set_bitmap_file(mddev, (int)arg);
4435 goto done_unlock;
4436
4437 default:
4438 err = -EINVAL;
4439 goto abort_unlock;
4440 }
4441
4442done_unlock:
4443abort_unlock:
4444 mddev_unlock(mddev);
4445
4446 return err;
4447done:
4448 if (err)
4449 MD_BUG();
4450abort:
4451 return err;
4452}
4453
4454static int md_open(struct inode *inode, struct file *file)
4455{
4456
4457
4458
4459
4460 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4461 int err;
4462
4463 if ((err = mddev_lock(mddev)))
4464 goto out;
4465
4466 err = 0;
4467 mddev_get(mddev);
4468 mddev_unlock(mddev);
4469
4470 check_disk_change(inode->i_bdev);
4471 out:
4472 return err;
4473}
4474
4475static int md_release(struct inode *inode, struct file * file)
4476{
4477 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4478
4479 if (!mddev)
4480 BUG();
4481 mddev_put(mddev);
4482
4483 return 0;
4484}
4485
4486static int md_media_changed(struct gendisk *disk)
4487{
4488 mddev_t *mddev = disk->private_data;
4489
4490 return mddev->changed;
4491}
4492
4493static int md_revalidate(struct gendisk *disk)
4494{
4495 mddev_t *mddev = disk->private_data;
4496
4497 mddev->changed = 0;
4498 return 0;
4499}
4500static struct block_device_operations md_fops =
4501{
4502 .owner = THIS_MODULE,
4503 .open = md_open,
4504 .release = md_release,
4505 .ioctl = md_ioctl,
4506 .getgeo = md_getgeo,
4507 .media_changed = md_media_changed,
4508 .revalidate_disk= md_revalidate,
4509};
4510
4511static int md_thread(void * arg)
4512{
4513 mdk_thread_t *thread = arg;
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527 allow_signal(SIGKILL);
4528 while (!kthread_should_stop()) {
4529
4530
4531
4532
4533
4534
4535 if (signal_pending(current))
4536 flush_signals(current);
4537
4538 wait_event_interruptible_timeout
4539 (thread->wqueue,
4540 test_bit(THREAD_WAKEUP, &thread->flags)
4541 || kthread_should_stop(),
4542 thread->timeout);
4543 try_to_freeze();
4544
4545 clear_bit(THREAD_WAKEUP, &thread->flags);
4546
4547 thread->run(thread->mddev);
4548 }
4549
4550 return 0;
4551}
4552
4553void md_wakeup_thread(mdk_thread_t *thread)
4554{
4555 if (thread) {
4556 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4557 set_bit(THREAD_WAKEUP, &thread->flags);
4558 wake_up(&thread->wqueue);
4559 }
4560}
4561
4562mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4563 const char *name)
4564{
4565 mdk_thread_t *thread;
4566
4567 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4568 if (!thread)
4569 return NULL;
4570
4571 init_waitqueue_head(&thread->wqueue);
4572
4573 thread->run = run;
4574 thread->mddev = mddev;
4575 thread->timeout = MAX_SCHEDULE_TIMEOUT;
4576 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4577 if (IS_ERR(thread->tsk)) {
4578 kfree(thread);
4579 return NULL;
4580 }
4581 return thread;
4582}
4583
4584void md_unregister_thread(mdk_thread_t *thread)
4585{
4586 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
4587
4588 kthread_stop(thread->tsk);
4589 kfree(thread);
4590}
4591
4592void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4593{
4594 if (!mddev) {
4595 MD_BUG();
4596 return;
4597 }
4598
4599 if (!rdev || test_bit(Faulty, &rdev->flags))
4600 return;
4601
4602
4603
4604
4605
4606
4607
4608 if (!mddev->pers)
4609 return;
4610 if (!mddev->pers->error_handler)
4611 return;
4612 mddev->pers->error_handler(mddev,rdev);
4613 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4614 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4615 md_wakeup_thread(mddev->thread);
4616 md_new_event_inintr(mddev);
4617}
4618
4619
4620
4621static void status_unused(struct seq_file *seq)
4622{
4623 int i = 0;
4624 mdk_rdev_t *rdev;
4625 struct list_head *tmp;
4626
4627 seq_printf(seq, "unused devices: ");
4628
4629 ITERATE_RDEV_PENDING(rdev,tmp) {
4630 char b[BDEVNAME_SIZE];
4631 i++;
4632 seq_printf(seq, "%s ",
4633 bdevname(rdev->bdev,b));
4634 }
4635 if (!i)
4636 seq_printf(seq, "<none>");
4637
4638 seq_printf(seq, "\n");
4639}
4640
4641
4642static void status_resync(struct seq_file *seq, mddev_t * mddev)
4643{
4644 sector_t max_blocks, resync, res;
4645 unsigned long dt, db, rt;
4646 int scale;
4647 unsigned int per_milli;
4648
4649 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4650
4651 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4652 max_blocks = mddev->resync_max_sectors >> 1;
4653 else
4654 max_blocks = mddev->size;
4655
4656
4657
4658
4659 if (!max_blocks) {
4660 MD_BUG();
4661 return;
4662 }
4663
4664
4665
4666
4667
4668 scale = 10;
4669 if (sizeof(sector_t) > sizeof(unsigned long)) {
4670 while ( max_blocks/2 > (1ULL<<(scale+32)))
4671 scale++;
4672 }
4673 res = (resync>>scale)*1000;
4674 sector_div(res, (u32)((max_blocks>>scale)+1));
4675
4676 per_milli = res;
4677 {
4678 int i, x = per_milli/50, y = 20-x;
4679 seq_printf(seq, "[");
4680 for (i = 0; i < x; i++)
4681 seq_printf(seq, "=");
4682 seq_printf(seq, ">");
4683 for (i = 0; i < y; i++)
4684 seq_printf(seq, ".");
4685 seq_printf(seq, "] ");
4686 }
4687 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4688 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4689 "reshape" :
4690 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4691 "resync" : "recovery")),
4692 per_milli/10, per_milli % 10,
4693 (unsigned long long) resync,
4694 (unsigned long long) max_blocks);
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705 dt = ((jiffies - mddev->resync_mark) / HZ);
4706 if (!dt) dt++;
4707 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
4708 - mddev->resync_mark_cnt;
4709 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
4710
4711 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4712
4713 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
4714}
4715
4716static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4717{
4718 struct list_head *tmp;
4719 loff_t l = *pos;
4720 mddev_t *mddev;
4721
4722 if (l >= 0x10000)
4723 return NULL;
4724 if (!l--)
4725
4726 return (void*)1;
4727
4728 spin_lock(&all_mddevs_lock);
4729 list_for_each(tmp,&all_mddevs)
4730 if (!l--) {
4731 mddev = list_entry(tmp, mddev_t, all_mddevs);
4732 mddev_get(mddev);
4733 spin_unlock(&all_mddevs_lock);
4734 return mddev;
4735 }
4736 spin_unlock(&all_mddevs_lock);
4737 if (!l--)
4738 return (void*)2;
4739 return NULL;
4740}
4741
4742static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4743{
4744 struct list_head *tmp;
4745 mddev_t *next_mddev, *mddev = v;
4746
4747 ++*pos;
4748 if (v == (void*)2)
4749 return NULL;
4750
4751 spin_lock(&all_mddevs_lock);
4752 if (v == (void*)1)
4753 tmp = all_mddevs.next;
4754 else
4755 tmp = mddev->all_mddevs.next;
4756 if (tmp != &all_mddevs)
4757 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4758 else {
4759 next_mddev = (void*)2;
4760 *pos = 0x10000;
4761 }
4762 spin_unlock(&all_mddevs_lock);
4763
4764 if (v != (void*)1)
4765 mddev_put(mddev);
4766 return next_mddev;
4767
4768}
4769
4770static void md_seq_stop(struct seq_file *seq, void *v)
4771{
4772 mddev_t *mddev = v;
4773
4774 if (mddev && v != (void*)1 && v != (void*)2)
4775 mddev_put(mddev);
4776}
4777
4778struct mdstat_info {
4779 int event;
4780};
4781
4782static int md_seq_show(struct seq_file *seq, void *v)
4783{
4784 mddev_t *mddev = v;
4785 sector_t size;
4786 struct list_head *tmp2;
4787 mdk_rdev_t *rdev;
4788 struct mdstat_info *mi = seq->private;
4789 struct bitmap *bitmap;
4790
4791 if (v == (void*)1) {
4792 struct mdk_personality *pers;
4793 seq_printf(seq, "Personalities : ");
4794 spin_lock(&pers_lock);
4795 list_for_each_entry(pers, &pers_list, list)
4796 seq_printf(seq, "[%s] ", pers->name);
4797
4798 spin_unlock(&pers_lock);
4799 seq_printf(seq, "\n");
4800 mi->event = atomic_read(&md_event_count);
4801 return 0;
4802 }
4803 if (v == (void*)2) {
4804 status_unused(seq);
4805 return 0;
4806 }
4807
4808 if (mddev_lock(mddev) < 0)
4809 return -EINTR;
4810
4811 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4812 seq_printf(seq, "%s : %sactive", mdname(mddev),
4813 mddev->pers ? "" : "in");
4814 if (mddev->pers) {
4815 if (mddev->ro==1)
4816 seq_printf(seq, " (read-only)");
4817 if (mddev->ro==2)
4818 seq_printf(seq, "(auto-read-only)");
4819 seq_printf(seq, " %s", mddev->pers->name);
4820 }
4821
4822 size = 0;
4823 ITERATE_RDEV(mddev,rdev,tmp2) {
4824 char b[BDEVNAME_SIZE];
4825 seq_printf(seq, " %s[%d]",
4826 bdevname(rdev->bdev,b), rdev->desc_nr);
4827 if (test_bit(WriteMostly, &rdev->flags))
4828 seq_printf(seq, "(W)");
4829 if (test_bit(Faulty, &rdev->flags)) {
4830 seq_printf(seq, "(F)");
4831 continue;
4832 } else if (rdev->raid_disk < 0)
4833 seq_printf(seq, "(S)");
4834 size += rdev->size;
4835 }
4836
4837 if (!list_empty(&mddev->disks)) {
4838 if (mddev->pers)
4839 seq_printf(seq, "\n %llu blocks",
4840 (unsigned long long)mddev->array_size);
4841 else
4842 seq_printf(seq, "\n %llu blocks",
4843 (unsigned long long)size);
4844 }
4845 if (mddev->persistent) {
4846 if (mddev->major_version != 0 ||
4847 mddev->minor_version != 90) {
4848 seq_printf(seq," super %d.%d",
4849 mddev->major_version,
4850 mddev->minor_version);
4851 }
4852 } else
4853 seq_printf(seq, " super non-persistent");
4854
4855 if (mddev->pers) {
4856 mddev->pers->status (seq, mddev);
4857 seq_printf(seq, "\n ");
4858 if (mddev->pers->sync_request) {
4859 if (mddev->curr_resync > 2) {
4860 status_resync (seq, mddev);
4861 seq_printf(seq, "\n ");
4862 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4863 seq_printf(seq, "\tresync=DELAYED\n ");
4864 else if (mddev->recovery_cp < MaxSector)
4865 seq_printf(seq, "\tresync=PENDING\n ");
4866 }
4867 } else
4868 seq_printf(seq, "\n ");
4869
4870 if ((bitmap = mddev->bitmap)) {
4871 unsigned long chunk_kb;
4872 unsigned long flags;
4873 spin_lock_irqsave(&bitmap->lock, flags);
4874 chunk_kb = bitmap->chunksize >> 10;
4875 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
4876 "%lu%s chunk",
4877 bitmap->pages - bitmap->missing_pages,
4878 bitmap->pages,
4879 (bitmap->pages - bitmap->missing_pages)
4880 << (PAGE_SHIFT - 10),
4881 chunk_kb ? chunk_kb : bitmap->chunksize,
4882 chunk_kb ? "KB" : "B");
4883 if (bitmap->file) {
4884 seq_printf(seq, ", file: ");
4885 seq_path(seq, bitmap->file->f_vfsmnt,
4886 bitmap->file->f_dentry," \t\n");
4887 }
4888
4889 seq_printf(seq, "\n");
4890 spin_unlock_irqrestore(&bitmap->lock, flags);
4891 }
4892
4893 seq_printf(seq, "\n");
4894 }
4895 mddev_unlock(mddev);
4896
4897 return 0;
4898}
4899
4900static struct seq_operations md_seq_ops = {
4901 .start = md_seq_start,
4902 .next = md_seq_next,
4903 .stop = md_seq_stop,
4904 .show = md_seq_show,
4905};
4906
4907static int md_seq_open(struct inode *inode, struct file *file)
4908{
4909 int error;
4910 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4911 if (mi == NULL)
4912 return -ENOMEM;
4913
4914 error = seq_open(file, &md_seq_ops);
4915 if (error)
4916 kfree(mi);
4917 else {
4918 struct seq_file *p = file->private_data;
4919 p->private = mi;
4920 mi->event = atomic_read(&md_event_count);
4921 }
4922 return error;
4923}
4924
4925static int md_seq_release(struct inode *inode, struct file *file)
4926{
4927 struct seq_file *m = file->private_data;
4928 struct mdstat_info *mi = m->private;
4929 m->private = NULL;
4930 kfree(mi);
4931 return seq_release(inode, file);
4932}
4933
4934static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4935{
4936 struct seq_file *m = filp->private_data;
4937 struct mdstat_info *mi = m->private;
4938 int mask;
4939
4940 poll_wait(filp, &md_event_waiters, wait);
4941
4942
4943 mask = POLLIN | POLLRDNORM;
4944
4945 if (mi->event != atomic_read(&md_event_count))
4946 mask |= POLLERR | POLLPRI;
4947 return mask;
4948}
4949
4950static struct file_operations md_seq_fops = {
4951 .open = md_seq_open,
4952 .read = seq_read,
4953 .llseek = seq_lseek,
4954 .release = md_seq_release,
4955 .poll = mdstat_poll,
4956};
4957
4958int register_md_personality(struct mdk_personality *p)
4959{
4960 spin_lock(&pers_lock);
4961 list_add_tail(&p->list, &pers_list);
4962 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
4963 spin_unlock(&pers_lock);
4964 return 0;
4965}
4966
4967int unregister_md_personality(struct mdk_personality *p)
4968{
4969 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
4970 spin_lock(&pers_lock);
4971 list_del_init(&p->list);
4972 spin_unlock(&pers_lock);
4973 return 0;
4974}
4975
4976static int is_mddev_idle(mddev_t *mddev)
4977{
4978 mdk_rdev_t * rdev;
4979 struct list_head *tmp;
4980 int idle;
4981 unsigned long curr_events;
4982
4983 idle = 1;
4984 ITERATE_RDEV(mddev,rdev,tmp) {
4985 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
4986 curr_events = disk_stat_read(disk, sectors[0]) +
4987 disk_stat_read(disk, sectors[1]) -
4988 atomic_read(&disk->sync_io);
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002 if ((curr_events - rdev->last_events + 4096) > 8192) {
5003 rdev->last_events = curr_events;
5004 idle = 0;
5005 }
5006 }
5007 return idle;
5008}
5009
5010void md_done_sync(mddev_t *mddev, int blocks, int ok)
5011{
5012
5013 atomic_sub(blocks, &mddev->recovery_active);
5014 wake_up(&mddev->recovery_wait);
5015 if (!ok) {
5016 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5017 md_wakeup_thread(mddev->thread);
5018
5019 }
5020}
5021
5022
5023
5024
5025
5026
5027
5028void md_write_start(mddev_t *mddev, struct bio *bi)
5029{
5030 if (bio_data_dir(bi) != WRITE)
5031 return;
5032
5033 BUG_ON(mddev->ro == 1);
5034 if (mddev->ro == 2) {
5035
5036 mddev->ro = 0;
5037 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5038 md_wakeup_thread(mddev->thread);
5039 }
5040 atomic_inc(&mddev->writes_pending);
5041 if (mddev->in_sync) {
5042 spin_lock_irq(&mddev->write_lock);
5043 if (mddev->in_sync) {
5044 mddev->in_sync = 0;
5045 mddev->sb_dirty = 3;
5046 md_wakeup_thread(mddev->thread);
5047 }
5048 spin_unlock_irq(&mddev->write_lock);
5049 }
5050 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
5051}
5052
5053void md_write_end(mddev_t *mddev)
5054{
5055 if (atomic_dec_and_test(&mddev->writes_pending)) {
5056 if (mddev->safemode == 2)
5057 md_wakeup_thread(mddev->thread);
5058 else if (mddev->safemode_delay)
5059 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5060 }
5061}
5062
5063static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5064
5065#define SYNC_MARKS 10
5066#define SYNC_MARK_STEP (3*HZ)
5067void md_do_sync(mddev_t *mddev)
5068{
5069 mddev_t *mddev2;
5070 unsigned int currspeed = 0,
5071 window;
5072 sector_t max_sectors,j, io_sectors;
5073 unsigned long mark[SYNC_MARKS];
5074 sector_t mark_cnt[SYNC_MARKS];
5075 int last_mark,m;
5076 struct list_head *tmp;
5077 sector_t last_check;
5078 int skipped = 0;
5079 struct list_head *rtmp;
5080 mdk_rdev_t *rdev;
5081
5082
5083 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5084 return;
5085 if (mddev->ro)
5086 return;
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104 do {
5105 mddev->curr_resync = 2;
5106
5107 try_again:
5108 if (kthread_should_stop()) {
5109 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5110 goto skip;
5111 }
5112 ITERATE_MDDEV(mddev2,tmp) {
5113 if (mddev2 == mddev)
5114 continue;
5115 if (mddev2->curr_resync &&
5116 match_mddev_units(mddev,mddev2)) {
5117 DEFINE_WAIT(wq);
5118 if (mddev < mddev2 && mddev->curr_resync == 2) {
5119
5120 mddev->curr_resync = 1;
5121 wake_up(&resync_wait);
5122 }
5123 if (mddev > mddev2 && mddev->curr_resync == 1)
5124
5125
5126
5127 continue;
5128 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5129 if (!kthread_should_stop() &&
5130 mddev2->curr_resync >= mddev->curr_resync) {
5131 printk(KERN_INFO "md: delaying resync of %s"
5132 " until %s has finished resync (they"
5133 " share one or more physical units)\n",
5134 mdname(mddev), mdname(mddev2));
5135 mddev_put(mddev2);
5136 schedule();
5137 finish_wait(&resync_wait, &wq);
5138 goto try_again;
5139 }
5140 finish_wait(&resync_wait, &wq);
5141 }
5142 }
5143 } while (mddev->curr_resync < 2);
5144
5145 j = 0;
5146 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5147
5148
5149
5150 max_sectors = mddev->resync_max_sectors;
5151 mddev->resync_mismatches = 0;
5152
5153 if (!mddev->bitmap &&
5154 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5155 j = mddev->recovery_cp;
5156 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5157 max_sectors = mddev->size << 1;
5158 else {
5159
5160 max_sectors = mddev->size << 1;
5161 j = MaxSector;
5162 ITERATE_RDEV(mddev,rdev,rtmp)
5163 if (rdev->raid_disk >= 0 &&
5164 !test_bit(Faulty, &rdev->flags) &&
5165 !test_bit(In_sync, &rdev->flags) &&
5166 rdev->recovery_offset < j)
5167 j = rdev->recovery_offset;
5168 }
5169
5170 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
5171 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
5172 " %d KB/sec/disc.\n", speed_min(mddev));
5173 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5174 "(but not more than %d KB/sec) for reconstruction.\n",
5175 speed_max(mddev));
5176
5177 is_mddev_idle(mddev);
5178
5179 io_sectors = 0;
5180 for (m = 0; m < SYNC_MARKS; m++) {
5181 mark[m] = jiffies;
5182 mark_cnt[m] = io_sectors;
5183 }
5184 last_mark = 0;
5185 mddev->resync_mark = mark[last_mark];
5186 mddev->resync_mark_cnt = mark_cnt[last_mark];
5187
5188
5189
5190
5191 window = 32*(PAGE_SIZE/512);
5192 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5193 window/2,(unsigned long long) max_sectors/2);
5194
5195 atomic_set(&mddev->recovery_active, 0);
5196 init_waitqueue_head(&mddev->recovery_wait);
5197 last_check = 0;
5198
5199 if (j>2) {
5200 printk(KERN_INFO
5201 "md: resuming recovery of %s from checkpoint.\n",
5202 mdname(mddev));
5203 mddev->curr_resync = j;
5204 }
5205
5206 while (j < max_sectors) {
5207 sector_t sectors;
5208
5209 skipped = 0;
5210 sectors = mddev->pers->sync_request(mddev, j, &skipped,
5211 currspeed < speed_min(mddev));
5212 if (sectors == 0) {
5213 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5214 goto out;
5215 }
5216
5217 if (!skipped) {
5218 io_sectors += sectors;
5219 atomic_add(sectors, &mddev->recovery_active);
5220 }
5221
5222 j += sectors;
5223 if (j>1) mddev->curr_resync = j;
5224 mddev->curr_mark_cnt = io_sectors;
5225 if (last_check == 0)
5226
5227
5228
5229 md_new_event(mddev);
5230
5231 if (last_check + window > io_sectors || j == max_sectors)
5232 continue;
5233
5234 last_check = io_sectors;
5235
5236 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5237 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5238 break;
5239
5240 repeat:
5241 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5242
5243 int next = (last_mark+1) % SYNC_MARKS;
5244
5245 mddev->resync_mark = mark[next];
5246 mddev->resync_mark_cnt = mark_cnt[next];
5247 mark[next] = jiffies;
5248 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5249 last_mark = next;
5250 }
5251
5252
5253 if (kthread_should_stop()) {
5254
5255
5256
5257 printk(KERN_INFO
5258 "md: md_do_sync() got signal ... exiting\n");
5259 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5260 goto out;
5261 }
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271 mddev->queue->unplug_fn(mddev->queue);
5272 cond_resched();
5273
5274 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5275 /((jiffies-mddev->resync_mark)/HZ +1) +1;
5276
5277 if (currspeed > speed_min(mddev)) {
5278 if ((currspeed > speed_max(mddev)) ||
5279 !is_mddev_idle(mddev)) {
5280 msleep(500);
5281 goto repeat;
5282 }
5283 }
5284 }
5285 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
5286
5287
5288
5289 out:
5290 mddev->queue->unplug_fn(mddev->queue);
5291
5292 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5293
5294
5295 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5296
5297 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5298 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
5299 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5300 mddev->curr_resync > 2) {
5301 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5302 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5303 if (mddev->curr_resync >= mddev->recovery_cp) {
5304 printk(KERN_INFO
5305 "md: checkpointing recovery of %s.\n",
5306 mdname(mddev));
5307 mddev->recovery_cp = mddev->curr_resync;
5308 }
5309 } else
5310 mddev->recovery_cp = MaxSector;
5311 } else {
5312 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5313 mddev->curr_resync = MaxSector;
5314 ITERATE_RDEV(mddev,rdev,rtmp)
5315 if (rdev->raid_disk >= 0 &&
5316 !test_bit(Faulty, &rdev->flags) &&
5317 !test_bit(In_sync, &rdev->flags) &&
5318 rdev->recovery_offset < mddev->curr_resync)
5319 rdev->recovery_offset = mddev->curr_resync;
5320 mddev->sb_dirty = 1;
5321 }
5322 }
5323
5324 skip:
5325 mddev->curr_resync = 0;
5326 wake_up(&resync_wait);
5327 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5328 md_wakeup_thread(mddev->thread);
5329}
5330EXPORT_SYMBOL_GPL(md_do_sync);
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355void md_check_recovery(mddev_t *mddev)
5356{
5357 mdk_rdev_t *rdev;
5358 struct list_head *rtmp;
5359
5360
5361 if (mddev->bitmap)
5362 bitmap_daemon_work(mddev->bitmap);
5363
5364 if (mddev->ro)
5365 return;
5366
5367 if (signal_pending(current)) {
5368 if (mddev->pers->sync_request) {
5369 printk(KERN_INFO "md: %s in immediate safe mode\n",
5370 mdname(mddev));
5371 mddev->safemode = 2;
5372 }
5373 flush_signals(current);
5374 }
5375
5376 if ( ! (
5377 mddev->sb_dirty ||
5378 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5379 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5380 (mddev->safemode == 1) ||
5381 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5382 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5383 ))
5384 return;
5385
5386 if (mddev_trylock(mddev)) {
5387 int spares =0;
5388
5389 spin_lock_irq(&mddev->write_lock);
5390 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5391 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5392 mddev->in_sync = 1;
5393 mddev->sb_dirty = 3;
5394 }
5395 if (mddev->safemode == 1)
5396 mddev->safemode = 0;
5397 spin_unlock_irq(&mddev->write_lock);
5398
5399 if (mddev->sb_dirty)
5400 md_update_sb(mddev);
5401
5402
5403 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5404 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5405
5406 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5407 goto unlock;
5408 }
5409 if (mddev->sync_thread) {
5410
5411 md_unregister_thread(mddev->sync_thread);
5412 mddev->sync_thread = NULL;
5413 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5414 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5415
5416
5417 mddev->pers->spare_active(mddev);
5418 }
5419 md_update_sb(mddev);
5420
5421
5422
5423
5424 if (!mddev->degraded)
5425 ITERATE_RDEV(mddev,rdev,rtmp)
5426 rdev->saved_raid_disk = -1;
5427
5428 mddev->recovery = 0;
5429
5430 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5431 md_new_event(mddev);
5432 goto unlock;
5433 }
5434
5435
5436
5437 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5438 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5439 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5440 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5441
5442 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5443 goto unlock;
5444
5445
5446
5447
5448
5449
5450 ITERATE_RDEV(mddev,rdev,rtmp)
5451 if (rdev->raid_disk >= 0 &&
5452 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
5453 atomic_read(&rdev->nr_pending)==0) {
5454 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
5455 char nm[20];
5456 sprintf(nm,"rd%d", rdev->raid_disk);
5457 sysfs_remove_link(&mddev->kobj, nm);
5458 rdev->raid_disk = -1;
5459 }
5460 }
5461
5462 if (mddev->degraded) {
5463 ITERATE_RDEV(mddev,rdev,rtmp)
5464 if (rdev->raid_disk < 0
5465 && !test_bit(Faulty, &rdev->flags)) {
5466 rdev->recovery_offset = 0;
5467 if (mddev->pers->hot_add_disk(mddev,rdev)) {
5468 char nm[20];
5469 sprintf(nm, "rd%d", rdev->raid_disk);
5470 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
5471 spares++;
5472 md_new_event(mddev);
5473 } else
5474 break;
5475 }
5476 }
5477
5478 if (spares) {
5479 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5480 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5481 } else if (mddev->recovery_cp < MaxSector) {
5482 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5483 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5484
5485 goto unlock;
5486
5487 if (mddev->pers->sync_request) {
5488 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5489 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5490
5491
5492
5493
5494 bitmap_write_all(mddev->bitmap);
5495 }
5496 mddev->sync_thread = md_register_thread(md_do_sync,
5497 mddev,
5498 "%s_resync");
5499 if (!mddev->sync_thread) {
5500 printk(KERN_ERR "%s: could not start resync"
5501 " thread...\n",
5502 mdname(mddev));
5503
5504 mddev->recovery = 0;
5505 } else
5506 md_wakeup_thread(mddev->sync_thread);
5507 md_new_event(mddev);
5508 }
5509 unlock:
5510 mddev_unlock(mddev);
5511 }
5512}
5513
5514static int md_notify_reboot(struct notifier_block *this,
5515 unsigned long code, void *x)
5516{
5517 struct list_head *tmp;
5518 mddev_t *mddev;
5519
5520 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5521
5522 printk(KERN_INFO "md: stopping all md devices.\n");
5523
5524 ITERATE_MDDEV(mddev,tmp)
5525 if (mddev_trylock(mddev)) {
5526 do_md_stop (mddev, 1);
5527 mddev_unlock(mddev);
5528 }
5529
5530
5531
5532
5533
5534
5535 mdelay(1000*1);
5536 }
5537 return NOTIFY_DONE;
5538}
5539
5540static struct notifier_block md_notifier = {
5541 .notifier_call = md_notify_reboot,
5542 .next = NULL,
5543 .priority = INT_MAX,
5544};
5545
5546static void md_geninit(void)
5547{
5548 struct proc_dir_entry *p;
5549
5550 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5551
5552 p = create_proc_entry("mdstat", S_IRUGO, NULL);
5553 if (p)
5554 p->proc_fops = &md_seq_fops;
5555}
5556
5557static int __init md_init(void)
5558{
5559 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
5560 " MD_SB_DISKS=%d\n",
5561 MD_MAJOR_VERSION, MD_MINOR_VERSION,
5562 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
5563 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
5564 BITMAP_MINOR);
5565
5566 if (register_blkdev(MAJOR_NR, "md"))
5567 return -1;
5568 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5569 unregister_blkdev(MAJOR_NR, "md");
5570 return -1;
5571 }
5572 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
5573 md_probe, NULL, NULL);
5574 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
5575 md_probe, NULL, NULL);
5576
5577 register_reboot_notifier(&md_notifier);
5578 raid_table_header = register_sysctl_table(raid_root_table, 1);
5579
5580 md_geninit();
5581 return (0);
5582}
5583
5584
5585#ifndef MODULE
5586
5587
5588
5589
5590
5591static dev_t detected_devices[128];
5592static int dev_cnt;
5593
5594void md_autodetect_dev(dev_t dev)
5595{
5596 if (dev_cnt >= 0 && dev_cnt < 127)
5597 detected_devices[dev_cnt++] = dev;
5598}
5599
5600
5601static void autostart_arrays(int part)
5602{
5603 mdk_rdev_t *rdev;
5604 int i;
5605
5606 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
5607
5608 for (i = 0; i < dev_cnt; i++) {
5609 dev_t dev = detected_devices[i];
5610
5611 rdev = md_import_device(dev,0, 0);
5612 if (IS_ERR(rdev))
5613 continue;
5614
5615 if (test_bit(Faulty, &rdev->flags)) {
5616 MD_BUG();
5617 continue;
5618 }
5619 list_add(&rdev->same_set, &pending_raid_disks);
5620 }
5621 dev_cnt = 0;
5622
5623 autorun_devices(part);
5624}
5625
5626#endif
5627
5628static __exit void md_exit(void)
5629{
5630 mddev_t *mddev;
5631 struct list_head *tmp;
5632
5633 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
5634 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
5635
5636 unregister_blkdev(MAJOR_NR,"md");
5637 unregister_blkdev(mdp_major, "mdp");
5638 unregister_reboot_notifier(&md_notifier);
5639 unregister_sysctl_table(raid_table_header);
5640 remove_proc_entry("mdstat", NULL);
5641 ITERATE_MDDEV(mddev,tmp) {
5642 struct gendisk *disk = mddev->gendisk;
5643 if (!disk)
5644 continue;
5645 export_array(mddev);
5646 del_gendisk(disk);
5647 put_disk(disk);
5648 mddev->gendisk = NULL;
5649 mddev_put(mddev);
5650 }
5651}
5652
5653module_init(md_init)
5654module_exit(md_exit)
5655
5656static int get_ro(char *buffer, struct kernel_param *kp)
5657{
5658 return sprintf(buffer, "%d", start_readonly);
5659}
5660static int set_ro(const char *val, struct kernel_param *kp)
5661{
5662 char *e;
5663 int num = simple_strtoul(val, &e, 10);
5664 if (*val && (*e == '\0' || *e == '\n')) {
5665 start_readonly = num;
5666 return 0;
5667 }
5668 return -EINVAL;
5669}
5670
5671module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
5672module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
5673
5674
5675EXPORT_SYMBOL(register_md_personality);
5676EXPORT_SYMBOL(unregister_md_personality);
5677EXPORT_SYMBOL(md_error);
5678EXPORT_SYMBOL(md_done_sync);
5679EXPORT_SYMBOL(md_write_start);
5680EXPORT_SYMBOL(md_write_end);
5681EXPORT_SYMBOL(md_register_thread);
5682EXPORT_SYMBOL(md_unregister_thread);
5683EXPORT_SYMBOL(md_wakeup_thread);
5684EXPORT_SYMBOL(md_check_recovery);
5685MODULE_LICENSE("GPL");
5686MODULE_ALIAS("md");
5687MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
5688