1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#include <linux/kthread.h>
36#include <linux/blkdev.h>
37#include <linux/sysctl.h>
38#include <linux/seq_file.h>
39#include <linux/buffer_head.h>
40#include <linux/poll.h>
41#include <linux/ctype.h>
42#include <linux/string.h>
43#include <linux/hdreg.h>
44#include <linux/proc_fs.h>
45#include <linux/random.h>
46#include <linux/reboot.h>
47#include <linux/file.h>
48#include <linux/compat.h>
49#include <linux/delay.h>
50#include <linux/raid/md_p.h>
51#include <linux/raid/md_u.h>
52#include "md.h"
53#include "bitmap.h"
54
55#define DEBUG 0
56#define dprintk(x...) ((void)(DEBUG && printk(x)))
57
58
59#ifndef MODULE
60static void autostart_arrays(int part);
61#endif
62
63static LIST_HEAD(pers_list);
64static DEFINE_SPINLOCK(pers_lock);
65
66static void md_print_devices(void);
67
68static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
69
70#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
71
72
73
74
75
76
77#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
78
79
80
81
82
83
84
85
86
87
88
89
90
91static int sysctl_speed_limit_min = 1000;
92static int sysctl_speed_limit_max = 200000;
93static inline int speed_min(mddev_t *mddev)
94{
95 return mddev->sync_speed_min ?
96 mddev->sync_speed_min : sysctl_speed_limit_min;
97}
98
99static inline int speed_max(mddev_t *mddev)
100{
101 return mddev->sync_speed_max ?
102 mddev->sync_speed_max : sysctl_speed_limit_max;
103}
104
105static struct ctl_table_header *raid_table_header;
106
107static ctl_table raid_table[] = {
108 {
109 .procname = "speed_limit_min",
110 .data = &sysctl_speed_limit_min,
111 .maxlen = sizeof(int),
112 .mode = S_IRUGO|S_IWUSR,
113 .proc_handler = proc_dointvec,
114 },
115 {
116 .procname = "speed_limit_max",
117 .data = &sysctl_speed_limit_max,
118 .maxlen = sizeof(int),
119 .mode = S_IRUGO|S_IWUSR,
120 .proc_handler = proc_dointvec,
121 },
122 { }
123};
124
125static ctl_table raid_dir_table[] = {
126 {
127 .procname = "raid",
128 .maxlen = 0,
129 .mode = S_IRUGO|S_IXUGO,
130 .child = raid_table,
131 },
132 { }
133};
134
135static ctl_table raid_root_table[] = {
136 {
137 .procname = "dev",
138 .maxlen = 0,
139 .mode = 0555,
140 .child = raid_dir_table,
141 },
142 { }
143};
144
145static const struct block_device_operations md_fops;
146
147static int start_readonly;
148
149
150
151
152
153
154
155
156
157
158
159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
160static atomic_t md_event_count;
161void md_new_event(mddev_t *mddev)
162{
163 atomic_inc(&md_event_count);
164 wake_up(&md_event_waiters);
165}
166EXPORT_SYMBOL_GPL(md_new_event);
167
168
169
170
171static void md_new_event_inintr(mddev_t *mddev)
172{
173 atomic_inc(&md_event_count);
174 wake_up(&md_event_waiters);
175}
176
177
178
179
180
181static LIST_HEAD(all_mddevs);
182static DEFINE_SPINLOCK(all_mddevs_lock);
183
184
185
186
187
188
189
190
191
192#define for_each_mddev(mddev,tmp) \
193 \
194 for (({ spin_lock(&all_mddevs_lock); \
195 tmp = all_mddevs.next; \
196 mddev = NULL;}); \
197 ({ if (tmp != &all_mddevs) \
198 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
199 spin_unlock(&all_mddevs_lock); \
200 if (mddev) mddev_put(mddev); \
201 mddev = list_entry(tmp, mddev_t, all_mddevs); \
202 tmp != &all_mddevs;}); \
203 ({ spin_lock(&all_mddevs_lock); \
204 tmp = tmp->next;}) \
205 )
206
207
208
209
210
211
212
213
214
215static int md_make_request(struct request_queue *q, struct bio *bio)
216{
217 mddev_t *mddev = q->queuedata;
218 int rv;
219 if (mddev == NULL || mddev->pers == NULL) {
220 bio_io_error(bio);
221 return 0;
222 }
223 rcu_read_lock();
224 if (mddev->suspended || mddev->barrier) {
225 DEFINE_WAIT(__wait);
226 for (;;) {
227 prepare_to_wait(&mddev->sb_wait, &__wait,
228 TASK_UNINTERRUPTIBLE);
229 if (!mddev->suspended && !mddev->barrier)
230 break;
231 rcu_read_unlock();
232 schedule();
233 rcu_read_lock();
234 }
235 finish_wait(&mddev->sb_wait, &__wait);
236 }
237 atomic_inc(&mddev->active_io);
238 rcu_read_unlock();
239 rv = mddev->pers->make_request(q, bio);
240 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
241 wake_up(&mddev->sb_wait);
242
243 return rv;
244}
245
246static void mddev_suspend(mddev_t *mddev)
247{
248 BUG_ON(mddev->suspended);
249 mddev->suspended = 1;
250 synchronize_rcu();
251 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
252 mddev->pers->quiesce(mddev, 1);
253 md_unregister_thread(mddev->thread);
254 mddev->thread = NULL;
255
256
257
258
259
260}
261
262static void mddev_resume(mddev_t *mddev)
263{
264 mddev->suspended = 0;
265 wake_up(&mddev->sb_wait);
266 mddev->pers->quiesce(mddev, 0);
267}
268
269int mddev_congested(mddev_t *mddev, int bits)
270{
271 if (mddev->barrier)
272 return 1;
273 return mddev->suspended;
274}
275EXPORT_SYMBOL(mddev_congested);
276
277
278
279
280
281#define POST_REQUEST_BARRIER ((void*)1)
282
283static void md_end_barrier(struct bio *bio, int err)
284{
285 mdk_rdev_t *rdev = bio->bi_private;
286 mddev_t *mddev = rdev->mddev;
287 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
288 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
289
290 rdev_dec_pending(rdev, mddev);
291
292 if (atomic_dec_and_test(&mddev->flush_pending)) {
293 if (mddev->barrier == POST_REQUEST_BARRIER) {
294
295 mddev->barrier = NULL;
296 wake_up(&mddev->sb_wait);
297 } else
298
299 schedule_work(&mddev->barrier_work);
300 }
301 bio_put(bio);
302}
303
304static void submit_barriers(mddev_t *mddev)
305{
306 mdk_rdev_t *rdev;
307
308 rcu_read_lock();
309 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
310 if (rdev->raid_disk >= 0 &&
311 !test_bit(Faulty, &rdev->flags)) {
312
313
314
315
316 struct bio *bi;
317 atomic_inc(&rdev->nr_pending);
318 atomic_inc(&rdev->nr_pending);
319 rcu_read_unlock();
320 bi = bio_alloc(GFP_KERNEL, 0);
321 bi->bi_end_io = md_end_barrier;
322 bi->bi_private = rdev;
323 bi->bi_bdev = rdev->bdev;
324 atomic_inc(&mddev->flush_pending);
325 submit_bio(WRITE_BARRIER, bi);
326 rcu_read_lock();
327 rdev_dec_pending(rdev, mddev);
328 }
329 rcu_read_unlock();
330}
331
332static void md_submit_barrier(struct work_struct *ws)
333{
334 mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
335 struct bio *bio = mddev->barrier;
336
337 atomic_set(&mddev->flush_pending, 1);
338
339 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
340 bio_endio(bio, -EOPNOTSUPP);
341 else if (bio->bi_size == 0)
342
343 bio_endio(bio, 0);
344 else {
345 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
346 if (mddev->pers->make_request(mddev->queue, bio))
347 generic_make_request(bio);
348 mddev->barrier = POST_REQUEST_BARRIER;
349 submit_barriers(mddev);
350 }
351 if (atomic_dec_and_test(&mddev->flush_pending)) {
352 mddev->barrier = NULL;
353 wake_up(&mddev->sb_wait);
354 }
355}
356
357void md_barrier_request(mddev_t *mddev, struct bio *bio)
358{
359 spin_lock_irq(&mddev->write_lock);
360 wait_event_lock_irq(mddev->sb_wait,
361 !mddev->barrier,
362 mddev->write_lock, );
363 mddev->barrier = bio;
364 spin_unlock_irq(&mddev->write_lock);
365
366 atomic_set(&mddev->flush_pending, 1);
367 INIT_WORK(&mddev->barrier_work, md_submit_barrier);
368
369 submit_barriers(mddev);
370
371 if (atomic_dec_and_test(&mddev->flush_pending))
372 schedule_work(&mddev->barrier_work);
373}
374EXPORT_SYMBOL(md_barrier_request);
375
376static inline mddev_t *mddev_get(mddev_t *mddev)
377{
378 atomic_inc(&mddev->active);
379 return mddev;
380}
381
382static void mddev_delayed_delete(struct work_struct *ws);
383
384static void mddev_put(mddev_t *mddev)
385{
386 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
387 return;
388 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
389 mddev->ctime == 0 && !mddev->hold_active) {
390
391
392 list_del(&mddev->all_mddevs);
393 if (mddev->gendisk) {
394
395
396
397
398
399
400 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
401 schedule_work(&mddev->del_work);
402 } else
403 kfree(mddev);
404 }
405 spin_unlock(&all_mddevs_lock);
406}
407
408static mddev_t * mddev_find(dev_t unit)
409{
410 mddev_t *mddev, *new = NULL;
411
412 retry:
413 spin_lock(&all_mddevs_lock);
414
415 if (unit) {
416 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
417 if (mddev->unit == unit) {
418 mddev_get(mddev);
419 spin_unlock(&all_mddevs_lock);
420 kfree(new);
421 return mddev;
422 }
423
424 if (new) {
425 list_add(&new->all_mddevs, &all_mddevs);
426 spin_unlock(&all_mddevs_lock);
427 new->hold_active = UNTIL_IOCTL;
428 return new;
429 }
430 } else if (new) {
431
432 static int next_minor = 512;
433 int start = next_minor;
434 int is_free = 0;
435 int dev = 0;
436 while (!is_free) {
437 dev = MKDEV(MD_MAJOR, next_minor);
438 next_minor++;
439 if (next_minor > MINORMASK)
440 next_minor = 0;
441 if (next_minor == start) {
442
443 spin_unlock(&all_mddevs_lock);
444 kfree(new);
445 return NULL;
446 }
447
448 is_free = 1;
449 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
450 if (mddev->unit == dev) {
451 is_free = 0;
452 break;
453 }
454 }
455 new->unit = dev;
456 new->md_minor = MINOR(dev);
457 new->hold_active = UNTIL_STOP;
458 list_add(&new->all_mddevs, &all_mddevs);
459 spin_unlock(&all_mddevs_lock);
460 return new;
461 }
462 spin_unlock(&all_mddevs_lock);
463
464 new = kzalloc(sizeof(*new), GFP_KERNEL);
465 if (!new)
466 return NULL;
467
468 new->unit = unit;
469 if (MAJOR(unit) == MD_MAJOR)
470 new->md_minor = MINOR(unit);
471 else
472 new->md_minor = MINOR(unit) >> MdpMinorShift;
473
474 mutex_init(&new->open_mutex);
475 mutex_init(&new->reconfig_mutex);
476 mutex_init(&new->bitmap_info.mutex);
477 INIT_LIST_HEAD(&new->disks);
478 INIT_LIST_HEAD(&new->all_mddevs);
479 init_timer(&new->safemode_timer);
480 atomic_set(&new->active, 1);
481 atomic_set(&new->openers, 0);
482 atomic_set(&new->active_io, 0);
483 spin_lock_init(&new->write_lock);
484 atomic_set(&new->flush_pending, 0);
485 init_waitqueue_head(&new->sb_wait);
486 init_waitqueue_head(&new->recovery_wait);
487 new->reshape_position = MaxSector;
488 new->resync_min = 0;
489 new->resync_max = MaxSector;
490 new->level = LEVEL_NONE;
491
492 goto retry;
493}
494
495static inline int mddev_lock(mddev_t * mddev)
496{
497 return mutex_lock_interruptible(&mddev->reconfig_mutex);
498}
499
500static inline int mddev_is_locked(mddev_t *mddev)
501{
502 return mutex_is_locked(&mddev->reconfig_mutex);
503}
504
505static inline int mddev_trylock(mddev_t * mddev)
506{
507 return mutex_trylock(&mddev->reconfig_mutex);
508}
509
510static inline void mddev_unlock(mddev_t * mddev)
511{
512 mutex_unlock(&mddev->reconfig_mutex);
513
514 md_wakeup_thread(mddev->thread);
515}
516
517static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
518{
519 mdk_rdev_t *rdev;
520
521 list_for_each_entry(rdev, &mddev->disks, same_set)
522 if (rdev->desc_nr == nr)
523 return rdev;
524
525 return NULL;
526}
527
528static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
529{
530 mdk_rdev_t *rdev;
531
532 list_for_each_entry(rdev, &mddev->disks, same_set)
533 if (rdev->bdev->bd_dev == dev)
534 return rdev;
535
536 return NULL;
537}
538
539static struct mdk_personality *find_pers(int level, char *clevel)
540{
541 struct mdk_personality *pers;
542 list_for_each_entry(pers, &pers_list, list) {
543 if (level != LEVEL_NONE && pers->level == level)
544 return pers;
545 if (strcmp(pers->name, clevel)==0)
546 return pers;
547 }
548 return NULL;
549}
550
551
552static inline sector_t calc_dev_sboffset(struct block_device *bdev)
553{
554 sector_t num_sectors = bdev->bd_inode->i_size / 512;
555 return MD_NEW_SIZE_SECTORS(num_sectors);
556}
557
558static int alloc_disk_sb(mdk_rdev_t * rdev)
559{
560 if (rdev->sb_page)
561 MD_BUG();
562
563 rdev->sb_page = alloc_page(GFP_KERNEL);
564 if (!rdev->sb_page) {
565 printk(KERN_ALERT "md: out of memory.\n");
566 return -ENOMEM;
567 }
568
569 return 0;
570}
571
572static void free_disk_sb(mdk_rdev_t * rdev)
573{
574 if (rdev->sb_page) {
575 put_page(rdev->sb_page);
576 rdev->sb_loaded = 0;
577 rdev->sb_page = NULL;
578 rdev->sb_start = 0;
579 rdev->sectors = 0;
580 }
581}
582
583
584static void super_written(struct bio *bio, int error)
585{
586 mdk_rdev_t *rdev = bio->bi_private;
587 mddev_t *mddev = rdev->mddev;
588
589 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
590 printk("md: super_written gets error=%d, uptodate=%d\n",
591 error, test_bit(BIO_UPTODATE, &bio->bi_flags));
592 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
593 md_error(mddev, rdev);
594 }
595
596 if (atomic_dec_and_test(&mddev->pending_writes))
597 wake_up(&mddev->sb_wait);
598 bio_put(bio);
599}
600
601static void super_written_barrier(struct bio *bio, int error)
602{
603 struct bio *bio2 = bio->bi_private;
604 mdk_rdev_t *rdev = bio2->bi_private;
605 mddev_t *mddev = rdev->mddev;
606
607 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
608 error == -EOPNOTSUPP) {
609 unsigned long flags;
610
611 set_bit(BarriersNotsupp, &rdev->flags);
612 mddev->barriers_work = 0;
613 spin_lock_irqsave(&mddev->write_lock, flags);
614 bio2->bi_next = mddev->biolist;
615 mddev->biolist = bio2;
616 spin_unlock_irqrestore(&mddev->write_lock, flags);
617 wake_up(&mddev->sb_wait);
618 bio_put(bio);
619 } else {
620 bio_put(bio2);
621 bio->bi_private = rdev;
622 super_written(bio, error);
623 }
624}
625
626void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
627 sector_t sector, int size, struct page *page)
628{
629
630
631
632
633
634
635
636
637
638 struct bio *bio = bio_alloc(GFP_NOIO, 1);
639 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
640
641 bio->bi_bdev = rdev->bdev;
642 bio->bi_sector = sector;
643 bio_add_page(bio, page, size, 0);
644 bio->bi_private = rdev;
645 bio->bi_end_io = super_written;
646 bio->bi_rw = rw;
647
648 atomic_inc(&mddev->pending_writes);
649 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
650 struct bio *rbio;
651 rw |= (1<<BIO_RW_BARRIER);
652 rbio = bio_clone(bio, GFP_NOIO);
653 rbio->bi_private = bio;
654 rbio->bi_end_io = super_written_barrier;
655 submit_bio(rw, rbio);
656 } else
657 submit_bio(rw, bio);
658}
659
660void md_super_wait(mddev_t *mddev)
661{
662
663
664
665 DEFINE_WAIT(wq);
666 for(;;) {
667 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
668 if (atomic_read(&mddev->pending_writes)==0)
669 break;
670 while (mddev->biolist) {
671 struct bio *bio;
672 spin_lock_irq(&mddev->write_lock);
673 bio = mddev->biolist;
674 mddev->biolist = bio->bi_next ;
675 bio->bi_next = NULL;
676 spin_unlock_irq(&mddev->write_lock);
677 submit_bio(bio->bi_rw, bio);
678 }
679 schedule();
680 }
681 finish_wait(&mddev->sb_wait, &wq);
682}
683
684static void bi_complete(struct bio *bio, int error)
685{
686 complete((struct completion*)bio->bi_private);
687}
688
689int sync_page_io(struct block_device *bdev, sector_t sector, int size,
690 struct page *page, int rw)
691{
692 struct bio *bio = bio_alloc(GFP_NOIO, 1);
693 struct completion event;
694 int ret;
695
696 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
697
698 bio->bi_bdev = bdev;
699 bio->bi_sector = sector;
700 bio_add_page(bio, page, size, 0);
701 init_completion(&event);
702 bio->bi_private = &event;
703 bio->bi_end_io = bi_complete;
704 submit_bio(rw, bio);
705 wait_for_completion(&event);
706
707 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
708 bio_put(bio);
709 return ret;
710}
711EXPORT_SYMBOL_GPL(sync_page_io);
712
713static int read_disk_sb(mdk_rdev_t * rdev, int size)
714{
715 char b[BDEVNAME_SIZE];
716 if (!rdev->sb_page) {
717 MD_BUG();
718 return -EINVAL;
719 }
720 if (rdev->sb_loaded)
721 return 0;
722
723
724 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
725 goto fail;
726 rdev->sb_loaded = 1;
727 return 0;
728
729fail:
730 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
731 bdevname(rdev->bdev,b));
732 return -EINVAL;
733}
734
735static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
736{
737 return sb1->set_uuid0 == sb2->set_uuid0 &&
738 sb1->set_uuid1 == sb2->set_uuid1 &&
739 sb1->set_uuid2 == sb2->set_uuid2 &&
740 sb1->set_uuid3 == sb2->set_uuid3;
741}
742
743static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
744{
745 int ret;
746 mdp_super_t *tmp1, *tmp2;
747
748 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
749 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
750
751 if (!tmp1 || !tmp2) {
752 ret = 0;
753 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
754 goto abort;
755 }
756
757 *tmp1 = *sb1;
758 *tmp2 = *sb2;
759
760
761
762
763 tmp1->nr_disks = 0;
764 tmp2->nr_disks = 0;
765
766 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
767abort:
768 kfree(tmp1);
769 kfree(tmp2);
770 return ret;
771}
772
773
774static u32 md_csum_fold(u32 csum)
775{
776 csum = (csum & 0xffff) + (csum >> 16);
777 return (csum & 0xffff) + (csum >> 16);
778}
779
780static unsigned int calc_sb_csum(mdp_super_t * sb)
781{
782 u64 newcsum = 0;
783 u32 *sb32 = (u32*)sb;
784 int i;
785 unsigned int disk_csum, csum;
786
787 disk_csum = sb->sb_csum;
788 sb->sb_csum = 0;
789
790 for (i = 0; i < MD_SB_BYTES/4 ; i++)
791 newcsum += sb32[i];
792 csum = (newcsum & 0xffffffff) + (newcsum>>32);
793
794
795#ifdef CONFIG_ALPHA
796
797
798
799
800
801
802
803
804 sb->sb_csum = md_csum_fold(disk_csum);
805#else
806 sb->sb_csum = disk_csum;
807#endif
808 return csum;
809}
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842struct super_type {
843 char *name;
844 struct module *owner;
845 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
846 int minor_version);
847 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
848 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
849 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
850 sector_t num_sectors);
851};
852
853
854
855
856
857
858
859
860
861int md_check_no_bitmap(mddev_t *mddev)
862{
863 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
864 return 0;
865 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
866 mdname(mddev), mddev->pers->name);
867 return 1;
868}
869EXPORT_SYMBOL(md_check_no_bitmap);
870
871
872
873
874static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
875{
876 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
877 mdp_super_t *sb;
878 int ret;
879
880
881
882
883
884
885
886 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
887
888 ret = read_disk_sb(rdev, MD_SB_BYTES);
889 if (ret) return ret;
890
891 ret = -EINVAL;
892
893 bdevname(rdev->bdev, b);
894 sb = (mdp_super_t*)page_address(rdev->sb_page);
895
896 if (sb->md_magic != MD_SB_MAGIC) {
897 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
898 b);
899 goto abort;
900 }
901
902 if (sb->major_version != 0 ||
903 sb->minor_version < 90 ||
904 sb->minor_version > 91) {
905 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
906 sb->major_version, sb->minor_version,
907 b);
908 goto abort;
909 }
910
911 if (sb->raid_disks <= 0)
912 goto abort;
913
914 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
915 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
916 b);
917 goto abort;
918 }
919
920 rdev->preferred_minor = sb->md_minor;
921 rdev->data_offset = 0;
922 rdev->sb_size = MD_SB_BYTES;
923
924 if (sb->level == LEVEL_MULTIPATH)
925 rdev->desc_nr = -1;
926 else
927 rdev->desc_nr = sb->this_disk.number;
928
929 if (!refdev) {
930 ret = 1;
931 } else {
932 __u64 ev1, ev2;
933 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
934 if (!uuid_equal(refsb, sb)) {
935 printk(KERN_WARNING "md: %s has different UUID to %s\n",
936 b, bdevname(refdev->bdev,b2));
937 goto abort;
938 }
939 if (!sb_equal(refsb, sb)) {
940 printk(KERN_WARNING "md: %s has same UUID"
941 " but different superblock to %s\n",
942 b, bdevname(refdev->bdev, b2));
943 goto abort;
944 }
945 ev1 = md_event(sb);
946 ev2 = md_event(refsb);
947 if (ev1 > ev2)
948 ret = 1;
949 else
950 ret = 0;
951 }
952 rdev->sectors = rdev->sb_start;
953
954 if (rdev->sectors < sb->size * 2 && sb->level > 1)
955
956 ret = -EINVAL;
957
958 abort:
959 return ret;
960}
961
962
963
964
965static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
966{
967 mdp_disk_t *desc;
968 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
969 __u64 ev1 = md_event(sb);
970
971 rdev->raid_disk = -1;
972 clear_bit(Faulty, &rdev->flags);
973 clear_bit(In_sync, &rdev->flags);
974 clear_bit(WriteMostly, &rdev->flags);
975 clear_bit(BarriersNotsupp, &rdev->flags);
976
977 if (mddev->raid_disks == 0) {
978 mddev->major_version = 0;
979 mddev->minor_version = sb->minor_version;
980 mddev->patch_version = sb->patch_version;
981 mddev->external = 0;
982 mddev->chunk_sectors = sb->chunk_size >> 9;
983 mddev->ctime = sb->ctime;
984 mddev->utime = sb->utime;
985 mddev->level = sb->level;
986 mddev->clevel[0] = 0;
987 mddev->layout = sb->layout;
988 mddev->raid_disks = sb->raid_disks;
989 mddev->dev_sectors = sb->size * 2;
990 mddev->events = ev1;
991 mddev->bitmap_info.offset = 0;
992 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
993
994 if (mddev->minor_version >= 91) {
995 mddev->reshape_position = sb->reshape_position;
996 mddev->delta_disks = sb->delta_disks;
997 mddev->new_level = sb->new_level;
998 mddev->new_layout = sb->new_layout;
999 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1000 } else {
1001 mddev->reshape_position = MaxSector;
1002 mddev->delta_disks = 0;
1003 mddev->new_level = mddev->level;
1004 mddev->new_layout = mddev->layout;
1005 mddev->new_chunk_sectors = mddev->chunk_sectors;
1006 }
1007
1008 if (sb->state & (1<<MD_SB_CLEAN))
1009 mddev->recovery_cp = MaxSector;
1010 else {
1011 if (sb->events_hi == sb->cp_events_hi &&
1012 sb->events_lo == sb->cp_events_lo) {
1013 mddev->recovery_cp = sb->recovery_cp;
1014 } else
1015 mddev->recovery_cp = 0;
1016 }
1017
1018 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1019 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1020 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1021 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1022
1023 mddev->max_disks = MD_SB_DISKS;
1024
1025 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1026 mddev->bitmap_info.file == NULL)
1027 mddev->bitmap_info.offset =
1028 mddev->bitmap_info.default_offset;
1029
1030 } else if (mddev->pers == NULL) {
1031
1032 ++ev1;
1033 if (ev1 < mddev->events)
1034 return -EINVAL;
1035 } else if (mddev->bitmap) {
1036
1037
1038
1039 if (ev1 < mddev->bitmap->events_cleared)
1040 return 0;
1041 } else {
1042 if (ev1 < mddev->events)
1043
1044 return 0;
1045 }
1046
1047 if (mddev->level != LEVEL_MULTIPATH) {
1048 desc = sb->disks + rdev->desc_nr;
1049
1050 if (desc->state & (1<<MD_DISK_FAULTY))
1051 set_bit(Faulty, &rdev->flags);
1052 else if (desc->state & (1<<MD_DISK_SYNC)
1053) {
1054 set_bit(In_sync, &rdev->flags);
1055 rdev->raid_disk = desc->raid_disk;
1056 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1057
1058
1059
1060 if (mddev->minor_version >= 91) {
1061 rdev->recovery_offset = 0;
1062 rdev->raid_disk = desc->raid_disk;
1063 }
1064 }
1065 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1066 set_bit(WriteMostly, &rdev->flags);
1067 } else
1068 set_bit(In_sync, &rdev->flags);
1069 return 0;
1070}
1071
1072
1073
1074
1075static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1076{
1077 mdp_super_t *sb;
1078 mdk_rdev_t *rdev2;
1079 int next_spare = mddev->raid_disks;
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092 int i;
1093 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1094
1095 rdev->sb_size = MD_SB_BYTES;
1096
1097 sb = (mdp_super_t*)page_address(rdev->sb_page);
1098
1099 memset(sb, 0, sizeof(*sb));
1100
1101 sb->md_magic = MD_SB_MAGIC;
1102 sb->major_version = mddev->major_version;
1103 sb->patch_version = mddev->patch_version;
1104 sb->gvalid_words = 0;
1105 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1106 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1107 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1108 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1109
1110 sb->ctime = mddev->ctime;
1111 sb->level = mddev->level;
1112 sb->size = mddev->dev_sectors / 2;
1113 sb->raid_disks = mddev->raid_disks;
1114 sb->md_minor = mddev->md_minor;
1115 sb->not_persistent = 0;
1116 sb->utime = mddev->utime;
1117 sb->state = 0;
1118 sb->events_hi = (mddev->events>>32);
1119 sb->events_lo = (u32)mddev->events;
1120
1121 if (mddev->reshape_position == MaxSector)
1122 sb->minor_version = 90;
1123 else {
1124 sb->minor_version = 91;
1125 sb->reshape_position = mddev->reshape_position;
1126 sb->new_level = mddev->new_level;
1127 sb->delta_disks = mddev->delta_disks;
1128 sb->new_layout = mddev->new_layout;
1129 sb->new_chunk = mddev->new_chunk_sectors << 9;
1130 }
1131 mddev->minor_version = sb->minor_version;
1132 if (mddev->in_sync)
1133 {
1134 sb->recovery_cp = mddev->recovery_cp;
1135 sb->cp_events_hi = (mddev->events>>32);
1136 sb->cp_events_lo = (u32)mddev->events;
1137 if (mddev->recovery_cp == MaxSector)
1138 sb->state = (1<< MD_SB_CLEAN);
1139 } else
1140 sb->recovery_cp = 0;
1141
1142 sb->layout = mddev->layout;
1143 sb->chunk_size = mddev->chunk_sectors << 9;
1144
1145 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1146 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1147
1148 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1149 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1150 mdp_disk_t *d;
1151 int desc_nr;
1152 int is_active = test_bit(In_sync, &rdev2->flags);
1153
1154 if (rdev2->raid_disk >= 0 &&
1155 sb->minor_version >= 91)
1156
1157
1158
1159
1160 is_active = 1;
1161 if (rdev2->raid_disk < 0 ||
1162 test_bit(Faulty, &rdev2->flags))
1163 is_active = 0;
1164 if (is_active)
1165 desc_nr = rdev2->raid_disk;
1166 else
1167 desc_nr = next_spare++;
1168 rdev2->desc_nr = desc_nr;
1169 d = &sb->disks[rdev2->desc_nr];
1170 nr_disks++;
1171 d->number = rdev2->desc_nr;
1172 d->major = MAJOR(rdev2->bdev->bd_dev);
1173 d->minor = MINOR(rdev2->bdev->bd_dev);
1174 if (is_active)
1175 d->raid_disk = rdev2->raid_disk;
1176 else
1177 d->raid_disk = rdev2->desc_nr;
1178 if (test_bit(Faulty, &rdev2->flags))
1179 d->state = (1<<MD_DISK_FAULTY);
1180 else if (is_active) {
1181 d->state = (1<<MD_DISK_ACTIVE);
1182 if (test_bit(In_sync, &rdev2->flags))
1183 d->state |= (1<<MD_DISK_SYNC);
1184 active++;
1185 working++;
1186 } else {
1187 d->state = 0;
1188 spare++;
1189 working++;
1190 }
1191 if (test_bit(WriteMostly, &rdev2->flags))
1192 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1193 }
1194
1195 for (i=0 ; i < mddev->raid_disks ; i++) {
1196 mdp_disk_t *d = &sb->disks[i];
1197 if (d->state == 0 && d->number == 0) {
1198 d->number = i;
1199 d->raid_disk = i;
1200 d->state = (1<<MD_DISK_REMOVED);
1201 d->state |= (1<<MD_DISK_FAULTY);
1202 failed++;
1203 }
1204 }
1205 sb->nr_disks = nr_disks;
1206 sb->active_disks = active;
1207 sb->working_disks = working;
1208 sb->failed_disks = failed;
1209 sb->spare_disks = spare;
1210
1211 sb->this_disk = sb->disks[rdev->desc_nr];
1212 sb->sb_csum = calc_sb_csum(sb);
1213}
1214
1215
1216
1217
1218static unsigned long long
1219super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1220{
1221 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1222 return 0;
1223 if (rdev->mddev->bitmap_info.offset)
1224 return 0;
1225 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1226 if (!num_sectors || num_sectors > rdev->sb_start)
1227 num_sectors = rdev->sb_start;
1228 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1229 rdev->sb_page);
1230 md_super_wait(rdev->mddev);
1231 return num_sectors / 2;
1232}
1233
1234
1235
1236
1237
1238
1239static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1240{
1241 __le32 disk_csum;
1242 u32 csum;
1243 unsigned long long newcsum;
1244 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1245 __le32 *isuper = (__le32*)sb;
1246 int i;
1247
1248 disk_csum = sb->sb_csum;
1249 sb->sb_csum = 0;
1250 newcsum = 0;
1251 for (i=0; size>=4; size -= 4 )
1252 newcsum += le32_to_cpu(*isuper++);
1253
1254 if (size == 2)
1255 newcsum += le16_to_cpu(*(__le16*) isuper);
1256
1257 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1258 sb->sb_csum = disk_csum;
1259 return cpu_to_le32(csum);
1260}
1261
1262static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1263{
1264 struct mdp_superblock_1 *sb;
1265 int ret;
1266 sector_t sb_start;
1267 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1268 int bmask;
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278 switch(minor_version) {
1279 case 0:
1280 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1281 sb_start -= 8*2;
1282 sb_start &= ~(sector_t)(4*2-1);
1283 break;
1284 case 1:
1285 sb_start = 0;
1286 break;
1287 case 2:
1288 sb_start = 8;
1289 break;
1290 default:
1291 return -EINVAL;
1292 }
1293 rdev->sb_start = sb_start;
1294
1295
1296
1297
1298 ret = read_disk_sb(rdev, 4096);
1299 if (ret) return ret;
1300
1301
1302 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1303
1304 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1305 sb->major_version != cpu_to_le32(1) ||
1306 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1307 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1308 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1309 return -EINVAL;
1310
1311 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1312 printk("md: invalid superblock checksum on %s\n",
1313 bdevname(rdev->bdev,b));
1314 return -EINVAL;
1315 }
1316 if (le64_to_cpu(sb->data_size) < 10) {
1317 printk("md: data_size too small on %s\n",
1318 bdevname(rdev->bdev,b));
1319 return -EINVAL;
1320 }
1321
1322 rdev->preferred_minor = 0xffff;
1323 rdev->data_offset = le64_to_cpu(sb->data_offset);
1324 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1325
1326 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1327 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1328 if (rdev->sb_size & bmask)
1329 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1330
1331 if (minor_version
1332 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1333 return -EINVAL;
1334
1335 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1336 rdev->desc_nr = -1;
1337 else
1338 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1339
1340 if (!refdev) {
1341 ret = 1;
1342 } else {
1343 __u64 ev1, ev2;
1344 struct mdp_superblock_1 *refsb =
1345 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1346
1347 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1348 sb->level != refsb->level ||
1349 sb->layout != refsb->layout ||
1350 sb->chunksize != refsb->chunksize) {
1351 printk(KERN_WARNING "md: %s has strangely different"
1352 " superblock to %s\n",
1353 bdevname(rdev->bdev,b),
1354 bdevname(refdev->bdev,b2));
1355 return -EINVAL;
1356 }
1357 ev1 = le64_to_cpu(sb->events);
1358 ev2 = le64_to_cpu(refsb->events);
1359
1360 if (ev1 > ev2)
1361 ret = 1;
1362 else
1363 ret = 0;
1364 }
1365 if (minor_version)
1366 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1367 le64_to_cpu(sb->data_offset);
1368 else
1369 rdev->sectors = rdev->sb_start;
1370 if (rdev->sectors < le64_to_cpu(sb->data_size))
1371 return -EINVAL;
1372 rdev->sectors = le64_to_cpu(sb->data_size);
1373 if (le64_to_cpu(sb->size) > rdev->sectors)
1374 return -EINVAL;
1375 return ret;
1376}
1377
1378static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1379{
1380 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1381 __u64 ev1 = le64_to_cpu(sb->events);
1382
1383 rdev->raid_disk = -1;
1384 clear_bit(Faulty, &rdev->flags);
1385 clear_bit(In_sync, &rdev->flags);
1386 clear_bit(WriteMostly, &rdev->flags);
1387 clear_bit(BarriersNotsupp, &rdev->flags);
1388
1389 if (mddev->raid_disks == 0) {
1390 mddev->major_version = 1;
1391 mddev->patch_version = 0;
1392 mddev->external = 0;
1393 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1394 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1395 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1396 mddev->level = le32_to_cpu(sb->level);
1397 mddev->clevel[0] = 0;
1398 mddev->layout = le32_to_cpu(sb->layout);
1399 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1400 mddev->dev_sectors = le64_to_cpu(sb->size);
1401 mddev->events = ev1;
1402 mddev->bitmap_info.offset = 0;
1403 mddev->bitmap_info.default_offset = 1024 >> 9;
1404
1405 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1406 memcpy(mddev->uuid, sb->set_uuid, 16);
1407
1408 mddev->max_disks = (4096-256)/2;
1409
1410 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1411 mddev->bitmap_info.file == NULL )
1412 mddev->bitmap_info.offset =
1413 (__s32)le32_to_cpu(sb->bitmap_offset);
1414
1415 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1416 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1417 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1418 mddev->new_level = le32_to_cpu(sb->new_level);
1419 mddev->new_layout = le32_to_cpu(sb->new_layout);
1420 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1421 } else {
1422 mddev->reshape_position = MaxSector;
1423 mddev->delta_disks = 0;
1424 mddev->new_level = mddev->level;
1425 mddev->new_layout = mddev->layout;
1426 mddev->new_chunk_sectors = mddev->chunk_sectors;
1427 }
1428
1429 } else if (mddev->pers == NULL) {
1430
1431 ++ev1;
1432 if (ev1 < mddev->events)
1433 return -EINVAL;
1434 } else if (mddev->bitmap) {
1435
1436
1437
1438 if (ev1 < mddev->bitmap->events_cleared)
1439 return 0;
1440 } else {
1441 if (ev1 < mddev->events)
1442
1443 return 0;
1444 }
1445 if (mddev->level != LEVEL_MULTIPATH) {
1446 int role;
1447 if (rdev->desc_nr < 0 ||
1448 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1449 role = 0xffff;
1450 rdev->desc_nr = -1;
1451 } else
1452 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1453 switch(role) {
1454 case 0xffff:
1455 break;
1456 case 0xfffe:
1457 set_bit(Faulty, &rdev->flags);
1458 break;
1459 default:
1460 if ((le32_to_cpu(sb->feature_map) &
1461 MD_FEATURE_RECOVERY_OFFSET))
1462 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1463 else
1464 set_bit(In_sync, &rdev->flags);
1465 rdev->raid_disk = role;
1466 break;
1467 }
1468 if (sb->devflags & WriteMostly1)
1469 set_bit(WriteMostly, &rdev->flags);
1470 } else
1471 set_bit(In_sync, &rdev->flags);
1472
1473 return 0;
1474}
1475
1476static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1477{
1478 struct mdp_superblock_1 *sb;
1479 mdk_rdev_t *rdev2;
1480 int max_dev, i;
1481
1482
1483 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1484
1485 sb->feature_map = 0;
1486 sb->pad0 = 0;
1487 sb->recovery_offset = cpu_to_le64(0);
1488 memset(sb->pad1, 0, sizeof(sb->pad1));
1489 memset(sb->pad2, 0, sizeof(sb->pad2));
1490 memset(sb->pad3, 0, sizeof(sb->pad3));
1491
1492 sb->utime = cpu_to_le64((__u64)mddev->utime);
1493 sb->events = cpu_to_le64(mddev->events);
1494 if (mddev->in_sync)
1495 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1496 else
1497 sb->resync_offset = cpu_to_le64(0);
1498
1499 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1500
1501 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1502 sb->size = cpu_to_le64(mddev->dev_sectors);
1503 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1504 sb->level = cpu_to_le32(mddev->level);
1505 sb->layout = cpu_to_le32(mddev->layout);
1506
1507 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1508 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1509 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1510 }
1511
1512 if (rdev->raid_disk >= 0 &&
1513 !test_bit(In_sync, &rdev->flags)) {
1514 sb->feature_map |=
1515 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1516 sb->recovery_offset =
1517 cpu_to_le64(rdev->recovery_offset);
1518 }
1519
1520 if (mddev->reshape_position != MaxSector) {
1521 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1522 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1523 sb->new_layout = cpu_to_le32(mddev->new_layout);
1524 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1525 sb->new_level = cpu_to_le32(mddev->new_level);
1526 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1527 }
1528
1529 max_dev = 0;
1530 list_for_each_entry(rdev2, &mddev->disks, same_set)
1531 if (rdev2->desc_nr+1 > max_dev)
1532 max_dev = rdev2->desc_nr+1;
1533
1534 if (max_dev > le32_to_cpu(sb->max_dev)) {
1535 int bmask;
1536 sb->max_dev = cpu_to_le32(max_dev);
1537 rdev->sb_size = max_dev * 2 + 256;
1538 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1539 if (rdev->sb_size & bmask)
1540 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1541 }
1542 for (i=0; i<max_dev;i++)
1543 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1544
1545 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1546 i = rdev2->desc_nr;
1547 if (test_bit(Faulty, &rdev2->flags))
1548 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1549 else if (test_bit(In_sync, &rdev2->flags))
1550 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1551 else if (rdev2->raid_disk >= 0)
1552 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1553 else
1554 sb->dev_roles[i] = cpu_to_le16(0xffff);
1555 }
1556
1557 sb->sb_csum = calc_sb_1_csum(sb);
1558}
1559
1560static unsigned long long
1561super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1562{
1563 struct mdp_superblock_1 *sb;
1564 sector_t max_sectors;
1565 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1566 return 0;
1567 if (rdev->sb_start < rdev->data_offset) {
1568
1569 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1570 max_sectors -= rdev->data_offset;
1571 if (!num_sectors || num_sectors > max_sectors)
1572 num_sectors = max_sectors;
1573 } else if (rdev->mddev->bitmap_info.offset) {
1574
1575 return 0;
1576 } else {
1577
1578 sector_t sb_start;
1579 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1580 sb_start &= ~(sector_t)(4*2 - 1);
1581 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1582 if (!num_sectors || num_sectors > max_sectors)
1583 num_sectors = max_sectors;
1584 rdev->sb_start = sb_start;
1585 }
1586 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1587 sb->data_size = cpu_to_le64(num_sectors);
1588 sb->super_offset = rdev->sb_start;
1589 sb->sb_csum = calc_sb_1_csum(sb);
1590 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1591 rdev->sb_page);
1592 md_super_wait(rdev->mddev);
1593 return num_sectors / 2;
1594}
1595
1596static struct super_type super_types[] = {
1597 [0] = {
1598 .name = "0.90.0",
1599 .owner = THIS_MODULE,
1600 .load_super = super_90_load,
1601 .validate_super = super_90_validate,
1602 .sync_super = super_90_sync,
1603 .rdev_size_change = super_90_rdev_size_change,
1604 },
1605 [1] = {
1606 .name = "md-1",
1607 .owner = THIS_MODULE,
1608 .load_super = super_1_load,
1609 .validate_super = super_1_validate,
1610 .sync_super = super_1_sync,
1611 .rdev_size_change = super_1_rdev_size_change,
1612 },
1613};
1614
1615static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1616{
1617 mdk_rdev_t *rdev, *rdev2;
1618
1619 rcu_read_lock();
1620 rdev_for_each_rcu(rdev, mddev1)
1621 rdev_for_each_rcu(rdev2, mddev2)
1622 if (rdev->bdev->bd_contains ==
1623 rdev2->bdev->bd_contains) {
1624 rcu_read_unlock();
1625 return 1;
1626 }
1627 rcu_read_unlock();
1628 return 0;
1629}
1630
1631static LIST_HEAD(pending_raid_disks);
1632
1633
1634
1635
1636
1637
1638
1639
1640int md_integrity_register(mddev_t *mddev)
1641{
1642 mdk_rdev_t *rdev, *reference = NULL;
1643
1644 if (list_empty(&mddev->disks))
1645 return 0;
1646 if (blk_get_integrity(mddev->gendisk))
1647 return 0;
1648 list_for_each_entry(rdev, &mddev->disks, same_set) {
1649
1650 if (test_bit(Faulty, &rdev->flags))
1651 continue;
1652 if (rdev->raid_disk < 0)
1653 continue;
1654
1655
1656
1657
1658 if (!bdev_get_integrity(rdev->bdev))
1659 return -EINVAL;
1660 if (!reference) {
1661
1662 reference = rdev;
1663 continue;
1664 }
1665
1666 if (blk_integrity_compare(reference->bdev->bd_disk,
1667 rdev->bdev->bd_disk) < 0)
1668 return -EINVAL;
1669 }
1670
1671
1672
1673
1674 if (blk_integrity_register(mddev->gendisk,
1675 bdev_get_integrity(reference->bdev)) != 0) {
1676 printk(KERN_ERR "md: failed to register integrity for %s\n",
1677 mdname(mddev));
1678 return -EINVAL;
1679 }
1680 printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1681 mdname(mddev));
1682 return 0;
1683}
1684EXPORT_SYMBOL(md_integrity_register);
1685
1686
1687void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1688{
1689 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1690 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1691
1692 if (!bi_mddev)
1693 return;
1694 if (rdev->raid_disk < 0)
1695 return;
1696 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1697 rdev->bdev->bd_disk) >= 0)
1698 return;
1699 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1700 blk_integrity_unregister(mddev->gendisk);
1701}
1702EXPORT_SYMBOL(md_integrity_add_rdev);
1703
1704static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1705{
1706 char b[BDEVNAME_SIZE];
1707 struct kobject *ko;
1708 char *s;
1709 int err;
1710
1711 if (rdev->mddev) {
1712 MD_BUG();
1713 return -EINVAL;
1714 }
1715
1716
1717 if (find_rdev(mddev, rdev->bdev->bd_dev))
1718 return -EEXIST;
1719
1720
1721 if (rdev->sectors && (mddev->dev_sectors == 0 ||
1722 rdev->sectors < mddev->dev_sectors)) {
1723 if (mddev->pers) {
1724
1725
1726
1727
1728 if (mddev->level > 0)
1729 return -ENOSPC;
1730 } else
1731 mddev->dev_sectors = rdev->sectors;
1732 }
1733
1734
1735
1736
1737
1738 if (rdev->desc_nr < 0) {
1739 int choice = 0;
1740 if (mddev->pers) choice = mddev->raid_disks;
1741 while (find_rdev_nr(mddev, choice))
1742 choice++;
1743 rdev->desc_nr = choice;
1744 } else {
1745 if (find_rdev_nr(mddev, rdev->desc_nr))
1746 return -EBUSY;
1747 }
1748 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1749 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1750 mdname(mddev), mddev->max_disks);
1751 return -EBUSY;
1752 }
1753 bdevname(rdev->bdev,b);
1754 while ( (s=strchr(b, '/')) != NULL)
1755 *s = '!';
1756
1757 rdev->mddev = mddev;
1758 printk(KERN_INFO "md: bind<%s>\n", b);
1759
1760 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1761 goto fail;
1762
1763 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1764 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1765 kobject_del(&rdev->kobj);
1766 goto fail;
1767 }
1768 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1769
1770 list_add_rcu(&rdev->same_set, &mddev->disks);
1771 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1772
1773
1774 mddev->recovery_disabled = 0;
1775
1776 return 0;
1777
1778 fail:
1779 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1780 b, mdname(mddev));
1781 return err;
1782}
1783
1784static void md_delayed_delete(struct work_struct *ws)
1785{
1786 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1787 kobject_del(&rdev->kobj);
1788 kobject_put(&rdev->kobj);
1789}
1790
1791static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1792{
1793 char b[BDEVNAME_SIZE];
1794 if (!rdev->mddev) {
1795 MD_BUG();
1796 return;
1797 }
1798 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1799 list_del_rcu(&rdev->same_set);
1800 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1801 rdev->mddev = NULL;
1802 sysfs_remove_link(&rdev->kobj, "block");
1803 sysfs_put(rdev->sysfs_state);
1804 rdev->sysfs_state = NULL;
1805
1806
1807
1808
1809 synchronize_rcu();
1810 INIT_WORK(&rdev->del_work, md_delayed_delete);
1811 kobject_get(&rdev->kobj);
1812 schedule_work(&rdev->del_work);
1813}
1814
1815
1816
1817
1818
1819
1820static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1821{
1822 int err = 0;
1823 struct block_device *bdev;
1824 char b[BDEVNAME_SIZE];
1825
1826 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1827 if (IS_ERR(bdev)) {
1828 printk(KERN_ERR "md: could not open %s.\n",
1829 __bdevname(dev, b));
1830 return PTR_ERR(bdev);
1831 }
1832 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1833 if (err) {
1834 printk(KERN_ERR "md: could not bd_claim %s.\n",
1835 bdevname(bdev, b));
1836 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1837 return err;
1838 }
1839 if (!shared)
1840 set_bit(AllReserved, &rdev->flags);
1841 rdev->bdev = bdev;
1842 return err;
1843}
1844
1845static void unlock_rdev(mdk_rdev_t *rdev)
1846{
1847 struct block_device *bdev = rdev->bdev;
1848 rdev->bdev = NULL;
1849 if (!bdev)
1850 MD_BUG();
1851 bd_release(bdev);
1852 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1853}
1854
1855void md_autodetect_dev(dev_t dev);
1856
1857static void export_rdev(mdk_rdev_t * rdev)
1858{
1859 char b[BDEVNAME_SIZE];
1860 printk(KERN_INFO "md: export_rdev(%s)\n",
1861 bdevname(rdev->bdev,b));
1862 if (rdev->mddev)
1863 MD_BUG();
1864 free_disk_sb(rdev);
1865#ifndef MODULE
1866 if (test_bit(AutoDetected, &rdev->flags))
1867 md_autodetect_dev(rdev->bdev->bd_dev);
1868#endif
1869 unlock_rdev(rdev);
1870 kobject_put(&rdev->kobj);
1871}
1872
1873static void kick_rdev_from_array(mdk_rdev_t * rdev)
1874{
1875 unbind_rdev_from_array(rdev);
1876 export_rdev(rdev);
1877}
1878
1879static void export_array(mddev_t *mddev)
1880{
1881 mdk_rdev_t *rdev, *tmp;
1882
1883 rdev_for_each(rdev, tmp, mddev) {
1884 if (!rdev->mddev) {
1885 MD_BUG();
1886 continue;
1887 }
1888 kick_rdev_from_array(rdev);
1889 }
1890 if (!list_empty(&mddev->disks))
1891 MD_BUG();
1892 mddev->raid_disks = 0;
1893 mddev->major_version = 0;
1894}
1895
1896static void print_desc(mdp_disk_t *desc)
1897{
1898 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1899 desc->major,desc->minor,desc->raid_disk,desc->state);
1900}
1901
1902static void print_sb_90(mdp_super_t *sb)
1903{
1904 int i;
1905
1906 printk(KERN_INFO
1907 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1908 sb->major_version, sb->minor_version, sb->patch_version,
1909 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1910 sb->ctime);
1911 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1912 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1913 sb->md_minor, sb->layout, sb->chunk_size);
1914 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1915 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1916 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1917 sb->failed_disks, sb->spare_disks,
1918 sb->sb_csum, (unsigned long)sb->events_lo);
1919
1920 printk(KERN_INFO);
1921 for (i = 0; i < MD_SB_DISKS; i++) {
1922 mdp_disk_t *desc;
1923
1924 desc = sb->disks + i;
1925 if (desc->number || desc->major || desc->minor ||
1926 desc->raid_disk || (desc->state && (desc->state != 4))) {
1927 printk(" D %2d: ", i);
1928 print_desc(desc);
1929 }
1930 }
1931 printk(KERN_INFO "md: THIS: ");
1932 print_desc(&sb->this_disk);
1933}
1934
1935static void print_sb_1(struct mdp_superblock_1 *sb)
1936{
1937 __u8 *uuid;
1938
1939 uuid = sb->set_uuid;
1940 printk(KERN_INFO
1941 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
1942 "md: Name: \"%s\" CT:%llu\n",
1943 le32_to_cpu(sb->major_version),
1944 le32_to_cpu(sb->feature_map),
1945 uuid,
1946 sb->set_name,
1947 (unsigned long long)le64_to_cpu(sb->ctime)
1948 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1949
1950 uuid = sb->device_uuid;
1951 printk(KERN_INFO
1952 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1953 " RO:%llu\n"
1954 "md: Dev:%08x UUID: %pU\n"
1955 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1956 "md: (MaxDev:%u) \n",
1957 le32_to_cpu(sb->level),
1958 (unsigned long long)le64_to_cpu(sb->size),
1959 le32_to_cpu(sb->raid_disks),
1960 le32_to_cpu(sb->layout),
1961 le32_to_cpu(sb->chunksize),
1962 (unsigned long long)le64_to_cpu(sb->data_offset),
1963 (unsigned long long)le64_to_cpu(sb->data_size),
1964 (unsigned long long)le64_to_cpu(sb->super_offset),
1965 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1966 le32_to_cpu(sb->dev_number),
1967 uuid,
1968 sb->devflags,
1969 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1970 (unsigned long long)le64_to_cpu(sb->events),
1971 (unsigned long long)le64_to_cpu(sb->resync_offset),
1972 le32_to_cpu(sb->sb_csum),
1973 le32_to_cpu(sb->max_dev)
1974 );
1975}
1976
1977static void print_rdev(mdk_rdev_t *rdev, int major_version)
1978{
1979 char b[BDEVNAME_SIZE];
1980 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1981 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1982 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1983 rdev->desc_nr);
1984 if (rdev->sb_loaded) {
1985 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1986 switch (major_version) {
1987 case 0:
1988 print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1989 break;
1990 case 1:
1991 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1992 break;
1993 }
1994 } else
1995 printk(KERN_INFO "md: no rdev superblock!\n");
1996}
1997
1998static void md_print_devices(void)
1999{
2000 struct list_head *tmp;
2001 mdk_rdev_t *rdev;
2002 mddev_t *mddev;
2003 char b[BDEVNAME_SIZE];
2004
2005 printk("\n");
2006 printk("md: **********************************\n");
2007 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
2008 printk("md: **********************************\n");
2009 for_each_mddev(mddev, tmp) {
2010
2011 if (mddev->bitmap)
2012 bitmap_print_sb(mddev->bitmap);
2013 else
2014 printk("%s: ", mdname(mddev));
2015 list_for_each_entry(rdev, &mddev->disks, same_set)
2016 printk("<%s>", bdevname(rdev->bdev,b));
2017 printk("\n");
2018
2019 list_for_each_entry(rdev, &mddev->disks, same_set)
2020 print_rdev(rdev, mddev->major_version);
2021 }
2022 printk("md: **********************************\n");
2023 printk("\n");
2024}
2025
2026
2027static void sync_sbs(mddev_t * mddev, int nospares)
2028{
2029
2030
2031
2032
2033
2034
2035 mdk_rdev_t *rdev;
2036
2037
2038 list_for_each_entry(rdev, &mddev->disks, same_set) {
2039 if (rdev->raid_disk >= 0 &&
2040 !test_bit(In_sync, &rdev->flags) &&
2041 mddev->curr_resync_completed > rdev->recovery_offset)
2042 rdev->recovery_offset = mddev->curr_resync_completed;
2043
2044 }
2045 list_for_each_entry(rdev, &mddev->disks, same_set) {
2046 if (rdev->sb_events == mddev->events ||
2047 (nospares &&
2048 rdev->raid_disk < 0 &&
2049 (rdev->sb_events&1)==0 &&
2050 rdev->sb_events+1 == mddev->events)) {
2051
2052 rdev->sb_loaded = 2;
2053 } else {
2054 super_types[mddev->major_version].
2055 sync_super(mddev, rdev);
2056 rdev->sb_loaded = 1;
2057 }
2058 }
2059}
2060
2061static void md_update_sb(mddev_t * mddev, int force_change)
2062{
2063 mdk_rdev_t *rdev;
2064 int sync_req;
2065 int nospares = 0;
2066
2067 mddev->utime = get_seconds();
2068 if (mddev->external)
2069 return;
2070repeat:
2071 spin_lock_irq(&mddev->write_lock);
2072
2073 set_bit(MD_CHANGE_PENDING, &mddev->flags);
2074 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2075 force_change = 1;
2076 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2077
2078
2079
2080
2081 nospares = 1;
2082 if (force_change)
2083 nospares = 0;
2084 if (mddev->degraded)
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094 nospares = 0;
2095
2096 sync_req = mddev->in_sync;
2097
2098
2099
2100 if (nospares
2101 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2102 && (mddev->events & 1)
2103 && mddev->events != 1)
2104 mddev->events--;
2105 else {
2106
2107 mddev->events ++;
2108 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) {
2109
2110
2111 if ((mddev->events&1)==0)
2112 nospares = 0;
2113 } else {
2114
2115 if ((mddev->events&1))
2116 nospares = 0;
2117 }
2118 }
2119
2120 if (!mddev->events) {
2121
2122
2123
2124
2125
2126 MD_BUG();
2127 mddev->events --;
2128 }
2129
2130
2131
2132
2133
2134 if (!mddev->persistent) {
2135 if (!mddev->external)
2136 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2137
2138 spin_unlock_irq(&mddev->write_lock);
2139 wake_up(&mddev->sb_wait);
2140 return;
2141 }
2142 sync_sbs(mddev, nospares);
2143 spin_unlock_irq(&mddev->write_lock);
2144
2145 dprintk(KERN_INFO
2146 "md: updating %s RAID superblock on device (in sync %d)\n",
2147 mdname(mddev),mddev->in_sync);
2148
2149 bitmap_update_sb(mddev->bitmap);
2150 list_for_each_entry(rdev, &mddev->disks, same_set) {
2151 char b[BDEVNAME_SIZE];
2152 dprintk(KERN_INFO "md: ");
2153 if (rdev->sb_loaded != 1)
2154 continue;
2155 if (test_bit(Faulty, &rdev->flags))
2156 dprintk("(skipping faulty ");
2157
2158 dprintk("%s ", bdevname(rdev->bdev,b));
2159 if (!test_bit(Faulty, &rdev->flags)) {
2160 md_super_write(mddev,rdev,
2161 rdev->sb_start, rdev->sb_size,
2162 rdev->sb_page);
2163 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2164 bdevname(rdev->bdev,b),
2165 (unsigned long long)rdev->sb_start);
2166 rdev->sb_events = mddev->events;
2167
2168 } else
2169 dprintk(")\n");
2170 if (mddev->level == LEVEL_MULTIPATH)
2171
2172 break;
2173 }
2174 md_super_wait(mddev);
2175
2176
2177 spin_lock_irq(&mddev->write_lock);
2178 if (mddev->in_sync != sync_req ||
2179 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2180
2181 spin_unlock_irq(&mddev->write_lock);
2182 goto repeat;
2183 }
2184 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2185 spin_unlock_irq(&mddev->write_lock);
2186 wake_up(&mddev->sb_wait);
2187 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2188 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2189
2190}
2191
2192
2193
2194
2195static int cmd_match(const char *cmd, const char *str)
2196{
2197
2198
2199
2200
2201 while (*cmd && *str && *cmd == *str) {
2202 cmd++;
2203 str++;
2204 }
2205 if (*cmd == '\n')
2206 cmd++;
2207 if (*str || *cmd)
2208 return 0;
2209 return 1;
2210}
2211
2212struct rdev_sysfs_entry {
2213 struct attribute attr;
2214 ssize_t (*show)(mdk_rdev_t *, char *);
2215 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2216};
2217
2218static ssize_t
2219state_show(mdk_rdev_t *rdev, char *page)
2220{
2221 char *sep = "";
2222 size_t len = 0;
2223
2224 if (test_bit(Faulty, &rdev->flags)) {
2225 len+= sprintf(page+len, "%sfaulty",sep);
2226 sep = ",";
2227 }
2228 if (test_bit(In_sync, &rdev->flags)) {
2229 len += sprintf(page+len, "%sin_sync",sep);
2230 sep = ",";
2231 }
2232 if (test_bit(WriteMostly, &rdev->flags)) {
2233 len += sprintf(page+len, "%swrite_mostly",sep);
2234 sep = ",";
2235 }
2236 if (test_bit(Blocked, &rdev->flags)) {
2237 len += sprintf(page+len, "%sblocked", sep);
2238 sep = ",";
2239 }
2240 if (!test_bit(Faulty, &rdev->flags) &&
2241 !test_bit(In_sync, &rdev->flags)) {
2242 len += sprintf(page+len, "%sspare", sep);
2243 sep = ",";
2244 }
2245 return len+sprintf(page+len, "\n");
2246}
2247
2248static ssize_t
2249state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2250{
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260 int err = -EINVAL;
2261 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2262 md_error(rdev->mddev, rdev);
2263 err = 0;
2264 } else if (cmd_match(buf, "remove")) {
2265 if (rdev->raid_disk >= 0)
2266 err = -EBUSY;
2267 else {
2268 mddev_t *mddev = rdev->mddev;
2269 kick_rdev_from_array(rdev);
2270 if (mddev->pers)
2271 md_update_sb(mddev, 1);
2272 md_new_event(mddev);
2273 err = 0;
2274 }
2275 } else if (cmd_match(buf, "writemostly")) {
2276 set_bit(WriteMostly, &rdev->flags);
2277 err = 0;
2278 } else if (cmd_match(buf, "-writemostly")) {
2279 clear_bit(WriteMostly, &rdev->flags);
2280 err = 0;
2281 } else if (cmd_match(buf, "blocked")) {
2282 set_bit(Blocked, &rdev->flags);
2283 err = 0;
2284 } else if (cmd_match(buf, "-blocked")) {
2285 clear_bit(Blocked, &rdev->flags);
2286 wake_up(&rdev->blocked_wait);
2287 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2288 md_wakeup_thread(rdev->mddev->thread);
2289
2290 err = 0;
2291 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2292 set_bit(In_sync, &rdev->flags);
2293 err = 0;
2294 }
2295 if (!err && rdev->sysfs_state)
2296 sysfs_notify_dirent(rdev->sysfs_state);
2297 return err ? err : len;
2298}
2299static struct rdev_sysfs_entry rdev_state =
2300__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2301
2302static ssize_t
2303errors_show(mdk_rdev_t *rdev, char *page)
2304{
2305 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2306}
2307
2308static ssize_t
2309errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2310{
2311 char *e;
2312 unsigned long n = simple_strtoul(buf, &e, 10);
2313 if (*buf && (*e == 0 || *e == '\n')) {
2314 atomic_set(&rdev->corrected_errors, n);
2315 return len;
2316 }
2317 return -EINVAL;
2318}
2319static struct rdev_sysfs_entry rdev_errors =
2320__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2321
2322static ssize_t
2323slot_show(mdk_rdev_t *rdev, char *page)
2324{
2325 if (rdev->raid_disk < 0)
2326 return sprintf(page, "none\n");
2327 else
2328 return sprintf(page, "%d\n", rdev->raid_disk);
2329}
2330
2331static ssize_t
2332slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2333{
2334 char *e;
2335 int err;
2336 char nm[20];
2337 int slot = simple_strtoul(buf, &e, 10);
2338 if (strncmp(buf, "none", 4)==0)
2339 slot = -1;
2340 else if (e==buf || (*e && *e!= '\n'))
2341 return -EINVAL;
2342 if (rdev->mddev->pers && slot == -1) {
2343
2344
2345
2346
2347
2348
2349
2350 if (rdev->raid_disk == -1)
2351 return -EEXIST;
2352
2353 if (rdev->mddev->pers->hot_add_disk == NULL)
2354 return -EINVAL;
2355 err = rdev->mddev->pers->
2356 hot_remove_disk(rdev->mddev, rdev->raid_disk);
2357 if (err)
2358 return err;
2359 sprintf(nm, "rd%d", rdev->raid_disk);
2360 sysfs_remove_link(&rdev->mddev->kobj, nm);
2361 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2362 md_wakeup_thread(rdev->mddev->thread);
2363 } else if (rdev->mddev->pers) {
2364 mdk_rdev_t *rdev2;
2365
2366
2367
2368
2369 if (rdev->raid_disk != -1)
2370 return -EBUSY;
2371
2372 if (rdev->mddev->pers->hot_add_disk == NULL)
2373 return -EINVAL;
2374
2375 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2376 if (rdev2->raid_disk == slot)
2377 return -EEXIST;
2378
2379 rdev->raid_disk = slot;
2380 if (test_bit(In_sync, &rdev->flags))
2381 rdev->saved_raid_disk = slot;
2382 else
2383 rdev->saved_raid_disk = -1;
2384 err = rdev->mddev->pers->
2385 hot_add_disk(rdev->mddev, rdev);
2386 if (err) {
2387 rdev->raid_disk = -1;
2388 return err;
2389 } else
2390 sysfs_notify_dirent(rdev->sysfs_state);
2391 sprintf(nm, "rd%d", rdev->raid_disk);
2392 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2393 printk(KERN_WARNING
2394 "md: cannot register "
2395 "%s for %s\n",
2396 nm, mdname(rdev->mddev));
2397
2398
2399 } else {
2400 if (slot >= rdev->mddev->raid_disks)
2401 return -ENOSPC;
2402 rdev->raid_disk = slot;
2403
2404 clear_bit(Faulty, &rdev->flags);
2405 clear_bit(WriteMostly, &rdev->flags);
2406 set_bit(In_sync, &rdev->flags);
2407 sysfs_notify_dirent(rdev->sysfs_state);
2408 }
2409 return len;
2410}
2411
2412
2413static struct rdev_sysfs_entry rdev_slot =
2414__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2415
2416static ssize_t
2417offset_show(mdk_rdev_t *rdev, char *page)
2418{
2419 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2420}
2421
2422static ssize_t
2423offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2424{
2425 char *e;
2426 unsigned long long offset = simple_strtoull(buf, &e, 10);
2427 if (e==buf || (*e && *e != '\n'))
2428 return -EINVAL;
2429 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2430 return -EBUSY;
2431 if (rdev->sectors && rdev->mddev->external)
2432
2433
2434 return -EBUSY;
2435 rdev->data_offset = offset;
2436 return len;
2437}
2438
2439static struct rdev_sysfs_entry rdev_offset =
2440__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2441
2442static ssize_t
2443rdev_size_show(mdk_rdev_t *rdev, char *page)
2444{
2445 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2446}
2447
2448static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2449{
2450
2451 if (s1+l1 <= s2)
2452 return 0;
2453 if (s2+l2 <= s1)
2454 return 0;
2455 return 1;
2456}
2457
2458static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2459{
2460 unsigned long long blocks;
2461 sector_t new;
2462
2463 if (strict_strtoull(buf, 10, &blocks) < 0)
2464 return -EINVAL;
2465
2466 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2467 return -EINVAL;
2468
2469 new = blocks * 2;
2470 if (new != blocks * 2)
2471 return -EINVAL;
2472
2473 *sectors = new;
2474 return 0;
2475}
2476
2477static ssize_t
2478rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2479{
2480 mddev_t *my_mddev = rdev->mddev;
2481 sector_t oldsectors = rdev->sectors;
2482 sector_t sectors;
2483
2484 if (strict_blocks_to_sectors(buf, §ors) < 0)
2485 return -EINVAL;
2486 if (my_mddev->pers && rdev->raid_disk >= 0) {
2487 if (my_mddev->persistent) {
2488 sectors = super_types[my_mddev->major_version].
2489 rdev_size_change(rdev, sectors);
2490 if (!sectors)
2491 return -EBUSY;
2492 } else if (!sectors)
2493 sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2494 rdev->data_offset;
2495 }
2496 if (sectors < my_mddev->dev_sectors)
2497 return -EINVAL;
2498
2499 rdev->sectors = sectors;
2500 if (sectors > oldsectors && my_mddev->external) {
2501
2502
2503
2504
2505
2506 mddev_t *mddev;
2507 int overlap = 0;
2508 struct list_head *tmp;
2509
2510 mddev_unlock(my_mddev);
2511 for_each_mddev(mddev, tmp) {
2512 mdk_rdev_t *rdev2;
2513
2514 mddev_lock(mddev);
2515 list_for_each_entry(rdev2, &mddev->disks, same_set)
2516 if (test_bit(AllReserved, &rdev2->flags) ||
2517 (rdev->bdev == rdev2->bdev &&
2518 rdev != rdev2 &&
2519 overlaps(rdev->data_offset, rdev->sectors,
2520 rdev2->data_offset,
2521 rdev2->sectors))) {
2522 overlap = 1;
2523 break;
2524 }
2525 mddev_unlock(mddev);
2526 if (overlap) {
2527 mddev_put(mddev);
2528 break;
2529 }
2530 }
2531 mddev_lock(my_mddev);
2532 if (overlap) {
2533
2534
2535
2536
2537
2538
2539 rdev->sectors = oldsectors;
2540 return -EBUSY;
2541 }
2542 }
2543 return len;
2544}
2545
2546static struct rdev_sysfs_entry rdev_size =
2547__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2548
2549
2550static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
2551{
2552 unsigned long long recovery_start = rdev->recovery_offset;
2553
2554 if (test_bit(In_sync, &rdev->flags) ||
2555 recovery_start == MaxSector)
2556 return sprintf(page, "none\n");
2557
2558 return sprintf(page, "%llu\n", recovery_start);
2559}
2560
2561static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2562{
2563 unsigned long long recovery_start;
2564
2565 if (cmd_match(buf, "none"))
2566 recovery_start = MaxSector;
2567 else if (strict_strtoull(buf, 10, &recovery_start))
2568 return -EINVAL;
2569
2570 if (rdev->mddev->pers &&
2571 rdev->raid_disk >= 0)
2572 return -EBUSY;
2573
2574 rdev->recovery_offset = recovery_start;
2575 if (recovery_start == MaxSector)
2576 set_bit(In_sync, &rdev->flags);
2577 else
2578 clear_bit(In_sync, &rdev->flags);
2579 return len;
2580}
2581
2582static struct rdev_sysfs_entry rdev_recovery_start =
2583__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2584
2585static struct attribute *rdev_default_attrs[] = {
2586 &rdev_state.attr,
2587 &rdev_errors.attr,
2588 &rdev_slot.attr,
2589 &rdev_offset.attr,
2590 &rdev_size.attr,
2591 &rdev_recovery_start.attr,
2592 NULL,
2593};
2594static ssize_t
2595rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2596{
2597 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2598 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2599 mddev_t *mddev = rdev->mddev;
2600 ssize_t rv;
2601
2602 if (!entry->show)
2603 return -EIO;
2604
2605 rv = mddev ? mddev_lock(mddev) : -EBUSY;
2606 if (!rv) {
2607 if (rdev->mddev == NULL)
2608 rv = -EBUSY;
2609 else
2610 rv = entry->show(rdev, page);
2611 mddev_unlock(mddev);
2612 }
2613 return rv;
2614}
2615
2616static ssize_t
2617rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2618 const char *page, size_t length)
2619{
2620 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2621 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2622 ssize_t rv;
2623 mddev_t *mddev = rdev->mddev;
2624
2625 if (!entry->store)
2626 return -EIO;
2627 if (!capable(CAP_SYS_ADMIN))
2628 return -EACCES;
2629 rv = mddev ? mddev_lock(mddev): -EBUSY;
2630 if (!rv) {
2631 if (rdev->mddev == NULL)
2632 rv = -EBUSY;
2633 else
2634 rv = entry->store(rdev, page, length);
2635 mddev_unlock(mddev);
2636 }
2637 return rv;
2638}
2639
2640static void rdev_free(struct kobject *ko)
2641{
2642 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2643 kfree(rdev);
2644}
2645static struct sysfs_ops rdev_sysfs_ops = {
2646 .show = rdev_attr_show,
2647 .store = rdev_attr_store,
2648};
2649static struct kobj_type rdev_ktype = {
2650 .release = rdev_free,
2651 .sysfs_ops = &rdev_sysfs_ops,
2652 .default_attrs = rdev_default_attrs,
2653};
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2666{
2667 char b[BDEVNAME_SIZE];
2668 int err;
2669 mdk_rdev_t *rdev;
2670 sector_t size;
2671
2672 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2673 if (!rdev) {
2674 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2675 return ERR_PTR(-ENOMEM);
2676 }
2677
2678 if ((err = alloc_disk_sb(rdev)))
2679 goto abort_free;
2680
2681 err = lock_rdev(rdev, newdev, super_format == -2);
2682 if (err)
2683 goto abort_free;
2684
2685 kobject_init(&rdev->kobj, &rdev_ktype);
2686
2687 rdev->desc_nr = -1;
2688 rdev->saved_raid_disk = -1;
2689 rdev->raid_disk = -1;
2690 rdev->flags = 0;
2691 rdev->data_offset = 0;
2692 rdev->sb_events = 0;
2693 rdev->last_read_error.tv_sec = 0;
2694 rdev->last_read_error.tv_nsec = 0;
2695 atomic_set(&rdev->nr_pending, 0);
2696 atomic_set(&rdev->read_errors, 0);
2697 atomic_set(&rdev->corrected_errors, 0);
2698
2699 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2700 if (!size) {
2701 printk(KERN_WARNING
2702 "md: %s has zero or unknown size, marking faulty!\n",
2703 bdevname(rdev->bdev,b));
2704 err = -EINVAL;
2705 goto abort_free;
2706 }
2707
2708 if (super_format >= 0) {
2709 err = super_types[super_format].
2710 load_super(rdev, NULL, super_minor);
2711 if (err == -EINVAL) {
2712 printk(KERN_WARNING
2713 "md: %s does not have a valid v%d.%d "
2714 "superblock, not importing!\n",
2715 bdevname(rdev->bdev,b),
2716 super_format, super_minor);
2717 goto abort_free;
2718 }
2719 if (err < 0) {
2720 printk(KERN_WARNING
2721 "md: could not read %s's sb, not importing!\n",
2722 bdevname(rdev->bdev,b));
2723 goto abort_free;
2724 }
2725 }
2726
2727 INIT_LIST_HEAD(&rdev->same_set);
2728 init_waitqueue_head(&rdev->blocked_wait);
2729
2730 return rdev;
2731
2732abort_free:
2733 if (rdev->sb_page) {
2734 if (rdev->bdev)
2735 unlock_rdev(rdev);
2736 free_disk_sb(rdev);
2737 }
2738 kfree(rdev);
2739 return ERR_PTR(err);
2740}
2741
2742
2743
2744
2745
2746
2747static void analyze_sbs(mddev_t * mddev)
2748{
2749 int i;
2750 mdk_rdev_t *rdev, *freshest, *tmp;
2751 char b[BDEVNAME_SIZE];
2752
2753 freshest = NULL;
2754 rdev_for_each(rdev, tmp, mddev)
2755 switch (super_types[mddev->major_version].
2756 load_super(rdev, freshest, mddev->minor_version)) {
2757 case 1:
2758 freshest = rdev;
2759 break;
2760 case 0:
2761 break;
2762 default:
2763 printk( KERN_ERR \
2764 "md: fatal superblock inconsistency in %s"
2765 " -- removing from array\n",
2766 bdevname(rdev->bdev,b));
2767 kick_rdev_from_array(rdev);
2768 }
2769
2770
2771 super_types[mddev->major_version].
2772 validate_super(mddev, freshest);
2773
2774 i = 0;
2775 rdev_for_each(rdev, tmp, mddev) {
2776 if (rdev->desc_nr >= mddev->max_disks ||
2777 i > mddev->max_disks) {
2778 printk(KERN_WARNING
2779 "md: %s: %s: only %d devices permitted\n",
2780 mdname(mddev), bdevname(rdev->bdev, b),
2781 mddev->max_disks);
2782 kick_rdev_from_array(rdev);
2783 continue;
2784 }
2785 if (rdev != freshest)
2786 if (super_types[mddev->major_version].
2787 validate_super(mddev, rdev)) {
2788 printk(KERN_WARNING "md: kicking non-fresh %s"
2789 " from array!\n",
2790 bdevname(rdev->bdev,b));
2791 kick_rdev_from_array(rdev);
2792 continue;
2793 }
2794 if (mddev->level == LEVEL_MULTIPATH) {
2795 rdev->desc_nr = i++;
2796 rdev->raid_disk = rdev->desc_nr;
2797 set_bit(In_sync, &rdev->flags);
2798 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
2799 rdev->raid_disk = -1;
2800 clear_bit(In_sync, &rdev->flags);
2801 }
2802 }
2803}
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
2816{
2817 unsigned long result = 0;
2818 long decimals = -1;
2819 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
2820 if (*cp == '.')
2821 decimals = 0;
2822 else if (decimals < scale) {
2823 unsigned int value;
2824 value = *cp - '0';
2825 result = result * 10 + value;
2826 if (decimals >= 0)
2827 decimals++;
2828 }
2829 cp++;
2830 }
2831 if (*cp == '\n')
2832 cp++;
2833 if (*cp)
2834 return -EINVAL;
2835 if (decimals < 0)
2836 decimals = 0;
2837 while (decimals < scale) {
2838 result *= 10;
2839 decimals ++;
2840 }
2841 *res = result;
2842 return 0;
2843}
2844
2845
2846static void md_safemode_timeout(unsigned long data);
2847
2848static ssize_t
2849safe_delay_show(mddev_t *mddev, char *page)
2850{
2851 int msec = (mddev->safemode_delay*1000)/HZ;
2852 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2853}
2854static ssize_t
2855safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2856{
2857 unsigned long msec;
2858
2859 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
2860 return -EINVAL;
2861 if (msec == 0)
2862 mddev->safemode_delay = 0;
2863 else {
2864 unsigned long old_delay = mddev->safemode_delay;
2865 mddev->safemode_delay = (msec*HZ)/1000;
2866 if (mddev->safemode_delay == 0)
2867 mddev->safemode_delay = 1;
2868 if (mddev->safemode_delay < old_delay)
2869 md_safemode_timeout((unsigned long)mddev);
2870 }
2871 return len;
2872}
2873static struct md_sysfs_entry md_safe_delay =
2874__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2875
2876static ssize_t
2877level_show(mddev_t *mddev, char *page)
2878{
2879 struct mdk_personality *p = mddev->pers;
2880 if (p)
2881 return sprintf(page, "%s\n", p->name);
2882 else if (mddev->clevel[0])
2883 return sprintf(page, "%s\n", mddev->clevel);
2884 else if (mddev->level != LEVEL_NONE)
2885 return sprintf(page, "%d\n", mddev->level);
2886 else
2887 return 0;
2888}
2889
2890static ssize_t
2891level_store(mddev_t *mddev, const char *buf, size_t len)
2892{
2893 char level[16];
2894 ssize_t rv = len;
2895 struct mdk_personality *pers;
2896 void *priv;
2897 mdk_rdev_t *rdev;
2898
2899 if (mddev->pers == NULL) {
2900 if (len == 0)
2901 return 0;
2902 if (len >= sizeof(mddev->clevel))
2903 return -ENOSPC;
2904 strncpy(mddev->clevel, buf, len);
2905 if (mddev->clevel[len-1] == '\n')
2906 len--;
2907 mddev->clevel[len] = 0;
2908 mddev->level = LEVEL_NONE;
2909 return rv;
2910 }
2911
2912
2913
2914
2915
2916
2917
2918 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2919 return -EBUSY;
2920
2921 if (!mddev->pers->quiesce) {
2922 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2923 mdname(mddev), mddev->pers->name);
2924 return -EINVAL;
2925 }
2926
2927
2928 if (len == 0 || len >= sizeof(level))
2929 return -EINVAL;
2930 strncpy(level, buf, len);
2931 if (level[len-1] == '\n')
2932 len--;
2933 level[len] = 0;
2934
2935 request_module("md-%s", level);
2936 spin_lock(&pers_lock);
2937 pers = find_pers(LEVEL_NONE, level);
2938 if (!pers || !try_module_get(pers->owner)) {
2939 spin_unlock(&pers_lock);
2940 printk(KERN_WARNING "md: personality %s not loaded\n", level);
2941 return -EINVAL;
2942 }
2943 spin_unlock(&pers_lock);
2944
2945 if (pers == mddev->pers) {
2946
2947 module_put(pers->owner);
2948 return rv;
2949 }
2950 if (!pers->takeover) {
2951 module_put(pers->owner);
2952 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2953 mdname(mddev), level);
2954 return -EINVAL;
2955 }
2956
2957
2958
2959
2960 priv = pers->takeover(mddev);
2961 if (IS_ERR(priv)) {
2962 mddev->new_level = mddev->level;
2963 mddev->new_layout = mddev->layout;
2964 mddev->new_chunk_sectors = mddev->chunk_sectors;
2965 mddev->raid_disks -= mddev->delta_disks;
2966 mddev->delta_disks = 0;
2967 module_put(pers->owner);
2968 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2969 mdname(mddev), level);
2970 return PTR_ERR(priv);
2971 }
2972
2973
2974 mddev_suspend(mddev);
2975 mddev->pers->stop(mddev);
2976 module_put(mddev->pers->owner);
2977
2978 list_for_each_entry(rdev, &mddev->disks, same_set)
2979 if (rdev->raid_disk >= mddev->raid_disks) {
2980 rdev->raid_disk = -1;
2981 clear_bit(In_sync, &rdev->flags);
2982 }
2983 mddev->pers = pers;
2984 mddev->private = priv;
2985 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2986 mddev->level = mddev->new_level;
2987 mddev->layout = mddev->new_layout;
2988 mddev->chunk_sectors = mddev->new_chunk_sectors;
2989 mddev->delta_disks = 0;
2990 pers->run(mddev);
2991 mddev_resume(mddev);
2992 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2993 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2994 md_wakeup_thread(mddev->thread);
2995 return rv;
2996}
2997
2998static struct md_sysfs_entry md_level =
2999__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3000
3001
3002static ssize_t
3003layout_show(mddev_t *mddev, char *page)
3004{
3005
3006 if (mddev->reshape_position != MaxSector &&
3007 mddev->layout != mddev->new_layout)
3008 return sprintf(page, "%d (%d)\n",
3009 mddev->new_layout, mddev->layout);
3010 return sprintf(page, "%d\n", mddev->layout);
3011}
3012
3013static ssize_t
3014layout_store(mddev_t *mddev, const char *buf, size_t len)
3015{
3016 char *e;
3017 unsigned long n = simple_strtoul(buf, &e, 10);
3018
3019 if (!*buf || (*e && *e != '\n'))
3020 return -EINVAL;
3021
3022 if (mddev->pers) {
3023 int err;
3024 if (mddev->pers->check_reshape == NULL)
3025 return -EBUSY;
3026 mddev->new_layout = n;
3027 err = mddev->pers->check_reshape(mddev);
3028 if (err) {
3029 mddev->new_layout = mddev->layout;
3030 return err;
3031 }
3032 } else {
3033 mddev->new_layout = n;
3034 if (mddev->reshape_position == MaxSector)
3035 mddev->layout = n;
3036 }
3037 return len;
3038}
3039static struct md_sysfs_entry md_layout =
3040__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3041
3042
3043static ssize_t
3044raid_disks_show(mddev_t *mddev, char *page)
3045{
3046 if (mddev->raid_disks == 0)
3047 return 0;
3048 if (mddev->reshape_position != MaxSector &&
3049 mddev->delta_disks != 0)
3050 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3051 mddev->raid_disks - mddev->delta_disks);
3052 return sprintf(page, "%d\n", mddev->raid_disks);
3053}
3054
3055static int update_raid_disks(mddev_t *mddev, int raid_disks);
3056
3057static ssize_t
3058raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
3059{
3060 char *e;
3061 int rv = 0;
3062 unsigned long n = simple_strtoul(buf, &e, 10);
3063
3064 if (!*buf || (*e && *e != '\n'))
3065 return -EINVAL;
3066
3067 if (mddev->pers)
3068 rv = update_raid_disks(mddev, n);
3069 else if (mddev->reshape_position != MaxSector) {
3070 int olddisks = mddev->raid_disks - mddev->delta_disks;
3071 mddev->delta_disks = n - olddisks;
3072 mddev->raid_disks = n;
3073 } else
3074 mddev->raid_disks = n;
3075 return rv ? rv : len;
3076}
3077static struct md_sysfs_entry md_raid_disks =
3078__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3079
3080static ssize_t
3081chunk_size_show(mddev_t *mddev, char *page)
3082{
3083 if (mddev->reshape_position != MaxSector &&
3084 mddev->chunk_sectors != mddev->new_chunk_sectors)
3085 return sprintf(page, "%d (%d)\n",
3086 mddev->new_chunk_sectors << 9,
3087 mddev->chunk_sectors << 9);
3088 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3089}
3090
3091static ssize_t
3092chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
3093{
3094 char *e;
3095 unsigned long n = simple_strtoul(buf, &e, 10);
3096
3097 if (!*buf || (*e && *e != '\n'))
3098 return -EINVAL;
3099
3100 if (mddev->pers) {
3101 int err;
3102 if (mddev->pers->check_reshape == NULL)
3103 return -EBUSY;
3104 mddev->new_chunk_sectors = n >> 9;
3105 err = mddev->pers->check_reshape(mddev);
3106 if (err) {
3107 mddev->new_chunk_sectors = mddev->chunk_sectors;
3108 return err;
3109 }
3110 } else {
3111 mddev->new_chunk_sectors = n >> 9;
3112 if (mddev->reshape_position == MaxSector)
3113 mddev->chunk_sectors = n >> 9;
3114 }
3115 return len;
3116}
3117static struct md_sysfs_entry md_chunk_size =
3118__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3119
3120static ssize_t
3121resync_start_show(mddev_t *mddev, char *page)
3122{
3123 if (mddev->recovery_cp == MaxSector)
3124 return sprintf(page, "none\n");
3125 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3126}
3127
3128static ssize_t
3129resync_start_store(mddev_t *mddev, const char *buf, size_t len)
3130{
3131 char *e;
3132 unsigned long long n = simple_strtoull(buf, &e, 10);
3133
3134 if (mddev->pers)
3135 return -EBUSY;
3136 if (cmd_match(buf, "none"))
3137 n = MaxSector;
3138 else if (!*buf || (*e && *e != '\n'))
3139 return -EINVAL;
3140
3141 mddev->recovery_cp = n;
3142 return len;
3143}
3144static struct md_sysfs_entry md_resync_start =
3145__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3184 write_pending, active_idle, bad_word};
3185static char *array_states[] = {
3186 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3187 "write-pending", "active-idle", NULL };
3188
3189static int match_word(const char *word, char **list)
3190{
3191 int n;
3192 for (n=0; list[n]; n++)
3193 if (cmd_match(word, list[n]))
3194 break;
3195 return n;
3196}
3197
3198static ssize_t
3199array_state_show(mddev_t *mddev, char *page)
3200{
3201 enum array_state st = inactive;
3202
3203 if (mddev->pers)
3204 switch(mddev->ro) {
3205 case 1:
3206 st = readonly;
3207 break;
3208 case 2:
3209 st = read_auto;
3210 break;
3211 case 0:
3212 if (mddev->in_sync)
3213 st = clean;
3214 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
3215 st = write_pending;
3216 else if (mddev->safemode)
3217 st = active_idle;
3218 else
3219 st = active;
3220 }
3221 else {
3222 if (list_empty(&mddev->disks) &&
3223 mddev->raid_disks == 0 &&
3224 mddev->dev_sectors == 0)
3225 st = clear;
3226 else
3227 st = inactive;
3228 }
3229 return sprintf(page, "%s\n", array_states[st]);
3230}
3231
3232static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3233static int do_md_run(mddev_t * mddev);
3234static int restart_array(mddev_t *mddev);
3235
3236static ssize_t
3237array_state_store(mddev_t *mddev, const char *buf, size_t len)
3238{
3239 int err = -EINVAL;
3240 enum array_state st = match_word(buf, array_states);
3241 switch(st) {
3242 case bad_word:
3243 break;
3244 case clear:
3245
3246 if (atomic_read(&mddev->openers) > 0)
3247 return -EBUSY;
3248 err = do_md_stop(mddev, 0, 0);
3249 break;
3250 case inactive:
3251
3252 if (mddev->pers) {
3253 if (atomic_read(&mddev->openers) > 0)
3254 return -EBUSY;
3255 err = do_md_stop(mddev, 2, 0);
3256 } else
3257 err = 0;
3258 break;
3259 case suspended:
3260 break;
3261 case readonly:
3262 if (mddev->pers)
3263 err = do_md_stop(mddev, 1, 0);
3264 else {
3265 mddev->ro = 1;
3266 set_disk_ro(mddev->gendisk, 1);
3267 err = do_md_run(mddev);
3268 }
3269 break;
3270 case read_auto:
3271 if (mddev->pers) {
3272 if (mddev->ro == 0)
3273 err = do_md_stop(mddev, 1, 0);
3274 else if (mddev->ro == 1)
3275 err = restart_array(mddev);
3276 if (err == 0) {
3277 mddev->ro = 2;
3278 set_disk_ro(mddev->gendisk, 0);
3279 }
3280 } else {
3281 mddev->ro = 2;
3282 err = do_md_run(mddev);
3283 }
3284 break;
3285 case clean:
3286 if (mddev->pers) {
3287 restart_array(mddev);
3288 spin_lock_irq(&mddev->write_lock);
3289 if (atomic_read(&mddev->writes_pending) == 0) {
3290 if (mddev->in_sync == 0) {
3291 mddev->in_sync = 1;
3292 if (mddev->safemode == 1)
3293 mddev->safemode = 0;
3294 if (mddev->persistent)
3295 set_bit(MD_CHANGE_CLEAN,
3296 &mddev->flags);
3297 }
3298 err = 0;
3299 } else
3300 err = -EBUSY;
3301 spin_unlock_irq(&mddev->write_lock);
3302 } else
3303 err = -EINVAL;
3304 break;
3305 case active:
3306 if (mddev->pers) {
3307 restart_array(mddev);
3308 if (mddev->external)
3309 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3310 wake_up(&mddev->sb_wait);
3311 err = 0;
3312 } else {
3313 mddev->ro = 0;
3314 set_disk_ro(mddev->gendisk, 0);
3315 err = do_md_run(mddev);
3316 }
3317 break;
3318 case write_pending:
3319 case active_idle:
3320
3321 break;
3322 }
3323 if (err)
3324 return err;
3325 else {
3326 sysfs_notify_dirent(mddev->sysfs_state);
3327 return len;
3328 }
3329}
3330static struct md_sysfs_entry md_array_state =
3331__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3332
3333static ssize_t
3334max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3335 return sprintf(page, "%d\n",
3336 atomic_read(&mddev->max_corr_read_errors));
3337}
3338
3339static ssize_t
3340max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3341{
3342 char *e;
3343 unsigned long n = simple_strtoul(buf, &e, 10);
3344
3345 if (*buf && (*e == 0 || *e == '\n')) {
3346 atomic_set(&mddev->max_corr_read_errors, n);
3347 return len;
3348 }
3349 return -EINVAL;
3350}
3351
3352static struct md_sysfs_entry max_corr_read_errors =
3353__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3354 max_corrected_read_errors_store);
3355
3356static ssize_t
3357null_show(mddev_t *mddev, char *page)
3358{
3359 return -EINVAL;
3360}
3361
3362static ssize_t
3363new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3364{
3365
3366
3367
3368
3369
3370
3371
3372 char *e;
3373 int major = simple_strtoul(buf, &e, 10);
3374 int minor;
3375 dev_t dev;
3376 mdk_rdev_t *rdev;
3377 int err;
3378
3379 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3380 return -EINVAL;
3381 minor = simple_strtoul(e+1, &e, 10);
3382 if (*e && *e != '\n')
3383 return -EINVAL;
3384 dev = MKDEV(major, minor);
3385 if (major != MAJOR(dev) ||
3386 minor != MINOR(dev))
3387 return -EOVERFLOW;
3388
3389
3390 if (mddev->persistent) {
3391 rdev = md_import_device(dev, mddev->major_version,
3392 mddev->minor_version);
3393 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3394 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3395 mdk_rdev_t, same_set);
3396 err = super_types[mddev->major_version]
3397 .load_super(rdev, rdev0, mddev->minor_version);
3398 if (err < 0)
3399 goto out;
3400 }
3401 } else if (mddev->external)
3402 rdev = md_import_device(dev, -2, -1);
3403 else
3404 rdev = md_import_device(dev, -1, -1);
3405
3406 if (IS_ERR(rdev))
3407 return PTR_ERR(rdev);
3408 err = bind_rdev_to_array(rdev, mddev);
3409 out:
3410 if (err)
3411 export_rdev(rdev);
3412 return err ? err : len;
3413}
3414
3415static struct md_sysfs_entry md_new_device =
3416__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3417
3418static ssize_t
3419bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3420{
3421 char *end;
3422 unsigned long chunk, end_chunk;
3423
3424 if (!mddev->bitmap)
3425 goto out;
3426
3427 while (*buf) {
3428 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3429 if (buf == end) break;
3430 if (*end == '-') {
3431 buf = end + 1;
3432 end_chunk = simple_strtoul(buf, &end, 0);
3433 if (buf == end) break;
3434 }
3435 if (*end && !isspace(*end)) break;
3436 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3437 buf = skip_spaces(end);
3438 }
3439 bitmap_unplug(mddev->bitmap);
3440out:
3441 return len;
3442}
3443
3444static struct md_sysfs_entry md_bitmap =
3445__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3446
3447static ssize_t
3448size_show(mddev_t *mddev, char *page)
3449{
3450 return sprintf(page, "%llu\n",
3451 (unsigned long long)mddev->dev_sectors / 2);
3452}
3453
3454static int update_size(mddev_t *mddev, sector_t num_sectors);
3455
3456static ssize_t
3457size_store(mddev_t *mddev, const char *buf, size_t len)
3458{
3459
3460
3461
3462
3463 sector_t sectors;
3464 int err = strict_blocks_to_sectors(buf, §ors);
3465
3466 if (err < 0)
3467 return err;
3468 if (mddev->pers) {
3469 err = update_size(mddev, sectors);
3470 md_update_sb(mddev, 1);
3471 } else {
3472 if (mddev->dev_sectors == 0 ||
3473 mddev->dev_sectors > sectors)
3474 mddev->dev_sectors = sectors;
3475 else
3476 err = -ENOSPC;
3477 }
3478 return err ? err : len;
3479}
3480
3481static struct md_sysfs_entry md_size =
3482__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3483
3484
3485
3486
3487
3488
3489
3490
3491static ssize_t
3492metadata_show(mddev_t *mddev, char *page)
3493{
3494 if (mddev->persistent)
3495 return sprintf(page, "%d.%d\n",
3496 mddev->major_version, mddev->minor_version);
3497 else if (mddev->external)
3498 return sprintf(page, "external:%s\n", mddev->metadata_type);
3499 else
3500 return sprintf(page, "none\n");
3501}
3502
3503static ssize_t
3504metadata_store(mddev_t *mddev, const char *buf, size_t len)
3505{
3506 int major, minor;
3507 char *e;
3508
3509
3510
3511
3512 if (mddev->external && strncmp(buf, "external:", 9) == 0)
3513 ;
3514 else if (!list_empty(&mddev->disks))
3515 return -EBUSY;
3516
3517 if (cmd_match(buf, "none")) {
3518 mddev->persistent = 0;
3519 mddev->external = 0;
3520 mddev->major_version = 0;
3521 mddev->minor_version = 90;
3522 return len;
3523 }
3524 if (strncmp(buf, "external:", 9) == 0) {
3525 size_t namelen = len-9;
3526 if (namelen >= sizeof(mddev->metadata_type))
3527 namelen = sizeof(mddev->metadata_type)-1;
3528 strncpy(mddev->metadata_type, buf+9, namelen);
3529 mddev->metadata_type[namelen] = 0;
3530 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3531 mddev->metadata_type[--namelen] = 0;
3532 mddev->persistent = 0;
3533 mddev->external = 1;
3534 mddev->major_version = 0;
3535 mddev->minor_version = 90;
3536 return len;
3537 }
3538 major = simple_strtoul(buf, &e, 10);
3539 if (e==buf || *e != '.')
3540 return -EINVAL;
3541 buf = e+1;
3542 minor = simple_strtoul(buf, &e, 10);
3543 if (e==buf || (*e && *e != '\n') )
3544 return -EINVAL;
3545 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3546 return -ENOENT;
3547 mddev->major_version = major;
3548 mddev->minor_version = minor;
3549 mddev->persistent = 1;
3550 mddev->external = 0;
3551 return len;
3552}
3553
3554static struct md_sysfs_entry md_metadata =
3555__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3556
3557static ssize_t
3558action_show(mddev_t *mddev, char *page)
3559{
3560 char *type = "idle";
3561 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3562 type = "frozen";
3563 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3564 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3565 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3566 type = "reshape";
3567 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3568 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3569 type = "resync";
3570 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3571 type = "check";
3572 else
3573 type = "repair";
3574 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3575 type = "recover";
3576 }
3577 return sprintf(page, "%s\n", type);
3578}
3579
3580static ssize_t
3581action_store(mddev_t *mddev, const char *page, size_t len)
3582{
3583 if (!mddev->pers || !mddev->pers->sync_request)
3584 return -EINVAL;
3585
3586 if (cmd_match(page, "frozen"))
3587 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3588 else
3589 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3590
3591 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3592 if (mddev->sync_thread) {
3593 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3594 md_unregister_thread(mddev->sync_thread);
3595 mddev->sync_thread = NULL;
3596 mddev->recovery = 0;
3597 }
3598 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3599 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3600 return -EBUSY;
3601 else if (cmd_match(page, "resync"))
3602 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3603 else if (cmd_match(page, "recover")) {
3604 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3605 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3606 } else if (cmd_match(page, "reshape")) {
3607 int err;
3608 if (mddev->pers->start_reshape == NULL)
3609 return -EINVAL;
3610 err = mddev->pers->start_reshape(mddev);
3611 if (err)
3612 return err;
3613 sysfs_notify(&mddev->kobj, NULL, "degraded");
3614 } else {
3615 if (cmd_match(page, "check"))
3616 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3617 else if (!cmd_match(page, "repair"))
3618 return -EINVAL;
3619 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3620 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3621 }
3622 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3623 md_wakeup_thread(mddev->thread);
3624 sysfs_notify_dirent(mddev->sysfs_action);
3625 return len;
3626}
3627
3628static ssize_t
3629mismatch_cnt_show(mddev_t *mddev, char *page)
3630{
3631 return sprintf(page, "%llu\n",
3632 (unsigned long long) mddev->resync_mismatches);
3633}
3634
3635static struct md_sysfs_entry md_scan_mode =
3636__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3637
3638
3639static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3640
3641static ssize_t
3642sync_min_show(mddev_t *mddev, char *page)
3643{
3644 return sprintf(page, "%d (%s)\n", speed_min(mddev),
3645 mddev->sync_speed_min ? "local": "system");
3646}
3647
3648static ssize_t
3649sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3650{
3651 int min;
3652 char *e;
3653 if (strncmp(buf, "system", 6)==0) {
3654 mddev->sync_speed_min = 0;
3655 return len;
3656 }
3657 min = simple_strtoul(buf, &e, 10);
3658 if (buf == e || (*e && *e != '\n') || min <= 0)
3659 return -EINVAL;
3660 mddev->sync_speed_min = min;
3661 return len;
3662}
3663
3664static struct md_sysfs_entry md_sync_min =
3665__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3666
3667static ssize_t
3668sync_max_show(mddev_t *mddev, char *page)
3669{
3670 return sprintf(page, "%d (%s)\n", speed_max(mddev),
3671 mddev->sync_speed_max ? "local": "system");
3672}
3673
3674static ssize_t
3675sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3676{
3677 int max;
3678 char *e;
3679 if (strncmp(buf, "system", 6)==0) {
3680 mddev->sync_speed_max = 0;
3681 return len;
3682 }
3683 max = simple_strtoul(buf, &e, 10);
3684 if (buf == e || (*e && *e != '\n') || max <= 0)
3685 return -EINVAL;
3686 mddev->sync_speed_max = max;
3687 return len;
3688}
3689
3690static struct md_sysfs_entry md_sync_max =
3691__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3692
3693static ssize_t
3694degraded_show(mddev_t *mddev, char *page)
3695{
3696 return sprintf(page, "%d\n", mddev->degraded);
3697}
3698static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3699
3700static ssize_t
3701sync_force_parallel_show(mddev_t *mddev, char *page)
3702{
3703 return sprintf(page, "%d\n", mddev->parallel_resync);
3704}
3705
3706static ssize_t
3707sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3708{
3709 long n;
3710
3711 if (strict_strtol(buf, 10, &n))
3712 return -EINVAL;
3713
3714 if (n != 0 && n != 1)
3715 return -EINVAL;
3716
3717 mddev->parallel_resync = n;
3718
3719 if (mddev->sync_thread)
3720 wake_up(&resync_wait);
3721
3722 return len;
3723}
3724
3725
3726static struct md_sysfs_entry md_sync_force_parallel =
3727__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3728 sync_force_parallel_show, sync_force_parallel_store);
3729
3730static ssize_t
3731sync_speed_show(mddev_t *mddev, char *page)
3732{
3733 unsigned long resync, dt, db;
3734 if (mddev->curr_resync == 0)
3735 return sprintf(page, "none\n");
3736 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3737 dt = (jiffies - mddev->resync_mark) / HZ;
3738 if (!dt) dt++;
3739 db = resync - mddev->resync_mark_cnt;
3740 return sprintf(page, "%lu\n", db/dt/2);
3741}
3742
3743static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3744
3745static ssize_t
3746sync_completed_show(mddev_t *mddev, char *page)
3747{
3748 unsigned long max_sectors, resync;
3749
3750 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3751 return sprintf(page, "none\n");
3752
3753 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3754 max_sectors = mddev->resync_max_sectors;
3755 else
3756 max_sectors = mddev->dev_sectors;
3757
3758 resync = mddev->curr_resync_completed;
3759 return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3760}
3761
3762static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3763
3764static ssize_t
3765min_sync_show(mddev_t *mddev, char *page)
3766{
3767 return sprintf(page, "%llu\n",
3768 (unsigned long long)mddev->resync_min);
3769}
3770static ssize_t
3771min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3772{
3773 unsigned long long min;
3774 if (strict_strtoull(buf, 10, &min))
3775 return -EINVAL;
3776 if (min > mddev->resync_max)
3777 return -EINVAL;
3778 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3779 return -EBUSY;
3780
3781
3782 if (mddev->chunk_sectors) {
3783 sector_t temp = min;
3784 if (sector_div(temp, mddev->chunk_sectors))
3785 return -EINVAL;
3786 }
3787 mddev->resync_min = min;
3788
3789 return len;
3790}
3791
3792static struct md_sysfs_entry md_min_sync =
3793__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3794
3795static ssize_t
3796max_sync_show(mddev_t *mddev, char *page)
3797{
3798 if (mddev->resync_max == MaxSector)
3799 return sprintf(page, "max\n");
3800 else
3801 return sprintf(page, "%llu\n",
3802 (unsigned long long)mddev->resync_max);
3803}
3804static ssize_t
3805max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3806{
3807 if (strncmp(buf, "max", 3) == 0)
3808 mddev->resync_max = MaxSector;
3809 else {
3810 unsigned long long max;
3811 if (strict_strtoull(buf, 10, &max))
3812 return -EINVAL;
3813 if (max < mddev->resync_min)
3814 return -EINVAL;
3815 if (max < mddev->resync_max &&
3816 mddev->ro == 0 &&
3817 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3818 return -EBUSY;
3819
3820
3821 if (mddev->chunk_sectors) {
3822 sector_t temp = max;
3823 if (sector_div(temp, mddev->chunk_sectors))
3824 return -EINVAL;
3825 }
3826 mddev->resync_max = max;
3827 }
3828 wake_up(&mddev->recovery_wait);
3829 return len;
3830}
3831
3832static struct md_sysfs_entry md_max_sync =
3833__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3834
3835static ssize_t
3836suspend_lo_show(mddev_t *mddev, char *page)
3837{
3838 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3839}
3840
3841static ssize_t
3842suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3843{
3844 char *e;
3845 unsigned long long new = simple_strtoull(buf, &e, 10);
3846
3847 if (mddev->pers == NULL ||
3848 mddev->pers->quiesce == NULL)
3849 return -EINVAL;
3850 if (buf == e || (*e && *e != '\n'))
3851 return -EINVAL;
3852 if (new >= mddev->suspend_hi ||
3853 (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3854 mddev->suspend_lo = new;
3855 mddev->pers->quiesce(mddev, 2);
3856 return len;
3857 } else
3858 return -EINVAL;
3859}
3860static struct md_sysfs_entry md_suspend_lo =
3861__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3862
3863
3864static ssize_t
3865suspend_hi_show(mddev_t *mddev, char *page)
3866{
3867 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3868}
3869
3870static ssize_t
3871suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3872{
3873 char *e;
3874 unsigned long long new = simple_strtoull(buf, &e, 10);
3875
3876 if (mddev->pers == NULL ||
3877 mddev->pers->quiesce == NULL)
3878 return -EINVAL;
3879 if (buf == e || (*e && *e != '\n'))
3880 return -EINVAL;
3881 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3882 (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3883 mddev->suspend_hi = new;
3884 mddev->pers->quiesce(mddev, 1);
3885 mddev->pers->quiesce(mddev, 0);
3886 return len;
3887 } else
3888 return -EINVAL;
3889}
3890static struct md_sysfs_entry md_suspend_hi =
3891__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3892
3893static ssize_t
3894reshape_position_show(mddev_t *mddev, char *page)
3895{
3896 if (mddev->reshape_position != MaxSector)
3897 return sprintf(page, "%llu\n",
3898 (unsigned long long)mddev->reshape_position);
3899 strcpy(page, "none\n");
3900 return 5;
3901}
3902
3903static ssize_t
3904reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3905{
3906 char *e;
3907 unsigned long long new = simple_strtoull(buf, &e, 10);
3908 if (mddev->pers)
3909 return -EBUSY;
3910 if (buf == e || (*e && *e != '\n'))
3911 return -EINVAL;
3912 mddev->reshape_position = new;
3913 mddev->delta_disks = 0;
3914 mddev->new_level = mddev->level;
3915 mddev->new_layout = mddev->layout;
3916 mddev->new_chunk_sectors = mddev->chunk_sectors;
3917 return len;
3918}
3919
3920static struct md_sysfs_entry md_reshape_position =
3921__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3922 reshape_position_store);
3923
3924static ssize_t
3925array_size_show(mddev_t *mddev, char *page)
3926{
3927 if (mddev->external_size)
3928 return sprintf(page, "%llu\n",
3929 (unsigned long long)mddev->array_sectors/2);
3930 else
3931 return sprintf(page, "default\n");
3932}
3933
3934static ssize_t
3935array_size_store(mddev_t *mddev, const char *buf, size_t len)
3936{
3937 sector_t sectors;
3938
3939 if (strncmp(buf, "default", 7) == 0) {
3940 if (mddev->pers)
3941 sectors = mddev->pers->size(mddev, 0, 0);
3942 else
3943 sectors = mddev->array_sectors;
3944
3945 mddev->external_size = 0;
3946 } else {
3947 if (strict_blocks_to_sectors(buf, §ors) < 0)
3948 return -EINVAL;
3949 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3950 return -E2BIG;
3951
3952 mddev->external_size = 1;
3953 }
3954
3955 mddev->array_sectors = sectors;
3956 set_capacity(mddev->gendisk, mddev->array_sectors);
3957 if (mddev->pers)
3958 revalidate_disk(mddev->gendisk);
3959
3960 return len;
3961}
3962
3963static struct md_sysfs_entry md_array_size =
3964__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3965 array_size_store);
3966
3967static struct attribute *md_default_attrs[] = {
3968 &md_level.attr,
3969 &md_layout.attr,
3970 &md_raid_disks.attr,
3971 &md_chunk_size.attr,
3972 &md_size.attr,
3973 &md_resync_start.attr,
3974 &md_metadata.attr,
3975 &md_new_device.attr,
3976 &md_safe_delay.attr,
3977 &md_array_state.attr,
3978 &md_reshape_position.attr,
3979 &md_array_size.attr,
3980 &max_corr_read_errors.attr,
3981 NULL,
3982};
3983
3984static struct attribute *md_redundancy_attrs[] = {
3985 &md_scan_mode.attr,
3986 &md_mismatches.attr,
3987 &md_sync_min.attr,
3988 &md_sync_max.attr,
3989 &md_sync_speed.attr,
3990 &md_sync_force_parallel.attr,
3991 &md_sync_completed.attr,
3992 &md_min_sync.attr,
3993 &md_max_sync.attr,
3994 &md_suspend_lo.attr,
3995 &md_suspend_hi.attr,
3996 &md_bitmap.attr,
3997 &md_degraded.attr,
3998 NULL,
3999};
4000static struct attribute_group md_redundancy_group = {
4001 .name = NULL,
4002 .attrs = md_redundancy_attrs,
4003};
4004
4005
4006static ssize_t
4007md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4008{
4009 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4010 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4011 ssize_t rv;
4012
4013 if (!entry->show)
4014 return -EIO;
4015 rv = mddev_lock(mddev);
4016 if (!rv) {
4017 rv = entry->show(mddev, page);
4018 mddev_unlock(mddev);
4019 }
4020 return rv;
4021}
4022
4023static ssize_t
4024md_attr_store(struct kobject *kobj, struct attribute *attr,
4025 const char *page, size_t length)
4026{
4027 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4028 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4029 ssize_t rv;
4030
4031 if (!entry->store)
4032 return -EIO;
4033 if (!capable(CAP_SYS_ADMIN))
4034 return -EACCES;
4035 rv = mddev_lock(mddev);
4036 if (mddev->hold_active == UNTIL_IOCTL)
4037 mddev->hold_active = 0;
4038 if (!rv) {
4039 rv = entry->store(mddev, page, length);
4040 mddev_unlock(mddev);
4041 }
4042 return rv;
4043}
4044
4045static void md_free(struct kobject *ko)
4046{
4047 mddev_t *mddev = container_of(ko, mddev_t, kobj);
4048
4049 if (mddev->sysfs_state)
4050 sysfs_put(mddev->sysfs_state);
4051
4052 if (mddev->gendisk) {
4053 del_gendisk(mddev->gendisk);
4054 put_disk(mddev->gendisk);
4055 }
4056 if (mddev->queue)
4057 blk_cleanup_queue(mddev->queue);
4058
4059 kfree(mddev);
4060}
4061
4062static struct sysfs_ops md_sysfs_ops = {
4063 .show = md_attr_show,
4064 .store = md_attr_store,
4065};
4066static struct kobj_type md_ktype = {
4067 .release = md_free,
4068 .sysfs_ops = &md_sysfs_ops,
4069 .default_attrs = md_default_attrs,
4070};
4071
4072int mdp_major = 0;
4073
4074static void mddev_delayed_delete(struct work_struct *ws)
4075{
4076 mddev_t *mddev = container_of(ws, mddev_t, del_work);
4077
4078 if (mddev->private) {
4079 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
4080 if (mddev->private != (void*)1)
4081 sysfs_remove_group(&mddev->kobj, mddev->private);
4082 if (mddev->sysfs_action)
4083 sysfs_put(mddev->sysfs_action);
4084 mddev->sysfs_action = NULL;
4085 mddev->private = NULL;
4086 }
4087 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4088 kobject_del(&mddev->kobj);
4089 kobject_put(&mddev->kobj);
4090}
4091
4092static int md_alloc(dev_t dev, char *name)
4093{
4094 static DEFINE_MUTEX(disks_mutex);
4095 mddev_t *mddev = mddev_find(dev);
4096 struct gendisk *disk;
4097 int partitioned;
4098 int shift;
4099 int unit;
4100 int error;
4101
4102 if (!mddev)
4103 return -ENODEV;
4104
4105 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4106 shift = partitioned ? MdpMinorShift : 0;
4107 unit = MINOR(mddev->unit) >> shift;
4108
4109
4110
4111
4112 flush_scheduled_work();
4113
4114 mutex_lock(&disks_mutex);
4115 error = -EEXIST;
4116 if (mddev->gendisk)
4117 goto abort;
4118
4119 if (name) {
4120
4121
4122 mddev_t *mddev2;
4123 spin_lock(&all_mddevs_lock);
4124
4125 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4126 if (mddev2->gendisk &&
4127 strcmp(mddev2->gendisk->disk_name, name) == 0) {
4128 spin_unlock(&all_mddevs_lock);
4129 goto abort;
4130 }
4131 spin_unlock(&all_mddevs_lock);
4132 }
4133
4134 error = -ENOMEM;
4135 mddev->queue = blk_alloc_queue(GFP_KERNEL);
4136 if (!mddev->queue)
4137 goto abort;
4138 mddev->queue->queuedata = mddev;
4139
4140
4141 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
4142
4143 blk_queue_make_request(mddev->queue, md_make_request);
4144
4145 disk = alloc_disk(1 << shift);
4146 if (!disk) {
4147 blk_cleanup_queue(mddev->queue);
4148 mddev->queue = NULL;
4149 goto abort;
4150 }
4151 disk->major = MAJOR(mddev->unit);
4152 disk->first_minor = unit << shift;
4153 if (name)
4154 strcpy(disk->disk_name, name);
4155 else if (partitioned)
4156 sprintf(disk->disk_name, "md_d%d", unit);
4157 else
4158 sprintf(disk->disk_name, "md%d", unit);
4159 disk->fops = &md_fops;
4160 disk->private_data = mddev;
4161 disk->queue = mddev->queue;
4162
4163
4164
4165
4166 disk->flags |= GENHD_FL_EXT_DEVT;
4167 add_disk(disk);
4168 mddev->gendisk = disk;
4169 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4170 &disk_to_dev(disk)->kobj, "%s", "md");
4171 if (error) {
4172
4173
4174
4175 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4176 disk->disk_name);
4177 error = 0;
4178 }
4179 if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4180 printk(KERN_DEBUG "pointless warning\n");
4181 abort:
4182 mutex_unlock(&disks_mutex);
4183 if (!error) {
4184 kobject_uevent(&mddev->kobj, KOBJ_ADD);
4185 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
4186 }
4187 mddev_put(mddev);
4188 return error;
4189}
4190
4191static struct kobject *md_probe(dev_t dev, int *part, void *data)
4192{
4193 md_alloc(dev, NULL);
4194 return NULL;
4195}
4196
4197static int add_named_array(const char *val, struct kernel_param *kp)
4198{
4199
4200
4201
4202
4203 int len = strlen(val);
4204 char buf[DISK_NAME_LEN];
4205
4206 while (len && val[len-1] == '\n')
4207 len--;
4208 if (len >= DISK_NAME_LEN)
4209 return -E2BIG;
4210 strlcpy(buf, val, len+1);
4211 if (strncmp(buf, "md_", 3) != 0)
4212 return -EINVAL;
4213 return md_alloc(0, buf);
4214}
4215
4216static void md_safemode_timeout(unsigned long data)
4217{
4218 mddev_t *mddev = (mddev_t *) data;
4219
4220 if (!atomic_read(&mddev->writes_pending)) {
4221 mddev->safemode = 1;
4222 if (mddev->external)
4223 sysfs_notify_dirent(mddev->sysfs_state);
4224 }
4225 md_wakeup_thread(mddev->thread);
4226}
4227
4228static int start_dirty_degraded;
4229
4230static int do_md_run(mddev_t * mddev)
4231{
4232 int err;
4233 mdk_rdev_t *rdev;
4234 struct gendisk *disk;
4235 struct mdk_personality *pers;
4236
4237 if (list_empty(&mddev->disks))
4238
4239 return -EINVAL;
4240
4241 if (mddev->pers)
4242 return -EBUSY;
4243
4244
4245
4246
4247 if (!mddev->raid_disks) {
4248 if (!mddev->persistent)
4249 return -EINVAL;
4250 analyze_sbs(mddev);
4251 }
4252
4253 if (mddev->level != LEVEL_NONE)
4254 request_module("md-level-%d", mddev->level);
4255 else if (mddev->clevel[0])
4256 request_module("md-%s", mddev->clevel);
4257
4258
4259
4260
4261
4262
4263 list_for_each_entry(rdev, &mddev->disks, same_set) {
4264 if (test_bit(Faulty, &rdev->flags))
4265 continue;
4266 sync_blockdev(rdev->bdev);
4267 invalidate_bdev(rdev->bdev);
4268
4269
4270
4271
4272
4273 if (rdev->data_offset < rdev->sb_start) {
4274 if (mddev->dev_sectors &&
4275 rdev->data_offset + mddev->dev_sectors
4276 > rdev->sb_start) {
4277 printk("md: %s: data overlaps metadata\n",
4278 mdname(mddev));
4279 return -EINVAL;
4280 }
4281 } else {
4282 if (rdev->sb_start + rdev->sb_size/512
4283 > rdev->data_offset) {
4284 printk("md: %s: metadata overlaps data\n",
4285 mdname(mddev));
4286 return -EINVAL;
4287 }
4288 }
4289 sysfs_notify_dirent(rdev->sysfs_state);
4290 }
4291
4292 disk = mddev->gendisk;
4293
4294 spin_lock(&pers_lock);
4295 pers = find_pers(mddev->level, mddev->clevel);
4296 if (!pers || !try_module_get(pers->owner)) {
4297 spin_unlock(&pers_lock);
4298 if (mddev->level != LEVEL_NONE)
4299 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4300 mddev->level);
4301 else
4302 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4303 mddev->clevel);
4304 return -EINVAL;
4305 }
4306 mddev->pers = pers;
4307 spin_unlock(&pers_lock);
4308 if (mddev->level != pers->level) {
4309 mddev->level = pers->level;
4310 mddev->new_level = pers->level;
4311 }
4312 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4313
4314 if (mddev->reshape_position != MaxSector &&
4315 pers->start_reshape == NULL) {
4316
4317 mddev->pers = NULL;
4318 module_put(pers->owner);
4319 return -EINVAL;
4320 }
4321
4322 if (pers->sync_request) {
4323
4324
4325
4326 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4327 mdk_rdev_t *rdev2;
4328 int warned = 0;
4329
4330 list_for_each_entry(rdev, &mddev->disks, same_set)
4331 list_for_each_entry(rdev2, &mddev->disks, same_set) {
4332 if (rdev < rdev2 &&
4333 rdev->bdev->bd_contains ==
4334 rdev2->bdev->bd_contains) {
4335 printk(KERN_WARNING
4336 "%s: WARNING: %s appears to be"
4337 " on the same physical disk as"
4338 " %s.\n",
4339 mdname(mddev),
4340 bdevname(rdev->bdev,b),
4341 bdevname(rdev2->bdev,b2));
4342 warned = 1;
4343 }
4344 }
4345
4346 if (warned)
4347 printk(KERN_WARNING
4348 "True protection against single-disk"
4349 " failure might be compromised.\n");
4350 }
4351
4352 mddev->recovery = 0;
4353
4354 mddev->resync_max_sectors = mddev->dev_sectors;
4355
4356 mddev->barriers_work = 1;
4357 mddev->ok_start_degraded = start_dirty_degraded;
4358
4359 if (start_readonly && mddev->ro == 0)
4360 mddev->ro = 2;
4361
4362 err = mddev->pers->run(mddev);
4363 if (err)
4364 printk(KERN_ERR "md: pers->run() failed ...\n");
4365 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4366 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4367 " but 'external_size' not in effect?\n", __func__);
4368 printk(KERN_ERR
4369 "md: invalid array_size %llu > default size %llu\n",
4370 (unsigned long long)mddev->array_sectors / 2,
4371 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4372 err = -EINVAL;
4373 mddev->pers->stop(mddev);
4374 }
4375 if (err == 0 && mddev->pers->sync_request) {
4376 err = bitmap_create(mddev);
4377 if (err) {
4378 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4379 mdname(mddev), err);
4380 mddev->pers->stop(mddev);
4381 }
4382 }
4383 if (err) {
4384 module_put(mddev->pers->owner);
4385 mddev->pers = NULL;
4386 bitmap_destroy(mddev);
4387 return err;
4388 }
4389 if (mddev->pers->sync_request) {
4390 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4391 printk(KERN_WARNING
4392 "md: cannot register extra attributes for %s\n",
4393 mdname(mddev));
4394 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4395 } else if (mddev->ro == 2)
4396 mddev->ro = 0;
4397
4398 atomic_set(&mddev->writes_pending,0);
4399 atomic_set(&mddev->max_corr_read_errors,
4400 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4401 mddev->safemode = 0;
4402 mddev->safemode_timer.function = md_safemode_timeout;
4403 mddev->safemode_timer.data = (unsigned long) mddev;
4404 mddev->safemode_delay = (200 * HZ)/1000 +1;
4405 mddev->in_sync = 1;
4406
4407 list_for_each_entry(rdev, &mddev->disks, same_set)
4408 if (rdev->raid_disk >= 0) {
4409 char nm[20];
4410 sprintf(nm, "rd%d", rdev->raid_disk);
4411 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4412 printk("md: cannot register %s for %s\n",
4413 nm, mdname(mddev));
4414 }
4415
4416 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4417
4418 if (mddev->flags)
4419 md_update_sb(mddev, 0);
4420
4421 set_capacity(disk, mddev->array_sectors);
4422
4423 md_wakeup_thread(mddev->thread);
4424 md_wakeup_thread(mddev->sync_thread);
4425
4426 revalidate_disk(mddev->gendisk);
4427 mddev->changed = 1;
4428 md_new_event(mddev);
4429 sysfs_notify_dirent(mddev->sysfs_state);
4430 if (mddev->sysfs_action)
4431 sysfs_notify_dirent(mddev->sysfs_action);
4432 sysfs_notify(&mddev->kobj, NULL, "degraded");
4433 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4434 return 0;
4435}
4436
4437static int restart_array(mddev_t *mddev)
4438{
4439 struct gendisk *disk = mddev->gendisk;
4440
4441
4442 if (list_empty(&mddev->disks))
4443 return -ENXIO;
4444 if (!mddev->pers)
4445 return -EINVAL;
4446 if (!mddev->ro)
4447 return -EBUSY;
4448 mddev->safemode = 0;
4449 mddev->ro = 0;
4450 set_disk_ro(disk, 0);
4451 printk(KERN_INFO "md: %s switched to read-write mode.\n",
4452 mdname(mddev));
4453
4454 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4455 md_wakeup_thread(mddev->thread);
4456 md_wakeup_thread(mddev->sync_thread);
4457 sysfs_notify_dirent(mddev->sysfs_state);
4458 return 0;
4459}
4460
4461
4462
4463static int deny_bitmap_write_access(struct file * file)
4464{
4465 struct inode *inode = file->f_mapping->host;
4466
4467 spin_lock(&inode->i_lock);
4468 if (atomic_read(&inode->i_writecount) > 1) {
4469 spin_unlock(&inode->i_lock);
4470 return -ETXTBSY;
4471 }
4472 atomic_set(&inode->i_writecount, -1);
4473 spin_unlock(&inode->i_lock);
4474
4475 return 0;
4476}
4477
4478void restore_bitmap_write_access(struct file *file)
4479{
4480 struct inode *inode = file->f_mapping->host;
4481
4482 spin_lock(&inode->i_lock);
4483 atomic_set(&inode->i_writecount, 1);
4484 spin_unlock(&inode->i_lock);
4485}
4486
4487
4488
4489
4490
4491
4492static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4493{
4494 int err = 0;
4495 struct gendisk *disk = mddev->gendisk;
4496 mdk_rdev_t *rdev;
4497
4498 mutex_lock(&mddev->open_mutex);
4499 if (atomic_read(&mddev->openers) > is_open) {
4500 printk("md: %s still in use.\n",mdname(mddev));
4501 err = -EBUSY;
4502 } else if (mddev->pers) {
4503
4504 if (mddev->sync_thread) {
4505 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4506 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4507 md_unregister_thread(mddev->sync_thread);
4508 mddev->sync_thread = NULL;
4509 }
4510
4511 del_timer_sync(&mddev->safemode_timer);
4512
4513 switch(mode) {
4514 case 1:
4515 err = -ENXIO;
4516 if (mddev->ro==1)
4517 goto out;
4518 mddev->ro = 1;
4519 break;
4520 case 0:
4521 case 2:
4522 bitmap_flush(mddev);
4523 md_super_wait(mddev);
4524 if (mddev->ro)
4525 set_disk_ro(disk, 0);
4526
4527 mddev->pers->stop(mddev);
4528 mddev->queue->merge_bvec_fn = NULL;
4529 mddev->queue->unplug_fn = NULL;
4530 mddev->queue->backing_dev_info.congested_fn = NULL;
4531 module_put(mddev->pers->owner);
4532 if (mddev->pers->sync_request && mddev->private == NULL)
4533 mddev->private = (void*)1;
4534 mddev->pers = NULL;
4535
4536 sysfs_notify_dirent(mddev->sysfs_state);
4537
4538 list_for_each_entry(rdev, &mddev->disks, same_set)
4539 if (rdev->raid_disk >= 0) {
4540 char nm[20];
4541 sprintf(nm, "rd%d", rdev->raid_disk);
4542 sysfs_remove_link(&mddev->kobj, nm);
4543 }
4544
4545 set_capacity(disk, 0);
4546 mddev->changed = 1;
4547
4548 if (mddev->ro)
4549 mddev->ro = 0;
4550 }
4551 if (!mddev->in_sync || mddev->flags) {
4552
4553 mddev->in_sync = 1;
4554 md_update_sb(mddev, 1);
4555 }
4556 if (mode == 1)
4557 set_disk_ro(disk, 1);
4558 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4559 err = 0;
4560 }
4561out:
4562 mutex_unlock(&mddev->open_mutex);
4563 if (err)
4564 return err;
4565
4566
4567
4568 if (mode == 0) {
4569
4570 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4571
4572 bitmap_destroy(mddev);
4573 if (mddev->bitmap_info.file) {
4574 restore_bitmap_write_access(mddev->bitmap_info.file);
4575 fput(mddev->bitmap_info.file);
4576 mddev->bitmap_info.file = NULL;
4577 }
4578 mddev->bitmap_info.offset = 0;
4579
4580 export_array(mddev);
4581
4582 mddev->array_sectors = 0;
4583 mddev->external_size = 0;
4584 mddev->dev_sectors = 0;
4585 mddev->raid_disks = 0;
4586 mddev->recovery_cp = 0;
4587 mddev->resync_min = 0;
4588 mddev->resync_max = MaxSector;
4589 mddev->reshape_position = MaxSector;
4590 mddev->external = 0;
4591 mddev->persistent = 0;
4592 mddev->level = LEVEL_NONE;
4593 mddev->clevel[0] = 0;
4594 mddev->flags = 0;
4595 mddev->ro = 0;
4596 mddev->metadata_type[0] = 0;
4597 mddev->chunk_sectors = 0;
4598 mddev->ctime = mddev->utime = 0;
4599 mddev->layout = 0;
4600 mddev->max_disks = 0;
4601 mddev->events = 0;
4602 mddev->delta_disks = 0;
4603 mddev->new_level = LEVEL_NONE;
4604 mddev->new_layout = 0;
4605 mddev->new_chunk_sectors = 0;
4606 mddev->curr_resync = 0;
4607 mddev->resync_mismatches = 0;
4608 mddev->suspend_lo = mddev->suspend_hi = 0;
4609 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4610 mddev->recovery = 0;
4611 mddev->in_sync = 0;
4612 mddev->changed = 0;
4613 mddev->degraded = 0;
4614 mddev->barriers_work = 0;
4615 mddev->safemode = 0;
4616 mddev->bitmap_info.offset = 0;
4617 mddev->bitmap_info.default_offset = 0;
4618 mddev->bitmap_info.chunksize = 0;
4619 mddev->bitmap_info.daemon_sleep = 0;
4620 mddev->bitmap_info.max_write_behind = 0;
4621 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4622 if (mddev->hold_active == UNTIL_STOP)
4623 mddev->hold_active = 0;
4624
4625 } else if (mddev->pers)
4626 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4627 mdname(mddev));
4628 err = 0;
4629 blk_integrity_unregister(disk);
4630 md_new_event(mddev);
4631 sysfs_notify_dirent(mddev->sysfs_state);
4632 return err;
4633}
4634
4635#ifndef MODULE
4636static void autorun_array(mddev_t *mddev)
4637{
4638 mdk_rdev_t *rdev;
4639 int err;
4640
4641 if (list_empty(&mddev->disks))
4642 return;
4643
4644 printk(KERN_INFO "md: running: ");
4645
4646 list_for_each_entry(rdev, &mddev->disks, same_set) {
4647 char b[BDEVNAME_SIZE];
4648 printk("<%s>", bdevname(rdev->bdev,b));
4649 }
4650 printk("\n");
4651
4652 err = do_md_run(mddev);
4653 if (err) {
4654 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4655 do_md_stop(mddev, 0, 0);
4656 }
4657}
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671static void autorun_devices(int part)
4672{
4673 mdk_rdev_t *rdev0, *rdev, *tmp;
4674 mddev_t *mddev;
4675 char b[BDEVNAME_SIZE];
4676
4677 printk(KERN_INFO "md: autorun ...\n");
4678 while (!list_empty(&pending_raid_disks)) {
4679 int unit;
4680 dev_t dev;
4681 LIST_HEAD(candidates);
4682 rdev0 = list_entry(pending_raid_disks.next,
4683 mdk_rdev_t, same_set);
4684
4685 printk(KERN_INFO "md: considering %s ...\n",
4686 bdevname(rdev0->bdev,b));
4687 INIT_LIST_HEAD(&candidates);
4688 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4689 if (super_90_load(rdev, rdev0, 0) >= 0) {
4690 printk(KERN_INFO "md: adding %s ...\n",
4691 bdevname(rdev->bdev,b));
4692 list_move(&rdev->same_set, &candidates);
4693 }
4694
4695
4696
4697
4698
4699 if (part) {
4700 dev = MKDEV(mdp_major,
4701 rdev0->preferred_minor << MdpMinorShift);
4702 unit = MINOR(dev) >> MdpMinorShift;
4703 } else {
4704 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4705 unit = MINOR(dev);
4706 }
4707 if (rdev0->preferred_minor != unit) {
4708 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4709 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4710 break;
4711 }
4712
4713 md_probe(dev, NULL, NULL);
4714 mddev = mddev_find(dev);
4715 if (!mddev || !mddev->gendisk) {
4716 if (mddev)
4717 mddev_put(mddev);
4718 printk(KERN_ERR
4719 "md: cannot allocate memory for md drive.\n");
4720 break;
4721 }
4722 if (mddev_lock(mddev))
4723 printk(KERN_WARNING "md: %s locked, cannot run\n",
4724 mdname(mddev));
4725 else if (mddev->raid_disks || mddev->major_version
4726 || !list_empty(&mddev->disks)) {
4727 printk(KERN_WARNING
4728 "md: %s already running, cannot run %s\n",
4729 mdname(mddev), bdevname(rdev0->bdev,b));
4730 mddev_unlock(mddev);
4731 } else {
4732 printk(KERN_INFO "md: created %s\n", mdname(mddev));
4733 mddev->persistent = 1;
4734 rdev_for_each_list(rdev, tmp, &candidates) {
4735 list_del_init(&rdev->same_set);
4736 if (bind_rdev_to_array(rdev, mddev))
4737 export_rdev(rdev);
4738 }
4739 autorun_array(mddev);
4740 mddev_unlock(mddev);
4741 }
4742
4743
4744
4745 rdev_for_each_list(rdev, tmp, &candidates) {
4746 list_del_init(&rdev->same_set);
4747 export_rdev(rdev);
4748 }
4749 mddev_put(mddev);
4750 }
4751 printk(KERN_INFO "md: ... autorun DONE.\n");
4752}
4753#endif
4754
4755static int get_version(void __user * arg)
4756{
4757 mdu_version_t ver;
4758
4759 ver.major = MD_MAJOR_VERSION;
4760 ver.minor = MD_MINOR_VERSION;
4761 ver.patchlevel = MD_PATCHLEVEL_VERSION;
4762
4763 if (copy_to_user(arg, &ver, sizeof(ver)))
4764 return -EFAULT;
4765
4766 return 0;
4767}
4768
4769static int get_array_info(mddev_t * mddev, void __user * arg)
4770{
4771 mdu_array_info_t info;
4772 int nr,working,insync,failed,spare;
4773 mdk_rdev_t *rdev;
4774
4775 nr=working=insync=failed=spare=0;
4776 list_for_each_entry(rdev, &mddev->disks, same_set) {
4777 nr++;
4778 if (test_bit(Faulty, &rdev->flags))
4779 failed++;
4780 else {
4781 working++;
4782 if (test_bit(In_sync, &rdev->flags))
4783 insync++;
4784 else
4785 spare++;
4786 }
4787 }
4788
4789 info.major_version = mddev->major_version;
4790 info.minor_version = mddev->minor_version;
4791 info.patch_version = MD_PATCHLEVEL_VERSION;
4792 info.ctime = mddev->ctime;
4793 info.level = mddev->level;
4794 info.size = mddev->dev_sectors / 2;
4795 if (info.size != mddev->dev_sectors / 2)
4796 info.size = -1;
4797 info.nr_disks = nr;
4798 info.raid_disks = mddev->raid_disks;
4799 info.md_minor = mddev->md_minor;
4800 info.not_persistent= !mddev->persistent;
4801
4802 info.utime = mddev->utime;
4803 info.state = 0;
4804 if (mddev->in_sync)
4805 info.state = (1<<MD_SB_CLEAN);
4806 if (mddev->bitmap && mddev->bitmap_info.offset)
4807 info.state = (1<<MD_SB_BITMAP_PRESENT);
4808 info.active_disks = insync;
4809 info.working_disks = working;
4810 info.failed_disks = failed;
4811 info.spare_disks = spare;
4812
4813 info.layout = mddev->layout;
4814 info.chunk_size = mddev->chunk_sectors << 9;
4815
4816 if (copy_to_user(arg, &info, sizeof(info)))
4817 return -EFAULT;
4818
4819 return 0;
4820}
4821
4822static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4823{
4824 mdu_bitmap_file_t *file = NULL;
4825 char *ptr, *buf = NULL;
4826 int err = -ENOMEM;
4827
4828 if (md_allow_write(mddev))
4829 file = kmalloc(sizeof(*file), GFP_NOIO);
4830 else
4831 file = kmalloc(sizeof(*file), GFP_KERNEL);
4832
4833 if (!file)
4834 goto out;
4835
4836
4837 if (!mddev->bitmap || !mddev->bitmap->file) {
4838 file->pathname[0] = '\0';
4839 goto copy_out;
4840 }
4841
4842 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4843 if (!buf)
4844 goto out;
4845
4846 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4847 if (IS_ERR(ptr))
4848 goto out;
4849
4850 strcpy(file->pathname, ptr);
4851
4852copy_out:
4853 err = 0;
4854 if (copy_to_user(arg, file, sizeof(*file)))
4855 err = -EFAULT;
4856out:
4857 kfree(buf);
4858 kfree(file);
4859 return err;
4860}
4861
4862static int get_disk_info(mddev_t * mddev, void __user * arg)
4863{
4864 mdu_disk_info_t info;
4865 mdk_rdev_t *rdev;
4866
4867 if (copy_from_user(&info, arg, sizeof(info)))
4868 return -EFAULT;
4869
4870 rdev = find_rdev_nr(mddev, info.number);
4871 if (rdev) {
4872 info.major = MAJOR(rdev->bdev->bd_dev);
4873 info.minor = MINOR(rdev->bdev->bd_dev);
4874 info.raid_disk = rdev->raid_disk;
4875 info.state = 0;
4876 if (test_bit(Faulty, &rdev->flags))
4877 info.state |= (1<<MD_DISK_FAULTY);
4878 else if (test_bit(In_sync, &rdev->flags)) {
4879 info.state |= (1<<MD_DISK_ACTIVE);
4880 info.state |= (1<<MD_DISK_SYNC);
4881 }
4882 if (test_bit(WriteMostly, &rdev->flags))
4883 info.state |= (1<<MD_DISK_WRITEMOSTLY);
4884 } else {
4885 info.major = info.minor = 0;
4886 info.raid_disk = -1;
4887 info.state = (1<<MD_DISK_REMOVED);
4888 }
4889
4890 if (copy_to_user(arg, &info, sizeof(info)))
4891 return -EFAULT;
4892
4893 return 0;
4894}
4895
4896static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4897{
4898 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4899 mdk_rdev_t *rdev;
4900 dev_t dev = MKDEV(info->major,info->minor);
4901
4902 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4903 return -EOVERFLOW;
4904
4905 if (!mddev->raid_disks) {
4906 int err;
4907
4908 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4909 if (IS_ERR(rdev)) {
4910 printk(KERN_WARNING
4911 "md: md_import_device returned %ld\n",
4912 PTR_ERR(rdev));
4913 return PTR_ERR(rdev);
4914 }
4915 if (!list_empty(&mddev->disks)) {
4916 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4917 mdk_rdev_t, same_set);
4918 err = super_types[mddev->major_version]
4919 .load_super(rdev, rdev0, mddev->minor_version);
4920 if (err < 0) {
4921 printk(KERN_WARNING
4922 "md: %s has different UUID to %s\n",
4923 bdevname(rdev->bdev,b),
4924 bdevname(rdev0->bdev,b2));
4925 export_rdev(rdev);
4926 return -EINVAL;
4927 }
4928 }
4929 err = bind_rdev_to_array(rdev, mddev);
4930 if (err)
4931 export_rdev(rdev);
4932 return err;
4933 }
4934
4935
4936
4937
4938
4939
4940 if (mddev->pers) {
4941 int err;
4942 if (!mddev->pers->hot_add_disk) {
4943 printk(KERN_WARNING
4944 "%s: personality does not support diskops!\n",
4945 mdname(mddev));
4946 return -EINVAL;
4947 }
4948 if (mddev->persistent)
4949 rdev = md_import_device(dev, mddev->major_version,
4950 mddev->minor_version);
4951 else
4952 rdev = md_import_device(dev, -1, -1);
4953 if (IS_ERR(rdev)) {
4954 printk(KERN_WARNING
4955 "md: md_import_device returned %ld\n",
4956 PTR_ERR(rdev));
4957 return PTR_ERR(rdev);
4958 }
4959
4960 if (!mddev->persistent) {
4961 if (info->state & (1<<MD_DISK_SYNC) &&
4962 info->raid_disk < mddev->raid_disks)
4963 rdev->raid_disk = info->raid_disk;
4964 else
4965 rdev->raid_disk = -1;
4966 } else
4967 super_types[mddev->major_version].
4968 validate_super(mddev, rdev);
4969 rdev->saved_raid_disk = rdev->raid_disk;
4970
4971 clear_bit(In_sync, &rdev->flags);
4972 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4973 set_bit(WriteMostly, &rdev->flags);
4974 else
4975 clear_bit(WriteMostly, &rdev->flags);
4976
4977 rdev->raid_disk = -1;
4978 err = bind_rdev_to_array(rdev, mddev);
4979 if (!err && !mddev->pers->hot_remove_disk) {
4980
4981
4982
4983
4984 super_types[mddev->major_version].
4985 validate_super(mddev, rdev);
4986 err = mddev->pers->hot_add_disk(mddev, rdev);
4987 if (err)
4988 unbind_rdev_from_array(rdev);
4989 }
4990 if (err)
4991 export_rdev(rdev);
4992 else
4993 sysfs_notify_dirent(rdev->sysfs_state);
4994
4995 md_update_sb(mddev, 1);
4996 if (mddev->degraded)
4997 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4998 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4999 md_wakeup_thread(mddev->thread);
5000 return err;
5001 }
5002
5003
5004
5005
5006 if (mddev->major_version != 0) {
5007 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5008 mdname(mddev));
5009 return -EINVAL;
5010 }
5011
5012 if (!(info->state & (1<<MD_DISK_FAULTY))) {
5013 int err;
5014 rdev = md_import_device(dev, -1, 0);
5015 if (IS_ERR(rdev)) {
5016 printk(KERN_WARNING
5017 "md: error, md_import_device() returned %ld\n",
5018 PTR_ERR(rdev));
5019 return PTR_ERR(rdev);
5020 }
5021 rdev->desc_nr = info->number;
5022 if (info->raid_disk < mddev->raid_disks)
5023 rdev->raid_disk = info->raid_disk;
5024 else
5025 rdev->raid_disk = -1;
5026
5027 if (rdev->raid_disk < mddev->raid_disks)
5028 if (info->state & (1<<MD_DISK_SYNC))
5029 set_bit(In_sync, &rdev->flags);
5030
5031 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5032 set_bit(WriteMostly, &rdev->flags);
5033
5034 if (!mddev->persistent) {
5035 printk(KERN_INFO "md: nonpersistent superblock ...\n");
5036 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
5037 } else
5038 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
5039 rdev->sectors = rdev->sb_start;
5040
5041 err = bind_rdev_to_array(rdev, mddev);
5042 if (err) {
5043 export_rdev(rdev);
5044 return err;
5045 }
5046 }
5047
5048 return 0;
5049}
5050
5051static int hot_remove_disk(mddev_t * mddev, dev_t dev)
5052{
5053 char b[BDEVNAME_SIZE];
5054 mdk_rdev_t *rdev;
5055
5056 rdev = find_rdev(mddev, dev);
5057 if (!rdev)
5058 return -ENXIO;
5059
5060 if (rdev->raid_disk >= 0)
5061 goto busy;
5062
5063 kick_rdev_from_array(rdev);
5064 md_update_sb(mddev, 1);
5065 md_new_event(mddev);
5066
5067 return 0;
5068busy:
5069 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5070 bdevname(rdev->bdev,b), mdname(mddev));
5071 return -EBUSY;
5072}
5073
5074static int hot_add_disk(mddev_t * mddev, dev_t dev)
5075{
5076 char b[BDEVNAME_SIZE];
5077 int err;
5078 mdk_rdev_t *rdev;
5079
5080 if (!mddev->pers)
5081 return -ENODEV;
5082
5083 if (mddev->major_version != 0) {
5084 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5085 " version-0 superblocks.\n",
5086 mdname(mddev));
5087 return -EINVAL;
5088 }
5089 if (!mddev->pers->hot_add_disk) {
5090 printk(KERN_WARNING
5091 "%s: personality does not support diskops!\n",
5092 mdname(mddev));
5093 return -EINVAL;
5094 }
5095
5096 rdev = md_import_device(dev, -1, 0);
5097 if (IS_ERR(rdev)) {
5098 printk(KERN_WARNING
5099 "md: error, md_import_device() returned %ld\n",
5100 PTR_ERR(rdev));
5101 return -EINVAL;
5102 }
5103
5104 if (mddev->persistent)
5105 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
5106 else
5107 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
5108
5109 rdev->sectors = rdev->sb_start;
5110
5111 if (test_bit(Faulty, &rdev->flags)) {
5112 printk(KERN_WARNING
5113 "md: can not hot-add faulty %s disk to %s!\n",
5114 bdevname(rdev->bdev,b), mdname(mddev));
5115 err = -EINVAL;
5116 goto abort_export;
5117 }
5118 clear_bit(In_sync, &rdev->flags);
5119 rdev->desc_nr = -1;
5120 rdev->saved_raid_disk = -1;
5121 err = bind_rdev_to_array(rdev, mddev);
5122 if (err)
5123 goto abort_export;
5124
5125
5126
5127
5128
5129
5130 rdev->raid_disk = -1;
5131
5132 md_update_sb(mddev, 1);
5133
5134
5135
5136
5137
5138 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5139 md_wakeup_thread(mddev->thread);
5140 md_new_event(mddev);
5141 return 0;
5142
5143abort_export:
5144 export_rdev(rdev);
5145 return err;
5146}
5147
5148static int set_bitmap_file(mddev_t *mddev, int fd)
5149{
5150 int err;
5151
5152 if (mddev->pers) {
5153 if (!mddev->pers->quiesce)
5154 return -EBUSY;
5155 if (mddev->recovery || mddev->sync_thread)
5156 return -EBUSY;
5157
5158 }
5159
5160
5161 if (fd >= 0) {
5162 if (mddev->bitmap)
5163 return -EEXIST;
5164 mddev->bitmap_info.file = fget(fd);
5165
5166 if (mddev->bitmap_info.file == NULL) {
5167 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5168 mdname(mddev));
5169 return -EBADF;
5170 }
5171
5172 err = deny_bitmap_write_access(mddev->bitmap_info.file);
5173 if (err) {
5174 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5175 mdname(mddev));
5176 fput(mddev->bitmap_info.file);
5177 mddev->bitmap_info.file = NULL;
5178 return err;
5179 }
5180 mddev->bitmap_info.offset = 0;
5181 } else if (mddev->bitmap == NULL)
5182 return -ENOENT;
5183 err = 0;
5184 if (mddev->pers) {
5185 mddev->pers->quiesce(mddev, 1);
5186 if (fd >= 0)
5187 err = bitmap_create(mddev);
5188 if (fd < 0 || err) {
5189 bitmap_destroy(mddev);
5190 fd = -1;
5191 }
5192 mddev->pers->quiesce(mddev, 0);
5193 }
5194 if (fd < 0) {
5195 if (mddev->bitmap_info.file) {
5196 restore_bitmap_write_access(mddev->bitmap_info.file);
5197 fput(mddev->bitmap_info.file);
5198 }
5199 mddev->bitmap_info.file = NULL;
5200 }
5201
5202 return err;
5203}
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5219{
5220
5221 if (info->raid_disks == 0) {
5222
5223 if (info->major_version < 0 ||
5224 info->major_version >= ARRAY_SIZE(super_types) ||
5225 super_types[info->major_version].name == NULL) {
5226
5227 printk(KERN_INFO
5228 "md: superblock version %d not known\n",
5229 info->major_version);
5230 return -EINVAL;
5231 }
5232 mddev->major_version = info->major_version;
5233 mddev->minor_version = info->minor_version;
5234 mddev->patch_version = info->patch_version;
5235 mddev->persistent = !info->not_persistent;
5236
5237
5238
5239 mddev->ctime = get_seconds();
5240 return 0;
5241 }
5242 mddev->major_version = MD_MAJOR_VERSION;
5243 mddev->minor_version = MD_MINOR_VERSION;
5244 mddev->patch_version = MD_PATCHLEVEL_VERSION;
5245 mddev->ctime = get_seconds();
5246
5247 mddev->level = info->level;
5248 mddev->clevel[0] = 0;
5249 mddev->dev_sectors = 2 * (sector_t)info->size;
5250 mddev->raid_disks = info->raid_disks;
5251
5252
5253
5254 if (info->state & (1<<MD_SB_CLEAN))
5255 mddev->recovery_cp = MaxSector;
5256 else
5257 mddev->recovery_cp = 0;
5258 mddev->persistent = ! info->not_persistent;
5259 mddev->external = 0;
5260
5261 mddev->layout = info->layout;
5262 mddev->chunk_sectors = info->chunk_size >> 9;
5263
5264 mddev->max_disks = MD_SB_DISKS;
5265
5266 if (mddev->persistent)
5267 mddev->flags = 0;
5268 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5269
5270 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5271 mddev->bitmap_info.offset = 0;
5272
5273 mddev->reshape_position = MaxSector;
5274
5275
5276
5277
5278 get_random_bytes(mddev->uuid, 16);
5279
5280 mddev->new_level = mddev->level;
5281 mddev->new_chunk_sectors = mddev->chunk_sectors;
5282 mddev->new_layout = mddev->layout;
5283 mddev->delta_disks = 0;
5284
5285 return 0;
5286}
5287
5288void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5289{
5290 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5291
5292 if (mddev->external_size)
5293 return;
5294
5295 mddev->array_sectors = array_sectors;
5296}
5297EXPORT_SYMBOL(md_set_array_sectors);
5298
5299static int update_size(mddev_t *mddev, sector_t num_sectors)
5300{
5301 mdk_rdev_t *rdev;
5302 int rv;
5303 int fit = (num_sectors == 0);
5304
5305 if (mddev->pers->resize == NULL)
5306 return -EINVAL;
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317 if (mddev->sync_thread)
5318 return -EBUSY;
5319 if (mddev->bitmap)
5320
5321
5322
5323 return -EBUSY;
5324 list_for_each_entry(rdev, &mddev->disks, same_set) {
5325 sector_t avail = rdev->sectors;
5326
5327 if (fit && (num_sectors == 0 || num_sectors > avail))
5328 num_sectors = avail;
5329 if (avail < num_sectors)
5330 return -ENOSPC;
5331 }
5332 rv = mddev->pers->resize(mddev, num_sectors);
5333 if (!rv)
5334 revalidate_disk(mddev->gendisk);
5335 return rv;
5336}
5337
5338static int update_raid_disks(mddev_t *mddev, int raid_disks)
5339{
5340 int rv;
5341
5342 if (mddev->pers->check_reshape == NULL)
5343 return -EINVAL;
5344 if (raid_disks <= 0 ||
5345 raid_disks >= mddev->max_disks)
5346 return -EINVAL;
5347 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5348 return -EBUSY;
5349 mddev->delta_disks = raid_disks - mddev->raid_disks;
5350
5351 rv = mddev->pers->check_reshape(mddev);
5352 return rv;
5353}
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5365{
5366 int rv = 0;
5367 int cnt = 0;
5368 int state = 0;
5369
5370
5371 if (mddev->bitmap && mddev->bitmap_info.offset)
5372 state |= (1 << MD_SB_BITMAP_PRESENT);
5373
5374 if (mddev->major_version != info->major_version ||
5375 mddev->minor_version != info->minor_version ||
5376
5377 mddev->ctime != info->ctime ||
5378 mddev->level != info->level ||
5379
5380 !mddev->persistent != info->not_persistent||
5381 mddev->chunk_sectors != info->chunk_size >> 9 ||
5382
5383 ((state^info->state) & 0xfffffe00)
5384 )
5385 return -EINVAL;
5386
5387 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5388 cnt++;
5389 if (mddev->raid_disks != info->raid_disks)
5390 cnt++;
5391 if (mddev->layout != info->layout)
5392 cnt++;
5393 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5394 cnt++;
5395 if (cnt == 0)
5396 return 0;
5397 if (cnt > 1)
5398 return -EINVAL;
5399
5400 if (mddev->layout != info->layout) {
5401
5402
5403
5404
5405 if (mddev->pers->check_reshape == NULL)
5406 return -EINVAL;
5407 else {
5408 mddev->new_layout = info->layout;
5409 rv = mddev->pers->check_reshape(mddev);
5410 if (rv)
5411 mddev->new_layout = mddev->layout;
5412 return rv;
5413 }
5414 }
5415 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5416 rv = update_size(mddev, (sector_t)info->size * 2);
5417
5418 if (mddev->raid_disks != info->raid_disks)
5419 rv = update_raid_disks(mddev, info->raid_disks);
5420
5421 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5422 if (mddev->pers->quiesce == NULL)
5423 return -EINVAL;
5424 if (mddev->recovery || mddev->sync_thread)
5425 return -EBUSY;
5426 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5427
5428 if (mddev->bitmap)
5429 return -EEXIST;
5430 if (mddev->bitmap_info.default_offset == 0)
5431 return -EINVAL;
5432 mddev->bitmap_info.offset =
5433 mddev->bitmap_info.default_offset;
5434 mddev->pers->quiesce(mddev, 1);
5435 rv = bitmap_create(mddev);
5436 if (rv)
5437 bitmap_destroy(mddev);
5438 mddev->pers->quiesce(mddev, 0);
5439 } else {
5440
5441 if (!mddev->bitmap)
5442 return -ENOENT;
5443 if (mddev->bitmap->file)
5444 return -EINVAL;
5445 mddev->pers->quiesce(mddev, 1);
5446 bitmap_destroy(mddev);
5447 mddev->pers->quiesce(mddev, 0);
5448 mddev->bitmap_info.offset = 0;
5449 }
5450 }
5451 md_update_sb(mddev, 1);
5452 return rv;
5453}
5454
5455static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5456{
5457 mdk_rdev_t *rdev;
5458
5459 if (mddev->pers == NULL)
5460 return -ENODEV;
5461
5462 rdev = find_rdev(mddev, dev);
5463 if (!rdev)
5464 return -ENODEV;
5465
5466 md_error(mddev, rdev);
5467 return 0;
5468}
5469
5470
5471
5472
5473
5474
5475
5476static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5477{
5478 mddev_t *mddev = bdev->bd_disk->private_data;
5479
5480 geo->heads = 2;
5481 geo->sectors = 4;
5482 geo->cylinders = get_capacity(mddev->gendisk) / 8;
5483 return 0;
5484}
5485
5486static int md_ioctl(struct block_device *bdev, fmode_t mode,
5487 unsigned int cmd, unsigned long arg)
5488{
5489 int err = 0;
5490 void __user *argp = (void __user *)arg;
5491 mddev_t *mddev = NULL;
5492
5493 if (!capable(CAP_SYS_ADMIN))
5494 return -EACCES;
5495
5496
5497
5498
5499
5500 switch (cmd)
5501 {
5502 case RAID_VERSION:
5503 err = get_version(argp);
5504 goto done;
5505
5506 case PRINT_RAID_DEBUG:
5507 err = 0;
5508 md_print_devices();
5509 goto done;
5510
5511#ifndef MODULE
5512 case RAID_AUTORUN:
5513 err = 0;
5514 autostart_arrays(arg);
5515 goto done;
5516#endif
5517 default:;
5518 }
5519
5520
5521
5522
5523
5524 mddev = bdev->bd_disk->private_data;
5525
5526 if (!mddev) {
5527 BUG();
5528 goto abort;
5529 }
5530
5531 err = mddev_lock(mddev);
5532 if (err) {
5533 printk(KERN_INFO
5534 "md: ioctl lock interrupted, reason %d, cmd %d\n",
5535 err, cmd);
5536 goto abort;
5537 }
5538
5539 switch (cmd)
5540 {
5541 case SET_ARRAY_INFO:
5542 {
5543 mdu_array_info_t info;
5544 if (!arg)
5545 memset(&info, 0, sizeof(info));
5546 else if (copy_from_user(&info, argp, sizeof(info))) {
5547 err = -EFAULT;
5548 goto abort_unlock;
5549 }
5550 if (mddev->pers) {
5551 err = update_array_info(mddev, &info);
5552 if (err) {
5553 printk(KERN_WARNING "md: couldn't update"
5554 " array info. %d\n", err);
5555 goto abort_unlock;
5556 }
5557 goto done_unlock;
5558 }
5559 if (!list_empty(&mddev->disks)) {
5560 printk(KERN_WARNING
5561 "md: array %s already has disks!\n",
5562 mdname(mddev));
5563 err = -EBUSY;
5564 goto abort_unlock;
5565 }
5566 if (mddev->raid_disks) {
5567 printk(KERN_WARNING
5568 "md: array %s already initialised!\n",
5569 mdname(mddev));
5570 err = -EBUSY;
5571 goto abort_unlock;
5572 }
5573 err = set_array_info(mddev, &info);
5574 if (err) {
5575 printk(KERN_WARNING "md: couldn't set"
5576 " array info. %d\n", err);
5577 goto abort_unlock;
5578 }
5579 }
5580 goto done_unlock;
5581
5582 default:;
5583 }
5584
5585
5586
5587
5588
5589
5590 if ((!mddev->raid_disks && !mddev->external)
5591 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5592 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5593 && cmd != GET_BITMAP_FILE) {
5594 err = -ENODEV;
5595 goto abort_unlock;
5596 }
5597
5598
5599
5600
5601 switch (cmd)
5602 {
5603 case GET_ARRAY_INFO:
5604 err = get_array_info(mddev, argp);
5605 goto done_unlock;
5606
5607 case GET_BITMAP_FILE:
5608 err = get_bitmap_file(mddev, argp);
5609 goto done_unlock;
5610
5611 case GET_DISK_INFO:
5612 err = get_disk_info(mddev, argp);
5613 goto done_unlock;
5614
5615 case RESTART_ARRAY_RW:
5616 err = restart_array(mddev);
5617 goto done_unlock;
5618
5619 case STOP_ARRAY:
5620 err = do_md_stop(mddev, 0, 1);
5621 goto done_unlock;
5622
5623 case STOP_ARRAY_RO:
5624 err = do_md_stop(mddev, 1, 1);
5625 goto done_unlock;
5626
5627 }
5628
5629
5630
5631
5632
5633
5634
5635
5636 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5637 if (mddev->ro == 2) {
5638 mddev->ro = 0;
5639 sysfs_notify_dirent(mddev->sysfs_state);
5640 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5641 md_wakeup_thread(mddev->thread);
5642 } else {
5643 err = -EROFS;
5644 goto abort_unlock;
5645 }
5646 }
5647
5648 switch (cmd)
5649 {
5650 case ADD_NEW_DISK:
5651 {
5652 mdu_disk_info_t info;
5653 if (copy_from_user(&info, argp, sizeof(info)))
5654 err = -EFAULT;
5655 else
5656 err = add_new_disk(mddev, &info);
5657 goto done_unlock;
5658 }
5659
5660 case HOT_REMOVE_DISK:
5661 err = hot_remove_disk(mddev, new_decode_dev(arg));
5662 goto done_unlock;
5663
5664 case HOT_ADD_DISK:
5665 err = hot_add_disk(mddev, new_decode_dev(arg));
5666 goto done_unlock;
5667
5668 case SET_DISK_FAULTY:
5669 err = set_disk_faulty(mddev, new_decode_dev(arg));
5670 goto done_unlock;
5671
5672 case RUN_ARRAY:
5673 err = do_md_run(mddev);
5674 goto done_unlock;
5675
5676 case SET_BITMAP_FILE:
5677 err = set_bitmap_file(mddev, (int)arg);
5678 goto done_unlock;
5679
5680 default:
5681 err = -EINVAL;
5682 goto abort_unlock;
5683 }
5684
5685done_unlock:
5686abort_unlock:
5687 if (mddev->hold_active == UNTIL_IOCTL &&
5688 err != -EINVAL)
5689 mddev->hold_active = 0;
5690 mddev_unlock(mddev);
5691
5692 return err;
5693done:
5694 if (err)
5695 MD_BUG();
5696abort:
5697 return err;
5698}
5699#ifdef CONFIG_COMPAT
5700static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
5701 unsigned int cmd, unsigned long arg)
5702{
5703 switch (cmd) {
5704 case HOT_REMOVE_DISK:
5705 case HOT_ADD_DISK:
5706 case SET_DISK_FAULTY:
5707 case SET_BITMAP_FILE:
5708
5709 break;
5710 default:
5711 arg = (unsigned long)compat_ptr(arg);
5712 break;
5713 }
5714
5715 return md_ioctl(bdev, mode, cmd, arg);
5716}
5717#endif
5718
5719static int md_open(struct block_device *bdev, fmode_t mode)
5720{
5721
5722
5723
5724
5725 mddev_t *mddev = mddev_find(bdev->bd_dev);
5726 int err;
5727
5728 if (mddev->gendisk != bdev->bd_disk) {
5729
5730
5731
5732 mddev_put(mddev);
5733
5734 flush_scheduled_work();
5735
5736 return -ERESTARTSYS;
5737 }
5738 BUG_ON(mddev != bdev->bd_disk->private_data);
5739
5740 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
5741 goto out;
5742
5743 err = 0;
5744 atomic_inc(&mddev->openers);
5745 mutex_unlock(&mddev->open_mutex);
5746
5747 check_disk_change(bdev);
5748 out:
5749 return err;
5750}
5751
5752static int md_release(struct gendisk *disk, fmode_t mode)
5753{
5754 mddev_t *mddev = disk->private_data;
5755
5756 BUG_ON(!mddev);
5757 atomic_dec(&mddev->openers);
5758 mddev_put(mddev);
5759
5760 return 0;
5761}
5762
5763static int md_media_changed(struct gendisk *disk)
5764{
5765 mddev_t *mddev = disk->private_data;
5766
5767 return mddev->changed;
5768}
5769
5770static int md_revalidate(struct gendisk *disk)
5771{
5772 mddev_t *mddev = disk->private_data;
5773
5774 mddev->changed = 0;
5775 return 0;
5776}
5777static const struct block_device_operations md_fops =
5778{
5779 .owner = THIS_MODULE,
5780 .open = md_open,
5781 .release = md_release,
5782 .ioctl = md_ioctl,
5783#ifdef CONFIG_COMPAT
5784 .compat_ioctl = md_compat_ioctl,
5785#endif
5786 .getgeo = md_getgeo,
5787 .media_changed = md_media_changed,
5788 .revalidate_disk= md_revalidate,
5789};
5790
5791static int md_thread(void * arg)
5792{
5793 mdk_thread_t *thread = arg;
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807 allow_signal(SIGKILL);
5808 while (!kthread_should_stop()) {
5809
5810
5811
5812
5813
5814
5815 if (signal_pending(current))
5816 flush_signals(current);
5817
5818 wait_event_interruptible_timeout
5819 (thread->wqueue,
5820 test_bit(THREAD_WAKEUP, &thread->flags)
5821 || kthread_should_stop(),
5822 thread->timeout);
5823
5824 clear_bit(THREAD_WAKEUP, &thread->flags);
5825
5826 thread->run(thread->mddev);
5827 }
5828
5829 return 0;
5830}
5831
5832void md_wakeup_thread(mdk_thread_t *thread)
5833{
5834 if (thread) {
5835 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5836 set_bit(THREAD_WAKEUP, &thread->flags);
5837 wake_up(&thread->wqueue);
5838 }
5839}
5840
5841mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5842 const char *name)
5843{
5844 mdk_thread_t *thread;
5845
5846 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5847 if (!thread)
5848 return NULL;
5849
5850 init_waitqueue_head(&thread->wqueue);
5851
5852 thread->run = run;
5853 thread->mddev = mddev;
5854 thread->timeout = MAX_SCHEDULE_TIMEOUT;
5855 thread->tsk = kthread_run(md_thread, thread,
5856 "%s_%s",
5857 mdname(thread->mddev),
5858 name ?: mddev->pers->name);
5859 if (IS_ERR(thread->tsk)) {
5860 kfree(thread);
5861 return NULL;
5862 }
5863 return thread;
5864}
5865
5866void md_unregister_thread(mdk_thread_t *thread)
5867{
5868 if (!thread)
5869 return;
5870 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5871
5872 kthread_stop(thread->tsk);
5873 kfree(thread);
5874}
5875
5876void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5877{
5878 if (!mddev) {
5879 MD_BUG();
5880 return;
5881 }
5882
5883 if (!rdev || test_bit(Faulty, &rdev->flags))
5884 return;
5885
5886 if (mddev->external)
5887 set_bit(Blocked, &rdev->flags);
5888
5889
5890
5891
5892
5893
5894
5895 if (!mddev->pers)
5896 return;
5897 if (!mddev->pers->error_handler)
5898 return;
5899 mddev->pers->error_handler(mddev,rdev);
5900 if (mddev->degraded)
5901 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5902 set_bit(StateChanged, &rdev->flags);
5903 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5904 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5905 md_wakeup_thread(mddev->thread);
5906 md_new_event_inintr(mddev);
5907}
5908
5909
5910
5911static void status_unused(struct seq_file *seq)
5912{
5913 int i = 0;
5914 mdk_rdev_t *rdev;
5915
5916 seq_printf(seq, "unused devices: ");
5917
5918 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5919 char b[BDEVNAME_SIZE];
5920 i++;
5921 seq_printf(seq, "%s ",
5922 bdevname(rdev->bdev,b));
5923 }
5924 if (!i)
5925 seq_printf(seq, "<none>");
5926
5927 seq_printf(seq, "\n");
5928}
5929
5930
5931static void status_resync(struct seq_file *seq, mddev_t * mddev)
5932{
5933 sector_t max_sectors, resync, res;
5934 unsigned long dt, db;
5935 sector_t rt;
5936 int scale;
5937 unsigned int per_milli;
5938
5939 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
5940
5941 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5942 max_sectors = mddev->resync_max_sectors;
5943 else
5944 max_sectors = mddev->dev_sectors;
5945
5946
5947
5948
5949 if (!max_sectors) {
5950 MD_BUG();
5951 return;
5952 }
5953
5954
5955
5956
5957
5958 scale = 10;
5959 if (sizeof(sector_t) > sizeof(unsigned long)) {
5960 while ( max_sectors/2 > (1ULL<<(scale+32)))
5961 scale++;
5962 }
5963 res = (resync>>scale)*1000;
5964 sector_div(res, (u32)((max_sectors>>scale)+1));
5965
5966 per_milli = res;
5967 {
5968 int i, x = per_milli/50, y = 20-x;
5969 seq_printf(seq, "[");
5970 for (i = 0; i < x; i++)
5971 seq_printf(seq, "=");
5972 seq_printf(seq, ">");
5973 for (i = 0; i < y; i++)
5974 seq_printf(seq, ".");
5975 seq_printf(seq, "] ");
5976 }
5977 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5978 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5979 "reshape" :
5980 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5981 "check" :
5982 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5983 "resync" : "recovery"))),
5984 per_milli/10, per_milli % 10,
5985 (unsigned long long) resync/2,
5986 (unsigned long long) max_sectors/2);
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002 dt = ((jiffies - mddev->resync_mark) / HZ);
6003 if (!dt) dt++;
6004 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6005 - mddev->resync_mark_cnt;
6006
6007 rt = max_sectors - resync;
6008 sector_div(rt, db/32+1);
6009 rt *= dt;
6010 rt >>= 5;
6011
6012 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6013 ((unsigned long)rt % 60)/6);
6014
6015 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6016}
6017
6018static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6019{
6020 struct list_head *tmp;
6021 loff_t l = *pos;
6022 mddev_t *mddev;
6023
6024 if (l >= 0x10000)
6025 return NULL;
6026 if (!l--)
6027
6028 return (void*)1;
6029
6030 spin_lock(&all_mddevs_lock);
6031 list_for_each(tmp,&all_mddevs)
6032 if (!l--) {
6033 mddev = list_entry(tmp, mddev_t, all_mddevs);
6034 mddev_get(mddev);
6035 spin_unlock(&all_mddevs_lock);
6036 return mddev;
6037 }
6038 spin_unlock(&all_mddevs_lock);
6039 if (!l--)
6040 return (void*)2;
6041 return NULL;
6042}
6043
6044static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6045{
6046 struct list_head *tmp;
6047 mddev_t *next_mddev, *mddev = v;
6048
6049 ++*pos;
6050 if (v == (void*)2)
6051 return NULL;
6052
6053 spin_lock(&all_mddevs_lock);
6054 if (v == (void*)1)
6055 tmp = all_mddevs.next;
6056 else
6057 tmp = mddev->all_mddevs.next;
6058 if (tmp != &all_mddevs)
6059 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
6060 else {
6061 next_mddev = (void*)2;
6062 *pos = 0x10000;
6063 }
6064 spin_unlock(&all_mddevs_lock);
6065
6066 if (v != (void*)1)
6067 mddev_put(mddev);
6068 return next_mddev;
6069
6070}
6071
6072static void md_seq_stop(struct seq_file *seq, void *v)
6073{
6074 mddev_t *mddev = v;
6075
6076 if (mddev && v != (void*)1 && v != (void*)2)
6077 mddev_put(mddev);
6078}
6079
6080struct mdstat_info {
6081 int event;
6082};
6083
6084static int md_seq_show(struct seq_file *seq, void *v)
6085{
6086 mddev_t *mddev = v;
6087 sector_t sectors;
6088 mdk_rdev_t *rdev;
6089 struct mdstat_info *mi = seq->private;
6090 struct bitmap *bitmap;
6091
6092 if (v == (void*)1) {
6093 struct mdk_personality *pers;
6094 seq_printf(seq, "Personalities : ");
6095 spin_lock(&pers_lock);
6096 list_for_each_entry(pers, &pers_list, list)
6097 seq_printf(seq, "[%s] ", pers->name);
6098
6099 spin_unlock(&pers_lock);
6100 seq_printf(seq, "\n");
6101 mi->event = atomic_read(&md_event_count);
6102 return 0;
6103 }
6104 if (v == (void*)2) {
6105 status_unused(seq);
6106 return 0;
6107 }
6108
6109 if (mddev_lock(mddev) < 0)
6110 return -EINTR;
6111
6112 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
6113 seq_printf(seq, "%s : %sactive", mdname(mddev),
6114 mddev->pers ? "" : "in");
6115 if (mddev->pers) {
6116 if (mddev->ro==1)
6117 seq_printf(seq, " (read-only)");
6118 if (mddev->ro==2)
6119 seq_printf(seq, " (auto-read-only)");
6120 seq_printf(seq, " %s", mddev->pers->name);
6121 }
6122
6123 sectors = 0;
6124 list_for_each_entry(rdev, &mddev->disks, same_set) {
6125 char b[BDEVNAME_SIZE];
6126 seq_printf(seq, " %s[%d]",
6127 bdevname(rdev->bdev,b), rdev->desc_nr);
6128 if (test_bit(WriteMostly, &rdev->flags))
6129 seq_printf(seq, "(W)");
6130 if (test_bit(Faulty, &rdev->flags)) {
6131 seq_printf(seq, "(F)");
6132 continue;
6133 } else if (rdev->raid_disk < 0)
6134 seq_printf(seq, "(S)");
6135 sectors += rdev->sectors;
6136 }
6137
6138 if (!list_empty(&mddev->disks)) {
6139 if (mddev->pers)
6140 seq_printf(seq, "\n %llu blocks",
6141 (unsigned long long)
6142 mddev->array_sectors / 2);
6143 else
6144 seq_printf(seq, "\n %llu blocks",
6145 (unsigned long long)sectors / 2);
6146 }
6147 if (mddev->persistent) {
6148 if (mddev->major_version != 0 ||
6149 mddev->minor_version != 90) {
6150 seq_printf(seq," super %d.%d",
6151 mddev->major_version,
6152 mddev->minor_version);
6153 }
6154 } else if (mddev->external)
6155 seq_printf(seq, " super external:%s",
6156 mddev->metadata_type);
6157 else
6158 seq_printf(seq, " super non-persistent");
6159
6160 if (mddev->pers) {
6161 mddev->pers->status(seq, mddev);
6162 seq_printf(seq, "\n ");
6163 if (mddev->pers->sync_request) {
6164 if (mddev->curr_resync > 2) {
6165 status_resync(seq, mddev);
6166 seq_printf(seq, "\n ");
6167 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
6168 seq_printf(seq, "\tresync=DELAYED\n ");
6169 else if (mddev->recovery_cp < MaxSector)
6170 seq_printf(seq, "\tresync=PENDING\n ");
6171 }
6172 } else
6173 seq_printf(seq, "\n ");
6174
6175 if ((bitmap = mddev->bitmap)) {
6176 unsigned long chunk_kb;
6177 unsigned long flags;
6178 spin_lock_irqsave(&bitmap->lock, flags);
6179 chunk_kb = mddev->bitmap_info.chunksize >> 10;
6180 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
6181 "%lu%s chunk",
6182 bitmap->pages - bitmap->missing_pages,
6183 bitmap->pages,
6184 (bitmap->pages - bitmap->missing_pages)
6185 << (PAGE_SHIFT - 10),
6186 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
6187 chunk_kb ? "KB" : "B");
6188 if (bitmap->file) {
6189 seq_printf(seq, ", file: ");
6190 seq_path(seq, &bitmap->file->f_path, " \t\n");
6191 }
6192
6193 seq_printf(seq, "\n");
6194 spin_unlock_irqrestore(&bitmap->lock, flags);
6195 }
6196
6197 seq_printf(seq, "\n");
6198 }
6199 mddev_unlock(mddev);
6200
6201 return 0;
6202}
6203
6204static const struct seq_operations md_seq_ops = {
6205 .start = md_seq_start,
6206 .next = md_seq_next,
6207 .stop = md_seq_stop,
6208 .show = md_seq_show,
6209};
6210
6211static int md_seq_open(struct inode *inode, struct file *file)
6212{
6213 int error;
6214 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
6215 if (mi == NULL)
6216 return -ENOMEM;
6217
6218 error = seq_open(file, &md_seq_ops);
6219 if (error)
6220 kfree(mi);
6221 else {
6222 struct seq_file *p = file->private_data;
6223 p->private = mi;
6224 mi->event = atomic_read(&md_event_count);
6225 }
6226 return error;
6227}
6228
6229static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6230{
6231 struct seq_file *m = filp->private_data;
6232 struct mdstat_info *mi = m->private;
6233 int mask;
6234
6235 poll_wait(filp, &md_event_waiters, wait);
6236
6237
6238 mask = POLLIN | POLLRDNORM;
6239
6240 if (mi->event != atomic_read(&md_event_count))
6241 mask |= POLLERR | POLLPRI;
6242 return mask;
6243}
6244
6245static const struct file_operations md_seq_fops = {
6246 .owner = THIS_MODULE,
6247 .open = md_seq_open,
6248 .read = seq_read,
6249 .llseek = seq_lseek,
6250 .release = seq_release_private,
6251 .poll = mdstat_poll,
6252};
6253
6254int register_md_personality(struct mdk_personality *p)
6255{
6256 spin_lock(&pers_lock);
6257 list_add_tail(&p->list, &pers_list);
6258 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
6259 spin_unlock(&pers_lock);
6260 return 0;
6261}
6262
6263int unregister_md_personality(struct mdk_personality *p)
6264{
6265 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
6266 spin_lock(&pers_lock);
6267 list_del_init(&p->list);
6268 spin_unlock(&pers_lock);
6269 return 0;
6270}
6271
6272static int is_mddev_idle(mddev_t *mddev, int init)
6273{
6274 mdk_rdev_t * rdev;
6275 int idle;
6276 int curr_events;
6277
6278 idle = 1;
6279 rcu_read_lock();
6280 rdev_for_each_rcu(rdev, mddev) {
6281 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
6282 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
6283 (int)part_stat_read(&disk->part0, sectors[1]) -
6284 atomic_read(&disk->sync_io);
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307 if (init || curr_events - rdev->last_events > 64) {
6308 rdev->last_events = curr_events;
6309 idle = 0;
6310 }
6311 }
6312 rcu_read_unlock();
6313 return idle;
6314}
6315
6316void md_done_sync(mddev_t *mddev, int blocks, int ok)
6317{
6318
6319 atomic_sub(blocks, &mddev->recovery_active);
6320 wake_up(&mddev->recovery_wait);
6321 if (!ok) {
6322 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6323 md_wakeup_thread(mddev->thread);
6324
6325 }
6326}
6327
6328
6329
6330
6331
6332
6333
6334void md_write_start(mddev_t *mddev, struct bio *bi)
6335{
6336 int did_change = 0;
6337 if (bio_data_dir(bi) != WRITE)
6338 return;
6339
6340 BUG_ON(mddev->ro == 1);
6341 if (mddev->ro == 2) {
6342
6343 mddev->ro = 0;
6344 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6345 md_wakeup_thread(mddev->thread);
6346 md_wakeup_thread(mddev->sync_thread);
6347 did_change = 1;
6348 }
6349 atomic_inc(&mddev->writes_pending);
6350 if (mddev->safemode == 1)
6351 mddev->safemode = 0;
6352 if (mddev->in_sync) {
6353 spin_lock_irq(&mddev->write_lock);
6354 if (mddev->in_sync) {
6355 mddev->in_sync = 0;
6356 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6357 md_wakeup_thread(mddev->thread);
6358 did_change = 1;
6359 }
6360 spin_unlock_irq(&mddev->write_lock);
6361 }
6362 if (did_change)
6363 sysfs_notify_dirent(mddev->sysfs_state);
6364 wait_event(mddev->sb_wait,
6365 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
6366 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6367}
6368
6369void md_write_end(mddev_t *mddev)
6370{
6371 if (atomic_dec_and_test(&mddev->writes_pending)) {
6372 if (mddev->safemode == 2)
6373 md_wakeup_thread(mddev->thread);
6374 else if (mddev->safemode_delay)
6375 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
6376 }
6377}
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388int md_allow_write(mddev_t *mddev)
6389{
6390 if (!mddev->pers)
6391 return 0;
6392 if (mddev->ro)
6393 return 0;
6394 if (!mddev->pers->sync_request)
6395 return 0;
6396
6397 spin_lock_irq(&mddev->write_lock);
6398 if (mddev->in_sync) {
6399 mddev->in_sync = 0;
6400 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6401 if (mddev->safemode_delay &&
6402 mddev->safemode == 0)
6403 mddev->safemode = 1;
6404 spin_unlock_irq(&mddev->write_lock);
6405 md_update_sb(mddev, 0);
6406 sysfs_notify_dirent(mddev->sysfs_state);
6407 } else
6408 spin_unlock_irq(&mddev->write_lock);
6409
6410 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
6411 return -EAGAIN;
6412 else
6413 return 0;
6414}
6415EXPORT_SYMBOL_GPL(md_allow_write);
6416
6417#define SYNC_MARKS 10
6418#define SYNC_MARK_STEP (3*HZ)
6419void md_do_sync(mddev_t *mddev)
6420{
6421 mddev_t *mddev2;
6422 unsigned int currspeed = 0,
6423 window;
6424 sector_t max_sectors,j, io_sectors;
6425 unsigned long mark[SYNC_MARKS];
6426 sector_t mark_cnt[SYNC_MARKS];
6427 int last_mark,m;
6428 struct list_head *tmp;
6429 sector_t last_check;
6430 int skipped = 0;
6431 mdk_rdev_t *rdev;
6432 char *desc;
6433
6434
6435 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
6436 return;
6437 if (mddev->ro)
6438 return;
6439
6440 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6441 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
6442 desc = "data-check";
6443 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6444 desc = "requested-resync";
6445 else
6446 desc = "resync";
6447 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6448 desc = "reshape";
6449 else
6450 desc = "recovery";
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468 do {
6469 mddev->curr_resync = 2;
6470
6471 try_again:
6472 if (kthread_should_stop())
6473 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6474
6475 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6476 goto skip;
6477 for_each_mddev(mddev2, tmp) {
6478 if (mddev2 == mddev)
6479 continue;
6480 if (!mddev->parallel_resync
6481 && mddev2->curr_resync
6482 && match_mddev_units(mddev, mddev2)) {
6483 DEFINE_WAIT(wq);
6484 if (mddev < mddev2 && mddev->curr_resync == 2) {
6485
6486 mddev->curr_resync = 1;
6487 wake_up(&resync_wait);
6488 }
6489 if (mddev > mddev2 && mddev->curr_resync == 1)
6490
6491
6492
6493 continue;
6494
6495
6496
6497
6498 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
6499 if (!kthread_should_stop() &&
6500 mddev2->curr_resync >= mddev->curr_resync) {
6501 printk(KERN_INFO "md: delaying %s of %s"
6502 " until %s has finished (they"
6503 " share one or more physical units)\n",
6504 desc, mdname(mddev), mdname(mddev2));
6505 mddev_put(mddev2);
6506 if (signal_pending(current))
6507 flush_signals(current);
6508 schedule();
6509 finish_wait(&resync_wait, &wq);
6510 goto try_again;
6511 }
6512 finish_wait(&resync_wait, &wq);
6513 }
6514 }
6515 } while (mddev->curr_resync < 2);
6516
6517 j = 0;
6518 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6519
6520
6521
6522 max_sectors = mddev->resync_max_sectors;
6523 mddev->resync_mismatches = 0;
6524
6525 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6526 j = mddev->resync_min;
6527 else if (!mddev->bitmap)
6528 j = mddev->recovery_cp;
6529
6530 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6531 max_sectors = mddev->dev_sectors;
6532 else {
6533
6534 max_sectors = mddev->dev_sectors;
6535 j = MaxSector;
6536 rcu_read_lock();
6537 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6538 if (rdev->raid_disk >= 0 &&
6539 !test_bit(Faulty, &rdev->flags) &&
6540 !test_bit(In_sync, &rdev->flags) &&
6541 rdev->recovery_offset < j)
6542 j = rdev->recovery_offset;
6543 rcu_read_unlock();
6544 }
6545
6546 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
6547 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
6548 " %d KB/sec/disk.\n", speed_min(mddev));
6549 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
6550 "(but not more than %d KB/sec) for %s.\n",
6551 speed_max(mddev), desc);
6552
6553 is_mddev_idle(mddev, 1);
6554
6555 io_sectors = 0;
6556 for (m = 0; m < SYNC_MARKS; m++) {
6557 mark[m] = jiffies;
6558 mark_cnt[m] = io_sectors;
6559 }
6560 last_mark = 0;
6561 mddev->resync_mark = mark[last_mark];
6562 mddev->resync_mark_cnt = mark_cnt[last_mark];
6563
6564
6565
6566
6567 window = 32*(PAGE_SIZE/512);
6568 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
6569 window/2,(unsigned long long) max_sectors/2);
6570
6571 atomic_set(&mddev->recovery_active, 0);
6572 last_check = 0;
6573
6574 if (j>2) {
6575 printk(KERN_INFO
6576 "md: resuming %s of %s from checkpoint.\n",
6577 desc, mdname(mddev));
6578 mddev->curr_resync = j;
6579 }
6580 mddev->curr_resync_completed = mddev->curr_resync;
6581
6582 while (j < max_sectors) {
6583 sector_t sectors;
6584
6585 skipped = 0;
6586
6587 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6588 ((mddev->curr_resync > mddev->curr_resync_completed &&
6589 (mddev->curr_resync - mddev->curr_resync_completed)
6590 > (max_sectors >> 4)) ||
6591 (j - mddev->curr_resync_completed)*2
6592 >= mddev->resync_max - mddev->curr_resync_completed
6593 )) {
6594
6595 blk_unplug(mddev->queue);
6596 wait_event(mddev->recovery_wait,
6597 atomic_read(&mddev->recovery_active) == 0);
6598 mddev->curr_resync_completed =
6599 mddev->curr_resync;
6600 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6601 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6602 }
6603
6604 while (j >= mddev->resync_max && !kthread_should_stop()) {
6605
6606
6607
6608
6609 flush_signals(current);
6610 wait_event_interruptible(mddev->recovery_wait,
6611 mddev->resync_max > j
6612 || kthread_should_stop());
6613 }
6614
6615 if (kthread_should_stop())
6616 goto interrupted;
6617
6618 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6619 currspeed < speed_min(mddev));
6620 if (sectors == 0) {
6621 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6622 goto out;
6623 }
6624
6625 if (!skipped) {
6626 io_sectors += sectors;
6627 atomic_add(sectors, &mddev->recovery_active);
6628 }
6629
6630 j += sectors;
6631 if (j>1) mddev->curr_resync = j;
6632 mddev->curr_mark_cnt = io_sectors;
6633 if (last_check == 0)
6634
6635
6636
6637 md_new_event(mddev);
6638
6639 if (last_check + window > io_sectors || j == max_sectors)
6640 continue;
6641
6642 last_check = io_sectors;
6643
6644 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6645 break;
6646
6647 repeat:
6648 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6649
6650 int next = (last_mark+1) % SYNC_MARKS;
6651
6652 mddev->resync_mark = mark[next];
6653 mddev->resync_mark_cnt = mark_cnt[next];
6654 mark[next] = jiffies;
6655 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
6656 last_mark = next;
6657 }
6658
6659
6660 if (kthread_should_stop())
6661 goto interrupted;
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672 blk_unplug(mddev->queue);
6673 cond_resched();
6674
6675 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
6676 /((jiffies-mddev->resync_mark)/HZ +1) +1;
6677
6678 if (currspeed > speed_min(mddev)) {
6679 if ((currspeed > speed_max(mddev)) ||
6680 !is_mddev_idle(mddev, 0)) {
6681 msleep(500);
6682 goto repeat;
6683 }
6684 }
6685 }
6686 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
6687
6688
6689
6690 out:
6691 blk_unplug(mddev->queue);
6692
6693 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6694
6695
6696 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
6697
6698 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
6699 mddev->curr_resync > 2) {
6700 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6701 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6702 if (mddev->curr_resync >= mddev->recovery_cp) {
6703 printk(KERN_INFO
6704 "md: checkpointing %s of %s.\n",
6705 desc, mdname(mddev));
6706 mddev->recovery_cp = mddev->curr_resync;
6707 }
6708 } else
6709 mddev->recovery_cp = MaxSector;
6710 } else {
6711 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6712 mddev->curr_resync = MaxSector;
6713 rcu_read_lock();
6714 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6715 if (rdev->raid_disk >= 0 &&
6716 !test_bit(Faulty, &rdev->flags) &&
6717 !test_bit(In_sync, &rdev->flags) &&
6718 rdev->recovery_offset < mddev->curr_resync)
6719 rdev->recovery_offset = mddev->curr_resync;
6720 rcu_read_unlock();
6721 }
6722 }
6723 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6724
6725 skip:
6726 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6727
6728 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6729 mddev->resync_min = 0;
6730 mddev->resync_max = MaxSector;
6731 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6732 mddev->resync_min = mddev->curr_resync_completed;
6733 mddev->curr_resync = 0;
6734 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6735 mddev->curr_resync_completed = 0;
6736 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6737 wake_up(&resync_wait);
6738 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6739 md_wakeup_thread(mddev->thread);
6740 return;
6741
6742 interrupted:
6743
6744
6745
6746 printk(KERN_INFO
6747 "md: md_do_sync() got signal ... exiting\n");
6748 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6749 goto out;
6750
6751}
6752EXPORT_SYMBOL_GPL(md_do_sync);
6753
6754
6755static int remove_and_add_spares(mddev_t *mddev)
6756{
6757 mdk_rdev_t *rdev;
6758 int spares = 0;
6759
6760 mddev->curr_resync_completed = 0;
6761
6762 list_for_each_entry(rdev, &mddev->disks, same_set)
6763 if (rdev->raid_disk >= 0 &&
6764 !test_bit(Blocked, &rdev->flags) &&
6765 (test_bit(Faulty, &rdev->flags) ||
6766 ! test_bit(In_sync, &rdev->flags)) &&
6767 atomic_read(&rdev->nr_pending)==0) {
6768 if (mddev->pers->hot_remove_disk(
6769 mddev, rdev->raid_disk)==0) {
6770 char nm[20];
6771 sprintf(nm,"rd%d", rdev->raid_disk);
6772 sysfs_remove_link(&mddev->kobj, nm);
6773 rdev->raid_disk = -1;
6774 }
6775 }
6776
6777 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6778 list_for_each_entry(rdev, &mddev->disks, same_set) {
6779 if (rdev->raid_disk >= 0 &&
6780 !test_bit(In_sync, &rdev->flags) &&
6781 !test_bit(Blocked, &rdev->flags))
6782 spares++;
6783 if (rdev->raid_disk < 0
6784 && !test_bit(Faulty, &rdev->flags)) {
6785 rdev->recovery_offset = 0;
6786 if (mddev->pers->
6787 hot_add_disk(mddev, rdev) == 0) {
6788 char nm[20];
6789 sprintf(nm, "rd%d", rdev->raid_disk);
6790 if (sysfs_create_link(&mddev->kobj,
6791 &rdev->kobj, nm))
6792 printk(KERN_WARNING
6793 "md: cannot register "
6794 "%s for %s\n",
6795 nm, mdname(mddev));
6796 spares++;
6797 md_new_event(mddev);
6798 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6799 } else
6800 break;
6801 }
6802 }
6803 }
6804 return spares;
6805}
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828void md_check_recovery(mddev_t *mddev)
6829{
6830 mdk_rdev_t *rdev;
6831
6832
6833 if (mddev->bitmap)
6834 bitmap_daemon_work(mddev);
6835
6836 if (mddev->ro)
6837 return;
6838
6839 if (signal_pending(current)) {
6840 if (mddev->pers->sync_request && !mddev->external) {
6841 printk(KERN_INFO "md: %s in immediate safe mode\n",
6842 mdname(mddev));
6843 mddev->safemode = 2;
6844 }
6845 flush_signals(current);
6846 }
6847
6848 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
6849 return;
6850 if ( ! (
6851 (mddev->flags && !mddev->external) ||
6852 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6853 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6854 (mddev->external == 0 && mddev->safemode == 1) ||
6855 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6856 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6857 ))
6858 return;
6859
6860 if (mddev_trylock(mddev)) {
6861 int spares = 0;
6862
6863 if (mddev->ro) {
6864
6865
6866
6867 remove_and_add_spares(mddev);
6868 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6869 goto unlock;
6870 }
6871
6872 if (!mddev->external) {
6873 int did_change = 0;
6874 spin_lock_irq(&mddev->write_lock);
6875 if (mddev->safemode &&
6876 !atomic_read(&mddev->writes_pending) &&
6877 !mddev->in_sync &&
6878 mddev->recovery_cp == MaxSector) {
6879 mddev->in_sync = 1;
6880 did_change = 1;
6881 if (mddev->persistent)
6882 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6883 }
6884 if (mddev->safemode == 1)
6885 mddev->safemode = 0;
6886 spin_unlock_irq(&mddev->write_lock);
6887 if (did_change)
6888 sysfs_notify_dirent(mddev->sysfs_state);
6889 }
6890
6891 if (mddev->flags)
6892 md_update_sb(mddev, 0);
6893
6894 list_for_each_entry(rdev, &mddev->disks, same_set)
6895 if (test_and_clear_bit(StateChanged, &rdev->flags))
6896 sysfs_notify_dirent(rdev->sysfs_state);
6897
6898
6899 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6900 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6901
6902 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6903 goto unlock;
6904 }
6905 if (mddev->sync_thread) {
6906
6907 md_unregister_thread(mddev->sync_thread);
6908 mddev->sync_thread = NULL;
6909 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
6910 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
6911
6912
6913 if (mddev->pers->spare_active(mddev))
6914 sysfs_notify(&mddev->kobj, NULL,
6915 "degraded");
6916 }
6917 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6918 mddev->pers->finish_reshape)
6919 mddev->pers->finish_reshape(mddev);
6920 md_update_sb(mddev, 1);
6921
6922
6923
6924
6925 if (!mddev->degraded)
6926 list_for_each_entry(rdev, &mddev->disks, same_set)
6927 rdev->saved_raid_disk = -1;
6928
6929 mddev->recovery = 0;
6930
6931 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6932 sysfs_notify_dirent(mddev->sysfs_action);
6933 md_new_event(mddev);
6934 goto unlock;
6935 }
6936
6937
6938
6939 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6940 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6941
6942
6943
6944 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6945 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6946
6947 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6948 goto unlock;
6949
6950
6951
6952
6953
6954
6955
6956 if (mddev->reshape_position != MaxSector) {
6957 if (mddev->pers->check_reshape == NULL ||
6958 mddev->pers->check_reshape(mddev) != 0)
6959
6960 goto unlock;
6961 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6962 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6963 } else if ((spares = remove_and_add_spares(mddev))) {
6964 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6965 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6966 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
6967 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6968 } else if (mddev->recovery_cp < MaxSector) {
6969 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6970 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6971 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6972
6973 goto unlock;
6974
6975 if (mddev->pers->sync_request) {
6976 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6977
6978
6979
6980
6981 bitmap_write_all(mddev->bitmap);
6982 }
6983 mddev->sync_thread = md_register_thread(md_do_sync,
6984 mddev,
6985 "resync");
6986 if (!mddev->sync_thread) {
6987 printk(KERN_ERR "%s: could not start resync"
6988 " thread...\n",
6989 mdname(mddev));
6990
6991 mddev->recovery = 0;
6992 } else
6993 md_wakeup_thread(mddev->sync_thread);
6994 sysfs_notify_dirent(mddev->sysfs_action);
6995 md_new_event(mddev);
6996 }
6997 unlock:
6998 if (!mddev->sync_thread) {
6999 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7000 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7001 &mddev->recovery))
7002 if (mddev->sysfs_action)
7003 sysfs_notify_dirent(mddev->sysfs_action);
7004 }
7005 mddev_unlock(mddev);
7006 }
7007}
7008
7009void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7010{
7011 sysfs_notify_dirent(rdev->sysfs_state);
7012 wait_event_timeout(rdev->blocked_wait,
7013 !test_bit(Blocked, &rdev->flags),
7014 msecs_to_jiffies(5000));
7015 rdev_dec_pending(rdev, mddev);
7016}
7017EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7018
7019static int md_notify_reboot(struct notifier_block *this,
7020 unsigned long code, void *x)
7021{
7022 struct list_head *tmp;
7023 mddev_t *mddev;
7024
7025 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
7026
7027 printk(KERN_INFO "md: stopping all md devices.\n");
7028
7029 for_each_mddev(mddev, tmp)
7030 if (mddev_trylock(mddev)) {
7031
7032
7033
7034
7035 do_md_stop(mddev, 1, 100);
7036 mddev_unlock(mddev);
7037 }
7038
7039
7040
7041
7042
7043
7044 mdelay(1000*1);
7045 }
7046 return NOTIFY_DONE;
7047}
7048
7049static struct notifier_block md_notifier = {
7050 .notifier_call = md_notify_reboot,
7051 .next = NULL,
7052 .priority = INT_MAX,
7053};
7054
7055static void md_geninit(void)
7056{
7057 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
7058
7059 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
7060}
7061
7062static int __init md_init(void)
7063{
7064 if (register_blkdev(MD_MAJOR, "md"))
7065 return -1;
7066 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
7067 unregister_blkdev(MD_MAJOR, "md");
7068 return -1;
7069 }
7070 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
7071 md_probe, NULL, NULL);
7072 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
7073 md_probe, NULL, NULL);
7074
7075 register_reboot_notifier(&md_notifier);
7076 raid_table_header = register_sysctl_table(raid_root_table);
7077
7078 md_geninit();
7079 return 0;
7080}
7081
7082
7083#ifndef MODULE
7084
7085
7086
7087
7088
7089
7090static LIST_HEAD(all_detected_devices);
7091struct detected_devices_node {
7092 struct list_head list;
7093 dev_t dev;
7094};
7095
7096void md_autodetect_dev(dev_t dev)
7097{
7098 struct detected_devices_node *node_detected_dev;
7099
7100 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
7101 if (node_detected_dev) {
7102 node_detected_dev->dev = dev;
7103 list_add_tail(&node_detected_dev->list, &all_detected_devices);
7104 } else {
7105 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
7106 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
7107 }
7108}
7109
7110
7111static void autostart_arrays(int part)
7112{
7113 mdk_rdev_t *rdev;
7114 struct detected_devices_node *node_detected_dev;
7115 dev_t dev;
7116 int i_scanned, i_passed;
7117
7118 i_scanned = 0;
7119 i_passed = 0;
7120
7121 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
7122
7123 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
7124 i_scanned++;
7125 node_detected_dev = list_entry(all_detected_devices.next,
7126 struct detected_devices_node, list);
7127 list_del(&node_detected_dev->list);
7128 dev = node_detected_dev->dev;
7129 kfree(node_detected_dev);
7130 rdev = md_import_device(dev,0, 90);
7131 if (IS_ERR(rdev))
7132 continue;
7133
7134 if (test_bit(Faulty, &rdev->flags)) {
7135 MD_BUG();
7136 continue;
7137 }
7138 set_bit(AutoDetected, &rdev->flags);
7139 list_add(&rdev->same_set, &pending_raid_disks);
7140 i_passed++;
7141 }
7142
7143 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
7144 i_scanned, i_passed);
7145
7146 autorun_devices(part);
7147}
7148
7149#endif
7150
7151static __exit void md_exit(void)
7152{
7153 mddev_t *mddev;
7154 struct list_head *tmp;
7155
7156 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
7157 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
7158
7159 unregister_blkdev(MD_MAJOR,"md");
7160 unregister_blkdev(mdp_major, "mdp");
7161 unregister_reboot_notifier(&md_notifier);
7162 unregister_sysctl_table(raid_table_header);
7163 remove_proc_entry("mdstat", NULL);
7164 for_each_mddev(mddev, tmp) {
7165 export_array(mddev);
7166 mddev->hold_active = 0;
7167 }
7168}
7169
7170subsys_initcall(md_init);
7171module_exit(md_exit)
7172
7173static int get_ro(char *buffer, struct kernel_param *kp)
7174{
7175 return sprintf(buffer, "%d", start_readonly);
7176}
7177static int set_ro(const char *val, struct kernel_param *kp)
7178{
7179 char *e;
7180 int num = simple_strtoul(val, &e, 10);
7181 if (*val && (*e == '\0' || *e == '\n')) {
7182 start_readonly = num;
7183 return 0;
7184 }
7185 return -EINVAL;
7186}
7187
7188module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
7189module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
7190
7191module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
7192
7193EXPORT_SYMBOL(register_md_personality);
7194EXPORT_SYMBOL(unregister_md_personality);
7195EXPORT_SYMBOL(md_error);
7196EXPORT_SYMBOL(md_done_sync);
7197EXPORT_SYMBOL(md_write_start);
7198EXPORT_SYMBOL(md_write_end);
7199EXPORT_SYMBOL(md_register_thread);
7200EXPORT_SYMBOL(md_unregister_thread);
7201EXPORT_SYMBOL(md_wakeup_thread);
7202EXPORT_SYMBOL(md_check_recovery);
7203MODULE_LICENSE("GPL");
7204MODULE_DESCRIPTION("MD RAID framework");
7205MODULE_ALIAS("md");
7206MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
7207