1
2
3
4
5
6#include <linux/kernel.h>
7#include <linux/wait.h>
8#include <linux/blkdev.h>
9#include <linux/slab.h>
10#include <linux/raid/md_p.h>
11#include <linux/crc32c.h>
12#include <linux/random.h>
13#include <linux/kthread.h>
14#include <linux/types.h>
15#include "md.h"
16#include "raid5.h"
17#include "md-bitmap.h"
18#include "raid5-log.h"
19
20
21
22
23
24#define BLOCK_SECTORS (8)
25#define BLOCK_SECTOR_SHIFT (3)
26
27
28
29
30
31
32
33#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2)
34#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
35
36
37#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
38
39#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
40
41#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
42
43
44
45
46
47#define R5L_POOL_SIZE 4
48
49static char *r5c_journal_mode_str[] = {"write-through",
50 "write-back"};
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82struct r5l_log {
83 struct md_rdev *rdev;
84
85 u32 uuid_checksum;
86
87 sector_t device_size;
88
89 sector_t max_free_space;
90
91
92 sector_t last_checkpoint;
93
94 u64 last_cp_seq;
95
96 sector_t log_start;
97 u64 seq;
98
99 sector_t next_checkpoint;
100
101 struct mutex io_mutex;
102 struct r5l_io_unit *current_io;
103
104 spinlock_t io_list_lock;
105 struct list_head running_ios;
106
107
108 struct list_head io_end_ios;
109
110
111 struct list_head flushing_ios;
112
113 struct list_head finished_ios;
114 struct bio flush_bio;
115
116 struct list_head no_mem_stripes;
117
118 struct kmem_cache *io_kc;
119 mempool_t io_pool;
120 struct bio_set bs;
121 mempool_t meta_pool;
122
123 struct md_thread *reclaim_thread;
124 unsigned long reclaim_target;
125
126
127
128
129
130
131 wait_queue_head_t iounit_wait;
132
133 struct list_head no_space_stripes;
134 spinlock_t no_space_stripes_lock;
135
136 bool need_cache_flush;
137
138
139 enum r5c_journal_mode r5c_journal_mode;
140
141
142 struct list_head stripe_in_journal_list;
143
144 spinlock_t stripe_in_journal_lock;
145 atomic_t stripe_in_journal_count;
146
147
148 struct work_struct deferred_io_work;
149
150 struct work_struct disable_writeback_work;
151
152
153 spinlock_t tree_lock;
154 struct radix_tree_root big_stripe_tree;
155};
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188#define R5C_RADIX_COUNT_SHIFT 2
189
190
191
192
193
194
195static inline sector_t r5c_tree_index(struct r5conf *conf,
196 sector_t sect)
197{
198 sector_div(sect, conf->chunk_sectors);
199 return sect;
200}
201
202
203
204
205
206
207
208
209struct r5l_io_unit {
210 struct r5l_log *log;
211
212 struct page *meta_page;
213 int meta_offset;
214
215 struct bio *current_bio;
216
217 atomic_t pending_stripe;
218 u64 seq;
219 sector_t log_start;
220 sector_t log_end;
221 struct list_head log_sibling;
222 struct list_head stripe_list;
223
224 int state;
225 bool need_split_bio;
226 struct bio *split_bio;
227
228 unsigned int has_flush:1;
229 unsigned int has_fua:1;
230 unsigned int has_null_flush:1;
231 unsigned int has_flush_payload:1;
232
233
234
235
236 unsigned int io_deferred:1;
237
238 struct bio_list flush_barriers;
239};
240
241
242enum r5l_io_unit_state {
243 IO_UNIT_RUNNING = 0,
244 IO_UNIT_IO_START = 1,
245
246 IO_UNIT_IO_END = 2,
247 IO_UNIT_STRIPE_END = 3,
248};
249
250bool r5c_is_writeback(struct r5l_log *log)
251{
252 return (log != NULL &&
253 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
254}
255
256static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
257{
258 start += inc;
259 if (start >= log->device_size)
260 start = start - log->device_size;
261 return start;
262}
263
264static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
265 sector_t end)
266{
267 if (end >= start)
268 return end - start;
269 else
270 return end + log->device_size - start;
271}
272
273static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
274{
275 sector_t used_size;
276
277 used_size = r5l_ring_distance(log, log->last_checkpoint,
278 log->log_start);
279
280 return log->device_size > used_size + size;
281}
282
283static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
284 enum r5l_io_unit_state state)
285{
286 if (WARN_ON(io->state >= state))
287 return;
288 io->state = state;
289}
290
291static void
292r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
293{
294 struct bio *wbi, *wbi2;
295
296 wbi = dev->written;
297 dev->written = NULL;
298 while (wbi && wbi->bi_iter.bi_sector <
299 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
300 wbi2 = r5_next_bio(conf, wbi, dev->sector);
301 md_write_end(conf->mddev);
302 bio_endio(wbi);
303 wbi = wbi2;
304 }
305}
306
307void r5c_handle_cached_data_endio(struct r5conf *conf,
308 struct stripe_head *sh, int disks)
309{
310 int i;
311
312 for (i = sh->disks; i--; ) {
313 if (sh->dev[i].written) {
314 set_bit(R5_UPTODATE, &sh->dev[i].flags);
315 r5c_return_dev_pending_writes(conf, &sh->dev[i]);
316 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
317 RAID5_STRIPE_SECTORS(conf),
318 !test_bit(STRIPE_DEGRADED, &sh->state),
319 0);
320 }
321 }
322}
323
324void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
325
326
327void r5c_check_stripe_cache_usage(struct r5conf *conf)
328{
329 int total_cached;
330
331 if (!r5c_is_writeback(conf->log))
332 return;
333
334 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
335 atomic_read(&conf->r5c_cached_full_stripes);
336
337
338
339
340
341
342
343
344
345 if (total_cached > conf->min_nr_stripes * 1 / 2 ||
346 atomic_read(&conf->empty_inactive_list_nr) > 0)
347 r5l_wake_reclaim(conf->log, 0);
348}
349
350
351
352
353
354void r5c_check_cached_full_stripe(struct r5conf *conf)
355{
356 if (!r5c_is_writeback(conf->log))
357 return;
358
359
360
361
362
363 if (atomic_read(&conf->r5c_cached_full_stripes) >=
364 min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
365 conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
366 r5l_wake_reclaim(conf->log, 0);
367}
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
398{
399 struct r5l_log *log = conf->log;
400
401 if (!r5c_is_writeback(log))
402 return 0;
403
404 return BLOCK_SECTORS *
405 ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
406 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
407}
408
409
410
411
412
413
414
415
416static inline void r5c_update_log_state(struct r5l_log *log)
417{
418 struct r5conf *conf = log->rdev->mddev->private;
419 sector_t free_space;
420 sector_t reclaim_space;
421 bool wake_reclaim = false;
422
423 if (!r5c_is_writeback(log))
424 return;
425
426 free_space = r5l_ring_distance(log, log->log_start,
427 log->last_checkpoint);
428 reclaim_space = r5c_log_required_to_flush_cache(conf);
429 if (free_space < 2 * reclaim_space)
430 set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
431 else {
432 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
433 wake_reclaim = true;
434 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
435 }
436 if (free_space < 3 * reclaim_space)
437 set_bit(R5C_LOG_TIGHT, &conf->cache_state);
438 else
439 clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
440
441 if (wake_reclaim)
442 r5l_wake_reclaim(log, 0);
443}
444
445
446
447
448
449void r5c_make_stripe_write_out(struct stripe_head *sh)
450{
451 struct r5conf *conf = sh->raid_conf;
452 struct r5l_log *log = conf->log;
453
454 BUG_ON(!r5c_is_writeback(log));
455
456 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
457 clear_bit(STRIPE_R5C_CACHING, &sh->state);
458
459 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
460 atomic_inc(&conf->preread_active_stripes);
461}
462
463static void r5c_handle_data_cached(struct stripe_head *sh)
464{
465 int i;
466
467 for (i = sh->disks; i--; )
468 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
469 set_bit(R5_InJournal, &sh->dev[i].flags);
470 clear_bit(R5_LOCKED, &sh->dev[i].flags);
471 }
472 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
473}
474
475
476
477
478
479static void r5c_handle_parity_cached(struct stripe_head *sh)
480{
481 int i;
482
483 for (i = sh->disks; i--; )
484 if (test_bit(R5_InJournal, &sh->dev[i].flags))
485 set_bit(R5_Wantwrite, &sh->dev[i].flags);
486}
487
488
489
490
491
492static void r5c_finish_cache_stripe(struct stripe_head *sh)
493{
494 struct r5l_log *log = sh->raid_conf->log;
495
496 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
497 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
498
499
500
501
502
503
504 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
505 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
506 r5c_handle_data_cached(sh);
507 } else {
508 r5c_handle_parity_cached(sh);
509 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
510 }
511}
512
513static void r5l_io_run_stripes(struct r5l_io_unit *io)
514{
515 struct stripe_head *sh, *next;
516
517 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
518 list_del_init(&sh->log_list);
519
520 r5c_finish_cache_stripe(sh);
521
522 set_bit(STRIPE_HANDLE, &sh->state);
523 raid5_release_stripe(sh);
524 }
525}
526
527static void r5l_log_run_stripes(struct r5l_log *log)
528{
529 struct r5l_io_unit *io, *next;
530
531 lockdep_assert_held(&log->io_list_lock);
532
533 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
534
535 if (io->state < IO_UNIT_IO_END)
536 break;
537
538 list_move_tail(&io->log_sibling, &log->finished_ios);
539 r5l_io_run_stripes(io);
540 }
541}
542
543static void r5l_move_to_end_ios(struct r5l_log *log)
544{
545 struct r5l_io_unit *io, *next;
546
547 lockdep_assert_held(&log->io_list_lock);
548
549 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
550
551 if (io->state < IO_UNIT_IO_END)
552 break;
553 list_move_tail(&io->log_sibling, &log->io_end_ios);
554 }
555}
556
557static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
558static void r5l_log_endio(struct bio *bio)
559{
560 struct r5l_io_unit *io = bio->bi_private;
561 struct r5l_io_unit *io_deferred;
562 struct r5l_log *log = io->log;
563 unsigned long flags;
564 bool has_null_flush;
565 bool has_flush_payload;
566
567 if (bio->bi_status)
568 md_error(log->rdev->mddev, log->rdev);
569
570 bio_put(bio);
571 mempool_free(io->meta_page, &log->meta_pool);
572
573 spin_lock_irqsave(&log->io_list_lock, flags);
574 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
575
576
577
578
579
580
581
582 has_null_flush = io->has_null_flush;
583 has_flush_payload = io->has_flush_payload;
584
585 if (log->need_cache_flush && !list_empty(&io->stripe_list))
586 r5l_move_to_end_ios(log);
587 else
588 r5l_log_run_stripes(log);
589 if (!list_empty(&log->running_ios)) {
590
591
592
593
594 io_deferred = list_first_entry(&log->running_ios,
595 struct r5l_io_unit, log_sibling);
596 if (io_deferred->io_deferred)
597 schedule_work(&log->deferred_io_work);
598 }
599
600 spin_unlock_irqrestore(&log->io_list_lock, flags);
601
602 if (log->need_cache_flush)
603 md_wakeup_thread(log->rdev->mddev->thread);
604
605
606 if (has_null_flush) {
607 struct bio *bi;
608
609 WARN_ON(bio_list_empty(&io->flush_barriers));
610 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
611 bio_endio(bi);
612 if (atomic_dec_and_test(&io->pending_stripe)) {
613 __r5l_stripe_write_finished(io);
614 return;
615 }
616 }
617 }
618
619 if (has_flush_payload)
620 if (atomic_dec_and_test(&io->pending_stripe))
621 __r5l_stripe_write_finished(io);
622}
623
624static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
625{
626 unsigned long flags;
627
628 spin_lock_irqsave(&log->io_list_lock, flags);
629 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
630 spin_unlock_irqrestore(&log->io_list_lock, flags);
631
632
633
634
635
636
637
638
639
640
641
642
643 if (io->split_bio) {
644 if (io->has_flush)
645 io->split_bio->bi_opf |= REQ_PREFLUSH;
646 if (io->has_fua)
647 io->split_bio->bi_opf |= REQ_FUA;
648 submit_bio(io->split_bio);
649 }
650
651 if (io->has_flush)
652 io->current_bio->bi_opf |= REQ_PREFLUSH;
653 if (io->has_fua)
654 io->current_bio->bi_opf |= REQ_FUA;
655 submit_bio(io->current_bio);
656}
657
658
659static void r5l_submit_io_async(struct work_struct *work)
660{
661 struct r5l_log *log = container_of(work, struct r5l_log,
662 deferred_io_work);
663 struct r5l_io_unit *io = NULL;
664 unsigned long flags;
665
666 spin_lock_irqsave(&log->io_list_lock, flags);
667 if (!list_empty(&log->running_ios)) {
668 io = list_first_entry(&log->running_ios, struct r5l_io_unit,
669 log_sibling);
670 if (!io->io_deferred)
671 io = NULL;
672 else
673 io->io_deferred = 0;
674 }
675 spin_unlock_irqrestore(&log->io_list_lock, flags);
676 if (io)
677 r5l_do_submit_io(log, io);
678}
679
680static void r5c_disable_writeback_async(struct work_struct *work)
681{
682 struct r5l_log *log = container_of(work, struct r5l_log,
683 disable_writeback_work);
684 struct mddev *mddev = log->rdev->mddev;
685 struct r5conf *conf = mddev->private;
686 int locked = 0;
687
688 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
689 return;
690 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
691 mdname(mddev));
692
693
694 wait_event(mddev->sb_wait,
695 conf->log == NULL ||
696 (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
697 (locked = mddev_trylock(mddev))));
698 if (locked) {
699 mddev_suspend(mddev);
700 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
701 mddev_resume(mddev);
702 mddev_unlock(mddev);
703 }
704}
705
706static void r5l_submit_current_io(struct r5l_log *log)
707{
708 struct r5l_io_unit *io = log->current_io;
709 struct r5l_meta_block *block;
710 unsigned long flags;
711 u32 crc;
712 bool do_submit = true;
713
714 if (!io)
715 return;
716
717 block = page_address(io->meta_page);
718 block->meta_size = cpu_to_le32(io->meta_offset);
719 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
720 block->checksum = cpu_to_le32(crc);
721
722 log->current_io = NULL;
723 spin_lock_irqsave(&log->io_list_lock, flags);
724 if (io->has_flush || io->has_fua) {
725 if (io != list_first_entry(&log->running_ios,
726 struct r5l_io_unit, log_sibling)) {
727 io->io_deferred = 1;
728 do_submit = false;
729 }
730 }
731 spin_unlock_irqrestore(&log->io_list_lock, flags);
732 if (do_submit)
733 r5l_do_submit_io(log, io);
734}
735
736static struct bio *r5l_bio_alloc(struct r5l_log *log)
737{
738 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &log->bs);
739
740 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
741 bio_set_dev(bio, log->rdev->bdev);
742 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
743
744 return bio;
745}
746
747static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
748{
749 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
750
751 r5c_update_log_state(log);
752
753
754
755
756
757
758
759 if (log->log_start == 0)
760 io->need_split_bio = true;
761
762 io->log_end = log->log_start;
763}
764
765static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
766{
767 struct r5l_io_unit *io;
768 struct r5l_meta_block *block;
769
770 io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
771 if (!io)
772 return NULL;
773 memset(io, 0, sizeof(*io));
774
775 io->log = log;
776 INIT_LIST_HEAD(&io->log_sibling);
777 INIT_LIST_HEAD(&io->stripe_list);
778 bio_list_init(&io->flush_barriers);
779 io->state = IO_UNIT_RUNNING;
780
781 io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
782 block = page_address(io->meta_page);
783 clear_page(block);
784 block->magic = cpu_to_le32(R5LOG_MAGIC);
785 block->version = R5LOG_VERSION;
786 block->seq = cpu_to_le64(log->seq);
787 block->position = cpu_to_le64(log->log_start);
788
789 io->log_start = log->log_start;
790 io->meta_offset = sizeof(struct r5l_meta_block);
791 io->seq = log->seq++;
792
793 io->current_bio = r5l_bio_alloc(log);
794 io->current_bio->bi_end_io = r5l_log_endio;
795 io->current_bio->bi_private = io;
796 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
797
798 r5_reserve_log_entry(log, io);
799
800 spin_lock_irq(&log->io_list_lock);
801 list_add_tail(&io->log_sibling, &log->running_ios);
802 spin_unlock_irq(&log->io_list_lock);
803
804 return io;
805}
806
807static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
808{
809 if (log->current_io &&
810 log->current_io->meta_offset + payload_size > PAGE_SIZE)
811 r5l_submit_current_io(log);
812
813 if (!log->current_io) {
814 log->current_io = r5l_new_meta(log);
815 if (!log->current_io)
816 return -ENOMEM;
817 }
818
819 return 0;
820}
821
822static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
823 sector_t location,
824 u32 checksum1, u32 checksum2,
825 bool checksum2_valid)
826{
827 struct r5l_io_unit *io = log->current_io;
828 struct r5l_payload_data_parity *payload;
829
830 payload = page_address(io->meta_page) + io->meta_offset;
831 payload->header.type = cpu_to_le16(type);
832 payload->header.flags = cpu_to_le16(0);
833 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
834 (PAGE_SHIFT - 9));
835 payload->location = cpu_to_le64(location);
836 payload->checksum[0] = cpu_to_le32(checksum1);
837 if (checksum2_valid)
838 payload->checksum[1] = cpu_to_le32(checksum2);
839
840 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
841 sizeof(__le32) * (1 + !!checksum2_valid);
842}
843
844static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
845{
846 struct r5l_io_unit *io = log->current_io;
847
848 if (io->need_split_bio) {
849 BUG_ON(io->split_bio);
850 io->split_bio = io->current_bio;
851 io->current_bio = r5l_bio_alloc(log);
852 bio_chain(io->current_bio, io->split_bio);
853 io->need_split_bio = false;
854 }
855
856 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
857 BUG();
858
859 r5_reserve_log_entry(log, io);
860}
861
862static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
863{
864 struct mddev *mddev = log->rdev->mddev;
865 struct r5conf *conf = mddev->private;
866 struct r5l_io_unit *io;
867 struct r5l_payload_flush *payload;
868 int meta_size;
869
870
871
872
873
874
875 if (conf->quiesce)
876 return;
877
878 mutex_lock(&log->io_mutex);
879 meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
880
881 if (r5l_get_meta(log, meta_size)) {
882 mutex_unlock(&log->io_mutex);
883 return;
884 }
885
886
887 io = log->current_io;
888 payload = page_address(io->meta_page) + io->meta_offset;
889 payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
890 payload->header.flags = cpu_to_le16(0);
891 payload->size = cpu_to_le32(sizeof(__le64));
892 payload->flush_stripes[0] = cpu_to_le64(sect);
893 io->meta_offset += meta_size;
894
895 if (!io->has_flush_payload) {
896 io->has_flush_payload = 1;
897 atomic_inc(&io->pending_stripe);
898 }
899 mutex_unlock(&log->io_mutex);
900}
901
902static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
903 int data_pages, int parity_pages)
904{
905 int i;
906 int meta_size;
907 int ret;
908 struct r5l_io_unit *io;
909
910 meta_size =
911 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
912 * data_pages) +
913 sizeof(struct r5l_payload_data_parity) +
914 sizeof(__le32) * parity_pages;
915
916 ret = r5l_get_meta(log, meta_size);
917 if (ret)
918 return ret;
919
920 io = log->current_io;
921
922 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
923 io->has_flush = 1;
924
925 for (i = 0; i < sh->disks; i++) {
926 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
927 test_bit(R5_InJournal, &sh->dev[i].flags))
928 continue;
929 if (i == sh->pd_idx || i == sh->qd_idx)
930 continue;
931 if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
932 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
933 io->has_fua = 1;
934
935
936
937
938 io->has_flush = 1;
939 }
940 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
941 raid5_compute_blocknr(sh, i, 0),
942 sh->dev[i].log_checksum, 0, false);
943 r5l_append_payload_page(log, sh->dev[i].page);
944 }
945
946 if (parity_pages == 2) {
947 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
948 sh->sector, sh->dev[sh->pd_idx].log_checksum,
949 sh->dev[sh->qd_idx].log_checksum, true);
950 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
951 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
952 } else if (parity_pages == 1) {
953 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
954 sh->sector, sh->dev[sh->pd_idx].log_checksum,
955 0, false);
956 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
957 } else
958 BUG_ON(parity_pages != 0);
959
960 list_add_tail(&sh->log_list, &io->stripe_list);
961 atomic_inc(&io->pending_stripe);
962 sh->log_io = io;
963
964 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
965 return 0;
966
967 if (sh->log_start == MaxSector) {
968 BUG_ON(!list_empty(&sh->r5c));
969 sh->log_start = io->log_start;
970 spin_lock_irq(&log->stripe_in_journal_lock);
971 list_add_tail(&sh->r5c,
972 &log->stripe_in_journal_list);
973 spin_unlock_irq(&log->stripe_in_journal_lock);
974 atomic_inc(&log->stripe_in_journal_count);
975 }
976 return 0;
977}
978
979
980static inline void r5l_add_no_space_stripe(struct r5l_log *log,
981 struct stripe_head *sh)
982{
983 spin_lock(&log->no_space_stripes_lock);
984 list_add_tail(&sh->log_list, &log->no_space_stripes);
985 spin_unlock(&log->no_space_stripes_lock);
986}
987
988
989
990
991
992int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
993{
994 struct r5conf *conf = sh->raid_conf;
995 int write_disks = 0;
996 int data_pages, parity_pages;
997 int reserve;
998 int i;
999 int ret = 0;
1000 bool wake_reclaim = false;
1001
1002 if (!log)
1003 return -EAGAIN;
1004
1005 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
1006 test_bit(STRIPE_SYNCING, &sh->state)) {
1007
1008 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
1009 return -EAGAIN;
1010 }
1011
1012 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1013
1014 for (i = 0; i < sh->disks; i++) {
1015 void *addr;
1016
1017 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
1018 test_bit(R5_InJournal, &sh->dev[i].flags))
1019 continue;
1020
1021 write_disks++;
1022
1023 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
1024 continue;
1025 addr = kmap_atomic(sh->dev[i].page);
1026 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1027 addr, PAGE_SIZE);
1028 kunmap_atomic(addr);
1029 }
1030 parity_pages = 1 + !!(sh->qd_idx >= 0);
1031 data_pages = write_disks - parity_pages;
1032
1033 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1034
1035
1036
1037
1038 clear_bit(STRIPE_DELAYED, &sh->state);
1039 atomic_inc(&sh->count);
1040
1041 mutex_lock(&log->io_mutex);
1042
1043 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
1044
1045 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1046 if (!r5l_has_free_space(log, reserve)) {
1047 r5l_add_no_space_stripe(log, sh);
1048 wake_reclaim = true;
1049 } else {
1050 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1051 if (ret) {
1052 spin_lock_irq(&log->io_list_lock);
1053 list_add_tail(&sh->log_list,
1054 &log->no_mem_stripes);
1055 spin_unlock_irq(&log->io_list_lock);
1056 }
1057 }
1058 } else {
1059
1060
1061
1062
1063 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
1064 sh->log_start == MaxSector) {
1065 r5l_add_no_space_stripe(log, sh);
1066 wake_reclaim = true;
1067 reserve = 0;
1068 } else if (!r5l_has_free_space(log, reserve)) {
1069 if (sh->log_start == log->last_checkpoint)
1070 BUG();
1071 else
1072 r5l_add_no_space_stripe(log, sh);
1073 } else {
1074 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
1075 if (ret) {
1076 spin_lock_irq(&log->io_list_lock);
1077 list_add_tail(&sh->log_list,
1078 &log->no_mem_stripes);
1079 spin_unlock_irq(&log->io_list_lock);
1080 }
1081 }
1082 }
1083
1084 mutex_unlock(&log->io_mutex);
1085 if (wake_reclaim)
1086 r5l_wake_reclaim(log, reserve);
1087 return 0;
1088}
1089
1090void r5l_write_stripe_run(struct r5l_log *log)
1091{
1092 if (!log)
1093 return;
1094 mutex_lock(&log->io_mutex);
1095 r5l_submit_current_io(log);
1096 mutex_unlock(&log->io_mutex);
1097}
1098
1099int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
1100{
1101 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
1102
1103
1104
1105
1106
1107
1108
1109 if (bio->bi_iter.bi_size == 0) {
1110 bio_endio(bio);
1111 return 0;
1112 }
1113 bio->bi_opf &= ~REQ_PREFLUSH;
1114 } else {
1115
1116 if (bio->bi_iter.bi_size == 0) {
1117 mutex_lock(&log->io_mutex);
1118 r5l_get_meta(log, 0);
1119 bio_list_add(&log->current_io->flush_barriers, bio);
1120 log->current_io->has_flush = 1;
1121 log->current_io->has_null_flush = 1;
1122 atomic_inc(&log->current_io->pending_stripe);
1123 r5l_submit_current_io(log);
1124 mutex_unlock(&log->io_mutex);
1125 return 0;
1126 }
1127 }
1128 return -EAGAIN;
1129}
1130
1131
1132static void r5l_run_no_space_stripes(struct r5l_log *log)
1133{
1134 struct stripe_head *sh;
1135
1136 spin_lock(&log->no_space_stripes_lock);
1137 while (!list_empty(&log->no_space_stripes)) {
1138 sh = list_first_entry(&log->no_space_stripes,
1139 struct stripe_head, log_list);
1140 list_del_init(&sh->log_list);
1141 set_bit(STRIPE_HANDLE, &sh->state);
1142 raid5_release_stripe(sh);
1143 }
1144 spin_unlock(&log->no_space_stripes_lock);
1145}
1146
1147
1148
1149
1150
1151
1152static sector_t r5c_calculate_new_cp(struct r5conf *conf)
1153{
1154 struct stripe_head *sh;
1155 struct r5l_log *log = conf->log;
1156 sector_t new_cp;
1157 unsigned long flags;
1158
1159 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1160 return log->next_checkpoint;
1161
1162 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1163 if (list_empty(&conf->log->stripe_in_journal_list)) {
1164
1165 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1166 return log->next_checkpoint;
1167 }
1168 sh = list_first_entry(&conf->log->stripe_in_journal_list,
1169 struct stripe_head, r5c);
1170 new_cp = sh->log_start;
1171 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1172 return new_cp;
1173}
1174
1175static sector_t r5l_reclaimable_space(struct r5l_log *log)
1176{
1177 struct r5conf *conf = log->rdev->mddev->private;
1178
1179 return r5l_ring_distance(log, log->last_checkpoint,
1180 r5c_calculate_new_cp(conf));
1181}
1182
1183static void r5l_run_no_mem_stripe(struct r5l_log *log)
1184{
1185 struct stripe_head *sh;
1186
1187 lockdep_assert_held(&log->io_list_lock);
1188
1189 if (!list_empty(&log->no_mem_stripes)) {
1190 sh = list_first_entry(&log->no_mem_stripes,
1191 struct stripe_head, log_list);
1192 list_del_init(&sh->log_list);
1193 set_bit(STRIPE_HANDLE, &sh->state);
1194 raid5_release_stripe(sh);
1195 }
1196}
1197
1198static bool r5l_complete_finished_ios(struct r5l_log *log)
1199{
1200 struct r5l_io_unit *io, *next;
1201 bool found = false;
1202
1203 lockdep_assert_held(&log->io_list_lock);
1204
1205 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1206
1207 if (io->state < IO_UNIT_STRIPE_END)
1208 break;
1209
1210 log->next_checkpoint = io->log_start;
1211
1212 list_del(&io->log_sibling);
1213 mempool_free(io, &log->io_pool);
1214 r5l_run_no_mem_stripe(log);
1215
1216 found = true;
1217 }
1218
1219 return found;
1220}
1221
1222static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
1223{
1224 struct r5l_log *log = io->log;
1225 struct r5conf *conf = log->rdev->mddev->private;
1226 unsigned long flags;
1227
1228 spin_lock_irqsave(&log->io_list_lock, flags);
1229 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
1230
1231 if (!r5l_complete_finished_ios(log)) {
1232 spin_unlock_irqrestore(&log->io_list_lock, flags);
1233 return;
1234 }
1235
1236 if (r5l_reclaimable_space(log) > log->max_free_space ||
1237 test_bit(R5C_LOG_TIGHT, &conf->cache_state))
1238 r5l_wake_reclaim(log, 0);
1239
1240 spin_unlock_irqrestore(&log->io_list_lock, flags);
1241 wake_up(&log->iounit_wait);
1242}
1243
1244void r5l_stripe_write_finished(struct stripe_head *sh)
1245{
1246 struct r5l_io_unit *io;
1247
1248 io = sh->log_io;
1249 sh->log_io = NULL;
1250
1251 if (io && atomic_dec_and_test(&io->pending_stripe))
1252 __r5l_stripe_write_finished(io);
1253}
1254
1255static void r5l_log_flush_endio(struct bio *bio)
1256{
1257 struct r5l_log *log = container_of(bio, struct r5l_log,
1258 flush_bio);
1259 unsigned long flags;
1260 struct r5l_io_unit *io;
1261
1262 if (bio->bi_status)
1263 md_error(log->rdev->mddev, log->rdev);
1264
1265 spin_lock_irqsave(&log->io_list_lock, flags);
1266 list_for_each_entry(io, &log->flushing_ios, log_sibling)
1267 r5l_io_run_stripes(io);
1268 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
1269 spin_unlock_irqrestore(&log->io_list_lock, flags);
1270}
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286void r5l_flush_stripe_to_raid(struct r5l_log *log)
1287{
1288 bool do_flush;
1289
1290 if (!log || !log->need_cache_flush)
1291 return;
1292
1293 spin_lock_irq(&log->io_list_lock);
1294
1295 if (!list_empty(&log->flushing_ios)) {
1296 spin_unlock_irq(&log->io_list_lock);
1297 return;
1298 }
1299 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
1300 do_flush = !list_empty(&log->flushing_ios);
1301 spin_unlock_irq(&log->io_list_lock);
1302
1303 if (!do_flush)
1304 return;
1305 bio_reset(&log->flush_bio);
1306 bio_set_dev(&log->flush_bio, log->rdev->bdev);
1307 log->flush_bio.bi_end_io = r5l_log_flush_endio;
1308 log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1309 submit_bio(&log->flush_bio);
1310}
1311
1312static void r5l_write_super(struct r5l_log *log, sector_t cp);
1313static void r5l_write_super_and_discard_space(struct r5l_log *log,
1314 sector_t end)
1315{
1316 struct block_device *bdev = log->rdev->bdev;
1317 struct mddev *mddev;
1318
1319 r5l_write_super(log, end);
1320
1321 if (!blk_queue_discard(bdev_get_queue(bdev)))
1322 return;
1323
1324 mddev = log->rdev->mddev;
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336 set_mask_bits(&mddev->sb_flags, 0,
1337 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1338 if (!mddev_trylock(mddev))
1339 return;
1340 md_update_sb(mddev, 1);
1341 mddev_unlock(mddev);
1342
1343
1344 if (log->last_checkpoint < end) {
1345 blkdev_issue_discard(bdev,
1346 log->last_checkpoint + log->rdev->data_offset,
1347 end - log->last_checkpoint, GFP_NOIO, 0);
1348 } else {
1349 blkdev_issue_discard(bdev,
1350 log->last_checkpoint + log->rdev->data_offset,
1351 log->device_size - log->last_checkpoint,
1352 GFP_NOIO, 0);
1353 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
1354 GFP_NOIO, 0);
1355 }
1356}
1357
1358
1359
1360
1361
1362
1363
1364static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1365{
1366 BUG_ON(list_empty(&sh->lru));
1367 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
1368 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1369
1370
1371
1372
1373
1374 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1375 lockdep_assert_held(&conf->device_lock);
1376
1377 list_del_init(&sh->lru);
1378 atomic_inc(&sh->count);
1379
1380 set_bit(STRIPE_HANDLE, &sh->state);
1381 atomic_inc(&conf->active_stripes);
1382 r5c_make_stripe_write_out(sh);
1383
1384 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
1385 atomic_inc(&conf->r5c_flushing_partial_stripes);
1386 else
1387 atomic_inc(&conf->r5c_flushing_full_stripes);
1388 raid5_release_stripe(sh);
1389}
1390
1391
1392
1393
1394
1395
1396
1397void r5c_flush_cache(struct r5conf *conf, int num)
1398{
1399 int count;
1400 struct stripe_head *sh, *next;
1401
1402 lockdep_assert_held(&conf->device_lock);
1403 if (!conf->log)
1404 return;
1405
1406 count = 0;
1407 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
1408 r5c_flush_stripe(conf, sh);
1409 count++;
1410 }
1411
1412 if (count >= num)
1413 return;
1414 list_for_each_entry_safe(sh, next,
1415 &conf->r5c_partial_stripe_list, lru) {
1416 r5c_flush_stripe(conf, sh);
1417 if (++count >= num)
1418 break;
1419 }
1420}
1421
1422static void r5c_do_reclaim(struct r5conf *conf)
1423{
1424 struct r5l_log *log = conf->log;
1425 struct stripe_head *sh;
1426 int count = 0;
1427 unsigned long flags;
1428 int total_cached;
1429 int stripes_to_flush;
1430 int flushing_partial, flushing_full;
1431
1432 if (!r5c_is_writeback(log))
1433 return;
1434
1435 flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
1436 flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
1437 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
1438 atomic_read(&conf->r5c_cached_full_stripes) -
1439 flushing_full - flushing_partial;
1440
1441 if (total_cached > conf->min_nr_stripes * 3 / 4 ||
1442 atomic_read(&conf->empty_inactive_list_nr) > 0)
1443
1444
1445
1446
1447 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
1448 else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
1449 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
1450 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
1451
1452
1453
1454
1455 stripes_to_flush = 0;
1456 else
1457
1458 stripes_to_flush = -1;
1459
1460 if (stripes_to_flush >= 0) {
1461 spin_lock_irqsave(&conf->device_lock, flags);
1462 r5c_flush_cache(conf, stripes_to_flush);
1463 spin_unlock_irqrestore(&conf->device_lock, flags);
1464 }
1465
1466
1467 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
1468 spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
1469 spin_lock(&conf->device_lock);
1470 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
1471
1472
1473
1474
1475
1476
1477
1478
1479 if (!list_empty(&sh->lru) &&
1480 !test_bit(STRIPE_HANDLE, &sh->state) &&
1481 atomic_read(&sh->count) == 0) {
1482 r5c_flush_stripe(conf, sh);
1483 if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
1484 break;
1485 }
1486 }
1487 spin_unlock(&conf->device_lock);
1488 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
1489 }
1490
1491 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
1492 r5l_run_no_space_stripes(log);
1493
1494 md_wakeup_thread(conf->mddev->thread);
1495}
1496
1497static void r5l_do_reclaim(struct r5l_log *log)
1498{
1499 struct r5conf *conf = log->rdev->mddev->private;
1500 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
1501 sector_t reclaimable;
1502 sector_t next_checkpoint;
1503 bool write_super;
1504
1505 spin_lock_irq(&log->io_list_lock);
1506 write_super = r5l_reclaimable_space(log) > log->max_free_space ||
1507 reclaim_target != 0 || !list_empty(&log->no_space_stripes);
1508
1509
1510
1511
1512
1513 while (1) {
1514 reclaimable = r5l_reclaimable_space(log);
1515 if (reclaimable >= reclaim_target ||
1516 (list_empty(&log->running_ios) &&
1517 list_empty(&log->io_end_ios) &&
1518 list_empty(&log->flushing_ios) &&
1519 list_empty(&log->finished_ios)))
1520 break;
1521
1522 md_wakeup_thread(log->rdev->mddev->thread);
1523 wait_event_lock_irq(log->iounit_wait,
1524 r5l_reclaimable_space(log) > reclaimable,
1525 log->io_list_lock);
1526 }
1527
1528 next_checkpoint = r5c_calculate_new_cp(conf);
1529 spin_unlock_irq(&log->io_list_lock);
1530
1531 if (reclaimable == 0 || !write_super)
1532 return;
1533
1534
1535
1536
1537
1538
1539 r5l_write_super_and_discard_space(log, next_checkpoint);
1540
1541 mutex_lock(&log->io_mutex);
1542 log->last_checkpoint = next_checkpoint;
1543 r5c_update_log_state(log);
1544 mutex_unlock(&log->io_mutex);
1545
1546 r5l_run_no_space_stripes(log);
1547}
1548
1549static void r5l_reclaim_thread(struct md_thread *thread)
1550{
1551 struct mddev *mddev = thread->mddev;
1552 struct r5conf *conf = mddev->private;
1553 struct r5l_log *log = conf->log;
1554
1555 if (!log)
1556 return;
1557 r5c_do_reclaim(conf);
1558 r5l_do_reclaim(log);
1559}
1560
1561void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1562{
1563 unsigned long target;
1564 unsigned long new = (unsigned long)space;
1565
1566 if (!log)
1567 return;
1568 do {
1569 target = log->reclaim_target;
1570 if (new < target)
1571 return;
1572 } while (cmpxchg(&log->reclaim_target, target, new) != target);
1573 md_wakeup_thread(log->reclaim_thread);
1574}
1575
1576void r5l_quiesce(struct r5l_log *log, int quiesce)
1577{
1578 struct mddev *mddev;
1579
1580 if (quiesce) {
1581
1582 mddev = log->rdev->mddev;
1583 wake_up(&mddev->sb_wait);
1584 kthread_park(log->reclaim_thread->tsk);
1585 r5l_wake_reclaim(log, MaxSector);
1586 r5l_do_reclaim(log);
1587 } else
1588 kthread_unpark(log->reclaim_thread->tsk);
1589}
1590
1591bool r5l_log_disk_error(struct r5conf *conf)
1592{
1593 struct r5l_log *log;
1594 bool ret;
1595
1596 rcu_read_lock();
1597 log = rcu_dereference(conf->log);
1598
1599 if (!log)
1600 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1601 else
1602 ret = test_bit(Faulty, &log->rdev->flags);
1603 rcu_read_unlock();
1604 return ret;
1605}
1606
1607#define R5L_RECOVERY_PAGE_POOL_SIZE 256
1608
1609struct r5l_recovery_ctx {
1610 struct page *meta_page;
1611 sector_t meta_total_blocks;
1612 sector_t pos;
1613 u64 seq;
1614 int data_parity_stripes;
1615 int data_only_stripes;
1616 struct list_head cached_list;
1617
1618
1619
1620
1621
1622
1623
1624
1625 struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
1626 sector_t pool_offset;
1627 int total_pages;
1628 int valid_pages;
1629 struct bio *ra_bio;
1630};
1631
1632static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
1633 struct r5l_recovery_ctx *ctx)
1634{
1635 struct page *page;
1636
1637 ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_VECS, &log->bs);
1638 if (!ctx->ra_bio)
1639 return -ENOMEM;
1640
1641 ctx->valid_pages = 0;
1642 ctx->total_pages = 0;
1643 while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
1644 page = alloc_page(GFP_KERNEL);
1645
1646 if (!page)
1647 break;
1648 ctx->ra_pool[ctx->total_pages] = page;
1649 ctx->total_pages += 1;
1650 }
1651
1652 if (ctx->total_pages == 0) {
1653 bio_put(ctx->ra_bio);
1654 return -ENOMEM;
1655 }
1656
1657 ctx->pool_offset = 0;
1658 return 0;
1659}
1660
1661static void r5l_recovery_free_ra_pool(struct r5l_log *log,
1662 struct r5l_recovery_ctx *ctx)
1663{
1664 int i;
1665
1666 for (i = 0; i < ctx->total_pages; ++i)
1667 put_page(ctx->ra_pool[i]);
1668 bio_put(ctx->ra_bio);
1669}
1670
1671
1672
1673
1674
1675
1676
1677static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
1678 struct r5l_recovery_ctx *ctx,
1679 sector_t offset)
1680{
1681 bio_reset(ctx->ra_bio);
1682 bio_set_dev(ctx->ra_bio, log->rdev->bdev);
1683 bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
1684 ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
1685
1686 ctx->valid_pages = 0;
1687 ctx->pool_offset = offset;
1688
1689 while (ctx->valid_pages < ctx->total_pages) {
1690 bio_add_page(ctx->ra_bio,
1691 ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
1692 ctx->valid_pages += 1;
1693
1694 offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
1695
1696 if (offset == 0)
1697 break;
1698 }
1699
1700 return submit_bio_wait(ctx->ra_bio);
1701}
1702
1703
1704
1705
1706
1707static int r5l_recovery_read_page(struct r5l_log *log,
1708 struct r5l_recovery_ctx *ctx,
1709 struct page *page,
1710 sector_t offset)
1711{
1712 int ret;
1713
1714 if (offset < ctx->pool_offset ||
1715 offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
1716 ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
1717 if (ret)
1718 return ret;
1719 }
1720
1721 BUG_ON(offset < ctx->pool_offset ||
1722 offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
1723
1724 memcpy(page_address(page),
1725 page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
1726 BLOCK_SECTOR_SHIFT]),
1727 PAGE_SIZE);
1728 return 0;
1729}
1730
1731static int r5l_recovery_read_meta_block(struct r5l_log *log,
1732 struct r5l_recovery_ctx *ctx)
1733{
1734 struct page *page = ctx->meta_page;
1735 struct r5l_meta_block *mb;
1736 u32 crc, stored_crc;
1737 int ret;
1738
1739 ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
1740 if (ret != 0)
1741 return ret;
1742
1743 mb = page_address(page);
1744 stored_crc = le32_to_cpu(mb->checksum);
1745 mb->checksum = 0;
1746
1747 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1748 le64_to_cpu(mb->seq) != ctx->seq ||
1749 mb->version != R5LOG_VERSION ||
1750 le64_to_cpu(mb->position) != ctx->pos)
1751 return -EINVAL;
1752
1753 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1754 if (stored_crc != crc)
1755 return -EINVAL;
1756
1757 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1758 return -EINVAL;
1759
1760 ctx->meta_total_blocks = BLOCK_SECTORS;
1761
1762 return 0;
1763}
1764
1765static void
1766r5l_recovery_create_empty_meta_block(struct r5l_log *log,
1767 struct page *page,
1768 sector_t pos, u64 seq)
1769{
1770 struct r5l_meta_block *mb;
1771
1772 mb = page_address(page);
1773 clear_page(mb);
1774 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1775 mb->version = R5LOG_VERSION;
1776 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1777 mb->seq = cpu_to_le64(seq);
1778 mb->position = cpu_to_le64(pos);
1779}
1780
1781static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1782 u64 seq)
1783{
1784 struct page *page;
1785 struct r5l_meta_block *mb;
1786
1787 page = alloc_page(GFP_KERNEL);
1788 if (!page)
1789 return -ENOMEM;
1790 r5l_recovery_create_empty_meta_block(log, page, pos, seq);
1791 mb = page_address(page);
1792 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
1793 mb, PAGE_SIZE));
1794 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1795 REQ_SYNC | REQ_FUA, false)) {
1796 __free_page(page);
1797 return -EIO;
1798 }
1799 __free_page(page);
1800 return 0;
1801}
1802
1803
1804
1805
1806
1807
1808
1809
1810static void r5l_recovery_load_data(struct r5l_log *log,
1811 struct stripe_head *sh,
1812 struct r5l_recovery_ctx *ctx,
1813 struct r5l_payload_data_parity *payload,
1814 sector_t log_offset)
1815{
1816 struct mddev *mddev = log->rdev->mddev;
1817 struct r5conf *conf = mddev->private;
1818 int dd_idx;
1819
1820 raid5_compute_sector(conf,
1821 le64_to_cpu(payload->location), 0,
1822 &dd_idx, sh);
1823 r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
1824 sh->dev[dd_idx].log_checksum =
1825 le32_to_cpu(payload->checksum[0]);
1826 ctx->meta_total_blocks += BLOCK_SECTORS;
1827
1828 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
1829 set_bit(STRIPE_R5C_CACHING, &sh->state);
1830}
1831
1832static void r5l_recovery_load_parity(struct r5l_log *log,
1833 struct stripe_head *sh,
1834 struct r5l_recovery_ctx *ctx,
1835 struct r5l_payload_data_parity *payload,
1836 sector_t log_offset)
1837{
1838 struct mddev *mddev = log->rdev->mddev;
1839 struct r5conf *conf = mddev->private;
1840
1841 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
1842 r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
1843 sh->dev[sh->pd_idx].log_checksum =
1844 le32_to_cpu(payload->checksum[0]);
1845 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
1846
1847 if (sh->qd_idx >= 0) {
1848 r5l_recovery_read_page(
1849 log, ctx, sh->dev[sh->qd_idx].page,
1850 r5l_ring_add(log, log_offset, BLOCK_SECTORS));
1851 sh->dev[sh->qd_idx].log_checksum =
1852 le32_to_cpu(payload->checksum[1]);
1853 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
1854 }
1855 clear_bit(STRIPE_R5C_CACHING, &sh->state);
1856}
1857
1858static void r5l_recovery_reset_stripe(struct stripe_head *sh)
1859{
1860 int i;
1861
1862 sh->state = 0;
1863 sh->log_start = MaxSector;
1864 for (i = sh->disks; i--; )
1865 sh->dev[i].flags = 0;
1866}
1867
1868static void
1869r5l_recovery_replay_one_stripe(struct r5conf *conf,
1870 struct stripe_head *sh,
1871 struct r5l_recovery_ctx *ctx)
1872{
1873 struct md_rdev *rdev, *rrdev;
1874 int disk_index;
1875 int data_count = 0;
1876
1877 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1878 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1879 continue;
1880 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
1881 continue;
1882 data_count++;
1883 }
1884
1885
1886
1887
1888
1889
1890 if (data_count == 0)
1891 goto out;
1892
1893 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1894 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1895 continue;
1896
1897
1898 rcu_read_lock();
1899 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1900 if (rdev) {
1901 atomic_inc(&rdev->nr_pending);
1902 rcu_read_unlock();
1903 sync_page_io(rdev, sh->sector, PAGE_SIZE,
1904 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1905 false);
1906 rdev_dec_pending(rdev, rdev->mddev);
1907 rcu_read_lock();
1908 }
1909 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1910 if (rrdev) {
1911 atomic_inc(&rrdev->nr_pending);
1912 rcu_read_unlock();
1913 sync_page_io(rrdev, sh->sector, PAGE_SIZE,
1914 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1915 false);
1916 rdev_dec_pending(rrdev, rrdev->mddev);
1917 rcu_read_lock();
1918 }
1919 rcu_read_unlock();
1920 }
1921 ctx->data_parity_stripes++;
1922out:
1923 r5l_recovery_reset_stripe(sh);
1924}
1925
1926static struct stripe_head *
1927r5c_recovery_alloc_stripe(
1928 struct r5conf *conf,
1929 sector_t stripe_sect,
1930 int noblock)
1931{
1932 struct stripe_head *sh;
1933
1934 sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
1935 if (!sh)
1936 return NULL;
1937
1938 r5l_recovery_reset_stripe(sh);
1939
1940 return sh;
1941}
1942
1943static struct stripe_head *
1944r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
1945{
1946 struct stripe_head *sh;
1947
1948 list_for_each_entry(sh, list, lru)
1949 if (sh->sector == sect)
1950 return sh;
1951 return NULL;
1952}
1953
1954static void
1955r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
1956 struct r5l_recovery_ctx *ctx)
1957{
1958 struct stripe_head *sh, *next;
1959
1960 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
1961 r5l_recovery_reset_stripe(sh);
1962 list_del_init(&sh->lru);
1963 raid5_release_stripe(sh);
1964 }
1965}
1966
1967static void
1968r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
1969 struct r5l_recovery_ctx *ctx)
1970{
1971 struct stripe_head *sh, *next;
1972
1973 list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
1974 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1975 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
1976 list_del_init(&sh->lru);
1977 raid5_release_stripe(sh);
1978 }
1979}
1980
1981
1982static int
1983r5l_recovery_verify_data_checksum(struct r5l_log *log,
1984 struct r5l_recovery_ctx *ctx,
1985 struct page *page,
1986 sector_t log_offset, __le32 log_checksum)
1987{
1988 void *addr;
1989 u32 checksum;
1990
1991 r5l_recovery_read_page(log, ctx, page, log_offset);
1992 addr = kmap_atomic(page);
1993 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1994 kunmap_atomic(addr);
1995 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
1996}
1997
1998
1999
2000
2001
2002static int
2003r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
2004 struct r5l_recovery_ctx *ctx)
2005{
2006 struct mddev *mddev = log->rdev->mddev;
2007 struct r5conf *conf = mddev->private;
2008 struct r5l_meta_block *mb = page_address(ctx->meta_page);
2009 sector_t mb_offset = sizeof(struct r5l_meta_block);
2010 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2011 struct page *page;
2012 struct r5l_payload_data_parity *payload;
2013 struct r5l_payload_flush *payload_flush;
2014
2015 page = alloc_page(GFP_KERNEL);
2016 if (!page)
2017 return -ENOMEM;
2018
2019 while (mb_offset < le32_to_cpu(mb->meta_size)) {
2020 payload = (void *)mb + mb_offset;
2021 payload_flush = (void *)mb + mb_offset;
2022
2023 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2024 if (r5l_recovery_verify_data_checksum(
2025 log, ctx, page, log_offset,
2026 payload->checksum[0]) < 0)
2027 goto mismatch;
2028 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
2029 if (r5l_recovery_verify_data_checksum(
2030 log, ctx, page, log_offset,
2031 payload->checksum[0]) < 0)
2032 goto mismatch;
2033 if (conf->max_degraded == 2 &&
2034 r5l_recovery_verify_data_checksum(
2035 log, ctx, page,
2036 r5l_ring_add(log, log_offset,
2037 BLOCK_SECTORS),
2038 payload->checksum[1]) < 0)
2039 goto mismatch;
2040 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2041
2042 } else
2043 goto mismatch;
2044
2045 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2046 mb_offset += sizeof(struct r5l_payload_flush) +
2047 le32_to_cpu(payload_flush->size);
2048 } else {
2049
2050 log_offset = r5l_ring_add(log, log_offset,
2051 le32_to_cpu(payload->size));
2052 mb_offset += sizeof(struct r5l_payload_data_parity) +
2053 sizeof(__le32) *
2054 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2055 }
2056
2057 }
2058
2059 put_page(page);
2060 return 0;
2061
2062mismatch:
2063 put_page(page);
2064 return -EINVAL;
2065}
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075static int
2076r5c_recovery_analyze_meta_block(struct r5l_log *log,
2077 struct r5l_recovery_ctx *ctx,
2078 struct list_head *cached_stripe_list)
2079{
2080 struct mddev *mddev = log->rdev->mddev;
2081 struct r5conf *conf = mddev->private;
2082 struct r5l_meta_block *mb;
2083 struct r5l_payload_data_parity *payload;
2084 struct r5l_payload_flush *payload_flush;
2085 int mb_offset;
2086 sector_t log_offset;
2087 sector_t stripe_sect;
2088 struct stripe_head *sh;
2089 int ret;
2090
2091
2092
2093
2094
2095
2096 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
2097 if (ret == -EINVAL)
2098 return -EAGAIN;
2099 else if (ret)
2100 return ret;
2101
2102 mb = page_address(ctx->meta_page);
2103 mb_offset = sizeof(struct r5l_meta_block);
2104 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2105
2106 while (mb_offset < le32_to_cpu(mb->meta_size)) {
2107 int dd;
2108
2109 payload = (void *)mb + mb_offset;
2110 payload_flush = (void *)mb + mb_offset;
2111
2112 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
2113 int i, count;
2114
2115 count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
2116 for (i = 0; i < count; ++i) {
2117 stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
2118 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2119 stripe_sect);
2120 if (sh) {
2121 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2122 r5l_recovery_reset_stripe(sh);
2123 list_del_init(&sh->lru);
2124 raid5_release_stripe(sh);
2125 }
2126 }
2127
2128 mb_offset += sizeof(struct r5l_payload_flush) +
2129 le32_to_cpu(payload_flush->size);
2130 continue;
2131 }
2132
2133
2134 stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
2135 raid5_compute_sector(
2136 conf, le64_to_cpu(payload->location), 0, &dd,
2137 NULL)
2138 : le64_to_cpu(payload->location);
2139
2140 sh = r5c_recovery_lookup_stripe(cached_stripe_list,
2141 stripe_sect);
2142
2143 if (!sh) {
2144 sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
2145
2146
2147
2148
2149 if (!sh) {
2150 r5c_recovery_replay_stripes(
2151 cached_stripe_list, ctx);
2152 sh = r5c_recovery_alloc_stripe(
2153 conf, stripe_sect, 1);
2154 }
2155 if (!sh) {
2156 int new_size = conf->min_nr_stripes * 2;
2157 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
2158 mdname(mddev),
2159 new_size);
2160 ret = raid5_set_cache_size(mddev, new_size);
2161 if (conf->min_nr_stripes <= new_size / 2) {
2162 pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
2163 mdname(mddev),
2164 ret,
2165 new_size,
2166 conf->min_nr_stripes,
2167 conf->max_nr_stripes);
2168 return -ENOMEM;
2169 }
2170 sh = r5c_recovery_alloc_stripe(
2171 conf, stripe_sect, 0);
2172 }
2173 if (!sh) {
2174 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
2175 mdname(mddev));
2176 return -ENOMEM;
2177 }
2178 list_add_tail(&sh->lru, cached_stripe_list);
2179 }
2180
2181 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
2182 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
2183 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
2184 r5l_recovery_replay_one_stripe(conf, sh, ctx);
2185 list_move_tail(&sh->lru, cached_stripe_list);
2186 }
2187 r5l_recovery_load_data(log, sh, ctx, payload,
2188 log_offset);
2189 } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
2190 r5l_recovery_load_parity(log, sh, ctx, payload,
2191 log_offset);
2192 else
2193 return -EINVAL;
2194
2195 log_offset = r5l_ring_add(log, log_offset,
2196 le32_to_cpu(payload->size));
2197
2198 mb_offset += sizeof(struct r5l_payload_data_parity) +
2199 sizeof(__le32) *
2200 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
2201 }
2202
2203 return 0;
2204}
2205
2206
2207
2208
2209
2210static void r5c_recovery_load_one_stripe(struct r5l_log *log,
2211 struct stripe_head *sh)
2212{
2213 struct r5dev *dev;
2214 int i;
2215
2216 for (i = sh->disks; i--; ) {
2217 dev = sh->dev + i;
2218 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
2219 set_bit(R5_InJournal, &dev->flags);
2220 set_bit(R5_UPTODATE, &dev->flags);
2221 }
2222 }
2223}
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241static int r5c_recovery_flush_log(struct r5l_log *log,
2242 struct r5l_recovery_ctx *ctx)
2243{
2244 struct stripe_head *sh;
2245 int ret = 0;
2246
2247
2248 while (1) {
2249 if (r5l_recovery_read_meta_block(log, ctx))
2250 break;
2251
2252 ret = r5c_recovery_analyze_meta_block(log, ctx,
2253 &ctx->cached_list);
2254
2255
2256
2257
2258 if (ret && ret != -EAGAIN)
2259 break;
2260 ctx->seq++;
2261 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
2262 }
2263
2264 if (ret == -ENOMEM) {
2265 r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
2266 return ret;
2267 }
2268
2269
2270 r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
2271
2272
2273 list_for_each_entry(sh, &ctx->cached_list, lru) {
2274 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2275 r5c_recovery_load_one_stripe(log, sh);
2276 ctx->data_only_stripes++;
2277 }
2278
2279 return 0;
2280}
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351static int
2352r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
2353 struct r5l_recovery_ctx *ctx)
2354{
2355 struct stripe_head *sh;
2356 struct mddev *mddev = log->rdev->mddev;
2357 struct page *page;
2358 sector_t next_checkpoint = MaxSector;
2359
2360 page = alloc_page(GFP_KERNEL);
2361 if (!page) {
2362 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
2363 mdname(mddev));
2364 return -ENOMEM;
2365 }
2366
2367 WARN_ON(list_empty(&ctx->cached_list));
2368
2369 list_for_each_entry(sh, &ctx->cached_list, lru) {
2370 struct r5l_meta_block *mb;
2371 int i;
2372 int offset;
2373 sector_t write_pos;
2374
2375 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
2376 r5l_recovery_create_empty_meta_block(log, page,
2377 ctx->pos, ctx->seq);
2378 mb = page_address(page);
2379 offset = le32_to_cpu(mb->meta_size);
2380 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2381
2382 for (i = sh->disks; i--; ) {
2383 struct r5dev *dev = &sh->dev[i];
2384 struct r5l_payload_data_parity *payload;
2385 void *addr;
2386
2387 if (test_bit(R5_InJournal, &dev->flags)) {
2388 payload = (void *)mb + offset;
2389 payload->header.type = cpu_to_le16(
2390 R5LOG_PAYLOAD_DATA);
2391 payload->size = cpu_to_le32(BLOCK_SECTORS);
2392 payload->location = cpu_to_le64(
2393 raid5_compute_blocknr(sh, i, 0));
2394 addr = kmap_atomic(dev->page);
2395 payload->checksum[0] = cpu_to_le32(
2396 crc32c_le(log->uuid_checksum, addr,
2397 PAGE_SIZE));
2398 kunmap_atomic(addr);
2399 sync_page_io(log->rdev, write_pos, PAGE_SIZE,
2400 dev->page, REQ_OP_WRITE, 0, false);
2401 write_pos = r5l_ring_add(log, write_pos,
2402 BLOCK_SECTORS);
2403 offset += sizeof(__le32) +
2404 sizeof(struct r5l_payload_data_parity);
2405
2406 }
2407 }
2408 mb->meta_size = cpu_to_le32(offset);
2409 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
2410 mb, PAGE_SIZE));
2411 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
2412 REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
2413 sh->log_start = ctx->pos;
2414 list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
2415 atomic_inc(&log->stripe_in_journal_count);
2416 ctx->pos = write_pos;
2417 ctx->seq += 1;
2418 next_checkpoint = sh->log_start;
2419 }
2420 log->next_checkpoint = next_checkpoint;
2421 __free_page(page);
2422 return 0;
2423}
2424
2425static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
2426 struct r5l_recovery_ctx *ctx)
2427{
2428 struct mddev *mddev = log->rdev->mddev;
2429 struct r5conf *conf = mddev->private;
2430 struct stripe_head *sh, *next;
2431 bool cleared_pending = false;
2432
2433 if (ctx->data_only_stripes == 0)
2434 return;
2435
2436 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2437 cleared_pending = true;
2438 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2439 }
2440 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
2441
2442 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
2443 r5c_make_stripe_write_out(sh);
2444 set_bit(STRIPE_HANDLE, &sh->state);
2445 list_del_init(&sh->lru);
2446 raid5_release_stripe(sh);
2447 }
2448
2449
2450 wait_event(conf->wait_for_quiescent,
2451 atomic_read(&conf->active_stripes) == 0);
2452
2453 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
2454 if (cleared_pending)
2455 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2456}
2457
2458static int r5l_recovery_log(struct r5l_log *log)
2459{
2460 struct mddev *mddev = log->rdev->mddev;
2461 struct r5l_recovery_ctx *ctx;
2462 int ret;
2463 sector_t pos;
2464
2465 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2466 if (!ctx)
2467 return -ENOMEM;
2468
2469 ctx->pos = log->last_checkpoint;
2470 ctx->seq = log->last_cp_seq;
2471 INIT_LIST_HEAD(&ctx->cached_list);
2472 ctx->meta_page = alloc_page(GFP_KERNEL);
2473
2474 if (!ctx->meta_page) {
2475 ret = -ENOMEM;
2476 goto meta_page;
2477 }
2478
2479 if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
2480 ret = -ENOMEM;
2481 goto ra_pool;
2482 }
2483
2484 ret = r5c_recovery_flush_log(log, ctx);
2485
2486 if (ret)
2487 goto error;
2488
2489 pos = ctx->pos;
2490 ctx->seq += 10000;
2491
2492 if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
2493 pr_info("md/raid:%s: starting from clean shutdown\n",
2494 mdname(mddev));
2495 else
2496 pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
2497 mdname(mddev), ctx->data_only_stripes,
2498 ctx->data_parity_stripes);
2499
2500 if (ctx->data_only_stripes == 0) {
2501 log->next_checkpoint = ctx->pos;
2502 r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
2503 ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
2504 } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
2505 pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
2506 mdname(mddev));
2507 ret = -EIO;
2508 goto error;
2509 }
2510
2511 log->log_start = ctx->pos;
2512 log->seq = ctx->seq;
2513 log->last_checkpoint = pos;
2514 r5l_write_super(log, pos);
2515
2516 r5c_recovery_flush_data_only_stripes(log, ctx);
2517 ret = 0;
2518error:
2519 r5l_recovery_free_ra_pool(log, ctx);
2520ra_pool:
2521 __free_page(ctx->meta_page);
2522meta_page:
2523 kfree(ctx);
2524 return ret;
2525}
2526
2527static void r5l_write_super(struct r5l_log *log, sector_t cp)
2528{
2529 struct mddev *mddev = log->rdev->mddev;
2530
2531 log->rdev->journal_tail = cp;
2532 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2533}
2534
2535static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2536{
2537 struct r5conf *conf;
2538 int ret;
2539
2540 spin_lock(&mddev->lock);
2541 conf = mddev->private;
2542 if (!conf || !conf->log) {
2543 spin_unlock(&mddev->lock);
2544 return 0;
2545 }
2546
2547 switch (conf->log->r5c_journal_mode) {
2548 case R5C_JOURNAL_MODE_WRITE_THROUGH:
2549 ret = snprintf(
2550 page, PAGE_SIZE, "[%s] %s\n",
2551 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2552 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2553 break;
2554 case R5C_JOURNAL_MODE_WRITE_BACK:
2555 ret = snprintf(
2556 page, PAGE_SIZE, "%s [%s]\n",
2557 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
2558 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
2559 break;
2560 default:
2561 ret = 0;
2562 }
2563 spin_unlock(&mddev->lock);
2564 return ret;
2565}
2566
2567
2568
2569
2570
2571
2572
2573int r5c_journal_mode_set(struct mddev *mddev, int mode)
2574{
2575 struct r5conf *conf;
2576
2577 if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2578 mode > R5C_JOURNAL_MODE_WRITE_BACK)
2579 return -EINVAL;
2580
2581 conf = mddev->private;
2582 if (!conf || !conf->log)
2583 return -ENODEV;
2584
2585 if (raid5_calc_degraded(conf) > 0 &&
2586 mode == R5C_JOURNAL_MODE_WRITE_BACK)
2587 return -EINVAL;
2588
2589 mddev_suspend(mddev);
2590 conf->log->r5c_journal_mode = mode;
2591 mddev_resume(mddev);
2592
2593 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2594 mdname(mddev), mode, r5c_journal_mode_str[mode]);
2595 return 0;
2596}
2597EXPORT_SYMBOL(r5c_journal_mode_set);
2598
2599static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2600 const char *page, size_t length)
2601{
2602 int mode = ARRAY_SIZE(r5c_journal_mode_str);
2603 size_t len = length;
2604 int ret;
2605
2606 if (len < 2)
2607 return -EINVAL;
2608
2609 if (page[len - 1] == '\n')
2610 len--;
2611
2612 while (mode--)
2613 if (strlen(r5c_journal_mode_str[mode]) == len &&
2614 !strncmp(page, r5c_journal_mode_str[mode], len))
2615 break;
2616 ret = mddev_lock(mddev);
2617 if (ret)
2618 return ret;
2619 ret = r5c_journal_mode_set(mddev, mode);
2620 mddev_unlock(mddev);
2621 return ret ?: length;
2622}
2623
2624struct md_sysfs_entry
2625r5c_journal_mode = __ATTR(journal_mode, 0644,
2626 r5c_journal_mode_show, r5c_journal_mode_store);
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636int r5c_try_caching_write(struct r5conf *conf,
2637 struct stripe_head *sh,
2638 struct stripe_head_state *s,
2639 int disks)
2640{
2641 struct r5l_log *log = conf->log;
2642 int i;
2643 struct r5dev *dev;
2644 int to_cache = 0;
2645 void **pslot;
2646 sector_t tree_index;
2647 int ret;
2648 uintptr_t refcount;
2649
2650 BUG_ON(!r5c_is_writeback(log));
2651
2652 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667 if (s->injournal > 0 || s->written > 0)
2668 return -EAGAIN;
2669
2670 set_bit(STRIPE_R5C_CACHING, &sh->state);
2671 }
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681 if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
2682 r5c_make_stripe_write_out(sh);
2683 return -EAGAIN;
2684 }
2685
2686 for (i = disks; i--; ) {
2687 dev = &sh->dev[i];
2688
2689 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
2690 !test_bit(R5_InJournal, &dev->flags)) {
2691 r5c_make_stripe_write_out(sh);
2692 return -EAGAIN;
2693 }
2694 }
2695
2696
2697 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
2698 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2699 tree_index = r5c_tree_index(conf, sh->sector);
2700 spin_lock(&log->tree_lock);
2701 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2702 tree_index);
2703 if (pslot) {
2704 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2705 pslot, &log->tree_lock) >>
2706 R5C_RADIX_COUNT_SHIFT;
2707 radix_tree_replace_slot(
2708 &log->big_stripe_tree, pslot,
2709 (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
2710 } else {
2711
2712
2713
2714
2715 ret = radix_tree_insert(
2716 &log->big_stripe_tree, tree_index,
2717 (void *)(1 << R5C_RADIX_COUNT_SHIFT));
2718 if (ret) {
2719 spin_unlock(&log->tree_lock);
2720 r5c_make_stripe_write_out(sh);
2721 return -EAGAIN;
2722 }
2723 }
2724 spin_unlock(&log->tree_lock);
2725
2726
2727
2728
2729
2730 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
2731 atomic_inc(&conf->r5c_cached_partial_stripes);
2732 }
2733
2734 for (i = disks; i--; ) {
2735 dev = &sh->dev[i];
2736 if (dev->towrite) {
2737 set_bit(R5_Wantwrite, &dev->flags);
2738 set_bit(R5_Wantdrain, &dev->flags);
2739 set_bit(R5_LOCKED, &dev->flags);
2740 to_cache++;
2741 }
2742 }
2743
2744 if (to_cache) {
2745 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2746
2747
2748
2749
2750
2751 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
2752 }
2753
2754 return 0;
2755}
2756
2757
2758
2759
2760void r5c_release_extra_page(struct stripe_head *sh)
2761{
2762 struct r5conf *conf = sh->raid_conf;
2763 int i;
2764 bool using_disk_info_extra_page;
2765
2766 using_disk_info_extra_page =
2767 sh->dev[0].orig_page == conf->disks[0].extra_page;
2768
2769 for (i = sh->disks; i--; )
2770 if (sh->dev[i].page != sh->dev[i].orig_page) {
2771 struct page *p = sh->dev[i].orig_page;
2772
2773 sh->dev[i].orig_page = sh->dev[i].page;
2774 clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2775
2776 if (!using_disk_info_extra_page)
2777 put_page(p);
2778 }
2779
2780 if (using_disk_info_extra_page) {
2781 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
2782 md_wakeup_thread(conf->mddev->thread);
2783 }
2784}
2785
2786void r5c_use_extra_page(struct stripe_head *sh)
2787{
2788 struct r5conf *conf = sh->raid_conf;
2789 int i;
2790 struct r5dev *dev;
2791
2792 for (i = sh->disks; i--; ) {
2793 dev = &sh->dev[i];
2794 if (dev->orig_page != dev->page)
2795 put_page(dev->orig_page);
2796 dev->orig_page = conf->disks[i].extra_page;
2797 }
2798}
2799
2800
2801
2802
2803
2804void r5c_finish_stripe_write_out(struct r5conf *conf,
2805 struct stripe_head *sh,
2806 struct stripe_head_state *s)
2807{
2808 struct r5l_log *log = conf->log;
2809 int i;
2810 int do_wakeup = 0;
2811 sector_t tree_index;
2812 void **pslot;
2813 uintptr_t refcount;
2814
2815 if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
2816 return;
2817
2818 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
2819 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
2820
2821 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
2822 return;
2823
2824 for (i = sh->disks; i--; ) {
2825 clear_bit(R5_InJournal, &sh->dev[i].flags);
2826 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2827 do_wakeup = 1;
2828 }
2829
2830
2831
2832
2833
2834 s->injournal = 0;
2835
2836 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2837 if (atomic_dec_and_test(&conf->pending_full_writes))
2838 md_wakeup_thread(conf->mddev->thread);
2839
2840 if (do_wakeup)
2841 wake_up(&conf->wait_for_overlap);
2842
2843 spin_lock_irq(&log->stripe_in_journal_lock);
2844 list_del_init(&sh->r5c);
2845 spin_unlock_irq(&log->stripe_in_journal_lock);
2846 sh->log_start = MaxSector;
2847
2848 atomic_dec(&log->stripe_in_journal_count);
2849 r5c_update_log_state(log);
2850
2851
2852 if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
2853 test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2854 tree_index = r5c_tree_index(conf, sh->sector);
2855 spin_lock(&log->tree_lock);
2856 pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
2857 tree_index);
2858 BUG_ON(pslot == NULL);
2859 refcount = (uintptr_t)radix_tree_deref_slot_protected(
2860 pslot, &log->tree_lock) >>
2861 R5C_RADIX_COUNT_SHIFT;
2862 if (refcount == 1)
2863 radix_tree_delete(&log->big_stripe_tree, tree_index);
2864 else
2865 radix_tree_replace_slot(
2866 &log->big_stripe_tree, pslot,
2867 (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
2868 spin_unlock(&log->tree_lock);
2869 }
2870
2871 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
2872 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
2873 atomic_dec(&conf->r5c_flushing_partial_stripes);
2874 atomic_dec(&conf->r5c_cached_partial_stripes);
2875 }
2876
2877 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
2878 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
2879 atomic_dec(&conf->r5c_flushing_full_stripes);
2880 atomic_dec(&conf->r5c_cached_full_stripes);
2881 }
2882
2883 r5l_append_flush_payload(log, sh->sector);
2884
2885 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
2886 set_bit(STRIPE_HANDLE, &sh->state);
2887}
2888
2889int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
2890{
2891 struct r5conf *conf = sh->raid_conf;
2892 int pages = 0;
2893 int reserve;
2894 int i;
2895 int ret = 0;
2896
2897 BUG_ON(!log);
2898
2899 for (i = 0; i < sh->disks; i++) {
2900 void *addr;
2901
2902 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
2903 continue;
2904 addr = kmap_atomic(sh->dev[i].page);
2905 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
2906 addr, PAGE_SIZE);
2907 kunmap_atomic(addr);
2908 pages++;
2909 }
2910 WARN_ON(pages == 0);
2911
2912
2913
2914
2915
2916 clear_bit(STRIPE_DELAYED, &sh->state);
2917 atomic_inc(&sh->count);
2918
2919 mutex_lock(&log->io_mutex);
2920
2921 reserve = (1 + pages) << (PAGE_SHIFT - 9);
2922
2923 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
2924 sh->log_start == MaxSector)
2925 r5l_add_no_space_stripe(log, sh);
2926 else if (!r5l_has_free_space(log, reserve)) {
2927 if (sh->log_start == log->last_checkpoint)
2928 BUG();
2929 else
2930 r5l_add_no_space_stripe(log, sh);
2931 } else {
2932 ret = r5l_log_stripe(log, sh, pages, 0);
2933 if (ret) {
2934 spin_lock_irq(&log->io_list_lock);
2935 list_add_tail(&sh->log_list, &log->no_mem_stripes);
2936 spin_unlock_irq(&log->io_list_lock);
2937 }
2938 }
2939
2940 mutex_unlock(&log->io_mutex);
2941 return 0;
2942}
2943
2944
2945bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
2946{
2947 struct r5l_log *log = conf->log;
2948 sector_t tree_index;
2949 void *slot;
2950
2951 if (!log)
2952 return false;
2953
2954 WARN_ON_ONCE(!rcu_read_lock_held());
2955 tree_index = r5c_tree_index(conf, sect);
2956 slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
2957 return slot != NULL;
2958}
2959
2960static int r5l_load_log(struct r5l_log *log)
2961{
2962 struct md_rdev *rdev = log->rdev;
2963 struct page *page;
2964 struct r5l_meta_block *mb;
2965 sector_t cp = log->rdev->journal_tail;
2966 u32 stored_crc, expected_crc;
2967 bool create_super = false;
2968 int ret = 0;
2969
2970
2971 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
2972 cp = 0;
2973 page = alloc_page(GFP_KERNEL);
2974 if (!page)
2975 return -ENOMEM;
2976
2977 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
2978 ret = -EIO;
2979 goto ioerr;
2980 }
2981 mb = page_address(page);
2982
2983 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
2984 mb->version != R5LOG_VERSION) {
2985 create_super = true;
2986 goto create;
2987 }
2988 stored_crc = le32_to_cpu(mb->checksum);
2989 mb->checksum = 0;
2990 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
2991 if (stored_crc != expected_crc) {
2992 create_super = true;
2993 goto create;
2994 }
2995 if (le64_to_cpu(mb->position) != cp) {
2996 create_super = true;
2997 goto create;
2998 }
2999create:
3000 if (create_super) {
3001 log->last_cp_seq = prandom_u32();
3002 cp = 0;
3003 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
3004
3005
3006
3007
3008
3009 r5l_write_super(log, cp);
3010 } else
3011 log->last_cp_seq = le64_to_cpu(mb->seq);
3012
3013 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
3014 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
3015 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
3016 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
3017 log->last_checkpoint = cp;
3018
3019 __free_page(page);
3020
3021 if (create_super) {
3022 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
3023 log->seq = log->last_cp_seq + 1;
3024 log->next_checkpoint = cp;
3025 } else
3026 ret = r5l_recovery_log(log);
3027
3028 r5c_update_log_state(log);
3029 return ret;
3030ioerr:
3031 __free_page(page);
3032 return ret;
3033}
3034
3035int r5l_start(struct r5l_log *log)
3036{
3037 int ret;
3038
3039 if (!log)
3040 return 0;
3041
3042 ret = r5l_load_log(log);
3043 if (ret) {
3044 struct mddev *mddev = log->rdev->mddev;
3045 struct r5conf *conf = mddev->private;
3046
3047 r5l_exit_log(conf);
3048 }
3049 return ret;
3050}
3051
3052void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
3053{
3054 struct r5conf *conf = mddev->private;
3055 struct r5l_log *log = conf->log;
3056
3057 if (!log)
3058 return;
3059
3060 if ((raid5_calc_degraded(conf) > 0 ||
3061 test_bit(Journal, &rdev->flags)) &&
3062 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
3063 schedule_work(&log->disable_writeback_work);
3064}
3065
3066int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
3067{
3068 struct request_queue *q = bdev_get_queue(rdev->bdev);
3069 struct r5l_log *log;
3070 char b[BDEVNAME_SIZE];
3071 int ret;
3072
3073 pr_debug("md/raid:%s: using device %s as journal\n",
3074 mdname(conf->mddev), bdevname(rdev->bdev, b));
3075
3076 if (PAGE_SIZE != 4096)
3077 return -EINVAL;
3078
3079
3080
3081
3082
3083
3084
3085
3086 if (sizeof(struct r5l_meta_block) +
3087 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
3088 conf->raid_disks) > PAGE_SIZE) {
3089 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
3090 mdname(conf->mddev), conf->raid_disks);
3091 return -EINVAL;
3092 }
3093
3094 log = kzalloc(sizeof(*log), GFP_KERNEL);
3095 if (!log)
3096 return -ENOMEM;
3097 log->rdev = rdev;
3098
3099 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
3100
3101 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
3102 sizeof(rdev->mddev->uuid));
3103
3104 mutex_init(&log->io_mutex);
3105
3106 spin_lock_init(&log->io_list_lock);
3107 INIT_LIST_HEAD(&log->running_ios);
3108 INIT_LIST_HEAD(&log->io_end_ios);
3109 INIT_LIST_HEAD(&log->flushing_ios);
3110 INIT_LIST_HEAD(&log->finished_ios);
3111 bio_init(&log->flush_bio, NULL, 0);
3112
3113 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
3114 if (!log->io_kc)
3115 goto io_kc;
3116
3117 ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
3118 if (ret)
3119 goto io_pool;
3120
3121 ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
3122 if (ret)
3123 goto io_bs;
3124
3125 ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
3126 if (ret)
3127 goto out_mempool;
3128
3129 spin_lock_init(&log->tree_lock);
3130 INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
3131
3132 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
3133 log->rdev->mddev, "reclaim");
3134 if (!log->reclaim_thread)
3135 goto reclaim_thread;
3136 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
3137
3138 init_waitqueue_head(&log->iounit_wait);
3139
3140 INIT_LIST_HEAD(&log->no_mem_stripes);
3141
3142 INIT_LIST_HEAD(&log->no_space_stripes);
3143 spin_lock_init(&log->no_space_stripes_lock);
3144
3145 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
3146 INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
3147
3148 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
3149 INIT_LIST_HEAD(&log->stripe_in_journal_list);
3150 spin_lock_init(&log->stripe_in_journal_lock);
3151 atomic_set(&log->stripe_in_journal_count, 0);
3152
3153 rcu_assign_pointer(conf->log, log);
3154
3155 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
3156 return 0;
3157
3158reclaim_thread:
3159 mempool_exit(&log->meta_pool);
3160out_mempool:
3161 bioset_exit(&log->bs);
3162io_bs:
3163 mempool_exit(&log->io_pool);
3164io_pool:
3165 kmem_cache_destroy(log->io_kc);
3166io_kc:
3167 kfree(log);
3168 return -EINVAL;
3169}
3170
3171void r5l_exit_log(struct r5conf *conf)
3172{
3173 struct r5l_log *log = conf->log;
3174
3175 conf->log = NULL;
3176 synchronize_rcu();
3177
3178
3179 wake_up(&conf->mddev->sb_wait);
3180 flush_work(&log->disable_writeback_work);
3181 md_unregister_thread(&log->reclaim_thread);
3182 mempool_exit(&log->meta_pool);
3183 bioset_exit(&log->bs);
3184 mempool_exit(&log->io_pool);
3185 kmem_cache_destroy(log->io_kc);
3186 kfree(log);
3187}
3188