1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/ioprio.h>
19#include <linux/kdev_t.h>
20#include <linux/module.h>
21#include <linux/sched/signal.h>
22#include <linux/err.h>
23#include <linux/blkdev.h>
24#include <linux/backing-dev.h>
25#include <linux/slab.h>
26#include <linux/delay.h>
27#include <linux/atomic.h>
28#include <linux/ctype.h>
29#include <linux/resume_user_mode.h>
30#include <linux/psi.h>
31#include <linux/part_stat.h>
32#include "blk.h"
33#include "blk-cgroup.h"
34#include "blk-ioprio.h"
35#include "blk-throttle.h"
36
37static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
38
39
40
41
42
43
44
45
46static DEFINE_MUTEX(blkcg_pol_register_mutex);
47static DEFINE_MUTEX(blkcg_pol_mutex);
48
49struct blkcg blkcg_root;
50EXPORT_SYMBOL_GPL(blkcg_root);
51
52struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
53EXPORT_SYMBOL_GPL(blkcg_root_css);
54
55static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
56
57static LIST_HEAD(all_blkcgs);
58
59bool blkcg_debug_stats = false;
60
61static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
62
63#define BLKG_DESTROY_BATCH_SIZE 64
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83static int init_blkcg_llists(struct blkcg *blkcg)
84{
85 int cpu;
86
87 blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
88 if (!blkcg->lhead)
89 return -ENOMEM;
90
91 for_each_possible_cpu(cpu)
92 init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
93 return 0;
94}
95
96
97
98
99
100
101
102
103static struct cgroup_subsys_state *blkcg_css(void)
104{
105 struct cgroup_subsys_state *css;
106
107 css = kthread_blkcg();
108 if (css)
109 return css;
110 return task_css(current, io_cgrp_id);
111}
112
113static bool blkcg_policy_enabled(struct request_queue *q,
114 const struct blkcg_policy *pol)
115{
116 return pol && test_bit(pol->plid, q->blkcg_pols);
117}
118
119static void blkg_free_workfn(struct work_struct *work)
120{
121 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
122 free_work);
123 struct request_queue *q = blkg->q;
124 int i;
125
126
127
128
129
130
131
132
133 mutex_lock(&q->blkcg_mutex);
134 for (i = 0; i < BLKCG_MAX_POLS; i++)
135 if (blkg->pd[i])
136 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
137 if (blkg->parent)
138 blkg_put(blkg->parent);
139 spin_lock_irq(&q->queue_lock);
140 list_del_init(&blkg->q_node);
141 spin_unlock_irq(&q->queue_lock);
142 mutex_unlock(&q->blkcg_mutex);
143
144 blk_put_queue(q);
145 free_percpu(blkg->iostat_cpu);
146 percpu_ref_exit(&blkg->refcnt);
147 kfree(blkg);
148}
149
150
151
152
153
154
155
156static void blkg_free(struct blkcg_gq *blkg)
157{
158 if (!blkg)
159 return;
160
161
162
163
164
165 INIT_WORK(&blkg->free_work, blkg_free_workfn);
166 schedule_work(&blkg->free_work);
167}
168
169static void __blkg_release(struct rcu_head *rcu)
170{
171 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
172 struct blkcg *blkcg = blkg->blkcg;
173 int cpu;
174
175#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
176 WARN_ON(!bio_list_empty(&blkg->async_bios));
177#endif
178
179
180
181
182
183
184 for_each_possible_cpu(cpu)
185 __blkcg_rstat_flush(blkcg, cpu);
186
187
188 css_put(&blkg->blkcg->css);
189 blkg_free(blkg);
190}
191
192
193
194
195
196
197
198
199
200static void blkg_release(struct percpu_ref *ref)
201{
202 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
203
204 call_rcu(&blkg->rcu_head, __blkg_release);
205}
206
207#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
208static struct workqueue_struct *blkcg_punt_bio_wq;
209
210static void blkg_async_bio_workfn(struct work_struct *work)
211{
212 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
213 async_bio_work);
214 struct bio_list bios = BIO_EMPTY_LIST;
215 struct bio *bio;
216 struct blk_plug plug;
217 bool need_plug = false;
218
219
220 spin_lock(&blkg->async_bio_lock);
221 bio_list_merge(&bios, &blkg->async_bios);
222 bio_list_init(&blkg->async_bios);
223 spin_unlock(&blkg->async_bio_lock);
224
225
226 if (bios.head && bios.head->bi_next) {
227 need_plug = true;
228 blk_start_plug(&plug);
229 }
230 while ((bio = bio_list_pop(&bios)))
231 submit_bio(bio);
232 if (need_plug)
233 blk_finish_plug(&plug);
234}
235
236
237
238
239
240
241
242void blkcg_punt_bio_submit(struct bio *bio)
243{
244 struct blkcg_gq *blkg = bio->bi_blkg;
245
246 if (blkg->parent) {
247 spin_lock(&blkg->async_bio_lock);
248 bio_list_add(&blkg->async_bios, bio);
249 spin_unlock(&blkg->async_bio_lock);
250 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
251 } else {
252
253 submit_bio(bio);
254 }
255}
256EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);
257
258static int __init blkcg_punt_bio_init(void)
259{
260 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
261 WQ_MEM_RECLAIM | WQ_FREEZABLE |
262 WQ_UNBOUND | WQ_SYSFS, 0);
263 if (!blkcg_punt_bio_wq)
264 return -ENOMEM;
265 return 0;
266}
267subsys_initcall(blkcg_punt_bio_init);
268#endif
269
270
271
272
273
274
275
276
277
278struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
279{
280 if (!bio || !bio->bi_blkg)
281 return NULL;
282 return &bio->bi_blkg->blkcg->css;
283}
284EXPORT_SYMBOL_GPL(bio_blkcg_css);
285
286
287
288
289
290
291
292static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
293{
294 return css_to_blkcg(blkcg->css.parent);
295}
296
297
298
299
300
301
302
303
304
305static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
306 gfp_t gfp_mask)
307{
308 struct blkcg_gq *blkg;
309 int i, cpu;
310
311
312 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
313 if (!blkg)
314 return NULL;
315 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
316 goto out_free_blkg;
317 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
318 if (!blkg->iostat_cpu)
319 goto out_exit_refcnt;
320 if (!blk_get_queue(disk->queue))
321 goto out_free_iostat;
322
323 blkg->q = disk->queue;
324 INIT_LIST_HEAD(&blkg->q_node);
325 blkg->blkcg = blkcg;
326#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
327 spin_lock_init(&blkg->async_bio_lock);
328 bio_list_init(&blkg->async_bios);
329 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
330#endif
331
332 u64_stats_init(&blkg->iostat.sync);
333 for_each_possible_cpu(cpu) {
334 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
335 per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
336 }
337
338 for (i = 0; i < BLKCG_MAX_POLS; i++) {
339 struct blkcg_policy *pol = blkcg_policy[i];
340 struct blkg_policy_data *pd;
341
342 if (!blkcg_policy_enabled(disk->queue, pol))
343 continue;
344
345
346 pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask);
347 if (!pd)
348 goto out_free_pds;
349 blkg->pd[i] = pd;
350 pd->blkg = blkg;
351 pd->plid = i;
352 pd->online = false;
353 }
354
355 return blkg;
356
357out_free_pds:
358 while (--i >= 0)
359 if (blkg->pd[i])
360 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
361 blk_put_queue(disk->queue);
362out_free_iostat:
363 free_percpu(blkg->iostat_cpu);
364out_exit_refcnt:
365 percpu_ref_exit(&blkg->refcnt);
366out_free_blkg:
367 kfree(blkg);
368 return NULL;
369}
370
371
372
373
374
375static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
376 struct blkcg_gq *new_blkg)
377{
378 struct blkcg_gq *blkg;
379 int i, ret;
380
381 lockdep_assert_held(&disk->queue->queue_lock);
382
383
384 if (blk_queue_dying(disk->queue)) {
385 ret = -ENODEV;
386 goto err_free_blkg;
387 }
388
389
390 if (!css_tryget_online(&blkcg->css)) {
391 ret = -ENODEV;
392 goto err_free_blkg;
393 }
394
395
396 if (!new_blkg) {
397 new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
398 if (unlikely(!new_blkg)) {
399 ret = -ENOMEM;
400 goto err_put_css;
401 }
402 }
403 blkg = new_blkg;
404
405
406 if (blkcg_parent(blkcg)) {
407 blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
408 if (WARN_ON_ONCE(!blkg->parent)) {
409 ret = -ENODEV;
410 goto err_put_css;
411 }
412 blkg_get(blkg->parent);
413 }
414
415
416 for (i = 0; i < BLKCG_MAX_POLS; i++) {
417 struct blkcg_policy *pol = blkcg_policy[i];
418
419 if (blkg->pd[i] && pol->pd_init_fn)
420 pol->pd_init_fn(blkg->pd[i]);
421 }
422
423
424 spin_lock(&blkcg->lock);
425 ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
426 if (likely(!ret)) {
427 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
428 list_add(&blkg->q_node, &disk->queue->blkg_list);
429
430 for (i = 0; i < BLKCG_MAX_POLS; i++) {
431 struct blkcg_policy *pol = blkcg_policy[i];
432
433 if (blkg->pd[i]) {
434 if (pol->pd_online_fn)
435 pol->pd_online_fn(blkg->pd[i]);
436 blkg->pd[i]->online = true;
437 }
438 }
439 }
440 blkg->online = true;
441 spin_unlock(&blkcg->lock);
442
443 if (!ret)
444 return blkg;
445
446
447 blkg_put(blkg);
448 return ERR_PTR(ret);
449
450err_put_css:
451 css_put(&blkcg->css);
452err_free_blkg:
453 if (new_blkg)
454 blkg_free(new_blkg);
455 return ERR_PTR(ret);
456}
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
472 struct gendisk *disk)
473{
474 struct request_queue *q = disk->queue;
475 struct blkcg_gq *blkg;
476 unsigned long flags;
477
478 WARN_ON_ONCE(!rcu_read_lock_held());
479
480 blkg = blkg_lookup(blkcg, q);
481 if (blkg)
482 return blkg;
483
484 spin_lock_irqsave(&q->queue_lock, flags);
485 blkg = blkg_lookup(blkcg, q);
486 if (blkg) {
487 if (blkcg != &blkcg_root &&
488 blkg != rcu_dereference(blkcg->blkg_hint))
489 rcu_assign_pointer(blkcg->blkg_hint, blkg);
490 goto found;
491 }
492
493
494
495
496
497
498 while (true) {
499 struct blkcg *pos = blkcg;
500 struct blkcg *parent = blkcg_parent(blkcg);
501 struct blkcg_gq *ret_blkg = q->root_blkg;
502
503 while (parent) {
504 blkg = blkg_lookup(parent, q);
505 if (blkg) {
506
507 ret_blkg = blkg;
508 break;
509 }
510 pos = parent;
511 parent = blkcg_parent(parent);
512 }
513
514 blkg = blkg_create(pos, disk, NULL);
515 if (IS_ERR(blkg)) {
516 blkg = ret_blkg;
517 break;
518 }
519 if (pos == blkcg)
520 break;
521 }
522
523found:
524 spin_unlock_irqrestore(&q->queue_lock, flags);
525 return blkg;
526}
527
528static void blkg_destroy(struct blkcg_gq *blkg)
529{
530 struct blkcg *blkcg = blkg->blkcg;
531 int i;
532
533 lockdep_assert_held(&blkg->q->queue_lock);
534 lockdep_assert_held(&blkcg->lock);
535
536
537
538
539
540
541
542 if (hlist_unhashed(&blkg->blkcg_node))
543 return;
544
545 for (i = 0; i < BLKCG_MAX_POLS; i++) {
546 struct blkcg_policy *pol = blkcg_policy[i];
547
548 if (blkg->pd[i] && blkg->pd[i]->online) {
549 blkg->pd[i]->online = false;
550 if (pol->pd_offline_fn)
551 pol->pd_offline_fn(blkg->pd[i]);
552 }
553 }
554
555 blkg->online = false;
556
557 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
558 hlist_del_init_rcu(&blkg->blkcg_node);
559
560
561
562
563
564
565 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
566 rcu_assign_pointer(blkcg->blkg_hint, NULL);
567
568
569
570
571
572 percpu_ref_kill(&blkg->refcnt);
573}
574
575static void blkg_destroy_all(struct gendisk *disk)
576{
577 struct request_queue *q = disk->queue;
578 struct blkcg_gq *blkg, *n;
579 int count = BLKG_DESTROY_BATCH_SIZE;
580 int i;
581
582restart:
583 spin_lock_irq(&q->queue_lock);
584 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
585 struct blkcg *blkcg = blkg->blkcg;
586
587 if (hlist_unhashed(&blkg->blkcg_node))
588 continue;
589
590 spin_lock(&blkcg->lock);
591 blkg_destroy(blkg);
592 spin_unlock(&blkcg->lock);
593
594
595
596
597
598 if (!(--count)) {
599 count = BLKG_DESTROY_BATCH_SIZE;
600 spin_unlock_irq(&q->queue_lock);
601 cond_resched();
602 goto restart;
603 }
604 }
605
606
607
608
609
610
611 for (i = 0; i < BLKCG_MAX_POLS; i++) {
612 struct blkcg_policy *pol = blkcg_policy[i];
613
614 if (pol)
615 __clear_bit(pol->plid, q->blkcg_pols);
616 }
617
618 q->root_blkg = NULL;
619 spin_unlock_irq(&q->queue_lock);
620}
621
622static int blkcg_reset_stats(struct cgroup_subsys_state *css,
623 struct cftype *cftype, u64 val)
624{
625 struct blkcg *blkcg = css_to_blkcg(css);
626 struct blkcg_gq *blkg;
627 int i, cpu;
628
629 mutex_lock(&blkcg_pol_mutex);
630 spin_lock_irq(&blkcg->lock);
631
632
633
634
635
636
637 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
638 for_each_possible_cpu(cpu) {
639 struct blkg_iostat_set *bis =
640 per_cpu_ptr(blkg->iostat_cpu, cpu);
641 memset(bis, 0, sizeof(*bis));
642
643
644 u64_stats_init(&bis->sync);
645 bis->blkg = blkg;
646 }
647 memset(&blkg->iostat, 0, sizeof(blkg->iostat));
648 u64_stats_init(&blkg->iostat.sync);
649
650 for (i = 0; i < BLKCG_MAX_POLS; i++) {
651 struct blkcg_policy *pol = blkcg_policy[i];
652
653 if (blkg->pd[i] && pol->pd_reset_stats_fn)
654 pol->pd_reset_stats_fn(blkg->pd[i]);
655 }
656 }
657
658 spin_unlock_irq(&blkcg->lock);
659 mutex_unlock(&blkcg_pol_mutex);
660 return 0;
661}
662
663const char *blkg_dev_name(struct blkcg_gq *blkg)
664{
665 if (!blkg->q->disk)
666 return NULL;
667 return bdi_dev_name(blkg->q->disk->bdi);
668}
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
689 u64 (*prfill)(struct seq_file *,
690 struct blkg_policy_data *, int),
691 const struct blkcg_policy *pol, int data,
692 bool show_total)
693{
694 struct blkcg_gq *blkg;
695 u64 total = 0;
696
697 rcu_read_lock();
698 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
699 spin_lock_irq(&blkg->q->queue_lock);
700 if (blkcg_policy_enabled(blkg->q, pol))
701 total += prfill(sf, blkg->pd[pol->plid], data);
702 spin_unlock_irq(&blkg->q->queue_lock);
703 }
704 rcu_read_unlock();
705
706 if (show_total)
707 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
708}
709EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
710
711
712
713
714
715
716
717
718
719u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
720{
721 const char *dname = blkg_dev_name(pd->blkg);
722
723 if (!dname)
724 return 0;
725
726 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
727 return v;
728}
729EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
730
731
732
733
734
735
736
737
738
739
740void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
741{
742 *ctx = (struct blkg_conf_ctx){ .input = input };
743}
744EXPORT_SYMBOL_GPL(blkg_conf_init);
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
760{
761 char *input = ctx->input;
762 unsigned int major, minor;
763 struct block_device *bdev;
764 int key_len;
765
766 if (ctx->bdev)
767 return 0;
768
769 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
770 return -EINVAL;
771
772 input += key_len;
773 if (!isspace(*input))
774 return -EINVAL;
775 input = skip_spaces(input);
776
777 bdev = blkdev_get_no_open(MKDEV(major, minor));
778 if (!bdev)
779 return -ENODEV;
780 if (bdev_is_partition(bdev)) {
781 blkdev_put_no_open(bdev);
782 return -ENODEV;
783 }
784
785 mutex_lock(&bdev->bd_queue->rq_qos_mutex);
786 if (!disk_live(bdev->bd_disk)) {
787 blkdev_put_no_open(bdev);
788 mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
789 return -ENODEV;
790 }
791
792 ctx->body = input;
793 ctx->bdev = bdev;
794 return 0;
795}
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
813 struct blkg_conf_ctx *ctx)
814 __acquires(&bdev->bd_queue->queue_lock)
815{
816 struct gendisk *disk;
817 struct request_queue *q;
818 struct blkcg_gq *blkg;
819 int ret;
820
821 ret = blkg_conf_open_bdev(ctx);
822 if (ret)
823 return ret;
824
825 disk = ctx->bdev->bd_disk;
826 q = disk->queue;
827
828
829
830
831
832 ret = blk_queue_enter(q, 0);
833 if (ret)
834 goto fail;
835
836 spin_lock_irq(&q->queue_lock);
837
838 if (!blkcg_policy_enabled(q, pol)) {
839 ret = -EOPNOTSUPP;
840 goto fail_unlock;
841 }
842
843 blkg = blkg_lookup(blkcg, q);
844 if (blkg)
845 goto success;
846
847
848
849
850
851 while (true) {
852 struct blkcg *pos = blkcg;
853 struct blkcg *parent;
854 struct blkcg_gq *new_blkg;
855
856 parent = blkcg_parent(blkcg);
857 while (parent && !blkg_lookup(parent, q)) {
858 pos = parent;
859 parent = blkcg_parent(parent);
860 }
861
862
863 spin_unlock_irq(&q->queue_lock);
864
865 new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
866 if (unlikely(!new_blkg)) {
867 ret = -ENOMEM;
868 goto fail_exit_queue;
869 }
870
871 if (radix_tree_preload(GFP_KERNEL)) {
872 blkg_free(new_blkg);
873 ret = -ENOMEM;
874 goto fail_exit_queue;
875 }
876
877 spin_lock_irq(&q->queue_lock);
878
879 if (!blkcg_policy_enabled(q, pol)) {
880 blkg_free(new_blkg);
881 ret = -EOPNOTSUPP;
882 goto fail_preloaded;
883 }
884
885 blkg = blkg_lookup(pos, q);
886 if (blkg) {
887 blkg_free(new_blkg);
888 } else {
889 blkg = blkg_create(pos, disk, new_blkg);
890 if (IS_ERR(blkg)) {
891 ret = PTR_ERR(blkg);
892 goto fail_preloaded;
893 }
894 }
895
896 radix_tree_preload_end();
897
898 if (pos == blkcg)
899 goto success;
900 }
901success:
902 blk_queue_exit(q);
903 ctx->blkg = blkg;
904 return 0;
905
906fail_preloaded:
907 radix_tree_preload_end();
908fail_unlock:
909 spin_unlock_irq(&q->queue_lock);
910fail_exit_queue:
911 blk_queue_exit(q);
912fail:
913
914
915
916
917
918
919 if (ret == -EBUSY) {
920 msleep(10);
921 ret = restart_syscall();
922 }
923 return ret;
924}
925EXPORT_SYMBOL_GPL(blkg_conf_prep);
926
927
928
929
930
931
932
933
934void blkg_conf_exit(struct blkg_conf_ctx *ctx)
935 __releases(&ctx->bdev->bd_queue->queue_lock)
936 __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
937{
938 if (ctx->blkg) {
939 spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
940 ctx->blkg = NULL;
941 }
942
943 if (ctx->bdev) {
944 mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
945 blkdev_put_no_open(ctx->bdev);
946 ctx->body = NULL;
947 ctx->bdev = NULL;
948 }
949}
950EXPORT_SYMBOL_GPL(blkg_conf_exit);
951
952static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
953{
954 int i;
955
956 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
957 dst->bytes[i] = src->bytes[i];
958 dst->ios[i] = src->ios[i];
959 }
960}
961
962static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
963{
964 int i;
965
966 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
967 dst->bytes[i] += src->bytes[i];
968 dst->ios[i] += src->ios[i];
969 }
970}
971
972static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
973{
974 int i;
975
976 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
977 dst->bytes[i] -= src->bytes[i];
978 dst->ios[i] -= src->ios[i];
979 }
980}
981
982static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
983 struct blkg_iostat *last)
984{
985 struct blkg_iostat delta;
986 unsigned long flags;
987
988
989 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
990 blkg_iostat_set(&delta, cur);
991 blkg_iostat_sub(&delta, last);
992 blkg_iostat_add(&blkg->iostat.cur, &delta);
993 blkg_iostat_add(last, &delta);
994 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
995}
996
997static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
998{
999 struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
1000 struct llist_node *lnode;
1001 struct blkg_iostat_set *bisc, *next_bisc;
1002 unsigned long flags;
1003
1004 rcu_read_lock();
1005
1006 lnode = llist_del_all(lhead);
1007 if (!lnode)
1008 goto out;
1009
1010
1011
1012
1013
1014
1015
1016 raw_spin_lock_irqsave(&blkg_stat_lock, flags);
1017
1018
1019
1020
1021 llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
1022 struct blkcg_gq *blkg = bisc->blkg;
1023 struct blkcg_gq *parent = blkg->parent;
1024 struct blkg_iostat cur;
1025 unsigned int seq;
1026
1027 WRITE_ONCE(bisc->lqueued, false);
1028
1029
1030 do {
1031 seq = u64_stats_fetch_begin(&bisc->sync);
1032 blkg_iostat_set(&cur, &bisc->cur);
1033 } while (u64_stats_fetch_retry(&bisc->sync, seq));
1034
1035 blkcg_iostat_update(blkg, &cur, &bisc->last);
1036
1037
1038 if (parent && parent->parent)
1039 blkcg_iostat_update(parent, &blkg->iostat.cur,
1040 &blkg->iostat.last);
1041 }
1042 raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
1043out:
1044 rcu_read_unlock();
1045}
1046
1047static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
1048{
1049
1050 if (cgroup_parent(css->cgroup))
1051 __blkcg_rstat_flush(css_to_blkcg(css), cpu);
1052}
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066static void blkcg_fill_root_iostats(void)
1067{
1068 struct class_dev_iter iter;
1069 struct device *dev;
1070
1071 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
1072 while ((dev = class_dev_iter_next(&iter))) {
1073 struct block_device *bdev = dev_to_bdev(dev);
1074 struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
1075 struct blkg_iostat tmp;
1076 int cpu;
1077 unsigned long flags;
1078
1079 memset(&tmp, 0, sizeof(tmp));
1080 for_each_possible_cpu(cpu) {
1081 struct disk_stats *cpu_dkstats;
1082
1083 cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
1084 tmp.ios[BLKG_IOSTAT_READ] +=
1085 cpu_dkstats->ios[STAT_READ];
1086 tmp.ios[BLKG_IOSTAT_WRITE] +=
1087 cpu_dkstats->ios[STAT_WRITE];
1088 tmp.ios[BLKG_IOSTAT_DISCARD] +=
1089 cpu_dkstats->ios[STAT_DISCARD];
1090
1091 tmp.bytes[BLKG_IOSTAT_READ] +=
1092 cpu_dkstats->sectors[STAT_READ] << 9;
1093 tmp.bytes[BLKG_IOSTAT_WRITE] +=
1094 cpu_dkstats->sectors[STAT_WRITE] << 9;
1095 tmp.bytes[BLKG_IOSTAT_DISCARD] +=
1096 cpu_dkstats->sectors[STAT_DISCARD] << 9;
1097 }
1098
1099 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
1100 blkg_iostat_set(&blkg->iostat.cur, &tmp);
1101 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
1102 }
1103}
1104
1105static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
1106{
1107 struct blkg_iostat_set *bis = &blkg->iostat;
1108 u64 rbytes, wbytes, rios, wios, dbytes, dios;
1109 const char *dname;
1110 unsigned seq;
1111 int i;
1112
1113 if (!blkg->online)
1114 return;
1115
1116 dname = blkg_dev_name(blkg);
1117 if (!dname)
1118 return;
1119
1120 seq_printf(s, "%s ", dname);
1121
1122 do {
1123 seq = u64_stats_fetch_begin(&bis->sync);
1124
1125 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
1126 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
1127 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
1128 rios = bis->cur.ios[BLKG_IOSTAT_READ];
1129 wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
1130 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
1131 } while (u64_stats_fetch_retry(&bis->sync, seq));
1132
1133 if (rbytes || wbytes || rios || wios) {
1134 seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
1135 rbytes, wbytes, rios, wios,
1136 dbytes, dios);
1137 }
1138
1139 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
1140 seq_printf(s, " use_delay=%d delay_nsec=%llu",
1141 atomic_read(&blkg->use_delay),
1142 atomic64_read(&blkg->delay_nsec));
1143 }
1144
1145 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1146 struct blkcg_policy *pol = blkcg_policy[i];
1147
1148 if (!blkg->pd[i] || !pol->pd_stat_fn)
1149 continue;
1150
1151 pol->pd_stat_fn(blkg->pd[i], s);
1152 }
1153
1154 seq_puts(s, "\n");
1155}
1156
1157static int blkcg_print_stat(struct seq_file *sf, void *v)
1158{
1159 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1160 struct blkcg_gq *blkg;
1161
1162 if (!seq_css(sf)->parent)
1163 blkcg_fill_root_iostats();
1164 else
1165 cgroup_rstat_flush(blkcg->css.cgroup);
1166
1167 rcu_read_lock();
1168 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
1169 spin_lock_irq(&blkg->q->queue_lock);
1170 blkcg_print_one_stat(blkg, sf);
1171 spin_unlock_irq(&blkg->q->queue_lock);
1172 }
1173 rcu_read_unlock();
1174 return 0;
1175}
1176
1177static struct cftype blkcg_files[] = {
1178 {
1179 .name = "stat",
1180 .seq_show = blkcg_print_stat,
1181 },
1182 { }
1183};
1184
1185static struct cftype blkcg_legacy_files[] = {
1186 {
1187 .name = "reset_stats",
1188 .write_u64 = blkcg_reset_stats,
1189 },
1190 { }
1191};
1192
1193#ifdef CONFIG_CGROUP_WRITEBACK
1194struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
1195{
1196 return &css_to_blkcg(css)->cgwb_list;
1197}
1198#endif
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232static void blkcg_destroy_blkgs(struct blkcg *blkcg)
1233{
1234 might_sleep();
1235
1236 spin_lock_irq(&blkcg->lock);
1237
1238 while (!hlist_empty(&blkcg->blkg_list)) {
1239 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1240 struct blkcg_gq, blkcg_node);
1241 struct request_queue *q = blkg->q;
1242
1243 if (need_resched() || !spin_trylock(&q->queue_lock)) {
1244
1245
1246
1247
1248
1249 spin_unlock_irq(&blkcg->lock);
1250 cond_resched();
1251 spin_lock_irq(&blkcg->lock);
1252 continue;
1253 }
1254
1255 blkg_destroy(blkg);
1256 spin_unlock(&q->queue_lock);
1257 }
1258
1259 spin_unlock_irq(&blkcg->lock);
1260}
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
1271{
1272 refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
1273}
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
1285{
1286 struct blkcg *blkcg = css_to_blkcg(blkcg_css);
1287
1288 do {
1289 if (!refcount_dec_and_test(&blkcg->online_pin))
1290 break;
1291 blkcg_destroy_blkgs(blkcg);
1292 blkcg = blkcg_parent(blkcg);
1293 } while (blkcg);
1294}
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304static void blkcg_css_offline(struct cgroup_subsys_state *css)
1305{
1306
1307 wb_blkcg_offline(css);
1308
1309
1310 blkcg_unpin_online(css);
1311}
1312
1313static void blkcg_css_free(struct cgroup_subsys_state *css)
1314{
1315 struct blkcg *blkcg = css_to_blkcg(css);
1316 int i;
1317
1318 mutex_lock(&blkcg_pol_mutex);
1319
1320 list_del(&blkcg->all_blkcgs_node);
1321
1322 for (i = 0; i < BLKCG_MAX_POLS; i++)
1323 if (blkcg->cpd[i])
1324 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1325
1326 mutex_unlock(&blkcg_pol_mutex);
1327
1328 free_percpu(blkcg->lhead);
1329 kfree(blkcg);
1330}
1331
1332static struct cgroup_subsys_state *
1333blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1334{
1335 struct blkcg *blkcg;
1336 int i;
1337
1338 mutex_lock(&blkcg_pol_mutex);
1339
1340 if (!parent_css) {
1341 blkcg = &blkcg_root;
1342 } else {
1343 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1344 if (!blkcg)
1345 goto unlock;
1346 }
1347
1348 if (init_blkcg_llists(blkcg))
1349 goto free_blkcg;
1350
1351 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1352 struct blkcg_policy *pol = blkcg_policy[i];
1353 struct blkcg_policy_data *cpd;
1354
1355
1356
1357
1358
1359
1360
1361 if (!pol || !pol->cpd_alloc_fn)
1362 continue;
1363
1364 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1365 if (!cpd)
1366 goto free_pd_blkcg;
1367
1368 blkcg->cpd[i] = cpd;
1369 cpd->blkcg = blkcg;
1370 cpd->plid = i;
1371 }
1372
1373 spin_lock_init(&blkcg->lock);
1374 refcount_set(&blkcg->online_pin, 1);
1375 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1376 INIT_HLIST_HEAD(&blkcg->blkg_list);
1377#ifdef CONFIG_CGROUP_WRITEBACK
1378 INIT_LIST_HEAD(&blkcg->cgwb_list);
1379#endif
1380 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1381
1382 mutex_unlock(&blkcg_pol_mutex);
1383 return &blkcg->css;
1384
1385free_pd_blkcg:
1386 for (i--; i >= 0; i--)
1387 if (blkcg->cpd[i])
1388 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1389 free_percpu(blkcg->lhead);
1390free_blkcg:
1391 if (blkcg != &blkcg_root)
1392 kfree(blkcg);
1393unlock:
1394 mutex_unlock(&blkcg_pol_mutex);
1395 return ERR_PTR(-ENOMEM);
1396}
1397
1398static int blkcg_css_online(struct cgroup_subsys_state *css)
1399{
1400 struct blkcg *parent = blkcg_parent(css_to_blkcg(css));
1401
1402
1403
1404
1405
1406
1407 if (parent)
1408 blkcg_pin_online(&parent->css);
1409 return 0;
1410}
1411
1412int blkcg_init_disk(struct gendisk *disk)
1413{
1414 struct request_queue *q = disk->queue;
1415 struct blkcg_gq *new_blkg, *blkg;
1416 bool preloaded;
1417 int ret;
1418
1419 INIT_LIST_HEAD(&q->blkg_list);
1420 mutex_init(&q->blkcg_mutex);
1421
1422 new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
1423 if (!new_blkg)
1424 return -ENOMEM;
1425
1426 preloaded = !radix_tree_preload(GFP_KERNEL);
1427
1428
1429
1430 spin_lock_irq(&q->queue_lock);
1431 blkg = blkg_create(&blkcg_root, disk, new_blkg);
1432 if (IS_ERR(blkg))
1433 goto err_unlock;
1434 q->root_blkg = blkg;
1435 spin_unlock_irq(&q->queue_lock);
1436
1437 if (preloaded)
1438 radix_tree_preload_end();
1439
1440 ret = blk_ioprio_init(disk);
1441 if (ret)
1442 goto err_destroy_all;
1443
1444 ret = blk_throtl_init(disk);
1445 if (ret)
1446 goto err_ioprio_exit;
1447
1448 return 0;
1449
1450err_ioprio_exit:
1451 blk_ioprio_exit(disk);
1452err_destroy_all:
1453 blkg_destroy_all(disk);
1454 return ret;
1455err_unlock:
1456 spin_unlock_irq(&q->queue_lock);
1457 if (preloaded)
1458 radix_tree_preload_end();
1459 return PTR_ERR(blkg);
1460}
1461
1462void blkcg_exit_disk(struct gendisk *disk)
1463{
1464 blkg_destroy_all(disk);
1465 blk_throtl_exit(disk);
1466}
1467
1468static void blkcg_exit(struct task_struct *tsk)
1469{
1470 if (tsk->throttle_disk)
1471 put_disk(tsk->throttle_disk);
1472 tsk->throttle_disk = NULL;
1473}
1474
1475struct cgroup_subsys io_cgrp_subsys = {
1476 .css_alloc = blkcg_css_alloc,
1477 .css_online = blkcg_css_online,
1478 .css_offline = blkcg_css_offline,
1479 .css_free = blkcg_css_free,
1480 .css_rstat_flush = blkcg_rstat_flush,
1481 .dfl_cftypes = blkcg_files,
1482 .legacy_cftypes = blkcg_legacy_files,
1483 .legacy_name = "blkio",
1484 .exit = blkcg_exit,
1485#ifdef CONFIG_MEMCG
1486
1487
1488
1489
1490
1491 .depends_on = 1 << memory_cgrp_id,
1492#endif
1493};
1494EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
1513{
1514 struct request_queue *q = disk->queue;
1515 struct blkg_policy_data *pd_prealloc = NULL;
1516 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1517 int ret;
1518
1519 if (blkcg_policy_enabled(q, pol))
1520 return 0;
1521
1522 if (queue_is_mq(q))
1523 blk_mq_freeze_queue(q);
1524retry:
1525 spin_lock_irq(&q->queue_lock);
1526
1527
1528 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1529 struct blkg_policy_data *pd;
1530
1531 if (blkg->pd[pol->plid])
1532 continue;
1533
1534
1535 if (blkg == pinned_blkg) {
1536 pd = pd_prealloc;
1537 pd_prealloc = NULL;
1538 } else {
1539 pd = pol->pd_alloc_fn(disk, blkg->blkcg,
1540 GFP_NOWAIT | __GFP_NOWARN);
1541 }
1542
1543 if (!pd) {
1544
1545
1546
1547
1548 if (pinned_blkg)
1549 blkg_put(pinned_blkg);
1550 blkg_get(blkg);
1551 pinned_blkg = blkg;
1552
1553 spin_unlock_irq(&q->queue_lock);
1554
1555 if (pd_prealloc)
1556 pol->pd_free_fn(pd_prealloc);
1557 pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
1558 GFP_KERNEL);
1559 if (pd_prealloc)
1560 goto retry;
1561 else
1562 goto enomem;
1563 }
1564
1565 spin_lock(&blkg->blkcg->lock);
1566
1567 pd->blkg = blkg;
1568 pd->plid = pol->plid;
1569 blkg->pd[pol->plid] = pd;
1570
1571 if (pol->pd_init_fn)
1572 pol->pd_init_fn(pd);
1573
1574 if (pol->pd_online_fn)
1575 pol->pd_online_fn(pd);
1576 pd->online = true;
1577
1578 spin_unlock(&blkg->blkcg->lock);
1579 }
1580
1581 __set_bit(pol->plid, q->blkcg_pols);
1582 ret = 0;
1583
1584 spin_unlock_irq(&q->queue_lock);
1585out:
1586 if (queue_is_mq(q))
1587 blk_mq_unfreeze_queue(q);
1588 if (pinned_blkg)
1589 blkg_put(pinned_blkg);
1590 if (pd_prealloc)
1591 pol->pd_free_fn(pd_prealloc);
1592 return ret;
1593
1594enomem:
1595
1596 spin_lock_irq(&q->queue_lock);
1597 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1598 struct blkcg *blkcg = blkg->blkcg;
1599 struct blkg_policy_data *pd;
1600
1601 spin_lock(&blkcg->lock);
1602 pd = blkg->pd[pol->plid];
1603 if (pd) {
1604 if (pd->online && pol->pd_offline_fn)
1605 pol->pd_offline_fn(pd);
1606 pd->online = false;
1607 pol->pd_free_fn(pd);
1608 blkg->pd[pol->plid] = NULL;
1609 }
1610 spin_unlock(&blkcg->lock);
1611 }
1612 spin_unlock_irq(&q->queue_lock);
1613 ret = -ENOMEM;
1614 goto out;
1615}
1616EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626void blkcg_deactivate_policy(struct gendisk *disk,
1627 const struct blkcg_policy *pol)
1628{
1629 struct request_queue *q = disk->queue;
1630 struct blkcg_gq *blkg;
1631
1632 if (!blkcg_policy_enabled(q, pol))
1633 return;
1634
1635 if (queue_is_mq(q))
1636 blk_mq_freeze_queue(q);
1637
1638 mutex_lock(&q->blkcg_mutex);
1639 spin_lock_irq(&q->queue_lock);
1640
1641 __clear_bit(pol->plid, q->blkcg_pols);
1642
1643 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1644 struct blkcg *blkcg = blkg->blkcg;
1645
1646 spin_lock(&blkcg->lock);
1647 if (blkg->pd[pol->plid]) {
1648 if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
1649 pol->pd_offline_fn(blkg->pd[pol->plid]);
1650 pol->pd_free_fn(blkg->pd[pol->plid]);
1651 blkg->pd[pol->plid] = NULL;
1652 }
1653 spin_unlock(&blkcg->lock);
1654 }
1655
1656 spin_unlock_irq(&q->queue_lock);
1657 mutex_unlock(&q->blkcg_mutex);
1658
1659 if (queue_is_mq(q))
1660 blk_mq_unfreeze_queue(q);
1661}
1662EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1663
1664static void blkcg_free_all_cpd(struct blkcg_policy *pol)
1665{
1666 struct blkcg *blkcg;
1667
1668 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1669 if (blkcg->cpd[pol->plid]) {
1670 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1671 blkcg->cpd[pol->plid] = NULL;
1672 }
1673 }
1674}
1675
1676
1677
1678
1679
1680
1681
1682
1683int blkcg_policy_register(struct blkcg_policy *pol)
1684{
1685 struct blkcg *blkcg;
1686 int i, ret;
1687
1688 mutex_lock(&blkcg_pol_register_mutex);
1689 mutex_lock(&blkcg_pol_mutex);
1690
1691
1692 ret = -ENOSPC;
1693 for (i = 0; i < BLKCG_MAX_POLS; i++)
1694 if (!blkcg_policy[i])
1695 break;
1696 if (i >= BLKCG_MAX_POLS) {
1697 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1698 goto err_unlock;
1699 }
1700
1701
1702 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1703 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1704 goto err_unlock;
1705
1706
1707 pol->plid = i;
1708 blkcg_policy[pol->plid] = pol;
1709
1710
1711 if (pol->cpd_alloc_fn) {
1712 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1713 struct blkcg_policy_data *cpd;
1714
1715 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1716 if (!cpd)
1717 goto err_free_cpds;
1718
1719 blkcg->cpd[pol->plid] = cpd;
1720 cpd->blkcg = blkcg;
1721 cpd->plid = pol->plid;
1722 }
1723 }
1724
1725 mutex_unlock(&blkcg_pol_mutex);
1726
1727
1728 if (pol->dfl_cftypes)
1729 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1730 pol->dfl_cftypes));
1731 if (pol->legacy_cftypes)
1732 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1733 pol->legacy_cftypes));
1734 mutex_unlock(&blkcg_pol_register_mutex);
1735 return 0;
1736
1737err_free_cpds:
1738 if (pol->cpd_free_fn)
1739 blkcg_free_all_cpd(pol);
1740
1741 blkcg_policy[pol->plid] = NULL;
1742err_unlock:
1743 mutex_unlock(&blkcg_pol_mutex);
1744 mutex_unlock(&blkcg_pol_register_mutex);
1745 return ret;
1746}
1747EXPORT_SYMBOL_GPL(blkcg_policy_register);
1748
1749
1750
1751
1752
1753
1754
1755void blkcg_policy_unregister(struct blkcg_policy *pol)
1756{
1757 mutex_lock(&blkcg_pol_register_mutex);
1758
1759 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1760 goto out_unlock;
1761
1762
1763 if (pol->dfl_cftypes)
1764 cgroup_rm_cftypes(pol->dfl_cftypes);
1765 if (pol->legacy_cftypes)
1766 cgroup_rm_cftypes(pol->legacy_cftypes);
1767
1768
1769 mutex_lock(&blkcg_pol_mutex);
1770
1771 if (pol->cpd_free_fn)
1772 blkcg_free_all_cpd(pol);
1773
1774 blkcg_policy[pol->plid] = NULL;
1775
1776 mutex_unlock(&blkcg_pol_mutex);
1777out_unlock:
1778 mutex_unlock(&blkcg_pol_register_mutex);
1779}
1780EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1781
1782
1783
1784
1785
1786
1787
1788static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1789{
1790 u64 old = atomic64_read(&blkg->delay_start);
1791
1792
1793 if (atomic_read(&blkg->use_delay) < 0)
1794 return;
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809 if (time_before64(old + NSEC_PER_SEC, now) &&
1810 atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
1811 u64 cur = atomic64_read(&blkg->delay_nsec);
1812 u64 sub = min_t(u64, blkg->last_delay, now - old);
1813 int cur_use = atomic_read(&blkg->use_delay);
1814
1815
1816
1817
1818
1819 if (cur_use < blkg->last_use)
1820 sub = max_t(u64, sub, blkg->last_delay >> 1);
1821
1822
1823
1824
1825
1826
1827
1828 if (unlikely(cur < sub)) {
1829 atomic64_set(&blkg->delay_nsec, 0);
1830 blkg->last_delay = 0;
1831 } else {
1832 atomic64_sub(sub, &blkg->delay_nsec);
1833 blkg->last_delay = cur - sub;
1834 }
1835 blkg->last_use = cur_use;
1836 }
1837}
1838
1839
1840
1841
1842
1843
1844
1845static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1846{
1847 unsigned long pflags;
1848 bool clamp;
1849 u64 now = ktime_to_ns(ktime_get());
1850 u64 exp;
1851 u64 delay_nsec = 0;
1852 int tok;
1853
1854 while (blkg->parent) {
1855 int use_delay = atomic_read(&blkg->use_delay);
1856
1857 if (use_delay) {
1858 u64 this_delay;
1859
1860 blkcg_scale_delay(blkg, now);
1861 this_delay = atomic64_read(&blkg->delay_nsec);
1862 if (this_delay > delay_nsec) {
1863 delay_nsec = this_delay;
1864 clamp = use_delay > 0;
1865 }
1866 }
1867 blkg = blkg->parent;
1868 }
1869
1870 if (!delay_nsec)
1871 return;
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882 if (clamp)
1883 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1884
1885 if (use_memdelay)
1886 psi_memstall_enter(&pflags);
1887
1888 exp = ktime_add_ns(now, delay_nsec);
1889 tok = io_schedule_prepare();
1890 do {
1891 __set_current_state(TASK_KILLABLE);
1892 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1893 break;
1894 } while (!fatal_signal_pending(current));
1895 io_schedule_finish(tok);
1896
1897 if (use_memdelay)
1898 psi_memstall_leave(&pflags);
1899}
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911void blkcg_maybe_throttle_current(void)
1912{
1913 struct gendisk *disk = current->throttle_disk;
1914 struct blkcg *blkcg;
1915 struct blkcg_gq *blkg;
1916 bool use_memdelay = current->use_memdelay;
1917
1918 if (!disk)
1919 return;
1920
1921 current->throttle_disk = NULL;
1922 current->use_memdelay = false;
1923
1924 rcu_read_lock();
1925 blkcg = css_to_blkcg(blkcg_css());
1926 if (!blkcg)
1927 goto out;
1928 blkg = blkg_lookup(blkcg, disk->queue);
1929 if (!blkg)
1930 goto out;
1931 if (!blkg_tryget(blkg))
1932 goto out;
1933 rcu_read_unlock();
1934
1935 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1936 blkg_put(blkg);
1937 put_disk(disk);
1938 return;
1939out:
1940 rcu_read_unlock();
1941}
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
1961{
1962 if (unlikely(current->flags & PF_KTHREAD))
1963 return;
1964
1965 if (current->throttle_disk != disk) {
1966 if (test_bit(GD_DEAD, &disk->state))
1967 return;
1968 get_device(disk_to_dev(disk));
1969
1970 if (current->throttle_disk)
1971 put_disk(current->throttle_disk);
1972 current->throttle_disk = disk;
1973 }
1974
1975 if (use_memdelay)
1976 current->use_memdelay = use_memdelay;
1977 set_notify_resume(current);
1978}
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1990{
1991 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
1992 return;
1993 blkcg_scale_delay(blkg, now);
1994 atomic64_add(delta, &blkg->delay_nsec);
1995}
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
2007 struct cgroup_subsys_state *css)
2008{
2009 struct blkcg_gq *blkg, *ret_blkg = NULL;
2010
2011 rcu_read_lock();
2012 blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
2013 while (blkg) {
2014 if (blkg_tryget(blkg)) {
2015 ret_blkg = blkg;
2016 break;
2017 }
2018 blkg = blkg->parent;
2019 }
2020 rcu_read_unlock();
2021
2022 return ret_blkg;
2023}
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039void bio_associate_blkg_from_css(struct bio *bio,
2040 struct cgroup_subsys_state *css)
2041{
2042 if (bio->bi_blkg)
2043 blkg_put(bio->bi_blkg);
2044
2045 if (css && css->parent) {
2046 bio->bi_blkg = blkg_tryget_closest(bio, css);
2047 } else {
2048 blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
2049 bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
2050 }
2051}
2052EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063void bio_associate_blkg(struct bio *bio)
2064{
2065 struct cgroup_subsys_state *css;
2066
2067 rcu_read_lock();
2068
2069 if (bio->bi_blkg)
2070 css = bio_blkcg_css(bio);
2071 else
2072 css = blkcg_css();
2073
2074 bio_associate_blkg_from_css(bio, css);
2075
2076 rcu_read_unlock();
2077}
2078EXPORT_SYMBOL_GPL(bio_associate_blkg);
2079
2080
2081
2082
2083
2084
2085void bio_clone_blkg_association(struct bio *dst, struct bio *src)
2086{
2087 if (src->bi_blkg)
2088 bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
2089}
2090EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
2091
2092static int blk_cgroup_io_type(struct bio *bio)
2093{
2094 if (op_is_discard(bio->bi_opf))
2095 return BLKG_IOSTAT_DISCARD;
2096 if (op_is_write(bio->bi_opf))
2097 return BLKG_IOSTAT_WRITE;
2098 return BLKG_IOSTAT_READ;
2099}
2100
2101void blk_cgroup_bio_start(struct bio *bio)
2102{
2103 struct blkcg *blkcg = bio->bi_blkg->blkcg;
2104 int rwd = blk_cgroup_io_type(bio), cpu;
2105 struct blkg_iostat_set *bis;
2106 unsigned long flags;
2107
2108 if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
2109 return;
2110
2111
2112 if (!cgroup_parent(blkcg->css.cgroup))
2113 return;
2114
2115 cpu = get_cpu();
2116 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
2117 flags = u64_stats_update_begin_irqsave(&bis->sync);
2118
2119
2120
2121
2122
2123 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
2124 bio_set_flag(bio, BIO_CGROUP_ACCT);
2125 bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
2126 }
2127 bis->cur.ios[rwd]++;
2128
2129
2130
2131
2132
2133 if (!READ_ONCE(bis->lqueued)) {
2134 struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
2135
2136 llist_add(&bis->lnode, lhead);
2137 WRITE_ONCE(bis->lqueued, true);
2138 }
2139
2140 u64_stats_update_end_irqrestore(&bis->sync, flags);
2141 cgroup_rstat_updated(blkcg->css.cgroup, cpu);
2142 put_cpu();
2143}
2144
2145bool blk_cgroup_congested(void)
2146{
2147 struct cgroup_subsys_state *css;
2148 bool ret = false;
2149
2150 rcu_read_lock();
2151 for (css = blkcg_css(); css; css = css->parent) {
2152 if (atomic_read(&css->cgroup->congestion_count)) {
2153 ret = true;
2154 break;
2155 }
2156 }
2157 rcu_read_unlock();
2158 return ret;
2159}
2160
2161module_param(blkcg_debug_stats, bool, 0644);
2162MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
2163