1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/ioprio.h>
19#include <linux/kdev_t.h>
20#include <linux/module.h>
21#include <linux/sched/signal.h>
22#include <linux/err.h>
23#include <linux/blkdev.h>
24#include <linux/backing-dev.h>
25#include <linux/slab.h>
26#include <linux/genhd.h>
27#include <linux/delay.h>
28#include <linux/atomic.h>
29#include <linux/ctype.h>
30#include <linux/blk-cgroup.h>
31#include <linux/tracehook.h>
32#include <linux/psi.h>
33#include "blk.h"
34
35
36
37
38
39
40
41
42static DEFINE_MUTEX(blkcg_pol_register_mutex);
43static DEFINE_MUTEX(blkcg_pol_mutex);
44
45struct blkcg blkcg_root;
46EXPORT_SYMBOL_GPL(blkcg_root);
47
48struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
49EXPORT_SYMBOL_GPL(blkcg_root_css);
50
51static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
52
53static LIST_HEAD(all_blkcgs);
54
55bool blkcg_debug_stats = false;
56static struct workqueue_struct *blkcg_punt_bio_wq;
57
58static bool blkcg_policy_enabled(struct request_queue *q,
59 const struct blkcg_policy *pol)
60{
61 return pol && test_bit(pol->plid, q->blkcg_pols);
62}
63
64
65
66
67
68
69
70static void blkg_free(struct blkcg_gq *blkg)
71{
72 int i;
73
74 if (!blkg)
75 return;
76
77 for (i = 0; i < BLKCG_MAX_POLS; i++)
78 if (blkg->pd[i])
79 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
80
81 free_percpu(blkg->iostat_cpu);
82 percpu_ref_exit(&blkg->refcnt);
83 kfree(blkg);
84}
85
86static void __blkg_release(struct rcu_head *rcu)
87{
88 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
89
90 WARN_ON(!bio_list_empty(&blkg->async_bios));
91
92
93 css_put(&blkg->blkcg->css);
94 if (blkg->parent)
95 blkg_put(blkg->parent);
96 blkg_free(blkg);
97}
98
99
100
101
102
103
104
105
106
107static void blkg_release(struct percpu_ref *ref)
108{
109 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
110
111 call_rcu(&blkg->rcu_head, __blkg_release);
112}
113
114static void blkg_async_bio_workfn(struct work_struct *work)
115{
116 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
117 async_bio_work);
118 struct bio_list bios = BIO_EMPTY_LIST;
119 struct bio *bio;
120 struct blk_plug plug;
121 bool need_plug = false;
122
123
124 spin_lock_bh(&blkg->async_bio_lock);
125 bio_list_merge(&bios, &blkg->async_bios);
126 bio_list_init(&blkg->async_bios);
127 spin_unlock_bh(&blkg->async_bio_lock);
128
129
130 if (bios.head && bios.head->bi_next) {
131 need_plug = true;
132 blk_start_plug(&plug);
133 }
134 while ((bio = bio_list_pop(&bios)))
135 submit_bio(bio);
136 if (need_plug)
137 blk_finish_plug(&plug);
138}
139
140
141
142
143
144
145
146
147
148static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
149 gfp_t gfp_mask)
150{
151 struct blkcg_gq *blkg;
152 int i, cpu;
153
154
155 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
156 if (!blkg)
157 return NULL;
158
159 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
160 goto err_free;
161
162 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
163 if (!blkg->iostat_cpu)
164 goto err_free;
165
166 blkg->q = q;
167 INIT_LIST_HEAD(&blkg->q_node);
168 spin_lock_init(&blkg->async_bio_lock);
169 bio_list_init(&blkg->async_bios);
170 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
171 blkg->blkcg = blkcg;
172
173 u64_stats_init(&blkg->iostat.sync);
174 for_each_possible_cpu(cpu)
175 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
176
177 for (i = 0; i < BLKCG_MAX_POLS; i++) {
178 struct blkcg_policy *pol = blkcg_policy[i];
179 struct blkg_policy_data *pd;
180
181 if (!blkcg_policy_enabled(q, pol))
182 continue;
183
184
185 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
186 if (!pd)
187 goto err_free;
188
189 blkg->pd[i] = pd;
190 pd->blkg = blkg;
191 pd->plid = i;
192 }
193
194 return blkg;
195
196err_free:
197 blkg_free(blkg);
198 return NULL;
199}
200
201struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
202 struct request_queue *q, bool update_hint)
203{
204 struct blkcg_gq *blkg;
205
206
207
208
209
210
211
212 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
213 if (blkg && blkg->q == q) {
214 if (update_hint) {
215 lockdep_assert_held(&q->queue_lock);
216 rcu_assign_pointer(blkcg->blkg_hint, blkg);
217 }
218 return blkg;
219 }
220
221 return NULL;
222}
223EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
224
225
226
227
228
229static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
230 struct request_queue *q,
231 struct blkcg_gq *new_blkg)
232{
233 struct blkcg_gq *blkg;
234 int i, ret;
235
236 WARN_ON_ONCE(!rcu_read_lock_held());
237 lockdep_assert_held(&q->queue_lock);
238
239
240 if (blk_queue_dying(q)) {
241 ret = -ENODEV;
242 goto err_free_blkg;
243 }
244
245
246 if (!css_tryget_online(&blkcg->css)) {
247 ret = -ENODEV;
248 goto err_free_blkg;
249 }
250
251
252 if (!new_blkg) {
253 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
254 if (unlikely(!new_blkg)) {
255 ret = -ENOMEM;
256 goto err_put_css;
257 }
258 }
259 blkg = new_blkg;
260
261
262 if (blkcg_parent(blkcg)) {
263 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
264 if (WARN_ON_ONCE(!blkg->parent)) {
265 ret = -ENODEV;
266 goto err_put_css;
267 }
268 blkg_get(blkg->parent);
269 }
270
271
272 for (i = 0; i < BLKCG_MAX_POLS; i++) {
273 struct blkcg_policy *pol = blkcg_policy[i];
274
275 if (blkg->pd[i] && pol->pd_init_fn)
276 pol->pd_init_fn(blkg->pd[i]);
277 }
278
279
280 spin_lock(&blkcg->lock);
281 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
282 if (likely(!ret)) {
283 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
284 list_add(&blkg->q_node, &q->blkg_list);
285
286 for (i = 0; i < BLKCG_MAX_POLS; i++) {
287 struct blkcg_policy *pol = blkcg_policy[i];
288
289 if (blkg->pd[i] && pol->pd_online_fn)
290 pol->pd_online_fn(blkg->pd[i]);
291 }
292 }
293 blkg->online = true;
294 spin_unlock(&blkcg->lock);
295
296 if (!ret)
297 return blkg;
298
299
300 blkg_put(blkg);
301 return ERR_PTR(ret);
302
303err_put_css:
304 css_put(&blkcg->css);
305err_free_blkg:
306 blkg_free(new_blkg);
307 return ERR_PTR(ret);
308}
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
324 struct request_queue *q)
325{
326 struct blkcg_gq *blkg;
327 unsigned long flags;
328
329 WARN_ON_ONCE(!rcu_read_lock_held());
330
331 blkg = blkg_lookup(blkcg, q);
332 if (blkg)
333 return blkg;
334
335 spin_lock_irqsave(&q->queue_lock, flags);
336 blkg = __blkg_lookup(blkcg, q, true);
337 if (blkg)
338 goto found;
339
340
341
342
343
344
345 while (true) {
346 struct blkcg *pos = blkcg;
347 struct blkcg *parent = blkcg_parent(blkcg);
348 struct blkcg_gq *ret_blkg = q->root_blkg;
349
350 while (parent) {
351 blkg = __blkg_lookup(parent, q, false);
352 if (blkg) {
353
354 ret_blkg = blkg;
355 break;
356 }
357 pos = parent;
358 parent = blkcg_parent(parent);
359 }
360
361 blkg = blkg_create(pos, q, NULL);
362 if (IS_ERR(blkg)) {
363 blkg = ret_blkg;
364 break;
365 }
366 if (pos == blkcg)
367 break;
368 }
369
370found:
371 spin_unlock_irqrestore(&q->queue_lock, flags);
372 return blkg;
373}
374
375static void blkg_destroy(struct blkcg_gq *blkg)
376{
377 struct blkcg *blkcg = blkg->blkcg;
378 int i;
379
380 lockdep_assert_held(&blkg->q->queue_lock);
381 lockdep_assert_held(&blkcg->lock);
382
383
384 WARN_ON_ONCE(list_empty(&blkg->q_node));
385 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
386
387 for (i = 0; i < BLKCG_MAX_POLS; i++) {
388 struct blkcg_policy *pol = blkcg_policy[i];
389
390 if (blkg->pd[i] && pol->pd_offline_fn)
391 pol->pd_offline_fn(blkg->pd[i]);
392 }
393
394 blkg->online = false;
395
396 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
397 list_del_init(&blkg->q_node);
398 hlist_del_init_rcu(&blkg->blkcg_node);
399
400
401
402
403
404
405 if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
406 rcu_assign_pointer(blkcg->blkg_hint, NULL);
407
408
409
410
411
412 percpu_ref_kill(&blkg->refcnt);
413}
414
415
416
417
418
419
420
421static void blkg_destroy_all(struct request_queue *q)
422{
423 struct blkcg_gq *blkg, *n;
424
425 spin_lock_irq(&q->queue_lock);
426 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
427 struct blkcg *blkcg = blkg->blkcg;
428
429 spin_lock(&blkcg->lock);
430 blkg_destroy(blkg);
431 spin_unlock(&blkcg->lock);
432 }
433
434 q->root_blkg = NULL;
435 spin_unlock_irq(&q->queue_lock);
436}
437
438static int blkcg_reset_stats(struct cgroup_subsys_state *css,
439 struct cftype *cftype, u64 val)
440{
441 struct blkcg *blkcg = css_to_blkcg(css);
442 struct blkcg_gq *blkg;
443 int i, cpu;
444
445 mutex_lock(&blkcg_pol_mutex);
446 spin_lock_irq(&blkcg->lock);
447
448
449
450
451
452
453 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
454 for_each_possible_cpu(cpu) {
455 struct blkg_iostat_set *bis =
456 per_cpu_ptr(blkg->iostat_cpu, cpu);
457 memset(bis, 0, sizeof(*bis));
458 }
459 memset(&blkg->iostat, 0, sizeof(blkg->iostat));
460
461 for (i = 0; i < BLKCG_MAX_POLS; i++) {
462 struct blkcg_policy *pol = blkcg_policy[i];
463
464 if (blkg->pd[i] && pol->pd_reset_stats_fn)
465 pol->pd_reset_stats_fn(blkg->pd[i]);
466 }
467 }
468
469 spin_unlock_irq(&blkcg->lock);
470 mutex_unlock(&blkcg_pol_mutex);
471 return 0;
472}
473
474const char *blkg_dev_name(struct blkcg_gq *blkg)
475{
476
477 if (blkg->q->backing_dev_info->dev)
478 return bdi_dev_name(blkg->q->backing_dev_info);
479 return NULL;
480}
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
501 u64 (*prfill)(struct seq_file *,
502 struct blkg_policy_data *, int),
503 const struct blkcg_policy *pol, int data,
504 bool show_total)
505{
506 struct blkcg_gq *blkg;
507 u64 total = 0;
508
509 rcu_read_lock();
510 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
511 spin_lock_irq(&blkg->q->queue_lock);
512 if (blkcg_policy_enabled(blkg->q, pol))
513 total += prfill(sf, blkg->pd[pol->plid], data);
514 spin_unlock_irq(&blkg->q->queue_lock);
515 }
516 rcu_read_unlock();
517
518 if (show_total)
519 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
520}
521EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
522
523
524
525
526
527
528
529
530
531u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
532{
533 const char *dname = blkg_dev_name(pd->blkg);
534
535 if (!dname)
536 return 0;
537
538 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
539 return v;
540}
541EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
542
543
544static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
545 const struct blkcg_policy *pol,
546 struct request_queue *q)
547{
548 WARN_ON_ONCE(!rcu_read_lock_held());
549 lockdep_assert_held(&q->queue_lock);
550
551 if (!blkcg_policy_enabled(q, pol))
552 return ERR_PTR(-EOPNOTSUPP);
553 return __blkg_lookup(blkcg, q, true );
554}
555
556
557
558
559
560
561
562
563
564
565
566
567struct block_device *blkcg_conf_open_bdev(char **inputp)
568{
569 char *input = *inputp;
570 unsigned int major, minor;
571 struct block_device *bdev;
572 int key_len;
573
574 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
575 return ERR_PTR(-EINVAL);
576
577 input += key_len;
578 if (!isspace(*input))
579 return ERR_PTR(-EINVAL);
580 input = skip_spaces(input);
581
582 bdev = blkdev_get_no_open(MKDEV(major, minor));
583 if (!bdev)
584 return ERR_PTR(-ENODEV);
585 if (bdev_is_partition(bdev)) {
586 blkdev_put_no_open(bdev);
587 return ERR_PTR(-ENODEV);
588 }
589
590 *inputp = input;
591 return bdev;
592}
593
594
595
596
597
598
599
600
601
602
603
604
605
606int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
607 char *input, struct blkg_conf_ctx *ctx)
608 __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
609{
610 struct block_device *bdev;
611 struct request_queue *q;
612 struct blkcg_gq *blkg;
613 int ret;
614
615 bdev = blkcg_conf_open_bdev(&input);
616 if (IS_ERR(bdev))
617 return PTR_ERR(bdev);
618
619 q = bdev->bd_disk->queue;
620
621 rcu_read_lock();
622 spin_lock_irq(&q->queue_lock);
623
624 blkg = blkg_lookup_check(blkcg, pol, q);
625 if (IS_ERR(blkg)) {
626 ret = PTR_ERR(blkg);
627 goto fail_unlock;
628 }
629
630 if (blkg)
631 goto success;
632
633
634
635
636
637 while (true) {
638 struct blkcg *pos = blkcg;
639 struct blkcg *parent;
640 struct blkcg_gq *new_blkg;
641
642 parent = blkcg_parent(blkcg);
643 while (parent && !__blkg_lookup(parent, q, false)) {
644 pos = parent;
645 parent = blkcg_parent(parent);
646 }
647
648
649 spin_unlock_irq(&q->queue_lock);
650 rcu_read_unlock();
651
652 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
653 if (unlikely(!new_blkg)) {
654 ret = -ENOMEM;
655 goto fail;
656 }
657
658 if (radix_tree_preload(GFP_KERNEL)) {
659 blkg_free(new_blkg);
660 ret = -ENOMEM;
661 goto fail;
662 }
663
664 rcu_read_lock();
665 spin_lock_irq(&q->queue_lock);
666
667 blkg = blkg_lookup_check(pos, pol, q);
668 if (IS_ERR(blkg)) {
669 ret = PTR_ERR(blkg);
670 blkg_free(new_blkg);
671 goto fail_preloaded;
672 }
673
674 if (blkg) {
675 blkg_free(new_blkg);
676 } else {
677 blkg = blkg_create(pos, q, new_blkg);
678 if (IS_ERR(blkg)) {
679 ret = PTR_ERR(blkg);
680 goto fail_preloaded;
681 }
682 }
683
684 radix_tree_preload_end();
685
686 if (pos == blkcg)
687 goto success;
688 }
689success:
690 ctx->bdev = bdev;
691 ctx->blkg = blkg;
692 ctx->body = input;
693 return 0;
694
695fail_preloaded:
696 radix_tree_preload_end();
697fail_unlock:
698 spin_unlock_irq(&q->queue_lock);
699 rcu_read_unlock();
700fail:
701 blkdev_put_no_open(bdev);
702
703
704
705
706
707
708 if (ret == -EBUSY) {
709 msleep(10);
710 ret = restart_syscall();
711 }
712 return ret;
713}
714EXPORT_SYMBOL_GPL(blkg_conf_prep);
715
716
717
718
719
720
721
722
723void blkg_conf_finish(struct blkg_conf_ctx *ctx)
724 __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
725{
726 spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
727 rcu_read_unlock();
728 blkdev_put_no_open(ctx->bdev);
729}
730EXPORT_SYMBOL_GPL(blkg_conf_finish);
731
732static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
733{
734 int i;
735
736 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
737 dst->bytes[i] = src->bytes[i];
738 dst->ios[i] = src->ios[i];
739 }
740}
741
742static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
743{
744 int i;
745
746 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
747 dst->bytes[i] += src->bytes[i];
748 dst->ios[i] += src->ios[i];
749 }
750}
751
752static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
753{
754 int i;
755
756 for (i = 0; i < BLKG_IOSTAT_NR; i++) {
757 dst->bytes[i] -= src->bytes[i];
758 dst->ios[i] -= src->ios[i];
759 }
760}
761
762static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
763{
764 struct blkcg *blkcg = css_to_blkcg(css);
765 struct blkcg_gq *blkg;
766
767
768 if (!cgroup_parent(css->cgroup))
769 return;
770
771 rcu_read_lock();
772
773 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
774 struct blkcg_gq *parent = blkg->parent;
775 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
776 struct blkg_iostat cur, delta;
777 unsigned long flags;
778 unsigned int seq;
779
780
781 do {
782 seq = u64_stats_fetch_begin(&bisc->sync);
783 blkg_iostat_set(&cur, &bisc->cur);
784 } while (u64_stats_fetch_retry(&bisc->sync, seq));
785
786
787 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
788 blkg_iostat_set(&delta, &cur);
789 blkg_iostat_sub(&delta, &bisc->last);
790 blkg_iostat_add(&blkg->iostat.cur, &delta);
791 blkg_iostat_add(&bisc->last, &delta);
792 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
793
794
795 if (parent && parent->parent) {
796 flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
797 blkg_iostat_set(&delta, &blkg->iostat.cur);
798 blkg_iostat_sub(&delta, &blkg->iostat.last);
799 blkg_iostat_add(&parent->iostat.cur, &delta);
800 blkg_iostat_add(&blkg->iostat.last, &delta);
801 u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
802 }
803 }
804
805 rcu_read_unlock();
806}
807
808
809
810
811
812
813
814
815
816
817
818
819
820static void blkcg_fill_root_iostats(void)
821{
822 struct class_dev_iter iter;
823 struct device *dev;
824
825 class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
826 while ((dev = class_dev_iter_next(&iter))) {
827 struct block_device *bdev = dev_to_bdev(dev);
828 struct blkcg_gq *blkg =
829 blk_queue_root_blkg(bdev->bd_disk->queue);
830 struct blkg_iostat tmp;
831 int cpu;
832
833 memset(&tmp, 0, sizeof(tmp));
834 for_each_possible_cpu(cpu) {
835 struct disk_stats *cpu_dkstats;
836 unsigned long flags;
837
838 cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
839 tmp.ios[BLKG_IOSTAT_READ] +=
840 cpu_dkstats->ios[STAT_READ];
841 tmp.ios[BLKG_IOSTAT_WRITE] +=
842 cpu_dkstats->ios[STAT_WRITE];
843 tmp.ios[BLKG_IOSTAT_DISCARD] +=
844 cpu_dkstats->ios[STAT_DISCARD];
845
846 tmp.bytes[BLKG_IOSTAT_READ] +=
847 cpu_dkstats->sectors[STAT_READ] << 9;
848 tmp.bytes[BLKG_IOSTAT_WRITE] +=
849 cpu_dkstats->sectors[STAT_WRITE] << 9;
850 tmp.bytes[BLKG_IOSTAT_DISCARD] +=
851 cpu_dkstats->sectors[STAT_DISCARD] << 9;
852
853 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
854 blkg_iostat_set(&blkg->iostat.cur, &tmp);
855 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
856 }
857 }
858}
859
860static int blkcg_print_stat(struct seq_file *sf, void *v)
861{
862 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
863 struct blkcg_gq *blkg;
864
865 if (!seq_css(sf)->parent)
866 blkcg_fill_root_iostats();
867 else
868 cgroup_rstat_flush(blkcg->css.cgroup);
869
870 rcu_read_lock();
871
872 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
873 struct blkg_iostat_set *bis = &blkg->iostat;
874 const char *dname;
875 char *buf;
876 u64 rbytes, wbytes, rios, wios, dbytes, dios;
877 size_t size = seq_get_buf(sf, &buf), off = 0;
878 int i;
879 bool has_stats = false;
880 unsigned seq;
881
882 spin_lock_irq(&blkg->q->queue_lock);
883
884 if (!blkg->online)
885 goto skip;
886
887 dname = blkg_dev_name(blkg);
888 if (!dname)
889 goto skip;
890
891
892
893
894
895
896
897 off += scnprintf(buf+off, size-off, "%s ", dname);
898
899 do {
900 seq = u64_stats_fetch_begin(&bis->sync);
901
902 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
903 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
904 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
905 rios = bis->cur.ios[BLKG_IOSTAT_READ];
906 wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
907 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
908 } while (u64_stats_fetch_retry(&bis->sync, seq));
909
910 if (rbytes || wbytes || rios || wios) {
911 has_stats = true;
912 off += scnprintf(buf+off, size-off,
913 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
914 rbytes, wbytes, rios, wios,
915 dbytes, dios);
916 }
917
918 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
919 has_stats = true;
920 off += scnprintf(buf+off, size-off,
921 " use_delay=%d delay_nsec=%llu",
922 atomic_read(&blkg->use_delay),
923 (unsigned long long)atomic64_read(&blkg->delay_nsec));
924 }
925
926 for (i = 0; i < BLKCG_MAX_POLS; i++) {
927 struct blkcg_policy *pol = blkcg_policy[i];
928 size_t written;
929
930 if (!blkg->pd[i] || !pol->pd_stat_fn)
931 continue;
932
933 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
934 if (written)
935 has_stats = true;
936 off += written;
937 }
938
939 if (has_stats) {
940 if (off < size - 1) {
941 off += scnprintf(buf+off, size-off, "\n");
942 seq_commit(sf, off);
943 } else {
944 seq_commit(sf, -1);
945 }
946 }
947 skip:
948 spin_unlock_irq(&blkg->q->queue_lock);
949 }
950
951 rcu_read_unlock();
952 return 0;
953}
954
955static struct cftype blkcg_files[] = {
956 {
957 .name = "stat",
958 .seq_show = blkcg_print_stat,
959 },
960 { }
961};
962
963static struct cftype blkcg_legacy_files[] = {
964 {
965 .name = "reset_stats",
966 .write_u64 = blkcg_reset_stats,
967 },
968 { }
969};
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000static void blkcg_css_offline(struct cgroup_subsys_state *css)
1001{
1002 struct blkcg *blkcg = css_to_blkcg(css);
1003
1004
1005 wb_blkcg_offline(blkcg);
1006
1007
1008 blkcg_unpin_online(blkcg);
1009}
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022void blkcg_destroy_blkgs(struct blkcg *blkcg)
1023{
1024 might_sleep();
1025
1026 spin_lock_irq(&blkcg->lock);
1027
1028 while (!hlist_empty(&blkcg->blkg_list)) {
1029 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1030 struct blkcg_gq, blkcg_node);
1031 struct request_queue *q = blkg->q;
1032
1033 if (need_resched() || !spin_trylock(&q->queue_lock)) {
1034
1035
1036
1037
1038
1039 spin_unlock_irq(&blkcg->lock);
1040 cond_resched();
1041 spin_lock_irq(&blkcg->lock);
1042 continue;
1043 }
1044
1045 blkg_destroy(blkg);
1046 spin_unlock(&q->queue_lock);
1047 }
1048
1049 spin_unlock_irq(&blkcg->lock);
1050}
1051
1052static void blkcg_css_free(struct cgroup_subsys_state *css)
1053{
1054 struct blkcg *blkcg = css_to_blkcg(css);
1055 int i;
1056
1057 mutex_lock(&blkcg_pol_mutex);
1058
1059 list_del(&blkcg->all_blkcgs_node);
1060
1061 for (i = 0; i < BLKCG_MAX_POLS; i++)
1062 if (blkcg->cpd[i])
1063 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1064
1065 mutex_unlock(&blkcg_pol_mutex);
1066
1067 kfree(blkcg);
1068}
1069
1070static struct cgroup_subsys_state *
1071blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
1072{
1073 struct blkcg *blkcg;
1074 struct cgroup_subsys_state *ret;
1075 int i;
1076
1077 mutex_lock(&blkcg_pol_mutex);
1078
1079 if (!parent_css) {
1080 blkcg = &blkcg_root;
1081 } else {
1082 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1083 if (!blkcg) {
1084 ret = ERR_PTR(-ENOMEM);
1085 goto unlock;
1086 }
1087 }
1088
1089 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
1090 struct blkcg_policy *pol = blkcg_policy[i];
1091 struct blkcg_policy_data *cpd;
1092
1093
1094
1095
1096
1097
1098
1099 if (!pol || !pol->cpd_alloc_fn)
1100 continue;
1101
1102 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1103 if (!cpd) {
1104 ret = ERR_PTR(-ENOMEM);
1105 goto free_pd_blkcg;
1106 }
1107 blkcg->cpd[i] = cpd;
1108 cpd->blkcg = blkcg;
1109 cpd->plid = i;
1110 if (pol->cpd_init_fn)
1111 pol->cpd_init_fn(cpd);
1112 }
1113
1114 spin_lock_init(&blkcg->lock);
1115 refcount_set(&blkcg->online_pin, 1);
1116 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
1117 INIT_HLIST_HEAD(&blkcg->blkg_list);
1118#ifdef CONFIG_CGROUP_WRITEBACK
1119 INIT_LIST_HEAD(&blkcg->cgwb_list);
1120#endif
1121 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1122
1123 mutex_unlock(&blkcg_pol_mutex);
1124 return &blkcg->css;
1125
1126free_pd_blkcg:
1127 for (i--; i >= 0; i--)
1128 if (blkcg->cpd[i])
1129 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
1130
1131 if (blkcg != &blkcg_root)
1132 kfree(blkcg);
1133unlock:
1134 mutex_unlock(&blkcg_pol_mutex);
1135 return ret;
1136}
1137
1138static int blkcg_css_online(struct cgroup_subsys_state *css)
1139{
1140 struct blkcg *blkcg = css_to_blkcg(css);
1141 struct blkcg *parent = blkcg_parent(blkcg);
1142
1143
1144
1145
1146
1147
1148 if (parent)
1149 blkcg_pin_online(parent);
1150 return 0;
1151}
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163int blkcg_init_queue(struct request_queue *q)
1164{
1165 struct blkcg_gq *new_blkg, *blkg;
1166 bool preloaded;
1167 int ret;
1168
1169 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
1170 if (!new_blkg)
1171 return -ENOMEM;
1172
1173 preloaded = !radix_tree_preload(GFP_KERNEL);
1174
1175
1176 rcu_read_lock();
1177 spin_lock_irq(&q->queue_lock);
1178 blkg = blkg_create(&blkcg_root, q, new_blkg);
1179 if (IS_ERR(blkg))
1180 goto err_unlock;
1181 q->root_blkg = blkg;
1182 spin_unlock_irq(&q->queue_lock);
1183 rcu_read_unlock();
1184
1185 if (preloaded)
1186 radix_tree_preload_end();
1187
1188 ret = blk_throtl_init(q);
1189 if (ret)
1190 goto err_destroy_all;
1191
1192 ret = blk_iolatency_init(q);
1193 if (ret) {
1194 blk_throtl_exit(q);
1195 goto err_destroy_all;
1196 }
1197 return 0;
1198
1199err_destroy_all:
1200 blkg_destroy_all(q);
1201 return ret;
1202err_unlock:
1203 spin_unlock_irq(&q->queue_lock);
1204 rcu_read_unlock();
1205 if (preloaded)
1206 radix_tree_preload_end();
1207 return PTR_ERR(blkg);
1208}
1209
1210
1211
1212
1213
1214
1215
1216void blkcg_exit_queue(struct request_queue *q)
1217{
1218 blkg_destroy_all(q);
1219 blk_throtl_exit(q);
1220}
1221
1222
1223
1224
1225
1226
1227
1228static int blkcg_can_attach(struct cgroup_taskset *tset)
1229{
1230 struct task_struct *task;
1231 struct cgroup_subsys_state *dst_css;
1232 struct io_context *ioc;
1233 int ret = 0;
1234
1235
1236 cgroup_taskset_for_each(task, dst_css, tset) {
1237 task_lock(task);
1238 ioc = task->io_context;
1239 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1240 ret = -EINVAL;
1241 task_unlock(task);
1242 if (ret)
1243 break;
1244 }
1245 return ret;
1246}
1247
1248static void blkcg_bind(struct cgroup_subsys_state *root_css)
1249{
1250 int i;
1251
1252 mutex_lock(&blkcg_pol_mutex);
1253
1254 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1255 struct blkcg_policy *pol = blkcg_policy[i];
1256 struct blkcg *blkcg;
1257
1258 if (!pol || !pol->cpd_bind_fn)
1259 continue;
1260
1261 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
1262 if (blkcg->cpd[pol->plid])
1263 pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
1264 }
1265 mutex_unlock(&blkcg_pol_mutex);
1266}
1267
1268static void blkcg_exit(struct task_struct *tsk)
1269{
1270 if (tsk->throttle_queue)
1271 blk_put_queue(tsk->throttle_queue);
1272 tsk->throttle_queue = NULL;
1273}
1274
1275struct cgroup_subsys io_cgrp_subsys = {
1276 .css_alloc = blkcg_css_alloc,
1277 .css_online = blkcg_css_online,
1278 .css_offline = blkcg_css_offline,
1279 .css_free = blkcg_css_free,
1280 .can_attach = blkcg_can_attach,
1281 .css_rstat_flush = blkcg_rstat_flush,
1282 .bind = blkcg_bind,
1283 .dfl_cftypes = blkcg_files,
1284 .legacy_cftypes = blkcg_legacy_files,
1285 .legacy_name = "blkio",
1286 .exit = blkcg_exit,
1287#ifdef CONFIG_MEMCG
1288
1289
1290
1291
1292
1293 .depends_on = 1 << memory_cgrp_id,
1294#endif
1295};
1296EXPORT_SYMBOL_GPL(io_cgrp_subsys);
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314int blkcg_activate_policy(struct request_queue *q,
1315 const struct blkcg_policy *pol)
1316{
1317 struct blkg_policy_data *pd_prealloc = NULL;
1318 struct blkcg_gq *blkg, *pinned_blkg = NULL;
1319 int ret;
1320
1321 if (blkcg_policy_enabled(q, pol))
1322 return 0;
1323
1324 if (queue_is_mq(q))
1325 blk_mq_freeze_queue(q);
1326retry:
1327 spin_lock_irq(&q->queue_lock);
1328
1329
1330 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1331 struct blkg_policy_data *pd;
1332
1333 if (blkg->pd[pol->plid])
1334 continue;
1335
1336
1337 if (blkg == pinned_blkg) {
1338 pd = pd_prealloc;
1339 pd_prealloc = NULL;
1340 } else {
1341 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1342 blkg->blkcg);
1343 }
1344
1345 if (!pd) {
1346
1347
1348
1349
1350 if (pinned_blkg)
1351 blkg_put(pinned_blkg);
1352 blkg_get(blkg);
1353 pinned_blkg = blkg;
1354
1355 spin_unlock_irq(&q->queue_lock);
1356
1357 if (pd_prealloc)
1358 pol->pd_free_fn(pd_prealloc);
1359 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1360 blkg->blkcg);
1361 if (pd_prealloc)
1362 goto retry;
1363 else
1364 goto enomem;
1365 }
1366
1367 blkg->pd[pol->plid] = pd;
1368 pd->blkg = blkg;
1369 pd->plid = pol->plid;
1370 }
1371
1372
1373 if (pol->pd_init_fn)
1374 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1375 pol->pd_init_fn(blkg->pd[pol->plid]);
1376
1377 __set_bit(pol->plid, q->blkcg_pols);
1378 ret = 0;
1379
1380 spin_unlock_irq(&q->queue_lock);
1381out:
1382 if (queue_is_mq(q))
1383 blk_mq_unfreeze_queue(q);
1384 if (pinned_blkg)
1385 blkg_put(pinned_blkg);
1386 if (pd_prealloc)
1387 pol->pd_free_fn(pd_prealloc);
1388 return ret;
1389
1390enomem:
1391
1392 spin_lock_irq(&q->queue_lock);
1393 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1394 if (blkg->pd[pol->plid]) {
1395 pol->pd_free_fn(blkg->pd[pol->plid]);
1396 blkg->pd[pol->plid] = NULL;
1397 }
1398 }
1399 spin_unlock_irq(&q->queue_lock);
1400 ret = -ENOMEM;
1401 goto out;
1402}
1403EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413void blkcg_deactivate_policy(struct request_queue *q,
1414 const struct blkcg_policy *pol)
1415{
1416 struct blkcg_gq *blkg;
1417
1418 if (!blkcg_policy_enabled(q, pol))
1419 return;
1420
1421 if (queue_is_mq(q))
1422 blk_mq_freeze_queue(q);
1423
1424 spin_lock_irq(&q->queue_lock);
1425
1426 __clear_bit(pol->plid, q->blkcg_pols);
1427
1428 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1429 if (blkg->pd[pol->plid]) {
1430 if (pol->pd_offline_fn)
1431 pol->pd_offline_fn(blkg->pd[pol->plid]);
1432 pol->pd_free_fn(blkg->pd[pol->plid]);
1433 blkg->pd[pol->plid] = NULL;
1434 }
1435 }
1436
1437 spin_unlock_irq(&q->queue_lock);
1438
1439 if (queue_is_mq(q))
1440 blk_mq_unfreeze_queue(q);
1441}
1442EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1443
1444
1445
1446
1447
1448
1449
1450
1451int blkcg_policy_register(struct blkcg_policy *pol)
1452{
1453 struct blkcg *blkcg;
1454 int i, ret;
1455
1456 mutex_lock(&blkcg_pol_register_mutex);
1457 mutex_lock(&blkcg_pol_mutex);
1458
1459
1460 ret = -ENOSPC;
1461 for (i = 0; i < BLKCG_MAX_POLS; i++)
1462 if (!blkcg_policy[i])
1463 break;
1464 if (i >= BLKCG_MAX_POLS) {
1465 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
1466 goto err_unlock;
1467 }
1468
1469
1470 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
1471 (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
1472 goto err_unlock;
1473
1474
1475 pol->plid = i;
1476 blkcg_policy[pol->plid] = pol;
1477
1478
1479 if (pol->cpd_alloc_fn) {
1480 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1481 struct blkcg_policy_data *cpd;
1482
1483 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
1484 if (!cpd)
1485 goto err_free_cpds;
1486
1487 blkcg->cpd[pol->plid] = cpd;
1488 cpd->blkcg = blkcg;
1489 cpd->plid = pol->plid;
1490 if (pol->cpd_init_fn)
1491 pol->cpd_init_fn(cpd);
1492 }
1493 }
1494
1495 mutex_unlock(&blkcg_pol_mutex);
1496
1497
1498 if (pol->dfl_cftypes)
1499 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
1500 pol->dfl_cftypes));
1501 if (pol->legacy_cftypes)
1502 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
1503 pol->legacy_cftypes));
1504 mutex_unlock(&blkcg_pol_register_mutex);
1505 return 0;
1506
1507err_free_cpds:
1508 if (pol->cpd_free_fn) {
1509 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1510 if (blkcg->cpd[pol->plid]) {
1511 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1512 blkcg->cpd[pol->plid] = NULL;
1513 }
1514 }
1515 }
1516 blkcg_policy[pol->plid] = NULL;
1517err_unlock:
1518 mutex_unlock(&blkcg_pol_mutex);
1519 mutex_unlock(&blkcg_pol_register_mutex);
1520 return ret;
1521}
1522EXPORT_SYMBOL_GPL(blkcg_policy_register);
1523
1524
1525
1526
1527
1528
1529
1530void blkcg_policy_unregister(struct blkcg_policy *pol)
1531{
1532 struct blkcg *blkcg;
1533
1534 mutex_lock(&blkcg_pol_register_mutex);
1535
1536 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1537 goto out_unlock;
1538
1539
1540 if (pol->dfl_cftypes)
1541 cgroup_rm_cftypes(pol->dfl_cftypes);
1542 if (pol->legacy_cftypes)
1543 cgroup_rm_cftypes(pol->legacy_cftypes);
1544
1545
1546 mutex_lock(&blkcg_pol_mutex);
1547
1548 if (pol->cpd_free_fn) {
1549 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
1550 if (blkcg->cpd[pol->plid]) {
1551 pol->cpd_free_fn(blkcg->cpd[pol->plid]);
1552 blkcg->cpd[pol->plid] = NULL;
1553 }
1554 }
1555 }
1556 blkcg_policy[pol->plid] = NULL;
1557
1558 mutex_unlock(&blkcg_pol_mutex);
1559out_unlock:
1560 mutex_unlock(&blkcg_pol_register_mutex);
1561}
1562EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1563
1564bool __blkcg_punt_bio_submit(struct bio *bio)
1565{
1566 struct blkcg_gq *blkg = bio->bi_blkg;
1567
1568
1569 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1570
1571
1572 if (!blkg->parent)
1573 return false;
1574
1575 spin_lock_bh(&blkg->async_bio_lock);
1576 bio_list_add(&blkg->async_bios, bio);
1577 spin_unlock_bh(&blkg->async_bio_lock);
1578
1579 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1580 return true;
1581}
1582
1583
1584
1585
1586
1587
1588
1589static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1590{
1591 u64 old = atomic64_read(&blkg->delay_start);
1592
1593
1594 if (atomic_read(&blkg->use_delay) < 0)
1595 return;
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610 if (time_before64(old + NSEC_PER_SEC, now) &&
1611 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1612 u64 cur = atomic64_read(&blkg->delay_nsec);
1613 u64 sub = min_t(u64, blkg->last_delay, now - old);
1614 int cur_use = atomic_read(&blkg->use_delay);
1615
1616
1617
1618
1619
1620 if (cur_use < blkg->last_use)
1621 sub = max_t(u64, sub, blkg->last_delay >> 1);
1622
1623
1624
1625
1626
1627
1628
1629 if (unlikely(cur < sub)) {
1630 atomic64_set(&blkg->delay_nsec, 0);
1631 blkg->last_delay = 0;
1632 } else {
1633 atomic64_sub(sub, &blkg->delay_nsec);
1634 blkg->last_delay = cur - sub;
1635 }
1636 blkg->last_use = cur_use;
1637 }
1638}
1639
1640
1641
1642
1643
1644
1645
1646static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1647{
1648 unsigned long pflags;
1649 bool clamp;
1650 u64 now = ktime_to_ns(ktime_get());
1651 u64 exp;
1652 u64 delay_nsec = 0;
1653 int tok;
1654
1655 while (blkg->parent) {
1656 int use_delay = atomic_read(&blkg->use_delay);
1657
1658 if (use_delay) {
1659 u64 this_delay;
1660
1661 blkcg_scale_delay(blkg, now);
1662 this_delay = atomic64_read(&blkg->delay_nsec);
1663 if (this_delay > delay_nsec) {
1664 delay_nsec = this_delay;
1665 clamp = use_delay > 0;
1666 }
1667 }
1668 blkg = blkg->parent;
1669 }
1670
1671 if (!delay_nsec)
1672 return;
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683 if (clamp)
1684 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1685
1686 if (use_memdelay)
1687 psi_memstall_enter(&pflags);
1688
1689 exp = ktime_add_ns(now, delay_nsec);
1690 tok = io_schedule_prepare();
1691 do {
1692 __set_current_state(TASK_KILLABLE);
1693 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1694 break;
1695 } while (!fatal_signal_pending(current));
1696 io_schedule_finish(tok);
1697
1698 if (use_memdelay)
1699 psi_memstall_leave(&pflags);
1700}
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712void blkcg_maybe_throttle_current(void)
1713{
1714 struct request_queue *q = current->throttle_queue;
1715 struct cgroup_subsys_state *css;
1716 struct blkcg *blkcg;
1717 struct blkcg_gq *blkg;
1718 bool use_memdelay = current->use_memdelay;
1719
1720 if (!q)
1721 return;
1722
1723 current->throttle_queue = NULL;
1724 current->use_memdelay = false;
1725
1726 rcu_read_lock();
1727 css = kthread_blkcg();
1728 if (css)
1729 blkcg = css_to_blkcg(css);
1730 else
1731 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1732
1733 if (!blkcg)
1734 goto out;
1735 blkg = blkg_lookup(blkcg, q);
1736 if (!blkg)
1737 goto out;
1738 if (!blkg_tryget(blkg))
1739 goto out;
1740 rcu_read_unlock();
1741
1742 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1743 blkg_put(blkg);
1744 blk_put_queue(q);
1745 return;
1746out:
1747 rcu_read_unlock();
1748 blk_put_queue(q);
1749}
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1769{
1770 if (unlikely(current->flags & PF_KTHREAD))
1771 return;
1772
1773 if (current->throttle_queue != q) {
1774 if (!blk_get_queue(q))
1775 return;
1776
1777 if (current->throttle_queue)
1778 blk_put_queue(current->throttle_queue);
1779 current->throttle_queue = q;
1780 }
1781
1782 if (use_memdelay)
1783 current->use_memdelay = use_memdelay;
1784 set_notify_resume(current);
1785}
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1797{
1798 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
1799 return;
1800 blkcg_scale_delay(blkg, now);
1801 atomic64_add(delta, &blkg->delay_nsec);
1802}
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
1814 struct cgroup_subsys_state *css)
1815{
1816 struct blkcg_gq *blkg, *ret_blkg = NULL;
1817
1818 rcu_read_lock();
1819 blkg = blkg_lookup_create(css_to_blkcg(css),
1820 bio->bi_bdev->bd_disk->queue);
1821 while (blkg) {
1822 if (blkg_tryget(blkg)) {
1823 ret_blkg = blkg;
1824 break;
1825 }
1826 blkg = blkg->parent;
1827 }
1828 rcu_read_unlock();
1829
1830 return ret_blkg;
1831}
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847void bio_associate_blkg_from_css(struct bio *bio,
1848 struct cgroup_subsys_state *css)
1849{
1850 if (bio->bi_blkg)
1851 blkg_put(bio->bi_blkg);
1852
1853 if (css && css->parent) {
1854 bio->bi_blkg = blkg_tryget_closest(bio, css);
1855 } else {
1856 blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
1857 bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
1858 }
1859}
1860EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871void bio_associate_blkg(struct bio *bio)
1872{
1873 struct cgroup_subsys_state *css;
1874
1875 rcu_read_lock();
1876
1877 if (bio->bi_blkg)
1878 css = &bio_blkcg(bio)->css;
1879 else
1880 css = blkcg_css();
1881
1882 bio_associate_blkg_from_css(bio, css);
1883
1884 rcu_read_unlock();
1885}
1886EXPORT_SYMBOL_GPL(bio_associate_blkg);
1887
1888
1889
1890
1891
1892
1893void bio_clone_blkg_association(struct bio *dst, struct bio *src)
1894{
1895 if (src->bi_blkg) {
1896 if (dst->bi_blkg)
1897 blkg_put(dst->bi_blkg);
1898 blkg_get(src->bi_blkg);
1899 dst->bi_blkg = src->bi_blkg;
1900 }
1901}
1902EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
1903
1904static int blk_cgroup_io_type(struct bio *bio)
1905{
1906 if (op_is_discard(bio->bi_opf))
1907 return BLKG_IOSTAT_DISCARD;
1908 if (op_is_write(bio->bi_opf))
1909 return BLKG_IOSTAT_WRITE;
1910 return BLKG_IOSTAT_READ;
1911}
1912
1913void blk_cgroup_bio_start(struct bio *bio)
1914{
1915 int rwd = blk_cgroup_io_type(bio), cpu;
1916 struct blkg_iostat_set *bis;
1917
1918 cpu = get_cpu();
1919 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
1920 u64_stats_update_begin(&bis->sync);
1921
1922
1923
1924
1925
1926 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
1927 bio_set_flag(bio, BIO_CGROUP_ACCT);
1928 bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
1929 }
1930 bis->cur.ios[rwd]++;
1931
1932 u64_stats_update_end(&bis->sync);
1933 if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1934 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
1935 put_cpu();
1936}
1937
1938static int __init blkcg_init(void)
1939{
1940 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1941 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1942 WQ_UNBOUND | WQ_SYSFS, 0);
1943 if (!blkcg_punt_bio_wq)
1944 return -ENOMEM;
1945 return 0;
1946}
1947subsys_initcall(blkcg_init);
1948
1949module_param(blkcg_debug_stats, bool, 0644);
1950MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
1951