1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
181#include <asm/local.h>
182#include <asm/local64.h>
183#include "blk-rq-qos.h"
184#include "blk-stat.h"
185#include "blk-wbt.h"
186#include "blk-cgroup.h"
187
188#ifdef CONFIG_TRACEPOINTS
189
190
191#define TRACE_IOCG_PATH_LEN 1024
192static DEFINE_SPINLOCK(trace_iocg_path_lock);
193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194
195#define TRACE_IOCG_PATH(type, iocg, ...) \
196 do { \
197 unsigned long flags; \
198 if (trace_iocost_##type##_enabled()) { \
199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
202 trace_iocost_##type(iocg, trace_iocg_path, \
203 ##__VA_ARGS__); \
204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
205 } \
206 } while (0)
207
208#else
209#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
210#endif
211
212enum {
213 MILLION = 1000000,
214
215
216 MIN_PERIOD = USEC_PER_MSEC,
217 MAX_PERIOD = USEC_PER_SEC,
218
219
220
221
222
223
224 MARGIN_MIN_PCT = 10,
225 MARGIN_LOW_PCT = 20,
226 MARGIN_TARGET_PCT = 50,
227
228 INUSE_ADJ_STEP_PCT = 25,
229
230
231 TIMER_SLACK_PCT = 1,
232
233
234 WEIGHT_ONE = 1 << 16,
235};
236
237enum {
238
239
240
241
242
243
244
245
246
247
248
249 VTIME_PER_SEC_SHIFT = 37,
250 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
251 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
252 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
253
254
255 VRATE_MIN_PPM = 10000,
256 VRATE_MAX_PPM = 100000000,
257
258 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
259 VRATE_CLAMP_ADJ_PCT = 4,
260
261
262 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
263};
264
265enum {
266
267 RQ_WAIT_BUSY_PCT = 5,
268
269
270 UNBUSY_THR_PCT = 75,
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292 MIN_DELAY_THR_PCT = 500,
293 MAX_DELAY_THR_PCT = 25000,
294 MIN_DELAY = 250,
295 MAX_DELAY = 250 * USEC_PER_MSEC,
296
297
298 DFGV_USAGE_PCT = 50,
299 DFGV_PERIOD = 100 * USEC_PER_MSEC,
300
301
302 MAX_LAGGING_PERIODS = 10,
303
304
305
306
307
308
309 IOC_PAGE_SHIFT = 12,
310 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
311 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
312
313
314 LCOEF_RANDIO_PAGES = 4096,
315};
316
317enum ioc_running {
318 IOC_IDLE,
319 IOC_RUNNING,
320 IOC_STOP,
321};
322
323
324enum {
325 QOS_ENABLE,
326 QOS_CTRL,
327 NR_QOS_CTRL_PARAMS,
328};
329
330
331enum {
332 QOS_RPPM,
333 QOS_RLAT,
334 QOS_WPPM,
335 QOS_WLAT,
336 QOS_MIN,
337 QOS_MAX,
338 NR_QOS_PARAMS,
339};
340
341
342enum {
343 COST_CTRL,
344 COST_MODEL,
345 NR_COST_CTRL_PARAMS,
346};
347
348
349enum {
350 I_LCOEF_RBPS,
351 I_LCOEF_RSEQIOPS,
352 I_LCOEF_RRANDIOPS,
353 I_LCOEF_WBPS,
354 I_LCOEF_WSEQIOPS,
355 I_LCOEF_WRANDIOPS,
356 NR_I_LCOEFS,
357};
358
359enum {
360 LCOEF_RPAGE,
361 LCOEF_RSEQIO,
362 LCOEF_RRANDIO,
363 LCOEF_WPAGE,
364 LCOEF_WSEQIO,
365 LCOEF_WRANDIO,
366 NR_LCOEFS,
367};
368
369enum {
370 AUTOP_INVALID,
371 AUTOP_HDD,
372 AUTOP_SSD_QD1,
373 AUTOP_SSD_DFL,
374 AUTOP_SSD_FAST,
375};
376
377struct ioc_params {
378 u32 qos[NR_QOS_PARAMS];
379 u64 i_lcoefs[NR_I_LCOEFS];
380 u64 lcoefs[NR_LCOEFS];
381 u32 too_fast_vrate_pct;
382 u32 too_slow_vrate_pct;
383};
384
385struct ioc_margins {
386 s64 min;
387 s64 low;
388 s64 target;
389};
390
391struct ioc_missed {
392 local_t nr_met;
393 local_t nr_missed;
394 u32 last_met;
395 u32 last_missed;
396};
397
398struct ioc_pcpu_stat {
399 struct ioc_missed missed[2];
400
401 local64_t rq_wait_ns;
402 u64 last_rq_wait_ns;
403};
404
405
406struct ioc {
407 struct rq_qos rqos;
408
409 bool enabled;
410
411 struct ioc_params params;
412 struct ioc_margins margins;
413 u32 period_us;
414 u32 timer_slack_ns;
415 u64 vrate_min;
416 u64 vrate_max;
417
418 spinlock_t lock;
419 struct timer_list timer;
420 struct list_head active_iocgs;
421 struct ioc_pcpu_stat __percpu *pcpu_stat;
422
423 enum ioc_running running;
424 atomic64_t vtime_rate;
425 u64 vtime_base_rate;
426 s64 vtime_err;
427
428 seqcount_spinlock_t period_seqcount;
429 u64 period_at;
430 u64 period_at_vtime;
431
432 atomic64_t cur_period;
433 int busy_level;
434
435 bool weights_updated;
436 atomic_t hweight_gen;
437
438
439 u64 dfgv_period_at;
440 u64 dfgv_period_rem;
441 u64 dfgv_usage_us_sum;
442
443 u64 autop_too_fast_at;
444 u64 autop_too_slow_at;
445 int autop_idx;
446 bool user_qos_params:1;
447 bool user_cost_model:1;
448};
449
450struct iocg_pcpu_stat {
451 local64_t abs_vusage;
452};
453
454struct iocg_stat {
455 u64 usage_us;
456 u64 wait_us;
457 u64 indebt_us;
458 u64 indelay_us;
459};
460
461
462struct ioc_gq {
463 struct blkg_policy_data pd;
464 struct ioc *ioc;
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484 u32 cfg_weight;
485 u32 weight;
486 u32 active;
487 u32 inuse;
488
489 u32 last_inuse;
490 s64 saved_margin;
491
492 sector_t cursor;
493
494
495
496
497
498
499
500
501
502
503
504 atomic64_t vtime;
505 atomic64_t done_vtime;
506 u64 abs_vdebt;
507
508
509 u64 delay;
510 u64 delay_at;
511
512
513
514
515
516 atomic64_t active_period;
517 struct list_head active_list;
518
519
520 u64 child_active_sum;
521 u64 child_inuse_sum;
522 u64 child_adjusted_sum;
523 int hweight_gen;
524 u32 hweight_active;
525 u32 hweight_inuse;
526 u32 hweight_donating;
527 u32 hweight_after_donation;
528
529 struct list_head walk_list;
530 struct list_head surplus_list;
531
532 struct wait_queue_head waitq;
533 struct hrtimer waitq_timer;
534
535
536 u64 activated_at;
537
538
539 struct iocg_pcpu_stat __percpu *pcpu_stat;
540 struct iocg_stat stat;
541 struct iocg_stat last_stat;
542 u64 last_stat_abs_vusage;
543 u64 usage_delta_us;
544 u64 wait_since;
545 u64 indebt_since;
546 u64 indelay_since;
547
548
549 int level;
550 struct ioc_gq *ancestors[];
551};
552
553
554struct ioc_cgrp {
555 struct blkcg_policy_data cpd;
556 unsigned int dfl_weight;
557};
558
559struct ioc_now {
560 u64 now_ns;
561 u64 now;
562 u64 vnow;
563};
564
565struct iocg_wait {
566 struct wait_queue_entry wait;
567 struct bio *bio;
568 u64 abs_cost;
569 bool committed;
570};
571
572struct iocg_wake_ctx {
573 struct ioc_gq *iocg;
574 u32 hw_inuse;
575 s64 vbudget;
576};
577
578static const struct ioc_params autop[] = {
579 [AUTOP_HDD] = {
580 .qos = {
581 [QOS_RLAT] = 250000,
582 [QOS_WLAT] = 250000,
583 [QOS_MIN] = VRATE_MIN_PPM,
584 [QOS_MAX] = VRATE_MAX_PPM,
585 },
586 .i_lcoefs = {
587 [I_LCOEF_RBPS] = 174019176,
588 [I_LCOEF_RSEQIOPS] = 41708,
589 [I_LCOEF_RRANDIOPS] = 370,
590 [I_LCOEF_WBPS] = 178075866,
591 [I_LCOEF_WSEQIOPS] = 42705,
592 [I_LCOEF_WRANDIOPS] = 378,
593 },
594 },
595 [AUTOP_SSD_QD1] = {
596 .qos = {
597 [QOS_RLAT] = 25000,
598 [QOS_WLAT] = 25000,
599 [QOS_MIN] = VRATE_MIN_PPM,
600 [QOS_MAX] = VRATE_MAX_PPM,
601 },
602 .i_lcoefs = {
603 [I_LCOEF_RBPS] = 245855193,
604 [I_LCOEF_RSEQIOPS] = 61575,
605 [I_LCOEF_RRANDIOPS] = 6946,
606 [I_LCOEF_WBPS] = 141365009,
607 [I_LCOEF_WSEQIOPS] = 33716,
608 [I_LCOEF_WRANDIOPS] = 26796,
609 },
610 },
611 [AUTOP_SSD_DFL] = {
612 .qos = {
613 [QOS_RLAT] = 25000,
614 [QOS_WLAT] = 25000,
615 [QOS_MIN] = VRATE_MIN_PPM,
616 [QOS_MAX] = VRATE_MAX_PPM,
617 },
618 .i_lcoefs = {
619 [I_LCOEF_RBPS] = 488636629,
620 [I_LCOEF_RSEQIOPS] = 8932,
621 [I_LCOEF_RRANDIOPS] = 8518,
622 [I_LCOEF_WBPS] = 427891549,
623 [I_LCOEF_WSEQIOPS] = 28755,
624 [I_LCOEF_WRANDIOPS] = 21940,
625 },
626 .too_fast_vrate_pct = 500,
627 },
628 [AUTOP_SSD_FAST] = {
629 .qos = {
630 [QOS_RLAT] = 5000,
631 [QOS_WLAT] = 5000,
632 [QOS_MIN] = VRATE_MIN_PPM,
633 [QOS_MAX] = VRATE_MAX_PPM,
634 },
635 .i_lcoefs = {
636 [I_LCOEF_RBPS] = 3102524156LLU,
637 [I_LCOEF_RSEQIOPS] = 724816,
638 [I_LCOEF_RRANDIOPS] = 778122,
639 [I_LCOEF_WBPS] = 1742780862LLU,
640 [I_LCOEF_WSEQIOPS] = 425702,
641 [I_LCOEF_WRANDIOPS] = 443193,
642 },
643 .too_slow_vrate_pct = 10,
644 },
645};
646
647
648
649
650
651static u32 vrate_adj_pct[] =
652 { 0, 0, 0, 0,
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
655 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
656
657static struct blkcg_policy blkcg_policy_iocost;
658
659
660static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
661{
662 return container_of(rqos, struct ioc, rqos);
663}
664
665static struct ioc *q_to_ioc(struct request_queue *q)
666{
667 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
668}
669
670static const char __maybe_unused *ioc_name(struct ioc *ioc)
671{
672 struct gendisk *disk = ioc->rqos.disk;
673
674 if (!disk)
675 return "<unknown>";
676 return disk->disk_name;
677}
678
679static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
680{
681 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
682}
683
684static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
685{
686 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
687}
688
689static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
690{
691 return pd_to_blkg(&iocg->pd);
692}
693
694static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
695{
696 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
697 struct ioc_cgrp, cpd);
698}
699
700
701
702
703
704static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
705{
706 return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
707}
708
709
710
711
712static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
713{
714 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
715}
716
717static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
718 u64 abs_cost, u64 cost)
719{
720 struct iocg_pcpu_stat *gcs;
721
722 bio->bi_iocost_cost = cost;
723 atomic64_add(cost, &iocg->vtime);
724
725 gcs = get_cpu_ptr(iocg->pcpu_stat);
726 local64_add(abs_cost, &gcs->abs_vusage);
727 put_cpu_ptr(gcs);
728}
729
730static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
731{
732 if (lock_ioc) {
733 spin_lock_irqsave(&iocg->ioc->lock, *flags);
734 spin_lock(&iocg->waitq.lock);
735 } else {
736 spin_lock_irqsave(&iocg->waitq.lock, *flags);
737 }
738}
739
740static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
741{
742 if (unlock_ioc) {
743 spin_unlock(&iocg->waitq.lock);
744 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
745 } else {
746 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
747 }
748}
749
750#define CREATE_TRACE_POINTS
751#include <trace/events/iocost.h>
752
753static void ioc_refresh_margins(struct ioc *ioc)
754{
755 struct ioc_margins *margins = &ioc->margins;
756 u32 period_us = ioc->period_us;
757 u64 vrate = ioc->vtime_base_rate;
758
759 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
760 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
761 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
762}
763
764
765static void ioc_refresh_period_us(struct ioc *ioc)
766{
767 u32 ppm, lat, multi, period_us;
768
769 lockdep_assert_held(&ioc->lock);
770
771
772 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
773 ppm = ioc->params.qos[QOS_RPPM];
774 lat = ioc->params.qos[QOS_RLAT];
775 } else {
776 ppm = ioc->params.qos[QOS_WPPM];
777 lat = ioc->params.qos[QOS_WLAT];
778 }
779
780
781
782
783
784
785
786
787
788 if (ppm)
789 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
790 else
791 multi = 2;
792 period_us = multi * lat;
793 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
794
795
796 ioc->period_us = period_us;
797 ioc->timer_slack_ns = div64_u64(
798 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
799 100);
800 ioc_refresh_margins(ioc);
801}
802
803
804
805
806
807static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
808{
809 int idx = ioc->autop_idx;
810 const struct ioc_params *p = &autop[idx];
811 u32 vrate_pct;
812 u64 now_ns;
813
814
815 if (!blk_queue_nonrot(disk->queue))
816 return AUTOP_HDD;
817
818
819 if (blk_queue_depth(disk->queue) == 1)
820 return AUTOP_SSD_QD1;
821
822
823 if (idx < AUTOP_SSD_DFL)
824 return AUTOP_SSD_DFL;
825
826
827 if (ioc->user_qos_params || ioc->user_cost_model)
828 return idx;
829
830
831 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
832 now_ns = ktime_get_ns();
833
834 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
835 if (!ioc->autop_too_fast_at)
836 ioc->autop_too_fast_at = now_ns;
837 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
838 return idx + 1;
839 } else {
840 ioc->autop_too_fast_at = 0;
841 }
842
843 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
844 if (!ioc->autop_too_slow_at)
845 ioc->autop_too_slow_at = now_ns;
846 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
847 return idx - 1;
848 } else {
849 ioc->autop_too_slow_at = 0;
850 }
851
852 return idx;
853}
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
869 u64 *page, u64 *seqio, u64 *randio)
870{
871 u64 v;
872
873 *page = *seqio = *randio = 0;
874
875 if (bps) {
876 u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
877
878 if (bps_pages)
879 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
880 else
881 *page = 1;
882 }
883
884 if (seqiops) {
885 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
886 if (v > *page)
887 *seqio = v - *page;
888 }
889
890 if (randiops) {
891 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
892 if (v > *page)
893 *randio = v - *page;
894 }
895}
896
897static void ioc_refresh_lcoefs(struct ioc *ioc)
898{
899 u64 *u = ioc->params.i_lcoefs;
900 u64 *c = ioc->params.lcoefs;
901
902 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
903 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
904 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
905 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
906}
907
908
909
910
911
912static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
913 struct gendisk *disk)
914{
915 const struct ioc_params *p;
916 int idx;
917
918 lockdep_assert_held(&ioc->lock);
919
920 idx = ioc_autop_idx(ioc, disk);
921 p = &autop[idx];
922
923 if (idx == ioc->autop_idx && !force)
924 return false;
925
926 if (idx != ioc->autop_idx) {
927 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
928 ioc->vtime_base_rate = VTIME_PER_USEC;
929 }
930
931 ioc->autop_idx = idx;
932 ioc->autop_too_fast_at = 0;
933 ioc->autop_too_slow_at = 0;
934
935 if (!ioc->user_qos_params)
936 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
937 if (!ioc->user_cost_model)
938 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
939
940 ioc_refresh_period_us(ioc);
941 ioc_refresh_lcoefs(ioc);
942
943 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
944 VTIME_PER_USEC, MILLION);
945 ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
946 VTIME_PER_USEC, MILLION);
947
948 return true;
949}
950
951static bool ioc_refresh_params(struct ioc *ioc, bool force)
952{
953 return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
954}
955
956
957
958
959
960
961
962
963static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
964{
965 s64 pleft = ioc->period_at + ioc->period_us - now->now;
966 s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
967 s64 vcomp, vcomp_min, vcomp_max;
968
969 lockdep_assert_held(&ioc->lock);
970
971
972 if (pleft <= 0)
973 goto done;
974
975
976
977
978
979
980 vcomp = -div64_s64(ioc->vtime_err, pleft);
981 vcomp_min = -(ioc->vtime_base_rate >> 1);
982 vcomp_max = ioc->vtime_base_rate;
983 vcomp = clamp(vcomp, vcomp_min, vcomp_max);
984
985 ioc->vtime_err += vcomp * pleft;
986
987 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
988done:
989
990 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
991}
992
993static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
994 int nr_lagging, int nr_shortages,
995 int prev_busy_level, u32 *missed_ppm)
996{
997 u64 vrate = ioc->vtime_base_rate;
998 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
999
1000 if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
1001 if (ioc->busy_level != prev_busy_level || nr_lagging)
1002 trace_iocost_ioc_vrate_adj(ioc, vrate,
1003 missed_ppm, rq_wait_pct,
1004 nr_lagging, nr_shortages);
1005
1006 return;
1007 }
1008
1009
1010
1011
1012
1013
1014 if (vrate < vrate_min) {
1015 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
1016 vrate = min(vrate, vrate_min);
1017 } else if (vrate > vrate_max) {
1018 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1019 vrate = max(vrate, vrate_max);
1020 } else {
1021 int idx = min_t(int, abs(ioc->busy_level),
1022 ARRAY_SIZE(vrate_adj_pct) - 1);
1023 u32 adj_pct = vrate_adj_pct[idx];
1024
1025 if (ioc->busy_level > 0)
1026 adj_pct = 100 - adj_pct;
1027 else
1028 adj_pct = 100 + adj_pct;
1029
1030 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1031 vrate_min, vrate_max);
1032 }
1033
1034 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1035 nr_lagging, nr_shortages);
1036
1037 ioc->vtime_base_rate = vrate;
1038 ioc_refresh_margins(ioc);
1039}
1040
1041
1042static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1043{
1044 unsigned seq;
1045 u64 vrate;
1046
1047 now->now_ns = ktime_get();
1048 now->now = ktime_to_us(now->now_ns);
1049 vrate = atomic64_read(&ioc->vtime_rate);
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059 do {
1060 seq = read_seqcount_begin(&ioc->period_seqcount);
1061 now->vnow = ioc->period_at_vtime +
1062 (now->now - ioc->period_at) * vrate;
1063 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
1064}
1065
1066static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1067{
1068 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1069
1070 write_seqcount_begin(&ioc->period_seqcount);
1071 ioc->period_at = now->now;
1072 ioc->period_at_vtime = now->vnow;
1073 write_seqcount_end(&ioc->period_seqcount);
1074
1075 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1076 add_timer(&ioc->timer);
1077}
1078
1079
1080
1081
1082
1083
1084static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1085 bool save, struct ioc_now *now)
1086{
1087 struct ioc *ioc = iocg->ioc;
1088 int lvl;
1089
1090 lockdep_assert_held(&ioc->lock);
1091
1092
1093
1094
1095
1096
1097 if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1098 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1099 iocg->child_active_sum);
1100 } else {
1101 inuse = clamp_t(u32, inuse, 1, active);
1102 }
1103
1104 iocg->last_inuse = iocg->inuse;
1105 if (save)
1106 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1107
1108 if (active == iocg->active && inuse == iocg->inuse)
1109 return;
1110
1111 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1112 struct ioc_gq *parent = iocg->ancestors[lvl];
1113 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1114 u32 parent_active = 0, parent_inuse = 0;
1115
1116
1117 parent->child_active_sum += (s32)(active - child->active);
1118 parent->child_inuse_sum += (s32)(inuse - child->inuse);
1119
1120 child->active = active;
1121 child->inuse = inuse;
1122
1123
1124
1125
1126
1127
1128 if (parent->child_active_sum) {
1129 parent_active = parent->weight;
1130 parent_inuse = DIV64_U64_ROUND_UP(
1131 parent_active * parent->child_inuse_sum,
1132 parent->child_active_sum);
1133 }
1134
1135
1136 if (parent_active == parent->active &&
1137 parent_inuse == parent->inuse)
1138 break;
1139
1140 active = parent_active;
1141 inuse = parent_inuse;
1142 }
1143
1144 ioc->weights_updated = true;
1145}
1146
1147static void commit_weights(struct ioc *ioc)
1148{
1149 lockdep_assert_held(&ioc->lock);
1150
1151 if (ioc->weights_updated) {
1152
1153 smp_wmb();
1154 atomic_inc(&ioc->hweight_gen);
1155 ioc->weights_updated = false;
1156 }
1157}
1158
1159static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1160 bool save, struct ioc_now *now)
1161{
1162 __propagate_weights(iocg, active, inuse, save, now);
1163 commit_weights(iocg->ioc);
1164}
1165
1166static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1167{
1168 struct ioc *ioc = iocg->ioc;
1169 int lvl;
1170 u32 hwa, hwi;
1171 int ioc_gen;
1172
1173
1174 ioc_gen = atomic_read(&ioc->hweight_gen);
1175 if (ioc_gen == iocg->hweight_gen)
1176 goto out;
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188 smp_rmb();
1189
1190 hwa = hwi = WEIGHT_ONE;
1191 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1192 struct ioc_gq *parent = iocg->ancestors[lvl];
1193 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1194 u64 active_sum = READ_ONCE(parent->child_active_sum);
1195 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1196 u32 active = READ_ONCE(child->active);
1197 u32 inuse = READ_ONCE(child->inuse);
1198
1199
1200 if (!active_sum || !inuse_sum)
1201 continue;
1202
1203 active_sum = max_t(u64, active, active_sum);
1204 hwa = div64_u64((u64)hwa * active, active_sum);
1205
1206 inuse_sum = max_t(u64, inuse, inuse_sum);
1207 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1208 }
1209
1210 iocg->hweight_active = max_t(u32, hwa, 1);
1211 iocg->hweight_inuse = max_t(u32, hwi, 1);
1212 iocg->hweight_gen = ioc_gen;
1213out:
1214 if (hw_activep)
1215 *hw_activep = iocg->hweight_active;
1216 if (hw_inusep)
1217 *hw_inusep = iocg->hweight_inuse;
1218}
1219
1220
1221
1222
1223
1224static u32 current_hweight_max(struct ioc_gq *iocg)
1225{
1226 u32 hwm = WEIGHT_ONE;
1227 u32 inuse = iocg->active;
1228 u64 child_inuse_sum;
1229 int lvl;
1230
1231 lockdep_assert_held(&iocg->ioc->lock);
1232
1233 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1234 struct ioc_gq *parent = iocg->ancestors[lvl];
1235 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1236
1237 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1238 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1239 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1240 parent->child_active_sum);
1241 }
1242
1243 return max_t(u32, hwm, 1);
1244}
1245
1246static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1247{
1248 struct ioc *ioc = iocg->ioc;
1249 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1250 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1251 u32 weight;
1252
1253 lockdep_assert_held(&ioc->lock);
1254
1255 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1256 if (weight != iocg->weight && iocg->active)
1257 propagate_weights(iocg, weight, iocg->inuse, true, now);
1258 iocg->weight = weight;
1259}
1260
1261static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1262{
1263 struct ioc *ioc = iocg->ioc;
1264 u64 last_period, cur_period;
1265 u64 vtime, vtarget;
1266 int i;
1267
1268
1269
1270
1271
1272 if (!list_empty(&iocg->active_list)) {
1273 ioc_now(ioc, now);
1274 cur_period = atomic64_read(&ioc->cur_period);
1275 if (atomic64_read(&iocg->active_period) != cur_period)
1276 atomic64_set(&iocg->active_period, cur_period);
1277 return true;
1278 }
1279
1280
1281 if (iocg->child_active_sum)
1282 return false;
1283
1284 spin_lock_irq(&ioc->lock);
1285
1286 ioc_now(ioc, now);
1287
1288
1289 cur_period = atomic64_read(&ioc->cur_period);
1290 last_period = atomic64_read(&iocg->active_period);
1291 atomic64_set(&iocg->active_period, cur_period);
1292
1293
1294 if (!list_empty(&iocg->active_list))
1295 goto succeed_unlock;
1296 for (i = iocg->level - 1; i > 0; i--)
1297 if (!list_empty(&iocg->ancestors[i]->active_list))
1298 goto fail_unlock;
1299
1300 if (iocg->child_active_sum)
1301 goto fail_unlock;
1302
1303
1304
1305
1306
1307 vtarget = now->vnow - ioc->margins.target;
1308 vtime = atomic64_read(&iocg->vtime);
1309
1310 atomic64_add(vtarget - vtime, &iocg->vtime);
1311 atomic64_add(vtarget - vtime, &iocg->done_vtime);
1312 vtime = vtarget;
1313
1314
1315
1316
1317
1318
1319 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1320 list_add(&iocg->active_list, &ioc->active_iocgs);
1321
1322 propagate_weights(iocg, iocg->weight,
1323 iocg->last_inuse ?: iocg->weight, true, now);
1324
1325 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1326 last_period, cur_period, vtime);
1327
1328 iocg->activated_at = now->now;
1329
1330 if (ioc->running == IOC_IDLE) {
1331 ioc->running = IOC_RUNNING;
1332 ioc->dfgv_period_at = now->now;
1333 ioc->dfgv_period_rem = 0;
1334 ioc_start_period(ioc, now);
1335 }
1336
1337succeed_unlock:
1338 spin_unlock_irq(&ioc->lock);
1339 return true;
1340
1341fail_unlock:
1342 spin_unlock_irq(&ioc->lock);
1343 return false;
1344}
1345
1346static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1347{
1348 struct ioc *ioc = iocg->ioc;
1349 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1350 u64 tdelta, delay, new_delay;
1351 s64 vover, vover_pct;
1352 u32 hwa;
1353
1354 lockdep_assert_held(&iocg->waitq.lock);
1355
1356
1357 tdelta = now->now - iocg->delay_at;
1358 if (iocg->delay)
1359 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1360 else
1361 delay = 0;
1362
1363
1364 current_hweight(iocg, &hwa, NULL);
1365 vover = atomic64_read(&iocg->vtime) +
1366 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1367 vover_pct = div64_s64(100 * vover,
1368 ioc->period_us * ioc->vtime_base_rate);
1369
1370 if (vover_pct <= MIN_DELAY_THR_PCT)
1371 new_delay = 0;
1372 else if (vover_pct >= MAX_DELAY_THR_PCT)
1373 new_delay = MAX_DELAY;
1374 else
1375 new_delay = MIN_DELAY +
1376 div_u64((MAX_DELAY - MIN_DELAY) *
1377 (vover_pct - MIN_DELAY_THR_PCT),
1378 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1379
1380
1381 if (new_delay > delay) {
1382 iocg->delay = new_delay;
1383 iocg->delay_at = now->now;
1384 delay = new_delay;
1385 }
1386
1387 if (delay >= MIN_DELAY) {
1388 if (!iocg->indelay_since)
1389 iocg->indelay_since = now->now;
1390 blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1391 return true;
1392 } else {
1393 if (iocg->indelay_since) {
1394 iocg->stat.indelay_us += now->now - iocg->indelay_since;
1395 iocg->indelay_since = 0;
1396 }
1397 iocg->delay = 0;
1398 blkcg_clear_delay(blkg);
1399 return false;
1400 }
1401}
1402
1403static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1404 struct ioc_now *now)
1405{
1406 struct iocg_pcpu_stat *gcs;
1407
1408 lockdep_assert_held(&iocg->ioc->lock);
1409 lockdep_assert_held(&iocg->waitq.lock);
1410 WARN_ON_ONCE(list_empty(&iocg->active_list));
1411
1412
1413
1414
1415
1416 if (!iocg->abs_vdebt && abs_cost) {
1417 iocg->indebt_since = now->now;
1418 propagate_weights(iocg, iocg->active, 0, false, now);
1419 }
1420
1421 iocg->abs_vdebt += abs_cost;
1422
1423 gcs = get_cpu_ptr(iocg->pcpu_stat);
1424 local64_add(abs_cost, &gcs->abs_vusage);
1425 put_cpu_ptr(gcs);
1426}
1427
1428static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1429 struct ioc_now *now)
1430{
1431 lockdep_assert_held(&iocg->ioc->lock);
1432 lockdep_assert_held(&iocg->waitq.lock);
1433
1434
1435 WARN_ON_ONCE(list_empty(&iocg->active_list));
1436 WARN_ON_ONCE(iocg->inuse > 1);
1437
1438 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1439
1440
1441 if (!iocg->abs_vdebt) {
1442 iocg->stat.indebt_us += now->now - iocg->indebt_since;
1443 iocg->indebt_since = 0;
1444
1445 propagate_weights(iocg, iocg->active, iocg->last_inuse,
1446 false, now);
1447 }
1448}
1449
1450static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1451 int flags, void *key)
1452{
1453 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1454 struct iocg_wake_ctx *ctx = key;
1455 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1456
1457 ctx->vbudget -= cost;
1458
1459 if (ctx->vbudget < 0)
1460 return -1;
1461
1462 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1463 wait->committed = true;
1464
1465
1466
1467
1468
1469
1470
1471
1472 default_wake_function(wq_entry, mode, flags, key);
1473 list_del_init_careful(&wq_entry->entry);
1474 return 0;
1475}
1476
1477
1478
1479
1480
1481
1482static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1483 struct ioc_now *now)
1484{
1485 struct ioc *ioc = iocg->ioc;
1486 struct iocg_wake_ctx ctx = { .iocg = iocg };
1487 u64 vshortage, expires, oexpires;
1488 s64 vbudget;
1489 u32 hwa;
1490
1491 lockdep_assert_held(&iocg->waitq.lock);
1492
1493 current_hweight(iocg, &hwa, NULL);
1494 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1495
1496
1497 if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1498 u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1499 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1500 u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1501
1502 lockdep_assert_held(&ioc->lock);
1503
1504 atomic64_add(vpay, &iocg->vtime);
1505 atomic64_add(vpay, &iocg->done_vtime);
1506 iocg_pay_debt(iocg, abs_vpay, now);
1507 vbudget -= vpay;
1508 }
1509
1510 if (iocg->abs_vdebt || iocg->delay)
1511 iocg_kick_delay(iocg, now);
1512
1513
1514
1515
1516
1517
1518
1519 if (iocg->abs_vdebt) {
1520 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1521 vbudget = min_t(s64, 0, vbudget - vdebt);
1522 }
1523
1524
1525
1526
1527
1528
1529 ctx.vbudget = vbudget;
1530 current_hweight(iocg, NULL, &ctx.hw_inuse);
1531
1532 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1533
1534 if (!waitqueue_active(&iocg->waitq)) {
1535 if (iocg->wait_since) {
1536 iocg->stat.wait_us += now->now - iocg->wait_since;
1537 iocg->wait_since = 0;
1538 }
1539 return;
1540 }
1541
1542 if (!iocg->wait_since)
1543 iocg->wait_since = now->now;
1544
1545 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1546 return;
1547
1548
1549 vshortage = -ctx.vbudget;
1550 expires = now->now_ns +
1551 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1552 NSEC_PER_USEC;
1553 expires += ioc->timer_slack_ns;
1554
1555
1556 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1557 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1558 abs(oexpires - expires) <= ioc->timer_slack_ns)
1559 return;
1560
1561 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1562 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1563}
1564
1565static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1566{
1567 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1568 bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1569 struct ioc_now now;
1570 unsigned long flags;
1571
1572 ioc_now(iocg->ioc, &now);
1573
1574 iocg_lock(iocg, pay_debt, &flags);
1575 iocg_kick_waitq(iocg, pay_debt, &now);
1576 iocg_unlock(iocg, pay_debt, &flags);
1577
1578 return HRTIMER_NORESTART;
1579}
1580
1581static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1582{
1583 u32 nr_met[2] = { };
1584 u32 nr_missed[2] = { };
1585 u64 rq_wait_ns = 0;
1586 int cpu, rw;
1587
1588 for_each_online_cpu(cpu) {
1589 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1590 u64 this_rq_wait_ns;
1591
1592 for (rw = READ; rw <= WRITE; rw++) {
1593 u32 this_met = local_read(&stat->missed[rw].nr_met);
1594 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1595
1596 nr_met[rw] += this_met - stat->missed[rw].last_met;
1597 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1598 stat->missed[rw].last_met = this_met;
1599 stat->missed[rw].last_missed = this_missed;
1600 }
1601
1602 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1603 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1604 stat->last_rq_wait_ns = this_rq_wait_ns;
1605 }
1606
1607 for (rw = READ; rw <= WRITE; rw++) {
1608 if (nr_met[rw] + nr_missed[rw])
1609 missed_ppm_ar[rw] =
1610 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1611 nr_met[rw] + nr_missed[rw]);
1612 else
1613 missed_ppm_ar[rw] = 0;
1614 }
1615
1616 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1617 ioc->period_us * NSEC_PER_USEC);
1618}
1619
1620
1621static bool iocg_is_idle(struct ioc_gq *iocg)
1622{
1623 struct ioc *ioc = iocg->ioc;
1624
1625
1626 if (atomic64_read(&iocg->active_period) ==
1627 atomic64_read(&ioc->cur_period))
1628 return false;
1629
1630
1631 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1632 return false;
1633
1634 return true;
1635}
1636
1637
1638
1639
1640
1641
1642static void iocg_build_inner_walk(struct ioc_gq *iocg,
1643 struct list_head *inner_walk)
1644{
1645 int lvl;
1646
1647 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1648
1649
1650 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1651 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1652 break;
1653 }
1654
1655
1656 while (++lvl <= iocg->level - 1) {
1657 struct ioc_gq *inner = iocg->ancestors[lvl];
1658
1659
1660 list_add_tail(&inner->walk_list, inner_walk);
1661 }
1662}
1663
1664
1665static void iocg_flush_stat_upward(struct ioc_gq *iocg)
1666{
1667 if (iocg->level > 0) {
1668 struct iocg_stat *parent_stat =
1669 &iocg->ancestors[iocg->level - 1]->stat;
1670
1671 parent_stat->usage_us +=
1672 iocg->stat.usage_us - iocg->last_stat.usage_us;
1673 parent_stat->wait_us +=
1674 iocg->stat.wait_us - iocg->last_stat.wait_us;
1675 parent_stat->indebt_us +=
1676 iocg->stat.indebt_us - iocg->last_stat.indebt_us;
1677 parent_stat->indelay_us +=
1678 iocg->stat.indelay_us - iocg->last_stat.indelay_us;
1679 }
1680
1681 iocg->last_stat = iocg->stat;
1682}
1683
1684
1685static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
1686{
1687 struct ioc *ioc = iocg->ioc;
1688 u64 abs_vusage = 0;
1689 u64 vusage_delta;
1690 int cpu;
1691
1692 lockdep_assert_held(&iocg->ioc->lock);
1693
1694
1695 for_each_possible_cpu(cpu) {
1696 abs_vusage += local64_read(
1697 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1698 }
1699 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1700 iocg->last_stat_abs_vusage = abs_vusage;
1701
1702 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1703 iocg->stat.usage_us += iocg->usage_delta_us;
1704
1705 iocg_flush_stat_upward(iocg);
1706}
1707
1708
1709static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1710{
1711 LIST_HEAD(inner_walk);
1712 struct ioc_gq *iocg, *tiocg;
1713
1714
1715 list_for_each_entry(iocg, target_iocgs, active_list) {
1716 iocg_flush_stat_leaf(iocg, now);
1717 iocg_build_inner_walk(iocg, &inner_walk);
1718 }
1719
1720
1721 list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1722 iocg_flush_stat_upward(iocg);
1723 list_del_init(&iocg->walk_list);
1724 }
1725}
1726
1727
1728
1729
1730
1731
1732static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1733 u32 usage, struct ioc_now *now)
1734{
1735 struct ioc *ioc = iocg->ioc;
1736 u64 vtime = atomic64_read(&iocg->vtime);
1737 s64 excess, delta, target, new_hwi;
1738
1739
1740 if (iocg->abs_vdebt)
1741 return 1;
1742
1743
1744 if (waitqueue_active(&iocg->waitq) ||
1745 time_after64(vtime, now->vnow - ioc->margins.min))
1746 return hwm;
1747
1748
1749 excess = now->vnow - vtime - ioc->margins.target;
1750 if (excess > 0) {
1751 atomic64_add(excess, &iocg->vtime);
1752 atomic64_add(excess, &iocg->done_vtime);
1753 vtime += excess;
1754 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1755 }
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1774 now->vnow - ioc->period_at_vtime);
1775 target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1776 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1777
1778 return clamp_t(s64, new_hwi, 1, hwm);
1779}
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1839{
1840 LIST_HEAD(over_hwa);
1841 LIST_HEAD(inner_walk);
1842 struct ioc_gq *iocg, *tiocg, *root_iocg;
1843 u32 after_sum, over_sum, over_target, gamma;
1844
1845
1846
1847
1848
1849
1850
1851
1852 after_sum = 0;
1853 over_sum = 0;
1854 list_for_each_entry(iocg, surpluses, surplus_list) {
1855 u32 hwa;
1856
1857 current_hweight(iocg, &hwa, NULL);
1858 after_sum += iocg->hweight_after_donation;
1859
1860 if (iocg->hweight_after_donation > hwa) {
1861 over_sum += iocg->hweight_after_donation;
1862 list_add(&iocg->walk_list, &over_hwa);
1863 }
1864 }
1865
1866 if (after_sum >= WEIGHT_ONE) {
1867
1868
1869
1870
1871 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1872 WARN_ON_ONCE(over_sum <= over_delta);
1873 over_target = over_sum - over_delta;
1874 } else {
1875 over_target = 0;
1876 }
1877
1878 list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1879 if (over_target)
1880 iocg->hweight_after_donation =
1881 div_u64((u64)iocg->hweight_after_donation *
1882 over_target, over_sum);
1883 list_del_init(&iocg->walk_list);
1884 }
1885
1886
1887
1888
1889
1890 list_for_each_entry(iocg, surpluses, surplus_list) {
1891 iocg_build_inner_walk(iocg, &inner_walk);
1892 }
1893
1894 root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1895 WARN_ON_ONCE(root_iocg->level > 0);
1896
1897 list_for_each_entry(iocg, &inner_walk, walk_list) {
1898 iocg->child_adjusted_sum = 0;
1899 iocg->hweight_donating = 0;
1900 iocg->hweight_after_donation = 0;
1901 }
1902
1903
1904
1905
1906
1907 list_for_each_entry(iocg, surpluses, surplus_list) {
1908 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1909
1910 parent->hweight_donating += iocg->hweight_donating;
1911 parent->hweight_after_donation += iocg->hweight_after_donation;
1912 }
1913
1914 list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1915 if (iocg->level > 0) {
1916 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1917
1918 parent->hweight_donating += iocg->hweight_donating;
1919 parent->hweight_after_donation += iocg->hweight_after_donation;
1920 }
1921 }
1922
1923
1924
1925
1926
1927
1928 list_for_each_entry(iocg, &inner_walk, walk_list) {
1929 if (iocg->level) {
1930 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1931
1932 iocg->hweight_active = DIV64_U64_ROUND_UP(
1933 (u64)parent->hweight_active * iocg->active,
1934 parent->child_active_sum);
1935
1936 }
1937
1938 iocg->hweight_donating = min(iocg->hweight_donating,
1939 iocg->hweight_active);
1940 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1941 iocg->hweight_donating - 1);
1942 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1943 iocg->hweight_donating <= 1 ||
1944 iocg->hweight_after_donation == 0)) {
1945 pr_warn("iocg: invalid donation weights in ");
1946 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1947 pr_cont(": active=%u donating=%u after=%u\n",
1948 iocg->hweight_active, iocg->hweight_donating,
1949 iocg->hweight_after_donation);
1950 }
1951 }
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967 gamma = DIV_ROUND_UP(
1968 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1969 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1970
1971
1972
1973
1974
1975 list_for_each_entry(iocg, &inner_walk, walk_list) {
1976 struct ioc_gq *parent;
1977 u32 inuse, wpt, wptp;
1978 u64 st, sf;
1979
1980 if (iocg->level == 0) {
1981
1982 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1983 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1984 WEIGHT_ONE - iocg->hweight_after_donation);
1985 continue;
1986 }
1987
1988 parent = iocg->ancestors[iocg->level - 1];
1989
1990
1991 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1992 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1993 WEIGHT_ONE) + iocg->hweight_after_donation;
1994
1995
1996 inuse = DIV64_U64_ROUND_UP(
1997 (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1998 parent->hweight_inuse);
1999
2000
2001 st = DIV64_U64_ROUND_UP(
2002 iocg->child_active_sum * iocg->hweight_donating,
2003 iocg->hweight_active);
2004 sf = iocg->child_active_sum - st;
2005 wpt = DIV64_U64_ROUND_UP(
2006 (u64)iocg->active * iocg->hweight_donating,
2007 iocg->hweight_active);
2008 wptp = DIV64_U64_ROUND_UP(
2009 (u64)inuse * iocg->hweight_after_donation,
2010 iocg->hweight_inuse);
2011
2012 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
2013 }
2014
2015
2016
2017
2018
2019 list_for_each_entry(iocg, surpluses, surplus_list) {
2020 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2021 u32 inuse;
2022
2023
2024
2025
2026
2027
2028
2029
2030 if (iocg->abs_vdebt) {
2031 WARN_ON_ONCE(iocg->inuse > 1);
2032 continue;
2033 }
2034
2035
2036 inuse = DIV64_U64_ROUND_UP(
2037 parent->child_adjusted_sum * iocg->hweight_after_donation,
2038 parent->hweight_inuse);
2039
2040 TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2041 iocg->inuse, inuse,
2042 iocg->hweight_inuse,
2043 iocg->hweight_after_donation);
2044
2045 __propagate_weights(iocg, iocg->active, inuse, true, now);
2046 }
2047
2048
2049 list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2050 list_del_init(&iocg->walk_list);
2051}
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2065 struct ioc_now *now)
2066{
2067 struct ioc_gq *iocg;
2068 u64 dur, usage_pct, nr_cycles;
2069
2070
2071 if (!nr_debtors) {
2072 ioc->dfgv_period_at = now->now;
2073 ioc->dfgv_period_rem = 0;
2074 ioc->dfgv_usage_us_sum = 0;
2075 return;
2076 }
2077
2078
2079
2080
2081
2082
2083
2084 if (ioc->busy_level > 0)
2085 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2086
2087 ioc->dfgv_usage_us_sum += usage_us_sum;
2088 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2089 return;
2090
2091
2092
2093
2094
2095 dur = now->now - ioc->dfgv_period_at;
2096 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2097
2098 ioc->dfgv_period_at = now->now;
2099 ioc->dfgv_usage_us_sum = 0;
2100
2101
2102 if (usage_pct > DFGV_USAGE_PCT) {
2103 ioc->dfgv_period_rem = 0;
2104 return;
2105 }
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116 nr_cycles = dur + ioc->dfgv_period_rem;
2117 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2118
2119 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2120 u64 __maybe_unused old_debt, __maybe_unused old_delay;
2121
2122 if (!iocg->abs_vdebt && !iocg->delay)
2123 continue;
2124
2125 spin_lock(&iocg->waitq.lock);
2126
2127 old_debt = iocg->abs_vdebt;
2128 old_delay = iocg->delay;
2129
2130 if (iocg->abs_vdebt)
2131 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2132 if (iocg->delay)
2133 iocg->delay = iocg->delay >> nr_cycles ?: 1;
2134
2135 iocg_kick_waitq(iocg, true, now);
2136
2137 TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2138 old_debt, iocg->abs_vdebt,
2139 old_delay, iocg->delay);
2140
2141 spin_unlock(&iocg->waitq.lock);
2142 }
2143}
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2156{
2157 int nr_debtors = 0;
2158 struct ioc_gq *iocg, *tiocg;
2159
2160 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2161 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2162 !iocg->delay && !iocg_is_idle(iocg))
2163 continue;
2164
2165 spin_lock(&iocg->waitq.lock);
2166
2167
2168 if (iocg->wait_since) {
2169 iocg->stat.wait_us += now->now - iocg->wait_since;
2170 iocg->wait_since = now->now;
2171 }
2172 if (iocg->indebt_since) {
2173 iocg->stat.indebt_us +=
2174 now->now - iocg->indebt_since;
2175 iocg->indebt_since = now->now;
2176 }
2177 if (iocg->indelay_since) {
2178 iocg->stat.indelay_us +=
2179 now->now - iocg->indelay_since;
2180 iocg->indelay_since = now->now;
2181 }
2182
2183 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2184 iocg->delay) {
2185
2186 iocg_kick_waitq(iocg, true, now);
2187 if (iocg->abs_vdebt || iocg->delay)
2188 nr_debtors++;
2189 } else if (iocg_is_idle(iocg)) {
2190
2191 u64 vtime = atomic64_read(&iocg->vtime);
2192 s64 excess;
2193
2194
2195
2196
2197
2198
2199
2200 excess = now->vnow - vtime - ioc->margins.target;
2201 if (excess > 0) {
2202 u32 old_hwi;
2203
2204 current_hweight(iocg, NULL, &old_hwi);
2205 ioc->vtime_err -= div64_u64(excess * old_hwi,
2206 WEIGHT_ONE);
2207 }
2208
2209 TRACE_IOCG_PATH(iocg_idle, iocg, now,
2210 atomic64_read(&iocg->active_period),
2211 atomic64_read(&ioc->cur_period), vtime);
2212 __propagate_weights(iocg, 0, 0, false, now);
2213 list_del_init(&iocg->active_list);
2214 }
2215
2216 spin_unlock(&iocg->waitq.lock);
2217 }
2218
2219 commit_weights(ioc);
2220 return nr_debtors;
2221}
2222
2223static void ioc_timer_fn(struct timer_list *timer)
2224{
2225 struct ioc *ioc = container_of(timer, struct ioc, timer);
2226 struct ioc_gq *iocg, *tiocg;
2227 struct ioc_now now;
2228 LIST_HEAD(surpluses);
2229 int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2230 u64 usage_us_sum = 0;
2231 u32 ppm_rthr;
2232 u32 ppm_wthr;
2233 u32 missed_ppm[2], rq_wait_pct;
2234 u64 period_vtime;
2235 int prev_busy_level;
2236
2237
2238 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2239
2240
2241 spin_lock_irq(&ioc->lock);
2242
2243 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2244 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2245 ioc_now(ioc, &now);
2246
2247 period_vtime = now.vnow - ioc->period_at_vtime;
2248 if (WARN_ON_ONCE(!period_vtime)) {
2249 spin_unlock_irq(&ioc->lock);
2250 return;
2251 }
2252
2253 nr_debtors = ioc_check_iocgs(ioc, &now);
2254
2255
2256
2257
2258
2259 iocg_flush_stat(&ioc->active_iocgs, &now);
2260
2261
2262 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2263 u64 vdone, vtime, usage_us;
2264 u32 hw_active, hw_inuse;
2265
2266
2267
2268
2269
2270 vdone = atomic64_read(&iocg->done_vtime);
2271 vtime = atomic64_read(&iocg->vtime);
2272 current_hweight(iocg, &hw_active, &hw_inuse);
2273
2274
2275
2276
2277
2278
2279
2280 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2281 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2282 time_after64(vtime, vdone) &&
2283 time_after64(vtime, now.vnow -
2284 MAX_LAGGING_PERIODS * period_vtime) &&
2285 time_before64(vdone, now.vnow - period_vtime))
2286 nr_lagging++;
2287
2288
2289
2290
2291
2292 usage_us = iocg->usage_delta_us;
2293 usage_us_sum += usage_us;
2294
2295
2296 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2297 if (hw_inuse < hw_active ||
2298 (!waitqueue_active(&iocg->waitq) &&
2299 time_before64(vtime, now.vnow - ioc->margins.low))) {
2300 u32 hwa, old_hwi, hwm, new_hwi, usage;
2301 u64 usage_dur;
2302
2303 if (vdone != vtime) {
2304 u64 inflight_us = DIV64_U64_ROUND_UP(
2305 cost_to_abs_cost(vtime - vdone, hw_inuse),
2306 ioc->vtime_base_rate);
2307
2308 usage_us = max(usage_us, inflight_us);
2309 }
2310
2311
2312 if (time_after64(iocg->activated_at, ioc->period_at))
2313 usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2314 else
2315 usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2316
2317 usage = clamp_t(u32,
2318 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2319 usage_dur),
2320 1, WEIGHT_ONE);
2321
2322
2323
2324
2325
2326 current_hweight(iocg, &hwa, &old_hwi);
2327 hwm = current_hweight_max(iocg);
2328 new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2329 usage, &now);
2330
2331
2332
2333
2334
2335
2336
2337 if (new_hwi < hwm && hwa >= 2) {
2338 iocg->hweight_donating = hwa;
2339 iocg->hweight_after_donation = new_hwi;
2340 list_add(&iocg->surplus_list, &surpluses);
2341 } else if (!iocg->abs_vdebt) {
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352 TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2353 iocg->inuse, iocg->active,
2354 iocg->hweight_inuse, new_hwi);
2355
2356 __propagate_weights(iocg, iocg->active,
2357 iocg->active, true, &now);
2358 nr_shortages++;
2359 }
2360 } else {
2361
2362 nr_shortages++;
2363 }
2364 }
2365
2366 if (!list_empty(&surpluses) && nr_shortages)
2367 transfer_surpluses(&surpluses, &now);
2368
2369 commit_weights(ioc);
2370
2371
2372 list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2373 list_del_init(&iocg->surplus_list);
2374
2375
2376
2377
2378
2379
2380
2381 prev_busy_level = ioc->busy_level;
2382 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2383 missed_ppm[READ] > ppm_rthr ||
2384 missed_ppm[WRITE] > ppm_wthr) {
2385
2386 ioc->busy_level = max(ioc->busy_level, 0);
2387 ioc->busy_level++;
2388 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2389 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2390 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2391
2392 if (nr_shortages) {
2393
2394
2395
2396
2397 ioc->busy_level = min(ioc->busy_level, 0);
2398
2399
2400
2401
2402
2403 if (!nr_lagging)
2404 ioc->busy_level--;
2405 } else {
2406
2407
2408
2409
2410
2411
2412 ioc->busy_level = 0;
2413 }
2414 } else {
2415
2416 ioc->busy_level = 0;
2417 }
2418
2419 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2420
2421 ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2422 prev_busy_level, missed_ppm);
2423
2424 ioc_refresh_params(ioc, false);
2425
2426 ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2427
2428
2429
2430
2431
2432 atomic64_inc(&ioc->cur_period);
2433
2434 if (ioc->running != IOC_STOP) {
2435 if (!list_empty(&ioc->active_iocgs)) {
2436 ioc_start_period(ioc, &now);
2437 } else {
2438 ioc->busy_level = 0;
2439 ioc->vtime_err = 0;
2440 ioc->running = IOC_IDLE;
2441 }
2442
2443 ioc_refresh_vrate(ioc, &now);
2444 }
2445
2446 spin_unlock_irq(&ioc->lock);
2447}
2448
2449static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2450 u64 abs_cost, struct ioc_now *now)
2451{
2452 struct ioc *ioc = iocg->ioc;
2453 struct ioc_margins *margins = &ioc->margins;
2454 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2455 u32 hwi, adj_step;
2456 s64 margin;
2457 u64 cost, new_inuse;
2458 unsigned long flags;
2459
2460 current_hweight(iocg, NULL, &hwi);
2461 old_hwi = hwi;
2462 cost = abs_cost_to_cost(abs_cost, hwi);
2463 margin = now->vnow - vtime - cost;
2464
2465
2466 if (iocg->abs_vdebt)
2467 return cost;
2468
2469
2470
2471
2472
2473 if (margin >= iocg->saved_margin || margin >= margins->low ||
2474 iocg->inuse == iocg->active)
2475 return cost;
2476
2477 spin_lock_irqsave(&ioc->lock, flags);
2478
2479
2480 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2481 spin_unlock_irqrestore(&ioc->lock, flags);
2482 return cost;
2483 }
2484
2485
2486
2487
2488
2489
2490
2491
2492 new_inuse = iocg->inuse;
2493 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2494 do {
2495 new_inuse = new_inuse + adj_step;
2496 propagate_weights(iocg, iocg->active, new_inuse, true, now);
2497 current_hweight(iocg, NULL, &hwi);
2498 cost = abs_cost_to_cost(abs_cost, hwi);
2499 } while (time_after64(vtime + cost, now->vnow) &&
2500 iocg->inuse != iocg->active);
2501
2502 spin_unlock_irqrestore(&ioc->lock, flags);
2503
2504 TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2505 old_inuse, iocg->inuse, old_hwi, hwi);
2506
2507 return cost;
2508}
2509
2510static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2511 bool is_merge, u64 *costp)
2512{
2513 struct ioc *ioc = iocg->ioc;
2514 u64 coef_seqio, coef_randio, coef_page;
2515 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2516 u64 seek_pages = 0;
2517 u64 cost = 0;
2518
2519
2520 if (!bio->bi_iter.bi_size)
2521 goto out;
2522
2523 switch (bio_op(bio)) {
2524 case REQ_OP_READ:
2525 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
2526 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
2527 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
2528 break;
2529 case REQ_OP_WRITE:
2530 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
2531 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
2532 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
2533 break;
2534 default:
2535 goto out;
2536 }
2537
2538 if (iocg->cursor) {
2539 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2540 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2541 }
2542
2543 if (!is_merge) {
2544 if (seek_pages > LCOEF_RANDIO_PAGES) {
2545 cost += coef_randio;
2546 } else {
2547 cost += coef_seqio;
2548 }
2549 }
2550 cost += pages * coef_page;
2551out:
2552 *costp = cost;
2553}
2554
2555static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2556{
2557 u64 cost;
2558
2559 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2560 return cost;
2561}
2562
2563static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2564 u64 *costp)
2565{
2566 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2567
2568 switch (req_op(rq)) {
2569 case REQ_OP_READ:
2570 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2571 break;
2572 case REQ_OP_WRITE:
2573 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2574 break;
2575 default:
2576 *costp = 0;
2577 }
2578}
2579
2580static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2581{
2582 u64 cost;
2583
2584 calc_size_vtime_cost_builtin(rq, ioc, &cost);
2585 return cost;
2586}
2587
2588static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2589{
2590 struct blkcg_gq *blkg = bio->bi_blkg;
2591 struct ioc *ioc = rqos_to_ioc(rqos);
2592 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2593 struct ioc_now now;
2594 struct iocg_wait wait;
2595 u64 abs_cost, cost, vtime;
2596 bool use_debt, ioc_locked;
2597 unsigned long flags;
2598
2599
2600 if (!ioc->enabled || !iocg || !iocg->level)
2601 return;
2602
2603
2604 abs_cost = calc_vtime_cost(bio, iocg, false);
2605 if (!abs_cost)
2606 return;
2607
2608 if (!iocg_activate(iocg, &now))
2609 return;
2610
2611 iocg->cursor = bio_end_sector(bio);
2612 vtime = atomic64_read(&iocg->vtime);
2613 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2614
2615
2616
2617
2618
2619
2620 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2621 time_before_eq64(vtime + cost, now.vnow)) {
2622 iocg_commit_bio(iocg, bio, abs_cost, cost);
2623 return;
2624 }
2625
2626
2627
2628
2629
2630
2631
2632
2633 use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2634 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2635retry_lock:
2636 iocg_lock(iocg, ioc_locked, &flags);
2637
2638
2639
2640
2641
2642
2643
2644
2645 if (unlikely(list_empty(&iocg->active_list))) {
2646 iocg_unlock(iocg, ioc_locked, &flags);
2647 iocg_commit_bio(iocg, bio, abs_cost, cost);
2648 return;
2649 }
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668 if (use_debt) {
2669 iocg_incur_debt(iocg, abs_cost, &now);
2670 if (iocg_kick_delay(iocg, &now))
2671 blkcg_schedule_throttle(rqos->disk,
2672 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2673 iocg_unlock(iocg, ioc_locked, &flags);
2674 return;
2675 }
2676
2677
2678 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2679 if (!ioc_locked) {
2680 iocg_unlock(iocg, false, &flags);
2681 ioc_locked = true;
2682 goto retry_lock;
2683 }
2684 propagate_weights(iocg, iocg->active, iocg->active, true,
2685 &now);
2686 }
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2702 wait.wait.private = current;
2703 wait.bio = bio;
2704 wait.abs_cost = abs_cost;
2705 wait.committed = false;
2706
2707 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2708 iocg_kick_waitq(iocg, ioc_locked, &now);
2709
2710 iocg_unlock(iocg, ioc_locked, &flags);
2711
2712 while (true) {
2713 set_current_state(TASK_UNINTERRUPTIBLE);
2714 if (wait.committed)
2715 break;
2716 io_schedule();
2717 }
2718
2719
2720 finish_wait(&iocg->waitq, &wait.wait);
2721}
2722
2723static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2724 struct bio *bio)
2725{
2726 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2727 struct ioc *ioc = rqos_to_ioc(rqos);
2728 sector_t bio_end = bio_end_sector(bio);
2729 struct ioc_now now;
2730 u64 vtime, abs_cost, cost;
2731 unsigned long flags;
2732
2733
2734 if (!ioc->enabled || !iocg || !iocg->level)
2735 return;
2736
2737 abs_cost = calc_vtime_cost(bio, iocg, true);
2738 if (!abs_cost)
2739 return;
2740
2741 ioc_now(ioc, &now);
2742
2743 vtime = atomic64_read(&iocg->vtime);
2744 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2745
2746
2747 if (blk_rq_pos(rq) < bio_end &&
2748 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2749 iocg->cursor = bio_end;
2750
2751
2752
2753
2754
2755 if (rq->bio && rq->bio->bi_iocost_cost &&
2756 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2757 iocg_commit_bio(iocg, bio, abs_cost, cost);
2758 return;
2759 }
2760
2761
2762
2763
2764
2765
2766 spin_lock_irqsave(&ioc->lock, flags);
2767 spin_lock(&iocg->waitq.lock);
2768
2769 if (likely(!list_empty(&iocg->active_list))) {
2770 iocg_incur_debt(iocg, abs_cost, &now);
2771 if (iocg_kick_delay(iocg, &now))
2772 blkcg_schedule_throttle(rqos->disk,
2773 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2774 } else {
2775 iocg_commit_bio(iocg, bio, abs_cost, cost);
2776 }
2777
2778 spin_unlock(&iocg->waitq.lock);
2779 spin_unlock_irqrestore(&ioc->lock, flags);
2780}
2781
2782static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2783{
2784 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2785
2786 if (iocg && bio->bi_iocost_cost)
2787 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2788}
2789
2790static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2791{
2792 struct ioc *ioc = rqos_to_ioc(rqos);
2793 struct ioc_pcpu_stat *ccs;
2794 u64 on_q_ns, rq_wait_ns, size_nsec;
2795 int pidx, rw;
2796
2797 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2798 return;
2799
2800 switch (req_op(rq)) {
2801 case REQ_OP_READ:
2802 pidx = QOS_RLAT;
2803 rw = READ;
2804 break;
2805 case REQ_OP_WRITE:
2806 pidx = QOS_WLAT;
2807 rw = WRITE;
2808 break;
2809 default:
2810 return;
2811 }
2812
2813 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2814 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2815 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2816
2817 ccs = get_cpu_ptr(ioc->pcpu_stat);
2818
2819 if (on_q_ns <= size_nsec ||
2820 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2821 local_inc(&ccs->missed[rw].nr_met);
2822 else
2823 local_inc(&ccs->missed[rw].nr_missed);
2824
2825 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2826
2827 put_cpu_ptr(ccs);
2828}
2829
2830static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2831{
2832 struct ioc *ioc = rqos_to_ioc(rqos);
2833
2834 spin_lock_irq(&ioc->lock);
2835 ioc_refresh_params(ioc, false);
2836 spin_unlock_irq(&ioc->lock);
2837}
2838
2839static void ioc_rqos_exit(struct rq_qos *rqos)
2840{
2841 struct ioc *ioc = rqos_to_ioc(rqos);
2842
2843 blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
2844
2845 spin_lock_irq(&ioc->lock);
2846 ioc->running = IOC_STOP;
2847 spin_unlock_irq(&ioc->lock);
2848
2849 timer_shutdown_sync(&ioc->timer);
2850 free_percpu(ioc->pcpu_stat);
2851 kfree(ioc);
2852}
2853
2854static const struct rq_qos_ops ioc_rqos_ops = {
2855 .throttle = ioc_rqos_throttle,
2856 .merge = ioc_rqos_merge,
2857 .done_bio = ioc_rqos_done_bio,
2858 .done = ioc_rqos_done,
2859 .queue_depth_changed = ioc_rqos_queue_depth_changed,
2860 .exit = ioc_rqos_exit,
2861};
2862
2863static int blk_iocost_init(struct gendisk *disk)
2864{
2865 struct ioc *ioc;
2866 int i, cpu, ret;
2867
2868 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2869 if (!ioc)
2870 return -ENOMEM;
2871
2872 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2873 if (!ioc->pcpu_stat) {
2874 kfree(ioc);
2875 return -ENOMEM;
2876 }
2877
2878 for_each_possible_cpu(cpu) {
2879 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2880
2881 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2882 local_set(&ccs->missed[i].nr_met, 0);
2883 local_set(&ccs->missed[i].nr_missed, 0);
2884 }
2885 local64_set(&ccs->rq_wait_ns, 0);
2886 }
2887
2888 spin_lock_init(&ioc->lock);
2889 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2890 INIT_LIST_HEAD(&ioc->active_iocgs);
2891
2892 ioc->running = IOC_IDLE;
2893 ioc->vtime_base_rate = VTIME_PER_USEC;
2894 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2895 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2896 ioc->period_at = ktime_to_us(ktime_get());
2897 atomic64_set(&ioc->cur_period, 0);
2898 atomic_set(&ioc->hweight_gen, 0);
2899
2900 spin_lock_irq(&ioc->lock);
2901 ioc->autop_idx = AUTOP_INVALID;
2902 ioc_refresh_params_disk(ioc, true, disk);
2903 spin_unlock_irq(&ioc->lock);
2904
2905
2906
2907
2908
2909
2910
2911 ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
2912 if (ret)
2913 goto err_free_ioc;
2914
2915 ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
2916 if (ret)
2917 goto err_del_qos;
2918 return 0;
2919
2920err_del_qos:
2921 rq_qos_del(&ioc->rqos);
2922err_free_ioc:
2923 free_percpu(ioc->pcpu_stat);
2924 kfree(ioc);
2925 return ret;
2926}
2927
2928static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2929{
2930 struct ioc_cgrp *iocc;
2931
2932 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2933 if (!iocc)
2934 return NULL;
2935
2936 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2937 return &iocc->cpd;
2938}
2939
2940static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2941{
2942 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2943}
2944
2945static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
2946 struct blkcg *blkcg, gfp_t gfp)
2947{
2948 int levels = blkcg->css.cgroup->level + 1;
2949 struct ioc_gq *iocg;
2950
2951 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
2952 disk->node_id);
2953 if (!iocg)
2954 return NULL;
2955
2956 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2957 if (!iocg->pcpu_stat) {
2958 kfree(iocg);
2959 return NULL;
2960 }
2961
2962 return &iocg->pd;
2963}
2964
2965static void ioc_pd_init(struct blkg_policy_data *pd)
2966{
2967 struct ioc_gq *iocg = pd_to_iocg(pd);
2968 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2969 struct ioc *ioc = q_to_ioc(blkg->q);
2970 struct ioc_now now;
2971 struct blkcg_gq *tblkg;
2972 unsigned long flags;
2973
2974 ioc_now(ioc, &now);
2975
2976 iocg->ioc = ioc;
2977 atomic64_set(&iocg->vtime, now.vnow);
2978 atomic64_set(&iocg->done_vtime, now.vnow);
2979 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2980 INIT_LIST_HEAD(&iocg->active_list);
2981 INIT_LIST_HEAD(&iocg->walk_list);
2982 INIT_LIST_HEAD(&iocg->surplus_list);
2983 iocg->hweight_active = WEIGHT_ONE;
2984 iocg->hweight_inuse = WEIGHT_ONE;
2985
2986 init_waitqueue_head(&iocg->waitq);
2987 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2988 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2989
2990 iocg->level = blkg->blkcg->css.cgroup->level;
2991
2992 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2993 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2994 iocg->ancestors[tiocg->level] = tiocg;
2995 }
2996
2997 spin_lock_irqsave(&ioc->lock, flags);
2998 weight_updated(iocg, &now);
2999 spin_unlock_irqrestore(&ioc->lock, flags);
3000}
3001
3002static void ioc_pd_free(struct blkg_policy_data *pd)
3003{
3004 struct ioc_gq *iocg = pd_to_iocg(pd);
3005 struct ioc *ioc = iocg->ioc;
3006 unsigned long flags;
3007
3008 if (ioc) {
3009 spin_lock_irqsave(&ioc->lock, flags);
3010
3011 if (!list_empty(&iocg->active_list)) {
3012 struct ioc_now now;
3013
3014 ioc_now(ioc, &now);
3015 propagate_weights(iocg, 0, 0, false, &now);
3016 list_del_init(&iocg->active_list);
3017 }
3018
3019 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
3020 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
3021
3022 spin_unlock_irqrestore(&ioc->lock, flags);
3023
3024 hrtimer_cancel(&iocg->waitq_timer);
3025 }
3026 free_percpu(iocg->pcpu_stat);
3027 kfree(iocg);
3028}
3029
3030static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
3031{
3032 struct ioc_gq *iocg = pd_to_iocg(pd);
3033 struct ioc *ioc = iocg->ioc;
3034
3035 if (!ioc->enabled)
3036 return;
3037
3038 if (iocg->level == 0) {
3039 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3040 ioc->vtime_base_rate * 10000,
3041 VTIME_PER_USEC);
3042 seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
3043 }
3044
3045 seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
3046
3047 if (blkcg_debug_stats)
3048 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3049 iocg->last_stat.wait_us,
3050 iocg->last_stat.indebt_us,
3051 iocg->last_stat.indelay_us);
3052}
3053
3054static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3055 int off)
3056{
3057 const char *dname = blkg_dev_name(pd->blkg);
3058 struct ioc_gq *iocg = pd_to_iocg(pd);
3059
3060 if (dname && iocg->cfg_weight)
3061 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3062 return 0;
3063}
3064
3065
3066static int ioc_weight_show(struct seq_file *sf, void *v)
3067{
3068 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3069 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3070
3071 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3072 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3073 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3074 return 0;
3075}
3076
3077static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3078 size_t nbytes, loff_t off)
3079{
3080 struct blkcg *blkcg = css_to_blkcg(of_css(of));
3081 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3082 struct blkg_conf_ctx ctx;
3083 struct ioc_now now;
3084 struct ioc_gq *iocg;
3085 u32 v;
3086 int ret;
3087
3088 if (!strchr(buf, ':')) {
3089 struct blkcg_gq *blkg;
3090
3091 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3092 return -EINVAL;
3093
3094 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3095 return -EINVAL;
3096
3097 spin_lock_irq(&blkcg->lock);
3098 iocc->dfl_weight = v * WEIGHT_ONE;
3099 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3100 struct ioc_gq *iocg = blkg_to_iocg(blkg);
3101
3102 if (iocg) {
3103 spin_lock(&iocg->ioc->lock);
3104 ioc_now(iocg->ioc, &now);
3105 weight_updated(iocg, &now);
3106 spin_unlock(&iocg->ioc->lock);
3107 }
3108 }
3109 spin_unlock_irq(&blkcg->lock);
3110
3111 return nbytes;
3112 }
3113
3114 blkg_conf_init(&ctx, buf);
3115
3116 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
3117 if (ret)
3118 goto err;
3119
3120 iocg = blkg_to_iocg(ctx.blkg);
3121
3122 if (!strncmp(ctx.body, "default", 7)) {
3123 v = 0;
3124 } else {
3125 if (!sscanf(ctx.body, "%u", &v))
3126 goto einval;
3127 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3128 goto einval;
3129 }
3130
3131 spin_lock(&iocg->ioc->lock);
3132 iocg->cfg_weight = v * WEIGHT_ONE;
3133 ioc_now(iocg->ioc, &now);
3134 weight_updated(iocg, &now);
3135 spin_unlock(&iocg->ioc->lock);
3136
3137 blkg_conf_exit(&ctx);
3138 return nbytes;
3139
3140einval:
3141 ret = -EINVAL;
3142err:
3143 blkg_conf_exit(&ctx);
3144 return ret;
3145}
3146
3147static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3148 int off)
3149{
3150 const char *dname = blkg_dev_name(pd->blkg);
3151 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3152
3153 if (!dname)
3154 return 0;
3155
3156 spin_lock_irq(&ioc->lock);
3157 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3158 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3159 ioc->params.qos[QOS_RPPM] / 10000,
3160 ioc->params.qos[QOS_RPPM] % 10000 / 100,
3161 ioc->params.qos[QOS_RLAT],
3162 ioc->params.qos[QOS_WPPM] / 10000,
3163 ioc->params.qos[QOS_WPPM] % 10000 / 100,
3164 ioc->params.qos[QOS_WLAT],
3165 ioc->params.qos[QOS_MIN] / 10000,
3166 ioc->params.qos[QOS_MIN] % 10000 / 100,
3167 ioc->params.qos[QOS_MAX] / 10000,
3168 ioc->params.qos[QOS_MAX] % 10000 / 100);
3169 spin_unlock_irq(&ioc->lock);
3170 return 0;
3171}
3172
3173static int ioc_qos_show(struct seq_file *sf, void *v)
3174{
3175 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3176
3177 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3178 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3179 return 0;
3180}
3181
3182static const match_table_t qos_ctrl_tokens = {
3183 { QOS_ENABLE, "enable=%u" },
3184 { QOS_CTRL, "ctrl=%s" },
3185 { NR_QOS_CTRL_PARAMS, NULL },
3186};
3187
3188static const match_table_t qos_tokens = {
3189 { QOS_RPPM, "rpct=%s" },
3190 { QOS_RLAT, "rlat=%u" },
3191 { QOS_WPPM, "wpct=%s" },
3192 { QOS_WLAT, "wlat=%u" },
3193 { QOS_MIN, "min=%s" },
3194 { QOS_MAX, "max=%s" },
3195 { NR_QOS_PARAMS, NULL },
3196};
3197
3198static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3199 size_t nbytes, loff_t off)
3200{
3201 struct blkg_conf_ctx ctx;
3202 struct gendisk *disk;
3203 struct ioc *ioc;
3204 u32 qos[NR_QOS_PARAMS];
3205 bool enable, user;
3206 char *body, *p;
3207 int ret;
3208
3209 blkg_conf_init(&ctx, input);
3210
3211 ret = blkg_conf_open_bdev(&ctx);
3212 if (ret)
3213 goto err;
3214
3215 body = ctx.body;
3216 disk = ctx.bdev->bd_disk;
3217 if (!queue_is_mq(disk->queue)) {
3218 ret = -EOPNOTSUPP;
3219 goto err;
3220 }
3221
3222 ioc = q_to_ioc(disk->queue);
3223 if (!ioc) {
3224 ret = blk_iocost_init(disk);
3225 if (ret)
3226 goto err;
3227 ioc = q_to_ioc(disk->queue);
3228 }
3229
3230 blk_mq_freeze_queue(disk->queue);
3231 blk_mq_quiesce_queue(disk->queue);
3232
3233 spin_lock_irq(&ioc->lock);
3234 memcpy(qos, ioc->params.qos, sizeof(qos));
3235 enable = ioc->enabled;
3236 user = ioc->user_qos_params;
3237
3238 while ((p = strsep(&body, " \t\n"))) {
3239 substring_t args[MAX_OPT_ARGS];
3240 char buf[32];
3241 int tok;
3242 s64 v;
3243
3244 if (!*p)
3245 continue;
3246
3247 switch (match_token(p, qos_ctrl_tokens, args)) {
3248 case QOS_ENABLE:
3249 if (match_u64(&args[0], &v))
3250 goto einval;
3251 enable = v;
3252 continue;
3253 case QOS_CTRL:
3254 match_strlcpy(buf, &args[0], sizeof(buf));
3255 if (!strcmp(buf, "auto"))
3256 user = false;
3257 else if (!strcmp(buf, "user"))
3258 user = true;
3259 else
3260 goto einval;
3261 continue;
3262 }
3263
3264 tok = match_token(p, qos_tokens, args);
3265 switch (tok) {
3266 case QOS_RPPM:
3267 case QOS_WPPM:
3268 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3269 sizeof(buf))
3270 goto einval;
3271 if (cgroup_parse_float(buf, 2, &v))
3272 goto einval;
3273 if (v < 0 || v > 10000)
3274 goto einval;
3275 qos[tok] = v * 100;
3276 break;
3277 case QOS_RLAT:
3278 case QOS_WLAT:
3279 if (match_u64(&args[0], &v))
3280 goto einval;
3281 qos[tok] = v;
3282 break;
3283 case QOS_MIN:
3284 case QOS_MAX:
3285 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3286 sizeof(buf))
3287 goto einval;
3288 if (cgroup_parse_float(buf, 2, &v))
3289 goto einval;
3290 if (v < 0)
3291 goto einval;
3292 qos[tok] = clamp_t(s64, v * 100,
3293 VRATE_MIN_PPM, VRATE_MAX_PPM);
3294 break;
3295 default:
3296 goto einval;
3297 }
3298 user = true;
3299 }
3300
3301 if (qos[QOS_MIN] > qos[QOS_MAX])
3302 goto einval;
3303
3304 if (enable && !ioc->enabled) {
3305 blk_stat_enable_accounting(disk->queue);
3306 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3307 ioc->enabled = true;
3308 } else if (!enable && ioc->enabled) {
3309 blk_stat_disable_accounting(disk->queue);
3310 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
3311 ioc->enabled = false;
3312 }
3313
3314 if (user) {
3315 memcpy(ioc->params.qos, qos, sizeof(qos));
3316 ioc->user_qos_params = true;
3317 } else {
3318 ioc->user_qos_params = false;
3319 }
3320
3321 ioc_refresh_params(ioc, true);
3322 spin_unlock_irq(&ioc->lock);
3323
3324 if (enable)
3325 wbt_disable_default(disk);
3326 else
3327 wbt_enable_default(disk);
3328
3329 blk_mq_unquiesce_queue(disk->queue);
3330 blk_mq_unfreeze_queue(disk->queue);
3331
3332 blkg_conf_exit(&ctx);
3333 return nbytes;
3334einval:
3335 spin_unlock_irq(&ioc->lock);
3336
3337 blk_mq_unquiesce_queue(disk->queue);
3338 blk_mq_unfreeze_queue(disk->queue);
3339
3340 ret = -EINVAL;
3341err:
3342 blkg_conf_exit(&ctx);
3343 return ret;
3344}
3345
3346static u64 ioc_cost_model_prfill(struct seq_file *sf,
3347 struct blkg_policy_data *pd, int off)
3348{
3349 const char *dname = blkg_dev_name(pd->blkg);
3350 struct ioc *ioc = pd_to_iocg(pd)->ioc;
3351 u64 *u = ioc->params.i_lcoefs;
3352
3353 if (!dname)
3354 return 0;
3355
3356 spin_lock_irq(&ioc->lock);
3357 seq_printf(sf, "%s ctrl=%s model=linear "
3358 "rbps=%llu rseqiops=%llu rrandiops=%llu "
3359 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3360 dname, ioc->user_cost_model ? "user" : "auto",
3361 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3362 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3363 spin_unlock_irq(&ioc->lock);
3364 return 0;
3365}
3366
3367static int ioc_cost_model_show(struct seq_file *sf, void *v)
3368{
3369 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3370
3371 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3372 &blkcg_policy_iocost, seq_cft(sf)->private, false);
3373 return 0;
3374}
3375
3376static const match_table_t cost_ctrl_tokens = {
3377 { COST_CTRL, "ctrl=%s" },
3378 { COST_MODEL, "model=%s" },
3379 { NR_COST_CTRL_PARAMS, NULL },
3380};
3381
3382static const match_table_t i_lcoef_tokens = {
3383 { I_LCOEF_RBPS, "rbps=%u" },
3384 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
3385 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
3386 { I_LCOEF_WBPS, "wbps=%u" },
3387 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
3388 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
3389 { NR_I_LCOEFS, NULL },
3390};
3391
3392static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3393 size_t nbytes, loff_t off)
3394{
3395 struct blkg_conf_ctx ctx;
3396 struct request_queue *q;
3397 struct ioc *ioc;
3398 u64 u[NR_I_LCOEFS];
3399 bool user;
3400 char *body, *p;
3401 int ret;
3402
3403 blkg_conf_init(&ctx, input);
3404
3405 ret = blkg_conf_open_bdev(&ctx);
3406 if (ret)
3407 goto err;
3408
3409 body = ctx.body;
3410 q = bdev_get_queue(ctx.bdev);
3411 if (!queue_is_mq(q)) {
3412 ret = -EOPNOTSUPP;
3413 goto err;
3414 }
3415
3416 ioc = q_to_ioc(q);
3417 if (!ioc) {
3418 ret = blk_iocost_init(ctx.bdev->bd_disk);
3419 if (ret)
3420 goto err;
3421 ioc = q_to_ioc(q);
3422 }
3423
3424 blk_mq_freeze_queue(q);
3425 blk_mq_quiesce_queue(q);
3426
3427 spin_lock_irq(&ioc->lock);
3428 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3429 user = ioc->user_cost_model;
3430
3431 while ((p = strsep(&body, " \t\n"))) {
3432 substring_t args[MAX_OPT_ARGS];
3433 char buf[32];
3434 int tok;
3435 u64 v;
3436
3437 if (!*p)
3438 continue;
3439
3440 switch (match_token(p, cost_ctrl_tokens, args)) {
3441 case COST_CTRL:
3442 match_strlcpy(buf, &args[0], sizeof(buf));
3443 if (!strcmp(buf, "auto"))
3444 user = false;
3445 else if (!strcmp(buf, "user"))
3446 user = true;
3447 else
3448 goto einval;
3449 continue;
3450 case COST_MODEL:
3451 match_strlcpy(buf, &args[0], sizeof(buf));
3452 if (strcmp(buf, "linear"))
3453 goto einval;
3454 continue;
3455 }
3456
3457 tok = match_token(p, i_lcoef_tokens, args);
3458 if (tok == NR_I_LCOEFS)
3459 goto einval;
3460 if (match_u64(&args[0], &v))
3461 goto einval;
3462 u[tok] = v;
3463 user = true;
3464 }
3465
3466 if (user) {
3467 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3468 ioc->user_cost_model = true;
3469 } else {
3470 ioc->user_cost_model = false;
3471 }
3472 ioc_refresh_params(ioc, true);
3473 spin_unlock_irq(&ioc->lock);
3474
3475 blk_mq_unquiesce_queue(q);
3476 blk_mq_unfreeze_queue(q);
3477
3478 blkg_conf_exit(&ctx);
3479 return nbytes;
3480
3481einval:
3482 spin_unlock_irq(&ioc->lock);
3483
3484 blk_mq_unquiesce_queue(q);
3485 blk_mq_unfreeze_queue(q);
3486
3487 ret = -EINVAL;
3488err:
3489 blkg_conf_exit(&ctx);
3490 return ret;
3491}
3492
3493static struct cftype ioc_files[] = {
3494 {
3495 .name = "weight",
3496 .flags = CFTYPE_NOT_ON_ROOT,
3497 .seq_show = ioc_weight_show,
3498 .write = ioc_weight_write,
3499 },
3500 {
3501 .name = "cost.qos",
3502 .flags = CFTYPE_ONLY_ON_ROOT,
3503 .seq_show = ioc_qos_show,
3504 .write = ioc_qos_write,
3505 },
3506 {
3507 .name = "cost.model",
3508 .flags = CFTYPE_ONLY_ON_ROOT,
3509 .seq_show = ioc_cost_model_show,
3510 .write = ioc_cost_model_write,
3511 },
3512 {}
3513};
3514
3515static struct blkcg_policy blkcg_policy_iocost = {
3516 .dfl_cftypes = ioc_files,
3517 .cpd_alloc_fn = ioc_cpd_alloc,
3518 .cpd_free_fn = ioc_cpd_free,
3519 .pd_alloc_fn = ioc_pd_alloc,
3520 .pd_init_fn = ioc_pd_init,
3521 .pd_free_fn = ioc_pd_free,
3522 .pd_stat_fn = ioc_pd_stat,
3523};
3524
3525static int __init ioc_init(void)
3526{
3527 return blkcg_policy_register(&blkcg_policy_iocost);
3528}
3529
3530static void __exit ioc_exit(void)
3531{
3532 blkcg_policy_unregister(&blkcg_policy_iocost);
3533}
3534
3535module_init(ioc_init);
3536module_exit(ioc_exit);
3537