1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include "sched.h"
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38unsigned int sysctl_sched_latency = 6000000ULL;
39static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40
41
42
43
44
45
46
47
48
49
50
51
52unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
53
54
55
56
57
58
59unsigned int sysctl_sched_min_granularity = 750000ULL;
60static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
61
62
63
64
65static unsigned int sched_nr_latency = 8;
66
67
68
69
70
71unsigned int sysctl_sched_child_runs_first __read_mostly;
72
73
74
75
76
77
78
79
80
81
82unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
83static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
84
85const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
86
87int sched_thermal_decay_shift;
88static int __init setup_sched_thermal_decay_shift(char *str)
89{
90 int _shift = 0;
91
92 if (kstrtoint(str, 0, &_shift))
93 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
94
95 sched_thermal_decay_shift = clamp(_shift, 0, 10);
96 return 1;
97}
98__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
99
100#ifdef CONFIG_SMP
101
102
103
104int __weak arch_asym_cpu_priority(int cpu)
105{
106 return -cpu;
107}
108
109
110
111
112
113
114#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
115
116
117
118
119
120
121
122#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
123#endif
124
125#ifdef CONFIG_CFS_BANDWIDTH
126
127
128
129
130
131
132
133
134
135
136unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
137#endif
138
139static inline void update_load_add(struct load_weight *lw, unsigned long inc)
140{
141 lw->weight += inc;
142 lw->inv_weight = 0;
143}
144
145static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
146{
147 lw->weight -= dec;
148 lw->inv_weight = 0;
149}
150
151static inline void update_load_set(struct load_weight *lw, unsigned long w)
152{
153 lw->weight = w;
154 lw->inv_weight = 0;
155}
156
157
158
159
160
161
162
163
164
165
166static unsigned int get_update_sysctl_factor(void)
167{
168 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
169 unsigned int factor;
170
171 switch (sysctl_sched_tunable_scaling) {
172 case SCHED_TUNABLESCALING_NONE:
173 factor = 1;
174 break;
175 case SCHED_TUNABLESCALING_LINEAR:
176 factor = cpus;
177 break;
178 case SCHED_TUNABLESCALING_LOG:
179 default:
180 factor = 1 + ilog2(cpus);
181 break;
182 }
183
184 return factor;
185}
186
187static void update_sysctl(void)
188{
189 unsigned int factor = get_update_sysctl_factor();
190
191#define SET_SYSCTL(name) \
192 (sysctl_##name = (factor) * normalized_sysctl_##name)
193 SET_SYSCTL(sched_min_granularity);
194 SET_SYSCTL(sched_latency);
195 SET_SYSCTL(sched_wakeup_granularity);
196#undef SET_SYSCTL
197}
198
199void __init sched_init_granularity(void)
200{
201 update_sysctl();
202}
203
204#define WMULT_CONST (~0U)
205#define WMULT_SHIFT 32
206
207static void __update_inv_weight(struct load_weight *lw)
208{
209 unsigned long w;
210
211 if (likely(lw->inv_weight))
212 return;
213
214 w = scale_load_down(lw->weight);
215
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
217 lw->inv_weight = 1;
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222}
223
224
225
226
227
228
229
230
231
232
233
234
235
236static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
237{
238 u64 fact = scale_load_down(weight);
239 u32 fact_hi = (u32)(fact >> 32);
240 int shift = WMULT_SHIFT;
241 int fs;
242
243 __update_inv_weight(lw);
244
245 if (unlikely(fact_hi)) {
246 fs = fls(fact_hi);
247 shift -= fs;
248 fact >>= fs;
249 }
250
251 fact = mul_u32_u32(fact, lw->inv_weight);
252
253 fact_hi = (u32)(fact >> 32);
254 if (fact_hi) {
255 fs = fls(fact_hi);
256 shift -= fs;
257 fact >>= fs;
258 }
259
260 return mul_u64_u32_shr(delta_exec, fact, shift);
261}
262
263
264const struct sched_class fair_sched_class;
265
266
267
268
269
270#ifdef CONFIG_FAIR_GROUP_SCHED
271
272
273#define for_each_sched_entity(se) \
274 for (; se; se = se->parent)
275
276static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
277{
278 if (!path)
279 return;
280
281 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
282 autogroup_path(cfs_rq->tg, path, len);
283 else if (cfs_rq && cfs_rq->tg->css.cgroup)
284 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
285 else
286 strlcpy(path, "(null)", len);
287}
288
289static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
290{
291 struct rq *rq = rq_of(cfs_rq);
292 int cpu = cpu_of(rq);
293
294 if (cfs_rq->on_list)
295 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
296
297 cfs_rq->on_list = 1;
298
299
300
301
302
303
304
305
306
307
308 if (cfs_rq->tg->parent &&
309 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
310
311
312
313
314
315
316 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
317 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
318
319
320
321
322
323 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
324 return true;
325 }
326
327 if (!cfs_rq->tg->parent) {
328
329
330
331
332 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
333 &rq->leaf_cfs_rq_list);
334
335
336
337
338 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
339 return true;
340 }
341
342
343
344
345
346
347
348 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
349
350
351
352
353 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
354 return false;
355}
356
357static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
358{
359 if (cfs_rq->on_list) {
360 struct rq *rq = rq_of(cfs_rq);
361
362
363
364
365
366
367
368
369 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
370 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
371
372 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
373 cfs_rq->on_list = 0;
374 }
375}
376
377static inline void assert_list_leaf_cfs_rq(struct rq *rq)
378{
379 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
380}
381
382
383#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
384 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
385 leaf_cfs_rq_list)
386
387
388static inline struct cfs_rq *
389is_same_group(struct sched_entity *se, struct sched_entity *pse)
390{
391 if (se->cfs_rq == pse->cfs_rq)
392 return se->cfs_rq;
393
394 return NULL;
395}
396
397static inline struct sched_entity *parent_entity(struct sched_entity *se)
398{
399 return se->parent;
400}
401
402static void
403find_matching_se(struct sched_entity **se, struct sched_entity **pse)
404{
405 int se_depth, pse_depth;
406
407
408
409
410
411
412
413
414
415 se_depth = (*se)->depth;
416 pse_depth = (*pse)->depth;
417
418 while (se_depth > pse_depth) {
419 se_depth--;
420 *se = parent_entity(*se);
421 }
422
423 while (pse_depth > se_depth) {
424 pse_depth--;
425 *pse = parent_entity(*pse);
426 }
427
428 while (!is_same_group(*se, *pse)) {
429 *se = parent_entity(*se);
430 *pse = parent_entity(*pse);
431 }
432}
433
434#else
435
436#define for_each_sched_entity(se) \
437 for (; se; se = NULL)
438
439static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
440{
441 if (path)
442 strlcpy(path, "(null)", len);
443}
444
445static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
446{
447 return true;
448}
449
450static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
451{
452}
453
454static inline void assert_list_leaf_cfs_rq(struct rq *rq)
455{
456}
457
458#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
459 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
460
461static inline struct sched_entity *parent_entity(struct sched_entity *se)
462{
463 return NULL;
464}
465
466static inline void
467find_matching_se(struct sched_entity **se, struct sched_entity **pse)
468{
469}
470
471#endif
472
473static __always_inline
474void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
475
476
477
478
479
480static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
481{
482 s64 delta = (s64)(vruntime - max_vruntime);
483 if (delta > 0)
484 max_vruntime = vruntime;
485
486 return max_vruntime;
487}
488
489static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
490{
491 s64 delta = (s64)(vruntime - min_vruntime);
492 if (delta < 0)
493 min_vruntime = vruntime;
494
495 return min_vruntime;
496}
497
498static inline bool entity_before(struct sched_entity *a,
499 struct sched_entity *b)
500{
501 return (s64)(a->vruntime - b->vruntime) < 0;
502}
503
504#define __node_2_se(node) \
505 rb_entry((node), struct sched_entity, run_node)
506
507static void update_min_vruntime(struct cfs_rq *cfs_rq)
508{
509 struct sched_entity *curr = cfs_rq->curr;
510 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
511
512 u64 vruntime = cfs_rq->min_vruntime;
513
514 if (curr) {
515 if (curr->on_rq)
516 vruntime = curr->vruntime;
517 else
518 curr = NULL;
519 }
520
521 if (leftmost) {
522 struct sched_entity *se = __node_2_se(leftmost);
523
524 if (!curr)
525 vruntime = se->vruntime;
526 else
527 vruntime = min_vruntime(vruntime, se->vruntime);
528 }
529
530
531 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
532#ifndef CONFIG_64BIT
533 smp_wmb();
534 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
535#endif
536}
537
538static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
539{
540 return entity_before(__node_2_se(a), __node_2_se(b));
541}
542
543
544
545
546static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
547{
548 rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
549}
550
551static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
552{
553 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
554}
555
556struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
557{
558 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
559
560 if (!left)
561 return NULL;
562
563 return __node_2_se(left);
564}
565
566static struct sched_entity *__pick_next_entity(struct sched_entity *se)
567{
568 struct rb_node *next = rb_next(&se->run_node);
569
570 if (!next)
571 return NULL;
572
573 return __node_2_se(next);
574}
575
576#ifdef CONFIG_SCHED_DEBUG
577struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
578{
579 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
580
581 if (!last)
582 return NULL;
583
584 return __node_2_se(last);
585}
586
587
588
589
590
591int sched_update_scaling(void)
592{
593 unsigned int factor = get_update_sysctl_factor();
594
595 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
596 sysctl_sched_min_granularity);
597
598#define WRT_SYSCTL(name) \
599 (normalized_sysctl_##name = sysctl_##name / (factor))
600 WRT_SYSCTL(sched_min_granularity);
601 WRT_SYSCTL(sched_latency);
602 WRT_SYSCTL(sched_wakeup_granularity);
603#undef WRT_SYSCTL
604
605 return 0;
606}
607#endif
608
609
610
611
612static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
613{
614 if (unlikely(se->load.weight != NICE_0_LOAD))
615 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
616
617 return delta;
618}
619
620
621
622
623
624
625
626
627
628static u64 __sched_period(unsigned long nr_running)
629{
630 if (unlikely(nr_running > sched_nr_latency))
631 return nr_running * sysctl_sched_min_granularity;
632 else
633 return sysctl_sched_latency;
634}
635
636
637
638
639
640
641
642static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
643{
644 unsigned int nr_running = cfs_rq->nr_running;
645 u64 slice;
646
647 if (sched_feat(ALT_PERIOD))
648 nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
649
650 slice = __sched_period(nr_running + !se->on_rq);
651
652 for_each_sched_entity(se) {
653 struct load_weight *load;
654 struct load_weight lw;
655
656 cfs_rq = cfs_rq_of(se);
657 load = &cfs_rq->load;
658
659 if (unlikely(!se->on_rq)) {
660 lw = cfs_rq->load;
661
662 update_load_add(&lw, se->load.weight);
663 load = &lw;
664 }
665 slice = __calc_delta(slice, se->load.weight, load);
666 }
667
668 if (sched_feat(BASE_SLICE))
669 slice = max(slice, (u64)sysctl_sched_min_granularity);
670
671 return slice;
672}
673
674
675
676
677
678
679static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
680{
681 return calc_delta_fair(sched_slice(cfs_rq, se), se);
682}
683
684#include "pelt.h"
685#ifdef CONFIG_SMP
686
687static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
688static unsigned long task_h_load(struct task_struct *p);
689static unsigned long capacity_of(int cpu);
690
691
692void init_entity_runnable_average(struct sched_entity *se)
693{
694 struct sched_avg *sa = &se->avg;
695
696 memset(sa, 0, sizeof(*sa));
697
698
699
700
701
702
703
704 if (entity_is_task(se))
705 sa->load_avg = scale_load_down(se->load.weight);
706
707
708}
709
710static void attach_entity_cfs_rq(struct sched_entity *se);
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738void post_init_entity_util_avg(struct task_struct *p)
739{
740 struct sched_entity *se = &p->se;
741 struct cfs_rq *cfs_rq = cfs_rq_of(se);
742 struct sched_avg *sa = &se->avg;
743 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
744 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
745
746 if (cap > 0) {
747 if (cfs_rq->avg.util_avg != 0) {
748 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
749 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
750
751 if (sa->util_avg > cap)
752 sa->util_avg = cap;
753 } else {
754 sa->util_avg = cap;
755 }
756 }
757
758 sa->runnable_avg = sa->util_avg;
759
760 if (p->sched_class != &fair_sched_class) {
761
762
763
764
765
766
767
768
769
770
771 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
772 return;
773 }
774
775 attach_entity_cfs_rq(se);
776}
777
778#else
779void init_entity_runnable_average(struct sched_entity *se)
780{
781}
782void post_init_entity_util_avg(struct task_struct *p)
783{
784}
785static void update_tg_load_avg(struct cfs_rq *cfs_rq)
786{
787}
788#endif
789
790
791
792
793static void update_curr(struct cfs_rq *cfs_rq)
794{
795 struct sched_entity *curr = cfs_rq->curr;
796 u64 now = rq_clock_task(rq_of(cfs_rq));
797 u64 delta_exec;
798
799 if (unlikely(!curr))
800 return;
801
802 delta_exec = now - curr->exec_start;
803 if (unlikely((s64)delta_exec <= 0))
804 return;
805
806 curr->exec_start = now;
807
808 schedstat_set(curr->statistics.exec_max,
809 max(delta_exec, curr->statistics.exec_max));
810
811 curr->sum_exec_runtime += delta_exec;
812 schedstat_add(cfs_rq->exec_clock, delta_exec);
813
814 curr->vruntime += calc_delta_fair(delta_exec, curr);
815 update_min_vruntime(cfs_rq);
816
817 if (entity_is_task(curr)) {
818 struct task_struct *curtask = task_of(curr);
819
820 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
821 cgroup_account_cputime(curtask, delta_exec);
822 account_group_exec_runtime(curtask, delta_exec);
823 }
824
825 account_cfs_rq_runtime(cfs_rq, delta_exec);
826}
827
828static void update_curr_fair(struct rq *rq)
829{
830 update_curr(cfs_rq_of(&rq->curr->se));
831}
832
833static inline void
834update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
835{
836 u64 wait_start, prev_wait_start;
837
838 if (!schedstat_enabled())
839 return;
840
841 wait_start = rq_clock(rq_of(cfs_rq));
842 prev_wait_start = schedstat_val(se->statistics.wait_start);
843
844 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
845 likely(wait_start > prev_wait_start))
846 wait_start -= prev_wait_start;
847
848 __schedstat_set(se->statistics.wait_start, wait_start);
849}
850
851static inline void
852update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
853{
854 struct task_struct *p;
855 u64 delta;
856
857 if (!schedstat_enabled())
858 return;
859
860
861
862
863
864
865
866 if (unlikely(!schedstat_val(se->statistics.wait_start)))
867 return;
868
869 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
870
871 if (entity_is_task(se)) {
872 p = task_of(se);
873 if (task_on_rq_migrating(p)) {
874
875
876
877
878
879 __schedstat_set(se->statistics.wait_start, delta);
880 return;
881 }
882 trace_sched_stat_wait(p, delta);
883 }
884
885 __schedstat_set(se->statistics.wait_max,
886 max(schedstat_val(se->statistics.wait_max), delta));
887 __schedstat_inc(se->statistics.wait_count);
888 __schedstat_add(se->statistics.wait_sum, delta);
889 __schedstat_set(se->statistics.wait_start, 0);
890}
891
892static inline void
893update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
894{
895 struct task_struct *tsk = NULL;
896 u64 sleep_start, block_start;
897
898 if (!schedstat_enabled())
899 return;
900
901 sleep_start = schedstat_val(se->statistics.sleep_start);
902 block_start = schedstat_val(se->statistics.block_start);
903
904 if (entity_is_task(se))
905 tsk = task_of(se);
906
907 if (sleep_start) {
908 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
909
910 if ((s64)delta < 0)
911 delta = 0;
912
913 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
914 __schedstat_set(se->statistics.sleep_max, delta);
915
916 __schedstat_set(se->statistics.sleep_start, 0);
917 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
918
919 if (tsk) {
920 account_scheduler_latency(tsk, delta >> 10, 1);
921 trace_sched_stat_sleep(tsk, delta);
922 }
923 }
924 if (block_start) {
925 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
926
927 if ((s64)delta < 0)
928 delta = 0;
929
930 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
931 __schedstat_set(se->statistics.block_max, delta);
932
933 __schedstat_set(se->statistics.block_start, 0);
934 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
935
936 if (tsk) {
937 if (tsk->in_iowait) {
938 __schedstat_add(se->statistics.iowait_sum, delta);
939 __schedstat_inc(se->statistics.iowait_count);
940 trace_sched_stat_iowait(tsk, delta);
941 }
942
943 trace_sched_stat_blocked(tsk, delta);
944
945
946
947
948
949
950 if (unlikely(prof_on == SLEEP_PROFILING)) {
951 profile_hits(SLEEP_PROFILING,
952 (void *)get_wchan(tsk),
953 delta >> 20);
954 }
955 account_scheduler_latency(tsk, delta >> 10, 0);
956 }
957 }
958}
959
960
961
962
963static inline void
964update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
965{
966 if (!schedstat_enabled())
967 return;
968
969
970
971
972
973 if (se != cfs_rq->curr)
974 update_stats_wait_start(cfs_rq, se);
975
976 if (flags & ENQUEUE_WAKEUP)
977 update_stats_enqueue_sleeper(cfs_rq, se);
978}
979
980static inline void
981update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
982{
983
984 if (!schedstat_enabled())
985 return;
986
987
988
989
990
991 if (se != cfs_rq->curr)
992 update_stats_wait_end(cfs_rq, se);
993
994 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
995 struct task_struct *tsk = task_of(se);
996 unsigned int state;
997
998
999 state = READ_ONCE(tsk->__state);
1000 if (state & TASK_INTERRUPTIBLE)
1001 __schedstat_set(se->statistics.sleep_start,
1002 rq_clock(rq_of(cfs_rq)));
1003 if (state & TASK_UNINTERRUPTIBLE)
1004 __schedstat_set(se->statistics.block_start,
1005 rq_clock(rq_of(cfs_rq)));
1006 }
1007}
1008
1009
1010
1011
1012static inline void
1013update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1014{
1015
1016
1017
1018 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1019}
1020
1021
1022
1023
1024
1025#ifdef CONFIG_NUMA_BALANCING
1026
1027
1028
1029
1030
1031unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1032unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1033
1034
1035unsigned int sysctl_numa_balancing_scan_size = 256;
1036
1037
1038unsigned int sysctl_numa_balancing_scan_delay = 1000;
1039
1040struct numa_group {
1041 refcount_t refcount;
1042
1043 spinlock_t lock;
1044 int nr_tasks;
1045 pid_t gid;
1046 int active_nodes;
1047
1048 struct rcu_head rcu;
1049 unsigned long total_faults;
1050 unsigned long max_faults_cpu;
1051
1052
1053
1054
1055
1056 unsigned long *faults_cpu;
1057 unsigned long faults[];
1058};
1059
1060
1061
1062
1063
1064static struct numa_group *deref_task_numa_group(struct task_struct *p)
1065{
1066 return rcu_dereference_check(p->numa_group, p == current ||
1067 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1068}
1069
1070static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1071{
1072 return rcu_dereference_protected(p->numa_group, p == current);
1073}
1074
1075static inline unsigned long group_faults_priv(struct numa_group *ng);
1076static inline unsigned long group_faults_shared(struct numa_group *ng);
1077
1078static unsigned int task_nr_scan_windows(struct task_struct *p)
1079{
1080 unsigned long rss = 0;
1081 unsigned long nr_scan_pages;
1082
1083
1084
1085
1086
1087
1088 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1089 rss = get_mm_rss(p->mm);
1090 if (!rss)
1091 rss = nr_scan_pages;
1092
1093 rss = round_up(rss, nr_scan_pages);
1094 return rss / nr_scan_pages;
1095}
1096
1097
1098#define MAX_SCAN_WINDOW 2560
1099
1100static unsigned int task_scan_min(struct task_struct *p)
1101{
1102 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1103 unsigned int scan, floor;
1104 unsigned int windows = 1;
1105
1106 if (scan_size < MAX_SCAN_WINDOW)
1107 windows = MAX_SCAN_WINDOW / scan_size;
1108 floor = 1000 / windows;
1109
1110 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1111 return max_t(unsigned int, floor, scan);
1112}
1113
1114static unsigned int task_scan_start(struct task_struct *p)
1115{
1116 unsigned long smin = task_scan_min(p);
1117 unsigned long period = smin;
1118 struct numa_group *ng;
1119
1120
1121 rcu_read_lock();
1122 ng = rcu_dereference(p->numa_group);
1123 if (ng) {
1124 unsigned long shared = group_faults_shared(ng);
1125 unsigned long private = group_faults_priv(ng);
1126
1127 period *= refcount_read(&ng->refcount);
1128 period *= shared + 1;
1129 period /= private + shared + 1;
1130 }
1131 rcu_read_unlock();
1132
1133 return max(smin, period);
1134}
1135
1136static unsigned int task_scan_max(struct task_struct *p)
1137{
1138 unsigned long smin = task_scan_min(p);
1139 unsigned long smax;
1140 struct numa_group *ng;
1141
1142
1143 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1144
1145
1146 ng = deref_curr_numa_group(p);
1147 if (ng) {
1148 unsigned long shared = group_faults_shared(ng);
1149 unsigned long private = group_faults_priv(ng);
1150 unsigned long period = smax;
1151
1152 period *= refcount_read(&ng->refcount);
1153 period *= shared + 1;
1154 period /= private + shared + 1;
1155
1156 smax = max(smax, period);
1157 }
1158
1159 return max(smin, smax);
1160}
1161
1162static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1163{
1164 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1165 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1166}
1167
1168static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1169{
1170 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1171 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1172}
1173
1174
1175#define NR_NUMA_HINT_FAULT_TYPES 2
1176
1177
1178#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1179
1180
1181#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1182
1183pid_t task_numa_group_id(struct task_struct *p)
1184{
1185 struct numa_group *ng;
1186 pid_t gid = 0;
1187
1188 rcu_read_lock();
1189 ng = rcu_dereference(p->numa_group);
1190 if (ng)
1191 gid = ng->gid;
1192 rcu_read_unlock();
1193
1194 return gid;
1195}
1196
1197
1198
1199
1200
1201
1202
1203static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1204{
1205 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1206}
1207
1208static inline unsigned long task_faults(struct task_struct *p, int nid)
1209{
1210 if (!p->numa_faults)
1211 return 0;
1212
1213 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1214 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1215}
1216
1217static inline unsigned long group_faults(struct task_struct *p, int nid)
1218{
1219 struct numa_group *ng = deref_task_numa_group(p);
1220
1221 if (!ng)
1222 return 0;
1223
1224 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1225 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1226}
1227
1228static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1229{
1230 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1231 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1232}
1233
1234static inline unsigned long group_faults_priv(struct numa_group *ng)
1235{
1236 unsigned long faults = 0;
1237 int node;
1238
1239 for_each_online_node(node) {
1240 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1241 }
1242
1243 return faults;
1244}
1245
1246static inline unsigned long group_faults_shared(struct numa_group *ng)
1247{
1248 unsigned long faults = 0;
1249 int node;
1250
1251 for_each_online_node(node) {
1252 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1253 }
1254
1255 return faults;
1256}
1257
1258
1259
1260
1261
1262
1263#define ACTIVE_NODE_FRACTION 3
1264
1265static bool numa_is_active_node(int nid, struct numa_group *ng)
1266{
1267 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1268}
1269
1270
1271static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1272 int maxdist, bool task)
1273{
1274 unsigned long score = 0;
1275 int node;
1276
1277
1278
1279
1280
1281 if (sched_numa_topology_type == NUMA_DIRECT)
1282 return 0;
1283
1284
1285
1286
1287
1288 for_each_online_node(node) {
1289 unsigned long faults;
1290 int dist = node_distance(nid, node);
1291
1292
1293
1294
1295
1296 if (dist == sched_max_numa_distance || node == nid)
1297 continue;
1298
1299
1300
1301
1302
1303
1304
1305
1306 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1307 dist >= maxdist)
1308 continue;
1309
1310
1311 if (task)
1312 faults = task_faults(p, node);
1313 else
1314 faults = group_faults(p, node);
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1325 faults *= (sched_max_numa_distance - dist);
1326 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1327 }
1328
1329 score += faults;
1330 }
1331
1332 return score;
1333}
1334
1335
1336
1337
1338
1339
1340
1341static inline unsigned long task_weight(struct task_struct *p, int nid,
1342 int dist)
1343{
1344 unsigned long faults, total_faults;
1345
1346 if (!p->numa_faults)
1347 return 0;
1348
1349 total_faults = p->total_numa_faults;
1350
1351 if (!total_faults)
1352 return 0;
1353
1354 faults = task_faults(p, nid);
1355 faults += score_nearby_nodes(p, nid, dist, true);
1356
1357 return 1000 * faults / total_faults;
1358}
1359
1360static inline unsigned long group_weight(struct task_struct *p, int nid,
1361 int dist)
1362{
1363 struct numa_group *ng = deref_task_numa_group(p);
1364 unsigned long faults, total_faults;
1365
1366 if (!ng)
1367 return 0;
1368
1369 total_faults = ng->total_faults;
1370
1371 if (!total_faults)
1372 return 0;
1373
1374 faults = group_faults(p, nid);
1375 faults += score_nearby_nodes(p, nid, dist, false);
1376
1377 return 1000 * faults / total_faults;
1378}
1379
1380bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1381 int src_nid, int dst_cpu)
1382{
1383 struct numa_group *ng = deref_curr_numa_group(p);
1384 int dst_nid = cpu_to_node(dst_cpu);
1385 int last_cpupid, this_cpupid;
1386
1387 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1388 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1389
1390
1391
1392
1393
1394
1395
1396 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1397 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1398 return true;
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417 if (!cpupid_pid_unset(last_cpupid) &&
1418 cpupid_to_nid(last_cpupid) != dst_nid)
1419 return false;
1420
1421
1422 if (cpupid_match_pid(p, last_cpupid))
1423 return true;
1424
1425
1426 if (!ng)
1427 return true;
1428
1429
1430
1431
1432
1433 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1434 ACTIVE_NODE_FRACTION)
1435 return true;
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1446 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1447}
1448
1449
1450
1451
1452enum numa_type {
1453
1454 node_has_spare = 0,
1455
1456
1457
1458
1459 node_fully_busy,
1460
1461
1462
1463
1464 node_overloaded
1465};
1466
1467
1468struct numa_stats {
1469 unsigned long load;
1470 unsigned long runnable;
1471 unsigned long util;
1472
1473 unsigned long compute_capacity;
1474 unsigned int nr_running;
1475 unsigned int weight;
1476 enum numa_type node_type;
1477 int idle_cpu;
1478};
1479
1480static inline bool is_core_idle(int cpu)
1481{
1482#ifdef CONFIG_SCHED_SMT
1483 int sibling;
1484
1485 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1486 if (cpu == sibling)
1487 continue;
1488
1489 if (!idle_cpu(cpu))
1490 return false;
1491 }
1492#endif
1493
1494 return true;
1495}
1496
1497struct task_numa_env {
1498 struct task_struct *p;
1499
1500 int src_cpu, src_nid;
1501 int dst_cpu, dst_nid;
1502
1503 struct numa_stats src_stats, dst_stats;
1504
1505 int imbalance_pct;
1506 int dist;
1507
1508 struct task_struct *best_task;
1509 long best_imp;
1510 int best_cpu;
1511};
1512
1513static unsigned long cpu_load(struct rq *rq);
1514static unsigned long cpu_runnable(struct rq *rq);
1515static unsigned long cpu_util(int cpu);
1516static inline long adjust_numa_imbalance(int imbalance,
1517 int dst_running, int dst_weight);
1518
1519static inline enum
1520numa_type numa_classify(unsigned int imbalance_pct,
1521 struct numa_stats *ns)
1522{
1523 if ((ns->nr_running > ns->weight) &&
1524 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1525 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1526 return node_overloaded;
1527
1528 if ((ns->nr_running < ns->weight) ||
1529 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1530 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1531 return node_has_spare;
1532
1533 return node_fully_busy;
1534}
1535
1536#ifdef CONFIG_SCHED_SMT
1537
1538static inline bool test_idle_cores(int cpu, bool def);
1539static inline int numa_idle_core(int idle_core, int cpu)
1540{
1541 if (!static_branch_likely(&sched_smt_present) ||
1542 idle_core >= 0 || !test_idle_cores(cpu, false))
1543 return idle_core;
1544
1545
1546
1547
1548
1549 if (is_core_idle(cpu))
1550 idle_core = cpu;
1551
1552 return idle_core;
1553}
1554#else
1555static inline int numa_idle_core(int idle_core, int cpu)
1556{
1557 return idle_core;
1558}
1559#endif
1560
1561
1562
1563
1564
1565
1566
1567static void update_numa_stats(struct task_numa_env *env,
1568 struct numa_stats *ns, int nid,
1569 bool find_idle)
1570{
1571 int cpu, idle_core = -1;
1572
1573 memset(ns, 0, sizeof(*ns));
1574 ns->idle_cpu = -1;
1575
1576 rcu_read_lock();
1577 for_each_cpu(cpu, cpumask_of_node(nid)) {
1578 struct rq *rq = cpu_rq(cpu);
1579
1580 ns->load += cpu_load(rq);
1581 ns->runnable += cpu_runnable(rq);
1582 ns->util += cpu_util(cpu);
1583 ns->nr_running += rq->cfs.h_nr_running;
1584 ns->compute_capacity += capacity_of(cpu);
1585
1586 if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1587 if (READ_ONCE(rq->numa_migrate_on) ||
1588 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1589 continue;
1590
1591 if (ns->idle_cpu == -1)
1592 ns->idle_cpu = cpu;
1593
1594 idle_core = numa_idle_core(idle_core, cpu);
1595 }
1596 }
1597 rcu_read_unlock();
1598
1599 ns->weight = cpumask_weight(cpumask_of_node(nid));
1600
1601 ns->node_type = numa_classify(env->imbalance_pct, ns);
1602
1603 if (idle_core >= 0)
1604 ns->idle_cpu = idle_core;
1605}
1606
1607static void task_numa_assign(struct task_numa_env *env,
1608 struct task_struct *p, long imp)
1609{
1610 struct rq *rq = cpu_rq(env->dst_cpu);
1611
1612
1613 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1614 int cpu;
1615 int start = env->dst_cpu;
1616
1617
1618 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1619 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1620 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1621 continue;
1622 }
1623
1624 env->dst_cpu = cpu;
1625 rq = cpu_rq(env->dst_cpu);
1626 if (!xchg(&rq->numa_migrate_on, 1))
1627 goto assign;
1628 }
1629
1630
1631 return;
1632 }
1633
1634assign:
1635
1636
1637
1638
1639 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1640 rq = cpu_rq(env->best_cpu);
1641 WRITE_ONCE(rq->numa_migrate_on, 0);
1642 }
1643
1644 if (env->best_task)
1645 put_task_struct(env->best_task);
1646 if (p)
1647 get_task_struct(p);
1648
1649 env->best_task = p;
1650 env->best_imp = imp;
1651 env->best_cpu = env->dst_cpu;
1652}
1653
1654static bool load_too_imbalanced(long src_load, long dst_load,
1655 struct task_numa_env *env)
1656{
1657 long imb, old_imb;
1658 long orig_src_load, orig_dst_load;
1659 long src_capacity, dst_capacity;
1660
1661
1662
1663
1664
1665
1666
1667
1668 src_capacity = env->src_stats.compute_capacity;
1669 dst_capacity = env->dst_stats.compute_capacity;
1670
1671 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1672
1673 orig_src_load = env->src_stats.load;
1674 orig_dst_load = env->dst_stats.load;
1675
1676 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1677
1678
1679 return (imb > old_imb);
1680}
1681
1682
1683
1684
1685
1686
1687#define SMALLIMP 30
1688
1689
1690
1691
1692
1693
1694
1695static bool task_numa_compare(struct task_numa_env *env,
1696 long taskimp, long groupimp, bool maymove)
1697{
1698 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1699 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1700 long imp = p_ng ? groupimp : taskimp;
1701 struct task_struct *cur;
1702 long src_load, dst_load;
1703 int dist = env->dist;
1704 long moveimp = imp;
1705 long load;
1706 bool stopsearch = false;
1707
1708 if (READ_ONCE(dst_rq->numa_migrate_on))
1709 return false;
1710
1711 rcu_read_lock();
1712 cur = rcu_dereference(dst_rq->curr);
1713 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1714 cur = NULL;
1715
1716
1717
1718
1719
1720 if (cur == env->p) {
1721 stopsearch = true;
1722 goto unlock;
1723 }
1724
1725 if (!cur) {
1726 if (maymove && moveimp >= env->best_imp)
1727 goto assign;
1728 else
1729 goto unlock;
1730 }
1731
1732
1733 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1734 goto unlock;
1735
1736
1737
1738
1739
1740 if (env->best_task &&
1741 env->best_task->numa_preferred_nid == env->src_nid &&
1742 cur->numa_preferred_nid != env->src_nid) {
1743 goto unlock;
1744 }
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756 cur_ng = rcu_dereference(cur->numa_group);
1757 if (cur_ng == p_ng) {
1758 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1759 task_weight(cur, env->dst_nid, dist);
1760
1761
1762
1763
1764 if (cur_ng)
1765 imp -= imp / 16;
1766 } else {
1767
1768
1769
1770
1771 if (cur_ng && p_ng)
1772 imp += group_weight(cur, env->src_nid, dist) -
1773 group_weight(cur, env->dst_nid, dist);
1774 else
1775 imp += task_weight(cur, env->src_nid, dist) -
1776 task_weight(cur, env->dst_nid, dist);
1777 }
1778
1779
1780 if (cur->numa_preferred_nid == env->dst_nid)
1781 imp -= imp / 16;
1782
1783
1784
1785
1786
1787
1788
1789 if (cur->numa_preferred_nid == env->src_nid)
1790 imp += imp / 8;
1791
1792 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1793 imp = moveimp;
1794 cur = NULL;
1795 goto assign;
1796 }
1797
1798
1799
1800
1801
1802 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1803 env->best_task->numa_preferred_nid != env->src_nid) {
1804 goto assign;
1805 }
1806
1807
1808
1809
1810
1811
1812
1813 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1814 goto unlock;
1815
1816
1817
1818
1819 load = task_h_load(env->p) - task_h_load(cur);
1820 if (!load)
1821 goto assign;
1822
1823 dst_load = env->dst_stats.load + load;
1824 src_load = env->src_stats.load - load;
1825
1826 if (load_too_imbalanced(src_load, dst_load, env))
1827 goto unlock;
1828
1829assign:
1830
1831 if (!cur) {
1832 int cpu = env->dst_stats.idle_cpu;
1833
1834
1835 if (cpu < 0)
1836 cpu = env->dst_cpu;
1837
1838
1839
1840
1841
1842 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1843 idle_cpu(env->best_cpu)) {
1844 cpu = env->best_cpu;
1845 }
1846
1847 env->dst_cpu = cpu;
1848 }
1849
1850 task_numa_assign(env, cur, imp);
1851
1852
1853
1854
1855
1856
1857 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1858 stopsearch = true;
1859
1860
1861
1862
1863
1864 if (!maymove && env->best_task &&
1865 env->best_task->numa_preferred_nid == env->src_nid) {
1866 stopsearch = true;
1867 }
1868unlock:
1869 rcu_read_unlock();
1870
1871 return stopsearch;
1872}
1873
1874static void task_numa_find_cpu(struct task_numa_env *env,
1875 long taskimp, long groupimp)
1876{
1877 bool maymove = false;
1878 int cpu;
1879
1880
1881
1882
1883
1884 if (env->dst_stats.node_type == node_has_spare) {
1885 unsigned int imbalance;
1886 int src_running, dst_running;
1887
1888
1889
1890
1891
1892
1893
1894 src_running = env->src_stats.nr_running - 1;
1895 dst_running = env->dst_stats.nr_running + 1;
1896 imbalance = max(0, dst_running - src_running);
1897 imbalance = adjust_numa_imbalance(imbalance, dst_running,
1898 env->dst_stats.weight);
1899
1900
1901 if (!imbalance) {
1902 maymove = true;
1903 if (env->dst_stats.idle_cpu >= 0) {
1904 env->dst_cpu = env->dst_stats.idle_cpu;
1905 task_numa_assign(env, NULL, 0);
1906 return;
1907 }
1908 }
1909 } else {
1910 long src_load, dst_load, load;
1911
1912
1913
1914
1915 load = task_h_load(env->p);
1916 dst_load = env->dst_stats.load + load;
1917 src_load = env->src_stats.load - load;
1918 maymove = !load_too_imbalanced(src_load, dst_load, env);
1919 }
1920
1921 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1922
1923 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1924 continue;
1925
1926 env->dst_cpu = cpu;
1927 if (task_numa_compare(env, taskimp, groupimp, maymove))
1928 break;
1929 }
1930}
1931
1932static int task_numa_migrate(struct task_struct *p)
1933{
1934 struct task_numa_env env = {
1935 .p = p,
1936
1937 .src_cpu = task_cpu(p),
1938 .src_nid = task_node(p),
1939
1940 .imbalance_pct = 112,
1941
1942 .best_task = NULL,
1943 .best_imp = 0,
1944 .best_cpu = -1,
1945 };
1946 unsigned long taskweight, groupweight;
1947 struct sched_domain *sd;
1948 long taskimp, groupimp;
1949 struct numa_group *ng;
1950 struct rq *best_rq;
1951 int nid, ret, dist;
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961 rcu_read_lock();
1962 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1963 if (sd)
1964 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1965 rcu_read_unlock();
1966
1967
1968
1969
1970
1971
1972
1973 if (unlikely(!sd)) {
1974 sched_setnuma(p, task_node(p));
1975 return -EINVAL;
1976 }
1977
1978 env.dst_nid = p->numa_preferred_nid;
1979 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1980 taskweight = task_weight(p, env.src_nid, dist);
1981 groupweight = group_weight(p, env.src_nid, dist);
1982 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
1983 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1984 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1985 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1986
1987
1988 task_numa_find_cpu(&env, taskimp, groupimp);
1989
1990
1991
1992
1993
1994
1995
1996
1997 ng = deref_curr_numa_group(p);
1998 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1999 for_each_online_node(nid) {
2000 if (nid == env.src_nid || nid == p->numa_preferred_nid)
2001 continue;
2002
2003 dist = node_distance(env.src_nid, env.dst_nid);
2004 if (sched_numa_topology_type == NUMA_BACKPLANE &&
2005 dist != env.dist) {
2006 taskweight = task_weight(p, env.src_nid, dist);
2007 groupweight = group_weight(p, env.src_nid, dist);
2008 }
2009
2010
2011 taskimp = task_weight(p, nid, dist) - taskweight;
2012 groupimp = group_weight(p, nid, dist) - groupweight;
2013 if (taskimp < 0 && groupimp < 0)
2014 continue;
2015
2016 env.dist = dist;
2017 env.dst_nid = nid;
2018 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2019 task_numa_find_cpu(&env, taskimp, groupimp);
2020 }
2021 }
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031 if (ng) {
2032 if (env.best_cpu == -1)
2033 nid = env.src_nid;
2034 else
2035 nid = cpu_to_node(env.best_cpu);
2036
2037 if (nid != p->numa_preferred_nid)
2038 sched_setnuma(p, nid);
2039 }
2040
2041
2042 if (env.best_cpu == -1) {
2043 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2044 return -EAGAIN;
2045 }
2046
2047 best_rq = cpu_rq(env.best_cpu);
2048 if (env.best_task == NULL) {
2049 ret = migrate_task_to(p, env.best_cpu);
2050 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2051 if (ret != 0)
2052 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2053 return ret;
2054 }
2055
2056 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2057 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2058
2059 if (ret != 0)
2060 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2061 put_task_struct(env.best_task);
2062 return ret;
2063}
2064
2065
2066static void numa_migrate_preferred(struct task_struct *p)
2067{
2068 unsigned long interval = HZ;
2069
2070
2071 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2072 return;
2073
2074
2075 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2076 p->numa_migrate_retry = jiffies + interval;
2077
2078
2079 if (task_node(p) == p->numa_preferred_nid)
2080 return;
2081
2082
2083 task_numa_migrate(p);
2084}
2085
2086
2087
2088
2089
2090
2091
2092static void numa_group_count_active_nodes(struct numa_group *numa_group)
2093{
2094 unsigned long faults, max_faults = 0;
2095 int nid, active_nodes = 0;
2096
2097 for_each_online_node(nid) {
2098 faults = group_faults_cpu(numa_group, nid);
2099 if (faults > max_faults)
2100 max_faults = faults;
2101 }
2102
2103 for_each_online_node(nid) {
2104 faults = group_faults_cpu(numa_group, nid);
2105 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2106 active_nodes++;
2107 }
2108
2109 numa_group->max_faults_cpu = max_faults;
2110 numa_group->active_nodes = active_nodes;
2111}
2112
2113
2114
2115
2116
2117
2118
2119
2120#define NUMA_PERIOD_SLOTS 10
2121#define NUMA_PERIOD_THRESHOLD 7
2122
2123
2124
2125
2126
2127
2128
2129static void update_task_scan_period(struct task_struct *p,
2130 unsigned long shared, unsigned long private)
2131{
2132 unsigned int period_slot;
2133 int lr_ratio, ps_ratio;
2134 int diff;
2135
2136 unsigned long remote = p->numa_faults_locality[0];
2137 unsigned long local = p->numa_faults_locality[1];
2138
2139
2140
2141
2142
2143
2144
2145
2146 if (local + shared == 0 || p->numa_faults_locality[2]) {
2147 p->numa_scan_period = min(p->numa_scan_period_max,
2148 p->numa_scan_period << 1);
2149
2150 p->mm->numa_next_scan = jiffies +
2151 msecs_to_jiffies(p->numa_scan_period);
2152
2153 return;
2154 }
2155
2156
2157
2158
2159
2160
2161
2162 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2163 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2164 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2165
2166 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2167
2168
2169
2170
2171 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2172 if (!slot)
2173 slot = 1;
2174 diff = slot * period_slot;
2175 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2176
2177
2178
2179
2180
2181 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2182 if (!slot)
2183 slot = 1;
2184 diff = slot * period_slot;
2185 } else {
2186
2187
2188
2189
2190
2191 int ratio = max(lr_ratio, ps_ratio);
2192 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2193 }
2194
2195 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2196 task_scan_min(p), task_scan_max(p));
2197 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2198}
2199
2200
2201
2202
2203
2204
2205
2206
2207static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2208{
2209 u64 runtime, delta, now;
2210
2211 now = p->se.exec_start;
2212 runtime = p->se.sum_exec_runtime;
2213
2214 if (p->last_task_numa_placement) {
2215 delta = runtime - p->last_sum_exec_runtime;
2216 *period = now - p->last_task_numa_placement;
2217
2218
2219 if (unlikely((s64)*period < 0))
2220 *period = 0;
2221 } else {
2222 delta = p->se.avg.load_sum;
2223 *period = LOAD_AVG_MAX;
2224 }
2225
2226 p->last_sum_exec_runtime = runtime;
2227 p->last_task_numa_placement = now;
2228
2229 return delta;
2230}
2231
2232
2233
2234
2235
2236
2237static int preferred_group_nid(struct task_struct *p, int nid)
2238{
2239 nodemask_t nodes;
2240 int dist;
2241
2242
2243 if (sched_numa_topology_type == NUMA_DIRECT)
2244 return nid;
2245
2246
2247
2248
2249
2250
2251 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2252 unsigned long score, max_score = 0;
2253 int node, max_node = nid;
2254
2255 dist = sched_max_numa_distance;
2256
2257 for_each_online_node(node) {
2258 score = group_weight(p, node, dist);
2259 if (score > max_score) {
2260 max_score = score;
2261 max_node = node;
2262 }
2263 }
2264 return max_node;
2265 }
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276 nodes = node_online_map;
2277 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2278 unsigned long max_faults = 0;
2279 nodemask_t max_group = NODE_MASK_NONE;
2280 int a, b;
2281
2282
2283 if (!find_numa_distance(dist))
2284 continue;
2285
2286 for_each_node_mask(a, nodes) {
2287 unsigned long faults = 0;
2288 nodemask_t this_group;
2289 nodes_clear(this_group);
2290
2291
2292 for_each_node_mask(b, nodes) {
2293 if (node_distance(a, b) < dist) {
2294 faults += group_faults(p, b);
2295 node_set(b, this_group);
2296 node_clear(b, nodes);
2297 }
2298 }
2299
2300
2301 if (faults > max_faults) {
2302 max_faults = faults;
2303 max_group = this_group;
2304
2305
2306
2307
2308
2309 nid = a;
2310 }
2311 }
2312
2313 if (!max_faults)
2314 break;
2315 nodes = max_group;
2316 }
2317 return nid;
2318}
2319
2320static void task_numa_placement(struct task_struct *p)
2321{
2322 int seq, nid, max_nid = NUMA_NO_NODE;
2323 unsigned long max_faults = 0;
2324 unsigned long fault_types[2] = { 0, 0 };
2325 unsigned long total_faults;
2326 u64 runtime, period;
2327 spinlock_t *group_lock = NULL;
2328 struct numa_group *ng;
2329
2330
2331
2332
2333
2334
2335 seq = READ_ONCE(p->mm->numa_scan_seq);
2336 if (p->numa_scan_seq == seq)
2337 return;
2338 p->numa_scan_seq = seq;
2339 p->numa_scan_period_max = task_scan_max(p);
2340
2341 total_faults = p->numa_faults_locality[0] +
2342 p->numa_faults_locality[1];
2343 runtime = numa_get_avg_runtime(p, &period);
2344
2345
2346 ng = deref_curr_numa_group(p);
2347 if (ng) {
2348 group_lock = &ng->lock;
2349 spin_lock_irq(group_lock);
2350 }
2351
2352
2353 for_each_online_node(nid) {
2354
2355 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2356 unsigned long faults = 0, group_faults = 0;
2357 int priv;
2358
2359 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2360 long diff, f_diff, f_weight;
2361
2362 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2363 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2364 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2365 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2366
2367
2368 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2369 fault_types[priv] += p->numa_faults[membuf_idx];
2370 p->numa_faults[membuf_idx] = 0;
2371
2372
2373
2374
2375
2376
2377
2378
2379 f_weight = div64_u64(runtime << 16, period + 1);
2380 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2381 (total_faults + 1);
2382 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2383 p->numa_faults[cpubuf_idx] = 0;
2384
2385 p->numa_faults[mem_idx] += diff;
2386 p->numa_faults[cpu_idx] += f_diff;
2387 faults += p->numa_faults[mem_idx];
2388 p->total_numa_faults += diff;
2389 if (ng) {
2390
2391
2392
2393
2394
2395
2396
2397 ng->faults[mem_idx] += diff;
2398 ng->faults_cpu[mem_idx] += f_diff;
2399 ng->total_faults += diff;
2400 group_faults += ng->faults[mem_idx];
2401 }
2402 }
2403
2404 if (!ng) {
2405 if (faults > max_faults) {
2406 max_faults = faults;
2407 max_nid = nid;
2408 }
2409 } else if (group_faults > max_faults) {
2410 max_faults = group_faults;
2411 max_nid = nid;
2412 }
2413 }
2414
2415 if (ng) {
2416 numa_group_count_active_nodes(ng);
2417 spin_unlock_irq(group_lock);
2418 max_nid = preferred_group_nid(p, max_nid);
2419 }
2420
2421 if (max_faults) {
2422
2423 if (max_nid != p->numa_preferred_nid)
2424 sched_setnuma(p, max_nid);
2425 }
2426
2427 update_task_scan_period(p, fault_types[0], fault_types[1]);
2428}
2429
2430static inline int get_numa_group(struct numa_group *grp)
2431{
2432 return refcount_inc_not_zero(&grp->refcount);
2433}
2434
2435static inline void put_numa_group(struct numa_group *grp)
2436{
2437 if (refcount_dec_and_test(&grp->refcount))
2438 kfree_rcu(grp, rcu);
2439}
2440
2441static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2442 int *priv)
2443{
2444 struct numa_group *grp, *my_grp;
2445 struct task_struct *tsk;
2446 bool join = false;
2447 int cpu = cpupid_to_cpu(cpupid);
2448 int i;
2449
2450 if (unlikely(!deref_curr_numa_group(p))) {
2451 unsigned int size = sizeof(struct numa_group) +
2452 4*nr_node_ids*sizeof(unsigned long);
2453
2454 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2455 if (!grp)
2456 return;
2457
2458 refcount_set(&grp->refcount, 1);
2459 grp->active_nodes = 1;
2460 grp->max_faults_cpu = 0;
2461 spin_lock_init(&grp->lock);
2462 grp->gid = p->pid;
2463
2464 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2465 nr_node_ids;
2466
2467 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2468 grp->faults[i] = p->numa_faults[i];
2469
2470 grp->total_faults = p->total_numa_faults;
2471
2472 grp->nr_tasks++;
2473 rcu_assign_pointer(p->numa_group, grp);
2474 }
2475
2476 rcu_read_lock();
2477 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2478
2479 if (!cpupid_match_pid(tsk, cpupid))
2480 goto no_join;
2481
2482 grp = rcu_dereference(tsk->numa_group);
2483 if (!grp)
2484 goto no_join;
2485
2486 my_grp = deref_curr_numa_group(p);
2487 if (grp == my_grp)
2488 goto no_join;
2489
2490
2491
2492
2493
2494 if (my_grp->nr_tasks > grp->nr_tasks)
2495 goto no_join;
2496
2497
2498
2499
2500 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2501 goto no_join;
2502
2503
2504 if (tsk->mm == current->mm)
2505 join = true;
2506
2507
2508 if (flags & TNF_SHARED)
2509 join = true;
2510
2511
2512 *priv = !join;
2513
2514 if (join && !get_numa_group(grp))
2515 goto no_join;
2516
2517 rcu_read_unlock();
2518
2519 if (!join)
2520 return;
2521
2522 BUG_ON(irqs_disabled());
2523 double_lock_irq(&my_grp->lock, &grp->lock);
2524
2525 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2526 my_grp->faults[i] -= p->numa_faults[i];
2527 grp->faults[i] += p->numa_faults[i];
2528 }
2529 my_grp->total_faults -= p->total_numa_faults;
2530 grp->total_faults += p->total_numa_faults;
2531
2532 my_grp->nr_tasks--;
2533 grp->nr_tasks++;
2534
2535 spin_unlock(&my_grp->lock);
2536 spin_unlock_irq(&grp->lock);
2537
2538 rcu_assign_pointer(p->numa_group, grp);
2539
2540 put_numa_group(my_grp);
2541 return;
2542
2543no_join:
2544 rcu_read_unlock();
2545 return;
2546}
2547
2548
2549
2550
2551
2552
2553
2554
2555void task_numa_free(struct task_struct *p, bool final)
2556{
2557
2558 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2559 unsigned long *numa_faults = p->numa_faults;
2560 unsigned long flags;
2561 int i;
2562
2563 if (!numa_faults)
2564 return;
2565
2566 if (grp) {
2567 spin_lock_irqsave(&grp->lock, flags);
2568 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2569 grp->faults[i] -= p->numa_faults[i];
2570 grp->total_faults -= p->total_numa_faults;
2571
2572 grp->nr_tasks--;
2573 spin_unlock_irqrestore(&grp->lock, flags);
2574 RCU_INIT_POINTER(p->numa_group, NULL);
2575 put_numa_group(grp);
2576 }
2577
2578 if (final) {
2579 p->numa_faults = NULL;
2580 kfree(numa_faults);
2581 } else {
2582 p->total_numa_faults = 0;
2583 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2584 numa_faults[i] = 0;
2585 }
2586}
2587
2588
2589
2590
2591void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2592{
2593 struct task_struct *p = current;
2594 bool migrated = flags & TNF_MIGRATED;
2595 int cpu_node = task_node(current);
2596 int local = !!(flags & TNF_FAULT_LOCAL);
2597 struct numa_group *ng;
2598 int priv;
2599
2600 if (!static_branch_likely(&sched_numa_balancing))
2601 return;
2602
2603
2604 if (!p->mm)
2605 return;
2606
2607
2608 if (unlikely(!p->numa_faults)) {
2609 int size = sizeof(*p->numa_faults) *
2610 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2611
2612 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2613 if (!p->numa_faults)
2614 return;
2615
2616 p->total_numa_faults = 0;
2617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2618 }
2619
2620
2621
2622
2623
2624 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2625 priv = 1;
2626 } else {
2627 priv = cpupid_match_pid(p, last_cpupid);
2628 if (!priv && !(flags & TNF_NO_GROUP))
2629 task_numa_group(p, last_cpupid, flags, &priv);
2630 }
2631
2632
2633
2634
2635
2636
2637
2638 ng = deref_curr_numa_group(p);
2639 if (!priv && !local && ng && ng->active_nodes > 1 &&
2640 numa_is_active_node(cpu_node, ng) &&
2641 numa_is_active_node(mem_node, ng))
2642 local = 1;
2643
2644
2645
2646
2647
2648 if (time_after(jiffies, p->numa_migrate_retry)) {
2649 task_numa_placement(p);
2650 numa_migrate_preferred(p);
2651 }
2652
2653 if (migrated)
2654 p->numa_pages_migrated += pages;
2655 if (flags & TNF_MIGRATE_FAIL)
2656 p->numa_faults_locality[2] += pages;
2657
2658 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2659 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2660 p->numa_faults_locality[local] += pages;
2661}
2662
2663static void reset_ptenuma_scan(struct task_struct *p)
2664{
2665
2666
2667
2668
2669
2670
2671
2672
2673 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2674 p->mm->numa_scan_offset = 0;
2675}
2676
2677
2678
2679
2680
2681static void task_numa_work(struct callback_head *work)
2682{
2683 unsigned long migrate, next_scan, now = jiffies;
2684 struct task_struct *p = current;
2685 struct mm_struct *mm = p->mm;
2686 u64 runtime = p->se.sum_exec_runtime;
2687 struct vm_area_struct *vma;
2688 unsigned long start, end;
2689 unsigned long nr_pte_updates = 0;
2690 long pages, virtpages;
2691
2692 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2693
2694 work->next = work;
2695
2696
2697
2698
2699
2700
2701
2702
2703 if (p->flags & PF_EXITING)
2704 return;
2705
2706 if (!mm->numa_next_scan) {
2707 mm->numa_next_scan = now +
2708 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2709 }
2710
2711
2712
2713
2714 migrate = mm->numa_next_scan;
2715 if (time_before(now, migrate))
2716 return;
2717
2718 if (p->numa_scan_period == 0) {
2719 p->numa_scan_period_max = task_scan_max(p);
2720 p->numa_scan_period = task_scan_start(p);
2721 }
2722
2723 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2724 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2725 return;
2726
2727
2728
2729
2730
2731 p->node_stamp += 2 * TICK_NSEC;
2732
2733 start = mm->numa_scan_offset;
2734 pages = sysctl_numa_balancing_scan_size;
2735 pages <<= 20 - PAGE_SHIFT;
2736 virtpages = pages * 8;
2737 if (!pages)
2738 return;
2739
2740
2741 if (!mmap_read_trylock(mm))
2742 return;
2743 vma = find_vma(mm, start);
2744 if (!vma) {
2745 reset_ptenuma_scan(p);
2746 start = 0;
2747 vma = mm->mmap;
2748 }
2749 for (; vma; vma = vma->vm_next) {
2750 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2751 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2752 continue;
2753 }
2754
2755
2756
2757
2758
2759
2760
2761 if (!vma->vm_mm ||
2762 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2763 continue;
2764
2765
2766
2767
2768
2769 if (!vma_is_accessible(vma))
2770 continue;
2771
2772 do {
2773 start = max(start, vma->vm_start);
2774 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2775 end = min(end, vma->vm_end);
2776 nr_pte_updates = change_prot_numa(vma, start, end);
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786 if (nr_pte_updates)
2787 pages -= (end - start) >> PAGE_SHIFT;
2788 virtpages -= (end - start) >> PAGE_SHIFT;
2789
2790 start = end;
2791 if (pages <= 0 || virtpages <= 0)
2792 goto out;
2793
2794 cond_resched();
2795 } while (end != vma->vm_end);
2796 }
2797
2798out:
2799
2800
2801
2802
2803
2804
2805 if (vma)
2806 mm->numa_scan_offset = start;
2807 else
2808 reset_ptenuma_scan(p);
2809 mmap_read_unlock(mm);
2810
2811
2812
2813
2814
2815
2816
2817 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2818 u64 diff = p->se.sum_exec_runtime - runtime;
2819 p->node_stamp += 32 * diff;
2820 }
2821}
2822
2823void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2824{
2825 int mm_users = 0;
2826 struct mm_struct *mm = p->mm;
2827
2828 if (mm) {
2829 mm_users = atomic_read(&mm->mm_users);
2830 if (mm_users == 1) {
2831 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2832 mm->numa_scan_seq = 0;
2833 }
2834 }
2835 p->node_stamp = 0;
2836 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2837 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2838
2839 p->numa_work.next = &p->numa_work;
2840 p->numa_faults = NULL;
2841 RCU_INIT_POINTER(p->numa_group, NULL);
2842 p->last_task_numa_placement = 0;
2843 p->last_sum_exec_runtime = 0;
2844
2845 init_task_work(&p->numa_work, task_numa_work);
2846
2847
2848 if (!(clone_flags & CLONE_VM)) {
2849 p->numa_preferred_nid = NUMA_NO_NODE;
2850 return;
2851 }
2852
2853
2854
2855
2856
2857 if (mm) {
2858 unsigned int delay;
2859
2860 delay = min_t(unsigned int, task_scan_max(current),
2861 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2862 delay += 2 * TICK_NSEC;
2863 p->node_stamp = delay;
2864 }
2865}
2866
2867
2868
2869
2870static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2871{
2872 struct callback_head *work = &curr->numa_work;
2873 u64 period, now;
2874
2875
2876
2877
2878 if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2879 return;
2880
2881
2882
2883
2884
2885
2886
2887 now = curr->se.sum_exec_runtime;
2888 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2889
2890 if (now > curr->node_stamp + period) {
2891 if (!curr->node_stamp)
2892 curr->numa_scan_period = task_scan_start(curr);
2893 curr->node_stamp += period;
2894
2895 if (!time_before(jiffies, curr->mm->numa_next_scan))
2896 task_work_add(curr, work, TWA_RESUME);
2897 }
2898}
2899
2900static void update_scan_period(struct task_struct *p, int new_cpu)
2901{
2902 int src_nid = cpu_to_node(task_cpu(p));
2903 int dst_nid = cpu_to_node(new_cpu);
2904
2905 if (!static_branch_likely(&sched_numa_balancing))
2906 return;
2907
2908 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2909 return;
2910
2911 if (src_nid == dst_nid)
2912 return;
2913
2914
2915
2916
2917
2918
2919 if (p->numa_scan_seq) {
2920
2921
2922
2923
2924
2925 if (dst_nid == p->numa_preferred_nid ||
2926 (p->numa_preferred_nid != NUMA_NO_NODE &&
2927 src_nid != p->numa_preferred_nid))
2928 return;
2929 }
2930
2931 p->numa_scan_period = task_scan_start(p);
2932}
2933
2934#else
2935static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2936{
2937}
2938
2939static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2940{
2941}
2942
2943static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2944{
2945}
2946
2947static inline void update_scan_period(struct task_struct *p, int new_cpu)
2948{
2949}
2950
2951#endif
2952
2953static void
2954account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2955{
2956 update_load_add(&cfs_rq->load, se->load.weight);
2957#ifdef CONFIG_SMP
2958 if (entity_is_task(se)) {
2959 struct rq *rq = rq_of(cfs_rq);
2960
2961 account_numa_enqueue(rq, task_of(se));
2962 list_add(&se->group_node, &rq->cfs_tasks);
2963 }
2964#endif
2965 cfs_rq->nr_running++;
2966}
2967
2968static void
2969account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2970{
2971 update_load_sub(&cfs_rq->load, se->load.weight);
2972#ifdef CONFIG_SMP
2973 if (entity_is_task(se)) {
2974 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2975 list_del_init(&se->group_node);
2976 }
2977#endif
2978 cfs_rq->nr_running--;
2979}
2980
2981
2982
2983
2984
2985
2986
2987
2988#define add_positive(_ptr, _val) do { \
2989 typeof(_ptr) ptr = (_ptr); \
2990 typeof(_val) val = (_val); \
2991 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2992 \
2993 res = var + val; \
2994 \
2995 if (val < 0 && res > var) \
2996 res = 0; \
2997 \
2998 WRITE_ONCE(*ptr, res); \
2999} while (0)
3000
3001
3002
3003
3004
3005
3006
3007
3008#define sub_positive(_ptr, _val) do { \
3009 typeof(_ptr) ptr = (_ptr); \
3010 typeof(*ptr) val = (_val); \
3011 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3012 res = var - val; \
3013 if (res > var) \
3014 res = 0; \
3015 WRITE_ONCE(*ptr, res); \
3016} while (0)
3017
3018
3019
3020
3021
3022
3023
3024#define lsub_positive(_ptr, _val) do { \
3025 typeof(_ptr) ptr = (_ptr); \
3026 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3027} while (0)
3028
3029#ifdef CONFIG_SMP
3030static inline void
3031enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3032{
3033 cfs_rq->avg.load_avg += se->avg.load_avg;
3034 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3035}
3036
3037static inline void
3038dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3039{
3040 u32 divider = get_pelt_divider(&se->avg);
3041 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3042 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3043}
3044#else
3045static inline void
3046enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3047static inline void
3048dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3049#endif
3050
3051static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3052 unsigned long weight)
3053{
3054 if (se->on_rq) {
3055
3056 if (cfs_rq->curr == se)
3057 update_curr(cfs_rq);
3058 update_load_sub(&cfs_rq->load, se->load.weight);
3059 }
3060 dequeue_load_avg(cfs_rq, se);
3061
3062 update_load_set(&se->load, weight);
3063
3064#ifdef CONFIG_SMP
3065 do {
3066 u32 divider = get_pelt_divider(&se->avg);
3067
3068 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3069 } while (0);
3070#endif
3071
3072 enqueue_load_avg(cfs_rq, se);
3073 if (se->on_rq)
3074 update_load_add(&cfs_rq->load, se->load.weight);
3075
3076}
3077
3078void reweight_task(struct task_struct *p, int prio)
3079{
3080 struct sched_entity *se = &p->se;
3081 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3082 struct load_weight *load = &se->load;
3083 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3084
3085 reweight_entity(cfs_rq, se, weight);
3086 load->inv_weight = sched_prio_to_wmult[prio];
3087}
3088
3089#ifdef CONFIG_FAIR_GROUP_SCHED
3090#ifdef CONFIG_SMP
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164static long calc_group_shares(struct cfs_rq *cfs_rq)
3165{
3166 long tg_weight, tg_shares, load, shares;
3167 struct task_group *tg = cfs_rq->tg;
3168
3169 tg_shares = READ_ONCE(tg->shares);
3170
3171 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3172
3173 tg_weight = atomic_long_read(&tg->load_avg);
3174
3175
3176 tg_weight -= cfs_rq->tg_load_avg_contrib;
3177 tg_weight += load;
3178
3179 shares = (tg_shares * load);
3180 if (tg_weight)
3181 shares /= tg_weight;
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3196}
3197#endif
3198
3199static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3200
3201
3202
3203
3204
3205static void update_cfs_group(struct sched_entity *se)
3206{
3207 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3208 long shares;
3209
3210 if (!gcfs_rq)
3211 return;
3212
3213 if (throttled_hierarchy(gcfs_rq))
3214 return;
3215
3216#ifndef CONFIG_SMP
3217 shares = READ_ONCE(gcfs_rq->tg->shares);
3218
3219 if (likely(se->load.weight == shares))
3220 return;
3221#else
3222 shares = calc_group_shares(gcfs_rq);
3223#endif
3224
3225 reweight_entity(cfs_rq_of(se), se, shares);
3226}
3227
3228#else
3229static inline void update_cfs_group(struct sched_entity *se)
3230{
3231}
3232#endif
3233
3234static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3235{
3236 struct rq *rq = rq_of(cfs_rq);
3237
3238 if (&rq->cfs == cfs_rq) {
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253 cpufreq_update_util(rq, flags);
3254 }
3255}
3256
3257#ifdef CONFIG_SMP
3258#ifdef CONFIG_FAIR_GROUP_SCHED
3259
3260
3261
3262
3263
3264
3265
3266
3267static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
3268{
3269 struct cfs_rq *prev_cfs_rq;
3270 struct list_head *prev;
3271
3272 if (cfs_rq->on_list) {
3273 prev = cfs_rq->leaf_cfs_rq_list.prev;
3274 } else {
3275 struct rq *rq = rq_of(cfs_rq);
3276
3277 prev = rq->tmp_alone_branch;
3278 }
3279
3280 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
3281
3282 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
3283}
3284
3285static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
3286{
3287 if (cfs_rq->load.weight)
3288 return false;
3289
3290 if (cfs_rq->avg.load_sum)
3291 return false;
3292
3293 if (cfs_rq->avg.util_sum)
3294 return false;
3295
3296 if (cfs_rq->avg.runnable_sum)
3297 return false;
3298
3299 if (child_cfs_rq_on_list(cfs_rq))
3300 return false;
3301
3302
3303
3304
3305
3306
3307 SCHED_WARN_ON(cfs_rq->avg.load_avg ||
3308 cfs_rq->avg.util_avg ||
3309 cfs_rq->avg.runnable_avg);
3310
3311 return true;
3312}
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3329{
3330 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3331
3332
3333
3334
3335 if (cfs_rq->tg == &root_task_group)
3336 return;
3337
3338 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3339 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3340 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3341 }
3342}
3343
3344
3345
3346
3347
3348
3349void set_task_rq_fair(struct sched_entity *se,
3350 struct cfs_rq *prev, struct cfs_rq *next)
3351{
3352 u64 p_last_update_time;
3353 u64 n_last_update_time;
3354
3355 if (!sched_feat(ATTACH_AGE_LOAD))
3356 return;
3357
3358
3359
3360
3361
3362
3363
3364
3365 if (!(se->avg.last_update_time && prev))
3366 return;
3367
3368#ifndef CONFIG_64BIT
3369 {
3370 u64 p_last_update_time_copy;
3371 u64 n_last_update_time_copy;
3372
3373 do {
3374 p_last_update_time_copy = prev->load_last_update_time_copy;
3375 n_last_update_time_copy = next->load_last_update_time_copy;
3376
3377 smp_rmb();
3378
3379 p_last_update_time = prev->avg.last_update_time;
3380 n_last_update_time = next->avg.last_update_time;
3381
3382 } while (p_last_update_time != p_last_update_time_copy ||
3383 n_last_update_time != n_last_update_time_copy);
3384 }
3385#else
3386 p_last_update_time = prev->avg.last_update_time;
3387 n_last_update_time = next->avg.last_update_time;
3388#endif
3389 __update_load_avg_blocked_se(p_last_update_time, se);
3390 se->avg.last_update_time = n_last_update_time;
3391}
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462static inline void
3463update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3464{
3465 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3466 u32 divider;
3467
3468
3469 if (!delta)
3470 return;
3471
3472
3473
3474
3475
3476 divider = get_pelt_divider(&cfs_rq->avg);
3477
3478
3479 se->avg.util_avg = gcfs_rq->avg.util_avg;
3480 se->avg.util_sum = se->avg.util_avg * divider;
3481
3482
3483 add_positive(&cfs_rq->avg.util_avg, delta);
3484 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3485}
3486
3487static inline void
3488update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3489{
3490 long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3491 u32 divider;
3492
3493
3494 if (!delta)
3495 return;
3496
3497
3498
3499
3500
3501 divider = get_pelt_divider(&cfs_rq->avg);
3502
3503
3504 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3505 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3506
3507
3508 add_positive(&cfs_rq->avg.runnable_avg, delta);
3509 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3510}
3511
3512static inline void
3513update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3514{
3515 long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3516 unsigned long load_avg;
3517 u64 load_sum = 0;
3518 u32 divider;
3519
3520 if (!runnable_sum)
3521 return;
3522
3523 gcfs_rq->prop_runnable_sum = 0;
3524
3525
3526
3527
3528
3529 divider = get_pelt_divider(&cfs_rq->avg);
3530
3531 if (runnable_sum >= 0) {
3532
3533
3534
3535
3536 runnable_sum += se->avg.load_sum;
3537 runnable_sum = min_t(long, runnable_sum, divider);
3538 } else {
3539
3540
3541
3542
3543 if (scale_load_down(gcfs_rq->load.weight)) {
3544 load_sum = div_s64(gcfs_rq->avg.load_sum,
3545 scale_load_down(gcfs_rq->load.weight));
3546 }
3547
3548
3549 runnable_sum = min(se->avg.load_sum, load_sum);
3550 }
3551
3552
3553
3554
3555
3556
3557
3558 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3559 runnable_sum = max(runnable_sum, running_sum);
3560
3561 load_sum = (s64)se_weight(se) * runnable_sum;
3562 load_avg = div_s64(load_sum, divider);
3563
3564 se->avg.load_sum = runnable_sum;
3565
3566 delta = load_avg - se->avg.load_avg;
3567 if (!delta)
3568 return;
3569
3570 se->avg.load_avg = load_avg;
3571
3572 add_positive(&cfs_rq->avg.load_avg, delta);
3573 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3574}
3575
3576static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3577{
3578 cfs_rq->propagate = 1;
3579 cfs_rq->prop_runnable_sum += runnable_sum;
3580}
3581
3582
3583static inline int propagate_entity_load_avg(struct sched_entity *se)
3584{
3585 struct cfs_rq *cfs_rq, *gcfs_rq;
3586
3587 if (entity_is_task(se))
3588 return 0;
3589
3590 gcfs_rq = group_cfs_rq(se);
3591 if (!gcfs_rq->propagate)
3592 return 0;
3593
3594 gcfs_rq->propagate = 0;
3595
3596 cfs_rq = cfs_rq_of(se);
3597
3598 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3599
3600 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3601 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3602 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3603
3604 trace_pelt_cfs_tp(cfs_rq);
3605 trace_pelt_se_tp(se);
3606
3607 return 1;
3608}
3609
3610
3611
3612
3613
3614static inline bool skip_blocked_update(struct sched_entity *se)
3615{
3616 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3617
3618
3619
3620
3621
3622 if (se->avg.load_avg || se->avg.util_avg)
3623 return false;
3624
3625
3626
3627
3628
3629 if (gcfs_rq->propagate)
3630 return false;
3631
3632
3633
3634
3635
3636
3637 return true;
3638}
3639
3640#else
3641
3642static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3643
3644static inline int propagate_entity_load_avg(struct sched_entity *se)
3645{
3646 return 0;
3647}
3648
3649static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3650
3651#endif
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669static inline int
3670update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3671{
3672 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3673 struct sched_avg *sa = &cfs_rq->avg;
3674 int decayed = 0;
3675
3676 if (cfs_rq->removed.nr) {
3677 unsigned long r;
3678 u32 divider = get_pelt_divider(&cfs_rq->avg);
3679
3680 raw_spin_lock(&cfs_rq->removed.lock);
3681 swap(cfs_rq->removed.util_avg, removed_util);
3682 swap(cfs_rq->removed.load_avg, removed_load);
3683 swap(cfs_rq->removed.runnable_avg, removed_runnable);
3684 cfs_rq->removed.nr = 0;
3685 raw_spin_unlock(&cfs_rq->removed.lock);
3686
3687 r = removed_load;
3688 sub_positive(&sa->load_avg, r);
3689 sa->load_sum = sa->load_avg * divider;
3690
3691 r = removed_util;
3692 sub_positive(&sa->util_avg, r);
3693 sa->util_sum = sa->util_avg * divider;
3694
3695 r = removed_runnable;
3696 sub_positive(&sa->runnable_avg, r);
3697 sa->runnable_sum = sa->runnable_avg * divider;
3698
3699
3700
3701
3702
3703 add_tg_cfs_propagate(cfs_rq,
3704 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3705
3706 decayed = 1;
3707 }
3708
3709 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3710
3711#ifndef CONFIG_64BIT
3712 smp_wmb();
3713 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3714#endif
3715
3716 return decayed;
3717}
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3728{
3729
3730
3731
3732
3733 u32 divider = get_pelt_divider(&cfs_rq->avg);
3734
3735
3736
3737
3738
3739
3740
3741
3742 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3743 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3744
3745
3746
3747
3748
3749
3750
3751 se->avg.util_sum = se->avg.util_avg * divider;
3752
3753 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3754
3755 se->avg.load_sum = divider;
3756 if (se_weight(se)) {
3757 se->avg.load_sum =
3758 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3759 }
3760
3761 enqueue_load_avg(cfs_rq, se);
3762 cfs_rq->avg.util_avg += se->avg.util_avg;
3763 cfs_rq->avg.util_sum += se->avg.util_sum;
3764 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3765 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3766
3767 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3768
3769 cfs_rq_util_change(cfs_rq, 0);
3770
3771 trace_pelt_cfs_tp(cfs_rq);
3772}
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3783{
3784
3785
3786
3787
3788 u32 divider = get_pelt_divider(&cfs_rq->avg);
3789
3790 dequeue_load_avg(cfs_rq, se);
3791 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3792 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3793 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3794 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3795
3796 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3797
3798 cfs_rq_util_change(cfs_rq, 0);
3799
3800 trace_pelt_cfs_tp(cfs_rq);
3801}
3802
3803
3804
3805
3806#define UPDATE_TG 0x1
3807#define SKIP_AGE_LOAD 0x2
3808#define DO_ATTACH 0x4
3809
3810
3811static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3812{
3813 u64 now = cfs_rq_clock_pelt(cfs_rq);
3814 int decayed;
3815
3816
3817
3818
3819
3820 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3821 __update_load_avg_se(now, cfs_rq, se);
3822
3823 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3824 decayed |= propagate_entity_load_avg(se);
3825
3826 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3827
3828
3829
3830
3831
3832
3833
3834
3835 attach_entity_load_avg(cfs_rq, se);
3836 update_tg_load_avg(cfs_rq);
3837
3838 } else if (decayed) {
3839 cfs_rq_util_change(cfs_rq, 0);
3840
3841 if (flags & UPDATE_TG)
3842 update_tg_load_avg(cfs_rq);
3843 }
3844}
3845
3846#ifndef CONFIG_64BIT
3847static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3848{
3849 u64 last_update_time_copy;
3850 u64 last_update_time;
3851
3852 do {
3853 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3854 smp_rmb();
3855 last_update_time = cfs_rq->avg.last_update_time;
3856 } while (last_update_time != last_update_time_copy);
3857
3858 return last_update_time;
3859}
3860#else
3861static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3862{
3863 return cfs_rq->avg.last_update_time;
3864}
3865#endif
3866
3867
3868
3869
3870
3871static void sync_entity_load_avg(struct sched_entity *se)
3872{
3873 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3874 u64 last_update_time;
3875
3876 last_update_time = cfs_rq_last_update_time(cfs_rq);
3877 __update_load_avg_blocked_se(last_update_time, se);
3878}
3879
3880
3881
3882
3883
3884static void remove_entity_load_avg(struct sched_entity *se)
3885{
3886 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3887 unsigned long flags;
3888
3889
3890
3891
3892
3893
3894
3895 sync_entity_load_avg(se);
3896
3897 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3898 ++cfs_rq->removed.nr;
3899 cfs_rq->removed.util_avg += se->avg.util_avg;
3900 cfs_rq->removed.load_avg += se->avg.load_avg;
3901 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
3902 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3903}
3904
3905static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3906{
3907 return cfs_rq->avg.runnable_avg;
3908}
3909
3910static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3911{
3912 return cfs_rq->avg.load_avg;
3913}
3914
3915static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
3916
3917static inline unsigned long task_util(struct task_struct *p)
3918{
3919 return READ_ONCE(p->se.avg.util_avg);
3920}
3921
3922static inline unsigned long _task_util_est(struct task_struct *p)
3923{
3924 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3925
3926 return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
3927}
3928
3929static inline unsigned long task_util_est(struct task_struct *p)
3930{
3931 return max(task_util(p), _task_util_est(p));
3932}
3933
3934#ifdef CONFIG_UCLAMP_TASK
3935static inline unsigned long uclamp_task_util(struct task_struct *p)
3936{
3937 return clamp(task_util_est(p),
3938 uclamp_eff_value(p, UCLAMP_MIN),
3939 uclamp_eff_value(p, UCLAMP_MAX));
3940}
3941#else
3942static inline unsigned long uclamp_task_util(struct task_struct *p)
3943{
3944 return task_util_est(p);
3945}
3946#endif
3947
3948static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3949 struct task_struct *p)
3950{
3951 unsigned int enqueued;
3952
3953 if (!sched_feat(UTIL_EST))
3954 return;
3955
3956
3957 enqueued = cfs_rq->avg.util_est.enqueued;
3958 enqueued += _task_util_est(p);
3959 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3960
3961 trace_sched_util_est_cfs_tp(cfs_rq);
3962}
3963
3964static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3965 struct task_struct *p)
3966{
3967 unsigned int enqueued;
3968
3969 if (!sched_feat(UTIL_EST))
3970 return;
3971
3972
3973 enqueued = cfs_rq->avg.util_est.enqueued;
3974 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
3975 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3976
3977 trace_sched_util_est_cfs_tp(cfs_rq);
3978}
3979
3980#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990static inline bool within_margin(int value, int margin)
3991{
3992 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3993}
3994
3995static inline void util_est_update(struct cfs_rq *cfs_rq,
3996 struct task_struct *p,
3997 bool task_sleep)
3998{
3999 long last_ewma_diff, last_enqueued_diff;
4000 struct util_est ue;
4001
4002 if (!sched_feat(UTIL_EST))
4003 return;
4004
4005
4006
4007
4008
4009 if (!task_sleep)
4010 return;
4011
4012
4013
4014
4015
4016 ue = p->se.avg.util_est;
4017 if (ue.enqueued & UTIL_AVG_UNCHANGED)
4018 return;
4019
4020 last_enqueued_diff = ue.enqueued;
4021
4022
4023
4024
4025
4026 ue.enqueued = task_util(p);
4027 if (sched_feat(UTIL_EST_FASTUP)) {
4028 if (ue.ewma < ue.enqueued) {
4029 ue.ewma = ue.enqueued;
4030 goto done;
4031 }
4032 }
4033
4034
4035
4036
4037
4038 last_ewma_diff = ue.enqueued - ue.ewma;
4039 last_enqueued_diff -= ue.enqueued;
4040 if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4041 if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4042 goto done;
4043
4044 return;
4045 }
4046
4047
4048
4049
4050
4051 if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
4052 return;
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4072 ue.ewma += last_ewma_diff;
4073 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4074done:
4075 ue.enqueued |= UTIL_AVG_UNCHANGED;
4076 WRITE_ONCE(p->se.avg.util_est, ue);
4077
4078 trace_sched_util_est_se_tp(&p->se);
4079}
4080
4081static inline int task_fits_capacity(struct task_struct *p, long capacity)
4082{
4083 return fits_capacity(uclamp_task_util(p), capacity);
4084}
4085
4086static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4087{
4088 if (!static_branch_unlikely(&sched_asym_cpucapacity))
4089 return;
4090
4091 if (!p || p->nr_cpus_allowed == 1) {
4092 rq->misfit_task_load = 0;
4093 return;
4094 }
4095
4096 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4097 rq->misfit_task_load = 0;
4098 return;
4099 }
4100
4101
4102
4103
4104
4105 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
4106}
4107
4108#else
4109
4110static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4111{
4112 return true;
4113}
4114
4115#define UPDATE_TG 0x0
4116#define SKIP_AGE_LOAD 0x0
4117#define DO_ATTACH 0x0
4118
4119static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
4120{
4121 cfs_rq_util_change(cfs_rq, 0);
4122}
4123
4124static inline void remove_entity_load_avg(struct sched_entity *se) {}
4125
4126static inline void
4127attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4128static inline void
4129detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4130
4131static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
4132{
4133 return 0;
4134}
4135
4136static inline void
4137util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4138
4139static inline void
4140util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4141
4142static inline void
4143util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4144 bool task_sleep) {}
4145static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
4146
4147#endif
4148
4149static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4150{
4151#ifdef CONFIG_SCHED_DEBUG
4152 s64 d = se->vruntime - cfs_rq->min_vruntime;
4153
4154 if (d < 0)
4155 d = -d;
4156
4157 if (d > 3*sysctl_sched_latency)
4158 schedstat_inc(cfs_rq->nr_spread_over);
4159#endif
4160}
4161
4162static void
4163place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4164{
4165 u64 vruntime = cfs_rq->min_vruntime;
4166
4167
4168
4169
4170
4171
4172
4173 if (initial && sched_feat(START_DEBIT))
4174 vruntime += sched_vslice(cfs_rq, se);
4175
4176
4177 if (!initial) {
4178 unsigned long thresh = sysctl_sched_latency;
4179
4180
4181
4182
4183
4184 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4185 thresh >>= 1;
4186
4187 vruntime -= thresh;
4188 }
4189
4190
4191 se->vruntime = max_vruntime(se->vruntime, vruntime);
4192}
4193
4194static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4195
4196static inline void check_schedstat_required(void)
4197{
4198#ifdef CONFIG_SCHEDSTATS
4199 if (schedstat_enabled())
4200 return;
4201
4202
4203 if (trace_sched_stat_wait_enabled() ||
4204 trace_sched_stat_sleep_enabled() ||
4205 trace_sched_stat_iowait_enabled() ||
4206 trace_sched_stat_blocked_enabled() ||
4207 trace_sched_stat_runtime_enabled()) {
4208 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
4209 "stat_blocked and stat_runtime require the "
4210 "kernel parameter schedstats=enable or "
4211 "kernel.sched_schedstats=1\n");
4212 }
4213#endif
4214}
4215
4216static inline bool cfs_bandwidth_used(void);
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248static void
4249enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4250{
4251 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4252 bool curr = cfs_rq->curr == se;
4253
4254
4255
4256
4257
4258 if (renorm && curr)
4259 se->vruntime += cfs_rq->min_vruntime;
4260
4261 update_curr(cfs_rq);
4262
4263
4264
4265
4266
4267
4268
4269 if (renorm && !curr)
4270 se->vruntime += cfs_rq->min_vruntime;
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4281 se_update_runnable(se);
4282 update_cfs_group(se);
4283 account_entity_enqueue(cfs_rq, se);
4284
4285 if (flags & ENQUEUE_WAKEUP)
4286 place_entity(cfs_rq, se, 0);
4287
4288 check_schedstat_required();
4289 update_stats_enqueue(cfs_rq, se, flags);
4290 check_spread(cfs_rq, se);
4291 if (!curr)
4292 __enqueue_entity(cfs_rq, se);
4293 se->on_rq = 1;
4294
4295
4296
4297
4298
4299
4300 if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
4301 list_add_leaf_cfs_rq(cfs_rq);
4302
4303 if (cfs_rq->nr_running == 1)
4304 check_enqueue_throttle(cfs_rq);
4305}
4306
4307static void __clear_buddies_last(struct sched_entity *se)
4308{
4309 for_each_sched_entity(se) {
4310 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4311 if (cfs_rq->last != se)
4312 break;
4313
4314 cfs_rq->last = NULL;
4315 }
4316}
4317
4318static void __clear_buddies_next(struct sched_entity *se)
4319{
4320 for_each_sched_entity(se) {
4321 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4322 if (cfs_rq->next != se)
4323 break;
4324
4325 cfs_rq->next = NULL;
4326 }
4327}
4328
4329static void __clear_buddies_skip(struct sched_entity *se)
4330{
4331 for_each_sched_entity(se) {
4332 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4333 if (cfs_rq->skip != se)
4334 break;
4335
4336 cfs_rq->skip = NULL;
4337 }
4338}
4339
4340static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4341{
4342 if (cfs_rq->last == se)
4343 __clear_buddies_last(se);
4344
4345 if (cfs_rq->next == se)
4346 __clear_buddies_next(se);
4347
4348 if (cfs_rq->skip == se)
4349 __clear_buddies_skip(se);
4350}
4351
4352static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4353
4354static void
4355dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4356{
4357
4358
4359
4360 update_curr(cfs_rq);
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370 update_load_avg(cfs_rq, se, UPDATE_TG);
4371 se_update_runnable(se);
4372
4373 update_stats_dequeue(cfs_rq, se, flags);
4374
4375 clear_buddies(cfs_rq, se);
4376
4377 if (se != cfs_rq->curr)
4378 __dequeue_entity(cfs_rq, se);
4379 se->on_rq = 0;
4380 account_entity_dequeue(cfs_rq, se);
4381
4382
4383
4384
4385
4386
4387
4388 if (!(flags & DEQUEUE_SLEEP))
4389 se->vruntime -= cfs_rq->min_vruntime;
4390
4391
4392 return_cfs_rq_runtime(cfs_rq);
4393
4394 update_cfs_group(se);
4395
4396
4397
4398
4399
4400
4401
4402 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4403 update_min_vruntime(cfs_rq);
4404}
4405
4406
4407
4408
4409static void
4410check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4411{
4412 unsigned long ideal_runtime, delta_exec;
4413 struct sched_entity *se;
4414 s64 delta;
4415
4416 ideal_runtime = sched_slice(cfs_rq, curr);
4417 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4418 if (delta_exec > ideal_runtime) {
4419 resched_curr(rq_of(cfs_rq));
4420
4421
4422
4423
4424 clear_buddies(cfs_rq, curr);
4425 return;
4426 }
4427
4428
4429
4430
4431
4432
4433 if (delta_exec < sysctl_sched_min_granularity)
4434 return;
4435
4436 se = __pick_first_entity(cfs_rq);
4437 delta = curr->vruntime - se->vruntime;
4438
4439 if (delta < 0)
4440 return;
4441
4442 if (delta > ideal_runtime)
4443 resched_curr(rq_of(cfs_rq));
4444}
4445
4446static void
4447set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4448{
4449 clear_buddies(cfs_rq, se);
4450
4451
4452 if (se->on_rq) {
4453
4454
4455
4456
4457
4458 update_stats_wait_end(cfs_rq, se);
4459 __dequeue_entity(cfs_rq, se);
4460 update_load_avg(cfs_rq, se, UPDATE_TG);
4461 }
4462
4463 update_stats_curr_start(cfs_rq, se);
4464 cfs_rq->curr = se;
4465
4466
4467
4468
4469
4470
4471 if (schedstat_enabled() &&
4472 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4473 schedstat_set(se->statistics.slice_max,
4474 max((u64)schedstat_val(se->statistics.slice_max),
4475 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4476 }
4477
4478 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4479}
4480
4481static int
4482wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4483
4484
4485
4486
4487
4488
4489
4490
4491static struct sched_entity *
4492pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4493{
4494 struct sched_entity *left = __pick_first_entity(cfs_rq);
4495 struct sched_entity *se;
4496
4497
4498
4499
4500
4501 if (!left || (curr && entity_before(curr, left)))
4502 left = curr;
4503
4504 se = left;
4505
4506
4507
4508
4509
4510 if (cfs_rq->skip && cfs_rq->skip == se) {
4511 struct sched_entity *second;
4512
4513 if (se == curr) {
4514 second = __pick_first_entity(cfs_rq);
4515 } else {
4516 second = __pick_next_entity(se);
4517 if (!second || (curr && entity_before(curr, second)))
4518 second = curr;
4519 }
4520
4521 if (second && wakeup_preempt_entity(second, left) < 1)
4522 se = second;
4523 }
4524
4525 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4526
4527
4528
4529 se = cfs_rq->next;
4530 } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4531
4532
4533
4534 se = cfs_rq->last;
4535 }
4536
4537 return se;
4538}
4539
4540static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4541
4542static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4543{
4544
4545
4546
4547
4548 if (prev->on_rq)
4549 update_curr(cfs_rq);
4550
4551
4552 check_cfs_rq_runtime(cfs_rq);
4553
4554 check_spread(cfs_rq, prev);
4555
4556 if (prev->on_rq) {
4557 update_stats_wait_start(cfs_rq, prev);
4558
4559 __enqueue_entity(cfs_rq, prev);
4560
4561 update_load_avg(cfs_rq, prev, 0);
4562 }
4563 cfs_rq->curr = NULL;
4564}
4565
4566static void
4567entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4568{
4569
4570
4571
4572 update_curr(cfs_rq);
4573
4574
4575
4576
4577 update_load_avg(cfs_rq, curr, UPDATE_TG);
4578 update_cfs_group(curr);
4579
4580#ifdef CONFIG_SCHED_HRTICK
4581
4582
4583
4584
4585 if (queued) {
4586 resched_curr(rq_of(cfs_rq));
4587 return;
4588 }
4589
4590
4591
4592 if (!sched_feat(DOUBLE_TICK) &&
4593 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4594 return;
4595#endif
4596
4597 if (cfs_rq->nr_running > 1)
4598 check_preempt_tick(cfs_rq, curr);
4599}
4600
4601
4602
4603
4604
4605
4606#ifdef CONFIG_CFS_BANDWIDTH
4607
4608#ifdef CONFIG_JUMP_LABEL
4609static struct static_key __cfs_bandwidth_used;
4610
4611static inline bool cfs_bandwidth_used(void)
4612{
4613 return static_key_false(&__cfs_bandwidth_used);
4614}
4615
4616void cfs_bandwidth_usage_inc(void)
4617{
4618 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4619}
4620
4621void cfs_bandwidth_usage_dec(void)
4622{
4623 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4624}