1
2
3
4
5
6
7
8
9#define CREATE_TRACE_POINTS
10#include <trace/events/sched.h>
11#undef CREATE_TRACE_POINTS
12
13#include "sched.h"
14
15#include <linux/nospec.h>
16
17#include <linux/kcov.h>
18#include <linux/scs.h>
19
20#include <asm/switch_to.h>
21#include <asm/tlb.h>
22
23#include "../workqueue_internal.h"
24#include "../../fs/io-wq.h"
25#include "../smpboot.h"
26
27#include "pelt.h"
28#include "smp.h"
29
30
31
32
33
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
43EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
44
45DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
46
47#ifdef CONFIG_SCHED_DEBUG
48
49
50
51
52
53
54
55#define SCHED_FEAT(name, enabled) \
56 (1UL << __SCHED_FEAT_##name) * enabled |
57const_debug unsigned int sysctl_sched_features =
58#include "features.h"
59 0;
60#undef SCHED_FEAT
61
62
63
64
65
66
67
68
69__read_mostly int sysctl_resched_latency_warn_ms = 100;
70__read_mostly int sysctl_resched_latency_warn_once = 1;
71#endif
72
73
74
75
76
77const_debug unsigned int sysctl_sched_nr_migrate = 32;
78
79
80
81
82
83unsigned int sysctl_sched_rt_period = 1000000;
84
85__read_mostly int scheduler_running;
86
87#ifdef CONFIG_SCHED_CORE
88
89DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
90
91
92static inline int __task_prio(struct task_struct *p)
93{
94 if (p->sched_class == &stop_sched_class)
95 return -2;
96
97 if (rt_prio(p->prio))
98 return p->prio;
99
100 if (p->sched_class == &idle_sched_class)
101 return MAX_RT_PRIO + NICE_WIDTH;
102
103 return MAX_RT_PRIO + MAX_NICE;
104}
105
106
107
108
109
110
111
112
113
114static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
115{
116
117 int pa = __task_prio(a), pb = __task_prio(b);
118
119 if (-pa < -pb)
120 return true;
121
122 if (-pb < -pa)
123 return false;
124
125 if (pa == -1)
126 return !dl_time_before(a->dl.deadline, b->dl.deadline);
127
128 if (pa == MAX_RT_PRIO + MAX_NICE)
129 return cfs_prio_less(a, b, in_fi);
130
131 return false;
132}
133
134static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
135{
136 if (a->core_cookie < b->core_cookie)
137 return true;
138
139 if (a->core_cookie > b->core_cookie)
140 return false;
141
142
143 if (prio_less(b, a, task_rq(a)->core->core_forceidle))
144 return true;
145
146 return false;
147}
148
149#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
150
151static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
152{
153 return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
154}
155
156static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
157{
158 const struct task_struct *p = __node_2_sc(node);
159 unsigned long cookie = (unsigned long)key;
160
161 if (cookie < p->core_cookie)
162 return -1;
163
164 if (cookie > p->core_cookie)
165 return 1;
166
167 return 0;
168}
169
170void sched_core_enqueue(struct rq *rq, struct task_struct *p)
171{
172 rq->core->core_task_seq++;
173
174 if (!p->core_cookie)
175 return;
176
177 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
178}
179
180void sched_core_dequeue(struct rq *rq, struct task_struct *p)
181{
182 rq->core->core_task_seq++;
183
184 if (!sched_core_enqueued(p))
185 return;
186
187 rb_erase(&p->core_node, &rq->core_tree);
188 RB_CLEAR_NODE(&p->core_node);
189}
190
191
192
193
194static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
195{
196 struct rb_node *node;
197
198 node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
199
200
201
202 if (!node)
203 return idle_sched_class.pick_task(rq);
204
205 return __node_2_sc(node);
206}
207
208static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
209{
210 struct rb_node *node = &p->core_node;
211
212 node = rb_next(node);
213 if (!node)
214 return NULL;
215
216 p = container_of(node, struct task_struct, core_node);
217 if (p->core_cookie != cookie)
218 return NULL;
219
220 return p;
221}
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236static DEFINE_MUTEX(sched_core_mutex);
237static atomic_t sched_core_count;
238static struct cpumask sched_core_mask;
239
240static void sched_core_lock(int cpu, unsigned long *flags)
241{
242 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
243 int t, i = 0;
244
245 local_irq_save(*flags);
246 for_each_cpu(t, smt_mask)
247 raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
248}
249
250static void sched_core_unlock(int cpu, unsigned long *flags)
251{
252 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
253 int t;
254
255 for_each_cpu(t, smt_mask)
256 raw_spin_unlock(&cpu_rq(t)->__lock);
257 local_irq_restore(*flags);
258}
259
260static void __sched_core_flip(bool enabled)
261{
262 unsigned long flags;
263 int cpu, t;
264
265 cpus_read_lock();
266
267
268
269
270 cpumask_copy(&sched_core_mask, cpu_online_mask);
271 for_each_cpu(cpu, &sched_core_mask) {
272 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
273
274 sched_core_lock(cpu, &flags);
275
276 for_each_cpu(t, smt_mask)
277 cpu_rq(t)->core_enabled = enabled;
278
279 sched_core_unlock(cpu, &flags);
280
281 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
282 }
283
284
285
286
287 cpumask_copy(&sched_core_mask, cpu_possible_mask);
288 cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
289
290 for_each_cpu(cpu, &sched_core_mask)
291 cpu_rq(cpu)->core_enabled = enabled;
292
293 cpus_read_unlock();
294}
295
296static void sched_core_assert_empty(void)
297{
298 int cpu;
299
300 for_each_possible_cpu(cpu)
301 WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
302}
303
304static void __sched_core_enable(void)
305{
306 static_branch_enable(&__sched_core_enabled);
307
308
309
310
311 synchronize_rcu();
312 __sched_core_flip(true);
313 sched_core_assert_empty();
314}
315
316static void __sched_core_disable(void)
317{
318 sched_core_assert_empty();
319 __sched_core_flip(false);
320 static_branch_disable(&__sched_core_enabled);
321}
322
323void sched_core_get(void)
324{
325 if (atomic_inc_not_zero(&sched_core_count))
326 return;
327
328 mutex_lock(&sched_core_mutex);
329 if (!atomic_read(&sched_core_count))
330 __sched_core_enable();
331
332 smp_mb__before_atomic();
333 atomic_inc(&sched_core_count);
334 mutex_unlock(&sched_core_mutex);
335}
336
337static void __sched_core_put(struct work_struct *work)
338{
339 if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
340 __sched_core_disable();
341 mutex_unlock(&sched_core_mutex);
342 }
343}
344
345void sched_core_put(void)
346{
347 static DECLARE_WORK(_work, __sched_core_put);
348
349
350
351
352
353
354
355
356 if (!atomic_add_unless(&sched_core_count, -1, 1))
357 schedule_work(&_work);
358}
359
360#else
361
362static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
363static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
364
365#endif
366
367
368
369
370
371int sysctl_sched_rt_runtime = 950000;
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
468{
469 raw_spinlock_t *lock;
470
471
472 preempt_disable();
473 if (sched_core_disabled()) {
474 raw_spin_lock_nested(&rq->__lock, subclass);
475
476 preempt_enable_no_resched();
477 return;
478 }
479
480 for (;;) {
481 lock = __rq_lockp(rq);
482 raw_spin_lock_nested(lock, subclass);
483 if (likely(lock == __rq_lockp(rq))) {
484
485 preempt_enable_no_resched();
486 return;
487 }
488 raw_spin_unlock(lock);
489 }
490}
491
492bool raw_spin_rq_trylock(struct rq *rq)
493{
494 raw_spinlock_t *lock;
495 bool ret;
496
497
498 preempt_disable();
499 if (sched_core_disabled()) {
500 ret = raw_spin_trylock(&rq->__lock);
501 preempt_enable();
502 return ret;
503 }
504
505 for (;;) {
506 lock = __rq_lockp(rq);
507 ret = raw_spin_trylock(lock);
508 if (!ret || (likely(lock == __rq_lockp(rq)))) {
509 preempt_enable();
510 return ret;
511 }
512 raw_spin_unlock(lock);
513 }
514}
515
516void raw_spin_rq_unlock(struct rq *rq)
517{
518 raw_spin_unlock(rq_lockp(rq));
519}
520
521#ifdef CONFIG_SMP
522
523
524
525void double_rq_lock(struct rq *rq1, struct rq *rq2)
526{
527 lockdep_assert_irqs_disabled();
528
529 if (rq_order_less(rq2, rq1))
530 swap(rq1, rq2);
531
532 raw_spin_rq_lock(rq1);
533 if (__rq_lockp(rq1) == __rq_lockp(rq2))
534 return;
535
536 raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
537}
538#endif
539
540
541
542
543struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
544 __acquires(rq->lock)
545{
546 struct rq *rq;
547
548 lockdep_assert_held(&p->pi_lock);
549
550 for (;;) {
551 rq = task_rq(p);
552 raw_spin_rq_lock(rq);
553 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
554 rq_pin_lock(rq, rf);
555 return rq;
556 }
557 raw_spin_rq_unlock(rq);
558
559 while (unlikely(task_on_rq_migrating(p)))
560 cpu_relax();
561 }
562}
563
564
565
566
567struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
568 __acquires(p->pi_lock)
569 __acquires(rq->lock)
570{
571 struct rq *rq;
572
573 for (;;) {
574 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
575 rq = task_rq(p);
576 raw_spin_rq_lock(rq);
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
595 rq_pin_lock(rq, rf);
596 return rq;
597 }
598 raw_spin_rq_unlock(rq);
599 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
600
601 while (unlikely(task_on_rq_migrating(p)))
602 cpu_relax();
603 }
604}
605
606
607
608
609
610static void update_rq_clock_task(struct rq *rq, s64 delta)
611{
612
613
614
615
616 s64 __maybe_unused steal = 0, irq_delta = 0;
617
618#ifdef CONFIG_IRQ_TIME_ACCOUNTING
619 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636 if (irq_delta > delta)
637 irq_delta = delta;
638
639 rq->prev_irq_time += irq_delta;
640 delta -= irq_delta;
641#endif
642#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
643 if (static_key_false((¶virt_steal_rq_enabled))) {
644 steal = paravirt_steal_clock(cpu_of(rq));
645 steal -= rq->prev_steal_time_rq;
646
647 if (unlikely(steal > delta))
648 steal = delta;
649
650 rq->prev_steal_time_rq += steal;
651 delta -= steal;
652 }
653#endif
654
655 rq->clock_task += delta;
656
657#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
658 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
659 update_irq_load_avg(rq, irq_delta + steal);
660#endif
661 update_rq_clock_pelt(rq, delta);
662}
663
664void update_rq_clock(struct rq *rq)
665{
666 s64 delta;
667
668 lockdep_assert_rq_held(rq);
669
670 if (rq->clock_update_flags & RQCF_ACT_SKIP)
671 return;
672
673#ifdef CONFIG_SCHED_DEBUG
674 if (sched_feat(WARN_DOUBLE_CLOCK))
675 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
676 rq->clock_update_flags |= RQCF_UPDATED;
677#endif
678
679 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
680 if (delta < 0)
681 return;
682 rq->clock += delta;
683 update_rq_clock_task(rq, delta);
684}
685
686#ifdef CONFIG_SCHED_HRTICK
687
688
689
690
691static void hrtick_clear(struct rq *rq)
692{
693 if (hrtimer_active(&rq->hrtick_timer))
694 hrtimer_cancel(&rq->hrtick_timer);
695}
696
697
698
699
700
701static enum hrtimer_restart hrtick(struct hrtimer *timer)
702{
703 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
704 struct rq_flags rf;
705
706 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
707
708 rq_lock(rq, &rf);
709 update_rq_clock(rq);
710 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
711 rq_unlock(rq, &rf);
712
713 return HRTIMER_NORESTART;
714}
715
716#ifdef CONFIG_SMP
717
718static void __hrtick_restart(struct rq *rq)
719{
720 struct hrtimer *timer = &rq->hrtick_timer;
721 ktime_t time = rq->hrtick_time;
722
723 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
724}
725
726
727
728
729static void __hrtick_start(void *arg)
730{
731 struct rq *rq = arg;
732 struct rq_flags rf;
733
734 rq_lock(rq, &rf);
735 __hrtick_restart(rq);
736 rq_unlock(rq, &rf);
737}
738
739
740
741
742
743
744void hrtick_start(struct rq *rq, u64 delay)
745{
746 struct hrtimer *timer = &rq->hrtick_timer;
747 s64 delta;
748
749
750
751
752
753 delta = max_t(s64, delay, 10000LL);
754 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
755
756 if (rq == this_rq())
757 __hrtick_restart(rq);
758 else
759 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
760}
761
762#else
763
764
765
766
767
768void hrtick_start(struct rq *rq, u64 delay)
769{
770
771
772
773
774 delay = max_t(u64, delay, 10000LL);
775 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
776 HRTIMER_MODE_REL_PINNED_HARD);
777}
778
779#endif
780
781static void hrtick_rq_init(struct rq *rq)
782{
783#ifdef CONFIG_SMP
784 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
785#endif
786 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
787 rq->hrtick_timer.function = hrtick;
788}
789#else
790static inline void hrtick_clear(struct rq *rq)
791{
792}
793
794static inline void hrtick_rq_init(struct rq *rq)
795{
796}
797#endif
798
799
800
801
802#define fetch_or(ptr, mask) \
803 ({ \
804 typeof(ptr) _ptr = (ptr); \
805 typeof(mask) _mask = (mask); \
806 typeof(*_ptr) _old, _val = *_ptr; \
807 \
808 for (;;) { \
809 _old = cmpxchg(_ptr, _val, _val | _mask); \
810 if (_old == _val) \
811 break; \
812 _val = _old; \
813 } \
814 _old; \
815})
816
817#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
818
819
820
821
822
823static bool set_nr_and_not_polling(struct task_struct *p)
824{
825 struct thread_info *ti = task_thread_info(p);
826 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
827}
828
829
830
831
832
833
834
835static bool set_nr_if_polling(struct task_struct *p)
836{
837 struct thread_info *ti = task_thread_info(p);
838 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
839
840 for (;;) {
841 if (!(val & _TIF_POLLING_NRFLAG))
842 return false;
843 if (val & _TIF_NEED_RESCHED)
844 return true;
845 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
846 if (old == val)
847 break;
848 val = old;
849 }
850 return true;
851}
852
853#else
854static bool set_nr_and_not_polling(struct task_struct *p)
855{
856 set_tsk_need_resched(p);
857 return true;
858}
859
860#ifdef CONFIG_SMP
861static bool set_nr_if_polling(struct task_struct *p)
862{
863 return false;
864}
865#endif
866#endif
867
868static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
869{
870 struct wake_q_node *node = &task->wake_q;
871
872
873
874
875
876
877
878
879
880 smp_mb__before_atomic();
881 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
882 return false;
883
884
885
886
887 *head->lastp = node;
888 head->lastp = &node->next;
889 return true;
890}
891
892
893
894
895
896
897
898
899
900
901
902
903
904void wake_q_add(struct wake_q_head *head, struct task_struct *task)
905{
906 if (__wake_q_add(head, task))
907 get_task_struct(task);
908}
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
928{
929 if (!__wake_q_add(head, task))
930 put_task_struct(task);
931}
932
933void wake_up_q(struct wake_q_head *head)
934{
935 struct wake_q_node *node = head->first;
936
937 while (node != WAKE_Q_TAIL) {
938 struct task_struct *task;
939
940 task = container_of(node, struct task_struct, wake_q);
941
942 node = node->next;
943 task->wake_q.next = NULL;
944
945
946
947
948
949 wake_up_process(task);
950 put_task_struct(task);
951 }
952}
953
954
955
956
957
958
959
960
961void resched_curr(struct rq *rq)
962{
963 struct task_struct *curr = rq->curr;
964 int cpu;
965
966 lockdep_assert_rq_held(rq);
967
968 if (test_tsk_need_resched(curr))
969 return;
970
971 cpu = cpu_of(rq);
972
973 if (cpu == smp_processor_id()) {
974 set_tsk_need_resched(curr);
975 set_preempt_need_resched();
976 return;
977 }
978
979 if (set_nr_and_not_polling(curr))
980 smp_send_reschedule(cpu);
981 else
982 trace_sched_wake_idle_without_ipi(cpu);
983}
984
985void resched_cpu(int cpu)
986{
987 struct rq *rq = cpu_rq(cpu);
988 unsigned long flags;
989
990 raw_spin_rq_lock_irqsave(rq, flags);
991 if (cpu_online(cpu) || cpu == smp_processor_id())
992 resched_curr(rq);
993 raw_spin_rq_unlock_irqrestore(rq, flags);
994}
995
996#ifdef CONFIG_SMP
997#ifdef CONFIG_NO_HZ_COMMON
998
999
1000
1001
1002
1003
1004
1005
1006int get_nohz_timer_target(void)
1007{
1008 int i, cpu = smp_processor_id(), default_cpu = -1;
1009 struct sched_domain *sd;
1010
1011 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1012 if (!idle_cpu(cpu))
1013 return cpu;
1014 default_cpu = cpu;
1015 }
1016
1017 rcu_read_lock();
1018 for_each_domain(cpu, sd) {
1019 for_each_cpu_and(i, sched_domain_span(sd),
1020 housekeeping_cpumask(HK_FLAG_TIMER)) {
1021 if (cpu == i)
1022 continue;
1023
1024 if (!idle_cpu(i)) {
1025 cpu = i;
1026 goto unlock;
1027 }
1028 }
1029 }
1030
1031 if (default_cpu == -1)
1032 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1033 cpu = default_cpu;
1034unlock:
1035 rcu_read_unlock();
1036 return cpu;
1037}
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049static void wake_up_idle_cpu(int cpu)
1050{
1051 struct rq *rq = cpu_rq(cpu);
1052
1053 if (cpu == smp_processor_id())
1054 return;
1055
1056 if (set_nr_and_not_polling(rq->idle))
1057 smp_send_reschedule(cpu);
1058 else
1059 trace_sched_wake_idle_without_ipi(cpu);
1060}
1061
1062static bool wake_up_full_nohz_cpu(int cpu)
1063{
1064
1065
1066
1067
1068
1069
1070 if (cpu_is_offline(cpu))
1071 return true;
1072 if (tick_nohz_full_cpu(cpu)) {
1073 if (cpu != smp_processor_id() ||
1074 tick_nohz_tick_stopped())
1075 tick_nohz_full_kick_cpu(cpu);
1076 return true;
1077 }
1078
1079 return false;
1080}
1081
1082
1083
1084
1085
1086
1087void wake_up_nohz_cpu(int cpu)
1088{
1089 if (!wake_up_full_nohz_cpu(cpu))
1090 wake_up_idle_cpu(cpu);
1091}
1092
1093static void nohz_csd_func(void *info)
1094{
1095 struct rq *rq = info;
1096 int cpu = cpu_of(rq);
1097 unsigned int flags;
1098
1099
1100
1101
1102 flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1103 WARN_ON(!(flags & NOHZ_KICK_MASK));
1104
1105 rq->idle_balance = idle_cpu(cpu);
1106 if (rq->idle_balance && !need_resched()) {
1107 rq->nohz_idle_balance = flags;
1108 raise_softirq_irqoff(SCHED_SOFTIRQ);
1109 }
1110}
1111
1112#endif
1113
1114#ifdef CONFIG_NO_HZ_FULL
1115bool sched_can_stop_tick(struct rq *rq)
1116{
1117 int fifo_nr_running;
1118
1119
1120 if (rq->dl.dl_nr_running)
1121 return false;
1122
1123
1124
1125
1126
1127 if (rq->rt.rr_nr_running) {
1128 if (rq->rt.rr_nr_running == 1)
1129 return true;
1130 else
1131 return false;
1132 }
1133
1134
1135
1136
1137
1138 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1139 if (fifo_nr_running)
1140 return true;
1141
1142
1143
1144
1145
1146
1147 if (rq->nr_running > 1)
1148 return false;
1149
1150 return true;
1151}
1152#endif
1153#endif
1154
1155#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1156 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1157
1158
1159
1160
1161
1162
1163int walk_tg_tree_from(struct task_group *from,
1164 tg_visitor down, tg_visitor up, void *data)
1165{
1166 struct task_group *parent, *child;
1167 int ret;
1168
1169 parent = from;
1170
1171down:
1172 ret = (*down)(parent, data);
1173 if (ret)
1174 goto out;
1175 list_for_each_entry_rcu(child, &parent->children, siblings) {
1176 parent = child;
1177 goto down;
1178
1179up:
1180 continue;
1181 }
1182 ret = (*up)(parent, data);
1183 if (ret || parent == from)
1184 goto out;
1185
1186 child = parent;
1187 parent = parent->parent;
1188 if (parent)
1189 goto up;
1190out:
1191 return ret;
1192}
1193
1194int tg_nop(struct task_group *tg, void *data)
1195{
1196 return 0;
1197}
1198#endif
1199
1200static void set_load_weight(struct task_struct *p, bool update_load)
1201{
1202 int prio = p->static_prio - MAX_RT_PRIO;
1203 struct load_weight *load = &p->se.load;
1204
1205
1206
1207
1208 if (task_has_idle_policy(p)) {
1209 load->weight = scale_load(WEIGHT_IDLEPRIO);
1210 load->inv_weight = WMULT_IDLEPRIO;
1211 return;
1212 }
1213
1214
1215
1216
1217
1218 if (update_load && p->sched_class == &fair_sched_class) {
1219 reweight_task(p, prio);
1220 } else {
1221 load->weight = scale_load(sched_prio_to_weight[prio]);
1222 load->inv_weight = sched_prio_to_wmult[prio];
1223 }
1224}
1225
1226#ifdef CONFIG_UCLAMP_TASK
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237static DEFINE_MUTEX(uclamp_mutex);
1238
1239
1240unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1241
1242
1243unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1261
1262
1263static struct uclamp_se uclamp_default[UCLAMP_CNT];
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1284
1285
1286#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1287
1288#define for_each_clamp_id(clamp_id) \
1289 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1290
1291static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1292{
1293 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1294}
1295
1296static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1297{
1298 if (clamp_id == UCLAMP_MIN)
1299 return 0;
1300 return SCHED_CAPACITY_SCALE;
1301}
1302
1303static inline void uclamp_se_set(struct uclamp_se *uc_se,
1304 unsigned int value, bool user_defined)
1305{
1306 uc_se->value = value;
1307 uc_se->bucket_id = uclamp_bucket_id(value);
1308 uc_se->user_defined = user_defined;
1309}
1310
1311static inline unsigned int
1312uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1313 unsigned int clamp_value)
1314{
1315
1316
1317
1318
1319
1320 if (clamp_id == UCLAMP_MAX) {
1321 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1322 return clamp_value;
1323 }
1324
1325 return uclamp_none(UCLAMP_MIN);
1326}
1327
1328static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1329 unsigned int clamp_value)
1330{
1331
1332 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1333 return;
1334
1335 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1336}
1337
1338static inline
1339unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1340 unsigned int clamp_value)
1341{
1342 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1343 int bucket_id = UCLAMP_BUCKETS - 1;
1344
1345
1346
1347
1348
1349 for ( ; bucket_id >= 0; bucket_id--) {
1350 if (!bucket[bucket_id].tasks)
1351 continue;
1352 return bucket[bucket_id].value;
1353 }
1354
1355
1356 return uclamp_idle_value(rq, clamp_id, clamp_value);
1357}
1358
1359static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1360{
1361 unsigned int default_util_min;
1362 struct uclamp_se *uc_se;
1363
1364 lockdep_assert_held(&p->pi_lock);
1365
1366 uc_se = &p->uclamp_req[UCLAMP_MIN];
1367
1368
1369 if (uc_se->user_defined)
1370 return;
1371
1372 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1373 uclamp_se_set(uc_se, default_util_min, false);
1374}
1375
1376static void uclamp_update_util_min_rt_default(struct task_struct *p)
1377{
1378 struct rq_flags rf;
1379 struct rq *rq;
1380
1381 if (!rt_task(p))
1382 return;
1383
1384
1385 rq = task_rq_lock(p, &rf);
1386 __uclamp_update_util_min_rt_default(p);
1387 task_rq_unlock(rq, p, &rf);
1388}
1389
1390static void uclamp_sync_util_min_rt_default(void)
1391{
1392 struct task_struct *g, *p;
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407 read_lock(&tasklist_lock);
1408 smp_mb__after_spinlock();
1409 read_unlock(&tasklist_lock);
1410
1411 rcu_read_lock();
1412 for_each_process_thread(g, p)
1413 uclamp_update_util_min_rt_default(p);
1414 rcu_read_unlock();
1415}
1416
1417static inline struct uclamp_se
1418uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1419{
1420
1421 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1422#ifdef CONFIG_UCLAMP_TASK_GROUP
1423 unsigned int tg_min, tg_max, value;
1424
1425
1426
1427
1428
1429 if (task_group_is_autogroup(task_group(p)))
1430 return uc_req;
1431 if (task_group(p) == &root_task_group)
1432 return uc_req;
1433
1434 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1435 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1436 value = uc_req.value;
1437 value = clamp(value, tg_min, tg_max);
1438 uclamp_se_set(&uc_req, value, false);
1439#endif
1440
1441 return uc_req;
1442}
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452static inline struct uclamp_se
1453uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1454{
1455 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1456 struct uclamp_se uc_max = uclamp_default[clamp_id];
1457
1458
1459 if (unlikely(uc_req.value > uc_max.value))
1460 return uc_max;
1461
1462 return uc_req;
1463}
1464
1465unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1466{
1467 struct uclamp_se uc_eff;
1468
1469
1470 if (p->uclamp[clamp_id].active)
1471 return (unsigned long)p->uclamp[clamp_id].value;
1472
1473 uc_eff = uclamp_eff_get(p, clamp_id);
1474
1475 return (unsigned long)uc_eff.value;
1476}
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1489 enum uclamp_id clamp_id)
1490{
1491 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1492 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1493 struct uclamp_bucket *bucket;
1494
1495 lockdep_assert_rq_held(rq);
1496
1497
1498 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1499
1500 bucket = &uc_rq->bucket[uc_se->bucket_id];
1501 bucket->tasks++;
1502 uc_se->active = true;
1503
1504 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1505
1506
1507
1508
1509
1510 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1511 bucket->value = uc_se->value;
1512
1513 if (uc_se->value > READ_ONCE(uc_rq->value))
1514 WRITE_ONCE(uc_rq->value, uc_se->value);
1515}
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1527 enum uclamp_id clamp_id)
1528{
1529 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1530 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1531 struct uclamp_bucket *bucket;
1532 unsigned int bkt_clamp;
1533 unsigned int rq_clamp;
1534
1535 lockdep_assert_rq_held(rq);
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560 if (unlikely(!uc_se->active))
1561 return;
1562
1563 bucket = &uc_rq->bucket[uc_se->bucket_id];
1564
1565 SCHED_WARN_ON(!bucket->tasks);
1566 if (likely(bucket->tasks))
1567 bucket->tasks--;
1568
1569 uc_se->active = false;
1570
1571
1572
1573
1574
1575
1576
1577 if (likely(bucket->tasks))
1578 return;
1579
1580 rq_clamp = READ_ONCE(uc_rq->value);
1581
1582
1583
1584
1585 SCHED_WARN_ON(bucket->value > rq_clamp);
1586 if (bucket->value >= rq_clamp) {
1587 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1588 WRITE_ONCE(uc_rq->value, bkt_clamp);
1589 }
1590}
1591
1592static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1593{
1594 enum uclamp_id clamp_id;
1595
1596
1597
1598
1599
1600
1601
1602 if (!static_branch_unlikely(&sched_uclamp_used))
1603 return;
1604
1605 if (unlikely(!p->sched_class->uclamp_enabled))
1606 return;
1607
1608 for_each_clamp_id(clamp_id)
1609 uclamp_rq_inc_id(rq, p, clamp_id);
1610
1611
1612 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1613 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1614}
1615
1616static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1617{
1618 enum uclamp_id clamp_id;
1619
1620
1621
1622
1623
1624
1625
1626 if (!static_branch_unlikely(&sched_uclamp_used))
1627 return;
1628
1629 if (unlikely(!p->sched_class->uclamp_enabled))
1630 return;
1631
1632 for_each_clamp_id(clamp_id)
1633 uclamp_rq_dec_id(rq, p, clamp_id);
1634}
1635
1636static inline void
1637uclamp_update_active(struct task_struct *p)
1638{
1639 enum uclamp_id clamp_id;
1640 struct rq_flags rf;
1641 struct rq *rq;
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651 rq = task_rq_lock(p, &rf);
1652
1653
1654
1655
1656
1657
1658
1659 for_each_clamp_id(clamp_id) {
1660 if (p->uclamp[clamp_id].active) {
1661 uclamp_rq_dec_id(rq, p, clamp_id);
1662 uclamp_rq_inc_id(rq, p, clamp_id);
1663 }
1664 }
1665
1666 task_rq_unlock(rq, p, &rf);
1667}
1668
1669#ifdef CONFIG_UCLAMP_TASK_GROUP
1670static inline void
1671uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1672{
1673 struct css_task_iter it;
1674 struct task_struct *p;
1675
1676 css_task_iter_start(css, 0, &it);
1677 while ((p = css_task_iter_next(&it)))
1678 uclamp_update_active(p);
1679 css_task_iter_end(&it);
1680}
1681
1682static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1683static void uclamp_update_root_tg(void)
1684{
1685 struct task_group *tg = &root_task_group;
1686
1687 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1688 sysctl_sched_uclamp_util_min, false);
1689 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1690 sysctl_sched_uclamp_util_max, false);
1691
1692 rcu_read_lock();
1693 cpu_util_update_eff(&root_task_group.css);
1694 rcu_read_unlock();
1695}
1696#else
1697static void uclamp_update_root_tg(void) { }
1698#endif
1699
1700int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1701 void *buffer, size_t *lenp, loff_t *ppos)
1702{
1703 bool update_root_tg = false;
1704 int old_min, old_max, old_min_rt;
1705 int result;
1706
1707 mutex_lock(&uclamp_mutex);
1708 old_min = sysctl_sched_uclamp_util_min;
1709 old_max = sysctl_sched_uclamp_util_max;
1710 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1711
1712 result = proc_dointvec(table, write, buffer, lenp, ppos);
1713 if (result)
1714 goto undo;
1715 if (!write)
1716 goto done;
1717
1718 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1719 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1720 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1721
1722 result = -EINVAL;
1723 goto undo;
1724 }
1725
1726 if (old_min != sysctl_sched_uclamp_util_min) {
1727 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1728 sysctl_sched_uclamp_util_min, false);
1729 update_root_tg = true;
1730 }
1731 if (old_max != sysctl_sched_uclamp_util_max) {
1732 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1733 sysctl_sched_uclamp_util_max, false);
1734 update_root_tg = true;
1735 }
1736
1737 if (update_root_tg) {
1738 static_branch_enable(&sched_uclamp_used);
1739 uclamp_update_root_tg();
1740 }
1741
1742 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1743 static_branch_enable(&sched_uclamp_used);
1744 uclamp_sync_util_min_rt_default();
1745 }
1746
1747
1748
1749
1750
1751
1752
1753 goto done;
1754
1755undo:
1756 sysctl_sched_uclamp_util_min = old_min;
1757 sysctl_sched_uclamp_util_max = old_max;
1758 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1759done:
1760 mutex_unlock(&uclamp_mutex);
1761
1762 return result;
1763}
1764
1765static int uclamp_validate(struct task_struct *p,
1766 const struct sched_attr *attr)
1767{
1768 int util_min = p->uclamp_req[UCLAMP_MIN].value;
1769 int util_max = p->uclamp_req[UCLAMP_MAX].value;
1770
1771 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1772 util_min = attr->sched_util_min;
1773
1774 if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1775 return -EINVAL;
1776 }
1777
1778 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1779 util_max = attr->sched_util_max;
1780
1781 if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1782 return -EINVAL;
1783 }
1784
1785 if (util_min != -1 && util_max != -1 && util_min > util_max)
1786 return -EINVAL;
1787
1788
1789
1790
1791
1792
1793
1794
1795 static_branch_enable(&sched_uclamp_used);
1796
1797 return 0;
1798}
1799
1800static bool uclamp_reset(const struct sched_attr *attr,
1801 enum uclamp_id clamp_id,
1802 struct uclamp_se *uc_se)
1803{
1804
1805 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1806 !uc_se->user_defined)
1807 return true;
1808
1809
1810 if (clamp_id == UCLAMP_MIN &&
1811 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1812 attr->sched_util_min == -1) {
1813 return true;
1814 }
1815
1816 if (clamp_id == UCLAMP_MAX &&
1817 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1818 attr->sched_util_max == -1) {
1819 return true;
1820 }
1821
1822 return false;
1823}
1824
1825static void __setscheduler_uclamp(struct task_struct *p,
1826 const struct sched_attr *attr)
1827{
1828 enum uclamp_id clamp_id;
1829
1830 for_each_clamp_id(clamp_id) {
1831 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1832 unsigned int value;
1833
1834 if (!uclamp_reset(attr, clamp_id, uc_se))
1835 continue;
1836
1837
1838
1839
1840
1841 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1842 value = sysctl_sched_uclamp_util_min_rt_default;
1843 else
1844 value = uclamp_none(clamp_id);
1845
1846 uclamp_se_set(uc_se, value, false);
1847
1848 }
1849
1850 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1851 return;
1852
1853 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1854 attr->sched_util_min != -1) {
1855 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1856 attr->sched_util_min, true);
1857 }
1858
1859 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1860 attr->sched_util_max != -1) {
1861 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1862 attr->sched_util_max, true);
1863 }
1864}
1865
1866static void uclamp_fork(struct task_struct *p)
1867{
1868 enum uclamp_id clamp_id;
1869
1870
1871
1872
1873
1874 for_each_clamp_id(clamp_id)
1875 p->uclamp[clamp_id].active = false;
1876
1877 if (likely(!p->sched_reset_on_fork))
1878 return;
1879
1880 for_each_clamp_id(clamp_id) {
1881 uclamp_se_set(&p->uclamp_req[clamp_id],
1882 uclamp_none(clamp_id), false);
1883 }
1884}
1885
1886static void uclamp_post_fork(struct task_struct *p)
1887{
1888 uclamp_update_util_min_rt_default(p);
1889}
1890
1891static void __init init_uclamp_rq(struct rq *rq)
1892{
1893 enum uclamp_id clamp_id;
1894 struct uclamp_rq *uc_rq = rq->uclamp;
1895
1896 for_each_clamp_id(clamp_id) {
1897 uc_rq[clamp_id] = (struct uclamp_rq) {
1898 .value = uclamp_none(clamp_id)
1899 };
1900 }
1901
1902 rq->uclamp_flags = 0;
1903}
1904
1905static void __init init_uclamp(void)
1906{
1907 struct uclamp_se uc_max = {};
1908 enum uclamp_id clamp_id;
1909 int cpu;
1910
1911 for_each_possible_cpu(cpu)
1912 init_uclamp_rq(cpu_rq(cpu));
1913
1914 for_each_clamp_id(clamp_id) {
1915 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1916 uclamp_none(clamp_id), false);
1917 }
1918
1919
1920 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1921 for_each_clamp_id(clamp_id) {
1922 uclamp_default[clamp_id] = uc_max;
1923#ifdef CONFIG_UCLAMP_TASK_GROUP
1924 root_task_group.uclamp_req[clamp_id] = uc_max;
1925 root_task_group.uclamp[clamp_id] = uc_max;
1926#endif
1927 }
1928}
1929
1930#else
1931static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1932static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1933static inline int uclamp_validate(struct task_struct *p,
1934 const struct sched_attr *attr)
1935{
1936 return -EOPNOTSUPP;
1937}
1938static void __setscheduler_uclamp(struct task_struct *p,
1939 const struct sched_attr *attr) { }
1940static inline void uclamp_fork(struct task_struct *p) { }
1941static inline void uclamp_post_fork(struct task_struct *p) { }
1942static inline void init_uclamp(void) { }
1943#endif
1944
1945bool sched_task_on_rq(struct task_struct *p)
1946{
1947 return task_on_rq_queued(p);
1948}
1949
1950static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1951{
1952 if (!(flags & ENQUEUE_NOCLOCK))
1953 update_rq_clock(rq);
1954
1955 if (!(flags & ENQUEUE_RESTORE)) {
1956 sched_info_enqueue(rq, p);
1957 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1958 }
1959
1960 uclamp_rq_inc(rq, p);
1961 p->sched_class->enqueue_task(rq, p, flags);
1962
1963 if (sched_core_enabled(rq))
1964 sched_core_enqueue(rq, p);
1965}
1966
1967static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1968{
1969 if (sched_core_enabled(rq))
1970 sched_core_dequeue(rq, p);
1971
1972 if (!(flags & DEQUEUE_NOCLOCK))
1973 update_rq_clock(rq);
1974
1975 if (!(flags & DEQUEUE_SAVE)) {
1976 sched_info_dequeue(rq, p);
1977 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1978 }
1979
1980 uclamp_rq_dec(rq, p);
1981 p->sched_class->dequeue_task(rq, p, flags);
1982}
1983
1984void activate_task(struct rq *rq, struct task_struct *p, int flags)
1985{
1986 enqueue_task(rq, p, flags);
1987
1988 p->on_rq = TASK_ON_RQ_QUEUED;
1989}
1990
1991void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1992{
1993 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1994
1995 dequeue_task(rq, p, flags);
1996}
1997
1998static inline int __normal_prio(int policy, int rt_prio, int nice)
1999{
2000 int prio;
2001
2002 if (dl_policy(policy))
2003 prio = MAX_DL_PRIO - 1;
2004 else if (rt_policy(policy))
2005 prio = MAX_RT_PRIO - 1 - rt_prio;
2006 else
2007 prio = NICE_TO_PRIO(nice);
2008
2009 return prio;
2010}
2011
2012
2013
2014
2015
2016
2017
2018
2019static inline int normal_prio(struct task_struct *p)
2020{
2021 return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2022}
2023
2024
2025
2026
2027
2028
2029
2030
2031static int effective_prio(struct task_struct *p)
2032{
2033 p->normal_prio = normal_prio(p);
2034
2035
2036
2037
2038
2039 if (!rt_prio(p->prio))
2040 return p->normal_prio;
2041 return p->prio;
2042}
2043
2044
2045
2046
2047
2048
2049
2050inline int task_curr(const struct task_struct *p)
2051{
2052 return cpu_curr(task_cpu(p)) == p;
2053}
2054
2055
2056
2057
2058
2059
2060
2061
2062static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2063 const struct sched_class *prev_class,
2064 int oldprio)
2065{
2066 if (prev_class != p->sched_class) {
2067 if (prev_class->switched_from)
2068 prev_class->switched_from(rq, p);
2069
2070 p->sched_class->switched_to(rq, p);
2071 } else if (oldprio != p->prio || dl_task(p))
2072 p->sched_class->prio_changed(rq, p, oldprio);
2073}
2074
2075void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2076{
2077 if (p->sched_class == rq->curr->sched_class)
2078 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2079 else if (p->sched_class > rq->curr->sched_class)
2080 resched_curr(rq);
2081
2082
2083
2084
2085
2086 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2087 rq_clock_skip_update(rq);
2088}
2089
2090#ifdef CONFIG_SMP
2091
2092static void
2093__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2094
2095static int __set_cpus_allowed_ptr(struct task_struct *p,
2096 const struct cpumask *new_mask,
2097 u32 flags);
2098
2099static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2100{
2101 if (likely(!p->migration_disabled))
2102 return;
2103
2104 if (p->cpus_ptr != &p->cpus_mask)
2105 return;
2106
2107
2108
2109
2110 __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2111}
2112
2113void migrate_disable(void)
2114{
2115 struct task_struct *p = current;
2116
2117 if (p->migration_disabled) {
2118 p->migration_disabled++;
2119 return;
2120 }
2121
2122 preempt_disable();
2123 this_rq()->nr_pinned++;
2124 p->migration_disabled = 1;
2125 preempt_enable();
2126}
2127EXPORT_SYMBOL_GPL(migrate_disable);
2128
2129void migrate_enable(void)
2130{
2131 struct task_struct *p = current;
2132
2133 if (p->migration_disabled > 1) {
2134 p->migration_disabled--;
2135 return;
2136 }
2137
2138
2139
2140
2141
2142 preempt_disable();
2143 if (p->cpus_ptr != &p->cpus_mask)
2144 __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2145
2146
2147
2148
2149
2150 barrier();
2151 p->migration_disabled = 0;
2152 this_rq()->nr_pinned--;
2153 preempt_enable();
2154}
2155EXPORT_SYMBOL_GPL(migrate_enable);
2156
2157static inline bool rq_has_pinned_tasks(struct rq *rq)
2158{
2159 return rq->nr_pinned;
2160}
2161
2162
2163
2164
2165
2166static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2167{
2168
2169 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2170 return false;
2171
2172
2173 if (is_migration_disabled(p))
2174 return cpu_online(cpu);
2175
2176
2177 if (!(p->flags & PF_KTHREAD))
2178 return cpu_active(cpu);
2179
2180
2181 if (kthread_is_per_cpu(p))
2182 return cpu_online(cpu);
2183
2184
2185 if (cpu_dying(cpu))
2186 return false;
2187
2188
2189 return cpu_online(cpu);
2190}
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2212 struct task_struct *p, int new_cpu)
2213{
2214 lockdep_assert_rq_held(rq);
2215
2216 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2217 set_task_cpu(p, new_cpu);
2218 rq_unlock(rq, rf);
2219
2220 rq = cpu_rq(new_cpu);
2221
2222 rq_lock(rq, rf);
2223 BUG_ON(task_cpu(p) != new_cpu);
2224 activate_task(rq, p, 0);
2225 check_preempt_curr(rq, p, 0);
2226
2227 return rq;
2228}
2229
2230struct migration_arg {
2231 struct task_struct *task;
2232 int dest_cpu;
2233 struct set_affinity_pending *pending;
2234};
2235
2236
2237
2238
2239
2240struct set_affinity_pending {
2241 refcount_t refs;
2242 unsigned int stop_pending;
2243 struct completion done;
2244 struct cpu_stop_work stop_work;
2245 struct migration_arg arg;
2246};
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2258 struct task_struct *p, int dest_cpu)
2259{
2260
2261 if (!is_cpu_allowed(p, dest_cpu))
2262 return rq;
2263
2264 update_rq_clock(rq);
2265 rq = move_queued_task(rq, rf, p, dest_cpu);
2266
2267 return rq;
2268}
2269
2270
2271
2272
2273
2274
2275static int migration_cpu_stop(void *data)
2276{
2277 struct migration_arg *arg = data;
2278 struct set_affinity_pending *pending = arg->pending;
2279 struct task_struct *p = arg->task;
2280 struct rq *rq = this_rq();
2281 bool complete = false;
2282 struct rq_flags rf;
2283
2284
2285
2286
2287
2288 local_irq_save(rf.flags);
2289
2290
2291
2292
2293
2294 flush_smp_call_function_from_idle();
2295
2296 raw_spin_lock(&p->pi_lock);
2297 rq_lock(rq, &rf);
2298
2299
2300
2301
2302
2303 WARN_ON_ONCE(pending && pending != p->migration_pending);
2304
2305
2306
2307
2308
2309
2310 if (task_rq(p) == rq) {
2311 if (is_migration_disabled(p))
2312 goto out;
2313
2314 if (pending) {
2315 p->migration_pending = NULL;
2316 complete = true;
2317
2318 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2319 goto out;
2320 }
2321
2322 if (task_on_rq_queued(p))
2323 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2324 else
2325 p->wake_cpu = arg->dest_cpu;
2326
2327
2328
2329
2330
2331
2332
2333
2334 } else if (pending) {
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2350 p->migration_pending = NULL;
2351 complete = true;
2352 goto out;
2353 }
2354
2355
2356
2357
2358
2359
2360 WARN_ON_ONCE(!pending->stop_pending);
2361 task_rq_unlock(rq, p, &rf);
2362 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2363 &pending->arg, &pending->stop_work);
2364 return 0;
2365 }
2366out:
2367 if (pending)
2368 pending->stop_pending = false;
2369 task_rq_unlock(rq, p, &rf);
2370
2371 if (complete)
2372 complete_all(&pending->done);
2373
2374 return 0;
2375}
2376
2377int push_cpu_stop(void *arg)
2378{
2379 struct rq *lowest_rq = NULL, *rq = this_rq();
2380 struct task_struct *p = arg;
2381
2382 raw_spin_lock_irq(&p->pi_lock);
2383 raw_spin_rq_lock(rq);
2384
2385 if (task_rq(p) != rq)
2386 goto out_unlock;
2387
2388 if (is_migration_disabled(p)) {
2389 p->migration_flags |= MDF_PUSH;
2390 goto out_unlock;
2391 }
2392
2393 p->migration_flags &= ~MDF_PUSH;
2394
2395 if (p->sched_class->find_lock_rq)
2396 lowest_rq = p->sched_class->find_lock_rq(p, rq);
2397
2398 if (!lowest_rq)
2399 goto out_unlock;
2400
2401
2402 if (task_rq(p) == rq) {
2403 deactivate_task(rq, p, 0);
2404 set_task_cpu(p, lowest_rq->cpu);
2405 activate_task(lowest_rq, p, 0);
2406 resched_curr(lowest_rq);
2407 }
2408
2409 double_unlock_balance(rq, lowest_rq);
2410
2411out_unlock:
2412 rq->push_busy = false;
2413 raw_spin_rq_unlock(rq);
2414 raw_spin_unlock_irq(&p->pi_lock);
2415
2416 put_task_struct(p);
2417 return 0;
2418}
2419
2420
2421
2422
2423
2424void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2425{
2426 if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2427 p->cpus_ptr = new_mask;
2428 return;
2429 }
2430
2431 cpumask_copy(&p->cpus_mask, new_mask);
2432 p->nr_cpus_allowed = cpumask_weight(new_mask);
2433}
2434
2435static void
2436__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2437{
2438 struct rq *rq = task_rq(p);
2439 bool queued, running;
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453 if (flags & SCA_MIGRATE_DISABLE)
2454 SCHED_WARN_ON(!p->on_cpu);
2455 else
2456 lockdep_assert_held(&p->pi_lock);
2457
2458 queued = task_on_rq_queued(p);
2459 running = task_current(rq, p);
2460
2461 if (queued) {
2462
2463
2464
2465
2466 lockdep_assert_rq_held(rq);
2467 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2468 }
2469 if (running)
2470 put_prev_task(rq, p);
2471
2472 p->sched_class->set_cpus_allowed(p, new_mask, flags);
2473
2474 if (queued)
2475 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2476 if (running)
2477 set_next_task(rq, p);
2478}
2479
2480void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2481{
2482 __do_set_cpus_allowed(p, new_mask, 0);
2483}
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2562 int dest_cpu, unsigned int flags)
2563{
2564 struct set_affinity_pending my_pending = { }, *pending = NULL;
2565 bool stop_pending, complete = false;
2566
2567
2568 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2569 struct task_struct *push_task = NULL;
2570
2571 if ((flags & SCA_MIGRATE_ENABLE) &&
2572 (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2573 rq->push_busy = true;
2574 push_task = get_task_struct(p);
2575 }
2576
2577
2578
2579
2580
2581 pending = p->migration_pending;
2582 if (pending && !pending->stop_pending) {
2583 p->migration_pending = NULL;
2584 complete = true;
2585 }
2586
2587 task_rq_unlock(rq, p, rf);
2588
2589 if (push_task) {
2590 stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2591 p, &rq->push_work);
2592 }
2593
2594 if (complete)
2595 complete_all(&pending->done);
2596
2597 return 0;
2598 }
2599
2600 if (!(flags & SCA_MIGRATE_ENABLE)) {
2601
2602 if (!p->migration_pending) {
2603
2604 refcount_set(&my_pending.refs, 1);
2605 init_completion(&my_pending.done);
2606 my_pending.arg = (struct migration_arg) {
2607 .task = p,
2608 .dest_cpu = dest_cpu,
2609 .pending = &my_pending,
2610 };
2611
2612 p->migration_pending = &my_pending;
2613 } else {
2614 pending = p->migration_pending;
2615 refcount_inc(&pending->refs);
2616
2617
2618
2619
2620
2621
2622
2623
2624 pending->arg.dest_cpu = dest_cpu;
2625 }
2626 }
2627 pending = p->migration_pending;
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640 if (WARN_ON_ONCE(!pending)) {
2641 task_rq_unlock(rq, p, rf);
2642 return -EINVAL;
2643 }
2644
2645 if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2646
2647
2648
2649
2650
2651 stop_pending = pending->stop_pending;
2652 if (!stop_pending)
2653 pending->stop_pending = true;
2654
2655 if (flags & SCA_MIGRATE_ENABLE)
2656 p->migration_flags &= ~MDF_PUSH;
2657
2658 task_rq_unlock(rq, p, rf);
2659
2660 if (!stop_pending) {
2661 stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2662 &pending->arg, &pending->stop_work);
2663 }
2664
2665 if (flags & SCA_MIGRATE_ENABLE)
2666 return 0;
2667 } else {
2668
2669 if (!is_migration_disabled(p)) {
2670 if (task_on_rq_queued(p))
2671 rq = move_queued_task(rq, rf, p, dest_cpu);
2672
2673 if (!pending->stop_pending) {
2674 p->migration_pending = NULL;
2675 complete = true;
2676 }
2677 }
2678 task_rq_unlock(rq, p, rf);
2679
2680 if (complete)
2681 complete_all(&pending->done);
2682 }
2683
2684 wait_for_completion(&pending->done);
2685
2686 if (refcount_dec_and_test(&pending->refs))
2687 wake_up_var(&pending->refs);
2688
2689
2690
2691
2692
2693 wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2694
2695
2696 WARN_ON_ONCE(my_pending.stop_pending);
2697
2698 return 0;
2699}
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710static int __set_cpus_allowed_ptr(struct task_struct *p,
2711 const struct cpumask *new_mask,
2712 u32 flags)
2713{
2714 const struct cpumask *cpu_valid_mask = cpu_active_mask;
2715 unsigned int dest_cpu;
2716 struct rq_flags rf;
2717 struct rq *rq;
2718 int ret = 0;
2719
2720 rq = task_rq_lock(p, &rf);
2721 update_rq_clock(rq);
2722
2723 if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734 cpu_valid_mask = cpu_online_mask;
2735 }
2736
2737
2738
2739
2740
2741 if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2742 ret = -EINVAL;
2743 goto out;
2744 }
2745
2746 if (!(flags & SCA_MIGRATE_ENABLE)) {
2747 if (cpumask_equal(&p->cpus_mask, new_mask))
2748 goto out;
2749
2750 if (WARN_ON_ONCE(p == current &&
2751 is_migration_disabled(p) &&
2752 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2753 ret = -EBUSY;
2754 goto out;
2755 }
2756 }
2757
2758
2759
2760
2761
2762
2763 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2764 if (dest_cpu >= nr_cpu_ids) {
2765 ret = -EINVAL;
2766 goto out;
2767 }
2768
2769 __do_set_cpus_allowed(p, new_mask, flags);
2770
2771 return affine_move_task(rq, p, &rf, dest_cpu, flags);
2772
2773out:
2774 task_rq_unlock(rq, p, &rf);
2775
2776 return ret;
2777}
2778
2779int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2780{
2781 return __set_cpus_allowed_ptr(p, new_mask, 0);
2782}
2783EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2784
2785void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2786{
2787#ifdef CONFIG_SCHED_DEBUG
2788 unsigned int state = READ_ONCE(p->__state);
2789
2790
2791
2792
2793
2794 WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
2795
2796
2797
2798
2799
2800
2801 WARN_ON_ONCE(state == TASK_RUNNING &&
2802 p->sched_class == &fair_sched_class &&
2803 (p->on_rq && !task_on_rq_migrating(p)));
2804
2805#ifdef CONFIG_LOCKDEP
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2817 lockdep_is_held(__rq_lockp(task_rq(p)))));
2818#endif
2819
2820
2821
2822 WARN_ON_ONCE(!cpu_online(new_cpu));
2823
2824 WARN_ON_ONCE(is_migration_disabled(p));
2825#endif
2826
2827 trace_sched_migrate_task(p, new_cpu);
2828
2829 if (task_cpu(p) != new_cpu) {
2830 if (p->sched_class->migrate_task_rq)
2831 p->sched_class->migrate_task_rq(p, new_cpu);
2832 p->se.nr_migrations++;
2833 rseq_migrate(p);
2834 perf_event_task_migrate(p);
2835 }
2836
2837 __set_task_cpu(p, new_cpu);
2838}
2839
2840#ifdef CONFIG_NUMA_BALANCING
2841static void __migrate_swap_task(struct task_struct *p, int cpu)
2842{
2843 if (task_on_rq_queued(p)) {
2844 struct rq *src_rq, *dst_rq;
2845 struct rq_flags srf, drf;
2846
2847 src_rq = task_rq(p);
2848 dst_rq = cpu_rq(cpu);
2849
2850 rq_pin_lock(src_rq, &srf);
2851 rq_pin_lock(dst_rq, &drf);
2852
2853 deactivate_task(src_rq, p, 0);
2854 set_task_cpu(p, cpu);
2855 activate_task(dst_rq, p, 0);
2856 check_preempt_curr(dst_rq, p, 0);
2857
2858 rq_unpin_lock(dst_rq, &drf);
2859 rq_unpin_lock(src_rq, &srf);
2860
2861 } else {
2862
2863
2864
2865
2866
2867 p->wake_cpu = cpu;
2868 }
2869}
2870
2871struct migration_swap_arg {
2872 struct task_struct *src_task, *dst_task;
2873 int src_cpu, dst_cpu;
2874};
2875
2876static int migrate_swap_stop(void *data)
2877{
2878 struct migration_swap_arg *arg = data;
2879 struct rq *src_rq, *dst_rq;
2880 int ret = -EAGAIN;
2881
2882 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
2883 return -EAGAIN;
2884
2885 src_rq = cpu_rq(arg->src_cpu);
2886 dst_rq = cpu_rq(arg->dst_cpu);
2887
2888 double_raw_lock(&arg->src_task->pi_lock,
2889 &arg->dst_task->pi_lock);
2890 double_rq_lock(src_rq, dst_rq);
2891
2892 if (task_cpu(arg->dst_task) != arg->dst_cpu)
2893 goto unlock;
2894
2895 if (task_cpu(arg->src_task) != arg->src_cpu)
2896 goto unlock;
2897
2898 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
2899 goto unlock;
2900
2901 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
2902 goto unlock;
2903
2904 __migrate_swap_task(arg->src_task, arg->dst_cpu);
2905 __migrate_swap_task(arg->dst_task, arg->src_cpu);
2906
2907 ret = 0;
2908
2909unlock:
2910 double_rq_unlock(src_rq, dst_rq);
2911 raw_spin_unlock(&arg->dst_task->pi_lock);
2912 raw_spin_unlock(&arg->src_task->pi_lock);
2913
2914 return ret;
2915}
2916
2917
2918
2919
2920int migrate_swap(struct task_struct *cur, struct task_struct *p,
2921 int target_cpu, int curr_cpu)
2922{
2923 struct migration_swap_arg arg;
2924 int ret = -EINVAL;
2925
2926 arg = (struct migration_swap_arg){
2927 .src_task = cur,
2928 .src_cpu = curr_cpu,
2929 .dst_task = p,
2930 .dst_cpu = target_cpu,
2931 };
2932
2933 if (arg.src_cpu == arg.dst_cpu)
2934 goto out;
2935
2936
2937
2938
2939
2940 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
2941 goto out;
2942
2943 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
2944 goto out;
2945
2946 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
2947 goto out;
2948
2949 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2950 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2951
2952out:
2953 return ret;
2954}
2955#endif
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
2974{
2975 int running, queued;
2976 struct rq_flags rf;
2977 unsigned long ncsw;
2978 struct rq *rq;
2979
2980 for (;;) {
2981
2982
2983
2984
2985
2986
2987 rq = task_rq(p);
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000 while (task_running(rq, p)) {
3001 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3002 return 0;
3003 cpu_relax();
3004 }
3005
3006
3007
3008
3009
3010
3011 rq = task_rq_lock(p, &rf);
3012 trace_sched_wait_task(p);
3013 running = task_running(rq, p);
3014 queued = task_on_rq_queued(p);
3015 ncsw = 0;
3016 if (!match_state || READ_ONCE(p->__state) == match_state)
3017 ncsw = p->nvcsw | LONG_MIN;
3018 task_rq_unlock(rq, p, &rf);
3019
3020
3021
3022
3023 if (unlikely(!ncsw))
3024 break;
3025
3026
3027
3028
3029
3030
3031
3032 if (unlikely(running)) {
3033 cpu_relax();
3034 continue;
3035 }
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046 if (unlikely(queued)) {
3047 ktime_t to = NSEC_PER_SEC / HZ;
3048
3049 set_current_state(TASK_UNINTERRUPTIBLE);
3050 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
3051 continue;
3052 }
3053
3054
3055
3056
3057
3058
3059 break;
3060 }
3061
3062 return ncsw;
3063}
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078void kick_process(struct task_struct *p)
3079{
3080 int cpu;
3081
3082 preempt_disable();
3083 cpu = task_cpu(p);
3084 if ((cpu != smp_processor_id()) && task_curr(p))
3085 smp_send_reschedule(cpu);
3086 preempt_enable();
3087}
3088EXPORT_SYMBOL_GPL(kick_process);
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112static int select_fallback_rq(int cpu, struct task_struct *p)
3113{
3114 int nid = cpu_to_node(cpu);
3115 const struct cpumask *nodemask = NULL;
3116 enum { cpuset, possible, fail } state = cpuset;
3117 int dest_cpu;
3118
3119
3120
3121
3122
3123
3124 if (nid != -1) {
3125 nodemask = cpumask_of_node(nid);
3126
3127
3128 for_each_cpu(dest_cpu, nodemask) {
3129 if (!cpu_active(dest_cpu))
3130 continue;
3131 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
3132 return dest_cpu;
3133 }
3134 }
3135
3136 for (;;) {
3137
3138 for_each_cpu(dest_cpu, p->cpus_ptr) {
3139 if (!is_cpu_allowed(p, dest_cpu))
3140 continue;
3141
3142 goto out;
3143 }
3144
3145
3146 switch (state) {
3147 case cpuset:
3148 if (IS_ENABLED(CONFIG_CPUSETS)) {
3149 cpuset_cpus_allowed_fallback(p);
3150 state = possible;
3151 break;
3152 }
3153 fallthrough;
3154 case possible:
3155
3156
3157
3158
3159
3160
3161 do_set_cpus_allowed(p, cpu_possible_mask);
3162 state = fail;
3163 break;
3164
3165 case fail:
3166 BUG();
3167 break;
3168 }
3169 }
3170
3171out:
3172 if (state != cpuset) {
3173
3174
3175
3176
3177
3178 if (p->mm && printk_ratelimit()) {
3179 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3180 task_pid_nr(p), p->comm, cpu);
3181 }
3182 }
3183
3184 return dest_cpu;
3185}
3186
3187
3188
3189
3190static inline
3191int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3192{
3193 lockdep_assert_held(&p->pi_lock);
3194
3195 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3196 cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3197 else
3198 cpu = cpumask_any(p->cpus_ptr);
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210 if (unlikely(!is_cpu_allowed(p, cpu)))
3211 cpu = select_fallback_rq(task_cpu(p), p);
3212
3213 return cpu;
3214}
3215
3216void sched_set_stop_task(int cpu, struct task_struct *stop)
3217{
3218 static struct lock_class_key stop_pi_lock;
3219 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3220 struct task_struct *old_stop = cpu_rq(cpu)->stop;
3221
3222 if (stop) {
3223
3224
3225
3226
3227
3228
3229
3230
3231 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
3232
3233 stop->sched_class = &stop_sched_class;
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247 lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3248 }
3249
3250 cpu_rq(cpu)->stop = stop;
3251
3252 if (old_stop) {
3253
3254
3255
3256
3257 old_stop->sched_class = &rt_sched_class;
3258 }
3259}
3260
3261#else
3262
3263static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3264 const struct cpumask *new_mask,
3265 u32 flags)
3266{
3267 return set_cpus_allowed_ptr(p, new_mask);
3268}
3269
3270static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3271
3272static inline bool rq_has_pinned_tasks(struct rq *rq)
3273{
3274 return false;
3275}
3276
3277#endif
3278
3279static void
3280ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3281{
3282 struct rq *rq;
3283
3284 if (!schedstat_enabled())
3285 return;
3286
3287 rq = this_rq();
3288
3289#ifdef CONFIG_SMP
3290 if (cpu == rq->cpu) {
3291 __schedstat_inc(rq->ttwu_local);
3292 __schedstat_inc(p->se.statistics.nr_wakeups_local);
3293 } else {
3294 struct sched_domain *sd;
3295
3296 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
3297 rcu_read_lock();
3298 for_each_domain(rq->cpu, sd) {
3299 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3300 __schedstat_inc(sd->ttwu_wake_remote);
3301 break;
3302 }
3303 }
3304 rcu_read_unlock();
3305 }
3306
3307 if (wake_flags & WF_MIGRATED)
3308 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
3309#endif
3310
3311 __schedstat_inc(rq->ttwu_count);
3312 __schedstat_inc(p->se.statistics.nr_wakeups);
3313
3314 if (wake_flags & WF_SYNC)
3315 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
3316}
3317
3318
3319
3320
3321static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3322 struct rq_flags *rf)
3323{
3324 check_preempt_curr(rq, p, wake_flags);
3325 WRITE_ONCE(p->__state, TASK_RUNNING);
3326 trace_sched_wakeup(p);
3327
3328#ifdef CONFIG_SMP
3329 if (p->sched_class->task_woken) {
3330
3331
3332
3333
3334 rq_unpin_lock(rq, rf);
3335 p->sched_class->task_woken(rq, p);
3336 rq_repin_lock(rq, rf);
3337 }
3338
3339 if (rq->idle_stamp) {
3340 u64 delta = rq_clock(rq) - rq->idle_stamp;
3341 u64 max = 2*rq->max_idle_balance_cost;
3342
3343 update_avg(&rq->avg_idle, delta);
3344
3345 if (rq->avg_idle > max)
3346 rq->avg_idle = max;
3347
3348 rq->wake_stamp = jiffies;
3349 rq->wake_avg_idle = rq->avg_idle / 2;
3350
3351 rq->idle_stamp = 0;
3352 }
3353#endif
3354}
3355
3356static void
3357ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3358 struct rq_flags *rf)
3359{
3360 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3361
3362 lockdep_assert_rq_held(rq);
3363
3364 if (p->sched_contributes_to_load)
3365 rq->nr_uninterruptible--;
3366
3367#ifdef CONFIG_SMP
3368 if (wake_flags & WF_MIGRATED)
3369 en_flags |= ENQUEUE_MIGRATED;
3370 else
3371#endif
3372 if (p->in_iowait) {
3373 delayacct_blkio_end(p);
3374 atomic_dec(&task_rq(p)->nr_iowait);
3375 }
3376
3377 activate_task(rq, p, en_flags);
3378 ttwu_do_wakeup(rq, p, wake_flags, rf);
3379}
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406static int ttwu_runnable(struct task_struct *p, int wake_flags)
3407{
3408 struct rq_flags rf;
3409 struct rq *rq;
3410 int ret = 0;
3411
3412 rq = __task_rq_lock(p, &rf);
3413 if (task_on_rq_queued(p)) {
3414
3415 update_rq_clock(rq);
3416 ttwu_do_wakeup(rq, p, wake_flags, &rf);
3417 ret = 1;
3418 }
3419 __task_rq_unlock(rq, &rf);
3420
3421 return ret;
3422}
3423
3424#ifdef CONFIG_SMP
3425void sched_ttwu_pending(void *arg)
3426{
3427 struct llist_node *llist = arg;
3428 struct rq *rq = this_rq();
3429 struct task_struct *p, *t;
3430 struct rq_flags rf;
3431
3432 if (!llist)
3433 return;
3434
3435
3436
3437
3438
3439
3440 WRITE_ONCE(rq->ttwu_pending, 0);
3441
3442 rq_lock_irqsave(rq, &rf);
3443 update_rq_clock(rq);
3444
3445 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3446 if (WARN_ON_ONCE(p->on_cpu))
3447 smp_cond_load_acquire(&p->on_cpu, !VAL);
3448
3449 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3450 set_task_cpu(p, cpu_of(rq));
3451
3452 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3453 }
3454
3455 rq_unlock_irqrestore(rq, &rf);
3456}
3457
3458void send_call_function_single_ipi(int cpu)
3459{
3460 struct rq *rq = cpu_rq(cpu);
3461
3462 if (!set_nr_if_polling(rq->idle))
3463 arch_send_call_function_single_ipi(cpu);
3464 else
3465 trace_sched_wake_idle_without_ipi(cpu);
3466}
3467
3468
3469
3470
3471
3472
3473
3474static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3475{
3476 struct rq *rq = cpu_rq(cpu);
3477
3478 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3479
3480 WRITE_ONCE(rq->ttwu_pending, 1);
3481 __smp_call_single_queue(cpu, &p->wake_entry.llist);
3482}
3483
3484void wake_up_if_idle(int cpu)
3485{
3486 struct rq *rq = cpu_rq(cpu);
3487 struct rq_flags rf;
3488
3489 rcu_read_lock();
3490
3491 if (!is_idle_task(rcu_dereference(rq->curr)))
3492 goto out;
3493
3494 if (set_nr_if_polling(rq->idle)) {
3495 trace_sched_wake_idle_without_ipi(cpu);
3496 } else {
3497 rq_lock_irqsave(rq, &rf);
3498 if (is_idle_task(rq->curr))
3499 smp_send_reschedule(cpu);
3500
3501 rq_unlock_irqrestore(rq, &rf);
3502 }
3503
3504out:
3505 rcu_read_unlock();
3506}
3507
3508bool cpus_share_cache(int this_cpu, int that_cpu)
3509{
3510 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3511}
3512
3513static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3514{
3515
3516
3517
3518
3519 if (!cpu_active(cpu))
3520 return false;
3521
3522
3523
3524
3525
3526 if (!cpus_share_cache(smp_processor_id(), cpu))
3527 return true;
3528
3529
3530
3531
3532
3533
3534
3535 if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
3536 return true;
3537
3538 return false;
3539}
3540
3541static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3542{
3543 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
3544 if (WARN_ON_ONCE(cpu == smp_processor_id()))
3545 return false;
3546
3547 sched_clock_cpu(cpu);
3548 __ttwu_queue_wakelist(p, cpu, wake_flags);
3549 return true;
3550 }
3551
3552 return false;
3553}
3554
3555#else
3556
3557static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3558{
3559 return false;
3560}
3561
3562#endif
3563
3564static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3565{
3566 struct rq *rq = cpu_rq(cpu);
3567 struct rq_flags rf;
3568
3569 if (ttwu_queue_wakelist(p, cpu, wake_flags))
3570 return;
3571
3572 rq_lock(rq, &rf);
3573 update_rq_clock(rq);
3574 ttwu_do_activate(rq, p, wake_flags, &rf);
3575 rq_unlock(rq, &rf);
3576}
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698static int
3699try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3700{
3701 unsigned long flags;
3702 int cpu, success = 0;
3703
3704 preempt_disable();
3705 if (p == current) {
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717 if (!(READ_ONCE(p->__state) & state))
3718 goto out;
3719
3720 success = 1;
3721 trace_sched_waking(p);
3722 WRITE_ONCE(p->__state, TASK_RUNNING);
3723 trace_sched_wakeup(p);
3724 goto out;
3725 }
3726
3727
3728
3729
3730
3731
3732
3733 raw_spin_lock_irqsave(&p->pi_lock, flags);
3734 smp_mb__after_spinlock();
3735 if (!(READ_ONCE(p->__state) & state))
3736 goto unlock;
3737
3738 trace_sched_waking(p);
3739
3740
3741 success = 1;
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765 smp_rmb();
3766 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3767 goto unlock;
3768
3769#ifdef CONFIG_SMP
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793 smp_acquire__after_ctrl_dep();
3794
3795
3796
3797
3798
3799
3800
3801 WRITE_ONCE(p->__state, TASK_WAKING);
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822 if (smp_load_acquire(&p->on_cpu) &&
3823 ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3824 goto unlock;
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835 smp_cond_load_acquire(&p->on_cpu, !VAL);
3836
3837 cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
3838 if (task_cpu(p) != cpu) {
3839 if (p->in_iowait) {
3840 delayacct_blkio_end(p);
3841 atomic_dec(&task_rq(p)->nr_iowait);
3842 }
3843
3844 wake_flags |= WF_MIGRATED;
3845 psi_ttwu_dequeue(p);
3846 set_task_cpu(p, cpu);
3847 }
3848#else
3849 cpu = task_cpu(p);
3850#endif
3851
3852 ttwu_queue(p, cpu, wake_flags);
3853unlock:
3854 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3855out:
3856 if (success)
3857 ttwu_stat(p, task_cpu(p), wake_flags);
3858 preempt_enable();
3859
3860 return success;
3861}
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3882{
3883 struct rq_flags rf;
3884 bool ret = false;
3885 struct rq *rq;
3886
3887 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3888 if (p->on_rq) {
3889 rq = __task_rq_lock(p, &rf);
3890 if (task_rq(p) == rq)
3891 ret = func(p, arg);
3892 rq_unlock(rq, &rf);
3893 } else {
3894 switch (READ_ONCE(p->__state)) {
3895 case TASK_RUNNING:
3896 case TASK_WAKING:
3897 break;
3898 default:
3899 smp_rmb();
3900 if (!p->on_rq)
3901 ret = func(p, arg);
3902 }
3903 }
3904 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3905 return ret;
3906}
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919int wake_up_process(struct task_struct *p)
3920{
3921 return try_to_wake_up(p, TASK_NORMAL, 0);
3922}
3923EXPORT_SYMBOL(wake_up_process);
3924
3925int wake_up_state(struct task_struct *p, unsigned int state)
3926{
3927 return try_to_wake_up(p, state, 0);
3928}
3929
3930
3931
3932
3933
3934
3935
3936static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3937{
3938 p->on_rq = 0;
3939
3940 p->se.on_rq = 0;
3941 p->se.exec_start = 0;
3942 p->se.sum_exec_runtime = 0;
3943 p->se.prev_sum_exec_runtime = 0;
3944 p->se.nr_migrations = 0;
3945 p->se.vruntime = 0;
3946 INIT_LIST_HEAD(&p->se.group_node);
3947
3948#ifdef CONFIG_FAIR_GROUP_SCHED
3949 p->se.cfs_rq = NULL;
3950#endif
3951
3952#ifdef CONFIG_SCHEDSTATS
3953
3954 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3955#endif
3956
3957 RB_CLEAR_NODE(&p->dl.rb_node);
3958 init_dl_task_timer(&p->dl);
3959 init_dl_inactive_task_timer(&p->dl);
3960 __dl_clear_params(p);
3961
3962 INIT_LIST_HEAD(&p->rt.run_list);
3963 p->rt.timeout = 0;
3964 p->rt.time_slice = sched_rr_timeslice;
3965 p->rt.on_rq = 0;
3966 p->rt.on_list = 0;
3967
3968#ifdef CONFIG_PREEMPT_NOTIFIERS
3969 INIT_HLIST_HEAD(&p->preempt_notifiers);
3970#endif
3971
3972#ifdef CONFIG_COMPACTION
3973 p->capture_control = NULL;
3974#endif
3975 init_numa_balancing(clone_flags, p);
3976#ifdef CONFIG_SMP
3977 p->wake_entry.u_flags = CSD_TYPE_TTWU;
3978 p->migration_pending = NULL;
3979#endif
3980}
3981
3982DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3983
3984#ifdef CONFIG_NUMA_BALANCING
3985
3986void set_numabalancing_state(bool enabled)
3987{
3988 if (enabled)
3989 static_branch_enable(&sched_numa_balancing);
3990 else
3991 static_branch_disable(&sched_numa_balancing);
3992}
3993
3994#ifdef CONFIG_PROC_SYSCTL
3995int sysctl_numa_balancing(struct ctl_table *table, int write,
3996 void *buffer, size_t *lenp, loff_t *ppos)
3997{
3998 struct ctl_table t;
3999 int err;
4000 int state = static_branch_likely(&sched_numa_balancing);
4001
4002 if (write && !capable(CAP_SYS_ADMIN))
4003 return -EPERM;
4004
4005 t = *table;
4006 t.data = &state;
4007 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4008 if (err < 0)
4009 return err;
4010 if (write)
4011 set_numabalancing_state(state);
4012 return err;
4013}
4014#endif
4015#endif
4016
4017#ifdef CONFIG_SCHEDSTATS
4018
4019DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4020
4021static void set_schedstats(bool enabled)
4022{
4023 if (enabled)
4024 static_branch_enable(&sched_schedstats);
4025 else
4026 static_branch_disable(&sched_schedstats);
4027}
4028
4029void force_schedstat_enabled(void)
4030{
4031 if (!schedstat_enabled()) {
4032 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4033 static_branch_enable(&sched_schedstats);
4034 }
4035}
4036
4037static int __init setup_schedstats(char *str)
4038{
4039 int ret = 0;
4040 if (!str)
4041 goto out;
4042
4043 if (!strcmp(str, "enable")) {
4044 set_schedstats(true);
4045 ret = 1;
4046 } else if (!strcmp(str, "disable")) {
4047 set_schedstats(false);
4048 ret = 1;
4049 }
4050out:
4051 if (!ret)
4052 pr_warn("Unable to parse schedstats=\n");
4053
4054 return ret;
4055}
4056__setup("schedstats=", setup_schedstats);
4057
4058#ifdef CONFIG_PROC_SYSCTL
4059int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4060 size_t *lenp, loff_t *ppos)
4061{
4062 struct ctl_table t;
4063 int err;
4064 int state = static_branch_likely(&sched_schedstats);
4065
4066 if (write && !capable(CAP_SYS_ADMIN))
4067 return -EPERM;
4068
4069 t = *table;
4070 t.data = &state;
4071 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4072 if (err < 0)
4073 return err;
4074 if (write)
4075 set_schedstats(state);
4076 return err;
4077}
4078#endif
4079#endif
4080
4081
4082
4083
4084int sched_fork(unsigned long clone_flags, struct task_struct *p)
4085{
4086 unsigned long flags;
4087
4088 __sched_fork(clone_flags, p);
4089
4090
4091
4092
4093
4094 p->__state = TASK_NEW;
4095
4096
4097
4098
4099 p->prio = current->normal_prio;
4100
4101 uclamp_fork(p);
4102
4103
4104
4105
4106 if (unlikely(p->sched_reset_on_fork)) {
4107 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4108 p->policy = SCHED_NORMAL;
4109 p->static_prio = NICE_TO_PRIO(0);
4110 p->rt_priority = 0;
4111 } else if (PRIO_TO_NICE(p->static_prio) < 0)
4112 p->static_prio = NICE_TO_PRIO(0);
4113
4114 p->prio = p->normal_prio = p->static_prio;
4115 set_load_weight(p, false);
4116
4117
4118
4119
4120
4121 p->sched_reset_on_fork = 0;
4122 }
4123
4124 if (dl_prio(p->prio))
4125 return -EAGAIN;
4126 else if (rt_prio(p->prio))
4127 p->sched_class = &rt_sched_class;
4128 else
4129 p->sched_class = &fair_sched_class;
4130
4131 init_entity_runnable_average(&p->se);
4132
4133
4134
4135
4136
4137
4138
4139
4140 raw_spin_lock_irqsave(&p->pi_lock, flags);
4141 rseq_migrate(p);
4142
4143
4144
4145
4146 __set_task_cpu(p, smp_processor_id());
4147 if (p->sched_class->task_fork)
4148 p->sched_class->task_fork(p);
4149 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4150
4151#ifdef CONFIG_SCHED_INFO
4152 if (likely(sched_info_on()))
4153 memset(&p->sched_info, 0, sizeof(p->sched_info));
4154#endif
4155#if defined(CONFIG_SMP)
4156 p->on_cpu = 0;
4157#endif
4158 init_task_preempt_count(p);
4159#ifdef CONFIG_SMP
4160 plist_node_init(&p->pushable_tasks, MAX_PRIO);
4161 RB_CLEAR_NODE(&p->pushable_dl_tasks);
4162#endif
4163 return 0;
4164}
4165
4166void sched_post_fork(struct task_struct *p)
4167{
4168 uclamp_post_fork(p);
4169}
4170
4171unsigned long to_ratio(u64 period, u64 runtime)
4172{
4173 if (runtime == RUNTIME_INF)
4174 return BW_UNIT;
4175
4176
4177
4178
4179
4180
4181 if (period == 0)
4182 return 0;
4183
4184 return div64_u64(runtime << BW_SHIFT, period);
4185}
4186
4187
4188
4189
4190
4191
4192
4193
4194void wake_up_new_task(struct task_struct *p)
4195{
4196 struct rq_flags rf;
4197 struct rq *rq;
4198
4199 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4200 WRITE_ONCE(p->__state, TASK_RUNNING);
4201#ifdef CONFIG_SMP
4202
4203
4204
4205
4206
4207
4208
4209
4210 p->recent_used_cpu = task_cpu(p);
4211 rseq_migrate(p);
4212 __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4213#endif
4214 rq = __task_rq_lock(p, &rf);
4215 update_rq_clock(rq);
4216 post_init_entity_util_avg(p);
4217
4218 activate_task(rq, p, ENQUEUE_NOCLOCK);
4219 trace_sched_wakeup_new(p);
4220 check_preempt_curr(rq, p, WF_FORK);
4221#ifdef CONFIG_SMP
4222 if (p->sched_class->task_woken) {
4223
4224
4225
4226
4227 rq_unpin_lock(rq, &rf);
4228 p->sched_class->task_woken(rq, p);
4229 rq_repin_lock(rq, &rf);
4230 }
4231#endif
4232 task_rq_unlock(rq, p, &rf);
4233}
4234
4235#ifdef CONFIG_PREEMPT_NOTIFIERS
4236
4237static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4238
4239void preempt_notifier_inc(void)
4240{
4241 static_branch_inc(&preempt_notifier_key);
4242}
4243EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4244
4245void preempt_notifier_dec(void)
4246{
4247 static_branch_dec(&preempt_notifier_key);
4248}
4249EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4250
4251
4252
4253
4254
4255void preempt_notifier_register(struct preempt_notifier *notifier)
4256{
4257 if (!static_branch_unlikely(&preempt_notifier_key))
4258 WARN(1, "registering preempt_notifier while notifiers disabled\n");
4259
4260 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
4261}
4262EXPORT_SYMBOL_GPL(preempt_notifier_register);
4263
4264
4265
4266
4267
4268
4269
4270void preempt_notifier_unregister(struct preempt_notifier *notifier)
4271{
4272 hlist_del(¬ifier->link);
4273}
4274EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4275
4276static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4277{
4278 struct preempt_notifier *notifier;
4279
4280 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4281 notifier->ops->sched_in(notifier, raw_smp_processor_id());
4282}
4283
4284static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4285{
4286 if (static_branch_unlikely(&preempt_notifier_key))
4287 __fire_sched_in_preempt_notifiers(curr);
4288}
4289
4290static void
4291__fire_sched_out_preempt_notifiers(struct task_struct *curr,
4292 struct task_struct *next)
4293{
4294 struct preempt_notifier *notifier;
4295
4296 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4297 notifier->ops->sched_out(notifier, next);
4298}
4299
4300static __always_inline void
4301fire_sched_out_preempt_notifiers(struct task_struct *curr,
4302 struct task_struct *next)
4303{
4304 if (static_branch_unlikely(&preempt_notifier_key))
4305 __fire_sched_out_preempt_notifiers(curr, next);
4306}
4307
4308#else
4309
4310static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4311{
4312}
4313
4314static inline void
4315fire_sched_out_preempt_notifiers(struct task_struct *curr,
4316 struct task_struct *next)
4317{
4318}
4319
4320#endif
4321
4322static inline void prepare_task(struct task_struct *next)
4323{
4324#ifdef CONFIG_SMP
4325
4326
4327
4328
4329
4330
4331 WRITE_ONCE(next->on_cpu, 1);
4332#endif
4333}
4334
4335static inline void finish_task(struct task_struct *prev)
4336{
4337#ifdef CONFIG_SMP
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349 smp_store_release(&prev->on_cpu, 0);
4350#endif
4351}
4352
4353#ifdef CONFIG_SMP
4354
4355static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4356{
4357 void (*func)(struct rq *rq);
4358 struct callback_head *next;
4359
4360 lockdep_assert_rq_held(rq);
4361
4362 while (head) {
4363 func = (void (*)(struct rq *))head->func;
4364 next = head->next;
4365 head->next = NULL;
4366 head = next;
4367
4368 func(rq);
4369 }
4370}
4371
4372static void balance_push(struct rq *rq);
4373
4374struct callback_head balance_push_callback = {
4375 .next = NULL,
4376 .func = (void (*)(struct callback_head *))balance_push,
4377};
4378
4379static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4380{
4381 struct callback_head *head = rq->balance_callback;
4382
4383 lockdep_assert_rq_held(rq);
4384 if (head)
4385 rq->balance_callback = NULL;
4386
4387 return head;
4388}
4389
4390static void __balance_callbacks(struct rq *rq)
4391{
4392 do_balance_callbacks(rq, splice_balance_callbacks(rq));
4393}
4394
4395static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4396{
4397 unsigned long flags;
4398
4399 if (unlikely(head)) {
4400 raw_spin_rq_lock_irqsave(rq, flags);
4401 do_balance_callbacks(rq, head);
4402 raw_spin_rq_unlock_irqrestore(rq, flags);
4403 }
4404}
4405
4406#else
4407
4408static inline void __balance_callbacks(struct rq *rq)
4409{
4410}
4411
4412static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4413{
4414 return NULL;
4415}
4416
4417static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4418{
4419}
4420
4421#endif
4422
4423static inline void
4424prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4425{
4426
4427
4428
4429
4430
4431
4432 rq_unpin_lock(rq, rf);
4433 spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
4434#ifdef CONFIG_DEBUG_SPINLOCK
4435
4436 rq_lockp(rq)->owner = next;
4437#endif
4438}
4439
4440static inline void finish_lock_switch(struct rq *rq)
4441{
4442
4443
4444
4445
4446
4447 spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
4448 __balance_callbacks(rq);
4449 raw_spin_rq_unlock_irq(rq);
4450}
4451
4452
4453
4454
4455
4456#ifndef prepare_arch_switch
4457# define prepare_arch_switch(next) do { } while (0)
4458#endif
4459
4460#ifndef finish_arch_post_lock_switch
4461# define finish_arch_post_lock_switch() do { } while (0)
4462#endif
4463
4464static inline void kmap_local_sched_out(void)
4465{
4466#ifdef CONFIG_KMAP_LOCAL
4467 if (unlikely(current->kmap_ctrl.idx))
4468 __kmap_local_sched_out();
4469#endif
4470}
4471
4472static inline void kmap_local_sched_in(void)
4473{
4474#ifdef CONFIG_KMAP_LOCAL
4475 if (unlikely(current->kmap_ctrl.idx))
4476 __kmap_local_sched_in();
4477#endif
4478}
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493static inline void
4494prepare_task_switch(struct rq *rq, struct task_struct *prev,
4495 struct task_struct *next)
4496{
4497 kcov_prepare_switch(prev);
4498 sched_info_switch(rq, prev, next);
4499 perf_event_task_sched_out(prev, next);
4500 rseq_preempt(prev);
4501 fire_sched_out_preempt_notifiers(prev, next);
4502 kmap_local_sched_out();
4503 prepare_task(next);
4504 prepare_arch_switch(next);
4505}
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526static struct rq *finish_task_switch(struct task_struct *prev)
4527 __releases(rq->lock)
4528{
4529 struct rq *rq = this_rq();
4530 struct mm_struct *mm = rq->prev_mm;
4531 long prev_state;
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
4545 "corrupted preempt_count: %s/%d/0x%x\n",
4546 current->comm, current->pid, preempt_count()))
4547 preempt_count_set(FORK_PREEMPT_COUNT);
4548
4549 rq->prev_mm = NULL;
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562 prev_state = READ_ONCE(prev->__state);
4563 vtime_task_switch(prev);
4564 perf_event_task_sched_in(prev, current);
4565 finish_task(prev);
4566 tick_nohz_task_switch();
4567 finish_lock_switch(rq);
4568 finish_arch_post_lock_switch();
4569 kcov_finish_switch(current);
4570
4571
4572
4573
4574
4575
4576
4577 kmap_local_sched_in();
4578
4579 fire_sched_in_preempt_notifiers(current);
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592 if (mm) {
4593 membarrier_mm_sync_core_before_usermode(mm);
4594 mmdrop(mm);
4595 }
4596 if (unlikely(prev_state == TASK_DEAD)) {
4597 if (prev->sched_class->task_dead)
4598 prev->sched_class->task_dead(prev);
4599
4600
4601
4602
4603
4604 kprobe_flush_task(prev);
4605
4606
4607 put_task_stack(prev);
4608
4609 put_task_struct_rcu_user(prev);
4610 }
4611
4612 return rq;
4613}
4614
4615
4616
4617
4618
4619asmlinkage __visible void schedule_tail(struct task_struct *prev)
4620 __releases(rq->lock)
4621{
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631 finish_task_switch(prev);
4632 preempt_enable();
4633
4634 if (current->set_child_tid)
4635 put_user(task_pid_vnr(current), current->set_child_tid);
4636
4637 calculate_sigpending();
4638}
4639
4640
4641
4642
4643static __always_inline struct rq *
4644context_switch(struct rq *rq, struct task_struct *prev,
4645 struct task_struct *next, struct rq_flags *rf)
4646{
4647 prepare_task_switch(rq, prev, next);
4648
4649
4650
4651
4652
4653
4654 arch_start_context_switch(prev);
4655
4656
4657
4658
4659
4660
4661
4662
4663 if (!next->mm) {
4664 enter_lazy_tlb(prev->active_mm, next);
4665
4666 next->active_mm = prev->active_mm;
4667 if (prev->mm)
4668 mmgrab(prev->active_mm);
4669 else
4670 prev->active_mm = NULL;
4671 } else {
4672 membarrier_switch_mm(rq, prev->active_mm, next->mm);
4673
4674
4675
4676
4677
4678
4679
4680
4681 switch_mm_irqs_off(prev->active_mm, next->mm, next);
4682
4683 if (!prev->mm) {
4684
4685 rq->prev_mm = prev->active_mm;
4686 prev->active_mm = NULL;
4687 }
4688 }
4689
4690 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4691
4692 prepare_lock_switch(rq, next, rf);
4693
4694
4695 switch_to(prev, next, prev);
4696 barrier();
4697
4698 return finish_task_switch(prev);
4699}
4700
4701
4702
4703
4704
4705
4706
4707unsigned int nr_running(void)
4708{
4709 unsigned int i, sum = 0;
4710
4711 for_each_online_cpu(i)
4712 sum += cpu_rq(i)->nr_running;
4713
4714 return sum;
4715}
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730bool single_task_running(void)
4731{
4732 return raw_rq()->nr_running == 1;
4733}
4734EXPORT_SYMBOL(single_task_running);
4735
4736unsigned long long nr_context_switches(void)
4737{
4738 int i;
4739 unsigned long long sum = 0;
4740
4741 for_each_possible_cpu(i)
4742 sum += cpu_rq(i)->nr_switches;
4743
4744 return sum;
4745}
4746
4747
4748
4749
4750
4751
4752
4753
4754unsigned int nr_iowait_cpu(int cpu)
4755{
4756 return atomic_read(&cpu_rq(cpu)->nr_iowait);
4757}
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789unsigned int nr_iowait(void)
4790{
4791 unsigned int i, sum = 0;
4792
4793 for_each_possible_cpu(i)
4794 sum += nr_iowait_cpu(i);
4795
4796 return sum;
4797}
4798
4799#ifdef CONFIG_SMP
4800
4801
4802
4803
4804
4805void sched_exec(void)
4806{
4807 struct task_struct *p = current;
4808 unsigned long flags;
4809 int dest_cpu;
4810
4811 raw_spin_lock_irqsave(&p->pi_lock, flags);
4812 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
4813 if (dest_cpu == smp_processor_id())
4814 goto unlock;
4815
4816 if (likely(cpu_active(dest_cpu))) {
4817 struct migration_arg arg = { p, dest_cpu };
4818
4819 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4820 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
4821 return;
4822 }
4823unlock:
4824 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4825}
4826
4827#endif
4828
4829DEFINE_PER_CPU(struct kernel_stat, kstat);
4830DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
4831
4832EXPORT_PER_CPU_SYMBOL(kstat);
4833EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
4834
4835
4836
4837
4838
4839
4840
4841static inline void prefetch_curr_exec_start(struct task_struct *p)
4842{
4843#ifdef CONFIG_FAIR_GROUP_SCHED
4844 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
4845#else
4846 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
4847#endif
4848 prefetch(curr);
4849 prefetch(&curr->exec_start);
4850}
4851
4852
4853
4854
4855
4856
4857unsigned long long task_sched_runtime(struct task_struct *p)
4858{
4859 struct rq_flags rf;
4860 struct rq *rq;
4861 u64 ns;
4862
4863#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875 if (!p->on_cpu || !task_on_rq_queued(p))
4876 return p->se.sum_exec_runtime;
4877#endif
4878
4879 rq = task_rq_lock(p, &rf);
4880
4881
4882
4883
4884
4885 if (task_current(rq, p) && task_on_rq_queued(p)) {
4886 prefetch_curr_exec_start(p);
4887 update_rq_clock(rq);
4888 p->sched_class->update_curr(rq);
4889 }
4890 ns = p->se.sum_exec_runtime;
4891 task_rq_unlock(rq, p, &rf);
4892
4893 return ns;
4894}
4895
4896#ifdef CONFIG_SCHED_DEBUG
4897static u64 cpu_resched_latency(struct rq *rq)
4898{
4899 int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
4900 u64 resched_latency, now = rq_clock(rq);
4901 static bool warned_once;
4902
4903 if (sysctl_resched_latency_warn_once && warned_once)
4904 return 0;
4905
4906 if (!need_resched() || !latency_warn_ms)
4907 return 0;
4908
4909 if (system_state == SYSTEM_BOOTING)
4910 return 0;
4911
4912 if (!rq->last_seen_need_resched_ns) {
4913 rq->last_seen_need_resched_ns = now;
4914 rq->ticks_without_resched = 0;
4915 return 0;
4916 }
4917
4918 rq->ticks_without_resched++;
4919 resched_latency = now - rq->last_seen_need_resched_ns;
4920 if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
4921 return 0;
4922
4923 warned_once = true;
4924
4925 return resched_latency;
4926}
4927
4928static int __init setup_resched_latency_warn_ms(char *str)
4929{
4930 long val;
4931
4932 if ((kstrtol(str, 0, &val))) {
4933 pr_warn("Unable to set resched_latency_warn_ms\n");
4934 return 1;
4935 }
4936
4937 sysctl_resched_latency_warn_ms = val;
4938 return 1;
4939}
4940__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
4941#else
4942static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
4943#endif
4944
4945
4946
4947
4948
4949void scheduler_tick(void)
4950{
4951 int cpu = smp_processor_id();
4952 struct rq *rq = cpu_rq(cpu);
4953 struct task_struct *curr = rq->curr;
4954 struct rq_flags rf;
4955 unsigned long thermal_pressure;
4956 u64 resched_latency;
4957
4958 arch_scale_freq_tick();
4959 sched_clock_tick();
4960
4961 rq_lock(rq, &rf);
4962
4963 update_rq_clock(rq);
4964 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4965 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
4966 curr->sched_class->task_tick(rq, curr, 0);
4967 if (sched_feat(LATENCY_WARN))
4968 resched_latency = cpu_resched_latency(rq);
4969 calc_global_load_tick(rq);
4970
4971 rq_unlock(rq, &rf);
4972
4973 if (sched_feat(LATENCY_WARN) && resched_latency)
4974 resched_