1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
33
34
35
36
37
38
39
40
41
42
43
44
45
46unsigned int sysctl_sched_latency = 6000000ULL;
47unsigned int normalized_sysctl_sched_latency = 6000000ULL;
48
49
50
51
52
53
54
55
56
57
58enum sched_tunable_scaling sysctl_sched_tunable_scaling
59 = SCHED_TUNABLESCALING_LOG;
60
61
62
63
64
65unsigned int sysctl_sched_min_granularity = 750000ULL;
66unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
67
68
69
70
71static unsigned int sched_nr_latency = 8;
72
73
74
75
76
77unsigned int sysctl_sched_child_runs_first __read_mostly;
78
79
80
81
82
83
84
85
86
87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91
92
93
94
95
96
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
99#ifdef CONFIG_CFS_BANDWIDTH
100
101
102
103
104
105
106
107
108
109
110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
111#endif
112
113
114
115
116
117
118
119
120
121
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168
169
170
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173
174
175
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182
183
184
185
186
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203
204
205
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
217
218
219
220
221
222#ifdef CONFIG_FAIR_GROUP_SCHED
223
224
225static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
226{
227 return cfs_rq->rq;
228}
229
230
231#define entity_is_task(se) (!se->my_q)
232
233static inline struct task_struct *task_of(struct sched_entity *se)
234{
235#ifdef CONFIG_SCHED_DEBUG
236 WARN_ON_ONCE(!entity_is_task(se));
237#endif
238 return container_of(se, struct task_struct, se);
239}
240
241
242#define for_each_sched_entity(se) \
243 for (; se; se = se->parent)
244
245static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
246{
247 return p->se.cfs_rq;
248}
249
250
251static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
252{
253 return se->cfs_rq;
254}
255
256
257static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
258{
259 return grp->my_q;
260}
261
262static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263{
264 if (!cfs_rq->on_list) {
265
266
267
268
269
270
271 if (cfs_rq->tg->parent &&
272 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
273 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
274 &rq_of(cfs_rq)->leaf_cfs_rq_list);
275 } else {
276 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
277 &rq_of(cfs_rq)->leaf_cfs_rq_list);
278 }
279
280 cfs_rq->on_list = 1;
281 }
282}
283
284static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
285{
286 if (cfs_rq->on_list) {
287 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
288 cfs_rq->on_list = 0;
289 }
290}
291
292
293#define for_each_leaf_cfs_rq(rq, cfs_rq) \
294 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
295
296
297static inline int
298is_same_group(struct sched_entity *se, struct sched_entity *pse)
299{
300 if (se->cfs_rq == pse->cfs_rq)
301 return 1;
302
303 return 0;
304}
305
306static inline struct sched_entity *parent_entity(struct sched_entity *se)
307{
308 return se->parent;
309}
310
311
312static inline int depth_se(struct sched_entity *se)
313{
314 int depth = 0;
315
316 for_each_sched_entity(se)
317 depth++;
318
319 return depth;
320}
321
322static void
323find_matching_se(struct sched_entity **se, struct sched_entity **pse)
324{
325 int se_depth, pse_depth;
326
327
328
329
330
331
332
333
334
335 se_depth = depth_se(*se);
336 pse_depth = depth_se(*pse);
337
338 while (se_depth > pse_depth) {
339 se_depth--;
340 *se = parent_entity(*se);
341 }
342
343 while (pse_depth > se_depth) {
344 pse_depth--;
345 *pse = parent_entity(*pse);
346 }
347
348 while (!is_same_group(*se, *pse)) {
349 *se = parent_entity(*se);
350 *pse = parent_entity(*pse);
351 }
352}
353
354#else
355
356static inline struct task_struct *task_of(struct sched_entity *se)
357{
358 return container_of(se, struct task_struct, se);
359}
360
361static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
362{
363 return container_of(cfs_rq, struct rq, cfs);
364}
365
366#define entity_is_task(se) 1
367
368#define for_each_sched_entity(se) \
369 for (; se; se = NULL)
370
371static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
372{
373 return &task_rq(p)->cfs;
374}
375
376static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
377{
378 struct task_struct *p = task_of(se);
379 struct rq *rq = task_rq(p);
380
381 return &rq->cfs;
382}
383
384
385static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
386{
387 return NULL;
388}
389
390static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
391{
392}
393
394static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
395{
396}
397
398#define for_each_leaf_cfs_rq(rq, cfs_rq) \
399 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
400
401static inline int
402is_same_group(struct sched_entity *se, struct sched_entity *pse)
403{
404 return 1;
405}
406
407static inline struct sched_entity *parent_entity(struct sched_entity *se)
408{
409 return NULL;
410}
411
412static inline void
413find_matching_se(struct sched_entity **se, struct sched_entity **pse)
414{
415}
416
417#endif
418
419static __always_inline
420void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
421
422
423
424
425
426static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
427{
428 s64 delta = (s64)(vruntime - min_vruntime);
429 if (delta > 0)
430 min_vruntime = vruntime;
431
432 return min_vruntime;
433}
434
435static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
436{
437 s64 delta = (s64)(vruntime - min_vruntime);
438 if (delta < 0)
439 min_vruntime = vruntime;
440
441 return min_vruntime;
442}
443
444static inline int entity_before(struct sched_entity *a,
445 struct sched_entity *b)
446{
447 return (s64)(a->vruntime - b->vruntime) < 0;
448}
449
450static void update_min_vruntime(struct cfs_rq *cfs_rq)
451{
452 u64 vruntime = cfs_rq->min_vruntime;
453
454 if (cfs_rq->curr)
455 vruntime = cfs_rq->curr->vruntime;
456
457 if (cfs_rq->rb_leftmost) {
458 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
459 struct sched_entity,
460 run_node);
461
462 if (!cfs_rq->curr)
463 vruntime = se->vruntime;
464 else
465 vruntime = min_vruntime(vruntime, se->vruntime);
466 }
467
468 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
469#ifndef CONFIG_64BIT
470 smp_wmb();
471 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
472#endif
473}
474
475
476
477
478static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
479{
480 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
481 struct rb_node *parent = NULL;
482 struct sched_entity *entry;
483 int leftmost = 1;
484
485
486
487
488 while (*link) {
489 parent = *link;
490 entry = rb_entry(parent, struct sched_entity, run_node);
491
492
493
494
495 if (entity_before(se, entry)) {
496 link = &parent->rb_left;
497 } else {
498 link = &parent->rb_right;
499 leftmost = 0;
500 }
501 }
502
503
504
505
506
507 if (leftmost)
508 cfs_rq->rb_leftmost = &se->run_node;
509
510 rb_link_node(&se->run_node, parent, link);
511 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
512}
513
514static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
515{
516 if (cfs_rq->rb_leftmost == &se->run_node) {
517 struct rb_node *next_node;
518
519 next_node = rb_next(&se->run_node);
520 cfs_rq->rb_leftmost = next_node;
521 }
522
523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
524}
525
526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
527{
528 struct rb_node *left = cfs_rq->rb_leftmost;
529
530 if (!left)
531 return NULL;
532
533 return rb_entry(left, struct sched_entity, run_node);
534}
535
536static struct sched_entity *__pick_next_entity(struct sched_entity *se)
537{
538 struct rb_node *next = rb_next(&se->run_node);
539
540 if (!next)
541 return NULL;
542
543 return rb_entry(next, struct sched_entity, run_node);
544}
545
546#ifdef CONFIG_SCHED_DEBUG
547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
548{
549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
550
551 if (!last)
552 return NULL;
553
554 return rb_entry(last, struct sched_entity, run_node);
555}
556
557
558
559
560
561int sched_proc_update_handler(struct ctl_table *table, int write,
562 void __user *buffer, size_t *lenp,
563 loff_t *ppos)
564{
565 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
566 int factor = get_update_sysctl_factor();
567
568 if (ret || !write)
569 return ret;
570
571 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
572 sysctl_sched_min_granularity);
573
574#define WRT_SYSCTL(name) \
575 (normalized_sysctl_##name = sysctl_##name / (factor))
576 WRT_SYSCTL(sched_min_granularity);
577 WRT_SYSCTL(sched_latency);
578 WRT_SYSCTL(sched_wakeup_granularity);
579#undef WRT_SYSCTL
580
581 return 0;
582}
583#endif
584
585
586
587
588static inline unsigned long
589calc_delta_fair(unsigned long delta, struct sched_entity *se)
590{
591 if (unlikely(se->load.weight != NICE_0_LOAD))
592 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
593
594 return delta;
595}
596
597
598
599
600
601
602
603
604
605static u64 __sched_period(unsigned long nr_running)
606{
607 u64 period = sysctl_sched_latency;
608 unsigned long nr_latency = sched_nr_latency;
609
610 if (unlikely(nr_running > nr_latency)) {
611 period = sysctl_sched_min_granularity;
612 period *= nr_running;
613 }
614
615 return period;
616}
617
618
619
620
621
622
623
624static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
625{
626 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
627
628 for_each_sched_entity(se) {
629 struct load_weight *load;
630 struct load_weight lw;
631
632 cfs_rq = cfs_rq_of(se);
633 load = &cfs_rq->load;
634
635 if (unlikely(!se->on_rq)) {
636 lw = cfs_rq->load;
637
638 update_load_add(&lw, se->load.weight);
639 load = &lw;
640 }
641 slice = calc_delta_mine(slice, se->load.weight, load);
642 }
643 return slice;
644}
645
646
647
648
649
650
651static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652{
653 return calc_delta_fair(sched_slice(cfs_rq, se), se);
654}
655
656static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657static void update_cfs_shares(struct cfs_rq *cfs_rq);
658
659
660
661
662
663static inline void
664__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
665 unsigned long delta_exec)
666{
667 unsigned long delta_exec_weighted;
668
669 schedstat_set(curr->statistics.exec_max,
670 max((u64)delta_exec, curr->statistics.exec_max));
671
672 curr->sum_exec_runtime += delta_exec;
673 schedstat_add(cfs_rq, exec_clock, delta_exec);
674 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
675
676 curr->vruntime += delta_exec_weighted;
677 update_min_vruntime(cfs_rq);
678
679#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 cfs_rq->load_unacc_exec_time += delta_exec;
681#endif
682}
683
684static void update_curr(struct cfs_rq *cfs_rq)
685{
686 struct sched_entity *curr = cfs_rq->curr;
687 u64 now = rq_of(cfs_rq)->clock_task;
688 unsigned long delta_exec;
689
690 if (unlikely(!curr))
691 return;
692
693
694
695
696
697
698 delta_exec = (unsigned long)(now - curr->exec_start);
699 if (!delta_exec)
700 return;
701
702 __update_curr(cfs_rq, curr, delta_exec);
703 curr->exec_start = now;
704
705 if (entity_is_task(curr)) {
706 struct task_struct *curtask = task_of(curr);
707
708 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
709 cpuacct_charge(curtask, delta_exec);
710 account_group_exec_runtime(curtask, delta_exec);
711 }
712
713 account_cfs_rq_runtime(cfs_rq, delta_exec);
714}
715
716static inline void
717update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
718{
719 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
720}
721
722
723
724
725static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
726{
727
728
729
730
731 if (se != cfs_rq->curr)
732 update_stats_wait_start(cfs_rq, se);
733}
734
735static void
736update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
737{
738 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
739 rq_of(cfs_rq)->clock - se->statistics.wait_start));
740 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
741 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
742 rq_of(cfs_rq)->clock - se->statistics.wait_start);
743#ifdef CONFIG_SCHEDSTATS
744 if (entity_is_task(se)) {
745 trace_sched_stat_wait(task_of(se),
746 rq_of(cfs_rq)->clock - se->statistics.wait_start);
747 }
748#endif
749 schedstat_set(se->statistics.wait_start, 0);
750}
751
752static inline void
753update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
754{
755
756
757
758
759 if (se != cfs_rq->curr)
760 update_stats_wait_end(cfs_rq, se);
761}
762
763
764
765
766static inline void
767update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
768{
769
770
771
772 se->exec_start = rq_of(cfs_rq)->clock_task;
773}
774
775
776
777
778
779static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{
782 update_load_add(&cfs_rq->load, se->load.weight);
783 if (!parent_entity(se))
784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
785#ifdef CONFIG_SMP
786 if (entity_is_task(se))
787 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788#endif
789 cfs_rq->nr_running++;
790}
791
792static void
793account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794{
795 update_load_sub(&cfs_rq->load, se->load.weight);
796 if (!parent_entity(se))
797 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
798 if (entity_is_task(se))
799 list_del_init(&se->group_node);
800 cfs_rq->nr_running--;
801}
802
803#ifdef CONFIG_FAIR_GROUP_SCHED
804
805static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806# ifdef CONFIG_SMP
807static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 int global_update)
809{
810 struct task_group *tg = cfs_rq->tg;
811 long load_avg;
812
813 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 load_avg -= cfs_rq->load_contribution;
815
816 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 atomic_add(load_avg, &tg->load_weight);
818 cfs_rq->load_contribution += load_avg;
819 }
820}
821
822static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823{
824 u64 period = sysctl_sched_shares_window;
825 u64 now, delta;
826 unsigned long load = cfs_rq->load.weight;
827
828 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 return;
830
831 now = rq_of(cfs_rq)->clock_task;
832 delta = now - cfs_rq->load_stamp;
833
834
835 if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 now - cfs_rq->load_last > 4 * period) {
837 cfs_rq->load_period = 0;
838 cfs_rq->load_avg = 0;
839 delta = period - 1;
840 }
841
842 cfs_rq->load_stamp = now;
843 cfs_rq->load_unacc_exec_time = 0;
844 cfs_rq->load_period += delta;
845 if (load) {
846 cfs_rq->load_last = now;
847 cfs_rq->load_avg += delta * load;
848 }
849
850
851 if (global_update || cfs_rq->load_period > period
852 || !cfs_rq->load_period)
853 update_cfs_rq_load_contribution(cfs_rq, global_update);
854
855 while (cfs_rq->load_period > period) {
856
857
858
859
860
861 asm("" : "+rm" (cfs_rq->load_period));
862 cfs_rq->load_period /= 2;
863 cfs_rq->load_avg /= 2;
864 }
865
866 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 list_del_leaf_cfs_rq(cfs_rq);
868}
869
870static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871{
872 long tg_weight;
873
874
875
876
877
878
879 tg_weight = atomic_read(&tg->load_weight);
880 tg_weight -= cfs_rq->load_contribution;
881 tg_weight += cfs_rq->load.weight;
882
883 return tg_weight;
884}
885
886static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
887{
888 long tg_weight, load, shares;
889
890 tg_weight = calc_tg_weight(tg, cfs_rq);
891 load = cfs_rq->load.weight;
892
893 shares = (tg->shares * load);
894 if (tg_weight)
895 shares /= tg_weight;
896
897 if (shares < MIN_SHARES)
898 shares = MIN_SHARES;
899 if (shares > tg->shares)
900 shares = tg->shares;
901
902 return shares;
903}
904
905static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906{
907 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 update_cfs_load(cfs_rq, 0);
909 update_cfs_shares(cfs_rq);
910 }
911}
912# else
913static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914{
915}
916
917static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918{
919 return tg->shares;
920}
921
922static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923{
924}
925# endif
926static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 unsigned long weight)
928{
929 if (se->on_rq) {
930
931 if (cfs_rq->curr == se)
932 update_curr(cfs_rq);
933 account_entity_dequeue(cfs_rq, se);
934 }
935
936 update_load_set(&se->load, weight);
937
938 if (se->on_rq)
939 account_entity_enqueue(cfs_rq, se);
940}
941
942static void update_cfs_shares(struct cfs_rq *cfs_rq)
943{
944 struct task_group *tg;
945 struct sched_entity *se;
946 long shares;
947
948 tg = cfs_rq->tg;
949 se = tg->se[cpu_of(rq_of(cfs_rq))];
950 if (!se || throttled_hierarchy(cfs_rq))
951 return;
952#ifndef CONFIG_SMP
953 if (likely(se->load.weight == tg->shares))
954 return;
955#endif
956 shares = calc_cfs_shares(cfs_rq, tg);
957
958 reweight_entity(cfs_rq_of(se), se, shares);
959}
960#else
961static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
962{
963}
964
965static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
966{
967}
968
969static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
970{
971}
972#endif
973
974static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975{
976#ifdef CONFIG_SCHEDSTATS
977 struct task_struct *tsk = NULL;
978
979 if (entity_is_task(se))
980 tsk = task_of(se);
981
982 if (se->statistics.sleep_start) {
983 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
984
985 if ((s64)delta < 0)
986 delta = 0;
987
988 if (unlikely(delta > se->statistics.sleep_max))
989 se->statistics.sleep_max = delta;
990
991 se->statistics.sleep_start = 0;
992 se->statistics.sum_sleep_runtime += delta;
993
994 if (tsk) {
995 account_scheduler_latency(tsk, delta >> 10, 1);
996 trace_sched_stat_sleep(tsk, delta);
997 }
998 }
999 if (se->statistics.block_start) {
1000 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1001
1002 if ((s64)delta < 0)
1003 delta = 0;
1004
1005 if (unlikely(delta > se->statistics.block_max))
1006 se->statistics.block_max = delta;
1007
1008 se->statistics.block_start = 0;
1009 se->statistics.sum_sleep_runtime += delta;
1010
1011 if (tsk) {
1012 if (tsk->in_iowait) {
1013 se->statistics.iowait_sum += delta;
1014 se->statistics.iowait_count++;
1015 trace_sched_stat_iowait(tsk, delta);
1016 }
1017
1018 trace_sched_stat_blocked(tsk, delta);
1019
1020
1021
1022
1023
1024
1025 if (unlikely(prof_on == SLEEP_PROFILING)) {
1026 profile_hits(SLEEP_PROFILING,
1027 (void *)get_wchan(tsk),
1028 delta >> 20);
1029 }
1030 account_scheduler_latency(tsk, delta >> 10, 0);
1031 }
1032 }
1033#endif
1034}
1035
1036static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1037{
1038#ifdef CONFIG_SCHED_DEBUG
1039 s64 d = se->vruntime - cfs_rq->min_vruntime;
1040
1041 if (d < 0)
1042 d = -d;
1043
1044 if (d > 3*sysctl_sched_latency)
1045 schedstat_inc(cfs_rq, nr_spread_over);
1046#endif
1047}
1048
1049static void
1050place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1051{
1052 u64 vruntime = cfs_rq->min_vruntime;
1053
1054
1055
1056
1057
1058
1059
1060 if (initial && sched_feat(START_DEBIT))
1061 vruntime += sched_vslice(cfs_rq, se);
1062
1063
1064 if (!initial) {
1065 unsigned long thresh = sysctl_sched_latency;
1066
1067
1068
1069
1070
1071 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1072 thresh >>= 1;
1073
1074 vruntime -= thresh;
1075 }
1076
1077
1078 vruntime = max_vruntime(se->vruntime, vruntime);
1079
1080 se->vruntime = vruntime;
1081}
1082
1083static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1084
1085static void
1086enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1087{
1088
1089
1090
1091
1092 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1093 se->vruntime += cfs_rq->min_vruntime;
1094
1095
1096
1097
1098 update_curr(cfs_rq);
1099 update_cfs_load(cfs_rq, 0);
1100 account_entity_enqueue(cfs_rq, se);
1101 update_cfs_shares(cfs_rq);
1102
1103 if (flags & ENQUEUE_WAKEUP) {
1104 place_entity(cfs_rq, se, 0);
1105 enqueue_sleeper(cfs_rq, se);
1106 }
1107
1108 update_stats_enqueue(cfs_rq, se);
1109 check_spread(cfs_rq, se);
1110 if (se != cfs_rq->curr)
1111 __enqueue_entity(cfs_rq, se);
1112 se->on_rq = 1;
1113
1114 if (cfs_rq->nr_running == 1) {
1115 list_add_leaf_cfs_rq(cfs_rq);
1116 check_enqueue_throttle(cfs_rq);
1117 }
1118}
1119
1120static void __clear_buddies_last(struct sched_entity *se)
1121{
1122 for_each_sched_entity(se) {
1123 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1124 if (cfs_rq->last == se)
1125 cfs_rq->last = NULL;
1126 else
1127 break;
1128 }
1129}
1130
1131static void __clear_buddies_next(struct sched_entity *se)
1132{
1133 for_each_sched_entity(se) {
1134 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1135 if (cfs_rq->next == se)
1136 cfs_rq->next = NULL;
1137 else
1138 break;
1139 }
1140}
1141
1142static void __clear_buddies_skip(struct sched_entity *se)
1143{
1144 for_each_sched_entity(se) {
1145 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1146 if (cfs_rq->skip == se)
1147 cfs_rq->skip = NULL;
1148 else
1149 break;
1150 }
1151}
1152
1153static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1154{
1155 if (cfs_rq->last == se)
1156 __clear_buddies_last(se);
1157
1158 if (cfs_rq->next == se)
1159 __clear_buddies_next(se);
1160
1161 if (cfs_rq->skip == se)
1162 __clear_buddies_skip(se);
1163}
1164
1165static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1166
1167static void
1168dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1169{
1170
1171
1172
1173 update_curr(cfs_rq);
1174
1175 update_stats_dequeue(cfs_rq, se);
1176 if (flags & DEQUEUE_SLEEP) {
1177#ifdef CONFIG_SCHEDSTATS
1178 if (entity_is_task(se)) {
1179 struct task_struct *tsk = task_of(se);
1180
1181 if (tsk->state & TASK_INTERRUPTIBLE)
1182 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1183 if (tsk->state & TASK_UNINTERRUPTIBLE)
1184 se->statistics.block_start = rq_of(cfs_rq)->clock;
1185 }
1186#endif
1187 }
1188
1189 clear_buddies(cfs_rq, se);
1190
1191 if (se != cfs_rq->curr)
1192 __dequeue_entity(cfs_rq, se);
1193 se->on_rq = 0;
1194 update_cfs_load(cfs_rq, 0);
1195 account_entity_dequeue(cfs_rq, se);
1196
1197
1198
1199
1200
1201
1202 if (!(flags & DEQUEUE_SLEEP))
1203 se->vruntime -= cfs_rq->min_vruntime;
1204
1205
1206 return_cfs_rq_runtime(cfs_rq);
1207
1208 update_min_vruntime(cfs_rq);
1209 update_cfs_shares(cfs_rq);
1210}
1211
1212
1213
1214
1215static void
1216check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1217{
1218 unsigned long ideal_runtime, delta_exec;
1219 struct sched_entity *se;
1220 s64 delta;
1221
1222 ideal_runtime = sched_slice(cfs_rq, curr);
1223 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1224 if (delta_exec > ideal_runtime) {
1225 resched_task(rq_of(cfs_rq)->curr);
1226
1227
1228
1229
1230 clear_buddies(cfs_rq, curr);
1231 return;
1232 }
1233
1234
1235
1236
1237
1238
1239 if (delta_exec < sysctl_sched_min_granularity)
1240 return;
1241
1242 se = __pick_first_entity(cfs_rq);
1243 delta = curr->vruntime - se->vruntime;
1244
1245 if (delta < 0)
1246 return;
1247
1248 if (delta > ideal_runtime)
1249 resched_task(rq_of(cfs_rq)->curr);
1250}
1251
1252static void
1253set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1254{
1255
1256 if (se->on_rq) {
1257
1258
1259
1260
1261
1262 update_stats_wait_end(cfs_rq, se);
1263 __dequeue_entity(cfs_rq, se);
1264 }
1265
1266 update_stats_curr_start(cfs_rq, se);
1267 cfs_rq->curr = se;
1268#ifdef CONFIG_SCHEDSTATS
1269
1270
1271
1272
1273
1274 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1275 se->statistics.slice_max = max(se->statistics.slice_max,
1276 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1277 }
1278#endif
1279 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1280}
1281
1282static int
1283wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1284
1285
1286
1287
1288
1289
1290
1291
1292static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1293{
1294 struct sched_entity *se = __pick_first_entity(cfs_rq);
1295 struct sched_entity *left = se;
1296
1297
1298
1299
1300
1301 if (cfs_rq->skip == se) {
1302 struct sched_entity *second = __pick_next_entity(se);
1303 if (second && wakeup_preempt_entity(second, left) < 1)
1304 se = second;
1305 }
1306
1307
1308
1309
1310 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1311 se = cfs_rq->last;
1312
1313
1314
1315
1316 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1317 se = cfs_rq->next;
1318
1319 clear_buddies(cfs_rq, se);
1320
1321 return se;
1322}
1323
1324static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1325
1326static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1327{
1328
1329
1330
1331
1332 if (prev->on_rq)
1333 update_curr(cfs_rq);
1334
1335
1336 check_cfs_rq_runtime(cfs_rq);
1337
1338 check_spread(cfs_rq, prev);
1339 if (prev->on_rq) {
1340 update_stats_wait_start(cfs_rq, prev);
1341
1342 __enqueue_entity(cfs_rq, prev);
1343 }
1344 cfs_rq->curr = NULL;
1345}
1346
1347static void
1348entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1349{
1350
1351
1352
1353 update_curr(cfs_rq);
1354
1355
1356
1357
1358 update_entity_shares_tick(cfs_rq);
1359
1360#ifdef CONFIG_SCHED_HRTICK
1361
1362
1363
1364
1365 if (queued) {
1366 resched_task(rq_of(cfs_rq)->curr);
1367 return;
1368 }
1369
1370
1371
1372 if (!sched_feat(DOUBLE_TICK) &&
1373 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1374 return;
1375#endif
1376
1377 if (cfs_rq->nr_running > 1)
1378 check_preempt_tick(cfs_rq, curr);
1379}
1380
1381
1382
1383
1384
1385
1386#ifdef CONFIG_CFS_BANDWIDTH
1387
1388#ifdef HAVE_JUMP_LABEL
1389static struct static_key __cfs_bandwidth_used;
1390
1391static inline bool cfs_bandwidth_used(void)
1392{
1393 return static_key_false(&__cfs_bandwidth_used);
1394}
1395
1396void account_cfs_bandwidth_used(int enabled, int was_enabled)
1397{
1398
1399 if (enabled && !was_enabled)
1400 static_key_slow_inc(&__cfs_bandwidth_used);
1401 else if (!enabled && was_enabled)
1402 static_key_slow_dec(&__cfs_bandwidth_used);
1403}
1404#else
1405static bool cfs_bandwidth_used(void)
1406{
1407 return true;
1408}
1409
1410void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1411#endif
1412
1413
1414
1415
1416
1417static inline u64 default_cfs_period(void)
1418{
1419 return 100000000ULL;
1420}
1421
1422static inline u64 sched_cfs_bandwidth_slice(void)
1423{
1424 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1425}
1426
1427
1428
1429
1430
1431
1432
1433
1434void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1435{
1436 u64 now;
1437
1438 if (cfs_b->quota == RUNTIME_INF)
1439 return;
1440
1441 now = sched_clock_cpu(smp_processor_id());
1442 cfs_b->runtime = cfs_b->quota;
1443 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1444}
1445
1446static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1447{
1448 return &tg->cfs_bandwidth;
1449}
1450
1451
1452static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1453{
1454 struct task_group *tg = cfs_rq->tg;
1455 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1456 u64 amount = 0, min_amount, expires;
1457
1458
1459 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1460
1461 raw_spin_lock(&cfs_b->lock);
1462 if (cfs_b->quota == RUNTIME_INF)
1463 amount = min_amount;
1464 else {
1465
1466
1467
1468
1469
1470
1471 if (!cfs_b->timer_active) {
1472 __refill_cfs_bandwidth_runtime(cfs_b);
1473 __start_cfs_bandwidth(cfs_b);
1474 }
1475
1476 if (cfs_b->runtime > 0) {
1477 amount = min(cfs_b->runtime, min_amount);
1478 cfs_b->runtime -= amount;
1479 cfs_b->idle = 0;
1480 }
1481 }
1482 expires = cfs_b->runtime_expires;
1483 raw_spin_unlock(&cfs_b->lock);
1484
1485 cfs_rq->runtime_remaining += amount;
1486
1487
1488
1489
1490
1491 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1492 cfs_rq->runtime_expires = expires;
1493
1494 return cfs_rq->runtime_remaining > 0;
1495}
1496
1497
1498
1499
1500
1501static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1502{
1503 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1504 struct rq *rq = rq_of(cfs_rq);
1505
1506
1507 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1508 return;
1509
1510 if (cfs_rq->runtime_remaining < 0)
1511 return;
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1523
1524 cfs_rq->runtime_expires += TICK_NSEC;
1525 } else {
1526
1527 cfs_rq->runtime_remaining = 0;
1528 }
1529}
1530
1531static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1532 unsigned long delta_exec)
1533{
1534
1535 cfs_rq->runtime_remaining -= delta_exec;
1536 expire_cfs_rq_runtime(cfs_rq);
1537
1538 if (likely(cfs_rq->runtime_remaining > 0))
1539 return;
1540
1541
1542
1543
1544
1545 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1546 resched_task(rq_of(cfs_rq)->curr);
1547}
1548
1549static __always_inline
1550void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
1551{
1552 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1553 return;
1554
1555 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1556}
1557
1558static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1559{
1560 return cfs_bandwidth_used() && cfs_rq->throttled;
1561}
1562
1563
1564static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1565{
1566 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1567}
1568
1569
1570
1571
1572
1573
1574static inline int throttled_lb_pair(struct task_group *tg,
1575 int src_cpu, int dest_cpu)
1576{
1577 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1578
1579 src_cfs_rq = tg->cfs_rq[src_cpu];
1580 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1581
1582 return throttled_hierarchy(src_cfs_rq) ||
1583 throttled_hierarchy(dest_cfs_rq);
1584}
1585
1586
1587static int tg_unthrottle_up(struct task_group *tg, void *data)
1588{
1589 struct rq *rq = data;
1590 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1591
1592 cfs_rq->throttle_count--;
1593#ifdef CONFIG_SMP
1594 if (!cfs_rq->throttle_count) {
1595 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1596
1597
1598 cfs_rq->load_stamp += delta;
1599 cfs_rq->load_last += delta;
1600
1601
1602 update_cfs_shares(cfs_rq);
1603 }
1604#endif
1605
1606 return 0;
1607}
1608
1609static int tg_throttle_down(struct task_group *tg, void *data)
1610{
1611 struct rq *rq = data;
1612 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1613
1614
1615 if (!cfs_rq->throttle_count)
1616 update_cfs_load(cfs_rq, 0);
1617 cfs_rq->throttle_count++;
1618
1619 return 0;
1620}
1621
1622static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1623{
1624 struct rq *rq = rq_of(cfs_rq);
1625 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1626 struct sched_entity *se;
1627 long task_delta, dequeue = 1;
1628
1629 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1630
1631
1632 rcu_read_lock();
1633 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1634 rcu_read_unlock();
1635
1636 task_delta = cfs_rq->h_nr_running;
1637 for_each_sched_entity(se) {
1638 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1639
1640 if (!se->on_rq)
1641 break;
1642
1643 if (dequeue)
1644 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1645 qcfs_rq->h_nr_running -= task_delta;
1646
1647 if (qcfs_rq->load.weight)
1648 dequeue = 0;
1649 }
1650
1651 if (!se)
1652 rq->nr_running -= task_delta;
1653
1654 cfs_rq->throttled = 1;
1655 cfs_rq->throttled_timestamp = rq->clock;
1656 raw_spin_lock(&cfs_b->lock);
1657 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1658 raw_spin_unlock(&cfs_b->lock);
1659}
1660
1661void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1662{
1663 struct rq *rq = rq_of(cfs_rq);
1664 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1665 struct sched_entity *se;
1666 int enqueue = 1;
1667 long task_delta;
1668
1669 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1670
1671 cfs_rq->throttled = 0;
1672 raw_spin_lock(&cfs_b->lock);
1673 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1674 list_del_rcu(&cfs_rq->throttled_list);
1675 raw_spin_unlock(&cfs_b->lock);
1676 cfs_rq->throttled_timestamp = 0;
1677
1678 update_rq_clock(rq);
1679
1680 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1681
1682 if (!cfs_rq->load.weight)
1683 return;
1684
1685 task_delta = cfs_rq->h_nr_running;
1686 for_each_sched_entity(se) {
1687 if (se->on_rq)
1688 enqueue = 0;
1689
1690 cfs_rq = cfs_rq_of(se);
1691 if (enqueue)
1692 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1693 cfs_rq->h_nr_running += task_delta;
1694
1695 if (cfs_rq_throttled(cfs_rq))
1696 break;
1697 }
1698
1699 if (!se)
1700 rq->nr_running += task_delta;
1701
1702
1703 if (rq->curr == rq->idle && rq->cfs.nr_running)
1704 resched_task(rq->curr);
1705}
1706
1707static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1708 u64 remaining, u64 expires)
1709{
1710 struct cfs_rq *cfs_rq;
1711 u64 runtime = remaining;
1712
1713 rcu_read_lock();
1714 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1715 throttled_list) {
1716 struct rq *rq = rq_of(cfs_rq);
1717
1718 raw_spin_lock(&rq->lock);
1719 if (!cfs_rq_throttled(cfs_rq))
1720 goto next;
1721
1722 runtime = -cfs_rq->runtime_remaining + 1;
1723 if (runtime > remaining)
1724 runtime = remaining;
1725 remaining -= runtime;
1726
1727 cfs_rq->runtime_remaining += runtime;
1728 cfs_rq->runtime_expires = expires;
1729
1730
1731 if (cfs_rq->runtime_remaining > 0)
1732 unthrottle_cfs_rq(cfs_rq);
1733
1734next:
1735 raw_spin_unlock(&rq->lock);
1736
1737 if (!remaining)
1738 break;
1739 }
1740 rcu_read_unlock();
1741
1742 return remaining;
1743}
1744
1745
1746
1747
1748
1749
1750
1751static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1752{
1753 u64 runtime, runtime_expires;
1754 int idle = 1, throttled;
1755
1756 raw_spin_lock(&cfs_b->lock);
1757
1758 if (cfs_b->quota == RUNTIME_INF)
1759 goto out_unlock;
1760
1761 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1762
1763 idle = cfs_b->idle && !throttled;
1764 cfs_b->nr_periods += overrun;
1765
1766
1767 if (idle)
1768 goto out_unlock;
1769
1770 __refill_cfs_bandwidth_runtime(cfs_b);
1771
1772 if (!throttled) {
1773
1774 cfs_b->idle = 1;
1775 goto out_unlock;
1776 }
1777
1778
1779 cfs_b->nr_throttled += overrun;
1780
1781
1782
1783
1784
1785
1786
1787 runtime = cfs_b->runtime;
1788 runtime_expires = cfs_b->runtime_expires;
1789 cfs_b->runtime = 0;
1790
1791
1792
1793
1794
1795
1796 while (throttled && runtime > 0) {
1797 raw_spin_unlock(&cfs_b->lock);
1798
1799 runtime = distribute_cfs_runtime(cfs_b, runtime,
1800 runtime_expires);
1801 raw_spin_lock(&cfs_b->lock);
1802
1803 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1804 }
1805
1806
1807 cfs_b->runtime = runtime;
1808
1809
1810
1811
1812
1813
1814 cfs_b->idle = 0;
1815out_unlock:
1816 if (idle)
1817 cfs_b->timer_active = 0;
1818 raw_spin_unlock(&cfs_b->lock);
1819
1820 return idle;
1821}
1822
1823
1824static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1825
1826static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1827
1828static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1829
1830
1831static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1832{
1833 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1834 u64 remaining;
1835
1836
1837 if (hrtimer_callback_running(refresh_timer))
1838 return 1;
1839
1840
1841 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1842 if (remaining < min_expire)
1843 return 1;
1844
1845 return 0;
1846}
1847
1848static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1849{
1850 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1851
1852
1853 if (runtime_refresh_within(cfs_b, min_left))
1854 return;
1855
1856 start_bandwidth_timer(&cfs_b->slack_timer,
1857 ns_to_ktime(cfs_bandwidth_slack_period));
1858}
1859
1860
1861static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1862{
1863 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1864 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1865
1866 if (slack_runtime <= 0)
1867 return;
1868
1869 raw_spin_lock(&cfs_b->lock);
1870 if (cfs_b->quota != RUNTIME_INF &&
1871 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1872 cfs_b->runtime += slack_runtime;
1873
1874
1875 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1876 !list_empty(&cfs_b->throttled_cfs_rq))
1877 start_cfs_slack_bandwidth(cfs_b);
1878 }
1879 raw_spin_unlock(&cfs_b->lock);
1880
1881
1882 cfs_rq->runtime_remaining -= slack_runtime;
1883}
1884
1885static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1886{
1887 if (!cfs_bandwidth_used())
1888 return;
1889
1890 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1891 return;
1892
1893 __return_cfs_rq_runtime(cfs_rq);
1894}
1895
1896
1897
1898
1899
1900static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1901{
1902 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1903 u64 expires;
1904
1905
1906 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1907 return;
1908
1909 raw_spin_lock(&cfs_b->lock);
1910 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1911 runtime = cfs_b->runtime;
1912 cfs_b->runtime = 0;
1913 }
1914 expires = cfs_b->runtime_expires;
1915 raw_spin_unlock(&cfs_b->lock);
1916
1917 if (!runtime)
1918 return;
1919
1920 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1921
1922 raw_spin_lock(&cfs_b->lock);
1923 if (expires == cfs_b->runtime_expires)
1924 cfs_b->runtime = runtime;
1925 raw_spin_unlock(&cfs_b->lock);
1926}
1927
1928
1929
1930
1931
1932
1933static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1934{
1935 if (!cfs_bandwidth_used())
1936 return;
1937
1938
1939 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1940 return;
1941
1942
1943 if (cfs_rq_throttled(cfs_rq))
1944 return;
1945
1946
1947 account_cfs_rq_runtime(cfs_rq, 0);
1948 if (cfs_rq->runtime_remaining <= 0)
1949 throttle_cfs_rq(cfs_rq);
1950}
1951
1952
1953static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1954{
1955 if (!cfs_bandwidth_used())
1956 return;
1957
1958 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1959 return;
1960
1961
1962
1963
1964
1965 if (cfs_rq_throttled(cfs_rq))
1966 return;
1967
1968 throttle_cfs_rq(cfs_rq);
1969}
1970
1971static inline u64 default_cfs_period(void);
1972static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1973static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1974
1975static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1976{
1977 struct cfs_bandwidth *cfs_b =
1978 container_of(timer, struct cfs_bandwidth, slack_timer);
1979 do_sched_cfs_slack_timer(cfs_b);
1980
1981 return HRTIMER_NORESTART;
1982}
1983
1984static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1985{
1986 struct cfs_bandwidth *cfs_b =
1987 container_of(timer, struct cfs_bandwidth, period_timer);
1988 ktime_t now;
1989 int overrun;
1990 int idle = 0;
1991
1992 for (;;) {
1993 now = hrtimer_cb_get_time(timer);
1994 overrun = hrtimer_forward(timer, now, cfs_b->period);
1995
1996 if (!overrun)
1997 break;
1998
1999 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2000 }
2001
2002 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2003}
2004
2005void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2006{
2007 raw_spin_lock_init(&cfs_b->lock);
2008 cfs_b->runtime = 0;
2009 cfs_b->quota = RUNTIME_INF;
2010 cfs_b->period = ns_to_ktime(default_cfs_period());
2011
2012 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2013 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2014 cfs_b->period_timer.function = sched_cfs_period_timer;
2015 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2016 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2017}
2018
2019static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2020{
2021 cfs_rq->runtime_enabled = 0;
2022 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2023}
2024
2025
2026void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2027{
2028
2029
2030
2031
2032
2033
2034 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2035 raw_spin_unlock(&cfs_b->lock);
2036
2037 hrtimer_cancel(&cfs_b->period_timer);
2038
2039 raw_spin_lock(&cfs_b->lock);
2040
2041 if (cfs_b->timer_active)
2042 return;
2043 }
2044
2045 cfs_b->timer_active = 1;
2046 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2047}
2048
2049static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2050{
2051 hrtimer_cancel(&cfs_b->period_timer);
2052 hrtimer_cancel(&cfs_b->slack_timer);
2053}
2054
2055static void unthrottle_offline_cfs_rqs(struct rq *rq)
2056{
2057 struct cfs_rq *cfs_rq;
2058
2059 for_each_leaf_cfs_rq(rq, cfs_rq) {
2060 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2061
2062 if (!cfs_rq->runtime_enabled)
2063 continue;
2064
2065
2066
2067
2068
2069 cfs_rq->runtime_remaining = cfs_b->quota;
2070 if (cfs_rq_throttled(cfs_rq))
2071 unthrottle_cfs_rq(cfs_rq);
2072 }
2073}
2074
2075#else
2076static __always_inline
2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2081
2082static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2083{
2084 return 0;
2085}
2086
2087static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2088{
2089 return 0;
2090}
2091
2092static inline int throttled_lb_pair(struct task_group *tg,
2093 int src_cpu, int dest_cpu)
2094{
2095 return 0;
2096}
2097
2098void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2099
2100#ifdef CONFIG_FAIR_GROUP_SCHED
2101static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2102#endif
2103
2104static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2105{
2106 return NULL;
2107}
2108static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2109static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2110
2111#endif
2112
2113
2114
2115
2116
2117#ifdef CONFIG_SCHED_HRTICK
2118static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2119{
2120 struct sched_entity *se = &p->se;
2121 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2122
2123 WARN_ON(task_rq(p) != rq);
2124
2125 if (cfs_rq->nr_running > 1) {
2126 u64 slice = sched_slice(cfs_rq, se);
2127 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2128 s64 delta = slice - ran;
2129
2130 if (delta < 0) {
2131 if (rq->curr == p)
2132 resched_task(p);
2133 return;
2134 }
2135
2136
2137
2138
2139
2140 if (rq->curr != p)
2141 delta = max_t(s64, 10000LL, delta);
2142
2143 hrtick_start(rq, delta);
2144 }
2145}
2146
2147
2148
2149
2150
2151
2152static void hrtick_update(struct rq *rq)
2153{
2154 struct task_struct *curr = rq->curr;
2155
2156 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2157 return;
2158
2159 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2160 hrtick_start_fair(rq, curr);
2161}
2162#else
2163static inline void
2164hrtick_start_fair(struct rq *rq, struct task_struct *p)
2165{
2166}
2167
2168static inline void hrtick_update(struct rq *rq)
2169{
2170}
2171#endif
2172
2173
2174
2175
2176
2177
2178static void
2179enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2180{
2181 struct cfs_rq *cfs_rq;
2182 struct sched_entity *se = &p->se;
2183
2184 for_each_sched_entity(se) {
2185 if (se->on_rq)
2186 break;
2187 cfs_rq = cfs_rq_of(se);
2188 enqueue_entity(cfs_rq, se, flags);
2189
2190
2191
2192
2193
2194
2195
2196 if (cfs_rq_throttled(cfs_rq))
2197 break;
2198 cfs_rq->h_nr_running++;
2199
2200 flags = ENQUEUE_WAKEUP;
2201 }
2202
2203 for_each_sched_entity(se) {
2204 cfs_rq = cfs_rq_of(se);
2205 cfs_rq->h_nr_running++;
2206
2207 if (cfs_rq_throttled(cfs_rq))
2208 break;
2209
2210 update_cfs_load(cfs_rq, 0);
2211 update_cfs_shares(cfs_rq);
2212 }
2213
2214 if (!se)
2215 inc_nr_running(rq);
2216 hrtick_update(rq);
2217}
2218
2219static void set_next_buddy(struct sched_entity *se);
2220
2221
2222
2223
2224
2225
2226static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2227{
2228 struct cfs_rq *cfs_rq;
2229 struct sched_entity *se = &p->se;
2230 int task_sleep = flags & DEQUEUE_SLEEP;
2231
2232 for_each_sched_entity(se) {
2233 cfs_rq = cfs_rq_of(se);
2234 dequeue_entity(cfs_rq, se, flags);
2235
2236
2237
2238
2239
2240
2241
2242 if (cfs_rq_throttled(cfs_rq))
2243 break;
2244 cfs_rq->h_nr_running--;
2245
2246
2247 if (cfs_rq->load.weight) {
2248
2249
2250
2251
2252 if (task_sleep && parent_entity(se))
2253 set_next_buddy(parent_entity(se));
2254
2255
2256 se = parent_entity(se);
2257 break;
2258 }
2259 flags |= DEQUEUE_SLEEP;
2260 }
2261
2262 for_each_sched_entity(se) {
2263 cfs_rq = cfs_rq_of(se);
2264 cfs_rq->h_nr_running--;
2265
2266 if (cfs_rq_throttled(cfs_rq))
2267 break;
2268
2269 update_cfs_load(cfs_rq, 0);
2270 update_cfs_shares(cfs_rq);
2271 }
2272
2273 if (!se)
2274 dec_nr_running(rq);
2275 hrtick_update(rq);
2276}
2277
2278#ifdef CONFIG_SMP
2279
2280static unsigned long weighted_cpuload(const int cpu)
2281{
2282 return cpu_rq(cpu)->load.weight;
2283}
2284
2285
2286
2287
2288
2289
2290
2291
2292static unsigned long source_load(int cpu, int type)
2293{
2294 struct rq *rq = cpu_rq(cpu);
2295 unsigned long total = weighted_cpuload(cpu);
2296
2297 if (type == 0 || !sched_feat(LB_BIAS))
2298 return total;
2299
2300 return min(rq->cpu_load[type-1], total);
2301}
2302
2303
2304
2305
2306
2307static unsigned long target_load(int cpu, int type)
2308{
2309 struct rq *rq = cpu_rq(cpu);
2310 unsigned long total = weighted_cpuload(cpu);
2311
2312 if (type == 0 || !sched_feat(LB_BIAS))
2313 return total;
2314
2315 return max(rq->cpu_load[type-1], total);
2316}
2317
2318static unsigned long power_of(int cpu)
2319{
2320 return cpu_rq(cpu)->cpu_power;
2321}
2322
2323static unsigned long cpu_avg_load_per_task(int cpu)
2324{
2325 struct rq *rq = cpu_rq(cpu);
2326 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2327
2328 if (nr_running)
2329 return rq->load.weight / nr_running;
2330
2331 return 0;
2332}
2333
2334
2335static void task_waking_fair(struct task_struct *p)
2336{
2337 struct sched_entity *se = &p->se;
2338 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2339 u64 min_vruntime;
2340
2341#ifndef CONFIG_64BIT
2342 u64 min_vruntime_copy;
2343
2344 do {
2345 min_vruntime_copy = cfs_rq->min_vruntime_copy;
2346 smp_rmb();
2347 min_vruntime = cfs_rq->min_vruntime;
2348 } while (min_vruntime != min_vruntime_copy);
2349#else
2350 min_vruntime = cfs_rq->min_vruntime;
2351#endif
2352
2353 se->vruntime -= min_vruntime;
2354}
2355
2356#ifdef CONFIG_FAIR_GROUP_SCHED
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2408{
2409 struct sched_entity *se = tg->se[cpu];
2410
2411 if (!tg->parent)
2412 return wl;
2413
2414 for_each_sched_entity(se) {
2415 long w, W;
2416
2417 tg = se->my_q->tg;
2418
2419
2420
2421
2422 W = wg + calc_tg_weight(tg, se->my_q);
2423
2424
2425
2426
2427 w = se->my_q->load.weight + wl;
2428
2429
2430
2431
2432 if (W > 0 && w < W)
2433 wl = (w * tg->shares) / W;
2434 else
2435 wl = tg->shares;
2436
2437
2438
2439
2440
2441
2442 if (wl < MIN_SHARES)
2443 wl = MIN_SHARES;
2444
2445
2446
2447
2448 wl -= se->load.weight;
2449
2450
2451
2452
2453
2454
2455
2456
2457 wg = 0;
2458 }
2459
2460 return wl;
2461}
2462#else
2463
2464static inline unsigned long effective_load(struct task_group *tg, int cpu,
2465 unsigned long wl, unsigned long wg)
2466{
2467 return wl;
2468}
2469
2470#endif
2471
2472static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
2473{
2474 s64 this_load, load;
2475 int idx, this_cpu, prev_cpu;
2476 unsigned long tl_per_task;
2477 struct task_group *tg;
2478 unsigned long weight;
2479 int balanced;
2480
2481 idx = sd->wake_idx;
2482 this_cpu = smp_processor_id();
2483 prev_cpu = task_cpu(p);
2484 load = source_load(prev_cpu, idx);
2485 this_load = target_load(this_cpu, idx);
2486
2487
2488
2489
2490
2491
2492 if (sync) {
2493 tg = task_group(current);
2494 weight = current->se.load.weight;
2495
2496 this_load += effective_load(tg, this_cpu, -weight, -weight);
2497 load += effective_load(tg, prev_cpu, 0, -weight);
2498 }
2499
2500 tg = task_group(p);
2501 weight = p->se.load.weight;
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512 if (this_load > 0) {
2513 s64 this_eff_load, prev_eff_load;
2514
2515 this_eff_load = 100;
2516 this_eff_load *= power_of(prev_cpu);
2517 this_eff_load *= this_load +
2518 effective_load(tg, this_cpu, weight, weight);
2519
2520 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2521 prev_eff_load *= power_of(this_cpu);
2522 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
2523
2524 balanced = this_eff_load <= prev_eff_load;
2525 } else
2526 balanced = true;
2527
2528
2529
2530
2531
2532
2533 if (sync && balanced)
2534 return 1;
2535
2536 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
2537 tl_per_task = cpu_avg_load_per_task(this_cpu);
2538
2539 if (balanced ||
2540 (this_load <= load &&
2541 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
2542
2543
2544
2545
2546
2547 schedstat_inc(sd, ttwu_move_affine);
2548 schedstat_inc(p, se.statistics.nr_wakeups_affine);
2549
2550 return 1;
2551 }
2552 return 0;
2553}
2554
2555
2556
2557
2558
2559static struct sched_group *
2560find_idlest_group(struct sched_domain *sd, struct task_struct *p,
2561 int this_cpu, int load_idx)
2562{
2563 struct sched_group *idlest = NULL, *group = sd->groups;
2564 unsigned long min_load = ULONG_MAX, this_load = 0;
2565 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2566
2567 do {
2568 unsigned long load, avg_load;
2569 int local_group;
2570 int i;
2571
2572
2573 if (!cpumask_intersects(sched_group_cpus(group),
2574 tsk_cpus_allowed(p)))
2575 continue;
2576
2577 local_group = cpumask_test_cpu(this_cpu,
2578 sched_group_cpus(group));
2579
2580
2581 avg_load = 0;
2582
2583 for_each_cpu(i, sched_group_cpus(group)) {
2584
2585 if (local_group)
2586 load = source_load(i, load_idx);
2587 else
2588 load = target_load(i, load_idx);
2589
2590 avg_load += load;
2591 }
2592
2593
2594 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
2595
2596 if (local_group) {
2597 this_load = avg_load;
2598 } else if (avg_load < min_load) {
2599 min_load = avg_load;
2600 idlest = group;
2601 }
2602 } while (group = group->next, group != sd->groups);
2603
2604 if (!idlest || 100*this_load < imbalance*min_load)
2605 return NULL;
2606 return idlest;
2607}
2608
2609
2610
2611
2612static int
2613find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2614{
2615 unsigned long load, min_load = ULONG_MAX;
2616 int idlest = -1;
2617 int i;
2618
2619
2620 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
2621 load = weighted_cpuload(i);
2622
2623 if (load < min_load || (load == min_load && i == this_cpu)) {
2624 min_load = load;
2625 idlest = i;
2626 }
2627 }
2628
2629 return idlest;
2630}
2631
2632
2633
2634
2635static int select_idle_sibling(struct task_struct *p, int target)
2636{
2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642
2643
2644
2645
2646
2647 if (target == cpu && idle_cpu(cpu))
2648 return cpu;
2649
2650
2651
2652
2653
2654 if (target == prev_cpu && idle_cpu(prev_cpu))
2655 return prev_cpu;
2656
2657
2658
2659
2660 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672
2673 target = cpumask_first_and(sched_group_cpus(sg),
2674 tsk_cpus_allowed(p));
2675 goto done;
2676next:
2677 sg = sg->next;
2678 } while (sg != sd->groups);
2679 }
2680done:
2681 return target;
2682}
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695static int
2696select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2697{
2698 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
2699 int cpu = smp_processor_id();
2700 int prev_cpu = task_cpu(p);
2701 int new_cpu = cpu;
2702 int want_affine = 0;
2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC;
2705
2706 if (p->nr_cpus_allowed == 1)
2707 return prev_cpu;
2708
2709 if (sd_flag & SD_BALANCE_WAKE) {
2710 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2711 want_affine = 1;
2712 new_cpu = prev_cpu;
2713 }
2714
2715 rcu_read_lock();
2716 for_each_domain(cpu, tmp) {
2717 if (!(tmp->flags & SD_LOAD_BALANCE))
2718 continue;
2719
2720
2721
2722
2723
2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0;
2726 unsigned long nr_running = 0;
2727 unsigned long capacity;
2728 int i;
2729
2730 for_each_cpu(i, sched_domain_span(tmp)) {
2731 power += power_of(i);
2732 nr_running += cpu_rq(i)->cfs.nr_running;
2733 }
2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736
2737 if (nr_running < capacity)
2738 want_sd = 0;
2739 }
2740
2741
2742
2743
2744
2745 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2746 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2747 affine_sd = tmp;
2748 want_affine = 0;
2749 }
2750
2751 if (!want_sd && !want_affine)
2752 break;
2753
2754 if (!(tmp->flags & sd_flag))
2755 continue;
2756
2757 if (want_sd)
2758 sd = tmp;
2759 }
2760
2761 if (affine_sd) {
2762 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
2763 prev_cpu = cpu;
2764
2765 new_cpu = select_idle_sibling(p, prev_cpu);
2766 goto unlock;
2767 }
2768
2769 while (sd) {
2770 int load_idx = sd->forkexec_idx;
2771 struct sched_group *group;
2772 int weight;
2773
2774 if (!(sd->flags & sd_flag)) {
2775 sd = sd->child;
2776 continue;
2777 }
2778
2779 if (sd_flag & SD_BALANCE_WAKE)
2780 load_idx = sd->wake_idx;
2781
2782 group = find_idlest_group(sd, p, cpu, load_idx);
2783 if (!group) {
2784 sd = sd->child;
2785 continue;
2786 }
2787
2788 new_cpu = find_idlest_cpu(group, p, cpu);
2789 if (new_cpu == -1 || new_cpu == cpu) {
2790
2791 sd = sd->child;
2792 continue;
2793 }
2794
2795
2796 cpu = new_cpu;
2797 weight = sd->span_weight;
2798 sd = NULL;
2799 for_each_domain(cpu, tmp) {
2800 if (weight <= tmp->span_weight)
2801 break;
2802 if (tmp->flags & sd_flag)
2803 sd = tmp;
2804 }
2805
2806 }
2807unlock:
2808 rcu_read_unlock();
2809
2810 return new_cpu;
2811}
2812#endif
2813
2814static unsigned long
2815wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
2816{
2817 unsigned long gran = sysctl_sched_wakeup_granularity;
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832 return calc_delta_fair(gran, se);
2833}
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849static int
2850wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
2851{
2852 s64 gran, vdiff = curr->vruntime - se->vruntime;
2853
2854 if (vdiff <= 0)
2855 return -1;
2856
2857 gran = wakeup_gran(curr, se);
2858 if (vdiff > gran)
2859 return 1;
2860
2861 return 0;
2862}
2863
2864static void set_last_buddy(struct sched_entity *se)
2865{
2866 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2867 return;
2868
2869 for_each_sched_entity(se)
2870 cfs_rq_of(se)->last = se;
2871}
2872
2873static void set_next_buddy(struct sched_entity *se)
2874{
2875 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2876 return;
2877
2878 for_each_sched_entity(se)
2879 cfs_rq_of(se)->next = se;
2880}
2881
2882static void set_skip_buddy(struct sched_entity *se)
2883{
2884 for_each_sched_entity(se)
2885 cfs_rq_of(se)->skip = se;
2886}
2887
2888
2889
2890
2891static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2892{
2893 struct task_struct *curr = rq->curr;
2894 struct sched_entity *se = &curr->se, *pse = &p->se;
2895 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2896 int scale = cfs_rq->nr_running >= sched_nr_latency;
2897 int next_buddy_marked = 0;
2898
2899 if (unlikely(se == pse))
2900 return;
2901
2902
2903
2904
2905
2906
2907
2908 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2909 return;
2910
2911 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
2912 set_next_buddy(pse);
2913 next_buddy_marked = 1;
2914 }
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926 if (test_tsk_need_resched(curr))
2927 return;
2928
2929
2930 if (unlikely(curr->policy == SCHED_IDLE) &&
2931 likely(p->policy != SCHED_IDLE))
2932 goto preempt;
2933
2934
2935
2936
2937
2938 if (unlikely(p->policy != SCHED_NORMAL))
2939 return;
2940
2941 find_matching_se(&se, &pse);
2942 update_curr(cfs_rq_of(se));
2943 BUG_ON(!pse);
2944 if (wakeup_preempt_entity(se, pse) == 1) {
2945
2946
2947
2948
2949 if (!next_buddy_marked)
2950 set_next_buddy(pse);
2951 goto preempt;
2952 }
2953
2954 return;
2955
2956preempt:
2957 resched_task(curr);
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967 if (unlikely(!se->on_rq || curr == rq->idle))
2968 return;
2969
2970 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
2971 set_last_buddy(se);
2972}
2973
2974static struct task_struct *pick_next_task_fair(struct rq *rq)
2975{
2976 struct task_struct *p;
2977 struct cfs_rq *cfs_rq = &rq->cfs;
2978 struct sched_entity *se;
2979
2980 if (!cfs_rq->nr_running)
2981 return NULL;
2982
2983 do {
2984 se = pick_next_entity(cfs_rq);
2985 set_next_entity(cfs_rq, se);
2986 cfs_rq = group_cfs_rq(se);
2987 } while (cfs_rq);
2988
2989 p = task_of(se);
2990 if (hrtick_enabled(rq))
2991 hrtick_start_fair(rq, p);
2992
2993 return p;
2994}
2995
2996
2997
2998
2999static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3000{
3001 struct sched_entity *se = &prev->se;
3002 struct cfs_rq *cfs_rq;
3003
3004 for_each_sched_entity(se) {
3005 cfs_rq = cfs_rq_of(se);
3006 put_prev_entity(cfs_rq, se);
3007 }
3008}
3009
3010
3011
3012
3013
3014
3015static void yield_task_fair(struct rq *rq)
3016{
3017 struct task_struct *curr = rq->curr;
3018 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3019 struct sched_entity *se = &curr->se;
3020
3021
3022
3023
3024 if (unlikely(rq->nr_running == 1))
3025 return;
3026
3027 clear_buddies(cfs_rq, se);
3028
3029 if (curr->policy != SCHED_BATCH) {
3030 update_rq_clock(rq);
3031
3032
3033
3034 update_curr(cfs_rq);
3035
3036
3037
3038
3039
3040 rq->skip_clock_update = 1;
3041 }
3042
3043 set_skip_buddy(se);
3044}
3045
3046static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3047{
3048 struct sched_entity *se = &p->se;
3049
3050
3051 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3052 return false;
3053
3054
3055 set_next_buddy(se);
3056
3057 yield_task_fair(rq);
3058
3059 return true;
3060}
3061
3062#ifdef CONFIG_SMP
3063
3064
3065
3066
3067static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068
3069#define LBF_ALL_PINNED 0x01
3070#define LBF_NEED_BREAK 0x02
3071#define LBF_SOME_PINNED 0x04
3072
3073struct lb_env {
3074 struct sched_domain *sd;
3075
3076 struct rq *src_rq;
3077 int src_cpu;
3078
3079 int dst_cpu;
3080 struct rq *dst_rq;
3081
3082 struct cpumask *dst_grpmask;
3083 int new_dst_cpu;
3084 enum cpu_idle_type idle;
3085 long imbalance;
3086
3087 struct cpumask *cpus;
3088
3089 unsigned int flags;
3090
3091 unsigned int loop;
3092 unsigned int loop_break;
3093 unsigned int loop_max;
3094};
3095
3096
3097
3098
3099
3100static void move_task(struct task_struct *p, struct lb_env *env)
3101{
3102 deactivate_task(env->src_rq, p, 0);
3103 set_task_cpu(p, env->dst_cpu);
3104 activate_task(env->dst_rq, p, 0);
3105 check_preempt_curr(env->dst_rq, p, 0);
3106}
3107
3108
3109
3110
3111static int
3112task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3113{
3114 s64 delta;
3115
3116 if (p->sched_class != &fair_sched_class)
3117 return 0;
3118
3119 if (unlikely(p->policy == SCHED_IDLE))
3120 return 0;
3121
3122
3123
3124
3125 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3126 (&p->se == cfs_rq_of(&p->se)->next ||
3127 &p->se == cfs_rq_of(&p->se)->last))
3128 return 1;
3129
3130 if (sysctl_sched_migration_cost == -1)
3131 return 1;
3132 if (sysctl_sched_migration_cost == 0)
3133 return 0;
3134
3135 delta = now - p->se.exec_start;
3136
3137 return delta < (s64)sysctl_sched_migration_cost;
3138}
3139
3140
3141
3142
3143static
3144int can_migrate_task(struct task_struct *p, struct lb_env *env)
3145{
3146 int tsk_cache_hot = 0;
3147
3148
3149
3150
3151
3152
3153 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3154 int new_dst_cpu;
3155
3156 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3167 return 0;
3168
3169 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3170 tsk_cpus_allowed(p));
3171 if (new_dst_cpu < nr_cpu_ids) {
3172 env->flags |= LBF_SOME_PINNED;
3173 env->new_dst_cpu = new_dst_cpu;
3174 }
3175 return 0;
3176 }
3177
3178
3179 env->flags &= ~LBF_ALL_PINNED;
3180
3181 if (task_running(env->src_rq, p)) {
3182 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3183 return 0;
3184 }
3185
3186
3187
3188
3189
3190
3191
3192 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3193 if (!tsk_cache_hot ||
3194 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3195#ifdef CONFIG_SCHEDSTATS
3196 if (tsk_cache_hot) {
3197 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3198 schedstat_inc(p, se.statistics.nr_forced_migrations);
3199 }
3200#endif
3201 return 1;
3202 }
3203
3204 if (tsk_cache_hot) {
3205 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3206 return 0;
3207 }
3208 return 1;
3209}
3210
3211
3212
3213
3214
3215
3216
3217
3218static int move_one_task(struct lb_env *env)
3219{
3220 struct task_struct *p, *n;
3221
3222 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3223 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3224 continue;
3225
3226 if (!can_migrate_task(p, env))
3227 continue;
3228
3229 move_task(p, env);
3230
3231
3232
3233
3234
3235 schedstat_inc(env->sd, lb_gained[env->idle]);
3236 return 1;
3237 }
3238 return 0;
3239}
3240
3241static unsigned long task_h_load(struct task_struct *p);
3242
3243static const unsigned int sched_nr_migrate_break = 32;
3244
3245
3246
3247
3248
3249
3250
3251
3252static int move_tasks(struct lb_env *env)
3253{
3254 struct list_head *tasks = &env->src_rq->cfs_tasks;
3255 struct task_struct *p;
3256 unsigned long load;
3257 int pulled = 0;
3258
3259 if (env->imbalance <= 0)
3260 return 0;
3261
3262 while (!list_empty(tasks)) {
3263 p = list_first_entry(tasks, struct task_struct, se.group_node);
3264
3265 env->loop++;
3266
3267 if (env->loop > env->loop_max)
3268 break;
3269
3270
3271 if (env->loop > env->loop_break) {
3272 env->loop_break += sched_nr_migrate_break;
3273 env->flags |= LBF_NEED_BREAK;
3274 break;
3275 }
3276
3277 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3278 goto next;
3279
3280 load = task_h_load(p);
3281
3282 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3283 goto next;
3284
3285 if ((load / 2) > env->imbalance)
3286 goto next;
3287
3288 if (!can_migrate_task(p, env))
3289 goto next;
3290
3291 move_task(p, env);
3292 pulled++;
3293 env->imbalance -= load;
3294
3295#ifdef CONFIG_PREEMPT
3296
3297
3298
3299
3300
3301 if (env->idle == CPU_NEWLY_IDLE)
3302 break;
3303#endif
3304
3305
3306
3307
3308
3309 if (env->imbalance <= 0)
3310 break;
3311
3312 continue;
3313next:
3314 list_move_tail(&p->se.group_node, tasks);
3315 }
3316
3317
3318
3319
3320
3321
3322 schedstat_add(env->sd, lb_gained[env->idle], pulled);
3323
3324 return pulled;
3325}
3326
3327#ifdef CONFIG_FAIR_GROUP_SCHED
3328
3329
3330
3331static int update_shares_cpu(struct task_group *tg, int cpu)
3332{
3333 struct cfs_rq *cfs_rq;
3334 unsigned long flags;
3335 struct rq *rq;
3336
3337 if (!tg->se[cpu])
3338 return 0;
3339
3340 rq = cpu_rq(cpu);
3341 cfs_rq = tg->cfs_rq[cpu];
3342
3343 raw_spin_lock_irqsave(&rq->lock, flags);
3344
3345 update_rq_clock(rq);
3346 update_cfs_load(cfs_rq, 1);
3347
3348
3349
3350
3351
3352 update_cfs_shares(cfs_rq);
3353
3354 raw_spin_unlock_irqrestore(&rq->lock, flags);
3355
3356 return 0;
3357}
3358
3359static void update_shares(int cpu)
3360{
3361 struct cfs_rq *cfs_rq;
3362 struct rq *rq = cpu_rq(cpu);
3363
3364 rcu_read_lock();
3365
3366
3367
3368
3369 for_each_leaf_cfs_rq(rq, cfs_rq) {
3370
3371 if (throttled_hierarchy(cfs_rq))
3372 continue;
3373
3374 update_shares_cpu(cfs_rq->tg, cpu);
3375 }
3376 rcu_read_unlock();
3377}
3378
3379
3380
3381
3382
3383
3384static int tg_load_down(struct task_group *tg, void *data)
3385{
3386 unsigned long load;
3387 long cpu = (long)data;
3388
3389 if (!tg->parent) {
3390 load = cpu_rq(cpu)->load.weight;
3391 } else {
3392 load = tg->parent->cfs_rq[cpu]->h_load;
3393 load *= tg->se[cpu]->load.weight;
3394 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
3395 }
3396
3397 tg->cfs_rq[cpu]->h_load = load;
3398
3399 return 0;
3400}
3401
3402static void update_h_load(long cpu)
3403{
3404 struct rq *rq = cpu_rq(cpu);
3405 unsigned long now = jiffies;
3406
3407 if (rq->h_load_throttle == now)
3408 return;
3409
3410 rq->h_load_throttle = now;
3411
3412 rcu_read_lock();
3413 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3414 rcu_read_unlock();
3415}
3416
3417static unsigned long task_h_load(struct task_struct *p)
3418{
3419 struct cfs_rq *cfs_rq = task_cfs_rq(p);
3420 unsigned long load;
3421
3422 load = p->se.load.weight;
3423 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
3424
3425 return load;
3426}
3427#else
3428static inline void update_shares(int cpu)
3429{
3430}
3431
3432static inline void update_h_load(long cpu)
3433{
3434}
3435
3436static unsigned long task_h_load(struct task_struct *p)
3437{
3438 return p->se.load.weight;
3439}
3440#endif
3441
3442
3443
3444
3445
3446
3447struct sd_lb_stats {
3448 struct sched_group *busiest;
3449 struct sched_group *this;
3450 unsigned long total_load;
3451 unsigned long total_pwr;
3452 unsigned long avg_load;
3453
3454
3455 unsigned long this_load;
3456 unsigned long this_load_per_task;
3457 unsigned long this_nr_running;
3458 unsigned long this_has_capacity;
3459 unsigned int this_idle_cpus;
3460
3461
3462 unsigned int busiest_idle_cpus;
3463 unsigned long max_load;
3464 unsigned long busiest_load_per_task;
3465 unsigned long busiest_nr_running;
3466 unsigned long busiest_group_capacity;
3467 unsigned long busiest_has_capacity;
3468 unsigned int busiest_group_weight;
3469
3470 int group_imb;
3471};
3472
3473
3474
3475
3476struct sg_lb_stats {
3477 unsigned long avg_load;
3478 unsigned long group_load;
3479 unsigned long sum_nr_running;
3480 unsigned long sum_weighted_load;
3481 unsigned long group_capacity;
3482 unsigned long idle_cpus;
3483 unsigned long group_weight;
3484 int group_imb;
3485 int group_has_capacity;
3486};
3487
3488
3489
3490
3491
3492
3493static inline int get_sd_load_idx(struct sched_domain *sd,
3494 enum cpu_idle_type idle)
3495{
3496 int load_idx;
3497
3498 switch (idle) {
3499 case CPU_NOT_IDLE:
3500 load_idx = sd->busy_idx;
3501 break;
3502
3503 case CPU_NEWLY_IDLE:
3504 load_idx = sd->newidle_idx;
3505 break;
3506 default:
3507 load_idx = sd->idle_idx;
3508 break;
3509 }
3510
3511 return load_idx;
3512}
3513
3514unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3515{
3516 return SCHED_POWER_SCALE;
3517}
3518
3519unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3520{
3521 return default_scale_freq_power(sd, cpu);
3522}
3523
3524unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3525{
3526 unsigned long weight = sd->span_weight;
3527 unsigned long smt_gain = sd->smt_gain;
3528
3529 smt_gain /= weight;
3530
3531 return smt_gain;
3532}
3533
3534unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3535{
3536 return default_scale_smt_power(sd, cpu);
3537}
3538
3539unsigned long scale_rt_power(int cpu)
3540{
3541 struct rq *rq = cpu_rq(cpu);
3542 u64 total, available, age_stamp, avg;
3543
3544
3545
3546
3547
3548 age_stamp = ACCESS_ONCE(rq->age_stamp);
3549 avg = ACCESS_ONCE(rq->rt_avg);
3550
3551 total = sched_avg_period() + (rq->clock - age_stamp);
3552
3553 if (unlikely(total < avg)) {
3554
3555 available = 0;
3556 } else {
3557 available = total - avg;
3558 }
3559
3560 if (unlikely((s64)total < SCHED_POWER_SCALE))
3561 total = SCHED_POWER_SCALE;
3562
3563 total >>= SCHED_POWER_SHIFT;
3564
3565 return div_u64(available, total);
3566}
3567
3568static void update_cpu_power(struct sched_domain *sd, int cpu)
3569{
3570 unsigned long weight = sd->span_weight;
3571 unsigned long power = SCHED_POWER_SCALE;
3572 struct sched_group *sdg = sd->groups;
3573
3574 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3575 if (sched_feat(ARCH_POWER))
3576 power *= arch_scale_smt_power(sd, cpu);
3577 else
3578 power *= default_scale_smt_power(sd, cpu);
3579
3580 power >>= SCHED_POWER_SHIFT;
3581 }
3582
3583 sdg->sgp->power_orig = power;
3584
3585 if (sched_feat(ARCH_POWER))
3586 power *= arch_scale_freq_power(sd, cpu);
3587 else
3588 power *= default_scale_freq_power(sd, cpu);
3589
3590 power >>= SCHED_POWER_SHIFT;
3591
3592 power *= scale_rt_power(cpu);
3593 power >>= SCHED_POWER_SHIFT;
3594
3595 if (!power)
3596 power = 1;
3597
3598 cpu_rq(cpu)->cpu_power = power;
3599 sdg->sgp->power = power;
3600}
3601
3602void update_group_power(struct sched_domain *sd, int cpu)
3603{
3604 struct sched_domain *child = sd->child;
3605 struct sched_group *group, *sdg = sd->groups;
3606 unsigned long power;
3607 unsigned long interval;
3608
3609 interval = msecs_to_jiffies(sd->balance_interval);
3610 interval = clamp(interval, 1UL, max_load_balance_interval);
3611 sdg->sgp->next_update = jiffies + interval;
3612
3613 if (!child) {
3614 update_cpu_power(sd, cpu);
3615 return;
3616 }
3617
3618 power = 0;
3619
3620 if (child->flags & SD_OVERLAP) {
3621
3622
3623
3624
3625
3626 for_each_cpu(cpu, sched_group_cpus(sdg))
3627 power += power_of(cpu);
3628 } else {
3629
3630
3631
3632
3633
3634 group = child->groups;
3635 do {
3636 power += group->sgp->power;
3637 group = group->next;
3638 } while (group != child->groups);
3639 }
3640
3641 sdg->sgp->power_orig = sdg->sgp->power = power;
3642}
3643
3644
3645
3646
3647
3648
3649
3650
3651static inline int
3652fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3653{
3654
3655
3656
3657 if (!(sd->flags & SD_SHARE_CPUPOWER))
3658 return 0;
3659
3660
3661
3662
3663 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
3664 return 1;
3665
3666 return 0;
3667}
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678static inline void update_sg_lb_stats(struct lb_env *env,
3679 struct sched_group *group, int load_idx,
3680 int local_group, int *balance, struct sg_lb_stats *sgs)
3681{
3682 unsigned long nr_running, max_nr_running, min_nr_running;
3683 unsigned long load, max_cpu_load, min_cpu_load;
3684 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3685 unsigned long avg_load_per_task = 0;
3686 int i;
3687
3688 if (local_group)
3689 balance_cpu = group_balance_cpu(group);
3690
3691
3692 max_cpu_load = 0;
3693 min_cpu_load = ~0UL;
3694 max_nr_running = 0;
3695 min_nr_running = ~0UL;
3696
3697 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
3698 struct rq *rq = cpu_rq(i);
3699
3700 nr_running = rq->nr_running;
3701
3702
3703 if (local_group) {
3704 if (idle_cpu(i) && !first_idle_cpu &&
3705 cpumask_test_cpu(i, sched_group_mask(group))) {
3706 first_idle_cpu = 1;
3707 balance_cpu = i;
3708 }
3709
3710 load = target_load(i, load_idx);
3711 } else {
3712 load = source_load(i, load_idx);
3713 if (load > max_cpu_load)
3714 max_cpu_load = load;
3715 if (min_cpu_load > load)
3716 min_cpu_load = load;
3717
3718 if (nr_running > max_nr_running)
3719 max_nr_running = nr_running;
3720 if (min_nr_running > nr_running)
3721 min_nr_running = nr_running;
3722 }
3723
3724 sgs->group_load += load;
3725 sgs->sum_nr_running += nr_running;
3726 sgs->sum_weighted_load += weighted_cpuload(i);
3727 if (idle_cpu(i))
3728 sgs->idle_cpus++;
3729 }
3730
3731
3732
3733
3734
3735
3736
3737 if (local_group) {
3738 if (env->idle != CPU_NEWLY_IDLE) {
3739 if (balance_cpu != env->dst_cpu) {
3740 *balance = 0;
3741 return;
3742 }
3743 update_group_power(env->sd, env->dst_cpu);
3744 } else if (time_after_eq(jiffies, group->sgp->next_update))
3745 update_group_power(env->sd, env->dst_cpu);
3746 }
3747
3748
3749 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760 if (sgs->sum_nr_running)
3761 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3762
3763 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3764 (max_nr_running - min_nr_running) > 1)
3765 sgs->group_imb = 1;
3766
3767 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3768 SCHED_POWER_SCALE);
3769 if (!sgs->group_capacity)
3770 sgs->group_capacity = fix_small_capacity(env->sd, group);
3771 sgs->group_weight = group->group_weight;
3772
3773 if (sgs->group_capacity > sgs->sum_nr_running)
3774 sgs->group_has_capacity = 1;
3775}
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787static bool update_sd_pick_busiest(struct lb_env *env,
3788 struct sd_lb_stats *sds,
3789 struct sched_group *sg,
3790 struct sg_lb_stats *sgs)
3791{
3792 if (sgs->avg_load <= sds->max_load)
3793 return false;
3794
3795 if (sgs->sum_nr_running > sgs->group_capacity)
3796 return true;
3797
3798 if (sgs->group_imb)
3799 return true;
3800
3801
3802
3803
3804
3805
3806 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3807 env->dst_cpu < group_first_cpu(sg)) {
3808 if (!sds->busiest)
3809 return true;
3810
3811 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
3812 return true;
3813 }
3814
3815 return false;
3816}
3817
3818
3819
3820
3821
3822
3823
3824static inline void update_sd_lb_stats(struct lb_env *env,
3825 int *balance, struct sd_lb_stats *sds)
3826{
3827 struct sched_domain *child = env->sd->child;
3828 struct sched_group *sg = env->sd->groups;
3829 struct sg_lb_stats sgs;
3830 int load_idx, prefer_sibling = 0;
3831
3832 if (child && child->flags & SD_PREFER_SIBLING)
3833 prefer_sibling = 1;
3834
3835 load_idx = get_sd_load_idx(env->sd, env->idle);
3836
3837 do {
3838 int local_group;
3839
3840 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3841 memset(&sgs, 0, sizeof(sgs));
3842 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
3843
3844 if (local_group && !(*balance))
3845 return;
3846
3847 sds->total_load += sgs.group_load;
3848 sds->total_pwr += sg->sgp->power;
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860 if (prefer_sibling && !local_group && sds->this_has_capacity)
3861 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3862
3863 if (local_group) {
3864 sds->this_load = sgs.avg_load;
3865 sds->this = sg;
3866 sds->this_nr_running = sgs.sum_nr_running;
3867 sds->this_load_per_task = sgs.sum_weighted_load;
3868 sds->this_has_capacity = sgs.group_has_capacity;
3869 sds->this_idle_cpus = sgs.idle_cpus;
3870 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3871 sds->max_load = sgs.avg_load;
3872 sds->busiest = sg;
3873 sds->busiest_nr_running = sgs.sum_nr_running;
3874 sds->busiest_idle_cpus = sgs.idle_cpus;
3875 sds->busiest_group_capacity = sgs.group_capacity;
3876 sds->busiest_load_per_task = sgs.sum_weighted_load;
3877 sds->busiest_has_capacity = sgs.group_has_capacity;
3878 sds->busiest_group_weight = sgs.group_weight;
3879 sds->group_imb = sgs.group_imb;
3880 }
3881
3882 sg = sg->next;
3883 } while (sg != env->sd->groups);
3884}
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3910{
3911 int busiest_cpu;
3912
3913 if (!(env->sd->flags & SD_ASYM_PACKING))
3914 return 0;
3915
3916 if (!sds->busiest)
3917 return 0;
3918
3919 busiest_cpu = group_first_cpu(sds->busiest);
3920 if (env->dst_cpu > busiest_cpu)
3921 return 0;
3922
3923 env->imbalance = DIV_ROUND_CLOSEST(
3924 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3925
3926 return 1;
3927}
3928
3929
3930
3931
3932
3933
3934
3935
3936static inline
3937void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
3938{
3939 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3940 unsigned int imbn = 2;
3941 unsigned long scaled_busy_load_per_task;
3942
3943 if (sds->this_nr_running) {
3944 sds->this_load_per_task /= sds->this_nr_running;
3945 if (sds->busiest_load_per_task >
3946 sds->this_load_per_task)
3947 imbn = 1;
3948 } else {
3949 sds->this_load_per_task =
3950 cpu_avg_load_per_task(env->dst_cpu);
3951 }
3952
3953 scaled_busy_load_per_task = sds->busiest_load_per_task
3954 * SCHED_POWER_SCALE;
3955 scaled_busy_load_per_task /= sds->busiest->sgp->power;
3956
3957 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
3958 (scaled_busy_load_per_task * imbn)) {
3959 env->imbalance = sds->busiest_load_per_task;
3960 return;
3961 }
3962
3963
3964
3965
3966
3967
3968
3969 pwr_now += sds->busiest->sgp->power *
3970 min(sds->busiest_load_per_task, sds->max_load);
3971 pwr_now += sds->this->sgp->power *
3972 min(sds->this_load_per_task, sds->this_load);
3973 pwr_now /= SCHED_POWER_SCALE;
3974
3975
3976 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3977 sds->busiest->sgp->power;
3978 if (sds->max_load > tmp)
3979 pwr_move += sds->busiest->sgp->power *
3980 min(sds->busiest_load_per_task, sds->max_load - tmp);
3981
3982
3983 if (sds->max_load * sds->busiest->sgp->power <
3984 sds->busiest_load_per_task * SCHED_POWER_SCALE)
3985 tmp = (sds->max_load * sds->busiest->sgp->power) /
3986 sds->this->sgp->power;
3987 else
3988 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3989 sds->this->sgp->power;
3990 pwr_move += sds->this->sgp->power *
3991 min(sds->this_load_per_task, sds->this_load + tmp);
3992 pwr_move /= SCHED_POWER_SCALE;
3993
3994
3995 if (pwr_move > pwr_now)
3996 env->imbalance = sds->busiest_load_per_task;
3997}
3998
3999
4000
4001
4002
4003
4004
4005static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4006{
4007 unsigned long max_pull, load_above_capacity = ~0UL;
4008
4009 sds->busiest_load_per_task /= sds->busiest_nr_running;
4010 if (sds->group_imb) {
4011 sds->busiest_load_per_task =
4012 min(sds->busiest_load_per_task, sds->avg_load);
4013 }
4014
4015
4016
4017
4018
4019
4020 if (sds->max_load < sds->avg_load) {
4021 env->imbalance = 0;
4022 return fix_small_imbalance(env, sds);
4023 }
4024
4025 if (!sds->group_imb) {
4026
4027
4028
4029 load_above_capacity = (sds->busiest_nr_running -
4030 sds->busiest_group_capacity);
4031
4032 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4033
4034 load_above_capacity /= sds->busiest->sgp->power;
4035 }
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4048
4049
4050 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4051 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4052 / SCHED_POWER_SCALE;
4053
4054
4055
4056
4057
4058
4059
4060 if (env->imbalance < sds->busiest_load_per_task)
4061 return fix_small_imbalance(env, sds);
4062
4063}
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086static struct sched_group *
4087find_busiest_group(struct lb_env *env, int *balance)
4088{
4089 struct sd_lb_stats sds;
4090
4091 memset(&sds, 0, sizeof(sds));
4092
4093
4094
4095
4096
4097 update_sd_lb_stats(env, balance, &sds);
4098
4099
4100
4101
4102
4103 if (!(*balance))
4104 goto ret;
4105
4106 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4107 check_asym_packing(env, &sds))
4108 return sds.busiest;
4109
4110
4111 if (!sds.busiest || sds.busiest_nr_running == 0)
4112 goto out_balanced;
4113
4114 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4115
4116
4117
4118
4119
4120
4121 if (sds.group_imb)
4122 goto force_balance;
4123
4124
4125 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4126 !sds.busiest_has_capacity)
4127 goto force_balance;
4128
4129
4130
4131
4132
4133 if (sds.this_load >= sds.max_load)
4134 goto out_balanced;
4135
4136
4137
4138
4139
4140 if (sds.this_load >= sds.avg_load)
4141 goto out_balanced;
4142
4143 if (env->idle == CPU_IDLE) {
4144
4145
4146
4147
4148
4149
4150 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4151 sds.busiest_nr_running <= sds.busiest_group_weight)
4152 goto out_balanced;
4153 } else {
4154
4155
4156
4157
4158 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4159 goto out_balanced;
4160 }
4161
4162force_balance:
4163
4164 calculate_imbalance(env, &sds);
4165 return sds.busiest;
4166
4167out_balanced:
4168ret:
4169 env->imbalance = 0;
4170 return NULL;
4171}
4172
4173
4174
4175
4176static struct rq *find_busiest_queue(struct lb_env *env,
4177 struct sched_group *group)
4178{
4179 struct rq *busiest = NULL, *rq;
4180 unsigned long max_load = 0;
4181 int i;
4182
4183 for_each_cpu(i, sched_group_cpus(group)) {
4184 unsigned long power = power_of(i);
4185 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4186 SCHED_POWER_SCALE);
4187 unsigned long wl;
4188
4189 if (!capacity)
4190 capacity = fix_small_capacity(env->sd, group);
4191
4192 if (!cpumask_test_cpu(i, env->cpus))
4193 continue;
4194
4195 rq = cpu_rq(i);
4196 wl = weighted_cpuload(i);
4197
4198
4199
4200
4201
4202 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4203 continue;
4204
4205
4206
4207
4208
4209
4210
4211 wl = (wl * SCHED_POWER_SCALE) / power;
4212
4213 if (wl > max_load) {
4214 max_load = wl;
4215 busiest = rq;
4216 }
4217 }
4218
4219 return busiest;
4220}
4221
4222
4223
4224
4225
4226#define MAX_PINNED_INTERVAL 512
4227
4228
4229DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4230
4231static int need_active_balance(struct lb_env *env)
4232{
4233 struct sched_domain *sd = env->sd;
4234
4235 if (env->idle == CPU_NEWLY_IDLE) {
4236
4237
4238
4239
4240
4241
4242 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4243 return 1;
4244 }
4245
4246 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4247}
4248
4249static int active_load_balance_cpu_stop(void *data);
4250
4251
4252
4253
4254
4255static int load_balance(int this_cpu, struct rq *this_rq,
4256 struct sched_domain *sd, enum cpu_idle_type idle,
4257 int *balance)
4258{
4259 int ld_moved, cur_ld_moved, active_balance = 0;
4260 int lb_iterations, max_lb_iterations;
4261 struct sched_group *group;
4262 struct rq *busiest;
4263 unsigned long flags;
4264 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4265
4266 struct lb_env env = {
4267 .sd = sd,
4268 .dst_cpu = this_cpu,
4269 .dst_rq = this_rq,
4270 .dst_grpmask = sched_group_cpus(sd->groups),
4271 .idle = idle,
4272 .loop_break = sched_nr_migrate_break,
4273 .cpus = cpus,
4274 };
4275
4276 cpumask_copy(cpus, cpu_active_mask);
4277 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4278
4279 schedstat_inc(sd, lb_count[idle]);
4280
4281redo:
4282 group = find_busiest_group(&env, balance);
4283
4284 if (*balance == 0)
4285 goto out_balanced;
4286
4287 if (!group) {
4288 schedstat_inc(sd, lb_nobusyg[idle]);
4289 goto out_balanced;
4290 }
4291
4292 busiest = find_busiest_queue(&env, group);
4293 if (!busiest) {
4294 schedstat_inc(sd, lb_nobusyq[idle]);
4295 goto out_balanced;
4296 }
4297
4298 BUG_ON(busiest == this_rq);
4299
4300 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4301
4302 ld_moved = 0;
4303 lb_iterations = 1;
4304 if (busiest->nr_running > 1) {
4305
4306
4307
4308
4309
4310
4311 env.flags |= LBF_ALL_PINNED;
4312 env.src_cpu = busiest->cpu;
4313 env.src_rq = busiest;
4314 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4315
4316 update_h_load(env.src_cpu);
4317more_balance:
4318 local_irq_save(flags);
4319 double_rq_lock(this_rq, busiest);
4320
4321
4322
4323
4324
4325 cur_ld_moved = move_tasks(&env);
4326 ld_moved += cur_ld_moved;
4327 double_rq_unlock(this_rq, busiest);
4328 local_irq_restore(flags);
4329
4330 if (env.flags & LBF_NEED_BREAK) {
4331 env.flags &= ~LBF_NEED_BREAK;
4332 goto more_balance;
4333 }
4334
4335
4336
4337
4338 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4339 resched_cpu(env.dst_cpu);
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4361 lb_iterations++ < max_lb_iterations) {
4362
4363 this_rq = cpu_rq(env.new_dst_cpu);
4364 env.dst_rq = this_rq;
4365 env.dst_cpu = env.new_dst_cpu;
4366 env.flags &= ~LBF_SOME_PINNED;
4367 env.loop = 0;
4368 env.loop_break = sched_nr_migrate_break;
4369
4370
4371
4372
4373 goto more_balance;
4374 }
4375
4376
4377 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4378 cpumask_clear_cpu(cpu_of(busiest), cpus);
4379 if (!cpumask_empty(cpus)) {
4380 env.loop = 0;
4381 env.loop_break = sched_nr_migrate_break;
4382 goto redo;
4383 }
4384 goto out_balanced;
4385 }
4386 }
4387
4388 if (!ld_moved) {
4389 schedstat_inc(sd, lb_failed[idle]);
4390
4391
4392
4393
4394
4395
4396 if (idle != CPU_NEWLY_IDLE)
4397 sd->nr_balance_failed++;
4398
4399 if (need_active_balance(&env)) {
4400 raw_spin_lock_irqsave(&busiest->lock, flags);
4401
4402
4403
4404
4405
4406 if (!cpumask_test_cpu(this_cpu,
4407 tsk_cpus_allowed(busiest->curr))) {
4408 raw_spin_unlock_irqrestore(&busiest->lock,
4409 flags);
4410 env.flags |= LBF_ALL_PINNED;
4411 goto out_one_pinned;
4412 }
4413
4414
4415
4416
4417
4418
4419 if (!busiest->active_balance) {
4420 busiest->active_balance = 1;
4421 busiest->push_cpu = this_cpu;
4422 active_balance = 1;
4423 }
4424 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4425
4426 if (active_balance) {
4427 stop_one_cpu_nowait(cpu_of(busiest),
4428 active_load_balance_cpu_stop, busiest,
4429 &busiest->active_balance_work);
4430 }
4431
4432
4433
4434
4435
4436 sd->nr_balance_failed = sd->cache_nice_tries+1;
4437 }
4438 } else
4439 sd->nr_balance_failed = 0;
4440
4441 if (likely(!active_balance)) {
4442
4443 sd->balance_interval = sd->min_interval;
4444 } else {
4445
4446
4447
4448
4449
4450
4451 if (sd->balance_interval < sd->max_interval)
4452 sd->balance_interval *= 2;
4453 }
4454
4455 goto out;
4456
4457out_balanced:
4458 schedstat_inc(sd, lb_balanced[idle]);
4459
4460 sd->nr_balance_failed = 0;
4461
4462out_one_pinned:
4463
4464 if (((env.flags & LBF_ALL_PINNED) &&
4465 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4466 (sd->balance_interval < sd->max_interval))
4467 sd->balance_interval *= 2;
4468
4469 ld_moved = 0;
4470out:
4471 return ld_moved;
4472}
4473
4474
4475
4476
4477
4478void idle_balance(int this_cpu, struct rq *this_rq)
4479{
4480 struct sched_domain *sd;
4481 int pulled_task = 0;
4482 unsigned long next_balance = jiffies + HZ;
4483
4484 this_rq->idle_stamp = this_rq->clock;
4485
4486 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4487 return;
4488
4489
4490
4491
4492 raw_spin_unlock(&this_rq->lock);
4493
4494 update_shares(this_cpu);
4495 rcu_read_lock();
4496 for_each_domain(this_cpu, sd) {
4497 unsigned long interval;
4498 int balance = 1;
4499
4500 if (!(sd->flags & SD_LOAD_BALANCE))
4501 continue;
4502
4503 if (sd->flags & SD_BALANCE_NEWIDLE) {
4504
4505 pulled_task = load_balance(this_cpu, this_rq,
4506 sd, CPU_NEWLY_IDLE, &balance);
4507 }
4508
4509 interval = msecs_to_jiffies(sd->balance_interval);
4510 if (time_after(next_balance, sd->last_balance + interval))
4511 next_balance = sd->last_balance + interval;
4512 if (pulled_task) {
4513 this_rq->idle_stamp = 0;
4514 break;
4515 }
4516 }
4517 rcu_read_unlock();
4518
4519 raw_spin_lock(&this_rq->lock);
4520
4521 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4522
4523
4524
4525
4526 this_rq->next_balance = next_balance;
4527 }
4528}
4529
4530
4531
4532
4533
4534
4535
4536static int active_load_balance_cpu_stop(void *data)
4537{
4538 struct rq *busiest_rq = data;
4539 int busiest_cpu = cpu_of(busiest_rq);
4540 int target_cpu = busiest_rq->push_cpu;
4541 struct rq *target_rq = cpu_rq(target_cpu);
4542 struct sched_domain *sd;
4543
4544 raw_spin_lock_irq(&busiest_rq->lock);
4545
4546
4547 if (unlikely(busiest_cpu != smp_processor_id() ||
4548 !busiest_rq->active_balance))
4549 goto out_unlock;
4550
4551
4552 if (busiest_rq->nr_running <= 1)
4553 goto out_unlock;
4554
4555
4556
4557
4558
4559
4560 BUG_ON(busiest_rq == target_rq);
4561
4562
4563 double_lock_balance(busiest_rq, target_rq);
4564
4565
4566 rcu_read_lock();
4567 for_each_domain(target_cpu, sd) {
4568 if ((sd->flags & SD_LOAD_BALANCE) &&
4569 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4570 break;
4571 }
4572
4573 if (likely(sd)) {
4574 struct lb_env env = {
4575 .sd = sd,
4576 .dst_cpu = target_cpu,
4577 .dst_rq = target_rq,
4578 .src_cpu = busiest_rq->cpu,
4579 .src_rq = busiest_rq,
4580 .idle = CPU_IDLE,
4581 };
4582
4583 schedstat_inc(sd, alb_count);
4584
4585 if (move_one_task(&env))
4586 schedstat_inc(sd, alb_pushed);
4587 else
4588 schedstat_inc(sd, alb_failed);
4589 }
4590 rcu_read_unlock();
4591 double_unlock_balance(busiest_rq, target_rq);
4592out_unlock:
4593 busiest_rq->active_balance = 0;
4594 raw_spin_unlock_irq(&busiest_rq->lock);
4595 return 0;
4596}
4597
4598#ifdef CONFIG_NO_HZ
4599
4600
4601
4602
4603
4604
4605static struct {
4606 cpumask_var_t idle_cpus_mask;
4607 atomic_t nr_cpus;
4608 unsigned long next_balance;
4609} nohz ____cacheline_aligned;
4610
4611static inline int find_new_ilb(int call_cpu)
4612{
4613 int ilb = cpumask_first(nohz.idle_cpus_mask);
4614
4615 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4616 return ilb;
4617
4618 return nr_cpu_ids;
4619}
4620
4621
4622
4623
4624
4625
4626static void nohz_balancer_kick(int cpu)
4627{
4628 int ilb_cpu;
4629
4630 nohz.next_balance++;
4631
4632 ilb_cpu = find_new_ilb(cpu);
4633
4634 if (ilb_cpu >= nr_cpu_ids)
4635 return;
4636
4637 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4638 return;
4639
4640
4641
4642
4643
4644
4645 smp_send_reschedule(ilb_cpu);
4646 return;
4647}
4648
4649static inline void clear_nohz_tick_stopped(int cpu)
4650{
4651 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4652 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4653 atomic_dec(&nohz.nr_cpus);
4654 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4655 }
4656}
4657
4658static inline void set_cpu_sd_state_busy(void)
4659{
4660 struct sched_domain *sd;
4661 int cpu = smp_processor_id();
4662
4663 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4664 return;
4665 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4666
4667 rcu_read_lock();
4668 for_each_domain(cpu, sd)
4669 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4670 rcu_read_unlock();
4671}
4672
4673void set_cpu_sd_state_idle(void)
4674{
4675 struct sched_domain *sd;
4676 int cpu = smp_processor_id();
4677
4678 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4679 return;
4680 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4681
4682 rcu_read_lock();
4683 for_each_domain(cpu, sd)
4684 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4685 rcu_read_unlock();
4686}
4687
4688
4689
4690
4691
4692void select_nohz_load_balancer(int stop_tick)
4693{
4694 int cpu = smp_processor_id();
4695
4696
4697
4698
4699 if (!cpu_active(cpu))
4700 return;
4701
4702 if (stop_tick) {
4703 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4704 return;
4705
4706 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4707 atomic_inc(&nohz.nr_cpus);
4708 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4709 }
4710 return;
4711}
4712
4713static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4714 unsigned long action, void *hcpu)
4715{
4716 switch (action & ~CPU_TASKS_FROZEN) {
4717 case CPU_DYING:
4718 clear_nohz_tick_stopped(smp_processor_id());
4719 return NOTIFY_OK;
4720 default:
4721 return NOTIFY_DONE;
4722 }
4723}
4724#endif
4725
4726static DEFINE_SPINLOCK(balancing);
4727
4728
4729
4730
4731
4732void update_max_interval(void)
4733{
4734 max_load_balance_interval = HZ*num_online_cpus()/10;
4735}
4736
4737
4738
4739
4740
4741
4742
4743static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4744{
4745 int balance = 1;
4746 struct rq *rq = cpu_rq(cpu);
4747 unsigned long interval;
4748 struct sched_domain *sd;
4749
4750 unsigned long next_balance = jiffies + 60*HZ;
4751 int update_next_balance = 0;
4752 int need_serialize;
4753
4754 update_shares(cpu);
4755
4756 rcu_read_lock();
4757 for_each_domain(cpu, sd) {
4758 if (!(sd->flags & SD_LOAD_BALANCE))
4759 continue;
4760
4761 interval = sd->balance_interval;
4762 if (idle != CPU_IDLE)
4763 interval *= sd->busy_factor;
4764
4765
4766 interval = msecs_to_jiffies(interval);
4767 interval = clamp(interval, 1UL, max_load_balance_interval);
4768
4769 need_serialize = sd->flags & SD_SERIALIZE;
4770
4771 if (need_serialize) {
4772 if (!spin_trylock(&balancing))
4773 goto out;
4774 }
4775
4776 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4777 if (load_balance(cpu, rq, sd, idle, &balance)) {
4778
4779
4780
4781
4782 idle = CPU_NOT_IDLE;
4783 }
4784 sd->last_balance = jiffies;
4785 }
4786 if (need_serialize)
4787 spin_unlock(&balancing);
4788out:
4789 if (time_after(next_balance, sd->last_balance + interval)) {
4790 next_balance = sd->last_balance + interval;
4791 update_next_balance = 1;
4792 }
4793
4794
4795
4796
4797
4798
4799 if (!balance)
4800 break;
4801 }
4802 rcu_read_unlock();
4803
4804
4805
4806
4807
4808
4809 if (likely(update_next_balance))
4810 rq->next_balance = next_balance;
4811}
4812
4813#ifdef CONFIG_NO_HZ
4814
4815
4816
4817
4818static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4819{
4820 struct rq *this_rq = cpu_rq(this_cpu);
4821 struct rq *rq;
4822 int balance_cpu;
4823
4824 if (idle != CPU_IDLE ||
4825 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
4826 goto end;
4827
4828 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4829 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4830 continue;
4831
4832
4833
4834
4835
4836
4837 if (need_resched())
4838 break;
4839
4840 raw_spin_lock_irq(&this_rq->lock);
4841 update_rq_clock(this_rq);
4842 update_idle_cpu_load(this_rq);
4843 raw_spin_unlock_irq(&this_rq->lock);
4844
4845 rebalance_domains(balance_cpu, CPU_IDLE);
4846
4847 rq = cpu_rq(balance_cpu);
4848 if (time_after(this_rq->next_balance, rq->next_balance))
4849 this_rq->next_balance = rq->next_balance;
4850 }
4851 nohz.next_balance = this_rq->next_balance;
4852end:
4853 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4854}
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865static inline int nohz_kick_needed(struct rq *rq, int cpu)
4866{
4867 unsigned long now = jiffies;
4868 struct sched_domain *sd;
4869
4870 if (unlikely(idle_cpu(cpu)))
4871 return 0;
4872
4873
4874
4875
4876
4877 set_cpu_sd_state_busy();
4878 clear_nohz_tick_stopped(cpu);
4879
4880
4881
4882
4883
4884 if (likely(!atomic_read(&nohz.nr_cpus)))
4885 return 0;
4886
4887 if (time_before(now, nohz.next_balance))
4888 return 0;
4889
4890 if (rq->nr_running >= 2)
4891 goto need_kick;
4892
4893 rcu_read_lock();
4894 for_each_domain(cpu, sd) {
4895 struct sched_group *sg = sd->groups;
4896 struct sched_group_power *sgp = sg->sgp;
4897 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4898
4899 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4900 goto need_kick_unlock;
4901
4902 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
4903 && (cpumask_first_and(nohz.idle_cpus_mask,
4904 sched_domain_span(sd)) < cpu))
4905 goto need_kick_unlock;
4906
4907 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
4908 break;
4909 }
4910 rcu_read_unlock();
4911 return 0;
4912
4913need_kick_unlock:
4914 rcu_read_unlock();
4915need_kick:
4916 return 1;
4917}
4918#else
4919static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
4920#endif
4921
4922
4923
4924
4925
4926static void run_rebalance_domains(struct softirq_action *h)
4927{
4928 int this_cpu = smp_processor_id();
4929 struct rq *this_rq = cpu_rq(this_cpu);
4930 enum cpu_idle_type idle = this_rq->idle_balance ?
4931 CPU_IDLE : CPU_NOT_IDLE;
4932
4933 rebalance_domains(this_cpu, idle);
4934
4935
4936
4937
4938
4939
4940 nohz_idle_balance(this_cpu, idle);
4941}
4942
4943static inline int on_null_domain(int cpu)
4944{
4945 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4946}
4947
4948
4949
4950
4951void trigger_load_balance(struct rq *rq, int cpu)
4952{
4953
4954 if (time_after_eq(jiffies, rq->next_balance) &&
4955 likely(!on_null_domain(cpu)))
4956 raise_softirq(SCHED_SOFTIRQ);
4957#ifdef CONFIG_NO_HZ
4958 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4959 nohz_balancer_kick(cpu);
4960#endif
4961}
4962
4963static void rq_online_fair(struct rq *rq)
4964{
4965 update_sysctl();
4966}
4967
4968static void rq_offline_fair(struct rq *rq)
4969{
4970 update_sysctl();
4971
4972
4973 unthrottle_offline_cfs_rqs(rq);
4974}
4975
4976#endif
4977
4978
4979
4980
4981static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4982{
4983 struct cfs_rq *cfs_rq;
4984 struct sched_entity *se = &curr->se;
4985
4986 for_each_sched_entity(se) {
4987 cfs_rq = cfs_rq_of(se);
4988 entity_tick(cfs_rq, se, queued);
4989 }
4990}
4991
4992
4993
4994
4995
4996
4997static void task_fork_fair(struct task_struct *p)
4998{
4999 struct cfs_rq *cfs_rq;
5000 struct sched_entity *se = &p->se, *curr;
5001 int this_cpu = smp_processor_id();
5002 struct rq *rq = this_rq();
5003 unsigned long flags;
5004
5005 raw_spin_lock_irqsave(&rq->lock, flags);
5006
5007 update_rq_clock(rq);
5008
5009 cfs_rq = task_cfs_rq(current);
5010 curr = cfs_rq->curr;
5011
5012 if (unlikely(task_cpu(p) != this_cpu)) {
5013 rcu_read_lock();
5014 __set_task_cpu(p, this_cpu);
5015 rcu_read_unlock();
5016 }
5017
5018 update_curr(cfs_rq);
5019
5020 if (curr)
5021 se->vruntime = curr->vruntime;
5022 place_entity(cfs_rq, se, 1);
5023
5024 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5025
5026
5027
5028
5029 swap(curr->vruntime, se->vruntime);
5030 resched_task(rq->curr);
5031 }
5032
5033 se->vruntime -= cfs_rq->min_vruntime;
5034
5035 raw_spin_unlock_irqrestore(&rq->lock, flags);
5036}
5037
5038
5039
5040
5041
5042static void
5043prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5044{
5045 if (!p->se.on_rq)
5046 return;
5047
5048
5049
5050
5051
5052
5053 if (rq->curr == p) {
5054 if (p->prio > oldprio)
5055 resched_task(rq->curr);
5056 } else
5057 check_preempt_curr(rq, p, 0);
5058}
5059
5060static void switched_from_fair(struct rq *rq, struct task_struct *p)
5061{
5062 struct sched_entity *se = &p->se;
5063 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074 if (!se->on_rq && p->state != TASK_RUNNING) {
5075
5076
5077
5078
5079 place_entity(cfs_rq, se, 0);
5080 se->vruntime -= cfs_rq->min_vruntime;
5081 }
5082}
5083
5084
5085
5086
5087static void switched_to_fair(struct rq *rq, struct task_struct *p)
5088{
5089 if (!p->se.on_rq)
5090 return;
5091
5092
5093
5094
5095
5096
5097 if (rq->curr == p)
5098 resched_task(rq->curr);
5099 else
5100 check_preempt_curr(rq, p, 0);
5101}
5102
5103
5104
5105
5106
5107
5108static void set_curr_task_fair(struct rq *rq)
5109{
5110 struct sched_entity *se = &rq->curr->se;
5111
5112 for_each_sched_entity(se) {
5113 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5114
5115 set_next_entity(cfs_rq, se);
5116
5117 account_cfs_rq_runtime(cfs_rq, 0);
5118 }
5119}
5120
5121void init_cfs_rq(struct cfs_rq *cfs_rq)
5122{
5123 cfs_rq->tasks_timeline = RB_ROOT;
5124 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5125#ifndef CONFIG_64BIT
5126 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5127#endif
5128}
5129
5130#ifdef CONFIG_FAIR_GROUP_SCHED
5131static void task_move_group_fair(struct task_struct *p, int on_rq)
5132{
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5159 on_rq = 1;
5160
5161 if (!on_rq)
5162 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5163 set_task_rq(p, task_cpu(p));
5164 if (!on_rq)
5165 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5166}
5167
5168void free_fair_sched_group(struct task_group *tg)
5169{
5170 int i;
5171
5172 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5173
5174 for_each_possible_cpu(i) {
5175 if (tg->cfs_rq)
5176 kfree(tg->cfs_rq[i]);
5177 if (tg->se)
5178 kfree(tg->se[i]);
5179 }
5180
5181 kfree(tg->cfs_rq);
5182 kfree(tg->se);
5183}
5184
5185int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5186{
5187 struct cfs_rq *cfs_rq;
5188 struct sched_entity *se;
5189 int i;
5190
5191 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5192 if (!tg->cfs_rq)
5193 goto err;
5194 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5195 if (!tg->se)
5196 goto err;
5197
5198 tg->shares = NICE_0_LOAD;
5199
5200 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5201
5202 for_each_possible_cpu(i) {
5203 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5204 GFP_KERNEL, cpu_to_node(i));
5205 if (!cfs_rq)
5206 goto err;
5207
5208 se = kzalloc_node(sizeof(struct sched_entity),
5209 GFP_KERNEL, cpu_to_node(i));
5210 if (!se)
5211 goto err_free_rq;
5212
5213 init_cfs_rq(cfs_rq);
5214 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5215 }
5216
5217 return 1;
5218
5219err_free_rq:
5220 kfree(cfs_rq);
5221err:
5222 return 0;
5223}
5224
5225void unregister_fair_sched_group(struct task_group *tg, int cpu)
5226{
5227 struct rq *rq = cpu_rq(cpu);
5228 unsigned long flags;
5229
5230
5231
5232
5233
5234 if (!tg->cfs_rq[cpu]->on_list)
5235 return;
5236
5237 raw_spin_lock_irqsave(&rq->lock, flags);
5238 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5239 raw_spin_unlock_irqrestore(&rq->lock, flags);
5240}
5241
5242void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5243 struct sched_entity *se, int cpu,
5244 struct sched_entity *parent)
5245{
5246 struct rq *rq = cpu_rq(cpu);
5247
5248 cfs_rq->tg = tg;
5249 cfs_rq->rq = rq;
5250#ifdef CONFIG_SMP
5251
5252 cfs_rq->load_stamp = 1;
5253#endif
5254 init_cfs_rq_runtime(cfs_rq);
5255
5256 tg->cfs_rq[cpu] = cfs_rq;
5257 tg->se[cpu] = se;
5258
5259
5260 if (!se)
5261 return;
5262
5263 if (!parent)
5264 se->cfs_rq = &rq->cfs;
5265 else
5266 se->cfs_rq = parent->my_q;
5267
5268 se->my_q = cfs_rq;
5269 update_load_set(&se->load, 0);
5270 se->parent = parent;
5271}
5272
5273static DEFINE_MUTEX(shares_mutex);
5274
5275int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5276{
5277 int i;
5278 unsigned long flags;
5279
5280
5281
5282
5283 if (!tg->se[0])
5284 return -EINVAL;
5285
5286 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5287
5288 mutex_lock(&shares_mutex);
5289 if (tg->shares == shares)
5290 goto done;
5291
5292 tg->shares = shares;
5293 for_each_possible_cpu(i) {
5294 struct rq *rq = cpu_rq(i);
5295 struct sched_entity *se;
5296
5297 se = tg->se[i];
5298
5299 raw_spin_lock_irqsave(&rq->lock, flags);
5300 for_each_sched_entity(se)
5301 update_cfs_shares(group_cfs_rq(se));
5302 raw_spin_unlock_irqrestore(&rq->lock, flags);
5303 }
5304
5305done:
5306 mutex_unlock(&shares_mutex);
5307 return 0;
5308}
5309#else
5310
5311void free_fair_sched_group(struct task_group *tg) { }
5312
5313int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5314{
5315 return 1;
5316}
5317
5318void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5319
5320#endif
5321
5322
5323static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5324{
5325 struct sched_entity *se = &task->se;
5326 unsigned int rr_interval = 0;
5327
5328
5329
5330
5331
5332 if (rq->cfs.load.weight)
5333 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5334
5335 return rr_interval;
5336}
5337
5338
5339
5340
5341const struct sched_class fair_sched_class = {
5342 .next = &idle_sched_class,
5343 .enqueue_task = enqueue_task_fair,
5344 .dequeue_task = dequeue_task_fair,
5345 .yield_task = yield_task_fair,
5346 .yield_to_task = yield_to_task_fair,
5347
5348 .check_preempt_curr = check_preempt_wakeup,
5349
5350 .pick_next_task = pick_next_task_fair,
5351 .put_prev_task = put_prev_task_fair,
5352
5353#ifdef CONFIG_SMP
5354 .select_task_rq = select_task_rq_fair,
5355
5356 .rq_online = rq_online_fair,
5357 .rq_offline = rq_offline_fair,
5358
5359 .task_waking = task_waking_fair,
5360#endif
5361
5362 .set_curr_task = set_curr_task_fair,
5363 .task_tick = task_tick_fair,
5364 .task_fork = task_fork_fair,
5365
5366 .prio_changed = prio_changed_fair,
5367 .switched_from = switched_from_fair,
5368 .switched_to = switched_to_fair,
5369
5370 .get_rr_interval = get_rr_interval_fair,
5371
5372#ifdef CONFIG_FAIR_GROUP_SCHED
5373 .task_move_group = task_move_group_fair,
5374#endif
5375};
5376
5377#ifdef CONFIG_SCHED_DEBUG
5378void print_cfs_stats(struct seq_file *m, int cpu)
5379{
5380 struct cfs_rq *cfs_rq;
5381
5382 rcu_read_lock();
5383 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5384 print_cfs_rq(m, cpu, cfs_rq);
5385 rcu_read_unlock();
5386}
5387#endif
5388
5389__init void init_sched_fair_class(void)
5390{
5391#ifdef CONFIG_SMP
5392 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5393
5394#ifdef CONFIG_NO_HZ
5395 nohz.next_balance = jiffies;
5396 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5397 cpu_notifier(sched_ilb_notifier, 0);
5398#endif
5399#endif
5400
5401}
5402