1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39unsigned int sysctl_sched_latency = 6000000ULL;
40unsigned int normalized_sysctl_sched_latency = 6000000ULL;
41
42
43
44
45
46
47
48
49
50
51enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 = SCHED_TUNABLESCALING_LOG;
53
54
55
56
57
58unsigned int sysctl_sched_min_granularity = 750000ULL;
59unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
60
61
62
63
64static unsigned int sched_nr_latency = 8;
65
66
67
68
69
70unsigned int sysctl_sched_child_runs_first __read_mostly;
71
72
73
74
75
76
77
78
79
80unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
81unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
82
83const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
84
85
86
87
88
89
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91
92#ifdef CONFIG_CFS_BANDWIDTH
93
94
95
96
97
98
99
100
101
102
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif
105
106static const struct sched_class fair_sched_class;
107
108
109
110
111
112#ifdef CONFIG_FAIR_GROUP_SCHED
113
114
115static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
116{
117 return cfs_rq->rq;
118}
119
120
121#define entity_is_task(se) (!se->my_q)
122
123static inline struct task_struct *task_of(struct sched_entity *se)
124{
125#ifdef CONFIG_SCHED_DEBUG
126 WARN_ON_ONCE(!entity_is_task(se));
127#endif
128 return container_of(se, struct task_struct, se);
129}
130
131
132#define for_each_sched_entity(se) \
133 for (; se; se = se->parent)
134
135static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
136{
137 return p->se.cfs_rq;
138}
139
140
141static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
142{
143 return se->cfs_rq;
144}
145
146
147static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
148{
149 return grp->my_q;
150}
151
152static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
153{
154 if (!cfs_rq->on_list) {
155
156
157
158
159
160
161 if (cfs_rq->tg->parent &&
162 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
163 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
164 &rq_of(cfs_rq)->leaf_cfs_rq_list);
165 } else {
166 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
167 &rq_of(cfs_rq)->leaf_cfs_rq_list);
168 }
169
170 cfs_rq->on_list = 1;
171 }
172}
173
174static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
175{
176 if (cfs_rq->on_list) {
177 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
178 cfs_rq->on_list = 0;
179 }
180}
181
182
183#define for_each_leaf_cfs_rq(rq, cfs_rq) \
184 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
185
186
187static inline int
188is_same_group(struct sched_entity *se, struct sched_entity *pse)
189{
190 if (se->cfs_rq == pse->cfs_rq)
191 return 1;
192
193 return 0;
194}
195
196static inline struct sched_entity *parent_entity(struct sched_entity *se)
197{
198 return se->parent;
199}
200
201
202static inline int depth_se(struct sched_entity *se)
203{
204 int depth = 0;
205
206 for_each_sched_entity(se)
207 depth++;
208
209 return depth;
210}
211
212static void
213find_matching_se(struct sched_entity **se, struct sched_entity **pse)
214{
215 int se_depth, pse_depth;
216
217
218
219
220
221
222
223
224
225 se_depth = depth_se(*se);
226 pse_depth = depth_se(*pse);
227
228 while (se_depth > pse_depth) {
229 se_depth--;
230 *se = parent_entity(*se);
231 }
232
233 while (pse_depth > se_depth) {
234 pse_depth--;
235 *pse = parent_entity(*pse);
236 }
237
238 while (!is_same_group(*se, *pse)) {
239 *se = parent_entity(*se);
240 *pse = parent_entity(*pse);
241 }
242}
243
244#else
245
246static inline struct task_struct *task_of(struct sched_entity *se)
247{
248 return container_of(se, struct task_struct, se);
249}
250
251static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
252{
253 return container_of(cfs_rq, struct rq, cfs);
254}
255
256#define entity_is_task(se) 1
257
258#define for_each_sched_entity(se) \
259 for (; se; se = NULL)
260
261static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
262{
263 return &task_rq(p)->cfs;
264}
265
266static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
267{
268 struct task_struct *p = task_of(se);
269 struct rq *rq = task_rq(p);
270
271 return &rq->cfs;
272}
273
274
275static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
276{
277 return NULL;
278}
279
280static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
281{
282}
283
284static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
285{
286}
287
288#define for_each_leaf_cfs_rq(rq, cfs_rq) \
289 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
290
291static inline int
292is_same_group(struct sched_entity *se, struct sched_entity *pse)
293{
294 return 1;
295}
296
297static inline struct sched_entity *parent_entity(struct sched_entity *se)
298{
299 return NULL;
300}
301
302static inline void
303find_matching_se(struct sched_entity **se, struct sched_entity **pse)
304{
305}
306
307#endif
308
309static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
310 unsigned long delta_exec);
311
312
313
314
315
316static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
317{
318 s64 delta = (s64)(vruntime - min_vruntime);
319 if (delta > 0)
320 min_vruntime = vruntime;
321
322 return min_vruntime;
323}
324
325static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
326{
327 s64 delta = (s64)(vruntime - min_vruntime);
328 if (delta < 0)
329 min_vruntime = vruntime;
330
331 return min_vruntime;
332}
333
334static inline int entity_before(struct sched_entity *a,
335 struct sched_entity *b)
336{
337 return (s64)(a->vruntime - b->vruntime) < 0;
338}
339
340static void update_min_vruntime(struct cfs_rq *cfs_rq)
341{
342 u64 vruntime = cfs_rq->min_vruntime;
343
344 if (cfs_rq->curr)
345 vruntime = cfs_rq->curr->vruntime;
346
347 if (cfs_rq->rb_leftmost) {
348 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
349 struct sched_entity,
350 run_node);
351
352 if (!cfs_rq->curr)
353 vruntime = se->vruntime;
354 else
355 vruntime = min_vruntime(vruntime, se->vruntime);
356 }
357
358 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
359#ifndef CONFIG_64BIT
360 smp_wmb();
361 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
362#endif
363}
364
365
366
367
368static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
369{
370 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
371 struct rb_node *parent = NULL;
372 struct sched_entity *entry;
373 int leftmost = 1;
374
375
376
377
378 while (*link) {
379 parent = *link;
380 entry = rb_entry(parent, struct sched_entity, run_node);
381
382
383
384
385 if (entity_before(se, entry)) {
386 link = &parent->rb_left;
387 } else {
388 link = &parent->rb_right;
389 leftmost = 0;
390 }
391 }
392
393
394
395
396
397 if (leftmost)
398 cfs_rq->rb_leftmost = &se->run_node;
399
400 rb_link_node(&se->run_node, parent, link);
401 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
402}
403
404static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
405{
406 if (cfs_rq->rb_leftmost == &se->run_node) {
407 struct rb_node *next_node;
408
409 next_node = rb_next(&se->run_node);
410 cfs_rq->rb_leftmost = next_node;
411 }
412
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414}
415
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{
418 struct rb_node *left = cfs_rq->rb_leftmost;
419
420 if (!left)
421 return NULL;
422
423 return rb_entry(left, struct sched_entity, run_node);
424}
425
426static struct sched_entity *__pick_next_entity(struct sched_entity *se)
427{
428 struct rb_node *next = rb_next(&se->run_node);
429
430 if (!next)
431 return NULL;
432
433 return rb_entry(next, struct sched_entity, run_node);
434}
435
436#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440
441 if (!last)
442 return NULL;
443
444 return rb_entry(last, struct sched_entity, run_node);
445}
446
447
448
449
450
451int sched_proc_update_handler(struct ctl_table *table, int write,
452 void __user *buffer, size_t *lenp,
453 loff_t *ppos)
454{
455 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
456 int factor = get_update_sysctl_factor();
457
458 if (ret || !write)
459 return ret;
460
461 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
462 sysctl_sched_min_granularity);
463
464#define WRT_SYSCTL(name) \
465 (normalized_sysctl_##name = sysctl_##name / (factor))
466 WRT_SYSCTL(sched_min_granularity);
467 WRT_SYSCTL(sched_latency);
468 WRT_SYSCTL(sched_wakeup_granularity);
469#undef WRT_SYSCTL
470
471 return 0;
472}
473#endif
474
475
476
477
478static inline unsigned long
479calc_delta_fair(unsigned long delta, struct sched_entity *se)
480{
481 if (unlikely(se->load.weight != NICE_0_LOAD))
482 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
483
484 return delta;
485}
486
487
488
489
490
491
492
493
494
495static u64 __sched_period(unsigned long nr_running)
496{
497 u64 period = sysctl_sched_latency;
498 unsigned long nr_latency = sched_nr_latency;
499
500 if (unlikely(nr_running > nr_latency)) {
501 period = sysctl_sched_min_granularity;
502 period *= nr_running;
503 }
504
505 return period;
506}
507
508
509
510
511
512
513
514static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
515{
516 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
517
518 for_each_sched_entity(se) {
519 struct load_weight *load;
520 struct load_weight lw;
521
522 cfs_rq = cfs_rq_of(se);
523 load = &cfs_rq->load;
524
525 if (unlikely(!se->on_rq)) {
526 lw = cfs_rq->load;
527
528 update_load_add(&lw, se->load.weight);
529 load = &lw;
530 }
531 slice = calc_delta_mine(slice, se->load.weight, load);
532 }
533 return slice;
534}
535
536
537
538
539
540
541static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
542{
543 return calc_delta_fair(sched_slice(cfs_rq, se), se);
544}
545
546static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
547static void update_cfs_shares(struct cfs_rq *cfs_rq);
548
549
550
551
552
553static inline void
554__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
555 unsigned long delta_exec)
556{
557 unsigned long delta_exec_weighted;
558
559 schedstat_set(curr->statistics.exec_max,
560 max((u64)delta_exec, curr->statistics.exec_max));
561
562 curr->sum_exec_runtime += delta_exec;
563 schedstat_add(cfs_rq, exec_clock, delta_exec);
564 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
565
566 curr->vruntime += delta_exec_weighted;
567 update_min_vruntime(cfs_rq);
568
569#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
570 cfs_rq->load_unacc_exec_time += delta_exec;
571#endif
572}
573
574static void update_curr(struct cfs_rq *cfs_rq)
575{
576 struct sched_entity *curr = cfs_rq->curr;
577 u64 now = rq_of(cfs_rq)->clock_task;
578 unsigned long delta_exec;
579
580 if (unlikely(!curr))
581 return;
582
583
584
585
586
587
588 delta_exec = (unsigned long)(now - curr->exec_start);
589 if (!delta_exec)
590 return;
591
592 __update_curr(cfs_rq, curr, delta_exec);
593 curr->exec_start = now;
594
595 if (entity_is_task(curr)) {
596 struct task_struct *curtask = task_of(curr);
597
598 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
599 cpuacct_charge(curtask, delta_exec);
600 account_group_exec_runtime(curtask, delta_exec);
601 }
602
603 account_cfs_rq_runtime(cfs_rq, delta_exec);
604}
605
606static inline void
607update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
608{
609 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
610}
611
612
613
614
615static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
616{
617
618
619
620
621 if (se != cfs_rq->curr)
622 update_stats_wait_start(cfs_rq, se);
623}
624
625static void
626update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
627{
628 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
629 rq_of(cfs_rq)->clock - se->statistics.wait_start));
630 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
631 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
632 rq_of(cfs_rq)->clock - se->statistics.wait_start);
633#ifdef CONFIG_SCHEDSTATS
634 if (entity_is_task(se)) {
635 trace_sched_stat_wait(task_of(se),
636 rq_of(cfs_rq)->clock - se->statistics.wait_start);
637 }
638#endif
639 schedstat_set(se->statistics.wait_start, 0);
640}
641
642static inline void
643update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
644{
645
646
647
648
649 if (se != cfs_rq->curr)
650 update_stats_wait_end(cfs_rq, se);
651}
652
653
654
655
656static inline void
657update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
658{
659
660
661
662 se->exec_start = rq_of(cfs_rq)->clock_task;
663}
664
665
666
667
668
669#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
670static void
671add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
672{
673 cfs_rq->task_weight += weight;
674}
675#else
676static inline void
677add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
678{
679}
680#endif
681
682static void
683account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{
685 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
688 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks);
691 }
692 cfs_rq->nr_running++;
693}
694
695static void
696account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{
698 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
701 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node);
704 }
705 cfs_rq->nr_running--;
706}
707
708#ifdef CONFIG_FAIR_GROUP_SCHED
709
710static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
711# ifdef CONFIG_SMP
712static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
713 int global_update)
714{
715 struct task_group *tg = cfs_rq->tg;
716 long load_avg;
717
718 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
719 load_avg -= cfs_rq->load_contribution;
720
721 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
722 atomic_add(load_avg, &tg->load_weight);
723 cfs_rq->load_contribution += load_avg;
724 }
725}
726
727static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
728{
729 u64 period = sysctl_sched_shares_window;
730 u64 now, delta;
731 unsigned long load = cfs_rq->load.weight;
732
733 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
734 return;
735
736 now = rq_of(cfs_rq)->clock_task;
737 delta = now - cfs_rq->load_stamp;
738
739
740 if (cfs_rq->load_stamp > cfs_rq->load_last &&
741 now - cfs_rq->load_last > 4 * period) {
742 cfs_rq->load_period = 0;
743 cfs_rq->load_avg = 0;
744 delta = period - 1;
745 }
746
747 cfs_rq->load_stamp = now;
748 cfs_rq->load_unacc_exec_time = 0;
749 cfs_rq->load_period += delta;
750 if (load) {
751 cfs_rq->load_last = now;
752 cfs_rq->load_avg += delta * load;
753 }
754
755
756 if (global_update || cfs_rq->load_period > period
757 || !cfs_rq->load_period)
758 update_cfs_rq_load_contribution(cfs_rq, global_update);
759
760 while (cfs_rq->load_period > period) {
761
762
763
764
765
766 asm("" : "+rm" (cfs_rq->load_period));
767 cfs_rq->load_period /= 2;
768 cfs_rq->load_avg /= 2;
769 }
770
771 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
772 list_del_leaf_cfs_rq(cfs_rq);
773}
774
775static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
776{
777 long tg_weight;
778
779
780
781
782
783
784 tg_weight = atomic_read(&tg->load_weight);
785 tg_weight -= cfs_rq->load_contribution;
786 tg_weight += cfs_rq->load.weight;
787
788 return tg_weight;
789}
790
791static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
792{
793 long tg_weight, load, shares;
794
795 tg_weight = calc_tg_weight(tg, cfs_rq);
796 load = cfs_rq->load.weight;
797
798 shares = (tg->shares * load);
799 if (tg_weight)
800 shares /= tg_weight;
801
802 if (shares < MIN_SHARES)
803 shares = MIN_SHARES;
804 if (shares > tg->shares)
805 shares = tg->shares;
806
807 return shares;
808}
809
810static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
811{
812 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
813 update_cfs_load(cfs_rq, 0);
814 update_cfs_shares(cfs_rq);
815 }
816}
817# else
818static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
819{
820}
821
822static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
823{
824 return tg->shares;
825}
826
827static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
828{
829}
830# endif
831static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
832 unsigned long weight)
833{
834 if (se->on_rq) {
835
836 if (cfs_rq->curr == se)
837 update_curr(cfs_rq);
838 account_entity_dequeue(cfs_rq, se);
839 }
840
841 update_load_set(&se->load, weight);
842
843 if (se->on_rq)
844 account_entity_enqueue(cfs_rq, se);
845}
846
847static void update_cfs_shares(struct cfs_rq *cfs_rq)
848{
849 struct task_group *tg;
850 struct sched_entity *se;
851 long shares;
852
853 tg = cfs_rq->tg;
854 se = tg->se[cpu_of(rq_of(cfs_rq))];
855 if (!se || throttled_hierarchy(cfs_rq))
856 return;
857#ifndef CONFIG_SMP
858 if (likely(se->load.weight == tg->shares))
859 return;
860#endif
861 shares = calc_cfs_shares(cfs_rq, tg);
862
863 reweight_entity(cfs_rq_of(se), se, shares);
864}
865#else
866static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
867{
868}
869
870static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
871{
872}
873
874static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
875{
876}
877#endif
878
879static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
880{
881#ifdef CONFIG_SCHEDSTATS
882 struct task_struct *tsk = NULL;
883
884 if (entity_is_task(se))
885 tsk = task_of(se);
886
887 if (se->statistics.sleep_start) {
888 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
889
890 if ((s64)delta < 0)
891 delta = 0;
892
893 if (unlikely(delta > se->statistics.sleep_max))
894 se->statistics.sleep_max = delta;
895
896 se->statistics.sleep_start = 0;
897 se->statistics.sum_sleep_runtime += delta;
898
899 if (tsk) {
900 account_scheduler_latency(tsk, delta >> 10, 1);
901 trace_sched_stat_sleep(tsk, delta);
902 }
903 }
904 if (se->statistics.block_start) {
905 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
906
907 if ((s64)delta < 0)
908 delta = 0;
909
910 if (unlikely(delta > se->statistics.block_max))
911 se->statistics.block_max = delta;
912
913 se->statistics.block_start = 0;
914 se->statistics.sum_sleep_runtime += delta;
915
916 if (tsk) {
917 if (tsk->in_iowait) {
918 se->statistics.iowait_sum += delta;
919 se->statistics.iowait_count++;
920 trace_sched_stat_iowait(tsk, delta);
921 }
922
923
924
925
926
927
928 if (unlikely(prof_on == SLEEP_PROFILING)) {
929 profile_hits(SLEEP_PROFILING,
930 (void *)get_wchan(tsk),
931 delta >> 20);
932 }
933 account_scheduler_latency(tsk, delta >> 10, 0);
934 }
935 }
936#endif
937}
938
939static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
940{
941#ifdef CONFIG_SCHED_DEBUG
942 s64 d = se->vruntime - cfs_rq->min_vruntime;
943
944 if (d < 0)
945 d = -d;
946
947 if (d > 3*sysctl_sched_latency)
948 schedstat_inc(cfs_rq, nr_spread_over);
949#endif
950}
951
952static void
953place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
954{
955 u64 vruntime = cfs_rq->min_vruntime;
956
957
958
959
960
961
962
963 if (initial && sched_feat(START_DEBIT))
964 vruntime += sched_vslice(cfs_rq, se);
965
966
967 if (!initial) {
968 unsigned long thresh = sysctl_sched_latency;
969
970
971
972
973
974 if (sched_feat(GENTLE_FAIR_SLEEPERS))
975 thresh >>= 1;
976
977 vruntime -= thresh;
978 }
979
980
981 vruntime = max_vruntime(se->vruntime, vruntime);
982
983 se->vruntime = vruntime;
984}
985
986static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
987
988static void
989enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
990{
991
992
993
994
995 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
996 se->vruntime += cfs_rq->min_vruntime;
997
998
999
1000
1001 update_curr(cfs_rq);
1002 update_cfs_load(cfs_rq, 0);
1003 account_entity_enqueue(cfs_rq, se);
1004 update_cfs_shares(cfs_rq);
1005
1006 if (flags & ENQUEUE_WAKEUP) {
1007 place_entity(cfs_rq, se, 0);
1008 enqueue_sleeper(cfs_rq, se);
1009 }
1010
1011 update_stats_enqueue(cfs_rq, se);
1012 check_spread(cfs_rq, se);
1013 if (se != cfs_rq->curr)
1014 __enqueue_entity(cfs_rq, se);
1015 se->on_rq = 1;
1016
1017 if (cfs_rq->nr_running == 1) {
1018 list_add_leaf_cfs_rq(cfs_rq);
1019 check_enqueue_throttle(cfs_rq);
1020 }
1021}
1022
1023static void __clear_buddies_last(struct sched_entity *se)
1024{
1025 for_each_sched_entity(se) {
1026 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1027 if (cfs_rq->last == se)
1028 cfs_rq->last = NULL;
1029 else
1030 break;
1031 }
1032}
1033
1034static void __clear_buddies_next(struct sched_entity *se)
1035{
1036 for_each_sched_entity(se) {
1037 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1038 if (cfs_rq->next == se)
1039 cfs_rq->next = NULL;
1040 else
1041 break;
1042 }
1043}
1044
1045static void __clear_buddies_skip(struct sched_entity *se)
1046{
1047 for_each_sched_entity(se) {
1048 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1049 if (cfs_rq->skip == se)
1050 cfs_rq->skip = NULL;
1051 else
1052 break;
1053 }
1054}
1055
1056static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1057{
1058 if (cfs_rq->last == se)
1059 __clear_buddies_last(se);
1060
1061 if (cfs_rq->next == se)
1062 __clear_buddies_next(se);
1063
1064 if (cfs_rq->skip == se)
1065 __clear_buddies_skip(se);
1066}
1067
1068static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1069
1070static void
1071dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1072{
1073
1074
1075
1076 update_curr(cfs_rq);
1077
1078 update_stats_dequeue(cfs_rq, se);
1079 if (flags & DEQUEUE_SLEEP) {
1080#ifdef CONFIG_SCHEDSTATS
1081 if (entity_is_task(se)) {
1082 struct task_struct *tsk = task_of(se);
1083
1084 if (tsk->state & TASK_INTERRUPTIBLE)
1085 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1086 if (tsk->state & TASK_UNINTERRUPTIBLE)
1087 se->statistics.block_start = rq_of(cfs_rq)->clock;
1088 }
1089#endif
1090 }
1091
1092 clear_buddies(cfs_rq, se);
1093
1094 if (se != cfs_rq->curr)
1095 __dequeue_entity(cfs_rq, se);
1096 se->on_rq = 0;
1097 update_cfs_load(cfs_rq, 0);
1098 account_entity_dequeue(cfs_rq, se);
1099
1100
1101
1102
1103
1104
1105 if (!(flags & DEQUEUE_SLEEP))
1106 se->vruntime -= cfs_rq->min_vruntime;
1107
1108
1109 return_cfs_rq_runtime(cfs_rq);
1110
1111 update_min_vruntime(cfs_rq);
1112 update_cfs_shares(cfs_rq);
1113}
1114
1115
1116
1117
1118static void
1119check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1120{
1121 unsigned long ideal_runtime, delta_exec;
1122 struct sched_entity *se;
1123 s64 delta;
1124
1125 ideal_runtime = sched_slice(cfs_rq, curr);
1126 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1127 if (delta_exec > ideal_runtime) {
1128 resched_task(rq_of(cfs_rq)->curr);
1129
1130
1131
1132
1133 clear_buddies(cfs_rq, curr);
1134 return;
1135 }
1136
1137
1138
1139
1140
1141
1142 if (delta_exec < sysctl_sched_min_granularity)
1143 return;
1144
1145 se = __pick_first_entity(cfs_rq);
1146 delta = curr->vruntime - se->vruntime;
1147
1148 if (delta < 0)
1149 return;
1150
1151 if (delta > ideal_runtime)
1152 resched_task(rq_of(cfs_rq)->curr);
1153}
1154
1155static void
1156set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1157{
1158
1159 if (se->on_rq) {
1160
1161
1162
1163
1164
1165 update_stats_wait_end(cfs_rq, se);
1166 __dequeue_entity(cfs_rq, se);
1167 }
1168
1169 update_stats_curr_start(cfs_rq, se);
1170 cfs_rq->curr = se;
1171#ifdef CONFIG_SCHEDSTATS
1172
1173
1174
1175
1176
1177 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1178 se->statistics.slice_max = max(se->statistics.slice_max,
1179 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1180 }
1181#endif
1182 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1183}
1184
1185static int
1186wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1187
1188
1189
1190
1191
1192
1193
1194
1195static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1196{
1197 struct sched_entity *se = __pick_first_entity(cfs_rq);
1198 struct sched_entity *left = se;
1199
1200
1201
1202
1203
1204 if (cfs_rq->skip == se) {
1205 struct sched_entity *second = __pick_next_entity(se);
1206 if (second && wakeup_preempt_entity(second, left) < 1)
1207 se = second;
1208 }
1209
1210
1211
1212
1213 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1214 se = cfs_rq->last;
1215
1216
1217
1218
1219 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1220 se = cfs_rq->next;
1221
1222 clear_buddies(cfs_rq, se);
1223
1224 return se;
1225}
1226
1227static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1228
1229static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1230{
1231
1232
1233
1234
1235 if (prev->on_rq)
1236 update_curr(cfs_rq);
1237
1238
1239 check_cfs_rq_runtime(cfs_rq);
1240
1241 check_spread(cfs_rq, prev);
1242 if (prev->on_rq) {
1243 update_stats_wait_start(cfs_rq, prev);
1244
1245 __enqueue_entity(cfs_rq, prev);
1246 }
1247 cfs_rq->curr = NULL;
1248}
1249
1250static void
1251entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1252{
1253
1254
1255
1256 update_curr(cfs_rq);
1257
1258
1259
1260
1261 update_entity_shares_tick(cfs_rq);
1262
1263#ifdef CONFIG_SCHED_HRTICK
1264
1265
1266
1267
1268 if (queued) {
1269 resched_task(rq_of(cfs_rq)->curr);
1270 return;
1271 }
1272
1273
1274
1275 if (!sched_feat(DOUBLE_TICK) &&
1276 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1277 return;
1278#endif
1279
1280 if (cfs_rq->nr_running > 1)
1281 check_preempt_tick(cfs_rq, curr);
1282}
1283
1284
1285
1286
1287
1288
1289#ifdef CONFIG_CFS_BANDWIDTH
1290
1291
1292
1293
1294static inline u64 default_cfs_period(void)
1295{
1296 return 100000000ULL;
1297}
1298
1299static inline u64 sched_cfs_bandwidth_slice(void)
1300{
1301 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1302}
1303
1304
1305
1306
1307
1308
1309
1310
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{
1313 u64 now;
1314
1315 if (cfs_b->quota == RUNTIME_INF)
1316 return;
1317
1318 now = sched_clock_cpu(smp_processor_id());
1319 cfs_b->runtime = cfs_b->quota;
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321}
1322
1323
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{
1326 struct task_group *tg = cfs_rq->tg;
1327 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1328 u64 amount = 0, min_amount, expires;
1329
1330
1331 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1332
1333 raw_spin_lock(&cfs_b->lock);
1334 if (cfs_b->quota == RUNTIME_INF)
1335 amount = min_amount;
1336 else {
1337
1338
1339
1340
1341
1342
1343 if (!cfs_b->timer_active) {
1344 __refill_cfs_bandwidth_runtime(cfs_b);
1345 __start_cfs_bandwidth(cfs_b);
1346 }
1347
1348 if (cfs_b->runtime > 0) {
1349 amount = min(cfs_b->runtime, min_amount);
1350 cfs_b->runtime -= amount;
1351 cfs_b->idle = 0;
1352 }
1353 }
1354 expires = cfs_b->runtime_expires;
1355 raw_spin_unlock(&cfs_b->lock);
1356
1357 cfs_rq->runtime_remaining += amount;
1358
1359
1360
1361
1362
1363 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1364 cfs_rq->runtime_expires = expires;
1365
1366 return cfs_rq->runtime_remaining > 0;
1367}
1368
1369
1370
1371
1372
1373static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1374{
1375 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1376 struct rq *rq = rq_of(cfs_rq);
1377
1378
1379 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1380 return;
1381
1382 if (cfs_rq->runtime_remaining < 0)
1383 return;
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1395
1396 cfs_rq->runtime_expires += TICK_NSEC;
1397 } else {
1398
1399 cfs_rq->runtime_remaining = 0;
1400 }
1401}
1402
1403static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1404 unsigned long delta_exec)
1405{
1406
1407 cfs_rq->runtime_remaining -= delta_exec;
1408 expire_cfs_rq_runtime(cfs_rq);
1409
1410 if (likely(cfs_rq->runtime_remaining > 0))
1411 return;
1412
1413
1414
1415
1416
1417 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1418 resched_task(rq_of(cfs_rq)->curr);
1419}
1420
1421static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1422 unsigned long delta_exec)
1423{
1424 if (!cfs_rq->runtime_enabled)
1425 return;
1426
1427 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1428}
1429
1430static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1431{
1432 return cfs_rq->throttled;
1433}
1434
1435
1436static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1437{
1438 return cfs_rq->throttle_count;
1439}
1440
1441
1442
1443
1444
1445
1446static inline int throttled_lb_pair(struct task_group *tg,
1447 int src_cpu, int dest_cpu)
1448{
1449 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1450
1451 src_cfs_rq = tg->cfs_rq[src_cpu];
1452 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1453
1454 return throttled_hierarchy(src_cfs_rq) ||
1455 throttled_hierarchy(dest_cfs_rq);
1456}
1457
1458
1459static int tg_unthrottle_up(struct task_group *tg, void *data)
1460{
1461 struct rq *rq = data;
1462 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1463
1464 cfs_rq->throttle_count--;
1465#ifdef CONFIG_SMP
1466 if (!cfs_rq->throttle_count) {
1467 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1468
1469
1470 cfs_rq->load_stamp += delta;
1471 cfs_rq->load_last += delta;
1472
1473
1474 update_cfs_shares(cfs_rq);
1475 }
1476#endif
1477
1478 return 0;
1479}
1480
1481static int tg_throttle_down(struct task_group *tg, void *data)
1482{
1483 struct rq *rq = data;
1484 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1485
1486
1487 if (!cfs_rq->throttle_count)
1488 update_cfs_load(cfs_rq, 0);
1489 cfs_rq->throttle_count++;
1490
1491 return 0;
1492}
1493
1494static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1495{
1496 struct rq *rq = rq_of(cfs_rq);
1497 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1498 struct sched_entity *se;
1499 long task_delta, dequeue = 1;
1500
1501 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1502
1503
1504 rcu_read_lock();
1505 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1506 rcu_read_unlock();
1507
1508 task_delta = cfs_rq->h_nr_running;
1509 for_each_sched_entity(se) {
1510 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1511
1512 if (!se->on_rq)
1513 break;
1514
1515 if (dequeue)
1516 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1517 qcfs_rq->h_nr_running -= task_delta;
1518
1519 if (qcfs_rq->load.weight)
1520 dequeue = 0;
1521 }
1522
1523 if (!se)
1524 rq->nr_running -= task_delta;
1525
1526 cfs_rq->throttled = 1;
1527 cfs_rq->throttled_timestamp = rq->clock;
1528 raw_spin_lock(&cfs_b->lock);
1529 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1530 raw_spin_unlock(&cfs_b->lock);
1531}
1532
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{
1535 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1537 struct sched_entity *se;
1538 int enqueue = 1;
1539 long task_delta;
1540
1541 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1542
1543 cfs_rq->throttled = 0;
1544 raw_spin_lock(&cfs_b->lock);
1545 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1546 list_del_rcu(&cfs_rq->throttled_list);
1547 raw_spin_unlock(&cfs_b->lock);
1548 cfs_rq->throttled_timestamp = 0;
1549
1550 update_rq_clock(rq);
1551
1552 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1553
1554 if (!cfs_rq->load.weight)
1555 return;
1556
1557 task_delta = cfs_rq->h_nr_running;
1558 for_each_sched_entity(se) {
1559 if (se->on_rq)
1560 enqueue = 0;
1561
1562 cfs_rq = cfs_rq_of(se);
1563 if (enqueue)
1564 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1565 cfs_rq->h_nr_running += task_delta;
1566
1567 if (cfs_rq_throttled(cfs_rq))
1568 break;
1569 }
1570
1571 if (!se)
1572 rq->nr_running += task_delta;
1573
1574
1575 if (rq->curr == rq->idle && rq->cfs.nr_running)
1576 resched_task(rq->curr);
1577}
1578
1579static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1580 u64 remaining, u64 expires)
1581{
1582 struct cfs_rq *cfs_rq;
1583 u64 runtime = remaining;
1584
1585 rcu_read_lock();
1586 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1587 throttled_list) {
1588 struct rq *rq = rq_of(cfs_rq);
1589
1590 raw_spin_lock(&rq->lock);
1591 if (!cfs_rq_throttled(cfs_rq))
1592 goto next;
1593
1594 runtime = -cfs_rq->runtime_remaining + 1;
1595 if (runtime > remaining)
1596 runtime = remaining;
1597 remaining -= runtime;
1598
1599 cfs_rq->runtime_remaining += runtime;
1600 cfs_rq->runtime_expires = expires;
1601
1602
1603 if (cfs_rq->runtime_remaining > 0)
1604 unthrottle_cfs_rq(cfs_rq);
1605
1606next:
1607 raw_spin_unlock(&rq->lock);
1608
1609 if (!remaining)
1610 break;
1611 }
1612 rcu_read_unlock();
1613
1614 return remaining;
1615}
1616
1617
1618
1619
1620
1621
1622
1623static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1624{
1625 u64 runtime, runtime_expires;
1626 int idle = 1, throttled;
1627
1628 raw_spin_lock(&cfs_b->lock);
1629
1630 if (cfs_b->quota == RUNTIME_INF)
1631 goto out_unlock;
1632
1633 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1634
1635 idle = cfs_b->idle && !throttled;
1636 cfs_b->nr_periods += overrun;
1637
1638
1639 if (idle)
1640 goto out_unlock;
1641
1642 __refill_cfs_bandwidth_runtime(cfs_b);
1643
1644 if (!throttled) {
1645
1646 cfs_b->idle = 1;
1647 goto out_unlock;
1648 }
1649
1650
1651 cfs_b->nr_throttled += overrun;
1652
1653
1654
1655
1656
1657
1658
1659 runtime = cfs_b->runtime;
1660 runtime_expires = cfs_b->runtime_expires;
1661 cfs_b->runtime = 0;
1662
1663
1664
1665
1666
1667
1668 while (throttled && runtime > 0) {
1669 raw_spin_unlock(&cfs_b->lock);
1670
1671 runtime = distribute_cfs_runtime(cfs_b, runtime,
1672 runtime_expires);
1673 raw_spin_lock(&cfs_b->lock);
1674
1675 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1676 }
1677
1678
1679 cfs_b->runtime = runtime;
1680
1681
1682
1683
1684
1685
1686 cfs_b->idle = 0;
1687out_unlock:
1688 if (idle)
1689 cfs_b->timer_active = 0;
1690 raw_spin_unlock(&cfs_b->lock);
1691
1692 return idle;
1693}
1694
1695
1696static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1697
1698static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1699
1700static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1701
1702
1703static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1704{
1705 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1706 u64 remaining;
1707
1708
1709 if (hrtimer_callback_running(refresh_timer))
1710 return 1;
1711
1712
1713 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1714 if (remaining < min_expire)
1715 return 1;
1716
1717 return 0;
1718}
1719
1720static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1721{
1722 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1723
1724
1725 if (runtime_refresh_within(cfs_b, min_left))
1726 return;
1727
1728 start_bandwidth_timer(&cfs_b->slack_timer,
1729 ns_to_ktime(cfs_bandwidth_slack_period));
1730}
1731
1732
1733static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1734{
1735 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1736 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1737
1738 if (slack_runtime <= 0)
1739 return;
1740
1741 raw_spin_lock(&cfs_b->lock);
1742 if (cfs_b->quota != RUNTIME_INF &&
1743 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1744 cfs_b->runtime += slack_runtime;
1745
1746
1747 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1748 !list_empty(&cfs_b->throttled_cfs_rq))
1749 start_cfs_slack_bandwidth(cfs_b);
1750 }
1751 raw_spin_unlock(&cfs_b->lock);
1752
1753
1754 cfs_rq->runtime_remaining -= slack_runtime;
1755}
1756
1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1758{
1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1760 return;
1761
1762 __return_cfs_rq_runtime(cfs_rq);
1763}
1764
1765
1766
1767
1768
1769static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1770{
1771 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1772 u64 expires;
1773
1774
1775 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1776 return;
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1780 runtime = cfs_b->runtime;
1781 cfs_b->runtime = 0;
1782 }
1783 expires = cfs_b->runtime_expires;
1784 raw_spin_unlock(&cfs_b->lock);
1785
1786 if (!runtime)
1787 return;
1788
1789 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1790
1791 raw_spin_lock(&cfs_b->lock);
1792 if (expires == cfs_b->runtime_expires)
1793 cfs_b->runtime = runtime;
1794 raw_spin_unlock(&cfs_b->lock);
1795}
1796
1797
1798
1799
1800
1801
1802static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1803{
1804
1805 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1806 return;
1807
1808
1809 if (cfs_rq_throttled(cfs_rq))
1810 return;
1811
1812
1813 account_cfs_rq_runtime(cfs_rq, 0);
1814 if (cfs_rq->runtime_remaining <= 0)
1815 throttle_cfs_rq(cfs_rq);
1816}
1817
1818
1819static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1820{
1821 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1822 return;
1823
1824
1825
1826
1827
1828 if (cfs_rq_throttled(cfs_rq))
1829 return;
1830
1831 throttle_cfs_rq(cfs_rq);
1832}
1833#else
1834static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1835 unsigned long delta_exec) {}
1836static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1837static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1838static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1839
1840static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1841{
1842 return 0;
1843}
1844
1845static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1846{
1847 return 0;
1848}
1849
1850static inline int throttled_lb_pair(struct task_group *tg,
1851 int src_cpu, int dest_cpu)
1852{
1853 return 0;
1854}
1855#endif
1856
1857
1858
1859
1860
1861#ifdef CONFIG_SCHED_HRTICK
1862static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1863{
1864 struct sched_entity *se = &p->se;
1865 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1866
1867 WARN_ON(task_rq(p) != rq);
1868
1869 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
1870 u64 slice = sched_slice(cfs_rq, se);
1871 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1872 s64 delta = slice - ran;
1873
1874 if (delta < 0) {
1875 if (rq->curr == p)
1876 resched_task(p);
1877 return;
1878 }
1879
1880
1881
1882
1883
1884 if (rq->curr != p)
1885 delta = max_t(s64, 10000LL, delta);
1886
1887 hrtick_start(rq, delta);
1888 }
1889}
1890
1891
1892
1893
1894
1895
1896static void hrtick_update(struct rq *rq)
1897{
1898 struct task_struct *curr = rq->curr;
1899
1900 if (curr->sched_class != &fair_sched_class)
1901 return;
1902
1903 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
1904 hrtick_start_fair(rq, curr);
1905}
1906#else
1907static inline void
1908hrtick_start_fair(struct rq *rq, struct task_struct *p)
1909{
1910}
1911
1912static inline void hrtick_update(struct rq *rq)
1913{
1914}
1915#endif
1916
1917
1918
1919
1920
1921
1922static void
1923enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1924{
1925 struct cfs_rq *cfs_rq;
1926 struct sched_entity *se = &p->se;
1927
1928 for_each_sched_entity(se) {
1929 if (se->on_rq)
1930 break;
1931 cfs_rq = cfs_rq_of(se);
1932 enqueue_entity(cfs_rq, se, flags);
1933
1934
1935
1936
1937
1938
1939
1940 if (cfs_rq_throttled(cfs_rq))
1941 break;
1942 cfs_rq->h_nr_running++;
1943
1944 flags = ENQUEUE_WAKEUP;
1945 }
1946
1947 for_each_sched_entity(se) {
1948 cfs_rq = cfs_rq_of(se);
1949 cfs_rq->h_nr_running++;
1950
1951 if (cfs_rq_throttled(cfs_rq))
1952 break;
1953
1954 update_cfs_load(cfs_rq, 0);
1955 update_cfs_shares(cfs_rq);
1956 }
1957
1958 if (!se)
1959 inc_nr_running(rq);
1960 hrtick_update(rq);
1961}
1962
1963static void set_next_buddy(struct sched_entity *se);
1964
1965
1966
1967
1968
1969
1970static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1971{
1972 struct cfs_rq *cfs_rq;
1973 struct sched_entity *se = &p->se;
1974 int task_sleep = flags & DEQUEUE_SLEEP;
1975
1976 for_each_sched_entity(se) {
1977 cfs_rq = cfs_rq_of(se);
1978 dequeue_entity(cfs_rq, se, flags);
1979
1980
1981
1982
1983
1984
1985
1986 if (cfs_rq_throttled(cfs_rq))
1987 break;
1988 cfs_rq->h_nr_running--;
1989
1990
1991 if (cfs_rq->load.weight) {
1992
1993
1994
1995
1996 if (task_sleep && parent_entity(se))
1997 set_next_buddy(parent_entity(se));
1998
1999
2000 se = parent_entity(se);
2001 break;
2002 }
2003 flags |= DEQUEUE_SLEEP;
2004 }
2005
2006 for_each_sched_entity(se) {
2007 cfs_rq = cfs_rq_of(se);
2008 cfs_rq->h_nr_running--;
2009
2010 if (cfs_rq_throttled(cfs_rq))
2011 break;
2012
2013 update_cfs_load(cfs_rq, 0);
2014 update_cfs_shares(cfs_rq);
2015 }
2016
2017 if (!se)
2018 dec_nr_running(rq);
2019 hrtick_update(rq);
2020}
2021
2022#ifdef CONFIG_SMP
2023
2024static void task_waking_fair(struct task_struct *p)
2025{
2026 struct sched_entity *se = &p->se;
2027 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2028 u64 min_vruntime;
2029
2030#ifndef CONFIG_64BIT
2031 u64 min_vruntime_copy;
2032
2033 do {
2034 min_vruntime_copy = cfs_rq->min_vruntime_copy;
2035 smp_rmb();
2036 min_vruntime = cfs_rq->min_vruntime;
2037 } while (min_vruntime != min_vruntime_copy);
2038#else
2039 min_vruntime = cfs_rq->min_vruntime;
2040#endif
2041
2042 se->vruntime -= min_vruntime;
2043}
2044
2045#ifdef CONFIG_FAIR_GROUP_SCHED
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2097{
2098 struct sched_entity *se = tg->se[cpu];
2099
2100 if (!tg->parent)
2101 return wl;
2102
2103 for_each_sched_entity(se) {
2104 long w, W;
2105
2106 tg = se->my_q->tg;
2107
2108
2109
2110
2111 W = wg + calc_tg_weight(tg, se->my_q);
2112
2113
2114
2115
2116 w = se->my_q->load.weight + wl;
2117
2118
2119
2120
2121 if (W > 0 && w < W)
2122 wl = (w * tg->shares) / W;
2123 else
2124 wl = tg->shares;
2125
2126
2127
2128
2129
2130
2131 if (wl < MIN_SHARES)
2132 wl = MIN_SHARES;
2133
2134
2135
2136
2137 wl -= se->load.weight;
2138
2139
2140
2141
2142
2143
2144
2145
2146 wg = 0;
2147 }
2148
2149 return wl;
2150}
2151#else
2152
2153static inline unsigned long effective_load(struct task_group *tg, int cpu,
2154 unsigned long wl, unsigned long wg)
2155{
2156 return wl;
2157}
2158
2159#endif
2160
2161static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
2162{
2163 s64 this_load, load;
2164 int idx, this_cpu, prev_cpu;
2165 unsigned long tl_per_task;
2166 struct task_group *tg;
2167 unsigned long weight;
2168 int balanced;
2169
2170 idx = sd->wake_idx;
2171 this_cpu = smp_processor_id();
2172 prev_cpu = task_cpu(p);
2173 load = source_load(prev_cpu, idx);
2174 this_load = target_load(this_cpu, idx);
2175
2176
2177
2178
2179
2180
2181 if (sync) {
2182 tg = task_group(current);
2183 weight = current->se.load.weight;
2184
2185 this_load += effective_load(tg, this_cpu, -weight, -weight);
2186 load += effective_load(tg, prev_cpu, 0, -weight);
2187 }
2188
2189 tg = task_group(p);
2190 weight = p->se.load.weight;
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201 if (this_load > 0) {
2202 s64 this_eff_load, prev_eff_load;
2203
2204 this_eff_load = 100;
2205 this_eff_load *= power_of(prev_cpu);
2206 this_eff_load *= this_load +
2207 effective_load(tg, this_cpu, weight, weight);
2208
2209 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2210 prev_eff_load *= power_of(this_cpu);
2211 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
2212
2213 balanced = this_eff_load <= prev_eff_load;
2214 } else
2215 balanced = true;
2216
2217
2218
2219
2220
2221
2222 if (sync && balanced)
2223 return 1;
2224
2225 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
2226 tl_per_task = cpu_avg_load_per_task(this_cpu);
2227
2228 if (balanced ||
2229 (this_load <= load &&
2230 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
2231
2232
2233
2234
2235
2236 schedstat_inc(sd, ttwu_move_affine);
2237 schedstat_inc(p, se.statistics.nr_wakeups_affine);
2238
2239 return 1;
2240 }
2241 return 0;
2242}
2243
2244
2245
2246
2247
2248static struct sched_group *
2249find_idlest_group(struct sched_domain *sd, struct task_struct *p,
2250 int this_cpu, int load_idx)
2251{
2252 struct sched_group *idlest = NULL, *group = sd->groups;
2253 unsigned long min_load = ULONG_MAX, this_load = 0;
2254 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2255
2256 do {
2257 unsigned long load, avg_load;
2258 int local_group;
2259 int i;
2260
2261
2262 if (!cpumask_intersects(sched_group_cpus(group),
2263 tsk_cpus_allowed(p)))
2264 continue;
2265
2266 local_group = cpumask_test_cpu(this_cpu,
2267 sched_group_cpus(group));
2268
2269
2270 avg_load = 0;
2271
2272 for_each_cpu(i, sched_group_cpus(group)) {
2273
2274 if (local_group)
2275 load = source_load(i, load_idx);
2276 else
2277 load = target_load(i, load_idx);
2278
2279 avg_load += load;
2280 }
2281
2282
2283 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
2284
2285 if (local_group) {
2286 this_load = avg_load;
2287 } else if (avg_load < min_load) {
2288 min_load = avg_load;
2289 idlest = group;
2290 }
2291 } while (group = group->next, group != sd->groups);
2292
2293 if (!idlest || 100*this_load < imbalance*min_load)
2294 return NULL;
2295 return idlest;
2296}
2297
2298
2299
2300
2301static int
2302find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2303{
2304 unsigned long load, min_load = ULONG_MAX;
2305 int idlest = -1;
2306 int i;
2307
2308
2309 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
2310 load = weighted_cpuload(i);
2311
2312 if (load < min_load || (load == min_load && i == this_cpu)) {
2313 min_load = load;
2314 idlest = i;
2315 }
2316 }
2317
2318 return idlest;
2319}
2320
2321
2322
2323
2324static int select_idle_sibling(struct task_struct *p, int target)
2325{
2326 int cpu = smp_processor_id();
2327 int prev_cpu = task_cpu(p);
2328 struct sched_domain *sd;
2329 struct sched_group *sg;
2330 int i, smt = 0;
2331
2332
2333
2334
2335
2336 if (target == cpu && idle_cpu(cpu))
2337 return cpu;
2338
2339
2340
2341
2342
2343 if (target == prev_cpu && idle_cpu(prev_cpu))
2344 return prev_cpu;
2345
2346
2347
2348
2349 rcu_read_lock();
2350again:
2351 for_each_domain(target, sd) {
2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2353 continue;
2354
2355 if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
2356 break;
2357
2358 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
2359 break;
2360
2361 sg = sd->groups;
2362 do {
2363 if (!cpumask_intersects(sched_group_cpus(sg),
2364 tsk_cpus_allowed(p)))
2365 goto next;
2366
2367 for_each_cpu(i, sched_group_cpus(sg)) {
2368 if (!idle_cpu(i))
2369 goto next;
2370 }
2371
2372 target = cpumask_first_and(sched_group_cpus(sg),
2373 tsk_cpus_allowed(p));
2374 goto done;
2375next:
2376 sg = sg->next;
2377 } while (sg != sd->groups);
2378 }
2379 if (!smt) {
2380 smt = 1;
2381 goto again;
2382 }
2383done:
2384 rcu_read_unlock();
2385
2386 return target;
2387}
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400static int
2401select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2402{
2403 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
2404 int cpu = smp_processor_id();
2405 int prev_cpu = task_cpu(p);
2406 int new_cpu = cpu;
2407 int want_affine = 0;
2408 int want_sd = 1;
2409 int sync = wake_flags & WF_SYNC;
2410
2411 if (sd_flag & SD_BALANCE_WAKE) {
2412 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2413 want_affine = 1;
2414 new_cpu = prev_cpu;
2415 }
2416
2417 rcu_read_lock();
2418 for_each_domain(cpu, tmp) {
2419 if (!(tmp->flags & SD_LOAD_BALANCE))
2420 continue;
2421
2422
2423
2424
2425
2426 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
2427 unsigned long power = 0;
2428 unsigned long nr_running = 0;
2429 unsigned long capacity;
2430 int i;
2431
2432 for_each_cpu(i, sched_domain_span(tmp)) {
2433 power += power_of(i);
2434 nr_running += cpu_rq(i)->cfs.nr_running;
2435 }
2436
2437 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2438
2439 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2440 nr_running /= 2;
2441
2442 if (nr_running < capacity)
2443 want_sd = 0;
2444 }
2445
2446
2447
2448
2449
2450 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2451 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2452 affine_sd = tmp;
2453 want_affine = 0;
2454 }
2455
2456 if (!want_sd && !want_affine)
2457 break;
2458
2459 if (!(tmp->flags & sd_flag))
2460 continue;
2461
2462 if (want_sd)
2463 sd = tmp;
2464 }
2465
2466 if (affine_sd) {
2467 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
2468 prev_cpu = cpu;
2469
2470 new_cpu = select_idle_sibling(p, prev_cpu);
2471 goto unlock;
2472 }
2473
2474 while (sd) {
2475 int load_idx = sd->forkexec_idx;
2476 struct sched_group *group;
2477 int weight;
2478
2479 if (!(sd->flags & sd_flag)) {
2480 sd = sd->child;
2481 continue;
2482 }
2483
2484 if (sd_flag & SD_BALANCE_WAKE)
2485 load_idx = sd->wake_idx;
2486
2487 group = find_idlest_group(sd, p, cpu, load_idx);
2488 if (!group) {
2489 sd = sd->child;
2490 continue;
2491 }
2492
2493 new_cpu = find_idlest_cpu(group, p, cpu);
2494 if (new_cpu == -1 || new_cpu == cpu) {
2495
2496 sd = sd->child;
2497 continue;
2498 }
2499
2500
2501 cpu = new_cpu;
2502 weight = sd->span_weight;
2503 sd = NULL;
2504 for_each_domain(cpu, tmp) {
2505 if (weight <= tmp->span_weight)
2506 break;
2507 if (tmp->flags & sd_flag)
2508 sd = tmp;
2509 }
2510
2511 }
2512unlock:
2513 rcu_read_unlock();
2514
2515 return new_cpu;
2516}
2517#endif
2518
2519static unsigned long
2520wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
2521{
2522 unsigned long gran = sysctl_sched_wakeup_granularity;
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537 return calc_delta_fair(gran, se);
2538}
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554static int
2555wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
2556{
2557 s64 gran, vdiff = curr->vruntime - se->vruntime;
2558
2559 if (vdiff <= 0)
2560 return -1;
2561
2562 gran = wakeup_gran(curr, se);
2563 if (vdiff > gran)
2564 return 1;
2565
2566 return 0;
2567}
2568
2569static void set_last_buddy(struct sched_entity *se)
2570{
2571 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2572 return;
2573
2574 for_each_sched_entity(se)
2575 cfs_rq_of(se)->last = se;
2576}
2577
2578static void set_next_buddy(struct sched_entity *se)
2579{
2580 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2581 return;
2582
2583 for_each_sched_entity(se)
2584 cfs_rq_of(se)->next = se;
2585}
2586
2587static void set_skip_buddy(struct sched_entity *se)
2588{
2589 for_each_sched_entity(se)
2590 cfs_rq_of(se)->skip = se;
2591}
2592
2593
2594
2595
2596static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2597{
2598 struct task_struct *curr = rq->curr;
2599 struct sched_entity *se = &curr->se, *pse = &p->se;
2600 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2601 int scale = cfs_rq->nr_running >= sched_nr_latency;
2602 int next_buddy_marked = 0;
2603
2604 if (unlikely(se == pse))
2605 return;
2606
2607
2608
2609
2610
2611
2612
2613 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2614 return;
2615
2616 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
2617 set_next_buddy(pse);
2618 next_buddy_marked = 1;
2619 }
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631 if (test_tsk_need_resched(curr))
2632 return;
2633
2634
2635 if (unlikely(curr->policy == SCHED_IDLE) &&
2636 likely(p->policy != SCHED_IDLE))
2637 goto preempt;
2638
2639
2640
2641
2642
2643 if (unlikely(p->policy != SCHED_NORMAL))
2644 return;
2645
2646 find_matching_se(&se, &pse);
2647 update_curr(cfs_rq_of(se));
2648 BUG_ON(!pse);
2649 if (wakeup_preempt_entity(se, pse) == 1) {
2650
2651
2652
2653
2654 if (!next_buddy_marked)
2655 set_next_buddy(pse);
2656 goto preempt;
2657 }
2658
2659 return;
2660
2661preempt:
2662 resched_task(curr);
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672 if (unlikely(!se->on_rq || curr == rq->idle))
2673 return;
2674
2675 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
2676 set_last_buddy(se);
2677}
2678
2679static struct task_struct *pick_next_task_fair(struct rq *rq)
2680{
2681 struct task_struct *p;
2682 struct cfs_rq *cfs_rq = &rq->cfs;
2683 struct sched_entity *se;
2684
2685 if (!cfs_rq->nr_running)
2686 return NULL;
2687
2688 do {
2689 se = pick_next_entity(cfs_rq);
2690 set_next_entity(cfs_rq, se);
2691 cfs_rq = group_cfs_rq(se);
2692 } while (cfs_rq);
2693
2694 p = task_of(se);
2695 hrtick_start_fair(rq, p);
2696
2697 return p;
2698}
2699
2700
2701
2702
2703static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
2704{
2705 struct sched_entity *se = &prev->se;
2706 struct cfs_rq *cfs_rq;
2707
2708 for_each_sched_entity(se) {
2709 cfs_rq = cfs_rq_of(se);
2710 put_prev_entity(cfs_rq, se);
2711 }
2712}
2713
2714
2715
2716
2717
2718
2719static void yield_task_fair(struct rq *rq)
2720{
2721 struct task_struct *curr = rq->curr;
2722 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2723 struct sched_entity *se = &curr->se;
2724
2725
2726
2727
2728 if (unlikely(rq->nr_running == 1))
2729 return;
2730
2731 clear_buddies(cfs_rq, se);
2732
2733 if (curr->policy != SCHED_BATCH) {
2734 update_rq_clock(rq);
2735
2736
2737
2738 update_curr(cfs_rq);
2739 }
2740
2741 set_skip_buddy(se);
2742}
2743
2744static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
2745{
2746 struct sched_entity *se = &p->se;
2747
2748
2749 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2750 return false;
2751
2752
2753 set_next_buddy(se);
2754
2755 yield_task_fair(rq);
2756
2757 return true;
2758}
2759
2760#ifdef CONFIG_SMP
2761
2762
2763
2764
2765
2766
2767
2768
2769static void pull_task(struct rq *src_rq, struct task_struct *p,
2770 struct rq *this_rq, int this_cpu)
2771{
2772 deactivate_task(src_rq, p, 0);
2773 set_task_cpu(p, this_cpu);
2774 activate_task(this_rq, p, 0);
2775 check_preempt_curr(this_rq, p, 0);
2776}
2777
2778
2779
2780
2781static
2782int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2783 struct sched_domain *sd, enum cpu_idle_type idle,
2784 int *all_pinned)
2785{
2786 int tsk_cache_hot = 0;
2787
2788
2789
2790
2791
2792
2793 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
2794 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2795 return 0;
2796 }
2797 *all_pinned = 0;
2798
2799 if (task_running(rq, p)) {
2800 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
2801 return 0;
2802 }
2803
2804
2805
2806
2807
2808
2809
2810 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
2811 if (!tsk_cache_hot ||
2812 sd->nr_balance_failed > sd->cache_nice_tries) {
2813#ifdef CONFIG_SCHEDSTATS
2814 if (tsk_cache_hot) {
2815 schedstat_inc(sd, lb_hot_gained[idle]);
2816 schedstat_inc(p, se.statistics.nr_forced_migrations);
2817 }
2818#endif
2819 return 1;
2820 }
2821
2822 if (tsk_cache_hot) {
2823 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
2824 return 0;
2825 }
2826 return 1;
2827}
2828
2829
2830
2831
2832
2833
2834
2835
2836static int
2837move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2838 struct sched_domain *sd, enum cpu_idle_type idle)
2839{
2840 struct task_struct *p, *n;
2841 struct cfs_rq *cfs_rq;
2842 int pinned = 0;
2843
2844 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2845 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2846 if (throttled_lb_pair(task_group(p),
2847 busiest->cpu, this_cpu))
2848 break;
2849
2850 if (!can_migrate_task(p, busiest, this_cpu,
2851 sd, idle, &pinned))
2852 continue;
2853
2854 pull_task(busiest, p, this_rq, this_cpu);
2855
2856
2857
2858
2859
2860 schedstat_inc(sd, lb_gained[idle]);
2861 return 1;
2862 }
2863 }
2864
2865 return 0;
2866}
2867
2868static unsigned long
2869balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2870 unsigned long max_load_move, struct sched_domain *sd,
2871 enum cpu_idle_type idle, int *all_pinned,
2872 struct cfs_rq *busiest_cfs_rq)
2873{
2874 int loops = 0, pulled = 0;
2875 long rem_load_move = max_load_move;
2876 struct task_struct *p, *n;
2877
2878 if (max_load_move == 0)
2879 goto out;
2880
2881 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2882 if (loops++ > sysctl_sched_nr_migrate)
2883 break;
2884
2885 if ((p->se.load.weight >> 1) > rem_load_move ||
2886 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2887 all_pinned))
2888 continue;
2889
2890 pull_task(busiest, p, this_rq, this_cpu);
2891 pulled++;
2892 rem_load_move -= p->se.load.weight;
2893
2894#ifdef CONFIG_PREEMPT
2895
2896
2897
2898
2899
2900 if (idle == CPU_NEWLY_IDLE)
2901 break;
2902#endif
2903
2904
2905
2906
2907
2908 if (rem_load_move <= 0)
2909 break;
2910 }
2911out:
2912
2913
2914
2915
2916
2917 schedstat_add(sd, lb_gained[idle], pulled);
2918
2919 return max_load_move - rem_load_move;
2920}
2921
2922#ifdef CONFIG_FAIR_GROUP_SCHED
2923
2924
2925
2926static int update_shares_cpu(struct task_group *tg, int cpu)
2927{
2928 struct cfs_rq *cfs_rq;
2929 unsigned long flags;
2930 struct rq *rq;
2931
2932 if (!tg->se[cpu])
2933 return 0;
2934
2935 rq = cpu_rq(cpu);
2936 cfs_rq = tg->cfs_rq[cpu];
2937
2938 raw_spin_lock_irqsave(&rq->lock, flags);
2939
2940 update_rq_clock(rq);
2941 update_cfs_load(cfs_rq, 1);
2942
2943
2944
2945
2946
2947 update_cfs_shares(cfs_rq);
2948
2949 raw_spin_unlock_irqrestore(&rq->lock, flags);
2950
2951 return 0;
2952}
2953
2954static void update_shares(int cpu)
2955{
2956 struct cfs_rq *cfs_rq;
2957 struct rq *rq = cpu_rq(cpu);
2958
2959 rcu_read_lock();
2960
2961
2962
2963
2964 for_each_leaf_cfs_rq(rq, cfs_rq) {
2965
2966 if (throttled_hierarchy(cfs_rq))
2967 continue;
2968
2969 update_shares_cpu(cfs_rq->tg, cpu);
2970 }
2971 rcu_read_unlock();
2972}
2973
2974
2975
2976
2977
2978
2979static int tg_load_down(struct task_group *tg, void *data)
2980{
2981 unsigned long load;
2982 long cpu = (long)data;
2983
2984 if (!tg->parent) {
2985 load = cpu_rq(cpu)->load.weight;
2986 } else {
2987 load = tg->parent->cfs_rq[cpu]->h_load;
2988 load *= tg->se[cpu]->load.weight;
2989 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
2990 }
2991
2992 tg->cfs_rq[cpu]->h_load = load;
2993
2994 return 0;
2995}
2996
2997static void update_h_load(long cpu)
2998{
2999 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3000}
3001
3002static unsigned long
3003load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3004 unsigned long max_load_move,
3005 struct sched_domain *sd, enum cpu_idle_type idle,
3006 int *all_pinned)
3007{
3008 long rem_load_move = max_load_move;
3009 struct cfs_rq *busiest_cfs_rq;
3010
3011 rcu_read_lock();
3012 update_h_load(cpu_of(busiest));
3013
3014 for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
3015 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
3016 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
3017 u64 rem_load, moved_load;
3018
3019
3020
3021
3022 if (!busiest_cfs_rq->task_weight ||
3023 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
3024 continue;
3025
3026 rem_load = (u64)rem_load_move * busiest_weight;
3027 rem_load = div_u64(rem_load, busiest_h_load + 1);
3028
3029 moved_load = balance_tasks(this_rq, this_cpu, busiest,
3030 rem_load, sd, idle, all_pinned,
3031 busiest_cfs_rq);
3032
3033 if (!moved_load)
3034 continue;
3035
3036 moved_load *= busiest_h_load;
3037 moved_load = div_u64(moved_load, busiest_weight + 1);
3038
3039 rem_load_move -= moved_load;
3040 if (rem_load_move < 0)
3041 break;
3042 }
3043 rcu_read_unlock();
3044
3045 return max_load_move - rem_load_move;
3046}
3047#else
3048static inline void update_shares(int cpu)
3049{
3050}
3051
3052static unsigned long
3053load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3054 unsigned long max_load_move,
3055 struct sched_domain *sd, enum cpu_idle_type idle,
3056 int *all_pinned)
3057{
3058 return balance_tasks(this_rq, this_cpu, busiest,
3059 max_load_move, sd, idle, all_pinned,
3060 &busiest->cfs);
3061}
3062#endif
3063
3064
3065
3066
3067
3068
3069
3070
3071static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3072 unsigned long max_load_move,
3073 struct sched_domain *sd, enum cpu_idle_type idle,
3074 int *all_pinned)
3075{
3076 unsigned long total_load_moved = 0, load_moved;
3077
3078 do {
3079 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
3080 max_load_move - total_load_moved,
3081 sd, idle, all_pinned);
3082
3083 total_load_moved += load_moved;
3084
3085#ifdef CONFIG_PREEMPT
3086
3087
3088
3089
3090
3091 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3092 break;
3093
3094 if (raw_spin_is_contended(&this_rq->lock) ||
3095 raw_spin_is_contended(&busiest->lock))
3096 break;
3097#endif
3098 } while (load_moved && max_load_move > total_load_moved);
3099
3100 return total_load_moved > 0;
3101}
3102
3103
3104
3105
3106
3107
3108struct sd_lb_stats {
3109 struct sched_group *busiest;
3110 struct sched_group *this;
3111 unsigned long total_load;
3112 unsigned long total_pwr;
3113 unsigned long avg_load;
3114
3115
3116 unsigned long this_load;
3117 unsigned long this_load_per_task;
3118 unsigned long this_nr_running;
3119 unsigned long this_has_capacity;
3120 unsigned int this_idle_cpus;
3121
3122
3123 unsigned int busiest_idle_cpus;
3124 unsigned long max_load;
3125 unsigned long busiest_load_per_task;
3126 unsigned long busiest_nr_running;
3127 unsigned long busiest_group_capacity;
3128 unsigned long busiest_has_capacity;
3129 unsigned int busiest_group_weight;
3130
3131 int group_imb;
3132#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3133 int power_savings_balance;
3134 struct sched_group *group_min;
3135 struct sched_group *group_leader;
3136 unsigned long min_load_per_task;
3137 unsigned long leader_nr_running;
3138 unsigned long min_nr_running;
3139#endif
3140};
3141
3142
3143
3144
3145struct sg_lb_stats {
3146 unsigned long avg_load;
3147 unsigned long group_load;
3148 unsigned long sum_nr_running;
3149 unsigned long sum_weighted_load;
3150 unsigned long group_capacity;
3151 unsigned long idle_cpus;
3152 unsigned long group_weight;
3153 int group_imb;
3154 int group_has_capacity;
3155};
3156
3157
3158
3159
3160
3161static inline unsigned int group_first_cpu(struct sched_group *group)
3162{
3163 return cpumask_first(sched_group_cpus(group));
3164}
3165
3166
3167
3168
3169
3170
3171static inline int get_sd_load_idx(struct sched_domain *sd,
3172 enum cpu_idle_type idle)
3173{
3174 int load_idx;
3175
3176 switch (idle) {
3177 case CPU_NOT_IDLE:
3178 load_idx = sd->busy_idx;
3179 break;
3180
3181 case CPU_NEWLY_IDLE:
3182 load_idx = sd->newidle_idx;
3183 break;
3184 default:
3185 load_idx = sd->idle_idx;
3186 break;
3187 }
3188
3189 return load_idx;
3190}
3191
3192
3193#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3194
3195
3196
3197
3198
3199
3200
3201
3202static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3203 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3204{
3205
3206
3207
3208
3209 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3210 sds->power_savings_balance = 0;
3211 else {
3212 sds->power_savings_balance = 1;
3213 sds->min_nr_running = ULONG_MAX;
3214 sds->leader_nr_running = 0;
3215 }
3216}
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228static inline void update_sd_power_savings_stats(struct sched_group *group,
3229 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3230{
3231
3232 if (!sds->power_savings_balance)
3233 return;
3234
3235
3236
3237
3238
3239 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3240 !sds->this_nr_running))
3241 sds->power_savings_balance = 0;
3242
3243
3244
3245
3246
3247 if (!sds->power_savings_balance ||
3248 sgs->sum_nr_running >= sgs->group_capacity ||
3249 !sgs->sum_nr_running)
3250 return;
3251
3252
3253
3254
3255
3256
3257 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3258 (sgs->sum_nr_running == sds->min_nr_running &&
3259 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3260 sds->group_min = group;
3261 sds->min_nr_running = sgs->sum_nr_running;
3262 sds->min_load_per_task = sgs->sum_weighted_load /
3263 sgs->sum_nr_running;
3264 }
3265
3266
3267
3268
3269
3270
3271 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3272 return;
3273
3274 if (sgs->sum_nr_running > sds->leader_nr_running ||
3275 (sgs->sum_nr_running == sds->leader_nr_running &&
3276 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3277 sds->group_leader = group;
3278 sds->leader_nr_running = sgs->sum_nr_running;
3279 }
3280}
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3298 int this_cpu, unsigned long *imbalance)
3299{
3300 if (!sds->power_savings_balance)
3301 return 0;
3302
3303 if (sds->this != sds->group_leader ||
3304 sds->group_leader == sds->group_min)
3305 return 0;
3306
3307 *imbalance = sds->min_load_per_task;
3308 sds->busiest = sds->group_min;
3309
3310 return 1;
3311
3312}
3313#else
3314static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3315 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3316{
3317 return;
3318}
3319
3320static inline void update_sd_power_savings_stats(struct sched_group *group,
3321 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3322{
3323 return;
3324}
3325
3326static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3327 int this_cpu, unsigned long *imbalance)
3328{
3329 return 0;
3330}
3331#endif
3332
3333
3334unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3335{
3336 return SCHED_POWER_SCALE;
3337}
3338
3339unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3340{
3341 return default_scale_freq_power(sd, cpu);
3342}
3343
3344unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3345{
3346 unsigned long weight = sd->span_weight;
3347 unsigned long smt_gain = sd->smt_gain;
3348
3349 smt_gain /= weight;
3350
3351 return smt_gain;
3352}
3353
3354unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3355{
3356 return default_scale_smt_power(sd, cpu);
3357}
3358
3359unsigned long scale_rt_power(int cpu)
3360{
3361 struct rq *rq = cpu_rq(cpu);
3362 u64 total, available;
3363
3364 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3365
3366 if (unlikely(total < rq->rt_avg)) {
3367
3368 available = 0;
3369 } else {
3370 available = total - rq->rt_avg;
3371 }
3372
3373 if (unlikely((s64)total < SCHED_POWER_SCALE))
3374 total = SCHED_POWER_SCALE;
3375
3376 total >>= SCHED_POWER_SHIFT;
3377
3378 return div_u64(available, total);
3379}
3380
3381static void update_cpu_power(struct sched_domain *sd, int cpu)
3382{
3383 unsigned long weight = sd->span_weight;
3384 unsigned long power = SCHED_POWER_SCALE;
3385 struct sched_group *sdg = sd->groups;
3386
3387 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3388 if (sched_feat(ARCH_POWER))
3389 power *= arch_scale_smt_power(sd, cpu);
3390 else
3391 power *= default_scale_smt_power(sd, cpu);
3392
3393 power >>= SCHED_POWER_SHIFT;
3394 }
3395
3396 sdg->sgp->power_orig = power;
3397
3398 if (sched_feat(ARCH_POWER))
3399 power *= arch_scale_freq_power(sd, cpu);
3400 else
3401 power *= default_scale_freq_power(sd, cpu);
3402
3403 power >>= SCHED_POWER_SHIFT;
3404
3405 power *= scale_rt_power(cpu);
3406 power >>= SCHED_POWER_SHIFT;
3407
3408 if (!power)
3409 power = 1;
3410
3411 cpu_rq(cpu)->cpu_power = power;
3412 sdg->sgp->power = power;
3413}
3414
3415static void update_group_power(struct sched_domain *sd, int cpu)
3416{
3417 struct sched_domain *child = sd->child;
3418 struct sched_group *group, *sdg = sd->groups;
3419 unsigned long power;
3420
3421 if (!child) {
3422 update_cpu_power(sd, cpu);
3423 return;
3424 }
3425
3426 power = 0;
3427
3428 group = child->groups;
3429 do {
3430 power += group->sgp->power;
3431 group = group->next;
3432 } while (group != child->groups);
3433
3434 sdg->sgp->power = power;
3435}
3436
3437
3438
3439
3440
3441
3442
3443
3444static inline int
3445fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3446{
3447
3448
3449
3450 if (!(sd->flags & SD_SHARE_CPUPOWER))
3451 return 0;
3452
3453
3454
3455
3456 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
3457 return 1;
3458
3459 return 0;
3460}
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474static inline void update_sg_lb_stats(struct sched_domain *sd,
3475 struct sched_group *group, int this_cpu,
3476 enum cpu_idle_type idle, int load_idx,
3477 int local_group, const struct cpumask *cpus,
3478 int *balance, struct sg_lb_stats *sgs)
3479{
3480 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
3481 int i;
3482 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3483 unsigned long avg_load_per_task = 0;
3484
3485 if (local_group)
3486 balance_cpu = group_first_cpu(group);
3487
3488
3489 max_cpu_load = 0;
3490 min_cpu_load = ~0UL;
3491 max_nr_running = 0;
3492
3493 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3494 struct rq *rq = cpu_rq(i);
3495
3496
3497 if (local_group) {
3498 if (idle_cpu(i) && !first_idle_cpu) {
3499 first_idle_cpu = 1;
3500 balance_cpu = i;
3501 }
3502
3503 load = target_load(i, load_idx);
3504 } else {
3505 load = source_load(i, load_idx);
3506 if (load > max_cpu_load) {
3507 max_cpu_load = load;
3508 max_nr_running = rq->nr_running;
3509 }
3510 if (min_cpu_load > load)
3511 min_cpu_load = load;
3512 }
3513
3514 sgs->group_load += load;
3515 sgs->sum_nr_running += rq->nr_running;
3516 sgs->sum_weighted_load += weighted_cpuload(i);
3517 if (idle_cpu(i))
3518 sgs->idle_cpus++;
3519 }
3520
3521
3522
3523
3524
3525
3526
3527 if (idle != CPU_NEWLY_IDLE && local_group) {
3528 if (balance_cpu != this_cpu) {
3529 *balance = 0;
3530 return;
3531 }
3532 update_group_power(sd, this_cpu);
3533 }
3534
3535
3536 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547 if (sgs->sum_nr_running)
3548 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3549
3550 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
3551 sgs->group_imb = 1;
3552
3553 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3554 SCHED_POWER_SCALE);
3555 if (!sgs->group_capacity)
3556 sgs->group_capacity = fix_small_capacity(sd, group);
3557 sgs->group_weight = group->group_weight;
3558
3559 if (sgs->group_capacity > sgs->sum_nr_running)
3560 sgs->group_has_capacity = 1;
3561}
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574static bool update_sd_pick_busiest(struct sched_domain *sd,
3575 struct sd_lb_stats *sds,
3576 struct sched_group *sg,
3577 struct sg_lb_stats *sgs,
3578 int this_cpu)
3579{
3580 if (sgs->avg_load <= sds->max_load)
3581 return false;
3582
3583 if (sgs->sum_nr_running > sgs->group_capacity)
3584 return true;
3585
3586 if (sgs->group_imb)
3587 return true;
3588
3589
3590
3591
3592
3593
3594 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3595 this_cpu < group_first_cpu(sg)) {
3596 if (!sds->busiest)
3597 return true;
3598
3599 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
3600 return true;
3601 }
3602
3603 return false;
3604}
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3616 enum cpu_idle_type idle, const struct cpumask *cpus,
3617 int *balance, struct sd_lb_stats *sds)
3618{
3619 struct sched_domain *child = sd->child;
3620 struct sched_group *sg = sd->groups;
3621 struct sg_lb_stats sgs;
3622 int load_idx, prefer_sibling = 0;
3623
3624 if (child && child->flags & SD_PREFER_SIBLING)
3625 prefer_sibling = 1;
3626
3627 init_sd_power_savings_stats(sd, sds, idle);
3628 load_idx = get_sd_load_idx(sd, idle);
3629
3630 do {
3631 int local_group;
3632
3633 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
3634 memset(&sgs, 0, sizeof(sgs));
3635 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
3636 local_group, cpus, balance, &sgs);
3637
3638 if (local_group && !(*balance))
3639 return;
3640
3641 sds->total_load += sgs.group_load;
3642 sds->total_pwr += sg->sgp->power;
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654 if (prefer_sibling && !local_group && sds->this_has_capacity)
3655 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3656
3657 if (local_group) {
3658 sds->this_load = sgs.avg_load;
3659 sds->this = sg;
3660 sds->this_nr_running = sgs.sum_nr_running;
3661 sds->this_load_per_task = sgs.sum_weighted_load;
3662 sds->this_has_capacity = sgs.group_has_capacity;
3663 sds->this_idle_cpus = sgs.idle_cpus;
3664 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
3665 sds->max_load = sgs.avg_load;
3666 sds->busiest = sg;
3667 sds->busiest_nr_running = sgs.sum_nr_running;
3668 sds->busiest_idle_cpus = sgs.idle_cpus;
3669 sds->busiest_group_capacity = sgs.group_capacity;
3670 sds->busiest_load_per_task = sgs.sum_weighted_load;
3671 sds->busiest_has_capacity = sgs.group_has_capacity;
3672 sds->busiest_group_weight = sgs.group_weight;
3673 sds->group_imb = sgs.group_imb;
3674 }
3675
3676 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3677 sg = sg->next;
3678 } while (sg != sd->groups);
3679}
3680
3681int __weak arch_sd_sibling_asym_packing(void)
3682{
3683 return 0*SD_ASYM_PACKING;
3684}
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711static int check_asym_packing(struct sched_domain *sd,
3712 struct sd_lb_stats *sds,
3713 int this_cpu, unsigned long *imbalance)
3714{
3715 int busiest_cpu;
3716
3717 if (!(sd->flags & SD_ASYM_PACKING))
3718 return 0;
3719
3720 if (!sds->busiest)
3721 return 0;
3722
3723 busiest_cpu = group_first_cpu(sds->busiest);
3724 if (this_cpu > busiest_cpu)
3725 return 0;
3726
3727 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
3728 SCHED_POWER_SCALE);
3729 return 1;
3730}
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3741 int this_cpu, unsigned long *imbalance)
3742{
3743 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3744 unsigned int imbn = 2;
3745 unsigned long scaled_busy_load_per_task;
3746
3747 if (sds->this_nr_running) {
3748 sds->this_load_per_task /= sds->this_nr_running;
3749 if (sds->busiest_load_per_task >
3750 sds->this_load_per_task)
3751 imbn = 1;
3752 } else
3753 sds->this_load_per_task =
3754 cpu_avg_load_per_task(this_cpu);
3755
3756 scaled_busy_load_per_task = sds->busiest_load_per_task
3757 * SCHED_POWER_SCALE;
3758 scaled_busy_load_per_task /= sds->busiest->sgp->power;
3759
3760 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
3761 (scaled_busy_load_per_task * imbn)) {
3762 *imbalance = sds->busiest_load_per_task;
3763 return;
3764 }
3765
3766
3767
3768
3769
3770
3771
3772 pwr_now += sds->busiest->sgp->power *
3773 min(sds->busiest_load_per_task, sds->max_load);
3774 pwr_now += sds->this->sgp->power *
3775 min(sds->this_load_per_task, sds->this_load);
3776 pwr_now /= SCHED_POWER_SCALE;
3777
3778
3779 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3780 sds->busiest->sgp->power;
3781 if (sds->max_load > tmp)
3782 pwr_move += sds->busiest->sgp->power *
3783 min(sds->busiest_load_per_task, sds->max_load - tmp);
3784
3785
3786 if (sds->max_load * sds->busiest->sgp->power <
3787 sds->busiest_load_per_task * SCHED_POWER_SCALE)
3788 tmp = (sds->max_load * sds->busiest->sgp->power) /
3789 sds->this->sgp->power;
3790 else
3791 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3792 sds->this->sgp->power;
3793 pwr_move += sds->this->sgp->power *
3794 min(sds->this_load_per_task, sds->this_load + tmp);
3795 pwr_move /= SCHED_POWER_SCALE;
3796
3797
3798 if (pwr_move > pwr_now)
3799 *imbalance = sds->busiest_load_per_task;
3800}
3801
3802
3803
3804
3805
3806
3807
3808
3809static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3810 unsigned long *imbalance)
3811{
3812 unsigned long max_pull, load_above_capacity = ~0UL;
3813
3814 sds->busiest_load_per_task /= sds->busiest_nr_running;
3815 if (sds->group_imb) {
3816 sds->busiest_load_per_task =
3817 min(sds->busiest_load_per_task, sds->avg_load);
3818 }
3819
3820
3821
3822
3823
3824
3825 if (sds->max_load < sds->avg_load) {
3826 *imbalance = 0;
3827 return fix_small_imbalance(sds, this_cpu, imbalance);
3828 }
3829
3830 if (!sds->group_imb) {
3831
3832
3833
3834 load_above_capacity = (sds->busiest_nr_running -
3835 sds->busiest_group_capacity);
3836
3837 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3838
3839 load_above_capacity /= sds->busiest->sgp->power;
3840 }
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
3853
3854
3855 *imbalance = min(max_pull * sds->busiest->sgp->power,
3856 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
3857 / SCHED_POWER_SCALE;
3858
3859
3860
3861
3862
3863
3864
3865 if (*imbalance < sds->busiest_load_per_task)
3866 return fix_small_imbalance(sds, this_cpu, imbalance);
3867
3868}
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896static struct sched_group *
3897find_busiest_group(struct sched_domain *sd, int this_cpu,
3898 unsigned long *imbalance, enum cpu_idle_type idle,
3899 const struct cpumask *cpus, int *balance)
3900{
3901 struct sd_lb_stats sds;
3902
3903 memset(&sds, 0, sizeof(sds));
3904
3905
3906
3907
3908
3909 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3910
3911
3912
3913
3914
3915 if (!(*balance))
3916 goto ret;
3917
3918 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
3919 check_asym_packing(sd, &sds, this_cpu, imbalance))
3920 return sds.busiest;
3921
3922
3923 if (!sds.busiest || sds.busiest_nr_running == 0)
3924 goto out_balanced;
3925
3926 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
3927
3928
3929
3930
3931
3932
3933 if (sds.group_imb)
3934 goto force_balance;
3935
3936
3937 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3938 !sds.busiest_has_capacity)
3939 goto force_balance;
3940
3941
3942
3943
3944
3945 if (sds.this_load >= sds.max_load)
3946 goto out_balanced;
3947
3948
3949
3950
3951
3952 if (sds.this_load >= sds.avg_load)
3953 goto out_balanced;
3954
3955 if (idle == CPU_IDLE) {
3956
3957
3958
3959
3960
3961
3962 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3963 sds.busiest_nr_running <= sds.busiest_group_weight)
3964 goto out_balanced;
3965 } else {
3966
3967
3968
3969
3970 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3971 goto out_balanced;
3972 }
3973
3974force_balance:
3975
3976 calculate_imbalance(&sds, this_cpu, imbalance);
3977 return sds.busiest;
3978
3979out_balanced:
3980
3981
3982
3983
3984 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3985 return sds.busiest;
3986ret:
3987 *imbalance = 0;
3988 return NULL;
3989}
3990
3991
3992
3993
3994static struct rq *
3995find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3996 enum cpu_idle_type idle, unsigned long imbalance,
3997 const struct cpumask *cpus)
3998{
3999 struct rq *busiest = NULL, *rq;
4000 unsigned long max_load = 0;
4001 int i;
4002
4003 for_each_cpu(i, sched_group_cpus(group)) {
4004 unsigned long power = power_of(i);
4005 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4006 SCHED_POWER_SCALE);
4007 unsigned long wl;
4008
4009 if (!capacity)
4010 capacity = fix_small_capacity(sd, group);
4011
4012 if (!cpumask_test_cpu(i, cpus))
4013 continue;
4014
4015 rq = cpu_rq(i);
4016 wl = weighted_cpuload(i);
4017
4018
4019
4020
4021
4022 if (capacity && rq->nr_running == 1 && wl > imbalance)
4023 continue;
4024
4025
4026
4027
4028
4029
4030
4031 wl = (wl * SCHED_POWER_SCALE) / power;
4032
4033 if (wl > max_load) {
4034 max_load = wl;
4035 busiest = rq;
4036 }
4037 }
4038
4039 return busiest;
4040}
4041
4042
4043
4044
4045
4046#define MAX_PINNED_INTERVAL 512
4047
4048
4049static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4050
4051static int need_active_balance(struct sched_domain *sd, int idle,
4052 int busiest_cpu, int this_cpu)
4053{
4054 if (idle == CPU_NEWLY_IDLE) {
4055
4056
4057
4058
4059
4060
4061 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
4062 return 1;
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4084 return 0;
4085 }
4086
4087 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4088}
4089
4090static int active_load_balance_cpu_stop(void *data);
4091
4092
4093
4094
4095
4096static int load_balance(int this_cpu, struct rq *this_rq,
4097 struct sched_domain *sd, enum cpu_idle_type idle,
4098 int *balance)
4099{
4100 int ld_moved, all_pinned = 0, active_balance = 0;
4101 struct sched_group *group;
4102 unsigned long imbalance;
4103 struct rq *busiest;
4104 unsigned long flags;
4105 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4106
4107 cpumask_copy(cpus, cpu_active_mask);
4108
4109 schedstat_inc(sd, lb_count[idle]);
4110
4111redo:
4112 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
4113 cpus, balance);
4114
4115 if (*balance == 0)
4116 goto out_balanced;
4117
4118 if (!group) {
4119 schedstat_inc(sd, lb_nobusyg[idle]);
4120 goto out_balanced;
4121 }
4122
4123 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
4124 if (!busiest) {
4125 schedstat_inc(sd, lb_nobusyq[idle]);
4126 goto out_balanced;
4127 }
4128
4129 BUG_ON(busiest == this_rq);
4130
4131 schedstat_add(sd, lb_imbalance[idle], imbalance);
4132
4133 ld_moved = 0;
4134 if (busiest->nr_running > 1) {
4135
4136
4137
4138
4139
4140
4141 all_pinned = 1;
4142 local_irq_save(flags);
4143 double_rq_lock(this_rq, busiest);
4144 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4145 imbalance, sd, idle, &all_pinned);
4146 double_rq_unlock(this_rq, busiest);
4147 local_irq_restore(flags);
4148
4149
4150
4151
4152 if (ld_moved && this_cpu != smp_processor_id())
4153 resched_cpu(this_cpu);
4154
4155
4156 if (unlikely(all_pinned)) {
4157 cpumask_clear_cpu(cpu_of(busiest), cpus);
4158 if (!cpumask_empty(cpus))
4159 goto redo;
4160 goto out_balanced;
4161 }
4162 }
4163
4164 if (!ld_moved) {
4165 schedstat_inc(sd, lb_failed[idle]);
4166
4167
4168
4169
4170
4171
4172 if (idle != CPU_NEWLY_IDLE)
4173 sd->nr_balance_failed++;
4174
4175 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
4176 raw_spin_lock_irqsave(&busiest->lock, flags);
4177
4178
4179
4180
4181
4182 if (!cpumask_test_cpu(this_cpu,
4183 tsk_cpus_allowed(busiest->curr))) {
4184 raw_spin_unlock_irqrestore(&busiest->lock,
4185 flags);
4186 all_pinned = 1;
4187 goto out_one_pinned;
4188 }
4189
4190
4191
4192
4193
4194
4195 if (!busiest->active_balance) {
4196 busiest->active_balance = 1;
4197 busiest->push_cpu = this_cpu;
4198 active_balance = 1;
4199 }
4200 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4201
4202 if (active_balance)
4203 stop_one_cpu_nowait(cpu_of(busiest),
4204 active_load_balance_cpu_stop, busiest,
4205 &busiest->active_balance_work);
4206
4207
4208
4209
4210
4211 sd->nr_balance_failed = sd->cache_nice_tries+1;
4212 }
4213 } else
4214 sd->nr_balance_failed = 0;
4215
4216 if (likely(!active_balance)) {
4217
4218 sd->balance_interval = sd->min_interval;
4219 } else {
4220
4221
4222
4223
4224
4225
4226 if (sd->balance_interval < sd->max_interval)
4227 sd->balance_interval *= 2;
4228 }
4229
4230 goto out;
4231
4232out_balanced:
4233 schedstat_inc(sd, lb_balanced[idle]);
4234
4235 sd->nr_balance_failed = 0;
4236
4237out_one_pinned:
4238
4239 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4240 (sd->balance_interval < sd->max_interval))
4241 sd->balance_interval *= 2;
4242
4243 ld_moved = 0;
4244out:
4245 return ld_moved;
4246}
4247
4248
4249
4250
4251
4252static void idle_balance(int this_cpu, struct rq *this_rq)
4253{
4254 struct sched_domain *sd;
4255 int pulled_task = 0;
4256 unsigned long next_balance = jiffies + HZ;
4257
4258 this_rq->idle_stamp = this_rq->clock;
4259
4260 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4261 return;
4262
4263
4264
4265
4266 raw_spin_unlock(&this_rq->lock);
4267
4268 update_shares(this_cpu);
4269 rcu_read_lock();
4270 for_each_domain(this_cpu, sd) {
4271 unsigned long interval;
4272 int balance = 1;
4273
4274 if (!(sd->flags & SD_LOAD_BALANCE))
4275 continue;
4276
4277 if (sd->flags & SD_BALANCE_NEWIDLE) {
4278
4279 pulled_task = load_balance(this_cpu, this_rq,
4280 sd, CPU_NEWLY_IDLE, &balance);
4281 }
4282
4283 interval = msecs_to_jiffies(sd->balance_interval);
4284 if (time_after(next_balance, sd->last_balance + interval))
4285 next_balance = sd->last_balance + interval;
4286 if (pulled_task) {
4287 this_rq->idle_stamp = 0;
4288 break;
4289 }
4290 }
4291 rcu_read_unlock();
4292
4293 raw_spin_lock(&this_rq->lock);
4294
4295 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4296
4297
4298
4299
4300 this_rq->next_balance = next_balance;
4301 }
4302}
4303
4304
4305
4306
4307
4308
4309
4310static int active_load_balance_cpu_stop(void *data)
4311{
4312 struct rq *busiest_rq = data;
4313 int busiest_cpu = cpu_of(busiest_rq);
4314 int target_cpu = busiest_rq->push_cpu;
4315 struct rq *target_rq = cpu_rq(target_cpu);
4316 struct sched_domain *sd;
4317
4318 raw_spin_lock_irq(&busiest_rq->lock);
4319
4320
4321 if (unlikely(busiest_cpu != smp_processor_id() ||
4322 !busiest_rq->active_balance))
4323 goto out_unlock;
4324
4325
4326 if (busiest_rq->nr_running <= 1)
4327 goto out_unlock;
4328
4329
4330
4331
4332
4333
4334 BUG_ON(busiest_rq == target_rq);
4335
4336
4337 double_lock_balance(busiest_rq, target_rq);
4338
4339
4340 rcu_read_lock();
4341 for_each_domain(target_cpu, sd) {
4342 if ((sd->flags & SD_LOAD_BALANCE) &&
4343 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4344 break;
4345 }
4346
4347 if (likely(sd)) {
4348 schedstat_inc(sd, alb_count);
4349
4350 if (move_one_task(target_rq, target_cpu, busiest_rq,
4351 sd, CPU_IDLE))
4352 schedstat_inc(sd, alb_pushed);
4353 else
4354 schedstat_inc(sd, alb_failed);
4355 }
4356 rcu_read_unlock();
4357 double_unlock_balance(busiest_rq, target_rq);
4358out_unlock:
4359 busiest_rq->active_balance = 0;
4360 raw_spin_unlock_irq(&busiest_rq->lock);
4361 return 0;
4362}
4363
4364#ifdef CONFIG_NO_HZ
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375static struct {
4376 atomic_t load_balancer;
4377 atomic_t first_pick_cpu;
4378 atomic_t second_pick_cpu;
4379 cpumask_var_t idle_cpus_mask;
4380 cpumask_var_t grp_idle_mask;
4381 unsigned long next_balance;
4382} nohz ____cacheline_aligned;
4383
4384int get_nohz_load_balancer(void)
4385{
4386 return atomic_read(&nohz.load_balancer);
4387}
4388
4389#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4400{
4401 struct sched_domain *sd;
4402
4403 for_each_domain(cpu, sd)
4404 if (sd->flags & flag)
4405 break;
4406
4407 return sd;
4408}
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420#define for_each_flag_domain(cpu, sd, flag) \
4421 for (sd = lowest_flag_domain(cpu, flag); \
4422 (sd && (sd->flags & flag)); sd = sd->parent)
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434static inline int is_semi_idle_group(struct sched_group *ilb_group)
4435{
4436 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
4437 sched_group_cpus(ilb_group));
4438
4439
4440
4441
4442
4443 if (cpumask_empty(nohz.grp_idle_mask))
4444 return 0;
4445
4446 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
4447 return 0;
4448
4449 return 1;
4450}
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463static int find_new_ilb(int cpu)
4464{
4465 struct sched_domain *sd;
4466 struct sched_group *ilb_group;
4467 int ilb = nr_cpu_ids;
4468
4469
4470
4471
4472
4473 if (!(sched_smt_power_savings || sched_mc_power_savings))
4474 goto out_done;
4475
4476
4477
4478
4479
4480 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4481 goto out_done;
4482
4483 rcu_read_lock();
4484 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4485 ilb_group = sd->groups;
4486
4487 do {
4488 if (is_semi_idle_group(ilb_group)) {
4489 ilb = cpumask_first(nohz.grp_idle_mask);
4490 goto unlock;
4491 }
4492
4493 ilb_group = ilb_group->next;
4494
4495 } while (ilb_group != sd->groups);
4496 }
4497unlock:
4498 rcu_read_unlock();
4499
4500out_done:
4501 return ilb;
4502}
4503#else
4504static inline int find_new_ilb(int call_cpu)
4505{
4506 return nr_cpu_ids;
4507}
4508#endif
4509
4510
4511
4512
4513
4514
4515static void nohz_balancer_kick(int cpu)
4516{
4517 int ilb_cpu;
4518
4519 nohz.next_balance++;
4520
4521 ilb_cpu = get_nohz_load_balancer();
4522
4523 if (ilb_cpu >= nr_cpu_ids) {
4524 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
4525 if (ilb_cpu >= nr_cpu_ids)
4526 return;
4527 }
4528
4529 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
4530 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
4531
4532 smp_mb();
4533
4534
4535
4536
4537
4538
4539 smp_send_reschedule(ilb_cpu);
4540 }
4541 return;
4542}
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557void select_nohz_load_balancer(int stop_tick)
4558{
4559 int cpu = smp_processor_id();
4560
4561 if (stop_tick) {
4562 if (!cpu_active(cpu)) {
4563 if (atomic_read(&nohz.load_balancer) != cpu)
4564 return;
4565
4566
4567
4568
4569
4570 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4571 nr_cpu_ids) != cpu)
4572 BUG();
4573
4574 return;
4575 }
4576
4577 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4578
4579 if (atomic_read(&nohz.first_pick_cpu) == cpu)
4580 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
4581 if (atomic_read(&nohz.second_pick_cpu) == cpu)
4582 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4583
4584 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
4585 int new_ilb;
4586
4587
4588 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
4589 cpu) != nr_cpu_ids)
4590 return;
4591
4592
4593
4594
4595
4596 new_ilb = find_new_ilb(cpu);
4597 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4598 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4599 resched_cpu(new_ilb);
4600 return;
4601 }
4602 return;
4603 }
4604 } else {
4605 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4606 return;
4607
4608 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4609
4610 if (atomic_read(&nohz.load_balancer) == cpu)
4611 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4612 nr_cpu_ids) != cpu)
4613 BUG();
4614 }
4615 return;
4616}
4617#endif
4618
4619static DEFINE_SPINLOCK(balancing);
4620
4621static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4622
4623
4624
4625
4626
4627static void update_max_interval(void)
4628{
4629 max_load_balance_interval = HZ*num_online_cpus()/10;
4630}
4631
4632
4633
4634
4635
4636
4637
4638static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4639{
4640 int balance = 1;
4641 struct rq *rq = cpu_rq(cpu);
4642 unsigned long interval;
4643 struct sched_domain *sd;
4644
4645 unsigned long next_balance = jiffies + 60*HZ;
4646 int update_next_balance = 0;
4647 int need_serialize;
4648
4649 update_shares(cpu);
4650
4651 rcu_read_lock();
4652 for_each_domain(cpu, sd) {
4653 if (!(sd->flags & SD_LOAD_BALANCE))
4654 continue;
4655
4656 interval = sd->balance_interval;
4657 if (idle != CPU_IDLE)
4658 interval *= sd->busy_factor;
4659
4660
4661 interval = msecs_to_jiffies(interval);
4662 interval = clamp(interval, 1UL, max_load_balance_interval);
4663
4664 need_serialize = sd->flags & SD_SERIALIZE;
4665
4666 if (need_serialize) {
4667 if (!spin_trylock(&balancing))
4668 goto out;
4669 }
4670
4671 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4672 if (load_balance(cpu, rq, sd, idle, &balance)) {
4673
4674
4675
4676
4677 idle = CPU_NOT_IDLE;
4678 }
4679 sd->last_balance = jiffies;
4680 }
4681 if (need_serialize)
4682 spin_unlock(&balancing);
4683out:
4684 if (time_after(next_balance, sd->last_balance + interval)) {
4685 next_balance = sd->last_balance + interval;
4686 update_next_balance = 1;
4687 }
4688
4689
4690
4691
4692
4693
4694 if (!balance)
4695 break;
4696 }
4697 rcu_read_unlock();
4698
4699
4700
4701
4702
4703
4704 if (likely(update_next_balance))
4705 rq->next_balance = next_balance;
4706}
4707
4708#ifdef CONFIG_NO_HZ
4709
4710
4711
4712
4713static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4714{
4715 struct rq *this_rq = cpu_rq(this_cpu);
4716 struct rq *rq;
4717 int balance_cpu;
4718
4719 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
4720 return;
4721
4722 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4723 if (balance_cpu == this_cpu)
4724 continue;
4725
4726
4727
4728
4729
4730
4731 if (need_resched()) {
4732 this_rq->nohz_balance_kick = 0;
4733 break;
4734 }
4735
4736 raw_spin_lock_irq(&this_rq->lock);
4737 update_rq_clock(this_rq);
4738 update_cpu_load(this_rq);
4739 raw_spin_unlock_irq(&this_rq->lock);
4740
4741 rebalance_domains(balance_cpu, CPU_IDLE);
4742
4743 rq = cpu_rq(balance_cpu);
4744 if (time_after(this_rq->next_balance, rq->next_balance))
4745 this_rq->next_balance = rq->next_balance;
4746 }
4747 nohz.next_balance = this_rq->next_balance;
4748 this_rq->nohz_balance_kick = 0;
4749}
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763static inline int nohz_kick_needed(struct rq *rq, int cpu)
4764{
4765 unsigned long now = jiffies;
4766 int ret;
4767 int first_pick_cpu, second_pick_cpu;
4768
4769 if (time_before(now, nohz.next_balance))
4770 return 0;
4771
4772 if (idle_cpu(cpu))
4773 return 0;
4774
4775 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
4776 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
4777
4778 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
4779 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4780 return 0;
4781
4782 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
4783 if (ret == nr_cpu_ids || ret == cpu) {
4784 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4785 if (rq->nr_running > 1)
4786 return 1;
4787 } else {
4788 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
4789 if (ret == nr_cpu_ids || ret == cpu) {
4790 if (rq->nr_running)
4791 return 1;
4792 }
4793 }
4794 return 0;
4795}
4796#else
4797static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
4798#endif
4799
4800
4801
4802
4803
4804static void run_rebalance_domains(struct softirq_action *h)
4805{
4806 int this_cpu = smp_processor_id();
4807 struct rq *this_rq = cpu_rq(this_cpu);
4808 enum cpu_idle_type idle = this_rq->idle_balance ?
4809 CPU_IDLE : CPU_NOT_IDLE;
4810
4811 rebalance_domains(this_cpu, idle);
4812
4813
4814
4815
4816
4817
4818 nohz_idle_balance(this_cpu, idle);
4819}
4820
4821static inline int on_null_domain(int cpu)
4822{
4823 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4824}
4825
4826
4827
4828
4829static inline void trigger_load_balance(struct rq *rq, int cpu)
4830{
4831
4832 if (time_after_eq(jiffies, rq->next_balance) &&
4833 likely(!on_null_domain(cpu)))
4834 raise_softirq(SCHED_SOFTIRQ);
4835#ifdef CONFIG_NO_HZ
4836 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4837 nohz_balancer_kick(cpu);
4838#endif
4839}
4840
4841static void rq_online_fair(struct rq *rq)
4842{
4843 update_sysctl();
4844}
4845
4846static void rq_offline_fair(struct rq *rq)
4847{
4848 update_sysctl();
4849}
4850
4851#else
4852
4853
4854
4855
4856static inline void idle_balance(int cpu, struct rq *rq)
4857{
4858}
4859
4860#endif
4861
4862
4863
4864
4865static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4866{
4867 struct cfs_rq *cfs_rq;
4868 struct sched_entity *se = &curr->se;
4869
4870 for_each_sched_entity(se) {
4871 cfs_rq = cfs_rq_of(se);
4872 entity_tick(cfs_rq, se, queued);
4873 }
4874}
4875
4876
4877
4878
4879
4880
4881static void task_fork_fair(struct task_struct *p)
4882{
4883 struct cfs_rq *cfs_rq = task_cfs_rq(current);
4884 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
4885 int this_cpu = smp_processor_id();
4886 struct rq *rq = this_rq();
4887 unsigned long flags;
4888
4889 raw_spin_lock_irqsave(&rq->lock, flags);
4890
4891 update_rq_clock(rq);
4892
4893 if (unlikely(task_cpu(p) != this_cpu)) {
4894 rcu_read_lock();
4895 __set_task_cpu(p, this_cpu);
4896 rcu_read_unlock();
4897 }
4898
4899 update_curr(cfs_rq);
4900
4901 if (curr)
4902 se->vruntime = curr->vruntime;
4903 place_entity(cfs_rq, se, 1);
4904
4905 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
4906
4907
4908
4909
4910 swap(curr->vruntime, se->vruntime);
4911 resched_task(rq->curr);
4912 }
4913
4914 se->vruntime -= cfs_rq->min_vruntime;
4915
4916 raw_spin_unlock_irqrestore(&rq->lock, flags);
4917}
4918
4919
4920
4921
4922
4923static void
4924prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4925{
4926 if (!p->se.on_rq)
4927 return;
4928
4929
4930
4931
4932
4933
4934 if (rq->curr == p) {
4935 if (p->prio > oldprio)
4936 resched_task(rq->curr);
4937 } else
4938 check_preempt_curr(rq, p, 0);
4939}
4940
4941static void switched_from_fair(struct rq *rq, struct task_struct *p)
4942{
4943 struct sched_entity *se = &p->se;
4944 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955 if (!se->on_rq && p->state != TASK_RUNNING) {
4956
4957
4958
4959
4960 place_entity(cfs_rq, se, 0);
4961 se->vruntime -= cfs_rq->min_vruntime;
4962 }
4963}
4964
4965
4966
4967
4968static void switched_to_fair(struct rq *rq, struct task_struct *p)
4969{
4970 if (!p->se.on_rq)
4971 return;
4972
4973
4974
4975
4976
4977
4978 if (rq->curr == p)
4979 resched_task(rq->curr);
4980 else
4981 check_preempt_curr(rq, p, 0);
4982}
4983
4984
4985
4986
4987
4988
4989static void set_curr_task_fair(struct rq *rq)
4990{
4991 struct sched_entity *se = &rq->curr->se;
4992
4993 for_each_sched_entity(se) {
4994 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4995
4996 set_next_entity(cfs_rq, se);
4997
4998 account_cfs_rq_runtime(cfs_rq, 0);
4999 }
5000}
5001
5002#ifdef CONFIG_FAIR_GROUP_SCHED
5003static void task_move_group_fair(struct task_struct *p, int on_rq)
5004{
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018 if (!on_rq)
5019 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5020 set_task_rq(p, task_cpu(p));
5021 if (!on_rq)
5022 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5023}
5024#endif
5025
5026static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5027{
5028 struct sched_entity *se = &task->se;
5029 unsigned int rr_interval = 0;
5030
5031
5032
5033
5034
5035 if (rq->cfs.load.weight)
5036 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5037
5038 return rr_interval;
5039}
5040
5041
5042
5043
5044static const struct sched_class fair_sched_class = {
5045 .next = &idle_sched_class,
5046 .enqueue_task = enqueue_task_fair,
5047 .dequeue_task = dequeue_task_fair,
5048 .yield_task = yield_task_fair,
5049 .yield_to_task = yield_to_task_fair,
5050
5051 .check_preempt_curr = check_preempt_wakeup,
5052
5053 .pick_next_task = pick_next_task_fair,
5054 .put_prev_task = put_prev_task_fair,
5055
5056#ifdef CONFIG_SMP
5057 .select_task_rq = select_task_rq_fair,
5058
5059 .rq_online = rq_online_fair,
5060 .rq_offline = rq_offline_fair,
5061
5062 .task_waking = task_waking_fair,
5063#endif
5064
5065 .set_curr_task = set_curr_task_fair,
5066 .task_tick = task_tick_fair,
5067 .task_fork = task_fork_fair,
5068
5069 .prio_changed = prio_changed_fair,
5070 .switched_from = switched_from_fair,
5071 .switched_to = switched_to_fair,
5072
5073 .get_rr_interval = get_rr_interval_fair,
5074
5075#ifdef CONFIG_FAIR_GROUP_SCHED
5076 .task_move_group = task_move_group_fair,
5077#endif
5078};
5079
5080#ifdef CONFIG_SCHED_DEBUG
5081static void print_cfs_stats(struct seq_file *m, int cpu)
5082{
5083 struct cfs_rq *cfs_rq;
5084
5085 rcu_read_lock();
5086 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5087 print_cfs_rq(m, cpu, cfs_rq);
5088 rcu_read_unlock();
5089}
5090#endif
5091