1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39unsigned int sysctl_sched_latency = 6000000ULL;
40unsigned int normalized_sysctl_sched_latency = 6000000ULL;
41
42
43
44
45
46
47
48
49
50
51enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 = SCHED_TUNABLESCALING_LOG;
53
54
55
56
57
58unsigned int sysctl_sched_min_granularity = 750000ULL;
59unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
60
61
62
63
64static unsigned int sched_nr_latency = 8;
65
66
67
68
69
70unsigned int sysctl_sched_child_runs_first __read_mostly;
71
72
73
74
75
76
77
78
79
80unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
81unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
82
83const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
84
85
86
87
88
89
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91
92static const struct sched_class fair_sched_class;
93
94
95
96
97
98#ifdef CONFIG_FAIR_GROUP_SCHED
99
100
101static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
102{
103 return cfs_rq->rq;
104}
105
106
107#define entity_is_task(se) (!se->my_q)
108
109static inline struct task_struct *task_of(struct sched_entity *se)
110{
111#ifdef CONFIG_SCHED_DEBUG
112 WARN_ON_ONCE(!entity_is_task(se));
113#endif
114 return container_of(se, struct task_struct, se);
115}
116
117
118#define for_each_sched_entity(se) \
119 for (; se; se = se->parent)
120
121static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
122{
123 return p->se.cfs_rq;
124}
125
126
127static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
128{
129 return se->cfs_rq;
130}
131
132
133static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
134{
135 return grp->my_q;
136}
137
138
139
140
141static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
142{
143 return cfs_rq->tg->cfs_rq[this_cpu];
144}
145
146static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147{
148 if (!cfs_rq->on_list) {
149
150
151
152
153
154
155 if (cfs_rq->tg->parent &&
156 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
157 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
158 &rq_of(cfs_rq)->leaf_cfs_rq_list);
159 } else {
160 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
161 &rq_of(cfs_rq)->leaf_cfs_rq_list);
162 }
163
164 cfs_rq->on_list = 1;
165 }
166}
167
168static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
169{
170 if (cfs_rq->on_list) {
171 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
172 cfs_rq->on_list = 0;
173 }
174}
175
176
177#define for_each_leaf_cfs_rq(rq, cfs_rq) \
178 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
179
180
181static inline int
182is_same_group(struct sched_entity *se, struct sched_entity *pse)
183{
184 if (se->cfs_rq == pse->cfs_rq)
185 return 1;
186
187 return 0;
188}
189
190static inline struct sched_entity *parent_entity(struct sched_entity *se)
191{
192 return se->parent;
193}
194
195
196static inline int depth_se(struct sched_entity *se)
197{
198 int depth = 0;
199
200 for_each_sched_entity(se)
201 depth++;
202
203 return depth;
204}
205
206static void
207find_matching_se(struct sched_entity **se, struct sched_entity **pse)
208{
209 int se_depth, pse_depth;
210
211
212
213
214
215
216
217
218
219 se_depth = depth_se(*se);
220 pse_depth = depth_se(*pse);
221
222 while (se_depth > pse_depth) {
223 se_depth--;
224 *se = parent_entity(*se);
225 }
226
227 while (pse_depth > se_depth) {
228 pse_depth--;
229 *pse = parent_entity(*pse);
230 }
231
232 while (!is_same_group(*se, *pse)) {
233 *se = parent_entity(*se);
234 *pse = parent_entity(*pse);
235 }
236}
237
238#else
239
240static inline struct task_struct *task_of(struct sched_entity *se)
241{
242 return container_of(se, struct task_struct, se);
243}
244
245static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
246{
247 return container_of(cfs_rq, struct rq, cfs);
248}
249
250#define entity_is_task(se) 1
251
252#define for_each_sched_entity(se) \
253 for (; se; se = NULL)
254
255static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
256{
257 return &task_rq(p)->cfs;
258}
259
260static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
261{
262 struct task_struct *p = task_of(se);
263 struct rq *rq = task_rq(p);
264
265 return &rq->cfs;
266}
267
268
269static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
270{
271 return NULL;
272}
273
274static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
275{
276 return &cpu_rq(this_cpu)->cfs;
277}
278
279static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
280{
281}
282
283static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
284{
285}
286
287#define for_each_leaf_cfs_rq(rq, cfs_rq) \
288 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
289
290static inline int
291is_same_group(struct sched_entity *se, struct sched_entity *pse)
292{
293 return 1;
294}
295
296static inline struct sched_entity *parent_entity(struct sched_entity *se)
297{
298 return NULL;
299}
300
301static inline void
302find_matching_se(struct sched_entity **se, struct sched_entity **pse)
303{
304}
305
306#endif
307
308
309
310
311
312
313static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
314{
315 s64 delta = (s64)(vruntime - min_vruntime);
316 if (delta > 0)
317 min_vruntime = vruntime;
318
319 return min_vruntime;
320}
321
322static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
323{
324 s64 delta = (s64)(vruntime - min_vruntime);
325 if (delta < 0)
326 min_vruntime = vruntime;
327
328 return min_vruntime;
329}
330
331static inline int entity_before(struct sched_entity *a,
332 struct sched_entity *b)
333{
334 return (s64)(a->vruntime - b->vruntime) < 0;
335}
336
337static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
338{
339 return se->vruntime - cfs_rq->min_vruntime;
340}
341
342static void update_min_vruntime(struct cfs_rq *cfs_rq)
343{
344 u64 vruntime = cfs_rq->min_vruntime;
345
346 if (cfs_rq->curr)
347 vruntime = cfs_rq->curr->vruntime;
348
349 if (cfs_rq->rb_leftmost) {
350 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
351 struct sched_entity,
352 run_node);
353
354 if (!cfs_rq->curr)
355 vruntime = se->vruntime;
356 else
357 vruntime = min_vruntime(vruntime, se->vruntime);
358 }
359
360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361}
362
363
364
365
366static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
367{
368 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
369 struct rb_node *parent = NULL;
370 struct sched_entity *entry;
371 s64 key = entity_key(cfs_rq, se);
372 int leftmost = 1;
373
374
375
376
377 while (*link) {
378 parent = *link;
379 entry = rb_entry(parent, struct sched_entity, run_node);
380
381
382
383
384 if (key < entity_key(cfs_rq, entry)) {
385 link = &parent->rb_left;
386 } else {
387 link = &parent->rb_right;
388 leftmost = 0;
389 }
390 }
391
392
393
394
395
396 if (leftmost)
397 cfs_rq->rb_leftmost = &se->run_node;
398
399 rb_link_node(&se->run_node, parent, link);
400 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
401}
402
403static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
404{
405 if (cfs_rq->rb_leftmost == &se->run_node) {
406 struct rb_node *next_node;
407
408 next_node = rb_next(&se->run_node);
409 cfs_rq->rb_leftmost = next_node;
410 }
411
412 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
413}
414
415static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
416{
417 struct rb_node *left = cfs_rq->rb_leftmost;
418
419 if (!left)
420 return NULL;
421
422 return rb_entry(left, struct sched_entity, run_node);
423}
424
425static struct sched_entity *__pick_next_entity(struct sched_entity *se)
426{
427 struct rb_node *next = rb_next(&se->run_node);
428
429 if (!next)
430 return NULL;
431
432 return rb_entry(next, struct sched_entity, run_node);
433}
434
435#ifdef CONFIG_SCHED_DEBUG
436static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
437{
438 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
439
440 if (!last)
441 return NULL;
442
443 return rb_entry(last, struct sched_entity, run_node);
444}
445
446
447
448
449
450int sched_proc_update_handler(struct ctl_table *table, int write,
451 void __user *buffer, size_t *lenp,
452 loff_t *ppos)
453{
454 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
455 int factor = get_update_sysctl_factor();
456
457 if (ret || !write)
458 return ret;
459
460 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
461 sysctl_sched_min_granularity);
462
463#define WRT_SYSCTL(name) \
464 (normalized_sysctl_##name = sysctl_##name / (factor))
465 WRT_SYSCTL(sched_min_granularity);
466 WRT_SYSCTL(sched_latency);
467 WRT_SYSCTL(sched_wakeup_granularity);
468#undef WRT_SYSCTL
469
470 return 0;
471}
472#endif
473
474
475
476
477static inline unsigned long
478calc_delta_fair(unsigned long delta, struct sched_entity *se)
479{
480 if (unlikely(se->load.weight != NICE_0_LOAD))
481 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
482
483 return delta;
484}
485
486
487
488
489
490
491
492
493
494static u64 __sched_period(unsigned long nr_running)
495{
496 u64 period = sysctl_sched_latency;
497 unsigned long nr_latency = sched_nr_latency;
498
499 if (unlikely(nr_running > nr_latency)) {
500 period = sysctl_sched_min_granularity;
501 period *= nr_running;
502 }
503
504 return period;
505}
506
507
508
509
510
511
512
513static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
514{
515 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
516
517 for_each_sched_entity(se) {
518 struct load_weight *load;
519 struct load_weight lw;
520
521 cfs_rq = cfs_rq_of(se);
522 load = &cfs_rq->load;
523
524 if (unlikely(!se->on_rq)) {
525 lw = cfs_rq->load;
526
527 update_load_add(&lw, se->load.weight);
528 load = &lw;
529 }
530 slice = calc_delta_mine(slice, se->load.weight, load);
531 }
532 return slice;
533}
534
535
536
537
538
539
540static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
541{
542 return calc_delta_fair(sched_slice(cfs_rq, se), se);
543}
544
545static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
546static void update_cfs_shares(struct cfs_rq *cfs_rq);
547
548
549
550
551
552static inline void
553__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
554 unsigned long delta_exec)
555{
556 unsigned long delta_exec_weighted;
557
558 schedstat_set(curr->statistics.exec_max,
559 max((u64)delta_exec, curr->statistics.exec_max));
560
561 curr->sum_exec_runtime += delta_exec;
562 schedstat_add(cfs_rq, exec_clock, delta_exec);
563 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
564
565 curr->vruntime += delta_exec_weighted;
566 update_min_vruntime(cfs_rq);
567
568#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
569 cfs_rq->load_unacc_exec_time += delta_exec;
570#endif
571}
572
573static void update_curr(struct cfs_rq *cfs_rq)
574{
575 struct sched_entity *curr = cfs_rq->curr;
576 u64 now = rq_of(cfs_rq)->clock_task;
577 unsigned long delta_exec;
578
579 if (unlikely(!curr))
580 return;
581
582
583
584
585
586
587 delta_exec = (unsigned long)(now - curr->exec_start);
588 if (!delta_exec)
589 return;
590
591 __update_curr(cfs_rq, curr, delta_exec);
592 curr->exec_start = now;
593
594 if (entity_is_task(curr)) {
595 struct task_struct *curtask = task_of(curr);
596
597 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
598 cpuacct_charge(curtask, delta_exec);
599 account_group_exec_runtime(curtask, delta_exec);
600 }
601}
602
603static inline void
604update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
605{
606 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
607}
608
609
610
611
612static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
613{
614
615
616
617
618 if (se != cfs_rq->curr)
619 update_stats_wait_start(cfs_rq, se);
620}
621
622static void
623update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
624{
625 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
626 rq_of(cfs_rq)->clock - se->statistics.wait_start));
627 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
628 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
629 rq_of(cfs_rq)->clock - se->statistics.wait_start);
630#ifdef CONFIG_SCHEDSTATS
631 if (entity_is_task(se)) {
632 trace_sched_stat_wait(task_of(se),
633 rq_of(cfs_rq)->clock - se->statistics.wait_start);
634 }
635#endif
636 schedstat_set(se->statistics.wait_start, 0);
637}
638
639static inline void
640update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
641{
642
643
644
645
646 if (se != cfs_rq->curr)
647 update_stats_wait_end(cfs_rq, se);
648}
649
650
651
652
653static inline void
654update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
655{
656
657
658
659 se->exec_start = rq_of(cfs_rq)->clock_task;
660}
661
662
663
664
665
666#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
667static void
668add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
669{
670 cfs_rq->task_weight += weight;
671}
672#else
673static inline void
674add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
675{
676}
677#endif
678
679static void
680account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
681{
682 update_load_add(&cfs_rq->load, se->load.weight);
683 if (!parent_entity(se))
684 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
685 if (entity_is_task(se)) {
686 add_cfs_task_weight(cfs_rq, se->load.weight);
687 list_add(&se->group_node, &cfs_rq->tasks);
688 }
689 cfs_rq->nr_running++;
690}
691
692static void
693account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
694{
695 update_load_sub(&cfs_rq->load, se->load.weight);
696 if (!parent_entity(se))
697 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
698 if (entity_is_task(se)) {
699 add_cfs_task_weight(cfs_rq, -se->load.weight);
700 list_del_init(&se->group_node);
701 }
702 cfs_rq->nr_running--;
703}
704
705#ifdef CONFIG_FAIR_GROUP_SCHED
706# ifdef CONFIG_SMP
707static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
708 int global_update)
709{
710 struct task_group *tg = cfs_rq->tg;
711 long load_avg;
712
713 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
714 load_avg -= cfs_rq->load_contribution;
715
716 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
717 atomic_add(load_avg, &tg->load_weight);
718 cfs_rq->load_contribution += load_avg;
719 }
720}
721
722static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
723{
724 u64 period = sysctl_sched_shares_window;
725 u64 now, delta;
726 unsigned long load = cfs_rq->load.weight;
727
728 if (cfs_rq->tg == &root_task_group)
729 return;
730
731 now = rq_of(cfs_rq)->clock_task;
732 delta = now - cfs_rq->load_stamp;
733
734
735 if (cfs_rq->load_stamp > cfs_rq->load_last &&
736 now - cfs_rq->load_last > 4 * period) {
737 cfs_rq->load_period = 0;
738 cfs_rq->load_avg = 0;
739 delta = period - 1;
740 }
741
742 cfs_rq->load_stamp = now;
743 cfs_rq->load_unacc_exec_time = 0;
744 cfs_rq->load_period += delta;
745 if (load) {
746 cfs_rq->load_last = now;
747 cfs_rq->load_avg += delta * load;
748 }
749
750
751 if (global_update || cfs_rq->load_period > period
752 || !cfs_rq->load_period)
753 update_cfs_rq_load_contribution(cfs_rq, global_update);
754
755 while (cfs_rq->load_period > period) {
756
757
758
759
760
761 asm("" : "+rm" (cfs_rq->load_period));
762 cfs_rq->load_period /= 2;
763 cfs_rq->load_avg /= 2;
764 }
765
766 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
767 list_del_leaf_cfs_rq(cfs_rq);
768}
769
770static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
771{
772 long load_weight, load, shares;
773
774 load = cfs_rq->load.weight;
775
776 load_weight = atomic_read(&tg->load_weight);
777 load_weight += load;
778 load_weight -= cfs_rq->load_contribution;
779
780 shares = (tg->shares * load);
781 if (load_weight)
782 shares /= load_weight;
783
784 if (shares < MIN_SHARES)
785 shares = MIN_SHARES;
786 if (shares > tg->shares)
787 shares = tg->shares;
788
789 return shares;
790}
791
792static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
793{
794 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
795 update_cfs_load(cfs_rq, 0);
796 update_cfs_shares(cfs_rq);
797 }
798}
799# else
800static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
801{
802}
803
804static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
805{
806 return tg->shares;
807}
808
809static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
810{
811}
812# endif
813static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
814 unsigned long weight)
815{
816 if (se->on_rq) {
817
818 if (cfs_rq->curr == se)
819 update_curr(cfs_rq);
820 account_entity_dequeue(cfs_rq, se);
821 }
822
823 update_load_set(&se->load, weight);
824
825 if (se->on_rq)
826 account_entity_enqueue(cfs_rq, se);
827}
828
829static void update_cfs_shares(struct cfs_rq *cfs_rq)
830{
831 struct task_group *tg;
832 struct sched_entity *se;
833 long shares;
834
835 tg = cfs_rq->tg;
836 se = tg->se[cpu_of(rq_of(cfs_rq))];
837 if (!se)
838 return;
839#ifndef CONFIG_SMP
840 if (likely(se->load.weight == tg->shares))
841 return;
842#endif
843 shares = calc_cfs_shares(cfs_rq, tg);
844
845 reweight_entity(cfs_rq_of(se), se, shares);
846}
847#else
848static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
849{
850}
851
852static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
853{
854}
855
856static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
857{
858}
859#endif
860
861static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
862{
863#ifdef CONFIG_SCHEDSTATS
864 struct task_struct *tsk = NULL;
865
866 if (entity_is_task(se))
867 tsk = task_of(se);
868
869 if (se->statistics.sleep_start) {
870 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
871
872 if ((s64)delta < 0)
873 delta = 0;
874
875 if (unlikely(delta > se->statistics.sleep_max))
876 se->statistics.sleep_max = delta;
877
878 se->statistics.sleep_start = 0;
879 se->statistics.sum_sleep_runtime += delta;
880
881 if (tsk) {
882 account_scheduler_latency(tsk, delta >> 10, 1);
883 trace_sched_stat_sleep(tsk, delta);
884 }
885 }
886 if (se->statistics.block_start) {
887 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
888
889 if ((s64)delta < 0)
890 delta = 0;
891
892 if (unlikely(delta > se->statistics.block_max))
893 se->statistics.block_max = delta;
894
895 se->statistics.block_start = 0;
896 se->statistics.sum_sleep_runtime += delta;
897
898 if (tsk) {
899 if (tsk->in_iowait) {
900 se->statistics.iowait_sum += delta;
901 se->statistics.iowait_count++;
902 trace_sched_stat_iowait(tsk, delta);
903 }
904
905
906
907
908
909
910 if (unlikely(prof_on == SLEEP_PROFILING)) {
911 profile_hits(SLEEP_PROFILING,
912 (void *)get_wchan(tsk),
913 delta >> 20);
914 }
915 account_scheduler_latency(tsk, delta >> 10, 0);
916 }
917 }
918#endif
919}
920
921static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
922{
923#ifdef CONFIG_SCHED_DEBUG
924 s64 d = se->vruntime - cfs_rq->min_vruntime;
925
926 if (d < 0)
927 d = -d;
928
929 if (d > 3*sysctl_sched_latency)
930 schedstat_inc(cfs_rq, nr_spread_over);
931#endif
932}
933
934static void
935place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
936{
937 u64 vruntime = cfs_rq->min_vruntime;
938
939
940
941
942
943
944
945 if (initial && sched_feat(START_DEBIT))
946 vruntime += sched_vslice(cfs_rq, se);
947
948
949 if (!initial) {
950 unsigned long thresh = sysctl_sched_latency;
951
952
953
954
955
956 if (sched_feat(GENTLE_FAIR_SLEEPERS))
957 thresh >>= 1;
958
959 vruntime -= thresh;
960 }
961
962
963 vruntime = max_vruntime(se->vruntime, vruntime);
964
965 se->vruntime = vruntime;
966}
967
968static void
969enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
970{
971
972
973
974
975 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
976 se->vruntime += cfs_rq->min_vruntime;
977
978
979
980
981 update_curr(cfs_rq);
982 update_cfs_load(cfs_rq, 0);
983 account_entity_enqueue(cfs_rq, se);
984 update_cfs_shares(cfs_rq);
985
986 if (flags & ENQUEUE_WAKEUP) {
987 place_entity(cfs_rq, se, 0);
988 enqueue_sleeper(cfs_rq, se);
989 }
990
991 update_stats_enqueue(cfs_rq, se);
992 check_spread(cfs_rq, se);
993 if (se != cfs_rq->curr)
994 __enqueue_entity(cfs_rq, se);
995 se->on_rq = 1;
996
997 if (cfs_rq->nr_running == 1)
998 list_add_leaf_cfs_rq(cfs_rq);
999}
1000
1001static void __clear_buddies_last(struct sched_entity *se)
1002{
1003 for_each_sched_entity(se) {
1004 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1005 if (cfs_rq->last == se)
1006 cfs_rq->last = NULL;
1007 else
1008 break;
1009 }
1010}
1011
1012static void __clear_buddies_next(struct sched_entity *se)
1013{
1014 for_each_sched_entity(se) {
1015 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1016 if (cfs_rq->next == se)
1017 cfs_rq->next = NULL;
1018 else
1019 break;
1020 }
1021}
1022
1023static void __clear_buddies_skip(struct sched_entity *se)
1024{
1025 for_each_sched_entity(se) {
1026 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1027 if (cfs_rq->skip == se)
1028 cfs_rq->skip = NULL;
1029 else
1030 break;
1031 }
1032}
1033
1034static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1035{
1036 if (cfs_rq->last == se)
1037 __clear_buddies_last(se);
1038
1039 if (cfs_rq->next == se)
1040 __clear_buddies_next(se);
1041
1042 if (cfs_rq->skip == se)
1043 __clear_buddies_skip(se);
1044}
1045
1046static void
1047dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1048{
1049
1050
1051
1052 update_curr(cfs_rq);
1053
1054 update_stats_dequeue(cfs_rq, se);
1055 if (flags & DEQUEUE_SLEEP) {
1056#ifdef CONFIG_SCHEDSTATS
1057 if (entity_is_task(se)) {
1058 struct task_struct *tsk = task_of(se);
1059
1060 if (tsk->state & TASK_INTERRUPTIBLE)
1061 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1062 if (tsk->state & TASK_UNINTERRUPTIBLE)
1063 se->statistics.block_start = rq_of(cfs_rq)->clock;
1064 }
1065#endif
1066 }
1067
1068 clear_buddies(cfs_rq, se);
1069
1070 if (se != cfs_rq->curr)
1071 __dequeue_entity(cfs_rq, se);
1072 se->on_rq = 0;
1073 update_cfs_load(cfs_rq, 0);
1074 account_entity_dequeue(cfs_rq, se);
1075 update_min_vruntime(cfs_rq);
1076 update_cfs_shares(cfs_rq);
1077
1078
1079
1080
1081
1082
1083 if (!(flags & DEQUEUE_SLEEP))
1084 se->vruntime -= cfs_rq->min_vruntime;
1085}
1086
1087
1088
1089
1090static void
1091check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1092{
1093 unsigned long ideal_runtime, delta_exec;
1094
1095 ideal_runtime = sched_slice(cfs_rq, curr);
1096 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1097 if (delta_exec > ideal_runtime) {
1098 resched_task(rq_of(cfs_rq)->curr);
1099
1100
1101
1102
1103 clear_buddies(cfs_rq, curr);
1104 return;
1105 }
1106
1107
1108
1109
1110
1111
1112 if (!sched_feat(WAKEUP_PREEMPT))
1113 return;
1114
1115 if (delta_exec < sysctl_sched_min_granularity)
1116 return;
1117
1118 if (cfs_rq->nr_running > 1) {
1119 struct sched_entity *se = __pick_first_entity(cfs_rq);
1120 s64 delta = curr->vruntime - se->vruntime;
1121
1122 if (delta < 0)
1123 return;
1124
1125 if (delta > ideal_runtime)
1126 resched_task(rq_of(cfs_rq)->curr);
1127 }
1128}
1129
1130static void
1131set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1132{
1133
1134 if (se->on_rq) {
1135
1136
1137
1138
1139
1140 update_stats_wait_end(cfs_rq, se);
1141 __dequeue_entity(cfs_rq, se);
1142 }
1143
1144 update_stats_curr_start(cfs_rq, se);
1145 cfs_rq->curr = se;
1146#ifdef CONFIG_SCHEDSTATS
1147
1148
1149
1150
1151
1152 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1153 se->statistics.slice_max = max(se->statistics.slice_max,
1154 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1155 }
1156#endif
1157 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1158}
1159
1160static int
1161wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1162
1163
1164
1165
1166
1167
1168
1169
1170static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1171{
1172 struct sched_entity *se = __pick_first_entity(cfs_rq);
1173 struct sched_entity *left = se;
1174
1175
1176
1177
1178
1179 if (cfs_rq->skip == se) {
1180 struct sched_entity *second = __pick_next_entity(se);
1181 if (second && wakeup_preempt_entity(second, left) < 1)
1182 se = second;
1183 }
1184
1185
1186
1187
1188 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1189 se = cfs_rq->last;
1190
1191
1192
1193
1194 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1195 se = cfs_rq->next;
1196
1197 clear_buddies(cfs_rq, se);
1198
1199 return se;
1200}
1201
1202static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1203{
1204
1205
1206
1207
1208 if (prev->on_rq)
1209 update_curr(cfs_rq);
1210
1211 check_spread(cfs_rq, prev);
1212 if (prev->on_rq) {
1213 update_stats_wait_start(cfs_rq, prev);
1214
1215 __enqueue_entity(cfs_rq, prev);
1216 }
1217 cfs_rq->curr = NULL;
1218}
1219
1220static void
1221entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1222{
1223
1224
1225
1226 update_curr(cfs_rq);
1227
1228
1229
1230
1231 update_entity_shares_tick(cfs_rq);
1232
1233#ifdef CONFIG_SCHED_HRTICK
1234
1235
1236
1237
1238 if (queued) {
1239 resched_task(rq_of(cfs_rq)->curr);
1240 return;
1241 }
1242
1243
1244
1245 if (!sched_feat(DOUBLE_TICK) &&
1246 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1247 return;
1248#endif
1249
1250 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
1251 check_preempt_tick(cfs_rq, curr);
1252}
1253
1254
1255
1256
1257
1258#ifdef CONFIG_SCHED_HRTICK
1259static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1260{
1261 struct sched_entity *se = &p->se;
1262 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1263
1264 WARN_ON(task_rq(p) != rq);
1265
1266 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
1267 u64 slice = sched_slice(cfs_rq, se);
1268 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1269 s64 delta = slice - ran;
1270
1271 if (delta < 0) {
1272 if (rq->curr == p)
1273 resched_task(p);
1274 return;
1275 }
1276
1277
1278
1279
1280
1281 if (rq->curr != p)
1282 delta = max_t(s64, 10000LL, delta);
1283
1284 hrtick_start(rq, delta);
1285 }
1286}
1287
1288
1289
1290
1291
1292
1293static void hrtick_update(struct rq *rq)
1294{
1295 struct task_struct *curr = rq->curr;
1296
1297 if (curr->sched_class != &fair_sched_class)
1298 return;
1299
1300 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
1301 hrtick_start_fair(rq, curr);
1302}
1303#else
1304static inline void
1305hrtick_start_fair(struct rq *rq, struct task_struct *p)
1306{
1307}
1308
1309static inline void hrtick_update(struct rq *rq)
1310{
1311}
1312#endif
1313
1314
1315
1316
1317
1318
1319static void
1320enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1321{
1322 struct cfs_rq *cfs_rq;
1323 struct sched_entity *se = &p->se;
1324
1325 for_each_sched_entity(se) {
1326 if (se->on_rq)
1327 break;
1328 cfs_rq = cfs_rq_of(se);
1329 enqueue_entity(cfs_rq, se, flags);
1330 flags = ENQUEUE_WAKEUP;
1331 }
1332
1333 for_each_sched_entity(se) {
1334 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1335
1336 update_cfs_load(cfs_rq, 0);
1337 update_cfs_shares(cfs_rq);
1338 }
1339
1340 hrtick_update(rq);
1341}
1342
1343
1344
1345
1346
1347
1348static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1349{
1350 struct cfs_rq *cfs_rq;
1351 struct sched_entity *se = &p->se;
1352
1353 for_each_sched_entity(se) {
1354 cfs_rq = cfs_rq_of(se);
1355 dequeue_entity(cfs_rq, se, flags);
1356
1357
1358 if (cfs_rq->load.weight)
1359 break;
1360 flags |= DEQUEUE_SLEEP;
1361 }
1362
1363 for_each_sched_entity(se) {
1364 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1365
1366 update_cfs_load(cfs_rq, 0);
1367 update_cfs_shares(cfs_rq);
1368 }
1369
1370 hrtick_update(rq);
1371}
1372
1373#ifdef CONFIG_SMP
1374
1375static void task_waking_fair(struct rq *rq, struct task_struct *p)
1376{
1377 struct sched_entity *se = &p->se;
1378 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1379
1380 se->vruntime -= cfs_rq->min_vruntime;
1381}
1382
1383#ifdef CONFIG_FAIR_GROUP_SCHED
1384
1385
1386
1387
1388
1389
1390
1391static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1392{
1393 struct sched_entity *se = tg->se[cpu];
1394
1395 if (!tg->parent)
1396 return wl;
1397
1398 for_each_sched_entity(se) {
1399 long lw, w;
1400
1401 tg = se->my_q->tg;
1402 w = se->my_q->load.weight;
1403
1404
1405 lw = atomic_read(&tg->load_weight);
1406 lw -= se->my_q->load_contribution;
1407 lw += w + wg;
1408
1409 wl += w;
1410
1411 if (lw > 0 && wl < lw)
1412 wl = (wl * tg->shares) / lw;
1413 else
1414 wl = tg->shares;
1415
1416
1417 if (wl < MIN_SHARES)
1418 wl = MIN_SHARES;
1419 wl -= se->load.weight;
1420 wg = 0;
1421 }
1422
1423 return wl;
1424}
1425
1426#else
1427
1428static inline unsigned long effective_load(struct task_group *tg, int cpu,
1429 unsigned long wl, unsigned long wg)
1430{
1431 return wl;
1432}
1433
1434#endif
1435
1436static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1437{
1438 s64 this_load, load;
1439 int idx, this_cpu, prev_cpu;
1440 unsigned long tl_per_task;
1441 struct task_group *tg;
1442 unsigned long weight;
1443 int balanced;
1444
1445 idx = sd->wake_idx;
1446 this_cpu = smp_processor_id();
1447 prev_cpu = task_cpu(p);
1448 load = source_load(prev_cpu, idx);
1449 this_load = target_load(this_cpu, idx);
1450
1451
1452
1453
1454
1455
1456 rcu_read_lock();
1457 if (sync) {
1458 tg = task_group(current);
1459 weight = current->se.load.weight;
1460
1461 this_load += effective_load(tg, this_cpu, -weight, -weight);
1462 load += effective_load(tg, prev_cpu, 0, -weight);
1463 }
1464
1465 tg = task_group(p);
1466 weight = p->se.load.weight;
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477 if (this_load > 0) {
1478 s64 this_eff_load, prev_eff_load;
1479
1480 this_eff_load = 100;
1481 this_eff_load *= power_of(prev_cpu);
1482 this_eff_load *= this_load +
1483 effective_load(tg, this_cpu, weight, weight);
1484
1485 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
1486 prev_eff_load *= power_of(this_cpu);
1487 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
1488
1489 balanced = this_eff_load <= prev_eff_load;
1490 } else
1491 balanced = true;
1492 rcu_read_unlock();
1493
1494
1495
1496
1497
1498
1499 if (sync && balanced)
1500 return 1;
1501
1502 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1503 tl_per_task = cpu_avg_load_per_task(this_cpu);
1504
1505 if (balanced ||
1506 (this_load <= load &&
1507 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1508
1509
1510
1511
1512
1513 schedstat_inc(sd, ttwu_move_affine);
1514 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1515
1516 return 1;
1517 }
1518 return 0;
1519}
1520
1521
1522
1523
1524
1525static struct sched_group *
1526find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1527 int this_cpu, int load_idx)
1528{
1529 struct sched_group *idlest = NULL, *group = sd->groups;
1530 unsigned long min_load = ULONG_MAX, this_load = 0;
1531 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1532
1533 do {
1534 unsigned long load, avg_load;
1535 int local_group;
1536 int i;
1537
1538
1539 if (!cpumask_intersects(sched_group_cpus(group),
1540 &p->cpus_allowed))
1541 continue;
1542
1543 local_group = cpumask_test_cpu(this_cpu,
1544 sched_group_cpus(group));
1545
1546
1547 avg_load = 0;
1548
1549 for_each_cpu(i, sched_group_cpus(group)) {
1550
1551 if (local_group)
1552 load = source_load(i, load_idx);
1553 else
1554 load = target_load(i, load_idx);
1555
1556 avg_load += load;
1557 }
1558
1559
1560 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1561
1562 if (local_group) {
1563 this_load = avg_load;
1564 } else if (avg_load < min_load) {
1565 min_load = avg_load;
1566 idlest = group;
1567 }
1568 } while (group = group->next, group != sd->groups);
1569
1570 if (!idlest || 100*this_load < imbalance*min_load)
1571 return NULL;
1572 return idlest;
1573}
1574
1575
1576
1577
1578static int
1579find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1580{
1581 unsigned long load, min_load = ULONG_MAX;
1582 int idlest = -1;
1583 int i;
1584
1585
1586 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1587 load = weighted_cpuload(i);
1588
1589 if (load < min_load || (load == min_load && i == this_cpu)) {
1590 min_load = load;
1591 idlest = i;
1592 }
1593 }
1594
1595 return idlest;
1596}
1597
1598
1599
1600
1601static int select_idle_sibling(struct task_struct *p, int target)
1602{
1603 int cpu = smp_processor_id();
1604 int prev_cpu = task_cpu(p);
1605 struct sched_domain *sd;
1606 int i;
1607
1608
1609
1610
1611
1612 if (target == cpu && idle_cpu(cpu))
1613 return cpu;
1614
1615
1616
1617
1618
1619 if (target == prev_cpu && idle_cpu(prev_cpu))
1620 return prev_cpu;
1621
1622
1623
1624
1625 for_each_domain(target, sd) {
1626 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1627 break;
1628
1629 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1630 if (idle_cpu(i)) {
1631 target = i;
1632 break;
1633 }
1634 }
1635
1636
1637
1638
1639
1640 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1642 break;
1643 }
1644
1645 return target;
1646}
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659static int
1660select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1661{
1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663 int cpu = smp_processor_id();
1664 int prev_cpu = task_cpu(p);
1665 int new_cpu = cpu;
1666 int want_affine = 0;
1667 int want_sd = 1;
1668 int sync = wake_flags & WF_SYNC;
1669
1670 if (sd_flag & SD_BALANCE_WAKE) {
1671 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1672 want_affine = 1;
1673 new_cpu = prev_cpu;
1674 }
1675
1676 for_each_domain(cpu, tmp) {
1677 if (!(tmp->flags & SD_LOAD_BALANCE))
1678 continue;
1679
1680
1681
1682
1683
1684 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1685 unsigned long power = 0;
1686 unsigned long nr_running = 0;
1687 unsigned long capacity;
1688 int i;
1689
1690 for_each_cpu(i, sched_domain_span(tmp)) {
1691 power += power_of(i);
1692 nr_running += cpu_rq(i)->cfs.nr_running;
1693 }
1694
1695 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1696
1697 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1698 nr_running /= 2;
1699
1700 if (nr_running < capacity)
1701 want_sd = 0;
1702 }
1703
1704
1705
1706
1707
1708 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1709 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1710 affine_sd = tmp;
1711 want_affine = 0;
1712 }
1713
1714 if (!want_sd && !want_affine)
1715 break;
1716
1717 if (!(tmp->flags & sd_flag))
1718 continue;
1719
1720 if (want_sd)
1721 sd = tmp;
1722 }
1723
1724 if (affine_sd) {
1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726 return select_idle_sibling(p, cpu);
1727 else
1728 return select_idle_sibling(p, prev_cpu);
1729 }
1730
1731 while (sd) {
1732 int load_idx = sd->forkexec_idx;
1733 struct sched_group *group;
1734 int weight;
1735
1736 if (!(sd->flags & sd_flag)) {
1737 sd = sd->child;
1738 continue;
1739 }
1740
1741 if (sd_flag & SD_BALANCE_WAKE)
1742 load_idx = sd->wake_idx;
1743
1744 group = find_idlest_group(sd, p, cpu, load_idx);
1745 if (!group) {
1746 sd = sd->child;
1747 continue;
1748 }
1749
1750 new_cpu = find_idlest_cpu(group, p, cpu);
1751 if (new_cpu == -1 || new_cpu == cpu) {
1752
1753 sd = sd->child;
1754 continue;
1755 }
1756
1757
1758 cpu = new_cpu;
1759 weight = sd->span_weight;
1760 sd = NULL;
1761 for_each_domain(cpu, tmp) {
1762 if (weight <= tmp->span_weight)
1763 break;
1764 if (tmp->flags & sd_flag)
1765 sd = tmp;
1766 }
1767
1768 }
1769
1770 return new_cpu;
1771}
1772#endif
1773
1774static unsigned long
1775wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1776{
1777 unsigned long gran = sysctl_sched_wakeup_granularity;
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792 if (unlikely(se->load.weight != NICE_0_LOAD))
1793 gran = calc_delta_fair(gran, se);
1794
1795 return gran;
1796}
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812static int
1813wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1814{
1815 s64 gran, vdiff = curr->vruntime - se->vruntime;
1816
1817 if (vdiff <= 0)
1818 return -1;
1819
1820 gran = wakeup_gran(curr, se);
1821 if (vdiff > gran)
1822 return 1;
1823
1824 return 0;
1825}
1826
1827static void set_last_buddy(struct sched_entity *se)
1828{
1829 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1830 for_each_sched_entity(se)
1831 cfs_rq_of(se)->last = se;
1832 }
1833}
1834
1835static void set_next_buddy(struct sched_entity *se)
1836{
1837 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1838 for_each_sched_entity(se)
1839 cfs_rq_of(se)->next = se;
1840 }
1841}
1842
1843static void set_skip_buddy(struct sched_entity *se)
1844{
1845 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1846 for_each_sched_entity(se)
1847 cfs_rq_of(se)->skip = se;
1848 }
1849}
1850
1851
1852
1853
1854static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1855{
1856 struct task_struct *curr = rq->curr;
1857 struct sched_entity *se = &curr->se, *pse = &p->se;
1858 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1859 int scale = cfs_rq->nr_running >= sched_nr_latency;
1860
1861 if (unlikely(se == pse))
1862 return;
1863
1864 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1865 set_next_buddy(pse);
1866
1867
1868
1869
1870
1871 if (test_tsk_need_resched(curr))
1872 return;
1873
1874
1875 if (unlikely(curr->policy == SCHED_IDLE) &&
1876 likely(p->policy != SCHED_IDLE))
1877 goto preempt;
1878
1879
1880
1881
1882
1883 if (unlikely(p->policy != SCHED_NORMAL))
1884 return;
1885
1886
1887 if (!sched_feat(WAKEUP_PREEMPT))
1888 return;
1889
1890 update_curr(cfs_rq);
1891 find_matching_se(&se, &pse);
1892 BUG_ON(!pse);
1893 if (wakeup_preempt_entity(se, pse) == 1)
1894 goto preempt;
1895
1896 return;
1897
1898preempt:
1899 resched_task(curr);
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909 if (unlikely(!se->on_rq || curr == rq->idle))
1910 return;
1911
1912 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1913 set_last_buddy(se);
1914}
1915
1916static struct task_struct *pick_next_task_fair(struct rq *rq)
1917{
1918 struct task_struct *p;
1919 struct cfs_rq *cfs_rq = &rq->cfs;
1920 struct sched_entity *se;
1921
1922 if (!cfs_rq->nr_running)
1923 return NULL;
1924
1925 do {
1926 se = pick_next_entity(cfs_rq);
1927 set_next_entity(cfs_rq, se);
1928 cfs_rq = group_cfs_rq(se);
1929 } while (cfs_rq);
1930
1931 p = task_of(se);
1932 hrtick_start_fair(rq, p);
1933
1934 return p;
1935}
1936
1937
1938
1939
1940static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1941{
1942 struct sched_entity *se = &prev->se;
1943 struct cfs_rq *cfs_rq;
1944
1945 for_each_sched_entity(se) {
1946 cfs_rq = cfs_rq_of(se);
1947 put_prev_entity(cfs_rq, se);
1948 }
1949}
1950
1951
1952
1953
1954
1955
1956static void yield_task_fair(struct rq *rq)
1957{
1958 struct task_struct *curr = rq->curr;
1959 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1960 struct sched_entity *se = &curr->se;
1961
1962
1963
1964
1965 if (unlikely(rq->nr_running == 1))
1966 return;
1967
1968 clear_buddies(cfs_rq, se);
1969
1970 if (curr->policy != SCHED_BATCH) {
1971 update_rq_clock(rq);
1972
1973
1974
1975 update_curr(cfs_rq);
1976 }
1977
1978 set_skip_buddy(se);
1979}
1980
1981static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
1982{
1983 struct sched_entity *se = &p->se;
1984
1985 if (!se->on_rq)
1986 return false;
1987
1988
1989 set_next_buddy(se);
1990
1991 yield_task_fair(rq);
1992
1993 return true;
1994}
1995
1996#ifdef CONFIG_SMP
1997
1998
1999
2000
2001
2002
2003
2004
2005static void pull_task(struct rq *src_rq, struct task_struct *p,
2006 struct rq *this_rq, int this_cpu)
2007{
2008 deactivate_task(src_rq, p, 0);
2009 set_task_cpu(p, this_cpu);
2010 activate_task(this_rq, p, 0);
2011 check_preempt_curr(this_rq, p, 0);
2012}
2013
2014
2015
2016
2017static
2018int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2019 struct sched_domain *sd, enum cpu_idle_type idle,
2020 int *all_pinned)
2021{
2022 int tsk_cache_hot = 0;
2023
2024
2025
2026
2027
2028
2029 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2030 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2031 return 0;
2032 }
2033 *all_pinned = 0;
2034
2035 if (task_running(rq, p)) {
2036 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
2037 return 0;
2038 }
2039
2040
2041
2042
2043
2044
2045
2046 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
2047 if (!tsk_cache_hot ||
2048 sd->nr_balance_failed > sd->cache_nice_tries) {
2049#ifdef CONFIG_SCHEDSTATS
2050 if (tsk_cache_hot) {
2051 schedstat_inc(sd, lb_hot_gained[idle]);
2052 schedstat_inc(p, se.statistics.nr_forced_migrations);
2053 }
2054#endif
2055 return 1;
2056 }
2057
2058 if (tsk_cache_hot) {
2059 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
2060 return 0;
2061 }
2062 return 1;
2063}
2064
2065
2066
2067
2068
2069
2070
2071
2072static int
2073move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2074 struct sched_domain *sd, enum cpu_idle_type idle)
2075{
2076 struct task_struct *p, *n;
2077 struct cfs_rq *cfs_rq;
2078 int pinned = 0;
2079
2080 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2081 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2082
2083 if (!can_migrate_task(p, busiest, this_cpu,
2084 sd, idle, &pinned))
2085 continue;
2086
2087 pull_task(busiest, p, this_rq, this_cpu);
2088
2089
2090
2091
2092
2093 schedstat_inc(sd, lb_gained[idle]);
2094 return 1;
2095 }
2096 }
2097
2098 return 0;
2099}
2100
2101static unsigned long
2102balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103 unsigned long max_load_move, struct sched_domain *sd,
2104 enum cpu_idle_type idle, int *all_pinned,
2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
2106{
2107 int loops = 0, pulled = 0;
2108 long rem_load_move = max_load_move;
2109 struct task_struct *p, *n;
2110
2111 if (max_load_move == 0)
2112 goto out;
2113
2114 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2115 if (loops++ > sysctl_sched_nr_migrate)
2116 break;
2117
2118 if ((p->se.load.weight >> 1) > rem_load_move ||
2119 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2120 all_pinned))
2121 continue;
2122
2123 pull_task(busiest, p, this_rq, this_cpu);
2124 pulled++;
2125 rem_load_move -= p->se.load.weight;
2126
2127#ifdef CONFIG_PREEMPT
2128
2129
2130
2131
2132
2133 if (idle == CPU_NEWLY_IDLE)
2134 break;
2135#endif
2136
2137
2138
2139
2140
2141 if (rem_load_move <= 0)
2142 break;
2143
2144 if (p->prio < *this_best_prio)
2145 *this_best_prio = p->prio;
2146 }
2147out:
2148
2149
2150
2151
2152
2153 schedstat_add(sd, lb_gained[idle], pulled);
2154
2155 return max_load_move - rem_load_move;
2156}
2157
2158#ifdef CONFIG_FAIR_GROUP_SCHED
2159
2160
2161
2162static int update_shares_cpu(struct task_group *tg, int cpu)
2163{
2164 struct cfs_rq *cfs_rq;
2165 unsigned long flags;
2166 struct rq *rq;
2167
2168 if (!tg->se[cpu])
2169 return 0;
2170
2171 rq = cpu_rq(cpu);
2172 cfs_rq = tg->cfs_rq[cpu];
2173
2174 raw_spin_lock_irqsave(&rq->lock, flags);
2175
2176 update_rq_clock(rq);
2177 update_cfs_load(cfs_rq, 1);
2178
2179
2180
2181
2182
2183 update_cfs_shares(cfs_rq);
2184
2185 raw_spin_unlock_irqrestore(&rq->lock, flags);
2186
2187 return 0;
2188}
2189
2190static void update_shares(int cpu)
2191{
2192 struct cfs_rq *cfs_rq;
2193 struct rq *rq = cpu_rq(cpu);
2194
2195 rcu_read_lock();
2196 for_each_leaf_cfs_rq(rq, cfs_rq)
2197 update_shares_cpu(cfs_rq->tg, cpu);
2198 rcu_read_unlock();
2199}
2200
2201static unsigned long
2202load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2203 unsigned long max_load_move,
2204 struct sched_domain *sd, enum cpu_idle_type idle,
2205 int *all_pinned, int *this_best_prio)
2206{
2207 long rem_load_move = max_load_move;
2208 int busiest_cpu = cpu_of(busiest);
2209 struct task_group *tg;
2210
2211 rcu_read_lock();
2212 update_h_load(busiest_cpu);
2213
2214 list_for_each_entry_rcu(tg, &task_groups, list) {
2215 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
2216 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
2217 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2218 u64 rem_load, moved_load;
2219
2220
2221
2222
2223 if (!busiest_cfs_rq->task_weight)
2224 continue;
2225
2226 rem_load = (u64)rem_load_move * busiest_weight;
2227 rem_load = div_u64(rem_load, busiest_h_load + 1);
2228
2229 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2230 rem_load, sd, idle, all_pinned, this_best_prio,
2231 busiest_cfs_rq);
2232
2233 if (!moved_load)
2234 continue;
2235
2236 moved_load *= busiest_h_load;
2237 moved_load = div_u64(moved_load, busiest_weight + 1);
2238
2239 rem_load_move -= moved_load;
2240 if (rem_load_move < 0)
2241 break;
2242 }
2243 rcu_read_unlock();
2244
2245 return max_load_move - rem_load_move;
2246}
2247#else
2248static inline void update_shares(int cpu)
2249{
2250}
2251
2252static unsigned long
2253load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2254 unsigned long max_load_move,
2255 struct sched_domain *sd, enum cpu_idle_type idle,
2256 int *all_pinned, int *this_best_prio)
2257{
2258 return balance_tasks(this_rq, this_cpu, busiest,
2259 max_load_move, sd, idle, all_pinned,
2260 this_best_prio, &busiest->cfs);
2261}
2262#endif
2263
2264
2265
2266
2267
2268
2269
2270
2271static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2272 unsigned long max_load_move,
2273 struct sched_domain *sd, enum cpu_idle_type idle,
2274 int *all_pinned)
2275{
2276 unsigned long total_load_moved = 0, load_moved;
2277 int this_best_prio = this_rq->curr->prio;
2278
2279 do {
2280 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2281 max_load_move - total_load_moved,
2282 sd, idle, all_pinned, &this_best_prio);
2283
2284 total_load_moved += load_moved;
2285
2286#ifdef CONFIG_PREEMPT
2287
2288
2289
2290
2291
2292 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
2293 break;
2294
2295 if (raw_spin_is_contended(&this_rq->lock) ||
2296 raw_spin_is_contended(&busiest->lock))
2297 break;
2298#endif
2299 } while (load_moved && max_load_move > total_load_moved);
2300
2301 return total_load_moved > 0;
2302}
2303
2304
2305
2306
2307
2308
2309struct sd_lb_stats {
2310 struct sched_group *busiest;
2311 struct sched_group *this;
2312 unsigned long total_load;
2313 unsigned long total_pwr;
2314 unsigned long avg_load;
2315
2316
2317 unsigned long this_load;
2318 unsigned long this_load_per_task;
2319 unsigned long this_nr_running;
2320 unsigned long this_has_capacity;
2321 unsigned int this_idle_cpus;
2322
2323
2324 unsigned int busiest_idle_cpus;
2325 unsigned long max_load;
2326 unsigned long busiest_load_per_task;
2327 unsigned long busiest_nr_running;
2328 unsigned long busiest_group_capacity;
2329 unsigned long busiest_has_capacity;
2330 unsigned int busiest_group_weight;
2331
2332 int group_imb;
2333#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2334 int power_savings_balance;
2335 struct sched_group *group_min;
2336 struct sched_group *group_leader;
2337 unsigned long min_load_per_task;
2338 unsigned long leader_nr_running;
2339 unsigned long min_nr_running;
2340#endif
2341};
2342
2343
2344
2345
2346struct sg_lb_stats {
2347 unsigned long avg_load;
2348 unsigned long group_load;
2349 unsigned long sum_nr_running;
2350 unsigned long sum_weighted_load;
2351 unsigned long group_capacity;
2352 unsigned long idle_cpus;
2353 unsigned long group_weight;
2354 int group_imb;
2355 int group_has_capacity;
2356};
2357
2358
2359
2360
2361
2362static inline unsigned int group_first_cpu(struct sched_group *group)
2363{
2364 return cpumask_first(sched_group_cpus(group));
2365}
2366
2367
2368
2369
2370
2371
2372static inline int get_sd_load_idx(struct sched_domain *sd,
2373 enum cpu_idle_type idle)
2374{
2375 int load_idx;
2376
2377 switch (idle) {
2378 case CPU_NOT_IDLE:
2379 load_idx = sd->busy_idx;
2380 break;
2381
2382 case CPU_NEWLY_IDLE:
2383 load_idx = sd->newidle_idx;
2384 break;
2385 default:
2386 load_idx = sd->idle_idx;
2387 break;
2388 }
2389
2390 return load_idx;
2391}
2392
2393
2394#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2395
2396
2397
2398
2399
2400
2401
2402
2403static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2404 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2405{
2406
2407
2408
2409
2410 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2411 sds->power_savings_balance = 0;
2412 else {
2413 sds->power_savings_balance = 1;
2414 sds->min_nr_running = ULONG_MAX;
2415 sds->leader_nr_running = 0;
2416 }
2417}
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429static inline void update_sd_power_savings_stats(struct sched_group *group,
2430 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2431{
2432
2433 if (!sds->power_savings_balance)
2434 return;
2435
2436
2437
2438
2439
2440 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2441 !sds->this_nr_running))
2442 sds->power_savings_balance = 0;
2443
2444
2445
2446
2447
2448 if (!sds->power_savings_balance ||
2449 sgs->sum_nr_running >= sgs->group_capacity ||
2450 !sgs->sum_nr_running)
2451 return;
2452
2453
2454
2455
2456
2457
2458 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2459 (sgs->sum_nr_running == sds->min_nr_running &&
2460 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2461 sds->group_min = group;
2462 sds->min_nr_running = sgs->sum_nr_running;
2463 sds->min_load_per_task = sgs->sum_weighted_load /
2464 sgs->sum_nr_running;
2465 }
2466
2467
2468
2469
2470
2471
2472 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2473 return;
2474
2475 if (sgs->sum_nr_running > sds->leader_nr_running ||
2476 (sgs->sum_nr_running == sds->leader_nr_running &&
2477 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2478 sds->group_leader = group;
2479 sds->leader_nr_running = sgs->sum_nr_running;
2480 }
2481}
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2499 int this_cpu, unsigned long *imbalance)
2500{
2501 if (!sds->power_savings_balance)
2502 return 0;
2503
2504 if (sds->this != sds->group_leader ||
2505 sds->group_leader == sds->group_min)
2506 return 0;
2507
2508 *imbalance = sds->min_load_per_task;
2509 sds->busiest = sds->group_min;
2510
2511 return 1;
2512
2513}
2514#else
2515static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2516 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2517{
2518 return;
2519}
2520
2521static inline void update_sd_power_savings_stats(struct sched_group *group,
2522 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2523{
2524 return;
2525}
2526
2527static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2528 int this_cpu, unsigned long *imbalance)
2529{
2530 return 0;
2531}
2532#endif
2533
2534
2535unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2536{
2537 return SCHED_LOAD_SCALE;
2538}
2539
2540unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2541{
2542 return default_scale_freq_power(sd, cpu);
2543}
2544
2545unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2546{
2547 unsigned long weight = sd->span_weight;
2548 unsigned long smt_gain = sd->smt_gain;
2549
2550 smt_gain /= weight;
2551
2552 return smt_gain;
2553}
2554
2555unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2556{
2557 return default_scale_smt_power(sd, cpu);
2558}
2559
2560unsigned long scale_rt_power(int cpu)
2561{
2562 struct rq *rq = cpu_rq(cpu);
2563 u64 total, available;
2564
2565 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2566
2567 if (unlikely(total < rq->rt_avg)) {
2568
2569 available = 0;
2570 } else {
2571 available = total - rq->rt_avg;
2572 }
2573
2574 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2575 total = SCHED_LOAD_SCALE;
2576
2577 total >>= SCHED_LOAD_SHIFT;
2578
2579 return div_u64(available, total);
2580}
2581
2582static void update_cpu_power(struct sched_domain *sd, int cpu)
2583{
2584 unsigned long weight = sd->span_weight;
2585 unsigned long power = SCHED_LOAD_SCALE;
2586 struct sched_group *sdg = sd->groups;
2587
2588 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2589 if (sched_feat(ARCH_POWER))
2590 power *= arch_scale_smt_power(sd, cpu);
2591 else
2592 power *= default_scale_smt_power(sd, cpu);
2593
2594 power >>= SCHED_LOAD_SHIFT;
2595 }
2596
2597 sdg->cpu_power_orig = power;
2598
2599 if (sched_feat(ARCH_POWER))
2600 power *= arch_scale_freq_power(sd, cpu);
2601 else
2602 power *= default_scale_freq_power(sd, cpu);
2603
2604 power >>= SCHED_LOAD_SHIFT;
2605
2606 power *= scale_rt_power(cpu);
2607 power >>= SCHED_LOAD_SHIFT;
2608
2609 if (!power)
2610 power = 1;
2611
2612 cpu_rq(cpu)->cpu_power = power;
2613 sdg->cpu_power = power;
2614}
2615
2616static void update_group_power(struct sched_domain *sd, int cpu)
2617{
2618 struct sched_domain *child = sd->child;
2619 struct sched_group *group, *sdg = sd->groups;
2620 unsigned long power;
2621
2622 if (!child) {
2623 update_cpu_power(sd, cpu);
2624 return;
2625 }
2626
2627 power = 0;
2628
2629 group = child->groups;
2630 do {
2631 power += group->cpu_power;
2632 group = group->next;
2633 } while (group != child->groups);
2634
2635 sdg->cpu_power = power;
2636}
2637
2638
2639
2640
2641
2642
2643
2644
2645static inline int
2646fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2647{
2648
2649
2650
2651 if (sd->level != SD_LV_SIBLING)
2652 return 0;
2653
2654
2655
2656
2657 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2658 return 1;
2659
2660 return 0;
2661}
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675static inline void update_sg_lb_stats(struct sched_domain *sd,
2676 struct sched_group *group, int this_cpu,
2677 enum cpu_idle_type idle, int load_idx,
2678 int local_group, const struct cpumask *cpus,
2679 int *balance, struct sg_lb_stats *sgs)
2680{
2681 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2682 int i;
2683 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2684 unsigned long avg_load_per_task = 0;
2685
2686 if (local_group)
2687 balance_cpu = group_first_cpu(group);
2688
2689
2690 max_cpu_load = 0;
2691 min_cpu_load = ~0UL;
2692 max_nr_running = 0;
2693
2694 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2695 struct rq *rq = cpu_rq(i);
2696
2697
2698 if (local_group) {
2699 if (idle_cpu(i) && !first_idle_cpu) {
2700 first_idle_cpu = 1;
2701 balance_cpu = i;
2702 }
2703
2704 load = target_load(i, load_idx);
2705 } else {
2706 load = source_load(i, load_idx);
2707 if (load > max_cpu_load) {
2708 max_cpu_load = load;
2709 max_nr_running = rq->nr_running;
2710 }
2711 if (min_cpu_load > load)
2712 min_cpu_load = load;
2713 }
2714
2715 sgs->group_load += load;
2716 sgs->sum_nr_running += rq->nr_running;
2717 sgs->sum_weighted_load += weighted_cpuload(i);
2718 if (idle_cpu(i))
2719 sgs->idle_cpus++;
2720 }
2721
2722
2723
2724
2725
2726
2727
2728 if (idle != CPU_NEWLY_IDLE && local_group) {
2729 if (balance_cpu != this_cpu) {
2730 *balance = 0;
2731 return;
2732 }
2733 update_group_power(sd, this_cpu);
2734 }
2735
2736
2737 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748 if (sgs->sum_nr_running)
2749 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2750
2751 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2752 sgs->group_imb = 1;
2753
2754 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2755 if (!sgs->group_capacity)
2756 sgs->group_capacity = fix_small_capacity(sd, group);
2757 sgs->group_weight = group->group_weight;
2758
2759 if (sgs->group_capacity > sgs->sum_nr_running)
2760 sgs->group_has_capacity = 1;
2761}
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774static bool update_sd_pick_busiest(struct sched_domain *sd,
2775 struct sd_lb_stats *sds,
2776 struct sched_group *sg,
2777 struct sg_lb_stats *sgs,
2778 int this_cpu)
2779{
2780 if (sgs->avg_load <= sds->max_load)
2781 return false;
2782
2783 if (sgs->sum_nr_running > sgs->group_capacity)
2784 return true;
2785
2786 if (sgs->group_imb)
2787 return true;
2788
2789
2790
2791
2792
2793
2794 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2795 this_cpu < group_first_cpu(sg)) {
2796 if (!sds->busiest)
2797 return true;
2798
2799 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2800 return true;
2801 }
2802
2803 return false;
2804}
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2816 enum cpu_idle_type idle, const struct cpumask *cpus,
2817 int *balance, struct sd_lb_stats *sds)
2818{
2819 struct sched_domain *child = sd->child;
2820 struct sched_group *sg = sd->groups;
2821 struct sg_lb_stats sgs;
2822 int load_idx, prefer_sibling = 0;
2823
2824 if (child && child->flags & SD_PREFER_SIBLING)
2825 prefer_sibling = 1;
2826
2827 init_sd_power_savings_stats(sd, sds, idle);
2828 load_idx = get_sd_load_idx(sd, idle);
2829
2830 do {
2831 int local_group;
2832
2833 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2834 memset(&sgs, 0, sizeof(sgs));
2835 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2836 local_group, cpus, balance, &sgs);
2837
2838 if (local_group && !(*balance))
2839 return;
2840
2841 sds->total_load += sgs.group_load;
2842 sds->total_pwr += sg->cpu_power;
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854 if (prefer_sibling && !local_group && sds->this_has_capacity)
2855 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2856
2857 if (local_group) {
2858 sds->this_load = sgs.avg_load;
2859 sds->this = sg;
2860 sds->this_nr_running = sgs.sum_nr_running;
2861 sds->this_load_per_task = sgs.sum_weighted_load;
2862 sds->this_has_capacity = sgs.group_has_capacity;
2863 sds->this_idle_cpus = sgs.idle_cpus;
2864 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2865 sds->max_load = sgs.avg_load;
2866 sds->busiest = sg;
2867 sds->busiest_nr_running = sgs.sum_nr_running;
2868 sds->busiest_idle_cpus = sgs.idle_cpus;
2869 sds->busiest_group_capacity = sgs.group_capacity;
2870 sds->busiest_load_per_task = sgs.sum_weighted_load;
2871 sds->busiest_has_capacity = sgs.group_has_capacity;
2872 sds->busiest_group_weight = sgs.group_weight;
2873 sds->group_imb = sgs.group_imb;
2874 }
2875
2876 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2877 sg = sg->next;
2878 } while (sg != sd->groups);
2879}
2880
2881int __weak arch_sd_sibling_asym_packing(void)
2882{
2883 return 0*SD_ASYM_PACKING;
2884}
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911static int check_asym_packing(struct sched_domain *sd,
2912 struct sd_lb_stats *sds,
2913 int this_cpu, unsigned long *imbalance)
2914{
2915 int busiest_cpu;
2916
2917 if (!(sd->flags & SD_ASYM_PACKING))
2918 return 0;
2919
2920 if (!sds->busiest)
2921 return 0;
2922
2923 busiest_cpu = group_first_cpu(sds->busiest);
2924 if (this_cpu > busiest_cpu)
2925 return 0;
2926
2927 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2928 SCHED_LOAD_SCALE);
2929 return 1;
2930}
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2941 int this_cpu, unsigned long *imbalance)
2942{
2943 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2944 unsigned int imbn = 2;
2945 unsigned long scaled_busy_load_per_task;
2946
2947 if (sds->this_nr_running) {
2948 sds->this_load_per_task /= sds->this_nr_running;
2949 if (sds->busiest_load_per_task >
2950 sds->this_load_per_task)
2951 imbn = 1;
2952 } else
2953 sds->this_load_per_task =
2954 cpu_avg_load_per_task(this_cpu);
2955
2956 scaled_busy_load_per_task = sds->busiest_load_per_task
2957 * SCHED_LOAD_SCALE;
2958 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2959
2960 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2961 (scaled_busy_load_per_task * imbn)) {
2962 *imbalance = sds->busiest_load_per_task;
2963 return;
2964 }
2965
2966
2967
2968
2969
2970
2971
2972 pwr_now += sds->busiest->cpu_power *
2973 min(sds->busiest_load_per_task, sds->max_load);
2974 pwr_now += sds->this->cpu_power *
2975 min(sds->this_load_per_task, sds->this_load);
2976 pwr_now /= SCHED_LOAD_SCALE;
2977
2978
2979 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2980 sds->busiest->cpu_power;
2981 if (sds->max_load > tmp)
2982 pwr_move += sds->busiest->cpu_power *
2983 min(sds->busiest_load_per_task, sds->max_load - tmp);
2984
2985
2986 if (sds->max_load * sds->busiest->cpu_power <
2987 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2988 tmp = (sds->max_load * sds->busiest->cpu_power) /
2989 sds->this->cpu_power;
2990 else
2991 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2992 sds->this->cpu_power;
2993 pwr_move += sds->this->cpu_power *
2994 min(sds->this_load_per_task, sds->this_load + tmp);
2995 pwr_move /= SCHED_LOAD_SCALE;
2996
2997
2998 if (pwr_move > pwr_now)
2999 *imbalance = sds->busiest_load_per_task;
3000}
3001
3002
3003
3004
3005
3006
3007
3008
3009static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3010 unsigned long *imbalance)
3011{
3012 unsigned long max_pull, load_above_capacity = ~0UL;
3013
3014 sds->busiest_load_per_task /= sds->busiest_nr_running;
3015 if (sds->group_imb) {
3016 sds->busiest_load_per_task =
3017 min(sds->busiest_load_per_task, sds->avg_load);
3018 }
3019
3020
3021
3022
3023
3024
3025 if (sds->max_load < sds->avg_load) {
3026 *imbalance = 0;
3027 return fix_small_imbalance(sds, this_cpu, imbalance);
3028 }
3029
3030 if (!sds->group_imb) {
3031
3032
3033
3034 load_above_capacity = (sds->busiest_nr_running -
3035 sds->busiest_group_capacity);
3036
3037 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
3038
3039 load_above_capacity /= sds->busiest->cpu_power;
3040 }
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
3053
3054
3055 *imbalance = min(max_pull * sds->busiest->cpu_power,
3056 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3057 / SCHED_LOAD_SCALE;
3058
3059
3060
3061
3062
3063
3064
3065 if (*imbalance < sds->busiest_load_per_task)
3066 return fix_small_imbalance(sds, this_cpu, imbalance);
3067
3068}
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096static struct sched_group *
3097find_busiest_group(struct sched_domain *sd, int this_cpu,
3098 unsigned long *imbalance, enum cpu_idle_type idle,
3099 const struct cpumask *cpus, int *balance)
3100{
3101 struct sd_lb_stats sds;
3102
3103 memset(&sds, 0, sizeof(sds));
3104
3105
3106
3107
3108
3109 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3110
3111
3112
3113
3114
3115 if (!(*balance))
3116 goto ret;
3117
3118 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
3119 check_asym_packing(sd, &sds, this_cpu, imbalance))
3120 return sds.busiest;
3121
3122
3123 if (!sds.busiest || sds.busiest_nr_running == 0)
3124 goto out_balanced;
3125
3126 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3127
3128
3129
3130
3131
3132
3133 if (sds.group_imb)
3134 goto force_balance;
3135
3136
3137 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3138 !sds.busiest_has_capacity)
3139 goto force_balance;
3140
3141
3142
3143
3144
3145 if (sds.this_load >= sds.max_load)
3146 goto out_balanced;
3147
3148
3149
3150
3151
3152 if (sds.this_load >= sds.avg_load)
3153 goto out_balanced;
3154
3155 if (idle == CPU_IDLE) {
3156
3157
3158
3159
3160
3161
3162 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3163 sds.busiest_nr_running <= sds.busiest_group_weight)
3164 goto out_balanced;
3165 } else {
3166
3167
3168
3169
3170 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3171 goto out_balanced;
3172 }
3173
3174force_balance:
3175
3176 calculate_imbalance(&sds, this_cpu, imbalance);
3177 return sds.busiest;
3178
3179out_balanced:
3180
3181
3182
3183
3184 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3185 return sds.busiest;
3186ret:
3187 *imbalance = 0;
3188 return NULL;
3189}
3190
3191
3192
3193
3194static struct rq *
3195find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3196 enum cpu_idle_type idle, unsigned long imbalance,
3197 const struct cpumask *cpus)
3198{
3199 struct rq *busiest = NULL, *rq;
3200 unsigned long max_load = 0;
3201 int i;
3202
3203 for_each_cpu(i, sched_group_cpus(group)) {
3204 unsigned long power = power_of(i);
3205 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3206 unsigned long wl;
3207
3208 if (!capacity)
3209 capacity = fix_small_capacity(sd, group);
3210
3211 if (!cpumask_test_cpu(i, cpus))
3212 continue;
3213
3214 rq = cpu_rq(i);
3215 wl = weighted_cpuload(i);
3216
3217
3218
3219
3220
3221 if (capacity && rq->nr_running == 1 && wl > imbalance)
3222 continue;
3223
3224
3225
3226
3227
3228
3229
3230 wl = (wl * SCHED_LOAD_SCALE) / power;
3231
3232 if (wl > max_load) {
3233 max_load = wl;
3234 busiest = rq;
3235 }
3236 }
3237
3238 return busiest;
3239}
3240
3241
3242
3243
3244
3245#define MAX_PINNED_INTERVAL 512
3246
3247
3248static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3249
3250static int need_active_balance(struct sched_domain *sd, int idle,
3251 int busiest_cpu, int this_cpu)
3252{
3253 if (idle == CPU_NEWLY_IDLE) {
3254
3255
3256
3257
3258
3259
3260 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
3261 return 1;
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3283 return 0;
3284 }
3285
3286 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
3287}
3288
3289static int active_load_balance_cpu_stop(void *data);
3290
3291
3292
3293
3294
3295static int load_balance(int this_cpu, struct rq *this_rq,
3296 struct sched_domain *sd, enum cpu_idle_type idle,
3297 int *balance)
3298{
3299 int ld_moved, all_pinned = 0, active_balance = 0;
3300 struct sched_group *group;
3301 unsigned long imbalance;
3302 struct rq *busiest;
3303 unsigned long flags;
3304 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3305
3306 cpumask_copy(cpus, cpu_active_mask);
3307
3308 schedstat_inc(sd, lb_count[idle]);
3309
3310redo:
3311 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3312 cpus, balance);
3313
3314 if (*balance == 0)
3315 goto out_balanced;
3316
3317 if (!group) {
3318 schedstat_inc(sd, lb_nobusyg[idle]);
3319 goto out_balanced;
3320 }
3321
3322 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
3323 if (!busiest) {
3324 schedstat_inc(sd, lb_nobusyq[idle]);
3325 goto out_balanced;
3326 }
3327
3328 BUG_ON(busiest == this_rq);
3329
3330 schedstat_add(sd, lb_imbalance[idle], imbalance);
3331
3332 ld_moved = 0;
3333 if (busiest->nr_running > 1) {
3334
3335
3336
3337
3338
3339
3340 all_pinned = 1;
3341 local_irq_save(flags);
3342 double_rq_lock(this_rq, busiest);
3343 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3344 imbalance, sd, idle, &all_pinned);
3345 double_rq_unlock(this_rq, busiest);
3346 local_irq_restore(flags);
3347
3348
3349
3350
3351 if (ld_moved && this_cpu != smp_processor_id())
3352 resched_cpu(this_cpu);
3353
3354
3355 if (unlikely(all_pinned)) {
3356 cpumask_clear_cpu(cpu_of(busiest), cpus);
3357 if (!cpumask_empty(cpus))
3358 goto redo;
3359 goto out_balanced;
3360 }
3361 }
3362
3363 if (!ld_moved) {
3364 schedstat_inc(sd, lb_failed[idle]);
3365
3366
3367
3368
3369
3370
3371 if (idle != CPU_NEWLY_IDLE)
3372 sd->nr_balance_failed++;
3373
3374 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3375 raw_spin_lock_irqsave(&busiest->lock, flags);
3376
3377
3378
3379
3380
3381 if (!cpumask_test_cpu(this_cpu,
3382 &busiest->curr->cpus_allowed)) {
3383 raw_spin_unlock_irqrestore(&busiest->lock,
3384 flags);
3385 all_pinned = 1;
3386 goto out_one_pinned;
3387 }
3388
3389
3390
3391
3392
3393
3394 if (!busiest->active_balance) {
3395 busiest->active_balance = 1;
3396 busiest->push_cpu = this_cpu;
3397 active_balance = 1;
3398 }
3399 raw_spin_unlock_irqrestore(&busiest->lock, flags);
3400
3401 if (active_balance)
3402 stop_one_cpu_nowait(cpu_of(busiest),
3403 active_load_balance_cpu_stop, busiest,
3404 &busiest->active_balance_work);
3405
3406
3407
3408
3409
3410 sd->nr_balance_failed = sd->cache_nice_tries+1;
3411 }
3412 } else
3413 sd->nr_balance_failed = 0;
3414
3415 if (likely(!active_balance)) {
3416
3417 sd->balance_interval = sd->min_interval;
3418 } else {
3419
3420
3421
3422
3423
3424
3425 if (sd->balance_interval < sd->max_interval)
3426 sd->balance_interval *= 2;
3427 }
3428
3429 goto out;
3430
3431out_balanced:
3432 schedstat_inc(sd, lb_balanced[idle]);
3433
3434 sd->nr_balance_failed = 0;
3435
3436out_one_pinned:
3437
3438 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3439 (sd->balance_interval < sd->max_interval))
3440 sd->balance_interval *= 2;
3441
3442 ld_moved = 0;
3443out:
3444 return ld_moved;
3445}
3446
3447
3448
3449
3450
3451static void idle_balance(int this_cpu, struct rq *this_rq)
3452{
3453 struct sched_domain *sd;
3454 int pulled_task = 0;
3455 unsigned long next_balance = jiffies + HZ;
3456
3457 this_rq->idle_stamp = this_rq->clock;
3458
3459 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3460 return;
3461
3462
3463
3464
3465 raw_spin_unlock(&this_rq->lock);
3466
3467 update_shares(this_cpu);
3468 for_each_domain(this_cpu, sd) {
3469 unsigned long interval;
3470 int balance = 1;
3471
3472 if (!(sd->flags & SD_LOAD_BALANCE))
3473 continue;
3474
3475 if (sd->flags & SD_BALANCE_NEWIDLE) {
3476
3477 pulled_task = load_balance(this_cpu, this_rq,
3478 sd, CPU_NEWLY_IDLE, &balance);
3479 }
3480
3481 interval = msecs_to_jiffies(sd->balance_interval);
3482 if (time_after(next_balance, sd->last_balance + interval))
3483 next_balance = sd->last_balance + interval;
3484 if (pulled_task) {
3485 this_rq->idle_stamp = 0;
3486 break;
3487 }
3488 }
3489
3490 raw_spin_lock(&this_rq->lock);
3491
3492 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3493
3494
3495
3496
3497 this_rq->next_balance = next_balance;
3498 }
3499}
3500
3501
3502
3503
3504
3505
3506
3507static int active_load_balance_cpu_stop(void *data)
3508{
3509 struct rq *busiest_rq = data;
3510 int busiest_cpu = cpu_of(busiest_rq);
3511 int target_cpu = busiest_rq->push_cpu;
3512 struct rq *target_rq = cpu_rq(target_cpu);
3513 struct sched_domain *sd;
3514
3515 raw_spin_lock_irq(&busiest_rq->lock);
3516
3517
3518 if (unlikely(busiest_cpu != smp_processor_id() ||
3519 !busiest_rq->active_balance))
3520 goto out_unlock;
3521
3522
3523 if (busiest_rq->nr_running <= 1)
3524 goto out_unlock;
3525
3526
3527
3528
3529
3530
3531 BUG_ON(busiest_rq == target_rq);
3532
3533
3534 double_lock_balance(busiest_rq, target_rq);
3535
3536
3537 for_each_domain(target_cpu, sd) {
3538 if ((sd->flags & SD_LOAD_BALANCE) &&
3539 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3540 break;
3541 }
3542
3543 if (likely(sd)) {
3544 schedstat_inc(sd, alb_count);
3545
3546 if (move_one_task(target_rq, target_cpu, busiest_rq,
3547 sd, CPU_IDLE))
3548 schedstat_inc(sd, alb_pushed);
3549 else
3550 schedstat_inc(sd, alb_failed);
3551 }
3552 double_unlock_balance(busiest_rq, target_rq);
3553out_unlock:
3554 busiest_rq->active_balance = 0;
3555 raw_spin_unlock_irq(&busiest_rq->lock);
3556 return 0;
3557}
3558
3559#ifdef CONFIG_NO_HZ
3560
3561static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3562
3563static void trigger_sched_softirq(void *data)
3564{
3565 raise_softirq_irqoff(SCHED_SOFTIRQ);
3566}
3567
3568static inline void init_sched_softirq_csd(struct call_single_data *csd)
3569{
3570 csd->func = trigger_sched_softirq;
3571 csd->info = NULL;
3572 csd->flags = 0;
3573 csd->priv = 0;
3574}
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586static struct {
3587 atomic_t load_balancer;
3588 atomic_t first_pick_cpu;
3589 atomic_t second_pick_cpu;
3590 cpumask_var_t idle_cpus_mask;
3591 cpumask_var_t grp_idle_mask;
3592 unsigned long next_balance;
3593} nohz ____cacheline_aligned;
3594
3595int get_nohz_load_balancer(void)
3596{
3597 return atomic_read(&nohz.load_balancer);
3598}
3599
3600#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3611{
3612 struct sched_domain *sd;
3613
3614 for_each_domain(cpu, sd)
3615 if (sd && (sd->flags & flag))
3616 break;
3617
3618 return sd;
3619}
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631#define for_each_flag_domain(cpu, sd, flag) \
3632 for (sd = lowest_flag_domain(cpu, flag); \
3633 (sd && (sd->flags & flag)); sd = sd->parent)
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645static inline int is_semi_idle_group(struct sched_group *ilb_group)
3646{
3647 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3648 sched_group_cpus(ilb_group));
3649
3650
3651
3652
3653
3654 if (cpumask_empty(nohz.grp_idle_mask))
3655 return 0;
3656
3657 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3658 return 0;
3659
3660 return 1;
3661}
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674static int find_new_ilb(int cpu)
3675{
3676 struct sched_domain *sd;
3677 struct sched_group *ilb_group;
3678
3679
3680
3681
3682
3683 if (!(sched_smt_power_savings || sched_mc_power_savings))
3684 goto out_done;
3685
3686
3687
3688
3689
3690 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3691 goto out_done;
3692
3693 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3694 ilb_group = sd->groups;
3695
3696 do {
3697 if (is_semi_idle_group(ilb_group))
3698 return cpumask_first(nohz.grp_idle_mask);
3699
3700 ilb_group = ilb_group->next;
3701
3702 } while (ilb_group != sd->groups);
3703 }
3704
3705out_done:
3706 return nr_cpu_ids;
3707}
3708#else
3709static inline int find_new_ilb(int call_cpu)
3710{
3711 return nr_cpu_ids;
3712}
3713#endif
3714
3715
3716
3717
3718
3719
3720static void nohz_balancer_kick(int cpu)
3721{
3722 int ilb_cpu;
3723
3724 nohz.next_balance++;
3725
3726 ilb_cpu = get_nohz_load_balancer();
3727
3728 if (ilb_cpu >= nr_cpu_ids) {
3729 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3730 if (ilb_cpu >= nr_cpu_ids)
3731 return;
3732 }
3733
3734 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3735 struct call_single_data *cp;
3736
3737 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3738 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3739 __smp_call_function_single(ilb_cpu, cp, 0);
3740 }
3741 return;
3742}
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757void select_nohz_load_balancer(int stop_tick)
3758{
3759 int cpu = smp_processor_id();
3760
3761 if (stop_tick) {
3762 if (!cpu_active(cpu)) {
3763 if (atomic_read(&nohz.load_balancer) != cpu)
3764 return;
3765
3766
3767
3768
3769
3770 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3771 nr_cpu_ids) != cpu)
3772 BUG();
3773
3774 return;
3775 }
3776
3777 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3778
3779 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3780 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3781 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3782 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3783
3784 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3785 int new_ilb;
3786
3787
3788 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3789 cpu) != nr_cpu_ids)
3790 return;
3791
3792
3793
3794
3795
3796 new_ilb = find_new_ilb(cpu);
3797 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3798 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3799 resched_cpu(new_ilb);
3800 return;
3801 }
3802 return;
3803 }
3804 } else {
3805 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3806 return;
3807
3808 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3809
3810 if (atomic_read(&nohz.load_balancer) == cpu)
3811 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3812 nr_cpu_ids) != cpu)
3813 BUG();
3814 }
3815 return;
3816}
3817#endif
3818
3819static DEFINE_SPINLOCK(balancing);
3820
3821static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3822
3823
3824
3825
3826
3827static void update_max_interval(void)
3828{
3829 max_load_balance_interval = HZ*num_online_cpus()/10;
3830}
3831
3832
3833
3834
3835
3836
3837
3838static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3839{
3840 int balance = 1;
3841 struct rq *rq = cpu_rq(cpu);
3842 unsigned long interval;
3843 struct sched_domain *sd;
3844
3845 unsigned long next_balance = jiffies + 60*HZ;
3846 int update_next_balance = 0;
3847 int need_serialize;
3848
3849 update_shares(cpu);
3850
3851 for_each_domain(cpu, sd) {
3852 if (!(sd->flags & SD_LOAD_BALANCE))
3853 continue;
3854
3855 interval = sd->balance_interval;
3856 if (idle != CPU_IDLE)
3857 interval *= sd->busy_factor;
3858
3859
3860 interval = msecs_to_jiffies(interval);
3861 interval = clamp(interval, 1UL, max_load_balance_interval);
3862
3863 need_serialize = sd->flags & SD_SERIALIZE;
3864
3865 if (need_serialize) {
3866 if (!spin_trylock(&balancing))
3867 goto out;
3868 }
3869
3870 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3871 if (load_balance(cpu, rq, sd, idle, &balance)) {
3872
3873
3874
3875
3876 idle = CPU_NOT_IDLE;
3877 }
3878 sd->last_balance = jiffies;
3879 }
3880 if (need_serialize)
3881 spin_unlock(&balancing);
3882out:
3883 if (time_after(next_balance, sd->last_balance + interval)) {
3884 next_balance = sd->last_balance + interval;
3885 update_next_balance = 1;
3886 }
3887
3888
3889
3890
3891
3892
3893 if (!balance)
3894 break;
3895 }
3896
3897
3898
3899
3900
3901
3902 if (likely(update_next_balance))
3903 rq->next_balance = next_balance;
3904}
3905
3906#ifdef CONFIG_NO_HZ
3907
3908
3909
3910
3911static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3912{
3913 struct rq *this_rq = cpu_rq(this_cpu);
3914 struct rq *rq;
3915 int balance_cpu;
3916
3917 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3918 return;
3919
3920 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3921 if (balance_cpu == this_cpu)
3922 continue;
3923
3924
3925
3926
3927
3928
3929 if (need_resched()) {
3930 this_rq->nohz_balance_kick = 0;
3931 break;
3932 }
3933
3934 raw_spin_lock_irq(&this_rq->lock);
3935 update_rq_clock(this_rq);
3936 update_cpu_load(this_rq);
3937 raw_spin_unlock_irq(&this_rq->lock);
3938
3939 rebalance_domains(balance_cpu, CPU_IDLE);
3940
3941 rq = cpu_rq(balance_cpu);
3942 if (time_after(this_rq->next_balance, rq->next_balance))
3943 this_rq->next_balance = rq->next_balance;
3944 }
3945 nohz.next_balance = this_rq->next_balance;
3946 this_rq->nohz_balance_kick = 0;
3947}
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961static inline int nohz_kick_needed(struct rq *rq, int cpu)
3962{
3963 unsigned long now = jiffies;
3964 int ret;
3965 int first_pick_cpu, second_pick_cpu;
3966
3967 if (time_before(now, nohz.next_balance))
3968 return 0;
3969
3970 if (rq->idle_at_tick)
3971 return 0;
3972
3973 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3974 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3975
3976 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3977 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3978 return 0;
3979
3980 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3981 if (ret == nr_cpu_ids || ret == cpu) {
3982 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3983 if (rq->nr_running > 1)
3984 return 1;
3985 } else {
3986 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3987 if (ret == nr_cpu_ids || ret == cpu) {
3988 if (rq->nr_running)
3989 return 1;
3990 }
3991 }
3992 return 0;
3993}
3994#else
3995static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3996#endif
3997
3998
3999
4000
4001
4002static void run_rebalance_domains(struct softirq_action *h)
4003{
4004 int this_cpu = smp_processor_id();
4005 struct rq *this_rq = cpu_rq(this_cpu);
4006 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4007 CPU_IDLE : CPU_NOT_IDLE;
4008
4009 rebalance_domains(this_cpu, idle);
4010
4011
4012
4013
4014
4015
4016 nohz_idle_balance(this_cpu, idle);
4017}
4018
4019static inline int on_null_domain(int cpu)
4020{
4021 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4022}
4023
4024
4025
4026
4027static inline void trigger_load_balance(struct rq *rq, int cpu)
4028{
4029
4030 if (time_after_eq(jiffies, rq->next_balance) &&
4031 likely(!on_null_domain(cpu)))
4032 raise_softirq(SCHED_SOFTIRQ);
4033#ifdef CONFIG_NO_HZ
4034 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4035 nohz_balancer_kick(cpu);
4036#endif
4037}
4038
4039static void rq_online_fair(struct rq *rq)
4040{
4041 update_sysctl();
4042}
4043
4044static void rq_offline_fair(struct rq *rq)
4045{
4046 update_sysctl();
4047}
4048
4049#else
4050
4051
4052
4053
4054static inline void idle_balance(int cpu, struct rq *rq)
4055{
4056}
4057
4058#endif
4059
4060
4061
4062
4063static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4064{
4065 struct cfs_rq *cfs_rq;
4066 struct sched_entity *se = &curr->se;
4067
4068 for_each_sched_entity(se) {
4069 cfs_rq = cfs_rq_of(se);
4070 entity_tick(cfs_rq, se, queued);
4071 }
4072}
4073
4074
4075
4076
4077
4078
4079static void task_fork_fair(struct task_struct *p)
4080{
4081 struct cfs_rq *cfs_rq = task_cfs_rq(current);
4082 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
4083 int this_cpu = smp_processor_id();
4084 struct rq *rq = this_rq();
4085 unsigned long flags;
4086
4087 raw_spin_lock_irqsave(&rq->lock, flags);
4088
4089 update_rq_clock(rq);
4090
4091 if (unlikely(task_cpu(p) != this_cpu)) {
4092 rcu_read_lock();
4093 __set_task_cpu(p, this_cpu);
4094 rcu_read_unlock();
4095 }
4096
4097 update_curr(cfs_rq);
4098
4099 if (curr)
4100 se->vruntime = curr->vruntime;
4101 place_entity(cfs_rq, se, 1);
4102
4103 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
4104
4105
4106
4107
4108 swap(curr->vruntime, se->vruntime);
4109 resched_task(rq->curr);
4110 }
4111
4112 se->vruntime -= cfs_rq->min_vruntime;
4113
4114 raw_spin_unlock_irqrestore(&rq->lock, flags);
4115}
4116
4117
4118
4119
4120
4121static void
4122prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4123{
4124 if (!p->se.on_rq)
4125 return;
4126
4127
4128
4129
4130
4131
4132 if (rq->curr == p) {
4133 if (p->prio > oldprio)
4134 resched_task(rq->curr);
4135 } else
4136 check_preempt_curr(rq, p, 0);
4137}
4138
4139static void switched_from_fair(struct rq *rq, struct task_struct *p)
4140{
4141 struct sched_entity *se = &p->se;
4142 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153 if (!se->on_rq && p->state != TASK_RUNNING) {
4154
4155
4156
4157
4158 place_entity(cfs_rq, se, 0);
4159 se->vruntime -= cfs_rq->min_vruntime;
4160 }
4161}
4162
4163
4164
4165
4166static void switched_to_fair(struct rq *rq, struct task_struct *p)
4167{
4168 if (!p->se.on_rq)
4169 return;
4170
4171
4172
4173
4174
4175
4176 if (rq->curr == p)
4177 resched_task(rq->curr);
4178 else
4179 check_preempt_curr(rq, p, 0);
4180}
4181
4182
4183
4184
4185
4186
4187static void set_curr_task_fair(struct rq *rq)
4188{
4189 struct sched_entity *se = &rq->curr->se;
4190
4191 for_each_sched_entity(se)
4192 set_next_entity(cfs_rq_of(se), se);
4193}
4194
4195#ifdef CONFIG_FAIR_GROUP_SCHED
4196static void task_move_group_fair(struct task_struct *p, int on_rq)
4197{
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211 if (!on_rq)
4212 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
4213 set_task_rq(p, task_cpu(p));
4214 if (!on_rq)
4215 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
4216}
4217#endif
4218
4219static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
4220{
4221 struct sched_entity *se = &task->se;
4222 unsigned int rr_interval = 0;
4223
4224
4225
4226
4227
4228 if (rq->cfs.load.weight)
4229 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
4230
4231 return rr_interval;
4232}
4233
4234
4235
4236
4237static const struct sched_class fair_sched_class = {
4238 .next = &idle_sched_class,
4239 .enqueue_task = enqueue_task_fair,
4240 .dequeue_task = dequeue_task_fair,
4241 .yield_task = yield_task_fair,
4242 .yield_to_task = yield_to_task_fair,
4243
4244 .check_preempt_curr = check_preempt_wakeup,
4245
4246 .pick_next_task = pick_next_task_fair,
4247 .put_prev_task = put_prev_task_fair,
4248
4249#ifdef CONFIG_SMP
4250 .select_task_rq = select_task_rq_fair,
4251
4252 .rq_online = rq_online_fair,
4253 .rq_offline = rq_offline_fair,
4254
4255 .task_waking = task_waking_fair,
4256#endif
4257
4258 .set_curr_task = set_curr_task_fair,
4259 .task_tick = task_tick_fair,
4260 .task_fork = task_fork_fair,
4261
4262 .prio_changed = prio_changed_fair,
4263 .switched_from = switched_from_fair,
4264 .switched_to = switched_to_fair,
4265
4266 .get_rr_interval = get_rr_interval_fair,
4267
4268#ifdef CONFIG_FAIR_GROUP_SCHED
4269 .task_move_group = task_move_group_fair,
4270#endif
4271};
4272
4273#ifdef CONFIG_SCHED_DEBUG
4274static void print_cfs_stats(struct seq_file *m, int cpu)
4275{
4276 struct cfs_rq *cfs_rq;
4277
4278 rcu_read_lock();
4279 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
4280 print_cfs_rq(m, cpu, cfs_rq);
4281 rcu_read_unlock();
4282}
4283#endif
4284