1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/sysctl.h>
61#include <linux/syscalls.h>
62#include <linux/times.h>
63#include <linux/tsacct_kern.h>
64#include <linux/kprobes.h>
65#include <linux/delayacct.h>
66#include <linux/reciprocal_div.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h>
73#include <linux/ctype.h>
74#include <linux/ftrace.h>
75#include <trace/sched.h>
76
77#include <asm/tlb.h>
78#include <asm/irq_regs.h>
79
80#include "sched_cpupri.h"
81
82
83
84
85
86
87#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
88#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
89#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
90
91
92
93
94
95
96#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
97#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
98#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
99
100
101
102
103#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
104
105#define NICE_0_LOAD SCHED_LOAD_SCALE
106#define NICE_0_SHIFT SCHED_LOAD_SHIFT
107
108
109
110
111
112
113
114#define DEF_TIMESLICE (100 * HZ / 1000)
115
116
117
118
119#define RUNTIME_INF ((u64)~0ULL)
120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP
128
129static void double_rq_lock(struct rq *rq1, struct rq *rq2);
130
131
132
133
134
135static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
136{
137 return reciprocal_divide(load, sg->reciprocal_cpu_power);
138}
139
140
141
142
143
144static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
145{
146 sg->__cpu_power += val;
147 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
148}
149#endif
150
151static inline int rt_policy(int policy)
152{
153 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
154 return 1;
155 return 0;
156}
157
158static inline int task_has_rt_policy(struct task_struct *p)
159{
160 return rt_policy(p->policy);
161}
162
163
164
165
166struct rt_prio_array {
167 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
168 struct list_head queue[MAX_RT_PRIO];
169};
170
171struct rt_bandwidth {
172
173 spinlock_t rt_runtime_lock;
174 ktime_t rt_period;
175 u64 rt_runtime;
176 struct hrtimer rt_period_timer;
177};
178
179static struct rt_bandwidth def_rt_bandwidth;
180
181static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
182
183static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
184{
185 struct rt_bandwidth *rt_b =
186 container_of(timer, struct rt_bandwidth, rt_period_timer);
187 ktime_t now;
188 int overrun;
189 int idle = 0;
190
191 for (;;) {
192 now = hrtimer_cb_get_time(timer);
193 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
194
195 if (!overrun)
196 break;
197
198 idle = do_sched_rt_period_timer(rt_b, overrun);
199 }
200
201 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
202}
203
204static
205void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
206{
207 rt_b->rt_period = ns_to_ktime(period);
208 rt_b->rt_runtime = runtime;
209
210 spin_lock_init(&rt_b->rt_runtime_lock);
211
212 hrtimer_init(&rt_b->rt_period_timer,
213 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
214 rt_b->rt_period_timer.function = sched_rt_period_timer;
215}
216
217static inline int rt_bandwidth_enabled(void)
218{
219 return sysctl_sched_rt_runtime >= 0;
220}
221
222static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{
224 ktime_t now;
225
226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return;
228
229 if (hrtimer_active(&rt_b->rt_period_timer))
230 return;
231
232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
237 if (hrtimer_active(&rt_b->rt_period_timer))
238 break;
239
240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
242
243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
248 }
249 spin_unlock(&rt_b->rt_runtime_lock);
250}
251
252#ifdef CONFIG_RT_GROUP_SCHED
253static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
254{
255 hrtimer_cancel(&rt_b->rt_period_timer);
256}
257#endif
258
259
260
261
262
263static DEFINE_MUTEX(sched_domains_mutex);
264
265#ifdef CONFIG_GROUP_SCHED
266
267#include <linux/cgroup.h>
268
269struct cfs_rq;
270
271static LIST_HEAD(task_groups);
272
273
274struct task_group {
275#ifdef CONFIG_CGROUP_SCHED
276 struct cgroup_subsys_state css;
277#endif
278
279#ifdef CONFIG_USER_SCHED
280 uid_t uid;
281#endif
282
283#ifdef CONFIG_FAIR_GROUP_SCHED
284
285 struct sched_entity **se;
286
287 struct cfs_rq **cfs_rq;
288 unsigned long shares;
289#endif
290
291#ifdef CONFIG_RT_GROUP_SCHED
292 struct sched_rt_entity **rt_se;
293 struct rt_rq **rt_rq;
294
295 struct rt_bandwidth rt_bandwidth;
296#endif
297
298 struct rcu_head rcu;
299 struct list_head list;
300
301 struct task_group *parent;
302 struct list_head siblings;
303 struct list_head children;
304};
305
306#ifdef CONFIG_USER_SCHED
307
308
309void set_tg_uid(struct user_struct *user)
310{
311 user->tg->uid = user->uid;
312}
313
314
315
316
317
318
319struct task_group root_task_group;
320
321#ifdef CONFIG_FAIR_GROUP_SCHED
322
323static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
324
325static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
326#endif
327
328#ifdef CONFIG_RT_GROUP_SCHED
329static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
330static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331#endif
332#else
333#define root_task_group init_task_group
334#endif
335
336
337
338
339static DEFINE_SPINLOCK(task_group_lock);
340
341#ifdef CONFIG_SMP
342static int root_task_group_empty(void)
343{
344 return list_empty(&root_task_group.children);
345}
346#endif
347
348#ifdef CONFIG_FAIR_GROUP_SCHED
349#ifdef CONFIG_USER_SCHED
350# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
351#else
352# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
353#endif
354
355
356
357
358
359
360
361
362
363#define MIN_SHARES 2
364#define MAX_SHARES (1UL << 18)
365
366static int init_task_group_load = INIT_TASK_GROUP_LOAD;
367#endif
368
369
370
371
372struct task_group init_task_group;
373
374
375static inline struct task_group *task_group(struct task_struct *p)
376{
377 struct task_group *tg;
378
379#ifdef CONFIG_USER_SCHED
380 rcu_read_lock();
381 tg = __task_cred(p)->user->tg;
382 rcu_read_unlock();
383#elif defined(CONFIG_CGROUP_SCHED)
384 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
385 struct task_group, css);
386#else
387 tg = &init_task_group;
388#endif
389 return tg;
390}
391
392
393static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
394{
395#ifdef CONFIG_FAIR_GROUP_SCHED
396 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
397 p->se.parent = task_group(p)->se[cpu];
398#endif
399
400#ifdef CONFIG_RT_GROUP_SCHED
401 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
402 p->rt.parent = task_group(p)->rt_se[cpu];
403#endif
404}
405
406#else
407
408#ifdef CONFIG_SMP
409static int root_task_group_empty(void)
410{
411 return 1;
412}
413#endif
414
415static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
416static inline struct task_group *task_group(struct task_struct *p)
417{
418 return NULL;
419}
420
421#endif
422
423
424struct cfs_rq {
425 struct load_weight load;
426 unsigned long nr_running;
427
428 u64 exec_clock;
429 u64 min_vruntime;
430
431 struct rb_root tasks_timeline;
432 struct rb_node *rb_leftmost;
433
434 struct list_head tasks;
435 struct list_head *balance_iterator;
436
437
438
439
440
441 struct sched_entity *curr, *next, *last;
442
443 unsigned int nr_spread_over;
444
445#ifdef CONFIG_FAIR_GROUP_SCHED
446 struct rq *rq;
447
448
449
450
451
452
453
454
455
456 struct list_head leaf_cfs_rq_list;
457 struct task_group *tg;
458
459#ifdef CONFIG_SMP
460
461
462
463 unsigned long task_weight;
464
465
466
467
468
469
470
471 unsigned long h_load;
472
473
474
475
476 unsigned long shares;
477
478
479
480
481 unsigned long rq_weight;
482#endif
483#endif
484};
485
486
487struct rt_rq {
488 struct rt_prio_array active;
489 unsigned long rt_nr_running;
490#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
491 struct {
492 int curr;
493#ifdef CONFIG_SMP
494 int next;
495#endif
496 } highest_prio;
497#endif
498#ifdef CONFIG_SMP
499 unsigned long rt_nr_migratory;
500 int overloaded;
501 struct plist_head pushable_tasks;
502#endif
503 int rt_throttled;
504 u64 rt_time;
505 u64 rt_runtime;
506
507 spinlock_t rt_runtime_lock;
508
509#ifdef CONFIG_RT_GROUP_SCHED
510 unsigned long rt_nr_boosted;
511
512 struct rq *rq;
513 struct list_head leaf_rt_rq_list;
514 struct task_group *tg;
515 struct sched_rt_entity *rt_se;
516#endif
517};
518
519#ifdef CONFIG_SMP
520
521
522
523
524
525
526
527
528
529struct root_domain {
530 atomic_t refcount;
531 cpumask_var_t span;
532 cpumask_var_t online;
533
534
535
536
537
538 cpumask_var_t rto_mask;
539 atomic_t rto_count;
540#ifdef CONFIG_SMP
541 struct cpupri cpupri;
542#endif
543#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
544
545
546
547
548
549 unsigned int sched_mc_preferred_wakeup_cpu;
550#endif
551};
552
553
554
555
556
557static struct root_domain def_root_domain;
558
559#endif
560
561
562
563
564
565
566
567
568struct rq {
569
570 spinlock_t lock;
571
572
573
574
575
576 unsigned long nr_running;
577 #define CPU_LOAD_IDX_MAX 5
578 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
579#ifdef CONFIG_NO_HZ
580 unsigned long last_tick_seen;
581 unsigned char in_nohz_recently;
582#endif
583
584 struct load_weight load;
585 unsigned long nr_load_updates;
586 u64 nr_switches;
587
588 struct cfs_rq cfs;
589 struct rt_rq rt;
590
591#ifdef CONFIG_FAIR_GROUP_SCHED
592
593 struct list_head leaf_cfs_rq_list;
594#endif
595#ifdef CONFIG_RT_GROUP_SCHED
596 struct list_head leaf_rt_rq_list;
597#endif
598
599
600
601
602
603
604
605 unsigned long nr_uninterruptible;
606
607 struct task_struct *curr, *idle;
608 unsigned long next_balance;
609 struct mm_struct *prev_mm;
610
611 u64 clock;
612
613 atomic_t nr_iowait;
614
615#ifdef CONFIG_SMP
616 struct root_domain *rd;
617 struct sched_domain *sd;
618
619 unsigned char idle_at_tick;
620
621 int active_balance;
622 int push_cpu;
623
624 int cpu;
625 int online;
626
627 unsigned long avg_load_per_task;
628
629 struct task_struct *migration_thread;
630 struct list_head migration_queue;
631#endif
632
633#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP
635 int hrtick_csd_pending;
636 struct call_single_data hrtick_csd;
637#endif
638 struct hrtimer hrtick_timer;
639#endif
640
641#ifdef CONFIG_SCHEDSTATS
642
643 struct sched_info rq_sched_info;
644 unsigned long long rq_cpu_time;
645
646
647
648 unsigned int yld_count;
649
650
651 unsigned int sched_switch;
652 unsigned int sched_count;
653 unsigned int sched_goidle;
654
655
656 unsigned int ttwu_count;
657 unsigned int ttwu_local;
658
659
660 unsigned int bkl_count;
661#endif
662};
663
664static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
665
666static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
667{
668 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
669}
670
671static inline int cpu_of(struct rq *rq)
672{
673#ifdef CONFIG_SMP
674 return rq->cpu;
675#else
676 return 0;
677#endif
678}
679
680
681
682
683
684
685
686
687#define for_each_domain(cpu, __sd) \
688 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
689
690#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
691#define this_rq() (&__get_cpu_var(runqueues))
692#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694
695static inline void update_rq_clock(struct rq *rq)
696{
697 rq->clock = sched_clock_cpu(cpu_of(rq));
698}
699
700
701
702
703#ifdef CONFIG_SCHED_DEBUG
704# define const_debug __read_mostly
705#else
706# define const_debug static const
707#endif
708
709
710
711
712
713
714
715
716int runqueue_is_locked(void)
717{
718 int cpu = get_cpu();
719 struct rq *rq = cpu_rq(cpu);
720 int ret;
721
722 ret = spin_is_locked(&rq->lock);
723 put_cpu();
724 return ret;
725}
726
727
728
729
730
731#define SCHED_FEAT(name, enabled) \
732 __SCHED_FEAT_##name ,
733
734enum {
735#include "sched_features.h"
736};
737
738#undef SCHED_FEAT
739
740#define SCHED_FEAT(name, enabled) \
741 (1UL << __SCHED_FEAT_##name) * enabled |
742
743const_debug unsigned int sysctl_sched_features =
744#include "sched_features.h"
745 0;
746
747#undef SCHED_FEAT
748
749#ifdef CONFIG_SCHED_DEBUG
750#define SCHED_FEAT(name, enabled) \
751 #name ,
752
753static __read_mostly char *sched_feat_names[] = {
754#include "sched_features.h"
755 NULL
756};
757
758#undef SCHED_FEAT
759
760static int sched_feat_show(struct seq_file *m, void *v)
761{
762 int i;
763
764 for (i = 0; sched_feat_names[i]; i++) {
765 if (!(sysctl_sched_features & (1UL << i)))
766 seq_puts(m, "NO_");
767 seq_printf(m, "%s ", sched_feat_names[i]);
768 }
769 seq_puts(m, "\n");
770
771 return 0;
772}
773
774static ssize_t
775sched_feat_write(struct file *filp, const char __user *ubuf,
776 size_t cnt, loff_t *ppos)
777{
778 char buf[64];
779 char *cmp = buf;
780 int neg = 0;
781 int i;
782
783 if (cnt > 63)
784 cnt = 63;
785
786 if (copy_from_user(&buf, ubuf, cnt))
787 return -EFAULT;
788
789 buf[cnt] = 0;
790
791 if (strncmp(buf, "NO_", 3) == 0) {
792 neg = 1;
793 cmp += 3;
794 }
795
796 for (i = 0; sched_feat_names[i]; i++) {
797 int len = strlen(sched_feat_names[i]);
798
799 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
800 if (neg)
801 sysctl_sched_features &= ~(1UL << i);
802 else
803 sysctl_sched_features |= (1UL << i);
804 break;
805 }
806 }
807
808 if (!sched_feat_names[i])
809 return -EINVAL;
810
811 filp->f_pos += cnt;
812
813 return cnt;
814}
815
816static int sched_feat_open(struct inode *inode, struct file *filp)
817{
818 return single_open(filp, sched_feat_show, NULL);
819}
820
821static struct file_operations sched_feat_fops = {
822 .open = sched_feat_open,
823 .write = sched_feat_write,
824 .read = seq_read,
825 .llseek = seq_lseek,
826 .release = single_release,
827};
828
829static __init int sched_init_debug(void)
830{
831 debugfs_create_file("sched_features", 0644, NULL, NULL,
832 &sched_feat_fops);
833
834 return 0;
835}
836late_initcall(sched_init_debug);
837
838#endif
839
840#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
841
842
843
844
845
846const_debug unsigned int sysctl_sched_nr_migrate = 32;
847
848
849
850
851
852unsigned int sysctl_sched_shares_ratelimit = 250000;
853
854
855
856
857
858
859unsigned int sysctl_sched_shares_thresh = 4;
860
861
862
863
864
865unsigned int sysctl_sched_rt_period = 1000000;
866
867static __read_mostly int scheduler_running;
868
869
870
871
872
873int sysctl_sched_rt_runtime = 950000;
874
875static inline u64 global_rt_period(void)
876{
877 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
878}
879
880static inline u64 global_rt_runtime(void)
881{
882 if (sysctl_sched_rt_runtime < 0)
883 return RUNTIME_INF;
884
885 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
886}
887
888#ifndef prepare_arch_switch
889# define prepare_arch_switch(next) do { } while (0)
890#endif
891#ifndef finish_arch_switch
892# define finish_arch_switch(prev) do { } while (0)
893#endif
894
895static inline int task_current(struct rq *rq, struct task_struct *p)
896{
897 return rq->curr == p;
898}
899
900#ifndef __ARCH_WANT_UNLOCKED_CTXSW
901static inline int task_running(struct rq *rq, struct task_struct *p)
902{
903 return task_current(rq, p);
904}
905
906static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
907{
908}
909
910static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
911{
912#ifdef CONFIG_DEBUG_SPINLOCK
913
914 rq->lock.owner = current;
915#endif
916
917
918
919
920
921 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
922
923 spin_unlock_irq(&rq->lock);
924}
925
926#else
927static inline int task_running(struct rq *rq, struct task_struct *p)
928{
929#ifdef CONFIG_SMP
930 return p->oncpu;
931#else
932 return task_current(rq, p);
933#endif
934}
935
936static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
937{
938#ifdef CONFIG_SMP
939
940
941
942
943
944 next->oncpu = 1;
945#endif
946#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
947 spin_unlock_irq(&rq->lock);
948#else
949 spin_unlock(&rq->lock);
950#endif
951}
952
953static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
954{
955#ifdef CONFIG_SMP
956
957
958
959
960
961 smp_wmb();
962 prev->oncpu = 0;
963#endif
964#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
965 local_irq_enable();
966#endif
967}
968#endif
969
970
971
972
973
974static inline struct rq *__task_rq_lock(struct task_struct *p)
975 __acquires(rq->lock)
976{
977 for (;;) {
978 struct rq *rq = task_rq(p);
979 spin_lock(&rq->lock);
980 if (likely(rq == task_rq(p)))
981 return rq;
982 spin_unlock(&rq->lock);
983 }
984}
985
986
987
988
989
990
991static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
992 __acquires(rq->lock)
993{
994 struct rq *rq;
995
996 for (;;) {
997 local_irq_save(*flags);
998 rq = task_rq(p);
999 spin_lock(&rq->lock);
1000 if (likely(rq == task_rq(p)))
1001 return rq;
1002 spin_unlock_irqrestore(&rq->lock, *flags);
1003 }
1004}
1005
1006void task_rq_unlock_wait(struct task_struct *p)
1007{
1008 struct rq *rq = task_rq(p);
1009
1010 smp_mb();
1011 spin_unlock_wait(&rq->lock);
1012}
1013
1014static void __task_rq_unlock(struct rq *rq)
1015 __releases(rq->lock)
1016{
1017 spin_unlock(&rq->lock);
1018}
1019
1020static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1021 __releases(rq->lock)
1022{
1023 spin_unlock_irqrestore(&rq->lock, *flags);
1024}
1025
1026
1027
1028
1029static struct rq *this_rq_lock(void)
1030 __acquires(rq->lock)
1031{
1032 struct rq *rq;
1033
1034 local_irq_disable();
1035 rq = this_rq();
1036 spin_lock(&rq->lock);
1037
1038 return rq;
1039}
1040
1041#ifdef CONFIG_SCHED_HRTICK
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058static inline int hrtick_enabled(struct rq *rq)
1059{
1060 if (!sched_feat(HRTICK))
1061 return 0;
1062 if (!cpu_active(cpu_of(rq)))
1063 return 0;
1064 return hrtimer_is_hres_active(&rq->hrtick_timer);
1065}
1066
1067static void hrtick_clear(struct rq *rq)
1068{
1069 if (hrtimer_active(&rq->hrtick_timer))
1070 hrtimer_cancel(&rq->hrtick_timer);
1071}
1072
1073
1074
1075
1076
1077static enum hrtimer_restart hrtick(struct hrtimer *timer)
1078{
1079 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1080
1081 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1082
1083 spin_lock(&rq->lock);
1084 update_rq_clock(rq);
1085 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1086 spin_unlock(&rq->lock);
1087
1088 return HRTIMER_NORESTART;
1089}
1090
1091#ifdef CONFIG_SMP
1092
1093
1094
1095static void __hrtick_start(void *arg)
1096{
1097 struct rq *rq = arg;
1098
1099 spin_lock(&rq->lock);
1100 hrtimer_restart(&rq->hrtick_timer);
1101 rq->hrtick_csd_pending = 0;
1102 spin_unlock(&rq->lock);
1103}
1104
1105
1106
1107
1108
1109
1110static void hrtick_start(struct rq *rq, u64 delay)
1111{
1112 struct hrtimer *timer = &rq->hrtick_timer;
1113 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1114
1115 hrtimer_set_expires(timer, time);
1116
1117 if (rq == this_rq()) {
1118 hrtimer_restart(timer);
1119 } else if (!rq->hrtick_csd_pending) {
1120 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1121 rq->hrtick_csd_pending = 1;
1122 }
1123}
1124
1125static int
1126hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1127{
1128 int cpu = (int)(long)hcpu;
1129
1130 switch (action) {
1131 case CPU_UP_CANCELED:
1132 case CPU_UP_CANCELED_FROZEN:
1133 case CPU_DOWN_PREPARE:
1134 case CPU_DOWN_PREPARE_FROZEN:
1135 case CPU_DEAD:
1136 case CPU_DEAD_FROZEN:
1137 hrtick_clear(cpu_rq(cpu));
1138 return NOTIFY_OK;
1139 }
1140
1141 return NOTIFY_DONE;
1142}
1143
1144static __init void init_hrtick(void)
1145{
1146 hotcpu_notifier(hotplug_hrtick, 0);
1147}
1148#else
1149
1150
1151
1152
1153
1154static void hrtick_start(struct rq *rq, u64 delay)
1155{
1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0);
1158}
1159
1160static inline void init_hrtick(void)
1161{
1162}
1163#endif
1164
1165static void init_rq_hrtick(struct rq *rq)
1166{
1167#ifdef CONFIG_SMP
1168 rq->hrtick_csd_pending = 0;
1169
1170 rq->hrtick_csd.flags = 0;
1171 rq->hrtick_csd.func = __hrtick_start;
1172 rq->hrtick_csd.info = rq;
1173#endif
1174
1175 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1176 rq->hrtick_timer.function = hrtick;
1177}
1178#else
1179static inline void hrtick_clear(struct rq *rq)
1180{
1181}
1182
1183static inline void init_rq_hrtick(struct rq *rq)
1184{
1185}
1186
1187static inline void init_hrtick(void)
1188{
1189}
1190#endif
1191
1192
1193
1194
1195
1196
1197
1198
1199#ifdef CONFIG_SMP
1200
1201#ifndef tsk_is_polling
1202#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1203#endif
1204
1205static void resched_task(struct task_struct *p)
1206{
1207 int cpu;
1208
1209 assert_spin_locked(&task_rq(p)->lock);
1210
1211 if (test_tsk_need_resched(p))
1212 return;
1213
1214 set_tsk_need_resched(p);
1215
1216 cpu = task_cpu(p);
1217 if (cpu == smp_processor_id())
1218 return;
1219
1220
1221 smp_mb();
1222 if (!tsk_is_polling(p))
1223 smp_send_reschedule(cpu);
1224}
1225
1226static void resched_cpu(int cpu)
1227{
1228 struct rq *rq = cpu_rq(cpu);
1229 unsigned long flags;
1230
1231 if (!spin_trylock_irqsave(&rq->lock, flags))
1232 return;
1233 resched_task(cpu_curr(cpu));
1234 spin_unlock_irqrestore(&rq->lock, flags);
1235}
1236
1237#ifdef CONFIG_NO_HZ
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248void wake_up_idle_cpu(int cpu)
1249{
1250 struct rq *rq = cpu_rq(cpu);
1251
1252 if (cpu == smp_processor_id())
1253 return;
1254
1255
1256
1257
1258
1259
1260
1261
1262 if (rq->curr != rq->idle)
1263 return;
1264
1265
1266
1267
1268
1269
1270 set_tsk_need_resched(rq->idle);
1271
1272
1273 smp_mb();
1274 if (!tsk_is_polling(rq->idle))
1275 smp_send_reschedule(cpu);
1276}
1277#endif
1278
1279#else
1280static void resched_task(struct task_struct *p)
1281{
1282 assert_spin_locked(&task_rq(p)->lock);
1283 set_tsk_need_resched(p);
1284}
1285#endif
1286
1287#if BITS_PER_LONG == 32
1288# define WMULT_CONST (~0UL)
1289#else
1290# define WMULT_CONST (1UL << 32)
1291#endif
1292
1293#define WMULT_SHIFT 32
1294
1295
1296
1297
1298#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1299
1300
1301
1302
1303static unsigned long
1304calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1305 struct load_weight *lw)
1306{
1307 u64 tmp;
1308
1309 if (!lw->inv_weight) {
1310 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1311 lw->inv_weight = 1;
1312 else
1313 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1314 / (lw->weight+1);
1315 }
1316
1317 tmp = (u64)delta_exec * weight;
1318
1319
1320
1321 if (unlikely(tmp > WMULT_CONST))
1322 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1323 WMULT_SHIFT/2);
1324 else
1325 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1326
1327 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1328}
1329
1330static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1331{
1332 lw->weight += inc;
1333 lw->inv_weight = 0;
1334}
1335
1336static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1337{
1338 lw->weight -= dec;
1339 lw->inv_weight = 0;
1340}
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351#define WEIGHT_IDLEPRIO 3
1352#define WMULT_IDLEPRIO 1431655765
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366static const int prio_to_weight[40] = {
1367 88761, 71755, 56483, 46273, 36291,
1368 29154, 23254, 18705, 14949, 11916,
1369 9548, 7620, 6100, 4904, 3906,
1370 3121, 2501, 1991, 1586, 1277,
1371 1024, 820, 655, 526, 423,
1372 335, 272, 215, 172, 137,
1373 110, 87, 70, 56, 45,
1374 36, 29, 23, 18, 15,
1375};
1376
1377
1378
1379
1380
1381
1382
1383
1384static const u32 prio_to_wmult[40] = {
1385 48388, 59856, 76040, 92818, 118348,
1386 147320, 184698, 229616, 287308, 360437,
1387 449829, 563644, 704093, 875809, 1099582,
1388 1376151, 1717300, 2157191, 2708050, 3363326,
1389 4194304, 5237765, 6557202, 8165337, 10153587,
1390 12820798, 15790321, 19976592, 24970740, 31350126,
1391 39045157, 49367440, 61356676, 76695844, 95443717,
1392 119304647, 148102320, 186737708, 238609294, 286331153,
1393};
1394
1395static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1396
1397
1398
1399
1400
1401
1402struct rq_iterator {
1403 void *arg;
1404 struct task_struct *(*start)(void *);
1405 struct task_struct *(*next)(void *);
1406};
1407
1408#ifdef CONFIG_SMP
1409static unsigned long
1410balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1411 unsigned long max_load_move, struct sched_domain *sd,
1412 enum cpu_idle_type idle, int *all_pinned,
1413 int *this_best_prio, struct rq_iterator *iterator);
1414
1415static int
1416iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1417 struct sched_domain *sd, enum cpu_idle_type idle,
1418 struct rq_iterator *iterator);
1419#endif
1420
1421
1422enum cpuacct_stat_index {
1423 CPUACCT_STAT_USER,
1424 CPUACCT_STAT_SYSTEM,
1425
1426 CPUACCT_STAT_NSTATS,
1427};
1428
1429#ifdef CONFIG_CGROUP_CPUACCT
1430static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1431static void cpuacct_update_stats(struct task_struct *tsk,
1432 enum cpuacct_stat_index idx, cputime_t val);
1433#else
1434static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1435static inline void cpuacct_update_stats(struct task_struct *tsk,
1436 enum cpuacct_stat_index idx, cputime_t val) {}
1437#endif
1438
1439static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1440{
1441 update_load_add(&rq->load, load);
1442}
1443
1444static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1445{
1446 update_load_sub(&rq->load, load);
1447}
1448
1449#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1450typedef int (*tg_visitor)(struct task_group *, void *);
1451
1452
1453
1454
1455
1456static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1457{
1458 struct task_group *parent, *child;
1459 int ret;
1460
1461 rcu_read_lock();
1462 parent = &root_task_group;
1463down:
1464 ret = (*down)(parent, data);
1465 if (ret)
1466 goto out_unlock;
1467 list_for_each_entry_rcu(child, &parent->children, siblings) {
1468 parent = child;
1469 goto down;
1470
1471up:
1472 continue;
1473 }
1474 ret = (*up)(parent, data);
1475 if (ret)
1476 goto out_unlock;
1477
1478 child = parent;
1479 parent = parent->parent;
1480 if (parent)
1481 goto up;
1482out_unlock:
1483 rcu_read_unlock();
1484
1485 return ret;
1486}
1487
1488static int tg_nop(struct task_group *tg, void *data)
1489{
1490 return 0;
1491}
1492#endif
1493
1494#ifdef CONFIG_SMP
1495static unsigned long source_load(int cpu, int type);
1496static unsigned long target_load(int cpu, int type);
1497static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1498
1499static unsigned long cpu_avg_load_per_task(int cpu)
1500{
1501 struct rq *rq = cpu_rq(cpu);
1502 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1503
1504 if (nr_running)
1505 rq->avg_load_per_task = rq->load.weight / nr_running;
1506 else
1507 rq->avg_load_per_task = 0;
1508
1509 return rq->avg_load_per_task;
1510}
1511
1512#ifdef CONFIG_FAIR_GROUP_SCHED
1513
1514static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1515
1516
1517
1518
1519static void
1520update_group_shares_cpu(struct task_group *tg, int cpu,
1521 unsigned long sd_shares, unsigned long sd_rq_weight)
1522{
1523 unsigned long shares;
1524 unsigned long rq_weight;
1525
1526 if (!tg->se[cpu])
1527 return;
1528
1529 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1530
1531
1532
1533
1534
1535
1536
1537 shares = (sd_shares * rq_weight) / sd_rq_weight;
1538 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1539
1540 if (abs(shares - tg->se[cpu]->load.weight) >
1541 sysctl_sched_shares_thresh) {
1542 struct rq *rq = cpu_rq(cpu);
1543 unsigned long flags;
1544
1545 spin_lock_irqsave(&rq->lock, flags);
1546 tg->cfs_rq[cpu]->shares = shares;
1547
1548 __set_se_shares(tg->se[cpu], shares);
1549 spin_unlock_irqrestore(&rq->lock, flags);
1550 }
1551}
1552
1553
1554
1555
1556
1557
1558static int tg_shares_up(struct task_group *tg, void *data)
1559{
1560 unsigned long weight, rq_weight = 0;
1561 unsigned long shares = 0;
1562 struct sched_domain *sd = data;
1563 int i;
1564
1565 for_each_cpu(i, sched_domain_span(sd)) {
1566
1567
1568
1569
1570
1571 weight = tg->cfs_rq[i]->load.weight;
1572 if (!weight)
1573 weight = NICE_0_LOAD;
1574
1575 tg->cfs_rq[i]->rq_weight = weight;
1576 rq_weight += weight;
1577 shares += tg->cfs_rq[i]->shares;
1578 }
1579
1580 if ((!shares && rq_weight) || shares > tg->shares)
1581 shares = tg->shares;
1582
1583 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1584 shares = tg->shares;
1585
1586 for_each_cpu(i, sched_domain_span(sd))
1587 update_group_shares_cpu(tg, i, shares, rq_weight);
1588
1589 return 0;
1590}
1591
1592
1593
1594
1595
1596
1597static int tg_load_down(struct task_group *tg, void *data)
1598{
1599 unsigned long load;
1600 long cpu = (long)data;
1601
1602 if (!tg->parent) {
1603 load = cpu_rq(cpu)->load.weight;
1604 } else {
1605 load = tg->parent->cfs_rq[cpu]->h_load;
1606 load *= tg->cfs_rq[cpu]->shares;
1607 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1608 }
1609
1610 tg->cfs_rq[cpu]->h_load = load;
1611
1612 return 0;
1613}
1614
1615static void update_shares(struct sched_domain *sd)
1616{
1617 u64 now = cpu_clock(raw_smp_processor_id());
1618 s64 elapsed = now - sd->last_update;
1619
1620 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1621 sd->last_update = now;
1622 walk_tg_tree(tg_nop, tg_shares_up, sd);
1623 }
1624}
1625
1626static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1627{
1628 spin_unlock(&rq->lock);
1629 update_shares(sd);
1630 spin_lock(&rq->lock);
1631}
1632
1633static void update_h_load(long cpu)
1634{
1635 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1636}
1637
1638#else
1639
1640static inline void update_shares(struct sched_domain *sd)
1641{
1642}
1643
1644static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1645{
1646}
1647
1648#endif
1649
1650#ifdef CONFIG_PREEMPT
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1661 __releases(this_rq->lock)
1662 __acquires(busiest->lock)
1663 __acquires(this_rq->lock)
1664{
1665 spin_unlock(&this_rq->lock);
1666 double_rq_lock(this_rq, busiest);
1667
1668 return 1;
1669}
1670
1671#else
1672
1673
1674
1675
1676
1677
1678
1679static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1680 __releases(this_rq->lock)
1681 __acquires(busiest->lock)
1682 __acquires(this_rq->lock)
1683{
1684 int ret = 0;
1685
1686 if (unlikely(!spin_trylock(&busiest->lock))) {
1687 if (busiest < this_rq) {
1688 spin_unlock(&this_rq->lock);
1689 spin_lock(&busiest->lock);
1690 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1691 ret = 1;
1692 } else
1693 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1694 }
1695 return ret;
1696}
1697
1698#endif
1699
1700
1701
1702
1703static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1704{
1705 if (unlikely(!irqs_disabled())) {
1706
1707 spin_unlock(&this_rq->lock);
1708 BUG_ON(1);
1709 }
1710
1711 return _double_lock_balance(this_rq, busiest);
1712}
1713
1714static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1715 __releases(busiest->lock)
1716{
1717 spin_unlock(&busiest->lock);
1718 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1719}
1720#endif
1721
1722#ifdef CONFIG_FAIR_GROUP_SCHED
1723static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1724{
1725#ifdef CONFIG_SMP
1726 cfs_rq->shares = shares;
1727#endif
1728}
1729#endif
1730
1731#include "sched_stats.h"
1732#include "sched_idletask.c"
1733#include "sched_fair.c"
1734#include "sched_rt.c"
1735#ifdef CONFIG_SCHED_DEBUG
1736# include "sched_debug.c"
1737#endif
1738
1739#define sched_class_highest (&rt_sched_class)
1740#define for_each_class(class) \
1741 for (class = sched_class_highest; class; class = class->next)
1742
1743static void inc_nr_running(struct rq *rq)
1744{
1745 rq->nr_running++;
1746}
1747
1748static void dec_nr_running(struct rq *rq)
1749{
1750 rq->nr_running--;
1751}
1752
1753static void set_load_weight(struct task_struct *p)
1754{
1755 if (task_has_rt_policy(p)) {
1756 p->se.load.weight = prio_to_weight[0] * 2;
1757 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1758 return;
1759 }
1760
1761
1762
1763
1764 if (p->policy == SCHED_IDLE) {
1765 p->se.load.weight = WEIGHT_IDLEPRIO;
1766 p->se.load.inv_weight = WMULT_IDLEPRIO;
1767 return;
1768 }
1769
1770 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1771 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1772}
1773
1774static void update_avg(u64 *avg, u64 sample)
1775{
1776 s64 diff = sample - *avg;
1777 *avg += diff >> 3;
1778}
1779
1780static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1781{
1782 if (wakeup)
1783 p->se.start_runtime = p->se.sum_exec_runtime;
1784
1785 sched_info_queued(p);
1786 p->sched_class->enqueue_task(rq, p, wakeup);
1787 p->se.on_rq = 1;
1788}
1789
1790static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1791{
1792 if (sleep) {
1793 if (p->se.last_wakeup) {
1794 update_avg(&p->se.avg_overlap,
1795 p->se.sum_exec_runtime - p->se.last_wakeup);
1796 p->se.last_wakeup = 0;
1797 } else {
1798 update_avg(&p->se.avg_wakeup,
1799 sysctl_sched_wakeup_granularity);
1800 }
1801 }
1802
1803 sched_info_dequeued(p);
1804 p->sched_class->dequeue_task(rq, p, sleep);
1805 p->se.on_rq = 0;
1806}
1807
1808
1809
1810
1811static inline int __normal_prio(struct task_struct *p)
1812{
1813 return p->static_prio;
1814}
1815
1816
1817
1818
1819
1820
1821
1822
1823static inline int normal_prio(struct task_struct *p)
1824{
1825 int prio;
1826
1827 if (task_has_rt_policy(p))
1828 prio = MAX_RT_PRIO-1 - p->rt_priority;
1829 else
1830 prio = __normal_prio(p);
1831 return prio;
1832}
1833
1834
1835
1836
1837
1838
1839
1840
1841static int effective_prio(struct task_struct *p)
1842{
1843 p->normal_prio = normal_prio(p);
1844
1845
1846
1847
1848
1849 if (!rt_prio(p->prio))
1850 return p->normal_prio;
1851 return p->prio;
1852}
1853
1854
1855
1856
1857static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1858{
1859 if (task_contributes_to_load(p))
1860 rq->nr_uninterruptible--;
1861
1862 enqueue_task(rq, p, wakeup);
1863 inc_nr_running(rq);
1864}
1865
1866
1867
1868
1869static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1870{
1871 if (task_contributes_to_load(p))
1872 rq->nr_uninterruptible++;
1873
1874 dequeue_task(rq, p, sleep);
1875 dec_nr_running(rq);
1876}
1877
1878
1879
1880
1881
1882inline int task_curr(const struct task_struct *p)
1883{
1884 return cpu_curr(task_cpu(p)) == p;
1885}
1886
1887static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1888{
1889 set_task_rq(p, cpu);
1890#ifdef CONFIG_SMP
1891
1892
1893
1894
1895
1896 smp_wmb();
1897 task_thread_info(p)->cpu = cpu;
1898#endif
1899}
1900
1901static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1902 const struct sched_class *prev_class,
1903 int oldprio, int running)
1904{
1905 if (prev_class != p->sched_class) {
1906 if (prev_class->switched_from)
1907 prev_class->switched_from(rq, p, running);
1908 p->sched_class->switched_to(rq, p, running);
1909 } else
1910 p->sched_class->prio_changed(rq, p, oldprio, running);
1911}
1912
1913#ifdef CONFIG_SMP
1914
1915
1916static unsigned long weighted_cpuload(const int cpu)
1917{
1918 return cpu_rq(cpu)->load.weight;
1919}
1920
1921
1922
1923
1924static int
1925task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1926{
1927 s64 delta;
1928
1929
1930
1931
1932 if (sched_feat(CACHE_HOT_BUDDY) &&
1933 (&p->se == cfs_rq_of(&p->se)->next ||
1934 &p->se == cfs_rq_of(&p->se)->last))
1935 return 1;
1936
1937 if (p->sched_class != &fair_sched_class)
1938 return 0;
1939
1940 if (sysctl_sched_migration_cost == -1)
1941 return 1;
1942 if (sysctl_sched_migration_cost == 0)
1943 return 0;
1944
1945 delta = now - p->se.exec_start;
1946
1947 return delta < (s64)sysctl_sched_migration_cost;
1948}
1949
1950
1951void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1952{
1953 int old_cpu = task_cpu(p);
1954 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1955 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1956 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1957 u64 clock_offset;
1958
1959 clock_offset = old_rq->clock - new_rq->clock;
1960
1961 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1962
1963#ifdef CONFIG_SCHEDSTATS
1964 if (p->se.wait_start)
1965 p->se.wait_start -= clock_offset;
1966 if (p->se.sleep_start)
1967 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start)
1969 p->se.block_start -= clock_offset;
1970 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations);
1972 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif
1976 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime;
1978
1979 __set_task_cpu(p, new_cpu);
1980}
1981
1982struct migration_req {
1983 struct list_head list;
1984
1985 struct task_struct *task;
1986 int dest_cpu;
1987
1988 struct completion done;
1989};
1990
1991
1992
1993
1994
1995static int
1996migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1997{
1998 struct rq *rq = task_rq(p);
1999
2000
2001
2002
2003
2004 if (!p->se.on_rq && !task_running(rq, p)) {
2005 set_task_cpu(p, dest_cpu);
2006 return 0;
2007 }
2008
2009 init_completion(&req->done);
2010 req->task = p;
2011 req->dest_cpu = dest_cpu;
2012 list_add(&req->list, &rq->migration_queue);
2013
2014 return 1;
2015}
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2034{
2035 unsigned long flags;
2036 int running, on_rq;
2037 unsigned long ncsw;
2038 struct rq *rq;
2039
2040 for (;;) {
2041
2042
2043
2044
2045
2046
2047 rq = task_rq(p);
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060 while (task_running(rq, p)) {
2061 if (match_state && unlikely(p->state != match_state))
2062 return 0;
2063 cpu_relax();
2064 }
2065
2066
2067
2068
2069
2070
2071 rq = task_rq_lock(p, &flags);
2072 trace_sched_wait_task(rq, p);
2073 running = task_running(rq, p);
2074 on_rq = p->se.on_rq;
2075 ncsw = 0;
2076 if (!match_state || p->state == match_state)
2077 ncsw = p->nvcsw | LONG_MIN;
2078 task_rq_unlock(rq, &flags);
2079
2080
2081
2082
2083 if (unlikely(!ncsw))
2084 break;
2085
2086
2087
2088
2089
2090
2091
2092 if (unlikely(running)) {
2093 cpu_relax();
2094 continue;
2095 }
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106 if (unlikely(on_rq)) {
2107 schedule_timeout_uninterruptible(1);
2108 continue;
2109 }
2110
2111
2112
2113
2114
2115
2116 break;
2117 }
2118
2119 return ncsw;
2120}
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135void kick_process(struct task_struct *p)
2136{
2137 int cpu;
2138
2139 preempt_disable();
2140 cpu = task_cpu(p);
2141 if ((cpu != smp_processor_id()) && task_curr(p))
2142 smp_send_reschedule(cpu);
2143 preempt_enable();
2144}
2145
2146
2147
2148
2149
2150
2151
2152
2153static unsigned long source_load(int cpu, int type)
2154{
2155 struct rq *rq = cpu_rq(cpu);
2156 unsigned long total = weighted_cpuload(cpu);
2157
2158 if (type == 0 || !sched_feat(LB_BIAS))
2159 return total;
2160
2161 return min(rq->cpu_load[type-1], total);
2162}
2163
2164
2165
2166
2167
2168static unsigned long target_load(int cpu, int type)
2169{
2170 struct rq *rq = cpu_rq(cpu);
2171 unsigned long total = weighted_cpuload(cpu);
2172
2173 if (type == 0 || !sched_feat(LB_BIAS))
2174 return total;
2175
2176 return max(rq->cpu_load[type-1], total);
2177}
2178
2179
2180
2181
2182
2183static struct sched_group *
2184find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2185{
2186 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2187 unsigned long min_load = ULONG_MAX, this_load = 0;
2188 int load_idx = sd->forkexec_idx;
2189 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2190
2191 do {
2192 unsigned long load, avg_load;
2193 int local_group;
2194 int i;
2195
2196
2197 if (!cpumask_intersects(sched_group_cpus(group),
2198 &p->cpus_allowed))
2199 continue;
2200
2201 local_group = cpumask_test_cpu(this_cpu,
2202 sched_group_cpus(group));
2203
2204
2205 avg_load = 0;
2206
2207 for_each_cpu(i, sched_group_cpus(group)) {
2208
2209 if (local_group)
2210 load = source_load(i, load_idx);
2211 else
2212 load = target_load(i, load_idx);
2213
2214 avg_load += load;
2215 }
2216
2217
2218 avg_load = sg_div_cpu_power(group,
2219 avg_load * SCHED_LOAD_SCALE);
2220
2221 if (local_group) {
2222 this_load = avg_load;
2223 this = group;
2224 } else if (avg_load < min_load) {
2225 min_load = avg_load;
2226 idlest = group;
2227 }
2228 } while (group = group->next, group != sd->groups);
2229
2230 if (!idlest || 100*this_load < imbalance*min_load)
2231 return NULL;
2232 return idlest;
2233}
2234
2235
2236
2237
2238static int
2239find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2240{
2241 unsigned long load, min_load = ULONG_MAX;
2242 int idlest = -1;
2243 int i;
2244
2245
2246 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2247 load = weighted_cpuload(i);
2248
2249 if (load < min_load || (load == min_load && i == this_cpu)) {
2250 min_load = load;
2251 idlest = i;
2252 }
2253 }
2254
2255 return idlest;
2256}
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269static int sched_balance_self(int cpu, int flag)
2270{
2271 struct task_struct *t = current;
2272 struct sched_domain *tmp, *sd = NULL;
2273
2274 for_each_domain(cpu, tmp) {
2275
2276
2277
2278 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2279 break;
2280 if (tmp->flags & flag)
2281 sd = tmp;
2282 }
2283
2284 if (sd)
2285 update_shares(sd);
2286
2287 while (sd) {
2288 struct sched_group *group;
2289 int new_cpu, weight;
2290
2291 if (!(sd->flags & flag)) {
2292 sd = sd->child;
2293 continue;
2294 }
2295
2296 group = find_idlest_group(sd, t, cpu);
2297 if (!group) {
2298 sd = sd->child;
2299 continue;
2300 }
2301
2302 new_cpu = find_idlest_cpu(group, t, cpu);
2303 if (new_cpu == -1 || new_cpu == cpu) {
2304
2305 sd = sd->child;
2306 continue;
2307 }
2308
2309
2310 cpu = new_cpu;
2311 weight = cpumask_weight(sched_domain_span(sd));
2312 sd = NULL;
2313 for_each_domain(cpu, tmp) {
2314 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2315 break;
2316 if (tmp->flags & flag)
2317 sd = tmp;
2318 }
2319
2320 }
2321
2322 return cpu;
2323}
2324
2325#endif
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2342{
2343 int cpu, orig_cpu, this_cpu, success = 0;
2344 unsigned long flags;
2345 long old_state;
2346 struct rq *rq;
2347
2348 if (!sched_feat(SYNC_WAKEUPS))
2349 sync = 0;
2350
2351#ifdef CONFIG_SMP
2352 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2353 struct sched_domain *sd;
2354
2355 this_cpu = raw_smp_processor_id();
2356 cpu = task_cpu(p);
2357
2358 for_each_domain(this_cpu, sd) {
2359 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2360 update_shares(sd);
2361 break;
2362 }
2363 }
2364 }
2365#endif
2366
2367 smp_wmb();
2368 rq = task_rq_lock(p, &flags);
2369 update_rq_clock(rq);
2370 old_state = p->state;
2371 if (!(old_state & state))
2372 goto out;
2373
2374 if (p->se.on_rq)
2375 goto out_running;
2376
2377 cpu = task_cpu(p);
2378 orig_cpu = cpu;
2379 this_cpu = smp_processor_id();
2380
2381#ifdef CONFIG_SMP
2382 if (unlikely(task_running(rq, p)))
2383 goto out_activate;
2384
2385 cpu = p->sched_class->select_task_rq(p, sync);
2386 if (cpu != orig_cpu) {
2387 set_task_cpu(p, cpu);
2388 task_rq_unlock(rq, &flags);
2389
2390 rq = task_rq_lock(p, &flags);
2391 old_state = p->state;
2392 if (!(old_state & state))
2393 goto out;
2394 if (p->se.on_rq)
2395 goto out_running;
2396
2397 this_cpu = smp_processor_id();
2398 cpu = task_cpu(p);
2399 }
2400
2401#ifdef CONFIG_SCHEDSTATS
2402 schedstat_inc(rq, ttwu_count);
2403 if (cpu == this_cpu)
2404 schedstat_inc(rq, ttwu_local);
2405 else {
2406 struct sched_domain *sd;
2407 for_each_domain(this_cpu, sd) {
2408 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2409 schedstat_inc(sd, ttwu_wake_remote);
2410 break;
2411 }
2412 }
2413 }
2414#endif
2415
2416out_activate:
2417#endif
2418 schedstat_inc(p, se.nr_wakeups);
2419 if (sync)
2420 schedstat_inc(p, se.nr_wakeups_sync);
2421 if (orig_cpu != cpu)
2422 schedstat_inc(p, se.nr_wakeups_migrate);
2423 if (cpu == this_cpu)
2424 schedstat_inc(p, se.nr_wakeups_local);
2425 else
2426 schedstat_inc(p, se.nr_wakeups_remote);
2427 activate_task(rq, p, 1);
2428 success = 1;
2429
2430
2431
2432
2433 if (!in_interrupt()) {
2434 struct sched_entity *se = ¤t->se;
2435 u64 sample = se->sum_exec_runtime;
2436
2437 if (se->last_wakeup)
2438 sample -= se->last_wakeup;
2439 else
2440 sample -= se->start_runtime;
2441 update_avg(&se->avg_wakeup, sample);
2442
2443 se->last_wakeup = se->sum_exec_runtime;
2444 }
2445
2446out_running:
2447 trace_sched_wakeup(rq, p, success);
2448 check_preempt_curr(rq, p, sync);
2449
2450 p->state = TASK_RUNNING;
2451#ifdef CONFIG_SMP
2452 if (p->sched_class->task_wake_up)
2453 p->sched_class->task_wake_up(rq, p);
2454#endif
2455out:
2456 task_rq_unlock(rq, &flags);
2457
2458 return success;
2459}
2460
2461int wake_up_process(struct task_struct *p)
2462{
2463 return try_to_wake_up(p, TASK_ALL, 0);
2464}
2465EXPORT_SYMBOL(wake_up_process);
2466
2467int wake_up_state(struct task_struct *p, unsigned int state)
2468{
2469 return try_to_wake_up(p, state, 0);
2470}
2471
2472
2473
2474
2475
2476
2477
2478static void __sched_fork(struct task_struct *p)
2479{
2480 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0;
2483 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0;
2486 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2487
2488#ifdef CONFIG_SCHEDSTATS
2489 p->se.wait_start = 0;
2490 p->se.sum_sleep_runtime = 0;
2491 p->se.sleep_start = 0;
2492 p->se.block_start = 0;
2493 p->se.sleep_max = 0;
2494 p->se.block_max = 0;
2495 p->se.exec_max = 0;
2496 p->se.slice_max = 0;
2497 p->se.wait_max = 0;
2498#endif
2499
2500 INIT_LIST_HEAD(&p->rt.run_list);
2501 p->se.on_rq = 0;
2502 INIT_LIST_HEAD(&p->se.group_node);
2503
2504#ifdef CONFIG_PREEMPT_NOTIFIERS
2505 INIT_HLIST_HEAD(&p->preempt_notifiers);
2506#endif
2507
2508
2509
2510
2511
2512
2513
2514 p->state = TASK_RUNNING;
2515}
2516
2517
2518
2519
2520void sched_fork(struct task_struct *p, int clone_flags)
2521{
2522 int cpu = get_cpu();
2523
2524 __sched_fork(p);
2525
2526#ifdef CONFIG_SMP
2527 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2528#endif
2529 set_task_cpu(p, cpu);
2530
2531
2532
2533
2534 p->prio = current->normal_prio;
2535 if (!rt_prio(p->prio))
2536 p->sched_class = &fair_sched_class;
2537
2538#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2539 if (likely(sched_info_on()))
2540 memset(&p->sched_info, 0, sizeof(p->sched_info));
2541#endif
2542#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2543 p->oncpu = 0;
2544#endif
2545#ifdef CONFIG_PREEMPT
2546
2547 task_thread_info(p)->preempt_count = 1;
2548#endif
2549 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2550
2551 put_cpu();
2552}
2553
2554
2555
2556
2557
2558
2559
2560
2561void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2562{
2563 unsigned long flags;
2564 struct rq *rq;
2565
2566 rq = task_rq_lock(p, &flags);
2567 BUG_ON(p->state != TASK_RUNNING);
2568 update_rq_clock(rq);
2569
2570 p->prio = effective_prio(p);
2571
2572 if (!p->sched_class->task_new || !current->se.on_rq) {
2573 activate_task(rq, p, 0);
2574 } else {
2575
2576
2577
2578
2579 p->sched_class->task_new(rq, p);
2580 inc_nr_running(rq);
2581 }
2582 trace_sched_wakeup_new(rq, p, 1);
2583 check_preempt_curr(rq, p, 0);
2584#ifdef CONFIG_SMP
2585 if (p->sched_class->task_wake_up)
2586 p->sched_class->task_wake_up(rq, p);
2587#endif
2588 task_rq_unlock(rq, &flags);
2589}
2590
2591#ifdef CONFIG_PREEMPT_NOTIFIERS
2592
2593
2594
2595
2596
2597void preempt_notifier_register(struct preempt_notifier *notifier)
2598{
2599 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2600}
2601EXPORT_SYMBOL_GPL(preempt_notifier_register);
2602
2603
2604
2605
2606
2607
2608
2609void preempt_notifier_unregister(struct preempt_notifier *notifier)
2610{
2611 hlist_del(¬ifier->link);
2612}
2613EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2614
2615static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2616{
2617 struct preempt_notifier *notifier;
2618 struct hlist_node *node;
2619
2620 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2621 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2622}
2623
2624static void
2625fire_sched_out_preempt_notifiers(struct task_struct *curr,
2626 struct task_struct *next)
2627{
2628 struct preempt_notifier *notifier;
2629 struct hlist_node *node;
2630
2631 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2632 notifier->ops->sched_out(notifier, next);
2633}
2634
2635#else
2636
2637static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2638{
2639}
2640
2641static void
2642fire_sched_out_preempt_notifiers(struct task_struct *curr,
2643 struct task_struct *next)
2644{
2645}
2646
2647#endif
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662static inline void
2663prepare_task_switch(struct rq *rq, struct task_struct *prev,
2664 struct task_struct *next)
2665{
2666 fire_sched_out_preempt_notifiers(prev, next);
2667 prepare_lock_switch(rq, next);
2668 prepare_arch_switch(next);
2669}
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2687 __releases(rq->lock)
2688{
2689 struct mm_struct *mm = rq->prev_mm;
2690 long prev_state;
2691#ifdef CONFIG_SMP
2692 int post_schedule = 0;
2693
2694 if (current->sched_class->needs_post_schedule)
2695 post_schedule = current->sched_class->needs_post_schedule(rq);
2696#endif
2697
2698 rq->prev_mm = NULL;
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711 prev_state = prev->state;
2712 finish_arch_switch(prev);
2713 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP
2715 if (post_schedule)
2716 current->sched_class->post_schedule(rq);
2717#endif
2718
2719 fire_sched_in_preempt_notifiers(current);
2720 if (mm)
2721 mmdrop(mm);
2722 if (unlikely(prev_state == TASK_DEAD)) {
2723
2724
2725
2726
2727 kprobe_flush_task(prev);
2728 put_task_struct(prev);
2729 }
2730}
2731
2732
2733
2734
2735
2736asmlinkage void schedule_tail(struct task_struct *prev)
2737 __releases(rq->lock)
2738{
2739 struct rq *rq = this_rq();
2740
2741 finish_task_switch(rq, prev);
2742#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2743
2744 preempt_enable();
2745#endif
2746 if (current->set_child_tid)
2747 put_user(task_pid_vnr(current), current->set_child_tid);
2748}
2749
2750
2751
2752
2753
2754static inline void
2755context_switch(struct rq *rq, struct task_struct *prev,
2756 struct task_struct *next)
2757{
2758 struct mm_struct *mm, *oldmm;
2759
2760 prepare_task_switch(rq, prev, next);
2761 trace_sched_switch(rq, prev, next);
2762 mm = next->mm;
2763 oldmm = prev->active_mm;
2764
2765
2766
2767
2768
2769 arch_enter_lazy_cpu_mode();
2770
2771 if (unlikely(!mm)) {
2772 next->active_mm = oldmm;
2773 atomic_inc(&oldmm->mm_count);
2774 enter_lazy_tlb(oldmm, next);
2775 } else
2776 switch_mm(oldmm, mm, next);
2777
2778 if (unlikely(!prev->mm)) {
2779 prev->active_mm = NULL;
2780 rq->prev_mm = oldmm;
2781 }
2782
2783
2784
2785
2786
2787
2788#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2789 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2790#endif
2791
2792
2793 switch_to(prev, next, prev);
2794
2795 barrier();
2796
2797
2798
2799
2800
2801 finish_task_switch(this_rq(), prev);
2802}
2803
2804
2805
2806
2807
2808
2809
2810
2811unsigned long nr_running(void)
2812{
2813 unsigned long i, sum = 0;
2814
2815 for_each_online_cpu(i)
2816 sum += cpu_rq(i)->nr_running;
2817
2818 return sum;
2819}
2820
2821unsigned long nr_uninterruptible(void)
2822{
2823 unsigned long i, sum = 0;
2824
2825 for_each_possible_cpu(i)
2826 sum += cpu_rq(i)->nr_uninterruptible;
2827
2828
2829
2830
2831
2832 if (unlikely((long)sum < 0))
2833 sum = 0;
2834
2835 return sum;
2836}
2837
2838unsigned long long nr_context_switches(void)
2839{
2840 int i;
2841 unsigned long long sum = 0;
2842
2843 for_each_possible_cpu(i)
2844 sum += cpu_rq(i)->nr_switches;
2845
2846 return sum;
2847}
2848
2849unsigned long nr_iowait(void)
2850{
2851 unsigned long i, sum = 0;
2852
2853 for_each_possible_cpu(i)
2854 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2855
2856 return sum;
2857}
2858
2859unsigned long nr_active(void)
2860{
2861 unsigned long i, running = 0, uninterruptible = 0;
2862
2863 for_each_online_cpu(i) {
2864 running += cpu_rq(i)->nr_running;
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2866 }
2867
2868 if (unlikely((long)uninterruptible < 0))
2869 uninterruptible = 0;
2870
2871 return running + uninterruptible;
2872}
2873
2874
2875
2876
2877
2878static void update_cpu_load(struct rq *this_rq)
2879{
2880 unsigned long this_load = this_rq->load.weight;
2881 int i, scale;
2882
2883 this_rq->nr_load_updates++;
2884
2885
2886 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2887 unsigned long old_load, new_load;
2888
2889
2890
2891 old_load = this_rq->cpu_load[i];
2892 new_load = this_load;
2893
2894
2895
2896
2897
2898 if (new_load > old_load)
2899 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 }
2902}
2903
2904#ifdef CONFIG_SMP
2905
2906
2907
2908
2909
2910
2911
2912static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2913 __acquires(rq1->lock)
2914 __acquires(rq2->lock)
2915{
2916 BUG_ON(!irqs_disabled());
2917 if (rq1 == rq2) {
2918 spin_lock(&rq1->lock);
2919 __acquire(rq2->lock);
2920 } else {
2921 if (rq1 < rq2) {
2922 spin_lock(&rq1->lock);
2923 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2924 } else {
2925 spin_lock(&rq2->lock);
2926 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2927 }
2928 }
2929 update_rq_clock(rq1);
2930 update_rq_clock(rq2);
2931}
2932
2933
2934
2935
2936
2937
2938
2939static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2940 __releases(rq1->lock)
2941 __releases(rq2->lock)
2942{
2943 spin_unlock(&rq1->lock);
2944 if (rq1 != rq2)
2945 spin_unlock(&rq2->lock);
2946 else
2947 __release(rq2->lock);
2948}
2949
2950
2951
2952
2953
2954
2955
2956static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2957{
2958 struct migration_req req;
2959 unsigned long flags;
2960 struct rq *rq;
2961
2962 rq = task_rq_lock(p, &flags);
2963 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2964 || unlikely(!cpu_active(dest_cpu)))
2965 goto out;
2966
2967
2968 if (migrate_task(p, dest_cpu, &req)) {
2969
2970 struct task_struct *mt = rq->migration_thread;
2971
2972 get_task_struct(mt);
2973 task_rq_unlock(rq, &flags);
2974 wake_up_process(mt);
2975 put_task_struct(mt);
2976 wait_for_completion(&req.done);
2977
2978 return;
2979 }
2980out:
2981 task_rq_unlock(rq, &flags);
2982}
2983
2984
2985
2986
2987
2988void sched_exec(void)
2989{
2990 int new_cpu, this_cpu = get_cpu();
2991 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2992 put_cpu();
2993 if (new_cpu != this_cpu)
2994 sched_migrate_task(current, new_cpu);
2995}
2996
2997
2998
2999
3000
3001static void pull_task(struct rq *src_rq, struct task_struct *p,
3002 struct rq *this_rq, int this_cpu)
3003{
3004 deactivate_task(src_rq, p, 0);
3005 set_task_cpu(p, this_cpu);
3006 activate_task(this_rq, p, 0);
3007
3008
3009
3010
3011 check_preempt_curr(this_rq, p, 0);
3012}
3013
3014
3015
3016
3017static
3018int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3019 struct sched_domain *sd, enum cpu_idle_type idle,
3020 int *all_pinned)
3021{
3022 int tsk_cache_hot = 0;
3023
3024
3025
3026
3027
3028
3029 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3030 schedstat_inc(p, se.nr_failed_migrations_affine);
3031 return 0;
3032 }
3033 *all_pinned = 0;
3034
3035 if (task_running(rq, p)) {
3036 schedstat_inc(p, se.nr_failed_migrations_running);
3037 return 0;
3038 }
3039
3040
3041
3042
3043
3044
3045
3046 tsk_cache_hot = task_hot(p, rq->clock, sd);
3047 if (!tsk_cache_hot ||
3048 sd->nr_balance_failed > sd->cache_nice_tries) {
3049#ifdef CONFIG_SCHEDSTATS
3050 if (tsk_cache_hot) {
3051 schedstat_inc(sd, lb_hot_gained[idle]);
3052 schedstat_inc(p, se.nr_forced_migrations);
3053 }
3054#endif
3055 return 1;
3056 }
3057
3058 if (tsk_cache_hot) {
3059 schedstat_inc(p, se.nr_failed_migrations_hot);
3060 return 0;
3061 }
3062 return 1;
3063}
3064
3065static unsigned long
3066balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3067 unsigned long max_load_move, struct sched_domain *sd,
3068 enum cpu_idle_type idle, int *all_pinned,
3069 int *this_best_prio, struct rq_iterator *iterator)
3070{
3071 int loops = 0, pulled = 0, pinned = 0;
3072 struct task_struct *p;
3073 long rem_load_move = max_load_move;
3074
3075 if (max_load_move == 0)
3076 goto out;
3077
3078 pinned = 1;
3079
3080
3081
3082
3083 p = iterator->start(iterator->arg);
3084next:
3085 if (!p || loops++ > sysctl_sched_nr_migrate)
3086 goto out;
3087
3088 if ((p->se.load.weight >> 1) > rem_load_move ||
3089 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3090 p = iterator->next(iterator->arg);
3091 goto next;
3092 }
3093
3094 pull_task(busiest, p, this_rq, this_cpu);
3095 pulled++;
3096 rem_load_move -= p->se.load.weight;
3097
3098#ifdef CONFIG_PREEMPT
3099
3100
3101
3102
3103
3104 if (idle == CPU_NEWLY_IDLE)
3105 goto out;
3106#endif
3107
3108
3109
3110
3111 if (rem_load_move > 0) {
3112 if (p->prio < *this_best_prio)
3113 *this_best_prio = p->prio;
3114 p = iterator->next(iterator->arg);
3115 goto next;
3116 }
3117out:
3118
3119
3120
3121
3122
3123 schedstat_add(sd, lb_gained[idle], pulled);
3124
3125 if (all_pinned)
3126 *all_pinned = pinned;
3127
3128 return max_load_move - rem_load_move;
3129}
3130
3131
3132
3133
3134
3135
3136
3137
3138static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3139 unsigned long max_load_move,
3140 struct sched_domain *sd, enum cpu_idle_type idle,
3141 int *all_pinned)
3142{
3143 const struct sched_class *class = sched_class_highest;
3144 unsigned long total_load_moved = 0;
3145 int this_best_prio = this_rq->curr->prio;
3146
3147 do {
3148 total_load_moved +=
3149 class->load_balance(this_rq, this_cpu, busiest,
3150 max_load_move - total_load_moved,
3151 sd, idle, all_pinned, &this_best_prio);
3152 class = class->next;
3153
3154#ifdef CONFIG_PREEMPT
3155
3156
3157
3158
3159
3160 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3161 break;
3162#endif
3163 } while (class && max_load_move > total_load_moved);
3164
3165 return total_load_moved > 0;
3166}
3167
3168static int
3169iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3170 struct sched_domain *sd, enum cpu_idle_type idle,
3171 struct rq_iterator *iterator)
3172{
3173 struct task_struct *p = iterator->start(iterator->arg);
3174 int pinned = 0;
3175
3176 while (p) {
3177 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3178 pull_task(busiest, p, this_rq, this_cpu);
3179
3180
3181
3182
3183
3184 schedstat_inc(sd, lb_gained[idle]);
3185
3186 return 1;
3187 }
3188 p = iterator->next(iterator->arg);
3189 }
3190
3191 return 0;
3192}
3193
3194
3195
3196
3197
3198
3199
3200
3201static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3202 struct sched_domain *sd, enum cpu_idle_type idle)
3203{
3204 const struct sched_class *class;
3205
3206 for (class = sched_class_highest; class; class = class->next)
3207 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3208 return 1;
3209
3210 return 0;
3211}
3212
3213
3214
3215
3216
3217struct sd_lb_stats {
3218 struct sched_group *busiest;
3219 struct sched_group *this;
3220 unsigned long total_load;
3221 unsigned long total_pwr;
3222 unsigned long avg_load;
3223
3224
3225 unsigned long this_load;
3226 unsigned long this_load_per_task;
3227 unsigned long this_nr_running;
3228
3229
3230 unsigned long max_load;
3231 unsigned long busiest_load_per_task;
3232 unsigned long busiest_nr_running;
3233
3234 int group_imb;
3235#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3236 int power_savings_balance;
3237 struct sched_group *group_min;
3238 struct sched_group *group_leader;
3239 unsigned long min_load_per_task;
3240 unsigned long leader_nr_running;
3241 unsigned long min_nr_running;
3242#endif
3243};
3244
3245
3246
3247
3248struct sg_lb_stats {
3249 unsigned long avg_load;
3250 unsigned long group_load;
3251 unsigned long sum_nr_running;
3252 unsigned long sum_weighted_load;
3253 unsigned long group_capacity;
3254 int group_imb;
3255};
3256
3257
3258
3259
3260
3261static inline unsigned int group_first_cpu(struct sched_group *group)
3262{
3263 return cpumask_first(sched_group_cpus(group));
3264}
3265
3266
3267
3268
3269
3270
3271static inline int get_sd_load_idx(struct sched_domain *sd,
3272 enum cpu_idle_type idle)
3273{
3274 int load_idx;
3275
3276 switch (idle) {
3277 case CPU_NOT_IDLE:
3278 load_idx = sd->busy_idx;
3279 break;
3280
3281 case CPU_NEWLY_IDLE:
3282 load_idx = sd->newidle_idx;
3283 break;
3284 default:
3285 load_idx = sd->idle_idx;
3286 break;
3287 }
3288
3289 return load_idx;
3290}
3291
3292
3293#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3294
3295
3296
3297
3298
3299
3300
3301
3302static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3303 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3304{
3305
3306
3307
3308
3309 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3310 sds->power_savings_balance = 0;
3311 else {
3312 sds->power_savings_balance = 1;
3313 sds->min_nr_running = ULONG_MAX;
3314 sds->leader_nr_running = 0;
3315 }
3316}
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328static inline void update_sd_power_savings_stats(struct sched_group *group,
3329 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3330{
3331
3332 if (!sds->power_savings_balance)
3333 return;
3334
3335
3336
3337
3338
3339 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3340 !sds->this_nr_running))
3341 sds->power_savings_balance = 0;
3342
3343
3344
3345
3346
3347 if (!sds->power_savings_balance ||
3348 sgs->sum_nr_running >= sgs->group_capacity ||
3349 !sgs->sum_nr_running)
3350 return;
3351
3352
3353
3354
3355
3356
3357 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3358 (sgs->sum_nr_running == sds->min_nr_running &&
3359 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3360 sds->group_min = group;
3361 sds->min_nr_running = sgs->sum_nr_running;
3362 sds->min_load_per_task = sgs->sum_weighted_load /
3363 sgs->sum_nr_running;
3364 }
3365
3366
3367
3368
3369
3370
3371 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3372 return;
3373
3374 if (sgs->sum_nr_running > sds->leader_nr_running ||
3375 (sgs->sum_nr_running == sds->leader_nr_running &&
3376 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3377 sds->group_leader = group;
3378 sds->leader_nr_running = sgs->sum_nr_running;
3379 }
3380}
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3398 int this_cpu, unsigned long *imbalance)
3399{
3400 if (!sds->power_savings_balance)
3401 return 0;
3402
3403 if (sds->this != sds->group_leader ||
3404 sds->group_leader == sds->group_min)
3405 return 0;
3406
3407 *imbalance = sds->min_load_per_task;
3408 sds->busiest = sds->group_min;
3409
3410 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3411 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3412 group_first_cpu(sds->group_leader);
3413 }
3414
3415 return 1;
3416
3417}
3418#else
3419static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3420 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3421{
3422 return;
3423}
3424
3425static inline void update_sd_power_savings_stats(struct sched_group *group,
3426 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3427{
3428 return;
3429}
3430
3431static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3432 int this_cpu, unsigned long *imbalance)
3433{
3434 return 0;
3435}
3436#endif
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3452 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3453 int local_group, const struct cpumask *cpus,
3454 int *balance, struct sg_lb_stats *sgs)
3455{
3456 unsigned long load, max_cpu_load, min_cpu_load;
3457 int i;
3458 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3459 unsigned long sum_avg_load_per_task;
3460 unsigned long avg_load_per_task;
3461
3462 if (local_group)
3463 balance_cpu = group_first_cpu(group);
3464
3465
3466 sum_avg_load_per_task = avg_load_per_task = 0;
3467 max_cpu_load = 0;
3468 min_cpu_load = ~0UL;
3469
3470 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3471 struct rq *rq = cpu_rq(i);
3472
3473 if (*sd_idle && rq->nr_running)
3474 *sd_idle = 0;
3475
3476
3477 if (local_group) {
3478 if (idle_cpu(i) && !first_idle_cpu) {
3479 first_idle_cpu = 1;
3480 balance_cpu = i;
3481 }
3482
3483 load = target_load(i, load_idx);
3484 } else {
3485 load = source_load(i, load_idx);
3486 if (load > max_cpu_load)
3487 max_cpu_load = load;
3488 if (min_cpu_load > load)
3489 min_cpu_load = load;
3490 }
3491
3492 sgs->group_load += load;
3493 sgs->sum_nr_running += rq->nr_running;
3494 sgs->sum_weighted_load += weighted_cpuload(i);
3495
3496 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3497 }
3498
3499
3500
3501
3502
3503
3504
3505 if (idle != CPU_NEWLY_IDLE && local_group &&
3506 balance_cpu != this_cpu && balance) {
3507 *balance = 0;
3508 return;
3509 }
3510
3511
3512 sgs->avg_load = sg_div_cpu_power(group,
3513 sgs->group_load * SCHED_LOAD_SCALE);
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525 avg_load_per_task = sg_div_cpu_power(group,
3526 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3527
3528 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3529 sgs->group_imb = 1;
3530
3531 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3532
3533}
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3546 enum cpu_idle_type idle, int *sd_idle,
3547 const struct cpumask *cpus, int *balance,
3548 struct sd_lb_stats *sds)
3549{
3550 struct sched_group *group = sd->groups;
3551 struct sg_lb_stats sgs;
3552 int load_idx;
3553
3554 init_sd_power_savings_stats(sd, sds, idle);
3555 load_idx = get_sd_load_idx(sd, idle);
3556
3557 do {
3558 int local_group;
3559
3560 local_group = cpumask_test_cpu(this_cpu,
3561 sched_group_cpus(group));
3562 memset(&sgs, 0, sizeof(sgs));
3563 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3564 local_group, cpus, balance, &sgs);
3565
3566 if (local_group && balance && !(*balance))
3567 return;
3568
3569 sds->total_load += sgs.group_load;
3570 sds->total_pwr += group->__cpu_power;
3571
3572 if (local_group) {
3573 sds->this_load = sgs.avg_load;
3574 sds->this = group;
3575 sds->this_nr_running = sgs.sum_nr_running;
3576 sds->this_load_per_task = sgs.sum_weighted_load;
3577 } else if (sgs.avg_load > sds->max_load &&
3578 (sgs.sum_nr_running > sgs.group_capacity ||
3579 sgs.group_imb)) {
3580 sds->max_load = sgs.avg_load;
3581 sds->busiest = group;
3582 sds->busiest_nr_running = sgs.sum_nr_running;
3583 sds->busiest_load_per_task = sgs.sum_weighted_load;
3584 sds->group_imb = sgs.group_imb;
3585 }
3586
3587 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3588 group = group->next;
3589 } while (group != sd->groups);
3590
3591}
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3602 int this_cpu, unsigned long *imbalance)
3603{
3604 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3605 unsigned int imbn = 2;
3606
3607 if (sds->this_nr_running) {
3608 sds->this_load_per_task /= sds->this_nr_running;
3609 if (sds->busiest_load_per_task >
3610 sds->this_load_per_task)
3611 imbn = 1;
3612 } else
3613 sds->this_load_per_task =
3614 cpu_avg_load_per_task(this_cpu);
3615
3616 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3617 sds->busiest_load_per_task * imbn) {
3618 *imbalance = sds->busiest_load_per_task;
3619 return;
3620 }
3621
3622
3623
3624
3625
3626
3627
3628 pwr_now += sds->busiest->__cpu_power *
3629 min(sds->busiest_load_per_task, sds->max_load);
3630 pwr_now += sds->this->__cpu_power *
3631 min(sds->this_load_per_task, sds->this_load);
3632 pwr_now /= SCHED_LOAD_SCALE;
3633
3634
3635 tmp = sg_div_cpu_power(sds->busiest,
3636 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3637 if (sds->max_load > tmp)
3638 pwr_move += sds->busiest->__cpu_power *
3639 min(sds->busiest_load_per_task, sds->max_load - tmp);
3640
3641
3642 if (sds->max_load * sds->busiest->__cpu_power <
3643 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3644 tmp = sg_div_cpu_power(sds->this,
3645 sds->max_load * sds->busiest->__cpu_power);
3646 else
3647 tmp = sg_div_cpu_power(sds->this,
3648 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3649 pwr_move += sds->this->__cpu_power *
3650 min(sds->this_load_per_task, sds->this_load + tmp);
3651 pwr_move /= SCHED_LOAD_SCALE;
3652
3653
3654 if (pwr_move > pwr_now)
3655 *imbalance = sds->busiest_load_per_task;
3656}
3657
3658
3659
3660
3661
3662
3663
3664
3665static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3666 unsigned long *imbalance)
3667{
3668 unsigned long max_pull;
3669
3670
3671
3672
3673
3674 if (sds->max_load < sds->avg_load) {
3675 *imbalance = 0;
3676 return fix_small_imbalance(sds, this_cpu, imbalance);
3677 }
3678
3679
3680 max_pull = min(sds->max_load - sds->avg_load,
3681 sds->max_load - sds->busiest_load_per_task);
3682
3683
3684 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3685 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3686 / SCHED_LOAD_SCALE;
3687
3688
3689
3690
3691
3692
3693
3694 if (*imbalance < sds->busiest_load_per_task)
3695 return fix_small_imbalance(sds, this_cpu, imbalance);
3696
3697}
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725static struct sched_group *
3726find_busiest_group(struct sched_domain *sd, int this_cpu,
3727 unsigned long *imbalance, enum cpu_idle_type idle,
3728 int *sd_idle, const struct cpumask *cpus, int *balance)
3729{
3730 struct sd_lb_stats sds;
3731
3732 memset(&sds, 0, sizeof(sds));
3733
3734
3735
3736
3737
3738 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3739 balance, &sds);
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751 if (balance && !(*balance))
3752 goto ret;
3753
3754 if (!sds.busiest || sds.busiest_nr_running == 0)
3755 goto out_balanced;
3756
3757 if (sds.this_load >= sds.max_load)
3758 goto out_balanced;
3759
3760 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3761
3762 if (sds.this_load >= sds.avg_load)
3763 goto out_balanced;
3764
3765 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3766 goto out_balanced;
3767
3768 sds.busiest_load_per_task /= sds.busiest_nr_running;
3769 if (sds.group_imb)
3770 sds.busiest_load_per_task =
3771 min(sds.busiest_load_per_task, sds.avg_load);
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784 if (sds.max_load <= sds.busiest_load_per_task)
3785 goto out_balanced;
3786
3787
3788 calculate_imbalance(&sds, this_cpu, imbalance);
3789 return sds.busiest;
3790
3791out_balanced:
3792
3793
3794
3795
3796 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3797 return sds.busiest;
3798ret:
3799 *imbalance = 0;
3800 return NULL;
3801}
3802
3803
3804
3805
3806static struct rq *
3807find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3808 unsigned long imbalance, const struct cpumask *cpus)
3809{
3810 struct rq *busiest = NULL, *rq;
3811 unsigned long max_load = 0;
3812 int i;
3813
3814 for_each_cpu(i, sched_group_cpus(group)) {
3815 unsigned long wl;
3816
3817 if (!cpumask_test_cpu(i, cpus))
3818 continue;
3819
3820 rq = cpu_rq(i);
3821 wl = weighted_cpuload(i);
3822
3823 if (rq->nr_running == 1 && wl > imbalance)
3824 continue;
3825
3826 if (wl > max_load) {
3827 max_load = wl;
3828 busiest = rq;
3829 }
3830 }
3831
3832 return busiest;
3833}
3834
3835
3836
3837
3838
3839#define MAX_PINNED_INTERVAL 512
3840
3841
3842static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3843
3844
3845
3846
3847
3848static int load_balance(int this_cpu, struct rq *this_rq,
3849 struct sched_domain *sd, enum cpu_idle_type idle,
3850 int *balance)
3851{
3852 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3853 struct sched_group *group;
3854 unsigned long imbalance;
3855 struct rq *busiest;
3856 unsigned long flags;
3857 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3858
3859 cpumask_setall(cpus);
3860
3861
3862
3863
3864
3865
3866
3867 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3868 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3869 sd_idle = 1;
3870
3871 schedstat_inc(sd, lb_count[idle]);
3872
3873redo:
3874 update_shares(sd);
3875 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3876 cpus, balance);
3877
3878 if (*balance == 0)
3879 goto out_balanced;
3880
3881 if (!group) {
3882 schedstat_inc(sd, lb_nobusyg[idle]);
3883 goto out_balanced;
3884 }
3885
3886 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3887 if (!busiest) {
3888 schedstat_inc(sd, lb_nobusyq[idle]);
3889 goto out_balanced;
3890 }
3891
3892 BUG_ON(busiest == this_rq);
3893
3894 schedstat_add(sd, lb_imbalance[idle], imbalance);
3895
3896 ld_moved = 0;
3897 if (busiest->nr_running > 1) {
3898
3899
3900
3901
3902
3903
3904 local_irq_save(flags);
3905 double_rq_lock(this_rq, busiest);
3906 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3907 imbalance, sd, idle, &all_pinned);
3908 double_rq_unlock(this_rq, busiest);
3909 local_irq_restore(flags);
3910
3911
3912
3913
3914 if (ld_moved && this_cpu != smp_processor_id())
3915 resched_cpu(this_cpu);
3916
3917
3918 if (unlikely(all_pinned)) {
3919 cpumask_clear_cpu(cpu_of(busiest), cpus);
3920 if (!cpumask_empty(cpus))
3921 goto redo;
3922 goto out_balanced;
3923 }
3924 }
3925
3926 if (!ld_moved) {
3927 schedstat_inc(sd, lb_failed[idle]);
3928 sd->nr_balance_failed++;
3929
3930 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3931
3932 spin_lock_irqsave(&busiest->lock, flags);
3933
3934
3935
3936
3937 if (!cpumask_test_cpu(this_cpu,
3938 &busiest->curr->cpus_allowed)) {
3939 spin_unlock_irqrestore(&busiest->lock, flags);
3940 all_pinned = 1;
3941 goto out_one_pinned;
3942 }
3943
3944 if (!busiest->active_balance) {
3945 busiest->active_balance = 1;
3946 busiest->push_cpu = this_cpu;
3947 active_balance = 1;
3948 }
3949 spin_unlock_irqrestore(&busiest->lock, flags);
3950 if (active_balance)
3951 wake_up_process(busiest->migration_thread);
3952
3953
3954
3955
3956
3957 sd->nr_balance_failed = sd->cache_nice_tries+1;
3958 }
3959 } else
3960 sd->nr_balance_failed = 0;
3961
3962 if (likely(!active_balance)) {
3963
3964 sd->balance_interval = sd->min_interval;
3965 } else {
3966
3967
3968
3969
3970
3971
3972 if (sd->balance_interval < sd->max_interval)
3973 sd->balance_interval *= 2;
3974 }
3975
3976 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3977 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3978 ld_moved = -1;
3979
3980 goto out;
3981
3982out_balanced:
3983 schedstat_inc(sd, lb_balanced[idle]);
3984
3985 sd->nr_balance_failed = 0;
3986
3987out_one_pinned:
3988
3989 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3990 (sd->balance_interval < sd->max_interval))
3991 sd->balance_interval *= 2;
3992
3993 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3994 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3995 ld_moved = -1;
3996 else
3997 ld_moved = 0;
3998out:
3999 if (ld_moved)
4000 update_shares(sd);
4001 return ld_moved;
4002}
4003
4004
4005
4006
4007
4008
4009
4010
4011static int
4012load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4013{
4014 struct sched_group *group;
4015 struct rq *busiest = NULL;
4016 unsigned long imbalance;
4017 int ld_moved = 0;
4018 int sd_idle = 0;
4019 int all_pinned = 0;
4020 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4021
4022 cpumask_setall(cpus);
4023
4024
4025
4026
4027
4028
4029
4030 if (sd->flags & SD_SHARE_CPUPOWER &&
4031 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4032 sd_idle = 1;
4033
4034 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4035redo:
4036 update_shares_locked(this_rq, sd);
4037 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4038 &sd_idle, cpus, NULL);
4039 if (!group) {
4040 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4041 goto out_balanced;
4042 }
4043
4044 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4045 if (!busiest) {
4046 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4047 goto out_balanced;
4048 }
4049
4050 BUG_ON(busiest == this_rq);
4051
4052 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4053
4054 ld_moved = 0;
4055 if (busiest->nr_running > 1) {
4056
4057 double_lock_balance(this_rq, busiest);
4058
4059 update_rq_clock(busiest);
4060 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4061 imbalance, sd, CPU_NEWLY_IDLE,
4062 &all_pinned);
4063 double_unlock_balance(this_rq, busiest);
4064
4065 if (unlikely(all_pinned)) {
4066 cpumask_clear_cpu(cpu_of(busiest), cpus);
4067 if (!cpumask_empty(cpus))
4068 goto redo;
4069 }
4070 }
4071
4072 if (!ld_moved) {
4073 int active_balance = 0;
4074
4075 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4076 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4077 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4078 return -1;
4079
4080 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4081 return -1;
4082
4083 if (sd->nr_balance_failed++ < 2)
4084 return -1;
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109 double_lock_balance(this_rq, busiest);
4110
4111
4112
4113
4114
4115 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4116 double_unlock_balance(this_rq, busiest);
4117 all_pinned = 1;
4118 return ld_moved;
4119 }
4120
4121 if (!busiest->active_balance) {
4122 busiest->active_balance = 1;
4123 busiest->push_cpu = this_cpu;
4124 active_balance = 1;
4125 }
4126
4127 double_unlock_balance(this_rq, busiest);
4128
4129
4130
4131 spin_unlock(&this_rq->lock);
4132 if (active_balance)
4133 wake_up_process(busiest->migration_thread);
4134 spin_lock(&this_rq->lock);
4135
4136 } else
4137 sd->nr_balance_failed = 0;
4138
4139 update_shares_locked(this_rq, sd);
4140 return ld_moved;
4141
4142out_balanced:
4143 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4144 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4145 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4146 return -1;
4147 sd->nr_balance_failed = 0;
4148
4149 return 0;
4150}
4151
4152
4153
4154
4155
4156static void idle_balance(int this_cpu, struct rq *this_rq)
4157{
4158 struct sched_domain *sd;
4159 int pulled_task = 0;
4160 unsigned long next_balance = jiffies + HZ;
4161
4162 for_each_domain(this_cpu, sd) {
4163 unsigned long interval;
4164
4165 if (!(sd->flags & SD_LOAD_BALANCE))
4166 continue;
4167
4168 if (sd->flags & SD_BALANCE_NEWIDLE)
4169
4170 pulled_task = load_balance_newidle(this_cpu, this_rq,
4171 sd);
4172
4173 interval = msecs_to_jiffies(sd->balance_interval);
4174 if (time_after(next_balance, sd->last_balance + interval))
4175 next_balance = sd->last_balance + interval;
4176 if (pulled_task)
4177 break;
4178 }
4179 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4180
4181
4182
4183
4184 this_rq->next_balance = next_balance;
4185 }
4186}
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4197{
4198 int target_cpu = busiest_rq->push_cpu;
4199 struct sched_domain *sd;
4200 struct rq *target_rq;
4201
4202
4203 if (busiest_rq->nr_running <= 1)
4204 return;
4205
4206 target_rq = cpu_rq(target_cpu);
4207
4208
4209
4210
4211
4212
4213 BUG_ON(busiest_rq == target_rq);
4214
4215
4216 double_lock_balance(busiest_rq, target_rq);
4217 update_rq_clock(busiest_rq);
4218 update_rq_clock(target_rq);
4219
4220
4221 for_each_domain(target_cpu, sd) {
4222 if ((sd->flags & SD_LOAD_BALANCE) &&
4223 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4224 break;
4225 }
4226
4227 if (likely(sd)) {
4228 schedstat_inc(sd, alb_count);
4229
4230 if (move_one_task(target_rq, target_cpu, busiest_rq,
4231 sd, CPU_IDLE))
4232 schedstat_inc(sd, alb_pushed);
4233 else
4234 schedstat_inc(sd, alb_failed);
4235 }
4236 double_unlock_balance(busiest_rq, target_rq);
4237}
4238
4239#ifdef CONFIG_NO_HZ
4240static struct {
4241 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask;
4243} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1),
4245};
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267int select_nohz_load_balancer(int stop_tick)
4268{
4269 int cpu = smp_processor_id();
4270
4271 if (stop_tick) {
4272 cpu_rq(cpu)->in_nohz_recently = 1;
4273
4274 if (!cpu_active(cpu)) {
4275 if (atomic_read(&nohz.load_balancer) != cpu)
4276 return 0;
4277
4278
4279
4280
4281
4282 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4283 BUG();
4284
4285 return 0;
4286 }
4287
4288 cpumask_set_cpu(cpu, nohz.cpu_mask);
4289
4290
4291 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4292 if (atomic_read(&nohz.load_balancer) == cpu)
4293 atomic_set(&nohz.load_balancer, -1);
4294 return 0;
4295 }
4296
4297 if (atomic_read(&nohz.load_balancer) == -1) {
4298
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu)
4302 return 1;
4303 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0;
4306
4307 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4308
4309 if (atomic_read(&nohz.load_balancer) == cpu)
4310 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4311 BUG();
4312 }
4313 return 0;
4314}
4315#endif
4316
4317static DEFINE_SPINLOCK(balancing);
4318
4319
4320
4321
4322
4323
4324
4325static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4326{
4327 int balance = 1;
4328 struct rq *rq = cpu_rq(cpu);
4329 unsigned long interval;
4330 struct sched_domain *sd;
4331
4332 unsigned long next_balance = jiffies + 60*HZ;
4333 int update_next_balance = 0;
4334 int need_serialize;
4335
4336 for_each_domain(cpu, sd) {
4337 if (!(sd->flags & SD_LOAD_BALANCE))
4338 continue;
4339
4340 interval = sd->balance_interval;
4341 if (idle != CPU_IDLE)
4342 interval *= sd->busy_factor;
4343
4344
4345 interval = msecs_to_jiffies(interval);
4346 if (unlikely(!interval))
4347 interval = 1;
4348 if (interval > HZ*NR_CPUS/10)
4349 interval = HZ*NR_CPUS/10;
4350
4351 need_serialize = sd->flags & SD_SERIALIZE;
4352
4353 if (need_serialize) {
4354 if (!spin_trylock(&balancing))
4355 goto out;
4356 }
4357
4358 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4359 if (load_balance(cpu, rq, sd, idle, &balance)) {
4360
4361
4362
4363
4364
4365 idle = CPU_NOT_IDLE;
4366 }
4367 sd->last_balance = jiffies;
4368 }
4369 if (need_serialize)
4370 spin_unlock(&balancing);
4371out:
4372 if (time_after(next_balance, sd->last_balance + interval)) {
4373 next_balance = sd->last_balance + interval;
4374 update_next_balance = 1;
4375 }
4376
4377
4378
4379
4380
4381
4382 if (!balance)
4383 break;
4384 }
4385
4386
4387
4388
4389
4390
4391 if (likely(update_next_balance))
4392 rq->next_balance = next_balance;
4393}
4394
4395
4396
4397
4398
4399
4400static void run_rebalance_domains(struct softirq_action *h)
4401{
4402 int this_cpu = smp_processor_id();
4403 struct rq *this_rq = cpu_rq(this_cpu);
4404 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4405 CPU_IDLE : CPU_NOT_IDLE;
4406
4407 rebalance_domains(this_cpu, idle);
4408
4409#ifdef CONFIG_NO_HZ
4410
4411
4412
4413
4414
4415 if (this_rq->idle_at_tick &&
4416 atomic_read(&nohz.load_balancer) == this_cpu) {
4417 struct rq *rq;
4418 int balance_cpu;
4419
4420 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4421 if (balance_cpu == this_cpu)
4422 continue;
4423
4424
4425
4426
4427
4428
4429 if (need_resched())
4430 break;
4431
4432 rebalance_domains(balance_cpu, CPU_IDLE);
4433
4434 rq = cpu_rq(balance_cpu);
4435 if (time_after(this_rq->next_balance, rq->next_balance))
4436 this_rq->next_balance = rq->next_balance;
4437 }
4438 }
4439#endif
4440}
4441
4442static inline int on_null_domain(int cpu)
4443{
4444 return !rcu_dereference(cpu_rq(cpu)->sd);
4445}
4446
4447
4448
4449
4450
4451
4452
4453
4454static inline void trigger_load_balance(struct rq *rq, int cpu)
4455{
4456#ifdef CONFIG_NO_HZ
4457
4458
4459
4460
4461
4462 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4463 rq->in_nohz_recently = 0;
4464
4465 if (atomic_read(&nohz.load_balancer) == cpu) {
4466 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4467 atomic_set(&nohz.load_balancer, -1);
4468 }
4469
4470 if (atomic_read(&nohz.load_balancer) == -1) {
4471
4472
4473
4474
4475
4476
4477
4478
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480
4481 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb);
4483 }
4484 }
4485
4486
4487
4488
4489
4490 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4491 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4492 resched_cpu(cpu);
4493 return;
4494 }
4495
4496
4497
4498
4499
4500 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4501 cpumask_test_cpu(cpu, nohz.cpu_mask))
4502 return;
4503#endif
4504
4505 if (time_after_eq(jiffies, rq->next_balance) &&
4506 likely(!on_null_domain(cpu)))
4507 raise_softirq(SCHED_SOFTIRQ);
4508}
4509
4510#else
4511
4512
4513
4514
4515static inline void idle_balance(int cpu, struct rq *rq)
4516{
4517}
4518
4519#endif
4520
4521DEFINE_PER_CPU(struct kernel_stat, kstat);
4522
4523EXPORT_PER_CPU_SYMBOL(kstat);
4524
4525
4526
4527
4528
4529
4530
4531static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4532{
4533 u64 ns = 0;
4534
4535 if (task_current(rq, p)) {
4536 update_rq_clock(rq);
4537 ns = rq->clock - p->se.exec_start;
4538 if ((s64)ns < 0)
4539 ns = 0;
4540 }
4541
4542 return ns;
4543}
4544
4545unsigned long long task_delta_exec(struct task_struct *p)
4546{
4547 unsigned long flags;
4548 struct rq *rq;
4549 u64 ns = 0;
4550
4551 rq = task_rq_lock(p, &flags);
4552 ns = do_task_delta_exec(p, rq);
4553 task_rq_unlock(rq, &flags);
4554
4555 return ns;
4556}
4557
4558
4559
4560
4561
4562
4563unsigned long long task_sched_runtime(struct task_struct *p)
4564{
4565 unsigned long flags;
4566 struct rq *rq;
4567 u64 ns = 0;
4568
4569 rq = task_rq_lock(p, &flags);
4570 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4571 task_rq_unlock(rq, &flags);
4572
4573 return ns;
4574}
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585unsigned long long thread_group_sched_runtime(struct task_struct *p)
4586{
4587 struct task_cputime totals;
4588 unsigned long flags;
4589 struct rq *rq;
4590 u64 ns;
4591
4592 rq = task_rq_lock(p, &flags);
4593 thread_group_cputime(p, &totals);
4594 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4595 task_rq_unlock(rq, &flags);
4596
4597 return ns;
4598}
4599
4600
4601
4602
4603
4604
4605
4606void account_user_time(struct task_struct *p, cputime_t cputime,
4607 cputime_t cputime_scaled)
4608{
4609 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4610 cputime64_t tmp;
4611
4612
4613 p->utime = cputime_add(p->utime, cputime);
4614 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4615 account_group_user_time(p, cputime);
4616
4617
4618 tmp = cputime_to_cputime64(cputime);
4619 if (TASK_NICE(p) > 0)
4620 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4621 else
4622 cpustat->user = cputime64_add(cpustat->user, tmp);
4623
4624 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4625
4626 acct_update_integrals(p);
4627}
4628
4629
4630
4631
4632
4633
4634
4635static void account_guest_time(struct task_struct *p, cputime_t cputime,
4636 cputime_t cputime_scaled)
4637{
4638 cputime64_t tmp;
4639 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4640
4641 tmp = cputime_to_cputime64(cputime);
4642
4643
4644 p->utime = cputime_add(p->utime, cputime);
4645 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4646 account_group_user_time(p, cputime);
4647 p->gtime = cputime_add(p->gtime, cputime);
4648
4649
4650 cpustat->user = cputime64_add(cpustat->user, tmp);
4651 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4652}
4653
4654
4655
4656
4657
4658
4659
4660
4661void account_system_time(struct task_struct *p, int hardirq_offset,
4662 cputime_t cputime, cputime_t cputime_scaled)
4663{
4664 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4665 cputime64_t tmp;
4666
4667 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4668 account_guest_time(p, cputime, cputime_scaled);
4669 return;
4670 }
4671
4672
4673 p->stime = cputime_add(p->stime, cputime);
4674 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4675 account_group_system_time(p, cputime);
4676
4677
4678 tmp = cputime_to_cputime64(cputime);
4679 if (hardirq_count() - hardirq_offset)
4680 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4681 else if (softirq_count())
4682 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4683 else
4684 cpustat->system = cputime64_add(cpustat->system, tmp);
4685
4686 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
4687
4688
4689 acct_update_integrals(p);
4690}
4691
4692
4693
4694
4695
4696void account_steal_time(cputime_t cputime)
4697{
4698 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4699 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4700
4701 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4702}
4703
4704
4705
4706
4707
4708void account_idle_time(cputime_t cputime)
4709{
4710 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4711 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4712 struct rq *rq = this_rq();
4713
4714 if (atomic_read(&rq->nr_iowait) > 0)
4715 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4716 else
4717 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4718}
4719
4720#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4721
4722
4723
4724
4725
4726
4727void account_process_tick(struct task_struct *p, int user_tick)
4728{
4729 cputime_t one_jiffy = jiffies_to_cputime(1);
4730 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4731 struct rq *rq = this_rq();
4732
4733 if (user_tick)
4734 account_user_time(p, one_jiffy, one_jiffy_scaled);
4735 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
4736 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4737 one_jiffy_scaled);
4738 else
4739 account_idle_time(one_jiffy);
4740}
4741
4742
4743
4744
4745
4746
4747void account_steal_ticks(unsigned long ticks)
4748{
4749 account_steal_time(jiffies_to_cputime(ticks));
4750}
4751
4752
4753
4754
4755
4756void account_idle_ticks(unsigned long ticks)
4757{
4758 account_idle_time(jiffies_to_cputime(ticks));
4759}
4760
4761#endif
4762
4763
4764
4765
4766#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4767cputime_t task_utime(struct task_struct *p)
4768{
4769 return p->utime;
4770}
4771
4772cputime_t task_stime(struct task_struct *p)
4773{
4774 return p->stime;
4775}
4776#else
4777cputime_t task_utime(struct task_struct *p)
4778{
4779 clock_t utime = cputime_to_clock_t(p->utime),
4780 total = utime + cputime_to_clock_t(p->stime);
4781 u64 temp;
4782
4783
4784
4785
4786 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4787
4788 if (total) {
4789 temp *= utime;
4790 do_div(temp, total);
4791 }
4792 utime = (clock_t)temp;
4793
4794 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4795 return p->prev_utime;
4796}
4797
4798cputime_t task_stime(struct task_struct *p)
4799{
4800 clock_t stime;
4801
4802
4803
4804
4805
4806
4807 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4808 cputime_to_clock_t(task_utime(p));
4809
4810 if (stime >= 0)
4811 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4812
4813 return p->prev_stime;
4814}
4815#endif
4816
4817inline cputime_t task_gtime(struct task_struct *p)
4818{
4819 return p->gtime;
4820}
4821
4822
4823
4824
4825
4826
4827
4828
4829void scheduler_tick(void)
4830{
4831 int cpu = smp_processor_id();
4832 struct rq *rq = cpu_rq(cpu);
4833 struct task_struct *curr = rq->curr;
4834
4835 sched_clock_tick();
4836
4837 spin_lock(&rq->lock);
4838 update_rq_clock(rq);
4839 update_cpu_load(rq);
4840 curr->sched_class->task_tick(rq, curr, 0);
4841 spin_unlock(&rq->lock);
4842
4843#ifdef CONFIG_SMP
4844 rq->idle_at_tick = idle_cpu(cpu);
4845 trigger_load_balance(rq, cpu);
4846#endif
4847}
4848
4849notrace unsigned long get_parent_ip(unsigned long addr)
4850{
4851 if (in_lock_functions(addr)) {
4852 addr = CALLER_ADDR2;
4853 if (in_lock_functions(addr))
4854 addr = CALLER_ADDR3;
4855 }
4856 return addr;
4857}
4858
4859#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4860 defined(CONFIG_PREEMPT_TRACER))
4861
4862void __kprobes add_preempt_count(int val)
4863{
4864#ifdef CONFIG_DEBUG_PREEMPT
4865
4866
4867
4868 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4869 return;
4870#endif
4871 preempt_count() += val;
4872#ifdef CONFIG_DEBUG_PREEMPT
4873
4874
4875
4876 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4877 PREEMPT_MASK - 10);
4878#endif
4879 if (preempt_count() == val)
4880 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4881}
4882EXPORT_SYMBOL(add_preempt_count);
4883
4884void __kprobes sub_preempt_count(int val)
4885{
4886#ifdef CONFIG_DEBUG_PREEMPT
4887
4888
4889
4890 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4891 return;
4892
4893
4894
4895 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4896 !(preempt_count() & PREEMPT_MASK)))
4897 return;
4898#endif
4899
4900 if (preempt_count() == val)
4901 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4902 preempt_count() -= val;
4903}
4904EXPORT_SYMBOL(sub_preempt_count);
4905
4906#endif
4907
4908
4909
4910
4911static noinline void __schedule_bug(struct task_struct *prev)
4912{
4913 struct pt_regs *regs = get_irq_regs();
4914
4915 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4916 prev->comm, prev->pid, preempt_count());
4917
4918 debug_show_held_locks(prev);
4919 print_modules();
4920 if (irqs_disabled())
4921 print_irqtrace_events(prev);
4922
4923 if (regs)
4924 show_regs(regs);
4925 else
4926 dump_stack();
4927}
4928
4929
4930
4931
4932static inline void schedule_debug(struct task_struct *prev)
4933{
4934
4935
4936
4937
4938
4939 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4940 __schedule_bug(prev);
4941
4942 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4943
4944 schedstat_inc(this_rq(), sched_count);
4945#ifdef CONFIG_SCHEDSTATS
4946 if (unlikely(prev->lock_depth >= 0)) {
4947 schedstat_inc(this_rq(), bkl_count);
4948 schedstat_inc(prev, sched_info.bkl_count);
4949 }
4950#endif
4951}
4952
4953static void put_prev_task(struct rq *rq, struct task_struct *prev)
4954{
4955 if (prev->state == TASK_RUNNING) {
4956 u64 runtime = prev->se.sum_exec_runtime;
4957
4958 runtime -= prev->se.prev_sum_exec_runtime;
4959 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970 update_avg(&prev->se.avg_overlap, runtime);
4971 }
4972 prev->sched_class->put_prev_task(rq, prev);
4973}
4974
4975
4976
4977
4978static inline struct task_struct *
4979pick_next_task(struct rq *rq)
4980{
4981 const struct sched_class *class;
4982 struct task_struct *p;
4983
4984
4985
4986
4987
4988 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4989 p = fair_sched_class.pick_next_task(rq);
4990 if (likely(p))
4991 return p;
4992 }
4993
4994 class = sched_class_highest;
4995 for ( ; ; ) {
4996 p = class->pick_next_task(rq);
4997 if (p)
4998 return p;
4999
5000
5001
5002
5003 class = class->next;
5004 }
5005}
5006
5007
5008
5009
5010asmlinkage void __sched __schedule(void)
5011{
5012 struct task_struct *prev, *next;
5013 unsigned long *switch_count;
5014 struct rq *rq;
5015 int cpu;
5016
5017 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu);
5020 prev = rq->curr;
5021 switch_count = &prev->nivcsw;
5022
5023 release_kernel_lock(prev);
5024need_resched_nonpreemptible:
5025
5026 schedule_debug(prev);
5027
5028 if (sched_feat(HRTICK))
5029 hrtick_clear(rq);
5030
5031 spin_lock_irq(&rq->lock);
5032 update_rq_clock(rq);
5033 clear_tsk_need_resched(prev);
5034
5035 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5036 if (unlikely(signal_pending_state(prev->state, prev)))
5037 prev->state = TASK_RUNNING;
5038 else
5039 deactivate_task(rq, prev, 1);
5040 switch_count = &prev->nvcsw;
5041 }
5042
5043#ifdef CONFIG_SMP
5044 if (prev->sched_class->pre_schedule)
5045 prev->sched_class->pre_schedule(rq, prev);
5046#endif
5047
5048 if (unlikely(!rq->nr_running))
5049 idle_balance(cpu, rq);
5050
5051 put_prev_task(rq, prev);
5052 next = pick_next_task(rq);
5053