1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/sysctl.h>
61#include <linux/syscalls.h>
62#include <linux/times.h>
63#include <linux/tsacct_kern.h>
64#include <linux/kprobes.h>
65#include <linux/delayacct.h>
66#include <linux/reciprocal_div.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h>
73#include <linux/ctype.h>
74#include <linux/ftrace.h>
75#include <trace/sched.h>
76
77#include <asm/tlb.h>
78#include <asm/irq_regs.h>
79
80#include "sched_cpupri.h"
81
82
83
84
85
86
87#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
88#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
89#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
90
91
92
93
94
95
96#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
97#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
98#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
99
100
101
102
103#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
104
105#define NICE_0_LOAD SCHED_LOAD_SCALE
106#define NICE_0_SHIFT SCHED_LOAD_SHIFT
107
108
109
110
111
112
113
114#define DEF_TIMESLICE (100 * HZ / 1000)
115
116
117
118
119#define RUNTIME_INF ((u64)~0ULL)
120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP
128
129static void double_rq_lock(struct rq *rq1, struct rq *rq2);
130
131
132
133
134
135static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
136{
137 return reciprocal_divide(load, sg->reciprocal_cpu_power);
138}
139
140
141
142
143
144static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
145{
146 sg->__cpu_power += val;
147 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
148}
149#endif
150
151static inline int rt_policy(int policy)
152{
153 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
154 return 1;
155 return 0;
156}
157
158static inline int task_has_rt_policy(struct task_struct *p)
159{
160 return rt_policy(p->policy);
161}
162
163
164
165
166struct rt_prio_array {
167 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
168 struct list_head queue[MAX_RT_PRIO];
169};
170
171struct rt_bandwidth {
172
173 spinlock_t rt_runtime_lock;
174 ktime_t rt_period;
175 u64 rt_runtime;
176 struct hrtimer rt_period_timer;
177};
178
179static struct rt_bandwidth def_rt_bandwidth;
180
181static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
182
183static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
184{
185 struct rt_bandwidth *rt_b =
186 container_of(timer, struct rt_bandwidth, rt_period_timer);
187 ktime_t now;
188 int overrun;
189 int idle = 0;
190
191 for (;;) {
192 now = hrtimer_cb_get_time(timer);
193 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
194
195 if (!overrun)
196 break;
197
198 idle = do_sched_rt_period_timer(rt_b, overrun);
199 }
200
201 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
202}
203
204static
205void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
206{
207 rt_b->rt_period = ns_to_ktime(period);
208 rt_b->rt_runtime = runtime;
209
210 spin_lock_init(&rt_b->rt_runtime_lock);
211
212 hrtimer_init(&rt_b->rt_period_timer,
213 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
214 rt_b->rt_period_timer.function = sched_rt_period_timer;
215}
216
217static inline int rt_bandwidth_enabled(void)
218{
219 return sysctl_sched_rt_runtime >= 0;
220}
221
222static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{
224 ktime_t now;
225
226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return;
228
229 if (hrtimer_active(&rt_b->rt_period_timer))
230 return;
231
232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
237 if (hrtimer_active(&rt_b->rt_period_timer))
238 break;
239
240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
242
243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
248 }
249 spin_unlock(&rt_b->rt_runtime_lock);
250}
251
252#ifdef CONFIG_RT_GROUP_SCHED
253static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
254{
255 hrtimer_cancel(&rt_b->rt_period_timer);
256}
257#endif
258
259
260
261
262
263static DEFINE_MUTEX(sched_domains_mutex);
264
265#ifdef CONFIG_GROUP_SCHED
266
267#include <linux/cgroup.h>
268
269struct cfs_rq;
270
271static LIST_HEAD(task_groups);
272
273
274struct task_group {
275#ifdef CONFIG_CGROUP_SCHED
276 struct cgroup_subsys_state css;
277#endif
278
279#ifdef CONFIG_USER_SCHED
280 uid_t uid;
281#endif
282
283#ifdef CONFIG_FAIR_GROUP_SCHED
284
285 struct sched_entity **se;
286
287 struct cfs_rq **cfs_rq;
288 unsigned long shares;
289#endif
290
291#ifdef CONFIG_RT_GROUP_SCHED
292 struct sched_rt_entity **rt_se;
293 struct rt_rq **rt_rq;
294
295 struct rt_bandwidth rt_bandwidth;
296#endif
297
298 struct rcu_head rcu;
299 struct list_head list;
300
301 struct task_group *parent;
302 struct list_head siblings;
303 struct list_head children;
304};
305
306#ifdef CONFIG_USER_SCHED
307
308
309void set_tg_uid(struct user_struct *user)
310{
311 user->tg->uid = user->uid;
312}
313
314
315
316
317
318
319struct task_group root_task_group;
320
321#ifdef CONFIG_FAIR_GROUP_SCHED
322
323static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
324
325static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
326#endif
327
328#ifdef CONFIG_RT_GROUP_SCHED
329static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
330static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331#endif
332#else
333#define root_task_group init_task_group
334#endif
335
336
337
338
339static DEFINE_SPINLOCK(task_group_lock);
340
341#ifdef CONFIG_SMP
342static int root_task_group_empty(void)
343{
344 return list_empty(&root_task_group.children);
345}
346#endif
347
348#ifdef CONFIG_FAIR_GROUP_SCHED
349#ifdef CONFIG_USER_SCHED
350# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
351#else
352# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
353#endif
354
355
356
357
358
359
360
361
362
363#define MIN_SHARES 2
364#define MAX_SHARES (1UL << 18)
365
366static int init_task_group_load = INIT_TASK_GROUP_LOAD;
367#endif
368
369
370
371
372struct task_group init_task_group;
373
374
375static inline struct task_group *task_group(struct task_struct *p)
376{
377 struct task_group *tg;
378
379#ifdef CONFIG_USER_SCHED
380 rcu_read_lock();
381 tg = __task_cred(p)->user->tg;
382 rcu_read_unlock();
383#elif defined(CONFIG_CGROUP_SCHED)
384 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
385 struct task_group, css);
386#else
387 tg = &init_task_group;
388#endif
389 return tg;
390}
391
392
393static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
394{
395#ifdef CONFIG_FAIR_GROUP_SCHED
396 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
397 p->se.parent = task_group(p)->se[cpu];
398#endif
399
400#ifdef CONFIG_RT_GROUP_SCHED
401 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
402 p->rt.parent = task_group(p)->rt_se[cpu];
403#endif
404}
405
406#else
407
408#ifdef CONFIG_SMP
409static int root_task_group_empty(void)
410{
411 return 1;
412}
413#endif
414
415static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
416static inline struct task_group *task_group(struct task_struct *p)
417{
418 return NULL;
419}
420
421#endif
422
423
424struct cfs_rq {
425 struct load_weight load;
426 unsigned long nr_running;
427
428 u64 exec_clock;
429 u64 min_vruntime;
430
431 struct rb_root tasks_timeline;
432 struct rb_node *rb_leftmost;
433
434 struct list_head tasks;
435 struct list_head *balance_iterator;
436
437
438
439
440
441 struct sched_entity *curr, *next, *last;
442
443 unsigned int nr_spread_over;
444
445#ifdef CONFIG_FAIR_GROUP_SCHED
446 struct rq *rq;
447
448
449
450
451
452
453
454
455
456 struct list_head leaf_cfs_rq_list;
457 struct task_group *tg;
458
459#ifdef CONFIG_SMP
460
461
462
463 unsigned long task_weight;
464
465
466
467
468
469
470
471 unsigned long h_load;
472
473
474
475
476 unsigned long shares;
477
478
479
480
481 unsigned long rq_weight;
482#endif
483#endif
484};
485
486
487struct rt_rq {
488 struct rt_prio_array active;
489 unsigned long rt_nr_running;
490#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
491 struct {
492 int curr;
493#ifdef CONFIG_SMP
494 int next;
495#endif
496 } highest_prio;
497#endif
498#ifdef CONFIG_SMP
499 unsigned long rt_nr_migratory;
500 unsigned long rt_nr_total;
501 int overloaded;
502 struct plist_head pushable_tasks;
503#endif
504 int rt_throttled;
505 u64 rt_time;
506 u64 rt_runtime;
507
508 spinlock_t rt_runtime_lock;
509
510#ifdef CONFIG_RT_GROUP_SCHED
511 unsigned long rt_nr_boosted;
512
513 struct rq *rq;
514 struct list_head leaf_rt_rq_list;
515 struct task_group *tg;
516 struct sched_rt_entity *rt_se;
517#endif
518};
519
520#ifdef CONFIG_SMP
521
522
523
524
525
526
527
528
529
530struct root_domain {
531 atomic_t refcount;
532 cpumask_var_t span;
533 cpumask_var_t online;
534
535
536
537
538
539 cpumask_var_t rto_mask;
540 atomic_t rto_count;
541#ifdef CONFIG_SMP
542 struct cpupri cpupri;
543#endif
544#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
545
546
547
548
549
550 unsigned int sched_mc_preferred_wakeup_cpu;
551#endif
552};
553
554
555
556
557
558static struct root_domain def_root_domain;
559
560#endif
561
562
563
564
565
566
567
568
569struct rq {
570
571 spinlock_t lock;
572
573
574
575
576
577 unsigned long nr_running;
578 #define CPU_LOAD_IDX_MAX 5
579 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
580#ifdef CONFIG_NO_HZ
581 unsigned long last_tick_seen;
582 unsigned char in_nohz_recently;
583#endif
584
585 struct load_weight load;
586 unsigned long nr_load_updates;
587 u64 nr_switches;
588
589 struct cfs_rq cfs;
590 struct rt_rq rt;
591
592#ifdef CONFIG_FAIR_GROUP_SCHED
593
594 struct list_head leaf_cfs_rq_list;
595#endif
596#ifdef CONFIG_RT_GROUP_SCHED
597 struct list_head leaf_rt_rq_list;
598#endif
599
600
601
602
603
604
605
606 unsigned long nr_uninterruptible;
607
608 struct task_struct *curr, *idle;
609 unsigned long next_balance;
610 struct mm_struct *prev_mm;
611
612 u64 clock;
613
614 atomic_t nr_iowait;
615
616#ifdef CONFIG_SMP
617 struct root_domain *rd;
618 struct sched_domain *sd;
619
620 unsigned char idle_at_tick;
621
622 int active_balance;
623 int push_cpu;
624
625 int cpu;
626 int online;
627
628 unsigned long avg_load_per_task;
629
630 struct task_struct *migration_thread;
631 struct list_head migration_queue;
632#endif
633
634#ifdef CONFIG_SCHED_HRTICK
635#ifdef CONFIG_SMP
636 int hrtick_csd_pending;
637 struct call_single_data hrtick_csd;
638#endif
639 struct hrtimer hrtick_timer;
640#endif
641
642#ifdef CONFIG_SCHEDSTATS
643
644 struct sched_info rq_sched_info;
645 unsigned long long rq_cpu_time;
646
647
648
649 unsigned int yld_count;
650
651
652 unsigned int sched_switch;
653 unsigned int sched_count;
654 unsigned int sched_goidle;
655
656
657 unsigned int ttwu_count;
658 unsigned int ttwu_local;
659
660
661 unsigned int bkl_count;
662#endif
663};
664
665static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
666
667static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
668{
669 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
670}
671
672static inline int cpu_of(struct rq *rq)
673{
674#ifdef CONFIG_SMP
675 return rq->cpu;
676#else
677 return 0;
678#endif
679}
680
681
682
683
684
685
686
687
688#define for_each_domain(cpu, __sd) \
689 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
690
691#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
692#define this_rq() (&__get_cpu_var(runqueues))
693#define task_rq(p) cpu_rq(task_cpu(p))
694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
695
696static inline void update_rq_clock(struct rq *rq)
697{
698 rq->clock = sched_clock_cpu(cpu_of(rq));
699}
700
701
702
703
704#ifdef CONFIG_SCHED_DEBUG
705# define const_debug __read_mostly
706#else
707# define const_debug static const
708#endif
709
710
711
712
713
714
715
716
717int runqueue_is_locked(void)
718{
719 int cpu = get_cpu();
720 struct rq *rq = cpu_rq(cpu);
721 int ret;
722
723 ret = spin_is_locked(&rq->lock);
724 put_cpu();
725 return ret;
726}
727
728
729
730
731
732#define SCHED_FEAT(name, enabled) \
733 __SCHED_FEAT_##name ,
734
735enum {
736#include "sched_features.h"
737};
738
739#undef SCHED_FEAT
740
741#define SCHED_FEAT(name, enabled) \
742 (1UL << __SCHED_FEAT_##name) * enabled |
743
744const_debug unsigned int sysctl_sched_features =
745#include "sched_features.h"
746 0;
747
748#undef SCHED_FEAT
749
750#ifdef CONFIG_SCHED_DEBUG
751#define SCHED_FEAT(name, enabled) \
752 #name ,
753
754static __read_mostly char *sched_feat_names[] = {
755#include "sched_features.h"
756 NULL
757};
758
759#undef SCHED_FEAT
760
761static int sched_feat_show(struct seq_file *m, void *v)
762{
763 int i;
764
765 for (i = 0; sched_feat_names[i]; i++) {
766 if (!(sysctl_sched_features & (1UL << i)))
767 seq_puts(m, "NO_");
768 seq_printf(m, "%s ", sched_feat_names[i]);
769 }
770 seq_puts(m, "\n");
771
772 return 0;
773}
774
775static ssize_t
776sched_feat_write(struct file *filp, const char __user *ubuf,
777 size_t cnt, loff_t *ppos)
778{
779 char buf[64];
780 char *cmp = buf;
781 int neg = 0;
782 int i;
783
784 if (cnt > 63)
785 cnt = 63;
786
787 if (copy_from_user(&buf, ubuf, cnt))
788 return -EFAULT;
789
790 buf[cnt] = 0;
791
792 if (strncmp(buf, "NO_", 3) == 0) {
793 neg = 1;
794 cmp += 3;
795 }
796
797 for (i = 0; sched_feat_names[i]; i++) {
798 int len = strlen(sched_feat_names[i]);
799
800 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
801 if (neg)
802 sysctl_sched_features &= ~(1UL << i);
803 else
804 sysctl_sched_features |= (1UL << i);
805 break;
806 }
807 }
808
809 if (!sched_feat_names[i])
810 return -EINVAL;
811
812 filp->f_pos += cnt;
813
814 return cnt;
815}
816
817static int sched_feat_open(struct inode *inode, struct file *filp)
818{
819 return single_open(filp, sched_feat_show, NULL);
820}
821
822static struct file_operations sched_feat_fops = {
823 .open = sched_feat_open,
824 .write = sched_feat_write,
825 .read = seq_read,
826 .llseek = seq_lseek,
827 .release = single_release,
828};
829
830static __init int sched_init_debug(void)
831{
832 debugfs_create_file("sched_features", 0644, NULL, NULL,
833 &sched_feat_fops);
834
835 return 0;
836}
837late_initcall(sched_init_debug);
838
839#endif
840
841#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
842
843
844
845
846
847const_debug unsigned int sysctl_sched_nr_migrate = 32;
848
849
850
851
852
853unsigned int sysctl_sched_shares_ratelimit = 250000;
854
855
856
857
858
859
860unsigned int sysctl_sched_shares_thresh = 4;
861
862
863
864
865
866unsigned int sysctl_sched_rt_period = 1000000;
867
868static __read_mostly int scheduler_running;
869
870
871
872
873
874int sysctl_sched_rt_runtime = 950000;
875
876static inline u64 global_rt_period(void)
877{
878 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
879}
880
881static inline u64 global_rt_runtime(void)
882{
883 if (sysctl_sched_rt_runtime < 0)
884 return RUNTIME_INF;
885
886 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
887}
888
889#ifndef prepare_arch_switch
890# define prepare_arch_switch(next) do { } while (0)
891#endif
892#ifndef finish_arch_switch
893# define finish_arch_switch(prev) do { } while (0)
894#endif
895
896static inline int task_current(struct rq *rq, struct task_struct *p)
897{
898 return rq->curr == p;
899}
900
901#ifndef __ARCH_WANT_UNLOCKED_CTXSW
902static inline int task_running(struct rq *rq, struct task_struct *p)
903{
904 return task_current(rq, p);
905}
906
907static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
908{
909}
910
911static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
912{
913#ifdef CONFIG_DEBUG_SPINLOCK
914
915 rq->lock.owner = current;
916#endif
917
918
919
920
921
922 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
923
924 spin_unlock_irq(&rq->lock);
925}
926
927#else
928static inline int task_running(struct rq *rq, struct task_struct *p)
929{
930#ifdef CONFIG_SMP
931 return p->oncpu;
932#else
933 return task_current(rq, p);
934#endif
935}
936
937static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
938{
939#ifdef CONFIG_SMP
940
941
942
943
944
945 next->oncpu = 1;
946#endif
947#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
948 spin_unlock_irq(&rq->lock);
949#else
950 spin_unlock(&rq->lock);
951#endif
952}
953
954static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
955{
956#ifdef CONFIG_SMP
957
958
959
960
961
962 smp_wmb();
963 prev->oncpu = 0;
964#endif
965#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
966 local_irq_enable();
967#endif
968}
969#endif
970
971
972
973
974
975static inline struct rq *__task_rq_lock(struct task_struct *p)
976 __acquires(rq->lock)
977{
978 for (;;) {
979 struct rq *rq = task_rq(p);
980 spin_lock(&rq->lock);
981 if (likely(rq == task_rq(p)))
982 return rq;
983 spin_unlock(&rq->lock);
984 }
985}
986
987
988
989
990
991
992static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
993 __acquires(rq->lock)
994{
995 struct rq *rq;
996
997 for (;;) {
998 local_irq_save(*flags);
999 rq = task_rq(p);
1000 spin_lock(&rq->lock);
1001 if (likely(rq == task_rq(p)))
1002 return rq;
1003 spin_unlock_irqrestore(&rq->lock, *flags);
1004 }
1005}
1006
1007void task_rq_unlock_wait(struct task_struct *p)
1008{
1009 struct rq *rq = task_rq(p);
1010
1011 smp_mb();
1012 spin_unlock_wait(&rq->lock);
1013}
1014
1015static void __task_rq_unlock(struct rq *rq)
1016 __releases(rq->lock)
1017{
1018 spin_unlock(&rq->lock);
1019}
1020
1021static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1022 __releases(rq->lock)
1023{
1024 spin_unlock_irqrestore(&rq->lock, *flags);
1025}
1026
1027
1028
1029
1030static struct rq *this_rq_lock(void)
1031 __acquires(rq->lock)
1032{
1033 struct rq *rq;
1034
1035 local_irq_disable();
1036 rq = this_rq();
1037 spin_lock(&rq->lock);
1038
1039 return rq;
1040}
1041
1042#ifdef CONFIG_SCHED_HRTICK
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059static inline int hrtick_enabled(struct rq *rq)
1060{
1061 if (!sched_feat(HRTICK))
1062 return 0;
1063 if (!cpu_active(cpu_of(rq)))
1064 return 0;
1065 return hrtimer_is_hres_active(&rq->hrtick_timer);
1066}
1067
1068static void hrtick_clear(struct rq *rq)
1069{
1070 if (hrtimer_active(&rq->hrtick_timer))
1071 hrtimer_cancel(&rq->hrtick_timer);
1072}
1073
1074
1075
1076
1077
1078static enum hrtimer_restart hrtick(struct hrtimer *timer)
1079{
1080 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1081
1082 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1083
1084 spin_lock(&rq->lock);
1085 update_rq_clock(rq);
1086 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1087 spin_unlock(&rq->lock);
1088
1089 return HRTIMER_NORESTART;
1090}
1091
1092#ifdef CONFIG_SMP
1093
1094
1095
1096static void __hrtick_start(void *arg)
1097{
1098 struct rq *rq = arg;
1099
1100 spin_lock(&rq->lock);
1101 hrtimer_restart(&rq->hrtick_timer);
1102 rq->hrtick_csd_pending = 0;
1103 spin_unlock(&rq->lock);
1104}
1105
1106
1107
1108
1109
1110
1111static void hrtick_start(struct rq *rq, u64 delay)
1112{
1113 struct hrtimer *timer = &rq->hrtick_timer;
1114 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1115
1116 hrtimer_set_expires(timer, time);
1117
1118 if (rq == this_rq()) {
1119 hrtimer_restart(timer);
1120 } else if (!rq->hrtick_csd_pending) {
1121 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1122 rq->hrtick_csd_pending = 1;
1123 }
1124}
1125
1126static int
1127hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1128{
1129 int cpu = (int)(long)hcpu;
1130
1131 switch (action) {
1132 case CPU_UP_CANCELED:
1133 case CPU_UP_CANCELED_FROZEN:
1134 case CPU_DOWN_PREPARE:
1135 case CPU_DOWN_PREPARE_FROZEN:
1136 case CPU_DEAD:
1137 case CPU_DEAD_FROZEN:
1138 hrtick_clear(cpu_rq(cpu));
1139 return NOTIFY_OK;
1140 }
1141
1142 return NOTIFY_DONE;
1143}
1144
1145static __init void init_hrtick(void)
1146{
1147 hotcpu_notifier(hotplug_hrtick, 0);
1148}
1149#else
1150
1151
1152
1153
1154
1155static void hrtick_start(struct rq *rq, u64 delay)
1156{
1157 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1158 HRTIMER_MODE_REL, 0);
1159}
1160
1161static inline void init_hrtick(void)
1162{
1163}
1164#endif
1165
1166static void init_rq_hrtick(struct rq *rq)
1167{
1168#ifdef CONFIG_SMP
1169 rq->hrtick_csd_pending = 0;
1170
1171 rq->hrtick_csd.flags = 0;
1172 rq->hrtick_csd.func = __hrtick_start;
1173 rq->hrtick_csd.info = rq;
1174#endif
1175
1176 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1177 rq->hrtick_timer.function = hrtick;
1178}
1179#else
1180static inline void hrtick_clear(struct rq *rq)
1181{
1182}
1183
1184static inline void init_rq_hrtick(struct rq *rq)
1185{
1186}
1187
1188static inline void init_hrtick(void)
1189{
1190}
1191#endif
1192
1193
1194
1195
1196
1197
1198
1199
1200#ifdef CONFIG_SMP
1201
1202#ifndef tsk_is_polling
1203#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1204#endif
1205
1206static void resched_task(struct task_struct *p)
1207{
1208 int cpu;
1209
1210 assert_spin_locked(&task_rq(p)->lock);
1211
1212 if (test_tsk_need_resched(p))
1213 return;
1214
1215 set_tsk_need_resched(p);
1216
1217 cpu = task_cpu(p);
1218 if (cpu == smp_processor_id())
1219 return;
1220
1221
1222 smp_mb();
1223 if (!tsk_is_polling(p))
1224 smp_send_reschedule(cpu);
1225}
1226
1227static void resched_cpu(int cpu)
1228{
1229 struct rq *rq = cpu_rq(cpu);
1230 unsigned long flags;
1231
1232 if (!spin_trylock_irqsave(&rq->lock, flags))
1233 return;
1234 resched_task(cpu_curr(cpu));
1235 spin_unlock_irqrestore(&rq->lock, flags);
1236}
1237
1238#ifdef CONFIG_NO_HZ
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249void wake_up_idle_cpu(int cpu)
1250{
1251 struct rq *rq = cpu_rq(cpu);
1252
1253 if (cpu == smp_processor_id())
1254 return;
1255
1256
1257
1258
1259
1260
1261
1262
1263 if (rq->curr != rq->idle)
1264 return;
1265
1266
1267
1268
1269
1270
1271 set_tsk_need_resched(rq->idle);
1272
1273
1274 smp_mb();
1275 if (!tsk_is_polling(rq->idle))
1276 smp_send_reschedule(cpu);
1277}
1278#endif
1279
1280#else
1281static void resched_task(struct task_struct *p)
1282{
1283 assert_spin_locked(&task_rq(p)->lock);
1284 set_tsk_need_resched(p);
1285}
1286#endif
1287
1288#if BITS_PER_LONG == 32
1289# define WMULT_CONST (~0UL)
1290#else
1291# define WMULT_CONST (1UL << 32)
1292#endif
1293
1294#define WMULT_SHIFT 32
1295
1296
1297
1298
1299#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1300
1301
1302
1303
1304static unsigned long
1305calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1306 struct load_weight *lw)
1307{
1308 u64 tmp;
1309
1310 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1312 lw->inv_weight = 1;
1313 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1315 / (lw->weight+1);
1316 }
1317
1318 tmp = (u64)delta_exec * weight;
1319
1320
1321
1322 if (unlikely(tmp > WMULT_CONST))
1323 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1324 WMULT_SHIFT/2);
1325 else
1326 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1327
1328 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1329}
1330
1331static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1332{
1333 lw->weight += inc;
1334 lw->inv_weight = 0;
1335}
1336
1337static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1338{
1339 lw->weight -= dec;
1340 lw->inv_weight = 0;
1341}
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352#define WEIGHT_IDLEPRIO 3
1353#define WMULT_IDLEPRIO 1431655765
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367static const int prio_to_weight[40] = {
1368 88761, 71755, 56483, 46273, 36291,
1369 29154, 23254, 18705, 14949, 11916,
1370 9548, 7620, 6100, 4904, 3906,
1371 3121, 2501, 1991, 1586, 1277,
1372 1024, 820, 655, 526, 423,
1373 335, 272, 215, 172, 137,
1374 110, 87, 70, 56, 45,
1375 36, 29, 23, 18, 15,
1376};
1377
1378
1379
1380
1381
1382
1383
1384
1385static const u32 prio_to_wmult[40] = {
1386 48388, 59856, 76040, 92818, 118348,
1387 147320, 184698, 229616, 287308, 360437,
1388 449829, 563644, 704093, 875809, 1099582,
1389 1376151, 1717300, 2157191, 2708050, 3363326,
1390 4194304, 5237765, 6557202, 8165337, 10153587,
1391 12820798, 15790321, 19976592, 24970740, 31350126,
1392 39045157, 49367440, 61356676, 76695844, 95443717,
1393 119304647, 148102320, 186737708, 238609294, 286331153,
1394};
1395
1396static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1397
1398
1399
1400
1401
1402
1403struct rq_iterator {
1404 void *arg;
1405 struct task_struct *(*start)(void *);
1406 struct task_struct *(*next)(void *);
1407};
1408
1409#ifdef CONFIG_SMP
1410static unsigned long
1411balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1412 unsigned long max_load_move, struct sched_domain *sd,
1413 enum cpu_idle_type idle, int *all_pinned,
1414 int *this_best_prio, struct rq_iterator *iterator);
1415
1416static int
1417iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1418 struct sched_domain *sd, enum cpu_idle_type idle,
1419 struct rq_iterator *iterator);
1420#endif
1421
1422
1423enum cpuacct_stat_index {
1424 CPUACCT_STAT_USER,
1425 CPUACCT_STAT_SYSTEM,
1426
1427 CPUACCT_STAT_NSTATS,
1428};
1429
1430#ifdef CONFIG_CGROUP_CPUACCT
1431static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1432static void cpuacct_update_stats(struct task_struct *tsk,
1433 enum cpuacct_stat_index idx, cputime_t val);
1434#else
1435static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1436static inline void cpuacct_update_stats(struct task_struct *tsk,
1437 enum cpuacct_stat_index idx, cputime_t val) {}
1438#endif
1439
1440static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1441{
1442 update_load_add(&rq->load, load);
1443}
1444
1445static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1446{
1447 update_load_sub(&rq->load, load);
1448}
1449
1450#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1451typedef int (*tg_visitor)(struct task_group *, void *);
1452
1453
1454
1455
1456
1457static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1458{
1459 struct task_group *parent, *child;
1460 int ret;
1461
1462 rcu_read_lock();
1463 parent = &root_task_group;
1464down:
1465 ret = (*down)(parent, data);
1466 if (ret)
1467 goto out_unlock;
1468 list_for_each_entry_rcu(child, &parent->children, siblings) {
1469 parent = child;
1470 goto down;
1471
1472up:
1473 continue;
1474 }
1475 ret = (*up)(parent, data);
1476 if (ret)
1477 goto out_unlock;
1478
1479 child = parent;
1480 parent = parent->parent;
1481 if (parent)
1482 goto up;
1483out_unlock:
1484 rcu_read_unlock();
1485
1486 return ret;
1487}
1488
1489static int tg_nop(struct task_group *tg, void *data)
1490{
1491 return 0;
1492}
1493#endif
1494
1495#ifdef CONFIG_SMP
1496static unsigned long source_load(int cpu, int type);
1497static unsigned long target_load(int cpu, int type);
1498static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1499
1500static unsigned long cpu_avg_load_per_task(int cpu)
1501{
1502 struct rq *rq = cpu_rq(cpu);
1503 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1504
1505 if (nr_running)
1506 rq->avg_load_per_task = rq->load.weight / nr_running;
1507 else
1508 rq->avg_load_per_task = 0;
1509
1510 return rq->avg_load_per_task;
1511}
1512
1513#ifdef CONFIG_FAIR_GROUP_SCHED
1514
1515static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1516
1517
1518
1519
1520static void
1521update_group_shares_cpu(struct task_group *tg, int cpu,
1522 unsigned long sd_shares, unsigned long sd_rq_weight)
1523{
1524 unsigned long shares;
1525 unsigned long rq_weight;
1526
1527 if (!tg->se[cpu])
1528 return;
1529
1530 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1531
1532
1533
1534
1535
1536
1537
1538 shares = (sd_shares * rq_weight) / sd_rq_weight;
1539 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1540
1541 if (abs(shares - tg->se[cpu]->load.weight) >
1542 sysctl_sched_shares_thresh) {
1543 struct rq *rq = cpu_rq(cpu);
1544 unsigned long flags;
1545
1546 spin_lock_irqsave(&rq->lock, flags);
1547 tg->cfs_rq[cpu]->shares = shares;
1548
1549 __set_se_shares(tg->se[cpu], shares);
1550 spin_unlock_irqrestore(&rq->lock, flags);
1551 }
1552}
1553
1554
1555
1556
1557
1558
1559static int tg_shares_up(struct task_group *tg, void *data)
1560{
1561 unsigned long weight, rq_weight = 0;
1562 unsigned long shares = 0;
1563 struct sched_domain *sd = data;
1564 int i;
1565
1566 for_each_cpu(i, sched_domain_span(sd)) {
1567
1568
1569
1570
1571
1572 weight = tg->cfs_rq[i]->load.weight;
1573 if (!weight)
1574 weight = NICE_0_LOAD;
1575
1576 tg->cfs_rq[i]->rq_weight = weight;
1577 rq_weight += weight;
1578 shares += tg->cfs_rq[i]->shares;
1579 }
1580
1581 if ((!shares && rq_weight) || shares > tg->shares)
1582 shares = tg->shares;
1583
1584 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1585 shares = tg->shares;
1586
1587 for_each_cpu(i, sched_domain_span(sd))
1588 update_group_shares_cpu(tg, i, shares, rq_weight);
1589
1590 return 0;
1591}
1592
1593
1594
1595
1596
1597
1598static int tg_load_down(struct task_group *tg, void *data)
1599{
1600 unsigned long load;
1601 long cpu = (long)data;
1602
1603 if (!tg->parent) {
1604 load = cpu_rq(cpu)->load.weight;
1605 } else {
1606 load = tg->parent->cfs_rq[cpu]->h_load;
1607 load *= tg->cfs_rq[cpu]->shares;
1608 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1609 }
1610
1611 tg->cfs_rq[cpu]->h_load = load;
1612
1613 return 0;
1614}
1615
1616static void update_shares(struct sched_domain *sd)
1617{
1618 u64 now = cpu_clock(raw_smp_processor_id());
1619 s64 elapsed = now - sd->last_update;
1620
1621 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1622 sd->last_update = now;
1623 walk_tg_tree(tg_nop, tg_shares_up, sd);
1624 }
1625}
1626
1627static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1628{
1629 spin_unlock(&rq->lock);
1630 update_shares(sd);
1631 spin_lock(&rq->lock);
1632}
1633
1634static void update_h_load(long cpu)
1635{
1636 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1637}
1638
1639#else
1640
1641static inline void update_shares(struct sched_domain *sd)
1642{
1643}
1644
1645static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1646{
1647}
1648
1649#endif
1650
1651#ifdef CONFIG_PREEMPT
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1662 __releases(this_rq->lock)
1663 __acquires(busiest->lock)
1664 __acquires(this_rq->lock)
1665{
1666 spin_unlock(&this_rq->lock);
1667 double_rq_lock(this_rq, busiest);
1668
1669 return 1;
1670}
1671
1672#else
1673
1674
1675
1676
1677
1678
1679
1680static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1681 __releases(this_rq->lock)
1682 __acquires(busiest->lock)
1683 __acquires(this_rq->lock)
1684{
1685 int ret = 0;
1686
1687 if (unlikely(!spin_trylock(&busiest->lock))) {
1688 if (busiest < this_rq) {
1689 spin_unlock(&this_rq->lock);
1690 spin_lock(&busiest->lock);
1691 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1692 ret = 1;
1693 } else
1694 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1695 }
1696 return ret;
1697}
1698
1699#endif
1700
1701
1702
1703
1704static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1705{
1706 if (unlikely(!irqs_disabled())) {
1707
1708 spin_unlock(&this_rq->lock);
1709 BUG_ON(1);
1710 }
1711
1712 return _double_lock_balance(this_rq, busiest);
1713}
1714
1715static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1716 __releases(busiest->lock)
1717{
1718 spin_unlock(&busiest->lock);
1719 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1720}
1721#endif
1722
1723#ifdef CONFIG_FAIR_GROUP_SCHED
1724static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1725{
1726#ifdef CONFIG_SMP
1727 cfs_rq->shares = shares;
1728#endif
1729}
1730#endif
1731
1732#include "sched_stats.h"
1733#include "sched_idletask.c"
1734#include "sched_fair.c"
1735#include "sched_rt.c"
1736#ifdef CONFIG_SCHED_DEBUG
1737# include "sched_debug.c"
1738#endif
1739
1740#define sched_class_highest (&rt_sched_class)
1741#define for_each_class(class) \
1742 for (class = sched_class_highest; class; class = class->next)
1743
1744static void inc_nr_running(struct rq *rq)
1745{
1746 rq->nr_running++;
1747}
1748
1749static void dec_nr_running(struct rq *rq)
1750{
1751 rq->nr_running--;
1752}
1753
1754static void set_load_weight(struct task_struct *p)
1755{
1756 if (task_has_rt_policy(p)) {
1757 p->se.load.weight = prio_to_weight[0] * 2;
1758 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1759 return;
1760 }
1761
1762
1763
1764
1765 if (p->policy == SCHED_IDLE) {
1766 p->se.load.weight = WEIGHT_IDLEPRIO;
1767 p->se.load.inv_weight = WMULT_IDLEPRIO;
1768 return;
1769 }
1770
1771 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1772 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1773}
1774
1775static void update_avg(u64 *avg, u64 sample)
1776{
1777 s64 diff = sample - *avg;
1778 *avg += diff >> 3;
1779}
1780
1781static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1782{
1783 if (wakeup)
1784 p->se.start_runtime = p->se.sum_exec_runtime;
1785
1786 sched_info_queued(p);
1787 p->sched_class->enqueue_task(rq, p, wakeup);
1788 p->se.on_rq = 1;
1789}
1790
1791static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1792{
1793 if (sleep) {
1794 if (p->se.last_wakeup) {
1795 update_avg(&p->se.avg_overlap,
1796 p->se.sum_exec_runtime - p->se.last_wakeup);
1797 p->se.last_wakeup = 0;
1798 } else {
1799 update_avg(&p->se.avg_wakeup,
1800 sysctl_sched_wakeup_granularity);
1801 }
1802 }
1803
1804 sched_info_dequeued(p);
1805 p->sched_class->dequeue_task(rq, p, sleep);
1806 p->se.on_rq = 0;
1807}
1808
1809
1810
1811
1812static inline int __normal_prio(struct task_struct *p)
1813{
1814 return p->static_prio;
1815}
1816
1817
1818
1819
1820
1821
1822
1823
1824static inline int normal_prio(struct task_struct *p)
1825{
1826 int prio;
1827
1828 if (task_has_rt_policy(p))
1829 prio = MAX_RT_PRIO-1 - p->rt_priority;
1830 else
1831 prio = __normal_prio(p);
1832 return prio;
1833}
1834
1835
1836
1837
1838
1839
1840
1841
1842static int effective_prio(struct task_struct *p)
1843{
1844 p->normal_prio = normal_prio(p);
1845
1846
1847
1848
1849
1850 if (!rt_prio(p->prio))
1851 return p->normal_prio;
1852 return p->prio;
1853}
1854
1855
1856
1857
1858static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1859{
1860 if (task_contributes_to_load(p))
1861 rq->nr_uninterruptible--;
1862
1863 enqueue_task(rq, p, wakeup);
1864 inc_nr_running(rq);
1865}
1866
1867
1868
1869
1870static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1871{
1872 if (task_contributes_to_load(p))
1873 rq->nr_uninterruptible++;
1874
1875 dequeue_task(rq, p, sleep);
1876 dec_nr_running(rq);
1877}
1878
1879
1880
1881
1882
1883inline int task_curr(const struct task_struct *p)
1884{
1885 return cpu_curr(task_cpu(p)) == p;
1886}
1887
1888static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1889{
1890 set_task_rq(p, cpu);
1891#ifdef CONFIG_SMP
1892
1893
1894
1895
1896
1897 smp_wmb();
1898 task_thread_info(p)->cpu = cpu;
1899#endif
1900}
1901
1902static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1903 const struct sched_class *prev_class,
1904 int oldprio, int running)
1905{
1906 if (prev_class != p->sched_class) {
1907 if (prev_class->switched_from)
1908 prev_class->switched_from(rq, p, running);
1909 p->sched_class->switched_to(rq, p, running);
1910 } else
1911 p->sched_class->prio_changed(rq, p, oldprio, running);
1912}
1913
1914#ifdef CONFIG_SMP
1915
1916
1917static unsigned long weighted_cpuload(const int cpu)
1918{
1919 return cpu_rq(cpu)->load.weight;
1920}
1921
1922
1923
1924
1925static int
1926task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1927{
1928 s64 delta;
1929
1930
1931
1932
1933 if (sched_feat(CACHE_HOT_BUDDY) &&
1934 (&p->se == cfs_rq_of(&p->se)->next ||
1935 &p->se == cfs_rq_of(&p->se)->last))
1936 return 1;
1937
1938 if (p->sched_class != &fair_sched_class)
1939 return 0;
1940
1941 if (sysctl_sched_migration_cost == -1)
1942 return 1;
1943 if (sysctl_sched_migration_cost == 0)
1944 return 0;
1945
1946 delta = now - p->se.exec_start;
1947
1948 return delta < (s64)sysctl_sched_migration_cost;
1949}
1950
1951
1952void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1953{
1954 int old_cpu = task_cpu(p);
1955 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1956 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1957 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1958 u64 clock_offset;
1959
1960 clock_offset = old_rq->clock - new_rq->clock;
1961
1962 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1963
1964#ifdef CONFIG_SCHEDSTATS
1965 if (p->se.wait_start)
1966 p->se.wait_start -= clock_offset;
1967 if (p->se.sleep_start)
1968 p->se.sleep_start -= clock_offset;
1969 if (p->se.block_start)
1970 p->se.block_start -= clock_offset;
1971 if (old_cpu != new_cpu) {
1972 schedstat_inc(p, se.nr_migrations);
1973 if (task_hot(p, old_rq->clock, NULL))
1974 schedstat_inc(p, se.nr_forced2_migrations);
1975 }
1976#endif
1977 p->se.vruntime -= old_cfsrq->min_vruntime -
1978 new_cfsrq->min_vruntime;
1979
1980 __set_task_cpu(p, new_cpu);
1981}
1982
1983struct migration_req {
1984 struct list_head list;
1985
1986 struct task_struct *task;
1987 int dest_cpu;
1988
1989 struct completion done;
1990};
1991
1992
1993
1994
1995
1996static int
1997migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1998{
1999 struct rq *rq = task_rq(p);
2000
2001
2002
2003
2004
2005 if (!p->se.on_rq && !task_running(rq, p)) {
2006 set_task_cpu(p, dest_cpu);
2007 return 0;
2008 }
2009
2010 init_completion(&req->done);
2011 req->task = p;
2012 req->dest_cpu = dest_cpu;
2013 list_add(&req->list, &rq->migration_queue);
2014
2015 return 1;
2016}
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2035{
2036 unsigned long flags;
2037 int running, on_rq;
2038 unsigned long ncsw;
2039 struct rq *rq;
2040
2041 for (;;) {
2042
2043
2044
2045
2046
2047
2048 rq = task_rq(p);
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061 while (task_running(rq, p)) {
2062 if (match_state && unlikely(p->state != match_state))
2063 return 0;
2064 cpu_relax();
2065 }
2066
2067
2068
2069
2070
2071
2072 rq = task_rq_lock(p, &flags);
2073 trace_sched_wait_task(rq, p);
2074 running = task_running(rq, p);
2075 on_rq = p->se.on_rq;
2076 ncsw = 0;
2077 if (!match_state || p->state == match_state)
2078 ncsw = p->nvcsw | LONG_MIN;
2079 task_rq_unlock(rq, &flags);
2080
2081
2082
2083
2084 if (unlikely(!ncsw))
2085 break;
2086
2087
2088
2089
2090
2091
2092
2093 if (unlikely(running)) {
2094 cpu_relax();
2095 continue;
2096 }
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107 if (unlikely(on_rq)) {
2108 schedule_timeout_uninterruptible(1);
2109 continue;
2110 }
2111
2112
2113
2114
2115
2116
2117 break;
2118 }
2119
2120 return ncsw;
2121}
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136void kick_process(struct task_struct *p)
2137{
2138 int cpu;
2139
2140 preempt_disable();
2141 cpu = task_cpu(p);
2142 if ((cpu != smp_processor_id()) && task_curr(p))
2143 smp_send_reschedule(cpu);
2144 preempt_enable();
2145}
2146
2147
2148
2149
2150
2151
2152
2153
2154static unsigned long source_load(int cpu, int type)
2155{
2156 struct rq *rq = cpu_rq(cpu);
2157 unsigned long total = weighted_cpuload(cpu);
2158
2159 if (type == 0 || !sched_feat(LB_BIAS))
2160 return total;
2161
2162 return min(rq->cpu_load[type-1], total);
2163}
2164
2165
2166
2167
2168
2169static unsigned long target_load(int cpu, int type)
2170{
2171 struct rq *rq = cpu_rq(cpu);
2172 unsigned long total = weighted_cpuload(cpu);
2173
2174 if (type == 0 || !sched_feat(LB_BIAS))
2175 return total;
2176
2177 return max(rq->cpu_load[type-1], total);
2178}
2179
2180
2181
2182
2183
2184static struct sched_group *
2185find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2186{
2187 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2188 unsigned long min_load = ULONG_MAX, this_load = 0;
2189 int load_idx = sd->forkexec_idx;
2190 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2191
2192 do {
2193 unsigned long load, avg_load;
2194 int local_group;
2195 int i;
2196
2197
2198 if (!cpumask_intersects(sched_group_cpus(group),
2199 &p->cpus_allowed))
2200 continue;
2201
2202 local_group = cpumask_test_cpu(this_cpu,
2203 sched_group_cpus(group));
2204
2205
2206 avg_load = 0;
2207
2208 for_each_cpu(i, sched_group_cpus(group)) {
2209
2210 if (local_group)
2211 load = source_load(i, load_idx);
2212 else
2213 load = target_load(i, load_idx);
2214
2215 avg_load += load;
2216 }
2217
2218
2219 avg_load = sg_div_cpu_power(group,
2220 avg_load * SCHED_LOAD_SCALE);
2221
2222 if (local_group) {
2223 this_load = avg_load;
2224 this = group;
2225 } else if (avg_load < min_load) {
2226 min_load = avg_load;
2227 idlest = group;
2228 }
2229 } while (group = group->next, group != sd->groups);
2230
2231 if (!idlest || 100*this_load < imbalance*min_load)
2232 return NULL;
2233 return idlest;
2234}
2235
2236
2237
2238
2239static int
2240find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2241{
2242 unsigned long load, min_load = ULONG_MAX;
2243 int idlest = -1;
2244 int i;
2245
2246
2247 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2248 load = weighted_cpuload(i);
2249
2250 if (load < min_load || (load == min_load && i == this_cpu)) {
2251 min_load = load;
2252 idlest = i;
2253 }
2254 }
2255
2256 return idlest;
2257}
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270static int sched_balance_self(int cpu, int flag)
2271{
2272 struct task_struct *t = current;
2273 struct sched_domain *tmp, *sd = NULL;
2274
2275 for_each_domain(cpu, tmp) {
2276
2277
2278
2279 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2280 break;
2281 if (tmp->flags & flag)
2282 sd = tmp;
2283 }
2284
2285 if (sd)
2286 update_shares(sd);
2287
2288 while (sd) {
2289 struct sched_group *group;
2290 int new_cpu, weight;
2291
2292 if (!(sd->flags & flag)) {
2293 sd = sd->child;
2294 continue;
2295 }
2296
2297 group = find_idlest_group(sd, t, cpu);
2298 if (!group) {
2299 sd = sd->child;
2300 continue;
2301 }
2302
2303 new_cpu = find_idlest_cpu(group, t, cpu);
2304 if (new_cpu == -1 || new_cpu == cpu) {
2305
2306 sd = sd->child;
2307 continue;
2308 }
2309
2310
2311 cpu = new_cpu;
2312 weight = cpumask_weight(sched_domain_span(sd));
2313 sd = NULL;
2314 for_each_domain(cpu, tmp) {
2315 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2316 break;
2317 if (tmp->flags & flag)
2318 sd = tmp;
2319 }
2320
2321 }
2322
2323 return cpu;
2324}
2325
2326#endif
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2343{
2344 int cpu, orig_cpu, this_cpu, success = 0;
2345 unsigned long flags;
2346 long old_state;
2347 struct rq *rq;
2348
2349 if (!sched_feat(SYNC_WAKEUPS))
2350 sync = 0;
2351
2352#ifdef CONFIG_SMP
2353 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2354 struct sched_domain *sd;
2355
2356 this_cpu = raw_smp_processor_id();
2357 cpu = task_cpu(p);
2358
2359 for_each_domain(this_cpu, sd) {
2360 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2361 update_shares(sd);
2362 break;
2363 }
2364 }
2365 }
2366#endif
2367
2368 smp_wmb();
2369 rq = task_rq_lock(p, &flags);
2370 update_rq_clock(rq);
2371 old_state = p->state;
2372 if (!(old_state & state))
2373 goto out;
2374
2375 if (p->se.on_rq)
2376 goto out_running;
2377
2378 cpu = task_cpu(p);
2379 orig_cpu = cpu;
2380 this_cpu = smp_processor_id();
2381
2382#ifdef CONFIG_SMP
2383 if (unlikely(task_running(rq, p)))
2384 goto out_activate;
2385
2386 cpu = p->sched_class->select_task_rq(p, sync);
2387 if (cpu != orig_cpu) {
2388 set_task_cpu(p, cpu);
2389 task_rq_unlock(rq, &flags);
2390
2391 rq = task_rq_lock(p, &flags);
2392 old_state = p->state;
2393 if (!(old_state & state))
2394 goto out;
2395 if (p->se.on_rq)
2396 goto out_running;
2397
2398 this_cpu = smp_processor_id();
2399 cpu = task_cpu(p);
2400 }
2401
2402#ifdef CONFIG_SCHEDSTATS
2403 schedstat_inc(rq, ttwu_count);
2404 if (cpu == this_cpu)
2405 schedstat_inc(rq, ttwu_local);
2406 else {
2407 struct sched_domain *sd;
2408 for_each_domain(this_cpu, sd) {
2409 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2410 schedstat_inc(sd, ttwu_wake_remote);
2411 break;
2412 }
2413 }
2414 }
2415#endif
2416
2417out_activate:
2418#endif
2419 schedstat_inc(p, se.nr_wakeups);
2420 if (sync)
2421 schedstat_inc(p, se.nr_wakeups_sync);
2422 if (orig_cpu != cpu)
2423 schedstat_inc(p, se.nr_wakeups_migrate);
2424 if (cpu == this_cpu)
2425 schedstat_inc(p, se.nr_wakeups_local);
2426 else
2427 schedstat_inc(p, se.nr_wakeups_remote);
2428 activate_task(rq, p, 1);
2429 success = 1;
2430
2431
2432
2433
2434 if (!in_interrupt()) {
2435 struct sched_entity *se = ¤t->se;
2436 u64 sample = se->sum_exec_runtime;
2437
2438 if (se->last_wakeup)
2439 sample -= se->last_wakeup;
2440 else
2441 sample -= se->start_runtime;
2442 update_avg(&se->avg_wakeup, sample);
2443
2444 se->last_wakeup = se->sum_exec_runtime;
2445 }
2446
2447out_running:
2448 trace_sched_wakeup(rq, p, success);
2449 check_preempt_curr(rq, p, sync);
2450
2451 p->state = TASK_RUNNING;
2452#ifdef CONFIG_SMP
2453 if (p->sched_class->task_wake_up)
2454 p->sched_class->task_wake_up(rq, p);
2455#endif
2456out:
2457 task_rq_unlock(rq, &flags);
2458
2459 return success;
2460}
2461
2462int wake_up_process(struct task_struct *p)
2463{
2464 return try_to_wake_up(p, TASK_ALL, 0);
2465}
2466EXPORT_SYMBOL(wake_up_process);
2467
2468int wake_up_state(struct task_struct *p, unsigned int state)
2469{
2470 return try_to_wake_up(p, state, 0);
2471}
2472
2473
2474
2475
2476
2477
2478
2479static void __sched_fork(struct task_struct *p)
2480{
2481 p->se.exec_start = 0;
2482 p->se.sum_exec_runtime = 0;
2483 p->se.prev_sum_exec_runtime = 0;
2484 p->se.last_wakeup = 0;
2485 p->se.avg_overlap = 0;
2486 p->se.start_runtime = 0;
2487 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2488
2489#ifdef CONFIG_SCHEDSTATS
2490 p->se.wait_start = 0;
2491 p->se.sum_sleep_runtime = 0;
2492 p->se.sleep_start = 0;
2493 p->se.block_start = 0;
2494 p->se.sleep_max = 0;
2495 p->se.block_max = 0;
2496 p->se.exec_max = 0;
2497 p->se.slice_max = 0;
2498 p->se.wait_max = 0;
2499#endif
2500
2501 INIT_LIST_HEAD(&p->rt.run_list);
2502 p->se.on_rq = 0;
2503 INIT_LIST_HEAD(&p->se.group_node);
2504
2505#ifdef CONFIG_PREEMPT_NOTIFIERS
2506 INIT_HLIST_HEAD(&p->preempt_notifiers);
2507#endif
2508
2509
2510
2511
2512
2513
2514
2515 p->state = TASK_RUNNING;
2516}
2517
2518
2519
2520
2521void sched_fork(struct task_struct *p, int clone_flags)
2522{
2523 int cpu = get_cpu();
2524
2525 __sched_fork(p);
2526
2527#ifdef CONFIG_SMP
2528 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2529#endif
2530 set_task_cpu(p, cpu);
2531
2532
2533
2534
2535 p->prio = current->normal_prio;
2536 if (!rt_prio(p->prio))
2537 p->sched_class = &fair_sched_class;
2538
2539#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2540 if (likely(sched_info_on()))
2541 memset(&p->sched_info, 0, sizeof(p->sched_info));
2542#endif
2543#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2544 p->oncpu = 0;
2545#endif
2546#ifdef CONFIG_PREEMPT
2547
2548 task_thread_info(p)->preempt_count = 1;
2549#endif
2550 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2551
2552 put_cpu();
2553}
2554
2555
2556
2557
2558
2559
2560
2561
2562void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2563{
2564 unsigned long flags;
2565 struct rq *rq;
2566
2567 rq = task_rq_lock(p, &flags);
2568 BUG_ON(p->state != TASK_RUNNING);
2569 update_rq_clock(rq);
2570
2571 p->prio = effective_prio(p);
2572
2573 if (!p->sched_class->task_new || !current->se.on_rq) {
2574 activate_task(rq, p, 0);
2575 } else {
2576
2577
2578
2579
2580 p->sched_class->task_new(rq, p);
2581 inc_nr_running(rq);
2582 }
2583 trace_sched_wakeup_new(rq, p, 1);
2584 check_preempt_curr(rq, p, 0);
2585#ifdef CONFIG_SMP
2586 if (p->sched_class->task_wake_up)
2587 p->sched_class->task_wake_up(rq, p);
2588#endif
2589 task_rq_unlock(rq, &flags);
2590}
2591
2592#ifdef CONFIG_PREEMPT_NOTIFIERS
2593
2594
2595
2596
2597
2598void preempt_notifier_register(struct preempt_notifier *notifier)
2599{
2600 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2601}
2602EXPORT_SYMBOL_GPL(preempt_notifier_register);
2603
2604
2605
2606
2607
2608
2609
2610void preempt_notifier_unregister(struct preempt_notifier *notifier)
2611{
2612 hlist_del(¬ifier->link);
2613}
2614EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2615
2616static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2617{
2618 struct preempt_notifier *notifier;
2619 struct hlist_node *node;
2620
2621 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2622 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2623}
2624
2625static void
2626fire_sched_out_preempt_notifiers(struct task_struct *curr,
2627 struct task_struct *next)
2628{
2629 struct preempt_notifier *notifier;
2630 struct hlist_node *node;
2631
2632 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2633 notifier->ops->sched_out(notifier, next);
2634}
2635
2636#else
2637
2638static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2639{
2640}
2641
2642static void
2643fire_sched_out_preempt_notifiers(struct task_struct *curr,
2644 struct task_struct *next)
2645{
2646}
2647
2648#endif
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663static inline void
2664prepare_task_switch(struct rq *rq, struct task_struct *prev,
2665 struct task_struct *next)
2666{
2667 fire_sched_out_preempt_notifiers(prev, next);
2668 prepare_lock_switch(rq, next);
2669 prepare_arch_switch(next);
2670}
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2688 __releases(rq->lock)
2689{
2690 struct mm_struct *mm = rq->prev_mm;
2691 long prev_state;
2692#ifdef CONFIG_SMP
2693 int post_schedule = 0;
2694
2695 if (current->sched_class->needs_post_schedule)
2696 post_schedule = current->sched_class->needs_post_schedule(rq);
2697#endif
2698
2699 rq->prev_mm = NULL;
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712 prev_state = prev->state;
2713 finish_arch_switch(prev);
2714 finish_lock_switch(rq, prev);
2715#ifdef CONFIG_SMP
2716 if (post_schedule)
2717 current->sched_class->post_schedule(rq);
2718#endif
2719
2720 fire_sched_in_preempt_notifiers(current);
2721 if (mm)
2722 mmdrop(mm);
2723 if (unlikely(prev_state == TASK_DEAD)) {
2724
2725
2726
2727
2728 kprobe_flush_task(prev);
2729 put_task_struct(prev);
2730 }
2731}
2732
2733
2734
2735
2736
2737asmlinkage void schedule_tail(struct task_struct *prev)
2738 __releases(rq->lock)
2739{
2740 struct rq *rq = this_rq();
2741
2742 finish_task_switch(rq, prev);
2743#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2744
2745 preempt_enable();
2746#endif
2747 if (current->set_child_tid)
2748 put_user(task_pid_vnr(current), current->set_child_tid);
2749}
2750
2751
2752
2753
2754
2755static inline void
2756context_switch(struct rq *rq, struct task_struct *prev,
2757 struct task_struct *next)
2758{
2759 struct mm_struct *mm, *oldmm;
2760
2761 prepare_task_switch(rq, prev, next);
2762 trace_sched_switch(rq, prev, next);
2763 mm = next->mm;
2764 oldmm = prev->active_mm;
2765
2766
2767
2768
2769
2770 arch_enter_lazy_cpu_mode();
2771
2772 if (unlikely(!mm)) {
2773 next->active_mm = oldmm;
2774 atomic_inc(&oldmm->mm_count);
2775 enter_lazy_tlb(oldmm, next);
2776 } else
2777 switch_mm(oldmm, mm, next);
2778
2779 if (unlikely(!prev->mm)) {
2780 prev->active_mm = NULL;
2781 rq->prev_mm = oldmm;
2782 }
2783
2784
2785
2786
2787
2788
2789#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2790 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2791#endif
2792
2793
2794 switch_to(prev, next, prev);
2795
2796 barrier();
2797
2798
2799
2800
2801
2802 finish_task_switch(this_rq(), prev);
2803}
2804
2805
2806
2807
2808
2809
2810
2811
2812unsigned long nr_running(void)
2813{
2814 unsigned long i, sum = 0;
2815
2816 for_each_online_cpu(i)
2817 sum += cpu_rq(i)->nr_running;
2818
2819 return sum;
2820}
2821
2822unsigned long nr_uninterruptible(void)
2823{
2824 unsigned long i, sum = 0;
2825
2826 for_each_possible_cpu(i)
2827 sum += cpu_rq(i)->nr_uninterruptible;
2828
2829
2830
2831
2832
2833 if (unlikely((long)sum < 0))
2834 sum = 0;
2835
2836 return sum;
2837}
2838
2839unsigned long long nr_context_switches(void)
2840{
2841 int i;
2842 unsigned long long sum = 0;
2843
2844 for_each_possible_cpu(i)
2845 sum += cpu_rq(i)->nr_switches;
2846
2847 return sum;
2848}
2849
2850unsigned long nr_iowait(void)
2851{
2852 unsigned long i, sum = 0;
2853
2854 for_each_possible_cpu(i)
2855 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2856
2857 return sum;
2858}
2859
2860unsigned long nr_active(void)
2861{
2862 unsigned long i, running = 0, uninterruptible = 0;
2863
2864 for_each_online_cpu(i) {
2865 running += cpu_rq(i)->nr_running;
2866 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2867 }
2868
2869 if (unlikely((long)uninterruptible < 0))
2870 uninterruptible = 0;
2871
2872 return running + uninterruptible;
2873}
2874
2875
2876
2877
2878
2879static void update_cpu_load(struct rq *this_rq)
2880{
2881 unsigned long this_load = this_rq->load.weight;
2882 int i, scale;
2883
2884 this_rq->nr_load_updates++;
2885
2886
2887 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2888 unsigned long old_load, new_load;
2889
2890
2891
2892 old_load = this_rq->cpu_load[i];
2893 new_load = this_load;
2894
2895
2896
2897
2898
2899 if (new_load > old_load)
2900 new_load += scale-1;
2901 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2902 }
2903}
2904
2905#ifdef CONFIG_SMP
2906
2907
2908
2909
2910
2911
2912
2913static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2914 __acquires(rq1->lock)
2915 __acquires(rq2->lock)
2916{
2917 BUG_ON(!irqs_disabled());
2918 if (rq1 == rq2) {
2919 spin_lock(&rq1->lock);
2920 __acquire(rq2->lock);
2921 } else {
2922 if (rq1 < rq2) {
2923 spin_lock(&rq1->lock);
2924 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2925 } else {
2926 spin_lock(&rq2->lock);
2927 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2928 }
2929 }
2930 update_rq_clock(rq1);
2931 update_rq_clock(rq2);
2932}
2933
2934
2935
2936
2937
2938
2939
2940static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2941 __releases(rq1->lock)
2942 __releases(rq2->lock)
2943{
2944 spin_unlock(&rq1->lock);
2945 if (rq1 != rq2)
2946 spin_unlock(&rq2->lock);
2947 else
2948 __release(rq2->lock);
2949}
2950
2951
2952
2953
2954
2955
2956
2957static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2958{
2959 struct migration_req req;
2960 unsigned long flags;
2961 struct rq *rq;
2962
2963 rq = task_rq_lock(p, &flags);
2964 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2965 || unlikely(!cpu_active(dest_cpu)))
2966 goto out;
2967
2968
2969 if (migrate_task(p, dest_cpu, &req)) {
2970
2971 struct task_struct *mt = rq->migration_thread;
2972
2973 get_task_struct(mt);
2974 task_rq_unlock(rq, &flags);
2975 wake_up_process(mt);
2976 put_task_struct(mt);
2977 wait_for_completion(&req.done);
2978
2979 return;
2980 }
2981out:
2982 task_rq_unlock(rq, &flags);
2983}
2984
2985
2986
2987
2988
2989void sched_exec(void)
2990{
2991 int new_cpu, this_cpu = get_cpu();
2992 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2993 put_cpu();
2994 if (new_cpu != this_cpu)
2995 sched_migrate_task(current, new_cpu);
2996}
2997
2998
2999
3000
3001
3002static void pull_task(struct rq *src_rq, struct task_struct *p,
3003 struct rq *this_rq, int this_cpu)
3004{
3005 deactivate_task(src_rq, p, 0);
3006 set_task_cpu(p, this_cpu);
3007 activate_task(this_rq, p, 0);
3008
3009
3010
3011
3012 check_preempt_curr(this_rq, p, 0);
3013}
3014
3015
3016
3017
3018static
3019int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3020 struct sched_domain *sd, enum cpu_idle_type idle,
3021 int *all_pinned)
3022{
3023 int tsk_cache_hot = 0;
3024
3025
3026
3027
3028
3029
3030 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3031 schedstat_inc(p, se.nr_failed_migrations_affine);
3032 return 0;
3033 }
3034 *all_pinned = 0;
3035
3036 if (task_running(rq, p)) {
3037 schedstat_inc(p, se.nr_failed_migrations_running);
3038 return 0;
3039 }
3040
3041
3042
3043
3044
3045
3046
3047 tsk_cache_hot = task_hot(p, rq->clock, sd);
3048 if (!tsk_cache_hot ||
3049 sd->nr_balance_failed > sd->cache_nice_tries) {
3050#ifdef CONFIG_SCHEDSTATS
3051 if (tsk_cache_hot) {
3052 schedstat_inc(sd, lb_hot_gained[idle]);
3053 schedstat_inc(p, se.nr_forced_migrations);
3054 }
3055#endif
3056 return 1;
3057 }
3058
3059 if (tsk_cache_hot) {
3060 schedstat_inc(p, se.nr_failed_migrations_hot);
3061 return 0;
3062 }
3063 return 1;
3064}
3065
3066static unsigned long
3067balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3068 unsigned long max_load_move, struct sched_domain *sd,
3069 enum cpu_idle_type idle, int *all_pinned,
3070 int *this_best_prio, struct rq_iterator *iterator)
3071{
3072 int loops = 0, pulled = 0, pinned = 0;
3073 struct task_struct *p;
3074 long rem_load_move = max_load_move;
3075
3076 if (max_load_move == 0)
3077 goto out;
3078
3079 pinned = 1;
3080
3081
3082
3083
3084 p = iterator->start(iterator->arg);
3085next:
3086 if (!p || loops++ > sysctl_sched_nr_migrate)
3087 goto out;
3088
3089 if ((p->se.load.weight >> 1) > rem_load_move ||
3090 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3091 p = iterator->next(iterator->arg);
3092 goto next;
3093 }
3094
3095 pull_task(busiest, p, this_rq, this_cpu);
3096 pulled++;
3097 rem_load_move -= p->se.load.weight;
3098
3099#ifdef CONFIG_PREEMPT
3100
3101
3102
3103
3104
3105 if (idle == CPU_NEWLY_IDLE)
3106 goto out;
3107#endif
3108
3109
3110
3111
3112 if (rem_load_move > 0) {
3113 if (p->prio < *this_best_prio)
3114 *this_best_prio = p->prio;
3115 p = iterator->next(iterator->arg);
3116 goto next;
3117 }
3118out:
3119
3120
3121
3122
3123
3124 schedstat_add(sd, lb_gained[idle], pulled);
3125
3126 if (all_pinned)
3127 *all_pinned = pinned;
3128
3129 return max_load_move - rem_load_move;
3130}
3131
3132
3133
3134
3135
3136
3137
3138
3139static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3140 unsigned long max_load_move,
3141 struct sched_domain *sd, enum cpu_idle_type idle,
3142 int *all_pinned)
3143{
3144 const struct sched_class *class = sched_class_highest;
3145 unsigned long total_load_moved = 0;
3146 int this_best_prio = this_rq->curr->prio;
3147
3148 do {
3149 total_load_moved +=
3150 class->load_balance(this_rq, this_cpu, busiest,
3151 max_load_move - total_load_moved,
3152 sd, idle, all_pinned, &this_best_prio);
3153 class = class->next;
3154
3155#ifdef CONFIG_PREEMPT
3156
3157
3158
3159
3160
3161 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3162 break;
3163#endif
3164 } while (class && max_load_move > total_load_moved);
3165
3166 return total_load_moved > 0;
3167}
3168
3169static int
3170iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3171 struct sched_domain *sd, enum cpu_idle_type idle,
3172 struct rq_iterator *iterator)
3173{
3174 struct task_struct *p = iterator->start(iterator->arg);
3175 int pinned = 0;
3176
3177 while (p) {
3178 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3179 pull_task(busiest, p, this_rq, this_cpu);
3180
3181
3182
3183
3184
3185 schedstat_inc(sd, lb_gained[idle]);
3186
3187 return 1;
3188 }
3189 p = iterator->next(iterator->arg);
3190 }
3191
3192 return 0;
3193}
3194
3195
3196
3197
3198
3199
3200
3201
3202static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3203 struct sched_domain *sd, enum cpu_idle_type idle)
3204{
3205 const struct sched_class *class;
3206
3207 for (class = sched_class_highest; class; class = class->next)
3208 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3209 return 1;
3210
3211 return 0;
3212}
3213
3214
3215
3216
3217
3218struct sd_lb_stats {
3219 struct sched_group *busiest;
3220 struct sched_group *this;
3221 unsigned long total_load;
3222 unsigned long total_pwr;
3223 unsigned long avg_load;
3224
3225
3226 unsigned long this_load;
3227 unsigned long this_load_per_task;
3228 unsigned long this_nr_running;
3229
3230
3231 unsigned long max_load;
3232 unsigned long busiest_load_per_task;
3233 unsigned long busiest_nr_running;
3234
3235 int group_imb;
3236#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3237 int power_savings_balance;
3238 struct sched_group *group_min;
3239 struct sched_group *group_leader;
3240 unsigned long min_load_per_task;
3241 unsigned long leader_nr_running;
3242 unsigned long min_nr_running;
3243#endif
3244};
3245
3246
3247
3248
3249struct sg_lb_stats {
3250 unsigned long avg_load;
3251 unsigned long group_load;
3252 unsigned long sum_nr_running;
3253 unsigned long sum_weighted_load;
3254 unsigned long group_capacity;
3255 int group_imb;
3256};
3257
3258
3259
3260
3261
3262static inline unsigned int group_first_cpu(struct sched_group *group)
3263{
3264 return cpumask_first(sched_group_cpus(group));
3265}
3266
3267
3268
3269
3270
3271
3272static inline int get_sd_load_idx(struct sched_domain *sd,
3273 enum cpu_idle_type idle)
3274{
3275 int load_idx;
3276
3277 switch (idle) {
3278 case CPU_NOT_IDLE:
3279 load_idx = sd->busy_idx;
3280 break;
3281
3282 case CPU_NEWLY_IDLE:
3283 load_idx = sd->newidle_idx;
3284 break;
3285 default:
3286 load_idx = sd->idle_idx;
3287 break;
3288 }
3289
3290 return load_idx;
3291}
3292
3293
3294#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3295
3296
3297
3298
3299
3300
3301
3302
3303static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3304 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3305{
3306
3307
3308
3309
3310 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3311 sds->power_savings_balance = 0;
3312 else {
3313 sds->power_savings_balance = 1;
3314 sds->min_nr_running = ULONG_MAX;
3315 sds->leader_nr_running = 0;
3316 }
3317}
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329static inline void update_sd_power_savings_stats(struct sched_group *group,
3330 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3331{
3332
3333 if (!sds->power_savings_balance)
3334 return;
3335
3336
3337
3338
3339
3340 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3341 !sds->this_nr_running))
3342 sds->power_savings_balance = 0;
3343
3344
3345
3346
3347
3348 if (!sds->power_savings_balance ||
3349 sgs->sum_nr_running >= sgs->group_capacity ||
3350 !sgs->sum_nr_running)
3351 return;
3352
3353
3354
3355
3356
3357
3358 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3359 (sgs->sum_nr_running == sds->min_nr_running &&
3360 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3361 sds->group_min = group;
3362 sds->min_nr_running = sgs->sum_nr_running;
3363 sds->min_load_per_task = sgs->sum_weighted_load /
3364 sgs->sum_nr_running;
3365 }
3366
3367
3368
3369
3370
3371
3372 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3373 return;
3374
3375 if (sgs->sum_nr_running > sds->leader_nr_running ||
3376 (sgs->sum_nr_running == sds->leader_nr_running &&
3377 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3378 sds->group_leader = group;
3379 sds->leader_nr_running = sgs->sum_nr_running;
3380 }
3381}
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3399 int this_cpu, unsigned long *imbalance)
3400{
3401 if (!sds->power_savings_balance)
3402 return 0;
3403
3404 if (sds->this != sds->group_leader ||
3405 sds->group_leader == sds->group_min)
3406 return 0;
3407
3408 *imbalance = sds->min_load_per_task;
3409 sds->busiest = sds->group_min;
3410
3411 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3412 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3413 group_first_cpu(sds->group_leader);
3414 }
3415
3416 return 1;
3417
3418}
3419#else
3420static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3421 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3422{
3423 return;
3424}
3425
3426static inline void update_sd_power_savings_stats(struct sched_group *group,
3427 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3428{
3429 return;
3430}
3431
3432static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3433 int this_cpu, unsigned long *imbalance)
3434{
3435 return 0;
3436}
3437#endif
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3453 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3454 int local_group, const struct cpumask *cpus,
3455 int *balance, struct sg_lb_stats *sgs)
3456{
3457 unsigned long load, max_cpu_load, min_cpu_load;
3458 int i;
3459 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3460 unsigned long sum_avg_load_per_task;
3461 unsigned long avg_load_per_task;
3462
3463 if (local_group)
3464 balance_cpu = group_first_cpu(group);
3465
3466
3467 sum_avg_load_per_task = avg_load_per_task = 0;
3468 max_cpu_load = 0;
3469 min_cpu_load = ~0UL;
3470
3471 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3472 struct rq *rq = cpu_rq(i);
3473
3474 if (*sd_idle && rq->nr_running)
3475 *sd_idle = 0;
3476
3477
3478 if (local_group) {
3479 if (idle_cpu(i) && !first_idle_cpu) {
3480 first_idle_cpu = 1;
3481 balance_cpu = i;
3482 }
3483
3484 load = target_load(i, load_idx);
3485 } else {
3486 load = source_load(i, load_idx);
3487 if (load > max_cpu_load)
3488 max_cpu_load = load;
3489 if (min_cpu_load > load)
3490 min_cpu_load = load;
3491 }
3492
3493 sgs->group_load += load;
3494 sgs->sum_nr_running += rq->nr_running;
3495 sgs->sum_weighted_load += weighted_cpuload(i);
3496
3497 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3498 }
3499
3500
3501
3502
3503
3504
3505
3506 if (idle != CPU_NEWLY_IDLE && local_group &&
3507 balance_cpu != this_cpu && balance) {
3508 *balance = 0;
3509 return;
3510 }
3511
3512
3513 sgs->avg_load = sg_div_cpu_power(group,
3514 sgs->group_load * SCHED_LOAD_SCALE);
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526 avg_load_per_task = sg_div_cpu_power(group,
3527 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3528
3529 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3530 sgs->group_imb = 1;
3531
3532 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3533
3534}
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3547 enum cpu_idle_type idle, int *sd_idle,
3548 const struct cpumask *cpus, int *balance,
3549 struct sd_lb_stats *sds)
3550{
3551 struct sched_group *group = sd->groups;
3552 struct sg_lb_stats sgs;
3553 int load_idx;
3554
3555 init_sd_power_savings_stats(sd, sds, idle);
3556 load_idx = get_sd_load_idx(sd, idle);
3557
3558 do {
3559 int local_group;
3560
3561 local_group = cpumask_test_cpu(this_cpu,
3562 sched_group_cpus(group));
3563 memset(&sgs, 0, sizeof(sgs));
3564 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3565 local_group, cpus, balance, &sgs);
3566
3567 if (local_group && balance && !(*balance))
3568 return;
3569
3570 sds->total_load += sgs.group_load;
3571 sds->total_pwr += group->__cpu_power;
3572
3573 if (local_group) {
3574 sds->this_load = sgs.avg_load;
3575 sds->this = group;
3576 sds->this_nr_running = sgs.sum_nr_running;
3577 sds->this_load_per_task = sgs.sum_weighted_load;
3578 } else if (sgs.avg_load > sds->max_load &&
3579 (sgs.sum_nr_running > sgs.group_capacity ||
3580 sgs.group_imb)) {
3581 sds->max_load = sgs.avg_load;
3582 sds->busiest = group;
3583 sds->busiest_nr_running = sgs.sum_nr_running;
3584 sds->busiest_load_per_task = sgs.sum_weighted_load;
3585 sds->group_imb = sgs.group_imb;
3586 }
3587
3588 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3589 group = group->next;
3590 } while (group != sd->groups);
3591
3592}
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3603 int this_cpu, unsigned long *imbalance)
3604{
3605 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3606 unsigned int imbn = 2;
3607
3608 if (sds->this_nr_running) {
3609 sds->this_load_per_task /= sds->this_nr_running;
3610 if (sds->busiest_load_per_task >
3611 sds->this_load_per_task)
3612 imbn = 1;
3613 } else
3614 sds->this_load_per_task =
3615 cpu_avg_load_per_task(this_cpu);
3616
3617 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3618 sds->busiest_load_per_task * imbn) {
3619 *imbalance = sds->busiest_load_per_task;
3620 return;
3621 }
3622
3623
3624
3625
3626
3627
3628
3629 pwr_now += sds->busiest->__cpu_power *
3630 min(sds->busiest_load_per_task, sds->max_load);
3631 pwr_now += sds->this->__cpu_power *
3632 min(sds->this_load_per_task, sds->this_load);
3633 pwr_now /= SCHED_LOAD_SCALE;
3634
3635
3636 tmp = sg_div_cpu_power(sds->busiest,
3637 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3638 if (sds->max_load > tmp)
3639 pwr_move += sds->busiest->__cpu_power *
3640 min(sds->busiest_load_per_task, sds->max_load - tmp);
3641
3642
3643 if (sds->max_load * sds->busiest->__cpu_power <
3644 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3645 tmp = sg_div_cpu_power(sds->this,
3646 sds->max_load * sds->busiest->__cpu_power);
3647 else
3648 tmp = sg_div_cpu_power(sds->this,
3649 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3650 pwr_move += sds->this->__cpu_power *
3651 min(sds->this_load_per_task, sds->this_load + tmp);
3652 pwr_move /= SCHED_LOAD_SCALE;
3653
3654
3655 if (pwr_move > pwr_now)
3656 *imbalance = sds->busiest_load_per_task;
3657}
3658
3659
3660
3661
3662
3663
3664
3665
3666static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3667 unsigned long *imbalance)
3668{
3669 unsigned long max_pull;
3670
3671
3672
3673
3674
3675 if (sds->max_load < sds->avg_load) {
3676 *imbalance = 0;
3677 return fix_small_imbalance(sds, this_cpu, imbalance);
3678 }
3679
3680
3681 max_pull = min(sds->max_load - sds->avg_load,
3682 sds->max_load - sds->busiest_load_per_task);
3683
3684
3685 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3686 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3687 / SCHED_LOAD_SCALE;
3688
3689
3690
3691
3692
3693
3694
3695 if (*imbalance < sds->busiest_load_per_task)
3696 return fix_small_imbalance(sds, this_cpu, imbalance);
3697
3698}
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726static struct sched_group *
3727find_busiest_group(struct sched_domain *sd, int this_cpu,
3728 unsigned long *imbalance, enum cpu_idle_type idle,
3729 int *sd_idle, const struct cpumask *cpus, int *balance)
3730{
3731 struct sd_lb_stats sds;
3732
3733 memset(&sds, 0, sizeof(sds));
3734
3735
3736
3737
3738
3739 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3740 balance, &sds);
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752 if (balance && !(*balance))
3753 goto ret;
3754
3755 if (!sds.busiest || sds.busiest_nr_running == 0)
3756 goto out_balanced;
3757
3758 if (sds.this_load >= sds.max_load)
3759 goto out_balanced;
3760
3761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3762
3763 if (sds.this_load >= sds.avg_load)
3764 goto out_balanced;
3765
3766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3767 goto out_balanced;
3768
3769 sds.busiest_load_per_task /= sds.busiest_nr_running;
3770 if (sds.group_imb)
3771 sds.busiest_load_per_task =
3772 min(sds.busiest_load_per_task, sds.avg_load);
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785 if (sds.max_load <= sds.busiest_load_per_task)
3786 goto out_balanced;
3787
3788
3789 calculate_imbalance(&sds, this_cpu, imbalance);
3790 return sds.busiest;
3791
3792out_balanced:
3793
3794
3795
3796
3797 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3798 return sds.busiest;
3799ret:
3800 *imbalance = 0;
3801 return NULL;
3802}
3803
3804
3805
3806
3807static struct rq *
3808find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3809 unsigned long imbalance, const struct cpumask *cpus)
3810{
3811 struct rq *busiest = NULL, *rq;
3812 unsigned long max_load = 0;
3813 int i;
3814
3815 for_each_cpu(i, sched_group_cpus(group)) {
3816 unsigned long wl;
3817
3818 if (!cpumask_test_cpu(i, cpus))
3819 continue;
3820
3821 rq = cpu_rq(i);
3822 wl = weighted_cpuload(i);
3823
3824 if (rq->nr_running == 1 && wl > imbalance)
3825 continue;
3826
3827 if (wl > max_load) {
3828 max_load = wl;
3829 busiest = rq;
3830 }
3831 }
3832
3833 return busiest;
3834}
3835
3836
3837
3838
3839
3840#define MAX_PINNED_INTERVAL 512
3841
3842
3843static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3844
3845
3846
3847
3848
3849static int load_balance(int this_cpu, struct rq *this_rq,
3850 struct sched_domain *sd, enum cpu_idle_type idle,
3851 int *balance)
3852{
3853 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3854 struct sched_group *group;
3855 unsigned long imbalance;
3856 struct rq *busiest;
3857 unsigned long flags;
3858 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3859
3860 cpumask_setall(cpus);
3861
3862
3863
3864
3865
3866
3867
3868 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3869 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3870 sd_idle = 1;
3871
3872 schedstat_inc(sd, lb_count[idle]);
3873
3874redo:
3875 update_shares(sd);
3876 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3877 cpus, balance);
3878
3879 if (*balance == 0)
3880 goto out_balanced;
3881
3882 if (!group) {
3883 schedstat_inc(sd, lb_nobusyg[idle]);
3884 goto out_balanced;
3885 }
3886
3887 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3888 if (!busiest) {
3889 schedstat_inc(sd, lb_nobusyq[idle]);
3890 goto out_balanced;
3891 }
3892
3893 BUG_ON(busiest == this_rq);
3894
3895 schedstat_add(sd, lb_imbalance[idle], imbalance);
3896
3897 ld_moved = 0;
3898 if (busiest->nr_running > 1) {
3899
3900
3901
3902
3903
3904
3905 local_irq_save(flags);
3906 double_rq_lock(this_rq, busiest);
3907 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3908 imbalance, sd, idle, &all_pinned);
3909 double_rq_unlock(this_rq, busiest);
3910 local_irq_restore(flags);
3911
3912
3913
3914
3915 if (ld_moved && this_cpu != smp_processor_id())
3916 resched_cpu(this_cpu);
3917
3918
3919 if (unlikely(all_pinned)) {
3920 cpumask_clear_cpu(cpu_of(busiest), cpus);
3921 if (!cpumask_empty(cpus))
3922 goto redo;
3923 goto out_balanced;
3924 }
3925 }
3926
3927 if (!ld_moved) {
3928 schedstat_inc(sd, lb_failed[idle]);
3929 sd->nr_balance_failed++;
3930
3931 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3932
3933 spin_lock_irqsave(&busiest->lock, flags);
3934
3935
3936
3937
3938 if (!cpumask_test_cpu(this_cpu,
3939 &busiest->curr->cpus_allowed)) {
3940 spin_unlock_irqrestore(&busiest->lock, flags);
3941 all_pinned = 1;
3942 goto out_one_pinned;
3943 }
3944
3945 if (!busiest->active_balance) {
3946 busiest->active_balance = 1;
3947 busiest->push_cpu = this_cpu;
3948 active_balance = 1;
3949 }
3950 spin_unlock_irqrestore(&busiest->lock, flags);
3951 if (active_balance)
3952 wake_up_process(busiest->migration_thread);
3953
3954
3955
3956
3957
3958 sd->nr_balance_failed = sd->cache_nice_tries+1;
3959 }
3960 } else
3961 sd->nr_balance_failed = 0;
3962
3963 if (likely(!active_balance)) {
3964
3965 sd->balance_interval = sd->min_interval;
3966 } else {
3967
3968
3969
3970
3971
3972
3973 if (sd->balance_interval < sd->max_interval)
3974 sd->balance_interval *= 2;
3975 }
3976
3977 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3978 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3979 ld_moved = -1;
3980
3981 goto out;
3982
3983out_balanced:
3984 schedstat_inc(sd, lb_balanced[idle]);
3985
3986 sd->nr_balance_failed = 0;
3987
3988out_one_pinned:
3989
3990 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3991 (sd->balance_interval < sd->max_interval))
3992 sd->balance_interval *= 2;
3993
3994 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3995 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3996 ld_moved = -1;
3997 else
3998 ld_moved = 0;
3999out:
4000 if (ld_moved)
4001 update_shares(sd);
4002 return ld_moved;
4003}
4004
4005
4006
4007
4008
4009
4010
4011
4012static int
4013load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4014{
4015 struct sched_group *group;
4016 struct rq *busiest = NULL;
4017 unsigned long imbalance;
4018 int ld_moved = 0;
4019 int sd_idle = 0;
4020 int all_pinned = 0;
4021 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4022
4023 cpumask_setall(cpus);
4024
4025
4026
4027
4028
4029
4030
4031 if (sd->flags & SD_SHARE_CPUPOWER &&
4032 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4033 sd_idle = 1;
4034
4035 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4036redo:
4037 update_shares_locked(this_rq, sd);
4038 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4039 &sd_idle, cpus, NULL);
4040 if (!group) {
4041 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4042 goto out_balanced;
4043 }
4044
4045 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4046 if (!busiest) {
4047 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4048 goto out_balanced;
4049 }
4050
4051 BUG_ON(busiest == this_rq);
4052
4053 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4054
4055 ld_moved = 0;
4056 if (busiest->nr_running > 1) {
4057
4058 double_lock_balance(this_rq, busiest);
4059
4060 update_rq_clock(busiest);
4061 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4062 imbalance, sd, CPU_NEWLY_IDLE,
4063 &all_pinned);
4064 double_unlock_balance(this_rq, busiest);
4065
4066 if (unlikely(all_pinned)) {
4067 cpumask_clear_cpu(cpu_of(busiest), cpus);
4068 if (!cpumask_empty(cpus))
4069 goto redo;
4070 }
4071 }
4072
4073 if (!ld_moved) {
4074 int active_balance = 0;
4075
4076 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4077 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4078 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4079 return -1;
4080
4081 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4082 return -1;
4083
4084 if (sd->nr_balance_failed++ < 2)
4085 return -1;
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110 double_lock_balance(this_rq, busiest);
4111
4112
4113
4114
4115
4116 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4117 double_unlock_balance(this_rq, busiest);
4118 all_pinned = 1;
4119 return ld_moved;
4120 }
4121
4122 if (!busiest->active_balance) {
4123 busiest->active_balance = 1;
4124 busiest->push_cpu = this_cpu;
4125 active_balance = 1;
4126 }
4127
4128 double_unlock_balance(this_rq, busiest);
4129
4130
4131
4132 spin_unlock(&this_rq->lock);
4133 if (active_balance)
4134 wake_up_process(busiest->migration_thread);
4135 spin_lock(&this_rq->lock);
4136
4137 } else
4138 sd->nr_balance_failed = 0;
4139
4140 update_shares_locked(this_rq, sd);
4141 return ld_moved;
4142
4143out_balanced:
4144 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4145 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4146 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4147 return -1;
4148 sd->nr_balance_failed = 0;
4149
4150 return 0;
4151}
4152
4153
4154
4155
4156
4157static void idle_balance(int this_cpu, struct rq *this_rq)
4158{
4159 struct sched_domain *sd;
4160 int pulled_task = 0;
4161 unsigned long next_balance = jiffies + HZ;
4162
4163 for_each_domain(this_cpu, sd) {
4164 unsigned long interval;
4165
4166 if (!(sd->flags & SD_LOAD_BALANCE))
4167 continue;
4168
4169 if (sd->flags & SD_BALANCE_NEWIDLE)
4170
4171 pulled_task = load_balance_newidle(this_cpu, this_rq,
4172 sd);
4173
4174 interval = msecs_to_jiffies(sd->balance_interval);
4175 if (time_after(next_balance, sd->last_balance + interval))
4176 next_balance = sd->last_balance + interval;
4177 if (pulled_task)
4178 break;
4179 }
4180 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4181
4182
4183
4184
4185 this_rq->next_balance = next_balance;
4186 }
4187}
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4198{
4199 int target_cpu = busiest_rq->push_cpu;
4200 struct sched_domain *sd;
4201 struct rq *target_rq;
4202
4203
4204 if (busiest_rq->nr_running <= 1)
4205 return;
4206
4207 target_rq = cpu_rq(target_cpu);
4208
4209
4210
4211
4212
4213
4214 BUG_ON(busiest_rq == target_rq);
4215
4216
4217 double_lock_balance(busiest_rq, target_rq);
4218 update_rq_clock(busiest_rq);
4219 update_rq_clock(target_rq);
4220
4221
4222 for_each_domain(target_cpu, sd) {
4223 if ((sd->flags & SD_LOAD_BALANCE) &&
4224 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4225 break;
4226 }
4227
4228 if (likely(sd)) {
4229 schedstat_inc(sd, alb_count);
4230
4231 if (move_one_task(target_rq, target_cpu, busiest_rq,
4232 sd, CPU_IDLE))
4233 schedstat_inc(sd, alb_pushed);
4234 else
4235 schedstat_inc(sd, alb_failed);
4236 }
4237 double_unlock_balance(busiest_rq, target_rq);
4238}
4239
4240#ifdef CONFIG_NO_HZ
4241static struct {
4242 atomic_t load_balancer;
4243 cpumask_var_t cpu_mask;
4244} nohz ____cacheline_aligned = {
4245 .load_balancer = ATOMIC_INIT(-1),
4246};
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268int select_nohz_load_balancer(int stop_tick)
4269{
4270 int cpu = smp_processor_id();
4271
4272 if (stop_tick) {
4273 cpu_rq(cpu)->in_nohz_recently = 1;
4274
4275 if (!cpu_active(cpu)) {
4276 if (atomic_read(&nohz.load_balancer) != cpu)
4277 return 0;
4278
4279
4280
4281
4282
4283 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4284 BUG();
4285
4286 return 0;
4287 }
4288
4289 cpumask_set_cpu(cpu, nohz.cpu_mask);
4290
4291
4292 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4293 if (atomic_read(&nohz.load_balancer) == cpu)
4294 atomic_set(&nohz.load_balancer, -1);
4295 return 0;
4296 }
4297
4298 if (atomic_read(&nohz.load_balancer) == -1) {
4299
4300 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4301 return 1;
4302 } else if (atomic_read(&nohz.load_balancer) == cpu)
4303 return 1;
4304 } else {
4305 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4306 return 0;
4307
4308 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4309
4310 if (atomic_read(&nohz.load_balancer) == cpu)
4311 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4312 BUG();
4313 }
4314 return 0;
4315}
4316#endif
4317
4318static DEFINE_SPINLOCK(balancing);
4319
4320
4321
4322
4323
4324
4325
4326static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4327{
4328 int balance = 1;
4329 struct rq *rq = cpu_rq(cpu);
4330 unsigned long interval;
4331 struct sched_domain *sd;
4332
4333 unsigned long next_balance = jiffies + 60*HZ;
4334 int update_next_balance = 0;
4335 int need_serialize;
4336
4337 for_each_domain(cpu, sd) {
4338 if (!(sd->flags & SD_LOAD_BALANCE))
4339 continue;
4340
4341 interval = sd->balance_interval;
4342 if (idle != CPU_IDLE)
4343 interval *= sd->busy_factor;
4344
4345
4346 interval = msecs_to_jiffies(interval);
4347 if (unlikely(!interval))
4348 interval = 1;
4349 if (interval > HZ*NR_CPUS/10)
4350 interval = HZ*NR_CPUS/10;
4351
4352 need_serialize = sd->flags & SD_SERIALIZE;
4353
4354 if (need_serialize) {
4355 if (!spin_trylock(&balancing))
4356 goto out;
4357 }
4358
4359 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4360 if (load_balance(cpu, rq, sd, idle, &balance)) {
4361
4362
4363
4364
4365
4366 idle = CPU_NOT_IDLE;
4367 }
4368 sd->last_balance = jiffies;
4369 }
4370 if (need_serialize)
4371 spin_unlock(&balancing);
4372out:
4373 if (time_after(next_balance, sd->last_balance + interval)) {
4374 next_balance = sd->last_balance + interval;
4375 update_next_balance = 1;
4376 }
4377
4378
4379
4380
4381
4382
4383 if (!balance)
4384 break;
4385 }
4386
4387
4388
4389
4390
4391
4392 if (likely(update_next_balance))
4393 rq->next_balance = next_balance;
4394}
4395
4396
4397
4398
4399
4400
4401static void run_rebalance_domains(struct softirq_action *h)
4402{
4403 int this_cpu = smp_processor_id();
4404 struct rq *this_rq = cpu_rq(this_cpu);
4405 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4406 CPU_IDLE : CPU_NOT_IDLE;
4407
4408 rebalance_domains(this_cpu, idle);
4409
4410#ifdef CONFIG_NO_HZ
4411
4412
4413
4414
4415
4416 if (this_rq->idle_at_tick &&
4417 atomic_read(&nohz.load_balancer) == this_cpu) {
4418 struct rq *rq;
4419 int balance_cpu;
4420
4421 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4422 if (balance_cpu == this_cpu)
4423 continue;
4424
4425
4426
4427
4428
4429
4430 if (need_resched())
4431 break;
4432
4433 rebalance_domains(balance_cpu, CPU_IDLE);
4434
4435 rq = cpu_rq(balance_cpu);
4436 if (time_after(this_rq->next_balance, rq->next_balance))
4437 this_rq->next_balance = rq->next_balance;
4438 }
4439 }
4440#endif
4441}
4442
4443static inline int on_null_domain(int cpu)
4444{
4445 return !rcu_dereference(cpu_rq(cpu)->sd);
4446}
4447
4448
4449
4450
4451
4452
4453
4454
4455static inline void trigger_load_balance(struct rq *rq, int cpu)
4456{
4457#ifdef CONFIG_NO_HZ
4458
4459
4460
4461
4462
4463 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4464 rq->in_nohz_recently = 0;
4465
4466 if (atomic_read(&nohz.load_balancer) == cpu) {
4467 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4468 atomic_set(&nohz.load_balancer, -1);
4469 }
4470
4471 if (atomic_read(&nohz.load_balancer) == -1) {
4472
4473
4474
4475
4476
4477
4478
4479
4480 int ilb = cpumask_first(nohz.cpu_mask);
4481
4482 if (ilb < nr_cpu_ids)
4483 resched_cpu(ilb);
4484 }
4485 }
4486
4487
4488
4489
4490
4491 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4492 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4493 resched_cpu(cpu);
4494 return;
4495 }
4496
4497
4498
4499
4500
4501 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4502 cpumask_test_cpu(cpu, nohz.cpu_mask))
4503 return;
4504#endif
4505
4506 if (time_after_eq(jiffies, rq->next_balance) &&
4507 likely(!on_null_domain(cpu)))
4508 raise_softirq(SCHED_SOFTIRQ);
4509}
4510
4511#else
4512
4513
4514
4515
4516static inline void idle_balance(int cpu, struct rq *rq)
4517{
4518}
4519
4520#endif
4521
4522DEFINE_PER_CPU(struct kernel_stat, kstat);
4523
4524EXPORT_PER_CPU_SYMBOL(kstat);
4525
4526
4527
4528
4529
4530
4531
4532static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4533{
4534 u64 ns = 0;
4535
4536 if (task_current(rq, p)) {
4537 update_rq_clock(rq);
4538 ns = rq->clock - p->se.exec_start;
4539 if ((s64)ns < 0)
4540 ns = 0;
4541 }
4542
4543 return ns;
4544}
4545
4546unsigned long long task_delta_exec(struct task_struct *p)
4547{
4548 unsigned long flags;
4549 struct rq *rq;
4550 u64 ns = 0;
4551
4552 rq = task_rq_lock(p, &flags);
4553 ns = do_task_delta_exec(p, rq);
4554 task_rq_unlock(rq, &flags);
4555
4556 return ns;
4557}
4558
4559
4560
4561
4562
4563
4564unsigned long long task_sched_runtime(struct task_struct *p)
4565{
4566 unsigned long flags;
4567 struct rq *rq;
4568 u64 ns = 0;
4569
4570 rq = task_rq_lock(p, &flags);
4571 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4572 task_rq_unlock(rq, &flags);
4573
4574 return ns;
4575}
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586unsigned long long thread_group_sched_runtime(struct task_struct *p)
4587{
4588 struct task_cputime totals;
4589 unsigned long flags;
4590 struct rq *rq;
4591 u64 ns;
4592
4593 rq = task_rq_lock(p, &flags);
4594 thread_group_cputime(p, &totals);
4595 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4596 task_rq_unlock(rq, &flags);
4597
4598 return ns;
4599}
4600
4601
4602
4603
4604
4605
4606
4607void account_user_time(struct task_struct *p, cputime_t cputime,
4608 cputime_t cputime_scaled)
4609{
4610 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4611 cputime64_t tmp;
4612
4613
4614 p->utime = cputime_add(p->utime, cputime);
4615 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4616 account_group_user_time(p, cputime);
4617
4618
4619 tmp = cputime_to_cputime64(cputime);
4620 if (TASK_NICE(p) > 0)
4621 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4622 else
4623 cpustat->user = cputime64_add(cpustat->user, tmp);
4624
4625 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4626
4627 acct_update_integrals(p);
4628}
4629
4630
4631
4632
4633
4634
4635
4636static void account_guest_time(struct task_struct *p, cputime_t cputime,
4637 cputime_t cputime_scaled)
4638{
4639 cputime64_t tmp;
4640 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4641
4642 tmp = cputime_to_cputime64(cputime);
4643
4644
4645 p->utime = cputime_add(p->utime, cputime);
4646 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4647 account_group_user_time(p, cputime);
4648 p->gtime = cputime_add(p->gtime, cputime);
4649
4650
4651 cpustat->user = cputime64_add(cpustat->user, tmp);
4652 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4653}
4654
4655
4656
4657
4658
4659
4660
4661
4662void account_system_time(struct task_struct *p, int hardirq_offset,
4663 cputime_t cputime, cputime_t cputime_scaled)
4664{
4665 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4666 cputime64_t tmp;
4667
4668 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4669 account_guest_time(p, cputime, cputime_scaled);
4670 return;
4671 }
4672
4673
4674 p->stime = cputime_add(p->stime, cputime);
4675 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4676 account_group_system_time(p, cputime);
4677
4678
4679 tmp = cputime_to_cputime64(cputime);
4680 if (hardirq_count() - hardirq_offset)
4681 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4682 else if (softirq_count())
4683 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4684 else
4685 cpustat->system = cputime64_add(cpustat->system, tmp);
4686
4687 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
4688
4689
4690 acct_update_integrals(p);
4691}
4692
4693
4694
4695
4696
4697void account_steal_time(cputime_t cputime)
4698{
4699 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4700 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4701
4702 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4703}
4704
4705
4706
4707
4708
4709void account_idle_time(cputime_t cputime)
4710{
4711 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4712 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4713 struct rq *rq = this_rq();
4714
4715 if (atomic_read(&rq->nr_iowait) > 0)
4716 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4717 else
4718 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4719}
4720
4721#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4722
4723
4724
4725
4726
4727
4728void account_process_tick(struct task_struct *p, int user_tick)
4729{
4730 cputime_t one_jiffy = jiffies_to_cputime(1);
4731 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4732 struct rq *rq = this_rq();
4733
4734 if (user_tick)
4735 account_user_time(p, one_jiffy, one_jiffy_scaled);
4736 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
4737 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4738 one_jiffy_scaled);
4739 else
4740 account_idle_time(one_jiffy);
4741}
4742
4743
4744
4745
4746
4747
4748void account_steal_ticks(unsigned long ticks)
4749{
4750 account_steal_time(jiffies_to_cputime(ticks));
4751}
4752
4753
4754
4755
4756
4757void account_idle_ticks(unsigned long ticks)
4758{
4759 account_idle_time(jiffies_to_cputime(ticks));
4760}
4761
4762#endif
4763
4764
4765
4766
4767#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4768cputime_t task_utime(struct task_struct *p)
4769{
4770 return p->utime;
4771}
4772
4773cputime_t task_stime(struct task_struct *p)
4774{
4775 return p->stime;
4776}
4777#else
4778cputime_t task_utime(struct task_struct *p)
4779{
4780 clock_t utime = cputime_to_clock_t(p->utime),
4781 total = utime + cputime_to_clock_t(p->stime);
4782 u64 temp;
4783
4784
4785
4786
4787 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4788
4789 if (total) {
4790 temp *= utime;
4791 do_div(temp, total);
4792 }
4793 utime = (clock_t)temp;
4794
4795 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4796 return p->prev_utime;
4797}
4798
4799cputime_t task_stime(struct task_struct *p)
4800{
4801 clock_t stime;
4802
4803
4804
4805
4806
4807
4808 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4809 cputime_to_clock_t(task_utime(p));
4810
4811 if (stime >= 0)
4812 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4813
4814 return p->prev_stime;
4815}
4816#endif
4817
4818inline cputime_t task_gtime(struct task_struct *p)
4819{
4820 return p->gtime;
4821}
4822
4823
4824
4825
4826
4827
4828
4829
4830void scheduler_tick(void)
4831{
4832 int cpu = smp_processor_id();
4833 struct rq *rq = cpu_rq(cpu);
4834 struct task_struct *curr = rq->curr;
4835
4836 sched_clock_tick();
4837
4838 spin_lock(&rq->lock);
4839 update_rq_clock(rq);
4840 update_cpu_load(rq);
4841 curr->sched_class->task_tick(rq, curr, 0);
4842 spin_unlock(&rq->lock);
4843
4844#ifdef CONFIG_SMP
4845 rq->idle_at_tick = idle_cpu(cpu);
4846 trigger_load_balance(rq, cpu);
4847#endif
4848}
4849
4850notrace unsigned long get_parent_ip(unsigned long addr)
4851{
4852 if (in_lock_functions(addr)) {
4853 addr = CALLER_ADDR2;
4854 if (in_lock_functions(addr))
4855 addr = CALLER_ADDR3;
4856 }
4857 return addr;
4858}
4859
4860#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4861 defined(CONFIG_PREEMPT_TRACER))
4862
4863void __kprobes add_preempt_count(int val)
4864{
4865#ifdef CONFIG_DEBUG_PREEMPT
4866
4867
4868
4869 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4870 return;
4871#endif
4872 preempt_count() += val;
4873#ifdef CONFIG_DEBUG_PREEMPT
4874
4875
4876
4877 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4878 PREEMPT_MASK - 10);
4879#endif
4880 if (preempt_count() == val)
4881 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4882}
4883EXPORT_SYMBOL(add_preempt_count);
4884
4885void __kprobes sub_preempt_count(int val)
4886{
4887#ifdef CONFIG_DEBUG_PREEMPT
4888
4889
4890
4891 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4892 return;
4893
4894
4895
4896 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4897 !(preempt_count() & PREEMPT_MASK)))
4898 return;
4899#endif
4900
4901 if (preempt_count() == val)
4902 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4903 preempt_count() -= val;
4904}
4905EXPORT_SYMBOL(sub_preempt_count);
4906
4907#endif
4908
4909
4910
4911
4912static noinline void __schedule_bug(struct task_struct *prev)
4913{
4914 struct pt_regs *regs = get_irq_regs();
4915
4916 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4917 prev->comm, prev->pid, preempt_count());
4918
4919 debug_show_held_locks(prev);
4920 print_modules();
4921 if (irqs_disabled())
4922 print_irqtrace_events(prev);
4923
4924 if (regs)
4925 show_regs(regs);
4926 else
4927 dump_stack();
4928}
4929
4930
4931
4932
4933static inline void schedule_debug(struct task_struct *prev)
4934{
4935
4936
4937
4938
4939
4940 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4941 __schedule_bug(prev);
4942
4943 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4944
4945 schedstat_inc(this_rq(), sched_count);
4946#ifdef CONFIG_SCHEDSTATS
4947 if (unlikely(prev->lock_depth >= 0)) {
4948 schedstat_inc(this_rq(), bkl_count);
4949 schedstat_inc(prev, sched_info.bkl_count);
4950 }
4951#endif
4952}
4953
4954static void put_prev_task(struct rq *rq, struct task_struct *prev)
4955{
4956 if (prev->state == TASK_RUNNING) {
4957 u64 runtime = prev->se.sum_exec_runtime;
4958
4959 runtime -= prev->se.prev_sum_exec_runtime;
4960 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971 update_avg(&prev->se.avg_overlap, runtime);
4972 }
4973 prev->sched_class->put_prev_task(rq, prev);
4974}
4975
4976
4977
4978
4979static inline struct task_struct *
4980pick_next_task(struct rq *rq)
4981{
4982 const struct sched_class *class;
4983 struct task_struct *p;
4984
4985
4986
4987
4988
4989 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4990 p = fair_sched_class.pick_next_task(rq);
4991 if (likely(p))
4992 return p;
4993 }
4994
4995 class = sched_class_highest;
4996 for ( ; ; ) {
4997 p = class->pick_next_task(rq);
4998 if (p)
4999 return p;
5000
5001
5002
5003
5004 class = class->next;
5005 }
5006}
5007
5008
5009
5010
5011asmlinkage void __sched __schedule(void)
5012{
5013 struct task_struct *prev, *next;
5014 unsigned long *switch_count;
5015 struct rq *rq;
5016 int cpu;
5017
5018 cpu = smp_processor_id();
5019 rq = cpu_rq(cpu);
5020 rcu_qsctr_inc(cpu);
5021 prev = rq->curr;
5022 switch_count = &prev->nivcsw;
5023
5024 release_kernel_lock(prev);
5025need_resched_nonpreemptible:
5026
5027 schedule_debug(prev);
5028
5029 if (sched_feat(HRTICK))
5030 hrtick_clear(rq);
5031
5032 spin_lock_irq(&rq->lock);
5033 update_rq_clock(rq);
5034 clear_tsk_need_resched(prev);
5035
5036 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5037 if (unlikely(signal_pending_state(prev->state, prev)))
5038 prev->state = TASK_RUNNING;
5039 else
5040 deactivate_task(rq, prev, 1);
5041 switch_count = &prev->nvcsw;
5042 }
5043
5044#ifdef CONFIG_SMP
5045 if (prev->sched_class->pre_schedule)
5046 prev->sched_class->pre_schedule(rq, prev);
5047#endif
5048
5049 if (unlikely(!rq->nr_running))
5050 idle_balance(cpu, rq);
5051
5052 put_prev_task(rq, prev);
5053 next = pick_next_task