1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h>
60#include <linux/seq_file.h>
61#include <linux/sysctl.h>
62#include <linux/syscalls.h>
63#include <linux/times.h>
64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h>
66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h>
69#include <linux/pagemap.h>
70#include <linux/hrtimer.h>
71#include <linux/tick.h>
72#include <linux/debugfs.h>
73#include <linux/ctype.h>
74#include <linux/ftrace.h>
75
76#include <asm/tlb.h>
77#include <asm/irq_regs.h>
78
79#include "sched_cpupri.h"
80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
84
85
86
87
88
89#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
90#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
91#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
92
93
94
95
96
97
98#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
99#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
100#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
101
102
103
104
105#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
106
107#define NICE_0_LOAD SCHED_LOAD_SCALE
108#define NICE_0_SHIFT SCHED_LOAD_SHIFT
109
110
111
112
113
114
115
116#define DEF_TIMESLICE (100 * HZ / 1000)
117
118
119
120
121#define RUNTIME_INF ((u64)~0ULL)
122
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127
128
129
130
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136
137
138
139
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy)
148{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
150 return 1;
151 return 0;
152}
153
154static inline int task_has_rt_policy(struct task_struct *p)
155{
156 return rt_policy(p->policy);
157}
158
159
160
161
162struct rt_prio_array {
163 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
164 struct list_head queue[MAX_RT_PRIO];
165};
166
167struct rt_bandwidth {
168
169 spinlock_t rt_runtime_lock;
170 ktime_t rt_period;
171 u64 rt_runtime;
172 struct hrtimer rt_period_timer;
173};
174
175static struct rt_bandwidth def_rt_bandwidth;
176
177static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
178
179static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
180{
181 struct rt_bandwidth *rt_b =
182 container_of(timer, struct rt_bandwidth, rt_period_timer);
183 ktime_t now;
184 int overrun;
185 int idle = 0;
186
187 for (;;) {
188 now = hrtimer_cb_get_time(timer);
189 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
190
191 if (!overrun)
192 break;
193
194 idle = do_sched_rt_period_timer(rt_b, overrun);
195 }
196
197 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
198}
199
200static
201void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
202{
203 rt_b->rt_period = ns_to_ktime(period);
204 rt_b->rt_runtime = runtime;
205
206 spin_lock_init(&rt_b->rt_runtime_lock);
207
208 hrtimer_init(&rt_b->rt_period_timer,
209 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
210 rt_b->rt_period_timer.function = sched_rt_period_timer;
211}
212
213static inline int rt_bandwidth_enabled(void)
214{
215 return sysctl_sched_rt_runtime >= 0;
216}
217
218static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
219{
220 ktime_t now;
221
222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
228 spin_lock(&rt_b->rt_runtime_lock);
229 for (;;) {
230 unsigned long delta;
231 ktime_t soft, hard;
232
233 if (hrtimer_active(&rt_b->rt_period_timer))
234 break;
235
236 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
237 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
238
239 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
240 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
241 delta = ktime_to_ns(ktime_sub(hard, soft));
242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
243 HRTIMER_MODE_ABS_PINNED, 0);
244 }
245 spin_unlock(&rt_b->rt_runtime_lock);
246}
247
248#ifdef CONFIG_RT_GROUP_SCHED
249static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
250{
251 hrtimer_cancel(&rt_b->rt_period_timer);
252}
253#endif
254
255
256
257
258
259static DEFINE_MUTEX(sched_domains_mutex);
260
261#ifdef CONFIG_GROUP_SCHED
262
263#include <linux/cgroup.h>
264
265struct cfs_rq;
266
267static LIST_HEAD(task_groups);
268
269
270struct task_group {
271#ifdef CONFIG_CGROUP_SCHED
272 struct cgroup_subsys_state css;
273#endif
274
275#ifdef CONFIG_USER_SCHED
276 uid_t uid;
277#endif
278
279#ifdef CONFIG_FAIR_GROUP_SCHED
280
281 struct sched_entity **se;
282
283 struct cfs_rq **cfs_rq;
284 unsigned long shares;
285#endif
286
287#ifdef CONFIG_RT_GROUP_SCHED
288 struct sched_rt_entity **rt_se;
289 struct rt_rq **rt_rq;
290
291 struct rt_bandwidth rt_bandwidth;
292#endif
293
294 struct rcu_head rcu;
295 struct list_head list;
296
297 struct task_group *parent;
298 struct list_head siblings;
299 struct list_head children;
300};
301
302#ifdef CONFIG_USER_SCHED
303
304
305void set_tg_uid(struct user_struct *user)
306{
307 user->tg->uid = user->uid;
308}
309
310
311
312
313
314
315struct task_group root_task_group;
316
317#ifdef CONFIG_FAIR_GROUP_SCHED
318
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
322#endif
323
324#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
327#endif
328#else
329#define root_task_group init_task_group
330#endif
331
332
333
334
335static DEFINE_SPINLOCK(task_group_lock);
336
337#ifdef CONFIG_SMP
338static int root_task_group_empty(void)
339{
340 return list_empty(&root_task_group.children);
341}
342#endif
343
344#ifdef CONFIG_FAIR_GROUP_SCHED
345#ifdef CONFIG_USER_SCHED
346# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
347#else
348# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
349#endif
350
351
352
353
354
355
356
357
358
359#define MIN_SHARES 2
360#define MAX_SHARES (1UL << 18)
361
362static int init_task_group_load = INIT_TASK_GROUP_LOAD;
363#endif
364
365
366
367
368struct task_group init_task_group;
369
370
371static inline struct task_group *task_group(struct task_struct *p)
372{
373 struct task_group *tg;
374
375#ifdef CONFIG_USER_SCHED
376 rcu_read_lock();
377 tg = __task_cred(p)->user->tg;
378 rcu_read_unlock();
379#elif defined(CONFIG_CGROUP_SCHED)
380 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
381 struct task_group, css);
382#else
383 tg = &init_task_group;
384#endif
385 return tg;
386}
387
388
389static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
390{
391#ifdef CONFIG_FAIR_GROUP_SCHED
392 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
393 p->se.parent = task_group(p)->se[cpu];
394#endif
395
396#ifdef CONFIG_RT_GROUP_SCHED
397 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
398 p->rt.parent = task_group(p)->rt_se[cpu];
399#endif
400}
401
402#else
403
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p)
413{
414 return NULL;
415}
416
417#endif
418
419
420struct cfs_rq {
421 struct load_weight load;
422 unsigned long nr_running;
423
424 u64 exec_clock;
425 u64 min_vruntime;
426
427 struct rb_root tasks_timeline;
428 struct rb_node *rb_leftmost;
429
430 struct list_head tasks;
431 struct list_head *balance_iterator;
432
433
434
435
436
437 struct sched_entity *curr, *next, *last;
438
439 unsigned int nr_spread_over;
440
441#ifdef CONFIG_FAIR_GROUP_SCHED
442 struct rq *rq;
443
444
445
446
447
448
449
450
451
452 struct list_head leaf_cfs_rq_list;
453 struct task_group *tg;
454
455#ifdef CONFIG_SMP
456
457
458
459 unsigned long task_weight;
460
461
462
463
464
465
466
467 unsigned long h_load;
468
469
470
471
472 unsigned long shares;
473
474
475
476
477 unsigned long rq_weight;
478#endif
479#endif
480};
481
482
483struct rt_rq {
484 struct rt_prio_array active;
485 unsigned long rt_nr_running;
486#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
487 struct {
488 int curr;
489#ifdef CONFIG_SMP
490 int next;
491#endif
492 } highest_prio;
493#endif
494#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
497 int overloaded;
498 struct plist_head pushable_tasks;
499#endif
500 int rt_throttled;
501 u64 rt_time;
502 u64 rt_runtime;
503
504 spinlock_t rt_runtime_lock;
505
506#ifdef CONFIG_RT_GROUP_SCHED
507 unsigned long rt_nr_boosted;
508
509 struct rq *rq;
510 struct list_head leaf_rt_rq_list;
511 struct task_group *tg;
512 struct sched_rt_entity *rt_se;
513#endif
514};
515
516#ifdef CONFIG_SMP
517
518
519
520
521
522
523
524
525
526struct root_domain {
527 atomic_t refcount;
528 cpumask_var_t span;
529 cpumask_var_t online;
530
531
532
533
534
535 cpumask_var_t rto_mask;
536 atomic_t rto_count;
537#ifdef CONFIG_SMP
538 struct cpupri cpupri;
539#endif
540#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
541
542
543
544
545
546 unsigned int sched_mc_preferred_wakeup_cpu;
547#endif
548};
549
550
551
552
553
554static struct root_domain def_root_domain;
555
556#endif
557
558
559
560
561
562
563
564
565struct rq {
566
567 spinlock_t lock;
568
569
570
571
572
573 unsigned long nr_running;
574 #define CPU_LOAD_IDX_MAX 5
575 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
576#ifdef CONFIG_NO_HZ
577 unsigned long last_tick_seen;
578 unsigned char in_nohz_recently;
579#endif
580
581 struct load_weight load;
582 unsigned long nr_load_updates;
583 u64 nr_switches;
584 u64 nr_migrations_in;
585
586 struct cfs_rq cfs;
587 struct rt_rq rt;
588
589#ifdef CONFIG_FAIR_GROUP_SCHED
590
591 struct list_head leaf_cfs_rq_list;
592#endif
593#ifdef CONFIG_RT_GROUP_SCHED
594 struct list_head leaf_rt_rq_list;
595#endif
596
597
598
599
600
601
602
603 unsigned long nr_uninterruptible;
604
605 struct task_struct *curr, *idle;
606 unsigned long next_balance;
607 struct mm_struct *prev_mm;
608
609 u64 clock;
610
611 atomic_t nr_iowait;
612
613#ifdef CONFIG_SMP
614 struct root_domain *rd;
615 struct sched_domain *sd;
616
617 unsigned char idle_at_tick;
618
619 int active_balance;
620 int push_cpu;
621
622 int cpu;
623 int online;
624
625 unsigned long avg_load_per_task;
626
627 struct task_struct *migration_thread;
628 struct list_head migration_queue;
629#endif
630
631
632 unsigned long calc_load_update;
633 long calc_load_active;
634
635#ifdef CONFIG_SCHED_HRTICK
636#ifdef CONFIG_SMP
637 int hrtick_csd_pending;
638 struct call_single_data hrtick_csd;
639#endif
640 struct hrtimer hrtick_timer;
641#endif
642
643#ifdef CONFIG_SCHEDSTATS
644
645 struct sched_info rq_sched_info;
646 unsigned long long rq_cpu_time;
647
648
649
650 unsigned int yld_count;
651
652
653 unsigned int sched_switch;
654 unsigned int sched_count;
655 unsigned int sched_goidle;
656
657
658 unsigned int ttwu_count;
659 unsigned int ttwu_local;
660
661
662 unsigned int bkl_count;
663#endif
664};
665
666static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
667
668static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
669{
670 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
671}
672
673static inline int cpu_of(struct rq *rq)
674{
675#ifdef CONFIG_SMP
676 return rq->cpu;
677#else
678 return 0;
679#endif
680}
681
682
683
684
685
686
687
688
689#define for_each_domain(cpu, __sd) \
690 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
691
692#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
693#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
696
697inline void update_rq_clock(struct rq *rq)
698{
699 rq->clock = sched_clock_cpu(cpu_of(rq));
700}
701
702
703
704
705#ifdef CONFIG_SCHED_DEBUG
706# define const_debug __read_mostly
707#else
708# define const_debug static const
709#endif
710
711
712
713
714
715
716
717
718int runqueue_is_locked(void)
719{
720 int cpu = get_cpu();
721 struct rq *rq = cpu_rq(cpu);
722 int ret;
723
724 ret = spin_is_locked(&rq->lock);
725 put_cpu();
726 return ret;
727}
728
729
730
731
732
733#define SCHED_FEAT(name, enabled) \
734 __SCHED_FEAT_##name ,
735
736enum {
737#include "sched_features.h"
738};
739
740#undef SCHED_FEAT
741
742#define SCHED_FEAT(name, enabled) \
743 (1UL << __SCHED_FEAT_##name) * enabled |
744
745const_debug unsigned int sysctl_sched_features =
746#include "sched_features.h"
747 0;
748
749#undef SCHED_FEAT
750
751#ifdef CONFIG_SCHED_DEBUG
752#define SCHED_FEAT(name, enabled) \
753 #name ,
754
755static __read_mostly char *sched_feat_names[] = {
756#include "sched_features.h"
757 NULL
758};
759
760#undef SCHED_FEAT
761
762static int sched_feat_show(struct seq_file *m, void *v)
763{
764 int i;
765
766 for (i = 0; sched_feat_names[i]; i++) {
767 if (!(sysctl_sched_features & (1UL << i)))
768 seq_puts(m, "NO_");
769 seq_printf(m, "%s ", sched_feat_names[i]);
770 }
771 seq_puts(m, "\n");
772
773 return 0;
774}
775
776static ssize_t
777sched_feat_write(struct file *filp, const char __user *ubuf,
778 size_t cnt, loff_t *ppos)
779{
780 char buf[64];
781 char *cmp = buf;
782 int neg = 0;
783 int i;
784
785 if (cnt > 63)
786 cnt = 63;
787
788 if (copy_from_user(&buf, ubuf, cnt))
789 return -EFAULT;
790
791 buf[cnt] = 0;
792
793 if (strncmp(buf, "NO_", 3) == 0) {
794 neg = 1;
795 cmp += 3;
796 }
797
798 for (i = 0; sched_feat_names[i]; i++) {
799 int len = strlen(sched_feat_names[i]);
800
801 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
802 if (neg)
803 sysctl_sched_features &= ~(1UL << i);
804 else
805 sysctl_sched_features |= (1UL << i);
806 break;
807 }
808 }
809
810 if (!sched_feat_names[i])
811 return -EINVAL;
812
813 filp->f_pos += cnt;
814
815 return cnt;
816}
817
818static int sched_feat_open(struct inode *inode, struct file *filp)
819{
820 return single_open(filp, sched_feat_show, NULL);
821}
822
823static struct file_operations sched_feat_fops = {
824 .open = sched_feat_open,
825 .write = sched_feat_write,
826 .read = seq_read,
827 .llseek = seq_lseek,
828 .release = single_release,
829};
830
831static __init int sched_init_debug(void)
832{
833 debugfs_create_file("sched_features", 0644, NULL, NULL,
834 &sched_feat_fops);
835
836 return 0;
837}
838late_initcall(sched_init_debug);
839
840#endif
841
842#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
843
844
845
846
847
848const_debug unsigned int sysctl_sched_nr_migrate = 32;
849
850
851
852
853
854unsigned int sysctl_sched_shares_ratelimit = 250000;
855
856
857
858
859
860
861unsigned int sysctl_sched_shares_thresh = 4;
862
863
864
865
866
867unsigned int sysctl_sched_rt_period = 1000000;
868
869static __read_mostly int scheduler_running;
870
871
872
873
874
875int sysctl_sched_rt_runtime = 950000;
876
877static inline u64 global_rt_period(void)
878{
879 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
880}
881
882static inline u64 global_rt_runtime(void)
883{
884 if (sysctl_sched_rt_runtime < 0)
885 return RUNTIME_INF;
886
887 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
888}
889
890#ifndef prepare_arch_switch
891# define prepare_arch_switch(next) do { } while (0)
892#endif
893#ifndef finish_arch_switch
894# define finish_arch_switch(prev) do { } while (0)
895#endif
896
897static inline int task_current(struct rq *rq, struct task_struct *p)
898{
899 return rq->curr == p;
900}
901
902#ifndef __ARCH_WANT_UNLOCKED_CTXSW
903static inline int task_running(struct rq *rq, struct task_struct *p)
904{
905 return task_current(rq, p);
906}
907
908static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
909{
910}
911
912static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
913{
914#ifdef CONFIG_DEBUG_SPINLOCK
915
916 rq->lock.owner = current;
917#endif
918
919
920
921
922
923 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
924
925 spin_unlock_irq(&rq->lock);
926}
927
928#else
929static inline int task_running(struct rq *rq, struct task_struct *p)
930{
931#ifdef CONFIG_SMP
932 return p->oncpu;
933#else
934 return task_current(rq, p);
935#endif
936}
937
938static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
939{
940#ifdef CONFIG_SMP
941
942
943
944
945
946 next->oncpu = 1;
947#endif
948#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
949 spin_unlock_irq(&rq->lock);
950#else
951 spin_unlock(&rq->lock);
952#endif
953}
954
955static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
956{
957#ifdef CONFIG_SMP
958
959
960
961
962
963 smp_wmb();
964 prev->oncpu = 0;
965#endif
966#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
967 local_irq_enable();
968#endif
969}
970#endif
971
972
973
974
975
976static inline struct rq *__task_rq_lock(struct task_struct *p)
977 __acquires(rq->lock)
978{
979 for (;;) {
980 struct rq *rq = task_rq(p);
981 spin_lock(&rq->lock);
982 if (likely(rq == task_rq(p)))
983 return rq;
984 spin_unlock(&rq->lock);
985 }
986}
987
988
989
990
991
992
993static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
994 __acquires(rq->lock)
995{
996 struct rq *rq;
997
998 for (;;) {
999 local_irq_save(*flags);
1000 rq = task_rq(p);
1001 spin_lock(&rq->lock);
1002 if (likely(rq == task_rq(p)))
1003 return rq;
1004 spin_unlock_irqrestore(&rq->lock, *flags);
1005 }
1006}
1007
1008void task_rq_unlock_wait(struct task_struct *p)
1009{
1010 struct rq *rq = task_rq(p);
1011
1012 smp_mb();
1013 spin_unlock_wait(&rq->lock);
1014}
1015
1016static void __task_rq_unlock(struct rq *rq)
1017 __releases(rq->lock)
1018{
1019 spin_unlock(&rq->lock);
1020}
1021
1022static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1023 __releases(rq->lock)
1024{
1025 spin_unlock_irqrestore(&rq->lock, *flags);
1026}
1027
1028
1029
1030
1031static struct rq *this_rq_lock(void)
1032 __acquires(rq->lock)
1033{
1034 struct rq *rq;
1035
1036 local_irq_disable();
1037 rq = this_rq();
1038 spin_lock(&rq->lock);
1039
1040 return rq;
1041}
1042
1043#ifdef CONFIG_SCHED_HRTICK
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060static inline int hrtick_enabled(struct rq *rq)
1061{
1062 if (!sched_feat(HRTICK))
1063 return 0;
1064 if (!cpu_active(cpu_of(rq)))
1065 return 0;
1066 return hrtimer_is_hres_active(&rq->hrtick_timer);
1067}
1068
1069static void hrtick_clear(struct rq *rq)
1070{
1071 if (hrtimer_active(&rq->hrtick_timer))
1072 hrtimer_cancel(&rq->hrtick_timer);
1073}
1074
1075
1076
1077
1078
1079static enum hrtimer_restart hrtick(struct hrtimer *timer)
1080{
1081 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1082
1083 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1084
1085 spin_lock(&rq->lock);
1086 update_rq_clock(rq);
1087 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1088 spin_unlock(&rq->lock);
1089
1090 return HRTIMER_NORESTART;
1091}
1092
1093#ifdef CONFIG_SMP
1094
1095
1096
1097static void __hrtick_start(void *arg)
1098{
1099 struct rq *rq = arg;
1100
1101 spin_lock(&rq->lock);
1102 hrtimer_restart(&rq->hrtick_timer);
1103 rq->hrtick_csd_pending = 0;
1104 spin_unlock(&rq->lock);
1105}
1106
1107
1108
1109
1110
1111
1112static void hrtick_start(struct rq *rq, u64 delay)
1113{
1114 struct hrtimer *timer = &rq->hrtick_timer;
1115 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1116
1117 hrtimer_set_expires(timer, time);
1118
1119 if (rq == this_rq()) {
1120 hrtimer_restart(timer);
1121 } else if (!rq->hrtick_csd_pending) {
1122 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1123 rq->hrtick_csd_pending = 1;
1124 }
1125}
1126
1127static int
1128hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1129{
1130 int cpu = (int)(long)hcpu;
1131
1132 switch (action) {
1133 case CPU_UP_CANCELED:
1134 case CPU_UP_CANCELED_FROZEN:
1135 case CPU_DOWN_PREPARE:
1136 case CPU_DOWN_PREPARE_FROZEN:
1137 case CPU_DEAD:
1138 case CPU_DEAD_FROZEN:
1139 hrtick_clear(cpu_rq(cpu));
1140 return NOTIFY_OK;
1141 }
1142
1143 return NOTIFY_DONE;
1144}
1145
1146static __init void init_hrtick(void)
1147{
1148 hotcpu_notifier(hotplug_hrtick, 0);
1149}
1150#else
1151
1152
1153
1154
1155
1156static void hrtick_start(struct rq *rq, u64 delay)
1157{
1158 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1159 HRTIMER_MODE_REL_PINNED, 0);
1160}
1161
1162static inline void init_hrtick(void)
1163{
1164}
1165#endif
1166
1167static void init_rq_hrtick(struct rq *rq)
1168{
1169#ifdef CONFIG_SMP
1170 rq->hrtick_csd_pending = 0;
1171
1172 rq->hrtick_csd.flags = 0;
1173 rq->hrtick_csd.func = __hrtick_start;
1174 rq->hrtick_csd.info = rq;
1175#endif
1176
1177 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1178 rq->hrtick_timer.function = hrtick;
1179}
1180#else
1181static inline void hrtick_clear(struct rq *rq)
1182{
1183}
1184
1185static inline void init_rq_hrtick(struct rq *rq)
1186{
1187}
1188
1189static inline void init_hrtick(void)
1190{
1191}
1192#endif
1193
1194
1195
1196
1197
1198
1199
1200
1201#ifdef CONFIG_SMP
1202
1203#ifndef tsk_is_polling
1204#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1205#endif
1206
1207static void resched_task(struct task_struct *p)
1208{
1209 int cpu;
1210
1211 assert_spin_locked(&task_rq(p)->lock);
1212
1213 if (test_tsk_need_resched(p))
1214 return;
1215
1216 set_tsk_need_resched(p);
1217
1218 cpu = task_cpu(p);
1219 if (cpu == smp_processor_id())
1220 return;
1221
1222
1223 smp_mb();
1224 if (!tsk_is_polling(p))
1225 smp_send_reschedule(cpu);
1226}
1227
1228static void resched_cpu(int cpu)
1229{
1230 struct rq *rq = cpu_rq(cpu);
1231 unsigned long flags;
1232
1233 if (!spin_trylock_irqsave(&rq->lock, flags))
1234 return;
1235 resched_task(cpu_curr(cpu));
1236 spin_unlock_irqrestore(&rq->lock, flags);
1237}
1238
1239#ifdef CONFIG_NO_HZ
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250void wake_up_idle_cpu(int cpu)
1251{
1252 struct rq *rq = cpu_rq(cpu);
1253
1254 if (cpu == smp_processor_id())
1255 return;
1256
1257
1258
1259
1260
1261
1262
1263
1264 if (rq->curr != rq->idle)
1265 return;
1266
1267
1268
1269
1270
1271
1272 set_tsk_need_resched(rq->idle);
1273
1274
1275 smp_mb();
1276 if (!tsk_is_polling(rq->idle))
1277 smp_send_reschedule(cpu);
1278}
1279#endif
1280
1281#else
1282static void resched_task(struct task_struct *p)
1283{
1284 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p);
1286}
1287#endif
1288
1289#if BITS_PER_LONG == 32
1290# define WMULT_CONST (~0UL)
1291#else
1292# define WMULT_CONST (1UL << 32)
1293#endif
1294
1295#define WMULT_SHIFT 32
1296
1297
1298
1299
1300#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1301
1302
1303
1304
1305static unsigned long
1306calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307 struct load_weight *lw)
1308{
1309 u64 tmp;
1310
1311 if (!lw->inv_weight) {
1312 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1313 lw->inv_weight = 1;
1314 else
1315 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1316 / (lw->weight+1);
1317 }
1318
1319 tmp = (u64)delta_exec * weight;
1320
1321
1322
1323 if (unlikely(tmp > WMULT_CONST))
1324 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1325 WMULT_SHIFT/2);
1326 else
1327 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1328
1329 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1330}
1331
1332static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1333{
1334 lw->weight += inc;
1335 lw->inv_weight = 0;
1336}
1337
1338static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1339{
1340 lw->weight -= dec;
1341 lw->inv_weight = 0;
1342}
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353#define WEIGHT_IDLEPRIO 3
1354#define WMULT_IDLEPRIO 1431655765
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368static const int prio_to_weight[40] = {
1369 88761, 71755, 56483, 46273, 36291,
1370 29154, 23254, 18705, 14949, 11916,
1371 9548, 7620, 6100, 4904, 3906,
1372 3121, 2501, 1991, 1586, 1277,
1373 1024, 820, 655, 526, 423,
1374 335, 272, 215, 172, 137,
1375 110, 87, 70, 56, 45,
1376 36, 29, 23, 18, 15,
1377};
1378
1379
1380
1381
1382
1383
1384
1385
1386static const u32 prio_to_wmult[40] = {
1387 48388, 59856, 76040, 92818, 118348,
1388 147320, 184698, 229616, 287308, 360437,
1389 449829, 563644, 704093, 875809, 1099582,
1390 1376151, 1717300, 2157191, 2708050, 3363326,
1391 4194304, 5237765, 6557202, 8165337, 10153587,
1392 12820798, 15790321, 19976592, 24970740, 31350126,
1393 39045157, 49367440, 61356676, 76695844, 95443717,
1394 119304647, 148102320, 186737708, 238609294, 286331153,
1395};
1396
1397static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1398
1399
1400
1401
1402
1403
1404struct rq_iterator {
1405 void *arg;
1406 struct task_struct *(*start)(void *);
1407 struct task_struct *(*next)(void *);
1408};
1409
1410#ifdef CONFIG_SMP
1411static unsigned long
1412balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1413 unsigned long max_load_move, struct sched_domain *sd,
1414 enum cpu_idle_type idle, int *all_pinned,
1415 int *this_best_prio, struct rq_iterator *iterator);
1416
1417static int
1418iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1419 struct sched_domain *sd, enum cpu_idle_type idle,
1420 struct rq_iterator *iterator);
1421#endif
1422
1423
1424enum cpuacct_stat_index {
1425 CPUACCT_STAT_USER,
1426 CPUACCT_STAT_SYSTEM,
1427
1428 CPUACCT_STAT_NSTATS,
1429};
1430
1431#ifdef CONFIG_CGROUP_CPUACCT
1432static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1433static void cpuacct_update_stats(struct task_struct *tsk,
1434 enum cpuacct_stat_index idx, cputime_t val);
1435#else
1436static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1437static inline void cpuacct_update_stats(struct task_struct *tsk,
1438 enum cpuacct_stat_index idx, cputime_t val) {}
1439#endif
1440
1441static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1442{
1443 update_load_add(&rq->load, load);
1444}
1445
1446static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1447{
1448 update_load_sub(&rq->load, load);
1449}
1450
1451#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1452typedef int (*tg_visitor)(struct task_group *, void *);
1453
1454
1455
1456
1457
1458static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1459{
1460 struct task_group *parent, *child;
1461 int ret;
1462
1463 rcu_read_lock();
1464 parent = &root_task_group;
1465down:
1466 ret = (*down)(parent, data);
1467 if (ret)
1468 goto out_unlock;
1469 list_for_each_entry_rcu(child, &parent->children, siblings) {
1470 parent = child;
1471 goto down;
1472
1473up:
1474 continue;
1475 }
1476 ret = (*up)(parent, data);
1477 if (ret)
1478 goto out_unlock;
1479
1480 child = parent;
1481 parent = parent->parent;
1482 if (parent)
1483 goto up;
1484out_unlock:
1485 rcu_read_unlock();
1486
1487 return ret;
1488}
1489
1490static int tg_nop(struct task_group *tg, void *data)
1491{
1492 return 0;
1493}
1494#endif
1495
1496#ifdef CONFIG_SMP
1497static unsigned long source_load(int cpu, int type);
1498static unsigned long target_load(int cpu, int type);
1499static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1500
1501static unsigned long cpu_avg_load_per_task(int cpu)
1502{
1503 struct rq *rq = cpu_rq(cpu);
1504 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1505
1506 if (nr_running)
1507 rq->avg_load_per_task = rq->load.weight / nr_running;
1508 else
1509 rq->avg_load_per_task = 0;
1510
1511 return rq->avg_load_per_task;
1512}
1513
1514#ifdef CONFIG_FAIR_GROUP_SCHED
1515
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517
1518
1519
1520
1521static void
1522update_group_shares_cpu(struct task_group *tg, int cpu,
1523 unsigned long sd_shares, unsigned long sd_rq_weight)
1524{
1525 unsigned long shares;
1526 unsigned long rq_weight;
1527
1528 if (!tg->se[cpu])
1529 return;
1530
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1532
1533
1534
1535
1536
1537
1538
1539 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1541
1542 if (abs(shares - tg->se[cpu]->load.weight) >
1543 sysctl_sched_shares_thresh) {
1544 struct rq *rq = cpu_rq(cpu);
1545 unsigned long flags;
1546
1547 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares;
1549
1550 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags);
1552 }
1553}
1554
1555
1556
1557
1558
1559
1560static int tg_shares_up(struct task_group *tg, void *data)
1561{
1562 unsigned long weight, rq_weight = 0;
1563 unsigned long shares = 0;
1564 struct sched_domain *sd = data;
1565 int i;
1566
1567 for_each_cpu(i, sched_domain_span(sd)) {
1568
1569
1570
1571
1572
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight)
1575 weight = NICE_0_LOAD;
1576
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares;
1580 }
1581
1582 if ((!shares && rq_weight) || shares > tg->shares)
1583 shares = tg->shares;
1584
1585 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1586 shares = tg->shares;
1587
1588 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight);
1590
1591 return 0;
1592}
1593
1594
1595
1596
1597
1598
1599static int tg_load_down(struct task_group *tg, void *data)
1600{
1601 unsigned long load;
1602 long cpu = (long)data;
1603
1604 if (!tg->parent) {
1605 load = cpu_rq(cpu)->load.weight;
1606 } else {
1607 load = tg->parent->cfs_rq[cpu]->h_load;
1608 load *= tg->cfs_rq[cpu]->shares;
1609 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1610 }
1611
1612 tg->cfs_rq[cpu]->h_load = load;
1613
1614 return 0;
1615}
1616
1617static void update_shares(struct sched_domain *sd)
1618{
1619 u64 now = cpu_clock(raw_smp_processor_id());
1620 s64 elapsed = now - sd->last_update;
1621
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now;
1624 walk_tg_tree(tg_nop, tg_shares_up, sd);
1625 }
1626}
1627
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{
1630 spin_unlock(&rq->lock);
1631 update_shares(sd);
1632 spin_lock(&rq->lock);
1633}
1634
1635static void update_h_load(long cpu)
1636{
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638}
1639
1640#else
1641
1642static inline void update_shares(struct sched_domain *sd)
1643{
1644}
1645
1646static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1647{
1648}
1649
1650#endif
1651
1652#ifdef CONFIG_PREEMPT
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1663 __releases(this_rq->lock)
1664 __acquires(busiest->lock)
1665 __acquires(this_rq->lock)
1666{
1667 spin_unlock(&this_rq->lock);
1668 double_rq_lock(this_rq, busiest);
1669
1670 return 1;
1671}
1672
1673#else
1674
1675
1676
1677
1678
1679
1680
1681static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1682 __releases(this_rq->lock)
1683 __acquires(busiest->lock)
1684 __acquires(this_rq->lock)
1685{
1686 int ret = 0;
1687
1688 if (unlikely(!spin_trylock(&busiest->lock))) {
1689 if (busiest < this_rq) {
1690 spin_unlock(&this_rq->lock);
1691 spin_lock(&busiest->lock);
1692 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1693 ret = 1;
1694 } else
1695 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1696 }
1697 return ret;
1698}
1699
1700#endif
1701
1702
1703
1704
1705static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1706{
1707 if (unlikely(!irqs_disabled())) {
1708
1709 spin_unlock(&this_rq->lock);
1710 BUG_ON(1);
1711 }
1712
1713 return _double_lock_balance(this_rq, busiest);
1714}
1715
1716static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1717 __releases(busiest->lock)
1718{
1719 spin_unlock(&busiest->lock);
1720 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1721}
1722#endif
1723
1724#ifdef CONFIG_FAIR_GROUP_SCHED
1725static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1726{
1727#ifdef CONFIG_SMP
1728 cfs_rq->shares = shares;
1729#endif
1730}
1731#endif
1732
1733static void calc_load_account_active(struct rq *this_rq);
1734
1735#include "sched_stats.h"
1736#include "sched_idletask.c"
1737#include "sched_fair.c"
1738#include "sched_rt.c"
1739#ifdef CONFIG_SCHED_DEBUG
1740# include "sched_debug.c"
1741#endif
1742
1743#define sched_class_highest (&rt_sched_class)
1744#define for_each_class(class) \
1745 for (class = sched_class_highest; class; class = class->next)
1746
1747static void inc_nr_running(struct rq *rq)
1748{
1749 rq->nr_running++;
1750}
1751
1752static void dec_nr_running(struct rq *rq)
1753{
1754 rq->nr_running--;
1755}
1756
1757static void set_load_weight(struct task_struct *p)
1758{
1759 if (task_has_rt_policy(p)) {
1760 p->se.load.weight = prio_to_weight[0] * 2;
1761 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1762 return;
1763 }
1764
1765
1766
1767
1768 if (p->policy == SCHED_IDLE) {
1769 p->se.load.weight = WEIGHT_IDLEPRIO;
1770 p->se.load.inv_weight = WMULT_IDLEPRIO;
1771 return;
1772 }
1773
1774 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1775 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1776}
1777
1778static void update_avg(u64 *avg, u64 sample)
1779{
1780 s64 diff = sample - *avg;
1781 *avg += diff >> 3;
1782}
1783
1784static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1785{
1786 if (wakeup)
1787 p->se.start_runtime = p->se.sum_exec_runtime;
1788
1789 sched_info_queued(p);
1790 p->sched_class->enqueue_task(rq, p, wakeup);
1791 p->se.on_rq = 1;
1792}
1793
1794static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1795{
1796 if (sleep) {
1797 if (p->se.last_wakeup) {
1798 update_avg(&p->se.avg_overlap,
1799 p->se.sum_exec_runtime - p->se.last_wakeup);
1800 p->se.last_wakeup = 0;
1801 } else {
1802 update_avg(&p->se.avg_wakeup,
1803 sysctl_sched_wakeup_granularity);
1804 }
1805 }
1806
1807 sched_info_dequeued(p);
1808 p->sched_class->dequeue_task(rq, p, sleep);
1809 p->se.on_rq = 0;
1810}
1811
1812
1813
1814
1815static inline int __normal_prio(struct task_struct *p)
1816{
1817 return p->static_prio;
1818}
1819
1820
1821
1822
1823
1824
1825
1826
1827static inline int normal_prio(struct task_struct *p)
1828{
1829 int prio;
1830
1831 if (task_has_rt_policy(p))
1832 prio = MAX_RT_PRIO-1 - p->rt_priority;
1833 else
1834 prio = __normal_prio(p);
1835 return prio;
1836}
1837
1838
1839
1840
1841
1842
1843
1844
1845static int effective_prio(struct task_struct *p)
1846{
1847 p->normal_prio = normal_prio(p);
1848
1849
1850
1851
1852
1853 if (!rt_prio(p->prio))
1854 return p->normal_prio;
1855 return p->prio;
1856}
1857
1858
1859
1860
1861static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1862{
1863 if (task_contributes_to_load(p))
1864 rq->nr_uninterruptible--;
1865
1866 enqueue_task(rq, p, wakeup);
1867 inc_nr_running(rq);
1868}
1869
1870
1871
1872
1873static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1874{
1875 if (task_contributes_to_load(p))
1876 rq->nr_uninterruptible++;
1877
1878 dequeue_task(rq, p, sleep);
1879 dec_nr_running(rq);
1880}
1881
1882
1883
1884
1885
1886inline int task_curr(const struct task_struct *p)
1887{
1888 return cpu_curr(task_cpu(p)) == p;
1889}
1890
1891static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1892{
1893 set_task_rq(p, cpu);
1894#ifdef CONFIG_SMP
1895
1896
1897
1898
1899
1900 smp_wmb();
1901 task_thread_info(p)->cpu = cpu;
1902#endif
1903}
1904
1905static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1906 const struct sched_class *prev_class,
1907 int oldprio, int running)
1908{
1909 if (prev_class != p->sched_class) {
1910 if (prev_class->switched_from)
1911 prev_class->switched_from(rq, p, running);
1912 p->sched_class->switched_to(rq, p, running);
1913 } else
1914 p->sched_class->prio_changed(rq, p, oldprio, running);
1915}
1916
1917#ifdef CONFIG_SMP
1918
1919
1920static unsigned long weighted_cpuload(const int cpu)
1921{
1922 return cpu_rq(cpu)->load.weight;
1923}
1924
1925
1926
1927
1928static int
1929task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1930{
1931 s64 delta;
1932
1933
1934
1935
1936 if (sched_feat(CACHE_HOT_BUDDY) &&
1937 (&p->se == cfs_rq_of(&p->se)->next ||
1938 &p->se == cfs_rq_of(&p->se)->last))
1939 return 1;
1940
1941 if (p->sched_class != &fair_sched_class)
1942 return 0;
1943
1944 if (sysctl_sched_migration_cost == -1)
1945 return 1;
1946 if (sysctl_sched_migration_cost == 0)
1947 return 0;
1948
1949 delta = now - p->se.exec_start;
1950
1951 return delta < (s64)sysctl_sched_migration_cost;
1952}
1953
1954
1955void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1956{
1957 int old_cpu = task_cpu(p);
1958 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1959 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1960 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1961 u64 clock_offset;
1962
1963 clock_offset = old_rq->clock - new_rq->clock;
1964
1965 trace_sched_migrate_task(p, new_cpu);
1966
1967#ifdef CONFIG_SCHEDSTATS
1968 if (p->se.wait_start)
1969 p->se.wait_start -= clock_offset;
1970 if (p->se.sleep_start)
1971 p->se.sleep_start -= clock_offset;
1972 if (p->se.block_start)
1973 p->se.block_start -= clock_offset;
1974#endif
1975 if (old_cpu != new_cpu) {
1976 p->se.nr_migrations++;
1977 new_rq->nr_migrations_in++;
1978#ifdef CONFIG_SCHEDSTATS
1979 if (task_hot(p, old_rq->clock, NULL))
1980 schedstat_inc(p, se.nr_forced2_migrations);
1981#endif
1982 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1983 1, 1, NULL, 0);
1984 }
1985 p->se.vruntime -= old_cfsrq->min_vruntime -
1986 new_cfsrq->min_vruntime;
1987
1988 __set_task_cpu(p, new_cpu);
1989}
1990
1991struct migration_req {
1992 struct list_head list;
1993
1994 struct task_struct *task;
1995 int dest_cpu;
1996
1997 struct completion done;
1998};
1999
2000
2001
2002
2003
2004static int
2005migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2006{
2007 struct rq *rq = task_rq(p);
2008
2009
2010
2011
2012
2013 if (!p->se.on_rq && !task_running(rq, p)) {
2014 set_task_cpu(p, dest_cpu);
2015 return 0;
2016 }
2017
2018 init_completion(&req->done);
2019 req->task = p;
2020 req->dest_cpu = dest_cpu;
2021 list_add(&req->list, &rq->migration_queue);
2022
2023 return 1;
2024}
2025
2026
2027
2028
2029
2030
2031
2032void wait_task_context_switch(struct task_struct *p)
2033{
2034 unsigned long nvcsw, nivcsw, flags;
2035 int running;
2036 struct rq *rq;
2037
2038 nvcsw = p->nvcsw;
2039 nivcsw = p->nivcsw;
2040 for (;;) {
2041
2042
2043
2044
2045
2046
2047
2048
2049 rq = task_rq_lock(p, &flags);
2050 running = task_running(rq, p);
2051 task_rq_unlock(rq, &flags);
2052
2053 if (likely(!running))
2054 break;
2055
2056
2057
2058
2059
2060 if ((p->nvcsw - nvcsw) > 1)
2061 break;
2062 if ((p->nivcsw - nivcsw) > 1)
2063 break;
2064
2065 cpu_relax();
2066 }
2067}
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2086{
2087 unsigned long flags;
2088 int running, on_rq;
2089 unsigned long ncsw;
2090 struct rq *rq;
2091
2092 for (;;) {
2093
2094
2095
2096
2097
2098
2099 rq = task_rq(p);
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112 while (task_running(rq, p)) {
2113 if (match_state && unlikely(p->state != match_state))
2114 return 0;
2115 cpu_relax();
2116 }
2117
2118
2119
2120
2121
2122
2123 rq = task_rq_lock(p, &flags);
2124 trace_sched_wait_task(rq, p);
2125 running = task_running(rq, p);
2126 on_rq = p->se.on_rq;
2127 ncsw = 0;
2128 if (!match_state || p->state == match_state)
2129 ncsw = p->nvcsw | LONG_MIN;
2130 task_rq_unlock(rq, &flags);
2131
2132
2133
2134
2135 if (unlikely(!ncsw))
2136 break;
2137
2138
2139
2140
2141
2142
2143
2144 if (unlikely(running)) {
2145 cpu_relax();
2146 continue;
2147 }
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158 if (unlikely(on_rq)) {
2159 schedule_timeout_uninterruptible(1);
2160 continue;
2161 }
2162
2163
2164
2165
2166
2167
2168 break;
2169 }
2170
2171 return ncsw;
2172}
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187void kick_process(struct task_struct *p)
2188{
2189 int cpu;
2190
2191 preempt_disable();
2192 cpu = task_cpu(p);
2193 if ((cpu != smp_processor_id()) && task_curr(p))
2194 smp_send_reschedule(cpu);
2195 preempt_enable();
2196}
2197EXPORT_SYMBOL_GPL(kick_process);
2198
2199
2200
2201
2202
2203
2204
2205
2206static unsigned long source_load(int cpu, int type)
2207{
2208 struct rq *rq = cpu_rq(cpu);
2209 unsigned long total = weighted_cpuload(cpu);
2210
2211 if (type == 0 || !sched_feat(LB_BIAS))
2212 return total;
2213
2214 return min(rq->cpu_load[type-1], total);
2215}
2216
2217
2218
2219
2220
2221static unsigned long target_load(int cpu, int type)
2222{
2223 struct rq *rq = cpu_rq(cpu);
2224 unsigned long total = weighted_cpuload(cpu);
2225
2226 if (type == 0 || !sched_feat(LB_BIAS))
2227 return total;
2228
2229 return max(rq->cpu_load[type-1], total);
2230}
2231
2232
2233
2234
2235
2236static struct sched_group *
2237find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2238{
2239 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2240 unsigned long min_load = ULONG_MAX, this_load = 0;
2241 int load_idx = sd->forkexec_idx;
2242 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2243
2244 do {
2245 unsigned long load, avg_load;
2246 int local_group;
2247 int i;
2248
2249
2250 if (!cpumask_intersects(sched_group_cpus(group),
2251 &p->cpus_allowed))
2252 continue;
2253
2254 local_group = cpumask_test_cpu(this_cpu,
2255 sched_group_cpus(group));
2256
2257
2258 avg_load = 0;
2259
2260 for_each_cpu(i, sched_group_cpus(group)) {
2261
2262 if (local_group)
2263 load = source_load(i, load_idx);
2264 else
2265 load = target_load(i, load_idx);
2266
2267 avg_load += load;
2268 }
2269
2270
2271 avg_load = sg_div_cpu_power(group,
2272 avg_load * SCHED_LOAD_SCALE);
2273
2274 if (local_group) {
2275 this_load = avg_load;
2276 this = group;
2277 } else if (avg_load < min_load) {
2278 min_load = avg_load;
2279 idlest = group;
2280 }
2281 } while (group = group->next, group != sd->groups);
2282
2283 if (!idlest || 100*this_load < imbalance*min_load)
2284 return NULL;
2285 return idlest;
2286}
2287
2288
2289
2290
2291static int
2292find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2293{
2294 unsigned long load, min_load = ULONG_MAX;
2295 int idlest = -1;
2296 int i;
2297
2298
2299 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2300 load = weighted_cpuload(i);
2301
2302 if (load < min_load || (load == min_load && i == this_cpu)) {
2303 min_load = load;
2304 idlest = i;
2305 }
2306 }
2307
2308 return idlest;
2309}
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322static int sched_balance_self(int cpu, int flag)
2323{
2324 struct task_struct *t = current;
2325 struct sched_domain *tmp, *sd = NULL;
2326
2327 for_each_domain(cpu, tmp) {
2328
2329
2330
2331 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2332 break;
2333 if (tmp->flags & flag)
2334 sd = tmp;
2335 }
2336
2337 if (sd)
2338 update_shares(sd);
2339
2340 while (sd) {
2341 struct sched_group *group;
2342 int new_cpu, weight;
2343
2344 if (!(sd->flags & flag)) {
2345 sd = sd->child;
2346 continue;
2347 }
2348
2349 group = find_idlest_group(sd, t, cpu);
2350 if (!group) {
2351 sd = sd->child;
2352 continue;
2353 }
2354
2355 new_cpu = find_idlest_cpu(group, t, cpu);
2356 if (new_cpu == -1 || new_cpu == cpu) {
2357
2358 sd = sd->child;
2359 continue;
2360 }
2361
2362
2363 cpu = new_cpu;
2364 weight = cpumask_weight(sched_domain_span(sd));
2365 sd = NULL;
2366 for_each_domain(cpu, tmp) {
2367 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2368 break;
2369 if (tmp->flags & flag)
2370 sd = tmp;
2371 }
2372
2373 }
2374
2375 return cpu;
2376}
2377
2378#endif
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389void task_oncpu_function_call(struct task_struct *p,
2390 void (*func) (void *info), void *info)
2391{
2392 int cpu;
2393
2394 preempt_disable();
2395 cpu = task_cpu(p);
2396 if (task_curr(p))
2397 smp_call_function_single(cpu, func, info, 1);
2398 preempt_enable();
2399}
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2416{
2417 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags;
2419 long old_state;
2420 struct rq *rq;
2421
2422 if (!sched_feat(SYNC_WAKEUPS))
2423 sync = 0;
2424
2425#ifdef CONFIG_SMP
2426 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2427 struct sched_domain *sd;
2428
2429 this_cpu = raw_smp_processor_id();
2430 cpu = task_cpu(p);
2431
2432 for_each_domain(this_cpu, sd) {
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 update_shares(sd);
2435 break;
2436 }
2437 }
2438 }
2439#endif
2440
2441 smp_wmb();
2442 rq = task_rq_lock(p, &flags);
2443 update_rq_clock(rq);
2444 old_state = p->state;
2445 if (!(old_state & state))
2446 goto out;
2447
2448 if (p->se.on_rq)
2449 goto out_running;
2450
2451 cpu = task_cpu(p);
2452 orig_cpu = cpu;
2453 this_cpu = smp_processor_id();
2454
2455#ifdef CONFIG_SMP
2456 if (unlikely(task_running(rq, p)))
2457 goto out_activate;
2458
2459 cpu = p->sched_class->select_task_rq(p, sync);
2460 if (cpu != orig_cpu) {
2461 set_task_cpu(p, cpu);
2462 task_rq_unlock(rq, &flags);
2463
2464 rq = task_rq_lock(p, &flags);
2465 old_state = p->state;
2466 if (!(old_state & state))
2467 goto out;
2468 if (p->se.on_rq)
2469 goto out_running;
2470
2471 this_cpu = smp_processor_id();
2472 cpu = task_cpu(p);
2473 }
2474
2475#ifdef CONFIG_SCHEDSTATS
2476 schedstat_inc(rq, ttwu_count);
2477 if (cpu == this_cpu)
2478 schedstat_inc(rq, ttwu_local);
2479 else {
2480 struct sched_domain *sd;
2481 for_each_domain(this_cpu, sd) {
2482 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2483 schedstat_inc(sd, ttwu_wake_remote);
2484 break;
2485 }
2486 }
2487 }
2488#endif
2489
2490out_activate:
2491#endif
2492 schedstat_inc(p, se.nr_wakeups);
2493 if (sync)
2494 schedstat_inc(p, se.nr_wakeups_sync);
2495 if (orig_cpu != cpu)
2496 schedstat_inc(p, se.nr_wakeups_migrate);
2497 if (cpu == this_cpu)
2498 schedstat_inc(p, se.nr_wakeups_local);
2499 else
2500 schedstat_inc(p, se.nr_wakeups_remote);
2501 activate_task(rq, p, 1);
2502 success = 1;
2503
2504
2505
2506
2507 if (!in_interrupt()) {
2508 struct sched_entity *se = ¤t->se;
2509 u64 sample = se->sum_exec_runtime;
2510
2511 if (se->last_wakeup)
2512 sample -= se->last_wakeup;
2513 else
2514 sample -= se->start_runtime;
2515 update_avg(&se->avg_wakeup, sample);
2516
2517 se->last_wakeup = se->sum_exec_runtime;
2518 }
2519
2520out_running:
2521 trace_sched_wakeup(rq, p, success);
2522 check_preempt_curr(rq, p, sync);
2523
2524 p->state = TASK_RUNNING;
2525#ifdef CONFIG_SMP
2526 if (p->sched_class->task_wake_up)
2527 p->sched_class->task_wake_up(rq, p);
2528#endif
2529out:
2530 task_rq_unlock(rq, &flags);
2531
2532 return success;
2533}
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546int wake_up_process(struct task_struct *p)
2547{
2548 return try_to_wake_up(p, TASK_ALL, 0);
2549}
2550EXPORT_SYMBOL(wake_up_process);
2551
2552int wake_up_state(struct task_struct *p, unsigned int state)
2553{
2554 return try_to_wake_up(p, state, 0);
2555}
2556
2557
2558
2559
2560
2561
2562
2563static void __sched_fork(struct task_struct *p)
2564{
2565 p->se.exec_start = 0;
2566 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0;
2569 p->se.last_wakeup = 0;
2570 p->se.avg_overlap = 0;
2571 p->se.start_runtime = 0;
2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2573
2574#ifdef CONFIG_SCHEDSTATS
2575 p->se.wait_start = 0;
2576 p->se.wait_max = 0;
2577 p->se.wait_count = 0;
2578 p->se.wait_sum = 0;
2579
2580 p->se.sleep_start = 0;
2581 p->se.sleep_max = 0;
2582 p->se.sum_sleep_runtime = 0;
2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2606#endif
2607
2608 INIT_LIST_HEAD(&p->rt.run_list);
2609 p->se.on_rq = 0;
2610 INIT_LIST_HEAD(&p->se.group_node);
2611
2612#ifdef CONFIG_PREEMPT_NOTIFIERS
2613 INIT_HLIST_HEAD(&p->preempt_notifiers);
2614#endif
2615
2616
2617
2618
2619
2620
2621
2622 p->state = TASK_RUNNING;
2623}
2624
2625
2626
2627
2628void sched_fork(struct task_struct *p, int clone_flags)
2629{
2630 int cpu = get_cpu();
2631
2632 __sched_fork(p);
2633
2634#ifdef CONFIG_SMP
2635 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2636#endif
2637 set_task_cpu(p, cpu);
2638
2639
2640
2641
2642 p->prio = current->normal_prio;
2643 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class;
2645
2646#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2647 if (likely(sched_info_on()))
2648 memset(&p->sched_info, 0, sizeof(p->sched_info));
2649#endif
2650#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2651 p->oncpu = 0;
2652#endif
2653#ifdef CONFIG_PREEMPT
2654
2655 task_thread_info(p)->preempt_count = 1;
2656#endif
2657 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2658
2659 put_cpu();
2660}
2661
2662
2663
2664
2665
2666
2667
2668
2669void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2670{
2671 unsigned long flags;
2672 struct rq *rq;
2673
2674 rq = task_rq_lock(p, &flags);
2675 BUG_ON(p->state != TASK_RUNNING);
2676 update_rq_clock(rq);
2677
2678 p->prio = effective_prio(p);
2679
2680 if (!p->sched_class->task_new || !current->se.on_rq) {
2681 activate_task(rq, p, 0);
2682 } else {
2683
2684
2685
2686
2687 p->sched_class->task_new(rq, p);
2688 inc_nr_running(rq);
2689 }
2690 trace_sched_wakeup_new(rq, p, 1);
2691 check_preempt_curr(rq, p, 0);
2692#ifdef CONFIG_SMP
2693 if (p->sched_class->task_wake_up)
2694 p->sched_class->task_wake_up(rq, p);
2695#endif
2696 task_rq_unlock(rq, &flags);
2697}
2698
2699#ifdef CONFIG_PREEMPT_NOTIFIERS
2700
2701
2702
2703
2704
2705void preempt_notifier_register(struct preempt_notifier *notifier)
2706{
2707 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2708}
2709EXPORT_SYMBOL_GPL(preempt_notifier_register);
2710
2711
2712
2713
2714
2715
2716
2717void preempt_notifier_unregister(struct preempt_notifier *notifier)
2718{
2719 hlist_del(¬ifier->link);
2720}
2721EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2722
2723static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2724{
2725 struct preempt_notifier *notifier;
2726 struct hlist_node *node;
2727
2728 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2729 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2730}
2731
2732static void
2733fire_sched_out_preempt_notifiers(struct task_struct *curr,
2734 struct task_struct *next)
2735{
2736 struct preempt_notifier *notifier;
2737 struct hlist_node *node;
2738
2739 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2740 notifier->ops->sched_out(notifier, next);
2741}
2742
2743#else
2744
2745static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2746{
2747}
2748
2749static void
2750fire_sched_out_preempt_notifiers(struct task_struct *curr,
2751 struct task_struct *next)
2752{
2753}
2754
2755#endif
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770static inline void
2771prepare_task_switch(struct rq *rq, struct task_struct *prev,
2772 struct task_struct *next)
2773{
2774 fire_sched_out_preempt_notifiers(prev, next);
2775 prepare_lock_switch(rq, next);
2776 prepare_arch_switch(next);
2777}
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2795 __releases(rq->lock)
2796{
2797 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805
2806 rq->prev_mm = NULL;
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819 prev_state = prev->state;
2820 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827
2828 fire_sched_in_preempt_notifiers(current);
2829 if (mm)
2830 mmdrop(mm);
2831 if (unlikely(prev_state == TASK_DEAD)) {
2832
2833
2834
2835
2836 kprobe_flush_task(prev);
2837 put_task_struct(prev);
2838 }
2839}
2840
2841
2842
2843
2844
2845asmlinkage void schedule_tail(struct task_struct *prev)
2846 __releases(rq->lock)
2847{
2848 struct rq *rq = this_rq();
2849
2850 finish_task_switch(rq, prev);
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852
2853 preempt_enable();
2854#endif
2855 if (current->set_child_tid)
2856 put_user(task_pid_vnr(current), current->set_child_tid);
2857}
2858
2859
2860
2861
2862
2863static inline void
2864context_switch(struct rq *rq, struct task_struct *prev,
2865 struct task_struct *next)
2866{
2867 struct mm_struct *mm, *oldmm;
2868
2869 prepare_task_switch(rq, prev, next);
2870 trace_sched_switch(rq, prev, next);
2871 mm = next->mm;
2872 oldmm = prev->active_mm;
2873
2874
2875
2876
2877
2878 arch_start_context_switch(prev);
2879
2880 if (unlikely(!mm)) {
2881 next->active_mm = oldmm;
2882 atomic_inc(&oldmm->mm_count);
2883 enter_lazy_tlb(oldmm, next);
2884 } else
2885 switch_mm(oldmm, mm, next);
2886
2887 if (unlikely(!prev->mm)) {
2888 prev->active_mm = NULL;
2889 rq->prev_mm = oldmm;
2890 }
2891
2892
2893
2894
2895
2896
2897#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2898 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2899#endif
2900
2901
2902 switch_to(prev, next, prev);
2903
2904 barrier();
2905
2906
2907
2908
2909
2910 finish_task_switch(this_rq(), prev);
2911}
2912
2913
2914
2915
2916
2917
2918
2919
2920unsigned long nr_running(void)
2921{
2922 unsigned long i, sum = 0;
2923
2924 for_each_online_cpu(i)
2925 sum += cpu_rq(i)->nr_running;
2926
2927 return sum;
2928}
2929
2930unsigned long nr_uninterruptible(void)
2931{
2932 unsigned long i, sum = 0;
2933
2934 for_each_possible_cpu(i)
2935 sum += cpu_rq(i)->nr_uninterruptible;
2936
2937
2938
2939
2940
2941 if (unlikely((long)sum < 0))
2942 sum = 0;
2943
2944 return sum;
2945}
2946
2947unsigned long long nr_context_switches(void)
2948{
2949 int i;
2950 unsigned long long sum = 0;
2951
2952 for_each_possible_cpu(i)
2953 sum += cpu_rq(i)->nr_switches;
2954
2955 return sum;
2956}
2957
2958unsigned long nr_iowait(void)
2959{
2960 unsigned long i, sum = 0;
2961
2962 for_each_possible_cpu(i)
2963 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2964
2965 return sum;
2966}
2967
2968
2969static atomic_long_t calc_load_tasks;
2970static unsigned long calc_load_update;
2971unsigned long avenrun[3];
2972EXPORT_SYMBOL(avenrun);
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2983{
2984 loads[0] = (avenrun[0] + offset) << shift;
2985 loads[1] = (avenrun[1] + offset) << shift;
2986 loads[2] = (avenrun[2] + offset) << shift;
2987}
2988
2989static unsigned long
2990calc_load(unsigned long load, unsigned long exp, unsigned long active)
2991{
2992 load *= exp;
2993 load += active * (FIXED_1 - exp);
2994 return load >> FSHIFT;
2995}
2996
2997
2998
2999
3000
3001void calc_global_load(void)
3002{
3003 unsigned long upd = calc_load_update + 10;
3004 long active;
3005
3006 if (time_before(jiffies, upd))
3007 return;
3008
3009 active = atomic_long_read(&calc_load_tasks);
3010 active = active > 0 ? active * FIXED_1 : 0;
3011
3012 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3013 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3014 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3015
3016 calc_load_update += LOAD_FREQ;
3017}
3018
3019
3020
3021
3022static void calc_load_account_active(struct rq *this_rq)
3023{
3024 long nr_active, delta;
3025
3026 nr_active = this_rq->nr_running;
3027 nr_active += (long) this_rq->nr_uninterruptible;
3028
3029 if (nr_active != this_rq->calc_load_active) {
3030 delta = nr_active - this_rq->calc_load_active;
3031 this_rq->calc_load_active = nr_active;
3032 atomic_long_add(delta, &calc_load_tasks);
3033 }
3034}
3035
3036
3037
3038
3039
3040u64 cpu_nr_migrations(int cpu)
3041{
3042 return cpu_rq(cpu)->nr_migrations_in;
3043}
3044
3045
3046
3047
3048
3049static void update_cpu_load(struct rq *this_rq)
3050{
3051 unsigned long this_load = this_rq->load.weight;
3052 int i, scale;
3053
3054 this_rq->nr_load_updates++;
3055
3056
3057 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3058 unsigned long old_load, new_load;
3059
3060
3061
3062 old_load = this_rq->cpu_load[i];
3063 new_load = this_load;
3064
3065
3066
3067
3068
3069 if (new_load > old_load)
3070 new_load += scale-1;
3071 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3072 }
3073
3074 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3075 this_rq->calc_load_update += LOAD_FREQ;
3076 calc_load_account_active(this_rq);
3077 }
3078}
3079
3080#ifdef CONFIG_SMP
3081
3082
3083
3084
3085
3086
3087
3088static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3089 __acquires(rq1->lock)
3090 __acquires(rq2->lock)
3091{
3092 BUG_ON(!irqs_disabled());
3093 if (rq1 == rq2) {
3094 spin_lock(&rq1->lock);
3095 __acquire(rq2->lock);
3096 } else {
3097 if (rq1 < rq2) {
3098 spin_lock(&rq1->lock);
3099 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3100 } else {
3101 spin_lock(&rq2->lock);
3102 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3103 }
3104 }
3105 update_rq_clock(rq1);
3106 update_rq_clock(rq2);
3107}
3108
3109
3110
3111
3112
3113
3114
3115static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3116 __releases(rq1->lock)
3117 __releases(rq2->lock)
3118{
3119 spin_unlock(&rq1->lock);
3120 if (rq1 != rq2)
3121 spin_unlock(&rq2->lock);
3122 else
3123 __release(rq2->lock);
3124}
3125
3126
3127
3128
3129
3130
3131
3132static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3133{
3134 struct migration_req req;
3135 unsigned long flags;
3136 struct rq *rq;
3137
3138 rq = task_rq_lock(p, &flags);
3139 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3140 || unlikely(!cpu_active(dest_cpu)))
3141 goto out;
3142
3143
3144 if (migrate_task(p, dest_cpu, &req)) {
3145
3146 struct task_struct *mt = rq->migration_thread;
3147
3148 get_task_struct(mt);
3149 task_rq_unlock(rq, &flags);
3150 wake_up_process(mt);
3151 put_task_struct(mt);
3152 wait_for_completion(&req.done);
3153
3154 return;
3155 }
3156out:
3157 task_rq_unlock(rq, &flags);
3158}
3159
3160
3161
3162
3163
3164void sched_exec(void)
3165{
3166 int new_cpu, this_cpu = get_cpu();
3167 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
3168 put_cpu();
3169 if (new_cpu != this_cpu)
3170 sched_migrate_task(current, new_cpu);
3171}
3172
3173
3174
3175
3176
3177static void pull_task(struct rq *src_rq, struct task_struct *p,
3178 struct rq *this_rq, int this_cpu)
3179{
3180 deactivate_task(src_rq, p, 0);
3181 set_task_cpu(p, this_cpu);
3182 activate_task(this_rq, p, 0);
3183
3184
3185
3186
3187 check_preempt_curr(this_rq, p, 0);
3188}
3189
3190
3191
3192
3193static
3194int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3195 struct sched_domain *sd, enum cpu_idle_type idle,
3196 int *all_pinned)
3197{
3198 int tsk_cache_hot = 0;
3199
3200
3201
3202
3203
3204
3205 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3206 schedstat_inc(p, se.nr_failed_migrations_affine);
3207 return 0;
3208 }
3209 *all_pinned = 0;
3210
3211 if (task_running(rq, p)) {
3212 schedstat_inc(p, se.nr_failed_migrations_running);
3213 return 0;
3214 }
3215
3216
3217
3218
3219
3220
3221
3222 tsk_cache_hot = task_hot(p, rq->clock, sd);
3223 if (!tsk_cache_hot ||
3224 sd->nr_balance_failed > sd->cache_nice_tries) {
3225#ifdef CONFIG_SCHEDSTATS
3226 if (tsk_cache_hot) {
3227 schedstat_inc(sd, lb_hot_gained[idle]);
3228 schedstat_inc(p, se.nr_forced_migrations);
3229 }
3230#endif
3231 return 1;
3232 }
3233
3234 if (tsk_cache_hot) {
3235 schedstat_inc(p, se.nr_failed_migrations_hot);
3236 return 0;
3237 }
3238 return 1;
3239}
3240
3241static unsigned long
3242balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3243 unsigned long max_load_move, struct sched_domain *sd,
3244 enum cpu_idle_type idle, int *all_pinned,
3245 int *this_best_prio, struct rq_iterator *iterator)
3246{
3247 int loops = 0, pulled = 0, pinned = 0;
3248 struct task_struct *p;
3249 long rem_load_move = max_load_move;
3250
3251 if (max_load_move == 0)
3252 goto out;
3253
3254 pinned = 1;
3255
3256
3257
3258
3259 p = iterator->start(iterator->arg);
3260next:
3261 if (!p || loops++ > sysctl_sched_nr_migrate)
3262 goto out;
3263
3264 if ((p->se.load.weight >> 1) > rem_load_move ||
3265 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3266 p = iterator->next(iterator->arg);
3267 goto next;
3268 }
3269
3270 pull_task(busiest, p, this_rq, this_cpu);
3271 pulled++;
3272 rem_load_move -= p->se.load.weight;
3273
3274#ifdef CONFIG_PREEMPT
3275
3276
3277
3278
3279
3280 if (idle == CPU_NEWLY_IDLE)
3281 goto out;
3282#endif
3283
3284
3285
3286
3287 if (rem_load_move > 0) {
3288 if (p->prio < *this_best_prio)
3289 *this_best_prio = p->prio;
3290 p = iterator->next(iterator->arg);
3291 goto next;
3292 }
3293out:
3294
3295
3296
3297
3298
3299 schedstat_add(sd, lb_gained[idle], pulled);
3300
3301 if (all_pinned)
3302 *all_pinned = pinned;
3303
3304 return max_load_move - rem_load_move;
3305}
3306
3307
3308
3309
3310
3311
3312
3313
3314static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3315 unsigned long max_load_move,
3316 struct sched_domain *sd, enum cpu_idle_type idle,
3317 int *all_pinned)
3318{
3319 const struct sched_class *class = sched_class_highest;
3320 unsigned long total_load_moved = 0;
3321 int this_best_prio = this_rq->curr->prio;
3322
3323 do {
3324 total_load_moved +=
3325 class->load_balance(this_rq, this_cpu, busiest,
3326 max_load_move - total_load_moved,
3327 sd, idle, all_pinned, &this_best_prio);
3328 class = class->next;
3329
3330#ifdef CONFIG_PREEMPT
3331
3332
3333
3334
3335
3336 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3337 break;
3338#endif
3339 } while (class && max_load_move > total_load_moved);
3340
3341 return total_load_moved > 0;
3342}
3343
3344static int
3345iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3346 struct sched_domain *sd, enum cpu_idle_type idle,
3347 struct rq_iterator *iterator)
3348{
3349 struct task_struct *p = iterator->start(iterator->arg);
3350 int pinned = 0;
3351
3352 while (p) {
3353 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3354 pull_task(busiest, p, this_rq, this_cpu);
3355
3356
3357
3358
3359
3360 schedstat_inc(sd, lb_gained[idle]);
3361
3362 return 1;
3363 }
3364 p = iterator->next(iterator->arg);
3365 }
3366
3367 return 0;
3368}
3369
3370
3371
3372
3373
3374
3375
3376
3377static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3378 struct sched_domain *sd, enum cpu_idle_type idle)
3379{
3380 const struct sched_class *class;
3381
3382 for (class = sched_class_highest; class; class = class->next)
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1;
3385
3386 return 0;
3387}
3388
3389
3390
3391
3392
3393struct sd_lb_stats {
3394 struct sched_group *busiest;
3395 struct sched_group *this;
3396 unsigned long total_load;
3397 unsigned long total_pwr;
3398 unsigned long avg_load;
3399
3400
3401 unsigned long this_load;
3402 unsigned long this_load_per_task;
3403 unsigned long this_nr_running;
3404
3405
3406 unsigned long max_load;
3407 unsigned long busiest_load_per_task;
3408 unsigned long busiest_nr_running;
3409
3410 int group_imb;
3411#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3412 int power_savings_balance;
3413 struct sched_group *group_min;
3414 struct sched_group *group_leader;
3415 unsigned long min_load_per_task;
3416 unsigned long leader_nr_running;
3417 unsigned long min_nr_running;
3418#endif
3419};
3420
3421
3422
3423
3424struct sg_lb_stats {
3425 unsigned long avg_load;
3426 unsigned long group_load;
3427 unsigned long sum_nr_running;
3428 unsigned long sum_weighted_load;
3429 unsigned long group_capacity;
3430 int group_imb;
3431};
3432
3433
3434
3435
3436
3437static inline unsigned int group_first_cpu(struct sched_group *group)
3438{
3439 return cpumask_first(sched_group_cpus(group));
3440}
3441
3442
3443
3444
3445
3446
3447static inline int get_sd_load_idx(struct sched_domain *sd,
3448 enum cpu_idle_type idle)
3449{
3450 int load_idx;
3451
3452 switch (idle) {
3453 case CPU_NOT_IDLE:
3454 load_idx = sd->busy_idx;
3455 break;
3456
3457 case CPU_NEWLY_IDLE:
3458 load_idx = sd->newidle_idx;
3459 break;
3460 default:
3461 load_idx = sd->idle_idx;
3462 break;
3463 }
3464
3465 return load_idx;
3466}
3467
3468
3469#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3470
3471
3472
3473
3474
3475
3476
3477
3478static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3479 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3480{
3481
3482
3483
3484
3485 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3486 sds->power_savings_balance = 0;
3487 else {
3488 sds->power_savings_balance = 1;
3489 sds->min_nr_running = ULONG_MAX;
3490 sds->leader_nr_running = 0;
3491 }
3492}
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504static inline void update_sd_power_savings_stats(struct sched_group *group,
3505 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3506{
3507
3508 if (!sds->power_savings_balance)
3509 return;
3510
3511
3512
3513
3514
3515 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3516 !sds->this_nr_running))
3517 sds->power_savings_balance = 0;
3518
3519
3520
3521
3522
3523 if (!sds->power_savings_balance ||
3524 sgs->sum_nr_running >= sgs->group_capacity ||
3525 !sgs->sum_nr_running)
3526 return;
3527
3528
3529
3530
3531
3532
3533 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3534 (sgs->sum_nr_running == sds->min_nr_running &&
3535 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3536 sds->group_min = group;
3537 sds->min_nr_running = sgs->sum_nr_running;
3538 sds->min_load_per_task = sgs->sum_weighted_load /
3539 sgs->sum_nr_running;
3540 }
3541
3542
3543
3544
3545
3546
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3548 return;
3549
3550 if (sgs->sum_nr_running > sds->leader_nr_running ||
3551 (sgs->sum_nr_running == sds->leader_nr_running &&
3552 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3553 sds->group_leader = group;
3554 sds->leader_nr_running = sgs->sum_nr_running;
3555 }
3556}
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3574 int this_cpu, unsigned long *imbalance)
3575{
3576 if (!sds->power_savings_balance)
3577 return 0;
3578
3579 if (sds->this != sds->group_leader ||
3580 sds->group_leader == sds->group_min)
3581 return 0;
3582
3583 *imbalance = sds->min_load_per_task;
3584 sds->busiest = sds->group_min;
3585
3586 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3587 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3588 group_first_cpu(sds->group_leader);
3589 }
3590
3591 return 1;
3592
3593}
3594#else
3595static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3596 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3597{
3598 return;
3599}
3600
3601static inline void update_sd_power_savings_stats(struct sched_group *group,
3602 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3603{
3604 return;
3605}
3606
3607static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3608 int this_cpu, unsigned long *imbalance)
3609{
3610 return 0;
3611}
3612#endif
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs)
3631{
3632 unsigned long load, max_cpu_load, min_cpu_load;
3633 int i;
3634 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3635 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task;
3637
3638 if (local_group)
3639 balance_cpu = group_first_cpu(group);
3640
3641
3642 sum_avg_load_per_task = avg_load_per_task = 0;
3643 max_cpu_load = 0;
3644 min_cpu_load = ~0UL;
3645
3646 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3647 struct rq *rq = cpu_rq(i);
3648
3649 if (*sd_idle && rq->nr_running)
3650 *sd_idle = 0;
3651
3652
3653 if (local_group) {
3654 if (idle_cpu(i) && !first_idle_cpu) {
3655 first_idle_cpu = 1;
3656 balance_cpu = i;
3657 }
3658
3659 load = target_load(i, load_idx);
3660 } else {
3661 load = source_load(i, load_idx);
3662 if (load > max_cpu_load)
3663 max_cpu_load = load;
3664 if (min_cpu_load > load)
3665 min_cpu_load = load;
3666 }
3667
3668 sgs->group_load += load;
3669 sgs->sum_nr_running += rq->nr_running;
3670 sgs->sum_weighted_load += weighted_cpuload(i);
3671
3672 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3673 }
3674
3675
3676
3677
3678
3679
3680
3681 if (idle != CPU_NEWLY_IDLE && local_group &&
3682 balance_cpu != this_cpu && balance) {
3683 *balance = 0;
3684 return;
3685 }
3686
3687
3688 sgs->avg_load = sg_div_cpu_power(group,
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701 avg_load_per_task = sg_div_cpu_power(group,
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3703
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1;
3706
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3708
3709}
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3722 enum cpu_idle_type idle, int *sd_idle,
3723 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds)
3725{
3726 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs;
3728 int load_idx;
3729
3730 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle);
3732
3733 do {
3734 int local_group;
3735
3736 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs);
3741
3742 if (local_group && balance && !(*balance))
3743 return;
3744
3745 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power;
3747
3748 if (local_group) {
3749 sds->this_load = sgs.avg_load;
3750 sds->this = group;
3751 sds->this_nr_running = sgs.sum_nr_running;
3752 sds->this_load_per_task = sgs.sum_weighted_load;
3753 } else if (sgs.avg_load > sds->max_load &&
3754 (sgs.sum_nr_running > sgs.group_capacity ||
3755 sgs.group_imb)) {
3756 sds->max_load = sgs.avg_load;
3757 sds->busiest = group;
3758 sds->busiest_nr_running = sgs.sum_nr_running;
3759 sds->busiest_load_per_task = sgs.sum_weighted_load;
3760 sds->group_imb = sgs.group_imb;
3761 }
3762
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next;
3765 } while (group != sd->groups);
3766
3767}
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3778 int this_cpu, unsigned long *imbalance)
3779{
3780 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3781 unsigned int imbn = 2;
3782
3783 if (sds->this_nr_running) {
3784 sds->this_load_per_task /= sds->this_nr_running;
3785 if (sds->busiest_load_per_task >
3786 sds->this_load_per_task)
3787 imbn = 1;
3788 } else
3789 sds->this_load_per_task =
3790 cpu_avg_load_per_task(this_cpu);
3791
3792 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3793 sds->busiest_load_per_task * imbn) {
3794 *imbalance = sds->busiest_load_per_task;
3795 return;
3796 }
3797
3798
3799
3800
3801
3802
3803
3804 pwr_now += sds->busiest->__cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power *
3807 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE;
3809
3810
3811 tmp = sg_div_cpu_power(sds->busiest,
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3813 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816
3817
3818 if (sds->max_load * sds->busiest->__cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this,
3821 sds->max_load * sds->busiest->__cpu_power);
3822 else
3823 tmp = sg_div_cpu_power(sds->this,
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3825 pwr_move += sds->this->__cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE;
3828
3829
3830 if (pwr_move > pwr_now)
3831 *imbalance = sds->busiest_load_per_task;
3832}
3833
3834
3835
3836
3837
3838
3839
3840
3841static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3842 unsigned long *imbalance)
3843{
3844 unsigned long max_pull;
3845
3846
3847
3848
3849
3850 if (sds->max_load < sds->avg_load) {
3851 *imbalance = 0;
3852 return fix_small_imbalance(sds, this_cpu, imbalance);
3853 }
3854
3855
3856 max_pull = min(sds->max_load - sds->avg_load,
3857 sds->max_load - sds->busiest_load_per_task);
3858
3859
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3862 / SCHED_LOAD_SCALE;
3863
3864
3865
3866
3867
3868
3869
3870 if (*imbalance < sds->busiest_load_per_task)
3871 return fix_small_imbalance(sds, this_cpu, imbalance);
3872
3873}
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901static struct sched_group *
3902find_busiest_group(struct sched_domain *sd, int this_cpu,
3903 unsigned long *imbalance, enum cpu_idle_type idle,
3904 int *sd_idle, const struct cpumask *cpus, int *balance)
3905{
3906 struct sd_lb_stats sds;
3907
3908 memset(&sds, 0, sizeof(sds));
3909
3910
3911
3912
3913
3914 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3915 balance, &sds);
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927 if (balance && !(*balance))
3928 goto ret;
3929
3930 if (!sds.busiest || sds.busiest_nr_running == 0)
3931 goto out_balanced;
3932
3933 if (sds.this_load >= sds.max_load)
3934 goto out_balanced;
3935
3936 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3937
3938 if (sds.this_load >= sds.avg_load)
3939 goto out_balanced;
3940
3941 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3942 goto out_balanced;
3943
3944 sds.busiest_load_per_task /= sds.busiest_nr_running;
3945 if (sds.group_imb)
3946 sds.busiest_load_per_task =
3947 min(sds.busiest_load_per_task, sds.avg_load);
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960 if (sds.max_load <= sds.busiest_load_per_task)
3961 goto out_balanced;
3962
3963
3964 calculate_imbalance(&sds, this_cpu, imbalance);
3965 return sds.busiest;
3966
3967out_balanced:
3968
3969
3970
3971
3972 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3973 return sds.busiest;
3974ret:
3975 *imbalance = 0;
3976 return NULL;
3977}
3978
3979
3980
3981
3982static struct rq *
3983find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3984 unsigned long imbalance, const struct cpumask *cpus)
3985{
3986 struct rq *busiest = NULL, *rq;
3987 unsigned long max_load = 0;
3988 int i;
3989
3990 for_each_cpu(i, sched_group_cpus(group)) {
3991 unsigned long wl;
3992
3993 if (!cpumask_test_cpu(i, cpus))
3994 continue;
3995
3996 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i);
3998
3999 if (rq->nr_running == 1 && wl > imbalance)
4000 continue;
4001
4002 if (wl > max_load) {
4003 max_load = wl;
4004 busiest = rq;
4005 }
4006 }
4007
4008 return busiest;
4009}
4010
4011
4012
4013
4014
4015#define MAX_PINNED_INTERVAL 512
4016
4017
4018static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4019
4020
4021
4022
4023
4024static int load_balance(int this_cpu, struct rq *this_rq,
4025 struct sched_domain *sd, enum cpu_idle_type idle,
4026 int *balance)
4027{
4028 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4029 struct sched_group *group;
4030 unsigned long imbalance;
4031 struct rq *busiest;
4032 unsigned long flags;
4033 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4034
4035 cpumask_setall(cpus);
4036
4037
4038
4039
4040
4041
4042
4043 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4044 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4045 sd_idle = 1;
4046
4047 schedstat_inc(sd, lb_count[idle]);
4048
4049redo:
4050 update_shares(sd);
4051 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4052 cpus, balance);
4053
4054 if (*balance == 0)
4055 goto out_balanced;
4056
4057 if (!group) {
4058 schedstat_inc(sd, lb_nobusyg[idle]);
4059 goto out_balanced;
4060 }
4061
4062 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4063 if (!busiest) {
4064 schedstat_inc(sd, lb_nobusyq[idle]);
4065 goto out_balanced;
4066 }
4067
4068 BUG_ON(busiest == this_rq);
4069
4070 schedstat_add(sd, lb_imbalance[idle], imbalance);
4071
4072 ld_moved = 0;
4073 if (busiest->nr_running > 1) {
4074
4075
4076
4077
4078
4079
4080 local_irq_save(flags);
4081 double_rq_lock(this_rq, busiest);
4082 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4083 imbalance, sd, idle, &all_pinned);
4084 double_rq_unlock(this_rq, busiest);
4085 local_irq_restore(flags);
4086
4087
4088
4089
4090 if (ld_moved && this_cpu != smp_processor_id())
4091 resched_cpu(this_cpu);
4092
4093
4094 if (unlikely(all_pinned)) {
4095 cpumask_clear_cpu(cpu_of(busiest), cpus);
4096 if (!cpumask_empty(cpus))
4097 goto redo;
4098 goto out_balanced;
4099 }
4100 }
4101
4102 if (!ld_moved) {
4103 schedstat_inc(sd, lb_failed[idle]);
4104 sd->nr_balance_failed++;
4105
4106 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4107
4108 spin_lock_irqsave(&busiest->lock, flags);
4109
4110
4111
4112
4113 if (!cpumask_test_cpu(this_cpu,
4114 &busiest->curr->cpus_allowed)) {
4115 spin_unlock_irqrestore(&busiest->lock, flags);
4116 all_pinned = 1;
4117 goto out_one_pinned;
4118 }
4119
4120 if (!busiest->active_balance) {
4121 busiest->active_balance = 1;
4122 busiest->push_cpu = this_cpu;
4123 active_balance = 1;
4124 }
4125 spin_unlock_irqrestore(&busiest->lock, flags);
4126 if (active_balance)
4127 wake_up_process(busiest->migration_thread);
4128
4129
4130
4131
4132
4133 sd->nr_balance_failed = sd->cache_nice_tries+1;
4134 }
4135 } else
4136 sd->nr_balance_failed = 0;
4137
4138 if (likely(!active_balance)) {
4139
4140 sd->balance_interval = sd->min_interval;
4141 } else {
4142
4143
4144
4145
4146
4147
4148 if (sd->balance_interval < sd->max_interval)
4149 sd->balance_interval *= 2;
4150 }
4151
4152 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4153 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4154 ld_moved = -1;
4155
4156 goto out;
4157
4158out_balanced:
4159 schedstat_inc(sd, lb_balanced[idle]);
4160
4161 sd->nr_balance_failed = 0;
4162
4163out_one_pinned:
4164
4165 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4166 (sd->balance_interval < sd->max_interval))
4167 sd->balance_interval *= 2;
4168
4169 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4170 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4171 ld_moved = -1;
4172 else
4173 ld_moved = 0;
4174out:
4175 if (ld_moved)
4176 update_shares(sd);
4177 return ld_moved;
4178}
4179
4180
4181
4182
4183
4184
4185
4186
4187static int
4188load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4189{
4190 struct sched_group *group;
4191 struct rq *busiest = NULL;
4192 unsigned long imbalance;
4193 int ld_moved = 0;
4194 int sd_idle = 0;
4195 int all_pinned = 0;
4196 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4197
4198 cpumask_setall(cpus);
4199
4200
4201
4202
4203
4204
4205
4206 if (sd->flags & SD_SHARE_CPUPOWER &&
4207 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4208 sd_idle = 1;
4209
4210 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4211redo:
4212 update_shares_locked(this_rq, sd);
4213 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4214 &sd_idle, cpus, NULL);
4215 if (!group) {
4216 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4217 goto out_balanced;
4218 }
4219
4220 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4221 if (!busiest) {
4222 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4223 goto out_balanced;
4224 }
4225
4226 BUG_ON(busiest == this_rq);
4227
4228 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4229
4230 ld_moved = 0;
4231 if (busiest->nr_running > 1) {
4232
4233 double_lock_balance(this_rq, busiest);
4234
4235 update_rq_clock(busiest);
4236 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4237 imbalance, sd, CPU_NEWLY_IDLE,
4238 &all_pinned);
4239 double_unlock_balance(this_rq, busiest);
4240
4241 if (unlikely(all_pinned)) {
4242 cpumask_clear_cpu(cpu_of(busiest), cpus);
4243 if (!cpumask_empty(cpus))
4244 goto redo;
4245 }
4246 }
4247
4248 if (!ld_moved) {
4249 int active_balance = 0;
4250
4251 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4252 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4253 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4254 return -1;
4255
4256 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4257 return -1;
4258
4259 if (sd->nr_balance_failed++ < 2)
4260 return -1;
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285 double_lock_balance(this_rq, busiest);
4286
4287
4288
4289
4290
4291 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4292 double_unlock_balance(this_rq, busiest);
4293 all_pinned = 1;
4294 return ld_moved;
4295 }
4296
4297 if (!busiest->active_balance) {
4298 busiest->active_balance = 1;
4299 busiest->push_cpu = this_cpu;
4300 active_balance = 1;
4301 }
4302
4303 double_unlock_balance(this_rq, busiest);
4304
4305
4306
4307 spin_unlock(&this_rq->lock);
4308 if (active_balance)
4309 wake_up_process(busiest->migration_thread);
4310 spin_lock(&this_rq->lock);
4311
4312 } else
4313 sd->nr_balance_failed = 0;
4314
4315 update_shares_locked(this_rq, sd);
4316 return ld_moved;
4317
4318out_balanced:
4319 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4320 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4321 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4322 return -1;
4323 sd->nr_balance_failed = 0;
4324
4325 return 0;
4326}
4327
4328
4329
4330
4331
4332static void idle_balance(int this_cpu, struct rq *this_rq)
4333{
4334 struct sched_domain *sd;
4335 int pulled_task = 0;
4336 unsigned long next_balance = jiffies + HZ;
4337
4338 for_each_domain(this_cpu, sd) {
4339 unsigned long interval;
4340
4341 if (!(sd->flags & SD_LOAD_BALANCE))
4342 continue;
4343
4344 if (sd->flags & SD_BALANCE_NEWIDLE)
4345
4346 pulled_task = load_balance_newidle(this_cpu, this_rq,
4347 sd);
4348
4349 interval = msecs_to_jiffies(sd->balance_interval);
4350 if (time_after(next_balance, sd->last_balance + interval))
4351 next_balance = sd->last_balance + interval;
4352 if (pulled_task)
4353 break;
4354 }
4355 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4356
4357
4358
4359
4360 this_rq->next_balance = next_balance;
4361 }
4362}
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4373{
4374 int target_cpu = busiest_rq->push_cpu;
4375 struct sched_domain *sd;
4376 struct rq *target_rq;
4377
4378
4379 if (busiest_rq->nr_running <= 1)
4380 return;
4381
4382 target_rq = cpu_rq(target_cpu);
4383
4384
4385
4386
4387
4388
4389 BUG_ON(busiest_rq == target_rq);
4390
4391
4392 double_lock_balance(busiest_rq, target_rq);
4393 update_rq_clock(busiest_rq);
4394 update_rq_clock(target_rq);
4395
4396
4397 for_each_domain(target_cpu, sd) {
4398 if ((sd->flags & SD_LOAD_BALANCE) &&
4399 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4400 break;
4401 }
4402
4403 if (likely(sd)) {
4404 schedstat_inc(sd, alb_count);
4405
4406 if (move_one_task(target_rq, target_cpu, busiest_rq,
4407 sd, CPU_IDLE))
4408 schedstat_inc(sd, alb_pushed);
4409 else
4410 schedstat_inc(sd, alb_failed);
4411 }
4412 double_unlock_balance(busiest_rq, target_rq);
4413}
4414
4415#ifdef CONFIG_NO_HZ
4416static struct {
4417 atomic_t load_balancer;
4418 cpumask_var_t cpu_mask;
4419 cpumask_var_t ilb_grp_nohz_mask;
4420} nohz ____cacheline_aligned = {
4421 .load_balancer = ATOMIC_INIT(-1),
4422};
4423
4424int get_nohz_load_balancer(void)
4425{
4426 return atomic_read(&nohz.load_balancer);
4427}
4428
4429#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4440{
4441 struct sched_domain *sd;
4442
4443 for_each_domain(cpu, sd)
4444 if (sd && (sd->flags & flag))
4445 break;
4446
4447 return sd;
4448}
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460#define for_each_flag_domain(cpu, sd, flag) \
4461 for (sd = lowest_flag_domain(cpu, flag); \
4462 (sd && (sd->flags & flag)); sd = sd->parent)
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474static inline int is_semi_idle_group(struct sched_group *ilb_group)
4475{
4476 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4477 sched_group_cpus(ilb_group));
4478
4479
4480
4481
4482
4483 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4484 return 0;
4485
4486 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4487 return 0;
4488
4489 return 1;
4490}
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503static int find_new_ilb(int cpu)
4504{
4505 struct sched_domain *sd;
4506 struct sched_group *ilb_group;
4507
4508
4509
4510
4511
4512 if (!(sched_smt_power_savings || sched_mc_power_savings))
4513 goto out_done;
4514
4515
4516
4517
4518
4519 if (cpumask_weight(nohz.cpu_mask) < 2)
4520 goto out_done;
4521
4522 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4523 ilb_group = sd->groups;
4524
4525 do {
4526 if (is_semi_idle_group(ilb_group))
4527 return cpumask_first(nohz.ilb_grp_nohz_mask);
4528
4529 ilb_group = ilb_group->next;
4530
4531 } while (ilb_group != sd->groups);
4532 }
4533
4534out_done:
4535 return cpumask_first(nohz.cpu_mask);
4536}
4537#else
4538static inline int find_new_ilb(int call_cpu)
4539{
4540 return cpumask_first(nohz.cpu_mask);
4541}
4542#endif
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564int select_nohz_load_balancer(int stop_tick)
4565{
4566 int cpu = smp_processor_id();
4567
4568 if (stop_tick) {
4569 cpu_rq(cpu)->in_nohz_recently = 1;
4570
4571 if (!cpu_active(cpu)) {
4572 if (atomic_read(&nohz.load_balancer) != cpu)
4573 return 0;
4574
4575
4576
4577
4578
4579 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4580 BUG();
4581
4582 return 0;
4583 }
4584
4585 cpumask_set_cpu(cpu, nohz.cpu_mask);
4586
4587
4588 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4589 if (atomic_read(&nohz.load_balancer) == cpu)
4590 atomic_set(&nohz.load_balancer, -1);
4591 return 0;
4592 }
4593
4594 if (atomic_read(&nohz.load_balancer) == -1) {
4595
4596 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4597 return 1;
4598 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4599 int new_ilb;
4600
4601 if (!(sched_smt_power_savings ||
4602 sched_mc_power_savings))
4603 return 1;
4604
4605
4606
4607
4608 new_ilb = find_new_ilb(cpu);
4609 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4610 atomic_set(&nohz.load_balancer, -1);
4611 resched_cpu(new_ilb);
4612 return 0;
4613 }
4614 return 1;
4615 }
4616 } else {
4617 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4618 return 0;
4619
4620 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4621
4622 if (atomic_read(&nohz.load_balancer) == cpu)
4623 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4624 BUG();
4625 }
4626 return 0;
4627}
4628#endif
4629
4630static DEFINE_SPINLOCK(balancing);
4631
4632
4633
4634
4635
4636
4637
4638static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4639{
4640 int balance = 1;
4641 struct rq *rq = cpu_rq(cpu);
4642 unsigned long interval;
4643 struct sched_domain *sd;
4644
4645 unsigned long next_balance = jiffies + 60*HZ;
4646 int update_next_balance = 0;
4647 int need_serialize;
4648
4649 for_each_domain(cpu, sd) {
4650 if (!(sd->flags & SD_LOAD_BALANCE))
4651 continue;
4652
4653 interval = sd->balance_interval;
4654 if (idle != CPU_IDLE)
4655 interval *= sd->busy_factor;
4656
4657
4658 interval = msecs_to_jiffies(interval);
4659 if (unlikely(!interval))
4660 interval = 1;
4661 if (interval > HZ*NR_CPUS/10)
4662 interval = HZ*NR_CPUS/10;
4663
4664 need_serialize = sd->flags & SD_SERIALIZE;
4665
4666 if (need_serialize) {
4667 if (!spin_trylock(&balancing))
4668 goto out;
4669 }
4670
4671 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4672 if (load_balance(cpu, rq, sd, idle, &balance)) {
4673
4674
4675
4676
4677
4678 idle = CPU_NOT_IDLE;
4679 }
4680 sd->last_balance = jiffies;
4681 }
4682 if (need_serialize)
4683 spin_unlock(&balancing);
4684out:
4685 if (time_after(next_balance, sd->last_balance + interval)) {
4686 next_balance = sd->last_balance + interval;
4687 update_next_balance = 1;
4688 }
4689
4690
4691
4692
4693
4694
4695 if (!balance)
4696 break;
4697 }
4698
4699
4700
4701
4702
4703
4704 if (likely(update_next_balance))
4705 rq->next_balance = next_balance;
4706}
4707
4708
4709
4710
4711
4712
4713static void run_rebalance_domains(struct softirq_action *h)
4714{
4715 int this_cpu = smp_processor_id();
4716 struct rq *this_rq = cpu_rq(this_cpu);
4717 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4718 CPU_IDLE : CPU_NOT_IDLE;
4719
4720 rebalance_domains(this_cpu, idle);
4721
4722#ifdef CONFIG_NO_HZ
4723
4724
4725
4726
4727
4728 if (this_rq->idle_at_tick &&
4729 atomic_read(&nohz.load_balancer) == this_cpu) {
4730 struct rq *rq;
4731 int balance_cpu;
4732
4733 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4734 if (balance_cpu == this_cpu)
4735 continue;
4736
4737
4738
4739
4740
4741
4742 if (need_resched())
4743 break;
4744
4745 rebalance_domains(balance_cpu, CPU_IDLE);
4746
4747 rq = cpu_rq(balance_cpu);
4748 if (time_after(this_rq->next_balance, rq->next_balance))
4749 this_rq->next_balance = rq->next_balance;
4750 }
4751 }
4752#endif
4753}
4754
4755static inline int on_null_domain(int cpu)
4756{
4757 return !rcu_dereference(cpu_rq(cpu)->sd);
4758}
4759
4760
4761
4762
4763
4764
4765
4766
4767static inline void trigger_load_balance(struct rq *rq, int cpu)
4768{
4769#ifdef CONFIG_NO_HZ
4770
4771
4772
4773
4774
4775 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4776 rq->in_nohz_recently = 0;
4777
4778 if (atomic_read(&nohz.load_balancer) == cpu) {
4779 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4780 atomic_set(&nohz.load_balancer, -1);
4781 }
4782
4783 if (atomic_read(&nohz.load_balancer) == -1) {
4784 int ilb = find_new_ilb(cpu);
4785
4786 if (ilb < nr_cpu_ids)
4787 resched_cpu(ilb);
4788 }
4789 }
4790
4791
4792
4793
4794
4795 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4796 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4797 resched_cpu(cpu);
4798 return;
4799 }
4800
4801
4802
4803
4804
4805 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4806 cpumask_test_cpu(cpu, nohz.cpu_mask))
4807 return;
4808#endif
4809
4810 if (time_after_eq(jiffies, rq->next_balance) &&
4811 likely(!on_null_domain(cpu)))
4812 raise_softirq(SCHED_SOFTIRQ);
4813}
4814
4815#else
4816
4817
4818
4819
4820static inline void idle_balance(int cpu, struct rq *rq)
4821{
4822}
4823
4824#endif
4825
4826DEFINE_PER_CPU(struct kernel_stat, kstat);
4827
4828EXPORT_PER_CPU_SYMBOL(kstat);
4829
4830
4831
4832
4833
4834
4835
4836static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4837{
4838 u64 ns = 0;
4839
4840 if (task_current(rq, p)) {
4841 update_rq_clock(rq);
4842 ns = rq->clock - p->se.exec_start;
4843 if ((s64)ns < 0)
4844 ns = 0;
4845 }
4846
4847 return ns;
4848}
4849
4850unsigned long long task_delta_exec(struct task_struct *p)
4851{
4852 unsigned long flags;
4853 struct rq *rq;
4854 u64 ns = 0;
4855
4856 rq = task_rq_lock(p, &flags);
4857 ns = do_task_delta_exec(p, rq);
4858 task_rq_unlock(rq, &flags);
4859
4860 return ns;
4861}
4862
4863
4864
4865
4866
4867
4868unsigned long long task_sched_runtime(struct task_struct *p)
4869{
4870 unsigned long flags;
4871 struct rq *rq;
4872 u64 ns = 0;
4873
4874 rq = task_rq_lock(p, &flags);
4875 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4876 task_rq_unlock(rq, &flags);
4877
4878 return ns;
4879}
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890unsigned long long thread_group_sched_runtime(struct task_struct *p)
4891{
4892 struct task_cputime totals;
4893 unsigned long flags;
4894 struct rq *rq;
4895 u64 ns;
4896
4897 rq = task_rq_lock(p, &flags);
4898 thread_group_cputime(p, &totals);
4899 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4900 task_rq_unlock(rq, &flags);
4901
4902 return ns;
4903}
4904
4905
4906
4907
4908
4909
4910
4911void account_user_time(struct task_struct *p, cputime_t cputime,
4912 cputime_t cputime_scaled)
4913{
4914 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4915 cputime64_t tmp;
4916
4917
4918 p->utime = cputime_add(p->utime, cputime);
4919 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4920 account_group_user_time(p, cputime);
4921
4922
4923 tmp = cputime_to_cputime64(cputime);
4924 if (TASK_NICE(p) > 0)
4925 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4926 else
4927 cpustat->user = cputime64_add(cpustat->user, tmp);
4928
4929 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4930
4931 acct_update_integrals(p);
4932}
4933
4934
4935
4936
4937
4938
4939
4940static void account_guest_time(struct task_struct *p, cputime_t cputime,
4941 cputime_t cputime_scaled)
4942{
4943 cputime64_t tmp;
4944 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4945
4946 tmp = cputime_to_cputime64(cputime);
4947
4948
4949 p->utime = cputime_add(p->utime, cputime);
4950 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4951 account_group_user_time(p, cputime);
4952 p->gtime = cputime_add(p->gtime, cputime);
4953
4954
4955 cpustat->user = cputime64_add(cpustat->user, tmp);
4956 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4957}
4958
4959
4960
4961
4962
4963
4964
4965
4966void account_system_time(struct task_struct *p, int hardirq_offset,
4967 cputime_t cputime, cputime_t cputime_scaled)
4968{
4969 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4970 cputime64_t tmp;
4971
4972 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4973 account_guest_time(p, cputime, cputime_scaled);
4974 return;
4975 }
4976
4977
4978 p->stime = cputime_add(p->stime, cputime);
4979 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4980 account_group_system_time(p, cputime);
4981
4982
4983 tmp = cputime_to_cputime64(cputime);
4984 if (hardirq_count() - hardirq_offset)
4985 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4986 else if (softirq_count())
4987 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4988 else
4989 cpustat->system = cputime64_add(cpustat->system, tmp);
4990
4991 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
4992
4993
4994 acct_update_integrals(p);
4995}
4996
4997
4998
4999
5000
5001void account_steal_time(cputime_t cputime)
5002{
5003 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5004 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5005
5006 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
5007}
5008
5009
5010
5011
5012
5013void account_idle_time(cputime_t cputime)
5014{
5015 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5016 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5017 struct rq *rq = this_rq();
5018
5019 if (atomic_read(&rq->nr_iowait) > 0)
5020 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
5021 else
5022 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
5023}
5024
5025#ifndef CONFIG_VIRT_CPU_ACCOUNTING
5026
5027
5028
5029
5030
5031
5032void account_process_tick(struct task_struct *p, int user_tick)
5033{
5034 cputime_t one_jiffy = jiffies_to_cputime(1);
5035 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5036 struct rq *rq = this_rq();
5037
5038 if (user_tick)
5039 account_user_time(p, one_jiffy, one_jiffy_scaled);
5040 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5041 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
5042 one_jiffy_scaled);
5043 else
5044 account_idle_time(one_jiffy);
5045}
5046
5047
5048
5049
5050
5051
5052void account_steal_ticks(unsigned long ticks)
5053{
5054 account_steal_time(jiffies_to_cputime(ticks));
5055}
5056
5057
5058
5059
5060
5061void account_idle_ticks(unsigned long ticks)
5062{
5063 account_idle_time(jiffies_to_cputime(ticks));
5064}
5065
5066#endif
5067
5068
5069
5070
5071#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5072cputime_t task_utime(struct task_struct *p)
5073{
5074 return p->utime;
5075}
5076
5077cputime_t task_stime(struct task_struct *p)
5078{
5079 return p->stime;
5080}
5081#else
5082cputime_t task_utime(struct task_struct *p)
5083{
5084 clock_t utime = cputime_to_clock_t(p->utime),
5085 total = utime + cputime_to_clock_t(p->stime);
5086 u64 temp;
5087
5088
5089
5090
5091 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
5092
5093 if (total) {
5094 temp *= utime;
5095 do_div(temp, total);
5096 }
5097 utime = (clock_t)temp;
5098
5099 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
5100 return p->prev_utime;
5101}
5102
5103cputime_t task_stime(struct task_struct *p)
5104{
5105 clock_t stime;
5106
5107
5108
5109
5110
5111
5112 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5113 cputime_to_clock_t(task_utime(p));
5114
5115 if (stime >= 0)
5116 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
5117
5118 return p->prev_stime;
5119}
5120#endif
5121
5122inline cputime_t task_gtime(struct task_struct *p)
5123{
5124 return p->gtime;
5125}
5126
5127
5128
5129
5130
5131
5132
5133
5134void scheduler_tick(void)
5135{
5136 int cpu = smp_processor_id();
5137 struct rq *rq = cpu_rq(cpu);
5138 struct task_struct *curr = rq->curr;
5139
5140 sched_clock_tick();
5141
5142 spin_lock(&rq->lock);
5143 update_rq_clock(rq);
5144 update_cpu_load(rq);
5145 curr->sched_class->task_tick(rq, curr, 0);
5146 spin_unlock(&rq->lock);
5147
5148 perf_counter_task_tick(curr, cpu);
5149
5150#ifdef CONFIG_SMP
5151 rq->idle_at_tick = idle_cpu(cpu);
5152 trigger_load_balance(rq, cpu);
5153#endif
5154}
5155
5156notrace unsigned long get_parent_ip(unsigned long addr)
5157{
5158 if (in_lock_functions(addr)) {
5159 addr = CALLER_ADDR2;
5160 if (in_lock_functions(addr))
5161 addr = CALLER_ADDR3;
5162 }
5163 return addr;
5164}
5165
5166#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
5167 defined(CONFIG_PREEMPT_TRACER))
5168
5169void __kprobes add_preempt_count(int val)
5170{
5171#ifdef CONFIG_DEBUG_PREEMPT
5172
5173
5174
5175 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5176 return;
5177#endif
5178 preempt_count() += val;
5179#ifdef CONFIG_DEBUG_PREEMPT
5180
5181
5182
5183 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5184 PREEMPT_MASK - 10);
5185#endif
5186 if (preempt_count() == val)
5187 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5188}
5189EXPORT_SYMBOL(add_preempt_count);
5190
5191void __kprobes sub_preempt_count(int val)
5192{
5193#ifdef CONFIG_DEBUG_PREEMPT
5194
5195
5196
5197 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5198 return;
5199
5200
5201
5202 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5203 !(preempt_count() & PREEMPT_MASK)))
5204 return;
5205#endif
5206
5207 if (preempt_count() == val)
5208 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5209 preempt_count() -= val;
5210}
5211EXPORT_SYMBOL(sub_preempt_count);
5212
5213#endif
5214
5215
5216
5217
5218static noinline void __schedule_bug(struct task_struct *prev)
5219{
5220 struct pt_regs *regs = get_irq_regs();
5221
5222 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5223 prev->comm, prev->pid, preempt_count());
5224
5225 debug_show_held_locks(prev);
5226 print_modules();
5227 if (irqs_disabled())
5228 print_irqtrace_events(prev);
5229
5230 if (regs)
5231 show_regs(regs);
5232 else
5233 dump_stack();
5234}
5235
5236
5237
5238
5239static inline void schedule_debug(struct task_struct *prev)
5240{
5241
5242
5243
5244
5245
5246 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
5247 __schedule_bug(prev);
5248
5249 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5250
5251 schedstat_inc(this_rq(), sched_count);
5252#ifdef CONFIG_SCHEDSTATS
5253 if (unlikely(prev->lock_depth >= 0)) {
5254 schedstat_inc(this_rq(), bkl_count);
5255 schedstat_inc(prev, sched_info.bkl_count);
5256 }
5257#endif
5258}
5259
5260static void put_prev_task(struct rq *rq, struct task_struct *prev)
5261{
5262 if (prev->state == TASK_RUNNING) {
5263 u64 runtime = prev->se.sum_exec_runtime;
5264
5265 runtime -= prev->se.prev_sum_exec_runtime;
5266 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277 update_avg(&prev->se.avg_overlap, runtime);
5278 }
5279 prev->sched_class->put_prev_task(rq, prev);
5280}
5281
5282
5283
5284
5285static inline struct task_struct *
5286pick_next_task(struct rq *rq)
5287{
5288 const struct sched_class *class;
5289 struct task_struct *p;
5290
5291
5292
5293
5294
5295 if (likely(rq->nr_running == rq->cfs.nr_running)) {
5296 p = fair_sched_class.pick_next_task(rq);
5297 if (likely(p))
5298 return p;
5299 }
5300
5301 class = sched_class_highest;
5302 for ( ; ; ) {
5303 p = class->pick_next_task(rq);
5304 if (p)
5305 return p;
5306
5307
5308
5309
5310 class = class->next;
5311 }
5312}
5313
5314
5315
5316
5317asmlinkage void __sched schedule(void)
5318{
5319 struct task_struct *prev, *next;
5320 unsigned long *switch_count;
5321 struct rq *rq;
5322 int cpu;
5323
5324need_resched:
5325 preempt_disable();
5326 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu);
5329 prev = rq->curr;
5330 switch_count = &prev->nivcsw;
5331
5332 release_kernel_lock(prev);
5333need_resched_nonpreemptible:
5334
5335 schedule_debug(prev);
5336
5337 if (sched_feat(HRTICK))
5338 hrtick_clear(rq);
5339
5340 spin_lock_irq(&rq->lock);
5341 update_rq_clock(rq);
5342 clear_tsk_need_resched(prev);
5343
5344 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5345 if (unlikely(signal_pending_state(prev->state, prev)))
5346 prev->state = TASK_RUNNING;
5347 else
5348 deactivate_task(rq, prev, 1);
5349 switch_count = &prev->nvcsw;
5350 }
5351
5352#ifdef CONFIG_SMP
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356
5357 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq);
5359
5360 put_prev_task(rq, prev);
5361 next = pick_next_task(rq);
5362
5363 if (likely(prev != next)) {
5364 sched_info_switch(prev, next);
5365 perf_counter_task_sched_out(prev, next, cpu);
5366
5367 rq->nr_switches++;
5368 rq->curr = next;
5369 ++*switch_count;
5370
5371 context_switch(rq, prev, next);
5372
5373
5374
5375
5376 cpu = smp_processor_id();
5377 rq = cpu_rq(cpu);
5378 } else
5379 spin_unlock_irq(&rq->lock);
5380
5381 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible;
5383
5384 preempt_enable_no_resched();
5385 if (need_resched())
5386 goto need_resched;
5387}
5388EXPORT_SYMBOL(schedule);
5389
5390#ifdef CONFIG_SMP
5391
5392
5393
5394
5395int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5396{
5397 unsigned int cpu;
5398 struct rq *rq;
5399
5400 if (!sched_feat(OWNER_SPIN))
5401 return 0;
5402
5403#ifdef CONFIG_DEBUG_PAGEALLOC
5404
5405
5406
5407
5408
5409 if (probe_kernel_address(&owner->cpu, cpu))
5410 goto out;
5411#else
5412 cpu = owner->cpu;
5413#endif
5414
5415
5416
5417
5418
5419 if (cpu >= nr_cpumask_bits)
5420 goto out;
5421
5422
5423
5424
5425
5426 if (!cpu_online(cpu))
5427 goto out;
5428
5429 rq = cpu_rq(cpu);
5430
5431 for (;;) {
5432
5433
5434
5435 if (lock->owner != owner)
5436 break;
5437
5438
5439
5440
5441 if (task_thread_info(rq->curr) != owner || need_resched())
5442 return 0;
5443
5444 cpu_relax();
5445 }
5446out:
5447 return 1;
5448}
5449#endif
5450
5451#ifdef CONFIG_PREEMPT
5452
5453
5454
5455
5456
5457asmlinkage void __sched preempt_schedule(void)
5458{
5459 struct thread_info *ti = current_thread_info();
5460
5461
5462
5463
5464
5465 if (likely(ti->preempt_count || irqs_disabled()))
5466 return;
5467
5468 do {
5469 add_preempt_count(PREEMPT_ACTIVE);
5470 schedule();
5471 sub_preempt_count(PREEMPT_ACTIVE);
5472
5473
5474
5475
5476
5477 barrier();
5478 } while (need_resched());
5479}
5480EXPORT_SYMBOL(preempt_schedule);
5481
5482
5483
5484
5485
5486
5487
5488asmlinkage void __sched preempt_schedule_irq(void)
5489{
5490 struct thread_info *ti = current_thread_info();
5491
5492
5493 BUG_ON(ti->preempt_count || !irqs_disabled());
5494
5495 do {
5496 add_preempt_count(PREEMPT_ACTIVE);
5497 local_irq_enable();
5498 schedule();
5499 local_irq_disable();
5500 sub_preempt_count(PREEMPT_ACTIVE);
5501
5502
5503
5504
5505
5506 barrier();
5507 } while (need_resched());
5508}
5509
5510#endif
5511
5512int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
5513 void *key)
5514{
5515 return try_to_wake_up(curr->private, mode, sync);
5516}
5517EXPORT_SYMBOL(default_wake_function);
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5529 int nr_exclusive, int sync, void *key)
5530{
5531 wait_queue_t *curr, *next;
5532
5533 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5534 unsigned flags = curr->flags;
5535
5536 if (curr->func(curr, mode, sync, key) &&
5537 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5538 break;
5539 }
5540}
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552void __wake_up(wait_queue_head_t *q, unsigned int mode,
5553 int nr_exclusive, void *key)
5554{
5555 unsigned long flags;
5556
5557 spin_lock_irqsave(&q->lock, flags);
5558 __wake_up_common(q, mode, nr_exclusive, 0, key);
5559 spin_unlock_irqrestore(&q->lock, flags);
5560}
5561EXPORT_SYMBOL(__wake_up);
5562
5563
5564
5565
5566void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5567{
5568 __wake_up_common(q, mode, 1, 0, NULL);
5569}
5570
5571void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5572{
5573 __wake_up_common(q, mode, 1, 0, key);
5574}
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5594 int nr_exclusive, void *key)
5595{
5596 unsigned long flags;
5597 int sync = 1;
5598
5599 if (unlikely(!q))
5600 return;
5601
5602 if (unlikely(!nr_exclusive))
5603 sync = 0;
5604
5605 spin_lock_irqsave(&q->lock, flags);
5606 __wake_up_common(q, mode, nr_exclusive, sync, key);
5607 spin_unlock_irqrestore(&q->lock, flags);
5608}
5609EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5610
5611
5612
5613
5614void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5615{
5616 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5617}
5618EXPORT_SYMBOL_GPL(__wake_up_sync);
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632void complete(struct completion *x)
5633{
5634 unsigned long flags;
5635
5636 spin_lock_irqsave(&x->wait.lock, flags);
5637 x->done++;
5638 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
5639 spin_unlock_irqrestore(&x->wait.lock, flags);
5640}
5641EXPORT_SYMBOL(complete);
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652void complete_all(struct completion *x)
5653{
5654 unsigned long flags;
5655
5656 spin_lock_irqsave(&x->wait.lock, flags);
5657 x->done += UINT_MAX/2;
5658 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
5659 spin_unlock_irqrestore(&x->wait.lock, flags);
5660}
5661EXPORT_SYMBOL(complete_all);
5662
5663static inline long __sched
5664do_wait_for_common(struct completion *x, long timeout, int state)
5665{
5666 if (!x->done) {
5667 DECLARE_WAITQUEUE(wait, current);
5668
5669 wait.flags |= WQ_FLAG_EXCLUSIVE;
5670 __add_wait_queue_tail(&x->wait, &wait);
5671 do {
5672 if (signal_pending_state(state, current)) {
5673 timeout = -ERESTARTSYS;
5674 break;
5675 }
5676 __set_current_state(state);
5677 spin_unlock_irq(&x->wait.lock);
5678 timeout = schedule_timeout(timeout);
5679 spin_lock_irq(&x->wait.lock);
5680 } while (!x->done && timeout);
5681 __remove_wait_queue(&x->wait, &wait);
5682 if (!x->done)
5683 return timeout;
5684 }
5685 x->done--;
5686 return timeout ?: 1;
5687}
5688
5689static long __sched
5690wait_for_common(struct completion *x, long timeout, int state)
5691{
5692 might_sleep();
5693
5694 spin_lock_irq(&x->wait.lock);
5695 timeout = do_wait_for_common(x, timeout, state);
5696 spin_unlock_irq(&x->wait.lock);
5697 return timeout;
5698}
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710void __sched wait_for_completion(struct completion *x)
5711{
5712 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
5713}
5714EXPORT_SYMBOL(wait_for_completion);
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725unsigned long __sched
5726wait_for_completion_timeout(struct completion *x, unsigned long timeout)
5727{
5728 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
5729}
5730EXPORT_SYMBOL(wait_for_completion_timeout);
5731
5732
5733
5734
5735
5736
5737
5738
5739int __sched wait_for_completion_interruptible(struct completion *x)
5740{
5741 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
5742 if (t == -ERESTARTSYS)
5743 return t;
5744 return 0;
5745}
5746EXPORT_SYMBOL(wait_for_completion_interruptible);
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756unsigned long __sched
5757wait_for_completion_interruptible_timeout(struct completion *x,
5758 unsigned long timeout)
5759{
5760 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
5761}
5762EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
5763
5764
5765
5766
5767
5768
5769
5770
5771int __sched wait_for_completion_killable(struct completion *x)
5772{
5773 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
5774 if (t == -ERESTARTSYS)
5775 return t;
5776 return 0;
5777}
5778EXPORT_SYMBOL(wait_for_completion_killable);
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792bool try_wait_for_completion(struct completion *x)
5793{
5794 int ret = 1;
5795
5796 spin_lock_irq(&x->wait.lock);
5797 if (!x->done)
5798 ret = 0;
5799 else
5800 x->done--;
5801 spin_unlock_irq(&x->wait.lock);
5802 return ret;
5803}
5804EXPORT_SYMBOL(try_wait_for_completion);
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814bool completion_done(struct completion *x)
5815{
5816 int ret = 1;
5817
5818 spin_lock_irq(&x->wait.lock);
5819 if (!x->done)
5820 ret = 0;
5821 spin_unlock_irq(&x->wait.lock);
5822 return ret;
5823}
5824EXPORT_SYMBOL(completion_done);
5825
5826static long __sched
5827sleep_on_common(wait_queue_head_t *q, int state, long timeout)
5828{
5829 unsigned long flags;
5830 wait_queue_t wait;
5831
5832 init_waitqueue_entry(&wait, current);
5833
5834 __set_current_state(state);
5835
5836 spin_lock_irqsave(&q->lock, flags);
5837 __add_wait_queue(q, &wait);
5838 spin_unlock(&q->lock);
5839 timeout = schedule_timeout(timeout);
5840 spin_lock_irq(&q->lock);
5841 __remove_wait_queue(q, &wait);
5842 spin_unlock_irqrestore(&q->lock, flags);
5843
5844 return timeout;
5845}
5846
5847void __sched interruptible_sleep_on(wait_queue_head_t *q)
5848{
5849 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5850}
5851EXPORT_SYMBOL(interruptible_sleep_on);
5852
5853long __sched
5854interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
5855{
5856 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
5857}
5858EXPORT_SYMBOL(interruptible_sleep_on_timeout);
5859
5860void __sched sleep_on(wait_queue_head_t *q)
5861{
5862 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5863}
5864EXPORT_SYMBOL(sleep_on);
5865
5866long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
5867{
5868 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
5869}
5870EXPORT_SYMBOL(sleep_on_timeout);
5871
5872#ifdef CONFIG_RT_MUTEXES
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884void rt_mutex_setprio(struct task_struct *p, int prio)
5885{
5886 unsigned long flags;
5887 int oldprio, on_rq, running;
5888 struct rq *rq;
5889 const struct sched_class *prev_class = p->sched_class;
5890
5891 BUG_ON(prio < 0 || prio > MAX_PRIO);
5892
5893 rq = task_rq_lock(p, &flags);
5894 update_rq_clock(rq);
5895
5896 oldprio = p->prio;
5897 on_rq = p->se.on_rq;
5898 running = task_current(rq, p);
5899 if (on_rq)
5900 dequeue_task(rq, p, 0);
5901 if (running)
5902 p->sched_class->put_prev_task(rq, p);
5903
5904 if (rt_prio(prio))
5905 p->sched_class = &rt_sched_class;
5906 else
5907 p->sched_class = &fair_sched_class;
5908
5909 p->prio = prio;
5910
5911 if (running)
5912 p->sched_class->set_curr_task(rq);
5913 if (on_rq) {
5914 enqueue_task(rq, p, 0);
5915
5916 check_class_changed(rq, p, prev_class, oldprio, running);
5917 }
5918 task_rq_unlock(rq, &flags);
5919}
5920
5921#endif
5922
5923void set_user_nice(struct task_struct *p, long nice)
5924{
5925 int old_prio, delta, on_rq;
5926 unsigned long flags;
5927 struct rq *rq;
5928
5929 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5930 return;
5931
5932
5933
5934
5935 rq = task_rq_lock(p, &flags);
5936 update_rq_clock(rq);
5937
5938
5939
5940
5941
5942
5943 if (task_has_rt_policy(p)) {
5944 p->static_prio = NICE_TO_PRIO(nice);
5945 goto out_unlock;
5946 }
5947 on_rq = p->se.on_rq;
5948 if (on_rq)
5949 dequeue_task(rq, p, 0);
5950
5951 p->static_prio = NICE_TO_PRIO(nice);
5952 set_load_weight(p);
5953 old_prio = p->prio;
5954 p->prio = effective_prio(p);
5955 delta = p->prio - old_prio;
5956
5957 if (on_rq) {
5958 enqueue_task(rq, p, 0);
5959
5960
5961
5962
5963 if (delta < 0 || (delta > 0 && task_running(rq, p)))
5964 resched_task(rq->curr);
5965 }
5966out_unlock:
5967 task_rq_unlock(rq, &flags);
5968}
5969EXPORT_SYMBOL(set_user_nice);
5970
5971
5972
5973
5974
5975
5976int can_nice(const struct task_struct *p, const int nice)
5977{
5978
5979 int nice_rlim = 20 - nice;
5980
5981 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
5982 capable(CAP_SYS_NICE));
5983}
5984
5985#ifdef __ARCH_WANT_SYS_NICE
5986
5987
5988
5989
5990
5991
5992
5993
5994SYSCALL_DEFINE1(nice, int, increment)
5995{
5996 long nice, retval;
5997
5998
5999
6000
6001
6002
6003 if (increment < -40)
6004 increment = -40;
6005 if (increment > 40)
6006 increment = 40;
6007
6008 nice = TASK_NICE(current) + increment;
6009 if (nice < -20)
6010 nice = -20;
6011 if (nice > 19)
6012 nice = 19;
6013
6014 if (increment < 0 && !can_nice(current, nice))
6015 return -EPERM;
6016
6017 retval = security_task_setnice(current, nice);
6018 if (retval)
6019 return retval;
6020
6021 set_user_nice(current, nice);
6022 return 0;
6023}
6024
6025#endif
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035int task_prio(const struct task_struct *p)
6036{
6037 return p->prio - MAX_RT_PRIO;
6038}
6039
6040
6041
6042
6043
6044int task_nice(const struct task_struct *p)
6045{
6046 return TASK_NICE(p);
6047}
6048EXPORT_SYMBOL(task_nice);
6049
6050
6051
6052
6053
6054int idle_cpu(int cpu)
6055{
6056 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
6057}
6058
6059
6060
6061
6062
6063struct task_struct *idle_task(int cpu)
6064{
6065 return cpu_rq(cpu)->idle;
6066}
6067
6068
6069
6070
6071
6072static struct task_struct *find_process_by_pid(pid_t pid)
6073{
6074 return pid ? find_task_by_vpid(pid) : current;
6075}
6076
6077
6078static void
6079__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6080{
6081 BUG_ON(p->se.on_rq);
6082
6083 p->policy = policy;
6084 switch (p->policy) {
6085 case SCHED_NORMAL:
6086 case SCHED_BATCH:
6087 case SCHED_IDLE:
6088 p->sched_class = &fair_sched_class;
6089 break;
6090 case SCHED_FIFO:
6091 case SCHED_RR:
6092 p->sched_class = &rt_sched_class;
6093 break;
6094 }
6095
6096 p->rt_priority = prio;
6097 p->normal_prio = normal_prio(p);
6098
6099 p->prio = rt_mutex_getprio(p);
6100 set_load_weight(p);
6101}
6102
6103
6104
6105
6106static bool check_same_owner(struct task_struct *p)
6107{
6108 const struct cred *cred = current_cred(), *pcred;
6109 bool match;
6110
6111 rcu_read_lock();
6112 pcred = __task_cred(p);
6113 match = (cred->euid == pcred->euid ||
6114 cred->euid == pcred->uid);
6115 rcu_read_unlock();
6116 return match;
6117}
6118
6119static int __sched_setscheduler(struct task_struct *p, int policy,
6120 struct sched_param *param, bool user)
6121{
6122 int retval, oldprio, oldpolicy = -1, on_rq, running;
6123 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq;
6126
6127
6128 BUG_ON(in_interrupt());
6129recheck:
6130
6131 if (policy < 0)
6132 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6135 policy != SCHED_IDLE)
6136 return -EINVAL;
6137
6138
6139
6140
6141
6142 if (param->sched_priority < 0 ||
6143 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
6144 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
6145 return -EINVAL;
6146 if (rt_policy(policy) != (param->sched_priority != 0))
6147 return -EINVAL;
6148
6149
6150
6151
6152 if (user && !capable(CAP_SYS_NICE)) {
6153 if (rt_policy(policy)) {
6154 unsigned long rlim_rtprio;
6155
6156 if (!lock_task_sighand(p, &flags))
6157 return -ESRCH;
6158 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
6159 unlock_task_sighand(p, &flags);
6160
6161
6162 if (policy != p->policy && !rlim_rtprio)
6163 return -EPERM;
6164
6165
6166 if (param->sched_priority > p->rt_priority &&
6167 param->sched_priority > rlim_rtprio)
6168 return -EPERM;
6169 }
6170
6171
6172
6173
6174 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
6175 return -EPERM;
6176
6177
6178 if (!check_same_owner(p))
6179 return -EPERM;
6180 }
6181
6182 if (user) {
6183#ifdef CONFIG_RT_GROUP_SCHED
6184
6185
6186
6187
6188 if (rt_bandwidth_enabled() && rt_policy(policy) &&
6189 task_group(p)->rt_bandwidth.rt_runtime == 0)
6190 return -EPERM;
6191#endif
6192
6193 retval = security_task_setscheduler(p, policy, param);
6194 if (retval)
6195 return retval;
6196 }
6197
6198
6199
6200
6201
6202 spin_lock_irqsave(&p->pi_lock, flags);
6203
6204
6205
6206
6207 rq = __task_rq_lock(p);
6208
6209 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6210 policy = oldpolicy = -1;
6211 __task_rq_unlock(rq);
6212 spin_unlock_irqrestore(&p->pi_lock, flags);
6213 goto recheck;
6214 }
6215 update_rq_clock(rq);
6216 on_rq = p->se.on_rq;
6217 running = task_current(rq, p);
6218 if (on_rq)
6219 deactivate_task(rq, p, 0);
6220 if (running)
6221 p->sched_class->put_prev_task(rq, p);
6222
6223 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority);
6225
6226 if (running)
6227 p->sched_class->set_curr_task(rq);
6228 if (on_rq) {
6229 activate_task(rq, p, 0);
6230
6231 check_class_changed(rq, p, prev_class, oldprio, running);
6232 }
6233 __task_rq_unlock(rq);
6234 spin_unlock_irqrestore(&p->pi_lock, flags);
6235
6236 rt_mutex_adjust_pi(p);
6237
6238 return 0;
6239}
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249int sched_setscheduler(struct task_struct *p, int policy,
6250 struct sched_param *param)
6251{
6252 return __sched_setscheduler(p, policy, param, true);
6253}
6254EXPORT_SYMBOL_GPL(sched_setscheduler);
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267int sched_setscheduler_nocheck(struct task_struct *p, int policy,
6268 struct sched_param *param)
6269{
6270 return __sched_setscheduler(p, policy, param, false);
6271}
6272
6273static int
6274do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
6275{
6276 struct sched_param lparam;
6277 struct task_struct *p;
6278 int retval;
6279
6280 if (!param || pid < 0)
6281 return -EINVAL;
6282 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
6283 return -EFAULT;
6284
6285 rcu_read_lock();
6286 retval = -ESRCH;
6287 p = find_process_by_pid(pid);
6288 if (p != NULL)
6289 retval = sched_setscheduler(p, policy, &lparam);
6290 rcu_read_unlock();
6291
6292 return retval;
6293}
6294
6295
6296
6297
6298
6299
6300
6301SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
6302 struct sched_param __user *, param)
6303{
6304
6305 if (policy < 0)
6306 return -EINVAL;
6307
6308 return do_sched_setscheduler(pid, policy, param);
6309}
6310
6311
6312
6313
6314
6315
6316SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
6317{
6318 return do_sched_setscheduler(pid, -1, param);
6319}
6320
6321
6322
6323
6324
6325SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6326{
6327 struct task_struct *p;
6328 int retval;
6329
6330 if (pid < 0)
6331 return -EINVAL;
6332
6333 retval = -ESRCH;
6334 read_lock(&tasklist_lock);
6335 p = find_process_by_pid(pid);
6336 if (p) {
6337 retval = security_task_getscheduler(p);
6338 if (!retval)
6339 retval = p->policy;
6340 }
6341 read_unlock(&tasklist_lock);
6342 return retval;
6343}
6344
6345
6346
6347
6348
6349
6350SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6351{
6352 struct sched_param lp;
6353 struct task_struct *p;
6354 int retval;
6355
6356 if (!param || pid < 0)
6357 return -EINVAL;
6358
6359 read_lock(&tasklist_lock);
6360 p = find_process_by_pid(pid);
6361 retval = -ESRCH;
6362 if (!p)
6363 goto out_unlock;
6364
6365 retval = security_task_getscheduler(p);
6366 if (retval)
6367 goto out_unlock;
6368
6369 lp.sched_priority = p->rt_priority;
6370 read_unlock(&tasklist_lock);
6371
6372
6373
6374
6375 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
6376
6377 return retval;
6378
6379out_unlock:
6380 read_unlock(&tasklist_lock);
6381 return retval;
6382}
6383
6384long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6385{
6386 cpumask_var_t cpus_allowed, new_mask;
6387 struct task_struct *p;
6388 int retval;
6389
6390 get_online_cpus();
6391 read_lock(&tasklist_lock);
6392
6393 p = find_process_by_pid(pid);
6394 if (!p) {
6395 read_unlock(&tasklist_lock);
6396 put_online_cpus();
6397 return -ESRCH;
6398 }
6399
6400
6401
6402
6403
6404
6405 get_task_struct(p);
6406 read_unlock(&tasklist_lock);
6407
6408 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6409 retval = -ENOMEM;
6410 goto out_put_task;
6411 }
6412 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6413 retval = -ENOMEM;
6414 goto out_free_cpus_allowed;
6415 }
6416 retval = -EPERM;
6417 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
6418 goto out_unlock;
6419
6420 retval = security_task_setscheduler(p, 0, NULL);
6421 if (retval)
6422 goto out_unlock;
6423
6424 cpuset_cpus_allowed(p, cpus_allowed);
6425 cpumask_and(new_mask, in_mask, cpus_allowed);
6426 again:
6427 retval = set_cpus_allowed_ptr(p, new_mask);
6428
6429 if (!retval) {
6430 cpuset_cpus_allowed(p, cpus_allowed);
6431 if (!cpumask_subset(new_mask, cpus_allowed)) {
6432
6433
6434
6435
6436
6437 cpumask_copy(new_mask, cpus_allowed);
6438 goto again;
6439 }
6440 }
6441out_unlock:
6442 free_cpumask_var(new_mask);
6443out_free_cpus_allowed:
6444 free_cpumask_var(cpus_allowed);
6445out_put_task:
6446 put_task_struct(p);
6447 put_online_cpus();
6448 return retval;
6449}
6450
6451static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
6452 struct cpumask *new_mask)
6453{
6454 if (len < cpumask_size())
6455 cpumask_clear(new_mask);
6456 else if (len > cpumask_size())
6457 len = cpumask_size();
6458
6459 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
6460}
6461
6462
6463
6464
6465
6466
6467
6468SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6469 unsigned long __user *, user_mask_ptr)
6470{
6471 cpumask_var_t new_mask;
6472 int retval;
6473
6474 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
6475 return -ENOMEM;
6476
6477 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6478 if (retval == 0)
6479 retval = sched_setaffinity(pid, new_mask);
6480 free_cpumask_var(new_mask);
6481 return retval;
6482}
6483
6484long sched_getaffinity(pid_t pid, struct cpumask *mask)
6485{
6486 struct task_struct *p;
6487 int retval;
6488
6489 get_online_cpus();
6490 read_lock(&tasklist_lock);
6491
6492 retval = -ESRCH;
6493 p = find_process_by_pid(pid);
6494 if (!p)
6495 goto out_unlock;
6496
6497 retval = security_task_getscheduler(p);
6498 if (retval)
6499 goto out_unlock;
6500
6501 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6502
6503out_unlock:
6504 read_unlock(&tasklist_lock);
6505 put_online_cpus();
6506
6507 return retval;
6508}
6509
6510
6511
6512
6513
6514
6515
6516SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6517 unsigned long __user *, user_mask_ptr)
6518{
6519 int ret;
6520 cpumask_var_t mask;
6521
6522 if (len < cpumask_size())
6523 return -EINVAL;
6524
6525 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6526 return -ENOMEM;
6527
6528 ret = sched_getaffinity(pid, mask);
6529 if (ret == 0) {
6530 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
6531 ret = -EFAULT;
6532 else
6533 ret = cpumask_size();
6534 }
6535 free_cpumask_var(mask);
6536
6537 return ret;
6538}
6539
6540
6541
6542
6543
6544
6545
6546SYSCALL_DEFINE0(sched_yield)
6547{
6548 struct rq *rq = this_rq_lock();
6549
6550 schedstat_inc(rq, yld_count);
6551 current->sched_class->yield_task(rq);
6552
6553
6554
6555
6556
6557 __release(rq->lock);
6558 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6559 _raw_spin_unlock(&rq->lock);
6560 preempt_enable_no_resched();
6561
6562 schedule();
6563
6564 return 0;
6565}
6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6572static void __cond_resched(void)
6573{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6575 __might_sleep(__FILE__, __LINE__);
6576#endif
6577
6578
6579
6580
6581
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587}
6588
6589int __sched _cond_resched(void)
6590{
6591 if (should_resched()) {
6592 __cond_resched();
6593 return 1;
6594 }
6595 return 0;
6596}
6597EXPORT_SYMBOL(_cond_resched);
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607int cond_resched_lock(spinlock_t *lock)
6608{
6609 int resched = should_resched();
6610 int ret = 0;
6611
6612 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock);
6614 if (resched)
6615 __cond_resched();
6616 else
6617 cpu_relax();
6618 ret = 1;
6619 spin_lock(lock);
6620 }
6621 return ret;
6622}
6623EXPORT_SYMBOL(cond_resched_lock);
6624
6625int __sched cond_resched_softirq(void)
6626{
6627 BUG_ON(!in_softirq());
6628
6629 if (should_resched()) {
6630 local_bh_enable();
6631 __cond_resched();
6632 local_bh_disable();
6633 return 1;
6634 }
6635 return 0;
6636}
6637EXPORT_SYMBOL(cond_resched_softirq);
6638
6639
6640
6641
6642
6643
6644
6645void __sched yield(void)
6646{
6647 set_current_state(TASK_RUNNING);
6648 sys_sched_yield();
6649}
6650EXPORT_SYMBOL(yield);
6651
6652
6653
6654
6655
6656
6657
6658
6659void __sched io_schedule(void)
6660{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues);
6662
6663 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait);
6665 schedule();
6666 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end();
6668}
6669EXPORT_SYMBOL(io_schedule);
6670
6671long __sched io_schedule_timeout(long timeout)
6672{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues);
6674 long ret;
6675
6676 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait);
6678 ret = schedule_timeout(timeout);
6679 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end();
6681 return ret;
6682}
6683
6684
6685
6686
6687
6688
6689
6690
6691SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6692{
6693 int ret = -EINVAL;
6694
6695 switch (policy) {
6696 case SCHED_FIFO:
6697 case SCHED_RR:
6698 ret = MAX_USER_RT_PRIO-1;
6699 break;
6700 case SCHED_NORMAL:
6701 case SCHED_BATCH:
6702 case SCHED_IDLE:
6703 ret = 0;
6704 break;
6705 }
6706 return ret;
6707}
6708
6709
6710
6711
6712
6713
6714
6715
6716SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6717{
6718 int ret = -EINVAL;
6719
6720 switch (policy) {
6721 case SCHED_FIFO:
6722 case SCHED_RR:
6723 ret = 1;
6724 break;
6725 case SCHED_NORMAL:
6726 case SCHED_BATCH:
6727 case SCHED_IDLE:
6728 ret = 0;
6729 }
6730 return ret;
6731}
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6742 struct timespec __user *, interval)
6743{
6744 struct task_struct *p;
6745 unsigned int time_slice;
6746 int retval;
6747 struct timespec t;
6748
6749 if (pid < 0)
6750 return -EINVAL;
6751
6752 retval = -ESRCH;
6753 read_lock(&tasklist_lock);
6754 p = find_process_by_pid(pid);
6755 if (!p)
6756 goto out_unlock;
6757
6758 retval = security_task_getscheduler(p);
6759 if (retval)
6760 goto out_unlock;
6761
6762
6763
6764
6765
6766 time_slice = 0;
6767 if (p->policy == SCHED_RR) {
6768 time_slice = DEF_TIMESLICE;
6769 } else if (p->policy != SCHED_FIFO) {
6770 struct sched_entity *se = &p->se;
6771 unsigned long flags;
6772 struct rq *rq;
6773
6774 rq = task_rq_lock(p, &flags);
6775 if (rq->cfs.load.weight)
6776 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6777 task_rq_unlock(rq, &flags);
6778 }
6779 read_unlock(&tasklist_lock);
6780 jiffies_to_timespec(time_slice, &t);
6781 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6782 return retval;
6783
6784out_unlock:
6785 read_unlock(&tasklist_lock);
6786 return retval;
6787}
6788
6789static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
6790
6791void sched_show_task(struct task_struct *p)
6792{
6793 unsigned long free = 0;
6794 unsigned state;
6795
6796 state = p->state ? __ffs(p->state) + 1 : 0;
6797 printk(KERN_INFO "%-13.13s %c", p->comm,
6798 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
6799#if BITS_PER_LONG == 32
6800 if (state == TASK_RUNNING)
6801 printk(KERN_CONT " running ");
6802 else
6803 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
6804#else
6805 if (state == TASK_RUNNING)
6806 printk(KERN_CONT " running task ");
6807 else
6808 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6809#endif
6810#ifdef CONFIG_DEBUG_STACK_USAGE
6811 free = stack_not_used(p);
6812#endif
6813 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6814 task_pid_nr(p), task_pid_nr(p->real_parent),
6815 (unsigned long)task_thread_info(p)->flags);
6816
6817 show_stack(p, NULL);
6818}
6819
6820void show_state_filter(unsigned long state_filter)
6821{
6822 struct task_struct *g, *p;
6823
6824#if BITS_PER_LONG == 32
6825 printk(KERN_INFO
6826 " task PC stack pid father\n");
6827#else
6828 printk(KERN_INFO
6829 " task PC stack pid father\n");
6830#endif
6831 read_lock(&tasklist_lock);
6832 do_each_thread(g, p) {
6833
6834
6835
6836
6837 touch_nmi_watchdog();
6838 if (!state_filter || (p->state & state_filter))
6839 sched_show_task(p);
6840 } while_each_thread(g, p);
6841
6842 touch_all_softlockup_watchdogs();
6843
6844#ifdef CONFIG_SCHED_DEBUG
6845 sysrq_sched_debug_show();
6846#endif
6847 read_unlock(&tasklist_lock);
6848
6849
6850
6851 if (state_filter == -1)
6852 debug_show_all_locks();
6853}
6854
6855void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6856{
6857 idle->sched_class = &idle_sched_class;
6858}
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868void __cpuinit init_idle(struct task_struct *idle, int cpu)
6869{
6870 struct rq *rq = cpu_rq(cpu);
6871 unsigned long flags;
6872
6873 spin_lock_irqsave(&rq->lock, flags);
6874
6875 __sched_fork(idle);
6876 idle->se.exec_start = sched_clock();
6877
6878 idle->prio = idle->normal_prio = MAX_PRIO;
6879 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6880 __set_task_cpu(idle, cpu);
6881
6882 rq->curr = rq->idle = idle;
6883#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6884 idle->oncpu = 1;
6885#endif
6886 spin_unlock_irqrestore(&rq->lock, flags);
6887
6888
6889#if defined(CONFIG_PREEMPT)
6890 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
6891#else
6892 task_thread_info(idle)->preempt_count = 0;
6893#endif
6894
6895
6896
6897 idle->sched_class = &idle_sched_class;
6898 ftrace_graph_init_task(idle);
6899}
6900
6901
6902
6903
6904
6905
6906
6907
6908cpumask_var_t nohz_cpu_mask;
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919static inline void sched_init_granularity(void)
6920{
6921 unsigned int factor = 1 + ilog2(num_online_cpus());
6922 const unsigned long limit = 200000000;
6923
6924 sysctl_sched_min_granularity *= factor;
6925 if (sysctl_sched_min_granularity > limit)
6926 sysctl_sched_min_granularity = limit;
6927
6928 sysctl_sched_latency *= factor;
6929 if (sysctl_sched_latency > limit)
6930 sysctl_sched_latency = limit;
6931
6932 sysctl_sched_wakeup_granularity *= factor;
6933
6934 sysctl_sched_shares_ratelimit *= factor;
6935}
6936
6937#ifdef CONFIG_SMP
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6964{
6965 struct migration_req req;
6966 unsigned long flags;
6967 struct rq *rq;
6968 int ret = 0;
6969
6970 rq = task_rq_lock(p, &flags);
6971 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
6972 ret = -EINVAL;
6973 goto out;
6974 }
6975
6976 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
6977 !cpumask_equal(&p->cpus_allowed, new_mask))) {
6978 ret = -EINVAL;
6979 goto out;
6980 }
6981
6982 if (p->sched_class->set_cpus_allowed)
6983 p->sched_class->set_cpus_allowed(p, new_mask);
6984 else {
6985 cpumask_copy(&p->cpus_allowed, new_mask);
6986 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6987 }
6988
6989
6990 if (cpumask_test_cpu(task_cpu(p), new_mask))
6991 goto out;
6992
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994
6995 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread);
6997 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm);
6999 return 0;
7000 }
7001out:
7002 task_rq_unlock(rq, &flags);
7003
7004 return ret;
7005}
7006EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7020{
7021 struct rq *rq_dest, *rq_src;
7022 int ret = 0, on_rq;
7023
7024 if (unlikely(!cpu_active(dest_cpu)))
7025 return ret;
7026
7027 rq_src = cpu_rq(src_cpu);
7028 rq_dest = cpu_rq(dest_cpu);
7029
7030 double_rq_lock(rq_src, rq_dest);
7031
7032 if (task_cpu(p) != src_cpu)
7033 goto done;
7034
7035 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7036 goto fail;
7037
7038 on_rq = p->se.on_rq;
7039 if (on_rq)
7040 deactivate_task(rq_src, p, 0);
7041
7042 set_task_cpu(p, dest_cpu);
7043 if (on_rq) {
7044 activate_task(rq_dest, p, 0);
7045 check_preempt_curr(rq_dest, p, 0);
7046 }
7047done:
7048 ret = 1;
7049fail:
7050 double_rq_unlock(rq_src, rq_dest);
7051 return ret;
7052}
7053
7054
7055
7056
7057
7058
7059static int migration_thread(void *data)
7060{
7061 int cpu = (long)data;
7062 struct rq *rq;
7063
7064 rq = cpu_rq(cpu);
7065 BUG_ON(rq->migration_thread != current);
7066
7067 set_current_state(TASK_INTERRUPTIBLE);
7068 while (!kthread_should_stop()) {
7069 struct migration_req *req;
7070 struct list_head *head;
7071
7072 spin_lock_irq(&rq->lock);
7073
7074 if (cpu_is_offline(cpu)) {
7075 spin_unlock_irq(&rq->lock);
7076 break;
7077 }
7078
7079 if (rq->active_balance) {
7080 active_load_balance(rq, cpu);
7081 rq->active_balance = 0;
7082 }
7083
7084 head = &rq->migration_queue;
7085
7086 if (list_empty(head)) {
7087 spin_unlock_irq(&rq->lock);
7088 schedule();
7089 set_current_state(TASK_INTERRUPTIBLE);
7090 continue;
7091 }
7092 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next);
7094
7095 spin_unlock(&rq->lock);
7096 __migrate_task(req->task, cpu, req->dest_cpu);
7097 local_irq_enable();
7098
7099 complete(&req->done);
7100 }
7101 __set_current_state(TASK_RUNNING);
7102
7103 return 0;
7104}
7105
7106#ifdef CONFIG_HOTPLUG_CPU
7107
7108static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7109{
7110 int ret;
7111
7112 local_irq_disable();
7113 ret = __migrate_task(p, src_cpu, dest_cpu);
7114 local_irq_enable();
7115 return ret;
7116}
7117
7118
7119
7120
7121static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7122{
7123 int dest_cpu;
7124 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7125
7126again:
7127
7128 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7129 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7130 goto move;
7131
7132
7133 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7134 if (dest_cpu < nr_cpu_ids)
7135 goto move;
7136
7137
7138 if (dest_cpu >= nr_cpu_ids) {
7139 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7140 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7141
7142
7143
7144
7145
7146
7147 if (p->mm && printk_ratelimit()) {
7148 printk(KERN_INFO "process %d (%s) no "
7149 "longer affine to cpu%d\n",
7150 task_pid_nr(p), p->comm, dead_cpu);
7151 }
7152 }
7153
7154move:
7155
7156 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7157 goto again;
7158}
7159
7160
7161
7162
7163
7164
7165
7166
7167static void migrate_nr_uninterruptible(struct rq *rq_src)
7168{
7169 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
7170 unsigned long flags;
7171
7172 local_irq_save(flags);
7173 double_rq_lock(rq_src, rq_dest);
7174 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
7175 rq_src->nr_uninterruptible = 0;
7176 double_rq_unlock(rq_src, rq_dest);
7177 local_irq_restore(flags);
7178}
7179
7180
7181static void migrate_live_tasks(int src_cpu)
7182{
7183 struct task_struct *p, *t;
7184
7185 read_lock(&tasklist_lock);
7186
7187 do_each_thread(t, p) {
7188 if (p == current)
7189 continue;
7190
7191 if (task_cpu(p) == src_cpu)
7192 move_task_off_dead_cpu(src_cpu, p);
7193 } while_each_thread(t, p);
7194
7195 read_unlock(&tasklist_lock);
7196}
7197
7198
7199
7200
7201
7202
7203void sched_idle_next(void)
7204{
7205 int this_cpu = smp_processor_id();
7206 struct rq *rq = cpu_rq(this_cpu);
7207 struct task_struct *p = rq->idle;
7208 unsigned long flags;
7209
7210
7211 BUG_ON(cpu_online(this_cpu));
7212
7213
7214
7215
7216
7217 spin_lock_irqsave(&rq->lock, flags);
7218
7219 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7220
7221 update_rq_clock(rq);
7222 activate_task(rq, p, 0);
7223
7224 spin_unlock_irqrestore(&rq->lock, flags);
7225}
7226
7227
7228
7229
7230
7231void idle_task_exit(void)
7232{
7233 struct mm_struct *mm = current->active_mm;
7234
7235 BUG_ON(cpu_online(smp_processor_id()));
7236
7237 if (mm != &init_mm)
7238 switch_mm(mm, &init_mm, current);
7239 mmdrop(mm);
7240}
7241
7242
7243static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7244{
7245 struct rq *rq = cpu_rq(dead_cpu);
7246
7247
7248 BUG_ON(!p->exit_state);
7249
7250
7251 BUG_ON(p->state == TASK_DEAD);
7252
7253 get_task_struct(p);
7254
7255
7256
7257
7258
7259
7260 spin_unlock_irq(&rq->lock);
7261 move_task_off_dead_cpu(dead_cpu, p);
7262 spin_lock_irq(&rq->lock);
7263
7264 put_task_struct(p);
7265}
7266
7267
7268static void migrate_dead_tasks(unsigned int dead_cpu)
7269{
7270 struct rq *rq = cpu_rq(dead_cpu);
7271 struct task_struct *next;
7272
7273 for ( ; ; ) {
7274 if (!rq->nr_running)
7275 break;
7276 update_rq_clock(rq);
7277 next = pick_next_task(rq);
7278 if (!next)
7279 break;
7280 next->sched_class->put_prev_task(rq, next);
7281 migrate_dead(dead_cpu, next);
7282
7283 }
7284}
7285
7286
7287
7288
7289static void calc_global_load_remove(struct rq *rq)
7290{
7291 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7292 rq->calc_load_active = 0;
7293}
7294#endif
7295
7296#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
7297
7298static struct ctl_table sd_ctl_dir[] = {
7299 {
7300 .procname = "sched_domain",
7301 .mode = 0555,
7302 },
7303 {0, },
7304};
7305
7306static struct ctl_table sd_ctl_root[] = {
7307 {
7308 .ctl_name = CTL_KERN,
7309 .procname = "kernel",
7310 .mode = 0555,
7311 .child = sd_ctl_dir,
7312 },
7313 {0, },
7314};
7315
7316static struct ctl_table *sd_alloc_ctl_entry(int n)
7317{
7318 struct ctl_table *entry =
7319 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
7320
7321 return entry;
7322}
7323
7324static void sd_free_ctl_entry(struct ctl_table **tablep)
7325{
7326 struct ctl_table *entry;
7327
7328
7329
7330
7331
7332
7333
7334 for (entry = *tablep; entry->mode; entry++) {
7335 if (entry->child)
7336 sd_free_ctl_entry(&entry->child);
7337 if (entry->proc_handler == NULL)
7338 kfree(entry->procname);
7339 }
7340
7341 kfree(*tablep);
7342 *tablep = NULL;
7343}
7344
7345static void
7346set_table_entry(struct ctl_table *entry,
7347 const char *procname, void *data, int maxlen,
7348 mode_t mode, proc_handler *proc_handler)
7349{
7350 entry->procname = procname;
7351 entry->data = data;
7352 entry->maxlen = maxlen;
7353 entry->mode = mode;
7354 entry->proc_handler = proc_handler;
7355}
7356
7357static struct ctl_table *
7358sd_alloc_ctl_domain_table(struct sched_domain *sd)
7359{
7360 struct ctl_table *table = sd_alloc_ctl_entry(13);
7361
7362 if (table == NULL)
7363 return NULL;
7364
7365 set_table_entry(&table[0], "min_interval", &sd->min_interval,
7366 sizeof(long), 0644, proc_doulongvec_minmax);
7367 set_table_entry(&table[1], "max_interval", &sd->max_interval,
7368 sizeof(long), 0644, proc_doulongvec_minmax);
7369 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
7370 sizeof(int), 0644, proc_dointvec_minmax);
7371 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
7372 sizeof(int), 0644, proc_dointvec_minmax);
7373 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
7374 sizeof(int), 0644, proc_dointvec_minmax);
7375 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
7376 sizeof(int), 0644, proc_dointvec_minmax);
7377 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
7378 sizeof(int), 0644, proc_dointvec_minmax);
7379 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
7380 sizeof(int), 0644, proc_dointvec_minmax);
7381 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
7382 sizeof(int), 0644, proc_dointvec_minmax);
7383 set_table_entry(&table[9], "cache_nice_tries",
7384 &sd->cache_nice_tries,
7385 sizeof(int), 0644, proc_dointvec_minmax);
7386 set_table_entry(&table[10], "flags", &sd->flags,
7387 sizeof(int), 0644, proc_dointvec_minmax);
7388 set_table_entry(&table[11], "name", sd->name,
7389 CORENAME_MAX_SIZE, 0444, proc_dostring);
7390
7391
7392 return table;
7393}
7394
7395static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7396{
7397 struct ctl_table *entry, *table;
7398 struct sched_domain *sd;
7399 int domain_num = 0, i;
7400 char buf[32];
7401
7402 for_each_domain(cpu, sd)
7403 domain_num++;
7404 entry = table = sd_alloc_ctl_entry(domain_num + 1);
7405 if (table == NULL)
7406 return NULL;
7407
7408 i = 0;
7409 for_each_domain(cpu, sd) {
7410 snprintf(buf, 32, "domain%d", i);
7411 entry->procname = kstrdup(buf, GFP_KERNEL);
7412 entry->mode = 0555;
7413 entry->child = sd_alloc_ctl_domain_table(sd);
7414 entry++;
7415 i++;
7416 }
7417 return table;
7418}
7419
7420static struct ctl_table_header *sd_sysctl_header;
7421static void register_sched_domain_sysctl(void)
7422{
7423 int i, cpu_num = num_online_cpus();
7424 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7425 char buf[32];
7426
7427 WARN_ON(sd_ctl_dir[0].child);
7428 sd_ctl_dir[0].child = entry;
7429
7430 if (entry == NULL)
7431 return;
7432
7433 for_each_online_cpu(i) {
7434 snprintf(buf, 32, "cpu%d", i);
7435 entry->procname = kstrdup(buf, GFP_KERNEL);
7436 entry->mode = 0555;
7437 entry->child = sd_alloc_ctl_cpu_table(i);
7438 entry++;
7439 }
7440
7441 WARN_ON(sd_sysctl_header);
7442 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
7443}
7444
7445
7446static void unregister_sched_domain_sysctl(void)
7447{
7448 if (sd_sysctl_header)
7449 unregister_sysctl_table(sd_sysctl_header);
7450 sd_sysctl_header = NULL;
7451 if (sd_ctl_dir[0].child)
7452 sd_free_ctl_entry(&sd_ctl_dir[0].child);
7453}
7454#else
7455static void register_sched_domain_sysctl(void)
7456{
7457}
7458static void unregister_sched_domain_sysctl(void)
7459{
7460}
7461#endif
7462
7463static void set_rq_online(struct rq *rq)
7464{
7465 if (!rq->online) {
7466 const struct sched_class *class;
7467
7468 cpumask_set_cpu(rq->cpu, rq->rd->online);
7469 rq->online = 1;
7470
7471 for_each_class(class) {
7472 if (class->rq_online)
7473 class->rq_online(rq);
7474 }
7475 }
7476}
7477
7478static void set_rq_offline(struct rq *rq)
7479{
7480 if (rq->online) {
7481 const struct sched_class *class;
7482
7483 for_each_class(class) {
7484 if (class->rq_offline)
7485 class->rq_offline(rq);
7486 }
7487
7488 cpumask_clear_cpu(rq->cpu, rq->rd->online);
7489 rq->online = 0;
7490 }
7491}
7492
7493
7494
7495
7496
7497static int __cpuinit
7498migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7499{
7500 struct task_struct *p;
7501 int cpu = (long)hcpu;
7502 unsigned long flags;
7503 struct rq *rq;
7504
7505 switch (action) {
7506
7507 case CPU_UP_PREPARE:
7508 case CPU_UP_PREPARE_FROZEN:
7509 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
7510 if (IS_ERR(p))
7511 return NOTIFY_BAD;
7512 kthread_bind(p, cpu);
7513
7514 rq = task_rq_lock(p, &flags);
7515 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7516 task_rq_unlock(rq, &flags);
7517 get_task_struct(p);
7518 cpu_rq(cpu)->migration_thread = p;
7519 rq->calc_load_update = calc_load_update;
7520 break;
7521
7522 case CPU_ONLINE:
7523 case CPU_ONLINE_FROZEN:
7524
7525 wake_up_process(cpu_rq(cpu)->migration_thread);
7526
7527
7528 rq = cpu_rq(cpu);
7529 spin_lock_irqsave(&rq->lock, flags);
7530 if (rq->rd) {
7531 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7532
7533 set_rq_online(rq);
7534 }
7535 spin_unlock_irqrestore(&rq->lock, flags);
7536 break;
7537
7538#ifdef CONFIG_HOTPLUG_CPU
7539 case CPU_UP_CANCELED:
7540 case CPU_UP_CANCELED_FROZEN:
7541 if (!cpu_rq(cpu)->migration_thread)
7542 break;
7543
7544 kthread_bind(cpu_rq(cpu)->migration_thread,
7545 cpumask_any(cpu_online_mask));
7546 kthread_stop(cpu_rq(cpu)->migration_thread);
7547 put_task_struct(cpu_rq(cpu)->migration_thread);
7548 cpu_rq(cpu)->migration_thread = NULL;
7549 break;
7550
7551 case CPU_DEAD:
7552 case CPU_DEAD_FROZEN:
7553 cpuset_lock();
7554 migrate_live_tasks(cpu);
7555 rq = cpu_rq(cpu);
7556 kthread_stop(rq->migration_thread);
7557 put_task_struct(rq->migration_thread);
7558 rq->migration_thread = NULL;
7559
7560 spin_lock_irq(&rq->lock);
7561 update_rq_clock(rq);
7562 deactivate_task(rq, rq->idle, 0);
7563 rq->idle->static_prio = MAX_PRIO;
7564 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7565 rq->idle->sched_class = &idle_sched_class;
7566 migrate_dead_tasks(cpu);
7567 spin_unlock_irq(&rq->lock);
7568 cpuset_unlock();
7569 migrate_nr_uninterruptible(rq);
7570 BUG_ON(rq->nr_running != 0);
7571 calc_global_load_remove(rq);
7572
7573
7574
7575
7576
7577 spin_lock_irq(&rq->lock);
7578 while (!list_empty(&rq->migration_queue)) {
7579 struct migration_req *req;
7580
7581 req = list_entry(rq->migration_queue.next,
7582 struct migration_req, list);
7583 list_del_init(&req->list);
7584 spin_unlock_irq(&rq->lock);
7585 complete(&req->done);
7586 spin_lock_irq(&rq->lock);
7587 }
7588 spin_unlock_irq(&rq->lock);
7589 break;
7590
7591 case CPU_DYING:
7592 case CPU_DYING_FROZEN:
7593
7594 rq = cpu_rq(cpu);
7595 spin_lock_irqsave(&rq->lock, flags);
7596 if (rq->rd) {
7597 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7598 set_rq_offline(rq);
7599 }
7600 spin_unlock_irqrestore(&rq->lock, flags);
7601 break;
7602#endif
7603 }
7604 return NOTIFY_OK;
7605}
7606
7607
7608
7609
7610
7611
7612static struct notifier_block __cpuinitdata migration_notifier = {
7613 .notifier_call = migration_call,
7614 .priority = 10
7615};
7616
7617static int __init migration_init(void)
7618{
7619 void *cpu = (void *)(long)smp_processor_id();
7620 int err;
7621
7622
7623 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
7624 BUG_ON(err == NOTIFY_BAD);
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier);
7627
7628 return err;
7629}
7630early_initcall(migration_init);
7631#endif
7632
7633#ifdef CONFIG_SMP
7634
7635#ifdef CONFIG_SCHED_DEBUG
7636
7637static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7638 struct cpumask *groupmask)
7639{
7640 struct sched_group *group = sd->groups;
7641 char str[256];
7642
7643 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
7644 cpumask_clear(groupmask);
7645
7646 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
7647
7648 if (!(sd->flags & SD_LOAD_BALANCE)) {
7649 printk("does not load-balance\n");
7650 if (sd->parent)
7651 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
7652 " has parent");
7653 return -1;
7654 }
7655
7656 printk(KERN_CONT "span %s level %s\n", str, sd->name);
7657
7658 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
7659 printk(KERN_ERR "ERROR: domain->span does not contain "
7660 "CPU%d\n", cpu);
7661 }
7662 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
7663 printk(KERN_ERR "ERROR: domain->groups does not contain"
7664 " CPU%d\n", cpu);
7665 }
7666
7667 printk(KERN_DEBUG "%*s groups:", level + 1, "");
7668 do {
7669 if (!group) {
7670 printk("\n");
7671 printk(KERN_ERR "ERROR: group is NULL\n");
7672 break;
7673 }
7674
7675 if (!group->__cpu_power) {
7676 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n");
7679 break;
7680 }
7681
7682 if (!cpumask_weight(sched_group_cpus(group))) {
7683 printk(KERN_CONT "\n");
7684 printk(KERN_ERR "ERROR: empty group\n");
7685 break;
7686 }
7687
7688 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
7689 printk(KERN_CONT "\n");
7690 printk(KERN_ERR "ERROR: repeated CPUs\n");
7691 break;
7692 }
7693
7694 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
7695
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697
7698 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)",
7701 group->__cpu_power);
7702 }
7703
7704 group = group->next;
7705 } while (group != sd->groups);
7706 printk(KERN_CONT "\n");
7707
7708 if (!cpumask_equal(sched_domain_span(sd), groupmask))
7709 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
7710
7711 if (sd->parent &&
7712 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
7713 printk(KERN_ERR "ERROR: parent span is not a superset "
7714 "of domain->span\n");
7715 return 0;
7716}
7717
7718static void sched_domain_debug(struct sched_domain *sd, int cpu)
7719{
7720 cpumask_var_t groupmask;
7721 int level = 0;
7722
7723 if (!sd) {
7724 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7725 return;
7726 }
7727
7728 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
7729
7730 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
7731 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
7732 return;
7733 }
7734
7735 for (;;) {
7736 if (sched_domain_debug_one(sd, cpu, level, groupmask))
7737 break;
7738 level++;
7739 sd = sd->parent;
7740 if (!sd)
7741 break;
7742 }
7743 free_cpumask_var(groupmask);
7744}
7745#else
7746# define sched_domain_debug(sd, cpu) do { } while (0)
7747#endif
7748
7749static int sd_degenerate(struct sched_domain *sd)
7750{
7751 if (cpumask_weight(sched_domain_span(sd)) == 1)
7752 return 1;
7753
7754
7755 if (sd->flags & (SD_LOAD_BALANCE |
7756 SD_BALANCE_NEWIDLE |
7757 SD_BALANCE_FORK |
7758 SD_BALANCE_EXEC |
7759 SD_SHARE_CPUPOWER |
7760 SD_SHARE_PKG_RESOURCES)) {
7761 if (sd->groups != sd->groups->next)
7762 return 0;
7763 }
7764
7765
7766 if (sd->flags & (SD_WAKE_IDLE |
7767 SD_WAKE_AFFINE |
7768 SD_WAKE_BALANCE))
7769 return 0;
7770
7771 return 1;
7772}
7773
7774static int
7775sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7776{
7777 unsigned long cflags = sd->flags, pflags = parent->flags;
7778
7779 if (sd_degenerate(parent))
7780 return 1;
7781
7782 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7783 return 0;
7784
7785
7786
7787 if (cflags & SD_WAKE_AFFINE)
7788 pflags &= ~SD_WAKE_BALANCE;
7789
7790 if (parent->groups == parent->groups->next) {
7791 pflags &= ~(SD_LOAD_BALANCE |
7792 SD_BALANCE_NEWIDLE |
7793 SD_BALANCE_FORK |
7794 SD_BALANCE_EXEC |
7795 SD_SHARE_CPUPOWER |
7796 SD_SHARE_PKG_RESOURCES);
7797 if (nr_node_ids == 1)
7798 pflags &= ~SD_SERIALIZE;
7799 }
7800 if (~cflags & pflags)
7801 return 0;
7802
7803 return 1;
7804}
7805
7806static void free_rootdomain(struct root_domain *rd)
7807{
7808 cpupri_cleanup(&rd->cpupri);
7809
7810 free_cpumask_var(rd->rto_mask);
7811 free_cpumask_var(rd->online);
7812 free_cpumask_var(rd->span);
7813 kfree(rd);
7814}
7815
7816static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7817{
7818 struct root_domain *old_rd = NULL;
7819 unsigned long flags;
7820
7821 spin_lock_irqsave(&rq->lock, flags);
7822
7823 if (rq->rd) {
7824 old_rd = rq->rd;
7825
7826 if (cpumask_test_cpu(rq->cpu, old_rd->online))
7827 set_rq_offline(rq);
7828
7829 cpumask_clear_cpu(rq->cpu, old_rd->span);
7830
7831
7832
7833
7834
7835
7836 if (!atomic_dec_and_test(&old_rd->refcount))
7837 old_rd = NULL;
7838 }
7839
7840 atomic_inc(&rd->refcount);
7841 rq->rd = rd;
7842
7843 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
7845 set_rq_online(rq);
7846
7847 spin_unlock_irqrestore(&rq->lock, flags);
7848
7849 if (old_rd)
7850 free_rootdomain(old_rd);
7851}
7852
7853static int init_rootdomain(struct root_domain *rd, bool bootmem)
7854{
7855 gfp_t gfp = GFP_KERNEL;
7856
7857 memset(rd, 0, sizeof(*rd));
7858
7859 if (bootmem)
7860 gfp = GFP_NOWAIT;
7861
7862 if (!alloc_cpumask_var(&rd->span, gfp))
7863 goto out;
7864 if (!alloc_cpumask_var(&rd->online, gfp))
7865 goto free_span;
7866 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7867 goto free_online;
7868
7869 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7870 goto free_rto_mask;
7871 return 0;
7872
7873free_rto_mask:
7874 free_cpumask_var(rd->rto_mask);
7875free_online:
7876 free_cpumask_var(rd->online);
7877free_span:
7878 free_cpumask_var(rd->span);
7879out:
7880 return -ENOMEM;
7881}
7882
7883static void init_defrootdomain(void)
7884{
7885 init_rootdomain(&def_root_domain, true);
7886
7887 atomic_set(&def_root_domain.refcount, 1);
7888}
7889
7890static struct root_domain *alloc_rootdomain(void)
7891{
7892 struct root_domain *rd;
7893
7894 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
7895 if (!rd)
7896 return NULL;
7897
7898 if (init_rootdomain(rd, false) != 0) {
7899 kfree(rd);
7900 return NULL;
7901 }
7902
7903 return rd;
7904}
7905
7906
7907
7908
7909
7910static void
7911cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7912{
7913 struct rq *rq = cpu_rq(cpu);
7914 struct sched_domain *tmp;
7915
7916
7917 for (tmp = sd; tmp; ) {
7918 struct sched_domain *parent = tmp->parent;
7919 if (!parent)
7920 break;
7921
7922 if (sd_parent_degenerate(tmp, parent)) {
7923 tmp->parent = parent->parent;
7924 if (parent->parent)
7925 parent->parent->child = tmp;
7926 } else
7927 tmp = tmp->parent;
7928 }
7929
7930 if (sd && sd_degenerate(sd)) {
7931 sd = sd->parent;
7932 if (sd)
7933 sd->child = NULL;
7934 }
7935
7936 sched_domain_debug(sd, cpu);
7937
7938 rq_attach_root(rq, rd);
7939 rcu_assign_pointer(rq->sd, sd);
7940}
7941
7942
7943static cpumask_var_t cpu_isolated_map;
7944
7945
7946static int __init isolated_cpu_setup(char *str)
7947{
7948 cpulist_parse(str, cpu_isolated_map);
7949 return 1;
7950}
7951
7952__setup("isolcpus=", isolated_cpu_setup);
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964static void
7965init_sched_build_groups(const struct cpumask *span,
7966 const struct cpumask *cpu_map,
7967 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
7968 struct sched_group **sg,
7969 struct cpumask *tmpmask),
7970 struct cpumask *covered, struct cpumask *tmpmask)
7971{
7972 struct sched_group *first = NULL, *last = NULL;
7973 int i;
7974
7975 cpumask_clear(covered);
7976
7977 for_each_cpu(i, span) {
7978 struct sched_group *sg;
7979 int group = group_fn(i, cpu_map, &sg, tmpmask);
7980 int j;
7981
7982 if (cpumask_test_cpu(i, covered))
7983 continue;
7984
7985 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0;
7987
7988 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
7990 continue;
7991
7992 cpumask_set_cpu(j, covered);
7993 cpumask_set_cpu(j, sched_group_cpus(sg));
7994 }
7995 if (!first)
7996 first = sg;
7997 if (last)
7998 last->next = sg;
7999 last = sg;
8000 }
8001 last->next = first;
8002}
8003
8004#define SD_NODES_PER_DOMAIN 16
8005
8006#ifdef CONFIG_NUMA
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018static int find_next_best_node(int node, nodemask_t *used_nodes)
8019{
8020 int i, n, val, min_val, best_node = 0;
8021
8022 min_val = INT_MAX;
8023
8024 for (i = 0; i < nr_node_ids; i++) {
8025
8026 n = (node + i) % nr_node_ids;
8027
8028 if (!nr_cpus_node(n))
8029 continue;
8030
8031
8032 if (node_isset(n, *used_nodes))
8033 continue;
8034
8035
8036 val = node_distance(node, n);
8037
8038 if (val < min_val) {
8039 min_val = val;
8040 best_node = n;
8041 }
8042 }
8043
8044 node_set(best_node, *used_nodes);
8045 return best_node;
8046}
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057static void sched_domain_node_span(int node, struct cpumask *span)
8058{
8059 nodemask_t used_nodes;
8060 int i;
8061
8062 cpumask_clear(span);
8063 nodes_clear(used_nodes);
8064
8065 cpumask_or(span, span, cpumask_of_node(node));
8066 node_set(node, used_nodes);
8067
8068 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
8069 int next_node = find_next_best_node(node, &used_nodes);
8070
8071 cpumask_or(span, span, cpumask_of_node(next_node));
8072 }
8073}
8074#endif
8075
8076int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
8077
8078
8079
8080
8081
8082
8083
8084struct static_sched_group {
8085 struct sched_group sg;
8086 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
8087};
8088
8089struct static_sched_domain {
8090 struct sched_domain sd;
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092};
8093
8094
8095
8096
8097#ifdef CONFIG_SCHED_SMT
8098static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8099static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
8100
8101static int
8102cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8103 struct sched_group **sg, struct cpumask *unused)
8104{
8105 if (sg)
8106 *sg = &per_cpu(sched_group_cpus, cpu).sg;
8107 return cpu;
8108}
8109#endif
8110
8111
8112
8113
8114#ifdef CONFIG_SCHED_MC
8115static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
8116static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
8117#endif
8118
8119#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
8120static int
8121cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
8122 struct sched_group **sg, struct cpumask *mask)
8123{
8124 int group;
8125
8126 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
8127 group = cpumask_first(mask);
8128 if (sg)
8129 *sg = &per_cpu(sched_group_core, group).sg;
8130 return group;
8131}
8132#elif defined(CONFIG_SCHED_MC)
8133static int
8134cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
8135 struct sched_group **sg, struct cpumask *unused)
8136{
8137 if (sg)
8138 *sg = &per_cpu(sched_group_core, cpu).sg;
8139 return cpu;
8140}
8141#endif
8142
8143static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
8144static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
8145
8146static int
8147cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
8148 struct sched_group **sg, struct cpumask *mask)
8149{
8150 int group;
8151#ifdef CONFIG_SCHED_MC
8152 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
8153 group = cpumask_first(mask);
8154#elif defined(CONFIG_SCHED_SMT)
8155 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
8156 group = cpumask_first(mask);
8157#else
8158 group = cpu;
8159#endif
8160 if (sg)
8161 *sg = &per_cpu(sched_group_phys, group).sg;
8162 return group;
8163}
8164
8165#ifdef CONFIG_NUMA
8166
8167
8168
8169
8170
8171static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
8172static struct sched_group ***sched_group_nodes_bycpu;
8173
8174static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
8175static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
8176
8177static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
8178 struct sched_group **sg,
8179 struct cpumask *nodemask)
8180{
8181 int group;
8182
8183 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
8184 group = cpumask_first(nodemask);
8185
8186 if (sg)
8187 *sg = &per_cpu(sched_group_allnodes, group).sg;
8188 return group;
8189}
8190
8191static void init_numa_sched_groups_power(struct sched_group *group_head)
8192{
8193 struct sched_group *sg = group_head;
8194 int j;
8195
8196 if (!sg)
8197 return;
8198 do {
8199 for_each_cpu(j, sched_group_cpus(sg)) {
8200 struct sched_domain *sd;
8201
8202 sd = &per_cpu(phys_domains, j).sd;
8203 if (j != group_first_cpu(sd->groups)) {
8204
8205
8206
8207
8208 continue;
8209 }
8210
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
8212 }
8213 sg = sg->next;
8214 } while (sg != group_head);
8215}
8216#endif
8217
8218#ifdef CONFIG_NUMA
8219
8220static void free_sched_groups(const struct cpumask *cpu_map,
8221 struct cpumask *nodemask)
8222{
8223 int cpu, i;
8224
8225 for_each_cpu(cpu, cpu_map) {
8226 struct sched_group **sched_group_nodes
8227 = sched_group_nodes_bycpu[cpu];
8228
8229 if (!sched_group_nodes)
8230 continue;
8231
8232 for (i = 0; i < nr_node_ids; i++) {
8233 struct sched_group *oldsg, *sg = sched_group_nodes[i];
8234
8235 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
8236 if (cpumask_empty(nodemask))
8237 continue;
8238
8239 if (sg == NULL)
8240 continue;
8241 sg = sg->next;
8242next_sg:
8243 oldsg = sg;
8244 sg = sg->next;
8245 kfree(oldsg);
8246 if (oldsg != sched_group_nodes[i])
8247 goto next_sg;
8248 }
8249 kfree(sched_group_nodes);
8250 sched_group_nodes_bycpu[cpu] = NULL;
8251 }
8252}
8253#else
8254static void free_sched_groups(const struct cpumask *cpu_map,
8255 struct cpumask *nodemask)
8256{
8257}
8258#endif
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{
8276 struct sched_domain *child;
8277 struct sched_group *group;
8278
8279 WARN_ON(!sd || !sd->groups);
8280
8281 if (cpu != group_first_cpu(sd->groups))
8282 return;
8283
8284 child = sd->child;
8285
8286 sd->groups->__cpu_power = 0;
8287
8288
8289
8290
8291
8292
8293
8294
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
8296 (child->flags &
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
8299 return;
8300 }
8301
8302
8303
8304
8305 group = child->groups;
8306 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power);
8308 group = group->next;
8309 } while (group != child->groups);
8310}
8311
8312
8313
8314
8315
8316
8317#ifdef CONFIG_SCHED_DEBUG
8318# define SD_INIT_NAME(sd, type) sd->name = #type
8319#else
8320# define SD_INIT_NAME(sd, type) do { } while (0)
8321#endif
8322
8323#define SD_INIT(sd, type) sd_init_##type(sd)
8324
8325#define SD_INIT_FUNC(type) \
8326static noinline void sd_init_##type(struct sched_domain *sd) \
8327{ \
8328 memset(sd, 0, sizeof(*sd)); \
8329 *sd = SD_##type##_INIT; \
8330 sd->level = SD_LV_##type; \
8331 SD_INIT_NAME(sd, type); \
8332}
8333
8334SD_INIT_FUNC(CPU)
8335#ifdef CONFIG_NUMA
8336 SD_INIT_FUNC(ALLNODES)
8337 SD_INIT_FUNC(NODE)
8338#endif
8339#ifdef CONFIG_SCHED_SMT
8340 SD_INIT_FUNC(SIBLING)
8341#endif
8342#ifdef CONFIG_SCHED_MC
8343 SD_INIT_FUNC(MC)
8344#endif
8345
8346static int default_relax_domain_level = -1;
8347
8348static int __init setup_relax_domain_level(char *str)
8349{
8350 unsigned long val;
8351
8352 val = simple_strtoul(str, NULL, 0);
8353 if (val < SD_LV_MAX)
8354 default_relax_domain_level = val;
8355
8356 return 1;
8357}
8358__setup("relax_domain_level=", setup_relax_domain_level);
8359
8360static void set_domain_attribute(struct sched_domain *sd,
8361 struct sched_domain_attr *attr)
8362{
8363 int request;
8364
8365 if (!attr || attr->relax_domain_level < 0) {
8366 if (default_relax_domain_level < 0)
8367 return;
8368 else
8369 request = default_relax_domain_level;
8370 } else
8371 request = attr->relax_domain_level;
8372 if (request < sd->level) {
8373
8374 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
8375 } else {
8376
8377 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
8378 }
8379}
8380
8381
8382
8383
8384
8385static int __build_sched_domains(const struct cpumask *cpu_map,
8386 struct sched_domain_attr *attr)
8387{
8388 int i, err = -ENOMEM;
8389 struct root_domain *rd;
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8391 tmpmask;
8392#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered;
8394 struct sched_group **sched_group_nodes = NULL;
8395 int sd_allnodes = 0;
8396
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
8398 goto out;
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
8400 goto free_domainspan;
8401 if (!alloc_cpumask_var(¬covered, GFP_KERNEL))
8402 goto free_covered;
8403#endif
8404
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415
8416#ifdef CONFIG_NUMA
8417
8418
8419
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
8421 GFP_KERNEL);
8422 if (!sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask;
8425 }
8426#endif
8427
8428 rd = alloc_rootdomain();
8429 if (!rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups;
8432 }
8433
8434#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
8436#endif
8437
8438
8439
8440
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458
8459 sd = &per_cpu(node_domains, i).sd;
8460 SD_INIT(sd, NODE);
8461 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8463 sd->parent = p;
8464 if (p)
8465 p->child = sd;
8466 cpumask_and(sched_domain_span(sd),
8467 sched_domain_span(sd), cpu_map);
8468#endif
8469
8470 p = sd;
8471 sd = &per_cpu(phys_domains, i).sd;
8472 SD_INIT(sd, CPU);
8473 set_domain_attribute(sd, attr);
8474 cpumask_copy(sched_domain_span(sd), nodemask);
8475 sd->parent = p;
8476 if (p)
8477 p->child = sd;
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
8479
8480#ifdef CONFIG_SCHED_MC
8481 p = sd;
8482 sd = &per_cpu(core_domains, i).sd;
8483 SD_INIT(sd, MC);
8484 set_domain_attribute(sd, attr);
8485 cpumask_and(sched_domain_span(sd), cpu_map,
8486 cpu_coregroup_mask(i));
8487 sd->parent = p;
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif
8491
8492#ifdef CONFIG_SCHED_SMT
8493 p = sd;
8494 sd = &per_cpu(cpu_domains, i).sd;
8495 SD_INIT(sd, SIBLING);
8496 set_domain_attribute(sd, attr);
8497 cpumask_and(sched_domain_span(sd),
8498 topology_thread_cpumask(i), cpu_map);
8499 sd->parent = p;
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif
8503 }
8504
8505#ifdef CONFIG_SCHED_SMT
8506
8507 for_each_cpu(i, cpu_map) {
8508 cpumask_and(this_sibling_map,
8509 topology_thread_cpumask(i), cpu_map);
8510 if (i != cpumask_first(this_sibling_map))
8511 continue;
8512
8513 init_sched_build_groups(this_sibling_map, cpu_map,
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif
8518
8519#ifdef CONFIG_SCHED_MC
8520
8521 for_each_cpu(i, cpu_map) {
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
8523 if (i != cpumask_first(this_core_map))
8524 continue;
8525
8526 init_sched_build_groups(this_core_map, cpu_map,
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif
8531
8532
8533 for (i = 0; i < nr_node_ids; i++) {
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
8535 if (cpumask_empty(nodemask))
8536 continue;
8537
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA
8544
8545 if (sd_allnodes) {
8546 init_sched_build_groups(cpu_map, cpu_map,
8547 &cpu_to_allnodes_group,
8548 send_covered, tmpmask);
8549 }
8550
8551 for (i = 0; i < nr_node_ids; i++) {
8552
8553 struct sched_group *sg, *prev;
8554 int j;
8555
8556 cpumask_clear(covered);
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
8558 if (cpumask_empty(nodemask)) {
8559 sched_group_nodes[i] = NULL;
8560 continue;
8561 }
8562
8563 sched_domain_node_span(i, domainspan);
8564 cpumask_and(domainspan, domainspan, cpu_map);
8565
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8567 GFP_KERNEL, i);
8568 if (!sg) {
8569 printk(KERN_WARNING "Can not alloc domain group for "
8570 "node %d\n", i);
8571 goto error;
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576
8577 sd = &per_cpu(node_domains, j).sd;
8578 sd->groups = sg;
8579 }
8580 sg->__cpu_power = 0;
8581 cpumask_copy(sched_group_cpus(sg), nodemask);
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585
8586 for (j = 0; j < nr_node_ids; j++) {
8587 int n = (i + j) % nr_node_ids;
8588
8589 cpumask_complement(notcovered, covered);
8590 cpumask_and(tmpmask, notcovered, cpu_map);
8591 cpumask_and(tmpmask, tmpmask, domainspan);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
8596 if (cpumask_empty(tmpmask))
8597 continue;
8598
8599 sg = kmalloc_node(sizeof(struct sched_group) +
8600 cpumask_size(),
8601 GFP_KERNEL, i);
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif
8616
8617
8618#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd);
8623 }
8624#endif
8625#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd);
8630 }
8631#endif
8632
8633 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd);
8637 }
8638
8639#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]);
8642
8643 if (sd_allnodes) {
8644 struct sched_group *sg;
8645
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask);
8648 init_numa_sched_groups_power(sg);
8649 }
8650#endif
8651
8652
8653 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC)
8658 sd = &per_cpu(core_domains, i).sd;
8659#else
8660 sd = &per_cpu(phys_domains, i).sd;
8661#endif
8662 cpu_attach_domain(sd, rd, i);
8663 }
8664
8665 err = 0;
8666
8667free_tmpmask:
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693
8694#ifdef CONFIG_NUMA
8695error:
8696 free_sched_groups(cpu_map, tmpmask);
8697 free_rootdomain(rd);
8698 goto free_tmpmask;
8699#endif
8700}
8701
8702static int build_sched_domains(const struct cpumask *cpu_map)
8703{
8704 return __build_sched_domains(cpu_map, NULL);
8705}
8706
8707static struct cpumask *doms_cur;
8708static int ndoms_cur;
8709static struct sched_domain_attr *dattr_cur;
8710
8711
8712
8713
8714
8715
8716
8717static cpumask_var_t fallback_doms;
8718
8719
8720
8721
8722
8723
8724int __attribute__((weak)) arch_update_cpu_topology(void)
8725{
8726 return 0;
8727}
8728
8729
8730
8731
8732
8733
8734static int arch_init_sched_domains(const struct cpumask *cpu_map)
8735{
8736 int err;
8737
8738 arch_update_cpu_topology();
8739 ndoms_cur = 1;
8740 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
8741 if (!doms_cur)
8742 doms_cur = fallback_doms;
8743 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
8744 dattr_cur = NULL;
8745 err = build_sched_domains(doms_cur);
8746 register_sched_domain_sysctl();
8747
8748 return err;
8749}
8750
8751static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
8752 struct cpumask *tmpmask)
8753{
8754 free_sched_groups(cpu_map, tmpmask);
8755}
8756
8757
8758
8759
8760
8761static void detach_destroy_domains(const struct cpumask *cpu_map)
8762{
8763
8764 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
8765 int i;
8766
8767 for_each_cpu(i, cpu_map)
8768 cpu_attach_domain(NULL, &def_root_domain, i);
8769 synchronize_sched();
8770 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
8771}
8772
8773
8774static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8775 struct sched_domain_attr *new, int idx_new)
8776{
8777 struct sched_domain_attr tmp;
8778
8779
8780 if (!new && !cur)
8781 return 1;
8782
8783 tmp = SD_ATTR_INIT;
8784 return !memcmp(cur ? (cur + idx_cur) : &tmp,
8785 new ? (new + idx_new) : &tmp,
8786 sizeof(struct sched_domain_attr));
8787}
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8817 struct sched_domain_attr *dattr_new)
8818{
8819 int i, j, n;
8820 int new_topology;
8821
8822 mutex_lock(&sched_domains_mutex);
8823
8824
8825 unregister_sched_domain_sysctl();
8826
8827
8828 new_topology = arch_update_cpu_topology();
8829
8830 n = doms_new ? ndoms_new : 0;
8831
8832
8833 for (i = 0; i < ndoms_cur; i++) {
8834 for (j = 0; j < n && !new_topology; j++) {
8835 if (cpumask_equal(&doms_cur[i], &doms_new[j])
8836 && dattrs_equal(dattr_cur, i, dattr_new, j))
8837 goto match1;
8838 }
8839
8840 detach_destroy_domains(doms_cur + i);
8841match1:
8842 ;
8843 }
8844
8845 if (doms_new == NULL) {
8846 ndoms_cur = 0;
8847 doms_new = fallback_doms;
8848 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
8849 WARN_ON_ONCE(dattr_new);
8850 }
8851
8852
8853 for (i = 0; i < ndoms_new; i++) {
8854 for (j = 0; j < ndoms_cur && !new_topology; j++) {
8855 if (cpumask_equal(&doms_new[i], &doms_cur[j])
8856 && dattrs_equal(dattr_new, i, dattr_cur, j))
8857 goto match2;
8858 }
8859
8860 __build_sched_domains(doms_new + i,
8861 dattr_new ? dattr_new + i : NULL);
8862match2:
8863 ;
8864 }
8865
8866
8867 if (doms_cur != fallback_doms)
8868 kfree(doms_cur);
8869 kfree(dattr_cur);
8870 doms_cur = doms_new;
8871 dattr_cur = dattr_new;
8872 ndoms_cur = ndoms_new;
8873
8874 register_sched_domain_sysctl();
8875
8876 mutex_unlock(&sched_domains_mutex);
8877}
8878
8879#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
8880static void arch_reinit_sched_domains(void)
8881{
8882 get_online_cpus();
8883
8884
8885 partition_sched_domains(0, NULL, NULL);
8886
8887 rebuild_sched_domains();
8888 put_online_cpus();
8889}
8890
8891static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
8892{
8893 unsigned int level = 0;
8894
8895 if (sscanf(buf, "%u", &level) != 1)
8896 return -EINVAL;
8897
8898
8899
8900
8901
8902
8903
8904
8905 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
8906 return -EINVAL;
8907
8908 if (smt)
8909 sched_smt_power_savings = level;
8910 else
8911 sched_mc_power_savings = level;
8912
8913 arch_reinit_sched_domains();
8914
8915 return count;
8916}
8917
8918#ifdef CONFIG_SCHED_MC
8919static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
8920 char *page)
8921{
8922 return sprintf(page, "%u\n", sched_mc_power_savings);
8923}
8924static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
8925 const char *buf, size_t count)
8926{
8927 return sched_power_savings_store(buf, count, 0);
8928}
8929static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
8930 sched_mc_power_savings_show,
8931 sched_mc_power_savings_store);
8932#endif
8933
8934#ifdef CONFIG_SCHED_SMT
8935static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
8936 char *page)
8937{
8938 return sprintf(page, "%u\n", sched_smt_power_savings);
8939}
8940static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
8941 const char *buf, size_t count)
8942{
8943 return sched_power_savings_store(buf, count, 1);
8944}
8945static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
8946 sched_smt_power_savings_show,
8947 sched_smt_power_savings_store);
8948#endif
8949
8950int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
8951{
8952 int err = 0;
8953
8954#ifdef CONFIG_SCHED_SMT
8955 if (smt_capable())
8956 err = sysfs_create_file(&cls->kset.kobj,
8957 &attr_sched_smt_power_savings.attr);
8958#endif
8959#ifdef CONFIG_SCHED_MC
8960 if (!err && mc_capable())
8961 err = sysfs_create_file(&cls->kset.kobj,
8962 &attr_sched_mc_power_savings.attr);
8963#endif
8964 return err;
8965}
8966#endif
8967
8968#ifndef CONFIG_CPUSETS
8969
8970
8971
8972
8973static int update_sched_domains(struct notifier_block *nfb,
8974 unsigned long action, void *hcpu)
8975{
8976 switch (action) {
8977 case CPU_ONLINE:
8978 case CPU_ONLINE_FROZEN:
8979 case CPU_DEAD:
8980 case CPU_DEAD_FROZEN:
8981 partition_sched_domains(1, NULL, NULL);
8982 return NOTIFY_OK;
8983
8984 default:
8985 return NOTIFY_DONE;
8986 }
8987}
8988#endif
8989
8990static int update_runtime(struct notifier_block *nfb,
8991 unsigned long action, void *hcpu)
8992{
8993 int cpu = (int)(long)hcpu;
8994
8995 switch (action) {
8996 case CPU_DOWN_PREPARE:
8997 case CPU_DOWN_PREPARE_FROZEN:
8998 disable_runtime(cpu_rq(cpu));
8999 return NOTIFY_OK;
9000
9001 case CPU_DOWN_FAILED:
9002 case CPU_DOWN_FAILED_FROZEN:
9003 case CPU_ONLINE:
9004 case CPU_ONLINE_FROZEN:
9005 enable_runtime(cpu_rq(cpu));
9006 return NOTIFY_OK;
9007
9008 default:
9009 return NOTIFY_DONE;
9010 }
9011}
9012
9013void __init sched_init_smp(void)
9014{
9015 cpumask_var_t non_isolated_cpus;
9016
9017 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9018
9019#if defined(CONFIG_NUMA)
9020 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
9021 GFP_KERNEL);
9022 BUG_ON(sched_group_nodes_bycpu == NULL);
9023#endif
9024 get_online_cpus();
9025 mutex_lock(&sched_domains_mutex);
9026 arch_init_sched_domains(cpu_online_mask);
9027 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9028 if (cpumask_empty(non_isolated_cpus))
9029 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
9030 mutex_unlock(&sched_domains_mutex);
9031 put_online_cpus();
9032
9033#ifndef CONFIG_CPUSETS
9034
9035 hotcpu_notifier(update_sched_domains, 0);
9036#endif
9037
9038
9039 hotcpu_notifier(update_runtime, 0);
9040
9041 init_hrtick();
9042
9043
9044 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
9045 BUG();
9046 sched_init_granularity();
9047 free_cpumask_var(non_isolated_cpus);
9048
9049 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9050 init_sched_rt_class();
9051}
9052#else
9053void __init sched_init_smp(void)
9054{
9055 sched_init_granularity();
9056}
9057#endif
9058
9059const_debug unsigned int sysctl_timer_migration = 1;
9060
9061int in_sched_functions(unsigned long addr)
9062{
9063 return in_lock_functions(addr) ||
9064 (addr >= (unsigned long)__sched_text_start
9065 && addr < (unsigned long)__sched_text_end);
9066}
9067
9068static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
9069{
9070 cfs_rq->tasks_timeline = RB_ROOT;
9071 INIT_LIST_HEAD(&cfs_rq->tasks);
9072#ifdef CONFIG_FAIR_GROUP_SCHED
9073 cfs_rq->rq = rq;
9074#endif
9075 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9076}
9077
9078static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9079{
9080 struct rt_prio_array *array;
9081 int i;
9082
9083 array = &rt_rq->active;
9084 for (i = 0; i < MAX_RT_PRIO; i++) {
9085 INIT_LIST_HEAD(array->queue + i);
9086 __clear_bit(i, array->bitmap);
9087 }
9088
9089 __set_bit(MAX_RT_PRIO, array->bitmap);
9090
9091#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
9092 rt_rq->highest_prio.curr = MAX_RT_PRIO;
9093#ifdef CONFIG_SMP
9094 rt_rq->highest_prio.next = MAX_RT_PRIO;
9095#endif
9096#endif
9097#ifdef CONFIG_SMP
9098 rt_rq->rt_nr_migratory = 0;
9099 rt_rq->overloaded = 0;
9100 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9101#endif
9102
9103 rt_rq->rt_time = 0;
9104 rt_rq->rt_throttled = 0;
9105 rt_rq->rt_runtime = 0;
9106 spin_lock_init(&rt_rq->rt_runtime_lock);
9107
9108#ifdef CONFIG_RT_GROUP_SCHED
9109 rt_rq->rt_nr_boosted = 0;
9110 rt_rq->rq = rq;
9111#endif
9112}
9113
9114#ifdef CONFIG_FAIR_GROUP_SCHED
9115static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9116 struct sched_entity *se, int cpu, int add,
9117 struct sched_entity *parent)
9118{
9119 struct rq *rq = cpu_rq(cpu);
9120 tg->cfs_rq[cpu] = cfs_rq;
9121 init_cfs_rq(cfs_rq, rq);
9122 cfs_rq->tg = tg;
9123 if (add)
9124 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
9125
9126 tg->se[cpu] = se;
9127
9128 if (!se)
9129 return;
9130
9131 if (!parent)
9132 se->cfs_rq = &rq->cfs;
9133 else
9134 se->cfs_rq = parent->my_q;
9135
9136 se->my_q = cfs_rq;
9137 se->load.weight = tg->shares;
9138 se->load.inv_weight = 0;
9139 se->parent = parent;
9140}
9141#endif
9142
9143#ifdef CONFIG_RT_GROUP_SCHED
9144static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9145 struct sched_rt_entity *rt_se, int cpu, int add,
9146 struct sched_rt_entity *parent)
9147{
9148 struct rq *rq = cpu_rq(cpu);
9149
9150 tg->rt_rq[cpu] = rt_rq;
9151 init_rt_rq(rt_rq, rq);
9152 rt_rq->tg = tg;
9153 rt_rq->rt_se = rt_se;
9154 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9155 if (add)
9156 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
9157
9158 tg->rt_se[cpu] = rt_se;
9159 if (!rt_se)
9160 return;
9161
9162 if (!parent)
9163 rt_se->rt_rq = &rq->rt;
9164 else
9165 rt_se->rt_rq = parent->my_q;
9166
9167 rt_se->my_q = rt_rq;
9168 rt_se->parent = parent;
9169 INIT_LIST_HEAD(&rt_se->run_list);
9170}
9171#endif
9172
9173void __init sched_init(void)
9174{
9175 int i, j;
9176 unsigned long alloc_size = 0, ptr;
9177
9178#ifdef CONFIG_FAIR_GROUP_SCHED
9179 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9180#endif
9181#ifdef CONFIG_RT_GROUP_SCHED
9182 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9183#endif
9184#ifdef CONFIG_USER_SCHED
9185 alloc_size *= 2;
9186#endif
9187#ifdef CONFIG_CPUMASK_OFFSTACK
9188 alloc_size += num_possible_cpus() * cpumask_size();
9189#endif
9190
9191
9192
9193
9194 if (alloc_size) {
9195 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9196
9197#ifdef CONFIG_FAIR_GROUP_SCHED
9198 init_task_group.se = (struct sched_entity **)ptr;
9199 ptr += nr_cpu_ids * sizeof(void **);
9200
9201 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9202 ptr += nr_cpu_ids * sizeof(void **);
9203
9204#ifdef CONFIG_USER_SCHED
9205 root_task_group.se = (struct sched_entity **)ptr;
9206 ptr += nr_cpu_ids * sizeof(void **);
9207
9208 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9209 ptr += nr_cpu_ids * sizeof(void **);
9210#endif
9211#endif
9212#ifdef CONFIG_RT_GROUP_SCHED
9213 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
9214 ptr += nr_cpu_ids * sizeof(void **);
9215
9216 init_task_group.rt_rq = (struct rt_rq **)ptr;
9217 ptr += nr_cpu_ids * sizeof(void **);
9218
9219#ifdef CONFIG_USER_SCHED
9220 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9221 ptr += nr_cpu_ids * sizeof(void **);
9222
9223 root_task_group.rt_rq = (struct rt_rq **)ptr;
9224 ptr += nr_cpu_ids * sizeof(void **);
9225#endif
9226#endif
9227#ifdef CONFIG_CPUMASK_OFFSTACK
9228 for_each_possible_cpu(i) {
9229 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
9230 ptr += cpumask_size();
9231 }
9232#endif
9233 }
9234
9235#ifdef CONFIG_SMP
9236 init_defrootdomain();
9237#endif
9238
9239 init_rt_bandwidth(&def_rt_bandwidth,
9240 global_rt_period(), global_rt_runtime());
9241
9242#ifdef CONFIG_RT_GROUP_SCHED
9243 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9244 global_rt_period(), global_rt_runtime());
9245#ifdef CONFIG_USER_SCHED
9246 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9247 global_rt_period(), RUNTIME_INF);
9248#endif
9249#endif
9250
9251#ifdef CONFIG_GROUP_SCHED
9252 list_add(&init_task_group.list, &task_groups);
9253 INIT_LIST_HEAD(&init_task_group.children);
9254
9255#ifdef CONFIG_USER_SCHED
9256 INIT_LIST_HEAD(&root_task_group.children);
9257 init_task_group.parent = &root_task_group;
9258 list_add(&init_task_group.siblings, &root_task_group.children);
9259#endif
9260#endif
9261
9262 for_each_possible_cpu(i) {
9263 struct rq *rq;
9264
9265 rq = cpu_rq(i);
9266 spin_lock_init(&rq->lock);
9267 rq->nr_running = 0;
9268 rq->calc_load_active = 0;
9269 rq->calc_load_update = jiffies + LOAD_FREQ;
9270 init_cfs_rq(&rq->cfs, rq);
9271 init_rt_rq(&rq->rt, rq);
9272#ifdef CONFIG_FAIR_GROUP_SCHED
9273 init_task_group.shares = init_task_group_load;
9274 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9275#ifdef CONFIG_CGROUP_SCHED
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9296#elif defined CONFIG_USER_SCHED
9297 root_task_group.shares = NICE_0_LOAD;
9298 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]);
9314
9315#endif
9316#endif
9317
9318 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9319#ifdef CONFIG_RT_GROUP_SCHED
9320 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9321#ifdef CONFIG_CGROUP_SCHED
9322 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9323#elif defined CONFIG_USER_SCHED
9324 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9325 init_tg_rt_entry(&init_task_group,
9326 &per_cpu(init_rt_rq, i),
9327 &per_cpu(init_sched_rt_entity, i), i, 1,
9328 root_task_group.rt_se[i]);
9329#endif
9330#endif
9331
9332 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
9333 rq->cpu_load[j] = 0;
9334#ifdef CONFIG_SMP
9335 rq->sd = NULL;
9336 rq->rd = NULL;
9337 rq->active_balance = 0;
9338 rq->next_balance = jiffies;
9339 rq->push_cpu = 0;
9340 rq->cpu = i;
9341 rq->online = 0;
9342 rq->migration_thread = NULL;
9343 INIT_LIST_HEAD(&rq->migration_queue);
9344 rq_attach_root(rq, &def_root_domain);
9345#endif
9346 init_rq_hrtick(rq);
9347 atomic_set(&rq->nr_iowait, 0);
9348 }
9349
9350 set_load_weight(&init_task);
9351
9352#ifdef CONFIG_PREEMPT_NOTIFIERS
9353 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
9354#endif
9355
9356#ifdef CONFIG_SMP
9357 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
9358#endif
9359
9360#ifdef CONFIG_RT_MUTEXES
9361 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
9362#endif
9363
9364
9365
9366
9367 atomic_inc(&init_mm.mm_count);
9368 enter_lazy_tlb(&init_mm, current);
9369
9370
9371
9372
9373
9374
9375
9376 init_idle(current, smp_processor_id());
9377
9378 calc_load_update = jiffies + LOAD_FREQ;
9379
9380
9381
9382
9383 current->sched_class = &fair_sched_class;
9384
9385
9386 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9387#ifdef CONFIG_SMP
9388#ifdef CONFIG_NO_HZ
9389 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9390 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9391#endif
9392 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9393#endif
9394
9395 perf_counter_init();
9396
9397 scheduler_running = 1;
9398}
9399
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line)
9402{
9403#ifdef in_atomic
9404 static unsigned long prev_jiffy;
9405
9406 if ((!in_atomic() && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return;
9411 prev_jiffy = jiffies;
9412
9413 printk(KERN_ERR
9414 "BUG: sleeping function called from invalid context at %s:%d\n",
9415 file, line);
9416 printk(KERN_ERR
9417 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9418 in_atomic(), irqs_disabled(),
9419 current->pid, current->comm);
9420
9421 debug_show_held_locks(current);
9422 if (irqs_disabled())
9423 print_irqtrace_events(current);
9424 dump_stack();
9425#endif
9426}
9427EXPORT_SYMBOL(__might_sleep);
9428#endif
9429
9430#ifdef CONFIG_MAGIC_SYSRQ
9431static void normalize_task(struct rq *rq, struct task_struct *p)
9432{
9433 int on_rq;
9434
9435 update_rq_clock(rq);
9436 on_rq = p->se.on_rq;
9437 if (on_rq)
9438 deactivate_task(rq, p, 0);
9439 __setscheduler(rq, p, SCHED_NORMAL, 0);
9440 if (on_rq) {
9441 activate_task(rq, p, 0);
9442 resched_task(rq->curr);
9443 }
9444}
9445
9446void normalize_rt_tasks(void)
9447{
9448 struct task_struct *g, *p;
9449 unsigned long flags;
9450 struct rq *rq;
9451
9452 read_lock_irqsave(&tasklist_lock, flags);
9453 do_each_thread(g, p) {
9454
9455
9456
9457 if (!p->mm)
9458 continue;
9459
9460 p->se.exec_start = 0;
9461#ifdef CONFIG_SCHEDSTATS
9462 p->se.wait_start = 0;
9463 p->se.sleep_start = 0;
9464 p->se.block_start = 0;
9465#endif
9466
9467 if (!rt_task(p)) {
9468
9469
9470
9471
9472 if (TASK_NICE(p) < 0 && p->mm)
9473 set_user_nice(p, 0);
9474 continue;
9475 }
9476
9477 spin_lock(&p->pi_lock);
9478 rq = __task_rq_lock(p);
9479
9480 normalize_task(rq, p);
9481
9482 __task_rq_unlock(rq);
9483 spin_unlock(&p->pi_lock);
9484 } while_each_thread(g, p);
9485
9486 read_unlock_irqrestore(&tasklist_lock, flags);
9487}
9488
9489#endif
9490
9491#ifdef CONFIG_IA64
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508struct task_struct *curr_task(int cpu)
9509{
9510 return cpu_curr(cpu);
9511}
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528void set_curr_task(int cpu, struct task_struct *p)
9529{
9530 cpu_curr(cpu) = p;
9531}
9532
9533#endif
9534
9535#ifdef CONFIG_FAIR_GROUP_SCHED
9536static void free_fair_sched_group(struct task_group *tg)
9537{
9538 int i;
9539
9540 for_each_possible_cpu(i) {
9541 if (tg->cfs_rq)
9542 kfree(tg->cfs_rq[i]);
9543 if (tg->se)
9544 kfree(tg->se[i]);
9545 }
9546
9547 kfree(tg->cfs_rq);
9548 kfree(tg->se);
9549}
9550
9551static
9552int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9553{
9554 struct cfs_rq *cfs_rq;
9555 struct sched_entity *se;
9556 struct rq *rq;
9557 int i;
9558
9559 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9560 if (!tg->cfs_rq)
9561 goto err;
9562 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9563 if (!tg->se)
9564 goto err;
9565
9566 tg->shares = NICE_0_LOAD;
9567
9568 for_each_possible_cpu(i) {
9569 rq = cpu_rq(i);
9570
9571 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9572 GFP_KERNEL, cpu_to_node(i));
9573 if (!cfs_rq)
9574 goto err;
9575
9576 se = kzalloc_node(sizeof(struct sched_entity),
9577 GFP_KERNEL, cpu_to_node(i));
9578 if (!se)
9579 goto err;
9580
9581 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9582 }
9583
9584 return 1;
9585
9586 err:
9587 return 0;
9588}
9589
9590static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9591{
9592 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
9593 &cpu_rq(cpu)->leaf_cfs_rq_list);
9594}
9595
9596static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9597{
9598 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
9599}
9600#else
9601static inline void free_fair_sched_group(struct task_group *tg)
9602{
9603}
9604
9605static inline
9606int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9607{
9608 return 1;
9609}
9610
9611static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9612{
9613}
9614
9615static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9616{
9617}
9618#endif
9619
9620#ifdef CONFIG_RT_GROUP_SCHED
9621static void free_rt_sched_group(struct task_group *tg)
9622{
9623 int i;
9624
9625 destroy_rt_bandwidth(&tg->rt_bandwidth);
9626
9627 for_each_possible_cpu(i) {
9628 if (tg->rt_rq)
9629 kfree(tg->rt_rq[i]);
9630 if (tg->rt_se)
9631 kfree(tg->rt_se[i]);
9632 }
9633
9634 kfree(tg->rt_rq);
9635 kfree(tg->rt_se);
9636}
9637
9638static
9639int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9640{
9641 struct rt_rq *rt_rq;
9642 struct sched_rt_entity *rt_se;
9643 struct rq *rq;
9644 int i;
9645
9646 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
9647 if (!tg->rt_rq)
9648 goto err;
9649 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
9650 if (!tg->rt_se)
9651 goto err;
9652
9653 init_rt_bandwidth(&tg->rt_bandwidth,
9654 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
9655
9656 for_each_possible_cpu(i) {
9657 rq = cpu_rq(i);
9658
9659 rt_rq = kzalloc_node(sizeof(struct rt_rq),
9660 GFP_KERNEL, cpu_to_node(i));
9661 if (!rt_rq)
9662 goto err;
9663
9664 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9665 GFP_KERNEL, cpu_to_node(i));
9666 if (!rt_se)
9667 goto err;
9668
9669 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9670 }
9671
9672 return 1;
9673
9674 err:
9675 return 0;
9676}
9677
9678static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9679{
9680 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
9681 &cpu_rq(cpu)->leaf_rt_rq_list);
9682}
9683
9684static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9685{
9686 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
9687}
9688#else
9689static inline void free_rt_sched_group(struct task_group *tg)
9690{
9691}
9692
9693static inline
9694int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9695{
9696 return 1;
9697}
9698
9699static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9700{
9701}
9702
9703static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9704{
9705}
9706#endif
9707
9708#ifdef CONFIG_GROUP_SCHED
9709static void free_sched_group(struct task_group *tg)
9710{
9711 free_fair_sched_group(tg);
9712 free_rt_sched_group(tg);
9713 kfree(tg);
9714}
9715
9716
9717struct task_group *sched_create_group(struct task_group *parent)
9718{
9719 struct task_group *tg;
9720 unsigned long flags;
9721 int i;
9722
9723 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
9724 if (!tg)
9725 return ERR_PTR(-ENOMEM);
9726
9727 if (!alloc_fair_sched_group(tg, parent))
9728 goto err;
9729
9730 if (!alloc_rt_sched_group(tg, parent))
9731 goto err;
9732
9733 spin_lock_irqsave(&task_group_lock, flags);
9734 for_each_possible_cpu(i) {
9735 register_fair_sched_group(tg, i);
9736 register_rt_sched_group(tg, i);
9737 }
9738 list_add_rcu(&tg->list, &task_groups);
9739
9740 WARN_ON(!parent);
9741
9742 tg->parent = parent;
9743 INIT_LIST_HEAD(&tg->children);
9744 list_add_rcu(&tg->siblings, &parent->children);
9745 spin_unlock_irqrestore(&task_group_lock, flags);
9746
9747 return tg;
9748
9749err:
9750 free_sched_group(tg);
9751 return ERR_PTR(-ENOMEM);
9752}
9753
9754
9755static void free_sched_group_rcu(struct rcu_head *rhp)
9756{
9757
9758 free_sched_group(container_of(rhp, struct task_group, rcu));
9759}
9760
9761
9762void sched_destroy_group(struct task_group *tg)
9763{
9764 unsigned long flags;
9765 int i;
9766
9767 spin_lock_irqsave(&task_group_lock, flags);
9768 for_each_possible_cpu(i) {
9769 unregister_fair_sched_group(tg, i);
9770 unregister_rt_sched_group(tg, i);
9771 }
9772 list_del_rcu(&tg->list);
9773 list_del_rcu(&tg->siblings);
9774 spin_unlock_irqrestore(&task_group_lock, flags);
9775
9776
9777 call_rcu(&tg->rcu, free_sched_group_rcu);
9778}
9779
9780
9781
9782
9783
9784
9785void sched_move_task(struct task_struct *tsk)
9786{
9787 int on_rq, running;
9788 unsigned long flags;
9789 struct rq *rq;
9790
9791 rq = task_rq_lock(tsk, &flags);
9792
9793 update_rq_clock(rq);
9794
9795 running = task_current(rq, tsk);
9796 on_rq = tsk->se.on_rq;
9797
9798 if (on_rq)
9799 dequeue_task(rq, tsk, 0);
9800 if (unlikely(running))
9801 tsk->sched_class->put_prev_task(rq, tsk);
9802
9803 set_task_rq(tsk, task_cpu(tsk));
9804
9805#ifdef CONFIG_FAIR_GROUP_SCHED
9806 if (tsk->sched_class->moved_group)
9807 tsk->sched_class->moved_group(tsk);
9808#endif
9809
9810 if (unlikely(running))
9811 tsk->sched_class->set_curr_task(rq);
9812 if (on_rq)
9813 enqueue_task(rq, tsk, 0);
9814
9815 task_rq_unlock(rq, &flags);
9816}
9817#endif
9818
9819#ifdef CONFIG_FAIR_GROUP_SCHED
9820static void __set_se_shares(struct sched_entity *se, unsigned long shares)
9821{
9822 struct cfs_rq *cfs_rq = se->cfs_rq;
9823 int on_rq;
9824
9825 on_rq = se->on_rq;
9826 if (on_rq)
9827 dequeue_entity(cfs_rq, se, 0);
9828
9829 se->load.weight = shares;
9830 se->load.inv_weight = 0;
9831
9832 if (on_rq)
9833 enqueue_entity(cfs_rq, se, 0);
9834}
9835
9836static void set_se_shares(struct sched_entity *se, unsigned long shares)
9837{
9838 struct cfs_rq *cfs_rq = se->cfs_rq;
9839 struct rq *rq = cfs_rq->rq;
9840 unsigned long flags;
9841
9842 spin_lock_irqsave(&rq->lock, flags);
9843 __set_se_shares(se, shares);
9844 spin_unlock_irqrestore(&rq->lock, flags);
9845}
9846
9847static DEFINE_MUTEX(shares_mutex);
9848
9849int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9850{
9851 int i;
9852 unsigned long flags;
9853
9854
9855
9856
9857 if (!tg->se[0])
9858 return -EINVAL;
9859
9860 if (shares < MIN_SHARES)
9861 shares = MIN_SHARES;
9862 else if (shares > MAX_SHARES)
9863 shares = MAX_SHARES;
9864
9865 mutex_lock(&shares_mutex);
9866 if (tg->shares == shares)
9867 goto done;
9868
9869 spin_lock_irqsave(&task_group_lock, flags);
9870 for_each_possible_cpu(i)
9871 unregister_fair_sched_group(tg, i);
9872 list_del_rcu(&tg->siblings);
9873 spin_unlock_irqrestore(&task_group_lock, flags);
9874
9875
9876 synchronize_sched();
9877
9878
9879
9880
9881
9882 tg->shares = shares;
9883 for_each_possible_cpu(i) {
9884
9885
9886
9887 cfs_rq_set_shares(tg->cfs_rq[i], 0);
9888 set_se_shares(tg->se[i], shares);
9889 }
9890
9891
9892
9893
9894
9895 spin_lock_irqsave(&task_group_lock, flags);
9896 for_each_possible_cpu(i)
9897 register_fair_sched_group(tg, i);
9898 list_add_rcu(&tg->siblings, &tg->parent->children);
9899 spin_unlock_irqrestore(&task_group_lock, flags);
9900done:
9901 mutex_unlock(&shares_mutex);
9902 return 0;
9903}
9904
9905unsigned long sched_group_shares(struct task_group *tg)
9906{
9907 return tg->shares;
9908}
9909#endif
9910
9911#ifdef CONFIG_RT_GROUP_SCHED
9912
9913
9914
9915static DEFINE_MUTEX(rt_constraints_mutex);
9916
9917static unsigned long to_ratio(u64 period, u64 runtime)
9918{
9919 if (runtime == RUNTIME_INF)
9920 return 1ULL << 20;
9921
9922 return div64_u64(runtime << 20, period);
9923}
9924
9925
9926static inline int tg_has_rt_tasks(struct task_group *tg)
9927{
9928 struct task_struct *g, *p;
9929
9930 do_each_thread(g, p) {
9931 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
9932 return 1;
9933 } while_each_thread(g, p);
9934
9935 return 0;
9936}
9937
9938struct rt_schedulable_data {
9939 struct task_group *tg;
9940 u64 rt_period;
9941 u64 rt_runtime;
9942};
9943
9944static int tg_schedulable(struct task_group *tg, void *data)
9945{
9946 struct rt_schedulable_data *d = data;
9947 struct task_group *child;
9948 unsigned long total, sum = 0;
9949 u64 period, runtime;
9950
9951 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9952 runtime = tg->rt_bandwidth.rt_runtime;
9953
9954 if (tg == d->tg) {
9955 period = d->rt_period;
9956 runtime = d->rt_runtime;
9957 }
9958
9959#ifdef CONFIG_USER_SCHED
9960 if (tg == &root_task_group) {
9961 period = global_rt_period();
9962 runtime = global_rt_runtime();
9963 }
9964#endif
9965
9966
9967
9968
9969 if (runtime > period && runtime != RUNTIME_INF)
9970 return -EINVAL;
9971
9972
9973
9974
9975 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
9976 return -EBUSY;
9977
9978 total = to_ratio(period, runtime);
9979
9980
9981
9982
9983 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
9984 return -EINVAL;
9985
9986
9987
9988
9989 list_for_each_entry_rcu(child, &tg->children, siblings) {
9990 period = ktime_to_ns(child->rt_bandwidth.rt_period);
9991 runtime = child->rt_bandwidth.rt_runtime;
9992
9993 if (child == d->tg) {
9994 period = d->rt_period;
9995 runtime = d->rt_runtime;
9996 }
9997
9998 sum += to_ratio(period, runtime);
9999 }
10000
10001 if (sum > total)
10002 return -EINVAL;
10003
10004 return 0;
10005}
10006
10007static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
10008{
10009 struct rt_schedulable_data data = {
10010 .tg = tg,
10011 .rt_period = period,
10012 .rt_runtime = runtime,
10013 };
10014
10015 return walk_tg_tree(tg_schedulable, tg_nop, &data);
10016}
10017
10018static int tg_set_bandwidth(struct task_group *tg,
10019 u64 rt_period, u64 rt_runtime)
10020{
10021 int i, err = 0;
10022
10023 mutex_lock(&rt_constraints_mutex);
10024 read_lock(&tasklist_lock);
10025 err = __rt_schedulable(tg, rt_period, rt_runtime);
10026 if (err)
10027 goto unlock;
10028
10029 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10030 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10031 tg->rt_bandwidth.rt_runtime = rt_runtime;
10032
10033 for_each_possible_cpu(i) {
10034 struct rt_rq *rt_rq = tg->rt_rq[i];
10035
10036 spin_lock(&rt_rq->rt_runtime_lock);
10037 rt_rq->rt_runtime = rt_runtime;
10038 spin_unlock(&rt_rq->rt_runtime_lock);
10039 }
10040 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10041 unlock:
10042 read_unlock(&tasklist_lock);
10043 mutex_unlock(&rt_constraints_mutex);
10044
10045 return err;
10046}
10047
10048int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
10049{
10050 u64 rt_runtime, rt_period;
10051
10052 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
10053 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
10054 if (rt_runtime_us < 0)
10055 rt_runtime = RUNTIME_INF;
10056
10057 return tg_set_bandwidth(tg, rt_period, rt_runtime);
10058}
10059
10060long sched_group_rt_runtime(struct task_group *tg)
10061{
10062 u64 rt_runtime_us;
10063
10064 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
10065 return -1;
10066
10067 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
10068 do_div(rt_runtime_us, NSEC_PER_USEC);
10069 return rt_runtime_us;
10070}
10071
10072int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
10073{
10074 u64 rt_runtime, rt_period;
10075
10076 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
10077 rt_runtime = tg->rt_bandwidth.rt_runtime;
10078
10079 if (rt_period == 0)
10080 return -EINVAL;
10081
10082 return tg_set_bandwidth(tg, rt_period, rt_runtime);
10083}
10084
10085long sched_group_rt_period(struct task_group *tg)
10086{
10087 u64 rt_period_us;
10088
10089 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
10090 do_div(rt_period_us, NSEC_PER_USEC);
10091 return rt_period_us;
10092}
10093
10094static int sched_rt_global_constraints(void)
10095{
10096 u64 runtime, period;
10097 int ret = 0;
10098
10099 if (sysctl_sched_rt_period <= 0)
10100 return -EINVAL;
10101
10102 runtime = global_rt_runtime();
10103 period = global_rt_period();
10104
10105
10106
10107
10108 if (runtime > period && runtime != RUNTIME_INF)
10109 return -EINVAL;
10110
10111 mutex_lock(&rt_constraints_mutex);
10112 read_lock(&tasklist_lock);
10113 ret = __rt_schedulable(NULL, 0, 0);
10114 read_unlock(&tasklist_lock);
10115 mutex_unlock(&rt_constraints_mutex);
10116
10117 return ret;
10118}
10119
10120int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
10121{
10122
10123 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
10124 return 0;
10125
10126 return 1;
10127}
10128
10129#else
10130static int sched_rt_global_constraints(void)
10131{
10132 unsigned long flags;
10133 int i;
10134
10135 if (sysctl_sched_rt_period <= 0)
10136 return -EINVAL;
10137
10138
10139
10140
10141
10142 if (sysctl_sched_rt_runtime == 0)
10143 return -EBUSY;
10144
10145 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10146 for_each_possible_cpu(i) {
10147 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10148
10149 spin_lock(&rt_rq->rt_runtime_lock);
10150 rt_rq->rt_runtime = global_rt_runtime();
10151 spin_unlock(&rt_rq->rt_runtime_lock);
10152 }
10153 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10154
10155 return 0;
10156}
10157#endif
10158
10159int sched_rt_handler(struct ctl_table *table, int write,
10160 struct file *filp, void __user *buffer, size_t *lenp,
10161 loff_t *ppos)
10162{
10163 int ret;
10164 int old_period, old_runtime;
10165 static DEFINE_MUTEX(mutex);
10166
10167 mutex_lock(&mutex);
10168 old_period = sysctl_sched_rt_period;
10169 old_runtime = sysctl_sched_rt_runtime;
10170
10171 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
10172
10173 if (!ret && write) {
10174 ret = sched_rt_global_constraints();
10175 if (ret) {
10176 sysctl_sched_rt_period = old_period;
10177 sysctl_sched_rt_runtime = old_runtime;
10178 } else {
10179 def_rt_bandwidth.rt_runtime = global_rt_runtime();
10180 def_rt_bandwidth.rt_period =
10181 ns_to_ktime(global_rt_period());
10182 }
10183 }
10184 mutex_unlock(&mutex);
10185
10186 return ret;
10187}
10188
10189#ifdef CONFIG_CGROUP_SCHED
10190
10191
10192static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
10193{
10194 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
10195 struct task_group, css);
10196}
10197
10198static struct cgroup_subsys_state *
10199cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
10200{
10201 struct task_group *tg, *parent;
10202
10203 if (!cgrp->parent) {
10204
10205 return &init_task_group.css;
10206 }
10207
10208 parent = cgroup_tg(cgrp->parent);
10209 tg = sched_create_group(parent);
10210 if (IS_ERR(tg))
10211 return ERR_PTR(-ENOMEM);
10212
10213 return &tg->css;
10214}
10215
10216static void
10217cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10218{
10219 struct task_group *tg = cgroup_tg(cgrp);
10220
10221 sched_destroy_group(tg);
10222}
10223
10224static int
10225cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10226 struct task_struct *tsk)
10227{
10228#ifdef CONFIG_RT_GROUP_SCHED
10229 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
10230 return -EINVAL;
10231#else
10232
10233 if (tsk->sched_class != &fair_sched_class)
10234 return -EINVAL;
10235#endif
10236
10237 return 0;
10238}
10239
10240static void
10241cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10242 struct cgroup *old_cont, struct task_struct *tsk)
10243{
10244 sched_move_task(tsk);
10245}
10246
10247#ifdef CONFIG_FAIR_GROUP_SCHED
10248static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
10249 u64 shareval)
10250{
10251 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
10252}
10253
10254static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
10255{
10256 struct task_group *tg = cgroup_tg(cgrp);
10257
10258 return (u64) tg->shares;
10259}
10260#endif
10261
10262#ifdef CONFIG_RT_GROUP_SCHED
10263static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
10264 s64 val)
10265{
10266 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
10267}
10268
10269static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
10270{
10271 return sched_group_rt_runtime(cgroup_tg(cgrp));
10272}
10273
10274static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
10275 u64 rt_period_us)
10276{
10277 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
10278}
10279
10280static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
10281{
10282 return sched_group_rt_period(cgroup_tg(cgrp));
10283}
10284#endif
10285
10286static struct cftype cpu_files[] = {
10287#ifdef CONFIG_FAIR_GROUP_SCHED
10288 {
10289 .name = "shares",
10290 .read_u64 = cpu_shares_read_u64,
10291 .write_u64 = cpu_shares_write_u64,
10292 },
10293#endif
10294#ifdef CONFIG_RT_GROUP_SCHED
10295 {
10296 .name = "rt_runtime_us",
10297 .read_s64 = cpu_rt_runtime_read,
10298 .write_s64 = cpu_rt_runtime_write,
10299 },
10300 {
10301 .name = "rt_period_us",
10302 .read_u64 = cpu_rt_period_read_uint,
10303 .write_u64 = cpu_rt_period_write_uint,
10304 },
10305#endif
10306};
10307
10308static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
10309{
10310 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
10311}
10312
10313struct cgroup_subsys cpu_cgroup_subsys = {
10314 .name = "cpu",
10315 .create = cpu_cgroup_create,
10316 .destroy = cpu_cgroup_destroy,
10317 .can_attach = cpu_cgroup_can_attach,
10318 .attach = cpu_cgroup_attach,
10319 .populate = cpu_cgroup_populate,
10320 .subsys_id = cpu_cgroup_subsys_id,
10321 .early_init = 1,
10322};
10323
10324#endif
10325
10326#ifdef CONFIG_CGROUP_CPUACCT
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336struct cpuacct {
10337 struct cgroup_subsys_state css;
10338
10339 u64 *cpuusage;
10340 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10341 struct cpuacct *parent;
10342};
10343
10344struct cgroup_subsys cpuacct_subsys;
10345
10346
10347static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
10348{
10349 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
10350 struct cpuacct, css);
10351}
10352
10353
10354static inline struct cpuacct *task_ca(struct task_struct *tsk)
10355{
10356 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
10357 struct cpuacct, css);
10358}
10359
10360
10361static struct cgroup_subsys_state *cpuacct_create(
10362 struct cgroup_subsys *ss, struct cgroup *cgrp)
10363{
10364 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
10365 int i;
10366
10367 if (!ca)
10368 goto out;
10369
10370 ca->cpuusage = alloc_percpu(u64);
10371 if (!ca->cpuusage)
10372 goto out_free_ca;
10373
10374 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10375 if (percpu_counter_init(&ca->cpustat[i], 0))
10376 goto out_free_counters;
10377
10378 if (cgrp->parent)
10379 ca->parent = cgroup_ca(cgrp->parent);
10380
10381 return &ca->css;
10382
10383out_free_counters:
10384 while (--i >= 0)
10385 percpu_counter_destroy(&ca->cpustat[i]);
10386 free_percpu(ca->cpuusage);
10387out_free_ca:
10388 kfree(ca);
10389out:
10390 return ERR_PTR(-ENOMEM);
10391}
10392
10393
10394static void
10395cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10396{
10397 struct cpuacct *ca = cgroup_ca(cgrp);
10398 int i;
10399
10400 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10401 percpu_counter_destroy(&ca->cpustat[i]);
10402 free_percpu(ca->cpuusage);
10403 kfree(ca);
10404}
10405
10406static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10407{
10408 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10409 u64 data;
10410
10411#ifndef CONFIG_64BIT
10412
10413
10414
10415 spin_lock_irq(&cpu_rq(cpu)->lock);
10416 data = *cpuusage;
10417 spin_unlock_irq(&cpu_rq(cpu)->lock);
10418#else
10419 data = *cpuusage;
10420#endif
10421
10422 return data;
10423}
10424
10425static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10426{
10427 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10428
10429#ifndef CONFIG_64BIT
10430
10431
10432
10433 spin_lock_irq(&cpu_rq(cpu)->lock);
10434 *cpuusage = val;
10435 spin_unlock_irq(&cpu_rq(cpu)->lock);
10436#else
10437 *cpuusage = val;
10438#endif
10439}
10440
10441
10442static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
10443{
10444 struct cpuacct *ca = cgroup_ca(cgrp);
10445 u64 totalcpuusage = 0;
10446 int i;
10447
10448 for_each_present_cpu(i)
10449 totalcpuusage += cpuacct_cpuusage_read(ca, i);
10450
10451 return totalcpuusage;
10452}
10453
10454static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
10455 u64 reset)
10456{
10457 struct cpuacct *ca = cgroup_ca(cgrp);
10458 int err = 0;
10459 int i;
10460
10461 if (reset) {
10462 err = -EINVAL;
10463 goto out;
10464 }
10465
10466 for_each_present_cpu(i)
10467 cpuacct_cpuusage_write(ca, i, 0);
10468
10469out:
10470 return err;
10471}
10472
10473static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
10474 struct seq_file *m)
10475{
10476 struct cpuacct *ca = cgroup_ca(cgroup);
10477 u64 percpu;
10478 int i;
10479
10480 for_each_present_cpu(i) {
10481 percpu = cpuacct_cpuusage_read(ca, i);
10482 seq_printf(m, "%llu ", (unsigned long long) percpu);
10483 }
10484 seq_printf(m, "\n");
10485 return 0;
10486}
10487
10488static const char *cpuacct_stat_desc[] = {
10489 [CPUACCT_STAT_USER] = "user",
10490 [CPUACCT_STAT_SYSTEM] = "system",
10491};
10492
10493static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
10494 struct cgroup_map_cb *cb)
10495{
10496 struct cpuacct *ca = cgroup_ca(cgrp);
10497 int i;
10498
10499 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
10500 s64 val = percpu_counter_read(&ca->cpustat[i]);
10501 val = cputime64_to_clock_t(val);
10502 cb->fill(cb, cpuacct_stat_desc[i], val);
10503 }
10504 return 0;
10505}
10506
10507static struct cftype files[] = {
10508 {
10509 .name = "usage",
10510 .read_u64 = cpuusage_read,
10511 .write_u64 = cpuusage_write,
10512 },
10513 {
10514 .name = "usage_percpu",
10515 .read_seq_string = cpuacct_percpu_seq_read,
10516 },
10517 {
10518 .name = "stat",
10519 .read_map = cpuacct_stats_show,
10520 },
10521};
10522
10523static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
10524{
10525 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
10526}
10527
10528
10529
10530
10531
10532
10533static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10534{
10535 struct cpuacct *ca;
10536 int cpu;
10537
10538 if (unlikely(!cpuacct_subsys.active))
10539 return;
10540
10541 cpu = task_cpu(tsk);
10542
10543 rcu_read_lock();
10544
10545 ca = task_ca(tsk);
10546
10547 for (; ca; ca = ca->parent) {
10548 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10549 *cpuusage += cputime;
10550 }
10551
10552 rcu_read_unlock();
10553}
10554
10555
10556
10557
10558static void cpuacct_update_stats(struct task_struct *tsk,
10559 enum cpuacct_stat_index idx, cputime_t val)
10560{
10561 struct cpuacct *ca;
10562
10563 if (unlikely(!cpuacct_subsys.active))
10564 return;
10565
10566 rcu_read_lock();
10567 ca = task_ca(tsk);
10568
10569 do {
10570 percpu_counter_add(&ca->cpustat[idx], val);
10571 ca = ca->parent;
10572 } while (ca);
10573 rcu_read_unlock();
10574}
10575
10576struct cgroup_subsys cpuacct_subsys = {
10577 .name = "cpuacct",
10578 .create = cpuacct_create,
10579 .destroy = cpuacct_destroy,
10580 .populate = cpuacct_populate,
10581 .subsys_id = cpuacct_subsys_id,
10582};
10583#endif
10584