1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/reciprocal_div.h>
66#include <linux/unistd.h>
67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/bootmem.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73
74#include <asm/tlb.h>
75#include <asm/irq_regs.h>
76
77
78
79
80
81
82#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
83#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
84#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
85
86
87
88
89
90
91#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
92#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
93#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
94
95
96
97
98#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
103
104
105
106
107
108
109#define DEF_TIMESLICE (100 * HZ / 1000)
110
111
112
113
114#define RUNTIME_INF ((u64)~0ULL)
115
116#ifdef CONFIG_SMP
117
118
119
120
121static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
122{
123 return reciprocal_divide(load, sg->reciprocal_cpu_power);
124}
125
126
127
128
129
130static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
131{
132 sg->__cpu_power += val;
133 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
134}
135#endif
136
137static inline int rt_policy(int policy)
138{
139 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
140 return 1;
141 return 0;
142}
143
144static inline int task_has_rt_policy(struct task_struct *p)
145{
146 return rt_policy(p->policy);
147}
148
149
150
151
152struct rt_prio_array {
153 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
154 struct list_head queue[MAX_RT_PRIO];
155};
156
157struct rt_bandwidth {
158
159 spinlock_t rt_runtime_lock;
160 ktime_t rt_period;
161 u64 rt_runtime;
162 struct hrtimer rt_period_timer;
163};
164
165static struct rt_bandwidth def_rt_bandwidth;
166
167static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
168
169static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
170{
171 struct rt_bandwidth *rt_b =
172 container_of(timer, struct rt_bandwidth, rt_period_timer);
173 ktime_t now;
174 int overrun;
175 int idle = 0;
176
177 for (;;) {
178 now = hrtimer_cb_get_time(timer);
179 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
180
181 if (!overrun)
182 break;
183
184 idle = do_sched_rt_period_timer(rt_b, overrun);
185 }
186
187 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
188}
189
190static
191void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
192{
193 rt_b->rt_period = ns_to_ktime(period);
194 rt_b->rt_runtime = runtime;
195
196 spin_lock_init(&rt_b->rt_runtime_lock);
197
198 hrtimer_init(&rt_b->rt_period_timer,
199 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
200 rt_b->rt_period_timer.function = sched_rt_period_timer;
201 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
202}
203
204static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
205{
206 ktime_t now;
207
208 if (rt_b->rt_runtime == RUNTIME_INF)
209 return;
210
211 if (hrtimer_active(&rt_b->rt_period_timer))
212 return;
213
214 spin_lock(&rt_b->rt_runtime_lock);
215 for (;;) {
216 if (hrtimer_active(&rt_b->rt_period_timer))
217 break;
218
219 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
220 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
221 hrtimer_start(&rt_b->rt_period_timer,
222 rt_b->rt_period_timer.expires,
223 HRTIMER_MODE_ABS);
224 }
225 spin_unlock(&rt_b->rt_runtime_lock);
226}
227
228#ifdef CONFIG_RT_GROUP_SCHED
229static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
230{
231 hrtimer_cancel(&rt_b->rt_period_timer);
232}
233#endif
234
235
236
237
238
239static DEFINE_MUTEX(sched_domains_mutex);
240
241#ifdef CONFIG_GROUP_SCHED
242
243#include <linux/cgroup.h>
244
245struct cfs_rq;
246
247static LIST_HEAD(task_groups);
248
249
250struct task_group {
251#ifdef CONFIG_CGROUP_SCHED
252 struct cgroup_subsys_state css;
253#endif
254
255#ifdef CONFIG_FAIR_GROUP_SCHED
256
257 struct sched_entity **se;
258
259 struct cfs_rq **cfs_rq;
260 unsigned long shares;
261#endif
262
263#ifdef CONFIG_RT_GROUP_SCHED
264 struct sched_rt_entity **rt_se;
265 struct rt_rq **rt_rq;
266
267 struct rt_bandwidth rt_bandwidth;
268#endif
269
270 struct rcu_head rcu;
271 struct list_head list;
272
273 struct task_group *parent;
274 struct list_head siblings;
275 struct list_head children;
276};
277
278#ifdef CONFIG_USER_SCHED
279
280
281
282
283
284
285struct task_group root_task_group;
286
287#ifdef CONFIG_FAIR_GROUP_SCHED
288
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif
293
294#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif
298#else
299#define root_task_group init_task_group
300#endif
301
302
303
304
305static DEFINE_SPINLOCK(task_group_lock);
306
307#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif
313
314
315
316
317
318
319
320
321
322#define MIN_SHARES 2
323#define MAX_SHARES (1UL << 18)
324
325static int init_task_group_load = INIT_TASK_GROUP_LOAD;
326#endif
327
328
329
330
331struct task_group init_task_group;
332
333
334static inline struct task_group *task_group(struct task_struct *p)
335{
336 struct task_group *tg;
337
338#ifdef CONFIG_USER_SCHED
339 tg = p->user->tg;
340#elif defined(CONFIG_CGROUP_SCHED)
341 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
342 struct task_group, css);
343#else
344 tg = &init_task_group;
345#endif
346 return tg;
347}
348
349
350static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
351{
352#ifdef CONFIG_FAIR_GROUP_SCHED
353 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
354 p->se.parent = task_group(p)->se[cpu];
355#endif
356
357#ifdef CONFIG_RT_GROUP_SCHED
358 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
359 p->rt.parent = task_group(p)->rt_se[cpu];
360#endif
361}
362
363#else
364
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
366
367#endif
368
369
370struct cfs_rq {
371 struct load_weight load;
372 unsigned long nr_running;
373
374 u64 exec_clock;
375 u64 min_vruntime;
376
377 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost;
379
380 struct list_head tasks;
381 struct list_head *balance_iterator;
382
383
384
385
386
387 struct sched_entity *curr, *next;
388
389 unsigned long nr_spread_over;
390
391#ifdef CONFIG_FAIR_GROUP_SCHED
392 struct rq *rq;
393
394
395
396
397
398
399
400
401
402 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg;
404#endif
405};
406
407
408struct rt_rq {
409 struct rt_prio_array active;
410 unsigned long rt_nr_running;
411#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
412 int highest_prio;
413#endif
414#ifdef CONFIG_SMP
415 unsigned long rt_nr_migratory;
416 int overloaded;
417#endif
418 int rt_throttled;
419 u64 rt_time;
420 u64 rt_runtime;
421
422 spinlock_t rt_runtime_lock;
423
424#ifdef CONFIG_RT_GROUP_SCHED
425 unsigned long rt_nr_boosted;
426
427 struct rq *rq;
428 struct list_head leaf_rt_rq_list;
429 struct task_group *tg;
430 struct sched_rt_entity *rt_se;
431#endif
432};
433
434#ifdef CONFIG_SMP
435
436
437
438
439
440
441
442
443
444struct root_domain {
445 atomic_t refcount;
446 cpumask_t span;
447 cpumask_t online;
448
449
450
451
452
453 cpumask_t rto_mask;
454 atomic_t rto_count;
455};
456
457
458
459
460
461static struct root_domain def_root_domain;
462
463#endif
464
465
466
467
468
469
470
471
472struct rq {
473
474 spinlock_t lock;
475
476
477
478
479
480 unsigned long nr_running;
481 #define CPU_LOAD_IDX_MAX 5
482 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
483 unsigned char idle_at_tick;
484#ifdef CONFIG_NO_HZ
485 unsigned long last_tick_seen;
486 unsigned char in_nohz_recently;
487#endif
488
489 struct load_weight load;
490 unsigned long nr_load_updates;
491 u64 nr_switches;
492
493 struct cfs_rq cfs;
494 struct rt_rq rt;
495
496#ifdef CONFIG_FAIR_GROUP_SCHED
497
498 struct list_head leaf_cfs_rq_list;
499#endif
500#ifdef CONFIG_RT_GROUP_SCHED
501 struct list_head leaf_rt_rq_list;
502#endif
503
504
505
506
507
508
509
510 unsigned long nr_uninterruptible;
511
512 struct task_struct *curr, *idle;
513 unsigned long next_balance;
514 struct mm_struct *prev_mm;
515
516 u64 clock;
517
518 atomic_t nr_iowait;
519
520#ifdef CONFIG_SMP
521 struct root_domain *rd;
522 struct sched_domain *sd;
523
524
525 int active_balance;
526 int push_cpu;
527
528 int cpu;
529
530 struct task_struct *migration_thread;
531 struct list_head migration_queue;
532#endif
533
534#ifdef CONFIG_SCHED_HRTICK
535 unsigned long hrtick_flags;
536 ktime_t hrtick_expire;
537 struct hrtimer hrtick_timer;
538#endif
539
540#ifdef CONFIG_SCHEDSTATS
541
542 struct sched_info rq_sched_info;
543
544
545 unsigned int yld_exp_empty;
546 unsigned int yld_act_empty;
547 unsigned int yld_both_empty;
548 unsigned int yld_count;
549
550
551 unsigned int sched_switch;
552 unsigned int sched_count;
553 unsigned int sched_goidle;
554
555
556 unsigned int ttwu_count;
557 unsigned int ttwu_local;
558
559
560 unsigned int bkl_count;
561#endif
562 struct lock_class_key rq_lock_key;
563};
564
565static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
566
567static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
568{
569 rq->curr->sched_class->check_preempt_curr(rq, p);
570}
571
572static inline int cpu_of(struct rq *rq)
573{
574#ifdef CONFIG_SMP
575 return rq->cpu;
576#else
577 return 0;
578#endif
579}
580
581
582
583
584
585
586
587
588#define for_each_domain(cpu, __sd) \
589 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
590
591#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
592#define this_rq() (&__get_cpu_var(runqueues))
593#define task_rq(p) cpu_rq(task_cpu(p))
594#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
595
596static inline void update_rq_clock(struct rq *rq)
597{
598 rq->clock = sched_clock_cpu(cpu_of(rq));
599}
600
601
602
603
604#ifdef CONFIG_SCHED_DEBUG
605# define const_debug __read_mostly
606#else
607# define const_debug static const
608#endif
609
610
611
612
613
614#define SCHED_FEAT(name, enabled) \
615 __SCHED_FEAT_##name ,
616
617enum {
618#include "sched_features.h"
619};
620
621#undef SCHED_FEAT
622
623#define SCHED_FEAT(name, enabled) \
624 (1UL << __SCHED_FEAT_##name) * enabled |
625
626const_debug unsigned int sysctl_sched_features =
627#include "sched_features.h"
628 0;
629
630#undef SCHED_FEAT
631
632#ifdef CONFIG_SCHED_DEBUG
633#define SCHED_FEAT(name, enabled) \
634 #name ,
635
636static __read_mostly char *sched_feat_names[] = {
637#include "sched_features.h"
638 NULL
639};
640
641#undef SCHED_FEAT
642
643static int sched_feat_open(struct inode *inode, struct file *filp)
644{
645 filp->private_data = inode->i_private;
646 return 0;
647}
648
649static ssize_t
650sched_feat_read(struct file *filp, char __user *ubuf,
651 size_t cnt, loff_t *ppos)
652{
653 char *buf;
654 int r = 0;
655 int len = 0;
656 int i;
657
658 for (i = 0; sched_feat_names[i]; i++) {
659 len += strlen(sched_feat_names[i]);
660 len += 4;
661 }
662
663 buf = kmalloc(len + 2, GFP_KERNEL);
664 if (!buf)
665 return -ENOMEM;
666
667 for (i = 0; sched_feat_names[i]; i++) {
668 if (sysctl_sched_features & (1UL << i))
669 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
670 else
671 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
672 }
673
674 r += sprintf(buf + r, "\n");
675 WARN_ON(r >= len + 2);
676
677 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
678
679 kfree(buf);
680
681 return r;
682}
683
684static ssize_t
685sched_feat_write(struct file *filp, const char __user *ubuf,
686 size_t cnt, loff_t *ppos)
687{
688 char buf[64];
689 char *cmp = buf;
690 int neg = 0;
691 int i;
692
693 if (cnt > 63)
694 cnt = 63;
695
696 if (copy_from_user(&buf, ubuf, cnt))
697 return -EFAULT;
698
699 buf[cnt] = 0;
700
701 if (strncmp(buf, "NO_", 3) == 0) {
702 neg = 1;
703 cmp += 3;
704 }
705
706 for (i = 0; sched_feat_names[i]; i++) {
707 int len = strlen(sched_feat_names[i]);
708
709 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
710 if (neg)
711 sysctl_sched_features &= ~(1UL << i);
712 else
713 sysctl_sched_features |= (1UL << i);
714 break;
715 }
716 }
717
718 if (!sched_feat_names[i])
719 return -EINVAL;
720
721 filp->f_pos += cnt;
722
723 return cnt;
724}
725
726static struct file_operations sched_feat_fops = {
727 .open = sched_feat_open,
728 .read = sched_feat_read,
729 .write = sched_feat_write,
730};
731
732static __init int sched_init_debug(void)
733{
734 debugfs_create_file("sched_features", 0644, NULL, NULL,
735 &sched_feat_fops);
736
737 return 0;
738}
739late_initcall(sched_init_debug);
740
741#endif
742
743#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
744
745
746
747
748
749const_debug unsigned int sysctl_sched_nr_migrate = 32;
750
751
752
753
754
755unsigned int sysctl_sched_rt_period = 1000000;
756
757static __read_mostly int scheduler_running;
758
759
760
761
762
763int sysctl_sched_rt_runtime = 950000;
764
765static inline u64 global_rt_period(void)
766{
767 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
768}
769
770static inline u64 global_rt_runtime(void)
771{
772 if (sysctl_sched_rt_period < 0)
773 return RUNTIME_INF;
774
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776}
777
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783
784
785
786
787
788
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794
795
796
797
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818
819
820
821
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830
831
832
833
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0)
856#endif
857#ifndef finish_arch_switch
858# define finish_arch_switch(prev) do { } while (0)
859#endif
860
861static inline int task_current(struct rq *rq, struct task_struct *p)
862{
863 return rq->curr == p;
864}
865
866#ifndef __ARCH_WANT_UNLOCKED_CTXSW
867static inline int task_running(struct rq *rq, struct task_struct *p)
868{
869 return task_current(rq, p);
870}
871
872static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
873{
874}
875
876static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
877{
878#ifdef CONFIG_DEBUG_SPINLOCK
879
880 rq->lock.owner = current;
881#endif
882
883
884
885
886
887 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
888
889 spin_unlock_irq(&rq->lock);
890}
891
892#else
893static inline int task_running(struct rq *rq, struct task_struct *p)
894{
895#ifdef CONFIG_SMP
896 return p->oncpu;
897#else
898 return task_current(rq, p);
899#endif
900}
901
902static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
903{
904#ifdef CONFIG_SMP
905
906
907
908
909
910 next->oncpu = 1;
911#endif
912#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
913 spin_unlock_irq(&rq->lock);
914#else
915 spin_unlock(&rq->lock);
916#endif
917}
918
919static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
920{
921#ifdef CONFIG_SMP
922
923
924
925
926
927 smp_wmb();
928 prev->oncpu = 0;
929#endif
930#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
931 local_irq_enable();
932#endif
933}
934#endif
935
936
937
938
939
940static inline struct rq *__task_rq_lock(struct task_struct *p)
941 __acquires(rq->lock)
942{
943 for (;;) {
944 struct rq *rq = task_rq(p);
945 spin_lock(&rq->lock);
946 if (likely(rq == task_rq(p)))
947 return rq;
948 spin_unlock(&rq->lock);
949 }
950}
951
952
953
954
955
956
957static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
958 __acquires(rq->lock)
959{
960 struct rq *rq;
961
962 for (;;) {
963 local_irq_save(*flags);
964 rq = task_rq(p);
965 spin_lock(&rq->lock);
966 if (likely(rq == task_rq(p)))
967 return rq;
968 spin_unlock_irqrestore(&rq->lock, *flags);
969 }
970}
971
972static void __task_rq_unlock(struct rq *rq)
973 __releases(rq->lock)
974{
975 spin_unlock(&rq->lock);
976}
977
978static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
979 __releases(rq->lock)
980{
981 spin_unlock_irqrestore(&rq->lock, *flags);
982}
983
984
985
986
987static struct rq *this_rq_lock(void)
988 __acquires(rq->lock)
989{
990 struct rq *rq;
991
992 local_irq_disable();
993 rq = this_rq();
994 spin_lock(&rq->lock);
995
996 return rq;
997}
998
999static void __resched_task(struct task_struct *p, int tif_bit);
1000
1001static inline void resched_task(struct task_struct *p)
1002{
1003 __resched_task(p, TIF_NEED_RESCHED);
1004}
1005
1006#ifdef CONFIG_SCHED_HRTICK
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017static inline void resched_hrt(struct task_struct *p)
1018{
1019 __resched_task(p, TIF_HRTICK_RESCHED);
1020}
1021
1022static inline void resched_rq(struct rq *rq)
1023{
1024 unsigned long flags;
1025
1026 spin_lock_irqsave(&rq->lock, flags);
1027 resched_task(rq->curr);
1028 spin_unlock_irqrestore(&rq->lock, flags);
1029}
1030
1031enum {
1032 HRTICK_SET,
1033 HRTICK_RESET,
1034 HRTICK_BLOCK,
1035};
1036
1037
1038
1039
1040
1041
1042static inline int hrtick_enabled(struct rq *rq)
1043{
1044 if (!sched_feat(HRTICK))
1045 return 0;
1046 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
1047 return 0;
1048 return hrtimer_is_hres_active(&rq->hrtick_timer);
1049}
1050
1051
1052
1053
1054
1055
1056static void hrtick_start(struct rq *rq, u64 delay, int reset)
1057{
1058 assert_spin_locked(&rq->lock);
1059
1060
1061
1062
1063 rq->hrtick_expire =
1064 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
1065
1066
1067
1068 __set_bit(HRTICK_SET, &rq->hrtick_flags);
1069 if (reset)
1070 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
1071
1072
1073
1074
1075
1076 if (reset)
1077 resched_hrt(rq->curr);
1078}
1079
1080static void hrtick_clear(struct rq *rq)
1081{
1082 if (hrtimer_active(&rq->hrtick_timer))
1083 hrtimer_cancel(&rq->hrtick_timer);
1084}
1085
1086
1087
1088
1089static void hrtick_set(struct rq *rq)
1090{
1091 ktime_t time;
1092 int set, reset;
1093 unsigned long flags;
1094
1095 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1096
1097 spin_lock_irqsave(&rq->lock, flags);
1098 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
1099 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
1100 time = rq->hrtick_expire;
1101 clear_thread_flag(TIF_HRTICK_RESCHED);
1102 spin_unlock_irqrestore(&rq->lock, flags);
1103
1104 if (set) {
1105 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
1106 if (reset && !hrtimer_active(&rq->hrtick_timer))
1107 resched_rq(rq);
1108 } else
1109 hrtick_clear(rq);
1110}
1111
1112
1113
1114
1115
1116static enum hrtimer_restart hrtick(struct hrtimer *timer)
1117{
1118 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1119
1120 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1121
1122 spin_lock(&rq->lock);
1123 update_rq_clock(rq);
1124 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1125 spin_unlock(&rq->lock);
1126
1127 return HRTIMER_NORESTART;
1128}
1129
1130#ifdef CONFIG_SMP
1131static void hotplug_hrtick_disable(int cpu)
1132{
1133 struct rq *rq = cpu_rq(cpu);
1134 unsigned long flags;
1135
1136 spin_lock_irqsave(&rq->lock, flags);
1137 rq->hrtick_flags = 0;
1138 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1139 spin_unlock_irqrestore(&rq->lock, flags);
1140
1141 hrtick_clear(rq);
1142}
1143
1144static void hotplug_hrtick_enable(int cpu)
1145{
1146 struct rq *rq = cpu_rq(cpu);
1147 unsigned long flags;
1148
1149 spin_lock_irqsave(&rq->lock, flags);
1150 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1151 spin_unlock_irqrestore(&rq->lock, flags);
1152}
1153
1154static int
1155hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1156{
1157 int cpu = (int)(long)hcpu;
1158
1159 switch (action) {
1160 case CPU_UP_CANCELED:
1161 case CPU_UP_CANCELED_FROZEN:
1162 case CPU_DOWN_PREPARE:
1163 case CPU_DOWN_PREPARE_FROZEN:
1164 case CPU_DEAD:
1165 case CPU_DEAD_FROZEN:
1166 hotplug_hrtick_disable(cpu);
1167 return NOTIFY_OK;
1168
1169 case CPU_UP_PREPARE:
1170 case CPU_UP_PREPARE_FROZEN:
1171 case CPU_DOWN_FAILED:
1172 case CPU_DOWN_FAILED_FROZEN:
1173 case CPU_ONLINE:
1174 case CPU_ONLINE_FROZEN:
1175 hotplug_hrtick_enable(cpu);
1176 return NOTIFY_OK;
1177 }
1178
1179 return NOTIFY_DONE;
1180}
1181
1182static void init_hrtick(void)
1183{
1184 hotcpu_notifier(hotplug_hrtick, 0);
1185}
1186#endif
1187
1188static void init_rq_hrtick(struct rq *rq)
1189{
1190 rq->hrtick_flags = 0;
1191 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1192 rq->hrtick_timer.function = hrtick;
1193 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1194}
1195
1196void hrtick_resched(void)
1197{
1198 struct rq *rq;
1199 unsigned long flags;
1200
1201 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1202 return;
1203
1204 local_irq_save(flags);
1205 rq = cpu_rq(smp_processor_id());
1206 hrtick_set(rq);
1207 local_irq_restore(flags);
1208}
1209#else
1210static inline void hrtick_clear(struct rq *rq)
1211{
1212}
1213
1214static inline void hrtick_set(struct rq *rq)
1215{
1216}
1217
1218static inline void init_rq_hrtick(struct rq *rq)
1219{
1220}
1221
1222void hrtick_resched(void)
1223{
1224}
1225
1226static inline void init_hrtick(void)
1227{
1228}
1229#endif
1230
1231
1232
1233
1234
1235
1236
1237
1238#ifdef CONFIG_SMP
1239
1240#ifndef tsk_is_polling
1241#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1242#endif
1243
1244static void __resched_task(struct task_struct *p, int tif_bit)
1245{
1246 int cpu;
1247
1248 assert_spin_locked(&task_rq(p)->lock);
1249
1250 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
1251 return;
1252
1253 set_tsk_thread_flag(p, tif_bit);
1254
1255 cpu = task_cpu(p);
1256 if (cpu == smp_processor_id())
1257 return;
1258
1259
1260 smp_mb();
1261 if (!tsk_is_polling(p))
1262 smp_send_reschedule(cpu);
1263}
1264
1265static void resched_cpu(int cpu)
1266{
1267 struct rq *rq = cpu_rq(cpu);
1268 unsigned long flags;
1269
1270 if (!spin_trylock_irqsave(&rq->lock, flags))
1271 return;
1272 resched_task(cpu_curr(cpu));
1273 spin_unlock_irqrestore(&rq->lock, flags);
1274}
1275
1276#ifdef CONFIG_NO_HZ
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287void wake_up_idle_cpu(int cpu)
1288{
1289 struct rq *rq = cpu_rq(cpu);
1290
1291 if (cpu == smp_processor_id())
1292 return;
1293
1294
1295
1296
1297
1298
1299
1300
1301 if (rq->curr != rq->idle)
1302 return;
1303
1304
1305
1306
1307
1308
1309 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1310
1311
1312 smp_mb();
1313 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu);
1315}
1316#endif
1317
1318#else
1319static void __resched_task(struct task_struct *p, int tif_bit)
1320{
1321 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit);
1323}
1324#endif
1325
1326#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL)
1328#else
1329# define WMULT_CONST (1UL << 32)
1330#endif
1331
1332#define WMULT_SHIFT 32
1333
1334
1335
1336
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338
1339static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw)
1342{
1343 u64 tmp;
1344
1345 if (!lw->inv_weight) {
1346 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1347 lw->inv_weight = 1;
1348 else
1349 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1350 / (lw->weight+1);
1351 }
1352
1353 tmp = (u64)delta_exec * weight;
1354
1355
1356
1357 if (unlikely(tmp > WMULT_CONST))
1358 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1359 WMULT_SHIFT/2);
1360 else
1361 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1362
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364}
1365
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{
1374 lw->weight += inc;
1375 lw->inv_weight = 0;
1376}
1377
1378static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1379{
1380 lw->weight -= dec;
1381 lw->inv_weight = 0;
1382}
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393#define WEIGHT_IDLEPRIO 2
1394#define WMULT_IDLEPRIO (1 << 31)
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408static const int prio_to_weight[40] = {
1409 88761, 71755, 56483, 46273, 36291,
1410 29154, 23254, 18705, 14949, 11916,
1411 9548, 7620, 6100, 4904, 3906,
1412 3121, 2501, 1991, 1586, 1277,
1413 1024, 820, 655, 526, 423,
1414 335, 272, 215, 172, 137,
1415 110, 87, 70, 56, 45,
1416 36, 29, 23, 18, 15,
1417};
1418
1419
1420
1421
1422
1423
1424
1425
1426static const u32 prio_to_wmult[40] = {
1427 48388, 59856, 76040, 92818, 118348,
1428 147320, 184698, 229616, 287308, 360437,
1429 449829, 563644, 704093, 875809, 1099582,
1430 1376151, 1717300, 2157191, 2708050, 3363326,
1431 4194304, 5237765, 6557202, 8165337, 10153587,
1432 12820798, 15790321, 19976592, 24970740, 31350126,
1433 39045157, 49367440, 61356676, 76695844, 95443717,
1434 119304647, 148102320, 186737708, 238609294, 286331153,
1435};
1436
1437static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1438
1439
1440
1441
1442
1443
1444struct rq_iterator {
1445 void *arg;
1446 struct task_struct *(*start)(void *);
1447 struct task_struct *(*next)(void *);
1448};
1449
1450#ifdef CONFIG_SMP
1451static unsigned long
1452balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1453 unsigned long max_load_move, struct sched_domain *sd,
1454 enum cpu_idle_type idle, int *all_pinned,
1455 int *this_best_prio, struct rq_iterator *iterator);
1456
1457static int
1458iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1459 struct sched_domain *sd, enum cpu_idle_type idle,
1460 struct rq_iterator *iterator);
1461#endif
1462
1463#ifdef CONFIG_CGROUP_CPUACCT
1464static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1465#else
1466static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1467#endif
1468
1469static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1470{
1471 update_load_add(&rq->load, load);
1472}
1473
1474static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1475{
1476 update_load_sub(&rq->load, load);
1477}
1478
1479#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else
1485
1486#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1488{
1489}
1490#endif
1491
1492#endif
1493
1494#include "sched_stats.h"
1495#include "sched_idletask.c"
1496#include "sched_fair.c"
1497#include "sched_rt.c"
1498#ifdef CONFIG_SCHED_DEBUG
1499# include "sched_debug.c"
1500#endif
1501
1502#define sched_class_highest (&rt_sched_class)
1503
1504static inline void inc_load(struct rq *rq, const struct task_struct *p)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{
1516 rq->nr_running++;
1517 inc_load(rq, p);
1518}
1519
1520static void dec_nr_running(struct task_struct *p, struct rq *rq)
1521{
1522 rq->nr_running--;
1523 dec_load(rq, p);
1524}
1525
1526static void set_load_weight(struct task_struct *p)
1527{
1528 if (task_has_rt_policy(p)) {
1529 p->se.load.weight = prio_to_weight[0] * 2;
1530 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1531 return;
1532 }
1533
1534
1535
1536
1537 if (p->policy == SCHED_IDLE) {
1538 p->se.load.weight = WEIGHT_IDLEPRIO;
1539 p->se.load.inv_weight = WMULT_IDLEPRIO;
1540 return;
1541 }
1542
1543 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545}
1546
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{
1549 sched_info_queued(p);
1550 p->sched_class->enqueue_task(rq, p, wakeup);
1551 p->se.on_rq = 1;
1552}
1553
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{
1556 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0;
1558}
1559
1560
1561
1562
1563static inline int __normal_prio(struct task_struct *p)
1564{
1565 return p->static_prio;
1566}
1567
1568
1569
1570
1571
1572
1573
1574
1575static inline int normal_prio(struct task_struct *p)
1576{
1577 int prio;
1578
1579 if (task_has_rt_policy(p))
1580 prio = MAX_RT_PRIO-1 - p->rt_priority;
1581 else
1582 prio = __normal_prio(p);
1583 return prio;
1584}
1585
1586
1587
1588
1589
1590
1591
1592
1593static int effective_prio(struct task_struct *p)
1594{
1595 p->normal_prio = normal_prio(p);
1596
1597
1598
1599
1600
1601 if (!rt_prio(p->prio))
1602 return p->normal_prio;
1603 return p->prio;
1604}
1605
1606
1607
1608
1609static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1610{
1611 if (task_contributes_to_load(p))
1612 rq->nr_uninterruptible--;
1613
1614 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq);
1616}
1617
1618
1619
1620
1621static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1622{
1623 if (task_contributes_to_load(p))
1624 rq->nr_uninterruptible++;
1625
1626 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq);
1628}
1629
1630
1631
1632
1633
1634inline int task_curr(const struct task_struct *p)
1635{
1636 return cpu_curr(task_cpu(p)) == p;
1637}
1638
1639
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{
1647 set_task_rq(p, cpu);
1648#ifdef CONFIG_SMP
1649
1650
1651
1652
1653
1654 smp_wmb();
1655 task_thread_info(p)->cpu = cpu;
1656#endif
1657}
1658
1659static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1660 const struct sched_class *prev_class,
1661 int oldprio, int running)
1662{
1663 if (prev_class != p->sched_class) {
1664 if (prev_class->switched_from)
1665 prev_class->switched_from(rq, p, running);
1666 p->sched_class->switched_to(rq, p, running);
1667 } else
1668 p->sched_class->prio_changed(rq, p, oldprio, running);
1669}
1670
1671#ifdef CONFIG_SMP
1672
1673
1674
1675
1676static int
1677task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1678{
1679 s64 delta;
1680
1681
1682
1683
1684 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
1685 return 1;
1686
1687 if (p->sched_class != &fair_sched_class)
1688 return 0;
1689
1690 if (sysctl_sched_migration_cost == -1)
1691 return 1;
1692 if (sysctl_sched_migration_cost == 0)
1693 return 0;
1694
1695 delta = now - p->se.exec_start;
1696
1697 return delta < (s64)sysctl_sched_migration_cost;
1698}
1699
1700
1701void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1702{
1703 int old_cpu = task_cpu(p);
1704 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1705 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1706 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1707 u64 clock_offset;
1708
1709 clock_offset = old_rq->clock - new_rq->clock;
1710
1711#ifdef CONFIG_SCHEDSTATS
1712 if (p->se.wait_start)
1713 p->se.wait_start -= clock_offset;
1714 if (p->se.sleep_start)
1715 p->se.sleep_start -= clock_offset;
1716 if (p->se.block_start)
1717 p->se.block_start -= clock_offset;
1718 if (old_cpu != new_cpu) {
1719 schedstat_inc(p, se.nr_migrations);
1720 if (task_hot(p, old_rq->clock, NULL))
1721 schedstat_inc(p, se.nr_forced2_migrations);
1722 }
1723#endif
1724 p->se.vruntime -= old_cfsrq->min_vruntime -
1725 new_cfsrq->min_vruntime;
1726
1727 __set_task_cpu(p, new_cpu);
1728}
1729
1730struct migration_req {
1731 struct list_head list;
1732
1733 struct task_struct *task;
1734 int dest_cpu;
1735
1736 struct completion done;
1737};
1738
1739
1740
1741
1742
1743static int
1744migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1745{
1746 struct rq *rq = task_rq(p);
1747
1748
1749
1750
1751
1752 if (!p->se.on_rq && !task_running(rq, p)) {
1753 set_task_cpu(p, dest_cpu);
1754 return 0;
1755 }
1756
1757 init_completion(&req->done);
1758 req->task = p;
1759 req->dest_cpu = dest_cpu;
1760 list_add(&req->list, &rq->migration_queue);
1761
1762 return 1;
1763}
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774void wait_task_inactive(struct task_struct *p)
1775{
1776 unsigned long flags;
1777 int running, on_rq;
1778 struct rq *rq;
1779
1780 for (;;) {
1781
1782
1783
1784
1785
1786
1787 rq = task_rq(p);
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800 while (task_running(rq, p))
1801 cpu_relax();
1802
1803
1804
1805
1806
1807
1808 rq = task_rq_lock(p, &flags);
1809 running = task_running(rq, p);
1810 on_rq = p->se.on_rq;
1811 task_rq_unlock(rq, &flags);
1812
1813
1814
1815
1816
1817
1818
1819 if (unlikely(running)) {
1820 cpu_relax();
1821 continue;
1822 }
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833 if (unlikely(on_rq)) {
1834 schedule_timeout_uninterruptible(1);
1835 continue;
1836 }
1837
1838
1839
1840
1841
1842
1843 break;
1844 }
1845}
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860void kick_process(struct task_struct *p)
1861{
1862 int cpu;
1863
1864 preempt_disable();
1865 cpu = task_cpu(p);
1866 if ((cpu != smp_processor_id()) && task_curr(p))
1867 smp_send_reschedule(cpu);
1868 preempt_enable();
1869}
1870
1871
1872
1873
1874
1875
1876
1877
1878static unsigned long source_load(int cpu, int type)
1879{
1880 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu);
1882
1883 if (type == 0)
1884 return total;
1885
1886 return min(rq->cpu_load[type-1], total);
1887}
1888
1889
1890
1891
1892
1893static unsigned long target_load(int cpu, int type)
1894{
1895 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu);
1897
1898 if (type == 0)
1899 return total;
1900
1901 return max(rq->cpu_load[type-1], total);
1902}
1903
1904
1905
1906
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916
1917
1918
1919
1920static struct sched_group *
1921find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1922{
1923 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1924 unsigned long min_load = ULONG_MAX, this_load = 0;
1925 int load_idx = sd->forkexec_idx;
1926 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1927
1928 do {
1929 unsigned long load, avg_load;
1930 int local_group;
1931 int i;
1932
1933
1934 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1935 continue;
1936
1937 local_group = cpu_isset(this_cpu, group->cpumask);
1938
1939
1940 avg_load = 0;
1941
1942 for_each_cpu_mask(i, group->cpumask) {
1943
1944 if (local_group)
1945 load = source_load(i, load_idx);
1946 else
1947 load = target_load(i, load_idx);
1948
1949 avg_load += load;
1950 }
1951
1952
1953 avg_load = sg_div_cpu_power(group,
1954 avg_load * SCHED_LOAD_SCALE);
1955
1956 if (local_group) {
1957 this_load = avg_load;
1958 this = group;
1959 } else if (avg_load < min_load) {
1960 min_load = avg_load;
1961 idlest = group;
1962 }
1963 } while (group = group->next, group != sd->groups);
1964
1965 if (!idlest || 100*this_load < imbalance*min_load)
1966 return NULL;
1967 return idlest;
1968}
1969
1970
1971
1972
1973static int
1974find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
1975 cpumask_t *tmp)
1976{
1977 unsigned long load, min_load = ULONG_MAX;
1978 int idlest = -1;
1979 int i;
1980
1981
1982 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1983
1984 for_each_cpu_mask(i, *tmp) {
1985 load = weighted_cpuload(i);
1986
1987 if (load < min_load || (load == min_load && i == this_cpu)) {
1988 min_load = load;
1989 idlest = i;
1990 }
1991 }
1992
1993 return idlest;
1994}
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007static int sched_balance_self(int cpu, int flag)
2008{
2009 struct task_struct *t = current;
2010 struct sched_domain *tmp, *sd = NULL;
2011
2012 for_each_domain(cpu, tmp) {
2013
2014
2015
2016 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2017 break;
2018 if (tmp->flags & flag)
2019 sd = tmp;
2020 }
2021
2022 while (sd) {
2023 cpumask_t span, tmpmask;
2024 struct sched_group *group;
2025 int new_cpu, weight;
2026
2027 if (!(sd->flags & flag)) {
2028 sd = sd->child;
2029 continue;
2030 }
2031
2032 span = sd->span;
2033 group = find_idlest_group(sd, t, cpu);
2034 if (!group) {
2035 sd = sd->child;
2036 continue;
2037 }
2038
2039 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2040 if (new_cpu == -1 || new_cpu == cpu) {
2041
2042 sd = sd->child;
2043 continue;
2044 }
2045
2046
2047 cpu = new_cpu;
2048 sd = NULL;
2049 weight = cpus_weight(span);
2050 for_each_domain(cpu, tmp) {
2051 if (weight <= cpus_weight(tmp->span))
2052 break;
2053 if (tmp->flags & flag)
2054 sd = tmp;
2055 }
2056
2057 }
2058
2059 return cpu;
2060}
2061
2062#endif
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2079{
2080 int cpu, orig_cpu, this_cpu, success = 0;
2081 unsigned long flags;
2082 long old_state;
2083 struct rq *rq;
2084
2085 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0;
2087
2088 smp_wmb();
2089 rq = task_rq_lock(p, &flags);
2090 old_state = p->state;
2091 if (!(old_state & state))
2092 goto out;
2093
2094 if (p->se.on_rq)
2095 goto out_running;
2096
2097 cpu = task_cpu(p);
2098 orig_cpu = cpu;
2099 this_cpu = smp_processor_id();
2100
2101#ifdef CONFIG_SMP
2102 if (unlikely(task_running(rq, p)))
2103 goto out_activate;
2104
2105 cpu = p->sched_class->select_task_rq(p, sync);
2106 if (cpu != orig_cpu) {
2107 set_task_cpu(p, cpu);
2108 task_rq_unlock(rq, &flags);
2109
2110 rq = task_rq_lock(p, &flags);
2111 old_state = p->state;
2112 if (!(old_state & state))
2113 goto out;
2114 if (p->se.on_rq)
2115 goto out_running;
2116
2117 this_cpu = smp_processor_id();
2118 cpu = task_cpu(p);
2119 }
2120
2121#ifdef CONFIG_SCHEDSTATS
2122 schedstat_inc(rq, ttwu_count);
2123 if (cpu == this_cpu)
2124 schedstat_inc(rq, ttwu_local);
2125 else {
2126 struct sched_domain *sd;
2127 for_each_domain(this_cpu, sd) {
2128 if (cpu_isset(cpu, sd->span)) {
2129 schedstat_inc(sd, ttwu_wake_remote);
2130 break;
2131 }
2132 }
2133 }
2134#endif
2135
2136out_activate:
2137#endif
2138 schedstat_inc(p, se.nr_wakeups);
2139 if (sync)
2140 schedstat_inc(p, se.nr_wakeups_sync);
2141 if (orig_cpu != cpu)
2142 schedstat_inc(p, se.nr_wakeups_migrate);
2143 if (cpu == this_cpu)
2144 schedstat_inc(p, se.nr_wakeups_local);
2145 else
2146 schedstat_inc(p, se.nr_wakeups_remote);
2147 update_rq_clock(rq);
2148 activate_task(rq, p, 1);
2149 success = 1;
2150
2151out_running:
2152 check_preempt_curr(rq, p);
2153
2154 p->state = TASK_RUNNING;
2155#ifdef CONFIG_SMP
2156 if (p->sched_class->task_wake_up)
2157 p->sched_class->task_wake_up(rq, p);
2158#endif
2159out:
2160 task_rq_unlock(rq, &flags);
2161
2162 return success;
2163}
2164
2165int wake_up_process(struct task_struct *p)
2166{
2167 return try_to_wake_up(p, TASK_ALL, 0);
2168}
2169EXPORT_SYMBOL(wake_up_process);
2170
2171int wake_up_state(struct task_struct *p, unsigned int state)
2172{
2173 return try_to_wake_up(p, state, 0);
2174}
2175
2176
2177
2178
2179
2180
2181
2182static void __sched_fork(struct task_struct *p)
2183{
2184 p->se.exec_start = 0;
2185 p->se.sum_exec_runtime = 0;
2186 p->se.prev_sum_exec_runtime = 0;
2187 p->se.last_wakeup = 0;
2188 p->se.avg_overlap = 0;
2189
2190#ifdef CONFIG_SCHEDSTATS
2191 p->se.wait_start = 0;
2192 p->se.sum_sleep_runtime = 0;
2193 p->se.sleep_start = 0;
2194 p->se.block_start = 0;
2195 p->se.sleep_max = 0;
2196 p->se.block_max = 0;
2197 p->se.exec_max = 0;
2198 p->se.slice_max = 0;
2199 p->se.wait_max = 0;
2200#endif
2201
2202 INIT_LIST_HEAD(&p->rt.run_list);
2203 p->se.on_rq = 0;
2204 INIT_LIST_HEAD(&p->se.group_node);
2205
2206#ifdef CONFIG_PREEMPT_NOTIFIERS
2207 INIT_HLIST_HEAD(&p->preempt_notifiers);
2208#endif
2209
2210
2211
2212
2213
2214
2215
2216 p->state = TASK_RUNNING;
2217}
2218
2219
2220
2221
2222void sched_fork(struct task_struct *p, int clone_flags)
2223{
2224 int cpu = get_cpu();
2225
2226 __sched_fork(p);
2227
2228#ifdef CONFIG_SMP
2229 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2230#endif
2231 set_task_cpu(p, cpu);
2232
2233
2234
2235
2236 p->prio = current->normal_prio;
2237 if (!rt_prio(p->prio))
2238 p->sched_class = &fair_sched_class;
2239
2240#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2241 if (likely(sched_info_on()))
2242 memset(&p->sched_info, 0, sizeof(p->sched_info));
2243#endif
2244#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2245 p->oncpu = 0;
2246#endif
2247#ifdef CONFIG_PREEMPT
2248
2249 task_thread_info(p)->preempt_count = 1;
2250#endif
2251 put_cpu();
2252}
2253
2254
2255
2256
2257
2258
2259
2260
2261void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2262{
2263 unsigned long flags;
2264 struct rq *rq;
2265
2266 rq = task_rq_lock(p, &flags);
2267 BUG_ON(p->state != TASK_RUNNING);
2268 update_rq_clock(rq);
2269
2270 p->prio = effective_prio(p);
2271
2272 if (!p->sched_class->task_new || !current->se.on_rq) {
2273 activate_task(rq, p, 0);
2274 } else {
2275
2276
2277
2278
2279 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq);
2281 }
2282 check_preempt_curr(rq, p);
2283#ifdef CONFIG_SMP
2284 if (p->sched_class->task_wake_up)
2285 p->sched_class->task_wake_up(rq, p);
2286#endif
2287 task_rq_unlock(rq, &flags);
2288}
2289
2290#ifdef CONFIG_PREEMPT_NOTIFIERS
2291
2292
2293
2294
2295
2296void preempt_notifier_register(struct preempt_notifier *notifier)
2297{
2298 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2299}
2300EXPORT_SYMBOL_GPL(preempt_notifier_register);
2301
2302
2303
2304
2305
2306
2307
2308void preempt_notifier_unregister(struct preempt_notifier *notifier)
2309{
2310 hlist_del(¬ifier->link);
2311}
2312EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2313
2314static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2315{
2316 struct preempt_notifier *notifier;
2317 struct hlist_node *node;
2318
2319 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2320 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2321}
2322
2323static void
2324fire_sched_out_preempt_notifiers(struct task_struct *curr,
2325 struct task_struct *next)
2326{
2327 struct preempt_notifier *notifier;
2328 struct hlist_node *node;
2329
2330 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2331 notifier->ops->sched_out(notifier, next);
2332}
2333
2334#else
2335
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{
2338}
2339
2340static void
2341fire_sched_out_preempt_notifiers(struct task_struct *curr,
2342 struct task_struct *next)
2343{
2344}
2345
2346#endif
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361static inline void
2362prepare_task_switch(struct rq *rq, struct task_struct *prev,
2363 struct task_struct *next)
2364{
2365 fire_sched_out_preempt_notifiers(prev, next);
2366 prepare_lock_switch(rq, next);
2367 prepare_arch_switch(next);
2368}
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2386 __releases(rq->lock)
2387{
2388 struct mm_struct *mm = rq->prev_mm;
2389 long prev_state;
2390
2391 rq->prev_mm = NULL;
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404 prev_state = prev->state;
2405 finish_arch_switch(prev);
2406 finish_lock_switch(rq, prev);
2407#ifdef CONFIG_SMP
2408 if (current->sched_class->post_schedule)
2409 current->sched_class->post_schedule(rq);
2410#endif
2411
2412 fire_sched_in_preempt_notifiers(current);
2413 if (mm)
2414 mmdrop(mm);
2415 if (unlikely(prev_state == TASK_DEAD)) {
2416
2417
2418
2419
2420 kprobe_flush_task(prev);
2421 put_task_struct(prev);
2422 }
2423}
2424
2425
2426
2427
2428
2429asmlinkage void schedule_tail(struct task_struct *prev)
2430 __releases(rq->lock)
2431{
2432 struct rq *rq = this_rq();
2433
2434 finish_task_switch(rq, prev);
2435#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2436
2437 preempt_enable();
2438#endif
2439 if (current->set_child_tid)
2440 put_user(task_pid_vnr(current), current->set_child_tid);
2441}
2442
2443
2444
2445
2446
2447static inline void
2448context_switch(struct rq *rq, struct task_struct *prev,
2449 struct task_struct *next)
2450{
2451 struct mm_struct *mm, *oldmm;
2452
2453 prepare_task_switch(rq, prev, next);
2454 mm = next->mm;
2455 oldmm = prev->active_mm;
2456
2457
2458
2459
2460
2461 arch_enter_lazy_cpu_mode();
2462
2463 if (unlikely(!mm)) {
2464 next->active_mm = oldmm;
2465 atomic_inc(&oldmm->mm_count);
2466 enter_lazy_tlb(oldmm, next);
2467 } else
2468 switch_mm(oldmm, mm, next);
2469
2470 if (unlikely(!prev->mm)) {
2471 prev->active_mm = NULL;
2472 rq->prev_mm = oldmm;
2473 }
2474
2475
2476
2477
2478
2479
2480#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2481 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2482#endif
2483
2484
2485 switch_to(prev, next, prev);
2486
2487 barrier();
2488
2489
2490
2491
2492
2493 finish_task_switch(this_rq(), prev);
2494}
2495
2496
2497
2498
2499
2500
2501
2502
2503unsigned long nr_running(void)
2504{
2505 unsigned long i, sum = 0;
2506
2507 for_each_online_cpu(i)
2508 sum += cpu_rq(i)->nr_running;
2509
2510 return sum;
2511}
2512
2513unsigned long nr_uninterruptible(void)
2514{
2515 unsigned long i, sum = 0;
2516
2517 for_each_possible_cpu(i)
2518 sum += cpu_rq(i)->nr_uninterruptible;
2519
2520
2521
2522
2523
2524 if (unlikely((long)sum < 0))
2525 sum = 0;
2526
2527 return sum;
2528}
2529
2530unsigned long long nr_context_switches(void)
2531{
2532 int i;
2533 unsigned long long sum = 0;
2534
2535 for_each_possible_cpu(i)
2536 sum += cpu_rq(i)->nr_switches;
2537
2538 return sum;
2539}
2540
2541unsigned long nr_iowait(void)
2542{
2543 unsigned long i, sum = 0;
2544
2545 for_each_possible_cpu(i)
2546 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2547
2548 return sum;
2549}
2550
2551unsigned long nr_active(void)
2552{
2553 unsigned long i, running = 0, uninterruptible = 0;
2554
2555 for_each_online_cpu(i) {
2556 running += cpu_rq(i)->nr_running;
2557 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2558 }
2559
2560 if (unlikely((long)uninterruptible < 0))
2561 uninterruptible = 0;
2562
2563 return running + uninterruptible;
2564}
2565
2566
2567
2568
2569
2570static void update_cpu_load(struct rq *this_rq)
2571{
2572 unsigned long this_load = this_rq->load.weight;
2573 int i, scale;
2574
2575 this_rq->nr_load_updates++;
2576
2577
2578 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2579 unsigned long old_load, new_load;
2580
2581
2582
2583 old_load = this_rq->cpu_load[i];
2584 new_load = this_load;
2585
2586
2587
2588
2589
2590 if (new_load > old_load)
2591 new_load += scale-1;
2592 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2593 }
2594}
2595
2596#ifdef CONFIG_SMP
2597
2598
2599
2600
2601
2602
2603
2604static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2605 __acquires(rq1->lock)
2606 __acquires(rq2->lock)
2607{
2608 BUG_ON(!irqs_disabled());
2609 if (rq1 == rq2) {
2610 spin_lock(&rq1->lock);
2611 __acquire(rq2->lock);
2612 } else {
2613 if (rq1 < rq2) {
2614 spin_lock(&rq1->lock);
2615 spin_lock(&rq2->lock);
2616 } else {
2617 spin_lock(&rq2->lock);
2618 spin_lock(&rq1->lock);
2619 }
2620 }
2621 update_rq_clock(rq1);
2622 update_rq_clock(rq2);
2623}
2624
2625
2626
2627
2628
2629
2630
2631static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2632 __releases(rq1->lock)
2633 __releases(rq2->lock)
2634{
2635 spin_unlock(&rq1->lock);
2636 if (rq1 != rq2)
2637 spin_unlock(&rq2->lock);
2638 else
2639 __release(rq2->lock);
2640}
2641
2642
2643
2644
2645static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2646 __releases(this_rq->lock)
2647 __acquires(busiest->lock)
2648 __acquires(this_rq->lock)
2649{
2650 int ret = 0;
2651
2652 if (unlikely(!irqs_disabled())) {
2653
2654 spin_unlock(&this_rq->lock);
2655 BUG_ON(1);
2656 }
2657 if (unlikely(!spin_trylock(&busiest->lock))) {
2658 if (busiest < this_rq) {
2659 spin_unlock(&this_rq->lock);
2660 spin_lock(&busiest->lock);
2661 spin_lock(&this_rq->lock);
2662 ret = 1;
2663 } else
2664 spin_lock(&busiest->lock);
2665 }
2666 return ret;
2667}
2668
2669
2670
2671
2672
2673
2674
2675static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2676{
2677 struct migration_req req;
2678 unsigned long flags;
2679 struct rq *rq;
2680
2681 rq = task_rq_lock(p, &flags);
2682 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2683 || unlikely(cpu_is_offline(dest_cpu)))
2684 goto out;
2685
2686
2687 if (migrate_task(p, dest_cpu, &req)) {
2688
2689 struct task_struct *mt = rq->migration_thread;
2690
2691 get_task_struct(mt);
2692 task_rq_unlock(rq, &flags);
2693 wake_up_process(mt);
2694 put_task_struct(mt);
2695 wait_for_completion(&req.done);
2696
2697 return;
2698 }
2699out:
2700 task_rq_unlock(rq, &flags);
2701}
2702
2703
2704
2705
2706
2707void sched_exec(void)
2708{
2709 int new_cpu, this_cpu = get_cpu();
2710 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2711 put_cpu();
2712 if (new_cpu != this_cpu)
2713 sched_migrate_task(current, new_cpu);
2714}
2715
2716
2717
2718
2719
2720static void pull_task(struct rq *src_rq, struct task_struct *p,
2721 struct rq *this_rq, int this_cpu)
2722{
2723 deactivate_task(src_rq, p, 0);
2724 set_task_cpu(p, this_cpu);
2725 activate_task(this_rq, p, 0);
2726
2727
2728
2729
2730 check_preempt_curr(this_rq, p);
2731}
2732
2733
2734
2735
2736static
2737int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2738 struct sched_domain *sd, enum cpu_idle_type idle,
2739 int *all_pinned)
2740{
2741
2742
2743
2744
2745
2746
2747 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2748 schedstat_inc(p, se.nr_failed_migrations_affine);
2749 return 0;
2750 }
2751 *all_pinned = 0;
2752
2753 if (task_running(rq, p)) {
2754 schedstat_inc(p, se.nr_failed_migrations_running);
2755 return 0;
2756 }
2757
2758
2759
2760
2761
2762
2763
2764 if (!task_hot(p, rq->clock, sd) ||
2765 sd->nr_balance_failed > sd->cache_nice_tries) {
2766#ifdef CONFIG_SCHEDSTATS
2767 if (task_hot(p, rq->clock, sd)) {
2768 schedstat_inc(sd, lb_hot_gained[idle]);
2769 schedstat_inc(p, se.nr_forced_migrations);
2770 }
2771#endif
2772 return 1;
2773 }
2774
2775 if (task_hot(p, rq->clock, sd)) {
2776 schedstat_inc(p, se.nr_failed_migrations_hot);
2777 return 0;
2778 }
2779 return 1;
2780}
2781
2782static unsigned long
2783balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2784 unsigned long max_load_move, struct sched_domain *sd,
2785 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator)
2787{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2789 struct task_struct *p;
2790 long rem_load_move = max_load_move;
2791
2792 if (max_load_move == 0)
2793 goto out;
2794
2795 pinned = 1;
2796
2797
2798
2799
2800 p = iterator->start(iterator->arg);
2801next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out;
2804
2805
2806
2807
2808
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg);
2814 goto next;
2815 }
2816
2817 pull_task(busiest, p, this_rq, this_cpu);
2818 pulled++;
2819 rem_load_move -= p->se.load.weight;
2820
2821
2822
2823
2824 if (rem_load_move > 0) {
2825 if (p->prio < *this_best_prio)
2826 *this_best_prio = p->prio;
2827 p = iterator->next(iterator->arg);
2828 goto next;
2829 }
2830out:
2831
2832
2833
2834
2835
2836 schedstat_add(sd, lb_gained[idle], pulled);
2837
2838 if (all_pinned)
2839 *all_pinned = pinned;
2840
2841 return max_load_move - rem_load_move;
2842}
2843
2844
2845
2846
2847
2848
2849
2850
2851static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2852 unsigned long max_load_move,
2853 struct sched_domain *sd, enum cpu_idle_type idle,
2854 int *all_pinned)
2855{
2856 const struct sched_class *class = sched_class_highest;
2857 unsigned long total_load_moved = 0;
2858 int this_best_prio = this_rq->curr->prio;
2859
2860 do {
2861 total_load_moved +=
2862 class->load_balance(this_rq, this_cpu, busiest,
2863 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next;
2866 } while (class && max_load_move > total_load_moved);
2867
2868 return total_load_moved > 0;
2869}
2870
2871static int
2872iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2873 struct sched_domain *sd, enum cpu_idle_type idle,
2874 struct rq_iterator *iterator)
2875{
2876 struct task_struct *p = iterator->start(iterator->arg);
2877 int pinned = 0;
2878
2879 while (p) {
2880 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2881 pull_task(busiest, p, this_rq, this_cpu);
2882
2883
2884
2885
2886
2887 schedstat_inc(sd, lb_gained[idle]);
2888
2889 return 1;
2890 }
2891 p = iterator->next(iterator->arg);
2892 }
2893
2894 return 0;
2895}
2896
2897
2898
2899
2900
2901
2902
2903
2904static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2905 struct sched_domain *sd, enum cpu_idle_type idle)
2906{
2907 const struct sched_class *class;
2908
2909 for (class = sched_class_highest; class; class = class->next)
2910 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
2911 return 1;
2912
2913 return 0;
2914}
2915
2916
2917
2918
2919
2920
2921static struct sched_group *
2922find_busiest_group(struct sched_domain *sd, int this_cpu,
2923 unsigned long *imbalance, enum cpu_idle_type idle,
2924 int *sd_idle, const cpumask_t *cpus, int *balance)
2925{
2926 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2927 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2928 unsigned long max_pull;
2929 unsigned long busiest_load_per_task, busiest_nr_running;
2930 unsigned long this_load_per_task, this_nr_running;
2931 int load_idx, group_imb = 0;
2932#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2933 int power_savings_balance = 1;
2934 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2935 unsigned long min_nr_running = ULONG_MAX;
2936 struct sched_group *group_min = NULL, *group_leader = NULL;
2937#endif
2938
2939 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0;
2942 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE)
2945 load_idx = sd->newidle_idx;
2946 else
2947 load_idx = sd->idle_idx;
2948
2949 do {
2950 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
2951 int local_group;
2952 int i;
2953 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load;
2956
2957 local_group = cpu_isset(this_cpu, group->cpumask);
2958
2959 if (local_group)
2960 balance_cpu = first_cpu(group->cpumask);
2961
2962
2963 sum_weighted_load = sum_nr_running = avg_load = 0;
2964 max_cpu_load = 0;
2965 min_cpu_load = ~0UL;
2966
2967 for_each_cpu_mask(i, group->cpumask) {
2968 struct rq *rq;
2969
2970 if (!cpu_isset(i, *cpus))
2971 continue;
2972
2973 rq = cpu_rq(i);
2974
2975 if (*sd_idle && rq->nr_running)
2976 *sd_idle = 0;
2977
2978
2979 if (local_group) {
2980 if (idle_cpu(i) && !first_idle_cpu) {
2981 first_idle_cpu = 1;
2982 balance_cpu = i;
2983 }
2984
2985 load = target_load(i, load_idx);
2986 } else {
2987 load = source_load(i, load_idx);
2988 if (load > max_cpu_load)
2989 max_cpu_load = load;
2990 if (min_cpu_load > load)
2991 min_cpu_load = load;
2992 }
2993
2994 avg_load += load;
2995 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i);
2997 }
2998
2999
3000
3001
3002
3003
3004
3005 if (idle != CPU_NEWLY_IDLE && local_group &&
3006 balance_cpu != this_cpu && balance) {
3007 *balance = 0;
3008 goto ret;
3009 }
3010
3011 total_load += avg_load;
3012 total_pwr += group->__cpu_power;
3013
3014
3015 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE);
3017
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
3019 __group_imb = 1;
3020
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3022
3023 if (local_group) {
3024 this_load = avg_load;
3025 this = group;
3026 this_nr_running = sum_nr_running;
3027 this_load_per_task = sum_weighted_load;
3028 } else if (avg_load > max_load &&
3029 (sum_nr_running > group_capacity || __group_imb)) {
3030 max_load = avg_load;
3031 busiest = group;
3032 busiest_nr_running = sum_nr_running;
3033 busiest_load_per_task = sum_weighted_load;
3034 group_imb = __group_imb;
3035 }
3036
3037#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3038
3039
3040
3041
3042 if (idle == CPU_NOT_IDLE ||
3043 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3044 goto group_next;
3045
3046
3047
3048
3049
3050 if (local_group && (this_nr_running >= group_capacity ||
3051 !this_nr_running))
3052 power_savings_balance = 0;
3053
3054
3055
3056
3057
3058 if (!power_savings_balance || sum_nr_running >= group_capacity
3059 || !sum_nr_running)
3060 goto group_next;
3061
3062
3063
3064
3065
3066
3067 if ((sum_nr_running < min_nr_running) ||
3068 (sum_nr_running == min_nr_running &&
3069 first_cpu(group->cpumask) <
3070 first_cpu(group_min->cpumask))) {
3071 group_min = group;
3072 min_nr_running = sum_nr_running;
3073 min_load_per_task = sum_weighted_load /
3074 sum_nr_running;
3075 }
3076
3077
3078
3079
3080
3081
3082 if (sum_nr_running <= group_capacity - 1) {
3083 if (sum_nr_running > leader_nr_running ||
3084 (sum_nr_running == leader_nr_running &&
3085 first_cpu(group->cpumask) >
3086 first_cpu(group_leader->cpumask))) {
3087 group_leader = group;
3088 leader_nr_running = sum_nr_running;
3089 }
3090 }
3091group_next:
3092#endif
3093 group = group->next;
3094 } while (group != sd->groups);
3095
3096 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3097 goto out_balanced;
3098
3099 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3100
3101 if (this_load >= avg_load ||
3102 100*max_load <= sd->imbalance_pct*this_load)
3103 goto out_balanced;
3104
3105 busiest_load_per_task /= busiest_nr_running;
3106 if (group_imb)
3107 busiest_load_per_task = min(busiest_load_per_task, avg_load);
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120 if (max_load <= busiest_load_per_task)
3121 goto out_balanced;
3122
3123
3124
3125
3126
3127
3128 if (max_load < avg_load) {
3129 *imbalance = 0;
3130 goto small_imbalance;
3131 }
3132
3133
3134 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3135
3136
3137 *imbalance = min(max_pull * busiest->__cpu_power,
3138 (avg_load - this_load) * this->__cpu_power)
3139 / SCHED_LOAD_SCALE;
3140
3141
3142
3143
3144
3145
3146
3147 if (*imbalance < busiest_load_per_task) {
3148 unsigned long tmp, pwr_now, pwr_move;
3149 unsigned int imbn;
3150
3151small_imbalance:
3152 pwr_move = pwr_now = 0;
3153 imbn = 2;
3154 if (this_nr_running) {
3155 this_load_per_task /= this_nr_running;
3156 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1;
3158 } else
3159 this_load_per_task = SCHED_LOAD_SCALE;
3160
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
3162 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task;
3164 return busiest;
3165 }
3166
3167
3168
3169
3170
3171
3172
3173 pwr_now += busiest->__cpu_power *
3174 min(busiest_load_per_task, max_load);
3175 pwr_now += this->__cpu_power *
3176 min(this_load_per_task, this_load);
3177 pwr_now /= SCHED_LOAD_SCALE;
3178
3179
3180 tmp = sg_div_cpu_power(busiest,
3181 busiest_load_per_task * SCHED_LOAD_SCALE);
3182 if (max_load > tmp)
3183 pwr_move += busiest->__cpu_power *
3184 min(busiest_load_per_task, max_load - tmp);
3185
3186
3187 if (max_load * busiest->__cpu_power <
3188 busiest_load_per_task * SCHED_LOAD_SCALE)
3189 tmp = sg_div_cpu_power(this,
3190 max_load * busiest->__cpu_power);
3191 else
3192 tmp = sg_div_cpu_power(this,
3193 busiest_load_per_task * SCHED_LOAD_SCALE);
3194 pwr_move += this->__cpu_power *
3195 min(this_load_per_task, this_load + tmp);
3196 pwr_move /= SCHED_LOAD_SCALE;
3197
3198
3199 if (pwr_move > pwr_now)
3200 *imbalance = busiest_load_per_task;
3201 }
3202
3203 return busiest;
3204
3205out_balanced:
3206#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3207 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3208 goto ret;
3209
3210 if (this == group_leader && group_leader != group_min) {
3211 *imbalance = min_load_per_task;
3212 return group_min;
3213 }
3214#endif
3215ret:
3216 *imbalance = 0;
3217 return NULL;
3218}
3219
3220
3221
3222
3223static struct rq *
3224find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3225 unsigned long imbalance, const cpumask_t *cpus)
3226{
3227 struct rq *busiest = NULL, *rq;
3228 unsigned long max_load = 0;
3229 int i;
3230
3231 for_each_cpu_mask(i, group->cpumask) {
3232 unsigned long wl;
3233
3234 if (!cpu_isset(i, *cpus))
3235 continue;
3236
3237 rq = cpu_rq(i);
3238 wl = weighted_cpuload(i);
3239
3240 if (rq->nr_running == 1 && wl > imbalance)
3241 continue;
3242
3243 if (wl > max_load) {
3244 max_load = wl;
3245 busiest = rq;
3246 }
3247 }
3248
3249 return busiest;
3250}
3251
3252
3253
3254
3255
3256#define MAX_PINNED_INTERVAL 512
3257
3258
3259
3260
3261
3262static int load_balance(int this_cpu, struct rq *this_rq,
3263 struct sched_domain *sd, enum cpu_idle_type idle,
3264 int *balance, cpumask_t *cpus)
3265{
3266 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3267 struct sched_group *group;
3268 unsigned long imbalance;
3269 struct rq *busiest;
3270 unsigned long flags;
3271
3272 cpus_setall(*cpus);
3273
3274
3275
3276
3277
3278
3279
3280 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3281 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3282 sd_idle = 1;
3283
3284 schedstat_inc(sd, lb_count[idle]);
3285
3286redo:
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance);
3289
3290 if (*balance == 0)
3291 goto out_balanced;
3292
3293 if (!group) {
3294 schedstat_inc(sd, lb_nobusyg[idle]);
3295 goto out_balanced;
3296 }
3297
3298 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3299 if (!busiest) {
3300 schedstat_inc(sd, lb_nobusyq[idle]);
3301 goto out_balanced;
3302 }
3303
3304 BUG_ON(busiest == this_rq);
3305
3306 schedstat_add(sd, lb_imbalance[idle], imbalance);
3307
3308 ld_moved = 0;
3309 if (busiest->nr_running > 1) {
3310
3311
3312
3313
3314
3315
3316 local_irq_save(flags);
3317 double_rq_lock(this_rq, busiest);
3318 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3319 imbalance, sd, idle, &all_pinned);
3320 double_rq_unlock(this_rq, busiest);
3321 local_irq_restore(flags);
3322
3323
3324
3325
3326 if (ld_moved && this_cpu != smp_processor_id())
3327 resched_cpu(this_cpu);
3328
3329
3330 if (unlikely(all_pinned)) {
3331 cpu_clear(cpu_of(busiest), *cpus);
3332 if (!cpus_empty(*cpus))
3333 goto redo;
3334 goto out_balanced;
3335 }
3336 }
3337
3338 if (!ld_moved) {
3339 schedstat_inc(sd, lb_failed[idle]);
3340 sd->nr_balance_failed++;
3341
3342 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3343
3344 spin_lock_irqsave(&busiest->lock, flags);
3345
3346
3347
3348
3349 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
3350 spin_unlock_irqrestore(&busiest->lock, flags);
3351 all_pinned = 1;
3352 goto out_one_pinned;
3353 }
3354
3355 if (!busiest->active_balance) {
3356 busiest->active_balance = 1;
3357 busiest->push_cpu = this_cpu;
3358 active_balance = 1;
3359 }
3360 spin_unlock_irqrestore(&busiest->lock, flags);
3361 if (active_balance)
3362 wake_up_process(busiest->migration_thread);
3363
3364
3365
3366
3367
3368 sd->nr_balance_failed = sd->cache_nice_tries+1;
3369 }
3370 } else
3371 sd->nr_balance_failed = 0;
3372
3373 if (likely(!active_balance)) {
3374
3375 sd->balance_interval = sd->min_interval;
3376 } else {
3377
3378
3379
3380
3381
3382
3383 if (sd->balance_interval < sd->max_interval)
3384 sd->balance_interval *= 2;
3385 }
3386
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1;
3390 return ld_moved;
3391
3392out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]);
3394
3395 sd->nr_balance_failed = 0;
3396
3397out_one_pinned:
3398
3399 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3400 (sd->balance_interval < sd->max_interval))
3401 sd->balance_interval *= 2;
3402
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1;
3406 return 0;
3407}
3408
3409
3410
3411
3412
3413
3414
3415
3416static int
3417load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3418 cpumask_t *cpus)
3419{
3420 struct sched_group *group;
3421 struct rq *busiest = NULL;
3422 unsigned long imbalance;
3423 int ld_moved = 0;
3424 int sd_idle = 0;
3425 int all_pinned = 0;
3426
3427 cpus_setall(*cpus);
3428
3429
3430
3431
3432
3433
3434
3435 if (sd->flags & SD_SHARE_CPUPOWER &&
3436 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3437 sd_idle = 1;
3438
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo:
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL);
3443 if (!group) {
3444 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3445 goto out_balanced;
3446 }
3447
3448 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3449 if (!busiest) {
3450 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3451 goto out_balanced;
3452 }
3453
3454 BUG_ON(busiest == this_rq);
3455
3456 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
3457
3458 ld_moved = 0;
3459 if (busiest->nr_running > 1) {
3460
3461 double_lock_balance(this_rq, busiest);
3462
3463 update_rq_clock(busiest);
3464 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3465 imbalance, sd, CPU_NEWLY_IDLE,
3466 &all_pinned);
3467 spin_unlock(&busiest->lock);
3468
3469 if (unlikely(all_pinned)) {
3470 cpu_clear(cpu_of(busiest), *cpus);
3471 if (!cpus_empty(*cpus))
3472 goto redo;
3473 }
3474 }
3475
3476 if (!ld_moved) {
3477 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3478 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3479 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3480 return -1;
3481 } else
3482 sd->nr_balance_failed = 0;
3483
3484 return ld_moved;
3485
3486out_balanced:
3487 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
3488 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3489 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3490 return -1;
3491 sd->nr_balance_failed = 0;
3492
3493 return 0;
3494}
3495
3496
3497
3498
3499
3500static void idle_balance(int this_cpu, struct rq *this_rq)
3501{
3502 struct sched_domain *sd;
3503 int pulled_task = -1;
3504 unsigned long next_balance = jiffies + HZ;
3505 cpumask_t tmpmask;
3506
3507 for_each_domain(this_cpu, sd) {
3508 unsigned long interval;
3509
3510 if (!(sd->flags & SD_LOAD_BALANCE))
3511 continue;
3512
3513 if (sd->flags & SD_BALANCE_NEWIDLE)
3514
3515 pulled_task = load_balance_newidle(this_cpu, this_rq,
3516 sd, &tmpmask);
3517
3518 interval = msecs_to_jiffies(sd->balance_interval);
3519 if (time_after(next_balance, sd->last_balance + interval))
3520 next_balance = sd->last_balance + interval;
3521 if (pulled_task)
3522 break;
3523 }
3524 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3525
3526
3527
3528
3529 this_rq->next_balance = next_balance;
3530 }
3531}
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3542{
3543 int target_cpu = busiest_rq->push_cpu;
3544 struct sched_domain *sd;
3545 struct rq *target_rq;
3546
3547
3548 if (busiest_rq->nr_running <= 1)
3549 return;
3550
3551 target_rq = cpu_rq(target_cpu);
3552
3553
3554
3555
3556
3557
3558 BUG_ON(busiest_rq == target_rq);
3559
3560
3561 double_lock_balance(busiest_rq, target_rq);
3562 update_rq_clock(busiest_rq);
3563 update_rq_clock(target_rq);
3564
3565
3566 for_each_domain(target_cpu, sd) {
3567 if ((sd->flags & SD_LOAD_BALANCE) &&
3568 cpu_isset(busiest_cpu, sd->span))
3569 break;
3570 }
3571
3572 if (likely(sd)) {
3573 schedstat_inc(sd, alb_count);
3574
3575 if (move_one_task(target_rq, target_cpu, busiest_rq,
3576 sd, CPU_IDLE))
3577 schedstat_inc(sd, alb_pushed);
3578 else
3579 schedstat_inc(sd, alb_failed);
3580 }
3581 spin_unlock(&target_rq->lock);
3582}
3583
3584#ifdef CONFIG_NO_HZ
3585static struct {
3586 atomic_t load_balancer;
3587 cpumask_t cpu_mask;
3588} nohz ____cacheline_aligned = {
3589 .load_balancer = ATOMIC_INIT(-1),
3590 .cpu_mask = CPU_MASK_NONE,
3591};
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613int select_nohz_load_balancer(int stop_tick)
3614{
3615 int cpu = smp_processor_id();
3616
3617 if (stop_tick) {
3618 cpu_set(cpu, nohz.cpu_mask);
3619 cpu_rq(cpu)->in_nohz_recently = 1;
3620
3621
3622
3623
3624 if (cpu_is_offline(cpu) &&
3625 atomic_read(&nohz.load_balancer) == cpu) {
3626 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3627 BUG();
3628 return 0;
3629 }
3630
3631
3632 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3633 if (atomic_read(&nohz.load_balancer) == cpu)
3634 atomic_set(&nohz.load_balancer, -1);
3635 return 0;
3636 }
3637
3638 if (atomic_read(&nohz.load_balancer) == -1) {
3639
3640 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3641 return 1;
3642 } else if (atomic_read(&nohz.load_balancer) == cpu)
3643 return 1;
3644 } else {
3645 if (!cpu_isset(cpu, nohz.cpu_mask))
3646 return 0;
3647
3648 cpu_clear(cpu, nohz.cpu_mask);
3649
3650 if (atomic_read(&nohz.load_balancer) == cpu)
3651 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3652 BUG();
3653 }
3654 return 0;
3655}
3656#endif
3657
3658static DEFINE_SPINLOCK(balancing);
3659
3660
3661
3662
3663
3664
3665
3666static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3667{
3668 int balance = 1;
3669 struct rq *rq = cpu_rq(cpu);
3670 unsigned long interval;
3671 struct sched_domain *sd;
3672
3673 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0;
3675 cpumask_t tmp;
3676
3677 for_each_domain(cpu, sd) {
3678 if (!(sd->flags & SD_LOAD_BALANCE))
3679 continue;
3680
3681 interval = sd->balance_interval;
3682 if (idle != CPU_IDLE)
3683 interval *= sd->busy_factor;
3684
3685
3686 interval = msecs_to_jiffies(interval);
3687 if (unlikely(!interval))
3688 interval = 1;
3689 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10;
3691
3692
3693 if (sd->flags & SD_SERIALIZE) {
3694 if (!spin_trylock(&balancing))
3695 goto out;
3696 }
3697
3698 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3699 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3700
3701
3702
3703
3704
3705 idle = CPU_NOT_IDLE;
3706 }
3707 sd->last_balance = jiffies;
3708 }
3709 if (sd->flags & SD_SERIALIZE)
3710 spin_unlock(&balancing);
3711out:
3712 if (time_after(next_balance, sd->last_balance + interval)) {
3713 next_balance = sd->last_balance + interval;
3714 update_next_balance = 1;
3715 }
3716
3717
3718
3719
3720
3721
3722 if (!balance)
3723 break;
3724 }
3725
3726
3727
3728
3729
3730
3731 if (likely(update_next_balance))
3732 rq->next_balance = next_balance;
3733}
3734
3735
3736
3737
3738
3739
3740static void run_rebalance_domains(struct softirq_action *h)
3741{
3742 int this_cpu = smp_processor_id();
3743 struct rq *this_rq = cpu_rq(this_cpu);
3744 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3745 CPU_IDLE : CPU_NOT_IDLE;
3746
3747 rebalance_domains(this_cpu, idle);
3748
3749#ifdef CONFIG_NO_HZ
3750
3751
3752
3753
3754
3755 if (this_rq->idle_at_tick &&
3756 atomic_read(&nohz.load_balancer) == this_cpu) {
3757 cpumask_t cpus = nohz.cpu_mask;
3758 struct rq *rq;
3759 int balance_cpu;
3760
3761 cpu_clear(this_cpu, cpus);
3762 for_each_cpu_mask(balance_cpu, cpus) {
3763
3764
3765
3766
3767
3768 if (need_resched())
3769 break;
3770
3771 rebalance_domains(balance_cpu, CPU_IDLE);
3772
3773 rq = cpu_rq(balance_cpu);
3774 if (time_after(this_rq->next_balance, rq->next_balance))
3775 this_rq->next_balance = rq->next_balance;
3776 }
3777 }
3778#endif
3779}
3780
3781
3782
3783
3784
3785
3786
3787
3788static inline void trigger_load_balance(struct rq *rq, int cpu)
3789{
3790#ifdef CONFIG_NO_HZ
3791
3792
3793
3794
3795
3796 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3797 rq->in_nohz_recently = 0;
3798
3799 if (atomic_read(&nohz.load_balancer) == cpu) {
3800 cpu_clear(cpu, nohz.cpu_mask);
3801 atomic_set(&nohz.load_balancer, -1);
3802 }
3803
3804 if (atomic_read(&nohz.load_balancer) == -1) {
3805
3806
3807
3808
3809
3810
3811
3812
3813 int ilb = first_cpu(nohz.cpu_mask);
3814
3815 if (ilb < nr_cpu_ids)
3816 resched_cpu(ilb);
3817 }
3818 }
3819
3820
3821
3822
3823
3824 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3825 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3826 resched_cpu(cpu);
3827 return;
3828 }
3829
3830
3831
3832
3833
3834 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3835 cpu_isset(cpu, nohz.cpu_mask))
3836 return;
3837#endif
3838 if (time_after_eq(jiffies, rq->next_balance))
3839 raise_softirq(SCHED_SOFTIRQ);
3840}
3841
3842#else
3843
3844
3845
3846
3847static inline void idle_balance(int cpu, struct rq *rq)
3848{
3849}
3850
3851#endif
3852
3853DEFINE_PER_CPU(struct kernel_stat, kstat);
3854
3855EXPORT_PER_CPU_SYMBOL(kstat);
3856
3857
3858
3859
3860
3861unsigned long long task_sched_runtime(struct task_struct *p)
3862{
3863 unsigned long flags;
3864 u64 ns, delta_exec;
3865 struct rq *rq;
3866
3867 rq = task_rq_lock(p, &flags);
3868 ns = p->se.sum_exec_runtime;
3869 if (task_current(rq, p)) {
3870 update_rq_clock(rq);
3871 delta_exec = rq->clock - p->se.exec_start;
3872 if ((s64)delta_exec > 0)
3873 ns += delta_exec;
3874 }
3875 task_rq_unlock(rq, &flags);
3876
3877 return ns;
3878}
3879
3880
3881
3882
3883
3884
3885void account_user_time(struct task_struct *p, cputime_t cputime)
3886{
3887 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3888 cputime64_t tmp;
3889
3890 p->utime = cputime_add(p->utime, cputime);
3891
3892
3893 tmp = cputime_to_cputime64(cputime);
3894 if (TASK_NICE(p) > 0)
3895 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3896 else
3897 cpustat->user = cputime64_add(cpustat->user, tmp);
3898}
3899
3900
3901
3902
3903
3904
3905static void account_guest_time(struct task_struct *p, cputime_t cputime)
3906{
3907 cputime64_t tmp;
3908 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3909
3910 tmp = cputime_to_cputime64(cputime);
3911
3912 p->utime = cputime_add(p->utime, cputime);
3913 p->gtime = cputime_add(p->gtime, cputime);
3914
3915 cpustat->user = cputime64_add(cpustat->user, tmp);
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3917}
3918
3919
3920
3921
3922
3923
3924void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
3925{
3926 p->utimescaled = cputime_add(p->utimescaled, cputime);
3927}
3928
3929
3930
3931
3932
3933
3934
3935void account_system_time(struct task_struct *p, int hardirq_offset,
3936 cputime_t cputime)
3937{
3938 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3939 struct rq *rq = this_rq();
3940 cputime64_t tmp;
3941
3942 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3943 account_guest_time(p, cputime);
3944 return;
3945 }
3946
3947 p->stime = cputime_add(p->stime, cputime);
3948
3949
3950 tmp = cputime_to_cputime64(cputime);
3951 if (hardirq_count() - hardirq_offset)
3952 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3953 else if (softirq_count())
3954 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3955 else if (p != rq->idle)
3956 cpustat->system = cputime64_add(cpustat->system, tmp);
3957 else if (atomic_read(&rq->nr_iowait) > 0)
3958 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3959 else
3960 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3961
3962 acct_update_integrals(p);
3963}
3964
3965
3966
3967
3968
3969
3970
3971void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
3972{
3973 p->stimescaled = cputime_add(p->stimescaled, cputime);
3974}
3975
3976
3977
3978
3979
3980
3981void account_steal_time(struct task_struct *p, cputime_t steal)
3982{
3983 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3984 cputime64_t tmp = cputime_to_cputime64(steal);
3985 struct rq *rq = this_rq();
3986
3987 if (p == rq->idle) {
3988 p->stime = cputime_add(p->stime, steal);
3989 if (atomic_read(&rq->nr_iowait) > 0)
3990 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3991 else
3992 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3993 } else
3994 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3995}
3996
3997
3998
3999
4000#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4001cputime_t task_utime(struct task_struct *p)
4002{
4003 return p->utime;
4004}
4005
4006cputime_t task_stime(struct task_struct *p)
4007{
4008 return p->stime;
4009}
4010#else
4011cputime_t task_utime(struct task_struct *p)
4012{
4013 clock_t utime = cputime_to_clock_t(p->utime),
4014 total = utime + cputime_to_clock_t(p->stime);
4015 u64 temp;
4016
4017
4018
4019
4020 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4021
4022 if (total) {
4023 temp *= utime;
4024 do_div(temp, total);
4025 }
4026 utime = (clock_t)temp;
4027
4028 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4029 return p->prev_utime;
4030}
4031
4032cputime_t task_stime(struct task_struct *p)
4033{
4034 clock_t stime;
4035
4036
4037
4038
4039
4040
4041 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4042 cputime_to_clock_t(task_utime(p));
4043
4044 if (stime >= 0)
4045 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4046
4047 return p->prev_stime;
4048}
4049#endif
4050
4051inline cputime_t task_gtime(struct task_struct *p)
4052{
4053 return p->gtime;
4054}
4055
4056
4057
4058
4059
4060
4061
4062
4063void scheduler_tick(void)
4064{
4065 int cpu = smp_processor_id();
4066 struct rq *rq = cpu_rq(cpu);
4067 struct task_struct *curr = rq->curr;
4068
4069 sched_clock_tick();
4070
4071 spin_lock(&rq->lock);
4072 update_rq_clock(rq);
4073 update_cpu_load(rq);
4074 curr->sched_class->task_tick(rq, curr, 0);
4075 spin_unlock(&rq->lock);
4076
4077#ifdef CONFIG_SMP
4078 rq->idle_at_tick = idle_cpu(cpu);
4079 trigger_load_balance(rq, cpu);
4080#endif
4081}
4082
4083#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
4084
4085void __kprobes add_preempt_count(int val)
4086{
4087
4088
4089
4090 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4091 return;
4092 preempt_count() += val;
4093
4094
4095
4096 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4097 PREEMPT_MASK - 10);
4098}
4099EXPORT_SYMBOL(add_preempt_count);
4100
4101void __kprobes sub_preempt_count(int val)
4102{
4103
4104
4105
4106 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4107 return;
4108
4109
4110
4111 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4112 !(preempt_count() & PREEMPT_MASK)))
4113 return;
4114
4115 preempt_count() -= val;
4116}
4117EXPORT_SYMBOL(sub_preempt_count);
4118
4119#endif
4120
4121
4122
4123
4124static noinline void __schedule_bug(struct task_struct *prev)
4125{
4126 struct pt_regs *regs = get_irq_regs();
4127
4128 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4129 prev->comm, prev->pid, preempt_count());
4130
4131 debug_show_held_locks(prev);
4132 if (irqs_disabled())
4133 print_irqtrace_events(prev);
4134
4135 if (regs)
4136 show_regs(regs);
4137 else
4138 dump_stack();
4139}
4140
4141
4142
4143
4144static inline void schedule_debug(struct task_struct *prev)
4145{
4146
4147
4148
4149
4150
4151 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4152 __schedule_bug(prev);
4153
4154 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4155
4156 schedstat_inc(this_rq(), sched_count);
4157#ifdef CONFIG_SCHEDSTATS
4158 if (unlikely(prev->lock_depth >= 0)) {
4159 schedstat_inc(this_rq(), bkl_count);
4160 schedstat_inc(prev, sched_info.bkl_count);
4161 }
4162#endif
4163}
4164
4165
4166
4167
4168static inline struct task_struct *
4169pick_next_task(struct rq *rq, struct task_struct *prev)
4170{
4171 const struct sched_class *class;
4172 struct task_struct *p;
4173
4174
4175
4176
4177
4178 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4179 p = fair_sched_class.pick_next_task(rq);
4180 if (likely(p))
4181 return p;
4182 }
4183
4184 class = sched_class_highest;
4185 for ( ; ; ) {
4186 p = class->pick_next_task(rq);
4187 if (p)
4188 return p;
4189
4190
4191
4192
4193 class = class->next;
4194 }
4195}
4196
4197
4198
4199
4200asmlinkage void __sched schedule(void)
4201{
4202 struct task_struct *prev, *next;
4203 unsigned long *switch_count;
4204 struct rq *rq;
4205 int cpu;
4206
4207need_resched:
4208 preempt_disable();
4209 cpu = smp_processor_id();
4210 rq = cpu_rq(cpu);
4211 rcu_qsctr_inc(cpu);
4212 prev = rq->curr;
4213 switch_count = &prev->nivcsw;
4214
4215 release_kernel_lock(prev);
4216need_resched_nonpreemptible:
4217
4218 schedule_debug(prev);
4219
4220 hrtick_clear(rq);
4221
4222
4223
4224
4225 local_irq_disable();
4226 update_rq_clock(rq);
4227 spin_lock(&rq->lock);
4228 clear_tsk_need_resched(prev);
4229
4230 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4231 if (unlikely(signal_pending_state(prev->state, prev)))
4232 prev->state = TASK_RUNNING;
4233 else
4234 deactivate_task(rq, prev, 1);
4235 switch_count = &prev->nvcsw;
4236 }
4237
4238#ifdef CONFIG_SMP
4239 if (prev->sched_class->pre_schedule)
4240 prev->sched_class->pre_schedule(rq, prev);
4241#endif
4242
4243 if (unlikely(!rq->nr_running))
4244 idle_balance(cpu, rq);
4245
4246 prev->sched_class->put_prev_task(rq, prev);
4247 next = pick_next_task(rq, prev);
4248
4249 if (likely(prev != next)) {
4250 sched_info_switch(prev, next);
4251
4252 rq->nr_switches++;
4253 rq->curr = next;
4254 ++*switch_count;
4255
4256 context_switch(rq, prev, next);
4257
4258
4259
4260
4261 cpu = smp_processor_id();
4262 rq = cpu_rq(cpu);
4263 } else
4264 spin_unlock_irq(&rq->lock);
4265
4266 hrtick_set(rq);
4267
4268 if (unlikely(reacquire_kernel_lock(current) < 0))
4269 goto need_resched_nonpreemptible;
4270
4271 preempt_enable_no_resched();
4272 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4273 goto need_resched;
4274}
4275EXPORT_SYMBOL(schedule);
4276
4277#ifdef CONFIG_PREEMPT
4278
4279
4280
4281
4282
4283asmlinkage void __sched preempt_schedule(void)
4284{
4285 struct thread_info *ti = current_thread_info();
4286
4287
4288
4289
4290
4291 if (likely(ti->preempt_count || irqs_disabled()))
4292 return;
4293
4294 do {
4295 add_preempt_count(PREEMPT_ACTIVE);
4296 schedule();
4297 sub_preempt_count(PREEMPT_ACTIVE);
4298
4299
4300
4301
4302
4303 barrier();
4304 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4305}
4306EXPORT_SYMBOL(preempt_schedule);
4307
4308
4309
4310
4311
4312
4313
4314asmlinkage void __sched preempt_schedule_irq(void)
4315{
4316 struct thread_info *ti = current_thread_info();
4317
4318
4319 BUG_ON(ti->preempt_count || !irqs_disabled());
4320
4321 do {
4322 add_preempt_count(PREEMPT_ACTIVE);
4323 local_irq_enable();
4324 schedule();
4325 local_irq_disable();
4326 sub_preempt_count(PREEMPT_ACTIVE);
4327
4328
4329
4330
4331
4332 barrier();
4333 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4334}
4335
4336#endif
4337
4338int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
4339 void *key)
4340{
4341 return try_to_wake_up(curr->private, mode, sync);
4342}
4343EXPORT_SYMBOL(default_wake_function);
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4355 int nr_exclusive, int sync, void *key)
4356{
4357 wait_queue_t *curr, *next;
4358
4359 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4360 unsigned flags = curr->flags;
4361
4362 if (curr->func(curr, mode, sync, key) &&
4363 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4364 break;
4365 }
4366}
4367
4368
4369
4370
4371
4372
4373
4374
4375void __wake_up(wait_queue_head_t *q, unsigned int mode,
4376 int nr_exclusive, void *key)
4377{
4378 unsigned long flags;
4379
4380 spin_lock_irqsave(&q->lock, flags);
4381 __wake_up_common(q, mode, nr_exclusive, 0, key);
4382 spin_unlock_irqrestore(&q->lock, flags);
4383}
4384EXPORT_SYMBOL(__wake_up);
4385
4386
4387
4388
4389void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4390{
4391 __wake_up_common(q, mode, 1, 0, NULL);
4392}
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407void
4408__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4409{
4410 unsigned long flags;
4411 int sync = 1;
4412
4413 if (unlikely(!q))
4414 return;
4415
4416 if (unlikely(!nr_exclusive))
4417 sync = 0;
4418
4419 spin_lock_irqsave(&q->lock, flags);
4420 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
4421 spin_unlock_irqrestore(&q->lock, flags);
4422}
4423EXPORT_SYMBOL_GPL(__wake_up_sync);
4424
4425void complete(struct completion *x)
4426{
4427 unsigned long flags;
4428
4429 spin_lock_irqsave(&x->wait.lock, flags);
4430 x->done++;
4431 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4432 spin_unlock_irqrestore(&x->wait.lock, flags);
4433}
4434EXPORT_SYMBOL(complete);
4435
4436void complete_all(struct completion *x)
4437{
4438 unsigned long flags;
4439
4440 spin_lock_irqsave(&x->wait.lock, flags);
4441 x->done += UINT_MAX/2;
4442 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4443 spin_unlock_irqrestore(&x->wait.lock, flags);
4444}
4445EXPORT_SYMBOL(complete_all);
4446
4447static inline long __sched
4448do_wait_for_common(struct completion *x, long timeout, int state)
4449{
4450 if (!x->done) {
4451 DECLARE_WAITQUEUE(wait, current);
4452
4453 wait.flags |= WQ_FLAG_EXCLUSIVE;
4454 __add_wait_queue_tail(&x->wait, &wait);
4455 do {
4456 if ((state == TASK_INTERRUPTIBLE &&
4457 signal_pending(current)) ||
4458 (state == TASK_KILLABLE &&
4459 fatal_signal_pending(current))) {
4460 timeout = -ERESTARTSYS;
4461 break;
4462 }
4463 __set_current_state(state);
4464 spin_unlock_irq(&x->wait.lock);
4465 timeout = schedule_timeout(timeout);
4466 spin_lock_irq(&x->wait.lock);
4467 } while (!x->done && timeout);
4468 __remove_wait_queue(&x->wait, &wait);
4469 if (!x->done)
4470 return timeout;
4471 }
4472 x->done--;
4473 return timeout ?: 1;
4474}
4475
4476static long __sched
4477wait_for_common(struct completion *x, long timeout, int state)
4478{
4479 might_sleep();
4480
4481 spin_lock_irq(&x->wait.lock);
4482 timeout = do_wait_for_common(x, timeout, state);
4483 spin_unlock_irq(&x->wait.lock);
4484 return timeout;
4485}
4486
4487void __sched wait_for_completion(struct completion *x)
4488{
4489 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4490}
4491EXPORT_SYMBOL(wait_for_completion);
4492
4493unsigned long __sched
4494wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4495{
4496 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4497}
4498EXPORT_SYMBOL(wait_for_completion_timeout);
4499
4500int __sched wait_for_completion_interruptible(struct completion *x)
4501{
4502 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4503 if (t == -ERESTARTSYS)
4504 return t;
4505 return 0;
4506}
4507EXPORT_SYMBOL(wait_for_completion_interruptible);
4508
4509unsigned long __sched
4510wait_for_completion_interruptible_timeout(struct completion *x,
4511 unsigned long timeout)
4512{
4513 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4514}
4515EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4516
4517int __sched wait_for_completion_killable(struct completion *x)
4518{
4519 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4520 if (t == -ERESTARTSYS)
4521 return t;
4522 return 0;
4523}
4524EXPORT_SYMBOL(wait_for_completion_killable);
4525
4526static long __sched
4527sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4528{
4529 unsigned long flags;
4530 wait_queue_t wait;
4531
4532 init_waitqueue_entry(&wait, current);
4533
4534 __set_current_state(state);
4535
4536 spin_lock_irqsave(&q->lock, flags);
4537 __add_wait_queue(q, &wait);
4538 spin_unlock(&q->lock);
4539 timeout = schedule_timeout(timeout);
4540 spin_lock_irq(&q->lock);
4541 __remove_wait_queue(q, &wait);
4542 spin_unlock_irqrestore(&q->lock, flags);
4543
4544 return timeout;
4545}
4546
4547void __sched interruptible_sleep_on(wait_queue_head_t *q)
4548{
4549 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4550}
4551EXPORT_SYMBOL(interruptible_sleep_on);
4552
4553long __sched
4554interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4555{
4556 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4557}
4558EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4559
4560void __sched sleep_on(wait_queue_head_t *q)
4561{
4562 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4563}
4564EXPORT_SYMBOL(sleep_on);
4565
4566long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4567{
4568 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4569}
4570EXPORT_SYMBOL(sleep_on_timeout);
4571
4572#ifdef CONFIG_RT_MUTEXES
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584void rt_mutex_setprio(struct task_struct *p, int prio)
4585{
4586 unsigned long flags;
4587 int oldprio, on_rq, running;
4588 struct rq *rq;
4589 const struct sched_class *prev_class = p->sched_class;
4590
4591 BUG_ON(prio < 0 || prio > MAX_PRIO);
4592
4593 rq = task_rq_lock(p, &flags);
4594 update_rq_clock(rq);
4595
4596 oldprio = p->prio;
4597 on_rq = p->se.on_rq;
4598 running = task_current(rq, p);
4599 if (on_rq)
4600 dequeue_task(rq, p, 0);
4601 if (running)
4602 p->sched_class->put_prev_task(rq, p);
4603
4604 if (rt_prio(prio))
4605 p->sched_class = &rt_sched_class;
4606 else
4607 p->sched_class = &fair_sched_class;
4608
4609 p->prio = prio;
4610
4611 if (running)
4612 p->sched_class->set_curr_task(rq);
4613 if (on_rq) {
4614 enqueue_task(rq, p, 0);
4615
4616 check_class_changed(rq, p, prev_class, oldprio, running);
4617 }
4618 task_rq_unlock(rq, &flags);
4619}
4620
4621#endif
4622
4623void set_user_nice(struct task_struct *p, long nice)
4624{
4625 int old_prio, delta, on_rq;
4626 unsigned long flags;
4627 struct rq *rq;
4628
4629 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4630 return;
4631
4632
4633
4634
4635 rq = task_rq_lock(p, &flags);
4636 update_rq_clock(rq);
4637
4638
4639
4640
4641
4642
4643 if (task_has_rt_policy(p)) {
4644 p->static_prio = NICE_TO_PRIO(nice);
4645 goto out_unlock;
4646 }
4647 on_rq = p->se.on_rq;
4648 if (on_rq) {
4649 dequeue_task(rq, p, 0);
4650 dec_load(rq, p);
4651 }
4652
4653 p->static_prio = NICE_TO_PRIO(nice);
4654 set_load_weight(p);
4655 old_prio = p->prio;
4656 p->prio = effective_prio(p);
4657 delta = p->prio - old_prio;
4658
4659 if (on_rq) {
4660 enqueue_task(rq, p, 0);
4661 inc_load(rq, p);
4662
4663
4664
4665
4666 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4667 resched_task(rq->curr);
4668 }
4669out_unlock:
4670 task_rq_unlock(rq, &flags);
4671}
4672EXPORT_SYMBOL(set_user_nice);
4673
4674
4675
4676
4677
4678
4679int can_nice(const struct task_struct *p, const int nice)
4680{
4681
4682 int nice_rlim = 20 - nice;
4683
4684 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
4685 capable(CAP_SYS_NICE));
4686}
4687
4688#ifdef __ARCH_WANT_SYS_NICE
4689
4690
4691
4692
4693
4694
4695
4696
4697asmlinkage long sys_nice(int increment)
4698{
4699 long nice, retval;
4700
4701
4702
4703
4704
4705
4706 if (increment < -40)
4707 increment = -40;
4708 if (increment > 40)
4709 increment = 40;
4710
4711 nice = PRIO_TO_NICE(current->static_prio) + increment;
4712 if (nice < -20)
4713 nice = -20;
4714 if (nice > 19)
4715 nice = 19;
4716
4717 if (increment < 0 && !can_nice(current, nice))
4718 return -EPERM;
4719
4720 retval = security_task_setnice(current, nice);
4721 if (retval)
4722 return retval;
4723
4724 set_user_nice(current, nice);
4725 return 0;
4726}
4727
4728#endif
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738int task_prio(const struct task_struct *p)
4739{
4740 return p->prio - MAX_RT_PRIO;
4741}
4742
4743
4744
4745
4746
4747int task_nice(const struct task_struct *p)
4748{
4749 return TASK_NICE(p);
4750}
4751EXPORT_SYMBOL(task_nice);
4752
4753
4754
4755
4756
4757int idle_cpu(int cpu)
4758{
4759 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4760}
4761
4762
4763
4764
4765
4766struct task_struct *idle_task(int cpu)
4767{
4768 return cpu_rq(cpu)->idle;
4769}
4770
4771
4772
4773
4774
4775static struct task_struct *find_process_by_pid(pid_t pid)
4776{
4777 return pid ? find_task_by_vpid(pid) : current;
4778}
4779
4780
4781static void
4782__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4783{
4784 BUG_ON(p->se.on_rq);
4785
4786 p->policy = policy;
4787 switch (p->policy) {
4788 case SCHED_NORMAL:
4789 case SCHED_BATCH:
4790 case SCHED_IDLE:
4791 p->sched_class = &fair_sched_class;
4792 break;
4793 case SCHED_FIFO:
4794 case SCHED_RR:
4795 p->sched_class = &rt_sched_class;
4796 break;
4797 }
4798
4799 p->rt_priority = prio;
4800 p->normal_prio = normal_prio(p);
4801
4802 p->prio = rt_mutex_getprio(p);
4803 set_load_weight(p);
4804}
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814int sched_setscheduler(struct task_struct *p, int policy,
4815 struct sched_param *param)
4816{
4817 int retval, oldprio, oldpolicy = -1, on_rq, running;
4818 unsigned long flags;
4819 const struct sched_class *prev_class = p->sched_class;
4820 struct rq *rq;
4821
4822
4823 BUG_ON(in_interrupt());
4824recheck:
4825
4826 if (policy < 0)
4827 policy = oldpolicy = p->policy;
4828 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4829 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4830 policy != SCHED_IDLE)
4831 return -EINVAL;
4832
4833
4834
4835
4836
4837 if (param->sched_priority < 0 ||
4838 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4839 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4840 return -EINVAL;
4841 if (rt_policy(policy) != (param->sched_priority != 0))
4842 return -EINVAL;
4843
4844
4845
4846
4847 if (!capable(CAP_SYS_NICE)) {
4848 if (rt_policy(policy)) {
4849 unsigned long rlim_rtprio;
4850
4851 if (!lock_task_sighand(p, &flags))
4852 return -ESRCH;
4853 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4854 unlock_task_sighand(p, &flags);
4855
4856
4857 if (policy != p->policy && !rlim_rtprio)
4858 return -EPERM;
4859
4860
4861 if (param->sched_priority > p->rt_priority &&
4862 param->sched_priority > rlim_rtprio)
4863 return -EPERM;
4864 }
4865
4866
4867
4868
4869 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4870 return -EPERM;
4871
4872
4873 if ((current->euid != p->euid) &&
4874 (current->euid != p->uid))
4875 return -EPERM;
4876 }
4877
4878#ifdef CONFIG_RT_GROUP_SCHED
4879
4880
4881
4882
4883 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4884 return -EPERM;
4885#endif
4886
4887 retval = security_task_setscheduler(p, policy, param);
4888 if (retval)
4889 return retval;
4890
4891
4892
4893
4894 spin_lock_irqsave(&p->pi_lock, flags);
4895
4896
4897
4898
4899 rq = __task_rq_lock(p);
4900
4901 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4902 policy = oldpolicy = -1;
4903 __task_rq_unlock(rq);
4904 spin_unlock_irqrestore(&p->pi_lock, flags);
4905 goto recheck;
4906 }
4907 update_rq_clock(rq);
4908 on_rq = p->se.on_rq;
4909 running = task_current(rq, p);
4910 if (on_rq)
4911 deactivate_task(rq, p, 0);
4912 if (running)
4913 p->sched_class->put_prev_task(rq, p);
4914
4915 oldprio = p->prio;
4916 __setscheduler(rq, p, policy, param->sched_priority);
4917
4918 if (running)
4919 p->sched_class->set_curr_task(rq);
4920 if (on_rq) {
4921 activate_task(rq, p, 0);
4922
4923 check_class_changed(rq, p, prev_class, oldprio, running);
4924 }
4925 __task_rq_unlock(rq);
4926 spin_unlock_irqrestore(&p->pi_lock, flags);
4927
4928 rt_mutex_adjust_pi(p);
4929
4930 return 0;
4931}
4932EXPORT_SYMBOL_GPL(sched_setscheduler);
4933
4934static int
4935do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4936{
4937 struct sched_param lparam;
4938 struct task_struct *p;
4939 int retval;
4940
4941 if (!param || pid < 0)
4942 return -EINVAL;
4943 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4944 return -EFAULT;
4945
4946 rcu_read_lock();
4947 retval = -ESRCH;
4948 p = find_process_by_pid(pid);
4949 if (p != NULL)
4950 retval = sched_setscheduler(p, policy, &lparam);
4951 rcu_read_unlock();
4952
4953 return retval;
4954}
4955
4956
4957
4958
4959
4960
4961
4962asmlinkage long
4963sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4964{
4965
4966 if (policy < 0)
4967 return -EINVAL;
4968
4969 return do_sched_setscheduler(pid, policy, param);
4970}
4971
4972
4973
4974
4975
4976
4977asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4978{
4979 return do_sched_setscheduler(pid, -1, param);
4980}
4981
4982
4983
4984
4985
4986asmlinkage long sys_sched_getscheduler(pid_t pid)
4987{
4988 struct task_struct *p;
4989 int retval;
4990
4991 if (pid < 0)
4992 return -EINVAL;
4993
4994 retval = -ESRCH;
4995 read_lock(&tasklist_lock);
4996 p = find_process_by_pid(pid);
4997 if (p) {
4998 retval = security_task_getscheduler(p);
4999 if (!retval)
5000 retval = p->policy;
5001 }
5002 read_unlock(&tasklist_lock);
5003 return retval;
5004}
5005
5006
5007
5008
5009
5010
5011asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
5012{
5013 struct sched_param lp;
5014 struct task_struct *p;
5015 int retval;
5016
5017 if (!param || pid < 0)
5018 return -EINVAL;
5019
5020 read_lock(&tasklist_lock);
5021 p = find_process_by_pid(pid);
5022 retval = -ESRCH;
5023 if (!p)
5024 goto out_unlock;
5025
5026 retval = security_task_getscheduler(p);
5027 if (retval)
5028 goto out_unlock;
5029
5030 lp.sched_priority = p->rt_priority;
5031 read_unlock(&tasklist_lock);
5032
5033
5034
5035
5036 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5037
5038 return retval;
5039
5040out_unlock:
5041 read_unlock(&tasklist_lock);
5042 return retval;
5043}
5044
5045long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5046{
5047 cpumask_t cpus_allowed;
5048 cpumask_t new_mask = *in_mask;
5049 struct task_struct *p;
5050 int retval;
5051
5052 get_online_cpus();
5053 read_lock(&tasklist_lock);
5054
5055 p = find_process_by_pid(pid);
5056 if (!p) {
5057 read_unlock(&tasklist_lock);
5058 put_online_cpus();
5059 return -ESRCH;
5060 }
5061
5062
5063
5064
5065
5066
5067 get_task_struct(p);
5068 read_unlock(&tasklist_lock);
5069
5070 retval = -EPERM;
5071 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5072 !capable(CAP_SYS_NICE))
5073 goto out_unlock;
5074
5075 retval = security_task_setscheduler(p, 0, NULL);
5076 if (retval)
5077 goto out_unlock;
5078
5079 cpuset_cpus_allowed(p, &cpus_allowed);
5080 cpus_and(new_mask, new_mask, cpus_allowed);
5081 again:
5082 retval = set_cpus_allowed_ptr(p, &new_mask);
5083
5084 if (!retval) {
5085 cpuset_cpus_allowed(p, &cpus_allowed);
5086 if (!cpus_subset(new_mask, cpus_allowed)) {
5087
5088
5089
5090
5091
5092 new_mask = cpus_allowed;
5093 goto again;
5094 }
5095 }
5096out_unlock:
5097 put_task_struct(p);
5098 put_online_cpus();
5099 return retval;
5100}
5101
5102static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5103 cpumask_t *new_mask)
5104{
5105 if (len < sizeof(cpumask_t)) {
5106 memset(new_mask, 0, sizeof(cpumask_t));
5107 } else if (len > sizeof(cpumask_t)) {
5108 len = sizeof(cpumask_t);
5109 }
5110 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5111}
5112
5113
5114
5115
5116
5117
5118
5119asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5120 unsigned long __user *user_mask_ptr)
5121{
5122 cpumask_t new_mask;
5123 int retval;
5124
5125 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
5126 if (retval)
5127 return retval;
5128
5129 return sched_setaffinity(pid, &new_mask);
5130}
5131
5132
5133
5134
5135
5136
5137
5138
5139cpumask_t cpu_present_map __read_mostly;
5140EXPORT_SYMBOL(cpu_present_map);
5141
5142#ifndef CONFIG_SMP
5143cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5144EXPORT_SYMBOL(cpu_online_map);
5145
5146cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5147EXPORT_SYMBOL(cpu_possible_map);
5148#endif
5149
5150long sched_getaffinity(pid_t pid, cpumask_t *mask)
5151{
5152 struct task_struct *p;
5153 int retval;
5154
5155 get_online_cpus();
5156 read_lock(&tasklist_lock);
5157
5158 retval = -ESRCH;
5159 p = find_process_by_pid(pid);
5160 if (!p)
5161 goto out_unlock;
5162
5163 retval = security_task_getscheduler(p);
5164 if (retval)
5165 goto out_unlock;
5166
5167 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
5168
5169out_unlock:
5170 read_unlock(&tasklist_lock);
5171 put_online_cpus();
5172
5173 return retval;
5174}
5175
5176
5177
5178
5179
5180
5181
5182asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5183 unsigned long __user *user_mask_ptr)
5184{
5185 int ret;
5186 cpumask_t mask;
5187
5188 if (len < sizeof(cpumask_t))
5189 return -EINVAL;
5190
5191 ret = sched_getaffinity(pid, &mask);
5192 if (ret < 0)
5193 return ret;
5194
5195 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
5196 return -EFAULT;
5197
5198 return sizeof(cpumask_t);
5199}
5200
5201
5202
5203
5204
5205
5206
5207asmlinkage long sys_sched_yield(void)
5208{
5209 struct rq *rq = this_rq_lock();
5210
5211 schedstat_inc(rq, yld_count);
5212 current->sched_class->yield_task(rq);
5213
5214
5215
5216
5217
5218 __release(rq->lock);
5219 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5220 _raw_spin_unlock(&rq->lock);
5221 preempt_enable_no_resched();
5222
5223 schedule();
5224
5225 return 0;
5226}
5227
5228static void __cond_resched(void)
5229{
5230#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5231 __might_sleep(__FILE__, __LINE__);
5232#endif
5233
5234
5235
5236
5237
5238 do {
5239 add_preempt_count(PREEMPT_ACTIVE);
5240 schedule();
5241 sub_preempt_count(PREEMPT_ACTIVE);
5242 } while (need_resched());
5243}
5244
5245int __sched _cond_resched(void)
5246{
5247 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
5248 system_state == SYSTEM_RUNNING) {
5249 __cond_resched();
5250 return 1;
5251 }
5252 return 0;
5253}
5254EXPORT_SYMBOL(_cond_resched);
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264int cond_resched_lock(spinlock_t *lock)
5265{
5266 int resched = need_resched() && system_state == SYSTEM_RUNNING;
5267 int ret = 0;
5268
5269 if (spin_needbreak(lock) || resched) {
5270 spin_unlock(lock);
5271 if (resched && need_resched())
5272 __cond_resched();
5273 else
5274 cpu_relax();
5275 ret = 1;
5276 spin_lock(lock);
5277 }
5278 return ret;
5279}
5280EXPORT_SYMBOL(cond_resched_lock);
5281
5282int __sched cond_resched_softirq(void)
5283{
5284 BUG_ON(!in_softirq());
5285
5286 if (need_resched() && system_state == SYSTEM_RUNNING) {
5287 local_bh_enable();
5288 __cond_resched();
5289 local_bh_disable();
5290 return 1;
5291 }
5292 return 0;
5293}
5294EXPORT_SYMBOL(cond_resched_softirq);
5295
5296
5297
5298
5299
5300
5301
5302void __sched yield(void)
5303{
5304 set_current_state(TASK_RUNNING);
5305 sys_sched_yield();
5306}
5307EXPORT_SYMBOL(yield);
5308
5309
5310
5311
5312
5313
5314
5315
5316void __sched io_schedule(void)
5317{
5318 struct rq *rq = &__raw_get_cpu_var(runqueues);
5319
5320 delayacct_blkio_start();
5321 atomic_inc(&rq->nr_iowait);
5322 schedule();
5323 atomic_dec(&rq->nr_iowait);
5324 delayacct_blkio_end();
5325}
5326EXPORT_SYMBOL(io_schedule);
5327
5328long __sched io_schedule_timeout(long timeout)
5329{
5330 struct rq *rq = &__raw_get_cpu_var(runqueues);
5331 long ret;
5332
5333 delayacct_blkio_start();
5334 atomic_inc(&rq->nr_iowait);
5335 ret = schedule_timeout(timeout);
5336 atomic_dec(&rq->nr_iowait);
5337 delayacct_blkio_end();
5338 return ret;
5339}
5340
5341
5342
5343
5344
5345
5346
5347
5348asmlinkage long sys_sched_get_priority_max(int policy)
5349{
5350 int ret = -EINVAL;
5351
5352 switch (policy) {
5353 case SCHED_FIFO:
5354 case SCHED_RR:
5355 ret = MAX_USER_RT_PRIO-1;
5356 break;
5357 case SCHED_NORMAL:
5358 case SCHED_BATCH:
5359 case SCHED_IDLE:
5360 ret = 0;
5361 break;
5362 }
5363 return ret;
5364}
5365
5366
5367
5368
5369
5370
5371
5372
5373asmlinkage long sys_sched_get_priority_min(int policy)
5374{
5375 int ret = -EINVAL;
5376
5377 switch (policy) {
5378 case SCHED_FIFO:
5379 case SCHED_RR:
5380 ret = 1;
5381 break;
5382 case SCHED_NORMAL:
5383 case SCHED_BATCH:
5384 case SCHED_IDLE:
5385 ret = 0;
5386 }
5387 return ret;
5388}
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398asmlinkage
5399long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
5400{
5401 struct task_struct *p;
5402 unsigned int time_slice;
5403 int retval;
5404 struct timespec t;
5405
5406 if (pid < 0)
5407 return -EINVAL;
5408
5409 retval = -ESRCH;
5410 read_lock(&tasklist_lock);
5411 p = find_process_by_pid(pid);
5412 if (!p)
5413 goto out_unlock;
5414
5415 retval = security_task_getscheduler(p);
5416 if (retval)
5417 goto out_unlock;
5418
5419
5420
5421
5422
5423 time_slice = 0;
5424 if (p->policy == SCHED_RR) {
5425 time_slice = DEF_TIMESLICE;
5426 } else if (p->policy != SCHED_FIFO) {
5427 struct sched_entity *se = &p->se;
5428 unsigned long flags;
5429 struct rq *rq;
5430
5431 rq = task_rq_lock(p, &flags);
5432 if (rq->cfs.load.weight)
5433 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5434 task_rq_unlock(rq, &flags);
5435 }
5436 read_unlock(&tasklist_lock);
5437 jiffies_to_timespec(time_slice, &t);
5438 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5439 return retval;
5440
5441out_unlock:
5442 read_unlock(&tasklist_lock);
5443 return retval;
5444}
5445
5446static const char stat_nam[] = "RSDTtZX";
5447
5448void sched_show_task(struct task_struct *p)
5449{
5450 unsigned long free = 0;
5451 unsigned state;
5452
5453 state = p->state ? __ffs(p->state) + 1 : 0;
5454 printk(KERN_INFO "%-13.13s %c", p->comm,
5455 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5456#if BITS_PER_LONG == 32
5457 if (state == TASK_RUNNING)
5458 printk(KERN_CONT " running ");
5459 else
5460 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5461#else
5462 if (state == TASK_RUNNING)
5463 printk(KERN_CONT " running task ");
5464 else
5465 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5466#endif
5467#ifdef CONFIG_DEBUG_STACK_USAGE
5468 {
5469 unsigned long *n = end_of_stack(p);
5470 while (!*n)
5471 n++;
5472 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5473 }
5474#endif
5475 printk(KERN_CONT "%5lu %5d %6d\n", free,
5476 task_pid_nr(p), task_pid_nr(p->real_parent));
5477
5478 show_stack(p, NULL);
5479}
5480
5481void show_state_filter(unsigned long state_filter)
5482{
5483 struct task_struct *g, *p;
5484
5485#if BITS_PER_LONG == 32
5486 printk(KERN_INFO
5487 " task PC stack pid father\n");
5488#else
5489 printk(KERN_INFO
5490 " task PC stack pid father\n");
5491#endif
5492 read_lock(&tasklist_lock);
5493 do_each_thread(g, p) {
5494
5495
5496
5497
5498 touch_nmi_watchdog();
5499 if (!state_filter || (p->state & state_filter))
5500 sched_show_task(p);
5501 } while_each_thread(g, p);
5502
5503 touch_all_softlockup_watchdogs();
5504
5505#ifdef CONFIG_SCHED_DEBUG
5506 sysrq_sched_debug_show();
5507#endif
5508 read_unlock(&tasklist_lock);
5509
5510
5511
5512 if (state_filter == -1)
5513 debug_show_all_locks();
5514}
5515
5516void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5517{
5518 idle->sched_class = &idle_sched_class;
5519}
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529void __cpuinit init_idle(struct task_struct *idle, int cpu)
5530{
5531 struct rq *rq = cpu_rq(cpu);
5532 unsigned long flags;
5533
5534 __sched_fork(idle);
5535 idle->se.exec_start = sched_clock();
5536
5537 idle->prio = idle->normal_prio = MAX_PRIO;
5538 idle->cpus_allowed = cpumask_of_cpu(cpu);
5539 __set_task_cpu(idle, cpu);
5540
5541 spin_lock_irqsave(&rq->lock, flags);
5542 rq->curr = rq->idle = idle;
5543#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5544 idle->oncpu = 1;
5545#endif
5546 spin_unlock_irqrestore(&rq->lock, flags);
5547
5548
5549#if defined(CONFIG_PREEMPT)
5550 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5551#else
5552 task_thread_info(idle)->preempt_count = 0;
5553#endif
5554
5555
5556
5557 idle->sched_class = &idle_sched_class;
5558}
5559
5560
5561
5562
5563
5564
5565
5566
5567cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578static inline void sched_init_granularity(void)
5579{
5580 unsigned int factor = 1 + ilog2(num_online_cpus());
5581 const unsigned long limit = 200000000;
5582
5583 sysctl_sched_min_granularity *= factor;
5584 if (sysctl_sched_min_granularity > limit)
5585 sysctl_sched_min_granularity = limit;
5586
5587 sysctl_sched_latency *= factor;
5588 if (sysctl_sched_latency > limit)
5589 sysctl_sched_latency = limit;
5590
5591 sysctl_sched_wakeup_granularity *= factor;
5592}
5593
5594#ifdef CONFIG_SMP
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5621{
5622 struct migration_req req;
5623 unsigned long flags;
5624 struct rq *rq;
5625 int ret = 0;
5626
5627 rq = task_rq_lock(p, &flags);
5628 if (!cpus_intersects(*new_mask, cpu_online_map)) {
5629 ret = -EINVAL;
5630 goto out;
5631 }
5632
5633 if (p->sched_class->set_cpus_allowed)
5634 p->sched_class->set_cpus_allowed(p, new_mask);
5635 else {
5636 p->cpus_allowed = *new_mask;
5637 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
5638 }
5639
5640
5641 if (cpu_isset(task_cpu(p), *new_mask))
5642 goto out;
5643
5644 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
5645
5646 task_rq_unlock(rq, &flags);
5647 wake_up_process(rq->migration_thread);
5648 wait_for_completion(&req.done);
5649 tlb_migrate_finish(p->mm);
5650 return 0;
5651 }
5652out:
5653 task_rq_unlock(rq, &flags);
5654
5655 return ret;
5656}
5657EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5671{
5672 struct rq *rq_dest, *rq_src;
5673 int ret = 0, on_rq;
5674
5675 if (unlikely(cpu_is_offline(dest_cpu)))
5676 return ret;
5677
5678 rq_src = cpu_rq(src_cpu);
5679 rq_dest = cpu_rq(dest_cpu);
5680
5681 double_rq_lock(rq_src, rq_dest);
5682
5683 if (task_cpu(p) != src_cpu)
5684 goto done;
5685
5686 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5687 goto fail;
5688
5689 on_rq = p->se.on_rq;
5690 if (on_rq)
5691 deactivate_task(rq_src, p, 0);
5692
5693 set_task_cpu(p, dest_cpu);
5694 if (on_rq) {
5695 activate_task(rq_dest, p, 0);
5696 check_preempt_curr(rq_dest, p);
5697 }
5698done:
5699 ret = 1;
5700fail:
5701 double_rq_unlock(rq_src, rq_dest);
5702 return ret;
5703}
5704
5705
5706
5707
5708
5709
5710static int migration_thread(void *data)
5711{
5712 int cpu = (long)data;
5713 struct rq *rq;
5714
5715 rq = cpu_rq(cpu);
5716 BUG_ON(rq->migration_thread != current);
5717
5718 set_current_state(TASK_INTERRUPTIBLE);
5719 while (!kthread_should_stop()) {
5720 struct migration_req *req;
5721 struct list_head *head;
5722
5723 spin_lock_irq(&rq->lock);
5724
5725 if (cpu_is_offline(cpu)) {
5726 spin_unlock_irq(&rq->lock);
5727 goto wait_to_die;
5728 }
5729
5730 if (rq->active_balance) {
5731 active_load_balance(rq, cpu);
5732 rq->active_balance = 0;
5733 }
5734
5735 head = &rq->migration_queue;
5736
5737 if (list_empty(head)) {
5738 spin_unlock_irq(&rq->lock);
5739 schedule();
5740 set_current_state(TASK_INTERRUPTIBLE);
5741 continue;
5742 }
5743 req = list_entry(head->next, struct migration_req, list);
5744 list_del_init(head->next);
5745
5746 spin_unlock(&rq->lock);
5747 __migrate_task(req->task, cpu, req->dest_cpu);
5748 local_irq_enable();
5749
5750 complete(&req->done);
5751 }
5752 __set_current_state(TASK_RUNNING);
5753 return 0;
5754
5755wait_to_die:
5756
5757 set_current_state(TASK_INTERRUPTIBLE);
5758 while (!kthread_should_stop()) {
5759 schedule();
5760 set_current_state(TASK_INTERRUPTIBLE);
5761 }
5762 __set_current_state(TASK_RUNNING);
5763 return 0;
5764}
5765
5766#ifdef CONFIG_HOTPLUG_CPU
5767
5768static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5769{
5770 int ret;
5771
5772 local_irq_disable();
5773 ret = __migrate_task(p, src_cpu, dest_cpu);
5774 local_irq_enable();
5775 return ret;
5776}
5777
5778
5779
5780
5781
5782static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5783{
5784 unsigned long flags;
5785 cpumask_t mask;
5786 struct rq *rq;
5787 int dest_cpu;
5788
5789 do {
5790
5791 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5792 cpus_and(mask, mask, p->cpus_allowed);
5793 dest_cpu = any_online_cpu(mask);
5794
5795
5796 if (dest_cpu >= nr_cpu_ids)
5797 dest_cpu = any_online_cpu(p->cpus_allowed);
5798
5799
5800 if (dest_cpu >= nr_cpu_ids) {
5801 cpumask_t cpus_allowed;
5802
5803 cpuset_cpus_allowed_locked(p, &cpus_allowed);
5804
5805
5806
5807
5808
5809
5810
5811 rq = task_rq_lock(p, &flags);
5812 p->cpus_allowed = cpus_allowed;
5813 dest_cpu = any_online_cpu(p->cpus_allowed);
5814 task_rq_unlock(rq, &flags);
5815
5816
5817
5818
5819
5820
5821 if (p->mm && printk_ratelimit()) {
5822 printk(KERN_INFO "process %d (%s) no "
5823 "longer affine to cpu%d\n",
5824 task_pid_nr(p), p->comm, dead_cpu);
5825 }
5826 }
5827 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5828}
5829
5830
5831
5832
5833
5834
5835
5836
5837static void migrate_nr_uninterruptible(struct rq *rq_src)
5838{
5839 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
5840 unsigned long flags;
5841
5842 local_irq_save(flags);
5843 double_rq_lock(rq_src, rq_dest);
5844 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5845 rq_src->nr_uninterruptible = 0;
5846 double_rq_unlock(rq_src, rq_dest);
5847 local_irq_restore(flags);
5848}
5849
5850
5851static void migrate_live_tasks(int src_cpu)
5852{
5853 struct task_struct *p, *t;
5854
5855 read_lock(&tasklist_lock);
5856
5857 do_each_thread(t, p) {
5858 if (p == current)
5859 continue;
5860
5861 if (task_cpu(p) == src_cpu)
5862 move_task_off_dead_cpu(src_cpu, p);
5863 } while_each_thread(t, p);
5864
5865 read_unlock(&tasklist_lock);
5866}
5867
5868
5869
5870
5871
5872
5873void sched_idle_next(void)
5874{
5875 int this_cpu = smp_processor_id();
5876 struct rq *rq = cpu_rq(this_cpu);
5877 struct task_struct *p = rq->idle;
5878 unsigned long flags;
5879
5880
5881 BUG_ON(cpu_online(this_cpu));
5882
5883
5884
5885
5886
5887 spin_lock_irqsave(&rq->lock, flags);
5888
5889 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5890
5891 update_rq_clock(rq);
5892 activate_task(rq, p, 0);
5893
5894 spin_unlock_irqrestore(&rq->lock, flags);
5895}
5896
5897
5898
5899
5900
5901void idle_task_exit(void)
5902{
5903 struct mm_struct *mm = current->active_mm;
5904
5905 BUG_ON(cpu_online(smp_processor_id()));
5906
5907 if (mm != &init_mm)
5908 switch_mm(mm, &init_mm, current);
5909 mmdrop(mm);
5910}
5911
5912
5913static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5914{
5915 struct rq *rq = cpu_rq(dead_cpu);
5916
5917
5918 BUG_ON(!p->exit_state);
5919
5920
5921 BUG_ON(p->state == TASK_DEAD);
5922
5923 get_task_struct(p);
5924
5925
5926
5927
5928
5929
5930 spin_unlock_irq(&rq->lock);
5931 move_task_off_dead_cpu(dead_cpu, p);
5932 spin_lock_irq(&rq->lock);
5933
5934 put_task_struct(p);
5935}
5936
5937
5938static void migrate_dead_tasks(unsigned int dead_cpu)
5939{
5940 struct rq *rq = cpu_rq(dead_cpu);
5941 struct task_struct *next;
5942
5943 for ( ; ; ) {
5944 if (!rq->nr_running)
5945 break;
5946 update_rq_clock(rq);
5947 next = pick_next_task(rq, rq->curr);
5948 if (!next)
5949 break;
5950 next->sched_class->put_prev_task(rq, next);
5951 migrate_dead(dead_cpu, next);
5952
5953 }
5954}
5955#endif
5956
5957#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5958
5959static struct ctl_table sd_ctl_dir[] = {
5960 {
5961 .procname = "sched_domain",
5962 .mode = 0555,
5963 },
5964 {0, },
5965};
5966
5967static struct ctl_table sd_ctl_root[] = {
5968 {
5969 .ctl_name = CTL_KERN,
5970 .procname = "kernel",
5971 .mode = 0555,
5972 .child = sd_ctl_dir,
5973 },
5974 {0, },
5975};
5976
5977static struct ctl_table *sd_alloc_ctl_entry(int n)
5978{
5979 struct ctl_table *entry =
5980 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5981
5982 return entry;
5983}
5984
5985static void sd_free_ctl_entry(struct ctl_table **tablep)
5986{
5987 struct ctl_table *entry;
5988
5989
5990
5991
5992
5993
5994
5995 for (entry = *tablep; entry->mode; entry++) {
5996 if (entry->child)
5997 sd_free_ctl_entry(&entry->child);
5998 if (entry->proc_handler == NULL)
5999 kfree(entry->procname);
6000 }
6001
6002 kfree(*tablep);
6003 *tablep = NULL;
6004}
6005
6006static void
6007set_table_entry(struct ctl_table *entry,
6008 const char *procname, void *data, int maxlen,
6009 mode_t mode, proc_handler *proc_handler)
6010{
6011 entry->procname = procname;
6012 entry->data = data;
6013 entry->maxlen = maxlen;
6014 entry->mode = mode;
6015 entry->proc_handler = proc_handler;
6016}
6017
6018static struct ctl_table *
6019sd_alloc_ctl_domain_table(struct sched_domain *sd)
6020{
6021 struct ctl_table *table = sd_alloc_ctl_entry(12);
6022
6023 if (table == NULL)
6024 return NULL;
6025
6026 set_table_entry(&table[0], "min_interval", &sd->min_interval,
6027 sizeof(long), 0644, proc_doulongvec_minmax);
6028 set_table_entry(&table[1], "max_interval", &sd->max_interval,
6029 sizeof(long), 0644, proc_doulongvec_minmax);
6030 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6031 sizeof(int), 0644, proc_dointvec_minmax);
6032 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6033 sizeof(int), 0644, proc_dointvec_minmax);
6034 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6035 sizeof(int), 0644, proc_dointvec_minmax);
6036 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6037 sizeof(int), 0644, proc_dointvec_minmax);
6038 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6039 sizeof(int), 0644, proc_dointvec_minmax);
6040 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6041 sizeof(int), 0644, proc_dointvec_minmax);
6042 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6043 sizeof(int), 0644, proc_dointvec_minmax);
6044 set_table_entry(&table[9], "cache_nice_tries",
6045 &sd->cache_nice_tries,
6046 sizeof(int), 0644, proc_dointvec_minmax);
6047 set_table_entry(&table[10], "flags", &sd->flags,
6048 sizeof(int), 0644, proc_dointvec_minmax);
6049
6050
6051 return table;
6052}
6053
6054static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6055{
6056 struct ctl_table *entry, *table;
6057 struct sched_domain *sd;
6058 int domain_num = 0, i;
6059 char buf[32];
6060
6061 for_each_domain(cpu, sd)
6062 domain_num++;
6063 entry = table = sd_alloc_ctl_entry(domain_num + 1);
6064 if (table == NULL)
6065 return NULL;
6066
6067 i = 0;
6068 for_each_domain(cpu, sd) {
6069 snprintf(buf, 32, "domain%d", i);
6070 entry->procname = kstrdup(buf, GFP_KERNEL);
6071 entry->mode = 0555;
6072 entry->child = sd_alloc_ctl_domain_table(sd);
6073 entry++;
6074 i++;
6075 }
6076 return table;
6077}
6078
6079static struct ctl_table_header *sd_sysctl_header;
6080static void register_sched_domain_sysctl(void)
6081{
6082 int i, cpu_num = num_online_cpus();
6083 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6084 char buf[32];
6085
6086 WARN_ON(sd_ctl_dir[0].child);
6087 sd_ctl_dir[0].child = entry;
6088
6089 if (entry == NULL)
6090 return;
6091
6092 for_each_online_cpu(i) {
6093 snprintf(buf, 32, "cpu%d", i);
6094 entry->procname = kstrdup(buf, GFP_KERNEL);
6095 entry->mode = 0555;
6096 entry->child = sd_alloc_ctl_cpu_table(i);
6097 entry++;
6098 }
6099
6100 WARN_ON(sd_sysctl_header);
6101 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6102}
6103
6104
6105static void unregister_sched_domain_sysctl(void)
6106{
6107 if (sd_sysctl_header)
6108 unregister_sysctl_table(sd_sysctl_header);
6109 sd_sysctl_header = NULL;
6110 if (sd_ctl_dir[0].child)
6111 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6112}
6113#else
6114static void register_sched_domain_sysctl(void)
6115{
6116}
6117static void unregister_sched_domain_sysctl(void)
6118{
6119}
6120#endif
6121
6122
6123
6124
6125
6126static int __cpuinit
6127migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6128{
6129 struct task_struct *p;
6130 int cpu = (long)hcpu;
6131 unsigned long flags;
6132 struct rq *rq;
6133
6134 switch (action) {
6135
6136 case CPU_UP_PREPARE:
6137 case CPU_UP_PREPARE_FROZEN:
6138 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
6139 if (IS_ERR(p))
6140 return NOTIFY_BAD;
6141 kthread_bind(p, cpu);
6142
6143 rq = task_rq_lock(p, &flags);
6144 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6145 task_rq_unlock(rq, &flags);
6146 cpu_rq(cpu)->migration_thread = p;
6147 break;
6148
6149 case CPU_ONLINE:
6150 case CPU_ONLINE_FROZEN:
6151
6152 wake_up_process(cpu_rq(cpu)->migration_thread);
6153
6154
6155 rq = cpu_rq(cpu);
6156 spin_lock_irqsave(&rq->lock, flags);
6157 if (rq->rd) {
6158 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6159 cpu_set(cpu, rq->rd->online);
6160 }
6161 spin_unlock_irqrestore(&rq->lock, flags);
6162 break;
6163
6164#ifdef CONFIG_HOTPLUG_CPU
6165 case CPU_UP_CANCELED:
6166 case CPU_UP_CANCELED_FROZEN:
6167 if (!cpu_rq(cpu)->migration_thread)
6168 break;
6169
6170 kthread_bind(cpu_rq(cpu)->migration_thread,
6171 any_online_cpu(cpu_online_map));
6172 kthread_stop(cpu_rq(cpu)->migration_thread);
6173 cpu_rq(cpu)->migration_thread = NULL;
6174 break;
6175
6176 case CPU_DEAD:
6177 case CPU_DEAD_FROZEN:
6178 cpuset_lock();
6179 migrate_live_tasks(cpu);
6180 rq = cpu_rq(cpu);
6181 kthread_stop(rq->migration_thread);
6182 rq->migration_thread = NULL;
6183
6184 spin_lock_irq(&rq->lock);
6185 update_rq_clock(rq);
6186 deactivate_task(rq, rq->idle, 0);
6187 rq->idle->static_prio = MAX_PRIO;
6188 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6189 rq->idle->sched_class = &idle_sched_class;
6190 migrate_dead_tasks(cpu);
6191 spin_unlock_irq(&rq->lock);
6192 cpuset_unlock();
6193 migrate_nr_uninterruptible(rq);
6194 BUG_ON(rq->nr_running != 0);
6195
6196
6197
6198
6199
6200
6201 spin_lock_irq(&rq->lock);
6202 while (!list_empty(&rq->migration_queue)) {
6203 struct migration_req *req;
6204
6205 req = list_entry(rq->migration_queue.next,
6206 struct migration_req, list);
6207 list_del_init(&req->list);
6208 complete(&req->done);
6209 }
6210 spin_unlock_irq(&rq->lock);
6211 break;
6212
6213 case CPU_DYING:
6214 case CPU_DYING_FROZEN:
6215
6216 rq = cpu_rq(cpu);
6217 spin_lock_irqsave(&rq->lock, flags);
6218 if (rq->rd) {
6219 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6220 cpu_clear(cpu, rq->rd->online);
6221 }
6222 spin_unlock_irqrestore(&rq->lock, flags);
6223 break;
6224#endif
6225 }
6226 return NOTIFY_OK;
6227}
6228
6229
6230
6231
6232static struct notifier_block __cpuinitdata migration_notifier = {
6233 .notifier_call = migration_call,
6234 .priority = 10
6235};
6236
6237void __init migration_init(void)
6238{
6239 void *cpu = (void *)(long)smp_processor_id();
6240 int err;
6241
6242
6243 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6244 BUG_ON(err == NOTIFY_BAD);
6245 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6246 register_cpu_notifier(&migration_notifier);
6247}
6248#endif
6249
6250#ifdef CONFIG_SMP
6251
6252#ifdef CONFIG_SCHED_DEBUG
6253
6254static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6255 cpumask_t *groupmask)
6256{
6257 struct sched_group *group = sd->groups;
6258 char str[256];
6259
6260 cpulist_scnprintf(str, sizeof(str), sd->span);
6261 cpus_clear(*groupmask);
6262
6263 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6264
6265 if (!(sd->flags & SD_LOAD_BALANCE)) {
6266 printk("does not load-balance\n");
6267 if (sd->parent)
6268 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6269 " has parent");
6270 return -1;
6271 }
6272
6273 printk(KERN_CONT "span %s\n", str);
6274
6275 if (!cpu_isset(cpu, sd->span)) {
6276 printk(KERN_ERR "ERROR: domain->span does not contain "
6277 "CPU%d\n", cpu);
6278 }
6279 if (!cpu_isset(cpu, group->cpumask)) {
6280 printk(KERN_ERR "ERROR: domain->groups does not contain"
6281 " CPU%d\n", cpu);
6282 }
6283
6284 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6285 do {
6286 if (!group) {
6287 printk("\n");
6288 printk(KERN_ERR "ERROR: group is NULL\n");
6289 break;
6290 }
6291
6292 if (!group->__cpu_power) {
6293 printk(KERN_CONT "\n");
6294 printk(KERN_ERR "ERROR: domain->cpu_power not "
6295 "set\n");
6296 break;
6297 }
6298
6299 if (!cpus_weight(group->cpumask)) {
6300 printk(KERN_CONT "\n");
6301 printk(KERN_ERR "ERROR: empty group\n");
6302 break;
6303 }
6304
6305 if (cpus_intersects(*groupmask, group->cpumask)) {
6306 printk(KERN_CONT "\n");
6307 printk(KERN_ERR "ERROR: repeated CPUs\n");
6308 break;
6309 }
6310
6311 cpus_or(*groupmask, *groupmask, group->cpumask);
6312
6313 cpulist_scnprintf(str, sizeof(str), group->cpumask);
6314 printk(KERN_CONT " %s", str);
6315
6316 group = group->next;
6317 } while (group != sd->groups);
6318 printk(KERN_CONT "\n");
6319
6320 if (!cpus_equal(sd->span, *groupmask))
6321 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6322
6323 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
6324 printk(KERN_ERR "ERROR: parent span is not a superset "
6325 "of domain->span\n");
6326 return 0;
6327}
6328
6329static void sched_domain_debug(struct sched_domain *sd, int cpu)
6330{
6331 cpumask_t *groupmask;
6332 int level = 0;
6333
6334 if (!sd) {
6335 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6336 return;
6337 }
6338
6339 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6340
6341 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6342 if (!groupmask) {
6343 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6344 return;
6345 }
6346
6347 for (;;) {
6348 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6349 break;
6350 level++;
6351 sd = sd->parent;
6352 if (!sd)
6353 break;
6354 }
6355 kfree(groupmask);
6356}
6357#else
6358# define sched_domain_debug(sd, cpu) do { } while (0)
6359#endif
6360
6361static int sd_degenerate(struct sched_domain *sd)
6362{
6363 if (cpus_weight(sd->span) == 1)
6364 return 1;
6365
6366
6367 if (sd->flags & (SD_LOAD_BALANCE |
6368 SD_BALANCE_NEWIDLE |
6369 SD_BALANCE_FORK |
6370 SD_BALANCE_EXEC |
6371 SD_SHARE_CPUPOWER |
6372 SD_SHARE_PKG_RESOURCES)) {
6373 if (sd->groups != sd->groups->next)
6374 return 0;
6375 }
6376
6377
6378 if (sd->flags & (SD_WAKE_IDLE |
6379 SD_WAKE_AFFINE |
6380 SD_WAKE_BALANCE))
6381 return 0;
6382
6383 return 1;
6384}
6385
6386static int
6387sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6388{
6389 unsigned long cflags = sd->flags, pflags = parent->flags;
6390
6391 if (sd_degenerate(parent))
6392 return 1;
6393
6394 if (!cpus_equal(sd->span, parent->span))
6395 return 0;
6396
6397
6398
6399 if (cflags & SD_WAKE_AFFINE)
6400 pflags &= ~SD_WAKE_BALANCE;
6401
6402 if (parent->groups == parent->groups->next) {
6403 pflags &= ~(SD_LOAD_BALANCE |
6404 SD_BALANCE_NEWIDLE |
6405 SD_BALANCE_FORK |
6406 SD_BALANCE_EXEC |
6407 SD_SHARE_CPUPOWER |
6408 SD_SHARE_PKG_RESOURCES);
6409 }
6410 if (~cflags & pflags)
6411 return 0;
6412
6413 return 1;
6414}
6415
6416static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6417{
6418 unsigned long flags;
6419 const struct sched_class *class;
6420
6421 spin_lock_irqsave(&rq->lock, flags);
6422
6423 if (rq->rd) {
6424 struct root_domain *old_rd = rq->rd;
6425
6426 for (class = sched_class_highest; class; class = class->next) {
6427 if (class->leave_domain)
6428 class->leave_domain(rq);
6429 }
6430
6431 cpu_clear(rq->cpu, old_rd->span);
6432 cpu_clear(rq->cpu, old_rd->online);
6433
6434 if (atomic_dec_and_test(&old_rd->refcount))
6435 kfree(old_rd);
6436 }
6437
6438 atomic_inc(&rd->refcount);
6439 rq->rd = rd;
6440
6441 cpu_set(rq->cpu, rd->span);
6442 if (cpu_isset(rq->cpu, cpu_online_map))
6443 cpu_set(rq->cpu, rd->online);
6444
6445 for (class = sched_class_highest; class; class = class->next) {
6446 if (class->join_domain)
6447 class->join_domain(rq);
6448 }
6449
6450 spin_unlock_irqrestore(&rq->lock, flags);
6451}
6452
6453static void init_rootdomain(struct root_domain *rd)
6454{
6455 memset(rd, 0, sizeof(*rd));
6456
6457 cpus_clear(rd->span);
6458 cpus_clear(rd->online);
6459}
6460
6461static void init_defrootdomain(void)
6462{
6463 init_rootdomain(&def_root_domain);
6464 atomic_set(&def_root_domain.refcount, 1);
6465}
6466
6467static struct root_domain *alloc_rootdomain(void)
6468{
6469 struct root_domain *rd;
6470
6471 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6472 if (!rd)
6473 return NULL;
6474
6475 init_rootdomain(rd);
6476
6477 return rd;
6478}
6479
6480
6481
6482
6483
6484static void
6485cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6486{
6487 struct rq *rq = cpu_rq(cpu);
6488 struct sched_domain *tmp;
6489
6490
6491 for (tmp = sd; tmp; tmp = tmp->parent) {
6492 struct sched_domain *parent = tmp->parent;
6493 if (!parent)
6494 break;
6495 if (sd_parent_degenerate(tmp, parent)) {
6496 tmp->parent = parent->parent;
6497 if (parent->parent)
6498 parent->parent->child = tmp;
6499 }
6500 }
6501
6502 if (sd && sd_degenerate(sd)) {
6503 sd = sd->parent;
6504 if (sd)
6505 sd->child = NULL;
6506 }
6507
6508 sched_domain_debug(sd, cpu);
6509
6510 rq_attach_root(rq, rd);
6511 rcu_assign_pointer(rq->sd, sd);
6512}
6513
6514
6515static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6516
6517
6518static int __init isolated_cpu_setup(char *str)
6519{
6520 int ints[NR_CPUS], i;
6521
6522 str = get_options(str, ARRAY_SIZE(ints), ints);
6523 cpus_clear(cpu_isolated_map);
6524 for (i = 1; i <= ints[0]; i++)
6525 if (ints[i] < NR_CPUS)
6526 cpu_set(ints[i], cpu_isolated_map);
6527 return 1;
6528}
6529
6530__setup("isolcpus=", isolated_cpu_setup);
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542static void
6543init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6544 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
6545 struct sched_group **sg,
6546 cpumask_t *tmpmask),
6547 cpumask_t *covered, cpumask_t *tmpmask)
6548{
6549 struct sched_group *first = NULL, *last = NULL;
6550 int i;
6551
6552 cpus_clear(*covered);
6553
6554 for_each_cpu_mask(i, *span) {
6555 struct sched_group *sg;
6556 int group = group_fn(i, cpu_map, &sg, tmpmask);
6557 int j;
6558
6559 if (cpu_isset(i, *covered))
6560 continue;
6561
6562 cpus_clear(sg->cpumask);
6563 sg->__cpu_power = 0;
6564
6565 for_each_cpu_mask(j, *span) {
6566 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6567 continue;
6568
6569 cpu_set(j, *covered);
6570 cpu_set(j, sg->cpumask);
6571 }
6572 if (!first)
6573 first = sg;
6574 if (last)
6575 last->next = sg;
6576 last = sg;
6577 }
6578 last->next = first;
6579}
6580
6581#define SD_NODES_PER_DOMAIN 16
6582
6583#ifdef CONFIG_NUMA
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595static int find_next_best_node(int node, nodemask_t *used_nodes)
6596{
6597 int i, n, val, min_val, best_node = 0;
6598
6599 min_val = INT_MAX;
6600
6601 for (i = 0; i < MAX_NUMNODES; i++) {
6602
6603 n = (node + i) % MAX_NUMNODES;
6604
6605 if (!nr_cpus_node(n))
6606 continue;
6607
6608
6609 if (node_isset(n, *used_nodes))
6610 continue;
6611
6612
6613 val = node_distance(node, n);
6614
6615 if (val < min_val) {
6616 min_val = val;
6617 best_node = n;
6618 }
6619 }
6620
6621 node_set(best_node, *used_nodes);
6622 return best_node;
6623}
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634static void sched_domain_node_span(int node, cpumask_t *span)
6635{
6636 nodemask_t used_nodes;
6637 node_to_cpumask_ptr(nodemask, node);
6638 int i;
6639
6640 cpus_clear(*span);
6641 nodes_clear(used_nodes);
6642
6643 cpus_or(*span, *span, *nodemask);
6644 node_set(node, used_nodes);
6645
6646 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6647 int next_node = find_next_best_node(node, &used_nodes);
6648
6649 node_to_cpumask_ptr_next(nodemask, next_node);
6650 cpus_or(*span, *span, *nodemask);
6651 }
6652}
6653#endif
6654
6655int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6656
6657
6658
6659
6660#ifdef CONFIG_SCHED_SMT
6661static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6662static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6663
6664static int
6665cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6666 cpumask_t *unused)
6667{
6668 if (sg)
6669 *sg = &per_cpu(sched_group_cpus, cpu);
6670 return cpu;
6671}
6672#endif
6673
6674
6675
6676
6677#ifdef CONFIG_SCHED_MC
6678static DEFINE_PER_CPU(struct sched_domain, core_domains);
6679static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6680#endif
6681
6682#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6683static int
6684cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6685 cpumask_t *mask)
6686{
6687 int group;
6688
6689 *mask = per_cpu(cpu_sibling_map, cpu);
6690 cpus_and(*mask, *mask, *cpu_map);
6691 group = first_cpu(*mask);
6692 if (sg)
6693 *sg = &per_cpu(sched_group_core, group);
6694 return group;
6695}
6696#elif defined(CONFIG_SCHED_MC)
6697static int
6698cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6699 cpumask_t *unused)
6700{
6701 if (sg)
6702 *sg = &per_cpu(sched_group_core, cpu);
6703 return cpu;
6704}
6705#endif
6706
6707static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6708static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6709
6710static int
6711cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6712 cpumask_t *mask)
6713{
6714 int group;
6715#ifdef CONFIG_SCHED_MC
6716 *mask = cpu_coregroup_map(cpu);
6717 cpus_and(*mask, *mask, *cpu_map);
6718 group = first_cpu(*mask);
6719#elif defined(CONFIG_SCHED_SMT)
6720 *mask = per_cpu(cpu_sibling_map, cpu);
6721 cpus_and(*mask, *mask, *cpu_map);
6722 group = first_cpu(*mask);
6723#else
6724 group = cpu;
6725#endif
6726 if (sg)
6727 *sg = &per_cpu(sched_group_phys, group);
6728 return group;
6729}
6730
6731#ifdef CONFIG_NUMA
6732
6733
6734
6735
6736
6737static DEFINE_PER_CPU(struct sched_domain, node_domains);
6738static struct sched_group ***sched_group_nodes_bycpu;
6739
6740static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6741static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6742
6743static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6744 struct sched_group **sg, cpumask_t *nodemask)
6745{
6746 int group;
6747
6748 *nodemask = node_to_cpumask(cpu_to_node(cpu));
6749 cpus_and(*nodemask, *nodemask, *cpu_map);
6750 group = first_cpu(*nodemask);
6751
6752 if (sg)
6753 *sg = &per_cpu(sched_group_allnodes, group);
6754 return group;
6755}
6756
6757static void init_numa_sched_groups_power(struct sched_group *group_head)
6758{
6759 struct sched_group *sg = group_head;
6760 int j;
6761
6762 if (!sg)
6763 return;
6764 do {
6765 for_each_cpu_mask(j, sg->cpumask) {
6766 struct sched_domain *sd;
6767
6768 sd = &per_cpu(phys_domains, j);
6769 if (j != first_cpu(sd->groups->cpumask)) {
6770
6771
6772
6773
6774 continue;
6775 }
6776
6777 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
6778 }
6779 sg = sg->next;
6780 } while (sg != group_head);
6781}
6782#endif
6783
6784#ifdef CONFIG_NUMA
6785
6786static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6787{
6788 int cpu, i;
6789
6790 for_each_cpu_mask(cpu, *cpu_map) {
6791 struct sched_group **sched_group_nodes
6792 = sched_group_nodes_bycpu[cpu];
6793
6794 if (!sched_group_nodes)
6795 continue;
6796
6797 for (i = 0; i < MAX_NUMNODES; i++) {
6798 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6799
6800 *nodemask = node_to_cpumask(i);
6801 cpus_and(*nodemask, *nodemask, *cpu_map);
6802 if (cpus_empty(*nodemask))
6803 continue;
6804
6805 if (sg == NULL)
6806 continue;
6807 sg = sg->next;
6808next_sg:
6809 oldsg = sg;
6810 sg = sg->next;
6811 kfree(oldsg);
6812 if (oldsg != sched_group_nodes[i])
6813 goto next_sg;
6814 }
6815 kfree(sched_group_nodes);
6816 sched_group_nodes_bycpu[cpu] = NULL;
6817 }
6818}
6819#else
6820static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6821{
6822}
6823#endif
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6840{
6841 struct sched_domain *child;
6842 struct sched_group *group;
6843
6844 WARN_ON(!sd || !sd->groups);
6845
6846 if (cpu != first_cpu(sd->groups->cpumask))
6847 return;
6848
6849 child = sd->child;
6850
6851 sd->groups->__cpu_power = 0;
6852
6853
6854
6855
6856
6857
6858
6859
6860 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6861 (child->flags &
6862 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6863 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
6864 return;
6865 }
6866
6867
6868
6869
6870 group = child->groups;
6871 do {
6872 sg_inc_cpu_power(sd->groups, group->__cpu_power);
6873 group = group->next;
6874 } while (group != child->groups);
6875}
6876
6877
6878
6879
6880
6881
6882#define SD_INIT(sd, type) sd_init_##type(sd)
6883#define SD_INIT_FUNC(type) \
6884static noinline void sd_init_##type(struct sched_domain *sd) \
6885{ \
6886 memset(sd, 0, sizeof(*sd)); \
6887 *sd = SD_##type##_INIT; \
6888 sd->level = SD_LV_##type; \
6889}
6890
6891SD_INIT_FUNC(CPU)
6892#ifdef CONFIG_NUMA
6893 SD_INIT_FUNC(ALLNODES)
6894 SD_INIT_FUNC(NODE)
6895#endif
6896#ifdef CONFIG_SCHED_SMT
6897 SD_INIT_FUNC(SIBLING)
6898#endif
6899#ifdef CONFIG_SCHED_MC
6900 SD_INIT_FUNC(MC)
6901#endif
6902
6903
6904
6905
6906
6907
6908struct allmasks {
6909 cpumask_t tmpmask;
6910 union {
6911 cpumask_t nodemask;
6912 cpumask_t this_sibling_map;
6913 cpumask_t this_core_map;
6914 };
6915 cpumask_t send_covered;
6916
6917#ifdef CONFIG_NUMA
6918 cpumask_t domainspan;
6919 cpumask_t covered;
6920 cpumask_t notcovered;
6921#endif
6922};
6923
6924#if NR_CPUS > 128
6925#define SCHED_CPUMASK_ALLOC 1
6926#define SCHED_CPUMASK_FREE(v) kfree(v)
6927#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
6928#else
6929#define SCHED_CPUMASK_ALLOC 0
6930#define SCHED_CPUMASK_FREE(v)
6931#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
6932#endif
6933
6934#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
6935 ((unsigned long)(a) + offsetof(struct allmasks, v))
6936
6937static int default_relax_domain_level = -1;
6938
6939static int __init setup_relax_domain_level(char *str)
6940{
6941 unsigned long val;
6942
6943 val = simple_strtoul(str, NULL, 0);
6944 if (val < SD_LV_MAX)
6945 default_relax_domain_level = val;
6946
6947 return 1;
6948}
6949__setup("relax_domain_level=", setup_relax_domain_level);
6950
6951static void set_domain_attribute(struct sched_domain *sd,
6952 struct sched_domain_attr *attr)
6953{
6954 int request;
6955
6956 if (!attr || attr->relax_domain_level < 0) {
6957 if (default_relax_domain_level < 0)
6958 return;
6959 else
6960 request = default_relax_domain_level;
6961 } else
6962 request = attr->relax_domain_level;
6963 if (request < sd->level) {
6964
6965 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
6966 } else {
6967
6968 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
6969 }
6970}
6971
6972
6973
6974
6975
6976static int __build_sched_domains(const cpumask_t *cpu_map,
6977 struct sched_domain_attr *attr)
6978{
6979 int i;
6980 struct root_domain *rd;
6981 SCHED_CPUMASK_DECLARE(allmasks);
6982 cpumask_t *tmpmask;
6983#ifdef CONFIG_NUMA
6984 struct sched_group **sched_group_nodes = NULL;
6985 int sd_allnodes = 0;
6986
6987
6988
6989
6990 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6991 GFP_KERNEL);
6992 if (!sched_group_nodes) {
6993 printk(KERN_WARNING "Can not alloc sched group node list\n");
6994 return -ENOMEM;
6995 }
6996#endif
6997
6998 rd = alloc_rootdomain();
6999 if (!rd) {
7000 printk(KERN_WARNING "Cannot alloc root domain\n");
7001#ifdef CONFIG_NUMA
7002 kfree(sched_group_nodes);
7003#endif
7004 return -ENOMEM;
7005 }
7006
7007#if SCHED_CPUMASK_ALLOC
7008
7009 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7010 if (!allmasks) {
7011 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7012 kfree(rd);
7013#ifdef CONFIG_NUMA
7014 kfree(sched_group_nodes);
7015#endif
7016 return -ENOMEM;
7017 }
7018#endif
7019 tmpmask = (cpumask_t *)allmasks;
7020
7021
7022#ifdef CONFIG_NUMA
7023 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7024#endif
7025
7026
7027
7028
7029 for_each_cpu_mask(i, *cpu_map) {
7030 struct sched_domain *sd = NULL, *p;
7031 SCHED_CPUMASK_VAR(nodemask, allmasks);
7032
7033 *nodemask = node_to_cpumask(cpu_to_node(i));
7034 cpus_and(*nodemask, *nodemask, *cpu_map);
7035
7036#ifdef CONFIG_NUMA
7037 if (cpus_weight(*cpu_map) >
7038 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
7039 sd = &per_cpu(allnodes_domains, i);
7040 SD_INIT(sd, ALLNODES);
7041 set_domain_attribute(sd, attr);
7042 sd->span = *cpu_map;
7043 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7044 p = sd;
7045 sd_allnodes = 1;
7046 } else
7047 p = NULL;
7048
7049 sd = &per_cpu(node_domains, i);
7050 SD_INIT(sd, NODE);
7051 set_domain_attribute(sd, attr);
7052 sched_domain_node_span(cpu_to_node(i), &sd->span);
7053 sd->parent = p;
7054 if (p)
7055 p->child = sd;
7056 cpus_and(sd->span, sd->span, *cpu_map);
7057#endif
7058
7059 p = sd;
7060 sd = &per_cpu(phys_domains, i);
7061 SD_INIT(sd, CPU);
7062 set_domain_attribute(sd, attr);
7063 sd->span = *nodemask;
7064 sd->parent = p;
7065 if (p)
7066 p->child = sd;
7067 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
7068
7069#ifdef CONFIG_SCHED_MC
7070 p = sd;
7071 sd = &per_cpu(core_domains, i);
7072 SD_INIT(sd, MC);
7073 set_domain_attribute(sd, attr);
7074 sd->span = cpu_coregroup_map(i);
7075 cpus_and(sd->span, sd->span, *cpu_map);
7076 sd->parent = p;
7077 p->child = sd;
7078 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
7079#endif
7080
7081#ifdef CONFIG_SCHED_SMT
7082 p = sd;
7083 sd = &per_cpu(cpu_domains, i);
7084 SD_INIT(sd, SIBLING);
7085 set_domain_attribute(sd, attr);
7086 sd->span = per_cpu(cpu_sibling_map, i);
7087 cpus_and(sd->span, sd->span, *cpu_map);
7088 sd->parent = p;
7089 p->child = sd;
7090 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
7091#endif
7092 }
7093
7094#ifdef CONFIG_SCHED_SMT
7095
7096 for_each_cpu_mask(i, *cpu_map) {
7097 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7098 SCHED_CPUMASK_VAR(send_covered, allmasks);
7099
7100 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7101 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7102 if (i != first_cpu(*this_sibling_map))
7103 continue;
7104
7105 init_sched_build_groups(this_sibling_map, cpu_map,
7106 &cpu_to_cpu_group,
7107 send_covered, tmpmask);
7108 }
7109#endif
7110
7111#ifdef CONFIG_SCHED_MC
7112
7113 for_each_cpu_mask(i, *cpu_map) {
7114 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7115 SCHED_CPUMASK_VAR(send_covered, allmasks);
7116
7117 *this_core_map = cpu_coregroup_map(i);
7118 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7119 if (i != first_cpu(*this_core_map))
7120 continue;
7121
7122 init_sched_build_groups(this_core_map, cpu_map,
7123 &cpu_to_core_group,
7124 send_covered, tmpmask);
7125 }
7126#endif
7127
7128
7129 for (i = 0; i < MAX_NUMNODES; i++) {
7130 SCHED_CPUMASK_VAR(nodemask, allmasks);
7131 SCHED_CPUMASK_VAR(send_covered, allmasks);
7132
7133 *nodemask = node_to_cpumask(i);
7134 cpus_and(*nodemask, *nodemask, *cpu_map);
7135 if (cpus_empty(*nodemask))
7136 continue;
7137
7138 init_sched_build_groups(nodemask, cpu_map,
7139 &cpu_to_phys_group,
7140 send_covered, tmpmask);
7141 }
7142
7143#ifdef CONFIG_NUMA
7144
7145 if (sd_allnodes) {
7146 SCHED_CPUMASK_VAR(send_covered, allmasks);
7147
7148 init_sched_build_groups(cpu_map, cpu_map,
7149 &cpu_to_allnodes_group,
7150 send_covered, tmpmask);
7151 }
7152
7153 for (i = 0; i < MAX_NUMNODES; i++) {
7154
7155 struct sched_group *sg, *prev;
7156 SCHED_CPUMASK_VAR(nodemask, allmasks);
7157 SCHED_CPUMASK_VAR(domainspan, allmasks);
7158 SCHED_CPUMASK_VAR(covered, allmasks);
7159 int j;
7160
7161 *nodemask = node_to_cpumask(i);
7162 cpus_clear(*covered);
7163
7164 cpus_and(*nodemask, *nodemask, *cpu_map);
7165 if (cpus_empty(*nodemask)) {
7166 sched_group_nodes[i] = NULL;
7167 continue;
7168 }
7169
7170 sched_domain_node_span(i, domainspan);
7171 cpus_and(*domainspan, *domainspan, *cpu_map);
7172
7173 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
7174 if (!sg) {
7175 printk(KERN_WARNING "Can not alloc domain group for "
7176 "node %d\n", i);
7177 goto error;
7178 }
7179 sched_group_nodes[i] = sg;
7180 for_each_cpu_mask(j, *nodemask) {
7181 struct sched_domain *sd;
7182
7183 sd = &per_cpu(node_domains, j);
7184 sd->groups = sg;
7185 }
7186 sg->__cpu_power = 0;
7187 sg->cpumask = *nodemask;
7188 sg->next = sg;
7189 cpus_or(*covered, *covered, *nodemask);
7190 prev = sg;
7191
7192 for (j = 0; j < MAX_NUMNODES; j++) {
7193 SCHED_CPUMASK_VAR(notcovered, allmasks);
7194 int n = (i + j) % MAX_NUMNODES;
7195 node_to_cpumask_ptr(pnodemask, n);
7196
7197 cpus_complement(*notcovered, *covered);
7198 cpus_and(*tmpmask, *notcovered, *cpu_map);
7199 cpus_and(*tmpmask, *tmpmask, *domainspan);
7200 if (cpus_empty(*tmpmask))
7201 break;
7202
7203 cpus_and(*tmpmask, *tmpmask, *pnodemask);
7204 if (cpus_empty(*tmpmask))
7205 continue;
7206
7207 sg = kmalloc_node(sizeof(struct sched_group),
7208 GFP_KERNEL, i);
7209 if (!sg) {
7210 printk(KERN_WARNING
7211 "Can not alloc domain group for node %d\n", j);
7212 goto error;
7213 }
7214 sg->__cpu_power = 0;
7215 sg->cpumask = *tmpmask;
7216 sg->next = prev->next;
7217 cpus_or(*covered, *covered, *tmpmask);
7218 prev->next = sg;
7219 prev = sg;
7220 }
7221 }
7222#endif
7223
7224
7225#ifdef CONFIG_SCHED_SMT
7226 for_each_cpu_mask(i, *cpu_map) {
7227 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7228
7229 init_sched_groups_power(i, sd);
7230 }
7231#endif
7232#ifdef CONFIG_SCHED_MC
7233 for_each_cpu_mask(i, *cpu_map) {
7234 struct sched_domain *sd = &per_cpu(core_domains, i);
7235
7236 init_sched_groups_power(i, sd);
7237 }
7238#endif
7239
7240 for_each_cpu_mask(i, *cpu_map) {
7241 struct sched_domain *sd = &per_cpu(phys_domains, i);
7242
7243 init_sched_groups_power(i, sd);
7244 }
7245
7246#ifdef CONFIG_NUMA
7247 for (i = 0; i < MAX_NUMNODES; i++)
7248 init_numa_sched_groups_power(sched_group_nodes[i]);
7249
7250 if (sd_allnodes) {
7251 struct sched_group *sg;
7252
7253 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
7254 tmpmask);
7255 init_numa_sched_groups_power(sg);
7256 }
7257#endif
7258
7259
7260 for_each_cpu_mask(i, *cpu_map) {
7261 struct sched_domain *sd;
7262#ifdef CONFIG_SCHED_SMT
7263 sd = &per_cpu(cpu_domains, i);
7264#elif defined(CONFIG_SCHED_MC)
7265 sd = &per_cpu(core_domains, i);
7266#else
7267 sd = &per_cpu(phys_domains, i);
7268#endif
7269 cpu_attach_domain(sd, rd, i);
7270 }
7271
7272 SCHED_CPUMASK_FREE((void *)allmasks);
7273 return 0;
7274
7275#ifdef CONFIG_NUMA
7276error:
7277 free_sched_groups(cpu_map, tmpmask);
7278 SCHED_CPUMASK_FREE((void *)allmasks);
7279 return -ENOMEM;
7280#endif
7281}
7282
7283static int build_sched_domains(const cpumask_t *cpu_map)
7284{
7285 return __build_sched_domains(cpu_map, NULL);
7286}
7287
7288static cpumask_t *doms_cur;
7289static int ndoms_cur;
7290static struct sched_domain_attr *dattr_cur;
7291
7292
7293
7294
7295
7296
7297
7298static cpumask_t fallback_doms;
7299
7300void __attribute__((weak)) arch_update_cpu_topology(void)
7301{
7302}
7303
7304
7305
7306
7307
7308static void free_sched_domains(void)
7309{
7310 ndoms_cur = 0;
7311 if (doms_cur != &fallback_doms)
7312 kfree(doms_cur);
7313 doms_cur = &fallback_doms;
7314}
7315
7316
7317
7318
7319
7320
7321static int arch_init_sched_domains(const cpumask_t *cpu_map)
7322{
7323 int err;
7324
7325 arch_update_cpu_topology();
7326 ndoms_cur = 1;
7327 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
7328 if (!doms_cur)
7329 doms_cur = &fallback_doms;
7330 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
7331 dattr_cur = NULL;
7332 err = build_sched_domains(doms_cur);
7333 register_sched_domain_sysctl();
7334
7335 return err;
7336}
7337
7338static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7339 cpumask_t *tmpmask)
7340{
7341 free_sched_groups(cpu_map, tmpmask);
7342}
7343
7344
7345
7346
7347
7348static void detach_destroy_domains(const cpumask_t *cpu_map)
7349{
7350 cpumask_t tmpmask;
7351 int i;
7352
7353 unregister_sched_domain_sysctl();
7354
7355 for_each_cpu_mask(i, *cpu_map)
7356 cpu_attach_domain(NULL, &def_root_domain, i);
7357 synchronize_sched();
7358 arch_destroy_sched_domains(cpu_map, &tmpmask);
7359}
7360
7361
7362static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7363 struct sched_domain_attr *new, int idx_new)
7364{
7365 struct sched_domain_attr tmp;
7366
7367
7368 if (!new && !cur)
7369 return 1;
7370
7371 tmp = SD_ATTR_INIT;
7372 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7373 new ? (new + idx_new) : &tmp,
7374 sizeof(struct sched_domain_attr));
7375}
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7399 struct sched_domain_attr *dattr_new)
7400{
7401 int i, j;
7402
7403 mutex_lock(&sched_domains_mutex);
7404
7405
7406 unregister_sched_domain_sysctl();
7407
7408 if (doms_new == NULL) {
7409 ndoms_new = 1;
7410 doms_new = &fallback_doms;
7411 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7412 dattr_new = NULL;
7413 }
7414
7415
7416 for (i = 0; i < ndoms_cur; i++) {
7417 for (j = 0; j < ndoms_new; j++) {
7418 if (cpus_equal(doms_cur[i], doms_new[j])
7419 && dattrs_equal(dattr_cur, i, dattr_new, j))
7420 goto match1;
7421 }
7422
7423 detach_destroy_domains(doms_cur + i);
7424match1:
7425 ;
7426 }
7427
7428
7429 for (i = 0; i < ndoms_new; i++) {
7430 for (j = 0; j < ndoms_cur; j++) {
7431 if (cpus_equal(doms_new[i], doms_cur[j])
7432 && dattrs_equal(dattr_new, i, dattr_cur, j))
7433 goto match2;
7434 }
7435
7436 __build_sched_domains(doms_new + i,
7437 dattr_new ? dattr_new + i : NULL);
7438match2:
7439 ;
7440 }
7441
7442
7443 if (doms_cur != &fallback_doms)
7444 kfree(doms_cur);
7445 kfree(dattr_cur);
7446 doms_cur = doms_new;
7447 dattr_cur = dattr_new;
7448 ndoms_cur = ndoms_new;
7449
7450 register_sched_domain_sysctl();
7451
7452 mutex_unlock(&sched_domains_mutex);
7453}
7454
7455#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7456int arch_reinit_sched_domains(void)
7457{
7458 int err;
7459
7460 get_online_cpus();
7461 mutex_lock(&sched_domains_mutex);
7462 detach_destroy_domains(&cpu_online_map);
7463 free_sched_domains();
7464 err = arch_init_sched_domains(&cpu_online_map);
7465 mutex_unlock(&sched_domains_mutex);
7466 put_online_cpus();
7467
7468 return err;
7469}
7470
7471static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7472{
7473 int ret;
7474
7475 if (buf[0] != '0' && buf[0] != '1')
7476 return -EINVAL;
7477
7478 if (smt)
7479 sched_smt_power_savings = (buf[0] == '1');
7480 else
7481 sched_mc_power_savings = (buf[0] == '1');
7482
7483 ret = arch_reinit_sched_domains();
7484
7485 return ret ? ret : count;
7486}
7487
7488#ifdef CONFIG_SCHED_MC
7489static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
7490{
7491 return sprintf(page, "%u\n", sched_mc_power_savings);
7492}
7493static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
7494 const char *buf, size_t count)
7495{
7496 return sched_power_savings_store(buf, count, 0);
7497}
7498static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
7499 sched_mc_power_savings_store);
7500#endif
7501
7502#ifdef CONFIG_SCHED_SMT
7503static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
7504{
7505 return sprintf(page, "%u\n", sched_smt_power_savings);
7506}
7507static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
7508 const char *buf, size_t count)
7509{
7510 return sched_power_savings_store(buf, count, 1);
7511}
7512static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
7513 sched_smt_power_savings_store);
7514#endif
7515
7516int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7517{
7518 int err = 0;
7519
7520#ifdef CONFIG_SCHED_SMT
7521 if (smt_capable())
7522 err = sysfs_create_file(&cls->kset.kobj,
7523 &attr_sched_smt_power_savings.attr);
7524#endif
7525#ifdef CONFIG_SCHED_MC
7526 if (!err && mc_capable())
7527 err = sysfs_create_file(&cls->kset.kobj,
7528 &attr_sched_mc_power_savings.attr);
7529#endif
7530 return err;
7531}
7532#endif
7533
7534
7535
7536
7537
7538
7539
7540static int update_sched_domains(struct notifier_block *nfb,
7541 unsigned long action, void *hcpu)
7542{
7543 switch (action) {
7544 case CPU_UP_PREPARE:
7545 case CPU_UP_PREPARE_FROZEN:
7546 case CPU_DOWN_PREPARE:
7547 case CPU_DOWN_PREPARE_FROZEN:
7548 detach_destroy_domains(&cpu_online_map);
7549 free_sched_domains();
7550 return NOTIFY_OK;
7551
7552 case CPU_UP_CANCELED:
7553 case CPU_UP_CANCELED_FROZEN:
7554 case CPU_DOWN_FAILED:
7555 case CPU_DOWN_FAILED_FROZEN:
7556 case CPU_ONLINE:
7557 case CPU_ONLINE_FROZEN:
7558 case CPU_DEAD:
7559 case CPU_DEAD_FROZEN:
7560
7561
7562
7563 break;
7564 default:
7565 return NOTIFY_DONE;
7566 }
7567
7568#ifndef CONFIG_CPUSETS
7569
7570
7571
7572
7573
7574
7575
7576 arch_init_sched_domains(&cpu_online_map);
7577#endif
7578
7579 return NOTIFY_OK;
7580}
7581
7582void __init sched_init_smp(void)
7583{
7584 cpumask_t non_isolated_cpus;
7585
7586#if defined(CONFIG_NUMA)
7587 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7588 GFP_KERNEL);
7589 BUG_ON(sched_group_nodes_bycpu == NULL);
7590#endif
7591 get_online_cpus();
7592 mutex_lock(&sched_domains_mutex);
7593 arch_init_sched_domains(&cpu_online_map);
7594 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7595 if (cpus_empty(non_isolated_cpus))
7596 cpu_set(smp_processor_id(), non_isolated_cpus);
7597 mutex_unlock(&sched_domains_mutex);
7598 put_online_cpus();
7599
7600 hotcpu_notifier(update_sched_domains, 0);
7601 init_hrtick();
7602
7603
7604 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
7605 BUG();
7606 sched_init_granularity();
7607}
7608#else
7609void __init sched_init_smp(void)
7610{
7611 sched_init_granularity();
7612}
7613#endif
7614
7615int in_sched_functions(unsigned long addr)
7616{
7617 return in_lock_functions(addr) ||
7618 (addr >= (unsigned long)__sched_text_start
7619 && addr < (unsigned long)__sched_text_end);
7620}
7621
7622static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7623{
7624 cfs_rq->tasks_timeline = RB_ROOT;
7625 INIT_LIST_HEAD(&cfs_rq->tasks);
7626#ifdef CONFIG_FAIR_GROUP_SCHED
7627 cfs_rq->rq = rq;
7628#endif
7629 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7630}
7631
7632static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7633{
7634 struct rt_prio_array *array;
7635 int i;
7636
7637 array = &rt_rq->active;
7638 for (i = 0; i < MAX_RT_PRIO; i++) {
7639 INIT_LIST_HEAD(array->queue + i);
7640 __clear_bit(i, array->bitmap);
7641 }
7642
7643 __set_bit(MAX_RT_PRIO, array->bitmap);
7644
7645#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7646 rt_rq->highest_prio = MAX_RT_PRIO;
7647#endif
7648#ifdef CONFIG_SMP
7649 rt_rq->rt_nr_migratory = 0;
7650 rt_rq->overloaded = 0;
7651#endif
7652
7653 rt_rq->rt_time = 0;
7654 rt_rq->rt_throttled = 0;
7655 rt_rq->rt_runtime = 0;
7656 spin_lock_init(&rt_rq->rt_runtime_lock);
7657
7658#ifdef CONFIG_RT_GROUP_SCHED
7659 rt_rq->rt_nr_boosted = 0;
7660 rt_rq->rq = rq;
7661#endif
7662}
7663
7664#ifdef CONFIG_FAIR_GROUP_SCHED
7665static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7666 struct sched_entity *se, int cpu, int add,
7667 struct sched_entity *parent)
7668{
7669 struct rq *rq = cpu_rq(cpu);
7670 tg->cfs_rq[cpu] = cfs_rq;
7671 init_cfs_rq(cfs_rq, rq);
7672 cfs_rq->tg = tg;
7673 if (add)
7674 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7675
7676 tg->se[cpu] = se;
7677
7678 if (!se)
7679 return;
7680
7681 if (!parent)
7682 se->cfs_rq = &rq->cfs;
7683 else
7684 se->cfs_rq = parent->my_q;
7685
7686 se->my_q = cfs_rq;
7687 se->load.weight = tg->shares;
7688 se->load.inv_weight = 0;
7689 se->parent = parent;
7690}
7691#endif
7692
7693#ifdef CONFIG_RT_GROUP_SCHED
7694static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7695 struct sched_rt_entity *rt_se, int cpu, int add,
7696 struct sched_rt_entity *parent)
7697{
7698 struct rq *rq = cpu_rq(cpu);
7699
7700 tg->rt_rq[cpu] = rt_rq;
7701 init_rt_rq(rt_rq, rq);
7702 rt_rq->tg = tg;
7703 rt_rq->rt_se = rt_se;
7704 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7705 if (add)
7706 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7707
7708 tg->rt_se[cpu] = rt_se;
7709 if (!rt_se)
7710 return;
7711
7712 if (!parent)
7713 rt_se->rt_rq = &rq->rt;
7714 else
7715 rt_se->rt_rq = parent->my_q;
7716
7717 rt_se->my_q = rt_rq;
7718 rt_se->parent = parent;
7719 INIT_LIST_HEAD(&rt_se->run_list);
7720}
7721#endif
7722
7723void __init sched_init(void)
7724{
7725 int i, j;
7726 unsigned long alloc_size = 0, ptr;
7727
7728#ifdef CONFIG_FAIR_GROUP_SCHED
7729 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7730#endif
7731#ifdef CONFIG_RT_GROUP_SCHED
7732 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7733#endif
7734#ifdef CONFIG_USER_SCHED
7735 alloc_size *= 2;
7736#endif
7737
7738
7739
7740
7741 if (alloc_size) {
7742 ptr = (unsigned long)alloc_bootmem(alloc_size);
7743
7744#ifdef CONFIG_FAIR_GROUP_SCHED
7745 init_task_group.se = (struct sched_entity **)ptr;
7746 ptr += nr_cpu_ids * sizeof(void **);
7747
7748 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
7749 ptr += nr_cpu_ids * sizeof(void **);
7750
7751#ifdef CONFIG_USER_SCHED
7752 root_task_group.se = (struct sched_entity **)ptr;
7753 ptr += nr_cpu_ids * sizeof(void **);
7754
7755 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7756 ptr += nr_cpu_ids * sizeof(void **);
7757#endif
7758#endif
7759#ifdef CONFIG_RT_GROUP_SCHED
7760 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7761 ptr += nr_cpu_ids * sizeof(void **);
7762
7763 init_task_group.rt_rq = (struct rt_rq **)ptr;
7764 ptr += nr_cpu_ids * sizeof(void **);
7765
7766#ifdef CONFIG_USER_SCHED
7767 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7768 ptr += nr_cpu_ids * sizeof(void **);
7769
7770 root_task_group.rt_rq = (struct rt_rq **)ptr;
7771 ptr += nr_cpu_ids * sizeof(void **);
7772#endif
7773#endif
7774 }
7775
7776#ifdef CONFIG_SMP
7777 init_defrootdomain();
7778#endif
7779
7780 init_rt_bandwidth(&def_rt_bandwidth,
7781 global_rt_period(), global_rt_runtime());
7782
7783#ifdef CONFIG_RT_GROUP_SCHED
7784 init_rt_bandwidth(&init_task_group.rt_bandwidth,
7785 global_rt_period(), global_rt_runtime());
7786#ifdef CONFIG_USER_SCHED
7787 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7788 global_rt_period(), RUNTIME_INF);
7789#endif
7790#endif
7791
7792#ifdef CONFIG_GROUP_SCHED
7793 list_add(&init_task_group.list, &task_groups);
7794 INIT_LIST_HEAD(&init_task_group.children);
7795
7796#ifdef CONFIG_USER_SCHED
7797 INIT_LIST_HEAD(&root_task_group.children);
7798 init_task_group.parent = &root_task_group;
7799 list_add(&init_task_group.siblings, &root_task_group.children);
7800#endif
7801#endif
7802
7803 for_each_possible_cpu(i) {
7804 struct rq *rq;
7805
7806 rq = cpu_rq(i);
7807 spin_lock_init(&rq->lock);
7808 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7809 rq->nr_running = 0;
7810 init_cfs_rq(&rq->cfs, rq);
7811 init_rt_rq(&rq->rt, rq);
7812#ifdef CONFIG_FAIR_GROUP_SCHED
7813 init_task_group.shares = init_task_group_load;
7814 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7815#ifdef CONFIG_CGROUP_SCHED
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
7836#elif defined CONFIG_USER_SCHED
7837 root_task_group.shares = NICE_0_LOAD;
7838 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850 init_tg_cfs_entry(&init_task_group,
7851 &per_cpu(init_cfs_rq, i),
7852 &per_cpu(init_sched_entity, i), i, 1,
7853 root_task_group.se[i]);
7854
7855#endif
7856#endif
7857
7858 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7859#ifdef CONFIG_RT_GROUP_SCHED
7860 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7861#ifdef CONFIG_CGROUP_SCHED
7862 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7863#elif defined CONFIG_USER_SCHED
7864 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
7865 init_tg_rt_entry(&init_task_group,
7866 &per_cpu(init_rt_rq, i),
7867 &per_cpu(init_sched_rt_entity, i), i, 1,
7868 root_task_group.rt_se[i]);
7869#endif
7870#endif
7871
7872 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7873 rq->cpu_load[j] = 0;
7874#ifdef CONFIG_SMP
7875 rq->sd = NULL;
7876 rq->rd = NULL;
7877 rq->active_balance = 0;
7878 rq->next_balance = jiffies;
7879 rq->push_cpu = 0;
7880 rq->cpu = i;
7881 rq->migration_thread = NULL;
7882 INIT_LIST_HEAD(&rq->migration_queue);
7883 rq_attach_root(rq, &def_root_domain);
7884#endif
7885 init_rq_hrtick(rq);
7886 atomic_set(&rq->nr_iowait, 0);
7887 }
7888
7889 set_load_weight(&init_task);
7890
7891#ifdef CONFIG_PREEMPT_NOTIFIERS
7892 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7893#endif
7894
7895#ifdef CONFIG_SMP
7896 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
7897#endif
7898
7899#ifdef CONFIG_RT_MUTEXES
7900 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
7901#endif
7902
7903
7904
7905
7906 atomic_inc(&init_mm.mm_count);
7907 enter_lazy_tlb(&init_mm, current);
7908
7909
7910
7911
7912
7913
7914
7915 init_idle(current, smp_processor_id());
7916
7917
7918
7919 current->sched_class = &fair_sched_class;
7920
7921 scheduler_running = 1;
7922}
7923
7924#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
7925void __might_sleep(char *file, int line)
7926{
7927#ifdef in_atomic
7928 static unsigned long prev_jiffy;
7929
7930 if ((in_atomic() || irqs_disabled()) &&
7931 system_state == SYSTEM_RUNNING && !oops_in_progress) {
7932 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7933 return;
7934 prev_jiffy = jiffies;
7935 printk(KERN_ERR "BUG: sleeping function called from invalid"
7936 " context at %s:%d\n", file, line);
7937 printk("in_atomic():%d, irqs_disabled():%d\n",
7938 in_atomic(), irqs_disabled());
7939 debug_show_held_locks(current);
7940 if (irqs_disabled())
7941 print_irqtrace_events(current);
7942 dump_stack();
7943 }
7944#endif
7945}
7946EXPORT_SYMBOL(__might_sleep);
7947#endif
7948
7949#ifdef CONFIG_MAGIC_SYSRQ
7950static void normalize_task(struct rq *rq, struct task_struct *p)
7951{
7952 int on_rq;
7953
7954 update_rq_clock(rq);
7955 on_rq = p->se.on_rq;
7956 if (on_rq)
7957 deactivate_task(rq, p, 0);
7958 __setscheduler(rq, p, SCHED_NORMAL, 0);
7959 if (on_rq) {
7960 activate_task(rq, p, 0);
7961 resched_task(rq->curr);
7962 }
7963}
7964
7965void normalize_rt_tasks(void)
7966{
7967 struct task_struct *g, *p;
7968 unsigned long flags;
7969 struct rq *rq;
7970
7971 read_lock_irqsave(&tasklist_lock, flags);
7972 do_each_thread(g, p) {
7973
7974
7975
7976 if (!p->mm)
7977 continue;
7978
7979 p->se.exec_start = 0;
7980#ifdef CONFIG_SCHEDSTATS
7981 p->se.wait_start = 0;
7982 p->se.sleep_start = 0;
7983 p->se.block_start = 0;
7984#endif
7985
7986 if (!rt_task(p)) {
7987
7988
7989
7990
7991 if (TASK_NICE(p) < 0 && p->mm)
7992 set_user_nice(p, 0);
7993 continue;
7994 }
7995
7996 spin_lock(&p->pi_lock);
7997 rq = __task_rq_lock(p);
7998
7999 normalize_task(rq, p);
8000
8001 __task_rq_unlock(rq);
8002 spin_unlock(&p->pi_lock);
8003 } while_each_thread(g, p);
8004
8005 read_unlock_irqrestore(&tasklist_lock, flags);
8006}
8007
8008#endif
8009
8010#ifdef CONFIG_IA64
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027struct task_struct *curr_task(int cpu)
8028{
8029 return cpu_curr(cpu);
8030}
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047void set_curr_task(int cpu, struct task_struct *p)
8048{
8049 cpu_curr(cpu) = p;
8050}
8051
8052#endif
8053
8054#ifdef CONFIG_FAIR_GROUP_SCHED
8055static void free_fair_sched_group(struct task_group *tg)
8056{
8057 int i;
8058
8059 for_each_possible_cpu(i) {
8060 if (tg->cfs_rq)
8061 kfree(tg->cfs_rq[i]);
8062 if (tg->se)
8063 kfree(tg->se[i]);
8064 }
8065
8066 kfree(tg->cfs_rq);
8067 kfree(tg->se);
8068}
8069
8070static
8071int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8072{
8073 struct cfs_rq *cfs_rq;
8074 struct sched_entity *se, *parent_se;
8075 struct rq *rq;
8076 int i;
8077
8078 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8079 if (!tg->cfs_rq)
8080 goto err;
8081 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8082 if (!tg->se)
8083 goto err;
8084
8085 tg->shares = NICE_0_LOAD;
8086
8087 for_each_possible_cpu(i) {
8088 rq = cpu_rq(i);
8089
8090 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
8091 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8092 if (!cfs_rq)
8093 goto err;
8094
8095 se = kmalloc_node(sizeof(struct sched_entity),
8096 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8097 if (!se)
8098 goto err;
8099
8100 parent_se = parent ? parent->se[i] : NULL;
8101 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8102 }
8103
8104 return 1;
8105
8106 err:
8107 return 0;
8108}
8109
8110static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8111{
8112 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8113 &cpu_rq(cpu)->leaf_cfs_rq_list);
8114}
8115
8116static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8117{
8118 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8119}
8120#else
8121static inline void free_fair_sched_group(struct task_group *tg)
8122{
8123}
8124
8125static inline
8126int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8127{
8128 return 1;
8129}
8130
8131static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8132{
8133}
8134
8135static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8136{
8137}
8138#endif
8139
8140#ifdef CONFIG_RT_GROUP_SCHED
8141static void free_rt_sched_group(struct task_group *tg)
8142{
8143 int i;
8144
8145 destroy_rt_bandwidth(&tg->rt_bandwidth);
8146
8147 for_each_possible_cpu(i) {
8148 if (tg->rt_rq)
8149 kfree(tg->rt_rq[i]);
8150 if (tg->rt_se)
8151 kfree(tg->rt_se[i]);
8152 }
8153
8154 kfree(tg->rt_rq);
8155 kfree(tg->rt_se);
8156}
8157
8158static
8159int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8160{
8161 struct rt_rq *rt_rq;
8162 struct sched_rt_entity *rt_se, *parent_se;
8163 struct rq *rq;
8164 int i;
8165
8166 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8167 if (!tg->rt_rq)
8168 goto err;
8169 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8170 if (!tg->rt_se)
8171 goto err;
8172
8173 init_rt_bandwidth(&tg->rt_bandwidth,
8174 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8175
8176 for_each_possible_cpu(i) {
8177 rq = cpu_rq(i);
8178
8179 rt_rq = kmalloc_node(sizeof(struct rt_rq),
8180 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8181 if (!rt_rq)
8182 goto err;
8183
8184 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
8185 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8186 if (!rt_se)
8187 goto err;
8188
8189 parent_se = parent ? parent->rt_se[i] : NULL;
8190 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8191 }
8192
8193 return 1;
8194
8195 err:
8196 return 0;
8197}
8198
8199static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8200{
8201 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8202 &cpu_rq(cpu)->leaf_rt_rq_list);
8203}
8204
8205static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8206{
8207 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8208}
8209#else
8210static inline void free_rt_sched_group(struct task_group *tg)
8211{
8212}
8213
8214static inline
8215int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8216{
8217 return 1;
8218}
8219
8220static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8221{
8222}
8223
8224static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8225{
8226}
8227#endif
8228
8229#ifdef CONFIG_GROUP_SCHED
8230static void free_sched_group(struct task_group *tg)
8231{
8232 free_fair_sched_group(tg);
8233 free_rt_sched_group(tg);
8234 kfree(tg);
8235}
8236
8237
8238struct task_group *sched_create_group(struct task_group *parent)
8239{
8240 struct task_group *tg;
8241 unsigned long flags;
8242 int i;
8243
8244 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8245 if (!tg)
8246 return ERR_PTR(-ENOMEM);
8247
8248 if (!alloc_fair_sched_group(tg, parent))
8249 goto err;
8250
8251 if (!alloc_rt_sched_group(tg, parent))
8252 goto err;
8253
8254 spin_lock_irqsave(&task_group_lock, flags);
8255 for_each_possible_cpu(i) {
8256 register_fair_sched_group(tg, i);
8257 register_rt_sched_group(tg, i);
8258 }
8259 list_add_rcu(&tg->list, &task_groups);
8260
8261 WARN_ON(!parent);
8262
8263 tg->parent = parent;
8264 list_add_rcu(&tg->siblings, &parent->children);
8265 INIT_LIST_HEAD(&tg->children);
8266 spin_unlock_irqrestore(&task_group_lock, flags);
8267
8268 return tg;
8269
8270err:
8271 free_sched_group(tg);
8272 return ERR_PTR(-ENOMEM);
8273}
8274
8275
8276static void free_sched_group_rcu(struct rcu_head *rhp)
8277{
8278
8279 free_sched_group(container_of(rhp, struct task_group, rcu));
8280}
8281
8282
8283void sched_destroy_group(struct task_group *tg)
8284{
8285 unsigned long flags;
8286 int i;
8287
8288 spin_lock_irqsave(&task_group_lock, flags);
8289 for_each_possible_cpu(i) {
8290 unregister_fair_sched_group(tg, i);
8291 unregister_rt_sched_group(tg, i);
8292 }
8293 list_del_rcu(&tg->list);
8294 list_del_rcu(&tg->siblings);
8295 spin_unlock_irqrestore(&task_group_lock, flags);
8296
8297
8298 call_rcu(&tg->rcu, free_sched_group_rcu);
8299}
8300
8301
8302
8303
8304
8305
8306void sched_move_task(struct task_struct *tsk)
8307{
8308 int on_rq, running;
8309 unsigned long flags;
8310 struct rq *rq;
8311
8312 rq = task_rq_lock(tsk, &flags);
8313
8314 update_rq_clock(rq);
8315
8316 running = task_current(rq, tsk);
8317 on_rq = tsk->se.on_rq;
8318
8319 if (on_rq)
8320 dequeue_task(rq, tsk, 0);
8321 if (unlikely(running))
8322 tsk->sched_class->put_prev_task(rq, tsk);
8323
8324 set_task_rq(tsk, task_cpu(tsk));
8325
8326#ifdef CONFIG_FAIR_GROUP_SCHED
8327 if (tsk->sched_class->moved_group)
8328 tsk->sched_class->moved_group(tsk);
8329#endif
8330
8331 if (unlikely(running))
8332 tsk->sched_class->set_curr_task(rq);
8333 if (on_rq)
8334 enqueue_task(rq, tsk, 0);
8335
8336 task_rq_unlock(rq, &flags);
8337}
8338#endif
8339
8340#ifdef CONFIG_FAIR_GROUP_SCHED
8341static void set_se_shares(struct sched_entity *se, unsigned long shares)
8342{
8343 struct cfs_rq *cfs_rq = se->cfs_rq;
8344 struct rq *rq = cfs_rq->rq;
8345 int on_rq;
8346
8347 spin_lock_irq(&rq->lock);
8348
8349 on_rq = se->on_rq;
8350 if (on_rq)
8351 dequeue_entity(cfs_rq, se, 0);
8352
8353 se->load.weight = shares;
8354 se->load.inv_weight = 0;
8355
8356 if (on_rq)
8357 enqueue_entity(cfs_rq, se, 0);
8358
8359 spin_unlock_irq(&rq->lock);
8360}
8361
8362static DEFINE_MUTEX(shares_mutex);
8363
8364int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8365{
8366 int i;
8367 unsigned long flags;
8368
8369
8370
8371
8372 if (!tg->se[0])
8373 return -EINVAL;
8374
8375 if (shares < MIN_SHARES)
8376 shares = MIN_SHARES;
8377 else if (shares > MAX_SHARES)
8378 shares = MAX_SHARES;
8379
8380 mutex_lock(&shares_mutex);
8381 if (tg->shares == shares)
8382 goto done;
8383
8384 spin_lock_irqsave(&task_group_lock, flags);
8385 for_each_possible_cpu(i)
8386 unregister_fair_sched_group(tg, i);
8387 list_del_rcu(&tg->siblings);
8388 spin_unlock_irqrestore(&task_group_lock, flags);
8389
8390
8391 synchronize_sched();
8392
8393
8394
8395
8396
8397 tg->shares = shares;
8398 for_each_possible_cpu(i)
8399 set_se_shares(tg->se[i], shares);
8400
8401
8402
8403
8404
8405 spin_lock_irqsave(&task_group_lock, flags);
8406 for_each_possible_cpu(i)
8407 register_fair_sched_group(tg, i);
8408 list_add_rcu(&tg->siblings, &tg->parent->children);
8409 spin_unlock_irqrestore(&task_group_lock, flags);
8410done:
8411 mutex_unlock(&shares_mutex);
8412 return 0;
8413}
8414
8415unsigned long sched_group_shares(struct task_group *tg)
8416{
8417 return tg->shares;
8418}
8419#endif
8420
8421#ifdef CONFIG_RT_GROUP_SCHED
8422
8423
8424
8425static DEFINE_MUTEX(rt_constraints_mutex);
8426
8427static unsigned long to_ratio(u64 period, u64 runtime)
8428{
8429 if (runtime == RUNTIME_INF)
8430 return 1ULL << 16;
8431
8432 return div64_u64(runtime << 16, period);
8433}
8434
8435#ifdef CONFIG_CGROUP_SCHED
8436static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8437{
8438 struct task_group *tgi, *parent = tg ? tg->parent : NULL;
8439 unsigned long total = 0;
8440
8441 if (!parent) {
8442 if (global_rt_period() < period)
8443 return 0;
8444
8445 return to_ratio(period, runtime) <
8446 to_ratio(global_rt_period(), global_rt_runtime());
8447 }
8448
8449 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
8450 return 0;
8451
8452 rcu_read_lock();
8453 list_for_each_entry_rcu(tgi, &parent->children, siblings) {
8454 if (tgi == tg)
8455 continue;
8456
8457 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8458 tgi->rt_bandwidth.rt_runtime);
8459 }
8460 rcu_read_unlock();
8461
8462 return total + to_ratio(period, runtime) <
8463 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8464 parent->rt_bandwidth.rt_runtime);
8465}
8466#elif defined CONFIG_USER_SCHED
8467static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8468{
8469 struct task_group *tgi;
8470 unsigned long total = 0;
8471 unsigned long global_ratio =
8472 to_ratio(global_rt_period(), global_rt_runtime());
8473
8474 rcu_read_lock();
8475 list_for_each_entry_rcu(tgi, &task_groups, list) {
8476 if (tgi == tg)
8477 continue;
8478
8479 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8480 tgi->rt_bandwidth.rt_runtime);
8481 }
8482 rcu_read_unlock();
8483
8484 return total + to_ratio(period, runtime) < global_ratio;
8485}
8486#endif
8487
8488
8489static inline int tg_has_rt_tasks(struct task_group *tg)
8490{
8491 struct task_struct *g, *p;
8492 do_each_thread(g, p) {
8493 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8494 return 1;
8495 } while_each_thread(g, p);
8496 return 0;
8497}
8498
8499static int tg_set_bandwidth(struct task_group *tg,
8500 u64 rt_period, u64 rt_runtime)
8501{
8502 int i, err = 0;
8503
8504 mutex_lock(&rt_constraints_mutex);
8505 read_lock(&tasklist_lock);
8506 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
8507 err = -EBUSY;
8508 goto unlock;
8509 }
8510 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8511 err = -EINVAL;
8512 goto unlock;
8513 }
8514
8515 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8516 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8517 tg->rt_bandwidth.rt_runtime = rt_runtime;
8518
8519 for_each_possible_cpu(i) {
8520 struct rt_rq *rt_rq = tg->rt_rq[i];
8521
8522 spin_lock(&rt_rq->rt_runtime_lock);
8523 rt_rq->rt_runtime = rt_runtime;
8524 spin_unlock(&rt_rq->rt_runtime_lock);
8525 }
8526 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8527 unlock:
8528 read_unlock(&tasklist_lock);
8529 mutex_unlock(&rt_constraints_mutex);
8530
8531 return err;
8532}
8533
8534int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8535{
8536 u64 rt_runtime, rt_period;
8537
8538 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8539 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8540 if (rt_runtime_us < 0)
8541 rt_runtime = RUNTIME_INF;
8542
8543 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8544}
8545
8546long sched_group_rt_runtime(struct task_group *tg)
8547{
8548 u64 rt_runtime_us;
8549
8550 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8551 return -1;
8552
8553 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8554 do_div(rt_runtime_us, NSEC_PER_USEC);
8555 return rt_runtime_us;
8556}
8557
8558int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8559{
8560 u64 rt_runtime, rt_period;
8561
8562 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8563 rt_runtime = tg->rt_bandwidth.rt_runtime;
8564
8565 if (rt_period == 0)
8566 return -EINVAL;
8567
8568 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8569}
8570
8571long sched_group_rt_period(struct task_group *tg)
8572{
8573 u64 rt_period_us;
8574
8575 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8576 do_div(rt_period_us, NSEC_PER_USEC);
8577 return rt_period_us;
8578}
8579
8580static int sched_rt_global_constraints(void)
8581{
8582 int ret = 0;
8583
8584 mutex_lock(&rt_constraints_mutex);
8585 if (!__rt_schedulable(NULL, 1, 0))
8586 ret = -EINVAL;
8587 mutex_unlock(&rt_constraints_mutex);
8588
8589 return ret;
8590}
8591#else
8592static int sched_rt_global_constraints(void)
8593{
8594 unsigned long flags;
8595 int i;
8596
8597 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8598 for_each_possible_cpu(i) {
8599 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8600
8601 spin_lock(&rt_rq->rt_runtime_lock);
8602 rt_rq->rt_runtime = global_rt_runtime();
8603 spin_unlock(&rt_rq->rt_runtime_lock);
8604 }
8605 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8606
8607 return 0;
8608}
8609#endif
8610
8611int sched_rt_handler(struct ctl_table *table, int write,
8612 struct file *filp, void __user *buffer, size_t *lenp,
8613 loff_t *ppos)
8614{
8615 int ret;
8616 int old_period, old_runtime;
8617 static DEFINE_MUTEX(mutex);
8618
8619 mutex_lock(&mutex);
8620 old_period = sysctl_sched_rt_period;
8621 old_runtime = sysctl_sched_rt_runtime;
8622
8623 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
8624
8625 if (!ret && write) {
8626 ret = sched_rt_global_constraints();
8627 if (ret) {
8628 sysctl_sched_rt_period = old_period;
8629 sysctl_sched_rt_runtime = old_runtime;
8630 } else {
8631 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8632 def_rt_bandwidth.rt_period =
8633 ns_to_ktime(global_rt_period());
8634 }
8635 }
8636 mutex_unlock(&mutex);
8637
8638 return ret;
8639}
8640
8641#ifdef CONFIG_CGROUP_SCHED
8642
8643
8644static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
8645{
8646 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
8647 struct task_group, css);
8648}
8649
8650static struct cgroup_subsys_state *
8651cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8652{
8653 struct task_group *tg, *parent;
8654
8655 if (!cgrp->parent) {
8656
8657 init_task_group.css.cgroup = cgrp;
8658 return &init_task_group.css;
8659 }
8660
8661 parent = cgroup_tg(cgrp->parent);
8662 tg = sched_create_group(parent);
8663 if (IS_ERR(tg))
8664 return ERR_PTR(-ENOMEM);
8665
8666
8667 tg->css.cgroup = cgrp;
8668
8669 return &tg->css;
8670}
8671
8672static void
8673cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8674{
8675 struct task_group *tg = cgroup_tg(cgrp);
8676
8677 sched_destroy_group(tg);
8678}
8679
8680static int
8681cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8682 struct task_struct *tsk)
8683{
8684#ifdef CONFIG_RT_GROUP_SCHED
8685
8686 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
8687 return -EINVAL;
8688#else
8689
8690 if (tsk->sched_class != &fair_sched_class)
8691 return -EINVAL;
8692#endif
8693
8694 return 0;
8695}
8696
8697static void
8698cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8699 struct cgroup *old_cont, struct task_struct *tsk)
8700{
8701 sched_move_task(tsk);
8702}
8703
8704#ifdef CONFIG_FAIR_GROUP_SCHED
8705static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8706 u64 shareval)
8707{
8708 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
8709}
8710
8711static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8712{
8713 struct task_group *tg = cgroup_tg(cgrp);
8714
8715 return (u64) tg->shares;
8716}
8717#endif
8718
8719#ifdef CONFIG_RT_GROUP_SCHED
8720static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8721 s64 val)
8722{
8723 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8724}
8725
8726static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8727{
8728 return sched_group_rt_runtime(cgroup_tg(cgrp));
8729}
8730
8731static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8732 u64 rt_period_us)
8733{
8734 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8735}
8736
8737static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8738{
8739 return sched_group_rt_period(cgroup_tg(cgrp));
8740}
8741#endif
8742
8743static struct cftype cpu_files[] = {
8744#ifdef CONFIG_FAIR_GROUP_SCHED
8745 {
8746 .name = "shares",
8747 .read_u64 = cpu_shares_read_u64,
8748 .write_u64 = cpu_shares_write_u64,
8749 },
8750#endif
8751#ifdef CONFIG_RT_GROUP_SCHED
8752 {
8753 .name = "rt_runtime_us",
8754 .read_s64 = cpu_rt_runtime_read,
8755 .write_s64 = cpu_rt_runtime_write,
8756 },
8757 {
8758 .name = "rt_period_us",
8759 .read_u64 = cpu_rt_period_read_uint,
8760 .write_u64 = cpu_rt_period_write_uint,
8761 },
8762#endif
8763};
8764
8765static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8766{
8767 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
8768}
8769
8770struct cgroup_subsys cpu_cgroup_subsys = {
8771 .name = "cpu",
8772 .create = cpu_cgroup_create,
8773 .destroy = cpu_cgroup_destroy,
8774 .can_attach = cpu_cgroup_can_attach,
8775 .attach = cpu_cgroup_attach,
8776 .populate = cpu_cgroup_populate,
8777 .subsys_id = cpu_cgroup_subsys_id,
8778 .early_init = 1,
8779};
8780
8781#endif
8782
8783#ifdef CONFIG_CGROUP_CPUACCT
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793struct cpuacct {
8794 struct cgroup_subsys_state css;
8795
8796 u64 *cpuusage;
8797};
8798
8799struct cgroup_subsys cpuacct_subsys;
8800
8801
8802static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
8803{
8804 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
8805 struct cpuacct, css);
8806}
8807
8808
8809static inline struct cpuacct *task_ca(struct task_struct *tsk)
8810{
8811 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
8812 struct cpuacct, css);
8813}
8814
8815
8816static struct cgroup_subsys_state *cpuacct_create(
8817 struct cgroup_subsys *ss, struct cgroup *cgrp)
8818{
8819 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8820
8821 if (!ca)
8822 return ERR_PTR(-ENOMEM);
8823
8824 ca->cpuusage = alloc_percpu(u64);
8825 if (!ca->cpuusage) {
8826 kfree(ca);
8827 return ERR_PTR(-ENOMEM);
8828 }
8829
8830 return &ca->css;
8831}
8832
8833
8834static void
8835cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8836{
8837 struct cpuacct *ca = cgroup_ca(cgrp);
8838
8839 free_percpu(ca->cpuusage);
8840 kfree(ca);
8841}
8842
8843
8844static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8845{
8846 struct cpuacct *ca = cgroup_ca(cgrp);
8847 u64 totalcpuusage = 0;
8848 int i;
8849
8850 for_each_possible_cpu(i) {
8851 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
8852
8853
8854
8855
8856
8857 spin_lock_irq(&cpu_rq(i)->lock);
8858 totalcpuusage += *cpuusage;
8859 spin_unlock_irq(&cpu_rq(i)->lock);
8860 }
8861
8862 return totalcpuusage;
8863}
8864
8865static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8866 u64 reset)
8867{
8868 struct cpuacct *ca = cgroup_ca(cgrp);
8869 int err = 0;
8870 int i;
8871
8872 if (reset) {
8873 err = -EINVAL;
8874 goto out;
8875 }
8876
8877 for_each_possible_cpu(i) {
8878 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
8879
8880 spin_lock_irq(&cpu_rq(i)->lock);
8881 *cpuusage = 0;
8882 spin_unlock_irq(&cpu_rq(i)->lock);
8883 }
8884out:
8885 return err;
8886}
8887
8888static struct cftype files[] = {
8889 {
8890 .name = "usage",
8891 .read_u64 = cpuusage_read,
8892 .write_u64 = cpuusage_write,
8893 },
8894};
8895
8896static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8897{
8898 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8899}
8900
8901
8902
8903
8904
8905
8906static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8907{
8908 struct cpuacct *ca;
8909
8910 if (!cpuacct_subsys.active)
8911 return;
8912
8913 ca = task_ca(tsk);
8914 if (ca) {
8915 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
8916
8917 *cpuusage += cputime;
8918 }
8919}
8920
8921struct cgroup_subsys cpuacct_subsys = {
8922 .name = "cpuacct",
8923 .create = cpuacct_create,
8924 .destroy = cpuacct_destroy,
8925 .populate = cpuacct_populate,
8926 .subsys_id = cpuacct_subsys_id,
8927};
8928#endif
8929