1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/reciprocal_div.h>
66#include <linux/unistd.h>
67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/bootmem.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73
74#include <asm/tlb.h>
75#include <asm/irq_regs.h>
76
77
78
79
80
81
82#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
83#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
84#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
85
86
87
88
89
90
91#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
92#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
93#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
94
95
96
97
98#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
103
104
105
106
107
108
109#define DEF_TIMESLICE (100 * HZ / 1000)
110
111
112
113
114#define RUNTIME_INF ((u64)~0ULL)
115
116#ifdef CONFIG_SMP
117
118
119
120
121static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
122{
123 return reciprocal_divide(load, sg->reciprocal_cpu_power);
124}
125
126
127
128
129
130static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
131{
132 sg->__cpu_power += val;
133 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
134}
135#endif
136
137static inline int rt_policy(int policy)
138{
139 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
140 return 1;
141 return 0;
142}
143
144static inline int task_has_rt_policy(struct task_struct *p)
145{
146 return rt_policy(p->policy);
147}
148
149
150
151
152struct rt_prio_array {
153 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
154 struct list_head queue[MAX_RT_PRIO];
155};
156
157struct rt_bandwidth {
158
159 spinlock_t rt_runtime_lock;
160 ktime_t rt_period;
161 u64 rt_runtime;
162 struct hrtimer rt_period_timer;
163};
164
165static struct rt_bandwidth def_rt_bandwidth;
166
167static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
168
169static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
170{
171 struct rt_bandwidth *rt_b =
172 container_of(timer, struct rt_bandwidth, rt_period_timer);
173 ktime_t now;
174 int overrun;
175 int idle = 0;
176
177 for (;;) {
178 now = hrtimer_cb_get_time(timer);
179 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
180
181 if (!overrun)
182 break;
183
184 idle = do_sched_rt_period_timer(rt_b, overrun);
185 }
186
187 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
188}
189
190static
191void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
192{
193 rt_b->rt_period = ns_to_ktime(period);
194 rt_b->rt_runtime = runtime;
195
196 spin_lock_init(&rt_b->rt_runtime_lock);
197
198 hrtimer_init(&rt_b->rt_period_timer,
199 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
200 rt_b->rt_period_timer.function = sched_rt_period_timer;
201 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
202}
203
204static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
205{
206 ktime_t now;
207
208 if (rt_b->rt_runtime == RUNTIME_INF)
209 return;
210
211 if (hrtimer_active(&rt_b->rt_period_timer))
212 return;
213
214 spin_lock(&rt_b->rt_runtime_lock);
215 for (;;) {
216 if (hrtimer_active(&rt_b->rt_period_timer))
217 break;
218
219 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
220 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
221 hrtimer_start(&rt_b->rt_period_timer,
222 rt_b->rt_period_timer.expires,
223 HRTIMER_MODE_ABS);
224 }
225 spin_unlock(&rt_b->rt_runtime_lock);
226}
227
228#ifdef CONFIG_RT_GROUP_SCHED
229static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
230{
231 hrtimer_cancel(&rt_b->rt_period_timer);
232}
233#endif
234
235
236
237
238
239static DEFINE_MUTEX(sched_domains_mutex);
240
241#ifdef CONFIG_GROUP_SCHED
242
243#include <linux/cgroup.h>
244
245struct cfs_rq;
246
247static LIST_HEAD(task_groups);
248
249
250struct task_group {
251#ifdef CONFIG_CGROUP_SCHED
252 struct cgroup_subsys_state css;
253#endif
254
255#ifdef CONFIG_FAIR_GROUP_SCHED
256
257 struct sched_entity **se;
258
259 struct cfs_rq **cfs_rq;
260 unsigned long shares;
261#endif
262
263#ifdef CONFIG_RT_GROUP_SCHED
264 struct sched_rt_entity **rt_se;
265 struct rt_rq **rt_rq;
266
267 struct rt_bandwidth rt_bandwidth;
268#endif
269
270 struct rcu_head rcu;
271 struct list_head list;
272
273 struct task_group *parent;
274 struct list_head siblings;
275 struct list_head children;
276};
277
278#ifdef CONFIG_USER_SCHED
279
280
281
282
283
284
285struct task_group root_task_group;
286
287#ifdef CONFIG_FAIR_GROUP_SCHED
288
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif
293
294#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif
298#else
299#define root_task_group init_task_group
300#endif
301
302
303
304
305static DEFINE_SPINLOCK(task_group_lock);
306
307#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif
313
314
315
316
317
318
319
320
321
322#define MIN_SHARES 2
323#define MAX_SHARES (1UL << 18)
324
325static int init_task_group_load = INIT_TASK_GROUP_LOAD;
326#endif
327
328
329
330
331struct task_group init_task_group;
332
333
334static inline struct task_group *task_group(struct task_struct *p)
335{
336 struct task_group *tg;
337
338#ifdef CONFIG_USER_SCHED
339 tg = p->user->tg;
340#elif defined(CONFIG_CGROUP_SCHED)
341 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
342 struct task_group, css);
343#else
344 tg = &init_task_group;
345#endif
346 return tg;
347}
348
349
350static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
351{
352#ifdef CONFIG_FAIR_GROUP_SCHED
353 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
354 p->se.parent = task_group(p)->se[cpu];
355#endif
356
357#ifdef CONFIG_RT_GROUP_SCHED
358 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
359 p->rt.parent = task_group(p)->rt_se[cpu];
360#endif
361}
362
363#else
364
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
366
367#endif
368
369
370struct cfs_rq {
371 struct load_weight load;
372 unsigned long nr_running;
373
374 u64 exec_clock;
375 u64 min_vruntime;
376
377 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost;
379
380 struct list_head tasks;
381 struct list_head *balance_iterator;
382
383
384
385
386
387 struct sched_entity *curr, *next;
388
389 unsigned long nr_spread_over;
390
391#ifdef CONFIG_FAIR_GROUP_SCHED
392 struct rq *rq;
393
394
395
396
397
398
399
400
401
402 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg;
404#endif
405};
406
407
408struct rt_rq {
409 struct rt_prio_array active;
410 unsigned long rt_nr_running;
411#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
412 int highest_prio;
413#endif
414#ifdef CONFIG_SMP
415 unsigned long rt_nr_migratory;
416 int overloaded;
417#endif
418 int rt_throttled;
419 u64 rt_time;
420 u64 rt_runtime;
421
422 spinlock_t rt_runtime_lock;
423
424#ifdef CONFIG_RT_GROUP_SCHED
425 unsigned long rt_nr_boosted;
426
427 struct rq *rq;
428 struct list_head leaf_rt_rq_list;
429 struct task_group *tg;
430 struct sched_rt_entity *rt_se;
431#endif
432};
433
434#ifdef CONFIG_SMP
435
436
437
438
439
440
441
442
443
444struct root_domain {
445 atomic_t refcount;
446 cpumask_t span;
447 cpumask_t online;
448
449
450
451
452
453 cpumask_t rto_mask;
454 atomic_t rto_count;
455};
456
457
458
459
460
461static struct root_domain def_root_domain;
462
463#endif
464
465
466
467
468
469
470
471
472struct rq {
473
474 spinlock_t lock;
475
476
477
478
479
480 unsigned long nr_running;
481 #define CPU_LOAD_IDX_MAX 5
482 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
483 unsigned char idle_at_tick;
484#ifdef CONFIG_NO_HZ
485 unsigned long last_tick_seen;
486 unsigned char in_nohz_recently;
487#endif
488
489 struct load_weight load;
490 unsigned long nr_load_updates;
491 u64 nr_switches;
492
493 struct cfs_rq cfs;
494 struct rt_rq rt;
495
496#ifdef CONFIG_FAIR_GROUP_SCHED
497
498 struct list_head leaf_cfs_rq_list;
499#endif
500#ifdef CONFIG_RT_GROUP_SCHED
501 struct list_head leaf_rt_rq_list;
502#endif
503
504
505
506
507
508
509
510 unsigned long nr_uninterruptible;
511
512 struct task_struct *curr, *idle;
513 unsigned long next_balance;
514 struct mm_struct *prev_mm;
515
516 u64 clock;
517
518 atomic_t nr_iowait;
519
520#ifdef CONFIG_SMP
521 struct root_domain *rd;
522 struct sched_domain *sd;
523
524
525 int active_balance;
526 int push_cpu;
527
528 int cpu;
529
530 struct task_struct *migration_thread;
531 struct list_head migration_queue;
532#endif
533
534#ifdef CONFIG_SCHED_HRTICK
535 unsigned long hrtick_flags;
536 ktime_t hrtick_expire;
537 struct hrtimer hrtick_timer;
538#endif
539
540#ifdef CONFIG_SCHEDSTATS
541
542 struct sched_info rq_sched_info;
543
544
545 unsigned int yld_exp_empty;
546 unsigned int yld_act_empty;
547 unsigned int yld_both_empty;
548 unsigned int yld_count;
549
550
551 unsigned int sched_switch;
552 unsigned int sched_count;
553 unsigned int sched_goidle;
554
555
556 unsigned int ttwu_count;
557 unsigned int ttwu_local;
558
559
560 unsigned int bkl_count;
561#endif
562 struct lock_class_key rq_lock_key;
563};
564
565static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
566
567static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
568{
569 rq->curr->sched_class->check_preempt_curr(rq, p);
570}
571
572static inline int cpu_of(struct rq *rq)
573{
574#ifdef CONFIG_SMP
575 return rq->cpu;
576#else
577 return 0;
578#endif
579}
580
581
582
583
584
585
586
587
588#define for_each_domain(cpu, __sd) \
589 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
590
591#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
592#define this_rq() (&__get_cpu_var(runqueues))
593#define task_rq(p) cpu_rq(task_cpu(p))
594#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
595
596static inline void update_rq_clock(struct rq *rq)
597{
598 rq->clock = sched_clock_cpu(cpu_of(rq));
599}
600
601
602
603
604#ifdef CONFIG_SCHED_DEBUG
605# define const_debug __read_mostly
606#else
607# define const_debug static const
608#endif
609
610
611
612
613
614#define SCHED_FEAT(name, enabled) \
615 __SCHED_FEAT_##name ,
616
617enum {
618#include "sched_features.h"
619};
620
621#undef SCHED_FEAT
622
623#define SCHED_FEAT(name, enabled) \
624 (1UL << __SCHED_FEAT_##name) * enabled |
625
626const_debug unsigned int sysctl_sched_features =
627#include "sched_features.h"
628 0;
629
630#undef SCHED_FEAT
631
632#ifdef CONFIG_SCHED_DEBUG
633#define SCHED_FEAT(name, enabled) \
634 #name ,
635
636static __read_mostly char *sched_feat_names[] = {
637#include "sched_features.h"
638 NULL
639};
640
641#undef SCHED_FEAT
642
643static int sched_feat_open(struct inode *inode, struct file *filp)
644{
645 filp->private_data = inode->i_private;
646 return 0;
647}
648
649static ssize_t
650sched_feat_read(struct file *filp, char __user *ubuf,
651 size_t cnt, loff_t *ppos)
652{
653 char *buf;
654 int r = 0;
655 int len = 0;
656 int i;
657
658 for (i = 0; sched_feat_names[i]; i++) {
659 len += strlen(sched_feat_names[i]);
660 len += 4;
661 }
662
663 buf = kmalloc(len + 2, GFP_KERNEL);
664 if (!buf)
665 return -ENOMEM;
666
667 for (i = 0; sched_feat_names[i]; i++) {
668 if (sysctl_sched_features & (1UL << i))
669 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
670 else
671 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
672 }
673
674 r += sprintf(buf + r, "\n");
675 WARN_ON(r >= len + 2);
676
677 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
678
679 kfree(buf);
680
681 return r;
682}
683
684static ssize_t
685sched_feat_write(struct file *filp, const char __user *ubuf,
686 size_t cnt, loff_t *ppos)
687{
688 char buf[64];
689 char *cmp = buf;
690 int neg = 0;
691 int i;
692
693 if (cnt > 63)
694 cnt = 63;
695
696 if (copy_from_user(&buf, ubuf, cnt))
697 return -EFAULT;
698
699 buf[cnt] = 0;
700
701 if (strncmp(buf, "NO_", 3) == 0) {
702 neg = 1;
703 cmp += 3;
704 }
705
706 for (i = 0; sched_feat_names[i]; i++) {
707 int len = strlen(sched_feat_names[i]);
708
709 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
710 if (neg)
711 sysctl_sched_features &= ~(1UL << i);
712 else
713 sysctl_sched_features |= (1UL << i);
714 break;
715 }
716 }
717
718 if (!sched_feat_names[i])
719 return -EINVAL;
720
721 filp->f_pos += cnt;
722
723 return cnt;
724}
725
726static struct file_operations sched_feat_fops = {
727 .open = sched_feat_open,
728 .read = sched_feat_read,
729 .write = sched_feat_write,
730};
731
732static __init int sched_init_debug(void)
733{
734 debugfs_create_file("sched_features", 0644, NULL, NULL,
735 &sched_feat_fops);
736
737 return 0;
738}
739late_initcall(sched_init_debug);
740
741#endif
742
743#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
744
745
746
747
748
749const_debug unsigned int sysctl_sched_nr_migrate = 32;
750
751
752
753
754
755unsigned int sysctl_sched_rt_period = 1000000;
756
757static __read_mostly int scheduler_running;
758
759
760
761
762
763int sysctl_sched_rt_runtime = 950000;
764
765static inline u64 global_rt_period(void)
766{
767 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
768}
769
770static inline u64 global_rt_runtime(void)
771{
772 if (sysctl_sched_rt_period < 0)
773 return RUNTIME_INF;
774
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776}
777
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783
784
785
786
787
788
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794
795
796
797
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818
819
820
821
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830
831
832
833
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0)
856#endif
857#ifndef finish_arch_switch
858# define finish_arch_switch(prev) do { } while (0)
859#endif
860
861static inline int task_current(struct rq *rq, struct task_struct *p)
862{
863 return rq->curr == p;
864}
865
866#ifndef __ARCH_WANT_UNLOCKED_CTXSW
867static inline int task_running(struct rq *rq, struct task_struct *p)
868{
869 return task_current(rq, p);
870}
871
872static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
873{
874}
875
876static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
877{
878#ifdef CONFIG_DEBUG_SPINLOCK
879
880 rq->lock.owner = current;
881#endif
882
883
884
885
886
887 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
888
889 spin_unlock_irq(&rq->lock);
890}
891
892#else
893static inline int task_running(struct rq *rq, struct task_struct *p)
894{
895#ifdef CONFIG_SMP
896 return p->oncpu;
897#else
898 return task_current(rq, p);
899#endif
900}
901
902static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
903{
904#ifdef CONFIG_SMP
905
906
907
908
909
910 next->oncpu = 1;
911#endif
912#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
913 spin_unlock_irq(&rq->lock);
914#else
915 spin_unlock(&rq->lock);
916#endif
917}
918
919static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
920{
921#ifdef CONFIG_SMP
922
923
924
925
926
927 smp_wmb();
928 prev->oncpu = 0;
929#endif
930#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
931 local_irq_enable();
932#endif
933}
934#endif
935
936
937
938
939
940static inline struct rq *__task_rq_lock(struct task_struct *p)
941 __acquires(rq->lock)
942{
943 for (;;) {
944 struct rq *rq = task_rq(p);
945 spin_lock(&rq->lock);
946 if (likely(rq == task_rq(p)))
947 return rq;
948 spin_unlock(&rq->lock);
949 }
950}
951
952
953
954
955
956
957static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
958 __acquires(rq->lock)
959{
960 struct rq *rq;
961
962 for (;;) {
963 local_irq_save(*flags);
964 rq = task_rq(p);
965 spin_lock(&rq->lock);
966 if (likely(rq == task_rq(p)))
967 return rq;
968 spin_unlock_irqrestore(&rq->lock, *flags);
969 }
970}
971
972static void __task_rq_unlock(struct rq *rq)
973 __releases(rq->lock)
974{
975 spin_unlock(&rq->lock);
976}
977
978static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
979 __releases(rq->lock)
980{
981 spin_unlock_irqrestore(&rq->lock, *flags);
982}
983
984
985
986
987static struct rq *this_rq_lock(void)
988 __acquires(rq->lock)
989{
990 struct rq *rq;
991
992 local_irq_disable();
993 rq = this_rq();
994 spin_lock(&rq->lock);
995
996 return rq;
997}
998
999static void __resched_task(struct task_struct *p, int tif_bit);
1000
1001static inline void resched_task(struct task_struct *p)
1002{
1003 __resched_task(p, TIF_NEED_RESCHED);
1004}
1005
1006#ifdef CONFIG_SCHED_HRTICK
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017static inline void resched_hrt(struct task_struct *p)
1018{
1019 __resched_task(p, TIF_HRTICK_RESCHED);
1020}
1021
1022static inline void resched_rq(struct rq *rq)
1023{
1024 unsigned long flags;
1025
1026 spin_lock_irqsave(&rq->lock, flags);
1027 resched_task(rq->curr);
1028 spin_unlock_irqrestore(&rq->lock, flags);
1029}
1030
1031enum {
1032 HRTICK_SET,
1033 HRTICK_RESET,
1034 HRTICK_BLOCK,
1035};
1036
1037
1038
1039
1040
1041
1042static inline int hrtick_enabled(struct rq *rq)
1043{
1044 if (!sched_feat(HRTICK))
1045 return 0;
1046 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
1047 return 0;
1048 return hrtimer_is_hres_active(&rq->hrtick_timer);
1049}
1050
1051
1052
1053
1054
1055
1056static void hrtick_start(struct rq *rq, u64 delay, int reset)
1057{
1058 assert_spin_locked(&rq->lock);
1059
1060
1061
1062
1063 rq->hrtick_expire =
1064 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
1065
1066
1067
1068 __set_bit(HRTICK_SET, &rq->hrtick_flags);
1069 if (reset)
1070 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
1071
1072
1073
1074
1075
1076 if (reset)
1077 resched_hrt(rq->curr);
1078}
1079
1080static void hrtick_clear(struct rq *rq)
1081{
1082 if (hrtimer_active(&rq->hrtick_timer))
1083 hrtimer_cancel(&rq->hrtick_timer);
1084}
1085
1086
1087
1088
1089static void hrtick_set(struct rq *rq)
1090{
1091 ktime_t time;
1092 int set, reset;
1093 unsigned long flags;
1094
1095 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1096
1097 spin_lock_irqsave(&rq->lock, flags);
1098 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
1099 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
1100 time = rq->hrtick_expire;
1101 clear_thread_flag(TIF_HRTICK_RESCHED);
1102 spin_unlock_irqrestore(&rq->lock, flags);
1103
1104 if (set) {
1105 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
1106 if (reset && !hrtimer_active(&rq->hrtick_timer))
1107 resched_rq(rq);
1108 } else
1109 hrtick_clear(rq);
1110}
1111
1112
1113
1114
1115
1116static enum hrtimer_restart hrtick(struct hrtimer *timer)
1117{
1118 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1119
1120 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1121
1122 spin_lock(&rq->lock);
1123 update_rq_clock(rq);
1124 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1125 spin_unlock(&rq->lock);
1126
1127 return HRTIMER_NORESTART;
1128}
1129
1130#ifdef CONFIG_SMP
1131static void hotplug_hrtick_disable(int cpu)
1132{
1133 struct rq *rq = cpu_rq(cpu);
1134 unsigned long flags;
1135
1136 spin_lock_irqsave(&rq->lock, flags);
1137 rq->hrtick_flags = 0;
1138 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1139 spin_unlock_irqrestore(&rq->lock, flags);
1140
1141 hrtick_clear(rq);
1142}
1143
1144static void hotplug_hrtick_enable(int cpu)
1145{
1146 struct rq *rq = cpu_rq(cpu);
1147 unsigned long flags;
1148
1149 spin_lock_irqsave(&rq->lock, flags);
1150 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1151 spin_unlock_irqrestore(&rq->lock, flags);
1152}
1153
1154static int
1155hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1156{
1157 int cpu = (int)(long)hcpu;
1158
1159 switch (action) {
1160 case CPU_UP_CANCELED:
1161 case CPU_UP_CANCELED_FROZEN:
1162 case CPU_DOWN_PREPARE:
1163 case CPU_DOWN_PREPARE_FROZEN:
1164 case CPU_DEAD:
1165 case CPU_DEAD_FROZEN:
1166 hotplug_hrtick_disable(cpu);
1167 return NOTIFY_OK;
1168
1169 case CPU_UP_PREPARE:
1170 case CPU_UP_PREPARE_FROZEN:
1171 case CPU_DOWN_FAILED:
1172 case CPU_DOWN_FAILED_FROZEN:
1173 case CPU_ONLINE:
1174 case CPU_ONLINE_FROZEN:
1175 hotplug_hrtick_enable(cpu);
1176 return NOTIFY_OK;
1177 }
1178
1179 return NOTIFY_DONE;
1180}
1181
1182static void init_hrtick(void)
1183{
1184 hotcpu_notifier(hotplug_hrtick, 0);
1185}
1186#endif
1187
1188static void init_rq_hrtick(struct rq *rq)
1189{
1190 rq->hrtick_flags = 0;
1191 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1192 rq->hrtick_timer.function = hrtick;
1193 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1194}
1195
1196void hrtick_resched(void)
1197{
1198 struct rq *rq;
1199 unsigned long flags;
1200
1201 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1202 return;
1203
1204 local_irq_save(flags);
1205 rq = cpu_rq(smp_processor_id());
1206 hrtick_set(rq);
1207 local_irq_restore(flags);
1208}
1209#else
1210static inline void hrtick_clear(struct rq *rq)
1211{
1212}
1213
1214static inline void hrtick_set(struct rq *rq)
1215{
1216}
1217
1218static inline void init_rq_hrtick(struct rq *rq)
1219{
1220}
1221
1222void hrtick_resched(void)
1223{
1224}
1225
1226static inline void init_hrtick(void)
1227{
1228}
1229#endif
1230
1231
1232
1233
1234
1235
1236
1237
1238#ifdef CONFIG_SMP
1239
1240#ifndef tsk_is_polling
1241#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1242#endif
1243
1244static void __resched_task(struct task_struct *p, int tif_bit)
1245{
1246 int cpu;
1247
1248 assert_spin_locked(&task_rq(p)->lock);
1249
1250 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
1251 return;
1252
1253 set_tsk_thread_flag(p, tif_bit);
1254
1255 cpu = task_cpu(p);
1256 if (cpu == smp_processor_id())
1257 return;
1258
1259
1260 smp_mb();
1261 if (!tsk_is_polling(p))
1262 smp_send_reschedule(cpu);
1263}
1264
1265static void resched_cpu(int cpu)
1266{
1267 struct rq *rq = cpu_rq(cpu);
1268 unsigned long flags;
1269
1270 if (!spin_trylock_irqsave(&rq->lock, flags))
1271 return;
1272 resched_task(cpu_curr(cpu));
1273 spin_unlock_irqrestore(&rq->lock, flags);
1274}
1275
1276#ifdef CONFIG_NO_HZ
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287void wake_up_idle_cpu(int cpu)
1288{
1289 struct rq *rq = cpu_rq(cpu);
1290
1291 if (cpu == smp_processor_id())
1292 return;
1293
1294
1295
1296
1297
1298
1299
1300
1301 if (rq->curr != rq->idle)
1302 return;
1303
1304
1305
1306
1307
1308
1309 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1310
1311
1312 smp_mb();
1313 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu);
1315}
1316#endif
1317
1318#else
1319static void __resched_task(struct task_struct *p, int tif_bit)
1320{
1321 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit);
1323}
1324#endif
1325
1326#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL)
1328#else
1329# define WMULT_CONST (1UL << 32)
1330#endif
1331
1332#define WMULT_SHIFT 32
1333
1334
1335
1336
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338
1339static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw)
1342{
1343 u64 tmp;
1344
1345 if (!lw->inv_weight) {
1346 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1347 lw->inv_weight = 1;
1348 else
1349 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1350 / (lw->weight+1);
1351 }
1352
1353 tmp = (u64)delta_exec * weight;
1354
1355
1356
1357 if (unlikely(tmp > WMULT_CONST))
1358 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1359 WMULT_SHIFT/2);
1360 else
1361 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1362
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364}
1365
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{
1374 lw->weight += inc;
1375 lw->inv_weight = 0;
1376}
1377
1378static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1379{
1380 lw->weight -= dec;
1381 lw->inv_weight = 0;
1382}
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393#define WEIGHT_IDLEPRIO 2
1394#define WMULT_IDLEPRIO (1 << 31)
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408static const int prio_to_weight[40] = {
1409 88761, 71755, 56483, 46273, 36291,
1410 29154, 23254, 18705, 14949, 11916,
1411 9548, 7620, 6100, 4904, 3906,
1412 3121, 2501, 1991, 1586, 1277,
1413 1024, 820, 655, 526, 423,
1414 335, 272, 215, 172, 137,
1415 110, 87, 70, 56, 45,
1416 36, 29, 23, 18, 15,
1417};
1418
1419
1420
1421
1422
1423
1424
1425
1426static const u32 prio_to_wmult[40] = {
1427 48388, 59856, 76040, 92818, 118348,
1428 147320, 184698, 229616, 287308, 360437,
1429 449829, 563644, 704093, 875809, 1099582,
1430 1376151, 1717300, 2157191, 2708050, 3363326,
1431 4194304, 5237765, 6557202, 8165337, 10153587,
1432 12820798, 15790321, 19976592, 24970740, 31350126,
1433 39045157, 49367440, 61356676, 76695844, 95443717,
1434 119304647, 148102320, 186737708, 238609294, 286331153,
1435};
1436
1437static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1438
1439
1440
1441
1442
1443
1444struct rq_iterator {
1445 void *arg;
1446 struct task_struct *(*start)(void *);
1447 struct task_struct *(*next)(void *);
1448};
1449
1450#ifdef CONFIG_SMP
1451static unsigned long
1452balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1453 unsigned long max_load_move, struct sched_domain *sd,
1454 enum cpu_idle_type idle, int *all_pinned,
1455 int *this_best_prio, struct rq_iterator *iterator);
1456
1457static int
1458iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1459 struct sched_domain *sd, enum cpu_idle_type idle,
1460 struct rq_iterator *iterator);
1461#endif
1462
1463#ifdef CONFIG_CGROUP_CPUACCT
1464static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1465#else
1466static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1467#endif
1468
1469static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1470{
1471 update_load_add(&rq->load, load);
1472}
1473
1474static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1475{
1476 update_load_sub(&rq->load, load);
1477}
1478
1479#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else
1485
1486#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1488{
1489}
1490#endif
1491
1492#endif
1493
1494#include "sched_stats.h"
1495#include "sched_idletask.c"
1496#include "sched_fair.c"
1497#include "sched_rt.c"
1498#ifdef CONFIG_SCHED_DEBUG
1499# include "sched_debug.c"
1500#endif
1501
1502#define sched_class_highest (&rt_sched_class)
1503
1504static inline void inc_load(struct rq *rq, const struct task_struct *p)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{
1516 rq->nr_running++;
1517 inc_load(rq, p);
1518}
1519
1520static void dec_nr_running(struct task_struct *p, struct rq *rq)
1521{
1522 rq->nr_running--;
1523 dec_load(rq, p);
1524}
1525
1526static void set_load_weight(struct task_struct *p)
1527{
1528 if (task_has_rt_policy(p)) {
1529 p->se.load.weight = prio_to_weight[0] * 2;
1530 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1531 return;
1532 }
1533
1534
1535
1536
1537 if (p->policy == SCHED_IDLE) {
1538 p->se.load.weight = WEIGHT_IDLEPRIO;
1539 p->se.load.inv_weight = WMULT_IDLEPRIO;
1540 return;
1541 }
1542
1543 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545}
1546
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{
1549 sched_info_queued(p);
1550 p->sched_class->enqueue_task(rq, p, wakeup);
1551 p->se.on_rq = 1;
1552}
1553
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{
1556 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0;
1558}
1559
1560
1561
1562
1563static inline int __normal_prio(struct task_struct *p)
1564{
1565 return p->static_prio;
1566}
1567
1568
1569
1570
1571
1572
1573
1574
1575static inline int normal_prio(struct task_struct *p)
1576{
1577 int prio;
1578
1579 if (task_has_rt_policy(p))
1580 prio = MAX_RT_PRIO-1 - p->rt_priority;
1581 else
1582 prio = __normal_prio(p);
1583 return prio;
1584}
1585
1586
1587
1588
1589
1590
1591
1592
1593static int effective_prio(struct task_struct *p)
1594{
1595 p->normal_prio = normal_prio(p);
1596
1597
1598
1599
1600
1601 if (!rt_prio(p->prio))
1602 return p->normal_prio;
1603 return p->prio;
1604}
1605
1606
1607
1608
1609static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1610{
1611 if (task_contributes_to_load(p))
1612 rq->nr_uninterruptible--;
1613
1614 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq);
1616}
1617
1618
1619
1620
1621static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1622{
1623 if (task_contributes_to_load(p))
1624 rq->nr_uninterruptible++;
1625
1626 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq);
1628}
1629
1630
1631
1632
1633
1634inline int task_curr(const struct task_struct *p)
1635{
1636 return cpu_curr(task_cpu(p)) == p;
1637}
1638
1639
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{
1647 set_task_rq(p, cpu);
1648#ifdef CONFIG_SMP
1649
1650
1651
1652
1653
1654 smp_wmb();
1655 task_thread_info(p)->cpu = cpu;
1656#endif
1657}
1658
1659static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1660 const struct sched_class *prev_class,
1661 int oldprio, int running)
1662{
1663 if (prev_class != p->sched_class) {
1664 if (prev_class->switched_from)
1665 prev_class->switched_from(rq, p, running);
1666 p->sched_class->switched_to(rq, p, running);
1667 } else
1668 p->sched_class->prio_changed(rq, p, oldprio, running);
1669}
1670
1671#ifdef CONFIG_SMP
1672
1673
1674
1675
1676static int
1677task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1678{
1679 s64 delta;
1680
1681
1682
1683
1684 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
1685 return 1;
1686
1687 if (p->sched_class != &fair_sched_class)
1688 return 0;
1689
1690 if (sysctl_sched_migration_cost == -1)
1691 return 1;
1692 if (sysctl_sched_migration_cost == 0)
1693 return 0;
1694
1695 delta = now - p->se.exec_start;
1696
1697 return delta < (s64)sysctl_sched_migration_cost;
1698}
1699
1700
1701void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1702{
1703 int old_cpu = task_cpu(p);
1704 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1705 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1706 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1707 u64 clock_offset;
1708
1709 clock_offset = old_rq->clock - new_rq->clock;
1710
1711#ifdef CONFIG_SCHEDSTATS
1712 if (p->se.wait_start)
1713 p->se.wait_start -= clock_offset;
1714 if (p->se.sleep_start)
1715 p->se.sleep_start -= clock_offset;
1716 if (p->se.block_start)
1717 p->se.block_start -= clock_offset;
1718 if (old_cpu != new_cpu) {
1719 schedstat_inc(p, se.nr_migrations);
1720 if (task_hot(p, old_rq->clock, NULL))
1721 schedstat_inc(p, se.nr_forced2_migrations);
1722 }
1723#endif
1724 p->se.vruntime -= old_cfsrq->min_vruntime -
1725 new_cfsrq->min_vruntime;
1726
1727 __set_task_cpu(p, new_cpu);
1728}
1729
1730struct migration_req {
1731 struct list_head list;
1732
1733 struct task_struct *task;
1734 int dest_cpu;
1735
1736 struct completion done;
1737};
1738
1739
1740
1741
1742
1743static int
1744migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1745{
1746 struct rq *rq = task_rq(p);
1747
1748
1749
1750
1751
1752 if (!p->se.on_rq && !task_running(rq, p)) {
1753 set_task_cpu(p, dest_cpu);
1754 return 0;
1755 }
1756
1757 init_completion(&req->done);
1758 req->task = p;
1759 req->dest_cpu = dest_cpu;
1760 list_add(&req->list, &rq->migration_queue);
1761
1762 return 1;
1763}
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774void wait_task_inactive(struct task_struct *p)
1775{
1776 unsigned long flags;
1777 int running, on_rq;
1778 struct rq *rq;
1779
1780 for (;;) {
1781
1782
1783
1784
1785
1786
1787 rq = task_rq(p);
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800 while (task_running(rq, p))
1801 cpu_relax();
1802
1803
1804
1805
1806
1807
1808 rq = task_rq_lock(p, &flags);
1809 running = task_running(rq, p);
1810 on_rq = p->se.on_rq;
1811 task_rq_unlock(rq, &flags);
1812
1813
1814
1815
1816
1817
1818
1819 if (unlikely(running)) {
1820 cpu_relax();
1821 continue;
1822 }
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833 if (unlikely(on_rq)) {
1834 schedule_timeout_uninterruptible(1);
1835 continue;
1836 }
1837
1838
1839
1840
1841
1842
1843 break;
1844 }
1845}
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860void kick_process(struct task_struct *p)
1861{
1862 int cpu;
1863
1864 preempt_disable();
1865 cpu = task_cpu(p);
1866 if ((cpu != smp_processor_id()) && task_curr(p))
1867 smp_send_reschedule(cpu);
1868 preempt_enable();
1869}
1870
1871
1872
1873
1874
1875
1876
1877
1878static unsigned long source_load(int cpu, int type)
1879{
1880 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu);
1882
1883 if (type == 0)
1884 return total;
1885
1886 return min(rq->cpu_load[type-1], total);
1887}
1888
1889
1890
1891
1892
1893static unsigned long target_load(int cpu, int type)
1894{
1895 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu);
1897
1898 if (type == 0)
1899 return total;
1900
1901 return max(rq->cpu_load[type-1], total);
1902}
1903
1904
1905
1906
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916
1917
1918
1919
1920static struct sched_group *
1921find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1922{
1923 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1924 unsigned long min_load = ULONG_MAX, this_load = 0;
1925 int load_idx = sd->forkexec_idx;
1926 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1927
1928 do {
1929 unsigned long load, avg_load;
1930 int local_group;
1931 int i;
1932
1933
1934 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1935 continue;
1936
1937 local_group = cpu_isset(this_cpu, group->cpumask);
1938
1939
1940 avg_load = 0;
1941
1942 for_each_cpu_mask(i, group->cpumask) {
1943
1944 if (local_group)
1945 load = source_load(i, load_idx);
1946 else
1947 load = target_load(i, load_idx);
1948
1949 avg_load += load;
1950 }
1951
1952
1953 avg_load = sg_div_cpu_power(group,
1954 avg_load * SCHED_LOAD_SCALE);
1955
1956 if (local_group) {
1957 this_load = avg_load;
1958 this = group;
1959 } else if (avg_load < min_load) {
1960 min_load = avg_load;
1961 idlest = group;
1962 }
1963 } while (group = group->next, group != sd->groups);
1964
1965 if (!idlest || 100*this_load < imbalance*min_load)
1966 return NULL;
1967 return idlest;
1968}
1969
1970
1971
1972
1973static int
1974find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
1975 cpumask_t *tmp)
1976{
1977 unsigned long load, min_load = ULONG_MAX;
1978 int idlest = -1;
1979 int i;
1980
1981
1982 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1983
1984 for_each_cpu_mask(i, *tmp) {
1985 load = weighted_cpuload(i);
1986
1987 if (load < min_load || (load == min_load && i == this_cpu)) {
1988 min_load = load;
1989 idlest = i;
1990 }
1991 }
1992
1993 return idlest;
1994}
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007static int sched_balance_self(int cpu, int flag)
2008{
2009 struct task_struct *t = current;
2010 struct sched_domain *tmp, *sd = NULL;
2011
2012 for_each_domain(cpu, tmp) {
2013
2014
2015
2016 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2017 break;
2018 if (tmp->flags & flag)
2019 sd = tmp;
2020 }
2021
2022 while (sd) {
2023 cpumask_t span, tmpmask;
2024 struct sched_group *group;
2025 int new_cpu, weight;
2026
2027 if (!(sd->flags & flag)) {
2028 sd = sd->child;
2029 continue;
2030 }
2031
2032 span = sd->span;
2033 group = find_idlest_group(sd, t, cpu);
2034 if (!group) {
2035 sd = sd->child;
2036 continue;
2037 }
2038
2039 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2040 if (new_cpu == -1 || new_cpu == cpu) {
2041
2042 sd = sd->child;
2043 continue;
2044 }
2045
2046
2047 cpu = new_cpu;
2048 sd = NULL;
2049 weight = cpus_weight(span);
2050 for_each_domain(cpu, tmp) {
2051 if (weight <= cpus_weight(tmp->span))
2052 break;
2053 if (tmp->flags & flag)
2054 sd = tmp;
2055 }
2056
2057 }
2058
2059 return cpu;
2060}
2061
2062#endif
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2079{
2080 int cpu, orig_cpu, this_cpu, success = 0;
2081 unsigned long flags;
2082 long old_state;
2083 struct rq *rq;
2084
2085 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0;
2087
2088 smp_wmb();
2089 rq = task_rq_lock(p, &flags);
2090 old_state = p->state;
2091 if (!(old_state & state))
2092 goto out;
2093
2094 if (p->se.on_rq)
2095 goto out_running;
2096
2097 cpu = task_cpu(p);
2098 orig_cpu = cpu;
2099 this_cpu = smp_processor_id();
2100
2101#ifdef CONFIG_SMP
2102 if (unlikely(task_running(rq, p)))
2103 goto out_activate;
2104
2105 cpu = p->sched_class->select_task_rq(p, sync);
2106 if (cpu != orig_cpu) {
2107 set_task_cpu(p, cpu);
2108 task_rq_unlock(rq, &flags);
2109
2110 rq = task_rq_lock(p, &flags);
2111 old_state = p->state;
2112 if (!(old_state & state))
2113 goto out;
2114 if (p->se.on_rq)
2115 goto out_running;
2116
2117 this_cpu = smp_processor_id();
2118 cpu = task_cpu(p);
2119 }
2120
2121#ifdef CONFIG_SCHEDSTATS
2122 schedstat_inc(rq, ttwu_count);
2123 if (cpu == this_cpu)
2124 schedstat_inc(rq, ttwu_local);
2125 else {
2126 struct sched_domain *sd;
2127 for_each_domain(this_cpu, sd) {
2128 if (cpu_isset(cpu, sd->span)) {
2129 schedstat_inc(sd, ttwu_wake_remote);
2130 break;
2131 }
2132 }
2133 }
2134#endif
2135
2136out_activate:
2137#endif
2138 schedstat_inc(p, se.nr_wakeups);
2139 if (sync)
2140 schedstat_inc(p, se.nr_wakeups_sync);
2141 if (orig_cpu != cpu)
2142 schedstat_inc(p, se.nr_wakeups_migrate);
2143 if (cpu == this_cpu)
2144 schedstat_inc(p, se.nr_wakeups_local);
2145 else
2146 schedstat_inc(p, se.nr_wakeups_remote);
2147 update_rq_clock(rq);
2148 activate_task(rq, p, 1);
2149 success = 1;
2150
2151out_running:
2152 check_preempt_curr(rq, p);
2153
2154 p->state = TASK_RUNNING;
2155#ifdef CONFIG_SMP
2156 if (p->sched_class->task_wake_up)
2157 p->sched_class->task_wake_up(rq, p);
2158#endif
2159out:
2160 task_rq_unlock(rq, &flags);
2161
2162 return success;
2163}
2164
2165int wake_up_process(struct task_struct *p)
2166{
2167 return try_to_wake_up(p, TASK_ALL, 0);
2168}
2169EXPORT_SYMBOL(wake_up_process);
2170
2171int wake_up_state(struct task_struct *p, unsigned int state)
2172{
2173 return try_to_wake_up(p, state, 0);
2174}
2175
2176
2177
2178
2179
2180
2181
2182static void __sched_fork(struct task_struct *p)
2183{
2184 p->se.exec_start = 0;
2185 p->se.sum_exec_runtime = 0;
2186 p->se.prev_sum_exec_runtime = 0;
2187 p->se.last_wakeup = 0;
2188 p->se.avg_overlap = 0;
2189
2190#ifdef CONFIG_SCHEDSTATS
2191 p->se.wait_start = 0;
2192 p->se.sum_sleep_runtime = 0;
2193 p->se.sleep_start = 0;
2194 p->se.block_start = 0;
2195 p->se.sleep_max = 0;
2196 p->se.block_max = 0;
2197 p->se.exec_max = 0;
2198 p->se.slice_max = 0;
2199 p->se.wait_max = 0;
2200#endif
2201
2202 INIT_LIST_HEAD(&p->rt.run_list);
2203 p->se.on_rq = 0;
2204 INIT_LIST_HEAD(&p->se.group_node);
2205
2206#ifdef CONFIG_PREEMPT_NOTIFIERS
2207 INIT_HLIST_HEAD(&p->preempt_notifiers);
2208#endif
2209
2210
2211
2212
2213
2214
2215
2216 p->state = TASK_RUNNING;
2217}
2218
2219
2220
2221
2222void sched_fork(struct task_struct *p, int clone_flags)
2223{
2224 int cpu = get_cpu();
2225
2226 __sched_fork(p);
2227
2228#ifdef CONFIG_SMP
2229 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2230#endif
2231 set_task_cpu(p, cpu);
2232
2233
2234
2235
2236 p->prio = current->normal_prio;
2237 if (!rt_prio(p->prio))
2238 p->sched_class = &fair_sched_class;
2239
2240#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2241 if (likely(sched_info_on()))
2242 memset(&p->sched_info, 0, sizeof(p->sched_info));
2243#endif
2244#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2245 p->oncpu = 0;
2246#endif
2247#ifdef CONFIG_PREEMPT
2248
2249 task_thread_info(p)->preempt_count = 1;
2250#endif
2251 put_cpu();
2252}
2253
2254
2255
2256
2257
2258
2259
2260
2261void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2262{
2263 unsigned long flags;
2264 struct rq *rq;
2265
2266 rq = task_rq_lock(p, &flags);
2267 BUG_ON(p->state != TASK_RUNNING);
2268 update_rq_clock(rq);
2269
2270 p->prio = effective_prio(p);
2271
2272 if (!p->sched_class->task_new || !current->se.on_rq) {
2273 activate_task(rq, p, 0);
2274 } else {
2275
2276
2277
2278
2279 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq);
2281 }
2282 check_preempt_curr(rq, p);
2283#ifdef CONFIG_SMP
2284 if (p->sched_class->task_wake_up)
2285 p->sched_class->task_wake_up(rq, p);
2286#endif
2287 task_rq_unlock(rq, &flags);
2288}
2289
2290#ifdef CONFIG_PREEMPT_NOTIFIERS
2291
2292
2293
2294
2295
2296void preempt_notifier_register(struct preempt_notifier *notifier)
2297{
2298 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2299}
2300EXPORT_SYMBOL_GPL(preempt_notifier_register);
2301
2302
2303
2304
2305
2306
2307
2308void preempt_notifier_unregister(struct preempt_notifier *notifier)
2309{
2310 hlist_del(¬ifier->link);
2311}
2312EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2313
2314static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2315{
2316 struct preempt_notifier *notifier;
2317 struct hlist_node *node;
2318
2319 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2320 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2321}
2322
2323static void
2324fire_sched_out_preempt_notifiers(struct task_struct *curr,
2325 struct task_struct *next)
2326{
2327 struct preempt_notifier *notifier;
2328 struct hlist_node *node;
2329
2330 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2331 notifier->ops->sched_out(notifier, next);
2332}
2333
2334#else
2335
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{
2338}
2339
2340static void
2341fire_sched_out_preempt_notifiers(struct task_struct *curr,
2342 struct task_struct *next)
2343{
2344}
2345
2346#endif
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361static inline void
2362prepare_task_switch(struct rq *rq, struct task_struct *prev,
2363 struct task_struct *next)
2364{
2365 fire_sched_out_preempt_notifiers(prev, next);
2366 prepare_lock_switch(rq, next);
2367 prepare_arch_switch(next);
2368}
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2386 __releases(rq->lock)
2387{
2388 struct mm_struct *mm = rq->prev_mm;
2389 long prev_state;
2390
2391 rq->prev_mm = NULL;
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404 prev_state = prev->state;
2405 finish_arch_switch(prev);
2406 finish_lock_switch(rq, prev);
2407#ifdef CONFIG_SMP
2408 if (current->sched_class->post_schedule)
2409 current->sched_class->post_schedule(rq);
2410#endif
2411
2412 fire_sched_in_preempt_notifiers(current);
2413 if (mm)
2414 mmdrop(mm);
2415 if (unlikely(prev_state == TASK_DEAD)) {
2416
2417
2418
2419
2420 kprobe_flush_task(prev);
2421 put_task_struct(prev);
2422 }
2423}
2424
2425
2426
2427
2428
2429asmlinkage void schedule_tail(struct task_struct *prev)
2430 __releases(rq->lock)
2431{
2432 struct rq *rq = this_rq();
2433
2434 finish_task_switch(rq, prev);
2435#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2436
2437 preempt_enable();
2438#endif
2439 if (current->set_child_tid)
2440 put_user(task_pid_vnr(current), current->set_child_tid);
2441}
2442
2443
2444
2445
2446
2447static inline void
2448context_switch(struct rq *rq, struct task_struct *prev,
2449 struct task_struct *next)
2450{
2451 struct mm_struct *mm, *oldmm;
2452
2453 prepare_task_switch(rq, prev, next);
2454 mm = next->mm;
2455 oldmm = prev->active_mm;
2456
2457
2458
2459
2460
2461 arch_enter_lazy_cpu_mode();
2462
2463 if (unlikely(!mm)) {
2464 next->active_mm = oldmm;
2465 atomic_inc(&oldmm->mm_count);
2466 enter_lazy_tlb(oldmm, next);
2467 } else
2468 switch_mm(oldmm, mm, next);
2469
2470 if (unlikely(!prev->mm)) {
2471 prev->active_mm = NULL;
2472 rq->prev_mm = oldmm;
2473 }
2474
2475
2476
2477
2478
2479
2480#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2481 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2482#endif
2483
2484
2485 switch_to(prev, next, prev);
2486
2487 barrier();
2488
2489
2490
2491
2492
2493 finish_task_switch(this_rq(), prev);
2494}
2495
2496
2497
2498
2499
2500
2501
2502
2503unsigned long nr_running(void)
2504{
2505 unsigned long i, sum = 0;
2506
2507 for_each_online_cpu(i)
2508 sum += cpu_rq(i)->nr_running;
2509
2510 return sum;
2511}
2512
2513unsigned long nr_uninterruptible(void)
2514{
2515 unsigned long i, sum = 0;
2516
2517 for_each_possible_cpu(i)
2518 sum += cpu_rq(i)->nr_uninterruptible;
2519
2520
2521
2522
2523
2524 if (unlikely((long)sum < 0))
2525 sum = 0;
2526
2527 return sum;
2528}
2529
2530unsigned long long nr_context_switches(void)
2531{
2532 int i;
2533 unsigned long long sum = 0;
2534
2535 for_each_possible_cpu(i)
2536 sum += cpu_rq(i)->nr_switches;
2537
2538 return sum;
2539}
2540
2541unsigned long nr_iowait(void)
2542{
2543 unsigned long i, sum = 0;
2544
2545 for_each_possible_cpu(i)
2546 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2547
2548 return sum;
2549}
2550
2551unsigned long nr_active(void)
2552{
2553 unsigned long i, running = 0, uninterruptible = 0;
2554
2555 for_each_online_cpu(i) {
2556 running += cpu_rq(i)->nr_running;
2557 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2558 }
2559
2560 if (unlikely((long)uninterruptible < 0))
2561 uninterruptible = 0;
2562
2563 return running + uninterruptible;
2564}
2565
2566
2567
2568
2569
2570static void update_cpu_load(struct rq *this_rq)
2571{
2572 unsigned long this_load = this_rq->load.weight;
2573 int i, scale;
2574
2575 this_rq->nr_load_updates++;
2576
2577
2578 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2579 unsigned long old_load, new_load;
2580
2581
2582
2583 old_load = this_rq->cpu_load[i];
2584 new_load = this_load;
2585
2586
2587
2588
2589
2590 if (new_load > old_load)
2591 new_load += scale-1;
2592 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2593 }
2594}
2595
2596#ifdef CONFIG_SMP
2597
2598
2599
2600
2601
2602
2603
2604static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2605 __acquires(rq1->lock)
2606 __acquires(rq2->lock)
2607{
2608 BUG_ON(!irqs_disabled());
2609 if (rq1 == rq2) {
2610 spin_lock(&rq1->lock);
2611 __acquire(rq2->lock);
2612 } else {
2613 if (rq1 < rq2) {
2614 spin_lock(&rq1->lock);
2615 spin_lock(&rq2->lock);
2616 } else {
2617 spin_lock(&rq2->lock);
2618 spin_lock(&rq1->lock);
2619 }
2620 }
2621 update_rq_clock(rq1);
2622 update_rq_clock(rq2);
2623}
2624
2625
2626
2627
2628
2629
2630
2631static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2632 __releases(rq1->lock)
2633 __releases(rq2->lock)
2634{
2635 spin_unlock(&rq1->lock);
2636 if (rq1 != rq2)
2637 spin_unlock(&rq2->lock);
2638 else
2639 __release(rq2->lock);
2640}
2641
2642
2643
2644
2645static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2646 __releases(this_rq->lock)
2647 __acquires(busiest->lock)
2648 __acquires(this_rq->lock)
2649{
2650 int ret = 0;
2651
2652 if (unlikely(!irqs_disabled())) {
2653
2654 spin_unlock(&this_rq->lock);
2655 BUG_ON(1);
2656 }
2657 if (unlikely(!spin_trylock(&busiest->lock))) {
2658 if (busiest < this_rq) {
2659 spin_unlock(&this_rq->lock);
2660 spin_lock(&busiest->lock);
2661 spin_lock(&this_rq->lock);
2662 ret = 1;
2663 } else
2664 spin_lock(&busiest->lock);
2665 }
2666 return ret;
2667}
2668
2669
2670
2671
2672
2673
2674
2675static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2676{
2677 struct migration_req req;
2678 unsigned long flags;
2679 struct rq *rq;
2680
2681 rq = task_rq_lock(p, &flags);
2682 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2683 || unlikely(cpu_is_offline(dest_cpu)))
2684 goto out;
2685
2686
2687 if (migrate_task(p, dest_cpu, &req)) {
2688
2689 struct task_struct *mt = rq->migration_thread;
2690
2691 get_task_struct(mt);
2692 task_rq_unlock(rq, &flags);
2693 wake_up_process(mt);
2694 put_task_struct(mt);
2695 wait_for_completion(&req.done);
2696
2697 return;
2698 }
2699out:
2700 task_rq_unlock(rq, &flags);
2701}
2702
2703
2704
2705
2706
2707void sched_exec(void)
2708{
2709 int new_cpu, this_cpu = get_cpu();
2710 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2711 put_cpu();
2712 if (new_cpu != this_cpu)
2713 sched_migrate_task(current, new_cpu);
2714}
2715
2716
2717
2718
2719
2720static void pull_task(struct rq *src_rq, struct task_struct *p,
2721 struct rq *this_rq, int this_cpu)
2722{
2723 deactivate_task(src_rq, p, 0);
2724 set_task_cpu(p, this_cpu);
2725 activate_task(this_rq, p, 0);
2726
2727
2728
2729
2730 check_preempt_curr(this_rq, p);
2731}
2732
2733
2734
2735
2736static
2737int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2738 struct sched_domain *sd, enum cpu_idle_type idle,
2739 int *all_pinned)
2740{
2741
2742
2743
2744
2745
2746
2747 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2748 schedstat_inc(p, se.nr_failed_migrations_affine);
2749 return 0;
2750 }
2751 *all_pinned = 0;
2752
2753 if (task_running(rq, p)) {
2754 schedstat_inc(p, se.nr_failed_migrations_running);
2755 return 0;
2756 }
2757
2758
2759
2760
2761
2762
2763
2764 if (!task_hot(p, rq->clock, sd) ||
2765 sd->nr_balance_failed > sd->cache_nice_tries) {
2766#ifdef CONFIG_SCHEDSTATS
2767 if (task_hot(p, rq->clock, sd)) {
2768 schedstat_inc(sd, lb_hot_gained[idle]);
2769 schedstat_inc(p, se.nr_forced_migrations);
2770 }
2771#endif
2772 return 1;
2773 }
2774
2775 if (task_hot(p, rq->clock, sd)) {
2776 schedstat_inc(p, se.nr_failed_migrations_hot);
2777 return 0;
2778 }
2779 return 1;
2780}
2781
2782static unsigned long
2783balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2784 unsigned long max_load_move, struct sched_domain *sd,
2785 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator)
2787{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2789 struct task_struct *p;
2790 long rem_load_move = max_load_move;
2791
2792 if (max_load_move == 0)
2793 goto out;
2794
2795 pinned = 1;
2796
2797
2798
2799
2800 p = iterator->start(iterator->arg);
2801next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out;
2804
2805
2806
2807
2808
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg);
2814 goto next;
2815 }
2816
2817 pull_task(busiest, p, this_rq, this_cpu);
2818 pulled++;
2819 rem_load_move -= p->se.load.weight;
2820
2821
2822
2823
2824 if (rem_load_move > 0) {
2825 if (p->prio < *this_best_prio)
2826 *this_best_prio = p->prio;
2827 p = iterator->next(iterator->arg);
2828 goto next;
2829 }
2830out:
2831
2832
2833
2834
2835
2836 schedstat_add(sd, lb_gained[idle], pulled);
2837
2838 if (all_pinned)
2839 *all_pinned = pinned;
2840
2841 return max_load_move - rem_load_move;
2842}
2843
2844
2845
2846
2847
2848
2849
2850
2851static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2852 unsigned long max_load_move,
2853 struct sched_domain *sd, enum cpu_idle_type idle,
2854 int *all_pinned)
2855{
2856 const struct sched_class *class = sched_class_highest;
2857 unsigned long total_load_moved = 0;
2858 int this_best_prio = this_rq->curr->prio;
2859
2860 do {
2861 total_load_moved +=
2862 class->load_balance(this_rq, this_cpu, busiest,
2863 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next;
2866 } while (class && max_load_move > total_load_moved);
2867
2868 return total_load_moved > 0;
2869}
2870
2871static int
2872iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2873 struct sched_domain *sd, enum cpu_idle_type idle,
2874 struct rq_iterator *iterator)
2875{
2876 struct task_struct *p = iterator->start(iterator->arg);
2877 int pinned = 0;
2878
2879 while (p) {
2880 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2881 pull_task(busiest, p, this_rq, this_cpu);
2882
2883
2884
2885
2886
2887 schedstat_inc(sd, lb_gained[idle]);
2888
2889 return 1;
2890 }
2891 p = iterator->next(iterator->arg);
2892 }
2893
2894 return 0;
2895}
2896
2897
2898
2899
2900
2901
2902
2903
2904static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2905 struct sched_domain *sd, enum cpu_idle_type idle)
2906{
2907 const struct sched_class *class;
2908
2909 for (class = sched_class_highest; class; class = class->next)
2910 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
2911 return 1;
2912
2913 return 0;
2914}
2915
2916
2917
2918
2919
2920
2921static struct sched_group *
2922find_busiest_group(struct sched_domain *sd, int this_cpu,
2923 unsigned long *imbalance, enum cpu_idle_type idle,
2924 int *sd_idle, const cpumask_t *cpus, int *balance)
2925{
2926 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2927 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2928 unsigned long max_pull;
2929 unsigned long busiest_load_per_task, busiest_nr_running;
2930 unsigned long this_load_per_task, this_nr_running;
2931 int load_idx, group_imb = 0;
2932#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2933 int power_savings_balance = 1;
2934 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2935 unsigned long min_nr_running = ULONG_MAX;
2936 struct sched_group *group_min = NULL, *group_leader = NULL;
2937#endif
2938
2939 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0;
2942 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE)
2945 load_idx = sd->newidle_idx;
2946 else
2947 load_idx = sd->idle_idx;
2948
2949 do {
2950 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
2951 int local_group;
2952 int i;
2953 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load;
2956
2957 local_group = cpu_isset(this_cpu, group->cpumask);
2958
2959 if (local_group)
2960 balance_cpu = first_cpu(group->cpumask);
2961
2962
2963 sum_weighted_load = sum_nr_running = avg_load = 0;
2964 max_cpu_load = 0;
2965 min_cpu_load = ~0UL;
2966
2967 for_each_cpu_mask(i, group->cpumask) {
2968 struct rq *rq;
2969
2970 if (!cpu_isset(i, *cpus))
2971 continue;
2972
2973 rq = cpu_rq(i);
2974
2975 if (*sd_idle && rq->nr_running)
2976 *sd_idle = 0;
2977
2978
2979 if (local_group) {
2980 if (idle_cpu(i) && !first_idle_cpu) {
2981 first_idle_cpu = 1;
2982 balance_cpu = i;
2983 }
2984
2985 load = target_load(i, load_idx);
2986 } else {
2987 load = source_load(i, load_idx);
2988 if (load > max_cpu_load)
2989 max_cpu_load = load;
2990 if (min_cpu_load > load)
2991 min_cpu_load = load;
2992 }
2993
2994 avg_load += load;
2995 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i);
2997 }
2998
2999
3000
3001
3002
3003
3004
3005 if (idle != CPU_NEWLY_IDLE && local_group &&
3006 balance_cpu != this_cpu && balance) {
3007 *balance = 0;
3008 goto ret;
3009 }
3010
3011 total_load += avg_load;
3012 total_pwr += group->__cpu_power;
3013
3014
3015 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE);
3017
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
3019 __group_imb = 1;
3020
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3022
3023 if (local_group) {
3024 this_load = avg_load;
3025 this = group;
3026 this_nr_running = sum_nr_running;
3027 this_load_per_task = sum_weighted_load;
3028 } else if (avg_load > max_load &&
3029 (sum_nr_running > group_capacity || __group_imb)) {
3030 max_load = avg_load;
3031 busiest = group;
3032 busiest_nr_running = sum_nr_running;
3033 busiest_load_per_task = sum_weighted_load;
3034 group_imb = __group_imb;
3035 }
3036
3037#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3038
3039
3040
3041
3042 if (idle == CPU_NOT_IDLE ||
3043 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3044 goto group_next;
3045
3046
3047
3048
3049
3050 if (local_group && (this_nr_running >= group_capacity ||
3051 !this_nr_running))
3052 power_savings_balance = 0;
3053
3054
3055
3056
3057
3058 if (!power_savings_balance || sum_nr_running >= group_capacity
3059 || !sum_nr_running)
3060 goto group_next;
3061
3062
3063
3064
3065
3066
3067 if ((sum_nr_running < min_nr_running) ||
3068 (sum_nr_running == min_nr_running &&
3069 first_cpu(group->cpumask) <
3070 first_cpu(group_min->cpumask))) {
3071 group_min = group;
3072 min_nr_running = sum_nr_running;
3073 min_load_per_task = sum_weighted_load /
3074 sum_nr_running;
3075 }
3076
3077
3078
3079
3080
3081
3082 if (sum_nr_running <= group_capacity - 1) {
3083 if (sum_nr_running > leader_nr_running ||
3084 (sum_nr_running == leader_nr_running &&
3085 first_cpu(group->cpumask) >
3086 first_cpu(group_leader->cpumask))) {
3087 group_leader = group;
3088 leader_nr_running = sum_nr_running;
3089 }
3090 }
3091group_next:
3092#endif
3093 group = group->next;
3094 } while (group != sd->groups);
3095
3096 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3097 goto out_balanced;
3098
3099 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3100
3101 if (this_load >= avg_load ||
3102 100*max_load <= sd->imbalance_pct*this_load)
3103 goto out_balanced;
3104
3105 busiest_load_per_task /= busiest_nr_running;
3106 if (group_imb)
3107 busiest_load_per_task = min(busiest_load_per_task, avg_load);
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120 if (max_load <= busiest_load_per_task)
3121 goto out_balanced;
3122
3123
3124
3125
3126
3127
3128 if (max_load < avg_load) {
3129 *imbalance = 0;
3130 goto small_imbalance;
3131 }
3132
3133
3134 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3135
3136
3137 *imbalance = min(max_pull * busiest->__cpu_power,
3138 (avg_load - this_load) * this->__cpu_power)
3139 / SCHED_LOAD_SCALE;
3140
3141
3142
3143
3144
3145
3146
3147 if (*imbalance < busiest_load_per_task) {
3148 unsigned long tmp, pwr_now, pwr_move;
3149 unsigned int imbn;
3150
3151small_imbalance:
3152 pwr_move = pwr_now = 0;
3153 imbn = 2;
3154 if (this_nr_running) {
3155 this_load_per_task /= this_nr_running;
3156 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1;
3158 } else
3159 this_load_per_task = SCHED_LOAD_SCALE;
3160
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
3162 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task;
3164 return busiest;
3165 }
3166
3167
3168
3169
3170
3171
3172
3173 pwr_now += busiest->__cpu_power *
3174 min(busiest_load_per_task, max_load);
3175 pwr_now += this->__cpu_power *
3176 min(this_load_per_task, this_load);
3177 pwr_now /= SCHED_LOAD_SCALE;
3178
3179
3180 tmp = sg_div_cpu_power(busiest,
3181 busiest_load_per_task * SCHED_LOAD_SCALE);
3182 if (max_load > tmp)
3183 pwr_move += busiest->__cpu_power *
3184 min(busiest_load_per_task, max_load - tmp);
3185
3186
3187 if (max_load * busiest->__cpu_power <
3188 busiest_load_per_task * SCHED_LOAD_SCALE)
3189 tmp = sg_div_cpu_power(this,
3190 max_load * busiest->__cpu_power);
3191 else
3192 tmp = sg_div_cpu_power(this,
3193 busiest_load_per_task * SCHED_LOAD_SCALE);
3194 pwr_move += this->__cpu_power *
3195 min(this_load_per_task, this_load + tmp);
3196 pwr_move /= SCHED_LOAD_SCALE;
3197
3198
3199 if (pwr_move > pwr_now)
3200 *imbalance = busiest_load_per_task;
3201 }
3202
3203 return busiest;
3204
3205out_balanced:
3206#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3207 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3208 goto ret;
3209
3210 if (this == group_leader && group_leader != group_min) {
3211 *imbalance = min_load_per_task;
3212 return group_min;
3213 }
3214#endif
3215ret:
3216 *imbalance = 0;
3217 return NULL;
3218}
3219
3220
3221
3222
3223static struct rq *
3224find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3225 unsigned long imbalance, const cpumask_t *cpus)
3226{
3227 struct rq *busiest = NULL, *rq;
3228 unsigned long max_load = 0;
3229 int i;
3230
3231 for_each_cpu_mask(i, group->cpumask) {
3232 unsigned long wl;
3233
3234 if (!cpu_isset(i, *cpus))
3235 continue;
3236
3237 rq = cpu_rq(i);
3238 wl = weighted_cpuload(i);
3239
3240 if (rq->nr_running == 1 && wl > imbalance)
3241 continue;
3242
3243 if (wl > max_load) {
3244 max_load = wl;
3245 busiest = rq;
3246 }
3247 }
3248
3249 return busiest;
3250}
3251
3252
3253
3254
3255
3256#define MAX_PINNED_INTERVAL 512
3257
3258
3259
3260
3261
3262static int load_balance(int this_cpu, struct rq *this_rq,
3263 struct sched_domain *sd, enum cpu_idle_type idle,
3264 int *balance, cpumask_t *cpus)
3265{
3266 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3267 struct sched_group *group;
3268 unsigned long imbalance;
3269 struct rq *busiest;
3270 unsigned long flags;
3271
3272 cpus_setall(*cpus);
3273
3274
3275
3276
3277
3278
3279
3280 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3281 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3282 sd_idle = 1;
3283
3284 schedstat_inc(sd, lb_count[idle]);
3285
3286redo:
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance);
3289
3290 if (*balance == 0)
3291 goto out_balanced;
3292
3293 if (!group) {
3294 schedstat_inc(sd, lb_nobusyg[idle]);
3295 goto out_balanced;
3296 }
3297
3298 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3299 if (!busiest) {
3300 schedstat_inc(sd, lb_nobusyq[idle]);
3301 goto out_balanced;
3302 }
3303
3304 BUG_ON(busiest == this_rq);
3305
3306 schedstat_add(sd, lb_imbalance[idle], imbalance);
3307
3308 ld_moved = 0;
3309 if (busiest->nr_running > 1) {
3310
3311
3312
3313
3314
3315
3316 local_irq_save(flags);
3317 double_rq_lock(this_rq, busiest);
3318 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3319 imbalance, sd, idle, &all_pinned);
3320 double_rq_unlock(this_rq, busiest);
3321 local_irq_restore(flags);
3322
3323
3324
3325
3326 if (ld_moved && this_cpu != smp_processor_id())
3327 resched_cpu(this_cpu);
3328
3329
3330 if (unlikely(all_pinned)) {
3331 cpu_clear(cpu_of(busiest), *cpus);
3332 if (!cpus_empty(*cpus))
3333 goto redo;
3334 goto out_balanced;
3335 }
3336 }
3337
3338 if (!ld_moved) {
3339 schedstat_inc(sd, lb_failed[idle]);
3340 sd->nr_balance_failed++;
3341
3342 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3343
3344 spin_lock_irqsave(&busiest->lock, flags);
3345
3346
3347
3348
3349 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
3350 spin_unlock_irqrestore(&busiest->lock, flags);
3351 all_pinned = 1;
3352 goto out_one_pinned;
3353 }
3354
3355 if (!busiest->active_balance) {
3356 busiest->active_balance = 1;
3357 busiest->push_cpu = this_cpu;
3358 active_balance = 1;
3359 }
3360 spin_unlock_irqrestore(&busiest->lock, flags);
3361 if (active_balance)
3362 wake_up_process(busiest->migration_thread);
3363
3364
3365
3366
3367
3368 sd->nr_balance_failed = sd->cache_nice_tries+1;
3369 }
3370 } else
3371 sd->nr_balance_failed = 0;
3372
3373 if (likely(!active_balance)) {
3374
3375 sd->balance_interval = sd->min_interval;
3376 } else {
3377
3378
3379
3380
3381
3382
3383 if (sd->balance_interval < sd->max_interval)
3384 sd->balance_interval *= 2;
3385 }
3386
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1;
3390 return ld_moved;
3391
3392out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]);
3394
3395 sd->nr_balance_failed = 0;
3396
3397out_one_pinned:
3398
3399 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3400 (sd->balance_interval < sd->max_interval))
3401 sd->balance_interval *= 2;
3402
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1;
3406 return 0;
3407}
3408
3409
3410
3411
3412
3413
3414
3415
3416static int
3417load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3418 cpumask_t *cpus)
3419{
3420 struct sched_group *group;
3421 struct rq *busiest = NULL;
3422 unsigned long imbalance;
3423 int ld_moved = 0;
3424 int sd_idle = 0;
3425 int all_pinned = 0;
3426
3427 cpus_setall(*cpus);
3428
3429
3430
3431
3432
3433
3434
3435 if (sd->flags & SD_SHARE_CPUPOWER &&
3436 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3437 sd_idle = 1;
3438
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo:
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL);
3443 if (!group) {
3444 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3445 goto out_balanced;
3446 }
3447
3448 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3449 if (!busiest) {
3450 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3451 goto out_balanced;
3452 }
3453
3454 BUG_ON(busiest == this_rq);
3455
3456 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
3457
3458 ld_moved = 0;
3459 if (busiest->nr_running > 1) {
3460
3461 double_lock_balance(this_rq, busiest);
3462
3463 update_rq_clock(busiest);
3464 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3465 imbalance, sd, CPU_NEWLY_IDLE,
3466 &all_pinned);
3467 spin_unlock(&busiest->lock);
3468
3469 if (unlikely(all_pinned)) {
3470 cpu_clear(cpu_of(busiest), *cpus);
3471 if (!cpus_empty(*cpus))
3472 goto redo;
3473 }
3474 }
3475
3476 if (!ld_moved) {
3477 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3478 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3479 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3480 return -1;
3481 } else
3482 sd->nr_balance_failed = 0;
3483
3484 return ld_moved;
3485
3486out_balanced:
3487 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
3488 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3489 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3490 return -1;
3491 sd->nr_balance_failed = 0;
3492
3493 return 0;
3494}
3495
3496
3497
3498
3499
3500static void idle_balance(int this_cpu, struct rq *this_rq)
3501{
3502 struct sched_domain *sd;
3503 int pulled_task = -1;
3504 unsigned long next_balance = jiffies + HZ;
3505 cpumask_t tmpmask;
3506
3507 for_each_domain(this_cpu, sd) {
3508 unsigned long interval;
3509
3510 if (!(sd->flags & SD_LOAD_BALANCE))
3511 continue;
3512
3513 if (sd->flags & SD_BALANCE_NEWIDLE)
3514
3515 pulled_task = load_balance_newidle(this_cpu, this_rq,
3516 sd, &tmpmask);
3517
3518 interval = msecs_to_jiffies(sd->balance_interval);
3519 if (time_after(next_balance, sd->last_balance + interval))
3520 next_balance = sd->last_balance + interval;
3521 if (pulled_task)
3522 break;
3523 }
3524 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3525
3526
3527
3528
3529 this_rq->next_balance = next_balance;
3530 }
3531}
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3542{
3543 int target_cpu = busiest_rq->push_cpu;
3544 struct sched_domain *sd;
3545 struct rq *target_rq;
3546
3547
3548 if (busiest_rq->nr_running <= 1)
3549 return;
3550
3551 target_rq = cpu_rq(target_cpu);
3552
3553
3554
3555
3556
3557
3558 BUG_ON(busiest_rq == target_rq);
3559
3560
3561 double_lock_balance(busiest_rq, target_rq);
3562 update_rq_clock(busiest_rq);
3563 update_rq_clock(target_rq);
3564
3565
3566 for_each_domain(target_cpu, sd) {
3567 if ((sd->flags & SD_LOAD_BALANCE) &&
3568 cpu_isset(busiest_cpu, sd->span))
3569 break;
3570 }
3571
3572 if (likely(sd)) {
3573 schedstat_inc(sd, alb_count);
3574
3575 if (move_one_task(target_rq, target_cpu, busiest_rq,
3576 sd, CPU_IDLE))
3577 schedstat_inc(sd, alb_pushed);
3578 else
3579 schedstat_inc(sd, alb_failed);
3580 }
3581 spin_unlock(&target_rq->lock);
3582}
3583
3584#ifdef CONFIG_NO_HZ
3585static struct {
3586 atomic_t load_balancer;
3587 cpumask_t cpu_mask;
3588} nohz ____cacheline_aligned = {
3589 .load_balancer = ATOMIC_INIT(-1),
3590 .cpu_mask = CPU_MASK_NONE,
3591};
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613int select_nohz_load_balancer(int stop_tick)
3614{
3615 int cpu = smp_processor_id();
3616
3617 if (stop_tick) {
3618 cpu_set(cpu, nohz.cpu_mask);
3619 cpu_rq(cpu)->in_nohz_recently = 1;
3620
3621
3622
3623
3624 if (cpu_is_offline(cpu) &&
3625 atomic_read(&nohz.load_balancer) == cpu) {
3626 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3627 BUG();
3628 return 0;
3629 }
3630
3631
3632 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3633 if (atomic_read(&nohz.load_balancer) == cpu)
3634 atomic_set(&nohz.load_balancer, -1);
3635 return 0;
3636 }
3637
3638 if (atomic_read(&nohz.load_balancer) == -1) {
3639
3640 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3641 return 1;
3642 } else if (atomic_read(&nohz.load_balancer) == cpu)
3643 return 1;
3644 } else {
3645 if (!cpu_isset(cpu, nohz.cpu_mask))
3646 return 0;
3647
3648 cpu_clear(cpu, nohz.cpu_mask);
3649
3650 if (atomic_read(&nohz.load_balancer) == cpu)
3651 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3652 BUG();
3653 }
3654 return 0;
3655}
3656#endif
3657
3658static DEFINE_SPINLOCK(balancing);
3659
3660
3661
3662
3663
3664
3665
3666static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3667{
3668 int balance = 1;
3669 struct rq *rq = cpu_rq(cpu);
3670 unsigned long interval;
3671 struct sched_domain *sd;
3672
3673 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0;
3675 cpumask_t tmp;
3676
3677 for_each_domain(cpu, sd) {
3678 if (!(sd->flags & SD_LOAD_BALANCE))
3679 continue;
3680
3681 interval = sd->balance_interval;
3682 if (idle != CPU_IDLE)
3683 interval *= sd->busy_factor;
3684
3685
3686 interval = msecs_to_jiffies(interval);
3687 if (unlikely(!interval))
3688 interval = 1;
3689 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10;
3691
3692
3693 if (sd->flags & SD_SERIALIZE) {
3694 if (!spin_trylock(&balancing))
3695 goto out;
3696 }
3697
3698 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3699 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3700
3701
3702
3703
3704
3705 idle = CPU_NOT_IDLE;
3706 }
3707 sd->last_balance = jiffies;
3708 }
3709 if (sd->flags & SD_SERIALIZE)
3710 spin_unlock(&balancing);
3711out:
3712 if (time_after(next_balance, sd->last_balance + interval)) {
3713 next_balance = sd->last_balance + interval;
3714 update_next_balance = 1;
3715 }
3716
3717
3718
3719
3720
3721
3722 if (!balance)
3723 break;
3724 }
3725
3726
3727
3728
3729
3730
3731 if (likely(update_next_balance))
3732 rq->next_balance = next_balance;
3733}
3734
3735
3736
3737
3738
3739
3740static void run_rebalance_domains(struct softirq_action *h)
3741{
3742 int this_cpu = smp_processor_id();
3743 struct rq *this_rq = cpu_rq(this_cpu);
3744 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3745 CPU_IDLE : CPU_NOT_IDLE;
3746
3747 rebalance_domains(this_cpu, idle);
3748
3749#ifdef CONFIG_NO_HZ
3750
3751
3752
3753
3754
3755 if (this_rq->idle_at_tick &&
3756 atomic_read(&nohz.load_balancer) == this_cpu) {
3757 cpumask_t cpus = nohz.cpu_mask;
3758 struct rq *rq;
3759 int balance_cpu;
3760
3761 cpu_clear(this_cpu, cpus);
3762 for_each_cpu_mask(balance_cpu, cpus) {
3763
3764
3765
3766
3767
3768 if (need_resched())
3769 break;
3770
3771 rebalance_domains(balance_cpu, CPU_IDLE);
3772
3773 rq = cpu_rq(balance_cpu);
3774 if (time_after(this_rq->next_balance, rq->next_balance))
3775 this_rq->next_balance = rq->next_balance;
3776 }
3777 }
3778#endif
3779}
3780
3781
3782
3783
3784
3785
3786
3787
3788static inline void trigger_load_balance(struct rq *rq, int cpu)
3789{
3790#ifdef CONFIG_NO_HZ
3791
3792
3793
3794
3795
3796 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3797 rq->in_nohz_recently = 0;
3798
3799 if (atomic_read(&nohz.load_balancer) == cpu) {
3800 cpu_clear(cpu, nohz.cpu_mask);
3801 atomic_set(&nohz.load_balancer, -1);
3802 }
3803
3804 if (atomic_read(&nohz.load_balancer) == -1) {
3805
3806
3807
3808
3809
3810
3811
3812
3813 int ilb = first_cpu(nohz.cpu_mask);
3814
3815 if (ilb < nr_cpu_ids)
3816 resched_cpu(ilb);
3817 }
3818 }
3819
3820
3821
3822
3823
3824 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3825 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3826 resched_cpu(cpu);
3827 return;
3828 }
3829
3830
3831
3832
3833
3834 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3835 cpu_isset(cpu, nohz.cpu_mask))
3836 return;
3837#endif
3838 if (time_after_eq(jiffies, rq->next_balance))
3839 raise_softirq(SCHED_SOFTIRQ);
3840}
3841
3842#else
3843
3844
3845
3846
3847static inline void idle_balance(int cpu, struct rq *rq)
3848{
3849}
3850
3851#endif
3852
3853DEFINE_PER_CPU(struct kernel_stat, kstat);
3854
3855EXPORT_PER_CPU_SYMBOL(kstat);
3856
3857
3858
3859
3860
3861unsigned long long task_sched_runtime(struct task_struct *p)
3862{
3863 unsigned long flags;
3864 u64 ns, delta_exec;
3865 struct rq *rq;
3866
3867 rq = task_rq_lock(p, &flags);
3868 ns = p->se.sum_exec_runtime;
3869 if (task_current(rq, p)) {
3870 update_rq_clock(rq);
3871 delta_exec = rq->clock - p->se.exec_start;
3872 if ((s64)delta_exec > 0)
3873 ns += delta_exec;
3874 }
3875 task_rq_unlock(rq, &flags);
3876
3877 return ns;
3878}
3879
3880
3881
3882
3883
3884
3885void account_user_time(struct task_struct *p, cputime_t cputime)
3886{
3887 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3888 cputime64_t tmp;
3889
3890 p->utime = cputime_add(p->utime, cputime);
3891
3892
3893 tmp = cputime_to_cputime64(cputime);
3894 if (TASK_NICE(p) > 0)
3895 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3896 else
3897 cpustat->user = cputime64_add(cpustat->user, tmp);
3898}
3899
3900
3901
3902
3903
3904
3905static void account_guest_time(struct task_struct *p, cputime_t cputime)
3906{
3907 cputime64_t tmp;
3908 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3909
3910 tmp = cputime_to_cputime64(cputime);
3911
3912 p->utime = cputime_add(p->utime, cputime);
3913 p->gtime = cputime_add(p->gtime, cputime);
3914
3915 cpustat->user = cputime64_add(cpustat->user, tmp);
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3917}
3918
3919
3920
3921
3922
3923
3924void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
3925{
3926 p->utimescaled = cputime_add(p->utimescaled, cputime);
3927}
3928
3929
3930
3931
3932
3933
3934
3935void account_system_time(struct task_struct *p, int hardirq_offset,
3936 cputime_t cputime)
3937{
3938 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3939 struct rq *rq = this_rq();
3940 cputime64_t tmp;
3941
3942 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3943 account_guest_time(p, cputime);
3944 return;
3945 }
3946
3947 p->stime = cputime_add(p->stime, cputime);
3948
3949
3950 tmp = cputime_to_cputime64(cputime);
3951 if (hardirq_count() - hardirq_offset)
3952 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3953 else if (softirq_count())
3954 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3955 else if (p != rq->idle)
3956 cpustat->system = cputime64_add(cpustat->system, tmp);
3957 else if (atomic_read(&rq->nr_iowait) > 0)
3958 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3959 else
3960 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3961
3962 acct_update_integrals(p);
3963}
3964
3965
3966
3967
3968
3969
3970
3971void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
3972{
3973 p->stimescaled = cputime_add(p->stimescaled, cputime);
3974}
3975
3976
3977
3978
3979
3980
3981void account_steal_time(struct task_struct *p, cputime_t steal)
3982{
3983 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3984 cputime64_t tmp = cputime_to_cputime64(steal);
3985 struct rq *rq = this_rq();
3986
3987 if (p == rq->idle) {
3988 p->stime = cputime_add(p->stime, steal);
3989 if (atomic_read(&rq->nr_iowait) > 0)
3990 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3991 else
3992 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3993 } else
3994 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3995}
3996
3997
3998
3999
4000
4001
4002
4003
4004void scheduler_tick(void)
4005{
4006 int cpu = smp_processor_id();
4007 struct rq *rq = cpu_rq(cpu);
4008 struct task_struct *curr = rq->curr;
4009
4010 sched_clock_tick();
4011
4012 spin_lock(&rq->lock);
4013 update_rq_clock(rq);
4014 update_cpu_load(rq);
4015 curr->sched_class->task_tick(rq, curr, 0);
4016 spin_unlock(&rq->lock);
4017
4018#ifdef CONFIG_SMP
4019 rq->idle_at_tick = idle_cpu(cpu);
4020 trigger_load_balance(rq, cpu);
4021#endif
4022}
4023
4024#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
4025
4026void __kprobes add_preempt_count(int val)
4027{
4028
4029
4030
4031 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4032 return;
4033 preempt_count() += val;
4034
4035
4036
4037 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4038 PREEMPT_MASK - 10);
4039}
4040EXPORT_SYMBOL(add_preempt_count);
4041
4042void __kprobes sub_preempt_count(int val)
4043{
4044
4045
4046
4047 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4048 return;
4049
4050
4051
4052 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4053 !(preempt_count() & PREEMPT_MASK)))
4054 return;
4055
4056 preempt_count() -= val;
4057}
4058EXPORT_SYMBOL(sub_preempt_count);
4059
4060#endif
4061
4062
4063
4064
4065static noinline void __schedule_bug(struct task_struct *prev)
4066{
4067 struct pt_regs *regs = get_irq_regs();
4068
4069 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4070 prev->comm, prev->pid, preempt_count());
4071
4072 debug_show_held_locks(prev);
4073 if (irqs_disabled())
4074 print_irqtrace_events(prev);
4075
4076 if (regs)
4077 show_regs(regs);
4078 else
4079 dump_stack();
4080}
4081
4082
4083
4084
4085static inline void schedule_debug(struct task_struct *prev)
4086{
4087
4088
4089
4090
4091
4092 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4093 __schedule_bug(prev);
4094
4095 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4096
4097 schedstat_inc(this_rq(), sched_count);
4098#ifdef CONFIG_SCHEDSTATS
4099 if (unlikely(prev->lock_depth >= 0)) {
4100 schedstat_inc(this_rq(), bkl_count);
4101 schedstat_inc(prev, sched_info.bkl_count);
4102 }
4103#endif
4104}
4105
4106
4107
4108
4109static inline struct task_struct *
4110pick_next_task(struct rq *rq, struct task_struct *prev)
4111{
4112 const struct sched_class *class;
4113 struct task_struct *p;
4114
4115
4116
4117
4118
4119 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4120 p = fair_sched_class.pick_next_task(rq);
4121 if (likely(p))
4122 return p;
4123 }
4124
4125 class = sched_class_highest;
4126 for ( ; ; ) {
4127 p = class->pick_next_task(rq);
4128 if (p)
4129 return p;
4130
4131
4132
4133
4134 class = class->next;
4135 }
4136}
4137
4138
4139
4140
4141asmlinkage void __sched schedule(void)
4142{
4143 struct task_struct *prev, *next;
4144 unsigned long *switch_count;
4145 struct rq *rq;
4146 int cpu;
4147
4148need_resched:
4149 preempt_disable();
4150 cpu = smp_processor_id();
4151 rq = cpu_rq(cpu);
4152 rcu_qsctr_inc(cpu);
4153 prev = rq->curr;
4154 switch_count = &prev->nivcsw;
4155
4156 release_kernel_lock(prev);
4157need_resched_nonpreemptible:
4158
4159 schedule_debug(prev);
4160
4161 hrtick_clear(rq);
4162
4163
4164
4165
4166 local_irq_disable();
4167 update_rq_clock(rq);
4168 spin_lock(&rq->lock);
4169 clear_tsk_need_resched(prev);
4170
4171 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4172 if (unlikely(signal_pending_state(prev->state, prev)))
4173 prev->state = TASK_RUNNING;
4174 else
4175 deactivate_task(rq, prev, 1);
4176 switch_count = &prev->nvcsw;
4177 }
4178
4179#ifdef CONFIG_SMP
4180 if (prev->sched_class->pre_schedule)
4181 prev->sched_class->pre_schedule(rq, prev);
4182#endif
4183
4184 if (unlikely(!rq->nr_running))
4185 idle_balance(cpu, rq);
4186
4187 prev->sched_class->put_prev_task(rq, prev);
4188 next = pick_next_task(rq, prev);
4189
4190 if (likely(prev != next)) {
4191 sched_info_switch(prev, next);
4192
4193 rq->nr_switches++;
4194 rq->curr = next;
4195 ++*switch_count;
4196
4197 context_switch(rq, prev, next);
4198
4199
4200
4201
4202 cpu = smp_processor_id();
4203 rq = cpu_rq(cpu);
4204 } else
4205 spin_unlock_irq(&rq->lock);
4206
4207 hrtick_set(rq);
4208
4209 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible;
4211
4212 preempt_enable_no_resched();
4213 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4214 goto need_resched;
4215}
4216EXPORT_SYMBOL(schedule);
4217
4218#ifdef CONFIG_PREEMPT
4219
4220
4221
4222
4223
4224asmlinkage void __sched preempt_schedule(void)
4225{
4226 struct thread_info *ti = current_thread_info();
4227
4228
4229
4230
4231
4232 if (likely(ti->preempt_count || irqs_disabled()))
4233 return;
4234
4235 do {
4236 add_preempt_count(PREEMPT_ACTIVE);
4237 schedule();
4238 sub_preempt_count(PREEMPT_ACTIVE);
4239
4240
4241
4242
4243
4244 barrier();
4245 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4246}
4247EXPORT_SYMBOL(preempt_schedule);
4248
4249
4250
4251
4252
4253
4254
4255asmlinkage void __sched preempt_schedule_irq(void)
4256{
4257 struct thread_info *ti = current_thread_info();
4258
4259
4260 BUG_ON(ti->preempt_count || !irqs_disabled());
4261
4262 do {
4263 add_preempt_count(PREEMPT_ACTIVE);
4264 local_irq_enable();
4265 schedule();
4266 local_irq_disable();
4267 sub_preempt_count(PREEMPT_ACTIVE);
4268
4269
4270
4271
4272
4273 barrier();
4274 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4275}
4276
4277#endif
4278
4279int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
4280 void *key)
4281{
4282 return try_to_wake_up(curr->private, mode, sync);
4283}
4284EXPORT_SYMBOL(default_wake_function);
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4296 int nr_exclusive, int sync, void *key)
4297{
4298 wait_queue_t *curr, *next;
4299
4300 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4301 unsigned flags = curr->flags;
4302
4303 if (curr->func(curr, mode, sync, key) &&
4304 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4305 break;
4306 }
4307}
4308
4309
4310
4311
4312
4313
4314
4315
4316void __wake_up(wait_queue_head_t *q, unsigned int mode,
4317 int nr_exclusive, void *key)
4318{
4319 unsigned long flags;
4320
4321 spin_lock_irqsave(&q->lock, flags);
4322 __wake_up_common(q, mode, nr_exclusive, 0, key);
4323 spin_unlock_irqrestore(&q->lock, flags);
4324}
4325EXPORT_SYMBOL(__wake_up);
4326
4327
4328
4329
4330void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4331{
4332 __wake_up_common(q, mode, 1, 0, NULL);
4333}
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348void
4349__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4350{
4351 unsigned long flags;
4352 int sync = 1;
4353
4354 if (unlikely(!q))
4355 return;
4356
4357 if (unlikely(!nr_exclusive))
4358 sync = 0;
4359
4360 spin_lock_irqsave(&q->lock, flags);
4361 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
4362 spin_unlock_irqrestore(&q->lock, flags);
4363}
4364EXPORT_SYMBOL_GPL(__wake_up_sync);
4365
4366void complete(struct completion *x)
4367{
4368 unsigned long flags;
4369
4370 spin_lock_irqsave(&x->wait.lock, flags);
4371 x->done++;
4372 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4373 spin_unlock_irqrestore(&x->wait.lock, flags);
4374}
4375EXPORT_SYMBOL(complete);
4376
4377void complete_all(struct completion *x)
4378{
4379 unsigned long flags;
4380
4381 spin_lock_irqsave(&x->wait.lock, flags);
4382 x->done += UINT_MAX/2;
4383 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4384 spin_unlock_irqrestore(&x->wait.lock, flags);
4385}
4386EXPORT_SYMBOL(complete_all);
4387
4388static inline long __sched
4389do_wait_for_common(struct completion *x, long timeout, int state)
4390{
4391 if (!x->done) {
4392 DECLARE_WAITQUEUE(wait, current);
4393
4394 wait.flags |= WQ_FLAG_EXCLUSIVE;
4395 __add_wait_queue_tail(&x->wait, &wait);
4396 do {
4397 if ((state == TASK_INTERRUPTIBLE &&
4398 signal_pending(current)) ||
4399 (state == TASK_KILLABLE &&
4400 fatal_signal_pending(current))) {
4401 timeout = -ERESTARTSYS;
4402 break;
4403 }
4404 __set_current_state(state);
4405 spin_unlock_irq(&x->wait.lock);
4406 timeout = schedule_timeout(timeout);
4407 spin_lock_irq(&x->wait.lock);
4408 } while (!x->done && timeout);
4409 __remove_wait_queue(&x->wait, &wait);
4410 if (!x->done)
4411 return timeout;
4412 }
4413 x->done--;
4414 return timeout ?: 1;
4415}
4416
4417static long __sched
4418wait_for_common(struct completion *x, long timeout, int state)
4419{
4420 might_sleep();
4421
4422 spin_lock_irq(&x->wait.lock);
4423 timeout = do_wait_for_common(x, timeout, state);
4424 spin_unlock_irq(&x->wait.lock);
4425 return timeout;
4426}
4427
4428void __sched wait_for_completion(struct completion *x)
4429{
4430 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4431}
4432EXPORT_SYMBOL(wait_for_completion);
4433
4434unsigned long __sched
4435wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4436{
4437 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4438}
4439EXPORT_SYMBOL(wait_for_completion_timeout);
4440
4441int __sched wait_for_completion_interruptible(struct completion *x)
4442{
4443 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4444 if (t == -ERESTARTSYS)
4445 return t;
4446 return 0;
4447}
4448EXPORT_SYMBOL(wait_for_completion_interruptible);
4449
4450unsigned long __sched
4451wait_for_completion_interruptible_timeout(struct completion *x,
4452 unsigned long timeout)
4453{
4454 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4455}
4456EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4457
4458int __sched wait_for_completion_killable(struct completion *x)
4459{
4460 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4461 if (t == -ERESTARTSYS)
4462 return t;
4463 return 0;
4464}
4465EXPORT_SYMBOL(wait_for_completion_killable);
4466
4467static long __sched
4468sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4469{
4470 unsigned long flags;
4471 wait_queue_t wait;
4472
4473 init_waitqueue_entry(&wait, current);
4474
4475 __set_current_state(state);
4476
4477 spin_lock_irqsave(&q->lock, flags);
4478 __add_wait_queue(q, &wait);
4479 spin_unlock(&q->lock);
4480 timeout = schedule_timeout(timeout);
4481 spin_lock_irq(&q->lock);
4482 __remove_wait_queue(q, &wait);
4483 spin_unlock_irqrestore(&q->lock, flags);
4484
4485 return timeout;
4486}
4487
4488void __sched interruptible_sleep_on(wait_queue_head_t *q)
4489{
4490 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4491}
4492EXPORT_SYMBOL(interruptible_sleep_on);
4493
4494long __sched
4495interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4496{
4497 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4498}
4499EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4500
4501void __sched sleep_on(wait_queue_head_t *q)
4502{
4503 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4504}
4505EXPORT_SYMBOL(sleep_on);
4506
4507long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4508{
4509 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4510}
4511EXPORT_SYMBOL(sleep_on_timeout);
4512
4513#ifdef CONFIG_RT_MUTEXES
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525void rt_mutex_setprio(struct task_struct *p, int prio)
4526{
4527 unsigned long flags;
4528 int oldprio, on_rq, running;
4529 struct rq *rq;
4530 const struct sched_class *prev_class = p->sched_class;
4531
4532 BUG_ON(prio < 0 || prio > MAX_PRIO);
4533
4534 rq = task_rq_lock(p, &flags);
4535 update_rq_clock(rq);
4536
4537 oldprio = p->prio;
4538 on_rq = p->se.on_rq;
4539 running = task_current(rq, p);
4540 if (on_rq)
4541 dequeue_task(rq, p, 0);
4542 if (running)
4543 p->sched_class->put_prev_task(rq, p);
4544
4545 if (rt_prio(prio))
4546 p->sched_class = &rt_sched_class;
4547 else
4548 p->sched_class = &fair_sched_class;
4549
4550 p->prio = prio;
4551
4552 if (running)
4553 p->sched_class->set_curr_task(rq);
4554 if (on_rq) {
4555 enqueue_task(rq, p, 0);
4556
4557 check_class_changed(rq, p, prev_class, oldprio, running);
4558 }
4559 task_rq_unlock(rq, &flags);
4560}
4561
4562#endif
4563
4564void set_user_nice(struct task_struct *p, long nice)
4565{
4566 int old_prio, delta, on_rq;
4567 unsigned long flags;
4568 struct rq *rq;
4569
4570 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4571 return;
4572
4573
4574
4575
4576 rq = task_rq_lock(p, &flags);
4577 update_rq_clock(rq);
4578
4579
4580
4581
4582
4583
4584 if (task_has_rt_policy(p)) {
4585 p->static_prio = NICE_TO_PRIO(nice);
4586 goto out_unlock;
4587 }
4588 on_rq = p->se.on_rq;
4589 if (on_rq) {
4590 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593
4594 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p);
4596 old_prio = p->prio;
4597 p->prio = effective_prio(p);
4598 delta = p->prio - old_prio;
4599
4600 if (on_rq) {
4601 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603
4604
4605
4606
4607 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4608 resched_task(rq->curr);
4609 }
4610out_unlock:
4611 task_rq_unlock(rq, &flags);
4612}
4613EXPORT_SYMBOL(set_user_nice);
4614
4615
4616
4617
4618
4619
4620int can_nice(const struct task_struct *p, const int nice)
4621{
4622
4623 int nice_rlim = 20 - nice;
4624
4625 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
4626 capable(CAP_SYS_NICE));
4627}
4628
4629#ifdef __ARCH_WANT_SYS_NICE
4630
4631
4632
4633
4634
4635
4636
4637
4638asmlinkage long sys_nice(int increment)
4639{
4640 long nice, retval;
4641
4642
4643
4644
4645
4646
4647 if (increment < -40)
4648 increment = -40;
4649 if (increment > 40)
4650 increment = 40;
4651
4652 nice = PRIO_TO_NICE(current->static_prio) + increment;
4653 if (nice < -20)
4654 nice = -20;
4655 if (nice > 19)
4656 nice = 19;
4657
4658 if (increment < 0 && !can_nice(current, nice))
4659 return -EPERM;
4660
4661 retval = security_task_setnice(current, nice);
4662 if (retval)
4663 return retval;
4664
4665 set_user_nice(current, nice);
4666 return 0;
4667}
4668
4669#endif
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679int task_prio(const struct task_struct *p)
4680{
4681 return p->prio - MAX_RT_PRIO;
4682}
4683
4684
4685
4686
4687
4688int task_nice(const struct task_struct *p)
4689{
4690 return TASK_NICE(p);
4691}
4692EXPORT_SYMBOL(task_nice);
4693
4694
4695
4696
4697
4698int idle_cpu(int cpu)
4699{
4700 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4701}
4702
4703
4704
4705
4706
4707struct task_struct *idle_task(int cpu)
4708{
4709 return cpu_rq(cpu)->idle;
4710}
4711
4712
4713
4714
4715
4716static struct task_struct *find_process_by_pid(pid_t pid)
4717{
4718 return pid ? find_task_by_vpid(pid) : current;
4719}
4720
4721
4722static void
4723__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4724{
4725 BUG_ON(p->se.on_rq);
4726
4727 p->policy = policy;
4728 switch (p->policy) {
4729 case SCHED_NORMAL:
4730 case SCHED_BATCH:
4731 case SCHED_IDLE:
4732 p->sched_class = &fair_sched_class;
4733 break;
4734 case SCHED_FIFO:
4735 case SCHED_RR:
4736 p->sched_class = &rt_sched_class;
4737 break;
4738 }
4739
4740 p->rt_priority = prio;
4741 p->normal_prio = normal_prio(p);
4742
4743 p->prio = rt_mutex_getprio(p);
4744 set_load_weight(p);
4745}
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755int sched_setscheduler(struct task_struct *p, int policy,
4756 struct sched_param *param)
4757{
4758 int retval, oldprio, oldpolicy = -1, on_rq, running;
4759 unsigned long flags;
4760 const struct sched_class *prev_class = p->sched_class;
4761 struct rq *rq;
4762
4763
4764 BUG_ON(in_interrupt());
4765recheck:
4766
4767 if (policy < 0)
4768 policy = oldpolicy = p->policy;
4769 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4770 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4771 policy != SCHED_IDLE)
4772 return -EINVAL;
4773
4774
4775
4776
4777
4778 if (param->sched_priority < 0 ||
4779 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4780 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4781 return -EINVAL;
4782 if (rt_policy(policy) != (param->sched_priority != 0))
4783 return -EINVAL;
4784
4785
4786
4787
4788 if (!capable(CAP_SYS_NICE)) {
4789 if (rt_policy(policy)) {
4790 unsigned long rlim_rtprio;
4791
4792 if (!lock_task_sighand(p, &flags))
4793 return -ESRCH;
4794 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4795 unlock_task_sighand(p, &flags);
4796
4797
4798 if (policy != p->policy && !rlim_rtprio)
4799 return -EPERM;
4800
4801
4802 if (param->sched_priority > p->rt_priority &&
4803 param->sched_priority > rlim_rtprio)
4804 return -EPERM;
4805 }
4806
4807
4808
4809
4810 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4811 return -EPERM;
4812
4813
4814 if ((current->euid != p->euid) &&
4815 (current->euid != p->uid))
4816 return -EPERM;
4817 }
4818
4819#ifdef CONFIG_RT_GROUP_SCHED
4820
4821
4822
4823
4824 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4825 return -EPERM;
4826#endif
4827
4828 retval = security_task_setscheduler(p, policy, param);
4829 if (retval)
4830 return retval;
4831
4832
4833
4834
4835 spin_lock_irqsave(&p->pi_lock, flags);
4836
4837
4838
4839
4840 rq = __task_rq_lock(p);
4841
4842 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4843 policy = oldpolicy = -1;
4844 __task_rq_unlock(rq);
4845 spin_unlock_irqrestore(&p->pi_lock, flags);
4846 goto recheck;
4847 }
4848 update_rq_clock(rq);
4849 on_rq = p->se.on_rq;
4850 running = task_current(rq, p);
4851 if (on_rq)
4852 deactivate_task(rq, p, 0);
4853 if (running)
4854 p->sched_class->put_prev_task(rq, p);
4855
4856 oldprio = p->prio;
4857 __setscheduler(rq, p, policy, param->sched_priority);
4858
4859 if (running)
4860 p->sched_class->set_curr_task(rq);
4861 if (on_rq) {
4862 activate_task(rq, p, 0);
4863
4864 check_class_changed(rq, p, prev_class, oldprio, running);
4865 }
4866 __task_rq_unlock(rq);
4867 spin_unlock_irqrestore(&p->pi_lock, flags);
4868
4869 rt_mutex_adjust_pi(p);
4870
4871 return 0;
4872}
4873EXPORT_SYMBOL_GPL(sched_setscheduler);
4874
4875static int
4876do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4877{
4878 struct sched_param lparam;
4879 struct task_struct *p;
4880 int retval;
4881
4882 if (!param || pid < 0)
4883 return -EINVAL;
4884 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4885 return -EFAULT;
4886
4887 rcu_read_lock();
4888 retval = -ESRCH;
4889 p = find_process_by_pid(pid);
4890 if (p != NULL)
4891 retval = sched_setscheduler(p, policy, &lparam);
4892 rcu_read_unlock();
4893
4894 return retval;
4895}
4896
4897
4898
4899
4900
4901
4902
4903asmlinkage long
4904sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4905{
4906
4907 if (policy < 0)
4908 return -EINVAL;
4909
4910 return do_sched_setscheduler(pid, policy, param);
4911}
4912
4913
4914
4915
4916
4917
4918asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4919{
4920 return do_sched_setscheduler(pid, -1, param);
4921}
4922
4923
4924
4925
4926
4927asmlinkage long sys_sched_getscheduler(pid_t pid)
4928{
4929 struct task_struct *p;
4930 int retval;
4931
4932 if (pid < 0)
4933 return -EINVAL;
4934
4935 retval = -ESRCH;
4936 read_lock(&tasklist_lock);
4937 p = find_process_by_pid(pid);
4938 if (p) {
4939 retval = security_task_getscheduler(p);
4940 if (!retval)
4941 retval = p->policy;
4942 }
4943 read_unlock(&tasklist_lock);
4944 return retval;
4945}
4946
4947
4948
4949
4950
4951
4952asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4953{
4954 struct sched_param lp;
4955 struct task_struct *p;
4956 int retval;
4957
4958 if (!param || pid < 0)
4959 return -EINVAL;
4960
4961 read_lock(&tasklist_lock);
4962 p = find_process_by_pid(pid);
4963 retval = -ESRCH;
4964 if (!p)
4965 goto out_unlock;
4966
4967 retval = security_task_getscheduler(p);
4968 if (retval)
4969 goto out_unlock;
4970
4971 lp.sched_priority = p->rt_priority;
4972 read_unlock(&tasklist_lock);
4973
4974
4975
4976
4977 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4978
4979 return retval;
4980
4981out_unlock:
4982 read_unlock(&tasklist_lock);
4983 return retval;
4984}
4985
4986long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
4987{
4988 cpumask_t cpus_allowed;
4989 cpumask_t new_mask = *in_mask;
4990 struct task_struct *p;
4991 int retval;
4992
4993 get_online_cpus();
4994 read_lock(&tasklist_lock);
4995
4996 p = find_process_by_pid(pid);
4997 if (!p) {
4998 read_unlock(&tasklist_lock);
4999 put_online_cpus();
5000 return -ESRCH;
5001 }
5002
5003
5004
5005
5006
5007
5008 get_task_struct(p);
5009 read_unlock(&tasklist_lock);
5010
5011 retval = -EPERM;
5012 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5013 !capable(CAP_SYS_NICE))
5014 goto out_unlock;
5015
5016 retval = security_task_setscheduler(p, 0, NULL);
5017 if (retval)
5018 goto out_unlock;
5019
5020 cpuset_cpus_allowed(p, &cpus_allowed);
5021 cpus_and(new_mask, new_mask, cpus_allowed);
5022 again:
5023 retval = set_cpus_allowed_ptr(p, &new_mask);
5024
5025 if (!retval) {
5026 cpuset_cpus_allowed(p, &cpus_allowed);
5027 if (!cpus_subset(new_mask, cpus_allowed)) {
5028
5029
5030
5031
5032
5033 new_mask = cpus_allowed;
5034 goto again;
5035 }
5036 }
5037out_unlock:
5038 put_task_struct(p);
5039 put_online_cpus();
5040 return retval;
5041}
5042
5043static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5044 cpumask_t *new_mask)
5045{
5046 if (len < sizeof(cpumask_t)) {
5047 memset(new_mask, 0, sizeof(cpumask_t));
5048 } else if (len > sizeof(cpumask_t)) {
5049 len = sizeof(cpumask_t);
5050 }
5051 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5052}
5053
5054
5055
5056
5057
5058
5059
5060asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5061 unsigned long __user *user_mask_ptr)
5062{
5063 cpumask_t new_mask;
5064 int retval;
5065
5066 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
5067 if (retval)
5068 return retval;
5069
5070 return sched_setaffinity(pid, &new_mask);
5071}
5072
5073
5074
5075
5076
5077
5078
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{
5093 struct task_struct *p;
5094 int retval;
5095
5096 get_online_cpus();
5097 read_lock(&tasklist_lock);
5098
5099 retval = -ESRCH;
5100 p = find_process_by_pid(pid);
5101 if (!p)
5102 goto out_unlock;
5103
5104 retval = security_task_getscheduler(p);
5105 if (retval)
5106 goto out_unlock;
5107
5108 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
5109
5110out_unlock:
5111 read_unlock(&tasklist_lock);
5112 put_online_cpus();
5113
5114 return retval;
5115}
5116
5117
5118
5119
5120
5121
5122
5123asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5124 unsigned long __user *user_mask_ptr)
5125{
5126 int ret;
5127 cpumask_t mask;
5128
5129 if (len < sizeof(cpumask_t))
5130 return -EINVAL;
5131
5132 ret = sched_getaffinity(pid, &mask);
5133 if (ret < 0)
5134 return ret;
5135
5136 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
5137 return -EFAULT;
5138
5139 return sizeof(cpumask_t);
5140}
5141
5142
5143
5144
5145
5146
5147
5148asmlinkage long sys_sched_yield(void)
5149{
5150 struct rq *rq = this_rq_lock();
5151
5152 schedstat_inc(rq, yld_count);
5153 current->sched_class->yield_task(rq);
5154
5155
5156
5157
5158
5159 __release(rq->lock);
5160 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5161 _raw_spin_unlock(&rq->lock);
5162 preempt_enable_no_resched();
5163
5164 schedule();
5165
5166 return 0;
5167}
5168
5169static void __cond_resched(void)
5170{
5171#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5172 __might_sleep(__FILE__, __LINE__);
5173#endif
5174
5175
5176
5177
5178
5179 do {
5180 add_preempt_count(PREEMPT_ACTIVE);
5181 schedule();
5182 sub_preempt_count(PREEMPT_ACTIVE);
5183 } while (need_resched());
5184}
5185
5186int __sched _cond_resched(void)
5187{
5188 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
5189 system_state == SYSTEM_RUNNING) {
5190 __cond_resched();
5191 return 1;
5192 }
5193 return 0;
5194}
5195EXPORT_SYMBOL(_cond_resched);
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205int cond_resched_lock(spinlock_t *lock)
5206{
5207 int resched = need_resched() && system_state == SYSTEM_RUNNING;
5208 int ret = 0;
5209
5210 if (spin_needbreak(lock) || resched) {
5211 spin_unlock(lock);
5212 if (resched && need_resched())
5213 __cond_resched();
5214 else
5215 cpu_relax();
5216 ret = 1;
5217 spin_lock(lock);
5218 }
5219 return ret;
5220}
5221EXPORT_SYMBOL(cond_resched_lock);
5222
5223int __sched cond_resched_softirq(void)
5224{
5225 BUG_ON(!in_softirq());
5226
5227 if (need_resched() && system_state == SYSTEM_RUNNING) {
5228 local_bh_enable();
5229 __cond_resched();
5230 local_bh_disable();
5231 return 1;
5232 }
5233 return 0;
5234}
5235EXPORT_SYMBOL(cond_resched_softirq);
5236
5237
5238
5239
5240
5241
5242
5243void __sched yield(void)
5244{
5245 set_current_state(TASK_RUNNING);
5246 sys_sched_yield();
5247}
5248EXPORT_SYMBOL(yield);
5249
5250
5251
5252
5253
5254
5255
5256
5257void __sched io_schedule(void)
5258{
5259 struct rq *rq = &__raw_get_cpu_var(runqueues);
5260
5261 delayacct_blkio_start();
5262 atomic_inc(&rq->nr_iowait);
5263 schedule();
5264 atomic_dec(&rq->nr_iowait);
5265 delayacct_blkio_end();
5266}
5267EXPORT_SYMBOL(io_schedule);
5268
5269long __sched io_schedule_timeout(long timeout)
5270{
5271 struct rq *rq = &__raw_get_cpu_var(runqueues);
5272 long ret;
5273
5274 delayacct_blkio_start();
5275 atomic_inc(&rq->nr_iowait);
5276 ret = schedule_timeout(timeout);
5277 atomic_dec(&rq->nr_iowait);
5278 delayacct_blkio_end();
5279 return ret;
5280}
5281
5282
5283
5284
5285
5286
5287
5288
5289asmlinkage long sys_sched_get_priority_max(int policy)
5290{
5291 int ret = -EINVAL;
5292
5293 switch (policy) {
5294 case SCHED_FIFO:
5295 case SCHED_RR:
5296 ret = MAX_USER_RT_PRIO-1;
5297 break;
5298 case SCHED_NORMAL:
5299 case SCHED_BATCH:
5300 case SCHED_IDLE:
5301 ret = 0;
5302 break;
5303 }
5304 return ret;
5305}
5306
5307
5308
5309
5310
5311
5312
5313
5314asmlinkage long sys_sched_get_priority_min(int policy)
5315{
5316 int ret = -EINVAL;
5317
5318 switch (policy) {
5319 case SCHED_FIFO:
5320 case SCHED_RR:
5321 ret = 1;
5322 break;
5323 case SCHED_NORMAL:
5324 case SCHED_BATCH:
5325 case SCHED_IDLE:
5326 ret = 0;
5327 }
5328 return ret;
5329}
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339asmlinkage
5340long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
5341{
5342 struct task_struct *p;
5343 unsigned int time_slice;
5344 int retval;
5345 struct timespec t;
5346
5347 if (pid < 0)
5348 return -EINVAL;
5349
5350 retval = -ESRCH;
5351 read_lock(&tasklist_lock);
5352 p = find_process_by_pid(pid);
5353 if (!p)
5354 goto out_unlock;
5355
5356 retval = security_task_getscheduler(p);
5357 if (retval)
5358 goto out_unlock;
5359
5360
5361
5362
5363
5364 time_slice = 0;
5365 if (p->policy == SCHED_RR) {
5366 time_slice = DEF_TIMESLICE;
5367 } else if (p->policy != SCHED_FIFO) {
5368 struct sched_entity *se = &p->se;
5369 unsigned long flags;
5370 struct rq *rq;
5371
5372 rq = task_rq_lock(p, &flags);
5373 if (rq->cfs.load.weight)
5374 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5375 task_rq_unlock(rq, &flags);
5376 }
5377 read_unlock(&tasklist_lock);
5378 jiffies_to_timespec(time_slice, &t);
5379 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5380 return retval;
5381
5382out_unlock:
5383 read_unlock(&tasklist_lock);
5384 return retval;
5385}
5386
5387static const char stat_nam[] = "RSDTtZX";
5388
5389void sched_show_task(struct task_struct *p)
5390{
5391 unsigned long free = 0;
5392 unsigned state;
5393
5394 state = p->state ? __ffs(p->state) + 1 : 0;
5395 printk(KERN_INFO "%-13.13s %c", p->comm,
5396 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5397#if BITS_PER_LONG == 32
5398 if (state == TASK_RUNNING)
5399 printk(KERN_CONT " running ");
5400 else
5401 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5402#else
5403 if (state == TASK_RUNNING)
5404 printk(KERN_CONT " running task ");
5405 else
5406 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5407#endif
5408#ifdef CONFIG_DEBUG_STACK_USAGE
5409 {
5410 unsigned long *n = end_of_stack(p);
5411 while (!*n)
5412 n++;
5413 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5414 }
5415#endif
5416 printk(KERN_CONT "%5lu %5d %6d\n", free,
5417 task_pid_nr(p), task_pid_nr(p->real_parent));
5418
5419 show_stack(p, NULL);
5420}
5421
5422void show_state_filter(unsigned long state_filter)
5423{
5424 struct task_struct *g, *p;
5425
5426#if BITS_PER_LONG == 32
5427 printk(KERN_INFO
5428 " task PC stack pid father\n");
5429#else
5430 printk(KERN_INFO
5431 " task PC stack pid father\n");
5432#endif
5433 read_lock(&tasklist_lock);
5434 do_each_thread(g, p) {
5435
5436
5437
5438
5439 touch_nmi_watchdog();
5440 if (!state_filter || (p->state & state_filter))
5441 sched_show_task(p);
5442 } while_each_thread(g, p);
5443
5444 touch_all_softlockup_watchdogs();
5445
5446#ifdef CONFIG_SCHED_DEBUG
5447 sysrq_sched_debug_show();
5448#endif
5449 read_unlock(&tasklist_lock);
5450
5451
5452
5453 if (state_filter == -1)
5454 debug_show_all_locks();
5455}
5456
5457void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5458{
5459 idle->sched_class = &idle_sched_class;
5460}
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470void __cpuinit init_idle(struct task_struct *idle, int cpu)
5471{
5472 struct rq *rq = cpu_rq(cpu);
5473 unsigned long flags;
5474
5475 __sched_fork(idle);
5476 idle->se.exec_start = sched_clock();
5477
5478 idle->prio = idle->normal_prio = MAX_PRIO;
5479 idle->cpus_allowed = cpumask_of_cpu(cpu);
5480 __set_task_cpu(idle, cpu);
5481
5482 spin_lock_irqsave(&rq->lock, flags);
5483 rq->curr = rq->idle = idle;
5484#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5485 idle->oncpu = 1;
5486#endif
5487 spin_unlock_irqrestore(&rq->lock, flags);
5488
5489
5490#if defined(CONFIG_PREEMPT)
5491 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5492#else
5493 task_thread_info(idle)->preempt_count = 0;
5494#endif
5495
5496
5497
5498 idle->sched_class = &idle_sched_class;
5499}
5500
5501
5502
5503
5504
5505
5506
5507
5508cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519static inline void sched_init_granularity(void)
5520{
5521 unsigned int factor = 1 + ilog2(num_online_cpus());
5522 const unsigned long limit = 200000000;
5523
5524 sysctl_sched_min_granularity *= factor;
5525 if (sysctl_sched_min_granularity > limit)
5526 sysctl_sched_min_granularity = limit;
5527
5528 sysctl_sched_latency *= factor;
5529 if (sysctl_sched_latency > limit)
5530 sysctl_sched_latency = limit;
5531
5532 sysctl_sched_wakeup_granularity *= factor;
5533}
5534
5535#ifdef CONFIG_SMP
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5562{
5563 struct migration_req req;
5564 unsigned long flags;
5565 struct rq *rq;
5566 int ret = 0;
5567
5568 rq = task_rq_lock(p, &flags);
5569 if (!cpus_intersects(*new_mask, cpu_online_map)) {
5570 ret = -EINVAL;
5571 goto out;
5572 }
5573
5574 if (p->sched_class->set_cpus_allowed)
5575 p->sched_class->set_cpus_allowed(p, new_mask);
5576 else {
5577 p->cpus_allowed = *new_mask;
5578 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
5579 }
5580
5581
5582 if (cpu_isset(task_cpu(p), *new_mask))
5583 goto out;
5584
5585 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
5586
5587 task_rq_unlock(rq, &flags);
5588 wake_up_process(rq->migration_thread);
5589 wait_for_completion(&req.done);
5590 tlb_migrate_finish(p->mm);
5591 return 0;
5592 }
5593out:
5594 task_rq_unlock(rq, &flags);
5595
5596 return ret;
5597}
5598EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5612{
5613 struct rq *rq_dest, *rq_src;
5614 int ret = 0, on_rq;
5615
5616 if (unlikely(cpu_is_offline(dest_cpu)))
5617 return ret;
5618
5619 rq_src = cpu_rq(src_cpu);
5620 rq_dest = cpu_rq(dest_cpu);
5621
5622 double_rq_lock(rq_src, rq_dest);
5623
5624 if (task_cpu(p) != src_cpu)
5625 goto done;
5626
5627 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5628 goto fail;
5629
5630 on_rq = p->se.on_rq;
5631 if (on_rq)
5632 deactivate_task(rq_src, p, 0);
5633
5634 set_task_cpu(p, dest_cpu);
5635 if (on_rq) {
5636 activate_task(rq_dest, p, 0);
5637 check_preempt_curr(rq_dest, p);
5638 }
5639done:
5640 ret = 1;
5641fail:
5642 double_rq_unlock(rq_src, rq_dest);
5643 return ret;
5644}
5645
5646
5647
5648
5649
5650
5651static int migration_thread(void *data)
5652{
5653 int cpu = (long)data;
5654 struct rq *rq;
5655
5656 rq = cpu_rq(cpu);
5657 BUG_ON(rq->migration_thread != current);
5658
5659 set_current_state(TASK_INTERRUPTIBLE);
5660 while (!kthread_should_stop()) {
5661 struct migration_req *req;
5662 struct list_head *head;
5663
5664 spin_lock_irq(&rq->lock);
5665
5666 if (cpu_is_offline(cpu)) {
5667 spin_unlock_irq(&rq->lock);
5668 goto wait_to_die;
5669 }
5670
5671 if (rq->active_balance) {
5672 active_load_balance(rq, cpu);
5673 rq->active_balance = 0;
5674 }
5675
5676 head = &rq->migration_queue;
5677
5678 if (list_empty(head)) {
5679 spin_unlock_irq(&rq->lock);
5680 schedule();
5681 set_current_state(TASK_INTERRUPTIBLE);
5682 continue;
5683 }
5684 req = list_entry(head->next, struct migration_req, list);
5685 list_del_init(head->next);
5686
5687 spin_unlock(&rq->lock);
5688 __migrate_task(req->task, cpu, req->dest_cpu);
5689 local_irq_enable();
5690
5691 complete(&req->done);
5692 }
5693 __set_current_state(TASK_RUNNING);
5694 return 0;
5695
5696wait_to_die:
5697
5698 set_current_state(TASK_INTERRUPTIBLE);
5699 while (!kthread_should_stop()) {
5700 schedule();
5701 set_current_state(TASK_INTERRUPTIBLE);
5702 }
5703 __set_current_state(TASK_RUNNING);
5704 return 0;
5705}
5706
5707#ifdef CONFIG_HOTPLUG_CPU
5708
5709static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5710{
5711 int ret;
5712
5713 local_irq_disable();
5714 ret = __migrate_task(p, src_cpu, dest_cpu);
5715 local_irq_enable();
5716 return ret;
5717}
5718
5719
5720
5721
5722
5723static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5724{
5725 unsigned long flags;
5726 cpumask_t mask;
5727 struct rq *rq;
5728 int dest_cpu;
5729
5730 do {
5731
5732 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5733 cpus_and(mask, mask, p->cpus_allowed);
5734 dest_cpu = any_online_cpu(mask);
5735
5736
5737 if (dest_cpu >= nr_cpu_ids)
5738 dest_cpu = any_online_cpu(p->cpus_allowed);
5739
5740
5741 if (dest_cpu >= nr_cpu_ids) {
5742 cpumask_t cpus_allowed;
5743
5744 cpuset_cpus_allowed_locked(p, &cpus_allowed);
5745
5746
5747
5748
5749
5750
5751
5752 rq = task_rq_lock(p, &flags);
5753 p->cpus_allowed = cpus_allowed;
5754 dest_cpu = any_online_cpu(p->cpus_allowed);
5755 task_rq_unlock(rq, &flags);
5756
5757
5758
5759
5760
5761
5762 if (p->mm && printk_ratelimit()) {
5763 printk(KERN_INFO "process %d (%s) no "
5764 "longer affine to cpu%d\n",
5765 task_pid_nr(p), p->comm, dead_cpu);
5766 }
5767 }
5768 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5769}
5770
5771
5772
5773
5774
5775
5776
5777
5778static void migrate_nr_uninterruptible(struct rq *rq_src)
5779{
5780 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
5781 unsigned long flags;
5782
5783 local_irq_save(flags);
5784 double_rq_lock(rq_src, rq_dest);
5785 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5786 rq_src->nr_uninterruptible = 0;
5787 double_rq_unlock(rq_src, rq_dest);
5788 local_irq_restore(flags);
5789}
5790
5791
5792static void migrate_live_tasks(int src_cpu)
5793{
5794 struct task_struct *p, *t;
5795
5796 read_lock(&tasklist_lock);
5797
5798 do_each_thread(t, p) {
5799 if (p == current)
5800 continue;
5801
5802 if (task_cpu(p) == src_cpu)
5803 move_task_off_dead_cpu(src_cpu, p);
5804 } while_each_thread(t, p);
5805
5806 read_unlock(&tasklist_lock);
5807}
5808
5809
5810
5811
5812
5813
5814void sched_idle_next(void)
5815{
5816 int this_cpu = smp_processor_id();
5817 struct rq *rq = cpu_rq(this_cpu);
5818 struct task_struct *p = rq->idle;
5819 unsigned long flags;
5820
5821
5822 BUG_ON(cpu_online(this_cpu));
5823
5824
5825
5826
5827
5828 spin_lock_irqsave(&rq->lock, flags);
5829
5830 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5831
5832 update_rq_clock(rq);
5833 activate_task(rq, p, 0);
5834
5835 spin_unlock_irqrestore(&rq->lock, flags);
5836}
5837
5838
5839
5840
5841
5842void idle_task_exit(void)
5843{
5844 struct mm_struct *mm = current->active_mm;
5845
5846 BUG_ON(cpu_online(smp_processor_id()));
5847
5848 if (mm != &init_mm)
5849 switch_mm(mm, &init_mm, current);
5850 mmdrop(mm);
5851}
5852
5853
5854static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5855{
5856 struct rq *rq = cpu_rq(dead_cpu);
5857
5858
5859 BUG_ON(!p->exit_state);
5860
5861
5862 BUG_ON(p->state == TASK_DEAD);
5863
5864 get_task_struct(p);
5865
5866
5867
5868
5869
5870
5871 spin_unlock_irq(&rq->lock);
5872 move_task_off_dead_cpu(dead_cpu, p);
5873 spin_lock_irq(&rq->lock);
5874
5875 put_task_struct(p);
5876}
5877
5878
5879static void migrate_dead_tasks(unsigned int dead_cpu)
5880{
5881 struct rq *rq = cpu_rq(dead_cpu);
5882 struct task_struct *next;
5883
5884 for ( ; ; ) {
5885 if (!rq->nr_running)
5886 break;
5887 update_rq_clock(rq);
5888 next = pick_next_task(rq, rq->curr);
5889 if (!next)
5890 break;
5891 next->sched_class->put_prev_task(rq, next);
5892 migrate_dead(dead_cpu, next);
5893
5894 }
5895}
5896#endif
5897
5898#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5899
5900static struct ctl_table sd_ctl_dir[] = {
5901 {
5902 .procname = "sched_domain",
5903 .mode = 0555,
5904 },
5905 {0, },
5906};
5907
5908static struct ctl_table sd_ctl_root[] = {
5909 {
5910 .ctl_name = CTL_KERN,
5911 .procname = "kernel",
5912 .mode = 0555,
5913 .child = sd_ctl_dir,
5914 },
5915 {0, },
5916};
5917
5918static struct ctl_table *sd_alloc_ctl_entry(int n)
5919{
5920 struct ctl_table *entry =
5921 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5922
5923 return entry;
5924}
5925
5926static void sd_free_ctl_entry(struct ctl_table **tablep)
5927{
5928 struct ctl_table *entry;
5929
5930
5931
5932
5933
5934
5935
5936 for (entry = *tablep; entry->mode; entry++) {
5937 if (entry->child)
5938 sd_free_ctl_entry(&entry->child);
5939 if (entry->proc_handler == NULL)
5940 kfree(entry->procname);
5941 }
5942
5943 kfree(*tablep);
5944 *tablep = NULL;
5945}
5946
5947static void
5948set_table_entry(struct ctl_table *entry,
5949 const char *procname, void *data, int maxlen,
5950 mode_t mode, proc_handler *proc_handler)
5951{
5952 entry->procname = procname;
5953 entry->data = data;
5954 entry->maxlen = maxlen;
5955 entry->mode = mode;
5956 entry->proc_handler = proc_handler;
5957}
5958
5959static struct ctl_table *
5960sd_alloc_ctl_domain_table(struct sched_domain *sd)
5961{
5962 struct ctl_table *table = sd_alloc_ctl_entry(12);
5963
5964 if (table == NULL)
5965 return NULL;
5966
5967 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5968 sizeof(long), 0644, proc_doulongvec_minmax);
5969 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5970 sizeof(long), 0644, proc_doulongvec_minmax);
5971 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5972 sizeof(int), 0644, proc_dointvec_minmax);
5973 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5974 sizeof(int), 0644, proc_dointvec_minmax);
5975 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5976 sizeof(int), 0644, proc_dointvec_minmax);
5977 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5978 sizeof(int), 0644, proc_dointvec_minmax);
5979 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5980 sizeof(int), 0644, proc_dointvec_minmax);
5981 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5982 sizeof(int), 0644, proc_dointvec_minmax);
5983 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5984 sizeof(int), 0644, proc_dointvec_minmax);
5985 set_table_entry(&table[9], "cache_nice_tries",
5986 &sd->cache_nice_tries,
5987 sizeof(int), 0644, proc_dointvec_minmax);
5988 set_table_entry(&table[10], "flags", &sd->flags,
5989 sizeof(int), 0644, proc_dointvec_minmax);
5990
5991
5992 return table;
5993}
5994
5995static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5996{
5997 struct ctl_table *entry, *table;
5998 struct sched_domain *sd;
5999 int domain_num = 0, i;
6000 char buf[32];
6001
6002 for_each_domain(cpu, sd)
6003 domain_num++;
6004 entry = table = sd_alloc_ctl_entry(domain_num + 1);
6005 if (table == NULL)
6006 return NULL;
6007
6008 i = 0;
6009 for_each_domain(cpu, sd) {
6010 snprintf(buf, 32, "domain%d", i);
6011 entry->procname = kstrdup(buf, GFP_KERNEL);
6012 entry->mode = 0555;
6013 entry->child = sd_alloc_ctl_domain_table(sd);
6014 entry++;
6015 i++;
6016 }
6017 return table;
6018}
6019
6020static struct ctl_table_header *sd_sysctl_header;
6021static void register_sched_domain_sysctl(void)
6022{
6023 int i, cpu_num = num_online_cpus();
6024 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6025 char buf[32];
6026
6027 WARN_ON(sd_ctl_dir[0].child);
6028 sd_ctl_dir[0].child = entry;
6029
6030 if (entry == NULL)
6031 return;
6032
6033 for_each_online_cpu(i) {
6034 snprintf(buf, 32, "cpu%d", i);
6035 entry->procname = kstrdup(buf, GFP_KERNEL);
6036 entry->mode = 0555;
6037 entry->child = sd_alloc_ctl_cpu_table(i);
6038 entry++;
6039 }
6040
6041 WARN_ON(sd_sysctl_header);
6042 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6043}
6044
6045
6046static void unregister_sched_domain_sysctl(void)
6047{
6048 if (sd_sysctl_header)
6049 unregister_sysctl_table(sd_sysctl_header);
6050 sd_sysctl_header = NULL;
6051 if (sd_ctl_dir[0].child)
6052 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6053}
6054#else
6055static void register_sched_domain_sysctl(void)
6056{
6057}
6058static void unregister_sched_domain_sysctl(void)
6059{
6060}
6061#endif
6062
6063
6064
6065
6066
6067static int __cpuinit
6068migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6069{
6070 struct task_struct *p;
6071 int cpu = (long)hcpu;
6072 unsigned long flags;
6073 struct rq *rq;
6074
6075 switch (action) {
6076
6077 case CPU_UP_PREPARE:
6078 case CPU_UP_PREPARE_FROZEN:
6079 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
6080 if (IS_ERR(p))
6081 return NOTIFY_BAD;
6082 kthread_bind(p, cpu);
6083
6084 rq = task_rq_lock(p, &flags);
6085 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6086 task_rq_unlock(rq, &flags);
6087 cpu_rq(cpu)->migration_thread = p;
6088 break;
6089
6090 case CPU_ONLINE:
6091 case CPU_ONLINE_FROZEN:
6092
6093 wake_up_process(cpu_rq(cpu)->migration_thread);
6094
6095
6096 rq = cpu_rq(cpu);
6097 spin_lock_irqsave(&rq->lock, flags);
6098 if (rq->rd) {
6099 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6100 cpu_set(cpu, rq->rd->online);
6101 }
6102 spin_unlock_irqrestore(&rq->lock, flags);
6103 break;
6104
6105#ifdef CONFIG_HOTPLUG_CPU
6106 case CPU_UP_CANCELED:
6107 case CPU_UP_CANCELED_FROZEN:
6108 if (!cpu_rq(cpu)->migration_thread)
6109 break;
6110
6111 kthread_bind(cpu_rq(cpu)->migration_thread,
6112 any_online_cpu(cpu_online_map));
6113 kthread_stop(cpu_rq(cpu)->migration_thread);
6114 cpu_rq(cpu)->migration_thread = NULL;
6115 break;
6116
6117 case CPU_DEAD:
6118 case CPU_DEAD_FROZEN:
6119 cpuset_lock();
6120 migrate_live_tasks(cpu);
6121 rq = cpu_rq(cpu);
6122 kthread_stop(rq->migration_thread);
6123 rq->migration_thread = NULL;
6124
6125 spin_lock_irq(&rq->lock);
6126 update_rq_clock(rq);
6127 deactivate_task(rq, rq->idle, 0);
6128 rq->idle->static_prio = MAX_PRIO;
6129 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6130 rq->idle->sched_class = &idle_sched_class;
6131 migrate_dead_tasks(cpu);
6132 spin_unlock_irq(&rq->lock);
6133 cpuset_unlock();
6134 migrate_nr_uninterruptible(rq);
6135 BUG_ON(rq->nr_running != 0);
6136
6137
6138
6139
6140
6141
6142 spin_lock_irq(&rq->lock);
6143 while (!list_empty(&rq->migration_queue)) {
6144 struct migration_req *req;
6145
6146 req = list_entry(rq->migration_queue.next,
6147 struct migration_req, list);
6148 list_del_init(&req->list);
6149 complete(&req->done);
6150 }
6151 spin_unlock_irq(&rq->lock);
6152 break;
6153
6154 case CPU_DYING:
6155 case CPU_DYING_FROZEN:
6156
6157 rq = cpu_rq(cpu);
6158 spin_lock_irqsave(&rq->lock, flags);
6159 if (rq->rd) {
6160 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6161 cpu_clear(cpu, rq->rd->online);
6162 }
6163 spin_unlock_irqrestore(&rq->lock, flags);
6164 break;
6165#endif
6166 }
6167 return NOTIFY_OK;
6168}
6169
6170
6171
6172
6173static struct notifier_block __cpuinitdata migration_notifier = {
6174 .notifier_call = migration_call,
6175 .priority = 10
6176};
6177
6178void __init migration_init(void)
6179{
6180 void *cpu = (void *)(long)smp_processor_id();
6181 int err;
6182
6183
6184 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6185 BUG_ON(err == NOTIFY_BAD);
6186 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6187 register_cpu_notifier(&migration_notifier);
6188}
6189#endif
6190
6191#ifdef CONFIG_SMP
6192
6193#ifdef CONFIG_SCHED_DEBUG
6194
6195static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6196 cpumask_t *groupmask)
6197{
6198 struct sched_group *group = sd->groups;
6199 char str[256];
6200
6201 cpulist_scnprintf(str, sizeof(str), sd->span);
6202 cpus_clear(*groupmask);
6203
6204 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6205
6206 if (!(sd->flags & SD_LOAD_BALANCE)) {
6207 printk("does not load-balance\n");
6208 if (sd->parent)
6209 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6210 " has parent");
6211 return -1;
6212 }
6213
6214 printk(KERN_CONT "span %s\n", str);
6215
6216 if (!cpu_isset(cpu, sd->span)) {
6217 printk(KERN_ERR "ERROR: domain->span does not contain "
6218 "CPU%d\n", cpu);
6219 }
6220 if (!cpu_isset(cpu, group->cpumask)) {
6221 printk(KERN_ERR "ERROR: domain->groups does not contain"
6222 " CPU%d\n", cpu);
6223 }
6224
6225 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6226 do {
6227 if (!group) {
6228 printk("\n");
6229 printk(KERN_ERR "ERROR: group is NULL\n");
6230 break;
6231 }
6232
6233 if (!group->__cpu_power) {
6234 printk(KERN_CONT "\n");
6235 printk(KERN_ERR "ERROR: domain->cpu_power not "
6236 "set\n");
6237 break;
6238 }
6239
6240 if (!cpus_weight(group->cpumask)) {
6241 printk(KERN_CONT "\n");
6242 printk(KERN_ERR "ERROR: empty group\n");
6243 break;
6244 }
6245
6246 if (cpus_intersects(*groupmask, group->cpumask)) {
6247 printk(KERN_CONT "\n");
6248 printk(KERN_ERR "ERROR: repeated CPUs\n");
6249 break;
6250 }
6251
6252 cpus_or(*groupmask, *groupmask, group->cpumask);
6253
6254 cpulist_scnprintf(str, sizeof(str), group->cpumask);
6255 printk(KERN_CONT " %s", str);
6256
6257 group = group->next;
6258 } while (group != sd->groups);
6259 printk(KERN_CONT "\n");
6260
6261 if (!cpus_equal(sd->span, *groupmask))
6262 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6263
6264 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
6265 printk(KERN_ERR "ERROR: parent span is not a superset "
6266 "of domain->span\n");
6267 return 0;
6268}
6269
6270static void sched_domain_debug(struct sched_domain *sd, int cpu)
6271{
6272 cpumask_t *groupmask;
6273 int level = 0;
6274
6275 if (!sd) {
6276 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6277 return;
6278 }
6279
6280 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6281
6282 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6283 if (!groupmask) {
6284 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6285 return;
6286 }
6287
6288 for (;;) {
6289 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6290 break;
6291 level++;
6292 sd = sd->parent;
6293 if (!sd)
6294 break;
6295 }
6296 kfree(groupmask);
6297}
6298#else
6299# define sched_domain_debug(sd, cpu) do { } while (0)
6300#endif
6301
6302static int sd_degenerate(struct sched_domain *sd)
6303{
6304 if (cpus_weight(sd->span) == 1)
6305 return 1;
6306
6307
6308 if (sd->flags & (SD_LOAD_BALANCE |
6309 SD_BALANCE_NEWIDLE |
6310 SD_BALANCE_FORK |
6311 SD_BALANCE_EXEC |
6312 SD_SHARE_CPUPOWER |
6313 SD_SHARE_PKG_RESOURCES)) {
6314 if (sd->groups != sd->groups->next)
6315 return 0;
6316 }
6317
6318
6319 if (sd->flags & (SD_WAKE_IDLE |
6320 SD_WAKE_AFFINE |
6321 SD_WAKE_BALANCE))
6322 return 0;
6323
6324 return 1;
6325}
6326
6327static int
6328sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6329{
6330 unsigned long cflags = sd->flags, pflags = parent->flags;
6331
6332 if (sd_degenerate(parent))
6333 return 1;
6334
6335 if (!cpus_equal(sd->span, parent->span))
6336 return 0;
6337
6338
6339
6340 if (cflags & SD_WAKE_AFFINE)
6341 pflags &= ~SD_WAKE_BALANCE;
6342
6343 if (parent->groups == parent->groups->next) {
6344 pflags &= ~(SD_LOAD_BALANCE |
6345 SD_BALANCE_NEWIDLE |
6346 SD_BALANCE_FORK |
6347 SD_BALANCE_EXEC |
6348 SD_SHARE_CPUPOWER |
6349 SD_SHARE_PKG_RESOURCES);
6350 }
6351 if (~cflags & pflags)
6352 return 0;
6353
6354 return 1;
6355}
6356
6357static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6358{
6359 unsigned long flags;
6360 const struct sched_class *class;
6361
6362 spin_lock_irqsave(&rq->lock, flags);
6363
6364 if (rq->rd) {
6365 struct root_domain *old_rd = rq->rd;
6366
6367 for (class = sched_class_highest; class; class = class->next) {
6368 if (class->leave_domain)
6369 class->leave_domain(rq);
6370 }
6371
6372 cpu_clear(rq->cpu, old_rd->span);
6373 cpu_clear(rq->cpu, old_rd->online);
6374
6375 if (atomic_dec_and_test(&old_rd->refcount))
6376 kfree(old_rd);
6377 }
6378
6379 atomic_inc(&rd->refcount);
6380 rq->rd = rd;
6381
6382 cpu_set(rq->cpu, rd->span);
6383 if (cpu_isset(rq->cpu, cpu_online_map))
6384 cpu_set(rq->cpu, rd->online);
6385
6386 for (class = sched_class_highest; class; class = class->next) {
6387 if (class->join_domain)
6388 class->join_domain(rq);
6389 }
6390
6391 spin_unlock_irqrestore(&rq->lock, flags);
6392}
6393
6394static void init_rootdomain(struct root_domain *rd)
6395{
6396 memset(rd, 0, sizeof(*rd));
6397
6398 cpus_clear(rd->span);
6399 cpus_clear(rd->online);
6400}
6401
6402static void init_defrootdomain(void)
6403{
6404 init_rootdomain(&def_root_domain);
6405 atomic_set(&def_root_domain.refcount, 1);
6406}
6407
6408static struct root_domain *alloc_rootdomain(void)
6409{
6410 struct root_domain *rd;
6411
6412 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6413 if (!rd)
6414 return NULL;
6415
6416 init_rootdomain(rd);
6417
6418 return rd;
6419}
6420
6421
6422
6423
6424
6425static void
6426cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6427{
6428 struct rq *rq = cpu_rq(cpu);
6429 struct sched_domain *tmp;
6430
6431
6432 for (tmp = sd; tmp; tmp = tmp->parent) {
6433 struct sched_domain *parent = tmp->parent;
6434 if (!parent)
6435 break;
6436 if (sd_parent_degenerate(tmp, parent)) {
6437 tmp->parent = parent->parent;
6438 if (parent->parent)
6439 parent->parent->child = tmp;
6440 }
6441 }
6442
6443 if (sd && sd_degenerate(sd)) {
6444 sd = sd->parent;
6445 if (sd)
6446 sd->child = NULL;
6447 }
6448
6449 sched_domain_debug(sd, cpu);
6450
6451 rq_attach_root(rq, rd);
6452 rcu_assign_pointer(rq->sd, sd);
6453}
6454
6455
6456static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6457
6458
6459static int __init isolated_cpu_setup(char *str)
6460{
6461 int ints[NR_CPUS], i;
6462
6463 str = get_options(str, ARRAY_SIZE(ints), ints);
6464 cpus_clear(cpu_isolated_map);
6465 for (i = 1; i <= ints[0]; i++)
6466 if (ints[i] < NR_CPUS)
6467 cpu_set(ints[i], cpu_isolated_map);
6468 return 1;
6469}
6470
6471__setup("isolcpus=", isolated_cpu_setup);
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483static void
6484init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6485 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
6486 struct sched_group **sg,
6487 cpumask_t *tmpmask),
6488 cpumask_t *covered, cpumask_t *tmpmask)
6489{
6490 struct sched_group *first = NULL, *last = NULL;
6491 int i;
6492
6493 cpus_clear(*covered);
6494
6495 for_each_cpu_mask(i, *span) {
6496 struct sched_group *sg;
6497 int group = group_fn(i, cpu_map, &sg, tmpmask);
6498 int j;
6499
6500 if (cpu_isset(i, *covered))
6501 continue;
6502
6503 cpus_clear(sg->cpumask);
6504 sg->__cpu_power = 0;
6505
6506 for_each_cpu_mask(j, *span) {
6507 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6508 continue;
6509
6510 cpu_set(j, *covered);
6511 cpu_set(j, sg->cpumask);
6512 }
6513 if (!first)
6514 first = sg;
6515 if (last)
6516 last->next = sg;
6517 last = sg;
6518 }
6519 last->next = first;
6520}
6521
6522#define SD_NODES_PER_DOMAIN 16
6523
6524#ifdef CONFIG_NUMA
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536static int find_next_best_node(int node, nodemask_t *used_nodes)
6537{
6538 int i, n, val, min_val, best_node = 0;
6539
6540 min_val = INT_MAX;
6541
6542 for (i = 0; i < MAX_NUMNODES; i++) {
6543
6544 n = (node + i) % MAX_NUMNODES;
6545
6546 if (!nr_cpus_node(n))
6547 continue;
6548
6549
6550 if (node_isset(n, *used_nodes))
6551 continue;
6552
6553
6554 val = node_distance(node, n);
6555
6556 if (val < min_val) {
6557 min_val = val;
6558 best_node = n;
6559 }
6560 }
6561
6562 node_set(best_node, *used_nodes);
6563 return best_node;
6564}
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575static void sched_domain_node_span(int node, cpumask_t *span)
6576{
6577 nodemask_t used_nodes;
6578 node_to_cpumask_ptr(nodemask, node);
6579 int i;
6580
6581 cpus_clear(*span);
6582 nodes_clear(used_nodes);
6583
6584 cpus_or(*span, *span, *nodemask);
6585 node_set(node, used_nodes);
6586
6587 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6588 int next_node = find_next_best_node(node, &used_nodes);
6589
6590 node_to_cpumask_ptr_next(nodemask, next_node);
6591 cpus_or(*span, *span, *nodemask);
6592 }
6593}
6594#endif
6595
6596int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6597
6598
6599
6600
6601#ifdef CONFIG_SCHED_SMT
6602static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6603static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6604
6605static int
6606cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6607 cpumask_t *unused)
6608{
6609 if (sg)
6610 *sg = &per_cpu(sched_group_cpus, cpu);
6611 return cpu;
6612}
6613#endif
6614
6615
6616
6617
6618#ifdef CONFIG_SCHED_MC
6619static DEFINE_PER_CPU(struct sched_domain, core_domains);
6620static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6621#endif
6622
6623#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6624static int
6625cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6626 cpumask_t *mask)
6627{
6628 int group;
6629
6630 *mask = per_cpu(cpu_sibling_map, cpu);
6631 cpus_and(*mask, *mask, *cpu_map);
6632 group = first_cpu(*mask);
6633 if (sg)
6634 *sg = &per_cpu(sched_group_core, group);
6635 return group;
6636}
6637#elif defined(CONFIG_SCHED_MC)
6638static int
6639cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6640 cpumask_t *unused)
6641{
6642 if (sg)
6643 *sg = &per_cpu(sched_group_core, cpu);
6644 return cpu;
6645}
6646#endif
6647
6648static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6649static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6650
6651static int
6652cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6653 cpumask_t *mask)
6654{
6655 int group;
6656#ifdef CONFIG_SCHED_MC
6657 *mask = cpu_coregroup_map(cpu);
6658 cpus_and(*mask, *mask, *cpu_map);
6659 group = first_cpu(*mask);
6660#elif defined(CONFIG_SCHED_SMT)
6661 *mask = per_cpu(cpu_sibling_map, cpu);
6662 cpus_and(*mask, *mask, *cpu_map);
6663 group = first_cpu(*mask);
6664#else
6665 group = cpu;
6666#endif
6667 if (sg)
6668 *sg = &per_cpu(sched_group_phys, group);
6669 return group;
6670}
6671
6672#ifdef CONFIG_NUMA
6673
6674
6675
6676
6677
6678static DEFINE_PER_CPU(struct sched_domain, node_domains);
6679static struct sched_group ***sched_group_nodes_bycpu;
6680
6681static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6682static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6683
6684static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6685 struct sched_group **sg, cpumask_t *nodemask)
6686{
6687 int group;
6688
6689 *nodemask = node_to_cpumask(cpu_to_node(cpu));
6690 cpus_and(*nodemask, *nodemask, *cpu_map);
6691 group = first_cpu(*nodemask);
6692
6693 if (sg)
6694 *sg = &per_cpu(sched_group_allnodes, group);
6695 return group;
6696}
6697
6698static void init_numa_sched_groups_power(struct sched_group *group_head)
6699{
6700 struct sched_group *sg = group_head;
6701 int j;
6702
6703 if (!sg)
6704 return;
6705 do {
6706 for_each_cpu_mask(j, sg->cpumask) {
6707 struct sched_domain *sd;
6708
6709 sd = &per_cpu(phys_domains, j);
6710 if (j != first_cpu(sd->groups->cpumask)) {
6711
6712
6713
6714
6715 continue;
6716 }
6717
6718 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
6719 }
6720 sg = sg->next;
6721 } while (sg != group_head);
6722}
6723#endif
6724
6725#ifdef CONFIG_NUMA
6726
6727static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6728{
6729 int cpu, i;
6730
6731 for_each_cpu_mask(cpu, *cpu_map) {
6732 struct sched_group **sched_group_nodes
6733 = sched_group_nodes_bycpu[cpu];
6734
6735 if (!sched_group_nodes)
6736 continue;
6737
6738 for (i = 0; i < MAX_NUMNODES; i++) {
6739 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6740
6741 *nodemask = node_to_cpumask(i);
6742 cpus_and(*nodemask, *nodemask, *cpu_map);
6743 if (cpus_empty(*nodemask))
6744 continue;
6745
6746 if (sg == NULL)
6747 continue;
6748 sg = sg->next;
6749next_sg:
6750 oldsg = sg;
6751 sg = sg->next;
6752 kfree(oldsg);
6753 if (oldsg != sched_group_nodes[i])
6754 goto next_sg;
6755 }
6756 kfree(sched_group_nodes);
6757 sched_group_nodes_bycpu[cpu] = NULL;
6758 }
6759}
6760#else
6761static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6762{
6763}
6764#endif
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6781{
6782 struct sched_domain *child;
6783 struct sched_group *group;
6784
6785 WARN_ON(!sd || !sd->groups);
6786
6787 if (cpu != first_cpu(sd->groups->cpumask))
6788 return;
6789
6790 child = sd->child;
6791
6792 sd->groups->__cpu_power = 0;
6793
6794
6795
6796
6797
6798
6799
6800
6801 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6802 (child->flags &
6803 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6804 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
6805 return;
6806 }
6807
6808
6809
6810
6811 group = child->groups;
6812 do {
6813 sg_inc_cpu_power(sd->groups, group->__cpu_power);
6814 group = group->next;
6815 } while (group != child->groups);
6816}
6817
6818
6819
6820
6821
6822
6823#define SD_INIT(sd, type) sd_init_##type(sd)
6824#define SD_INIT_FUNC(type) \
6825static noinline void sd_init_##type(struct sched_domain *sd) \
6826{ \
6827 memset(sd, 0, sizeof(*sd)); \
6828 *sd = SD_##type##_INIT; \
6829 sd->level = SD_LV_##type; \
6830}
6831
6832SD_INIT_FUNC(CPU)
6833#ifdef CONFIG_NUMA
6834 SD_INIT_FUNC(ALLNODES)
6835 SD_INIT_FUNC(NODE)
6836#endif
6837#ifdef CONFIG_SCHED_SMT
6838 SD_INIT_FUNC(SIBLING)
6839#endif
6840#ifdef CONFIG_SCHED_MC
6841 SD_INIT_FUNC(MC)
6842#endif
6843
6844
6845
6846
6847
6848
6849struct allmasks {
6850 cpumask_t tmpmask;
6851 union {
6852 cpumask_t nodemask;
6853 cpumask_t this_sibling_map;
6854 cpumask_t this_core_map;
6855 };
6856 cpumask_t send_covered;
6857
6858#ifdef CONFIG_NUMA
6859 cpumask_t domainspan;
6860 cpumask_t covered;
6861 cpumask_t notcovered;
6862#endif
6863};
6864
6865#if NR_CPUS > 128
6866#define SCHED_CPUMASK_ALLOC 1
6867#define SCHED_CPUMASK_FREE(v) kfree(v)
6868#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
6869#else
6870#define SCHED_CPUMASK_ALLOC 0
6871#define SCHED_CPUMASK_FREE(v)
6872#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
6873#endif
6874
6875#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
6876 ((unsigned long)(a) + offsetof(struct allmasks, v))
6877
6878static int default_relax_domain_level = -1;
6879
6880static int __init setup_relax_domain_level(char *str)
6881{
6882 unsigned long val;
6883
6884 val = simple_strtoul(str, NULL, 0);
6885 if (val < SD_LV_MAX)
6886 default_relax_domain_level = val;
6887
6888 return 1;
6889}
6890__setup("relax_domain_level=", setup_relax_domain_level);
6891
6892static void set_domain_attribute(struct sched_domain *sd,
6893 struct sched_domain_attr *attr)
6894{
6895 int request;
6896
6897 if (!attr || attr->relax_domain_level < 0) {
6898 if (default_relax_domain_level < 0)
6899 return;
6900 else
6901 request = default_relax_domain_level;
6902 } else
6903 request = attr->relax_domain_level;
6904 if (request < sd->level) {
6905
6906 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
6907 } else {
6908
6909 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
6910 }
6911}
6912
6913
6914
6915
6916
6917static int __build_sched_domains(const cpumask_t *cpu_map,
6918 struct sched_domain_attr *attr)
6919{
6920 int i;
6921 struct root_domain *rd;
6922 SCHED_CPUMASK_DECLARE(allmasks);
6923 cpumask_t *tmpmask;
6924#ifdef CONFIG_NUMA
6925 struct sched_group **sched_group_nodes = NULL;
6926 int sd_allnodes = 0;
6927
6928
6929
6930
6931 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6932 GFP_KERNEL);
6933 if (!sched_group_nodes) {
6934 printk(KERN_WARNING "Can not alloc sched group node list\n");
6935 return -ENOMEM;
6936 }
6937#endif
6938
6939 rd = alloc_rootdomain();
6940 if (!rd) {
6941 printk(KERN_WARNING "Cannot alloc root domain\n");
6942#ifdef CONFIG_NUMA
6943 kfree(sched_group_nodes);
6944#endif
6945 return -ENOMEM;
6946 }
6947
6948#if SCHED_CPUMASK_ALLOC
6949
6950 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
6951 if (!allmasks) {
6952 printk(KERN_WARNING "Cannot alloc cpumask array\n");
6953 kfree(rd);
6954#ifdef CONFIG_NUMA
6955 kfree(sched_group_nodes);
6956#endif
6957 return -ENOMEM;
6958 }
6959#endif
6960 tmpmask = (cpumask_t *)allmasks;
6961
6962
6963#ifdef CONFIG_NUMA
6964 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6965#endif
6966
6967
6968
6969
6970 for_each_cpu_mask(i, *cpu_map) {
6971 struct sched_domain *sd = NULL, *p;
6972 SCHED_CPUMASK_VAR(nodemask, allmasks);
6973
6974 *nodemask = node_to_cpumask(cpu_to_node(i));
6975 cpus_and(*nodemask, *nodemask, *cpu_map);
6976
6977#ifdef CONFIG_NUMA
6978 if (cpus_weight(*cpu_map) >
6979 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
6980 sd = &per_cpu(allnodes_domains, i);
6981 SD_INIT(sd, ALLNODES);
6982 set_domain_attribute(sd, attr);
6983 sd->span = *cpu_map;
6984 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
6985 p = sd;
6986 sd_allnodes = 1;
6987 } else
6988 p = NULL;
6989
6990 sd = &per_cpu(node_domains, i);
6991 SD_INIT(sd, NODE);
6992 set_domain_attribute(sd, attr);
6993 sched_domain_node_span(cpu_to_node(i), &sd->span);
6994 sd->parent = p;
6995 if (p)
6996 p->child = sd;
6997 cpus_and(sd->span, sd->span, *cpu_map);
6998#endif
6999
7000 p = sd;
7001 sd = &per_cpu(phys_domains, i);
7002 SD_INIT(sd, CPU);
7003 set_domain_attribute(sd, attr);
7004 sd->span = *nodemask;
7005 sd->parent = p;
7006 if (p)
7007 p->child = sd;
7008 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
7009
7010#ifdef CONFIG_SCHED_MC
7011 p = sd;
7012 sd = &per_cpu(core_domains, i);
7013 SD_INIT(sd, MC);
7014 set_domain_attribute(sd, attr);
7015 sd->span = cpu_coregroup_map(i);
7016 cpus_and(sd->span, sd->span, *cpu_map);
7017 sd->parent = p;
7018 p->child = sd;
7019 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
7020#endif
7021
7022#ifdef CONFIG_SCHED_SMT
7023 p = sd;
7024 sd = &per_cpu(cpu_domains, i);
7025 SD_INIT(sd, SIBLING);
7026 set_domain_attribute(sd, attr);
7027 sd->span = per_cpu(cpu_sibling_map, i);
7028 cpus_and(sd->span, sd->span, *cpu_map);
7029 sd->parent = p;
7030 p->child = sd;
7031 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
7032#endif
7033 }
7034
7035#ifdef CONFIG_SCHED_SMT
7036
7037 for_each_cpu_mask(i, *cpu_map) {
7038 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7039 SCHED_CPUMASK_VAR(send_covered, allmasks);
7040
7041 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7042 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7043 if (i != first_cpu(*this_sibling_map))
7044 continue;
7045
7046 init_sched_build_groups(this_sibling_map, cpu_map,
7047 &cpu_to_cpu_group,
7048 send_covered, tmpmask);
7049 }
7050#endif
7051
7052#ifdef CONFIG_SCHED_MC
7053
7054 for_each_cpu_mask(i, *cpu_map) {
7055 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7056 SCHED_CPUMASK_VAR(send_covered, allmasks);
7057
7058 *this_core_map = cpu_coregroup_map(i);
7059 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7060 if (i != first_cpu(*this_core_map))
7061 continue;
7062
7063 init_sched_build_groups(this_core_map, cpu_map,
7064 &cpu_to_core_group,
7065 send_covered, tmpmask);
7066 }
7067#endif
7068
7069
7070 for (i = 0; i < MAX_NUMNODES; i++) {
7071 SCHED_CPUMASK_VAR(nodemask, allmasks);
7072 SCHED_CPUMASK_VAR(send_covered, allmasks);
7073
7074 *nodemask = node_to_cpumask(i);
7075 cpus_and(*nodemask, *nodemask, *cpu_map);
7076 if (cpus_empty(*nodemask))
7077 continue;
7078
7079 init_sched_build_groups(nodemask, cpu_map,
7080 &cpu_to_phys_group,
7081 send_covered, tmpmask);
7082 }
7083
7084#ifdef CONFIG_NUMA
7085
7086 if (sd_allnodes) {
7087 SCHED_CPUMASK_VAR(send_covered, allmasks);
7088
7089 init_sched_build_groups(cpu_map, cpu_map,
7090 &cpu_to_allnodes_group,
7091 send_covered, tmpmask);
7092 }
7093
7094 for (i = 0; i < MAX_NUMNODES; i++) {
7095
7096 struct sched_group *sg, *prev;
7097 SCHED_CPUMASK_VAR(nodemask, allmasks);
7098 SCHED_CPUMASK_VAR(domainspan, allmasks);
7099 SCHED_CPUMASK_VAR(covered, allmasks);
7100 int j;
7101
7102 *nodemask = node_to_cpumask(i);
7103 cpus_clear(*covered);
7104
7105 cpus_and(*nodemask, *nodemask, *cpu_map);
7106 if (cpus_empty(*nodemask)) {
7107 sched_group_nodes[i] = NULL;
7108 continue;
7109 }
7110
7111 sched_domain_node_span(i, domainspan);
7112 cpus_and(*domainspan, *domainspan, *cpu_map);
7113
7114 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
7115 if (!sg) {
7116 printk(KERN_WARNING "Can not alloc domain group for "
7117 "node %d\n", i);
7118 goto error;
7119 }
7120 sched_group_nodes[i] = sg;
7121 for_each_cpu_mask(j, *nodemask) {
7122 struct sched_domain *sd;
7123
7124 sd = &per_cpu(node_domains, j);
7125 sd->groups = sg;
7126 }
7127 sg->__cpu_power = 0;
7128 sg->cpumask = *nodemask;
7129 sg->next = sg;
7130 cpus_or(*covered, *covered, *nodemask);
7131 prev = sg;
7132
7133 for (j = 0; j < MAX_NUMNODES; j++) {
7134 SCHED_CPUMASK_VAR(notcovered, allmasks);
7135 int n = (i + j) % MAX_NUMNODES;
7136 node_to_cpumask_ptr(pnodemask, n);
7137
7138 cpus_complement(*notcovered, *covered);
7139 cpus_and(*tmpmask, *notcovered, *cpu_map);
7140 cpus_and(*tmpmask, *tmpmask, *domainspan);
7141 if (cpus_empty(*tmpmask))
7142 break;
7143
7144 cpus_and(*tmpmask, *tmpmask, *pnodemask);
7145 if (cpus_empty(*tmpmask))
7146 continue;
7147
7148 sg = kmalloc_node(sizeof(struct sched_group),
7149 GFP_KERNEL, i);
7150 if (!sg) {
7151 printk(KERN_WARNING
7152 "Can not alloc domain group for node %d\n", j);
7153 goto error;
7154 }
7155 sg->__cpu_power = 0;
7156 sg->cpumask = *tmpmask;
7157 sg->next = prev->next;
7158 cpus_or(*covered, *covered, *tmpmask);
7159 prev->next = sg;
7160 prev = sg;
7161 }
7162 }
7163#endif
7164
7165
7166#ifdef CONFIG_SCHED_SMT
7167 for_each_cpu_mask(i, *cpu_map) {
7168 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7169
7170 init_sched_groups_power(i, sd);
7171 }
7172#endif
7173#ifdef CONFIG_SCHED_MC
7174 for_each_cpu_mask(i, *cpu_map) {
7175 struct sched_domain *sd = &per_cpu(core_domains, i);
7176
7177 init_sched_groups_power(i, sd);
7178 }
7179#endif
7180
7181 for_each_cpu_mask(i, *cpu_map) {
7182 struct sched_domain *sd = &per_cpu(phys_domains, i);
7183
7184 init_sched_groups_power(i, sd);
7185 }
7186
7187#ifdef CONFIG_NUMA
7188 for (i = 0; i < MAX_NUMNODES; i++)
7189 init_numa_sched_groups_power(sched_group_nodes[i]);
7190
7191 if (sd_allnodes) {
7192 struct sched_group *sg;
7193
7194 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
7195 tmpmask);
7196 init_numa_sched_groups_power(sg);
7197 }
7198#endif
7199
7200
7201 for_each_cpu_mask(i, *cpu_map) {
7202 struct sched_domain *sd;
7203#ifdef CONFIG_SCHED_SMT
7204 sd = &per_cpu(cpu_domains, i);
7205#elif defined(CONFIG_SCHED_MC)
7206 sd = &per_cpu(core_domains, i);
7207#else
7208 sd = &per_cpu(phys_domains, i);
7209#endif
7210 cpu_attach_domain(sd, rd, i);
7211 }
7212
7213 SCHED_CPUMASK_FREE((void *)allmasks);
7214 return 0;
7215
7216#ifdef CONFIG_NUMA
7217error:
7218 free_sched_groups(cpu_map, tmpmask);
7219 SCHED_CPUMASK_FREE((void *)allmasks);
7220 return -ENOMEM;
7221#endif
7222}
7223
7224static int build_sched_domains(const cpumask_t *cpu_map)
7225{
7226 return __build_sched_domains(cpu_map, NULL);
7227}
7228
7229static cpumask_t *doms_cur;
7230static int ndoms_cur;
7231static struct sched_domain_attr *dattr_cur;
7232
7233
7234
7235
7236
7237
7238
7239static cpumask_t fallback_doms;
7240
7241void __attribute__((weak)) arch_update_cpu_topology(void)
7242{
7243}
7244
7245
7246
7247
7248
7249static void free_sched_domains(void)
7250{
7251 ndoms_cur = 0;
7252 if (doms_cur != &fallback_doms)
7253 kfree(doms_cur);
7254 doms_cur = &fallback_doms;
7255}
7256
7257
7258
7259
7260
7261
7262static int arch_init_sched_domains(const cpumask_t *cpu_map)
7263{
7264 int err;
7265
7266 arch_update_cpu_topology();
7267 ndoms_cur = 1;
7268 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
7269 if (!doms_cur)
7270 doms_cur = &fallback_doms;
7271 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
7272 dattr_cur = NULL;
7273 err = build_sched_domains(doms_cur);
7274 register_sched_domain_sysctl();
7275
7276 return err;
7277}
7278
7279static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7280 cpumask_t *tmpmask)
7281{
7282 free_sched_groups(cpu_map, tmpmask);
7283}
7284
7285
7286
7287
7288
7289static void detach_destroy_domains(const cpumask_t *cpu_map)
7290{
7291 cpumask_t tmpmask;
7292 int i;
7293
7294 unregister_sched_domain_sysctl();
7295
7296 for_each_cpu_mask(i, *cpu_map)
7297 cpu_attach_domain(NULL, &def_root_domain, i);
7298 synchronize_sched();
7299 arch_destroy_sched_domains(cpu_map, &tmpmask);
7300}
7301
7302
7303static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7304 struct sched_domain_attr *new, int idx_new)
7305{
7306 struct sched_domain_attr tmp;
7307
7308
7309 if (!new && !cur)
7310 return 1;
7311
7312 tmp = SD_ATTR_INIT;
7313 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7314 new ? (new + idx_new) : &tmp,
7315 sizeof(struct sched_domain_attr));
7316}
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7340 struct sched_domain_attr *dattr_new)
7341{
7342 int i, j;
7343
7344 mutex_lock(&sched_domains_mutex);
7345
7346
7347 unregister_sched_domain_sysctl();
7348
7349 if (doms_new == NULL) {
7350 ndoms_new = 1;
7351 doms_new = &fallback_doms;
7352 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7353 dattr_new = NULL;
7354 }
7355
7356
7357 for (i = 0; i < ndoms_cur; i++) {
7358 for (j = 0; j < ndoms_new; j++) {
7359 if (cpus_equal(doms_cur[i], doms_new[j])
7360 && dattrs_equal(dattr_cur, i, dattr_new, j))
7361 goto match1;
7362 }
7363
7364 detach_destroy_domains(doms_cur + i);
7365match1:
7366 ;
7367 }
7368
7369
7370 for (i = 0; i < ndoms_new; i++) {
7371 for (j = 0; j < ndoms_cur; j++) {
7372 if (cpus_equal(doms_new[i], doms_cur[j])
7373 && dattrs_equal(dattr_new, i, dattr_cur, j))
7374 goto match2;
7375 }
7376
7377 __build_sched_domains(doms_new + i,
7378 dattr_new ? dattr_new + i : NULL);
7379match2:
7380 ;
7381 }
7382
7383
7384 if (doms_cur != &fallback_doms)
7385 kfree(doms_cur);
7386 kfree(dattr_cur);
7387 doms_cur = doms_new;
7388 dattr_cur = dattr_new;
7389 ndoms_cur = ndoms_new;
7390
7391 register_sched_domain_sysctl();
7392
7393 mutex_unlock(&sched_domains_mutex);
7394}
7395
7396#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7397int arch_reinit_sched_domains(void)
7398{
7399 int err;
7400
7401 get_online_cpus();
7402 mutex_lock(&sched_domains_mutex);
7403 detach_destroy_domains(&cpu_online_map);
7404 free_sched_domains();
7405 err = arch_init_sched_domains(&cpu_online_map);
7406 mutex_unlock(&sched_domains_mutex);
7407 put_online_cpus();
7408
7409 return err;
7410}
7411
7412static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7413{
7414 int ret;
7415
7416 if (buf[0] != '0' && buf[0] != '1')
7417 return -EINVAL;
7418
7419 if (smt)
7420 sched_smt_power_savings = (buf[0] == '1');
7421 else
7422 sched_mc_power_savings = (buf[0] == '1');
7423
7424 ret = arch_reinit_sched_domains();
7425
7426 return ret ? ret : count;
7427}
7428
7429#ifdef CONFIG_SCHED_MC
7430static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
7431{
7432 return sprintf(page, "%u\n", sched_mc_power_savings);
7433}
7434static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
7435 const char *buf, size_t count)
7436{
7437 return sched_power_savings_store(buf, count, 0);
7438}
7439static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
7440 sched_mc_power_savings_store);
7441#endif
7442
7443#ifdef CONFIG_SCHED_SMT
7444static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
7445{
7446 return sprintf(page, "%u\n", sched_smt_power_savings);
7447}
7448static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
7449 const char *buf, size_t count)
7450{
7451 return sched_power_savings_store(buf, count, 1);
7452}
7453static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
7454 sched_smt_power_savings_store);
7455#endif
7456
7457int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7458{
7459 int err = 0;
7460
7461#ifdef CONFIG_SCHED_SMT
7462 if (smt_capable())
7463 err = sysfs_create_file(&cls->kset.kobj,
7464 &attr_sched_smt_power_savings.attr);
7465#endif
7466#ifdef CONFIG_SCHED_MC
7467 if (!err && mc_capable())
7468 err = sysfs_create_file(&cls->kset.kobj,
7469 &attr_sched_mc_power_savings.attr);
7470#endif
7471 return err;
7472}
7473#endif
7474
7475
7476
7477
7478
7479
7480
7481static int update_sched_domains(struct notifier_block *nfb,
7482 unsigned long action, void *hcpu)
7483{
7484 switch (action) {
7485 case CPU_UP_PREPARE:
7486 case CPU_UP_PREPARE_FROZEN:
7487 case CPU_DOWN_PREPARE:
7488 case CPU_DOWN_PREPARE_FROZEN:
7489 detach_destroy_domains(&cpu_online_map);
7490 free_sched_domains();
7491 return NOTIFY_OK;
7492
7493 case CPU_UP_CANCELED:
7494 case CPU_UP_CANCELED_FROZEN:
7495 case CPU_DOWN_FAILED:
7496 case CPU_DOWN_FAILED_FROZEN:
7497 case CPU_ONLINE:
7498 case CPU_ONLINE_FROZEN:
7499 case CPU_DEAD:
7500 case CPU_DEAD_FROZEN:
7501
7502
7503
7504 break;
7505 default:
7506 return NOTIFY_DONE;
7507 }
7508
7509#ifndef CONFIG_CPUSETS
7510
7511
7512
7513
7514
7515
7516
7517 arch_init_sched_domains(&cpu_online_map);
7518#endif
7519
7520 return NOTIFY_OK;
7521}
7522
7523void __init sched_init_smp(void)
7524{
7525 cpumask_t non_isolated_cpus;
7526
7527#if defined(CONFIG_NUMA)
7528 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7529 GFP_KERNEL);
7530 BUG_ON(sched_group_nodes_bycpu == NULL);
7531#endif
7532 get_online_cpus();
7533 mutex_lock(&sched_domains_mutex);
7534 arch_init_sched_domains(&cpu_online_map);
7535 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7536 if (cpus_empty(non_isolated_cpus))
7537 cpu_set(smp_processor_id(), non_isolated_cpus);
7538 mutex_unlock(&sched_domains_mutex);
7539 put_online_cpus();
7540
7541 hotcpu_notifier(update_sched_domains, 0);
7542 init_hrtick();
7543
7544
7545 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
7546 BUG();
7547 sched_init_granularity();
7548}
7549#else
7550void __init sched_init_smp(void)
7551{
7552 sched_init_granularity();
7553}
7554#endif
7555
7556int in_sched_functions(unsigned long addr)
7557{
7558 return in_lock_functions(addr) ||
7559 (addr >= (unsigned long)__sched_text_start
7560 && addr < (unsigned long)__sched_text_end);
7561}
7562
7563static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7564{
7565 cfs_rq->tasks_timeline = RB_ROOT;
7566 INIT_LIST_HEAD(&cfs_rq->tasks);
7567#ifdef CONFIG_FAIR_GROUP_SCHED
7568 cfs_rq->rq = rq;
7569#endif
7570 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7571}
7572
7573static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7574{
7575 struct rt_prio_array *array;
7576 int i;
7577
7578 array = &rt_rq->active;
7579 for (i = 0; i < MAX_RT_PRIO; i++) {
7580 INIT_LIST_HEAD(array->queue + i);
7581 __clear_bit(i, array->bitmap);
7582 }
7583
7584 __set_bit(MAX_RT_PRIO, array->bitmap);
7585
7586#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7587 rt_rq->highest_prio = MAX_RT_PRIO;
7588#endif
7589#ifdef CONFIG_SMP
7590 rt_rq->rt_nr_migratory = 0;
7591 rt_rq->overloaded = 0;
7592#endif
7593
7594 rt_rq->rt_time = 0;
7595 rt_rq->rt_throttled = 0;
7596 rt_rq->rt_runtime = 0;
7597 spin_lock_init(&rt_rq->rt_runtime_lock);
7598
7599#ifdef CONFIG_RT_GROUP_SCHED
7600 rt_rq->rt_nr_boosted = 0;
7601 rt_rq->rq = rq;
7602#endif
7603}
7604
7605#ifdef CONFIG_FAIR_GROUP_SCHED
7606static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7607 struct sched_entity *se, int cpu, int add,
7608 struct sched_entity *parent)
7609{
7610 struct rq *rq = cpu_rq(cpu);
7611 tg->cfs_rq[cpu] = cfs_rq;
7612 init_cfs_rq(cfs_rq, rq);
7613 cfs_rq->tg = tg;
7614 if (add)
7615 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7616
7617 tg->se[cpu] = se;
7618
7619 if (!se)
7620 return;
7621
7622 if (!parent)
7623 se->cfs_rq = &rq->cfs;
7624 else
7625 se->cfs_rq = parent->my_q;
7626
7627 se->my_q = cfs_rq;
7628 se->load.weight = tg->shares;
7629 se->load.inv_weight = 0;
7630 se->parent = parent;
7631}
7632#endif
7633
7634#ifdef CONFIG_RT_GROUP_SCHED
7635static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7636 struct sched_rt_entity *rt_se, int cpu, int add,
7637 struct sched_rt_entity *parent)
7638{
7639 struct rq *rq = cpu_rq(cpu);
7640
7641 tg->rt_rq[cpu] = rt_rq;
7642 init_rt_rq(rt_rq, rq);
7643 rt_rq->tg = tg;
7644 rt_rq->rt_se = rt_se;
7645 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7646 if (add)
7647 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7648
7649 tg->rt_se[cpu] = rt_se;
7650 if (!rt_se)
7651 return;
7652
7653 if (!parent)
7654 rt_se->rt_rq = &rq->rt;
7655 else
7656 rt_se->rt_rq = parent->my_q;
7657
7658 rt_se->my_q = rt_rq;
7659 rt_se->parent = parent;
7660 INIT_LIST_HEAD(&rt_se->run_list);
7661}
7662#endif
7663
7664void __init sched_init(void)
7665{
7666 int i, j;
7667 unsigned long alloc_size = 0, ptr;
7668
7669#ifdef CONFIG_FAIR_GROUP_SCHED
7670 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7671#endif
7672#ifdef CONFIG_RT_GROUP_SCHED
7673 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7674#endif
7675#ifdef CONFIG_USER_SCHED
7676 alloc_size *= 2;
7677#endif
7678
7679
7680
7681
7682 if (alloc_size) {
7683 ptr = (unsigned long)alloc_bootmem(alloc_size);
7684
7685#ifdef CONFIG_FAIR_GROUP_SCHED
7686 init_task_group.se = (struct sched_entity **)ptr;
7687 ptr += nr_cpu_ids * sizeof(void **);
7688
7689 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
7690 ptr += nr_cpu_ids * sizeof(void **);
7691
7692#ifdef CONFIG_USER_SCHED
7693 root_task_group.se = (struct sched_entity **)ptr;
7694 ptr += nr_cpu_ids * sizeof(void **);
7695
7696 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7697 ptr += nr_cpu_ids * sizeof(void **);
7698#endif
7699#endif
7700#ifdef CONFIG_RT_GROUP_SCHED
7701 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7702 ptr += nr_cpu_ids * sizeof(void **);
7703
7704 init_task_group.rt_rq = (struct rt_rq **)ptr;
7705 ptr += nr_cpu_ids * sizeof(void **);
7706
7707#ifdef CONFIG_USER_SCHED
7708 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7709 ptr += nr_cpu_ids * sizeof(void **);
7710
7711 root_task_group.rt_rq = (struct rt_rq **)ptr;
7712 ptr += nr_cpu_ids * sizeof(void **);
7713#endif
7714#endif
7715 }
7716
7717#ifdef CONFIG_SMP
7718 init_defrootdomain();
7719#endif
7720
7721 init_rt_bandwidth(&def_rt_bandwidth,
7722 global_rt_period(), global_rt_runtime());
7723
7724#ifdef CONFIG_RT_GROUP_SCHED
7725 init_rt_bandwidth(&init_task_group.rt_bandwidth,
7726 global_rt_period(), global_rt_runtime());
7727#ifdef CONFIG_USER_SCHED
7728 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7729 global_rt_period(), RUNTIME_INF);
7730#endif
7731#endif
7732
7733#ifdef CONFIG_GROUP_SCHED
7734 list_add(&init_task_group.list, &task_groups);
7735 INIT_LIST_HEAD(&init_task_group.children);
7736
7737#ifdef CONFIG_USER_SCHED
7738 INIT_LIST_HEAD(&root_task_group.children);
7739 init_task_group.parent = &root_task_group;
7740 list_add(&init_task_group.siblings, &root_task_group.children);
7741#endif
7742#endif
7743
7744 for_each_possible_cpu(i) {
7745 struct rq *rq;
7746
7747 rq = cpu_rq(i);
7748 spin_lock_init(&rq->lock);
7749 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7750 rq->nr_running = 0;
7751 init_cfs_rq(&rq->cfs, rq);
7752 init_rt_rq(&rq->rt, rq);
7753#ifdef CONFIG_FAIR_GROUP_SCHED
7754 init_task_group.shares = init_task_group_load;
7755 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7756#ifdef CONFIG_CGROUP_SCHED
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
7777#elif defined CONFIG_USER_SCHED
7778 root_task_group.shares = NICE_0_LOAD;
7779 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791 init_tg_cfs_entry(&init_task_group,
7792 &per_cpu(init_cfs_rq, i),
7793 &per_cpu(init_sched_entity, i), i, 1,
7794 root_task_group.se[i]);
7795
7796#endif
7797#endif
7798
7799 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7800#ifdef CONFIG_RT_GROUP_SCHED
7801 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7802#ifdef CONFIG_CGROUP_SCHED
7803 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7804#elif defined CONFIG_USER_SCHED
7805 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
7806 init_tg_rt_entry(&init_task_group,
7807 &per_cpu(init_rt_rq, i),
7808 &per_cpu(init_sched_rt_entity, i), i, 1,
7809 root_task_group.rt_se[i]);
7810#endif
7811#endif
7812
7813 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7814 rq->cpu_load[j] = 0;
7815#ifdef CONFIG_SMP
7816 rq->sd = NULL;
7817 rq->rd = NULL;
7818 rq->active_balance = 0;
7819 rq->next_balance = jiffies;
7820 rq->push_cpu = 0;
7821 rq->cpu = i;
7822 rq->migration_thread = NULL;
7823 INIT_LIST_HEAD(&rq->migration_queue);
7824 rq_attach_root(rq, &def_root_domain);
7825#endif
7826 init_rq_hrtick(rq);
7827 atomic_set(&rq->nr_iowait, 0);
7828 }
7829
7830 set_load_weight(&init_task);
7831
7832#ifdef CONFIG_PREEMPT_NOTIFIERS
7833 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7834#endif
7835
7836#ifdef CONFIG_SMP
7837 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
7838#endif
7839
7840#ifdef CONFIG_RT_MUTEXES
7841 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
7842#endif
7843
7844
7845
7846
7847 atomic_inc(&init_mm.mm_count);
7848 enter_lazy_tlb(&init_mm, current);
7849
7850
7851
7852
7853
7854
7855
7856 init_idle(current, smp_processor_id());
7857
7858
7859
7860 current->sched_class = &fair_sched_class;
7861
7862 scheduler_running = 1;
7863}
7864
7865#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
7866void __might_sleep(char *file, int line)
7867{
7868#ifdef in_atomic
7869 static unsigned long prev_jiffy;
7870
7871 if ((in_atomic() || irqs_disabled()) &&
7872 system_state == SYSTEM_RUNNING && !oops_in_progress) {
7873 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7874 return;
7875 prev_jiffy = jiffies;
7876 printk(KERN_ERR "BUG: sleeping function called from invalid"
7877 " context at %s:%d\n", file, line);
7878 printk("in_atomic():%d, irqs_disabled():%d\n",
7879 in_atomic(), irqs_disabled());
7880 debug_show_held_locks(current);
7881 if (irqs_disabled())
7882 print_irqtrace_events(current);
7883 dump_stack();
7884 }
7885#endif
7886}
7887EXPORT_SYMBOL(__might_sleep);
7888#endif
7889
7890#ifdef CONFIG_MAGIC_SYSRQ
7891static void normalize_task(struct rq *rq, struct task_struct *p)
7892{
7893 int on_rq;
7894
7895 update_rq_clock(rq);
7896 on_rq = p->se.on_rq;
7897 if (on_rq)
7898 deactivate_task(rq, p, 0);
7899 __setscheduler(rq, p, SCHED_NORMAL, 0);
7900 if (on_rq) {
7901 activate_task(rq, p, 0);
7902 resched_task(rq->curr);
7903 }
7904}
7905
7906void normalize_rt_tasks(void)
7907{
7908 struct task_struct *g, *p;
7909 unsigned long flags;
7910 struct rq *rq;
7911
7912 read_lock_irqsave(&tasklist_lock, flags);
7913 do_each_thread(g, p) {
7914
7915
7916
7917 if (!p->mm)
7918 continue;
7919
7920 p->se.exec_start = 0;
7921#ifdef CONFIG_SCHEDSTATS
7922 p->se.wait_start = 0;
7923 p->se.sleep_start = 0;
7924 p->se.block_start = 0;
7925#endif
7926
7927 if (!rt_task(p)) {
7928
7929
7930
7931
7932 if (TASK_NICE(p) < 0 && p->mm)
7933 set_user_nice(p, 0);
7934 continue;
7935 }
7936
7937 spin_lock(&p->pi_lock);
7938 rq = __task_rq_lock(p);
7939
7940 normalize_task(rq, p);
7941
7942 __task_rq_unlock(rq);
7943 spin_unlock(&p->pi_lock);
7944 } while_each_thread(g, p);
7945
7946 read_unlock_irqrestore(&tasklist_lock, flags);
7947}
7948
7949#endif
7950
7951#ifdef CONFIG_IA64
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968struct task_struct *curr_task(int cpu)
7969{
7970 return cpu_curr(cpu);
7971}
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988void set_curr_task(int cpu, struct task_struct *p)
7989{
7990 cpu_curr(cpu) = p;
7991}
7992
7993#endif
7994
7995#ifdef CONFIG_FAIR_GROUP_SCHED
7996static void free_fair_sched_group(struct task_group *tg)
7997{
7998 int i;
7999
8000 for_each_possible_cpu(i) {
8001 if (tg->cfs_rq)
8002 kfree(tg->cfs_rq[i]);
8003 if (tg->se)
8004 kfree(tg->se[i]);
8005 }
8006
8007 kfree(tg->cfs_rq);
8008 kfree(tg->se);
8009}
8010
8011static
8012int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8013{
8014 struct cfs_rq *cfs_rq;
8015 struct sched_entity *se, *parent_se;
8016 struct rq *rq;
8017 int i;
8018
8019 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8020 if (!tg->cfs_rq)
8021 goto err;
8022 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8023 if (!tg->se)
8024 goto err;
8025
8026 tg->shares = NICE_0_LOAD;
8027
8028 for_each_possible_cpu(i) {
8029 rq = cpu_rq(i);
8030
8031 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
8032 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8033 if (!cfs_rq)
8034 goto err;
8035
8036 se = kmalloc_node(sizeof(struct sched_entity),
8037 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8038 if (!se)
8039 goto err;
8040
8041 parent_se = parent ? parent->se[i] : NULL;
8042 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8043 }
8044
8045 return 1;
8046
8047 err:
8048 return 0;
8049}
8050
8051static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8052{
8053 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8054 &cpu_rq(cpu)->leaf_cfs_rq_list);
8055}
8056
8057static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8058{
8059 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8060}
8061#else
8062static inline void free_fair_sched_group(struct task_group *tg)
8063{
8064}
8065
8066static inline
8067int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8068{
8069 return 1;
8070}
8071
8072static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8073{
8074}
8075
8076static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8077{
8078}
8079#endif
8080
8081#ifdef CONFIG_RT_GROUP_SCHED
8082static void free_rt_sched_group(struct task_group *tg)
8083{
8084 int i;
8085
8086 destroy_rt_bandwidth(&tg->rt_bandwidth);
8087
8088 for_each_possible_cpu(i) {
8089 if (tg->rt_rq)
8090 kfree(tg->rt_rq[i]);
8091 if (tg->rt_se)
8092 kfree(tg->rt_se[i]);
8093 }
8094
8095 kfree(tg->rt_rq);
8096 kfree(tg->rt_se);
8097}
8098
8099static
8100int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8101{
8102 struct rt_rq *rt_rq;
8103 struct sched_rt_entity *rt_se, *parent_se;
8104 struct rq *rq;
8105 int i;
8106
8107 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8108 if (!tg->rt_rq)
8109 goto err;
8110 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8111 if (!tg->rt_se)
8112 goto err;
8113
8114 init_rt_bandwidth(&tg->rt_bandwidth,
8115 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8116
8117 for_each_possible_cpu(i) {
8118 rq = cpu_rq(i);
8119
8120 rt_rq = kmalloc_node(sizeof(struct rt_rq),
8121 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8122 if (!rt_rq)
8123 goto err;
8124
8125 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
8126 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8127 if (!rt_se)
8128 goto err;
8129
8130 parent_se = parent ? parent->rt_se[i] : NULL;
8131 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8132 }
8133
8134 return 1;
8135
8136 err:
8137 return 0;
8138}
8139
8140static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8141{
8142 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8143 &cpu_rq(cpu)->leaf_rt_rq_list);
8144}
8145
8146static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8147{
8148 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8149}
8150#else
8151static inline void free_rt_sched_group(struct task_group *tg)
8152{
8153}
8154
8155static inline
8156int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8157{
8158 return 1;
8159}
8160
8161static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8162{
8163}
8164
8165static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8166{
8167}
8168#endif
8169
8170#ifdef CONFIG_GROUP_SCHED
8171static void free_sched_group(struct task_group *tg)
8172{
8173 free_fair_sched_group(tg);
8174 free_rt_sched_group(tg);
8175 kfree(tg);
8176}
8177
8178
8179struct task_group *sched_create_group(struct task_group *parent)
8180{
8181 struct task_group *tg;
8182 unsigned long flags;
8183 int i;
8184
8185 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8186 if (!tg)
8187 return ERR_PTR(-ENOMEM);
8188
8189 if (!alloc_fair_sched_group(tg, parent))
8190 goto err;
8191
8192 if (!alloc_rt_sched_group(tg, parent))
8193 goto err;
8194
8195 spin_lock_irqsave(&task_group_lock, flags);
8196 for_each_possible_cpu(i) {
8197 register_fair_sched_group(tg, i);
8198 register_rt_sched_group(tg, i);
8199 }
8200 list_add_rcu(&tg->list, &task_groups);
8201
8202 WARN_ON(!parent);
8203
8204 tg->parent = parent;
8205 list_add_rcu(&tg->siblings, &parent->children);
8206 INIT_LIST_HEAD(&tg->children);
8207 spin_unlock_irqrestore(&task_group_lock, flags);
8208
8209 return tg;
8210
8211err:
8212 free_sched_group(tg);
8213 return ERR_PTR(-ENOMEM);
8214}
8215
8216
8217static void free_sched_group_rcu(struct rcu_head *rhp)
8218{
8219
8220 free_sched_group(container_of(rhp, struct task_group, rcu));
8221}
8222
8223
8224void sched_destroy_group(struct task_group *tg)
8225{
8226 unsigned long flags;
8227 int i;
8228
8229 spin_lock_irqsave(&task_group_lock, flags);
8230 for_each_possible_cpu(i) {
8231 unregister_fair_sched_group(tg, i);
8232 unregister_rt_sched_group(tg, i);
8233 }
8234 list_del_rcu(&tg->list);
8235 list_del_rcu(&tg->siblings);
8236 spin_unlock_irqrestore(&task_group_lock, flags);
8237
8238
8239 call_rcu(&tg->rcu, free_sched_group_rcu);
8240}
8241
8242
8243
8244
8245
8246
8247void sched_move_task(struct task_struct *tsk)
8248{
8249 int on_rq, running;
8250 unsigned long flags;
8251 struct rq *rq;
8252
8253 rq = task_rq_lock(tsk, &flags);
8254
8255 update_rq_clock(rq);
8256
8257 running = task_current(rq, tsk);
8258 on_rq = tsk->se.on_rq;
8259
8260 if (on_rq)
8261 dequeue_task(rq, tsk, 0);
8262 if (unlikely(running))
8263 tsk->sched_class->put_prev_task(rq, tsk);
8264
8265 set_task_rq(tsk, task_cpu(tsk));
8266
8267#ifdef CONFIG_FAIR_GROUP_SCHED
8268 if (tsk->sched_class->moved_group)
8269 tsk->sched_class->moved_group(tsk);
8270#endif
8271
8272 if (unlikely(running))
8273 tsk->sched_class->set_curr_task(rq);
8274 if (on_rq)
8275 enqueue_task(rq, tsk, 0);
8276
8277 task_rq_unlock(rq, &flags);
8278}
8279#endif
8280
8281#ifdef CONFIG_FAIR_GROUP_SCHED
8282static void set_se_shares(struct sched_entity *se, unsigned long shares)
8283{
8284 struct cfs_rq *cfs_rq = se->cfs_rq;
8285 struct rq *rq = cfs_rq->rq;
8286 int on_rq;
8287
8288 spin_lock_irq(&rq->lock);
8289
8290 on_rq = se->on_rq;
8291 if (on_rq)
8292 dequeue_entity(cfs_rq, se, 0);
8293
8294 se->load.weight = shares;
8295 se->load.inv_weight = 0;
8296
8297 if (on_rq)
8298 enqueue_entity(cfs_rq, se, 0);
8299
8300 spin_unlock_irq(&rq->lock);
8301}
8302
8303static DEFINE_MUTEX(shares_mutex);
8304
8305int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8306{
8307 int i;
8308 unsigned long flags;
8309
8310
8311
8312
8313 if (!tg->se[0])
8314 return -EINVAL;
8315
8316 if (shares < MIN_SHARES)
8317 shares = MIN_SHARES;
8318 else if (shares > MAX_SHARES)
8319 shares = MAX_SHARES;
8320
8321 mutex_lock(&shares_mutex);
8322 if (tg->shares == shares)
8323 goto done;
8324
8325 spin_lock_irqsave(&task_group_lock, flags);
8326 for_each_possible_cpu(i)
8327 unregister_fair_sched_group(tg, i);
8328 list_del_rcu(&tg->siblings);
8329 spin_unlock_irqrestore(&task_group_lock, flags);
8330
8331
8332 synchronize_sched();
8333
8334
8335
8336
8337
8338 tg->shares = shares;
8339 for_each_possible_cpu(i)
8340 set_se_shares(tg->se[i], shares);
8341
8342
8343
8344
8345
8346 spin_lock_irqsave(&task_group_lock, flags);
8347 for_each_possible_cpu(i)
8348 register_fair_sched_group(tg, i);
8349 list_add_rcu(&tg->siblings, &tg->parent->children);
8350 spin_unlock_irqrestore(&task_group_lock, flags);
8351done:
8352 mutex_unlock(&shares_mutex);
8353 return 0;
8354}
8355
8356unsigned long sched_group_shares(struct task_group *tg)
8357{
8358 return tg->shares;
8359}
8360#endif
8361
8362#ifdef CONFIG_RT_GROUP_SCHED
8363
8364
8365
8366static DEFINE_MUTEX(rt_constraints_mutex);
8367
8368static unsigned long to_ratio(u64 period, u64 runtime)
8369{
8370 if (runtime == RUNTIME_INF)
8371 return 1ULL << 16;
8372
8373 return div64_u64(runtime << 16, period);
8374}
8375
8376#ifdef CONFIG_CGROUP_SCHED
8377static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8378{
8379 struct task_group *tgi, *parent = tg ? tg->parent : NULL;
8380 unsigned long total = 0;
8381
8382 if (!parent) {
8383 if (global_rt_period() < period)
8384 return 0;
8385
8386 return to_ratio(period, runtime) <
8387 to_ratio(global_rt_period(), global_rt_runtime());
8388 }
8389
8390 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
8391 return 0;
8392
8393 rcu_read_lock();
8394 list_for_each_entry_rcu(tgi, &parent->children, siblings) {
8395 if (tgi == tg)
8396 continue;
8397
8398 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8399 tgi->rt_bandwidth.rt_runtime);
8400 }
8401 rcu_read_unlock();
8402
8403 return total + to_ratio(period, runtime) <
8404 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8405 parent->rt_bandwidth.rt_runtime);
8406}
8407#elif defined CONFIG_USER_SCHED
8408static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8409{
8410 struct task_group *tgi;
8411 unsigned long total = 0;
8412 unsigned long global_ratio =
8413 to_ratio(global_rt_period(), global_rt_runtime());
8414
8415 rcu_read_lock();
8416 list_for_each_entry_rcu(tgi, &task_groups, list) {
8417 if (tgi == tg)
8418 continue;
8419
8420 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8421 tgi->rt_bandwidth.rt_runtime);
8422 }
8423 rcu_read_unlock();
8424
8425 return total + to_ratio(period, runtime) < global_ratio;
8426}
8427#endif
8428
8429
8430static inline int tg_has_rt_tasks(struct task_group *tg)
8431{
8432 struct task_struct *g, *p;
8433 do_each_thread(g, p) {
8434 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8435 return 1;
8436 } while_each_thread(g, p);
8437 return 0;
8438}
8439
8440static int tg_set_bandwidth(struct task_group *tg,
8441 u64 rt_period, u64 rt_runtime)
8442{
8443 int i, err = 0;
8444
8445 mutex_lock(&rt_constraints_mutex);
8446 read_lock(&tasklist_lock);
8447 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
8448 err = -EBUSY;
8449 goto unlock;
8450 }
8451 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8452 err = -EINVAL;
8453 goto unlock;
8454 }
8455
8456 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8457 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8458 tg->rt_bandwidth.rt_runtime = rt_runtime;
8459
8460 for_each_possible_cpu(i) {
8461 struct rt_rq *rt_rq = tg->rt_rq[i];
8462
8463 spin_lock(&rt_rq->rt_runtime_lock);
8464 rt_rq->rt_runtime = rt_runtime;
8465 spin_unlock(&rt_rq->rt_runtime_lock);
8466 }
8467 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8468 unlock:
8469 read_unlock(&tasklist_lock);
8470 mutex_unlock(&rt_constraints_mutex);
8471
8472 return err;
8473}
8474
8475int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8476{
8477 u64 rt_runtime, rt_period;
8478
8479 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8480 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8481 if (rt_runtime_us < 0)
8482 rt_runtime = RUNTIME_INF;
8483
8484 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8485}
8486
8487long sched_group_rt_runtime(struct task_group *tg)
8488{
8489 u64 rt_runtime_us;
8490
8491 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8492 return -1;
8493
8494 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8495 do_div(rt_runtime_us, NSEC_PER_USEC);
8496 return rt_runtime_us;
8497}
8498
8499int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8500{
8501 u64 rt_runtime, rt_period;
8502
8503 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8504 rt_runtime = tg->rt_bandwidth.rt_runtime;
8505
8506 if (rt_period == 0)
8507 return -EINVAL;
8508
8509 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8510}
8511
8512long sched_group_rt_period(struct task_group *tg)
8513{
8514 u64 rt_period_us;
8515
8516 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8517 do_div(rt_period_us, NSEC_PER_USEC);
8518 return rt_period_us;
8519}
8520
8521static int sched_rt_global_constraints(void)
8522{
8523 int ret = 0;
8524
8525 mutex_lock(&rt_constraints_mutex);
8526 if (!__rt_schedulable(NULL, 1, 0))
8527 ret = -EINVAL;
8528 mutex_unlock(&rt_constraints_mutex);
8529
8530 return ret;
8531}
8532#else
8533static int sched_rt_global_constraints(void)
8534{
8535 unsigned long flags;
8536 int i;
8537
8538 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8539 for_each_possible_cpu(i) {
8540 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8541
8542 spin_lock(&rt_rq->rt_runtime_lock);
8543 rt_rq->rt_runtime = global_rt_runtime();
8544 spin_unlock(&rt_rq->rt_runtime_lock);
8545 }
8546 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8547
8548 return 0;
8549}
8550#endif
8551
8552int sched_rt_handler(struct ctl_table *table, int write,
8553 struct file *filp, void __user *buffer, size_t *lenp,
8554 loff_t *ppos)
8555{
8556 int ret;
8557 int old_period, old_runtime;
8558 static DEFINE_MUTEX(mutex);
8559
8560 mutex_lock(&mutex);
8561 old_period = sysctl_sched_rt_period;
8562 old_runtime = sysctl_sched_rt_runtime;
8563
8564 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
8565
8566 if (!ret && write) {
8567 ret = sched_rt_global_constraints();
8568 if (ret) {
8569 sysctl_sched_rt_period = old_period;
8570 sysctl_sched_rt_runtime = old_runtime;
8571 } else {
8572 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8573 def_rt_bandwidth.rt_period =
8574 ns_to_ktime(global_rt_period());
8575 }
8576 }
8577 mutex_unlock(&mutex);
8578
8579 return ret;
8580}
8581
8582#ifdef CONFIG_CGROUP_SCHED
8583
8584
8585static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
8586{
8587 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
8588 struct task_group, css);
8589}
8590
8591static struct cgroup_subsys_state *
8592cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8593{
8594 struct task_group *tg, *parent;
8595
8596 if (!cgrp->parent) {
8597
8598 init_task_group.css.cgroup = cgrp;
8599 return &init_task_group.css;
8600 }
8601
8602 parent = cgroup_tg(cgrp->parent);
8603 tg = sched_create_group(parent);
8604 if (IS_ERR(tg))
8605 return ERR_PTR(-ENOMEM);
8606
8607
8608 tg->css.cgroup = cgrp;
8609
8610 return &tg->css;
8611}
8612
8613static void
8614cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8615{
8616 struct task_group *tg = cgroup_tg(cgrp);
8617
8618 sched_destroy_group(tg);
8619}
8620
8621static int
8622cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8623 struct task_struct *tsk)
8624{
8625#ifdef CONFIG_RT_GROUP_SCHED
8626
8627 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
8628 return -EINVAL;
8629#else
8630
8631 if (tsk->sched_class != &fair_sched_class)
8632 return -EINVAL;
8633#endif
8634
8635 return 0;
8636}
8637
8638static void
8639cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8640 struct cgroup *old_cont, struct task_struct *tsk)
8641{
8642 sched_move_task(tsk);
8643}
8644
8645#ifdef CONFIG_FAIR_GROUP_SCHED
8646static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8647 u64 shareval)
8648{
8649 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
8650}
8651
8652static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8653{
8654 struct task_group *tg = cgroup_tg(cgrp);
8655
8656 return (u64) tg->shares;
8657}
8658#endif
8659
8660#ifdef CONFIG_RT_GROUP_SCHED
8661static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8662 s64 val)
8663{
8664 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8665}
8666
8667static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8668{
8669 return sched_group_rt_runtime(cgroup_tg(cgrp));
8670}
8671
8672static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8673 u64 rt_period_us)
8674{
8675 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8676}
8677
8678static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8679{
8680 return sched_group_rt_period(cgroup_tg(cgrp));
8681}
8682#endif
8683
8684static struct cftype cpu_files[] = {
8685#ifdef CONFIG_FAIR_GROUP_SCHED
8686 {
8687 .name = "shares",
8688 .read_u64 = cpu_shares_read_u64,
8689 .write_u64 = cpu_shares_write_u64,
8690 },
8691#endif
8692#ifdef CONFIG_RT_GROUP_SCHED
8693 {
8694 .name = "rt_runtime_us",
8695 .read_s64 = cpu_rt_runtime_read,
8696 .write_s64 = cpu_rt_runtime_write,
8697 },
8698 {
8699 .name = "rt_period_us",
8700 .read_u64 = cpu_rt_period_read_uint,
8701 .write_u64 = cpu_rt_period_write_uint,
8702 },
8703#endif
8704};
8705
8706static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8707{
8708 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
8709}
8710
8711struct cgroup_subsys cpu_cgroup_subsys = {
8712 .name = "cpu",
8713 .create = cpu_cgroup_create,
8714 .destroy = cpu_cgroup_destroy,
8715 .can_attach = cpu_cgroup_can_attach,
8716 .attach = cpu_cgroup_attach,
8717 .populate = cpu_cgroup_populate,
8718 .subsys_id = cpu_cgroup_subsys_id,
8719 .early_init = 1,
8720};
8721
8722#endif
8723
8724#ifdef CONFIG_CGROUP_CPUACCT
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734struct cpuacct {
8735 struct cgroup_subsys_state css;
8736
8737 u64 *cpuusage;
8738};
8739
8740struct cgroup_subsys cpuacct_subsys;
8741
8742
8743static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
8744{
8745 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
8746 struct cpuacct, css);
8747}
8748
8749
8750static inline struct cpuacct *task_ca(struct task_struct *tsk)
8751{
8752 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
8753 struct cpuacct, css);
8754}
8755
8756
8757static struct cgroup_subsys_state *cpuacct_create(
8758 struct cgroup_subsys *ss, struct cgroup *cgrp)
8759{
8760 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8761
8762 if (!ca)
8763 return ERR_PTR(-ENOMEM);
8764
8765 ca->cpuusage = alloc_percpu(u64);
8766 if (!ca->cpuusage) {
8767 kfree(ca);
8768 return ERR_PTR(-ENOMEM);
8769 }
8770
8771 return &ca->css;
8772}
8773
8774
8775static void
8776cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8777{
8778 struct cpuacct *ca = cgroup_ca(cgrp);
8779
8780 free_percpu(ca->cpuusage);
8781 kfree(ca);
8782}
8783
8784
8785static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8786{
8787 struct cpuacct *ca = cgroup_ca(cgrp);
8788 u64 totalcpuusage = 0;
8789 int i;
8790
8791 for_each_possible_cpu(i) {
8792 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
8793
8794
8795
8796
8797
8798 spin_lock_irq(&cpu_rq(i)->lock);
8799 totalcpuusage += *cpuusage;
8800 spin_unlock_irq(&cpu_rq(i)->lock);
8801 }
8802
8803 return totalcpuusage;
8804}
8805
8806static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8807 u64 reset)
8808{
8809 struct cpuacct *ca = cgroup_ca(cgrp);
8810 int err = 0;
8811 int i;
8812
8813 if (reset) {
8814 err = -EINVAL;
8815 goto out;
8816 }
8817
8818 for_each_possible_cpu(i) {
8819 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
8820
8821 spin_lock_irq(&cpu_rq(i)->lock);
8822 *cpuusage = 0;
8823 spin_unlock_irq(&cpu_rq(i)->lock);
8824 }
8825out:
8826 return err;
8827}
8828
8829static struct cftype files[] = {
8830 {
8831 .name = "usage",
8832 .read_u64 = cpuusage_read,
8833 .write_u64 = cpuusage_write,
8834 },
8835};
8836
8837static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8838{
8839 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8840}
8841
8842
8843
8844
8845
8846
8847static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8848{
8849 struct cpuacct *ca;
8850
8851 if (!cpuacct_subsys.active)
8852 return;
8853
8854 ca = task_ca(tsk);
8855 if (ca) {
8856 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
8857
8858 *cpuusage += cputime;
8859 }
8860}
8861
8862struct cgroup_subsys cpuacct_subsys = {
8863 .name = "cpuacct",
8864 .create = cpuacct_create,
8865 .destroy = cpuacct_destroy,
8866 .populate = cpuacct_populate,
8867 .subsys_id = cpuacct_subsys_id,
8868};
8869#endif
8870