1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/reciprocal_div.h>
66#include <linux/unistd.h>
67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
69
70#include <asm/tlb.h>
71#include <asm/irq_regs.h>
72
73
74
75
76
77
78unsigned long long __attribute__((weak)) sched_clock(void)
79{
80 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
81}
82
83
84
85
86
87
88#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
89#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
90#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
91
92
93
94
95
96
97#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
98#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
100
101
102
103
104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
105
106#define NICE_0_LOAD SCHED_LOAD_SCALE
107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
108
109
110
111
112
113
114
115#define DEF_TIMESLICE (100 * HZ / 1000)
116
117#ifdef CONFIG_SMP
118
119
120
121
122static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
123{
124 return reciprocal_divide(load, sg->reciprocal_cpu_power);
125}
126
127
128
129
130
131static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
132{
133 sg->__cpu_power += val;
134 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
135}
136#endif
137
138static inline int rt_policy(int policy)
139{
140 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
141 return 1;
142 return 0;
143}
144
145static inline int task_has_rt_policy(struct task_struct *p)
146{
147 return rt_policy(p->policy);
148}
149
150
151
152
153struct rt_prio_array {
154 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
155 struct list_head queue[MAX_RT_PRIO];
156};
157
158#ifdef CONFIG_GROUP_SCHED
159
160#include <linux/cgroup.h>
161
162struct cfs_rq;
163
164static LIST_HEAD(task_groups);
165
166
167struct task_group {
168#ifdef CONFIG_CGROUP_SCHED
169 struct cgroup_subsys_state css;
170#endif
171
172#ifdef CONFIG_FAIR_GROUP_SCHED
173
174 struct sched_entity **se;
175
176 struct cfs_rq **cfs_rq;
177 unsigned long shares;
178#endif
179
180#ifdef CONFIG_RT_GROUP_SCHED
181 struct sched_rt_entity **rt_se;
182 struct rt_rq **rt_rq;
183
184 u64 rt_runtime;
185#endif
186
187 struct rcu_head rcu;
188 struct list_head list;
189};
190
191#ifdef CONFIG_FAIR_GROUP_SCHED
192
193static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
194
195static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
196
197static struct sched_entity *init_sched_entity_p[NR_CPUS];
198static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
199#endif
200
201#ifdef CONFIG_RT_GROUP_SCHED
202static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
203static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
204
205static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
206static struct rt_rq *init_rt_rq_p[NR_CPUS];
207#endif
208
209
210
211
212static DEFINE_SPINLOCK(task_group_lock);
213
214
215static DEFINE_MUTEX(doms_cur_mutex);
216
217#ifdef CONFIG_FAIR_GROUP_SCHED
218#ifdef CONFIG_USER_SCHED
219# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
220#else
221# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
222#endif
223
224static int init_task_group_load = INIT_TASK_GROUP_LOAD;
225#endif
226
227
228
229
230struct task_group init_task_group = {
231#ifdef CONFIG_FAIR_GROUP_SCHED
232 .se = init_sched_entity_p,
233 .cfs_rq = init_cfs_rq_p,
234#endif
235
236#ifdef CONFIG_RT_GROUP_SCHED
237 .rt_se = init_sched_rt_entity_p,
238 .rt_rq = init_rt_rq_p,
239#endif
240};
241
242
243static inline struct task_group *task_group(struct task_struct *p)
244{
245 struct task_group *tg;
246
247#ifdef CONFIG_USER_SCHED
248 tg = p->user->tg;
249#elif defined(CONFIG_CGROUP_SCHED)
250 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
251 struct task_group, css);
252#else
253 tg = &init_task_group;
254#endif
255 return tg;
256}
257
258
259static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
260{
261#ifdef CONFIG_FAIR_GROUP_SCHED
262 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
263 p->se.parent = task_group(p)->se[cpu];
264#endif
265
266#ifdef CONFIG_RT_GROUP_SCHED
267 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
268 p->rt.parent = task_group(p)->rt_se[cpu];
269#endif
270}
271
272static inline void lock_doms_cur(void)
273{
274 mutex_lock(&doms_cur_mutex);
275}
276
277static inline void unlock_doms_cur(void)
278{
279 mutex_unlock(&doms_cur_mutex);
280}
281
282#else
283
284static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
285static inline void lock_doms_cur(void) { }
286static inline void unlock_doms_cur(void) { }
287
288#endif
289
290
291struct cfs_rq {
292 struct load_weight load;
293 unsigned long nr_running;
294
295 u64 exec_clock;
296 u64 min_vruntime;
297
298 struct rb_root tasks_timeline;
299 struct rb_node *rb_leftmost;
300 struct rb_node *rb_load_balance_curr;
301
302
303
304 struct sched_entity *curr, *next;
305
306 unsigned long nr_spread_over;
307
308#ifdef CONFIG_FAIR_GROUP_SCHED
309 struct rq *rq;
310
311
312
313
314
315
316
317
318
319 struct list_head leaf_cfs_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324
325struct rt_rq {
326 struct rt_prio_array active;
327 unsigned long rt_nr_running;
328#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
329 int highest_prio;
330#endif
331#ifdef CONFIG_SMP
332 unsigned long rt_nr_migratory;
333 int overloaded;
334#endif
335 int rt_throttled;
336 u64 rt_time;
337
338#ifdef CONFIG_RT_GROUP_SCHED
339 unsigned long rt_nr_boosted;
340
341 struct rq *rq;
342 struct list_head leaf_rt_rq_list;
343 struct task_group *tg;
344 struct sched_rt_entity *rt_se;
345#endif
346};
347
348#ifdef CONFIG_SMP
349
350
351
352
353
354
355
356
357
358struct root_domain {
359 atomic_t refcount;
360 cpumask_t span;
361 cpumask_t online;
362
363
364
365
366
367 cpumask_t rto_mask;
368 atomic_t rto_count;
369};
370
371
372
373
374
375static struct root_domain def_root_domain;
376
377#endif
378
379
380
381
382
383
384
385
386struct rq {
387
388 spinlock_t lock;
389
390
391
392
393
394 unsigned long nr_running;
395 #define CPU_LOAD_IDX_MAX 5
396 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
397 unsigned char idle_at_tick;
398#ifdef CONFIG_NO_HZ
399 unsigned char in_nohz_recently;
400#endif
401
402 struct load_weight load;
403 unsigned long nr_load_updates;
404 u64 nr_switches;
405
406 struct cfs_rq cfs;
407 struct rt_rq rt;
408 u64 rt_period_expire;
409 int rt_throttled;
410
411#ifdef CONFIG_FAIR_GROUP_SCHED
412
413 struct list_head leaf_cfs_rq_list;
414#endif
415#ifdef CONFIG_RT_GROUP_SCHED
416 struct list_head leaf_rt_rq_list;
417#endif
418
419
420
421
422
423
424
425 unsigned long nr_uninterruptible;
426
427 struct task_struct *curr, *idle;
428 unsigned long next_balance;
429 struct mm_struct *prev_mm;
430
431 u64 clock, prev_clock_raw;
432 s64 clock_max_delta;
433
434 unsigned int clock_warps, clock_overflows, clock_underflows;
435 u64 idle_clock;
436 unsigned int clock_deep_idle_events;
437 u64 tick_timestamp;
438
439 atomic_t nr_iowait;
440
441#ifdef CONFIG_SMP
442 struct root_domain *rd;
443 struct sched_domain *sd;
444
445
446 int active_balance;
447 int push_cpu;
448
449 int cpu;
450
451 struct task_struct *migration_thread;
452 struct list_head migration_queue;
453#endif
454
455#ifdef CONFIG_SCHED_HRTICK
456 unsigned long hrtick_flags;
457 ktime_t hrtick_expire;
458 struct hrtimer hrtick_timer;
459#endif
460
461#ifdef CONFIG_SCHEDSTATS
462
463 struct sched_info rq_sched_info;
464
465
466 unsigned int yld_exp_empty;
467 unsigned int yld_act_empty;
468 unsigned int yld_both_empty;
469 unsigned int yld_count;
470
471
472 unsigned int sched_switch;
473 unsigned int sched_count;
474 unsigned int sched_goidle;
475
476
477 unsigned int ttwu_count;
478 unsigned int ttwu_local;
479
480
481 unsigned int bkl_count;
482#endif
483 struct lock_class_key rq_lock_key;
484};
485
486static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
487
488static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
489{
490 rq->curr->sched_class->check_preempt_curr(rq, p);
491}
492
493static inline int cpu_of(struct rq *rq)
494{
495#ifdef CONFIG_SMP
496 return rq->cpu;
497#else
498 return 0;
499#endif
500}
501
502
503
504
505
506static void __update_rq_clock(struct rq *rq)
507{
508 u64 prev_raw = rq->prev_clock_raw;
509 u64 now = sched_clock();
510 s64 delta = now - prev_raw;
511 u64 clock = rq->clock;
512
513#ifdef CONFIG_SCHED_DEBUG
514 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
515#endif
516
517
518
519 if (unlikely(delta < 0)) {
520 clock++;
521 rq->clock_warps++;
522 } else {
523
524
525
526 if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
527 if (clock < rq->tick_timestamp + TICK_NSEC)
528 clock = rq->tick_timestamp + TICK_NSEC;
529 else
530 clock++;
531 rq->clock_overflows++;
532 } else {
533 if (unlikely(delta > rq->clock_max_delta))
534 rq->clock_max_delta = delta;
535 clock += delta;
536 }
537 }
538
539 rq->prev_clock_raw = now;
540 rq->clock = clock;
541}
542
543static void update_rq_clock(struct rq *rq)
544{
545 if (likely(smp_processor_id() == cpu_of(rq)))
546 __update_rq_clock(rq);
547}
548
549
550
551
552
553
554
555
556#define for_each_domain(cpu, __sd) \
557 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
558
559#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
560#define this_rq() (&__get_cpu_var(runqueues))
561#define task_rq(p) cpu_rq(task_cpu(p))
562#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
563
564unsigned long rt_needs_cpu(int cpu)
565{
566 struct rq *rq = cpu_rq(cpu);
567 u64 delta;
568
569 if (!rq->rt_throttled)
570 return 0;
571
572 if (rq->clock > rq->rt_period_expire)
573 return 1;
574
575 delta = rq->rt_period_expire - rq->clock;
576 do_div(delta, NSEC_PER_SEC / HZ);
577
578 return (unsigned long)delta;
579}
580
581
582
583
584#ifdef CONFIG_SCHED_DEBUG
585# define const_debug __read_mostly
586#else
587# define const_debug static const
588#endif
589
590
591
592
593enum {
594 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
595 SCHED_FEAT_WAKEUP_PREEMPT = 2,
596 SCHED_FEAT_START_DEBIT = 4,
597 SCHED_FEAT_HRTICK = 8,
598 SCHED_FEAT_DOUBLE_TICK = 16,
599};
600
601const_debug unsigned int sysctl_sched_features =
602 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
603 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
604 SCHED_FEAT_START_DEBIT * 1 |
605 SCHED_FEAT_HRTICK * 1 |
606 SCHED_FEAT_DOUBLE_TICK * 0;
607
608#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
609
610
611
612
613
614const_debug unsigned int sysctl_sched_nr_migrate = 32;
615
616
617
618
619
620unsigned int sysctl_sched_rt_period = 1000000;
621
622static __read_mostly int scheduler_running;
623
624
625
626
627
628int sysctl_sched_rt_runtime = 950000;
629
630
631
632
633#define RUNTIME_INF ((u64)~0ULL)
634
635
636
637
638
639unsigned long long cpu_clock(int cpu)
640{
641 unsigned long long now;
642 unsigned long flags;
643 struct rq *rq;
644
645
646
647
648
649 if (unlikely(!scheduler_running))
650 return 0;
651
652 local_irq_save(flags);
653 rq = cpu_rq(cpu);
654 update_rq_clock(rq);
655 now = rq->clock;
656 local_irq_restore(flags);
657
658 return now;
659}
660EXPORT_SYMBOL_GPL(cpu_clock);
661
662#ifndef prepare_arch_switch
663# define prepare_arch_switch(next) do { } while (0)
664#endif
665#ifndef finish_arch_switch
666# define finish_arch_switch(prev) do { } while (0)
667#endif
668
669static inline int task_current(struct rq *rq, struct task_struct *p)
670{
671 return rq->curr == p;
672}
673
674#ifndef __ARCH_WANT_UNLOCKED_CTXSW
675static inline int task_running(struct rq *rq, struct task_struct *p)
676{
677 return task_current(rq, p);
678}
679
680static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
681{
682}
683
684static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
685{
686#ifdef CONFIG_DEBUG_SPINLOCK
687
688 rq->lock.owner = current;
689#endif
690
691
692
693
694
695 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
696
697 spin_unlock_irq(&rq->lock);
698}
699
700#else
701static inline int task_running(struct rq *rq, struct task_struct *p)
702{
703#ifdef CONFIG_SMP
704 return p->oncpu;
705#else
706 return task_current(rq, p);
707#endif
708}
709
710static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
711{
712#ifdef CONFIG_SMP
713
714
715
716
717
718 next->oncpu = 1;
719#endif
720#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
721 spin_unlock_irq(&rq->lock);
722#else
723 spin_unlock(&rq->lock);
724#endif
725}
726
727static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
728{
729#ifdef CONFIG_SMP
730
731
732
733
734
735 smp_wmb();
736 prev->oncpu = 0;
737#endif
738#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
739 local_irq_enable();
740#endif
741}
742#endif
743
744
745
746
747
748static inline struct rq *__task_rq_lock(struct task_struct *p)
749 __acquires(rq->lock)
750{
751 for (;;) {
752 struct rq *rq = task_rq(p);
753 spin_lock(&rq->lock);
754 if (likely(rq == task_rq(p)))
755 return rq;
756 spin_unlock(&rq->lock);
757 }
758}
759
760
761
762
763
764
765static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
766 __acquires(rq->lock)
767{
768 struct rq *rq;
769
770 for (;;) {
771 local_irq_save(*flags);
772 rq = task_rq(p);
773 spin_lock(&rq->lock);
774 if (likely(rq == task_rq(p)))
775 return rq;
776 spin_unlock_irqrestore(&rq->lock, *flags);
777 }
778}
779
780static void __task_rq_unlock(struct rq *rq)
781 __releases(rq->lock)
782{
783 spin_unlock(&rq->lock);
784}
785
786static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
787 __releases(rq->lock)
788{
789 spin_unlock_irqrestore(&rq->lock, *flags);
790}
791
792
793
794
795static struct rq *this_rq_lock(void)
796 __acquires(rq->lock)
797{
798 struct rq *rq;
799
800 local_irq_disable();
801 rq = this_rq();
802 spin_lock(&rq->lock);
803
804 return rq;
805}
806
807
808
809
810void sched_clock_idle_sleep_event(void)
811{
812 struct rq *rq = cpu_rq(smp_processor_id());
813
814 spin_lock(&rq->lock);
815 __update_rq_clock(rq);
816 spin_unlock(&rq->lock);
817 rq->clock_deep_idle_events++;
818}
819EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
820
821
822
823
824void sched_clock_idle_wakeup_event(u64 delta_ns)
825{
826 struct rq *rq = cpu_rq(smp_processor_id());
827 u64 now = sched_clock();
828
829 rq->idle_clock += delta_ns;
830
831
832
833
834
835
836 spin_lock(&rq->lock);
837 rq->prev_clock_raw = now;
838 rq->clock += delta_ns;
839 spin_unlock(&rq->lock);
840 touch_softlockup_watchdog();
841}
842EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
843
844static void __resched_task(struct task_struct *p, int tif_bit);
845
846static inline void resched_task(struct task_struct *p)
847{
848 __resched_task(p, TIF_NEED_RESCHED);
849}
850
851#ifdef CONFIG_SCHED_HRTICK
852
853
854
855
856
857
858
859
860
861
862static inline void resched_hrt(struct task_struct *p)
863{
864 __resched_task(p, TIF_HRTICK_RESCHED);
865}
866
867static inline void resched_rq(struct rq *rq)
868{
869 unsigned long flags;
870
871 spin_lock_irqsave(&rq->lock, flags);
872 resched_task(rq->curr);
873 spin_unlock_irqrestore(&rq->lock, flags);
874}
875
876enum {
877 HRTICK_SET,
878 HRTICK_RESET,
879 HRTICK_BLOCK,
880};
881
882
883
884
885
886
887static inline int hrtick_enabled(struct rq *rq)
888{
889 if (!sched_feat(HRTICK))
890 return 0;
891 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
892 return 0;
893 return hrtimer_is_hres_active(&rq->hrtick_timer);
894}
895
896
897
898
899
900
901static void hrtick_start(struct rq *rq, u64 delay, int reset)
902{
903 assert_spin_locked(&rq->lock);
904
905
906
907
908 rq->hrtick_expire =
909 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
910
911
912
913 __set_bit(HRTICK_SET, &rq->hrtick_flags);
914 if (reset)
915 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
916
917
918
919
920
921 if (reset)
922 resched_hrt(rq->curr);
923}
924
925static void hrtick_clear(struct rq *rq)
926{
927 if (hrtimer_active(&rq->hrtick_timer))
928 hrtimer_cancel(&rq->hrtick_timer);
929}
930
931
932
933
934static void hrtick_set(struct rq *rq)
935{
936 ktime_t time;
937 int set, reset;
938 unsigned long flags;
939
940 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
941
942 spin_lock_irqsave(&rq->lock, flags);
943 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
944 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
945 time = rq->hrtick_expire;
946 clear_thread_flag(TIF_HRTICK_RESCHED);
947 spin_unlock_irqrestore(&rq->lock, flags);
948
949 if (set) {
950 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
951 if (reset && !hrtimer_active(&rq->hrtick_timer))
952 resched_rq(rq);
953 } else
954 hrtick_clear(rq);
955}
956
957
958
959
960
961static enum hrtimer_restart hrtick(struct hrtimer *timer)
962{
963 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
964
965 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
966
967 spin_lock(&rq->lock);
968 __update_rq_clock(rq);
969 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
970 spin_unlock(&rq->lock);
971
972 return HRTIMER_NORESTART;
973}
974
975static void hotplug_hrtick_disable(int cpu)
976{
977 struct rq *rq = cpu_rq(cpu);
978 unsigned long flags;
979
980 spin_lock_irqsave(&rq->lock, flags);
981 rq->hrtick_flags = 0;
982 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
983 spin_unlock_irqrestore(&rq->lock, flags);
984
985 hrtick_clear(rq);
986}
987
988static void hotplug_hrtick_enable(int cpu)
989{
990 struct rq *rq = cpu_rq(cpu);
991 unsigned long flags;
992
993 spin_lock_irqsave(&rq->lock, flags);
994 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
995 spin_unlock_irqrestore(&rq->lock, flags);
996}
997
998static int
999hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1000{
1001 int cpu = (int)(long)hcpu;
1002
1003 switch (action) {
1004 case CPU_UP_CANCELED:
1005 case CPU_UP_CANCELED_FROZEN:
1006 case CPU_DOWN_PREPARE:
1007 case CPU_DOWN_PREPARE_FROZEN:
1008 case CPU_DEAD:
1009 case CPU_DEAD_FROZEN:
1010 hotplug_hrtick_disable(cpu);
1011 return NOTIFY_OK;
1012
1013 case CPU_UP_PREPARE:
1014 case CPU_UP_PREPARE_FROZEN:
1015 case CPU_DOWN_FAILED:
1016 case CPU_DOWN_FAILED_FROZEN:
1017 case CPU_ONLINE:
1018 case CPU_ONLINE_FROZEN:
1019 hotplug_hrtick_enable(cpu);
1020 return NOTIFY_OK;
1021 }
1022
1023 return NOTIFY_DONE;
1024}
1025
1026static void init_hrtick(void)
1027{
1028 hotcpu_notifier(hotplug_hrtick, 0);
1029}
1030
1031static void init_rq_hrtick(struct rq *rq)
1032{
1033 rq->hrtick_flags = 0;
1034 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1035 rq->hrtick_timer.function = hrtick;
1036 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1037}
1038
1039void hrtick_resched(void)
1040{
1041 struct rq *rq;
1042 unsigned long flags;
1043
1044 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1045 return;
1046
1047 local_irq_save(flags);
1048 rq = cpu_rq(smp_processor_id());
1049 hrtick_set(rq);
1050 local_irq_restore(flags);
1051}
1052#else
1053static inline void hrtick_clear(struct rq *rq)
1054{
1055}
1056
1057static inline void hrtick_set(struct rq *rq)
1058{
1059}
1060
1061static inline void init_rq_hrtick(struct rq *rq)
1062{
1063}
1064
1065void hrtick_resched(void)
1066{
1067}
1068
1069static inline void init_hrtick(void)
1070{
1071}
1072#endif
1073
1074
1075
1076
1077
1078
1079
1080
1081#ifdef CONFIG_SMP
1082
1083#ifndef tsk_is_polling
1084#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1085#endif
1086
1087static void __resched_task(struct task_struct *p, int tif_bit)
1088{
1089 int cpu;
1090
1091 assert_spin_locked(&task_rq(p)->lock);
1092
1093 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
1094 return;
1095
1096 set_tsk_thread_flag(p, tif_bit);
1097
1098 cpu = task_cpu(p);
1099 if (cpu == smp_processor_id())
1100 return;
1101
1102
1103 smp_mb();
1104 if (!tsk_is_polling(p))
1105 smp_send_reschedule(cpu);
1106}
1107
1108static void resched_cpu(int cpu)
1109{
1110 struct rq *rq = cpu_rq(cpu);
1111 unsigned long flags;
1112
1113 if (!spin_trylock_irqsave(&rq->lock, flags))
1114 return;
1115 resched_task(cpu_curr(cpu));
1116 spin_unlock_irqrestore(&rq->lock, flags);
1117}
1118
1119#ifdef CONFIG_NO_HZ
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130void wake_up_idle_cpu(int cpu)
1131{
1132 struct rq *rq = cpu_rq(cpu);
1133
1134 if (cpu == smp_processor_id())
1135 return;
1136
1137
1138
1139
1140
1141
1142
1143
1144 if (rq->curr != rq->idle)
1145 return;
1146
1147
1148
1149
1150
1151
1152 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1153
1154
1155 smp_mb();
1156 if (!tsk_is_polling(rq->idle))
1157 smp_send_reschedule(cpu);
1158}
1159#endif
1160
1161#else
1162static void __resched_task(struct task_struct *p, int tif_bit)
1163{
1164 assert_spin_locked(&task_rq(p)->lock);
1165 set_tsk_thread_flag(p, tif_bit);
1166}
1167#endif
1168
1169#if BITS_PER_LONG == 32
1170# define WMULT_CONST (~0UL)
1171#else
1172# define WMULT_CONST (1UL << 32)
1173#endif
1174
1175#define WMULT_SHIFT 32
1176
1177
1178
1179
1180#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1181
1182static unsigned long
1183calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1184 struct load_weight *lw)
1185{
1186 u64 tmp;
1187
1188 if (unlikely(!lw->inv_weight))
1189 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
1190
1191 tmp = (u64)delta_exec * weight;
1192
1193
1194
1195 if (unlikely(tmp > WMULT_CONST))
1196 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1197 WMULT_SHIFT/2);
1198 else
1199 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1200
1201 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1202}
1203
1204static inline unsigned long
1205calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1206{
1207 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1208}
1209
1210static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1211{
1212 lw->weight += inc;
1213 lw->inv_weight = 0;
1214}
1215
1216static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1217{
1218 lw->weight -= dec;
1219 lw->inv_weight = 0;
1220}
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231#define WEIGHT_IDLEPRIO 2
1232#define WMULT_IDLEPRIO (1 << 31)
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246static const int prio_to_weight[40] = {
1247 88761, 71755, 56483, 46273, 36291,
1248 29154, 23254, 18705, 14949, 11916,
1249 9548, 7620, 6100, 4904, 3906,
1250 3121, 2501, 1991, 1586, 1277,
1251 1024, 820, 655, 526, 423,
1252 335, 272, 215, 172, 137,
1253 110, 87, 70, 56, 45,
1254 36, 29, 23, 18, 15,
1255};
1256
1257
1258
1259
1260
1261
1262
1263
1264static const u32 prio_to_wmult[40] = {
1265 48388, 59856, 76040, 92818, 118348,
1266 147320, 184698, 229616, 287308, 360437,
1267 449829, 563644, 704093, 875809, 1099582,
1268 1376151, 1717300, 2157191, 2708050, 3363326,
1269 4194304, 5237765, 6557202, 8165337, 10153587,
1270 12820798, 15790321, 19976592, 24970740, 31350126,
1271 39045157, 49367440, 61356676, 76695844, 95443717,
1272 119304647, 148102320, 186737708, 238609294, 286331153,
1273};
1274
1275static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1276
1277
1278
1279
1280
1281
1282struct rq_iterator {
1283 void *arg;
1284 struct task_struct *(*start)(void *);
1285 struct task_struct *(*next)(void *);
1286};
1287
1288#ifdef CONFIG_SMP
1289static unsigned long
1290balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1291 unsigned long max_load_move, struct sched_domain *sd,
1292 enum cpu_idle_type idle, int *all_pinned,
1293 int *this_best_prio, struct rq_iterator *iterator);
1294
1295static int
1296iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1297 struct sched_domain *sd, enum cpu_idle_type idle,
1298 struct rq_iterator *iterator);
1299#endif
1300
1301#ifdef CONFIG_CGROUP_CPUACCT
1302static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1303#else
1304static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1305#endif
1306
1307#ifdef CONFIG_SMP
1308static unsigned long source_load(int cpu, int type);
1309static unsigned long target_load(int cpu, int type);
1310static unsigned long cpu_avg_load_per_task(int cpu);
1311static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1312#endif
1313
1314#include "sched_stats.h"
1315#include "sched_idletask.c"
1316#include "sched_fair.c"
1317#include "sched_rt.c"
1318#ifdef CONFIG_SCHED_DEBUG
1319# include "sched_debug.c"
1320#endif
1321
1322#define sched_class_highest (&rt_sched_class)
1323
1324static inline void inc_load(struct rq *rq, const struct task_struct *p)
1325{
1326 update_load_add(&rq->load, p->se.load.weight);
1327}
1328
1329static inline void dec_load(struct rq *rq, const struct task_struct *p)
1330{
1331 update_load_sub(&rq->load, p->se.load.weight);
1332}
1333
1334static void inc_nr_running(struct task_struct *p, struct rq *rq)
1335{
1336 rq->nr_running++;
1337 inc_load(rq, p);
1338}
1339
1340static void dec_nr_running(struct task_struct *p, struct rq *rq)
1341{
1342 rq->nr_running--;
1343 dec_load(rq, p);
1344}
1345
1346static void set_load_weight(struct task_struct *p)
1347{
1348 if (task_has_rt_policy(p)) {
1349 p->se.load.weight = prio_to_weight[0] * 2;
1350 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1351 return;
1352 }
1353
1354
1355
1356
1357 if (p->policy == SCHED_IDLE) {
1358 p->se.load.weight = WEIGHT_IDLEPRIO;
1359 p->se.load.inv_weight = WMULT_IDLEPRIO;
1360 return;
1361 }
1362
1363 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1364 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1365}
1366
1367static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1368{
1369 sched_info_queued(p);
1370 p->sched_class->enqueue_task(rq, p, wakeup);
1371 p->se.on_rq = 1;
1372}
1373
1374static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1375{
1376 p->sched_class->dequeue_task(rq, p, sleep);
1377 p->se.on_rq = 0;
1378}
1379
1380
1381
1382
1383static inline int __normal_prio(struct task_struct *p)
1384{
1385 return p->static_prio;
1386}
1387
1388
1389
1390
1391
1392
1393
1394
1395static inline int normal_prio(struct task_struct *p)
1396{
1397 int prio;
1398
1399 if (task_has_rt_policy(p))
1400 prio = MAX_RT_PRIO-1 - p->rt_priority;
1401 else
1402 prio = __normal_prio(p);
1403 return prio;
1404}
1405
1406
1407
1408
1409
1410
1411
1412
1413static int effective_prio(struct task_struct *p)
1414{
1415 p->normal_prio = normal_prio(p);
1416
1417
1418
1419
1420
1421 if (!rt_prio(p->prio))
1422 return p->normal_prio;
1423 return p->prio;
1424}
1425
1426
1427
1428
1429static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1430{
1431 if (task_contributes_to_load(p))
1432 rq->nr_uninterruptible--;
1433
1434 enqueue_task(rq, p, wakeup);
1435 inc_nr_running(p, rq);
1436}
1437
1438
1439
1440
1441static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1442{
1443 if (task_contributes_to_load(p))
1444 rq->nr_uninterruptible++;
1445
1446 dequeue_task(rq, p, sleep);
1447 dec_nr_running(p, rq);
1448}
1449
1450
1451
1452
1453
1454inline int task_curr(const struct task_struct *p)
1455{
1456 return cpu_curr(task_cpu(p)) == p;
1457}
1458
1459
1460unsigned long weighted_cpuload(const int cpu)
1461{
1462 return cpu_rq(cpu)->load.weight;
1463}
1464
1465static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1466{
1467 set_task_rq(p, cpu);
1468#ifdef CONFIG_SMP
1469
1470
1471
1472
1473
1474 smp_wmb();
1475 task_thread_info(p)->cpu = cpu;
1476#endif
1477}
1478
1479static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1480 const struct sched_class *prev_class,
1481 int oldprio, int running)
1482{
1483 if (prev_class != p->sched_class) {
1484 if (prev_class->switched_from)
1485 prev_class->switched_from(rq, p, running);
1486 p->sched_class->switched_to(rq, p, running);
1487 } else
1488 p->sched_class->prio_changed(rq, p, oldprio, running);
1489}
1490
1491#ifdef CONFIG_SMP
1492
1493
1494
1495
1496static int
1497task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1498{
1499 s64 delta;
1500
1501
1502
1503
1504 if (&p->se == cfs_rq_of(&p->se)->next)
1505 return 1;
1506
1507 if (p->sched_class != &fair_sched_class)
1508 return 0;
1509
1510 if (sysctl_sched_migration_cost == -1)
1511 return 1;
1512 if (sysctl_sched_migration_cost == 0)
1513 return 0;
1514
1515 delta = now - p->se.exec_start;
1516
1517 return delta < (s64)sysctl_sched_migration_cost;
1518}
1519
1520
1521void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1522{
1523 int old_cpu = task_cpu(p);
1524 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1525 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1526 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1527 u64 clock_offset;
1528
1529 clock_offset = old_rq->clock - new_rq->clock;
1530
1531#ifdef CONFIG_SCHEDSTATS
1532 if (p->se.wait_start)
1533 p->se.wait_start -= clock_offset;
1534 if (p->se.sleep_start)
1535 p->se.sleep_start -= clock_offset;
1536 if (p->se.block_start)
1537 p->se.block_start -= clock_offset;
1538 if (old_cpu != new_cpu) {
1539 schedstat_inc(p, se.nr_migrations);
1540 if (task_hot(p, old_rq->clock, NULL))
1541 schedstat_inc(p, se.nr_forced2_migrations);
1542 }
1543#endif
1544 p->se.vruntime -= old_cfsrq->min_vruntime -
1545 new_cfsrq->min_vruntime;
1546
1547 __set_task_cpu(p, new_cpu);
1548}
1549
1550struct migration_req {
1551 struct list_head list;
1552
1553 struct task_struct *task;
1554 int dest_cpu;
1555
1556 struct completion done;
1557};
1558
1559
1560
1561
1562
1563static int
1564migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1565{
1566 struct rq *rq = task_rq(p);
1567
1568
1569
1570
1571
1572 if (!p->se.on_rq && !task_running(rq, p)) {
1573 set_task_cpu(p, dest_cpu);
1574 return 0;
1575 }
1576
1577 init_completion(&req->done);
1578 req->task = p;
1579 req->dest_cpu = dest_cpu;
1580 list_add(&req->list, &rq->migration_queue);
1581
1582 return 1;
1583}
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594void wait_task_inactive(struct task_struct *p)
1595{
1596 unsigned long flags;
1597 int running, on_rq;
1598 struct rq *rq;
1599
1600 for (;;) {
1601
1602
1603
1604
1605
1606
1607 rq = task_rq(p);
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620 while (task_running(rq, p))
1621 cpu_relax();
1622
1623
1624
1625
1626
1627
1628 rq = task_rq_lock(p, &flags);
1629 running = task_running(rq, p);
1630 on_rq = p->se.on_rq;
1631 task_rq_unlock(rq, &flags);
1632
1633
1634
1635
1636
1637
1638
1639 if (unlikely(running)) {
1640 cpu_relax();
1641 continue;
1642 }
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653 if (unlikely(on_rq)) {
1654 schedule_timeout_uninterruptible(1);
1655 continue;
1656 }
1657
1658
1659
1660
1661
1662
1663 break;
1664 }
1665}
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680void kick_process(struct task_struct *p)
1681{
1682 int cpu;
1683
1684 preempt_disable();
1685 cpu = task_cpu(p);
1686 if ((cpu != smp_processor_id()) && task_curr(p))
1687 smp_send_reschedule(cpu);
1688 preempt_enable();
1689}
1690
1691
1692
1693
1694
1695
1696
1697
1698static unsigned long source_load(int cpu, int type)
1699{
1700 struct rq *rq = cpu_rq(cpu);
1701 unsigned long total = weighted_cpuload(cpu);
1702
1703 if (type == 0)
1704 return total;
1705
1706 return min(rq->cpu_load[type-1], total);
1707}
1708
1709
1710
1711
1712
1713static unsigned long target_load(int cpu, int type)
1714{
1715 struct rq *rq = cpu_rq(cpu);
1716 unsigned long total = weighted_cpuload(cpu);
1717
1718 if (type == 0)
1719 return total;
1720
1721 return max(rq->cpu_load[type-1], total);
1722}
1723
1724
1725
1726
1727static unsigned long cpu_avg_load_per_task(int cpu)
1728{
1729 struct rq *rq = cpu_rq(cpu);
1730 unsigned long total = weighted_cpuload(cpu);
1731 unsigned long n = rq->nr_running;
1732
1733 return n ? total / n : SCHED_LOAD_SCALE;
1734}
1735
1736
1737
1738
1739
1740static struct sched_group *
1741find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1742{
1743 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1744 unsigned long min_load = ULONG_MAX, this_load = 0;
1745 int load_idx = sd->forkexec_idx;
1746 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1747
1748 do {
1749 unsigned long load, avg_load;
1750 int local_group;
1751 int i;
1752
1753
1754 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1755 continue;
1756
1757 local_group = cpu_isset(this_cpu, group->cpumask);
1758
1759
1760 avg_load = 0;
1761
1762 for_each_cpu_mask(i, group->cpumask) {
1763
1764 if (local_group)
1765 load = source_load(i, load_idx);
1766 else
1767 load = target_load(i, load_idx);
1768
1769 avg_load += load;
1770 }
1771
1772
1773 avg_load = sg_div_cpu_power(group,
1774 avg_load * SCHED_LOAD_SCALE);
1775
1776 if (local_group) {
1777 this_load = avg_load;
1778 this = group;
1779 } else if (avg_load < min_load) {
1780 min_load = avg_load;
1781 idlest = group;
1782 }
1783 } while (group = group->next, group != sd->groups);
1784
1785 if (!idlest || 100*this_load < imbalance*min_load)
1786 return NULL;
1787 return idlest;
1788}
1789
1790
1791
1792
1793static int
1794find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1795{
1796 cpumask_t tmp;
1797 unsigned long load, min_load = ULONG_MAX;
1798 int idlest = -1;
1799 int i;
1800
1801
1802 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1803
1804 for_each_cpu_mask(i, tmp) {
1805 load = weighted_cpuload(i);
1806
1807 if (load < min_load || (load == min_load && i == this_cpu)) {
1808 min_load = load;
1809 idlest = i;
1810 }
1811 }
1812
1813 return idlest;
1814}
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827static int sched_balance_self(int cpu, int flag)
1828{
1829 struct task_struct *t = current;
1830 struct sched_domain *tmp, *sd = NULL;
1831
1832 for_each_domain(cpu, tmp) {
1833
1834
1835
1836 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1837 break;
1838 if (tmp->flags & flag)
1839 sd = tmp;
1840 }
1841
1842 while (sd) {
1843 cpumask_t span;
1844 struct sched_group *group;
1845 int new_cpu, weight;
1846
1847 if (!(sd->flags & flag)) {
1848 sd = sd->child;
1849 continue;
1850 }
1851
1852 span = sd->span;
1853 group = find_idlest_group(sd, t, cpu);
1854 if (!group) {
1855 sd = sd->child;
1856 continue;
1857 }
1858
1859 new_cpu = find_idlest_cpu(group, t, cpu);
1860 if (new_cpu == -1 || new_cpu == cpu) {
1861
1862 sd = sd->child;
1863 continue;
1864 }
1865
1866
1867 cpu = new_cpu;
1868 sd = NULL;
1869 weight = cpus_weight(span);
1870 for_each_domain(cpu, tmp) {
1871 if (weight <= cpus_weight(tmp->span))
1872 break;
1873 if (tmp->flags & flag)
1874 sd = tmp;
1875 }
1876
1877 }
1878
1879 return cpu;
1880}
1881
1882#endif
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1899{
1900 int cpu, orig_cpu, this_cpu, success = 0;
1901 unsigned long flags;
1902 long old_state;
1903 struct rq *rq;
1904
1905 smp_wmb();
1906 rq = task_rq_lock(p, &flags);
1907 old_state = p->state;
1908 if (!(old_state & state))
1909 goto out;
1910
1911 if (p->se.on_rq)
1912 goto out_running;
1913
1914 cpu = task_cpu(p);
1915 orig_cpu = cpu;
1916 this_cpu = smp_processor_id();
1917
1918#ifdef CONFIG_SMP
1919 if (unlikely(task_running(rq, p)))
1920 goto out_activate;
1921
1922 cpu = p->sched_class->select_task_rq(p, sync);
1923 if (cpu != orig_cpu) {
1924 set_task_cpu(p, cpu);
1925 task_rq_unlock(rq, &flags);
1926
1927 rq = task_rq_lock(p, &flags);
1928 old_state = p->state;
1929 if (!(old_state & state))
1930 goto out;
1931 if (p->se.on_rq)
1932 goto out_running;
1933
1934 this_cpu = smp_processor_id();
1935 cpu = task_cpu(p);
1936 }
1937
1938#ifdef CONFIG_SCHEDSTATS
1939 schedstat_inc(rq, ttwu_count);
1940 if (cpu == this_cpu)
1941 schedstat_inc(rq, ttwu_local);
1942 else {
1943 struct sched_domain *sd;
1944 for_each_domain(this_cpu, sd) {
1945 if (cpu_isset(cpu, sd->span)) {
1946 schedstat_inc(sd, ttwu_wake_remote);
1947 break;
1948 }
1949 }
1950 }
1951#endif
1952
1953out_activate:
1954#endif
1955 schedstat_inc(p, se.nr_wakeups);
1956 if (sync)
1957 schedstat_inc(p, se.nr_wakeups_sync);
1958 if (orig_cpu != cpu)
1959 schedstat_inc(p, se.nr_wakeups_migrate);
1960 if (cpu == this_cpu)
1961 schedstat_inc(p, se.nr_wakeups_local);
1962 else
1963 schedstat_inc(p, se.nr_wakeups_remote);
1964 update_rq_clock(rq);
1965 activate_task(rq, p, 1);
1966 success = 1;
1967
1968out_running:
1969 check_preempt_curr(rq, p);
1970
1971 p->state = TASK_RUNNING;
1972#ifdef CONFIG_SMP
1973 if (p->sched_class->task_wake_up)
1974 p->sched_class->task_wake_up(rq, p);
1975#endif
1976out:
1977 task_rq_unlock(rq, &flags);
1978
1979 return success;
1980}
1981
1982int wake_up_process(struct task_struct *p)
1983{
1984 return try_to_wake_up(p, TASK_ALL, 0);
1985}
1986EXPORT_SYMBOL(wake_up_process);
1987
1988int wake_up_state(struct task_struct *p, unsigned int state)
1989{
1990 return try_to_wake_up(p, state, 0);
1991}
1992
1993
1994
1995
1996
1997
1998
1999static void __sched_fork(struct task_struct *p)
2000{
2001 p->se.exec_start = 0;
2002 p->se.sum_exec_runtime = 0;
2003 p->se.prev_sum_exec_runtime = 0;
2004 p->se.last_wakeup = 0;
2005 p->se.avg_overlap = 0;
2006
2007#ifdef CONFIG_SCHEDSTATS
2008 p->se.wait_start = 0;
2009 p->se.sum_sleep_runtime = 0;
2010 p->se.sleep_start = 0;
2011 p->se.block_start = 0;
2012 p->se.sleep_max = 0;
2013 p->se.block_max = 0;
2014 p->se.exec_max = 0;
2015 p->se.slice_max = 0;
2016 p->se.wait_max = 0;
2017#endif
2018
2019 INIT_LIST_HEAD(&p->rt.run_list);
2020 p->se.on_rq = 0;
2021
2022#ifdef CONFIG_PREEMPT_NOTIFIERS
2023 INIT_HLIST_HEAD(&p->preempt_notifiers);
2024#endif
2025
2026
2027
2028
2029
2030
2031
2032 p->state = TASK_RUNNING;
2033}
2034
2035
2036
2037
2038void sched_fork(struct task_struct *p, int clone_flags)
2039{
2040 int cpu = get_cpu();
2041
2042 __sched_fork(p);
2043
2044#ifdef CONFIG_SMP
2045 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2046#endif
2047 set_task_cpu(p, cpu);
2048
2049
2050
2051
2052 p->prio = current->normal_prio;
2053 if (!rt_prio(p->prio))
2054 p->sched_class = &fair_sched_class;
2055
2056#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2057 if (likely(sched_info_on()))
2058 memset(&p->sched_info, 0, sizeof(p->sched_info));
2059#endif
2060#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2061 p->oncpu = 0;
2062#endif
2063#ifdef CONFIG_PREEMPT
2064
2065 task_thread_info(p)->preempt_count = 1;
2066#endif
2067 put_cpu();
2068}
2069
2070
2071
2072
2073
2074
2075
2076
2077void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2078{
2079 unsigned long flags;
2080 struct rq *rq;
2081
2082 rq = task_rq_lock(p, &flags);
2083 BUG_ON(p->state != TASK_RUNNING);
2084 update_rq_clock(rq);
2085
2086 p->prio = effective_prio(p);
2087
2088 if (!p->sched_class->task_new || !current->se.on_rq) {
2089 activate_task(rq, p, 0);
2090 } else {
2091
2092
2093
2094
2095 p->sched_class->task_new(rq, p);
2096 inc_nr_running(p, rq);
2097 }
2098 check_preempt_curr(rq, p);
2099#ifdef CONFIG_SMP
2100 if (p->sched_class->task_wake_up)
2101 p->sched_class->task_wake_up(rq, p);
2102#endif
2103 task_rq_unlock(rq, &flags);
2104}
2105
2106#ifdef CONFIG_PREEMPT_NOTIFIERS
2107
2108
2109
2110
2111
2112void preempt_notifier_register(struct preempt_notifier *notifier)
2113{
2114 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2115}
2116EXPORT_SYMBOL_GPL(preempt_notifier_register);
2117
2118
2119
2120
2121
2122
2123
2124void preempt_notifier_unregister(struct preempt_notifier *notifier)
2125{
2126 hlist_del(¬ifier->link);
2127}
2128EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2129
2130static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2131{
2132 struct preempt_notifier *notifier;
2133 struct hlist_node *node;
2134
2135 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2136 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2137}
2138
2139static void
2140fire_sched_out_preempt_notifiers(struct task_struct *curr,
2141 struct task_struct *next)
2142{
2143 struct preempt_notifier *notifier;
2144 struct hlist_node *node;
2145
2146 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2147 notifier->ops->sched_out(notifier, next);
2148}
2149
2150#else
2151
2152static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2153{
2154}
2155
2156static void
2157fire_sched_out_preempt_notifiers(struct task_struct *curr,
2158 struct task_struct *next)
2159{
2160}
2161
2162#endif
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177static inline void
2178prepare_task_switch(struct rq *rq, struct task_struct *prev,
2179 struct task_struct *next)
2180{
2181 fire_sched_out_preempt_notifiers(prev, next);
2182 prepare_lock_switch(rq, next);
2183 prepare_arch_switch(next);
2184}
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2202 __releases(rq->lock)
2203{
2204 struct mm_struct *mm = rq->prev_mm;
2205 long prev_state;
2206
2207 rq->prev_mm = NULL;
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220 prev_state = prev->state;
2221 finish_arch_switch(prev);
2222 finish_lock_switch(rq, prev);
2223#ifdef CONFIG_SMP
2224 if (current->sched_class->post_schedule)
2225 current->sched_class->post_schedule(rq);
2226#endif
2227
2228 fire_sched_in_preempt_notifiers(current);
2229 if (mm)
2230 mmdrop(mm);
2231 if (unlikely(prev_state == TASK_DEAD)) {
2232
2233
2234
2235
2236 kprobe_flush_task(prev);
2237 put_task_struct(prev);
2238 }
2239}
2240
2241
2242
2243
2244
2245asmlinkage void schedule_tail(struct task_struct *prev)
2246 __releases(rq->lock)
2247{
2248 struct rq *rq = this_rq();
2249
2250 finish_task_switch(rq, prev);
2251#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2252
2253 preempt_enable();
2254#endif
2255 if (current->set_child_tid)
2256 put_user(task_pid_vnr(current), current->set_child_tid);
2257}
2258
2259
2260
2261
2262
2263static inline void
2264context_switch(struct rq *rq, struct task_struct *prev,
2265 struct task_struct *next)
2266{
2267 struct mm_struct *mm, *oldmm;
2268
2269 prepare_task_switch(rq, prev, next);
2270 mm = next->mm;
2271 oldmm = prev->active_mm;
2272
2273
2274
2275
2276
2277 arch_enter_lazy_cpu_mode();
2278
2279 if (unlikely(!mm)) {
2280 next->active_mm = oldmm;
2281 atomic_inc(&oldmm->mm_count);
2282 enter_lazy_tlb(oldmm, next);
2283 } else
2284 switch_mm(oldmm, mm, next);
2285
2286 if (unlikely(!prev->mm)) {
2287 prev->active_mm = NULL;
2288 rq->prev_mm = oldmm;
2289 }
2290
2291
2292
2293
2294
2295
2296#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2297 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2298#endif
2299
2300
2301 switch_to(prev, next, prev);
2302
2303 barrier();
2304
2305
2306
2307
2308
2309 finish_task_switch(this_rq(), prev);
2310}
2311
2312
2313
2314
2315
2316
2317
2318
2319unsigned long nr_running(void)
2320{
2321 unsigned long i, sum = 0;
2322
2323 for_each_online_cpu(i)
2324 sum += cpu_rq(i)->nr_running;
2325
2326 return sum;
2327}
2328
2329unsigned long nr_uninterruptible(void)
2330{
2331 unsigned long i, sum = 0;
2332
2333 for_each_possible_cpu(i)
2334 sum += cpu_rq(i)->nr_uninterruptible;
2335
2336
2337
2338
2339
2340 if (unlikely((long)sum < 0))
2341 sum = 0;
2342
2343 return sum;
2344}
2345
2346unsigned long long nr_context_switches(void)
2347{
2348 int i;
2349 unsigned long long sum = 0;
2350
2351 for_each_possible_cpu(i)
2352 sum += cpu_rq(i)->nr_switches;
2353
2354 return sum;
2355}
2356
2357unsigned long nr_iowait(void)
2358{
2359 unsigned long i, sum = 0;
2360
2361 for_each_possible_cpu(i)
2362 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2363
2364 return sum;
2365}
2366
2367unsigned long nr_active(void)
2368{
2369 unsigned long i, running = 0, uninterruptible = 0;
2370
2371 for_each_online_cpu(i) {
2372 running += cpu_rq(i)->nr_running;
2373 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2374 }
2375
2376 if (unlikely((long)uninterruptible < 0))
2377 uninterruptible = 0;
2378
2379 return running + uninterruptible;
2380}
2381
2382
2383
2384
2385
2386static void update_cpu_load(struct rq *this_rq)
2387{
2388 unsigned long this_load = this_rq->load.weight;
2389 int i, scale;
2390
2391 this_rq->nr_load_updates++;
2392
2393
2394 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2395 unsigned long old_load, new_load;
2396
2397
2398
2399 old_load = this_rq->cpu_load[i];
2400 new_load = this_load;
2401
2402
2403
2404
2405
2406 if (new_load > old_load)
2407 new_load += scale-1;
2408 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2409 }
2410}
2411
2412#ifdef CONFIG_SMP
2413
2414
2415
2416
2417
2418
2419
2420static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2421 __acquires(rq1->lock)
2422 __acquires(rq2->lock)
2423{
2424 BUG_ON(!irqs_disabled());
2425 if (rq1 == rq2) {
2426 spin_lock(&rq1->lock);
2427 __acquire(rq2->lock);
2428 } else {
2429 if (rq1 < rq2) {
2430 spin_lock(&rq1->lock);
2431 spin_lock(&rq2->lock);
2432 } else {
2433 spin_lock(&rq2->lock);
2434 spin_lock(&rq1->lock);
2435 }
2436 }
2437 update_rq_clock(rq1);
2438 update_rq_clock(rq2);
2439}
2440
2441
2442
2443
2444
2445
2446
2447static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2448 __releases(rq1->lock)
2449 __releases(rq2->lock)
2450{
2451 spin_unlock(&rq1->lock);
2452 if (rq1 != rq2)
2453 spin_unlock(&rq2->lock);
2454 else
2455 __release(rq2->lock);
2456}
2457
2458
2459
2460
2461static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2462 __releases(this_rq->lock)
2463 __acquires(busiest->lock)
2464 __acquires(this_rq->lock)
2465{
2466 int ret = 0;
2467
2468 if (unlikely(!irqs_disabled())) {
2469
2470 spin_unlock(&this_rq->lock);
2471 BUG_ON(1);
2472 }
2473 if (unlikely(!spin_trylock(&busiest->lock))) {
2474 if (busiest < this_rq) {
2475 spin_unlock(&this_rq->lock);
2476 spin_lock(&busiest->lock);
2477 spin_lock(&this_rq->lock);
2478 ret = 1;
2479 } else
2480 spin_lock(&busiest->lock);
2481 }
2482 return ret;
2483}
2484
2485
2486
2487
2488
2489
2490
2491static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2492{
2493 struct migration_req req;
2494 unsigned long flags;
2495 struct rq *rq;
2496
2497 rq = task_rq_lock(p, &flags);
2498 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2499 || unlikely(cpu_is_offline(dest_cpu)))
2500 goto out;
2501
2502
2503 if (migrate_task(p, dest_cpu, &req)) {
2504
2505 struct task_struct *mt = rq->migration_thread;
2506
2507 get_task_struct(mt);
2508 task_rq_unlock(rq, &flags);
2509 wake_up_process(mt);
2510 put_task_struct(mt);
2511 wait_for_completion(&req.done);
2512
2513 return;
2514 }
2515out:
2516 task_rq_unlock(rq, &flags);
2517}
2518
2519
2520
2521
2522
2523void sched_exec(void)
2524{
2525 int new_cpu, this_cpu = get_cpu();
2526 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2527 put_cpu();
2528 if (new_cpu != this_cpu)
2529 sched_migrate_task(current, new_cpu);
2530}
2531
2532
2533
2534
2535
2536static void pull_task(struct rq *src_rq, struct task_struct *p,
2537 struct rq *this_rq, int this_cpu)
2538{
2539 deactivate_task(src_rq, p, 0);
2540 set_task_cpu(p, this_cpu);
2541 activate_task(this_rq, p, 0);
2542
2543
2544
2545
2546 check_preempt_curr(this_rq, p);
2547}
2548
2549
2550
2551
2552static
2553int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2554 struct sched_domain *sd, enum cpu_idle_type idle,
2555 int *all_pinned)
2556{
2557
2558
2559
2560
2561
2562
2563 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2564 schedstat_inc(p, se.nr_failed_migrations_affine);
2565 return 0;
2566 }
2567 *all_pinned = 0;
2568
2569 if (task_running(rq, p)) {
2570 schedstat_inc(p, se.nr_failed_migrations_running);
2571 return 0;
2572 }
2573
2574
2575
2576
2577
2578
2579
2580 if (!task_hot(p, rq->clock, sd) ||
2581 sd->nr_balance_failed > sd->cache_nice_tries) {
2582#ifdef CONFIG_SCHEDSTATS
2583 if (task_hot(p, rq->clock, sd)) {
2584 schedstat_inc(sd, lb_hot_gained[idle]);
2585 schedstat_inc(p, se.nr_forced_migrations);
2586 }
2587#endif
2588 return 1;
2589 }
2590
2591 if (task_hot(p, rq->clock, sd)) {
2592 schedstat_inc(p, se.nr_failed_migrations_hot);
2593 return 0;
2594 }
2595 return 1;
2596}
2597
2598static unsigned long
2599balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2600 unsigned long max_load_move, struct sched_domain *sd,
2601 enum cpu_idle_type idle, int *all_pinned,
2602 int *this_best_prio, struct rq_iterator *iterator)
2603{
2604 int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2605 struct task_struct *p;
2606 long rem_load_move = max_load_move;
2607
2608 if (max_load_move == 0)
2609 goto out;
2610
2611 pinned = 1;
2612
2613
2614
2615
2616 p = iterator->start(iterator->arg);
2617next:
2618 if (!p || loops++ > sysctl_sched_nr_migrate)
2619 goto out;
2620
2621
2622
2623
2624
2625 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2626 SCHED_LOAD_SCALE_FUZZ;
2627 if ((skip_for_load && p->prio >= *this_best_prio) ||
2628 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2629 p = iterator->next(iterator->arg);
2630 goto next;
2631 }
2632
2633 pull_task(busiest, p, this_rq, this_cpu);
2634 pulled++;
2635 rem_load_move -= p->se.load.weight;
2636
2637
2638
2639
2640 if (rem_load_move > 0) {
2641 if (p->prio < *this_best_prio)
2642 *this_best_prio = p->prio;
2643 p = iterator->next(iterator->arg);
2644 goto next;
2645 }
2646out:
2647
2648
2649
2650
2651
2652 schedstat_add(sd, lb_gained[idle], pulled);
2653
2654 if (all_pinned)
2655 *all_pinned = pinned;
2656
2657 return max_load_move - rem_load_move;
2658}
2659
2660
2661
2662
2663
2664
2665
2666
2667static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2668 unsigned long max_load_move,
2669 struct sched_domain *sd, enum cpu_idle_type idle,
2670 int *all_pinned)
2671{
2672 const struct sched_class *class = sched_class_highest;
2673 unsigned long total_load_moved = 0;
2674 int this_best_prio = this_rq->curr->prio;
2675
2676 do {
2677 total_load_moved +=
2678 class->load_balance(this_rq, this_cpu, busiest,
2679 max_load_move - total_load_moved,
2680 sd, idle, all_pinned, &this_best_prio);
2681 class = class->next;
2682 } while (class && max_load_move > total_load_moved);
2683
2684 return total_load_moved > 0;
2685}
2686
2687static int
2688iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2689 struct sched_domain *sd, enum cpu_idle_type idle,
2690 struct rq_iterator *iterator)
2691{
2692 struct task_struct *p = iterator->start(iterator->arg);
2693 int pinned = 0;
2694
2695 while (p) {
2696 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2697 pull_task(busiest, p, this_rq, this_cpu);
2698
2699
2700
2701
2702
2703 schedstat_inc(sd, lb_gained[idle]);
2704
2705 return 1;
2706 }
2707 p = iterator->next(iterator->arg);
2708 }
2709
2710 return 0;
2711}
2712
2713
2714
2715
2716
2717
2718
2719
2720static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2721 struct sched_domain *sd, enum cpu_idle_type idle)
2722{
2723 const struct sched_class *class;
2724
2725 for (class = sched_class_highest; class; class = class->next)
2726 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
2727 return 1;
2728
2729 return 0;
2730}
2731
2732
2733
2734
2735
2736
2737static struct sched_group *
2738find_busiest_group(struct sched_domain *sd, int this_cpu,
2739 unsigned long *imbalance, enum cpu_idle_type idle,
2740 int *sd_idle, cpumask_t *cpus, int *balance)
2741{
2742 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2743 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2744 unsigned long max_pull;
2745 unsigned long busiest_load_per_task, busiest_nr_running;
2746 unsigned long this_load_per_task, this_nr_running;
2747 int load_idx, group_imb = 0;
2748#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2749 int power_savings_balance = 1;
2750 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2751 unsigned long min_nr_running = ULONG_MAX;
2752 struct sched_group *group_min = NULL, *group_leader = NULL;
2753#endif
2754
2755 max_load = this_load = total_load = total_pwr = 0;
2756 busiest_load_per_task = busiest_nr_running = 0;
2757 this_load_per_task = this_nr_running = 0;
2758 if (idle == CPU_NOT_IDLE)
2759 load_idx = sd->busy_idx;
2760 else if (idle == CPU_NEWLY_IDLE)
2761 load_idx = sd->newidle_idx;
2762 else
2763 load_idx = sd->idle_idx;
2764
2765 do {
2766 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
2767 int local_group;
2768 int i;
2769 int __group_imb = 0;
2770 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2771 unsigned long sum_nr_running, sum_weighted_load;
2772
2773 local_group = cpu_isset(this_cpu, group->cpumask);
2774
2775 if (local_group)
2776 balance_cpu = first_cpu(group->cpumask);
2777
2778
2779 sum_weighted_load = sum_nr_running = avg_load = 0;
2780 max_cpu_load = 0;
2781 min_cpu_load = ~0UL;
2782
2783 for_each_cpu_mask(i, group->cpumask) {
2784 struct rq *rq;
2785
2786 if (!cpu_isset(i, *cpus))
2787 continue;
2788
2789 rq = cpu_rq(i);
2790
2791 if (*sd_idle && rq->nr_running)
2792 *sd_idle = 0;
2793
2794
2795 if (local_group) {
2796 if (idle_cpu(i) && !first_idle_cpu) {
2797 first_idle_cpu = 1;
2798 balance_cpu = i;
2799 }
2800
2801 load = target_load(i, load_idx);
2802 } else {
2803 load = source_load(i, load_idx);
2804 if (load > max_cpu_load)
2805 max_cpu_load = load;
2806 if (min_cpu_load > load)
2807 min_cpu_load = load;
2808 }
2809
2810 avg_load += load;
2811 sum_nr_running += rq->nr_running;
2812 sum_weighted_load += weighted_cpuload(i);
2813 }
2814
2815
2816
2817
2818
2819
2820
2821 if (idle != CPU_NEWLY_IDLE && local_group &&
2822 balance_cpu != this_cpu && balance) {
2823 *balance = 0;
2824 goto ret;
2825 }
2826
2827 total_load += avg_load;
2828 total_pwr += group->__cpu_power;
2829
2830
2831 avg_load = sg_div_cpu_power(group,
2832 avg_load * SCHED_LOAD_SCALE);
2833
2834 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
2835 __group_imb = 1;
2836
2837 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2838
2839 if (local_group) {
2840 this_load = avg_load;
2841 this = group;
2842 this_nr_running = sum_nr_running;
2843 this_load_per_task = sum_weighted_load;
2844 } else if (avg_load > max_load &&
2845 (sum_nr_running > group_capacity || __group_imb)) {
2846 max_load = avg_load;
2847 busiest = group;
2848 busiest_nr_running = sum_nr_running;
2849 busiest_load_per_task = sum_weighted_load;
2850 group_imb = __group_imb;
2851 }
2852
2853#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2854
2855
2856
2857
2858 if (idle == CPU_NOT_IDLE ||
2859 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2860 goto group_next;
2861
2862
2863
2864
2865
2866 if (local_group && (this_nr_running >= group_capacity ||
2867 !this_nr_running))
2868 power_savings_balance = 0;
2869
2870
2871
2872
2873
2874 if (!power_savings_balance || sum_nr_running >= group_capacity
2875 || !sum_nr_running)
2876 goto group_next;
2877
2878
2879
2880
2881
2882
2883 if ((sum_nr_running < min_nr_running) ||
2884 (sum_nr_running == min_nr_running &&
2885 first_cpu(group->cpumask) <
2886 first_cpu(group_min->cpumask))) {
2887 group_min = group;
2888 min_nr_running = sum_nr_running;
2889 min_load_per_task = sum_weighted_load /
2890 sum_nr_running;
2891 }
2892
2893
2894
2895
2896
2897
2898 if (sum_nr_running <= group_capacity - 1) {
2899 if (sum_nr_running > leader_nr_running ||
2900 (sum_nr_running == leader_nr_running &&
2901 first_cpu(group->cpumask) >
2902 first_cpu(group_leader->cpumask))) {
2903 group_leader = group;
2904 leader_nr_running = sum_nr_running;
2905 }
2906 }
2907group_next:
2908#endif
2909 group = group->next;
2910 } while (group != sd->groups);
2911
2912 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2913 goto out_balanced;
2914
2915 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2916
2917 if (this_load >= avg_load ||
2918 100*max_load <= sd->imbalance_pct*this_load)
2919 goto out_balanced;
2920
2921 busiest_load_per_task /= busiest_nr_running;
2922 if (group_imb)
2923 busiest_load_per_task = min(busiest_load_per_task, avg_load);
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936 if (max_load <= busiest_load_per_task)
2937 goto out_balanced;
2938
2939
2940
2941
2942
2943
2944 if (max_load < avg_load) {
2945 *imbalance = 0;
2946 goto small_imbalance;
2947 }
2948
2949
2950 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2951
2952
2953 *imbalance = min(max_pull * busiest->__cpu_power,
2954 (avg_load - this_load) * this->__cpu_power)
2955 / SCHED_LOAD_SCALE;
2956
2957
2958
2959
2960
2961
2962
2963 if (*imbalance < busiest_load_per_task) {
2964 unsigned long tmp, pwr_now, pwr_move;
2965 unsigned int imbn;
2966
2967small_imbalance:
2968 pwr_move = pwr_now = 0;
2969 imbn = 2;
2970 if (this_nr_running) {
2971 this_load_per_task /= this_nr_running;
2972 if (busiest_load_per_task > this_load_per_task)
2973 imbn = 1;
2974 } else
2975 this_load_per_task = SCHED_LOAD_SCALE;
2976
2977 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2978 busiest_load_per_task * imbn) {
2979 *imbalance = busiest_load_per_task;
2980 return busiest;
2981 }
2982
2983
2984
2985
2986
2987
2988
2989 pwr_now += busiest->__cpu_power *
2990 min(busiest_load_per_task, max_load);
2991 pwr_now += this->__cpu_power *
2992 min(this_load_per_task, this_load);
2993 pwr_now /= SCHED_LOAD_SCALE;
2994
2995
2996 tmp = sg_div_cpu_power(busiest,
2997 busiest_load_per_task * SCHED_LOAD_SCALE);
2998 if (max_load > tmp)
2999 pwr_move += busiest->__cpu_power *
3000 min(busiest_load_per_task, max_load - tmp);
3001
3002
3003 if (max_load * busiest->__cpu_power <
3004 busiest_load_per_task * SCHED_LOAD_SCALE)
3005 tmp = sg_div_cpu_power(this,
3006 max_load * busiest->__cpu_power);
3007 else
3008 tmp = sg_div_cpu_power(this,
3009 busiest_load_per_task * SCHED_LOAD_SCALE);
3010 pwr_move += this->__cpu_power *
3011 min(this_load_per_task, this_load + tmp);
3012 pwr_move /= SCHED_LOAD_SCALE;
3013
3014
3015 if (pwr_move > pwr_now)
3016 *imbalance = busiest_load_per_task;
3017 }
3018
3019 return busiest;
3020
3021out_balanced:
3022#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3023 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3024 goto ret;
3025
3026 if (this == group_leader && group_leader != group_min) {
3027 *imbalance = min_load_per_task;
3028 return group_min;
3029 }
3030#endif
3031ret:
3032 *imbalance = 0;
3033 return NULL;
3034}
3035
3036
3037
3038
3039static struct rq *
3040find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3041 unsigned long imbalance, cpumask_t *cpus)
3042{
3043 struct rq *busiest = NULL, *rq;
3044 unsigned long max_load = 0;
3045 int i;
3046
3047 for_each_cpu_mask(i, group->cpumask) {
3048 unsigned long wl;
3049
3050 if (!cpu_isset(i, *cpus))
3051 continue;
3052
3053 rq = cpu_rq(i);
3054 wl = weighted_cpuload(i);
3055
3056 if (rq->nr_running == 1 && wl > imbalance)
3057 continue;
3058
3059 if (wl > max_load) {
3060 max_load = wl;
3061 busiest = rq;
3062 }
3063 }
3064
3065 return busiest;
3066}
3067
3068
3069
3070
3071
3072#define MAX_PINNED_INTERVAL 512
3073
3074
3075
3076
3077
3078static int load_balance(int this_cpu, struct rq *this_rq,
3079 struct sched_domain *sd, enum cpu_idle_type idle,
3080 int *balance)
3081{
3082 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3083 struct sched_group *group;
3084 unsigned long imbalance;
3085 struct rq *busiest;
3086 cpumask_t cpus = CPU_MASK_ALL;
3087 unsigned long flags;
3088
3089
3090
3091
3092
3093
3094
3095 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3096 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3097 sd_idle = 1;
3098
3099 schedstat_inc(sd, lb_count[idle]);
3100
3101redo:
3102 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3103 &cpus, balance);
3104
3105 if (*balance == 0)
3106 goto out_balanced;
3107
3108 if (!group) {
3109 schedstat_inc(sd, lb_nobusyg[idle]);
3110 goto out_balanced;
3111 }
3112
3113 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
3114 if (!busiest) {
3115 schedstat_inc(sd, lb_nobusyq[idle]);
3116 goto out_balanced;
3117 }
3118
3119 BUG_ON(busiest == this_rq);
3120
3121 schedstat_add(sd, lb_imbalance[idle], imbalance);
3122
3123 ld_moved = 0;
3124 if (busiest->nr_running > 1) {
3125
3126
3127
3128
3129
3130
3131 local_irq_save(flags);
3132 double_rq_lock(this_rq, busiest);
3133 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3134 imbalance, sd, idle, &all_pinned);
3135 double_rq_unlock(this_rq, busiest);
3136 local_irq_restore(flags);
3137
3138
3139
3140
3141 if (ld_moved && this_cpu != smp_processor_id())
3142 resched_cpu(this_cpu);
3143
3144
3145 if (unlikely(all_pinned)) {
3146 cpu_clear(cpu_of(busiest), cpus);
3147 if (!cpus_empty(cpus))
3148 goto redo;
3149 goto out_balanced;
3150 }
3151 }
3152
3153 if (!ld_moved) {
3154 schedstat_inc(sd, lb_failed[idle]);
3155 sd->nr_balance_failed++;
3156
3157 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3158
3159 spin_lock_irqsave(&busiest->lock, flags);
3160
3161
3162
3163
3164 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
3165 spin_unlock_irqrestore(&busiest->lock, flags);
3166 all_pinned = 1;
3167 goto out_one_pinned;
3168 }
3169
3170 if (!busiest->active_balance) {
3171 busiest->active_balance = 1;
3172 busiest->push_cpu = this_cpu;
3173 active_balance = 1;
3174 }
3175 spin_unlock_irqrestore(&busiest->lock, flags);
3176 if (active_balance)
3177 wake_up_process(busiest->migration_thread);
3178
3179
3180
3181
3182
3183 sd->nr_balance_failed = sd->cache_nice_tries+1;
3184 }
3185 } else
3186 sd->nr_balance_failed = 0;
3187
3188 if (likely(!active_balance)) {
3189
3190 sd->balance_interval = sd->min_interval;
3191 } else {
3192
3193
3194
3195
3196
3197
3198 if (sd->balance_interval < sd->max_interval)
3199 sd->balance_interval *= 2;
3200 }
3201
3202 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3203 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3204 return -1;
3205 return ld_moved;
3206
3207out_balanced:
3208 schedstat_inc(sd, lb_balanced[idle]);
3209
3210 sd->nr_balance_failed = 0;
3211
3212out_one_pinned:
3213
3214 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3215 (sd->balance_interval < sd->max_interval))
3216 sd->balance_interval *= 2;
3217
3218 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3219 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3220 return -1;
3221 return 0;
3222}
3223
3224
3225
3226
3227
3228
3229
3230
3231static int
3232load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3233{
3234 struct sched_group *group;
3235 struct rq *busiest = NULL;
3236 unsigned long imbalance;
3237 int ld_moved = 0;
3238 int sd_idle = 0;
3239 int all_pinned = 0;
3240 cpumask_t cpus = CPU_MASK_ALL;
3241
3242
3243
3244
3245
3246
3247
3248 if (sd->flags & SD_SHARE_CPUPOWER &&
3249 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3250 sd_idle = 1;
3251
3252 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3253redo:
3254 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3255 &sd_idle, &cpus, NULL);
3256 if (!group) {
3257 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3258 goto out_balanced;
3259 }
3260
3261 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
3262 &cpus);
3263 if (!busiest) {
3264 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3265 goto out_balanced;
3266 }
3267
3268 BUG_ON(busiest == this_rq);
3269
3270 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
3271
3272 ld_moved = 0;
3273 if (busiest->nr_running > 1) {
3274
3275 double_lock_balance(this_rq, busiest);
3276
3277 update_rq_clock(busiest);
3278 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3279 imbalance, sd, CPU_NEWLY_IDLE,
3280 &all_pinned);
3281 spin_unlock(&busiest->lock);
3282
3283 if (unlikely(all_pinned)) {
3284 cpu_clear(cpu_of(busiest), cpus);
3285 if (!cpus_empty(cpus))
3286 goto redo;
3287 }
3288 }
3289
3290 if (!ld_moved) {
3291 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3292 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3293 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3294 return -1;
3295 } else
3296 sd->nr_balance_failed = 0;
3297
3298 return ld_moved;
3299
3300out_balanced:
3301 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
3302 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3303 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3304 return -1;
3305 sd->nr_balance_failed = 0;
3306
3307 return 0;
3308}
3309
3310
3311
3312
3313
3314static void idle_balance(int this_cpu, struct rq *this_rq)
3315{
3316 struct sched_domain *sd;
3317 int pulled_task = -1;
3318 unsigned long next_balance = jiffies + HZ;
3319
3320 for_each_domain(this_cpu, sd) {
3321 unsigned long interval;
3322
3323 if (!(sd->flags & SD_LOAD_BALANCE))
3324 continue;
3325
3326 if (sd->flags & SD_BALANCE_NEWIDLE)
3327
3328 pulled_task = load_balance_newidle(this_cpu,
3329 this_rq, sd);
3330
3331 interval = msecs_to_jiffies(sd->balance_interval);
3332 if (time_after(next_balance, sd->last_balance + interval))
3333 next_balance = sd->last_balance + interval;
3334 if (pulled_task)
3335 break;
3336 }
3337 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3338
3339
3340
3341
3342 this_rq->next_balance = next_balance;
3343 }
3344}
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3355{
3356 int target_cpu = busiest_rq->push_cpu;
3357 struct sched_domain *sd;
3358 struct rq *target_rq;
3359
3360
3361 if (busiest_rq->nr_running <= 1)
3362 return;
3363
3364 target_rq = cpu_rq(target_cpu);
3365
3366
3367
3368
3369
3370
3371 BUG_ON(busiest_rq == target_rq);
3372
3373
3374 double_lock_balance(busiest_rq, target_rq);
3375 update_rq_clock(busiest_rq);
3376 update_rq_clock(target_rq);
3377
3378
3379 for_each_domain(target_cpu, sd) {
3380 if ((sd->flags & SD_LOAD_BALANCE) &&
3381 cpu_isset(busiest_cpu, sd->span))
3382 break;
3383 }
3384
3385 if (likely(sd)) {
3386 schedstat_inc(sd, alb_count);
3387
3388 if (move_one_task(target_rq, target_cpu, busiest_rq,
3389 sd, CPU_IDLE))
3390 schedstat_inc(sd, alb_pushed);
3391 else
3392 schedstat_inc(sd, alb_failed);
3393 }
3394 spin_unlock(&target_rq->lock);
3395}
3396
3397#ifdef CONFIG_NO_HZ
3398static struct {
3399 atomic_t load_balancer;
3400 cpumask_t cpu_mask;
3401} nohz ____cacheline_aligned = {
3402 .load_balancer = ATOMIC_INIT(-1),
3403 .cpu_mask = CPU_MASK_NONE,
3404};
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426int select_nohz_load_balancer(int stop_tick)
3427{
3428 int cpu = smp_processor_id();
3429
3430 if (stop_tick) {
3431 cpu_set(cpu, nohz.cpu_mask);
3432 cpu_rq(cpu)->in_nohz_recently = 1;
3433
3434
3435
3436
3437 if (cpu_is_offline(cpu) &&
3438 atomic_read(&nohz.load_balancer) == cpu) {
3439 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3440 BUG();
3441 return 0;
3442 }
3443
3444
3445 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3446 if (atomic_read(&nohz.load_balancer) == cpu)
3447 atomic_set(&nohz.load_balancer, -1);
3448 return 0;
3449 }
3450
3451 if (atomic_read(&nohz.load_balancer) == -1) {
3452
3453 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3454 return 1;
3455 } else if (atomic_read(&nohz.load_balancer) == cpu)
3456 return 1;
3457 } else {
3458 if (!cpu_isset(cpu, nohz.cpu_mask))
3459 return 0;
3460
3461 cpu_clear(cpu, nohz.cpu_mask);
3462
3463 if (atomic_read(&nohz.load_balancer) == cpu)
3464 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3465 BUG();
3466 }
3467 return 0;
3468}
3469#endif
3470
3471static DEFINE_SPINLOCK(balancing);
3472
3473
3474
3475
3476
3477
3478
3479static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3480{
3481 int balance = 1;
3482 struct rq *rq = cpu_rq(cpu);
3483 unsigned long interval;
3484 struct sched_domain *sd;
3485
3486 unsigned long next_balance = jiffies + 60*HZ;
3487 int update_next_balance = 0;
3488
3489 for_each_domain(cpu, sd) {
3490 if (!(sd->flags & SD_LOAD_BALANCE))
3491 continue;
3492
3493 interval = sd->balance_interval;
3494 if (idle != CPU_IDLE)
3495 interval *= sd->busy_factor;
3496
3497
3498 interval = msecs_to_jiffies(interval);
3499 if (unlikely(!interval))
3500 interval = 1;
3501 if (interval > HZ*NR_CPUS/10)
3502 interval = HZ*NR_CPUS/10;
3503
3504
3505 if (sd->flags & SD_SERIALIZE) {
3506 if (!spin_trylock(&balancing))
3507 goto out;
3508 }
3509
3510 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3511 if (load_balance(cpu, rq, sd, idle, &balance)) {
3512
3513
3514
3515
3516
3517 idle = CPU_NOT_IDLE;
3518 }
3519 sd->last_balance = jiffies;
3520 }
3521 if (sd->flags & SD_SERIALIZE)
3522 spin_unlock(&balancing);
3523out:
3524 if (time_after(next_balance, sd->last_balance + interval)) {
3525 next_balance = sd->last_balance + interval;
3526 update_next_balance = 1;
3527 }
3528
3529
3530
3531
3532
3533
3534 if (!balance)
3535 break;
3536 }
3537
3538
3539
3540
3541
3542
3543 if (likely(update_next_balance))
3544 rq->next_balance = next_balance;
3545}
3546
3547
3548
3549
3550
3551
3552static void run_rebalance_domains(struct softirq_action *h)
3553{
3554 int this_cpu = smp_processor_id();
3555 struct rq *this_rq = cpu_rq(this_cpu);
3556 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3557 CPU_IDLE : CPU_NOT_IDLE;
3558
3559 rebalance_domains(this_cpu, idle);
3560
3561#ifdef CONFIG_NO_HZ
3562
3563
3564
3565
3566
3567 if (this_rq->idle_at_tick &&
3568 atomic_read(&nohz.load_balancer) == this_cpu) {
3569 cpumask_t cpus = nohz.cpu_mask;
3570 struct rq *rq;
3571 int balance_cpu;
3572
3573 cpu_clear(this_cpu, cpus);
3574 for_each_cpu_mask(balance_cpu, cpus) {
3575
3576
3577
3578
3579
3580 if (need_resched())
3581 break;
3582
3583 rebalance_domains(balance_cpu, CPU_IDLE);
3584
3585 rq = cpu_rq(balance_cpu);
3586 if (time_after(this_rq->next_balance, rq->next_balance))
3587 this_rq->next_balance = rq->next_balance;
3588 }
3589 }
3590#endif
3591}
3592
3593
3594
3595
3596
3597
3598
3599
3600static inline void trigger_load_balance(struct rq *rq, int cpu)
3601{
3602#ifdef CONFIG_NO_HZ
3603
3604
3605
3606
3607
3608 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3609 rq->in_nohz_recently = 0;
3610
3611 if (atomic_read(&nohz.load_balancer) == cpu) {
3612 cpu_clear(cpu, nohz.cpu_mask);
3613 atomic_set(&nohz.load_balancer, -1);
3614 }
3615
3616 if (atomic_read(&nohz.load_balancer) == -1) {
3617
3618
3619
3620
3621
3622
3623
3624
3625 int ilb = first_cpu(nohz.cpu_mask);
3626
3627 if (ilb != NR_CPUS)
3628 resched_cpu(ilb);
3629 }
3630 }
3631
3632
3633
3634
3635
3636 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3637 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3638 resched_cpu(cpu);
3639 return;
3640 }
3641
3642
3643
3644
3645
3646 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3647 cpu_isset(cpu, nohz.cpu_mask))
3648 return;
3649#endif
3650 if (time_after_eq(jiffies, rq->next_balance))
3651 raise_softirq(SCHED_SOFTIRQ);
3652}
3653
3654#else
3655
3656
3657
3658
3659static inline void idle_balance(int cpu, struct rq *rq)
3660{
3661}
3662
3663#endif
3664
3665DEFINE_PER_CPU(struct kernel_stat, kstat);
3666
3667EXPORT_PER_CPU_SYMBOL(kstat);
3668
3669
3670
3671
3672
3673unsigned long long task_sched_runtime(struct task_struct *p)
3674{
3675 unsigned long flags;
3676 u64 ns, delta_exec;
3677 struct rq *rq;
3678
3679 rq = task_rq_lock(p, &flags);
3680 ns = p->se.sum_exec_runtime;
3681 if (task_current(rq, p)) {
3682 update_rq_clock(rq);
3683 delta_exec = rq->clock - p->se.exec_start;
3684 if ((s64)delta_exec > 0)
3685 ns += delta_exec;
3686 }
3687 task_rq_unlock(rq, &flags);
3688
3689 return ns;
3690}
3691
3692
3693
3694
3695
3696
3697void account_user_time(struct task_struct *p, cputime_t cputime)
3698{
3699 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3700 cputime64_t tmp;
3701
3702 p->utime = cputime_add(p->utime, cputime);
3703
3704
3705 tmp = cputime_to_cputime64(cputime);
3706 if (TASK_NICE(p) > 0)
3707 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3708 else
3709 cpustat->user = cputime64_add(cpustat->user, tmp);
3710}
3711
3712
3713
3714
3715
3716
3717static void account_guest_time(struct task_struct *p, cputime_t cputime)
3718{
3719 cputime64_t tmp;
3720 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3721
3722 tmp = cputime_to_cputime64(cputime);
3723
3724 p->utime = cputime_add(p->utime, cputime);
3725 p->gtime = cputime_add(p->gtime, cputime);
3726
3727 cpustat->user = cputime64_add(cpustat->user, tmp);
3728 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3729}
3730
3731
3732
3733
3734
3735
3736void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
3737{
3738 p->utimescaled = cputime_add(p->utimescaled, cputime);
3739}
3740
3741
3742
3743
3744
3745
3746
3747void account_system_time(struct task_struct *p, int hardirq_offset,
3748 cputime_t cputime)
3749{
3750 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3751 struct rq *rq = this_rq();
3752 cputime64_t tmp;
3753
3754 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
3755 return account_guest_time(p, cputime);
3756
3757 p->stime = cputime_add(p->stime, cputime);
3758
3759
3760 tmp = cputime_to_cputime64(cputime);
3761 if (hardirq_count() - hardirq_offset)
3762 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3763 else if (softirq_count())
3764 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3765 else if (p != rq->idle)
3766 cpustat->system = cputime64_add(cpustat->system, tmp);
3767 else if (atomic_read(&rq->nr_iowait) > 0)
3768 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3769 else
3770 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3771
3772 acct_update_integrals(p);
3773}
3774
3775
3776
3777
3778
3779
3780
3781void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
3782{
3783 p->stimescaled = cputime_add(p->stimescaled, cputime);
3784}
3785
3786
3787
3788
3789
3790
3791void account_steal_time(struct task_struct *p, cputime_t steal)
3792{
3793 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3794 cputime64_t tmp = cputime_to_cputime64(steal);
3795 struct rq *rq = this_rq();
3796
3797 if (p == rq->idle) {
3798 p->stime = cputime_add(p->stime, steal);
3799 if (atomic_read(&rq->nr_iowait) > 0)
3800 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3801 else
3802 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3803 } else
3804 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3805}
3806
3807
3808
3809
3810
3811
3812
3813
3814void scheduler_tick(void)
3815{
3816 int cpu = smp_processor_id();
3817 struct rq *rq = cpu_rq(cpu);
3818 struct task_struct *curr = rq->curr;
3819 u64 next_tick = rq->tick_timestamp + TICK_NSEC;
3820
3821 spin_lock(&rq->lock);
3822 __update_rq_clock(rq);
3823
3824
3825
3826 if (unlikely(rq->clock < next_tick)) {
3827 rq->clock = next_tick;
3828 rq->clock_underflows++;
3829 }
3830 rq->tick_timestamp = rq->clock;
3831 update_cpu_load(rq);
3832 curr->sched_class->task_tick(rq, curr, 0);
3833 update_sched_rt_period(rq);
3834 spin_unlock(&rq->lock);
3835
3836#ifdef CONFIG_SMP
3837 rq->idle_at_tick = idle_cpu(cpu);
3838 trigger_load_balance(rq, cpu);
3839#endif
3840}
3841
3842#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3843
3844void __kprobes add_preempt_count(int val)
3845{
3846
3847
3848
3849 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3850 return;
3851 preempt_count() += val;
3852
3853
3854
3855 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3856 PREEMPT_MASK - 10);
3857}
3858EXPORT_SYMBOL(add_preempt_count);
3859
3860void __kprobes sub_preempt_count(int val)
3861{
3862
3863
3864
3865 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3866 return;
3867
3868
3869
3870 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3871 !(preempt_count() & PREEMPT_MASK)))
3872 return;
3873
3874 preempt_count() -= val;
3875}
3876EXPORT_SYMBOL(sub_preempt_count);
3877
3878#endif
3879
3880
3881
3882
3883static noinline void __schedule_bug(struct task_struct *prev)
3884{
3885 struct pt_regs *regs = get_irq_regs();
3886
3887 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3888 prev->comm, prev->pid, preempt_count());
3889
3890 debug_show_held_locks(prev);
3891 if (irqs_disabled())
3892 print_irqtrace_events(prev);
3893
3894 if (regs)
3895 show_regs(regs);
3896 else
3897 dump_stack();
3898}
3899
3900
3901
3902
3903static inline void schedule_debug(struct task_struct *prev)
3904{
3905
3906
3907
3908
3909
3910 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3911 __schedule_bug(prev);
3912
3913 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3914
3915 schedstat_inc(this_rq(), sched_count);
3916#ifdef CONFIG_SCHEDSTATS
3917 if (unlikely(prev->lock_depth >= 0)) {
3918 schedstat_inc(this_rq(), bkl_count);
3919 schedstat_inc(prev, sched_info.bkl_count);
3920 }
3921#endif
3922}
3923
3924
3925
3926
3927static inline struct task_struct *
3928pick_next_task(struct rq *rq, struct task_struct *prev)
3929{
3930 const struct sched_class *class;
3931 struct task_struct *p;
3932
3933
3934
3935
3936
3937 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3938 p = fair_sched_class.pick_next_task(rq);
3939 if (likely(p))
3940 return p;
3941 }
3942
3943 class = sched_class_highest;
3944 for ( ; ; ) {
3945 p = class->pick_next_task(rq);
3946 if (p)
3947 return p;
3948
3949
3950
3951
3952 class = class->next;
3953 }
3954}
3955
3956
3957
3958
3959asmlinkage void __sched schedule(void)
3960{
3961 struct task_struct *prev, *next;
3962 unsigned long *switch_count;
3963 struct rq *rq;
3964 int cpu;
3965
3966need_resched:
3967 preempt_disable();
3968 cpu = smp_processor_id();
3969 rq = cpu_rq(cpu);
3970 rcu_qsctr_inc(cpu);
3971 prev = rq->curr;
3972 switch_count = &prev->nivcsw;
3973
3974 release_kernel_lock(prev);
3975need_resched_nonpreemptible:
3976
3977 schedule_debug(prev);
3978
3979 hrtick_clear(rq);
3980
3981
3982
3983
3984 local_irq_disable();
3985 __update_rq_clock(rq);
3986 spin_lock(&rq->lock);
3987 clear_tsk_need_resched(prev);
3988
3989 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3990 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3991 signal_pending(prev))) {
3992 prev->state = TASK_RUNNING;
3993 } else {
3994 deactivate_task(rq, prev, 1);
3995 }
3996 switch_count = &prev->nvcsw;
3997 }
3998
3999#ifdef CONFIG_SMP
4000 if (prev->sched_class->pre_schedule)
4001 prev->sched_class->pre_schedule(rq, prev);
4002#endif
4003
4004 if (unlikely(!rq->nr_running))
4005 idle_balance(cpu, rq);
4006
4007 prev->sched_class->put_prev_task(rq, prev);
4008 next = pick_next_task(rq, prev);
4009
4010 sched_info_switch(prev, next);
4011
4012 if (likely(prev != next)) {
4013 rq->nr_switches++;
4014 rq->curr = next;
4015 ++*switch_count;
4016
4017 context_switch(rq, prev, next);
4018
4019
4020
4021
4022 cpu = smp_processor_id();
4023 rq = cpu_rq(cpu);
4024 } else
4025 spin_unlock_irq(&rq->lock);
4026
4027 hrtick_set(rq);
4028
4029 if (unlikely(reacquire_kernel_lock(current) < 0))
4030 goto need_resched_nonpreemptible;
4031
4032 preempt_enable_no_resched();
4033 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4034 goto need_resched;
4035}
4036EXPORT_SYMBOL(schedule);
4037
4038#ifdef CONFIG_PREEMPT
4039
4040
4041
4042
4043
4044asmlinkage void __sched preempt_schedule(void)
4045{
4046 struct thread_info *ti = current_thread_info();
4047 struct task_struct *task = current;
4048 int saved_lock_depth;
4049
4050
4051
4052
4053
4054 if (likely(ti->preempt_count || irqs_disabled()))
4055 return;
4056
4057 do {
4058 add_preempt_count(PREEMPT_ACTIVE);
4059
4060
4061
4062
4063
4064
4065 saved_lock_depth = task->lock_depth;
4066 task->lock_depth = -1;
4067 schedule();
4068 task->lock_depth = saved_lock_depth;
4069 sub_preempt_count(PREEMPT_ACTIVE);
4070
4071
4072
4073
4074
4075 barrier();
4076 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4077}
4078EXPORT_SYMBOL(preempt_schedule);
4079
4080
4081
4082
4083
4084
4085
4086asmlinkage void __sched preempt_schedule_irq(void)
4087{
4088 struct thread_info *ti = current_thread_info();
4089 struct task_struct *task = current;
4090 int saved_lock_depth;
4091
4092
4093 BUG_ON(ti->preempt_count || !irqs_disabled());
4094
4095 do {
4096 add_preempt_count(PREEMPT_ACTIVE);
4097
4098
4099
4100
4101
4102
4103 saved_lock_depth = task->lock_depth;
4104 task->lock_depth = -1;
4105 local_irq_enable();
4106 schedule();
4107 local_irq_disable();
4108 task->lock_depth = saved_lock_depth;
4109 sub_preempt_count(PREEMPT_ACTIVE);
4110
4111
4112
4113
4114
4115 barrier();
4116 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4117}
4118
4119#endif
4120
4121int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
4122 void *key)
4123{
4124 return try_to_wake_up(curr->private, mode, sync);
4125}
4126EXPORT_SYMBOL(default_wake_function);
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4138 int nr_exclusive, int sync, void *key)
4139{
4140 wait_queue_t *curr, *next;
4141
4142 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4143 unsigned flags = curr->flags;
4144
4145 if (curr->func(curr, mode, sync, key) &&
4146 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4147 break;
4148 }
4149}
4150
4151
4152
4153
4154
4155
4156
4157
4158void __wake_up(wait_queue_head_t *q, unsigned int mode,
4159 int nr_exclusive, void *key)
4160{
4161 unsigned long flags;
4162
4163 spin_lock_irqsave(&q->lock, flags);
4164 __wake_up_common(q, mode, nr_exclusive, 0, key);
4165 spin_unlock_irqrestore(&q->lock, flags);
4166}
4167EXPORT_SYMBOL(__wake_up);
4168
4169
4170
4171
4172void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4173{
4174 __wake_up_common(q, mode, 1, 0, NULL);
4175}
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190void
4191__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4192{
4193 unsigned long flags;
4194 int sync = 1;
4195
4196 if (unlikely(!q))
4197 return;
4198
4199 if (unlikely(!nr_exclusive))
4200 sync = 0;
4201
4202 spin_lock_irqsave(&q->lock, flags);
4203 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
4204 spin_unlock_irqrestore(&q->lock, flags);
4205}
4206EXPORT_SYMBOL_GPL(__wake_up_sync);
4207
4208void complete(struct completion *x)
4209{
4210 unsigned long flags;
4211
4212 spin_lock_irqsave(&x->wait.lock, flags);
4213 x->done++;
4214 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4215 spin_unlock_irqrestore(&x->wait.lock, flags);
4216}
4217EXPORT_SYMBOL(complete);
4218
4219void complete_all(struct completion *x)
4220{
4221 unsigned long flags;
4222
4223 spin_lock_irqsave(&x->wait.lock, flags);
4224 x->done += UINT_MAX/2;
4225 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4226 spin_unlock_irqrestore(&x->wait.lock, flags);
4227}
4228EXPORT_SYMBOL(complete_all);
4229
4230static inline long __sched
4231do_wait_for_common(struct completion *x, long timeout, int state)
4232{
4233 if (!x->done) {
4234 DECLARE_WAITQUEUE(wait, current);
4235
4236 wait.flags |= WQ_FLAG_EXCLUSIVE;
4237 __add_wait_queue_tail(&x->wait, &wait);
4238 do {
4239 if ((state == TASK_INTERRUPTIBLE &&
4240 signal_pending(current)) ||
4241 (state == TASK_KILLABLE &&
4242 fatal_signal_pending(current))) {
4243 __remove_wait_queue(&x->wait, &wait);
4244 return -ERESTARTSYS;
4245 }
4246 __set_current_state(state);
4247 spin_unlock_irq(&x->wait.lock);
4248 timeout = schedule_timeout(timeout);
4249 spin_lock_irq(&x->wait.lock);
4250 if (!timeout) {
4251 __remove_wait_queue(&x->wait, &wait);
4252 return timeout;
4253 }
4254 } while (!x->done);
4255 __remove_wait_queue(&x->wait, &wait);
4256 }
4257 x->done--;
4258 return timeout;
4259}
4260
4261static long __sched
4262wait_for_common(struct completion *x, long timeout, int state)
4263{
4264 might_sleep();
4265
4266 spin_lock_irq(&x->wait.lock);
4267 timeout = do_wait_for_common(x, timeout, state);
4268 spin_unlock_irq(&x->wait.lock);
4269 return timeout;
4270}
4271
4272void __sched wait_for_completion(struct completion *x)
4273{
4274 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4275}
4276EXPORT_SYMBOL(wait_for_completion);
4277
4278unsigned long __sched
4279wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4280{
4281 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4282}
4283EXPORT_SYMBOL(wait_for_completion_timeout);
4284
4285int __sched wait_for_completion_interruptible(struct completion *x)
4286{
4287 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4288 if (t == -ERESTARTSYS)
4289 return t;
4290 return 0;
4291}
4292EXPORT_SYMBOL(wait_for_completion_interruptible);
4293
4294unsigned long __sched
4295wait_for_completion_interruptible_timeout(struct completion *x,
4296 unsigned long timeout)
4297{
4298 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4299}
4300EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4301
4302int __sched wait_for_completion_killable(struct completion *x)
4303{
4304 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4305 if (t == -ERESTARTSYS)
4306 return t;
4307 return 0;
4308}
4309EXPORT_SYMBOL(wait_for_completion_killable);
4310
4311static long __sched
4312sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4313{
4314 unsigned long flags;
4315 wait_queue_t wait;
4316
4317 init_waitqueue_entry(&wait, current);
4318
4319 __set_current_state(state);
4320
4321 spin_lock_irqsave(&q->lock, flags);
4322 __add_wait_queue(q, &wait);
4323 spin_unlock(&q->lock);
4324 timeout = schedule_timeout(timeout);
4325 spin_lock_irq(&q->lock);
4326 __remove_wait_queue(q, &wait);
4327 spin_unlock_irqrestore(&q->lock, flags);
4328
4329 return timeout;
4330}
4331
4332void __sched interruptible_sleep_on(wait_queue_head_t *q)
4333{
4334 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4335}
4336EXPORT_SYMBOL(interruptible_sleep_on);
4337
4338long __sched
4339interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4340{
4341 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4342}
4343EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4344
4345void __sched sleep_on(wait_queue_head_t *q)
4346{
4347 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4348}
4349EXPORT_SYMBOL(sleep_on);
4350
4351long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4352{
4353 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4354}
4355EXPORT_SYMBOL(sleep_on_timeout);
4356
4357#ifdef CONFIG_RT_MUTEXES
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369void rt_mutex_setprio(struct task_struct *p, int prio)
4370{
4371 unsigned long flags;
4372 int oldprio, on_rq, running;
4373 struct rq *rq;
4374 const struct sched_class *prev_class = p->sched_class;
4375
4376 BUG_ON(prio < 0 || prio > MAX_PRIO);
4377
4378 rq = task_rq_lock(p, &flags);
4379 update_rq_clock(rq);
4380
4381 oldprio = p->prio;
4382 on_rq = p->se.on_rq;
4383 running = task_current(rq, p);
4384 if (on_rq)
4385 dequeue_task(rq, p, 0);
4386 if (running)
4387 p->sched_class->put_prev_task(rq, p);
4388
4389 if (rt_prio(prio))
4390 p->sched_class = &rt_sched_class;
4391 else
4392 p->sched_class = &fair_sched_class;
4393
4394 p->prio = prio;
4395
4396 if (running)
4397 p->sched_class->set_curr_task(rq);
4398 if (on_rq) {
4399 enqueue_task(rq, p, 0);
4400
4401 check_class_changed(rq, p, prev_class, oldprio, running);
4402 }
4403 task_rq_unlock(rq, &flags);
4404}
4405
4406#endif
4407
4408void set_user_nice(struct task_struct *p, long nice)
4409{
4410 int old_prio, delta, on_rq;
4411 unsigned long flags;
4412 struct rq *rq;
4413
4414 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4415 return;
4416
4417
4418
4419
4420 rq = task_rq_lock(p, &flags);
4421 update_rq_clock(rq);
4422
4423
4424
4425
4426
4427
4428 if (task_has_rt_policy(p)) {
4429 p->static_prio = NICE_TO_PRIO(nice);
4430 goto out_unlock;
4431 }
4432 on_rq = p->se.on_rq;
4433 if (on_rq) {
4434 dequeue_task(rq, p, 0);
4435 dec_load(rq, p);
4436 }
4437
4438 p->static_prio = NICE_TO_PRIO(nice);
4439 set_load_weight(p);
4440 old_prio = p->prio;
4441 p->prio = effective_prio(p);
4442 delta = p->prio - old_prio;
4443
4444 if (on_rq) {
4445 enqueue_task(rq, p, 0);
4446 inc_load(rq, p);
4447
4448
4449
4450
4451 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4452 resched_task(rq->curr);
4453 }
4454out_unlock:
4455 task_rq_unlock(rq, &flags);
4456}
4457EXPORT_SYMBOL(set_user_nice);
4458
4459
4460
4461
4462
4463
4464int can_nice(const struct task_struct *p, const int nice)
4465{
4466
4467 int nice_rlim = 20 - nice;
4468
4469 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
4470 capable(CAP_SYS_NICE));
4471}
4472
4473#ifdef __ARCH_WANT_SYS_NICE
4474
4475
4476
4477
4478
4479
4480
4481
4482asmlinkage long sys_nice(int increment)
4483{
4484 long nice, retval;
4485
4486
4487
4488
4489
4490
4491 if (increment < -40)
4492 increment = -40;
4493 if (increment > 40)
4494 increment = 40;
4495
4496 nice = PRIO_TO_NICE(current->static_prio) + increment;
4497 if (nice < -20)
4498 nice = -20;
4499 if (nice > 19)
4500 nice = 19;
4501
4502 if (increment < 0 && !can_nice(current, nice))
4503 return -EPERM;
4504
4505 retval = security_task_setnice(current, nice);
4506 if (retval)
4507 return retval;
4508
4509 set_user_nice(current, nice);
4510 return 0;
4511}
4512
4513#endif
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523int task_prio(const struct task_struct *p)
4524{
4525 return p->prio - MAX_RT_PRIO;
4526}
4527
4528
4529
4530
4531
4532int task_nice(const struct task_struct *p)
4533{
4534 return TASK_NICE(p);
4535}
4536EXPORT_SYMBOL(task_nice);
4537
4538
4539
4540
4541
4542int idle_cpu(int cpu)
4543{
4544 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4545}
4546
4547
4548
4549
4550
4551struct task_struct *idle_task(int cpu)
4552{
4553 return cpu_rq(cpu)->idle;
4554}
4555
4556
4557
4558
4559
4560static struct task_struct *find_process_by_pid(pid_t pid)
4561{
4562 return pid ? find_task_by_vpid(pid) : current;
4563}
4564
4565
4566static void
4567__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4568{
4569 BUG_ON(p->se.on_rq);
4570
4571 p->policy = policy;
4572 switch (p->policy) {
4573 case SCHED_NORMAL:
4574 case SCHED_BATCH:
4575 case SCHED_IDLE:
4576 p->sched_class = &fair_sched_class;
4577 break;
4578 case SCHED_FIFO:
4579 case SCHED_RR:
4580 p->sched_class = &rt_sched_class;
4581 break;
4582 }
4583
4584 p->rt_priority = prio;
4585 p->normal_prio = normal_prio(p);
4586
4587 p->prio = rt_mutex_getprio(p);
4588 set_load_weight(p);
4589}
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599int sched_setscheduler(struct task_struct *p, int policy,
4600 struct sched_param *param)
4601{
4602 int retval, oldprio, oldpolicy = -1, on_rq, running;
4603 unsigned long flags;
4604 const struct sched_class *prev_class = p->sched_class;
4605 struct rq *rq;
4606
4607
4608 BUG_ON(in_interrupt());
4609recheck:
4610
4611 if (policy < 0)
4612 policy = oldpolicy = p->policy;
4613 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4614 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4615 policy != SCHED_IDLE)
4616 return -EINVAL;
4617
4618
4619
4620
4621
4622 if (param->sched_priority < 0 ||
4623 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4624 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4625 return -EINVAL;
4626 if (rt_policy(policy) != (param->sched_priority != 0))
4627 return -EINVAL;
4628
4629
4630
4631
4632 if (!capable(CAP_SYS_NICE)) {
4633 if (rt_policy(policy)) {
4634 unsigned long rlim_rtprio;
4635
4636 if (!lock_task_sighand(p, &flags))
4637 return -ESRCH;
4638 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4639 unlock_task_sighand(p, &flags);
4640
4641
4642 if (policy != p->policy && !rlim_rtprio)
4643 return -EPERM;
4644
4645
4646 if (param->sched_priority > p->rt_priority &&
4647 param->sched_priority > rlim_rtprio)
4648 return -EPERM;
4649 }
4650
4651
4652
4653
4654 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4655 return -EPERM;
4656
4657
4658 if ((current->euid != p->euid) &&
4659 (current->euid != p->uid))
4660 return -EPERM;
4661 }
4662
4663#ifdef CONFIG_RT_GROUP_SCHED
4664
4665
4666
4667
4668 if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
4669 return -EPERM;
4670#endif
4671
4672 retval = security_task_setscheduler(p, policy, param);
4673 if (retval)
4674 return retval;
4675
4676
4677
4678
4679 spin_lock_irqsave(&p->pi_lock, flags);
4680
4681
4682
4683
4684 rq = __task_rq_lock(p);
4685
4686 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4687 policy = oldpolicy = -1;
4688 __task_rq_unlock(rq);
4689 spin_unlock_irqrestore(&p->pi_lock, flags);
4690 goto recheck;
4691 }
4692 update_rq_clock(rq);
4693 on_rq = p->se.on_rq;
4694 running = task_current(rq, p);
4695 if (on_rq)
4696 deactivate_task(rq, p, 0);
4697 if (running)
4698 p->sched_class->put_prev_task(rq, p);
4699
4700 oldprio = p->prio;
4701 __setscheduler(rq, p, policy, param->sched_priority);
4702
4703 if (running)
4704 p->sched_class->set_curr_task(rq);
4705 if (on_rq) {
4706 activate_task(rq, p, 0);
4707
4708 check_class_changed(rq, p, prev_class, oldprio, running);
4709 }
4710 __task_rq_unlock(rq);
4711 spin_unlock_irqrestore(&p->pi_lock, flags);
4712
4713 rt_mutex_adjust_pi(p);
4714
4715 return 0;
4716}
4717EXPORT_SYMBOL_GPL(sched_setscheduler);
4718
4719static int
4720do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4721{
4722 struct sched_param lparam;
4723 struct task_struct *p;
4724 int retval;
4725
4726 if (!param || pid < 0)
4727 return -EINVAL;
4728 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4729 return -EFAULT;
4730
4731 rcu_read_lock();
4732 retval = -ESRCH;
4733 p = find_process_by_pid(pid);
4734 if (p != NULL)
4735 retval = sched_setscheduler(p, policy, &lparam);
4736 rcu_read_unlock();
4737
4738 return retval;
4739}
4740
4741
4742
4743
4744
4745
4746
4747asmlinkage long
4748sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4749{
4750
4751 if (policy < 0)
4752 return -EINVAL;
4753
4754 return do_sched_setscheduler(pid, policy, param);
4755}
4756
4757
4758
4759
4760
4761
4762asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4763{
4764 return do_sched_setscheduler(pid, -1, param);
4765}
4766
4767
4768
4769
4770
4771asmlinkage long sys_sched_getscheduler(pid_t pid)
4772{
4773 struct task_struct *p;
4774 int retval;
4775
4776 if (pid < 0)
4777 return -EINVAL;
4778
4779 retval = -ESRCH;
4780 read_lock(&tasklist_lock);
4781 p = find_process_by_pid(pid);
4782 if (p) {
4783 retval = security_task_getscheduler(p);
4784 if (!retval)
4785 retval = p->policy;
4786 }
4787 read_unlock(&tasklist_lock);
4788 return retval;
4789}
4790
4791
4792
4793
4794
4795
4796asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4797{
4798 struct sched_param lp;
4799 struct task_struct *p;
4800 int retval;
4801
4802 if (!param || pid < 0)
4803 return -EINVAL;
4804
4805 read_lock(&tasklist_lock);
4806 p = find_process_by_pid(pid);
4807 retval = -ESRCH;
4808 if (!p)
4809 goto out_unlock;
4810
4811 retval = security_task_getscheduler(p);
4812 if (retval)
4813 goto out_unlock;
4814
4815 lp.sched_priority = p->rt_priority;
4816 read_unlock(&tasklist_lock);
4817
4818
4819
4820
4821 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4822
4823 return retval;
4824
4825out_unlock:
4826 read_unlock(&tasklist_lock);
4827 return retval;
4828}
4829
4830long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4831{
4832 cpumask_t cpus_allowed;
4833 struct task_struct *p;
4834 int retval;
4835
4836 get_online_cpus();
4837 read_lock(&tasklist_lock);
4838
4839 p = find_process_by_pid(pid);
4840 if (!p) {
4841 read_unlock(&tasklist_lock);
4842 put_online_cpus();
4843 return -ESRCH;
4844 }
4845
4846
4847
4848
4849
4850
4851 get_task_struct(p);
4852 read_unlock(&tasklist_lock);
4853
4854 retval = -EPERM;
4855 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4856 !capable(CAP_SYS_NICE))
4857 goto out_unlock;
4858
4859 retval = security_task_setscheduler(p, 0, NULL);
4860 if (retval)
4861 goto out_unlock;
4862
4863 cpus_allowed = cpuset_cpus_allowed(p);
4864 cpus_and(new_mask, new_mask, cpus_allowed);
4865 again:
4866 retval = set_cpus_allowed(p, new_mask);
4867
4868 if (!retval) {
4869 cpus_allowed = cpuset_cpus_allowed(p);
4870 if (!cpus_subset(new_mask, cpus_allowed)) {
4871
4872
4873
4874
4875
4876 new_mask = cpus_allowed;
4877 goto again;
4878 }
4879 }
4880out_unlock:
4881 put_task_struct(p);
4882 put_online_cpus();
4883 return retval;
4884}
4885
4886static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4887 cpumask_t *new_mask)
4888{
4889 if (len < sizeof(cpumask_t)) {
4890 memset(new_mask, 0, sizeof(cpumask_t));
4891 } else if (len > sizeof(cpumask_t)) {
4892 len = sizeof(cpumask_t);
4893 }
4894 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4895}
4896
4897
4898
4899
4900
4901
4902
4903asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4904 unsigned long __user *user_mask_ptr)
4905{
4906 cpumask_t new_mask;
4907 int retval;
4908
4909 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4910 if (retval)
4911 return retval;
4912
4913 return sched_setaffinity(pid, new_mask);
4914}
4915
4916
4917
4918
4919
4920
4921
4922
4923cpumask_t cpu_present_map __read_mostly;
4924EXPORT_SYMBOL(cpu_present_map);
4925
4926#ifndef CONFIG_SMP
4927cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4928EXPORT_SYMBOL(cpu_online_map);
4929
4930cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4931EXPORT_SYMBOL(cpu_possible_map);
4932#endif
4933
4934long sched_getaffinity(pid_t pid, cpumask_t *mask)
4935{
4936 struct task_struct *p;
4937 int retval;
4938
4939 get_online_cpus();
4940 read_lock(&tasklist_lock);
4941
4942 retval = -ESRCH;
4943 p = find_process_by_pid(pid);
4944 if (!p)
4945 goto out_unlock;
4946
4947 retval = security_task_getscheduler(p);
4948 if (retval)
4949 goto out_unlock;
4950
4951 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4952
4953out_unlock:
4954 read_unlock(&tasklist_lock);
4955 put_online_cpus();
4956
4957 return retval;
4958}
4959
4960
4961
4962
4963
4964
4965
4966asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4967 unsigned long __user *user_mask_ptr)
4968{
4969 int ret;
4970 cpumask_t mask;
4971
4972 if (len < sizeof(cpumask_t))
4973 return -EINVAL;
4974
4975 ret = sched_getaffinity(pid, &mask);
4976 if (ret < 0)
4977 return ret;
4978
4979 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4980 return -EFAULT;
4981
4982 return sizeof(cpumask_t);
4983}
4984
4985
4986
4987
4988
4989
4990
4991asmlinkage long sys_sched_yield(void)
4992{
4993 struct rq *rq = this_rq_lock();
4994
4995 schedstat_inc(rq, yld_count);
4996 current->sched_class->yield_task(rq);
4997
4998
4999
5000
5001
5002 __release(rq->lock);
5003 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5004 _raw_spin_unlock(&rq->lock);
5005 preempt_enable_no_resched();
5006
5007 schedule();
5008
5009 return 0;
5010}
5011
5012static void __cond_resched(void)
5013{
5014#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5015 __might_sleep(__FILE__, __LINE__);
5016#endif
5017
5018
5019
5020
5021
5022 do {
5023 add_preempt_count(PREEMPT_ACTIVE);
5024 schedule();
5025 sub_preempt_count(PREEMPT_ACTIVE);
5026 } while (need_resched());
5027}
5028
5029#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
5030int __sched _cond_resched(void)
5031{
5032 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
5033 system_state == SYSTEM_RUNNING) {
5034 __cond_resched();
5035 return 1;
5036 }
5037 return 0;
5038}
5039EXPORT_SYMBOL(_cond_resched);
5040#endif
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050int cond_resched_lock(spinlock_t *lock)
5051{
5052 int resched = need_resched() && system_state == SYSTEM_RUNNING;
5053 int ret = 0;
5054
5055 if (spin_needbreak(lock) || resched) {
5056 spin_unlock(lock);
5057 if (resched && need_resched())
5058 __cond_resched();
5059 else
5060 cpu_relax();
5061 ret = 1;
5062 spin_lock(lock);
5063 }
5064 return ret;
5065}
5066EXPORT_SYMBOL(cond_resched_lock);
5067
5068int __sched cond_resched_softirq(void)
5069{
5070 BUG_ON(!in_softirq());
5071
5072 if (need_resched() && system_state == SYSTEM_RUNNING) {
5073 local_bh_enable();
5074 __cond_resched();
5075 local_bh_disable();
5076 return 1;
5077 }
5078 return 0;
5079}
5080EXPORT_SYMBOL(cond_resched_softirq);
5081
5082
5083
5084
5085
5086
5087
5088void __sched yield(void)
5089{
5090 set_current_state(TASK_RUNNING);
5091 sys_sched_yield();
5092}
5093EXPORT_SYMBOL(yield);
5094
5095
5096
5097
5098
5099
5100
5101
5102void __sched io_schedule(void)
5103{
5104 struct rq *rq = &__raw_get_cpu_var(runqueues);
5105
5106 delayacct_blkio_start();
5107 atomic_inc(&rq->nr_iowait);
5108 schedule();
5109 atomic_dec(&rq->nr_iowait);
5110 delayacct_blkio_end();
5111}
5112EXPORT_SYMBOL(io_schedule);
5113
5114long __sched io_schedule_timeout(long timeout)
5115{
5116 struct rq *rq = &__raw_get_cpu_var(runqueues);
5117 long ret;
5118
5119 delayacct_blkio_start();
5120 atomic_inc(&rq->nr_iowait);
5121 ret = schedule_timeout(timeout);
5122 atomic_dec(&rq->nr_iowait);
5123 delayacct_blkio_end();
5124 return ret;
5125}
5126
5127
5128
5129
5130
5131
5132
5133
5134asmlinkage long sys_sched_get_priority_max(int policy)
5135{
5136 int ret = -EINVAL;
5137
5138 switch (policy) {
5139 case SCHED_FIFO:
5140 case SCHED_RR:
5141 ret = MAX_USER_RT_PRIO-1;
5142 break;
5143 case SCHED_NORMAL:
5144 case SCHED_BATCH:
5145 case SCHED_IDLE:
5146 ret = 0;
5147 break;
5148 }
5149 return ret;
5150}
5151
5152
5153
5154
5155
5156
5157
5158
5159asmlinkage long sys_sched_get_priority_min(int policy)
5160{
5161 int ret = -EINVAL;
5162
5163 switch (policy) {
5164 case SCHED_FIFO:
5165 case SCHED_RR:
5166 ret = 1;
5167 break;
5168 case SCHED_NORMAL:
5169 case SCHED_BATCH:
5170 case SCHED_IDLE:
5171 ret = 0;
5172 }
5173 return ret;
5174}
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184asmlinkage
5185long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
5186{
5187 struct task_struct *p;
5188 unsigned int time_slice;
5189 int retval;
5190 struct timespec t;
5191
5192 if (pid < 0)
5193 return -EINVAL;
5194
5195 retval = -ESRCH;
5196 read_lock(&tasklist_lock);
5197 p = find_process_by_pid(pid);
5198 if (!p)
5199 goto out_unlock;
5200
5201 retval = security_task_getscheduler(p);
5202 if (retval)
5203 goto out_unlock;
5204
5205
5206
5207
5208
5209 time_slice = 0;
5210 if (p->policy == SCHED_RR) {
5211 time_slice = DEF_TIMESLICE;
5212 } else if (p->policy != SCHED_FIFO) {
5213 struct sched_entity *se = &p->se;
5214 unsigned long flags;
5215 struct rq *rq;
5216
5217 rq = task_rq_lock(p, &flags);
5218 if (rq->cfs.load.weight)
5219 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5220 task_rq_unlock(rq, &flags);
5221 }
5222 read_unlock(&tasklist_lock);
5223 jiffies_to_timespec(time_slice, &t);
5224 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5225 return retval;
5226
5227out_unlock:
5228 read_unlock(&tasklist_lock);
5229 return retval;
5230}
5231
5232static const char stat_nam[] = "RSDTtZX";
5233
5234void sched_show_task(struct task_struct *p)
5235{
5236 unsigned long free = 0;
5237 unsigned state;
5238
5239 state = p->state ? __ffs(p->state) + 1 : 0;
5240 printk(KERN_INFO "%-13.13s %c", p->comm,
5241 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5242#if BITS_PER_LONG == 32
5243 if (state == TASK_RUNNING)
5244 printk(KERN_CONT " running ");
5245 else
5246 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5247#else
5248 if (state == TASK_RUNNING)
5249 printk(KERN_CONT " running task ");
5250 else
5251 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5252#endif
5253#ifdef CONFIG_DEBUG_STACK_USAGE
5254 {
5255 unsigned long *n = end_of_stack(p);
5256 while (!*n)
5257 n++;
5258 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5259 }
5260#endif
5261 printk(KERN_CONT "%5lu %5d %6d\n", free,
5262 task_pid_nr(p), task_pid_nr(p->real_parent));
5263
5264 show_stack(p, NULL);
5265}
5266
5267void show_state_filter(unsigned long state_filter)
5268{
5269 struct task_struct *g, *p;
5270
5271#if BITS_PER_LONG == 32
5272 printk(KERN_INFO
5273 " task PC stack pid father\n");
5274#else
5275 printk(KERN_INFO
5276 " task PC stack pid father\n");
5277#endif
5278 read_lock(&tasklist_lock);
5279 do_each_thread(g, p) {
5280
5281
5282
5283
5284 touch_nmi_watchdog();
5285 if (!state_filter || (p->state & state_filter))
5286 sched_show_task(p);
5287 } while_each_thread(g, p);
5288
5289 touch_all_softlockup_watchdogs();
5290
5291#ifdef CONFIG_SCHED_DEBUG
5292 sysrq_sched_debug_show();
5293#endif
5294 read_unlock(&tasklist_lock);
5295
5296
5297
5298 if (state_filter == -1)
5299 debug_show_all_locks();
5300}
5301
5302void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5303{
5304 idle->sched_class = &idle_sched_class;
5305}
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315void __cpuinit init_idle(struct task_struct *idle, int cpu)
5316{
5317 struct rq *rq = cpu_rq(cpu);
5318 unsigned long flags;
5319
5320 __sched_fork(idle);
5321 idle->se.exec_start = sched_clock();
5322
5323 idle->prio = idle->normal_prio = MAX_PRIO;
5324 idle->cpus_allowed = cpumask_of_cpu(cpu);
5325 __set_task_cpu(idle, cpu);
5326
5327 spin_lock_irqsave(&rq->lock, flags);
5328 rq->curr = rq->idle = idle;
5329#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5330 idle->oncpu = 1;
5331#endif
5332 spin_unlock_irqrestore(&rq->lock, flags);
5333
5334
5335 task_thread_info(idle)->preempt_count = 0;
5336
5337
5338
5339
5340 idle->sched_class = &idle_sched_class;
5341}
5342
5343
5344
5345
5346
5347
5348
5349
5350cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361static inline void sched_init_granularity(void)
5362{
5363 unsigned int factor = 1 + ilog2(num_online_cpus());
5364 const unsigned long limit = 200000000;
5365
5366 sysctl_sched_min_granularity *= factor;
5367 if (sysctl_sched_min_granularity > limit)
5368 sysctl_sched_min_granularity = limit;
5369
5370 sysctl_sched_latency *= factor;
5371 if (sysctl_sched_latency > limit)
5372 sysctl_sched_latency = limit;
5373
5374 sysctl_sched_wakeup_granularity *= factor;
5375 sysctl_sched_batch_wakeup_granularity *= factor;
5376}
5377
5378#ifdef CONFIG_SMP
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5405{
5406 struct migration_req req;
5407 unsigned long flags;
5408 struct rq *rq;
5409 int ret = 0;
5410
5411 rq = task_rq_lock(p, &flags);
5412 if (!cpus_intersects(new_mask, cpu_online_map)) {
5413 ret = -EINVAL;
5414 goto out;
5415 }
5416
5417 if (p->sched_class->set_cpus_allowed)
5418 p->sched_class->set_cpus_allowed(p, &new_mask);
5419 else {
5420 p->cpus_allowed = new_mask;
5421 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5422 }
5423
5424
5425 if (cpu_isset(task_cpu(p), new_mask))
5426 goto out;
5427
5428 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
5429
5430 task_rq_unlock(rq, &flags);
5431 wake_up_process(rq->migration_thread);
5432 wait_for_completion(&req.done);
5433 tlb_migrate_finish(p->mm);
5434 return 0;
5435 }
5436out:
5437 task_rq_unlock(rq, &flags);
5438
5439 return ret;
5440}
5441EXPORT_SYMBOL_GPL(set_cpus_allowed);
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5455{
5456 struct rq *rq_dest, *rq_src;
5457 int ret = 0, on_rq;
5458
5459 if (unlikely(cpu_is_offline(dest_cpu)))
5460 return ret;
5461
5462 rq_src = cpu_rq(src_cpu);
5463 rq_dest = cpu_rq(dest_cpu);
5464
5465 double_rq_lock(rq_src, rq_dest);
5466
5467 if (task_cpu(p) != src_cpu)
5468 goto out;
5469
5470 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5471 goto out;
5472
5473 on_rq = p->se.on_rq;
5474 if (on_rq)
5475 deactivate_task(rq_src, p, 0);
5476
5477 set_task_cpu(p, dest_cpu);
5478 if (on_rq) {
5479 activate_task(rq_dest, p, 0);
5480 check_preempt_curr(rq_dest, p);
5481 }
5482 ret = 1;
5483out:
5484 double_rq_unlock(rq_src, rq_dest);
5485 return ret;
5486}
5487
5488
5489
5490
5491
5492
5493static int migration_thread(void *data)
5494{
5495 int cpu = (long)data;
5496 struct rq *rq;
5497
5498 rq = cpu_rq(cpu);
5499 BUG_ON(rq->migration_thread != current);
5500
5501 set_current_state(TASK_INTERRUPTIBLE);
5502 while (!kthread_should_stop()) {
5503 struct migration_req *req;
5504 struct list_head *head;
5505
5506 spin_lock_irq(&rq->lock);
5507
5508 if (cpu_is_offline(cpu)) {
5509 spin_unlock_irq(&rq->lock);
5510 goto wait_to_die;
5511 }
5512
5513 if (rq->active_balance) {
5514 active_load_balance(rq, cpu);
5515 rq->active_balance = 0;
5516 }
5517
5518 head = &rq->migration_queue;
5519
5520 if (list_empty(head)) {
5521 spin_unlock_irq(&rq->lock);
5522 schedule();
5523 set_current_state(TASK_INTERRUPTIBLE);
5524 continue;
5525 }
5526 req = list_entry(head->next, struct migration_req, list);
5527 list_del_init(head->next);
5528
5529 spin_unlock(&rq->lock);
5530 __migrate_task(req->task, cpu, req->dest_cpu);
5531 local_irq_enable();
5532
5533 complete(&req->done);
5534 }
5535 __set_current_state(TASK_RUNNING);
5536 return 0;
5537
5538wait_to_die:
5539
5540 set_current_state(TASK_INTERRUPTIBLE);
5541 while (!kthread_should_stop()) {
5542 schedule();
5543 set_current_state(TASK_INTERRUPTIBLE);
5544 }
5545 __set_current_state(TASK_RUNNING);
5546 return 0;
5547}
5548
5549#ifdef CONFIG_HOTPLUG_CPU
5550
5551static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5552{
5553 int ret;
5554
5555 local_irq_disable();
5556 ret = __migrate_task(p, src_cpu, dest_cpu);
5557 local_irq_enable();
5558 return ret;
5559}
5560
5561
5562
5563
5564
5565static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5566{
5567 unsigned long flags;
5568 cpumask_t mask;
5569 struct rq *rq;
5570 int dest_cpu;
5571
5572 do {
5573
5574 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5575 cpus_and(mask, mask, p->cpus_allowed);
5576 dest_cpu = any_online_cpu(mask);
5577
5578
5579 if (dest_cpu == NR_CPUS)
5580 dest_cpu = any_online_cpu(p->cpus_allowed);
5581
5582
5583 if (dest_cpu == NR_CPUS) {
5584 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
5585
5586
5587
5588
5589
5590
5591
5592 rq = task_rq_lock(p, &flags);
5593 p->cpus_allowed = cpus_allowed;
5594 dest_cpu = any_online_cpu(p->cpus_allowed);
5595 task_rq_unlock(rq, &flags);
5596
5597
5598
5599
5600
5601
5602 if (p->mm && printk_ratelimit()) {
5603 printk(KERN_INFO "process %d (%s) no "
5604 "longer affine to cpu%d\n",
5605 task_pid_nr(p), p->comm, dead_cpu);
5606 }
5607 }
5608 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5609}
5610
5611
5612
5613
5614
5615
5616
5617
5618static void migrate_nr_uninterruptible(struct rq *rq_src)
5619{
5620 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5621 unsigned long flags;
5622
5623 local_irq_save(flags);
5624 double_rq_lock(rq_src, rq_dest);
5625 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5626 rq_src->nr_uninterruptible = 0;
5627 double_rq_unlock(rq_src, rq_dest);
5628 local_irq_restore(flags);
5629}
5630
5631
5632static void migrate_live_tasks(int src_cpu)
5633{
5634 struct task_struct *p, *t;
5635
5636 read_lock(&tasklist_lock);
5637
5638 do_each_thread(t, p) {
5639 if (p == current)
5640 continue;
5641
5642 if (task_cpu(p) == src_cpu)
5643 move_task_off_dead_cpu(src_cpu, p);
5644 } while_each_thread(t, p);
5645
5646 read_unlock(&tasklist_lock);
5647}
5648
5649
5650
5651
5652
5653
5654void sched_idle_next(void)
5655{
5656 int this_cpu = smp_processor_id();
5657 struct rq *rq = cpu_rq(this_cpu);
5658 struct task_struct *p = rq->idle;
5659 unsigned long flags;
5660
5661
5662 BUG_ON(cpu_online(this_cpu));
5663
5664
5665
5666
5667
5668 spin_lock_irqsave(&rq->lock, flags);
5669
5670 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5671
5672 update_rq_clock(rq);
5673 activate_task(rq, p, 0);
5674
5675 spin_unlock_irqrestore(&rq->lock, flags);
5676}
5677
5678
5679
5680
5681
5682void idle_task_exit(void)
5683{
5684 struct mm_struct *mm = current->active_mm;
5685
5686 BUG_ON(cpu_online(smp_processor_id()));
5687
5688 if (mm != &init_mm)
5689 switch_mm(mm, &init_mm, current);
5690 mmdrop(mm);
5691}
5692
5693
5694static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5695{
5696 struct rq *rq = cpu_rq(dead_cpu);
5697
5698
5699 BUG_ON(!p->exit_state);
5700
5701
5702 BUG_ON(p->state == TASK_DEAD);
5703
5704 get_task_struct(p);
5705
5706
5707
5708
5709
5710
5711 spin_unlock_irq(&rq->lock);
5712 move_task_off_dead_cpu(dead_cpu, p);
5713 spin_lock_irq(&rq->lock);
5714
5715 put_task_struct(p);
5716}
5717
5718
5719static void migrate_dead_tasks(unsigned int dead_cpu)
5720{
5721 struct rq *rq = cpu_rq(dead_cpu);
5722 struct task_struct *next;
5723
5724 for ( ; ; ) {
5725 if (!rq->nr_running)
5726 break;
5727 update_rq_clock(rq);
5728 next = pick_next_task(rq, rq->curr);
5729 if (!next)
5730 break;
5731 migrate_dead(dead_cpu, next);
5732
5733 }
5734}
5735#endif
5736
5737#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5738
5739static struct ctl_table sd_ctl_dir[] = {
5740 {
5741 .procname = "sched_domain",
5742 .mode = 0555,
5743 },
5744 {0, },
5745};
5746
5747static struct ctl_table sd_ctl_root[] = {
5748 {
5749 .ctl_name = CTL_KERN,
5750 .procname = "kernel",
5751 .mode = 0555,
5752 .child = sd_ctl_dir,
5753 },
5754 {0, },
5755};
5756
5757static struct ctl_table *sd_alloc_ctl_entry(int n)
5758{
5759 struct ctl_table *entry =
5760 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5761
5762 return entry;
5763}
5764
5765static void sd_free_ctl_entry(struct ctl_table **tablep)
5766{
5767 struct ctl_table *entry;
5768
5769
5770
5771
5772
5773
5774
5775 for (entry = *tablep; entry->mode; entry++) {
5776 if (entry->child)
5777 sd_free_ctl_entry(&entry->child);
5778 if (entry->proc_handler == NULL)
5779 kfree(entry->procname);
5780 }
5781
5782 kfree(*tablep);
5783 *tablep = NULL;
5784}
5785
5786static void
5787set_table_entry(struct ctl_table *entry,
5788 const char *procname, void *data, int maxlen,
5789 mode_t mode, proc_handler *proc_handler)
5790{
5791 entry->procname = procname;
5792 entry->data = data;
5793 entry->maxlen = maxlen;
5794 entry->mode = mode;
5795 entry->proc_handler = proc_handler;
5796}
5797
5798static struct ctl_table *
5799sd_alloc_ctl_domain_table(struct sched_domain *sd)
5800{
5801 struct ctl_table *table = sd_alloc_ctl_entry(12);
5802
5803 if (table == NULL)
5804 return NULL;
5805
5806 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5807 sizeof(long), 0644, proc_doulongvec_minmax);
5808 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5809 sizeof(long), 0644, proc_doulongvec_minmax);
5810 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5811 sizeof(int), 0644, proc_dointvec_minmax);
5812 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5813 sizeof(int), 0644, proc_dointvec_minmax);
5814 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5815 sizeof(int), 0644, proc_dointvec_minmax);
5816 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5817 sizeof(int), 0644, proc_dointvec_minmax);
5818 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5819 sizeof(int), 0644, proc_dointvec_minmax);
5820 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5821 sizeof(int), 0644, proc_dointvec_minmax);
5822 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5823 sizeof(int), 0644, proc_dointvec_minmax);
5824 set_table_entry(&table[9], "cache_nice_tries",
5825 &sd->cache_nice_tries,
5826 sizeof(int), 0644, proc_dointvec_minmax);
5827 set_table_entry(&table[10], "flags", &sd->flags,
5828 sizeof(int), 0644, proc_dointvec_minmax);
5829
5830
5831 return table;
5832}
5833
5834static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5835{
5836 struct ctl_table *entry, *table;
5837 struct sched_domain *sd;
5838 int domain_num = 0, i;
5839 char buf[32];
5840
5841 for_each_domain(cpu, sd)
5842 domain_num++;
5843 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5844 if (table == NULL)
5845 return NULL;
5846
5847 i = 0;
5848 for_each_domain(cpu, sd) {
5849 snprintf(buf, 32, "domain%d", i);
5850 entry->procname = kstrdup(buf, GFP_KERNEL);
5851 entry->mode = 0555;
5852 entry->child = sd_alloc_ctl_domain_table(sd);
5853 entry++;
5854 i++;
5855 }
5856 return table;
5857}
5858
5859static struct ctl_table_header *sd_sysctl_header;
5860static void register_sched_domain_sysctl(void)
5861{
5862 int i, cpu_num = num_online_cpus();
5863 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5864 char buf[32];
5865
5866 WARN_ON(sd_ctl_dir[0].child);
5867 sd_ctl_dir[0].child = entry;
5868
5869 if (entry == NULL)
5870 return;
5871
5872 for_each_online_cpu(i) {
5873 snprintf(buf, 32, "cpu%d", i);
5874 entry->procname = kstrdup(buf, GFP_KERNEL);
5875 entry->mode = 0555;
5876 entry->child = sd_alloc_ctl_cpu_table(i);
5877 entry++;
5878 }
5879
5880 WARN_ON(sd_sysctl_header);
5881 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5882}
5883
5884
5885static void unregister_sched_domain_sysctl(void)
5886{
5887 if (sd_sysctl_header)
5888 unregister_sysctl_table(sd_sysctl_header);
5889 sd_sysctl_header = NULL;
5890 if (sd_ctl_dir[0].child)
5891 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5892}
5893#else
5894static void register_sched_domain_sysctl(void)
5895{
5896}
5897static void unregister_sched_domain_sysctl(void)
5898{
5899}
5900#endif
5901
5902
5903
5904
5905
5906static int __cpuinit
5907migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5908{
5909 struct task_struct *p;
5910 int cpu = (long)hcpu;
5911 unsigned long flags;
5912 struct rq *rq;
5913
5914 switch (action) {
5915
5916 case CPU_UP_PREPARE:
5917 case CPU_UP_PREPARE_FROZEN:
5918 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5919 if (IS_ERR(p))
5920 return NOTIFY_BAD;
5921 kthread_bind(p, cpu);
5922
5923 rq = task_rq_lock(p, &flags);
5924 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5925 task_rq_unlock(rq, &flags);
5926 cpu_rq(cpu)->migration_thread = p;
5927 break;
5928
5929 case CPU_ONLINE:
5930 case CPU_ONLINE_FROZEN:
5931
5932 wake_up_process(cpu_rq(cpu)->migration_thread);
5933
5934
5935 rq = cpu_rq(cpu);
5936 spin_lock_irqsave(&rq->lock, flags);
5937 if (rq->rd) {
5938 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5939 cpu_set(cpu, rq->rd->online);
5940 }
5941 spin_unlock_irqrestore(&rq->lock, flags);
5942 break;
5943
5944#ifdef CONFIG_HOTPLUG_CPU
5945 case CPU_UP_CANCELED:
5946 case CPU_UP_CANCELED_FROZEN:
5947 if (!cpu_rq(cpu)->migration_thread)
5948 break;
5949
5950 kthread_bind(cpu_rq(cpu)->migration_thread,
5951 any_online_cpu(cpu_online_map));
5952 kthread_stop(cpu_rq(cpu)->migration_thread);
5953 cpu_rq(cpu)->migration_thread = NULL;
5954 break;
5955
5956 case CPU_DEAD:
5957 case CPU_DEAD_FROZEN:
5958 cpuset_lock();
5959 migrate_live_tasks(cpu);
5960 rq = cpu_rq(cpu);
5961 kthread_stop(rq->migration_thread);
5962 rq->migration_thread = NULL;
5963
5964 spin_lock_irq(&rq->lock);
5965 update_rq_clock(rq);
5966 deactivate_task(rq, rq->idle, 0);
5967 rq->idle->static_prio = MAX_PRIO;
5968 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5969 rq->idle->sched_class = &idle_sched_class;
5970 migrate_dead_tasks(cpu);
5971 spin_unlock_irq(&rq->lock);
5972 cpuset_unlock();
5973 migrate_nr_uninterruptible(rq);
5974 BUG_ON(rq->nr_running != 0);
5975
5976
5977
5978
5979
5980
5981 spin_lock_irq(&rq->lock);
5982 while (!list_empty(&rq->migration_queue)) {
5983 struct migration_req *req;
5984
5985 req = list_entry(rq->migration_queue.next,
5986 struct migration_req, list);
5987 list_del_init(&req->list);
5988 complete(&req->done);
5989 }
5990 spin_unlock_irq(&rq->lock);
5991 break;
5992
5993 case CPU_DYING:
5994 case CPU_DYING_FROZEN:
5995
5996 rq = cpu_rq(cpu);
5997 spin_lock_irqsave(&rq->lock, flags);
5998 if (rq->rd) {
5999 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6000 cpu_clear(cpu, rq->rd->online);
6001 }
6002 spin_unlock_irqrestore(&rq->lock, flags);
6003 break;
6004#endif
6005 }
6006 return NOTIFY_OK;
6007}
6008
6009
6010
6011
6012static struct notifier_block __cpuinitdata migration_notifier = {
6013 .notifier_call = migration_call,
6014 .priority = 10
6015};
6016
6017void __init migration_init(void)
6018{
6019 void *cpu = (void *)(long)smp_processor_id();
6020 int err;
6021
6022
6023 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6024 BUG_ON(err == NOTIFY_BAD);
6025 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6026 register_cpu_notifier(&migration_notifier);
6027}
6028#endif
6029
6030#ifdef CONFIG_SMP
6031
6032
6033int nr_cpu_ids __read_mostly = NR_CPUS;
6034EXPORT_SYMBOL(nr_cpu_ids);
6035
6036#ifdef CONFIG_SCHED_DEBUG
6037
6038static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
6039{
6040 struct sched_group *group = sd->groups;
6041 cpumask_t groupmask;
6042 char str[NR_CPUS];
6043
6044 cpumask_scnprintf(str, NR_CPUS, sd->span);
6045 cpus_clear(groupmask);
6046
6047 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6048
6049 if (!(sd->flags & SD_LOAD_BALANCE)) {
6050 printk("does not load-balance\n");
6051 if (sd->parent)
6052 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6053 " has parent");
6054 return -1;
6055 }
6056
6057 printk(KERN_CONT "span %s\n", str);
6058
6059 if (!cpu_isset(cpu, sd->span)) {
6060 printk(KERN_ERR "ERROR: domain->span does not contain "
6061 "CPU%d\n", cpu);
6062 }
6063 if (!cpu_isset(cpu, group->cpumask)) {
6064 printk(KERN_ERR "ERROR: domain->groups does not contain"
6065 " CPU%d\n", cpu);
6066 }
6067
6068 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6069 do {
6070 if (!group) {
6071 printk("\n");
6072 printk(KERN_ERR "ERROR: group is NULL\n");
6073 break;
6074 }
6075
6076 if (!group->__cpu_power) {
6077 printk(KERN_CONT "\n");
6078 printk(KERN_ERR "ERROR: domain->cpu_power not "
6079 "set\n");
6080 break;
6081 }
6082
6083 if (!cpus_weight(group->cpumask)) {
6084 printk(KERN_CONT "\n");
6085 printk(KERN_ERR "ERROR: empty group\n");
6086 break;
6087 }
6088
6089 if (cpus_intersects(groupmask, group->cpumask)) {
6090 printk(KERN_CONT "\n");
6091 printk(KERN_ERR "ERROR: repeated CPUs\n");
6092 break;
6093 }
6094
6095 cpus_or(groupmask, groupmask, group->cpumask);
6096
6097 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
6098 printk(KERN_CONT " %s", str);
6099
6100 group = group->next;
6101 } while (group != sd->groups);
6102 printk(KERN_CONT "\n");
6103
6104 if (!cpus_equal(sd->span, groupmask))
6105 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6106
6107 if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
6108 printk(KERN_ERR "ERROR: parent span is not a superset "
6109 "of domain->span\n");
6110 return 0;
6111}
6112
6113static void sched_domain_debug(struct sched_domain *sd, int cpu)
6114{
6115 int level = 0;
6116
6117 if (!sd) {
6118 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6119 return;
6120 }
6121
6122 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6123
6124 for (;;) {
6125 if (sched_domain_debug_one(sd, cpu, level))
6126 break;
6127 level++;
6128 sd = sd->parent;
6129 if (!sd)
6130 break;
6131 }
6132}
6133#else
6134# define sched_domain_debug(sd, cpu) do { } while (0)
6135#endif
6136
6137static int sd_degenerate(struct sched_domain *sd)
6138{
6139 if (cpus_weight(sd->span) == 1)
6140 return 1;
6141
6142
6143 if (sd->flags & (SD_LOAD_BALANCE |
6144 SD_BALANCE_NEWIDLE |
6145 SD_BALANCE_FORK |
6146 SD_BALANCE_EXEC |
6147 SD_SHARE_CPUPOWER |
6148 SD_SHARE_PKG_RESOURCES)) {
6149 if (sd->groups != sd->groups->next)
6150 return 0;
6151 }
6152
6153
6154 if (sd->flags & (SD_WAKE_IDLE |
6155 SD_WAKE_AFFINE |
6156 SD_WAKE_BALANCE))
6157 return 0;
6158
6159 return 1;
6160}
6161
6162static int
6163sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6164{
6165 unsigned long cflags = sd->flags, pflags = parent->flags;
6166
6167 if (sd_degenerate(parent))
6168 return 1;
6169
6170 if (!cpus_equal(sd->span, parent->span))
6171 return 0;
6172
6173
6174
6175 if (cflags & SD_WAKE_AFFINE)
6176 pflags &= ~SD_WAKE_BALANCE;
6177
6178 if (parent->groups == parent->groups->next) {
6179 pflags &= ~(SD_LOAD_BALANCE |
6180 SD_BALANCE_NEWIDLE |
6181 SD_BALANCE_FORK |
6182 SD_BALANCE_EXEC |
6183 SD_SHARE_CPUPOWER |
6184 SD_SHARE_PKG_RESOURCES);
6185 }
6186 if (~cflags & pflags)
6187 return 0;
6188
6189 return 1;
6190}
6191
6192static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6193{
6194 unsigned long flags;
6195 const struct sched_class *class;
6196
6197 spin_lock_irqsave(&rq->lock, flags);
6198
6199 if (rq->rd) {
6200 struct root_domain *old_rd = rq->rd;
6201
6202 for (class = sched_class_highest; class; class = class->next) {
6203 if (class->leave_domain)
6204 class->leave_domain(rq);
6205 }
6206
6207 cpu_clear(rq->cpu, old_rd->span);
6208 cpu_clear(rq->cpu, old_rd->online);
6209
6210 if (atomic_dec_and_test(&old_rd->refcount))
6211 kfree(old_rd);
6212 }
6213
6214 atomic_inc(&rd->refcount);
6215 rq->rd = rd;
6216
6217 cpu_set(rq->cpu, rd->span);
6218 if (cpu_isset(rq->cpu, cpu_online_map))
6219 cpu_set(rq->cpu, rd->online);
6220
6221 for (class = sched_class_highest; class; class = class->next) {
6222 if (class->join_domain)
6223 class->join_domain(rq);
6224 }
6225
6226 spin_unlock_irqrestore(&rq->lock, flags);
6227}
6228
6229static void init_rootdomain(struct root_domain *rd)
6230{
6231 memset(rd, 0, sizeof(*rd));
6232
6233 cpus_clear(rd->span);
6234 cpus_clear(rd->online);
6235}
6236
6237static void init_defrootdomain(void)
6238{
6239 init_rootdomain(&def_root_domain);
6240 atomic_set(&def_root_domain.refcount, 1);
6241}
6242
6243static struct root_domain *alloc_rootdomain(void)
6244{
6245 struct root_domain *rd;
6246
6247 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6248 if (!rd)
6249 return NULL;
6250
6251 init_rootdomain(rd);
6252
6253 return rd;
6254}
6255
6256
6257
6258
6259
6260static void
6261cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6262{
6263 struct rq *rq = cpu_rq(cpu);
6264 struct sched_domain *tmp;
6265
6266
6267 for (tmp = sd; tmp; tmp = tmp->parent) {
6268 struct sched_domain *parent = tmp->parent;
6269 if (!parent)
6270 break;
6271 if (sd_parent_degenerate(tmp, parent)) {
6272 tmp->parent = parent->parent;
6273 if (parent->parent)
6274 parent->parent->child = tmp;
6275 }
6276 }
6277
6278 if (sd && sd_degenerate(sd)) {
6279 sd = sd->parent;
6280 if (sd)
6281 sd->child = NULL;
6282 }
6283
6284 sched_domain_debug(sd, cpu);
6285
6286 rq_attach_root(rq, rd);
6287 rcu_assign_pointer(rq->sd, sd);
6288}
6289
6290
6291static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6292
6293
6294static int __init isolated_cpu_setup(char *str)
6295{
6296 int ints[NR_CPUS], i;
6297
6298 str = get_options(str, ARRAY_SIZE(ints), ints);
6299 cpus_clear(cpu_isolated_map);
6300 for (i = 1; i <= ints[0]; i++)
6301 if (ints[i] < NR_CPUS)
6302 cpu_set(ints[i], cpu_isolated_map);
6303 return 1;
6304}
6305
6306__setup("isolcpus=", isolated_cpu_setup);
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318static void
6319init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
6320 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
6321 struct sched_group **sg))
6322{
6323 struct sched_group *first = NULL, *last = NULL;
6324 cpumask_t covered = CPU_MASK_NONE;
6325 int i;
6326
6327 for_each_cpu_mask(i, span) {
6328 struct sched_group *sg;
6329 int group = group_fn(i, cpu_map, &sg);
6330 int j;
6331
6332 if (cpu_isset(i, covered))
6333 continue;
6334
6335 sg->cpumask = CPU_MASK_NONE;
6336 sg->__cpu_power = 0;
6337
6338 for_each_cpu_mask(j, span) {
6339 if (group_fn(j, cpu_map, NULL) != group)
6340 continue;
6341
6342 cpu_set(j, covered);
6343 cpu_set(j, sg->cpumask);
6344 }
6345 if (!first)
6346 first = sg;
6347 if (last)
6348 last->next = sg;
6349 last = sg;
6350 }
6351 last->next = first;
6352}
6353
6354#define SD_NODES_PER_DOMAIN 16
6355
6356#ifdef CONFIG_NUMA
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368static int find_next_best_node(int node, unsigned long *used_nodes)
6369{
6370 int i, n, val, min_val, best_node = 0;
6371
6372 min_val = INT_MAX;
6373
6374 for (i = 0; i < MAX_NUMNODES; i++) {
6375
6376 n = (node + i) % MAX_NUMNODES;
6377
6378 if (!nr_cpus_node(n))
6379 continue;
6380
6381
6382 if (test_bit(n, used_nodes))
6383 continue;
6384
6385
6386 val = node_distance(node, n);
6387
6388 if (val < min_val) {
6389 min_val = val;
6390 best_node = n;
6391 }
6392 }
6393
6394 set_bit(best_node, used_nodes);
6395 return best_node;
6396}
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407static cpumask_t sched_domain_node_span(int node)
6408{
6409 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
6410 cpumask_t span, nodemask;
6411 int i;
6412
6413 cpus_clear(span);
6414 bitmap_zero(used_nodes, MAX_NUMNODES);
6415
6416 nodemask = node_to_cpumask(node);
6417 cpus_or(span, span, nodemask);
6418 set_bit(node, used_nodes);
6419
6420 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6421 int next_node = find_next_best_node(node, used_nodes);
6422
6423 nodemask = node_to_cpumask(next_node);
6424 cpus_or(span, span, nodemask);
6425 }
6426
6427 return span;
6428}
6429#endif
6430
6431int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6432
6433
6434
6435
6436#ifdef CONFIG_SCHED_SMT
6437static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6438static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6439
6440static int
6441cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6442{
6443 if (sg)
6444 *sg = &per_cpu(sched_group_cpus, cpu);
6445 return cpu;
6446}
6447#endif
6448
6449
6450
6451
6452#ifdef CONFIG_SCHED_MC
6453static DEFINE_PER_CPU(struct sched_domain, core_domains);
6454static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6455#endif
6456
6457#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6458static int
6459cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6460{
6461 int group;
6462 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
6463 cpus_and(mask, mask, *cpu_map);
6464 group = first_cpu(mask);
6465 if (sg)
6466 *sg = &per_cpu(sched_group_core, group);
6467 return group;
6468}
6469#elif defined(CONFIG_SCHED_MC)
6470static int
6471cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6472{
6473 if (sg)
6474 *sg = &per_cpu(sched_group_core, cpu);
6475 return cpu;
6476}
6477#endif
6478
6479static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6480static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6481
6482static int
6483cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6484{
6485 int group;
6486#ifdef CONFIG_SCHED_MC
6487 cpumask_t mask = cpu_coregroup_map(cpu);
6488 cpus_and(mask, mask, *cpu_map);
6489 group = first_cpu(mask);
6490#elif defined(CONFIG_SCHED_SMT)
6491 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
6492 cpus_and(mask, mask, *cpu_map);
6493 group = first_cpu(mask);
6494#else
6495 group = cpu;
6496#endif
6497 if (sg)
6498 *sg = &per_cpu(sched_group_phys, group);
6499 return group;
6500}
6501
6502#ifdef CONFIG_NUMA
6503
6504
6505
6506
6507
6508static DEFINE_PER_CPU(struct sched_domain, node_domains);
6509static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6510
6511static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6512static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6513
6514static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6515 struct sched_group **sg)
6516{
6517 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6518 int group;
6519
6520 cpus_and(nodemask, nodemask, *cpu_map);
6521 group = first_cpu(nodemask);
6522
6523 if (sg)
6524 *sg = &per_cpu(sched_group_allnodes, group);
6525 return group;
6526}
6527
6528static void init_numa_sched_groups_power(struct sched_group *group_head)
6529{
6530 struct sched_group *sg = group_head;
6531 int j;
6532
6533 if (!sg)
6534 return;
6535 do {
6536 for_each_cpu_mask(j, sg->cpumask) {
6537 struct sched_domain *sd;
6538
6539 sd = &per_cpu(phys_domains, j);
6540 if (j != first_cpu(sd->groups->cpumask)) {
6541
6542
6543
6544
6545 continue;
6546 }
6547
6548 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
6549 }
6550 sg = sg->next;
6551 } while (sg != group_head);
6552}
6553#endif
6554
6555#ifdef CONFIG_NUMA
6556
6557static void free_sched_groups(const cpumask_t *cpu_map)
6558{
6559 int cpu, i;
6560
6561 for_each_cpu_mask(cpu, *cpu_map) {
6562 struct sched_group **sched_group_nodes
6563 = sched_group_nodes_bycpu[cpu];
6564
6565 if (!sched_group_nodes)
6566 continue;
6567
6568 for (i = 0; i < MAX_NUMNODES; i++) {
6569 cpumask_t nodemask = node_to_cpumask(i);
6570 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6571
6572 cpus_and(nodemask, nodemask, *cpu_map);
6573 if (cpus_empty(nodemask))
6574 continue;
6575
6576 if (sg == NULL)
6577 continue;
6578 sg = sg->next;
6579next_sg:
6580 oldsg = sg;
6581 sg = sg->next;
6582 kfree(oldsg);
6583 if (oldsg != sched_group_nodes[i])
6584 goto next_sg;
6585 }
6586 kfree(sched_group_nodes);
6587 sched_group_nodes_bycpu[cpu] = NULL;
6588 }
6589}
6590#else
6591static void free_sched_groups(const cpumask_t *cpu_map)
6592{
6593}
6594#endif
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6611{
6612 struct sched_domain *child;
6613 struct sched_group *group;
6614
6615 WARN_ON(!sd || !sd->groups);
6616
6617 if (cpu != first_cpu(sd->groups->cpumask))
6618 return;
6619
6620 child = sd->child;
6621
6622 sd->groups->__cpu_power = 0;
6623
6624
6625
6626
6627
6628
6629
6630
6631 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6632 (child->flags &
6633 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6634 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
6635 return;
6636 }
6637
6638
6639
6640
6641 group = child->groups;
6642 do {
6643 sg_inc_cpu_power(sd->groups, group->__cpu_power);
6644 group = group->next;
6645 } while (group != child->groups);
6646}
6647
6648
6649
6650
6651
6652static int build_sched_domains(const cpumask_t *cpu_map)
6653{
6654 int i;
6655 struct root_domain *rd;
6656#ifdef CONFIG_NUMA
6657 struct sched_group **sched_group_nodes = NULL;
6658 int sd_allnodes = 0;
6659
6660
6661
6662
6663 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6664 GFP_KERNEL);
6665 if (!sched_group_nodes) {
6666 printk(KERN_WARNING "Can not alloc sched group node list\n");
6667 return -ENOMEM;
6668 }
6669 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6670#endif
6671
6672 rd = alloc_rootdomain();
6673 if (!rd) {
6674 printk(KERN_WARNING "Cannot alloc root domain\n");
6675 return -ENOMEM;
6676 }
6677
6678
6679
6680
6681 for_each_cpu_mask(i, *cpu_map) {
6682 struct sched_domain *sd = NULL, *p;
6683 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6684
6685 cpus_and(nodemask, nodemask, *cpu_map);
6686
6687#ifdef CONFIG_NUMA
6688 if (cpus_weight(*cpu_map) >
6689 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6690 sd = &per_cpu(allnodes_domains, i);
6691 *sd = SD_ALLNODES_INIT;
6692 sd->span = *cpu_map;
6693 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6694 p = sd;
6695 sd_allnodes = 1;
6696 } else
6697 p = NULL;
6698
6699 sd = &per_cpu(node_domains, i);
6700 *sd = SD_NODE_INIT;
6701 sd->span = sched_domain_node_span(cpu_to_node(i));
6702 sd->parent = p;
6703 if (p)
6704 p->child = sd;
6705 cpus_and(sd->span, sd->span, *cpu_map);
6706#endif
6707
6708 p = sd;
6709 sd = &per_cpu(phys_domains, i);
6710 *sd = SD_CPU_INIT;
6711 sd->span = nodemask;
6712 sd->parent = p;
6713 if (p)
6714 p->child = sd;
6715 cpu_to_phys_group(i, cpu_map, &sd->groups);
6716
6717#ifdef CONFIG_SCHED_MC
6718 p = sd;
6719 sd = &per_cpu(core_domains, i);
6720 *sd = SD_MC_INIT;
6721 sd->span = cpu_coregroup_map(i);
6722 cpus_and(sd->span, sd->span, *cpu_map);
6723 sd->parent = p;
6724 p->child = sd;
6725 cpu_to_core_group(i, cpu_map, &sd->groups);
6726#endif
6727
6728#ifdef CONFIG_SCHED_SMT
6729 p = sd;
6730 sd = &per_cpu(cpu_domains, i);
6731 *sd = SD_SIBLING_INIT;
6732 sd->span = per_cpu(cpu_sibling_map, i);
6733 cpus_and(sd->span, sd->span, *cpu_map);
6734 sd->parent = p;
6735 p->child = sd;
6736 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6737#endif
6738 }
6739
6740#ifdef CONFIG_SCHED_SMT
6741
6742 for_each_cpu_mask(i, *cpu_map) {
6743 cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i);
6744 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
6745 if (i != first_cpu(this_sibling_map))
6746 continue;
6747
6748 init_sched_build_groups(this_sibling_map, cpu_map,
6749 &cpu_to_cpu_group);
6750 }
6751#endif
6752
6753#ifdef CONFIG_SCHED_MC
6754
6755 for_each_cpu_mask(i, *cpu_map) {
6756 cpumask_t this_core_map = cpu_coregroup_map(i);
6757 cpus_and(this_core_map, this_core_map, *cpu_map);
6758 if (i != first_cpu(this_core_map))
6759 continue;
6760 init_sched_build_groups(this_core_map, cpu_map,
6761 &cpu_to_core_group);
6762 }
6763#endif
6764
6765
6766 for (i = 0; i < MAX_NUMNODES; i++) {
6767 cpumask_t nodemask = node_to_cpumask(i);
6768
6769 cpus_and(nodemask, nodemask, *cpu_map);
6770 if (cpus_empty(nodemask))
6771 continue;
6772
6773 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6774 }
6775
6776#ifdef CONFIG_NUMA
6777
6778 if (sd_allnodes)
6779 init_sched_build_groups(*cpu_map, cpu_map,
6780 &cpu_to_allnodes_group);
6781
6782 for (i = 0; i < MAX_NUMNODES; i++) {
6783
6784 struct sched_group *sg, *prev;
6785 cpumask_t nodemask = node_to_cpumask(i);
6786 cpumask_t domainspan;
6787 cpumask_t covered = CPU_MASK_NONE;
6788 int j;
6789
6790 cpus_and(nodemask, nodemask, *cpu_map);
6791 if (cpus_empty(nodemask)) {
6792 sched_group_nodes[i] = NULL;
6793 continue;
6794 }
6795
6796 domainspan = sched_domain_node_span(i);
6797 cpus_and(domainspan, domainspan, *cpu_map);
6798
6799 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6800 if (!sg) {
6801 printk(KERN_WARNING "Can not alloc domain group for "
6802 "node %d\n", i);
6803 goto error;
6804 }
6805 sched_group_nodes[i] = sg;
6806 for_each_cpu_mask(j, nodemask) {
6807 struct sched_domain *sd;
6808
6809 sd = &per_cpu(node_domains, j);
6810 sd->groups = sg;
6811 }
6812 sg->__cpu_power = 0;
6813 sg->cpumask = nodemask;
6814 sg->next = sg;
6815 cpus_or(covered, covered, nodemask);
6816 prev = sg;
6817
6818 for (j = 0; j < MAX_NUMNODES; j++) {
6819 cpumask_t tmp, notcovered;
6820 int n = (i + j) % MAX_NUMNODES;
6821
6822 cpus_complement(notcovered, covered);
6823 cpus_and(tmp, notcovered, *cpu_map);
6824 cpus_and(tmp, tmp, domainspan);
6825 if (cpus_empty(tmp))
6826 break;
6827
6828 nodemask = node_to_cpumask(n);
6829 cpus_and(tmp, tmp, nodemask);
6830 if (cpus_empty(tmp))
6831 continue;
6832
6833 sg = kmalloc_node(sizeof(struct sched_group),
6834 GFP_KERNEL, i);
6835 if (!sg) {
6836 printk(KERN_WARNING
6837 "Can not alloc domain group for node %d\n", j);
6838 goto error;
6839 }
6840 sg->__cpu_power = 0;
6841 sg->cpumask = tmp;
6842 sg->next = prev->next;
6843 cpus_or(covered, covered, tmp);
6844 prev->next = sg;
6845 prev = sg;
6846 }
6847 }
6848#endif
6849
6850
6851#ifdef CONFIG_SCHED_SMT
6852 for_each_cpu_mask(i, *cpu_map) {
6853 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6854
6855 init_sched_groups_power(i, sd);
6856 }
6857#endif
6858#ifdef CONFIG_SCHED_MC
6859 for_each_cpu_mask(i, *cpu_map) {
6860 struct sched_domain *sd = &per_cpu(core_domains, i);
6861
6862 init_sched_groups_power(i, sd);
6863 }
6864#endif
6865
6866 for_each_cpu_mask(i, *cpu_map) {
6867 struct sched_domain *sd = &per_cpu(phys_domains, i);
6868
6869 init_sched_groups_power(i, sd);
6870 }
6871
6872#ifdef CONFIG_NUMA
6873 for (i = 0; i < MAX_NUMNODES; i++)
6874 init_numa_sched_groups_power(sched_group_nodes[i]);
6875
6876 if (sd_allnodes) {
6877 struct sched_group *sg;
6878
6879 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6880 init_numa_sched_groups_power(sg);
6881 }
6882#endif
6883
6884
6885 for_each_cpu_mask(i, *cpu_map) {
6886 struct sched_domain *sd;
6887#ifdef CONFIG_SCHED_SMT
6888 sd = &per_cpu(cpu_domains, i);
6889#elif defined(CONFIG_SCHED_MC)
6890 sd = &per_cpu(core_domains, i);
6891#else
6892 sd = &per_cpu(phys_domains, i);
6893#endif
6894 cpu_attach_domain(sd, rd, i);
6895 }
6896
6897 return 0;
6898
6899#ifdef CONFIG_NUMA
6900error:
6901 free_sched_groups(cpu_map);
6902 return -ENOMEM;
6903#endif
6904}
6905
6906static cpumask_t *doms_cur;
6907static int ndoms_cur;
6908
6909
6910
6911
6912
6913
6914static cpumask_t fallback_doms;
6915
6916void __attribute__((weak)) arch_update_cpu_topology(void)
6917{
6918}
6919
6920
6921
6922
6923
6924
6925static int arch_init_sched_domains(const cpumask_t *cpu_map)
6926{
6927 int err;
6928
6929 arch_update_cpu_topology();
6930 ndoms_cur = 1;
6931 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6932 if (!doms_cur)
6933 doms_cur = &fallback_doms;
6934 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
6935 err = build_sched_domains(doms_cur);
6936 register_sched_domain_sysctl();
6937
6938 return err;
6939}
6940
6941static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6942{
6943 free_sched_groups(cpu_map);
6944}
6945
6946
6947
6948
6949
6950static void detach_destroy_domains(const cpumask_t *cpu_map)
6951{
6952 int i;
6953
6954 unregister_sched_domain_sysctl();
6955
6956 for_each_cpu_mask(i, *cpu_map)
6957 cpu_attach_domain(NULL, &def_root_domain, i);
6958 synchronize_sched();
6959 arch_destroy_sched_domains(cpu_map);
6960}
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6984{
6985 int i, j;
6986
6987 lock_doms_cur();
6988
6989
6990 unregister_sched_domain_sysctl();
6991
6992 if (doms_new == NULL) {
6993 ndoms_new = 1;
6994 doms_new = &fallback_doms;
6995 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
6996 }
6997
6998
6999 for (i = 0; i < ndoms_cur; i++) {
7000 for (j = 0; j < ndoms_new; j++) {
7001 if (cpus_equal(doms_cur[i], doms_new[j]))
7002 goto match1;
7003 }
7004
7005 detach_destroy_domains(doms_cur + i);
7006match1:
7007 ;
7008 }
7009
7010
7011 for (i = 0; i < ndoms_new; i++) {
7012 for (j = 0; j < ndoms_cur; j++) {
7013 if (cpus_equal(doms_new[i], doms_cur[j]))
7014 goto match2;
7015 }
7016
7017 build_sched_domains(doms_new + i);
7018match2:
7019 ;
7020 }
7021
7022
7023 if (doms_cur != &fallback_doms)
7024 kfree(doms_cur);
7025 doms_cur = doms_new;
7026 ndoms_cur = ndoms_new;
7027
7028 register_sched_domain_sysctl();
7029
7030 unlock_doms_cur();
7031}
7032
7033#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7034int arch_reinit_sched_domains(void)
7035{
7036 int err;
7037
7038 get_online_cpus();
7039 detach_destroy_domains(&cpu_online_map);
7040 err = arch_init_sched_domains(&cpu_online_map);
7041 put_online_cpus();
7042
7043 return err;
7044}
7045
7046static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7047{
7048 int ret;
7049
7050 if (buf[0] != '0' && buf[0] != '1')
7051 return -EINVAL;
7052
7053 if (smt)
7054 sched_smt_power_savings = (buf[0] == '1');
7055 else
7056 sched_mc_power_savings = (buf[0] == '1');
7057
7058 ret = arch_reinit_sched_domains();
7059
7060 return ret ? ret : count;
7061}
7062
7063#ifdef CONFIG_SCHED_MC
7064static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
7065{
7066 return sprintf(page, "%u\n", sched_mc_power_savings);
7067}
7068static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
7069 const char *buf, size_t count)
7070{
7071 return sched_power_savings_store(buf, count, 0);
7072}
7073static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
7074 sched_mc_power_savings_store);
7075#endif
7076
7077#ifdef CONFIG_SCHED_SMT
7078static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
7079{
7080 return sprintf(page, "%u\n", sched_smt_power_savings);
7081}
7082static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
7083 const char *buf, size_t count)
7084{
7085 return sched_power_savings_store(buf, count, 1);
7086}
7087static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
7088 sched_smt_power_savings_store);
7089#endif
7090
7091int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7092{
7093 int err = 0;
7094
7095#ifdef CONFIG_SCHED_SMT
7096 if (smt_capable())
7097 err = sysfs_create_file(&cls->kset.kobj,
7098 &attr_sched_smt_power_savings.attr);
7099#endif
7100#ifdef CONFIG_SCHED_MC
7101 if (!err && mc_capable())
7102 err = sysfs_create_file(&cls->kset.kobj,
7103 &attr_sched_mc_power_savings.attr);
7104#endif
7105 return err;
7106}
7107#endif
7108
7109
7110
7111
7112
7113
7114
7115static int update_sched_domains(struct notifier_block *nfb,
7116 unsigned long action, void *hcpu)
7117{
7118 switch (action) {
7119 case CPU_UP_PREPARE:
7120 case CPU_UP_PREPARE_FROZEN:
7121 case CPU_DOWN_PREPARE:
7122 case CPU_DOWN_PREPARE_FROZEN:
7123 detach_destroy_domains(&cpu_online_map);
7124 return NOTIFY_OK;
7125
7126 case CPU_UP_CANCELED:
7127 case CPU_UP_CANCELED_FROZEN:
7128 case CPU_DOWN_FAILED:
7129 case CPU_DOWN_FAILED_FROZEN:
7130 case CPU_ONLINE:
7131 case CPU_ONLINE_FROZEN:
7132 case CPU_DEAD:
7133 case CPU_DEAD_FROZEN:
7134
7135
7136
7137 break;
7138 default:
7139 return NOTIFY_DONE;
7140 }
7141
7142
7143 arch_init_sched_domains(&cpu_online_map);
7144
7145 return NOTIFY_OK;
7146}
7147
7148void __init sched_init_smp(void)
7149{
7150 cpumask_t non_isolated_cpus;
7151
7152 get_online_cpus();
7153 arch_init_sched_domains(&cpu_online_map);
7154 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
7155 if (cpus_empty(non_isolated_cpus))
7156 cpu_set(smp_processor_id(), non_isolated_cpus);
7157 put_online_cpus();
7158
7159 hotcpu_notifier(update_sched_domains, 0);
7160 init_hrtick();
7161
7162
7163 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7164 BUG();
7165 sched_init_granularity();
7166}
7167#else
7168void __init sched_init_smp(void)
7169{
7170 sched_init_granularity();
7171}
7172#endif
7173
7174int in_sched_functions(unsigned long addr)
7175{
7176 return in_lock_functions(addr) ||
7177 (addr >= (unsigned long)__sched_text_start
7178 && addr < (unsigned long)__sched_text_end);
7179}
7180
7181static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7182{
7183 cfs_rq->tasks_timeline = RB_ROOT;
7184#ifdef CONFIG_FAIR_GROUP_SCHED
7185 cfs_rq->rq = rq;
7186#endif
7187 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7188}
7189
7190static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7191{
7192 struct rt_prio_array *array;
7193 int i;
7194
7195 array = &rt_rq->active;
7196 for (i = 0; i < MAX_RT_PRIO; i++) {
7197 INIT_LIST_HEAD(array->queue + i);
7198 __clear_bit(i, array->bitmap);
7199 }
7200
7201 __set_bit(MAX_RT_PRIO, array->bitmap);
7202
7203#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7204 rt_rq->highest_prio = MAX_RT_PRIO;
7205#endif
7206#ifdef CONFIG_SMP
7207 rt_rq->rt_nr_migratory = 0;
7208 rt_rq->overloaded = 0;
7209#endif
7210
7211 rt_rq->rt_time = 0;
7212 rt_rq->rt_throttled = 0;
7213
7214#ifdef CONFIG_RT_GROUP_SCHED
7215 rt_rq->rt_nr_boosted = 0;
7216 rt_rq->rq = rq;
7217#endif
7218}
7219
7220#ifdef CONFIG_FAIR_GROUP_SCHED
7221static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7222 struct cfs_rq *cfs_rq, struct sched_entity *se,
7223 int cpu, int add)
7224{
7225 tg->cfs_rq[cpu] = cfs_rq;
7226 init_cfs_rq(cfs_rq, rq);
7227 cfs_rq->tg = tg;
7228 if (add)
7229 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7230
7231 tg->se[cpu] = se;
7232 se->cfs_rq = &rq->cfs;
7233 se->my_q = cfs_rq;
7234 se->load.weight = tg->shares;
7235 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7236 se->parent = NULL;
7237}
7238#endif
7239
7240#ifdef CONFIG_RT_GROUP_SCHED
7241static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7242 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7243 int cpu, int add)
7244{
7245 tg->rt_rq[cpu] = rt_rq;
7246 init_rt_rq(rt_rq, rq);
7247 rt_rq->tg = tg;
7248 rt_rq->rt_se = rt_se;
7249 if (add)
7250 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7251
7252 tg->rt_se[cpu] = rt_se;
7253 rt_se->rt_rq = &rq->rt;
7254 rt_se->my_q = rt_rq;
7255 rt_se->parent = NULL;
7256 INIT_LIST_HEAD(&rt_se->run_list);
7257}
7258#endif
7259
7260void __init sched_init(void)
7261{
7262 int highest_cpu = 0;
7263 int i, j;
7264
7265#ifdef CONFIG_SMP
7266 init_defrootdomain();
7267#endif
7268
7269#ifdef CONFIG_GROUP_SCHED
7270 list_add(&init_task_group.list, &task_groups);
7271#endif
7272
7273 for_each_possible_cpu(i) {
7274 struct rq *rq;
7275
7276 rq = cpu_rq(i);
7277 spin_lock_init(&rq->lock);
7278 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7279 rq->nr_running = 0;
7280 rq->clock = 1;
7281 init_cfs_rq(&rq->cfs, rq);
7282 init_rt_rq(&rq->rt, rq);
7283#ifdef CONFIG_FAIR_GROUP_SCHED
7284 init_task_group.shares = init_task_group_load;
7285 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7286 init_tg_cfs_entry(rq, &init_task_group,
7287 &per_cpu(init_cfs_rq, i),
7288 &per_cpu(init_sched_entity, i), i, 1);
7289
7290#endif
7291#ifdef CONFIG_RT_GROUP_SCHED
7292 init_task_group.rt_runtime =
7293 sysctl_sched_rt_runtime * NSEC_PER_USEC;
7294 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7295 init_tg_rt_entry(rq, &init_task_group,
7296 &per_cpu(init_rt_rq, i),
7297 &per_cpu(init_sched_rt_entity, i), i, 1);
7298#endif
7299 rq->rt_period_expire = 0;
7300 rq->rt_throttled = 0;
7301
7302 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7303 rq->cpu_load[j] = 0;
7304#ifdef CONFIG_SMP
7305 rq->sd = NULL;
7306 rq->rd = NULL;
7307 rq->active_balance = 0;
7308 rq->next_balance = jiffies;
7309 rq->push_cpu = 0;
7310 rq->cpu = i;
7311 rq->migration_thread = NULL;
7312 INIT_LIST_HEAD(&rq->migration_queue);
7313 rq_attach_root(rq, &def_root_domain);
7314#endif
7315 init_rq_hrtick(rq);
7316 atomic_set(&rq->nr_iowait, 0);
7317 highest_cpu = i;
7318 }
7319
7320 set_load_weight(&init_task);
7321
7322#ifdef CONFIG_PREEMPT_NOTIFIERS
7323 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7324#endif
7325
7326#ifdef CONFIG_SMP
7327 nr_cpu_ids = highest_cpu + 1;
7328 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
7329#endif
7330
7331#ifdef CONFIG_RT_MUTEXES
7332 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
7333#endif
7334
7335
7336
7337
7338 atomic_inc(&init_mm.mm_count);
7339 enter_lazy_tlb(&init_mm, current);
7340
7341
7342
7343
7344
7345
7346
7347 init_idle(current, smp_processor_id());
7348
7349
7350
7351 current->sched_class = &fair_sched_class;
7352
7353 scheduler_running = 1;
7354}
7355
7356#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
7357void __might_sleep(char *file, int line)
7358{
7359#ifdef in_atomic
7360 static unsigned long prev_jiffy;
7361
7362 if ((in_atomic() || irqs_disabled()) &&
7363 system_state == SYSTEM_RUNNING && !oops_in_progress) {
7364 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7365 return;
7366 prev_jiffy = jiffies;
7367 printk(KERN_ERR "BUG: sleeping function called from invalid"
7368 " context at %s:%d\n", file, line);
7369 printk("in_atomic():%d, irqs_disabled():%d\n",
7370 in_atomic(), irqs_disabled());
7371 debug_show_held_locks(current);
7372 if (irqs_disabled())
7373 print_irqtrace_events(current);
7374 dump_stack();
7375 }
7376#endif
7377}
7378EXPORT_SYMBOL(__might_sleep);
7379#endif
7380
7381#ifdef CONFIG_MAGIC_SYSRQ
7382static void normalize_task(struct rq *rq, struct task_struct *p)
7383{
7384 int on_rq;
7385 update_rq_clock(rq);
7386 on_rq = p->se.on_rq;
7387 if (on_rq)
7388 deactivate_task(rq, p, 0);
7389 __setscheduler(rq, p, SCHED_NORMAL, 0);
7390 if (on_rq) {
7391 activate_task(rq, p, 0);
7392 resched_task(rq->curr);
7393 }
7394}
7395
7396void normalize_rt_tasks(void)
7397{
7398 struct task_struct *g, *p;
7399 unsigned long flags;
7400 struct rq *rq;
7401
7402 read_lock_irqsave(&tasklist_lock, flags);
7403 do_each_thread(g, p) {
7404
7405
7406
7407 if (!p->mm)
7408 continue;
7409
7410 p->se.exec_start = 0;
7411#ifdef CONFIG_SCHEDSTATS
7412 p->se.wait_start = 0;
7413 p->se.sleep_start = 0;
7414 p->se.block_start = 0;
7415#endif
7416 task_rq(p)->clock = 0;
7417
7418 if (!rt_task(p)) {
7419
7420
7421
7422
7423 if (TASK_NICE(p) < 0 && p->mm)
7424 set_user_nice(p, 0);
7425 continue;
7426 }
7427
7428 spin_lock(&p->pi_lock);
7429 rq = __task_rq_lock(p);
7430
7431 normalize_task(rq, p);
7432
7433 __task_rq_unlock(rq);
7434 spin_unlock(&p->pi_lock);
7435 } while_each_thread(g, p);
7436
7437 read_unlock_irqrestore(&tasklist_lock, flags);
7438}
7439
7440#endif
7441
7442#ifdef CONFIG_IA64
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459struct task_struct *curr_task(int cpu)
7460{
7461 return cpu_curr(cpu);
7462}
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479void set_curr_task(int cpu, struct task_struct *p)
7480{
7481 cpu_curr(cpu) = p;
7482}
7483
7484#endif
7485
7486#ifdef CONFIG_GROUP_SCHED
7487
7488#ifdef CONFIG_FAIR_GROUP_SCHED
7489static void free_fair_sched_group(struct task_group *tg)
7490{
7491 int i;
7492
7493 for_each_possible_cpu(i) {
7494 if (tg->cfs_rq)
7495 kfree(tg->cfs_rq[i]);
7496 if (tg->se)
7497 kfree(tg->se[i]);
7498 }
7499
7500 kfree(tg->cfs_rq);
7501 kfree(tg->se);
7502}
7503
7504static int alloc_fair_sched_group(struct task_group *tg)
7505{
7506 struct cfs_rq *cfs_rq;
7507 struct sched_entity *se;
7508 struct rq *rq;
7509 int i;
7510
7511 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
7512 if (!tg->cfs_rq)
7513 goto err;
7514 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
7515 if (!tg->se)
7516 goto err;
7517
7518 tg->shares = NICE_0_LOAD;
7519
7520 for_each_possible_cpu(i) {
7521 rq = cpu_rq(i);
7522
7523 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7524 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7525 if (!cfs_rq)
7526 goto err;
7527
7528 se = kmalloc_node(sizeof(struct sched_entity),
7529 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7530 if (!se)
7531 goto err;
7532
7533 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7534 }
7535
7536 return 1;
7537
7538 err:
7539 return 0;
7540}
7541
7542static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7543{
7544 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
7545 &cpu_rq(cpu)->leaf_cfs_rq_list);
7546}
7547
7548static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7549{
7550 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
7551}
7552#else
7553static inline void free_fair_sched_group(struct task_group *tg)
7554{
7555}
7556
7557static inline int alloc_fair_sched_group(struct task_group *tg)
7558{
7559 return 1;
7560}
7561
7562static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7563{
7564}
7565
7566static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7567{
7568}
7569#endif
7570
7571#ifdef CONFIG_RT_GROUP_SCHED
7572static void free_rt_sched_group(struct task_group *tg)
7573{
7574 int i;
7575
7576 for_each_possible_cpu(i) {
7577 if (tg->rt_rq)
7578 kfree(tg->rt_rq[i]);
7579 if (tg->rt_se)
7580 kfree(tg->rt_se[i]);
7581 }
7582
7583 kfree(tg->rt_rq);
7584 kfree(tg->rt_se);
7585}
7586
7587static int alloc_rt_sched_group(struct task_group *tg)
7588{
7589 struct rt_rq *rt_rq;
7590 struct sched_rt_entity *rt_se;
7591 struct rq *rq;
7592 int i;
7593
7594 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7595 if (!tg->rt_rq)
7596 goto err;
7597 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7598 if (!tg->rt_se)
7599 goto err;
7600
7601 tg->rt_runtime = 0;
7602
7603 for_each_possible_cpu(i) {
7604 rq = cpu_rq(i);
7605
7606 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7607 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7608 if (!rt_rq)
7609 goto err;
7610
7611 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7612 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7613 if (!rt_se)
7614 goto err;
7615
7616 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7617 }
7618
7619 return 1;
7620
7621 err:
7622 return 0;
7623}
7624
7625static inline void register_rt_sched_group(struct task_group *tg, int cpu)
7626{
7627 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
7628 &cpu_rq(cpu)->leaf_rt_rq_list);
7629}
7630
7631static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7632{
7633 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
7634}
7635#else
7636static inline void free_rt_sched_group(struct task_group *tg)
7637{
7638}
7639
7640static inline int alloc_rt_sched_group(struct task_group *tg)
7641{
7642 return 1;
7643}
7644
7645static inline void register_rt_sched_group(struct task_group *tg, int cpu)
7646{
7647}
7648
7649static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7650{
7651}
7652#endif
7653
7654static void free_sched_group(struct task_group *tg)
7655{
7656 free_fair_sched_group(tg);
7657 free_rt_sched_group(tg);
7658 kfree(tg);
7659}
7660
7661
7662struct task_group *sched_create_group(void)
7663{
7664 struct task_group *tg;
7665 unsigned long flags;
7666 int i;
7667
7668 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7669 if (!tg)
7670 return ERR_PTR(-ENOMEM);
7671
7672 if (!alloc_fair_sched_group(tg))
7673 goto err;
7674
7675 if (!alloc_rt_sched_group(tg))
7676 goto err;
7677
7678 spin_lock_irqsave(&task_group_lock, flags);
7679 for_each_possible_cpu(i) {
7680 register_fair_sched_group(tg, i);
7681 register_rt_sched_group(tg, i);
7682 }
7683 list_add_rcu(&tg->list, &task_groups);
7684 spin_unlock_irqrestore(&task_group_lock, flags);
7685
7686 return tg;
7687
7688err:
7689 free_sched_group(tg);
7690 return ERR_PTR(-ENOMEM);
7691}
7692
7693
7694static void free_sched_group_rcu(struct rcu_head *rhp)
7695{
7696
7697 free_sched_group(container_of(rhp, struct task_group, rcu));
7698}
7699
7700
7701void sched_destroy_group(struct task_group *tg)
7702{
7703 unsigned long flags;
7704 int i;
7705
7706 spin_lock_irqsave(&task_group_lock, flags);
7707 for_each_possible_cpu(i) {
7708 unregister_fair_sched_group(tg, i);
7709 unregister_rt_sched_group(tg, i);
7710 }
7711 list_del_rcu(&tg->list);
7712 spin_unlock_irqrestore(&task_group_lock, flags);
7713
7714
7715 call_rcu(&tg->rcu, free_sched_group_rcu);
7716}
7717
7718
7719
7720
7721
7722
7723void sched_move_task(struct task_struct *tsk)
7724{
7725 int on_rq, running;
7726 unsigned long flags;
7727 struct rq *rq;
7728
7729 rq = task_rq_lock(tsk, &flags);
7730
7731 update_rq_clock(rq);
7732
7733 running = task_current(rq, tsk);
7734 on_rq = tsk->se.on_rq;
7735
7736 if (on_rq)
7737 dequeue_task(rq, tsk, 0);
7738 if (unlikely(running))
7739 tsk->sched_class->put_prev_task(rq, tsk);
7740
7741 set_task_rq(tsk, task_cpu(tsk));
7742
7743#ifdef CONFIG_FAIR_GROUP_SCHED
7744 if (tsk->sched_class->moved_group)
7745 tsk->sched_class->moved_group(tsk);
7746#endif
7747
7748 if (unlikely(running))
7749 tsk->sched_class->set_curr_task(rq);
7750 if (on_rq)
7751 enqueue_task(rq, tsk, 0);
7752
7753 task_rq_unlock(rq, &flags);
7754}
7755
7756#ifdef CONFIG_FAIR_GROUP_SCHED
7757static void set_se_shares(struct sched_entity *se, unsigned long shares)
7758{
7759 struct cfs_rq *cfs_rq = se->cfs_rq;
7760 struct rq *rq = cfs_rq->rq;
7761 int on_rq;
7762
7763 spin_lock_irq(&rq->lock);
7764
7765 on_rq = se->on_rq;
7766 if (on_rq)
7767 dequeue_entity(cfs_rq, se, 0);
7768
7769 se->load.weight = shares;
7770 se->load.inv_weight = div64_64((1ULL<<32), shares);
7771
7772 if (on_rq)
7773 enqueue_entity(cfs_rq, se, 0);
7774
7775 spin_unlock_irq(&rq->lock);
7776}
7777
7778static DEFINE_MUTEX(shares_mutex);
7779
7780int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7781{
7782 int i;
7783 unsigned long flags;
7784
7785
7786
7787
7788
7789
7790 if (shares < 2)
7791 shares = 2;
7792
7793 mutex_lock(&shares_mutex);
7794 if (tg->shares == shares)
7795 goto done;
7796
7797 spin_lock_irqsave(&task_group_lock, flags);
7798 for_each_possible_cpu(i)
7799 unregister_fair_sched_group(tg, i);
7800 spin_unlock_irqrestore(&task_group_lock, flags);
7801
7802
7803 synchronize_sched();
7804
7805
7806
7807
7808
7809 tg->shares = shares;
7810 for_each_possible_cpu(i)
7811 set_se_shares(tg->se[i], shares);
7812
7813
7814
7815
7816
7817 spin_lock_irqsave(&task_group_lock, flags);
7818 for_each_possible_cpu(i)
7819 register_fair_sched_group(tg, i);
7820 spin_unlock_irqrestore(&task_group_lock, flags);
7821done:
7822 mutex_unlock(&shares_mutex);
7823 return 0;
7824}
7825
7826unsigned long sched_group_shares(struct task_group *tg)
7827{
7828 return tg->shares;
7829}
7830#endif
7831
7832#ifdef CONFIG_RT_GROUP_SCHED
7833
7834
7835
7836static DEFINE_MUTEX(rt_constraints_mutex);
7837
7838static unsigned long to_ratio(u64 period, u64 runtime)
7839{
7840 if (runtime == RUNTIME_INF)
7841 return 1ULL << 16;
7842
7843 return div64_64(runtime << 16, period);
7844}
7845
7846static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7847{
7848 struct task_group *tgi;
7849 unsigned long total = 0;
7850 unsigned long global_ratio =
7851 to_ratio(sysctl_sched_rt_period,
7852 sysctl_sched_rt_runtime < 0 ?
7853 RUNTIME_INF : sysctl_sched_rt_runtime);
7854
7855 rcu_read_lock();
7856 list_for_each_entry_rcu(tgi, &task_groups, list) {
7857 if (tgi == tg)
7858 continue;
7859
7860 total += to_ratio(period, tgi->rt_runtime);
7861 }
7862 rcu_read_unlock();
7863
7864 return total + to_ratio(period, runtime) < global_ratio;
7865}
7866
7867
7868static inline int tg_has_rt_tasks(struct task_group *tg)
7869{
7870 struct task_struct *g, *p;
7871 do_each_thread(g, p) {
7872 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
7873 return 1;
7874 } while_each_thread(g, p);
7875 return 0;
7876}
7877
7878int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7879{
7880 u64 rt_runtime, rt_period;
7881 int err = 0;
7882
7883 rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
7884 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7885 if (rt_runtime_us == -1)
7886 rt_runtime = RUNTIME_INF;
7887
7888 mutex_lock(&rt_constraints_mutex);
7889 read_lock(&tasklist_lock);
7890 if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
7891 err = -EBUSY;
7892 goto unlock;
7893 }
7894 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
7895 err = -EINVAL;
7896 goto unlock;
7897 }
7898 tg->rt_runtime = rt_runtime;
7899 unlock:
7900 read_unlock(&tasklist_lock);
7901 mutex_unlock(&rt_constraints_mutex);
7902
7903 return err;
7904}
7905
7906long sched_group_rt_runtime(struct task_group *tg)
7907{
7908 u64 rt_runtime_us;
7909
7910 if (tg->rt_runtime == RUNTIME_INF)
7911 return -1;
7912
7913 rt_runtime_us = tg->rt_runtime;
7914 do_div(rt_runtime_us, NSEC_PER_USEC);
7915 return rt_runtime_us;
7916}
7917#endif
7918#endif
7919
7920#ifdef CONFIG_CGROUP_SCHED
7921
7922
7923static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7924{
7925 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7926 struct task_group, css);
7927}
7928
7929static struct cgroup_subsys_state *
7930cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7931{
7932 struct task_group *tg;
7933
7934 if (!cgrp->parent) {
7935
7936 init_task_group.css.cgroup = cgrp;
7937 return &init_task_group.css;
7938 }
7939
7940
7941 if (cgrp->parent->parent)
7942 return ERR_PTR(-EINVAL);
7943
7944 tg = sched_create_group();
7945 if (IS_ERR(tg))
7946 return ERR_PTR(-ENOMEM);
7947
7948
7949 tg->css.cgroup = cgrp;
7950
7951 return &tg->css;
7952}
7953
7954static void
7955cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7956{
7957 struct task_group *tg = cgroup_tg(cgrp);
7958
7959 sched_destroy_group(tg);
7960}
7961
7962static int
7963cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7964 struct task_struct *tsk)
7965{
7966#ifdef CONFIG_RT_GROUP_SCHED
7967
7968 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
7969 return -EINVAL;
7970#else
7971
7972 if (tsk->sched_class != &fair_sched_class)
7973 return -EINVAL;
7974#endif
7975
7976 return 0;
7977}
7978
7979static void
7980cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7981 struct cgroup *old_cont, struct task_struct *tsk)
7982{
7983 sched_move_task(tsk);
7984}
7985
7986#ifdef CONFIG_FAIR_GROUP_SCHED
7987static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7988 u64 shareval)
7989{
7990 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
7991}
7992
7993static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7994{
7995 struct task_group *tg = cgroup_tg(cgrp);
7996
7997 return (u64) tg->shares;
7998}
7999#endif
8000
8001#ifdef CONFIG_RT_GROUP_SCHED
8002static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8003 struct file *file,
8004 const char __user *userbuf,
8005 size_t nbytes, loff_t *unused_ppos)
8006{
8007 char buffer[64];
8008 int retval = 0;
8009 s64 val;
8010 char *end;
8011
8012 if (!nbytes)
8013 return -EINVAL;
8014 if (nbytes >= sizeof(buffer))
8015 return -E2BIG;
8016 if (copy_from_user(buffer, userbuf, nbytes))
8017 return -EFAULT;
8018
8019 buffer[nbytes] = 0;
8020
8021
8022 if (nbytes && (buffer[nbytes-1] == '\n'))
8023 buffer[nbytes-1] = 0;
8024 val = simple_strtoll(buffer, &end, 0);
8025 if (*end)
8026 return -EINVAL;
8027
8028
8029 retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8030 if (!retval)
8031 retval = nbytes;
8032 return retval;
8033}
8034
8035static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
8036 struct file *file,
8037 char __user *buf, size_t nbytes,
8038 loff_t *ppos)
8039{
8040 char tmp[64];
8041 long val = sched_group_rt_runtime(cgroup_tg(cgrp));
8042 int len = sprintf(tmp, "%ld\n", val);
8043
8044 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
8045}
8046#endif
8047
8048static struct cftype cpu_files[] = {
8049#ifdef CONFIG_FAIR_GROUP_SCHED
8050 {
8051 .name = "shares",
8052 .read_uint = cpu_shares_read_uint,
8053 .write_uint = cpu_shares_write_uint,
8054 },
8055#endif
8056#ifdef CONFIG_RT_GROUP_SCHED
8057 {
8058 .name = "rt_runtime_us",
8059 .read = cpu_rt_runtime_read,
8060 .write = cpu_rt_runtime_write,
8061 },
8062#endif
8063};
8064
8065static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8066{
8067 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
8068}
8069
8070struct cgroup_subsys cpu_cgroup_subsys = {
8071 .name = "cpu",
8072 .create = cpu_cgroup_create,
8073 .destroy = cpu_cgroup_destroy,
8074 .can_attach = cpu_cgroup_can_attach,
8075 .attach = cpu_cgroup_attach,
8076 .populate = cpu_cgroup_populate,
8077 .subsys_id = cpu_cgroup_subsys_id,
8078 .early_init = 1,
8079};
8080
8081#endif
8082
8083#ifdef CONFIG_CGROUP_CPUACCT
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093struct cpuacct {
8094 struct cgroup_subsys_state css;
8095
8096 u64 *cpuusage;
8097};
8098
8099struct cgroup_subsys cpuacct_subsys;
8100
8101
8102static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
8103{
8104 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
8105 struct cpuacct, css);
8106}
8107
8108
8109static inline struct cpuacct *task_ca(struct task_struct *tsk)
8110{
8111 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
8112 struct cpuacct, css);
8113}
8114
8115
8116static struct cgroup_subsys_state *cpuacct_create(
8117 struct cgroup_subsys *ss, struct cgroup *cont)
8118{
8119 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8120
8121 if (!ca)
8122 return ERR_PTR(-ENOMEM);
8123
8124 ca->cpuusage = alloc_percpu(u64);
8125 if (!ca->cpuusage) {
8126 kfree(ca);
8127 return ERR_PTR(-ENOMEM);
8128 }
8129
8130 return &ca->css;
8131}
8132
8133
8134static void
8135cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
8136{
8137 struct cpuacct *ca = cgroup_ca(cont);
8138
8139 free_percpu(ca->cpuusage);
8140 kfree(ca);
8141}
8142
8143
8144static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
8145{
8146 struct cpuacct *ca = cgroup_ca(cont);
8147 u64 totalcpuusage = 0;
8148 int i;
8149
8150 for_each_possible_cpu(i) {
8151 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
8152
8153
8154
8155
8156
8157 spin_lock_irq(&cpu_rq(i)->lock);
8158 totalcpuusage += *cpuusage;
8159 spin_unlock_irq(&cpu_rq(i)->lock);
8160 }
8161
8162 return totalcpuusage;
8163}
8164
8165static struct cftype files[] = {
8166 {
8167 .name = "usage",
8168 .read_uint = cpuusage_read,
8169 },
8170};
8171
8172static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8173{
8174 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
8175}
8176
8177
8178
8179
8180
8181
8182static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8183{
8184 struct cpuacct *ca;
8185
8186 if (!cpuacct_subsys.active)
8187 return;
8188
8189 ca = task_ca(tsk);
8190 if (ca) {
8191 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
8192
8193 *cpuusage += cputime;
8194 }
8195}
8196
8197struct cgroup_subsys cpuacct_subsys = {
8198 .name = "cpuacct",
8199 .create = cpuacct_create,
8200 .destroy = cpuacct_destroy,
8201 .populate = cpuacct_populate,
8202 .subsys_id = cpuacct_subsys_id,
8203};
8204#endif
8205