1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_event.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h>
60#include <linux/seq_file.h>
61#include <linux/sysctl.h>
62#include <linux/syscalls.h>
63#include <linux/times.h>
64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h>
66#include <linux/delayacct.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73#include <linux/ftrace.h>
74#include <linux/slab.h>
75
76#include <asm/tlb.h>
77#include <asm/irq_regs.h>
78
79#include "sched_cpupri.h"
80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
84
85
86
87
88
89#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
90#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
91#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
92
93
94
95
96
97
98#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
99#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
100#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
101
102
103
104
105#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
106
107#define NICE_0_LOAD SCHED_LOAD_SCALE
108#define NICE_0_SHIFT SCHED_LOAD_SHIFT
109
110
111
112
113
114
115
116#define DEF_TIMESLICE (100 * HZ / 1000)
117
118
119
120
121#define RUNTIME_INF ((u64)~0ULL)
122
123static inline int rt_policy(int policy)
124{
125 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
126 return 1;
127 return 0;
128}
129
130static inline int task_has_rt_policy(struct task_struct *p)
131{
132 return rt_policy(p->policy);
133}
134
135
136
137
138struct rt_prio_array {
139 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
140 struct list_head queue[MAX_RT_PRIO];
141};
142
143struct rt_bandwidth {
144
145 raw_spinlock_t rt_runtime_lock;
146 ktime_t rt_period;
147 u64 rt_runtime;
148 struct hrtimer rt_period_timer;
149};
150
151static struct rt_bandwidth def_rt_bandwidth;
152
153static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
154
155static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
156{
157 struct rt_bandwidth *rt_b =
158 container_of(timer, struct rt_bandwidth, rt_period_timer);
159 ktime_t now;
160 int overrun;
161 int idle = 0;
162
163 for (;;) {
164 now = hrtimer_cb_get_time(timer);
165 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
166
167 if (!overrun)
168 break;
169
170 idle = do_sched_rt_period_timer(rt_b, overrun);
171 }
172
173 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
174}
175
176static
177void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178{
179 rt_b->rt_period = ns_to_ktime(period);
180 rt_b->rt_runtime = runtime;
181
182 raw_spin_lock_init(&rt_b->rt_runtime_lock);
183
184 hrtimer_init(&rt_b->rt_period_timer,
185 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
186 rt_b->rt_period_timer.function = sched_rt_period_timer;
187}
188
189static inline int rt_bandwidth_enabled(void)
190{
191 return sysctl_sched_rt_runtime >= 0;
192}
193
194static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
195{
196 ktime_t now;
197
198 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
199 return;
200
201 if (hrtimer_active(&rt_b->rt_period_timer))
202 return;
203
204 raw_spin_lock(&rt_b->rt_runtime_lock);
205 for (;;) {
206 unsigned long delta;
207 ktime_t soft, hard;
208
209 if (hrtimer_active(&rt_b->rt_period_timer))
210 break;
211
212 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
213 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
214
215 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
216 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
217 delta = ktime_to_ns(ktime_sub(hard, soft));
218 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
219 HRTIMER_MODE_ABS_PINNED, 0);
220 }
221 raw_spin_unlock(&rt_b->rt_runtime_lock);
222}
223
224#ifdef CONFIG_RT_GROUP_SCHED
225static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
226{
227 hrtimer_cancel(&rt_b->rt_period_timer);
228}
229#endif
230
231
232
233
234
235static DEFINE_MUTEX(sched_domains_mutex);
236
237#ifdef CONFIG_CGROUP_SCHED
238
239#include <linux/cgroup.h>
240
241struct cfs_rq;
242
243static LIST_HEAD(task_groups);
244
245
246struct task_group {
247 struct cgroup_subsys_state css;
248
249#ifdef CONFIG_FAIR_GROUP_SCHED
250
251 struct sched_entity **se;
252
253 struct cfs_rq **cfs_rq;
254 unsigned long shares;
255#endif
256
257#ifdef CONFIG_RT_GROUP_SCHED
258 struct sched_rt_entity **rt_se;
259 struct rt_rq **rt_rq;
260
261 struct rt_bandwidth rt_bandwidth;
262#endif
263
264 struct rcu_head rcu;
265 struct list_head list;
266
267 struct task_group *parent;
268 struct list_head siblings;
269 struct list_head children;
270};
271
272#define root_task_group init_task_group
273
274
275
276
277static DEFINE_SPINLOCK(task_group_lock);
278
279#ifdef CONFIG_FAIR_GROUP_SCHED
280
281#ifdef CONFIG_SMP
282static int root_task_group_empty(void)
283{
284 return list_empty(&root_task_group.children);
285}
286#endif
287
288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
289
290
291
292
293
294
295
296
297
298#define MIN_SHARES 2
299#define MAX_SHARES (1UL << 18)
300
301static int init_task_group_load = INIT_TASK_GROUP_LOAD;
302#endif
303
304
305
306
307struct task_group init_task_group;
308
309
310static inline struct task_group *task_group(struct task_struct *p)
311{
312 struct task_group *tg;
313
314#ifdef CONFIG_CGROUP_SCHED
315 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
316 struct task_group, css);
317#else
318 tg = &init_task_group;
319#endif
320 return tg;
321}
322
323
324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
325{
326
327
328
329
330
331
332
333
334 rcu_read_lock();
335#ifdef CONFIG_FAIR_GROUP_SCHED
336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
337 p->se.parent = task_group(p)->se[cpu];
338#endif
339
340#ifdef CONFIG_RT_GROUP_SCHED
341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
342 p->rt.parent = task_group(p)->rt_se[cpu];
343#endif
344 rcu_read_unlock();
345}
346
347#else
348
349static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
350static inline struct task_group *task_group(struct task_struct *p)
351{
352 return NULL;
353}
354
355#endif
356
357
358struct cfs_rq {
359 struct load_weight load;
360 unsigned long nr_running;
361
362 u64 exec_clock;
363 u64 min_vruntime;
364
365 struct rb_root tasks_timeline;
366 struct rb_node *rb_leftmost;
367
368 struct list_head tasks;
369 struct list_head *balance_iterator;
370
371
372
373
374
375 struct sched_entity *curr, *next, *last;
376
377 unsigned int nr_spread_over;
378
379#ifdef CONFIG_FAIR_GROUP_SCHED
380 struct rq *rq;
381
382
383
384
385
386
387
388
389
390 struct list_head leaf_cfs_rq_list;
391 struct task_group *tg;
392
393#ifdef CONFIG_SMP
394
395
396
397 unsigned long task_weight;
398
399
400
401
402
403
404
405 unsigned long h_load;
406
407
408
409
410 unsigned long shares;
411
412
413
414
415 unsigned long rq_weight;
416#endif
417#endif
418};
419
420
421struct rt_rq {
422 struct rt_prio_array active;
423 unsigned long rt_nr_running;
424#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
425 struct {
426 int curr;
427#ifdef CONFIG_SMP
428 int next;
429#endif
430 } highest_prio;
431#endif
432#ifdef CONFIG_SMP
433 unsigned long rt_nr_migratory;
434 unsigned long rt_nr_total;
435 int overloaded;
436 struct plist_head pushable_tasks;
437#endif
438 int rt_throttled;
439 u64 rt_time;
440 u64 rt_runtime;
441
442 raw_spinlock_t rt_runtime_lock;
443
444#ifdef CONFIG_RT_GROUP_SCHED
445 unsigned long rt_nr_boosted;
446
447 struct rq *rq;
448 struct list_head leaf_rt_rq_list;
449 struct task_group *tg;
450#endif
451};
452
453#ifdef CONFIG_SMP
454
455
456
457
458
459
460
461
462
463struct root_domain {
464 atomic_t refcount;
465 cpumask_var_t span;
466 cpumask_var_t online;
467
468
469
470
471
472 cpumask_var_t rto_mask;
473 atomic_t rto_count;
474#ifdef CONFIG_SMP
475 struct cpupri cpupri;
476#endif
477};
478
479
480
481
482
483static struct root_domain def_root_domain;
484
485#endif
486
487
488
489
490
491
492
493
494struct rq {
495
496 raw_spinlock_t lock;
497
498
499
500
501
502 unsigned long nr_running;
503 #define CPU_LOAD_IDX_MAX 5
504 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
505#ifdef CONFIG_NO_HZ
506 unsigned char in_nohz_recently;
507#endif
508
509 struct load_weight load;
510 unsigned long nr_load_updates;
511 u64 nr_switches;
512
513 struct cfs_rq cfs;
514 struct rt_rq rt;
515
516#ifdef CONFIG_FAIR_GROUP_SCHED
517
518 struct list_head leaf_cfs_rq_list;
519#endif
520#ifdef CONFIG_RT_GROUP_SCHED
521 struct list_head leaf_rt_rq_list;
522#endif
523
524
525
526
527
528
529
530 unsigned long nr_uninterruptible;
531
532 struct task_struct *curr, *idle;
533 unsigned long next_balance;
534 struct mm_struct *prev_mm;
535
536 u64 clock;
537
538 atomic_t nr_iowait;
539
540#ifdef CONFIG_SMP
541 struct root_domain *rd;
542 struct sched_domain *sd;
543
544 unsigned char idle_at_tick;
545
546 int post_schedule;
547 int active_balance;
548 int push_cpu;
549
550 int cpu;
551 int online;
552
553 unsigned long avg_load_per_task;
554
555 struct task_struct *migration_thread;
556 struct list_head migration_queue;
557
558 u64 rt_avg;
559 u64 age_stamp;
560 u64 idle_stamp;
561 u64 avg_idle;
562#endif
563
564
565 unsigned long calc_load_update;
566 long calc_load_active;
567
568#ifdef CONFIG_SCHED_HRTICK
569#ifdef CONFIG_SMP
570 int hrtick_csd_pending;
571 struct call_single_data hrtick_csd;
572#endif
573 struct hrtimer hrtick_timer;
574#endif
575
576#ifdef CONFIG_SCHEDSTATS
577
578 struct sched_info rq_sched_info;
579 unsigned long long rq_cpu_time;
580
581
582
583 unsigned int yld_count;
584
585
586 unsigned int sched_switch;
587 unsigned int sched_count;
588 unsigned int sched_goidle;
589
590
591 unsigned int ttwu_count;
592 unsigned int ttwu_local;
593
594
595 unsigned int bkl_count;
596#endif
597};
598
599static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
600
601static inline
602void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
603{
604 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
605}
606
607static inline int cpu_of(struct rq *rq)
608{
609#ifdef CONFIG_SMP
610 return rq->cpu;
611#else
612 return 0;
613#endif
614}
615
616#define rcu_dereference_check_sched_domain(p) \
617 rcu_dereference_check((p), \
618 rcu_read_lock_sched_held() || \
619 lockdep_is_held(&sched_domains_mutex))
620
621
622
623
624
625
626
627
628#define for_each_domain(cpu, __sd) \
629 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
630
631#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
632#define this_rq() (&__get_cpu_var(runqueues))
633#define task_rq(p) cpu_rq(task_cpu(p))
634#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
635#define raw_rq() (&__raw_get_cpu_var(runqueues))
636
637inline void update_rq_clock(struct rq *rq)
638{
639 rq->clock = sched_clock_cpu(cpu_of(rq));
640}
641
642
643
644
645#ifdef CONFIG_SCHED_DEBUG
646# define const_debug __read_mostly
647#else
648# define const_debug static const
649#endif
650
651
652
653
654
655
656
657
658
659int runqueue_is_locked(int cpu)
660{
661 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
662}
663
664
665
666
667
668#define SCHED_FEAT(name, enabled) \
669 __SCHED_FEAT_##name ,
670
671enum {
672#include "sched_features.h"
673};
674
675#undef SCHED_FEAT
676
677#define SCHED_FEAT(name, enabled) \
678 (1UL << __SCHED_FEAT_##name) * enabled |
679
680const_debug unsigned int sysctl_sched_features =
681#include "sched_features.h"
682 0;
683
684#undef SCHED_FEAT
685
686#ifdef CONFIG_SCHED_DEBUG
687#define SCHED_FEAT(name, enabled) \
688 #name ,
689
690static __read_mostly char *sched_feat_names[] = {
691#include "sched_features.h"
692 NULL
693};
694
695#undef SCHED_FEAT
696
697static int sched_feat_show(struct seq_file *m, void *v)
698{
699 int i;
700
701 for (i = 0; sched_feat_names[i]; i++) {
702 if (!(sysctl_sched_features & (1UL << i)))
703 seq_puts(m, "NO_");
704 seq_printf(m, "%s ", sched_feat_names[i]);
705 }
706 seq_puts(m, "\n");
707
708 return 0;
709}
710
711static ssize_t
712sched_feat_write(struct file *filp, const char __user *ubuf,
713 size_t cnt, loff_t *ppos)
714{
715 char buf[64];
716 char *cmp = buf;
717 int neg = 0;
718 int i;
719
720 if (cnt > 63)
721 cnt = 63;
722
723 if (copy_from_user(&buf, ubuf, cnt))
724 return -EFAULT;
725
726 buf[cnt] = 0;
727
728 if (strncmp(buf, "NO_", 3) == 0) {
729 neg = 1;
730 cmp += 3;
731 }
732
733 for (i = 0; sched_feat_names[i]; i++) {
734 int len = strlen(sched_feat_names[i]);
735
736 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
737 if (neg)
738 sysctl_sched_features &= ~(1UL << i);
739 else
740 sysctl_sched_features |= (1UL << i);
741 break;
742 }
743 }
744
745 if (!sched_feat_names[i])
746 return -EINVAL;
747
748 *ppos += cnt;
749
750 return cnt;
751}
752
753static int sched_feat_open(struct inode *inode, struct file *filp)
754{
755 return single_open(filp, sched_feat_show, NULL);
756}
757
758static const struct file_operations sched_feat_fops = {
759 .open = sched_feat_open,
760 .write = sched_feat_write,
761 .read = seq_read,
762 .llseek = seq_lseek,
763 .release = single_release,
764};
765
766static __init int sched_init_debug(void)
767{
768 debugfs_create_file("sched_features", 0644, NULL, NULL,
769 &sched_feat_fops);
770
771 return 0;
772}
773late_initcall(sched_init_debug);
774
775#endif
776
777#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
778
779
780
781
782
783const_debug unsigned int sysctl_sched_nr_migrate = 32;
784
785
786
787
788
789unsigned int sysctl_sched_shares_ratelimit = 250000;
790unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
791
792
793
794
795
796
797unsigned int sysctl_sched_shares_thresh = 4;
798
799
800
801
802
803
804
805const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
806
807
808
809
810
811unsigned int sysctl_sched_rt_period = 1000000;
812
813static __read_mostly int scheduler_running;
814
815
816
817
818
819int sysctl_sched_rt_runtime = 950000;
820
821static inline u64 global_rt_period(void)
822{
823 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
824}
825
826static inline u64 global_rt_runtime(void)
827{
828 if (sysctl_sched_rt_runtime < 0)
829 return RUNTIME_INF;
830
831 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
832}
833
834#ifndef prepare_arch_switch
835# define prepare_arch_switch(next) do { } while (0)
836#endif
837#ifndef finish_arch_switch
838# define finish_arch_switch(prev) do { } while (0)
839#endif
840
841static inline int task_current(struct rq *rq, struct task_struct *p)
842{
843 return rq->curr == p;
844}
845
846#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline int task_running(struct rq *rq, struct task_struct *p)
848{
849 return task_current(rq, p);
850}
851
852static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
853{
854}
855
856static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
857{
858#ifdef CONFIG_DEBUG_SPINLOCK
859
860 rq->lock.owner = current;
861#endif
862
863
864
865
866
867 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
868
869 raw_spin_unlock_irq(&rq->lock);
870}
871
872#else
873static inline int task_running(struct rq *rq, struct task_struct *p)
874{
875#ifdef CONFIG_SMP
876 return p->oncpu;
877#else
878 return task_current(rq, p);
879#endif
880}
881
882static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
883{
884#ifdef CONFIG_SMP
885
886
887
888
889
890 next->oncpu = 1;
891#endif
892#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
893 raw_spin_unlock_irq(&rq->lock);
894#else
895 raw_spin_unlock(&rq->lock);
896#endif
897}
898
899static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
900{
901#ifdef CONFIG_SMP
902
903
904
905
906
907 smp_wmb();
908 prev->oncpu = 0;
909#endif
910#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
911 local_irq_enable();
912#endif
913}
914#endif
915
916
917
918
919
920
921
922
923
924static inline int task_is_waking(struct task_struct *p)
925{
926 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
927}
928
929
930
931
932
933static inline struct rq *__task_rq_lock(struct task_struct *p)
934 __acquires(rq->lock)
935{
936 struct rq *rq;
937
938 for (;;) {
939 while (task_is_waking(p))
940 cpu_relax();
941 rq = task_rq(p);
942 raw_spin_lock(&rq->lock);
943 if (likely(rq == task_rq(p) && !task_is_waking(p)))
944 return rq;
945 raw_spin_unlock(&rq->lock);
946 }
947}
948
949
950
951
952
953
954static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
955 __acquires(rq->lock)
956{
957 struct rq *rq;
958
959 for (;;) {
960 while (task_is_waking(p))
961 cpu_relax();
962 local_irq_save(*flags);
963 rq = task_rq(p);
964 raw_spin_lock(&rq->lock);
965 if (likely(rq == task_rq(p) && !task_is_waking(p)))
966 return rq;
967 raw_spin_unlock_irqrestore(&rq->lock, *flags);
968 }
969}
970
971void task_rq_unlock_wait(struct task_struct *p)
972{
973 struct rq *rq = task_rq(p);
974
975 smp_mb();
976 raw_spin_unlock_wait(&rq->lock);
977}
978
979static void __task_rq_unlock(struct rq *rq)
980 __releases(rq->lock)
981{
982 raw_spin_unlock(&rq->lock);
983}
984
985static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
986 __releases(rq->lock)
987{
988 raw_spin_unlock_irqrestore(&rq->lock, *flags);
989}
990
991
992
993
994static struct rq *this_rq_lock(void)
995 __acquires(rq->lock)
996{
997 struct rq *rq;
998
999 local_irq_disable();
1000 rq = this_rq();
1001 raw_spin_lock(&rq->lock);
1002
1003 return rq;
1004}
1005
1006#ifdef CONFIG_SCHED_HRTICK
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023static inline int hrtick_enabled(struct rq *rq)
1024{
1025 if (!sched_feat(HRTICK))
1026 return 0;
1027 if (!cpu_active(cpu_of(rq)))
1028 return 0;
1029 return hrtimer_is_hres_active(&rq->hrtick_timer);
1030}
1031
1032static void hrtick_clear(struct rq *rq)
1033{
1034 if (hrtimer_active(&rq->hrtick_timer))
1035 hrtimer_cancel(&rq->hrtick_timer);
1036}
1037
1038
1039
1040
1041
1042static enum hrtimer_restart hrtick(struct hrtimer *timer)
1043{
1044 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1045
1046 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1047
1048 raw_spin_lock(&rq->lock);
1049 update_rq_clock(rq);
1050 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1051 raw_spin_unlock(&rq->lock);
1052
1053 return HRTIMER_NORESTART;
1054}
1055
1056#ifdef CONFIG_SMP
1057
1058
1059
1060static void __hrtick_start(void *arg)
1061{
1062 struct rq *rq = arg;
1063
1064 raw_spin_lock(&rq->lock);
1065 hrtimer_restart(&rq->hrtick_timer);
1066 rq->hrtick_csd_pending = 0;
1067 raw_spin_unlock(&rq->lock);
1068}
1069
1070
1071
1072
1073
1074
1075static void hrtick_start(struct rq *rq, u64 delay)
1076{
1077 struct hrtimer *timer = &rq->hrtick_timer;
1078 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1079
1080 hrtimer_set_expires(timer, time);
1081
1082 if (rq == this_rq()) {
1083 hrtimer_restart(timer);
1084 } else if (!rq->hrtick_csd_pending) {
1085 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1086 rq->hrtick_csd_pending = 1;
1087 }
1088}
1089
1090static int
1091hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1092{
1093 int cpu = (int)(long)hcpu;
1094
1095 switch (action) {
1096 case CPU_UP_CANCELED:
1097 case CPU_UP_CANCELED_FROZEN:
1098 case CPU_DOWN_PREPARE:
1099 case CPU_DOWN_PREPARE_FROZEN:
1100 case CPU_DEAD:
1101 case CPU_DEAD_FROZEN:
1102 hrtick_clear(cpu_rq(cpu));
1103 return NOTIFY_OK;
1104 }
1105
1106 return NOTIFY_DONE;
1107}
1108
1109static __init void init_hrtick(void)
1110{
1111 hotcpu_notifier(hotplug_hrtick, 0);
1112}
1113#else
1114
1115
1116
1117
1118
1119static void hrtick_start(struct rq *rq, u64 delay)
1120{
1121 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1122 HRTIMER_MODE_REL_PINNED, 0);
1123}
1124
1125static inline void init_hrtick(void)
1126{
1127}
1128#endif
1129
1130static void init_rq_hrtick(struct rq *rq)
1131{
1132#ifdef CONFIG_SMP
1133 rq->hrtick_csd_pending = 0;
1134
1135 rq->hrtick_csd.flags = 0;
1136 rq->hrtick_csd.func = __hrtick_start;
1137 rq->hrtick_csd.info = rq;
1138#endif
1139
1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1141 rq->hrtick_timer.function = hrtick;
1142}
1143#else
1144static inline void hrtick_clear(struct rq *rq)
1145{
1146}
1147
1148static inline void init_rq_hrtick(struct rq *rq)
1149{
1150}
1151
1152static inline void init_hrtick(void)
1153{
1154}
1155#endif
1156
1157
1158
1159
1160
1161
1162
1163
1164#ifdef CONFIG_SMP
1165
1166#ifndef tsk_is_polling
1167#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1168#endif
1169
1170static void resched_task(struct task_struct *p)
1171{
1172 int cpu;
1173
1174 assert_raw_spin_locked(&task_rq(p)->lock);
1175
1176 if (test_tsk_need_resched(p))
1177 return;
1178
1179 set_tsk_need_resched(p);
1180
1181 cpu = task_cpu(p);
1182 if (cpu == smp_processor_id())
1183 return;
1184
1185
1186 smp_mb();
1187 if (!tsk_is_polling(p))
1188 smp_send_reschedule(cpu);
1189}
1190
1191static void resched_cpu(int cpu)
1192{
1193 struct rq *rq = cpu_rq(cpu);
1194 unsigned long flags;
1195
1196 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1197 return;
1198 resched_task(cpu_curr(cpu));
1199 raw_spin_unlock_irqrestore(&rq->lock, flags);
1200}
1201
1202#ifdef CONFIG_NO_HZ
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213void wake_up_idle_cpu(int cpu)
1214{
1215 struct rq *rq = cpu_rq(cpu);
1216
1217 if (cpu == smp_processor_id())
1218 return;
1219
1220
1221
1222
1223
1224
1225
1226
1227 if (rq->curr != rq->idle)
1228 return;
1229
1230
1231
1232
1233
1234
1235 set_tsk_need_resched(rq->idle);
1236
1237
1238 smp_mb();
1239 if (!tsk_is_polling(rq->idle))
1240 smp_send_reschedule(cpu);
1241}
1242#endif
1243
1244static u64 sched_avg_period(void)
1245{
1246 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1247}
1248
1249static void sched_avg_update(struct rq *rq)
1250{
1251 s64 period = sched_avg_period();
1252
1253 while ((s64)(rq->clock - rq->age_stamp) > period) {
1254 rq->age_stamp += period;
1255 rq->rt_avg /= 2;
1256 }
1257}
1258
1259static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1260{
1261 rq->rt_avg += rt_delta;
1262 sched_avg_update(rq);
1263}
1264
1265#else
1266static void resched_task(struct task_struct *p)
1267{
1268 assert_raw_spin_locked(&task_rq(p)->lock);
1269 set_tsk_need_resched(p);
1270}
1271
1272static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1273{
1274}
1275#endif
1276
1277#if BITS_PER_LONG == 32
1278# define WMULT_CONST (~0UL)
1279#else
1280# define WMULT_CONST (1UL << 32)
1281#endif
1282
1283#define WMULT_SHIFT 32
1284
1285
1286
1287
1288#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1289
1290
1291
1292
1293static unsigned long
1294calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1295 struct load_weight *lw)
1296{
1297 u64 tmp;
1298
1299 if (!lw->inv_weight) {
1300 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1301 lw->inv_weight = 1;
1302 else
1303 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1304 / (lw->weight+1);
1305 }
1306
1307 tmp = (u64)delta_exec * weight;
1308
1309
1310
1311 if (unlikely(tmp > WMULT_CONST))
1312 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1313 WMULT_SHIFT/2);
1314 else
1315 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1316
1317 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1318}
1319
1320static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1321{
1322 lw->weight += inc;
1323 lw->inv_weight = 0;
1324}
1325
1326static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1327{
1328 lw->weight -= dec;
1329 lw->inv_weight = 0;
1330}
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341#define WEIGHT_IDLEPRIO 3
1342#define WMULT_IDLEPRIO 1431655765
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356static const int prio_to_weight[40] = {
1357 88761, 71755, 56483, 46273, 36291,
1358 29154, 23254, 18705, 14949, 11916,
1359 9548, 7620, 6100, 4904, 3906,
1360 3121, 2501, 1991, 1586, 1277,
1361 1024, 820, 655, 526, 423,
1362 335, 272, 215, 172, 137,
1363 110, 87, 70, 56, 45,
1364 36, 29, 23, 18, 15,
1365};
1366
1367
1368
1369
1370
1371
1372
1373
1374static const u32 prio_to_wmult[40] = {
1375 48388, 59856, 76040, 92818, 118348,
1376 147320, 184698, 229616, 287308, 360437,
1377 449829, 563644, 704093, 875809, 1099582,
1378 1376151, 1717300, 2157191, 2708050, 3363326,
1379 4194304, 5237765, 6557202, 8165337, 10153587,
1380 12820798, 15790321, 19976592, 24970740, 31350126,
1381 39045157, 49367440, 61356676, 76695844, 95443717,
1382 119304647, 148102320, 186737708, 238609294, 286331153,
1383};
1384
1385
1386enum cpuacct_stat_index {
1387 CPUACCT_STAT_USER,
1388 CPUACCT_STAT_SYSTEM,
1389
1390 CPUACCT_STAT_NSTATS,
1391};
1392
1393#ifdef CONFIG_CGROUP_CPUACCT
1394static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1395static void cpuacct_update_stats(struct task_struct *tsk,
1396 enum cpuacct_stat_index idx, cputime_t val);
1397#else
1398static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1399static inline void cpuacct_update_stats(struct task_struct *tsk,
1400 enum cpuacct_stat_index idx, cputime_t val) {}
1401#endif
1402
1403static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1404{
1405 update_load_add(&rq->load, load);
1406}
1407
1408static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1409{
1410 update_load_sub(&rq->load, load);
1411}
1412
1413#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1414typedef int (*tg_visitor)(struct task_group *, void *);
1415
1416
1417
1418
1419
1420static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1421{
1422 struct task_group *parent, *child;
1423 int ret;
1424
1425 rcu_read_lock();
1426 parent = &root_task_group;
1427down:
1428 ret = (*down)(parent, data);
1429 if (ret)
1430 goto out_unlock;
1431 list_for_each_entry_rcu(child, &parent->children, siblings) {
1432 parent = child;
1433 goto down;
1434
1435up:
1436 continue;
1437 }
1438 ret = (*up)(parent, data);
1439 if (ret)
1440 goto out_unlock;
1441
1442 child = parent;
1443 parent = parent->parent;
1444 if (parent)
1445 goto up;
1446out_unlock:
1447 rcu_read_unlock();
1448
1449 return ret;
1450}
1451
1452static int tg_nop(struct task_group *tg, void *data)
1453{
1454 return 0;
1455}
1456#endif
1457
1458#ifdef CONFIG_SMP
1459
1460static unsigned long weighted_cpuload(const int cpu)
1461{
1462 return cpu_rq(cpu)->load.weight;
1463}
1464
1465
1466
1467
1468
1469
1470
1471
1472static unsigned long source_load(int cpu, int type)
1473{
1474 struct rq *rq = cpu_rq(cpu);
1475 unsigned long total = weighted_cpuload(cpu);
1476
1477 if (type == 0 || !sched_feat(LB_BIAS))
1478 return total;
1479
1480 return min(rq->cpu_load[type-1], total);
1481}
1482
1483
1484
1485
1486
1487static unsigned long target_load(int cpu, int type)
1488{
1489 struct rq *rq = cpu_rq(cpu);
1490 unsigned long total = weighted_cpuload(cpu);
1491
1492 if (type == 0 || !sched_feat(LB_BIAS))
1493 return total;
1494
1495 return max(rq->cpu_load[type-1], total);
1496}
1497
1498static struct sched_group *group_of(int cpu)
1499{
1500 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1501
1502 if (!sd)
1503 return NULL;
1504
1505 return sd->groups;
1506}
1507
1508static unsigned long power_of(int cpu)
1509{
1510 struct sched_group *group = group_of(cpu);
1511
1512 if (!group)
1513 return SCHED_LOAD_SCALE;
1514
1515 return group->cpu_power;
1516}
1517
1518static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1519
1520static unsigned long cpu_avg_load_per_task(int cpu)
1521{
1522 struct rq *rq = cpu_rq(cpu);
1523 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1524
1525 if (nr_running)
1526 rq->avg_load_per_task = rq->load.weight / nr_running;
1527 else
1528 rq->avg_load_per_task = 0;
1529
1530 return rq->avg_load_per_task;
1531}
1532
1533#ifdef CONFIG_FAIR_GROUP_SCHED
1534
1535static __read_mostly unsigned long __percpu *update_shares_data;
1536
1537static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1538
1539
1540
1541
1542static void update_group_shares_cpu(struct task_group *tg, int cpu,
1543 unsigned long sd_shares,
1544 unsigned long sd_rq_weight,
1545 unsigned long *usd_rq_weight)
1546{
1547 unsigned long shares, rq_weight;
1548 int boost = 0;
1549
1550 rq_weight = usd_rq_weight[cpu];
1551 if (!rq_weight) {
1552 boost = 1;
1553 rq_weight = NICE_0_LOAD;
1554 }
1555
1556
1557
1558
1559
1560
1561 shares = (sd_shares * rq_weight) / sd_rq_weight;
1562 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1563
1564 if (abs(shares - tg->se[cpu]->load.weight) >
1565 sysctl_sched_shares_thresh) {
1566 struct rq *rq = cpu_rq(cpu);
1567 unsigned long flags;
1568
1569 raw_spin_lock_irqsave(&rq->lock, flags);
1570 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1571 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1572 __set_se_shares(tg->se[cpu], shares);
1573 raw_spin_unlock_irqrestore(&rq->lock, flags);
1574 }
1575}
1576
1577
1578
1579
1580
1581
1582static int tg_shares_up(struct task_group *tg, void *data)
1583{
1584 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1585 unsigned long *usd_rq_weight;
1586 struct sched_domain *sd = data;
1587 unsigned long flags;
1588 int i;
1589
1590 if (!tg->se[0])
1591 return 0;
1592
1593 local_irq_save(flags);
1594 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1595
1596 for_each_cpu(i, sched_domain_span(sd)) {
1597 weight = tg->cfs_rq[i]->load.weight;
1598 usd_rq_weight[i] = weight;
1599
1600 rq_weight += weight;
1601
1602
1603
1604
1605
1606 if (!weight)
1607 weight = NICE_0_LOAD;
1608
1609 sum_weight += weight;
1610 shares += tg->cfs_rq[i]->shares;
1611 }
1612
1613 if (!rq_weight)
1614 rq_weight = sum_weight;
1615
1616 if ((!shares && rq_weight) || shares > tg->shares)
1617 shares = tg->shares;
1618
1619 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1620 shares = tg->shares;
1621
1622 for_each_cpu(i, sched_domain_span(sd))
1623 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1624
1625 local_irq_restore(flags);
1626
1627 return 0;
1628}
1629
1630
1631
1632
1633
1634
1635static int tg_load_down(struct task_group *tg, void *data)
1636{
1637 unsigned long load;
1638 long cpu = (long)data;
1639
1640 if (!tg->parent) {
1641 load = cpu_rq(cpu)->load.weight;
1642 } else {
1643 load = tg->parent->cfs_rq[cpu]->h_load;
1644 load *= tg->cfs_rq[cpu]->shares;
1645 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1646 }
1647
1648 tg->cfs_rq[cpu]->h_load = load;
1649
1650 return 0;
1651}
1652
1653static void update_shares(struct sched_domain *sd)
1654{
1655 s64 elapsed;
1656 u64 now;
1657
1658 if (root_task_group_empty())
1659 return;
1660
1661 now = cpu_clock(raw_smp_processor_id());
1662 elapsed = now - sd->last_update;
1663
1664 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1665 sd->last_update = now;
1666 walk_tg_tree(tg_nop, tg_shares_up, sd);
1667 }
1668}
1669
1670static void update_h_load(long cpu)
1671{
1672 if (root_task_group_empty())
1673 return;
1674
1675 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1676}
1677
1678#else
1679
1680static inline void update_shares(struct sched_domain *sd)
1681{
1682}
1683
1684#endif
1685
1686#ifdef CONFIG_PREEMPT
1687
1688static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1699 __releases(this_rq->lock)
1700 __acquires(busiest->lock)
1701 __acquires(this_rq->lock)
1702{
1703 raw_spin_unlock(&this_rq->lock);
1704 double_rq_lock(this_rq, busiest);
1705
1706 return 1;
1707}
1708
1709#else
1710
1711
1712
1713
1714
1715
1716
1717static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1718 __releases(this_rq->lock)
1719 __acquires(busiest->lock)
1720 __acquires(this_rq->lock)
1721{
1722 int ret = 0;
1723
1724 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1725 if (busiest < this_rq) {
1726 raw_spin_unlock(&this_rq->lock);
1727 raw_spin_lock(&busiest->lock);
1728 raw_spin_lock_nested(&this_rq->lock,
1729 SINGLE_DEPTH_NESTING);
1730 ret = 1;
1731 } else
1732 raw_spin_lock_nested(&busiest->lock,
1733 SINGLE_DEPTH_NESTING);
1734 }
1735 return ret;
1736}
1737
1738#endif
1739
1740
1741
1742
1743static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1744{
1745 if (unlikely(!irqs_disabled())) {
1746
1747 raw_spin_unlock(&this_rq->lock);
1748 BUG_ON(1);
1749 }
1750
1751 return _double_lock_balance(this_rq, busiest);
1752}
1753
1754static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1755 __releases(busiest->lock)
1756{
1757 raw_spin_unlock(&busiest->lock);
1758 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1759}
1760
1761
1762
1763
1764
1765
1766
1767static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1768 __acquires(rq1->lock)
1769 __acquires(rq2->lock)
1770{
1771 BUG_ON(!irqs_disabled());
1772 if (rq1 == rq2) {
1773 raw_spin_lock(&rq1->lock);
1774 __acquire(rq2->lock);
1775 } else {
1776 if (rq1 < rq2) {
1777 raw_spin_lock(&rq1->lock);
1778 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1779 } else {
1780 raw_spin_lock(&rq2->lock);
1781 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1782 }
1783 }
1784 update_rq_clock(rq1);
1785 update_rq_clock(rq2);
1786}
1787
1788
1789
1790
1791
1792
1793
1794static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1795 __releases(rq1->lock)
1796 __releases(rq2->lock)
1797{
1798 raw_spin_unlock(&rq1->lock);
1799 if (rq1 != rq2)
1800 raw_spin_unlock(&rq2->lock);
1801 else
1802 __release(rq2->lock);
1803}
1804
1805#endif
1806
1807#ifdef CONFIG_FAIR_GROUP_SCHED
1808static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1809{
1810#ifdef CONFIG_SMP
1811 cfs_rq->shares = shares;
1812#endif
1813}
1814#endif
1815
1816static void calc_load_account_active(struct rq *this_rq);
1817static void update_sysctl(void);
1818static int get_update_sysctl_factor(void);
1819
1820static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1821{
1822 set_task_rq(p, cpu);
1823#ifdef CONFIG_SMP
1824
1825
1826
1827
1828
1829 smp_wmb();
1830 task_thread_info(p)->cpu = cpu;
1831#endif
1832}
1833
1834static const struct sched_class rt_sched_class;
1835
1836#define sched_class_highest (&rt_sched_class)
1837#define for_each_class(class) \
1838 for (class = sched_class_highest; class; class = class->next)
1839
1840#include "sched_stats.h"
1841
1842static void inc_nr_running(struct rq *rq)
1843{
1844 rq->nr_running++;
1845}
1846
1847static void dec_nr_running(struct rq *rq)
1848{
1849 rq->nr_running--;
1850}
1851
1852static void set_load_weight(struct task_struct *p)
1853{
1854 if (task_has_rt_policy(p)) {
1855 p->se.load.weight = prio_to_weight[0] * 2;
1856 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1857 return;
1858 }
1859
1860
1861
1862
1863 if (p->policy == SCHED_IDLE) {
1864 p->se.load.weight = WEIGHT_IDLEPRIO;
1865 p->se.load.inv_weight = WMULT_IDLEPRIO;
1866 return;
1867 }
1868
1869 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1870 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1871}
1872
1873static void update_avg(u64 *avg, u64 sample)
1874{
1875 s64 diff = sample - *avg;
1876 *avg += diff >> 3;
1877}
1878
1879static void
1880enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1881{
1882 if (wakeup)
1883 p->se.start_runtime = p->se.sum_exec_runtime;
1884
1885 sched_info_queued(p);
1886 p->sched_class->enqueue_task(rq, p, wakeup, head);
1887 p->se.on_rq = 1;
1888}
1889
1890static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1891{
1892 if (sleep) {
1893 if (p->se.last_wakeup) {
1894 update_avg(&p->se.avg_overlap,
1895 p->se.sum_exec_runtime - p->se.last_wakeup);
1896 p->se.last_wakeup = 0;
1897 } else {
1898 update_avg(&p->se.avg_wakeup,
1899 sysctl_sched_wakeup_granularity);
1900 }
1901 }
1902
1903 sched_info_dequeued(p);
1904 p->sched_class->dequeue_task(rq, p, sleep);
1905 p->se.on_rq = 0;
1906}
1907
1908
1909
1910
1911static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1912{
1913 if (task_contributes_to_load(p))
1914 rq->nr_uninterruptible--;
1915
1916 enqueue_task(rq, p, wakeup, false);
1917 inc_nr_running(rq);
1918}
1919
1920
1921
1922
1923static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1924{
1925 if (task_contributes_to_load(p))
1926 rq->nr_uninterruptible++;
1927
1928 dequeue_task(rq, p, sleep);
1929 dec_nr_running(rq);
1930}
1931
1932#include "sched_idletask.c"
1933#include "sched_fair.c"
1934#include "sched_rt.c"
1935#ifdef CONFIG_SCHED_DEBUG
1936# include "sched_debug.c"
1937#endif
1938
1939
1940
1941
1942static inline int __normal_prio(struct task_struct *p)
1943{
1944 return p->static_prio;
1945}
1946
1947
1948
1949
1950
1951
1952
1953
1954static inline int normal_prio(struct task_struct *p)
1955{
1956 int prio;
1957
1958 if (task_has_rt_policy(p))
1959 prio = MAX_RT_PRIO-1 - p->rt_priority;
1960 else
1961 prio = __normal_prio(p);
1962 return prio;
1963}
1964
1965
1966
1967
1968
1969
1970
1971
1972static int effective_prio(struct task_struct *p)
1973{
1974 p->normal_prio = normal_prio(p);
1975
1976
1977
1978
1979
1980 if (!rt_prio(p->prio))
1981 return p->normal_prio;
1982 return p->prio;
1983}
1984
1985
1986
1987
1988
1989inline int task_curr(const struct task_struct *p)
1990{
1991 return cpu_curr(task_cpu(p)) == p;
1992}
1993
1994static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1995 const struct sched_class *prev_class,
1996 int oldprio, int running)
1997{
1998 if (prev_class != p->sched_class) {
1999 if (prev_class->switched_from)
2000 prev_class->switched_from(rq, p, running);
2001 p->sched_class->switched_to(rq, p, running);
2002 } else
2003 p->sched_class->prio_changed(rq, p, oldprio, running);
2004}
2005
2006#ifdef CONFIG_SMP
2007
2008
2009
2010static int
2011task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2012{
2013 s64 delta;
2014
2015 if (p->sched_class != &fair_sched_class)
2016 return 0;
2017
2018
2019
2020
2021 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2022 (&p->se == cfs_rq_of(&p->se)->next ||
2023 &p->se == cfs_rq_of(&p->se)->last))
2024 return 1;
2025
2026 if (sysctl_sched_migration_cost == -1)
2027 return 1;
2028 if (sysctl_sched_migration_cost == 0)
2029 return 0;
2030
2031 delta = now - p->se.exec_start;
2032
2033 return delta < (s64)sysctl_sched_migration_cost;
2034}
2035
2036void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2037{
2038#ifdef CONFIG_SCHED_DEBUG
2039
2040
2041
2042
2043 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2044 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2045#endif
2046
2047 trace_sched_migrate_task(p, new_cpu);
2048
2049 if (task_cpu(p) != new_cpu) {
2050 p->se.nr_migrations++;
2051 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2052 }
2053
2054 __set_task_cpu(p, new_cpu);
2055}
2056
2057struct migration_req {
2058 struct list_head list;
2059
2060 struct task_struct *task;
2061 int dest_cpu;
2062
2063 struct completion done;
2064};
2065
2066
2067
2068
2069
2070static int
2071migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2072{
2073 struct rq *rq = task_rq(p);
2074
2075
2076
2077
2078
2079 if (!p->se.on_rq && !task_running(rq, p))
2080 return 0;
2081
2082 init_completion(&req->done);
2083 req->task = p;
2084 req->dest_cpu = dest_cpu;
2085 list_add(&req->list, &rq->migration_queue);
2086
2087 return 1;
2088}
2089
2090
2091
2092
2093
2094
2095
2096void wait_task_context_switch(struct task_struct *p)
2097{
2098 unsigned long nvcsw, nivcsw, flags;
2099 int running;
2100 struct rq *rq;
2101
2102 nvcsw = p->nvcsw;
2103 nivcsw = p->nivcsw;
2104 for (;;) {
2105
2106
2107
2108
2109
2110
2111
2112
2113 rq = task_rq_lock(p, &flags);
2114 running = task_running(rq, p);
2115 task_rq_unlock(rq, &flags);
2116
2117 if (likely(!running))
2118 break;
2119
2120
2121
2122
2123
2124 if ((p->nvcsw - nvcsw) > 1)
2125 break;
2126 if ((p->nivcsw - nivcsw) > 1)
2127 break;
2128
2129 cpu_relax();
2130 }
2131}
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2150{
2151 unsigned long flags;
2152 int running, on_rq;
2153 unsigned long ncsw;
2154 struct rq *rq;
2155
2156 for (;;) {
2157
2158
2159
2160
2161
2162
2163 rq = task_rq(p);
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176 while (task_running(rq, p)) {
2177 if (match_state && unlikely(p->state != match_state))
2178 return 0;
2179 cpu_relax();
2180 }
2181
2182
2183
2184
2185
2186
2187 rq = task_rq_lock(p, &flags);
2188 trace_sched_wait_task(rq, p);
2189 running = task_running(rq, p);
2190 on_rq = p->se.on_rq;
2191 ncsw = 0;
2192 if (!match_state || p->state == match_state)
2193 ncsw = p->nvcsw | LONG_MIN;
2194 task_rq_unlock(rq, &flags);
2195
2196
2197
2198
2199 if (unlikely(!ncsw))
2200 break;
2201
2202
2203
2204
2205
2206
2207
2208 if (unlikely(running)) {
2209 cpu_relax();
2210 continue;
2211 }
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222 if (unlikely(on_rq)) {
2223 schedule_timeout_uninterruptible(1);
2224 continue;
2225 }
2226
2227
2228
2229
2230
2231
2232 break;
2233 }
2234
2235 return ncsw;
2236}
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251void kick_process(struct task_struct *p)
2252{
2253 int cpu;
2254
2255 preempt_disable();
2256 cpu = task_cpu(p);
2257 if ((cpu != smp_processor_id()) && task_curr(p))
2258 smp_send_reschedule(cpu);
2259 preempt_enable();
2260}
2261EXPORT_SYMBOL_GPL(kick_process);
2262#endif
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273void task_oncpu_function_call(struct task_struct *p,
2274 void (*func) (void *info), void *info)
2275{
2276 int cpu;
2277
2278 preempt_disable();
2279 cpu = task_cpu(p);
2280 if (task_curr(p))
2281 smp_call_function_single(cpu, func, info, 1);
2282 preempt_enable();
2283}
2284
2285#ifdef CONFIG_SMP
2286static int select_fallback_rq(int cpu, struct task_struct *p)
2287{
2288 int dest_cpu;
2289 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2290
2291
2292 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2293 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2294 return dest_cpu;
2295
2296
2297 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2298 if (dest_cpu < nr_cpu_ids)
2299 return dest_cpu;
2300
2301
2302 if (dest_cpu >= nr_cpu_ids) {
2303 rcu_read_lock();
2304 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2305 rcu_read_unlock();
2306 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2307
2308
2309
2310
2311
2312
2313 if (p->mm && printk_ratelimit()) {
2314 printk(KERN_INFO "process %d (%s) no "
2315 "longer affine to cpu%d\n",
2316 task_pid_nr(p), p->comm, cpu);
2317 }
2318 }
2319
2320 return dest_cpu;
2321}
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331static inline
2332int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2333{
2334 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2347 !cpu_online(cpu)))
2348 cpu = select_fallback_rq(task_cpu(p), p);
2349
2350 return cpu;
2351}
2352#endif
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368static int try_to_wake_up(struct task_struct *p, unsigned int state,
2369 int wake_flags)
2370{
2371 int cpu, orig_cpu, this_cpu, success = 0;
2372 unsigned long flags;
2373 struct rq *rq;
2374
2375 if (!sched_feat(SYNC_WAKEUPS))
2376 wake_flags &= ~WF_SYNC;
2377
2378 this_cpu = get_cpu();
2379
2380 smp_wmb();
2381 rq = task_rq_lock(p, &flags);
2382 update_rq_clock(rq);
2383 if (!(p->state & state))
2384 goto out;
2385
2386 if (p->se.on_rq)
2387 goto out_running;
2388
2389 cpu = task_cpu(p);
2390 orig_cpu = cpu;
2391
2392#ifdef CONFIG_SMP
2393 if (unlikely(task_running(rq, p)))
2394 goto out_activate;
2395
2396
2397
2398
2399
2400
2401
2402 if (task_contributes_to_load(p))
2403 rq->nr_uninterruptible--;
2404 p->state = TASK_WAKING;
2405
2406 if (p->sched_class->task_waking)
2407 p->sched_class->task_waking(rq, p);
2408
2409 __task_rq_unlock(rq);
2410
2411 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2412 if (cpu != orig_cpu) {
2413
2414
2415
2416
2417
2418 set_task_cpu(p, cpu);
2419 }
2420
2421 rq = cpu_rq(cpu);
2422 raw_spin_lock(&rq->lock);
2423 update_rq_clock(rq);
2424
2425
2426
2427
2428
2429
2430
2431 WARN_ON(task_cpu(p) != cpu);
2432 WARN_ON(p->state != TASK_WAKING);
2433
2434#ifdef CONFIG_SCHEDSTATS
2435 schedstat_inc(rq, ttwu_count);
2436 if (cpu == this_cpu)
2437 schedstat_inc(rq, ttwu_local);
2438 else {
2439 struct sched_domain *sd;
2440 for_each_domain(this_cpu, sd) {
2441 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2442 schedstat_inc(sd, ttwu_wake_remote);
2443 break;
2444 }
2445 }
2446 }
2447#endif
2448
2449out_activate:
2450#endif
2451 schedstat_inc(p, se.nr_wakeups);
2452 if (wake_flags & WF_SYNC)
2453 schedstat_inc(p, se.nr_wakeups_sync);
2454 if (orig_cpu != cpu)
2455 schedstat_inc(p, se.nr_wakeups_migrate);
2456 if (cpu == this_cpu)
2457 schedstat_inc(p, se.nr_wakeups_local);
2458 else
2459 schedstat_inc(p, se.nr_wakeups_remote);
2460 activate_task(rq, p, 1);
2461 success = 1;
2462
2463
2464
2465
2466 if (!in_interrupt()) {
2467 struct sched_entity *se = ¤t->se;
2468 u64 sample = se->sum_exec_runtime;
2469
2470 if (se->last_wakeup)
2471 sample -= se->last_wakeup;
2472 else
2473 sample -= se->start_runtime;
2474 update_avg(&se->avg_wakeup, sample);
2475
2476 se->last_wakeup = se->sum_exec_runtime;
2477 }
2478
2479out_running:
2480 trace_sched_wakeup(rq, p, success);
2481 check_preempt_curr(rq, p, wake_flags);
2482
2483 p->state = TASK_RUNNING;
2484#ifdef CONFIG_SMP
2485 if (p->sched_class->task_woken)
2486 p->sched_class->task_woken(rq, p);
2487
2488 if (unlikely(rq->idle_stamp)) {
2489 u64 delta = rq->clock - rq->idle_stamp;
2490 u64 max = 2*sysctl_sched_migration_cost;
2491
2492 if (delta > max)
2493 rq->avg_idle = max;
2494 else
2495 update_avg(&rq->avg_idle, delta);
2496 rq->idle_stamp = 0;
2497 }
2498#endif
2499out:
2500 task_rq_unlock(rq, &flags);
2501 put_cpu();
2502
2503 return success;
2504}
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517int wake_up_process(struct task_struct *p)
2518{
2519 return try_to_wake_up(p, TASK_ALL, 0);
2520}
2521EXPORT_SYMBOL(wake_up_process);
2522
2523int wake_up_state(struct task_struct *p, unsigned int state)
2524{
2525 return try_to_wake_up(p, state, 0);
2526}
2527
2528
2529
2530
2531
2532
2533
2534static void __sched_fork(struct task_struct *p)
2535{
2536 p->se.exec_start = 0;
2537 p->se.sum_exec_runtime = 0;
2538 p->se.prev_sum_exec_runtime = 0;
2539 p->se.nr_migrations = 0;
2540 p->se.last_wakeup = 0;
2541 p->se.avg_overlap = 0;
2542 p->se.start_runtime = 0;
2543 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2544
2545#ifdef CONFIG_SCHEDSTATS
2546 p->se.wait_start = 0;
2547 p->se.wait_max = 0;
2548 p->se.wait_count = 0;
2549 p->se.wait_sum = 0;
2550
2551 p->se.sleep_start = 0;
2552 p->se.sleep_max = 0;
2553 p->se.sum_sleep_runtime = 0;
2554
2555 p->se.block_start = 0;
2556 p->se.block_max = 0;
2557 p->se.exec_max = 0;
2558 p->se.slice_max = 0;
2559
2560 p->se.nr_migrations_cold = 0;
2561 p->se.nr_failed_migrations_affine = 0;
2562 p->se.nr_failed_migrations_running = 0;
2563 p->se.nr_failed_migrations_hot = 0;
2564 p->se.nr_forced_migrations = 0;
2565
2566 p->se.nr_wakeups = 0;
2567 p->se.nr_wakeups_sync = 0;
2568 p->se.nr_wakeups_migrate = 0;
2569 p->se.nr_wakeups_local = 0;
2570 p->se.nr_wakeups_remote = 0;
2571 p->se.nr_wakeups_affine = 0;
2572 p->se.nr_wakeups_affine_attempts = 0;
2573 p->se.nr_wakeups_passive = 0;
2574 p->se.nr_wakeups_idle = 0;
2575
2576#endif
2577
2578 INIT_LIST_HEAD(&p->rt.run_list);
2579 p->se.on_rq = 0;
2580 INIT_LIST_HEAD(&p->se.group_node);
2581
2582#ifdef CONFIG_PREEMPT_NOTIFIERS
2583 INIT_HLIST_HEAD(&p->preempt_notifiers);
2584#endif
2585}
2586
2587
2588
2589
2590void sched_fork(struct task_struct *p, int clone_flags)
2591{
2592 int cpu = get_cpu();
2593
2594 __sched_fork(p);
2595
2596
2597
2598
2599
2600 p->state = TASK_WAKING;
2601
2602
2603
2604
2605 if (unlikely(p->sched_reset_on_fork)) {
2606 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2607 p->policy = SCHED_NORMAL;
2608 p->normal_prio = p->static_prio;
2609 }
2610
2611 if (PRIO_TO_NICE(p->static_prio) < 0) {
2612 p->static_prio = NICE_TO_PRIO(0);
2613 p->normal_prio = p->static_prio;
2614 set_load_weight(p);
2615 }
2616
2617
2618
2619
2620
2621 p->sched_reset_on_fork = 0;
2622 }
2623
2624
2625
2626
2627 p->prio = current->normal_prio;
2628
2629 if (!rt_prio(p->prio))
2630 p->sched_class = &fair_sched_class;
2631
2632 if (p->sched_class->task_fork)
2633 p->sched_class->task_fork(p);
2634
2635 set_task_cpu(p, cpu);
2636
2637#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2638 if (likely(sched_info_on()))
2639 memset(&p->sched_info, 0, sizeof(p->sched_info));
2640#endif
2641#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2642 p->oncpu = 0;
2643#endif
2644#ifdef CONFIG_PREEMPT
2645
2646 task_thread_info(p)->preempt_count = 1;
2647#endif
2648 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2649
2650 put_cpu();
2651}
2652
2653
2654
2655
2656
2657
2658
2659
2660void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2661{
2662 unsigned long flags;
2663 struct rq *rq;
2664 int cpu __maybe_unused = get_cpu();
2665
2666#ifdef CONFIG_SMP
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2677 set_task_cpu(p, cpu);
2678#endif
2679
2680
2681
2682
2683
2684 rq = cpu_rq(cpu);
2685 raw_spin_lock_irqsave(&rq->lock, flags);
2686
2687 BUG_ON(p->state != TASK_WAKING);
2688 p->state = TASK_RUNNING;
2689 update_rq_clock(rq);
2690 activate_task(rq, p, 0);
2691 trace_sched_wakeup_new(rq, p, 1);
2692 check_preempt_curr(rq, p, WF_FORK);
2693#ifdef CONFIG_SMP
2694 if (p->sched_class->task_woken)
2695 p->sched_class->task_woken(rq, p);
2696#endif
2697 task_rq_unlock(rq, &flags);
2698 put_cpu();
2699}
2700
2701#ifdef CONFIG_PREEMPT_NOTIFIERS
2702
2703
2704
2705
2706
2707void preempt_notifier_register(struct preempt_notifier *notifier)
2708{
2709 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2710}
2711EXPORT_SYMBOL_GPL(preempt_notifier_register);
2712
2713
2714
2715
2716
2717
2718
2719void preempt_notifier_unregister(struct preempt_notifier *notifier)
2720{
2721 hlist_del(¬ifier->link);
2722}
2723EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2724
2725static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2726{
2727 struct preempt_notifier *notifier;
2728 struct hlist_node *node;
2729
2730 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2731 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2732}
2733
2734static void
2735fire_sched_out_preempt_notifiers(struct task_struct *curr,
2736 struct task_struct *next)
2737{
2738 struct preempt_notifier *notifier;
2739 struct hlist_node *node;
2740
2741 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2742 notifier->ops->sched_out(notifier, next);
2743}
2744
2745#else
2746
2747static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2748{
2749}
2750
2751static void
2752fire_sched_out_preempt_notifiers(struct task_struct *curr,
2753 struct task_struct *next)
2754{
2755}
2756
2757#endif
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772static inline void
2773prepare_task_switch(struct rq *rq, struct task_struct *prev,
2774 struct task_struct *next)
2775{
2776 fire_sched_out_preempt_notifiers(prev, next);
2777 prepare_lock_switch(rq, next);
2778 prepare_arch_switch(next);
2779}
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2797 __releases(rq->lock)
2798{
2799 struct mm_struct *mm = rq->prev_mm;
2800 long prev_state;
2801
2802 rq->prev_mm = NULL;
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815 prev_state = prev->state;
2816 finish_arch_switch(prev);
2817#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2818 local_irq_disable();
2819#endif
2820 perf_event_task_sched_in(current);
2821#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2822 local_irq_enable();
2823#endif
2824 finish_lock_switch(rq, prev);
2825
2826 fire_sched_in_preempt_notifiers(current);
2827 if (mm)
2828 mmdrop(mm);
2829 if (unlikely(prev_state == TASK_DEAD)) {
2830
2831
2832
2833
2834 kprobe_flush_task(prev);
2835 put_task_struct(prev);
2836 }
2837}
2838
2839#ifdef CONFIG_SMP
2840
2841
2842static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2843{
2844 if (prev->sched_class->pre_schedule)
2845 prev->sched_class->pre_schedule(rq, prev);
2846}
2847
2848
2849static inline void post_schedule(struct rq *rq)
2850{
2851 if (rq->post_schedule) {
2852 unsigned long flags;
2853
2854 raw_spin_lock_irqsave(&rq->lock, flags);
2855 if (rq->curr->sched_class->post_schedule)
2856 rq->curr->sched_class->post_schedule(rq);
2857 raw_spin_unlock_irqrestore(&rq->lock, flags);
2858
2859 rq->post_schedule = 0;
2860 }
2861}
2862
2863#else
2864
2865static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2866{
2867}
2868
2869static inline void post_schedule(struct rq *rq)
2870{
2871}
2872
2873#endif
2874
2875
2876
2877
2878
2879asmlinkage void schedule_tail(struct task_struct *prev)
2880 __releases(rq->lock)
2881{
2882 struct rq *rq = this_rq();
2883
2884 finish_task_switch(rq, prev);
2885
2886
2887
2888
2889
2890 post_schedule(rq);
2891
2892#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2893
2894 preempt_enable();
2895#endif
2896 if (current->set_child_tid)
2897 put_user(task_pid_vnr(current), current->set_child_tid);
2898}
2899
2900
2901
2902
2903
2904static inline void
2905context_switch(struct rq *rq, struct task_struct *prev,
2906 struct task_struct *next)
2907{
2908 struct mm_struct *mm, *oldmm;
2909
2910 prepare_task_switch(rq, prev, next);
2911 trace_sched_switch(rq, prev, next);
2912 mm = next->mm;
2913 oldmm = prev->active_mm;
2914
2915
2916
2917
2918
2919 arch_start_context_switch(prev);
2920
2921 if (likely(!mm)) {
2922 next->active_mm = oldmm;
2923 atomic_inc(&oldmm->mm_count);
2924 enter_lazy_tlb(oldmm, next);
2925 } else
2926 switch_mm(oldmm, mm, next);
2927
2928 if (likely(!prev->mm)) {
2929 prev->active_mm = NULL;
2930 rq->prev_mm = oldmm;
2931 }
2932
2933
2934
2935
2936
2937
2938#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2939 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2940#endif
2941
2942
2943 switch_to(prev, next, prev);
2944
2945 barrier();
2946
2947
2948
2949
2950
2951 finish_task_switch(this_rq(), prev);
2952}
2953
2954
2955
2956
2957
2958
2959
2960
2961unsigned long nr_running(void)
2962{
2963 unsigned long i, sum = 0;
2964
2965 for_each_online_cpu(i)
2966 sum += cpu_rq(i)->nr_running;
2967
2968 return sum;
2969}
2970
2971unsigned long nr_uninterruptible(void)
2972{
2973 unsigned long i, sum = 0;
2974
2975 for_each_possible_cpu(i)
2976 sum += cpu_rq(i)->nr_uninterruptible;
2977
2978
2979
2980
2981
2982 if (unlikely((long)sum < 0))
2983 sum = 0;
2984
2985 return sum;
2986}
2987
2988unsigned long long nr_context_switches(void)
2989{
2990 int i;
2991 unsigned long long sum = 0;
2992
2993 for_each_possible_cpu(i)
2994 sum += cpu_rq(i)->nr_switches;
2995
2996 return sum;
2997}
2998
2999unsigned long nr_iowait(void)
3000{
3001 unsigned long i, sum = 0;
3002
3003 for_each_possible_cpu(i)
3004 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3005
3006 return sum;
3007}
3008
3009unsigned long nr_iowait_cpu(void)
3010{
3011 struct rq *this = this_rq();
3012 return atomic_read(&this->nr_iowait);
3013}
3014
3015unsigned long this_cpu_load(void)
3016{
3017 struct rq *this = this_rq();
3018 return this->cpu_load[0];
3019}
3020
3021
3022
3023static atomic_long_t calc_load_tasks;
3024static unsigned long calc_load_update;
3025unsigned long avenrun[3];
3026EXPORT_SYMBOL(avenrun);
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3037{
3038 loads[0] = (avenrun[0] + offset) << shift;
3039 loads[1] = (avenrun[1] + offset) << shift;
3040 loads[2] = (avenrun[2] + offset) << shift;
3041}
3042
3043static unsigned long
3044calc_load(unsigned long load, unsigned long exp, unsigned long active)
3045{
3046 load *= exp;
3047 load += active * (FIXED_1 - exp);
3048 return load >> FSHIFT;
3049}
3050
3051
3052
3053
3054
3055void calc_global_load(void)
3056{
3057 unsigned long upd = calc_load_update + 10;
3058 long active;
3059
3060 if (time_before(jiffies, upd))
3061 return;
3062
3063 active = atomic_long_read(&calc_load_tasks);
3064 active = active > 0 ? active * FIXED_1 : 0;
3065
3066 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3067 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3068 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3069
3070 calc_load_update += LOAD_FREQ;
3071}
3072
3073
3074
3075
3076static void calc_load_account_active(struct rq *this_rq)
3077{
3078 long nr_active, delta;
3079
3080 nr_active = this_rq->nr_running;
3081 nr_active += (long) this_rq->nr_uninterruptible;
3082
3083 if (nr_active != this_rq->calc_load_active) {
3084 delta = nr_active - this_rq->calc_load_active;
3085 this_rq->calc_load_active = nr_active;
3086 atomic_long_add(delta, &calc_load_tasks);
3087 }
3088}
3089
3090
3091
3092
3093
3094static void update_cpu_load(struct rq *this_rq)
3095{
3096 unsigned long this_load = this_rq->load.weight;
3097 int i, scale;
3098
3099 this_rq->nr_load_updates++;
3100
3101
3102 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3103 unsigned long old_load, new_load;
3104
3105
3106
3107 old_load = this_rq->cpu_load[i];
3108 new_load = this_load;
3109
3110
3111
3112
3113
3114 if (new_load > old_load)
3115 new_load += scale-1;
3116 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3117 }
3118
3119 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3120 this_rq->calc_load_update += LOAD_FREQ;
3121 calc_load_account_active(this_rq);
3122 }
3123}
3124
3125#ifdef CONFIG_SMP
3126
3127
3128
3129
3130
3131void sched_exec(void)
3132{
3133 struct task_struct *p = current;
3134 struct migration_req req;
3135 int dest_cpu, this_cpu;
3136 unsigned long flags;
3137 struct rq *rq;
3138
3139again:
3140 this_cpu = get_cpu();
3141 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3142 if (dest_cpu == this_cpu) {
3143 put_cpu();
3144 return;
3145 }
3146
3147 rq = task_rq_lock(p, &flags);
3148 put_cpu();
3149
3150
3151
3152
3153 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3154 || unlikely(!cpu_active(dest_cpu))) {
3155 task_rq_unlock(rq, &flags);
3156 goto again;
3157 }
3158
3159
3160 if (migrate_task(p, dest_cpu, &req)) {
3161
3162 struct task_struct *mt = rq->migration_thread;
3163
3164 get_task_struct(mt);
3165 task_rq_unlock(rq, &flags);
3166 wake_up_process(mt);
3167 put_task_struct(mt);
3168 wait_for_completion(&req.done);
3169
3170 return;
3171 }
3172 task_rq_unlock(rq, &flags);
3173}
3174
3175#endif
3176
3177DEFINE_PER_CPU(struct kernel_stat, kstat);
3178
3179EXPORT_PER_CPU_SYMBOL(kstat);
3180
3181
3182
3183
3184
3185
3186
3187static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3188{
3189 u64 ns = 0;
3190
3191 if (task_current(rq, p)) {
3192 update_rq_clock(rq);
3193 ns = rq->clock - p->se.exec_start;
3194 if ((s64)ns < 0)
3195 ns = 0;
3196 }
3197
3198 return ns;
3199}
3200
3201unsigned long long task_delta_exec(struct task_struct *p)
3202{
3203 unsigned long flags;
3204 struct rq *rq;
3205 u64 ns = 0;
3206
3207 rq = task_rq_lock(p, &flags);
3208 ns = do_task_delta_exec(p, rq);
3209 task_rq_unlock(rq, &flags);
3210
3211 return ns;
3212}
3213
3214
3215
3216
3217
3218
3219unsigned long long task_sched_runtime(struct task_struct *p)
3220{
3221 unsigned long flags;
3222 struct rq *rq;
3223 u64 ns = 0;
3224
3225 rq = task_rq_lock(p, &flags);
3226 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3227 task_rq_unlock(rq, &flags);
3228
3229 return ns;
3230}
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241unsigned long long thread_group_sched_runtime(struct task_struct *p)
3242{
3243 struct task_cputime totals;
3244 unsigned long flags;
3245 struct rq *rq;
3246 u64 ns;
3247
3248 rq = task_rq_lock(p, &flags);
3249 thread_group_cputime(p, &totals);
3250 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3251 task_rq_unlock(rq, &flags);
3252
3253 return ns;
3254}
3255
3256
3257
3258
3259
3260
3261
3262void account_user_time(struct task_struct *p, cputime_t cputime,
3263 cputime_t cputime_scaled)
3264{
3265 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3266 cputime64_t tmp;
3267
3268
3269 p->utime = cputime_add(p->utime, cputime);
3270 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3271 account_group_user_time(p, cputime);
3272
3273
3274 tmp = cputime_to_cputime64(cputime);
3275 if (TASK_NICE(p) > 0)
3276 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3277 else
3278 cpustat->user = cputime64_add(cpustat->user, tmp);
3279
3280 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3281
3282 acct_update_integrals(p);
3283}
3284
3285
3286
3287
3288
3289
3290
3291static void account_guest_time(struct task_struct *p, cputime_t cputime,
3292 cputime_t cputime_scaled)
3293{
3294 cputime64_t tmp;
3295 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3296
3297 tmp = cputime_to_cputime64(cputime);
3298
3299
3300 p->utime = cputime_add(p->utime, cputime);
3301 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3302 account_group_user_time(p, cputime);
3303 p->gtime = cputime_add(p->gtime, cputime);
3304
3305
3306 if (TASK_NICE(p) > 0) {
3307 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3308 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3309 } else {
3310 cpustat->user = cputime64_add(cpustat->user, tmp);
3311 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3312 }
3313}
3314
3315
3316
3317
3318
3319
3320
3321
3322void account_system_time(struct task_struct *p, int hardirq_offset,
3323 cputime_t cputime, cputime_t cputime_scaled)
3324{
3325 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3326 cputime64_t tmp;
3327
3328 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3329 account_guest_time(p, cputime, cputime_scaled);
3330 return;
3331 }
3332
3333
3334 p->stime = cputime_add(p->stime, cputime);
3335 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3336 account_group_system_time(p, cputime);
3337
3338
3339 tmp = cputime_to_cputime64(cputime);
3340 if (hardirq_count() - hardirq_offset)
3341 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3342 else if (softirq_count())
3343 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3344 else
3345 cpustat->system = cputime64_add(cpustat->system, tmp);
3346
3347 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3348
3349
3350 acct_update_integrals(p);
3351}
3352
3353
3354
3355
3356
3357void account_steal_time(cputime_t cputime)
3358{
3359 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3360 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3361
3362 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3363}
3364
3365
3366
3367
3368
3369void account_idle_time(cputime_t cputime)
3370{
3371 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3372 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3373 struct rq *rq = this_rq();
3374
3375 if (atomic_read(&rq->nr_iowait) > 0)
3376 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3377 else
3378 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3379}
3380
3381#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3382
3383
3384
3385
3386
3387
3388void account_process_tick(struct task_struct *p, int user_tick)
3389{
3390 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3391 struct rq *rq = this_rq();
3392
3393 if (user_tick)
3394 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3395 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3396 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3397 one_jiffy_scaled);
3398 else
3399 account_idle_time(cputime_one_jiffy);
3400}
3401
3402
3403
3404
3405
3406
3407void account_steal_ticks(unsigned long ticks)
3408{
3409 account_steal_time(jiffies_to_cputime(ticks));
3410}
3411
3412
3413
3414
3415
3416void account_idle_ticks(unsigned long ticks)
3417{
3418 account_idle_time(jiffies_to_cputime(ticks));
3419}
3420
3421#endif
3422
3423
3424
3425
3426#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3427void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3428{
3429 *ut = p->utime;
3430 *st = p->stime;
3431}
3432
3433void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3434{
3435 struct task_cputime cputime;
3436
3437 thread_group_cputime(p, &cputime);
3438
3439 *ut = cputime.utime;
3440 *st = cputime.stime;
3441}
3442#else
3443
3444#ifndef nsecs_to_cputime
3445# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3446#endif
3447
3448void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3449{
3450 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3451
3452
3453
3454
3455 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3456
3457 if (total) {
3458 u64 temp;
3459
3460 temp = (u64)(rtime * utime);
3461 do_div(temp, total);
3462 utime = (cputime_t)temp;
3463 } else
3464 utime = rtime;
3465
3466
3467
3468
3469 p->prev_utime = max(p->prev_utime, utime);
3470 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
3471
3472 *ut = p->prev_utime;
3473 *st = p->prev_stime;
3474}
3475
3476
3477
3478
3479void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3480{
3481 struct signal_struct *sig = p->signal;
3482 struct task_cputime cputime;
3483 cputime_t rtime, utime, total;
3484
3485 thread_group_cputime(p, &cputime);
3486
3487 total = cputime_add(cputime.utime, cputime.stime);
3488 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3489
3490 if (total) {
3491 u64 temp;
3492
3493 temp = (u64)(rtime * cputime.utime);
3494 do_div(temp, total);
3495 utime = (cputime_t)temp;
3496 } else
3497 utime = rtime;
3498
3499 sig->prev_utime = max(sig->prev_utime, utime);
3500 sig->prev_stime = max(sig->prev_stime,
3501 cputime_sub(rtime, sig->prev_utime));
3502
3503 *ut = sig->prev_utime;
3504 *st = sig->prev_stime;
3505}
3506#endif
3507
3508
3509
3510
3511
3512
3513
3514
3515void scheduler_tick(void)
3516{
3517 int cpu = smp_processor_id();
3518 struct rq *rq = cpu_rq(cpu);
3519 struct task_struct *curr = rq->curr;
3520
3521 sched_clock_tick();
3522
3523 raw_spin_lock(&rq->lock);
3524 update_rq_clock(rq);
3525 update_cpu_load(rq);
3526 curr->sched_class->task_tick(rq, curr, 0);
3527 raw_spin_unlock(&rq->lock);
3528
3529 perf_event_task_tick(curr);
3530
3531#ifdef CONFIG_SMP
3532 rq->idle_at_tick = idle_cpu(cpu);
3533 trigger_load_balance(rq, cpu);
3534#endif
3535}
3536
3537notrace unsigned long get_parent_ip(unsigned long addr)
3538{
3539 if (in_lock_functions(addr)) {
3540 addr = CALLER_ADDR2;
3541 if (in_lock_functions(addr))
3542 addr = CALLER_ADDR3;
3543 }
3544 return addr;
3545}
3546
3547#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3548 defined(CONFIG_PREEMPT_TRACER))
3549
3550void __kprobes add_preempt_count(int val)
3551{
3552#ifdef CONFIG_DEBUG_PREEMPT
3553
3554
3555
3556 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3557 return;
3558#endif
3559 preempt_count() += val;
3560#ifdef CONFIG_DEBUG_PREEMPT
3561
3562
3563
3564 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3565 PREEMPT_MASK - 10);
3566#endif
3567 if (preempt_count() == val)
3568 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3569}
3570EXPORT_SYMBOL(add_preempt_count);
3571
3572void __kprobes sub_preempt_count(int val)
3573{
3574#ifdef CONFIG_DEBUG_PREEMPT
3575
3576
3577
3578 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3579 return;
3580
3581
3582
3583 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3584 !(preempt_count() & PREEMPT_MASK)))
3585 return;
3586#endif
3587
3588 if (preempt_count() == val)
3589 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3590 preempt_count() -= val;
3591}
3592EXPORT_SYMBOL(sub_preempt_count);
3593
3594#endif
3595
3596
3597
3598
3599static noinline void __schedule_bug(struct task_struct *prev)
3600{
3601 struct pt_regs *regs = get_irq_regs();
3602
3603 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3604 prev->comm, prev->pid, preempt_count());
3605
3606 debug_show_held_locks(prev);
3607 print_modules();
3608 if (irqs_disabled())
3609 print_irqtrace_events(prev);
3610
3611 if (regs)
3612 show_regs(regs);
3613 else
3614 dump_stack();
3615}
3616
3617
3618
3619
3620static inline void schedule_debug(struct task_struct *prev)
3621{
3622
3623
3624
3625
3626
3627 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3628 __schedule_bug(prev);
3629
3630 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3631
3632 schedstat_inc(this_rq(), sched_count);
3633#ifdef CONFIG_SCHEDSTATS
3634 if (unlikely(prev->lock_depth >= 0)) {
3635 schedstat_inc(this_rq(), bkl_count);
3636 schedstat_inc(prev, sched_info.bkl_count);
3637 }
3638#endif
3639}
3640
3641static void put_prev_task(struct rq *rq, struct task_struct *prev)
3642{
3643 if (prev->state == TASK_RUNNING) {
3644 u64 runtime = prev->se.sum_exec_runtime;
3645
3646 runtime -= prev->se.prev_sum_exec_runtime;
3647 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658 update_avg(&prev->se.avg_overlap, runtime);
3659 }
3660 prev->sched_class->put_prev_task(rq, prev);
3661}
3662
3663
3664
3665
3666static inline struct task_struct *
3667pick_next_task(struct rq *rq)
3668{
3669 const struct sched_class *class;
3670 struct task_struct *p;
3671
3672
3673
3674
3675
3676 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3677 p = fair_sched_class.pick_next_task(rq);
3678 if (likely(p))
3679 return p;
3680 }
3681
3682 class = sched_class_highest;
3683 for ( ; ; ) {
3684 p = class->pick_next_task(rq);
3685 if (p)
3686 return p;
3687
3688
3689
3690
3691 class = class->next;
3692 }
3693}
3694
3695
3696
3697
3698asmlinkage void __sched schedule(void)
3699{
3700 struct task_struct *prev, *next;
3701 unsigned long *switch_count;
3702 struct rq *rq;
3703 int cpu;
3704
3705need_resched:
3706 preempt_disable();
3707 cpu = smp_processor_id();
3708 rq = cpu_rq(cpu);
3709 rcu_sched_qs(cpu);
3710 prev = rq->curr;
3711 switch_count = &prev->nivcsw;
3712
3713 release_kernel_lock(prev);
3714need_resched_nonpreemptible:
3715
3716 schedule_debug(prev);
3717
3718 if (sched_feat(HRTICK))
3719 hrtick_clear(rq);
3720
3721 raw_spin_lock_irq(&rq->lock);
3722 update_rq_clock(rq);
3723 clear_tsk_need_resched(prev);
3724
3725 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3726 if (unlikely(signal_pending_state(prev->state, prev)))
3727 prev->state = TASK_RUNNING;
3728 else
3729 deactivate_task(rq, prev, 1);
3730 switch_count = &prev->nvcsw;
3731 }
3732
3733 pre_schedule(rq, prev);
3734
3735 if (unlikely(!rq->nr_running))
3736 idle_balance(cpu, rq);
3737
3738 put_prev_task(rq, prev);
3739 next = pick_next_task(rq);
3740
3741 if (likely(prev != next)) {
3742 sched_info_switch(prev, next);
3743 perf_event_task_sched_out(prev, next);
3744
3745 rq->nr_switches++;
3746 rq->curr = next;
3747 ++*switch_count;
3748
3749 context_switch(rq, prev, next);
3750
3751
3752
3753
3754 cpu = smp_processor_id();
3755 rq = cpu_rq(cpu);
3756 } else
3757 raw_spin_unlock_irq(&rq->lock);
3758
3759 post_schedule(rq);
3760
3761 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3762 prev = rq->curr;
3763 switch_count = &prev->nivcsw;
3764 goto need_resched_nonpreemptible;
3765 }
3766
3767 preempt_enable_no_resched();
3768 if (need_resched())
3769 goto need_resched;
3770}
3771EXPORT_SYMBOL(schedule);
3772
3773#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3774
3775
3776
3777
3778int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3779{
3780 unsigned int cpu;
3781 struct rq *rq;
3782
3783 if (!sched_feat(OWNER_SPIN))
3784 return 0;
3785
3786#ifdef CONFIG_DEBUG_PAGEALLOC
3787
3788
3789
3790
3791
3792 if (probe_kernel_address(&owner->cpu, cpu))
3793 return 0;
3794#else
3795 cpu = owner->cpu;
3796#endif
3797
3798
3799
3800
3801
3802 if (cpu >= nr_cpumask_bits)
3803 return 0;
3804
3805
3806
3807
3808
3809 if (!cpu_online(cpu))
3810 return 0;
3811
3812 rq = cpu_rq(cpu);
3813
3814 for (;;) {
3815
3816
3817
3818 if (lock->owner != owner)
3819 break;
3820
3821
3822
3823
3824 if (task_thread_info(rq->curr) != owner || need_resched())
3825 return 0;
3826
3827 cpu_relax();
3828 }
3829
3830 return 1;
3831}
3832#endif
3833
3834#ifdef CONFIG_PREEMPT
3835
3836
3837
3838
3839
3840asmlinkage void __sched preempt_schedule(void)
3841{
3842 struct thread_info *ti = current_thread_info();
3843
3844
3845
3846
3847
3848 if (likely(ti->preempt_count || irqs_disabled()))
3849 return;
3850
3851 do {
3852 add_preempt_count(PREEMPT_ACTIVE);
3853 schedule();
3854 sub_preempt_count(PREEMPT_ACTIVE);
3855
3856
3857
3858
3859
3860 barrier();
3861 } while (need_resched());
3862}
3863EXPORT_SYMBOL(preempt_schedule);
3864
3865
3866
3867
3868
3869
3870
3871asmlinkage void __sched preempt_schedule_irq(void)
3872{
3873 struct thread_info *ti = current_thread_info();
3874
3875
3876 BUG_ON(ti->preempt_count || !irqs_disabled());
3877
3878 do {
3879 add_preempt_count(PREEMPT_ACTIVE);
3880 local_irq_enable();
3881 schedule();
3882 local_irq_disable();
3883 sub_preempt_count(PREEMPT_ACTIVE);
3884
3885
3886
3887
3888
3889 barrier();
3890 } while (need_resched());
3891}
3892
3893#endif
3894
3895int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3896 void *key)
3897{
3898 return try_to_wake_up(curr->private, mode, wake_flags);
3899}
3900EXPORT_SYMBOL(default_wake_function);
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3912 int nr_exclusive, int wake_flags, void *key)
3913{
3914 wait_queue_t *curr, *next;
3915
3916 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3917 unsigned flags = curr->flags;
3918
3919 if (curr->func(curr, mode, wake_flags, key) &&
3920 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3921 break;
3922 }
3923}
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935void __wake_up(wait_queue_head_t *q, unsigned int mode,
3936 int nr_exclusive, void *key)
3937{
3938 unsigned long flags;
3939
3940 spin_lock_irqsave(&q->lock, flags);
3941 __wake_up_common(q, mode, nr_exclusive, 0, key);
3942 spin_unlock_irqrestore(&q->lock, flags);
3943}
3944EXPORT_SYMBOL(__wake_up);
3945
3946
3947
3948
3949void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3950{
3951 __wake_up_common(q, mode, 1, 0, NULL);
3952}
3953
3954void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3955{
3956 __wake_up_common(q, mode, 1, 0, key);
3957}
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3977 int nr_exclusive, void *key)
3978{
3979 unsigned long flags;
3980 int wake_flags = WF_SYNC;
3981
3982 if (unlikely(!q))
3983 return;
3984
3985 if (unlikely(!nr_exclusive))
3986 wake_flags = 0;
3987
3988 spin_lock_irqsave(&q->lock, flags);
3989 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3990 spin_unlock_irqrestore(&q->lock, flags);
3991}
3992EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3993
3994
3995
3996
3997void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3998{
3999 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4000}
4001EXPORT_SYMBOL_GPL(__wake_up_sync);
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015void complete(struct completion *x)
4016{
4017 unsigned long flags;
4018
4019 spin_lock_irqsave(&x->wait.lock, flags);
4020 x->done++;
4021 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4022 spin_unlock_irqrestore(&x->wait.lock, flags);
4023}
4024EXPORT_SYMBOL(complete);
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035void complete_all(struct completion *x)
4036{
4037 unsigned long flags;
4038
4039 spin_lock_irqsave(&x->wait.lock, flags);
4040 x->done += UINT_MAX/2;
4041 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4042 spin_unlock_irqrestore(&x->wait.lock, flags);
4043}
4044EXPORT_SYMBOL(complete_all);
4045
4046static inline long __sched
4047do_wait_for_common(struct completion *x, long timeout, int state)
4048{
4049 if (!x->done) {
4050 DECLARE_WAITQUEUE(wait, current);
4051
4052 wait.flags |= WQ_FLAG_EXCLUSIVE;
4053 __add_wait_queue_tail(&x->wait, &wait);
4054 do {
4055 if (signal_pending_state(state, current)) {
4056 timeout = -ERESTARTSYS;
4057 break;
4058 }
4059 __set_current_state(state);
4060 spin_unlock_irq(&x->wait.lock);
4061 timeout = schedule_timeout(timeout);
4062 spin_lock_irq(&x->wait.lock);
4063 } while (!x->done && timeout);
4064 __remove_wait_queue(&x->wait, &wait);
4065 if (!x->done)
4066 return timeout;
4067 }
4068 x->done--;
4069 return timeout ?: 1;
4070}
4071
4072static long __sched
4073wait_for_common(struct completion *x, long timeout, int state)
4074{
4075 might_sleep();
4076
4077 spin_lock_irq(&x->wait.lock);
4078 timeout = do_wait_for_common(x, timeout, state);
4079 spin_unlock_irq(&x->wait.lock);
4080 return timeout;
4081}
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093void __sched wait_for_completion(struct completion *x)
4094{
4095 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4096}
4097EXPORT_SYMBOL(wait_for_completion);
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108unsigned long __sched
4109wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4110{
4111 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4112}
4113EXPORT_SYMBOL(wait_for_completion_timeout);
4114
4115
4116
4117
4118
4119
4120
4121
4122int __sched wait_for_completion_interruptible(struct completion *x)
4123{
4124 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4125 if (t == -ERESTARTSYS)
4126 return t;
4127 return 0;
4128}
4129EXPORT_SYMBOL(wait_for_completion_interruptible);
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139unsigned long __sched
4140wait_for_completion_interruptible_timeout(struct completion *x,
4141 unsigned long timeout)
4142{
4143 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4144}
4145EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4146
4147
4148
4149
4150
4151
4152
4153
4154int __sched wait_for_completion_killable(struct completion *x)
4155{
4156 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4157 if (t == -ERESTARTSYS)
4158 return t;
4159 return 0;
4160}
4161EXPORT_SYMBOL(wait_for_completion_killable);
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175bool try_wait_for_completion(struct completion *x)
4176{
4177 unsigned long flags;
4178 int ret = 1;
4179
4180 spin_lock_irqsave(&x->wait.lock, flags);
4181 if (!x->done)
4182 ret = 0;
4183 else
4184 x->done--;
4185 spin_unlock_irqrestore(&x->wait.lock, flags);
4186 return ret;
4187}
4188EXPORT_SYMBOL(try_wait_for_completion);
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198bool completion_done(struct completion *x)
4199{
4200 unsigned long flags;
4201 int ret = 1;
4202
4203 spin_lock_irqsave(&x->wait.lock, flags);
4204 if (!x->done)
4205 ret = 0;
4206 spin_unlock_irqrestore(&x->wait.lock, flags);
4207 return ret;
4208}
4209EXPORT_SYMBOL(completion_done);
4210
4211static long __sched
4212sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4213{
4214 unsigned long flags;
4215 wait_queue_t wait;
4216
4217 init_waitqueue_entry(&wait, current);
4218
4219 __set_current_state(state);
4220
4221 spin_lock_irqsave(&q->lock, flags);
4222 __add_wait_queue(q, &wait);
4223 spin_unlock(&q->lock);
4224 timeout = schedule_timeout(timeout);
4225 spin_lock_irq(&q->lock);
4226 __remove_wait_queue(q, &wait);
4227 spin_unlock_irqrestore(&q->lock, flags);
4228
4229 return timeout;
4230}
4231
4232void __sched interruptible_sleep_on(wait_queue_head_t *q)
4233{
4234 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4235}
4236EXPORT_SYMBOL(interruptible_sleep_on);
4237
4238long __sched
4239interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4240{
4241 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4242}
4243EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4244
4245void __sched sleep_on(wait_queue_head_t *q)
4246{
4247 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4248}
4249EXPORT_SYMBOL(sleep_on);
4250
4251long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4252{
4253 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4254}
4255EXPORT_SYMBOL(sleep_on_timeout);
4256
4257#ifdef CONFIG_RT_MUTEXES
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269void rt_mutex_setprio(struct task_struct *p, int prio)
4270{
4271 unsigned long flags;
4272 int oldprio, on_rq, running;
4273 struct rq *rq;
4274 const struct sched_class *prev_class;
4275
4276 BUG_ON(prio < 0 || prio > MAX_PRIO);
4277
4278 rq = task_rq_lock(p, &flags);
4279 update_rq_clock(rq);
4280
4281 oldprio = p->prio;
4282 prev_class = p->sched_class;
4283 on_rq = p->se.on_rq;
4284 running = task_current(rq, p);
4285 if (on_rq)
4286 dequeue_task(rq, p, 0);
4287 if (running)
4288 p->sched_class->put_prev_task(rq, p);
4289
4290 if (rt_prio(prio))
4291 p->sched_class = &rt_sched_class;
4292 else
4293 p->sched_class = &fair_sched_class;
4294
4295 p->prio = prio;
4296
4297 if (running)
4298 p->sched_class->set_curr_task(rq);
4299 if (on_rq) {
4300 enqueue_task(rq, p, 0, oldprio < prio);
4301
4302 check_class_changed(rq, p, prev_class, oldprio, running);
4303 }
4304 task_rq_unlock(rq, &flags);
4305}
4306
4307#endif
4308
4309void set_user_nice(struct task_struct *p, long nice)
4310{
4311 int old_prio, delta, on_rq;
4312 unsigned long flags;
4313 struct rq *rq;
4314
4315 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4316 return;
4317
4318
4319
4320
4321 rq = task_rq_lock(p, &flags);
4322 update_rq_clock(rq);
4323
4324
4325
4326
4327
4328
4329 if (task_has_rt_policy(p)) {
4330 p->static_prio = NICE_TO_PRIO(nice);
4331 goto out_unlock;
4332 }
4333 on_rq = p->se.on_rq;
4334 if (on_rq)
4335 dequeue_task(rq, p, 0);
4336
4337 p->static_prio = NICE_TO_PRIO(nice);
4338 set_load_weight(p);
4339 old_prio = p->prio;
4340 p->prio = effective_prio(p);
4341 delta = p->prio - old_prio;
4342
4343 if (on_rq) {
4344 enqueue_task(rq, p, 0, false);
4345
4346
4347
4348
4349 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4350 resched_task(rq->curr);
4351 }
4352out_unlock:
4353 task_rq_unlock(rq, &flags);
4354}
4355EXPORT_SYMBOL(set_user_nice);
4356
4357
4358
4359
4360
4361
4362int can_nice(const struct task_struct *p, const int nice)
4363{
4364
4365 int nice_rlim = 20 - nice;
4366
4367 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4368 capable(CAP_SYS_NICE));
4369}
4370
4371#ifdef __ARCH_WANT_SYS_NICE
4372
4373
4374
4375
4376
4377
4378
4379
4380SYSCALL_DEFINE1(nice, int, increment)
4381{
4382 long nice, retval;
4383
4384
4385
4386
4387
4388
4389 if (increment < -40)
4390 increment = -40;
4391 if (increment > 40)
4392 increment = 40;
4393
4394 nice = TASK_NICE(current) + increment;
4395 if (nice < -20)
4396 nice = -20;
4397 if (nice > 19)
4398 nice = 19;
4399
4400 if (increment < 0 && !can_nice(current, nice))
4401 return -EPERM;
4402
4403 retval = security_task_setnice(current, nice);
4404 if (retval)
4405 return retval;
4406
4407 set_user_nice(current, nice);
4408 return 0;
4409}
4410
4411#endif
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421int task_prio(const struct task_struct *p)
4422{
4423 return p->prio - MAX_RT_PRIO;
4424}
4425
4426
4427
4428
4429
4430int task_nice(const struct task_struct *p)
4431{
4432 return TASK_NICE(p);
4433}
4434EXPORT_SYMBOL(task_nice);
4435
4436
4437
4438
4439
4440int idle_cpu(int cpu)
4441{
4442 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4443}
4444
4445
4446
4447
4448
4449struct task_struct *idle_task(int cpu)
4450{
4451 return cpu_rq(cpu)->idle;
4452}
4453
4454
4455
4456
4457
4458static struct task_struct *find_process_by_pid(pid_t pid)
4459{
4460 return pid ? find_task_by_vpid(pid) : current;
4461}
4462
4463
4464static void
4465__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4466{
4467 BUG_ON(p->se.on_rq);
4468
4469 p->policy = policy;
4470 p->rt_priority = prio;
4471 p->normal_prio = normal_prio(p);
4472
4473 p->prio = rt_mutex_getprio(p);
4474 if (rt_prio(p->prio))
4475 p->sched_class = &rt_sched_class;
4476 else
4477 p->sched_class = &fair_sched_class;
4478 set_load_weight(p);
4479}
4480
4481
4482
4483
4484static bool check_same_owner(struct task_struct *p)
4485{
4486 const struct cred *cred = current_cred(), *pcred;
4487 bool match;
4488
4489 rcu_read_lock();
4490 pcred = __task_cred(p);
4491 match = (cred->euid == pcred->euid ||
4492 cred->euid == pcred->uid);
4493 rcu_read_unlock();
4494 return match;
4495}
4496
4497static int __sched_setscheduler(struct task_struct *p, int policy,
4498 struct sched_param *param, bool user)
4499{
4500 int retval, oldprio, oldpolicy = -1, on_rq, running;
4501 unsigned long flags;
4502 const struct sched_class *prev_class;
4503 struct rq *rq;
4504 int reset_on_fork;
4505
4506
4507 BUG_ON(in_interrupt());
4508recheck:
4509
4510 if (policy < 0) {
4511 reset_on_fork = p->sched_reset_on_fork;
4512 policy = oldpolicy = p->policy;
4513 } else {
4514 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4515 policy &= ~SCHED_RESET_ON_FORK;
4516
4517 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4518 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4519 policy != SCHED_IDLE)
4520 return -EINVAL;
4521 }
4522
4523
4524
4525
4526
4527
4528 if (param->sched_priority < 0 ||
4529 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4530 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4531 return -EINVAL;
4532 if (rt_policy(policy) != (param->sched_priority != 0))
4533 return -EINVAL;
4534
4535
4536
4537
4538 if (user && !capable(CAP_SYS_NICE)) {
4539 if (rt_policy(policy)) {
4540 unsigned long rlim_rtprio;
4541
4542 if (!lock_task_sighand(p, &flags))
4543 return -ESRCH;
4544 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4545 unlock_task_sighand(p, &flags);
4546
4547
4548 if (policy != p->policy && !rlim_rtprio)
4549 return -EPERM;
4550
4551
4552 if (param->sched_priority > p->rt_priority &&
4553 param->sched_priority > rlim_rtprio)
4554 return -EPERM;
4555 }
4556
4557
4558
4559
4560 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4561 return -EPERM;
4562
4563
4564 if (!check_same_owner(p))
4565 return -EPERM;
4566
4567
4568 if (p->sched_reset_on_fork && !reset_on_fork)
4569 return -EPERM;
4570 }
4571
4572 if (user) {
4573#ifdef CONFIG_RT_GROUP_SCHED
4574
4575
4576
4577
4578 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4579 task_group(p)->rt_bandwidth.rt_runtime == 0)
4580 return -EPERM;
4581#endif
4582
4583 retval = security_task_setscheduler(p, policy, param);
4584 if (retval)
4585 return retval;
4586 }
4587
4588
4589
4590
4591
4592 raw_spin_lock_irqsave(&p->pi_lock, flags);
4593
4594
4595
4596
4597 rq = __task_rq_lock(p);
4598
4599 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4600 policy = oldpolicy = -1;
4601 __task_rq_unlock(rq);
4602 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4603 goto recheck;
4604 }
4605 update_rq_clock(rq);
4606 on_rq = p->se.on_rq;
4607 running = task_current(rq, p);
4608 if (on_rq)
4609 deactivate_task(rq, p, 0);
4610 if (running)
4611 p->sched_class->put_prev_task(rq, p);
4612
4613 p->sched_reset_on_fork = reset_on_fork;
4614
4615 oldprio = p->prio;
4616 prev_class = p->sched_class;
4617 __setscheduler(rq, p, policy, param->sched_priority);
4618
4619 if (running)
4620 p->sched_class->set_curr_task(rq);
4621 if (on_rq) {
4622 activate_task(rq, p, 0);
4623
4624 check_class_changed(rq, p, prev_class, oldprio, running);
4625 }
4626 __task_rq_unlock(rq);
4627 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4628
4629 rt_mutex_adjust_pi(p);
4630
4631 return 0;
4632}
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642int sched_setscheduler(struct task_struct *p, int policy,
4643 struct sched_param *param)
4644{
4645 return __sched_setscheduler(p, policy, param, true);
4646}
4647EXPORT_SYMBOL_GPL(sched_setscheduler);
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4661 struct sched_param *param)
4662{
4663 return __sched_setscheduler(p, policy, param, false);
4664}
4665
4666static int
4667do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4668{
4669 struct sched_param lparam;
4670 struct task_struct *p;
4671 int retval;
4672
4673 if (!param || pid < 0)
4674 return -EINVAL;
4675 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4676 return -EFAULT;
4677
4678 rcu_read_lock();
4679 retval = -ESRCH;
4680 p = find_process_by_pid(pid);
4681 if (p != NULL)
4682 retval = sched_setscheduler(p, policy, &lparam);
4683 rcu_read_unlock();
4684
4685 return retval;
4686}
4687
4688
4689
4690
4691
4692
4693
4694SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4695 struct sched_param __user *, param)
4696{
4697
4698 if (policy < 0)
4699 return -EINVAL;
4700
4701 return do_sched_setscheduler(pid, policy, param);
4702}
4703
4704
4705
4706
4707
4708
4709SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4710{
4711 return do_sched_setscheduler(pid, -1, param);
4712}
4713
4714
4715
4716
4717
4718SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4719{
4720 struct task_struct *p;
4721 int retval;
4722
4723 if (pid < 0)
4724 return -EINVAL;
4725
4726 retval = -ESRCH;
4727 rcu_read_lock();
4728 p = find_process_by_pid(pid);
4729 if (p) {
4730 retval = security_task_getscheduler(p);
4731 if (!retval)
4732 retval = p->policy
4733 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4734 }
4735 rcu_read_unlock();
4736 return retval;
4737}
4738
4739
4740
4741
4742
4743
4744SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4745{
4746 struct sched_param lp;
4747 struct task_struct *p;
4748 int retval;
4749
4750 if (!param || pid < 0)
4751 return -EINVAL;
4752
4753 rcu_read_lock();
4754 p = find_process_by_pid(pid);
4755 retval = -ESRCH;
4756 if (!p)
4757 goto out_unlock;
4758
4759 retval = security_task_getscheduler(p);
4760 if (retval)
4761 goto out_unlock;
4762
4763 lp.sched_priority = p->rt_priority;
4764 rcu_read_unlock();
4765
4766
4767
4768
4769 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4770
4771 return retval;
4772
4773out_unlock:
4774 rcu_read_unlock();
4775 return retval;
4776}
4777
4778long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4779{
4780 cpumask_var_t cpus_allowed, new_mask;
4781 struct task_struct *p;
4782 int retval;
4783
4784 get_online_cpus();
4785 rcu_read_lock();
4786
4787 p = find_process_by_pid(pid);
4788 if (!p) {
4789 rcu_read_unlock();
4790 put_online_cpus();
4791 return -ESRCH;
4792 }
4793
4794
4795 get_task_struct(p);
4796 rcu_read_unlock();
4797
4798 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4799 retval = -ENOMEM;
4800 goto out_put_task;
4801 }
4802 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4803 retval = -ENOMEM;
4804 goto out_free_cpus_allowed;
4805 }
4806 retval = -EPERM;
4807 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4808 goto out_unlock;
4809
4810 retval = security_task_setscheduler(p, 0, NULL);
4811 if (retval)
4812 goto out_unlock;
4813
4814 cpuset_cpus_allowed(p, cpus_allowed);
4815 cpumask_and(new_mask, in_mask, cpus_allowed);
4816 again:
4817 retval = set_cpus_allowed_ptr(p, new_mask);
4818
4819 if (!retval) {
4820 cpuset_cpus_allowed(p, cpus_allowed);
4821 if (!cpumask_subset(new_mask, cpus_allowed)) {
4822
4823
4824
4825
4826
4827 cpumask_copy(new_mask, cpus_allowed);
4828 goto again;
4829 }
4830 }
4831out_unlock:
4832 free_cpumask_var(new_mask);
4833out_free_cpus_allowed:
4834 free_cpumask_var(cpus_allowed);
4835out_put_task:
4836 put_task_struct(p);
4837 put_online_cpus();
4838 return retval;
4839}
4840
4841static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4842 struct cpumask *new_mask)
4843{
4844 if (len < cpumask_size())
4845 cpumask_clear(new_mask);
4846 else if (len > cpumask_size())
4847 len = cpumask_size();
4848
4849 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4850}
4851
4852
4853
4854
4855
4856
4857
4858SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4859 unsigned long __user *, user_mask_ptr)
4860{
4861 cpumask_var_t new_mask;
4862 int retval;
4863
4864 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4865 return -ENOMEM;
4866
4867 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4868 if (retval == 0)
4869 retval = sched_setaffinity(pid, new_mask);
4870 free_cpumask_var(new_mask);
4871 return retval;
4872}
4873
4874long sched_getaffinity(pid_t pid, struct cpumask *mask)
4875{
4876 struct task_struct *p;
4877 unsigned long flags;
4878 struct rq *rq;
4879 int retval;
4880
4881 get_online_cpus();
4882 rcu_read_lock();
4883
4884 retval = -ESRCH;
4885 p = find_process_by_pid(pid);
4886 if (!p)
4887 goto out_unlock;
4888
4889 retval = security_task_getscheduler(p);
4890 if (retval)
4891 goto out_unlock;
4892
4893 rq = task_rq_lock(p, &flags);
4894 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4895 task_rq_unlock(rq, &flags);
4896
4897out_unlock:
4898 rcu_read_unlock();
4899 put_online_cpus();
4900
4901 return retval;
4902}
4903
4904
4905
4906
4907
4908
4909
4910SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4911 unsigned long __user *, user_mask_ptr)
4912{
4913 int ret;
4914 cpumask_var_t mask;
4915
4916 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4917 return -EINVAL;
4918 if (len & (sizeof(unsigned long)-1))
4919 return -EINVAL;
4920
4921 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4922 return -ENOMEM;
4923
4924 ret = sched_getaffinity(pid, mask);
4925 if (ret == 0) {
4926 size_t retlen = min_t(size_t, len, cpumask_size());
4927
4928 if (copy_to_user(user_mask_ptr, mask, retlen))
4929 ret = -EFAULT;
4930 else
4931 ret = retlen;
4932 }
4933 free_cpumask_var(mask);
4934
4935 return ret;
4936}
4937
4938
4939
4940
4941
4942
4943
4944SYSCALL_DEFINE0(sched_yield)
4945{
4946 struct rq *rq = this_rq_lock();
4947
4948 schedstat_inc(rq, yld_count);
4949 current->sched_class->yield_task(rq);
4950
4951
4952
4953
4954
4955 __release(rq->lock);
4956 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4957 do_raw_spin_unlock(&rq->lock);
4958 preempt_enable_no_resched();
4959
4960 schedule();
4961
4962 return 0;
4963}
4964
4965static inline int should_resched(void)
4966{
4967 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4968}
4969
4970static void __cond_resched(void)
4971{
4972 add_preempt_count(PREEMPT_ACTIVE);
4973 schedule();
4974 sub_preempt_count(PREEMPT_ACTIVE);
4975}
4976
4977int __sched _cond_resched(void)
4978{
4979 if (should_resched()) {
4980 __cond_resched();
4981 return 1;
4982 }
4983 return 0;
4984}
4985EXPORT_SYMBOL(_cond_resched);
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995int __cond_resched_lock(spinlock_t *lock)
4996{
4997 int resched = should_resched();
4998 int ret = 0;
4999
5000 lockdep_assert_held(lock);
5001
5002 if (spin_needbreak(lock) || resched) {
5003 spin_unlock(lock);
5004 if (resched)
5005 __cond_resched();
5006 else
5007 cpu_relax();
5008 ret = 1;
5009 spin_lock(lock);
5010 }
5011 return ret;
5012}
5013EXPORT_SYMBOL(__cond_resched_lock);
5014
5015int __sched __cond_resched_softirq(void)
5016{
5017 BUG_ON(!in_softirq());
5018
5019 if (should_resched()) {
5020 local_bh_enable();
5021 __cond_resched();
5022 local_bh_disable();
5023 return 1;
5024 }
5025 return 0;
5026}
5027EXPORT_SYMBOL(__cond_resched_softirq);
5028
5029
5030
5031
5032
5033
5034
5035void __sched yield(void)
5036{
5037 set_current_state(TASK_RUNNING);
5038 sys_sched_yield();
5039}
5040EXPORT_SYMBOL(yield);
5041
5042
5043
5044
5045
5046void __sched io_schedule(void)
5047{
5048 struct rq *rq = raw_rq();
5049
5050 delayacct_blkio_start();
5051 atomic_inc(&rq->nr_iowait);
5052 current->in_iowait = 1;
5053 schedule();
5054 current->in_iowait = 0;
5055 atomic_dec(&rq->nr_iowait);
5056 delayacct_blkio_end();
5057}
5058EXPORT_SYMBOL(io_schedule);
5059
5060long __sched io_schedule_timeout(long timeout)
5061{
5062 struct rq *rq = raw_rq();
5063 long ret;
5064
5065 delayacct_blkio_start();
5066 atomic_inc(&rq->nr_iowait);
5067 current->in_iowait = 1;
5068 ret = schedule_timeout(timeout);
5069 current->in_iowait = 0;
5070 atomic_dec(&rq->nr_iowait);
5071 delayacct_blkio_end();
5072 return ret;
5073}
5074
5075
5076
5077
5078
5079
5080
5081
5082SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5083{
5084 int ret = -EINVAL;
5085
5086 switch (policy) {
5087 case SCHED_FIFO:
5088 case SCHED_RR:
5089 ret = MAX_USER_RT_PRIO-1;
5090 break;
5091 case SCHED_NORMAL:
5092 case SCHED_BATCH:
5093 case SCHED_IDLE:
5094 ret = 0;
5095 break;
5096 }
5097 return ret;
5098}
5099
5100
5101
5102
5103
5104
5105
5106
5107SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5108{
5109 int ret = -EINVAL;
5110
5111 switch (policy) {
5112 case SCHED_FIFO:
5113 case SCHED_RR:
5114 ret = 1;
5115 break;
5116 case SCHED_NORMAL:
5117 case SCHED_BATCH:
5118 case SCHED_IDLE:
5119 ret = 0;
5120 }
5121 return ret;
5122}
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5133 struct timespec __user *, interval)
5134{
5135 struct task_struct *p;
5136 unsigned int time_slice;
5137 unsigned long flags;
5138 struct rq *rq;
5139 int retval;
5140 struct timespec t;
5141
5142 if (pid < 0)
5143 return -EINVAL;
5144
5145 retval = -ESRCH;
5146 rcu_read_lock();
5147 p = find_process_by_pid(pid);
5148 if (!p)
5149 goto out_unlock;
5150
5151 retval = security_task_getscheduler(p);
5152 if (retval)
5153 goto out_unlock;
5154
5155 rq = task_rq_lock(p, &flags);
5156 time_slice = p->sched_class->get_rr_interval(rq, p);
5157 task_rq_unlock(rq, &flags);
5158
5159 rcu_read_unlock();
5160 jiffies_to_timespec(time_slice, &t);
5161 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5162 return retval;
5163
5164out_unlock:
5165 rcu_read_unlock();
5166 return retval;
5167}
5168
5169static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5170
5171void sched_show_task(struct task_struct *p)
5172{
5173 unsigned long free = 0;
5174 unsigned state;
5175
5176 state = p->state ? __ffs(p->state) + 1 : 0;
5177 printk(KERN_INFO "%-13.13s %c", p->comm,
5178 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5179#if BITS_PER_LONG == 32
5180 if (state == TASK_RUNNING)
5181 printk(KERN_CONT " running ");
5182 else
5183 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5184#else
5185 if (state == TASK_RUNNING)
5186 printk(KERN_CONT " running task ");
5187 else
5188 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5189#endif
5190#ifdef CONFIG_DEBUG_STACK_USAGE
5191 free = stack_not_used(p);
5192#endif
5193 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5194 task_pid_nr(p), task_pid_nr(p->real_parent),
5195 (unsigned long)task_thread_info(p)->flags);
5196
5197 show_stack(p, NULL);
5198}
5199
5200void show_state_filter(unsigned long state_filter)
5201{
5202 struct task_struct *g, *p;
5203
5204#if BITS_PER_LONG == 32
5205 printk(KERN_INFO
5206 " task PC stack pid father\n");
5207#else
5208 printk(KERN_INFO
5209 " task PC stack pid father\n");
5210#endif
5211 read_lock(&tasklist_lock);
5212 do_each_thread(g, p) {
5213
5214
5215
5216
5217 touch_nmi_watchdog();
5218 if (!state_filter || (p->state & state_filter))
5219 sched_show_task(p);
5220 } while_each_thread(g, p);
5221
5222 touch_all_softlockup_watchdogs();
5223
5224#ifdef CONFIG_SCHED_DEBUG
5225 sysrq_sched_debug_show();
5226#endif
5227 read_unlock(&tasklist_lock);
5228
5229
5230
5231 if (!state_filter)
5232 debug_show_all_locks();
5233}
5234
5235void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5236{
5237 idle->sched_class = &idle_sched_class;
5238}
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248void __cpuinit init_idle(struct task_struct *idle, int cpu)
5249{
5250 struct rq *rq = cpu_rq(cpu);
5251 unsigned long flags;
5252
5253 raw_spin_lock_irqsave(&rq->lock, flags);
5254
5255 __sched_fork(idle);
5256 idle->state = TASK_RUNNING;
5257 idle->se.exec_start = sched_clock();
5258
5259 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5260 __set_task_cpu(idle, cpu);
5261
5262 rq->curr = rq->idle = idle;
5263#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5264 idle->oncpu = 1;
5265#endif
5266 raw_spin_unlock_irqrestore(&rq->lock, flags);
5267
5268
5269#if defined(CONFIG_PREEMPT)
5270 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5271#else
5272 task_thread_info(idle)->preempt_count = 0;
5273#endif
5274
5275
5276
5277 idle->sched_class = &idle_sched_class;
5278 ftrace_graph_init_task(idle);
5279}
5280
5281
5282
5283
5284
5285
5286
5287
5288cpumask_var_t nohz_cpu_mask;
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299static int get_update_sysctl_factor(void)
5300{
5301 unsigned int cpus = min_t(int, num_online_cpus(), 8);
5302 unsigned int factor;
5303
5304 switch (sysctl_sched_tunable_scaling) {
5305 case SCHED_TUNABLESCALING_NONE:
5306 factor = 1;
5307 break;
5308 case SCHED_TUNABLESCALING_LINEAR:
5309 factor = cpus;
5310 break;
5311 case SCHED_TUNABLESCALING_LOG:
5312 default:
5313 factor = 1 + ilog2(cpus);
5314 break;
5315 }
5316
5317 return factor;
5318}
5319
5320static void update_sysctl(void)
5321{
5322 unsigned int factor = get_update_sysctl_factor();
5323
5324#define SET_SYSCTL(name) \
5325 (sysctl_##name = (factor) * normalized_sysctl_##name)
5326 SET_SYSCTL(sched_min_granularity);
5327 SET_SYSCTL(sched_latency);
5328 SET_SYSCTL(sched_wakeup_granularity);
5329 SET_SYSCTL(sched_shares_ratelimit);
5330#undef SET_SYSCTL
5331}
5332
5333static inline void sched_init_granularity(void)
5334{
5335 update_sysctl();
5336}
5337
5338#ifdef CONFIG_SMP
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5365{
5366 struct migration_req req;
5367 unsigned long flags;
5368 struct rq *rq;
5369 int ret = 0;
5370
5371 rq = task_rq_lock(p, &flags);
5372
5373 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5374 ret = -EINVAL;
5375 goto out;
5376 }
5377
5378 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5379 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5380 ret = -EINVAL;
5381 goto out;
5382 }
5383
5384 if (p->sched_class->set_cpus_allowed)
5385 p->sched_class->set_cpus_allowed(p, new_mask);
5386 else {
5387 cpumask_copy(&p->cpus_allowed, new_mask);
5388 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5389 }
5390
5391
5392 if (cpumask_test_cpu(task_cpu(p), new_mask))
5393 goto out;
5394
5395 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
5396
5397 struct task_struct *mt = rq->migration_thread;
5398
5399 get_task_struct(mt);
5400 task_rq_unlock(rq, &flags);
5401 wake_up_process(mt);
5402 put_task_struct(mt);
5403 wait_for_completion(&req.done);
5404 tlb_migrate_finish(p->mm);
5405 return 0;
5406 }
5407out:
5408 task_rq_unlock(rq, &flags);
5409
5410 return ret;
5411}
5412EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5426{
5427 struct rq *rq_dest, *rq_src;
5428 int ret = 0;
5429
5430 if (unlikely(!cpu_active(dest_cpu)))
5431 return ret;
5432
5433 rq_src = cpu_rq(src_cpu);
5434 rq_dest = cpu_rq(dest_cpu);
5435
5436 double_rq_lock(rq_src, rq_dest);
5437
5438 if (task_cpu(p) != src_cpu)
5439 goto done;
5440
5441 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
5442 goto fail;
5443
5444
5445
5446
5447
5448 if (p->se.on_rq) {
5449 deactivate_task(rq_src, p, 0);
5450 set_task_cpu(p, dest_cpu);
5451 activate_task(rq_dest, p, 0);
5452 check_preempt_curr(rq_dest, p, 0);
5453 }
5454done:
5455 ret = 1;
5456fail:
5457 double_rq_unlock(rq_src, rq_dest);
5458 return ret;
5459}
5460
5461#define RCU_MIGRATION_IDLE 0
5462#define RCU_MIGRATION_NEED_QS 1
5463#define RCU_MIGRATION_GOT_QS 2
5464#define RCU_MIGRATION_MUST_SYNC 3
5465
5466
5467
5468
5469
5470
5471static int migration_thread(void *data)
5472{
5473 int badcpu;
5474 int cpu = (long)data;
5475 struct rq *rq;
5476
5477 rq = cpu_rq(cpu);
5478 BUG_ON(rq->migration_thread != current);
5479
5480 set_current_state(TASK_INTERRUPTIBLE);
5481 while (!kthread_should_stop()) {
5482 struct migration_req *req;
5483 struct list_head *head;
5484
5485 raw_spin_lock_irq(&rq->lock);
5486
5487 if (cpu_is_offline(cpu)) {
5488 raw_spin_unlock_irq(&rq->lock);
5489 break;
5490 }
5491
5492 if (rq->active_balance) {
5493 active_load_balance(rq, cpu);
5494 rq->active_balance = 0;
5495 }
5496
5497 head = &rq->migration_queue;
5498
5499 if (list_empty(head)) {
5500 raw_spin_unlock_irq(&rq->lock);
5501 schedule();
5502 set_current_state(TASK_INTERRUPTIBLE);
5503 continue;
5504 }
5505 req = list_entry(head->next, struct migration_req, list);
5506 list_del_init(head->next);
5507
5508 if (req->task != NULL) {
5509 raw_spin_unlock(&rq->lock);
5510 __migrate_task(req->task, cpu, req->dest_cpu);
5511 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5512 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5513 raw_spin_unlock(&rq->lock);
5514 } else {
5515 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5516 raw_spin_unlock(&rq->lock);
5517 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5518 }
5519 local_irq_enable();
5520
5521 complete(&req->done);
5522 }
5523 __set_current_state(TASK_RUNNING);
5524
5525 return 0;
5526}
5527
5528#ifdef CONFIG_HOTPLUG_CPU
5529
5530static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5531{
5532 int ret;
5533
5534 local_irq_disable();
5535 ret = __migrate_task(p, src_cpu, dest_cpu);
5536 local_irq_enable();
5537 return ret;
5538}
5539
5540
5541
5542
5543static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5544{
5545 int dest_cpu;
5546
5547again:
5548 dest_cpu = select_fallback_rq(dead_cpu, p);
5549
5550
5551 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
5552 goto again;
5553}
5554
5555
5556
5557
5558
5559
5560
5561
5562static void migrate_nr_uninterruptible(struct rq *rq_src)
5563{
5564 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5565 unsigned long flags;
5566
5567 local_irq_save(flags);
5568 double_rq_lock(rq_src, rq_dest);
5569 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5570 rq_src->nr_uninterruptible = 0;
5571 double_rq_unlock(rq_src, rq_dest);
5572 local_irq_restore(flags);
5573}
5574
5575
5576static void migrate_live_tasks(int src_cpu)
5577{
5578 struct task_struct *p, *t;
5579
5580 read_lock(&tasklist_lock);
5581
5582 do_each_thread(t, p) {
5583 if (p == current)
5584 continue;
5585
5586 if (task_cpu(p) == src_cpu)
5587 move_task_off_dead_cpu(src_cpu, p);
5588 } while_each_thread(t, p);
5589
5590 read_unlock(&tasklist_lock);
5591}
5592
5593
5594
5595
5596
5597
5598void sched_idle_next(void)
5599{
5600 int this_cpu = smp_processor_id();
5601 struct rq *rq = cpu_rq(this_cpu);
5602 struct task_struct *p = rq->idle;
5603 unsigned long flags;
5604
5605
5606 BUG_ON(cpu_online(this_cpu));
5607
5608
5609
5610
5611
5612 raw_spin_lock_irqsave(&rq->lock, flags);
5613
5614 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5615
5616 update_rq_clock(rq);
5617 activate_task(rq, p, 0);
5618
5619 raw_spin_unlock_irqrestore(&rq->lock, flags);
5620}
5621
5622
5623
5624
5625
5626void idle_task_exit(void)
5627{
5628 struct mm_struct *mm = current->active_mm;
5629
5630 BUG_ON(cpu_online(smp_processor_id()));
5631
5632 if (mm != &init_mm)
5633 switch_mm(mm, &init_mm, current);
5634 mmdrop(mm);
5635}
5636
5637
5638static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5639{
5640 struct rq *rq = cpu_rq(dead_cpu);
5641
5642
5643 BUG_ON(!p->exit_state);
5644
5645
5646 BUG_ON(p->state == TASK_DEAD);
5647
5648 get_task_struct(p);
5649
5650
5651
5652
5653
5654
5655 raw_spin_unlock_irq(&rq->lock);
5656 move_task_off_dead_cpu(dead_cpu, p);
5657 raw_spin_lock_irq(&rq->lock);
5658
5659 put_task_struct(p);
5660}
5661
5662
5663static void migrate_dead_tasks(unsigned int dead_cpu)
5664{
5665 struct rq *rq = cpu_rq(dead_cpu);
5666 struct task_struct *next;
5667
5668 for ( ; ; ) {
5669 if (!rq->nr_running)
5670 break;
5671 update_rq_clock(rq);
5672 next = pick_next_task(rq);
5673 if (!next)
5674 break;
5675 next->sched_class->put_prev_task(rq, next);
5676 migrate_dead(dead_cpu, next);
5677
5678 }
5679}
5680
5681
5682
5683
5684static void calc_global_load_remove(struct rq *rq)
5685{
5686 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5687 rq->calc_load_active = 0;
5688}
5689#endif
5690
5691#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5692
5693static struct ctl_table sd_ctl_dir[] = {
5694 {
5695 .procname = "sched_domain",
5696 .mode = 0555,
5697 },
5698 {}
5699};
5700
5701static struct ctl_table sd_ctl_root[] = {
5702 {
5703 .procname = "kernel",
5704 .mode = 0555,
5705 .child = sd_ctl_dir,
5706 },
5707 {}
5708};
5709
5710static struct ctl_table *sd_alloc_ctl_entry(int n)
5711{
5712 struct ctl_table *entry =
5713 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5714
5715 return entry;
5716}
5717
5718static void sd_free_ctl_entry(struct ctl_table **tablep)
5719{
5720 struct ctl_table *entry;
5721
5722
5723
5724
5725
5726
5727
5728 for (entry = *tablep; entry->mode; entry++) {
5729 if (entry->child)
5730 sd_free_ctl_entry(&entry->child);
5731 if (entry->proc_handler == NULL)
5732 kfree(entry->procname);
5733 }
5734
5735 kfree(*tablep);
5736 *tablep = NULL;
5737}
5738
5739static void
5740set_table_entry(struct ctl_table *entry,
5741 const char *procname, void *data, int maxlen,
5742 mode_t mode, proc_handler *proc_handler)
5743{
5744 entry->procname = procname;
5745 entry->data = data;
5746 entry->maxlen = maxlen;
5747 entry->mode = mode;
5748 entry->proc_handler = proc_handler;
5749}
5750
5751static struct ctl_table *
5752sd_alloc_ctl_domain_table(struct sched_domain *sd)
5753{
5754 struct ctl_table *table = sd_alloc_ctl_entry(13);
5755
5756 if (table == NULL)
5757 return NULL;
5758
5759 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5760 sizeof(long), 0644, proc_doulongvec_minmax);
5761 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5762 sizeof(long), 0644, proc_doulongvec_minmax);
5763 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5764 sizeof(int), 0644, proc_dointvec_minmax);
5765 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5766 sizeof(int), 0644, proc_dointvec_minmax);
5767 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5768 sizeof(int), 0644, proc_dointvec_minmax);
5769 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5770 sizeof(int), 0644, proc_dointvec_minmax);
5771 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5772 sizeof(int), 0644, proc_dointvec_minmax);
5773 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5774 sizeof(int), 0644, proc_dointvec_minmax);
5775 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5776 sizeof(int), 0644, proc_dointvec_minmax);
5777 set_table_entry(&table[9], "cache_nice_tries",
5778 &sd->cache_nice_tries,
5779 sizeof(int), 0644, proc_dointvec_minmax);
5780 set_table_entry(&table[10], "flags", &sd->flags,
5781 sizeof(int), 0644, proc_dointvec_minmax);
5782 set_table_entry(&table[11], "name", sd->name,
5783 CORENAME_MAX_SIZE, 0444, proc_dostring);
5784
5785
5786 return table;
5787}
5788
5789static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5790{
5791 struct ctl_table *entry, *table;
5792 struct sched_domain *sd;
5793 int domain_num = 0, i;
5794 char buf[32];
5795
5796 for_each_domain(cpu, sd)
5797 domain_num++;
5798 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5799 if (table == NULL)
5800 return NULL;
5801
5802 i = 0;
5803 for_each_domain(cpu, sd) {
5804 snprintf(buf, 32, "domain%d", i);
5805 entry->procname = kstrdup(buf, GFP_KERNEL);
5806 entry->mode = 0555;
5807 entry->child = sd_alloc_ctl_domain_table(sd);
5808 entry++;
5809 i++;
5810 }
5811 return table;
5812}
5813
5814static struct ctl_table_header *sd_sysctl_header;
5815static void register_sched_domain_sysctl(void)
5816{
5817 int i, cpu_num = num_possible_cpus();
5818 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5819 char buf[32];
5820
5821 WARN_ON(sd_ctl_dir[0].child);
5822 sd_ctl_dir[0].child = entry;
5823
5824 if (entry == NULL)
5825 return;
5826
5827 for_each_possible_cpu(i) {
5828 snprintf(buf, 32, "cpu%d", i);
5829 entry->procname = kstrdup(buf, GFP_KERNEL);
5830 entry->mode = 0555;
5831 entry->child = sd_alloc_ctl_cpu_table(i);
5832 entry++;
5833 }
5834
5835 WARN_ON(sd_sysctl_header);
5836 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5837}
5838
5839
5840static void unregister_sched_domain_sysctl(void)
5841{
5842 if (sd_sysctl_header)
5843 unregister_sysctl_table(sd_sysctl_header);
5844 sd_sysctl_header = NULL;
5845 if (sd_ctl_dir[0].child)
5846 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5847}
5848#else
5849static void register_sched_domain_sysctl(void)
5850{
5851}
5852static void unregister_sched_domain_sysctl(void)
5853{
5854}
5855#endif
5856
5857static void set_rq_online(struct rq *rq)
5858{
5859 if (!rq->online) {
5860 const struct sched_class *class;
5861
5862 cpumask_set_cpu(rq->cpu, rq->rd->online);
5863 rq->online = 1;
5864
5865 for_each_class(class) {
5866 if (class->rq_online)
5867 class->rq_online(rq);
5868 }
5869 }
5870}
5871
5872static void set_rq_offline(struct rq *rq)
5873{
5874 if (rq->online) {
5875 const struct sched_class *class;
5876
5877 for_each_class(class) {
5878 if (class->rq_offline)
5879 class->rq_offline(rq);
5880 }
5881
5882 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5883 rq->online = 0;
5884 }
5885}
5886
5887
5888
5889
5890
5891static int __cpuinit
5892migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5893{
5894 struct task_struct *p;
5895 int cpu = (long)hcpu;
5896 unsigned long flags;
5897 struct rq *rq;
5898
5899 switch (action) {
5900
5901 case CPU_UP_PREPARE:
5902 case CPU_UP_PREPARE_FROZEN:
5903 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5904 if (IS_ERR(p))
5905 return NOTIFY_BAD;
5906 kthread_bind(p, cpu);
5907
5908 rq = task_rq_lock(p, &flags);
5909 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5910 task_rq_unlock(rq, &flags);
5911 get_task_struct(p);
5912 cpu_rq(cpu)->migration_thread = p;
5913 rq->calc_load_update = calc_load_update;
5914 break;
5915
5916 case CPU_ONLINE:
5917 case CPU_ONLINE_FROZEN:
5918
5919 wake_up_process(cpu_rq(cpu)->migration_thread);
5920
5921
5922 rq = cpu_rq(cpu);
5923 raw_spin_lock_irqsave(&rq->lock, flags);
5924 if (rq->rd) {
5925 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5926
5927 set_rq_online(rq);
5928 }
5929 raw_spin_unlock_irqrestore(&rq->lock, flags);
5930 break;
5931
5932#ifdef CONFIG_HOTPLUG_CPU
5933 case CPU_UP_CANCELED:
5934 case CPU_UP_CANCELED_FROZEN:
5935 if (!cpu_rq(cpu)->migration_thread)
5936 break;
5937
5938 kthread_bind(cpu_rq(cpu)->migration_thread,
5939 cpumask_any(cpu_online_mask));
5940 kthread_stop(cpu_rq(cpu)->migration_thread);
5941 put_task_struct(cpu_rq(cpu)->migration_thread);
5942 cpu_rq(cpu)->migration_thread = NULL;
5943 break;
5944
5945 case CPU_DEAD:
5946 case CPU_DEAD_FROZEN:
5947 cpuset_lock();
5948 migrate_live_tasks(cpu);
5949 rq = cpu_rq(cpu);
5950 kthread_stop(rq->migration_thread);
5951 put_task_struct(rq->migration_thread);
5952 rq->migration_thread = NULL;
5953
5954 raw_spin_lock_irq(&rq->lock);
5955 update_rq_clock(rq);
5956 deactivate_task(rq, rq->idle, 0);
5957 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5958 rq->idle->sched_class = &idle_sched_class;
5959 migrate_dead_tasks(cpu);
5960 raw_spin_unlock_irq(&rq->lock);
5961 cpuset_unlock();
5962 migrate_nr_uninterruptible(rq);
5963 BUG_ON(rq->nr_running != 0);
5964 calc_global_load_remove(rq);
5965
5966
5967
5968
5969
5970 raw_spin_lock_irq(&rq->lock);
5971 while (!list_empty(&rq->migration_queue)) {
5972 struct migration_req *req;
5973
5974 req = list_entry(rq->migration_queue.next,
5975 struct migration_req, list);
5976 list_del_init(&req->list);
5977 raw_spin_unlock_irq(&rq->lock);
5978 complete(&req->done);
5979 raw_spin_lock_irq(&rq->lock);
5980 }
5981 raw_spin_unlock_irq(&rq->lock);
5982 break;
5983
5984 case CPU_DYING:
5985 case CPU_DYING_FROZEN:
5986
5987 rq = cpu_rq(cpu);
5988 raw_spin_lock_irqsave(&rq->lock, flags);
5989 if (rq->rd) {
5990 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5991 set_rq_offline(rq);
5992 }
5993 raw_spin_unlock_irqrestore(&rq->lock, flags);
5994 break;
5995#endif
5996 }
5997 return NOTIFY_OK;
5998}
5999
6000
6001
6002
6003
6004
6005static struct notifier_block __cpuinitdata migration_notifier = {
6006 .notifier_call = migration_call,
6007 .priority = 10
6008};
6009
6010static int __init migration_init(void)
6011{
6012 void *cpu = (void *)(long)smp_processor_id();
6013 int err;
6014
6015
6016 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6017 BUG_ON(err == NOTIFY_BAD);
6018 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6019 register_cpu_notifier(&migration_notifier);
6020
6021 return 0;
6022}
6023early_initcall(migration_init);
6024#endif
6025
6026#ifdef CONFIG_SMP
6027
6028#ifdef CONFIG_SCHED_DEBUG
6029
6030static __read_mostly int sched_domain_debug_enabled;
6031
6032static int __init sched_domain_debug_setup(char *str)
6033{
6034 sched_domain_debug_enabled = 1;
6035
6036 return 0;
6037}
6038early_param("sched_debug", sched_domain_debug_setup);
6039
6040static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6041 struct cpumask *groupmask)
6042{
6043 struct sched_group *group = sd->groups;
6044 char str[256];
6045
6046 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6047 cpumask_clear(groupmask);
6048
6049 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6050
6051 if (!(sd->flags & SD_LOAD_BALANCE)) {
6052 printk("does not load-balance\n");
6053 if (sd->parent)
6054 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6055 " has parent");
6056 return -1;
6057 }
6058
6059 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6060
6061 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6062 printk(KERN_ERR "ERROR: domain->span does not contain "
6063 "CPU%d\n", cpu);
6064 }
6065 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6066 printk(KERN_ERR "ERROR: domain->groups does not contain"
6067 " CPU%d\n", cpu);
6068 }
6069
6070 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6071 do {
6072 if (!group) {
6073 printk("\n");
6074 printk(KERN_ERR "ERROR: group is NULL\n");
6075 break;
6076 }
6077
6078 if (!group->cpu_power) {
6079 printk(KERN_CONT "\n");
6080 printk(KERN_ERR "ERROR: domain->cpu_power not "
6081 "set\n");
6082 break;
6083 }
6084
6085 if (!cpumask_weight(sched_group_cpus(group))) {
6086 printk(KERN_CONT "\n");
6087 printk(KERN_ERR "ERROR: empty group\n");
6088 break;
6089 }
6090
6091 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6092 printk(KERN_CONT "\n");
6093 printk(KERN_ERR "ERROR: repeated CPUs\n");
6094 break;
6095 }
6096
6097 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6098
6099 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6100
6101 printk(KERN_CONT " %s", str);
6102 if (group->cpu_power != SCHED_LOAD_SCALE) {
6103 printk(KERN_CONT " (cpu_power = %d)",
6104 group->cpu_power);
6105 }
6106
6107 group = group->next;
6108 } while (