1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/perf_event.h>
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
46#include <linux/freezer.h>
47#include <linux/vmalloc.h>
48#include <linux/blkdev.h>
49#include <linux/delay.h>
50#include <linux/pid_namespace.h>
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h>
62#include <linux/syscalls.h>
63#include <linux/times.h>
64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h>
66#include <linux/delayacct.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73#include <linux/ftrace.h>
74#include <linux/slab.h>
75
76#include <asm/tlb.h>
77#include <asm/irq_regs.h>
78
79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
81
82#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h>
84
85
86
87
88
89
90#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
91#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
92#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
93
94
95
96
97
98
99#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
100#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
101#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
102
103
104
105
106#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
107
108#define NICE_0_LOAD SCHED_LOAD_SCALE
109#define NICE_0_SHIFT SCHED_LOAD_SHIFT
110
111
112
113
114
115
116
117#define DEF_TIMESLICE (100 * HZ / 1000)
118
119
120
121
122#define RUNTIME_INF ((u64)~0ULL)
123
124static inline int rt_policy(int policy)
125{
126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
127 return 1;
128 return 0;
129}
130
131static inline int task_has_rt_policy(struct task_struct *p)
132{
133 return rt_policy(p->policy);
134}
135
136
137
138
139struct rt_prio_array {
140 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
141 struct list_head queue[MAX_RT_PRIO];
142};
143
144struct rt_bandwidth {
145
146 raw_spinlock_t rt_runtime_lock;
147 ktime_t rt_period;
148 u64 rt_runtime;
149 struct hrtimer rt_period_timer;
150};
151
152static struct rt_bandwidth def_rt_bandwidth;
153
154static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
155
156static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
157{
158 struct rt_bandwidth *rt_b =
159 container_of(timer, struct rt_bandwidth, rt_period_timer);
160 ktime_t now;
161 int overrun;
162 int idle = 0;
163
164 for (;;) {
165 now = hrtimer_cb_get_time(timer);
166 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
167
168 if (!overrun)
169 break;
170
171 idle = do_sched_rt_period_timer(rt_b, overrun);
172 }
173
174 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
175}
176
177static
178void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
179{
180 rt_b->rt_period = ns_to_ktime(period);
181 rt_b->rt_runtime = runtime;
182
183 raw_spin_lock_init(&rt_b->rt_runtime_lock);
184
185 hrtimer_init(&rt_b->rt_period_timer,
186 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
187 rt_b->rt_period_timer.function = sched_rt_period_timer;
188}
189
190static inline int rt_bandwidth_enabled(void)
191{
192 return sysctl_sched_rt_runtime >= 0;
193}
194
195static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
196{
197 ktime_t now;
198
199 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
200 return;
201
202 if (hrtimer_active(&rt_b->rt_period_timer))
203 return;
204
205 raw_spin_lock(&rt_b->rt_runtime_lock);
206 for (;;) {
207 unsigned long delta;
208 ktime_t soft, hard;
209
210 if (hrtimer_active(&rt_b->rt_period_timer))
211 break;
212
213 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
214 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
215
216 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
217 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
218 delta = ktime_to_ns(ktime_sub(hard, soft));
219 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
220 HRTIMER_MODE_ABS_PINNED, 0);
221 }
222 raw_spin_unlock(&rt_b->rt_runtime_lock);
223}
224
225#ifdef CONFIG_RT_GROUP_SCHED
226static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
227{
228 hrtimer_cancel(&rt_b->rt_period_timer);
229}
230#endif
231
232
233
234
235
236static DEFINE_MUTEX(sched_domains_mutex);
237
238#ifdef CONFIG_CGROUP_SCHED
239
240#include <linux/cgroup.h>
241
242struct cfs_rq;
243
244static LIST_HEAD(task_groups);
245
246
247struct task_group {
248 struct cgroup_subsys_state css;
249
250#ifdef CONFIG_FAIR_GROUP_SCHED
251
252 struct sched_entity **se;
253
254 struct cfs_rq **cfs_rq;
255 unsigned long shares;
256#endif
257
258#ifdef CONFIG_RT_GROUP_SCHED
259 struct sched_rt_entity **rt_se;
260 struct rt_rq **rt_rq;
261
262 struct rt_bandwidth rt_bandwidth;
263#endif
264
265 struct rcu_head rcu;
266 struct list_head list;
267
268 struct task_group *parent;
269 struct list_head siblings;
270 struct list_head children;
271};
272
273#define root_task_group init_task_group
274
275
276
277
278static DEFINE_SPINLOCK(task_group_lock);
279
280#ifdef CONFIG_FAIR_GROUP_SCHED
281
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290
291
292
293
294
295
296
297
298
299#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18)
301
302static int init_task_group_load = INIT_TASK_GROUP_LOAD;
303#endif
304
305
306
307
308struct task_group init_task_group;
309
310#endif
311
312
313struct cfs_rq {
314 struct load_weight load;
315 unsigned long nr_running;
316
317 u64 exec_clock;
318 u64 min_vruntime;
319
320 struct rb_root tasks_timeline;
321 struct rb_node *rb_leftmost;
322
323 struct list_head tasks;
324 struct list_head *balance_iterator;
325
326
327
328
329
330 struct sched_entity *curr, *next, *last;
331
332 unsigned int nr_spread_over;
333
334#ifdef CONFIG_FAIR_GROUP_SCHED
335 struct rq *rq;
336
337
338
339
340
341
342
343
344
345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg;
347
348#ifdef CONFIG_SMP
349
350
351
352 unsigned long task_weight;
353
354
355
356
357
358
359
360 unsigned long h_load;
361
362
363
364
365 unsigned long shares;
366
367
368
369
370 unsigned long rq_weight;
371#endif
372#endif
373};
374
375
376struct rt_rq {
377 struct rt_prio_array active;
378 unsigned long rt_nr_running;
379#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
380 struct {
381 int curr;
382#ifdef CONFIG_SMP
383 int next;
384#endif
385 } highest_prio;
386#endif
387#ifdef CONFIG_SMP
388 unsigned long rt_nr_migratory;
389 unsigned long rt_nr_total;
390 int overloaded;
391 struct plist_head pushable_tasks;
392#endif
393 int rt_throttled;
394 u64 rt_time;
395 u64 rt_runtime;
396
397 raw_spinlock_t rt_runtime_lock;
398
399#ifdef CONFIG_RT_GROUP_SCHED
400 unsigned long rt_nr_boosted;
401
402 struct rq *rq;
403 struct list_head leaf_rt_rq_list;
404 struct task_group *tg;
405#endif
406};
407
408#ifdef CONFIG_SMP
409
410
411
412
413
414
415
416
417
418struct root_domain {
419 atomic_t refcount;
420 cpumask_var_t span;
421 cpumask_var_t online;
422
423
424
425
426
427 cpumask_var_t rto_mask;
428 atomic_t rto_count;
429 struct cpupri cpupri;
430};
431
432
433
434
435
436static struct root_domain def_root_domain;
437
438#endif
439
440
441
442
443
444
445
446
447struct rq {
448
449 raw_spinlock_t lock;
450
451
452
453
454
455 unsigned long nr_running;
456 #define CPU_LOAD_IDX_MAX 5
457 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
458 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp;
461 unsigned char nohz_balance_kick;
462#endif
463 unsigned int skip_clock_update;
464
465
466 struct load_weight load;
467 unsigned long nr_load_updates;
468 u64 nr_switches;
469
470 struct cfs_rq cfs;
471 struct rt_rq rt;
472
473#ifdef CONFIG_FAIR_GROUP_SCHED
474
475 struct list_head leaf_cfs_rq_list;
476#endif
477#ifdef CONFIG_RT_GROUP_SCHED
478 struct list_head leaf_rt_rq_list;
479#endif
480
481
482
483
484
485
486
487 unsigned long nr_uninterruptible;
488
489 struct task_struct *curr, *idle, *stop;
490 unsigned long next_balance;
491 struct mm_struct *prev_mm;
492
493 u64 clock;
494 u64 clock_task;
495
496 atomic_t nr_iowait;
497
498#ifdef CONFIG_SMP
499 struct root_domain *rd;
500 struct sched_domain *sd;
501
502 unsigned long cpu_power;
503
504 unsigned char idle_at_tick;
505
506 int post_schedule;
507 int active_balance;
508 int push_cpu;
509 struct cpu_stop_work active_balance_work;
510
511 int cpu;
512 int online;
513
514 unsigned long avg_load_per_task;
515
516 u64 rt_avg;
517 u64 age_stamp;
518 u64 idle_stamp;
519 u64 avg_idle;
520#endif
521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
526
527 unsigned long calc_load_update;
528 long calc_load_active;
529
530#ifdef CONFIG_SCHED_HRTICK
531#ifdef CONFIG_SMP
532 int hrtick_csd_pending;
533 struct call_single_data hrtick_csd;
534#endif
535 struct hrtimer hrtick_timer;
536#endif
537
538#ifdef CONFIG_SCHEDSTATS
539
540 struct sched_info rq_sched_info;
541 unsigned long long rq_cpu_time;
542
543
544
545 unsigned int yld_count;
546
547
548 unsigned int sched_switch;
549 unsigned int sched_count;
550 unsigned int sched_goidle;
551
552
553 unsigned int ttwu_count;
554 unsigned int ttwu_local;
555
556
557 unsigned int bkl_count;
558#endif
559};
560
561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562
563
564static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
565
566static inline int cpu_of(struct rq *rq)
567{
568#ifdef CONFIG_SMP
569 return rq->cpu;
570#else
571 return 0;
572#endif
573}
574
575#define rcu_dereference_check_sched_domain(p) \
576 rcu_dereference_check((p), \
577 rcu_read_lock_sched_held() || \
578 lockdep_is_held(&sched_domains_mutex))
579
580
581
582
583
584
585
586
587#define for_each_domain(cpu, __sd) \
588 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
589
590#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
591#define this_rq() (&__get_cpu_var(runqueues))
592#define task_rq(p) cpu_rq(task_cpu(p))
593#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
594#define raw_rq() (&__raw_get_cpu_var(runqueues))
595
596#ifdef CONFIG_CGROUP_SCHED
597
598
599
600
601
602
603
604
605
606static inline struct task_group *task_group(struct task_struct *p)
607{
608 struct cgroup_subsys_state *css;
609
610 if (p->flags & PF_EXITING)
611 return &root_task_group;
612
613 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
614 lockdep_is_held(&task_rq(p)->lock));
615 return container_of(css, struct task_group, css);
616}
617
618
619static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
620{
621#ifdef CONFIG_FAIR_GROUP_SCHED
622 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
623 p->se.parent = task_group(p)->se[cpu];
624#endif
625
626#ifdef CONFIG_RT_GROUP_SCHED
627 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
628 p->rt.parent = task_group(p)->rt_se[cpu];
629#endif
630}
631
632#else
633
634static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
635static inline struct task_group *task_group(struct task_struct *p)
636{
637 return NULL;
638}
639
640#endif
641
642static void update_rq_clock_task(struct rq *rq, s64 delta);
643
644static void update_rq_clock(struct rq *rq)
645{
646 s64 delta;
647
648 if (rq->skip_clock_update)
649 return;
650
651 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
652 rq->clock += delta;
653 update_rq_clock_task(rq, delta);
654}
655
656
657
658
659#ifdef CONFIG_SCHED_DEBUG
660# define const_debug __read_mostly
661#else
662# define const_debug static const
663#endif
664
665
666
667
668
669
670
671
672
673int runqueue_is_locked(int cpu)
674{
675 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
676}
677
678
679
680
681
682#define SCHED_FEAT(name, enabled) \
683 __SCHED_FEAT_##name ,
684
685enum {
686#include "sched_features.h"
687};
688
689#undef SCHED_FEAT
690
691#define SCHED_FEAT(name, enabled) \
692 (1UL << __SCHED_FEAT_##name) * enabled |
693
694const_debug unsigned int sysctl_sched_features =
695#include "sched_features.h"
696 0;
697
698#undef SCHED_FEAT
699
700#ifdef CONFIG_SCHED_DEBUG
701#define SCHED_FEAT(name, enabled) \
702 #name ,
703
704static __read_mostly char *sched_feat_names[] = {
705#include "sched_features.h"
706 NULL
707};
708
709#undef SCHED_FEAT
710
711static int sched_feat_show(struct seq_file *m, void *v)
712{
713 int i;
714
715 for (i = 0; sched_feat_names[i]; i++) {
716 if (!(sysctl_sched_features & (1UL << i)))
717 seq_puts(m, "NO_");
718 seq_printf(m, "%s ", sched_feat_names[i]);
719 }
720 seq_puts(m, "\n");
721
722 return 0;
723}
724
725static ssize_t
726sched_feat_write(struct file *filp, const char __user *ubuf,
727 size_t cnt, loff_t *ppos)
728{
729 char buf[64];
730 char *cmp;
731 int neg = 0;
732 int i;
733
734 if (cnt > 63)
735 cnt = 63;
736
737 if (copy_from_user(&buf, ubuf, cnt))
738 return -EFAULT;
739
740 buf[cnt] = 0;
741 cmp = strstrip(buf);
742
743 if (strncmp(buf, "NO_", 3) == 0) {
744 neg = 1;
745 cmp += 3;
746 }
747
748 for (i = 0; sched_feat_names[i]; i++) {
749 if (strcmp(cmp, sched_feat_names[i]) == 0) {
750 if (neg)
751 sysctl_sched_features &= ~(1UL << i);
752 else
753 sysctl_sched_features |= (1UL << i);
754 break;
755 }
756 }
757
758 if (!sched_feat_names[i])
759 return -EINVAL;
760
761 *ppos += cnt;
762
763 return cnt;
764}
765
766static int sched_feat_open(struct inode *inode, struct file *filp)
767{
768 return single_open(filp, sched_feat_show, NULL);
769}
770
771static const struct file_operations sched_feat_fops = {
772 .open = sched_feat_open,
773 .write = sched_feat_write,
774 .read = seq_read,
775 .llseek = seq_lseek,
776 .release = single_release,
777};
778
779static __init int sched_init_debug(void)
780{
781 debugfs_create_file("sched_features", 0644, NULL, NULL,
782 &sched_feat_fops);
783
784 return 0;
785}
786late_initcall(sched_init_debug);
787
788#endif
789
790#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
791
792
793
794
795
796const_debug unsigned int sysctl_sched_nr_migrate = 32;
797
798
799
800
801
802unsigned int sysctl_sched_shares_ratelimit = 250000;
803unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
804
805
806
807
808
809
810unsigned int sysctl_sched_shares_thresh = 4;
811
812
813
814
815
816
817
818const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
819
820
821
822
823
824unsigned int sysctl_sched_rt_period = 1000000;
825
826static __read_mostly int scheduler_running;
827
828
829
830
831
832int sysctl_sched_rt_runtime = 950000;
833
834static inline u64 global_rt_period(void)
835{
836 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
837}
838
839static inline u64 global_rt_runtime(void)
840{
841 if (sysctl_sched_rt_runtime < 0)
842 return RUNTIME_INF;
843
844 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
845}
846
847#ifndef prepare_arch_switch
848# define prepare_arch_switch(next) do { } while (0)
849#endif
850#ifndef finish_arch_switch
851# define finish_arch_switch(prev) do { } while (0)
852#endif
853
854static inline int task_current(struct rq *rq, struct task_struct *p)
855{
856 return rq->curr == p;
857}
858
859#ifndef __ARCH_WANT_UNLOCKED_CTXSW
860static inline int task_running(struct rq *rq, struct task_struct *p)
861{
862 return task_current(rq, p);
863}
864
865static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
866{
867}
868
869static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
870{
871#ifdef CONFIG_DEBUG_SPINLOCK
872
873 rq->lock.owner = current;
874#endif
875
876
877
878
879
880 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
881
882 raw_spin_unlock_irq(&rq->lock);
883}
884
885#else
886static inline int task_running(struct rq *rq, struct task_struct *p)
887{
888#ifdef CONFIG_SMP
889 return p->oncpu;
890#else
891 return task_current(rq, p);
892#endif
893}
894
895static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
896{
897#ifdef CONFIG_SMP
898
899
900
901
902
903 next->oncpu = 1;
904#endif
905#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 raw_spin_unlock_irq(&rq->lock);
907#else
908 raw_spin_unlock(&rq->lock);
909#endif
910}
911
912static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
913{
914#ifdef CONFIG_SMP
915
916
917
918
919
920 smp_wmb();
921 prev->oncpu = 0;
922#endif
923#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
924 local_irq_enable();
925#endif
926}
927#endif
928
929
930
931
932
933static inline int task_is_waking(struct task_struct *p)
934{
935 return unlikely(p->state == TASK_WAKING);
936}
937
938
939
940
941
942static inline struct rq *__task_rq_lock(struct task_struct *p)
943 __acquires(rq->lock)
944{
945 struct rq *rq;
946
947 for (;;) {
948 rq = task_rq(p);
949 raw_spin_lock(&rq->lock);
950 if (likely(rq == task_rq(p)))
951 return rq;
952 raw_spin_unlock(&rq->lock);
953 }
954}
955
956
957
958
959
960
961static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
962 __acquires(rq->lock)
963{
964 struct rq *rq;
965
966 for (;;) {
967 local_irq_save(*flags);
968 rq = task_rq(p);
969 raw_spin_lock(&rq->lock);
970 if (likely(rq == task_rq(p)))
971 return rq;
972 raw_spin_unlock_irqrestore(&rq->lock, *flags);
973 }
974}
975
976static void __task_rq_unlock(struct rq *rq)
977 __releases(rq->lock)
978{
979 raw_spin_unlock(&rq->lock);
980}
981
982static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
983 __releases(rq->lock)
984{
985 raw_spin_unlock_irqrestore(&rq->lock, *flags);
986}
987
988
989
990
991static struct rq *this_rq_lock(void)
992 __acquires(rq->lock)
993{
994 struct rq *rq;
995
996 local_irq_disable();
997 rq = this_rq();
998 raw_spin_lock(&rq->lock);
999
1000 return rq;
1001}
1002
1003#ifdef CONFIG_SCHED_HRTICK
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020static inline int hrtick_enabled(struct rq *rq)
1021{
1022 if (!sched_feat(HRTICK))
1023 return 0;
1024 if (!cpu_active(cpu_of(rq)))
1025 return 0;
1026 return hrtimer_is_hres_active(&rq->hrtick_timer);
1027}
1028
1029static void hrtick_clear(struct rq *rq)
1030{
1031 if (hrtimer_active(&rq->hrtick_timer))
1032 hrtimer_cancel(&rq->hrtick_timer);
1033}
1034
1035
1036
1037
1038
1039static enum hrtimer_restart hrtick(struct hrtimer *timer)
1040{
1041 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1042
1043 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1044
1045 raw_spin_lock(&rq->lock);
1046 update_rq_clock(rq);
1047 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1048 raw_spin_unlock(&rq->lock);
1049
1050 return HRTIMER_NORESTART;
1051}
1052
1053#ifdef CONFIG_SMP
1054
1055
1056
1057static void __hrtick_start(void *arg)
1058{
1059 struct rq *rq = arg;
1060
1061 raw_spin_lock(&rq->lock);
1062 hrtimer_restart(&rq->hrtick_timer);
1063 rq->hrtick_csd_pending = 0;
1064 raw_spin_unlock(&rq->lock);
1065}
1066
1067
1068
1069
1070
1071
1072static void hrtick_start(struct rq *rq, u64 delay)
1073{
1074 struct hrtimer *timer = &rq->hrtick_timer;
1075 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1076
1077 hrtimer_set_expires(timer, time);
1078
1079 if (rq == this_rq()) {
1080 hrtimer_restart(timer);
1081 } else if (!rq->hrtick_csd_pending) {
1082 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1083 rq->hrtick_csd_pending = 1;
1084 }
1085}
1086
1087static int
1088hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1089{
1090 int cpu = (int)(long)hcpu;
1091
1092 switch (action) {
1093 case CPU_UP_CANCELED:
1094 case CPU_UP_CANCELED_FROZEN:
1095 case CPU_DOWN_PREPARE:
1096 case CPU_DOWN_PREPARE_FROZEN:
1097 case CPU_DEAD:
1098 case CPU_DEAD_FROZEN:
1099 hrtick_clear(cpu_rq(cpu));
1100 return NOTIFY_OK;
1101 }
1102
1103 return NOTIFY_DONE;
1104}
1105
1106static __init void init_hrtick(void)
1107{
1108 hotcpu_notifier(hotplug_hrtick, 0);
1109}
1110#else
1111
1112
1113
1114
1115
1116static void hrtick_start(struct rq *rq, u64 delay)
1117{
1118 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1119 HRTIMER_MODE_REL_PINNED, 0);
1120}
1121
1122static inline void init_hrtick(void)
1123{
1124}
1125#endif
1126
1127static void init_rq_hrtick(struct rq *rq)
1128{
1129#ifdef CONFIG_SMP
1130 rq->hrtick_csd_pending = 0;
1131
1132 rq->hrtick_csd.flags = 0;
1133 rq->hrtick_csd.func = __hrtick_start;
1134 rq->hrtick_csd.info = rq;
1135#endif
1136
1137 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1138 rq->hrtick_timer.function = hrtick;
1139}
1140#else
1141static inline void hrtick_clear(struct rq *rq)
1142{
1143}
1144
1145static inline void init_rq_hrtick(struct rq *rq)
1146{
1147}
1148
1149static inline void init_hrtick(void)
1150{
1151}
1152#endif
1153
1154
1155
1156
1157
1158
1159
1160
1161#ifdef CONFIG_SMP
1162
1163#ifndef tsk_is_polling
1164#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1165#endif
1166
1167static void resched_task(struct task_struct *p)
1168{
1169 int cpu;
1170
1171 assert_raw_spin_locked(&task_rq(p)->lock);
1172
1173 if (test_tsk_need_resched(p))
1174 return;
1175
1176 set_tsk_need_resched(p);
1177
1178 cpu = task_cpu(p);
1179 if (cpu == smp_processor_id())
1180 return;
1181
1182
1183 smp_mb();
1184 if (!tsk_is_polling(p))
1185 smp_send_reschedule(cpu);
1186}
1187
1188static void resched_cpu(int cpu)
1189{
1190 struct rq *rq = cpu_rq(cpu);
1191 unsigned long flags;
1192
1193 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1194 return;
1195 resched_task(cpu_curr(cpu));
1196 raw_spin_unlock_irqrestore(&rq->lock, flags);
1197}
1198
1199#ifdef CONFIG_NO_HZ
1200
1201
1202
1203
1204
1205
1206
1207
1208int get_nohz_timer_target(void)
1209{
1210 int cpu = smp_processor_id();
1211 int i;
1212 struct sched_domain *sd;
1213
1214 for_each_domain(cpu, sd) {
1215 for_each_cpu(i, sched_domain_span(sd))
1216 if (!idle_cpu(i))
1217 return i;
1218 }
1219 return cpu;
1220}
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231void wake_up_idle_cpu(int cpu)
1232{
1233 struct rq *rq = cpu_rq(cpu);
1234
1235 if (cpu == smp_processor_id())
1236 return;
1237
1238
1239
1240
1241
1242
1243
1244
1245 if (rq->curr != rq->idle)
1246 return;
1247
1248
1249
1250
1251
1252
1253 set_tsk_need_resched(rq->idle);
1254
1255
1256 smp_mb();
1257 if (!tsk_is_polling(rq->idle))
1258 smp_send_reschedule(cpu);
1259}
1260
1261#endif
1262
1263static u64 sched_avg_period(void)
1264{
1265 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1266}
1267
1268static void sched_avg_update(struct rq *rq)
1269{
1270 s64 period = sched_avg_period();
1271
1272 while ((s64)(rq->clock - rq->age_stamp) > period) {
1273
1274
1275
1276
1277
1278 asm("" : "+rm" (rq->age_stamp));
1279 rq->age_stamp += period;
1280 rq->rt_avg /= 2;
1281 }
1282}
1283
1284static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1285{
1286 rq->rt_avg += rt_delta;
1287 sched_avg_update(rq);
1288}
1289
1290#else
1291static void resched_task(struct task_struct *p)
1292{
1293 assert_raw_spin_locked(&task_rq(p)->lock);
1294 set_tsk_need_resched(p);
1295}
1296
1297static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1298{
1299}
1300
1301static void sched_avg_update(struct rq *rq)
1302{
1303}
1304#endif
1305
1306#if BITS_PER_LONG == 32
1307# define WMULT_CONST (~0UL)
1308#else
1309# define WMULT_CONST (1UL << 32)
1310#endif
1311
1312#define WMULT_SHIFT 32
1313
1314
1315
1316
1317#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1318
1319
1320
1321
1322static unsigned long
1323calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1324 struct load_weight *lw)
1325{
1326 u64 tmp;
1327
1328 if (!lw->inv_weight) {
1329 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1330 lw->inv_weight = 1;
1331 else
1332 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1333 / (lw->weight+1);
1334 }
1335
1336 tmp = (u64)delta_exec * weight;
1337
1338
1339
1340 if (unlikely(tmp > WMULT_CONST))
1341 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1342 WMULT_SHIFT/2);
1343 else
1344 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1345
1346 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1347}
1348
1349static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1350{
1351 lw->weight += inc;
1352 lw->inv_weight = 0;
1353}
1354
1355static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1356{
1357 lw->weight -= dec;
1358 lw->inv_weight = 0;
1359}
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370#define WEIGHT_IDLEPRIO 3
1371#define WMULT_IDLEPRIO 1431655765
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385static const int prio_to_weight[40] = {
1386 88761, 71755, 56483, 46273, 36291,
1387 29154, 23254, 18705, 14949, 11916,
1388 9548, 7620, 6100, 4904, 3906,
1389 3121, 2501, 1991, 1586, 1277,
1390 1024, 820, 655, 526, 423,
1391 335, 272, 215, 172, 137,
1392 110, 87, 70, 56, 45,
1393 36, 29, 23, 18, 15,
1394};
1395
1396
1397
1398
1399
1400
1401
1402
1403static const u32 prio_to_wmult[40] = {
1404 48388, 59856, 76040, 92818, 118348,
1405 147320, 184698, 229616, 287308, 360437,
1406 449829, 563644, 704093, 875809, 1099582,
1407 1376151, 1717300, 2157191, 2708050, 3363326,
1408 4194304, 5237765, 6557202, 8165337, 10153587,
1409 12820798, 15790321, 19976592, 24970740, 31350126,
1410 39045157, 49367440, 61356676, 76695844, 95443717,
1411 119304647, 148102320, 186737708, 238609294, 286331153,
1412};
1413
1414
1415enum cpuacct_stat_index {
1416 CPUACCT_STAT_USER,
1417 CPUACCT_STAT_SYSTEM,
1418
1419 CPUACCT_STAT_NSTATS,
1420};
1421
1422#ifdef CONFIG_CGROUP_CPUACCT
1423static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1424static void cpuacct_update_stats(struct task_struct *tsk,
1425 enum cpuacct_stat_index idx, cputime_t val);
1426#else
1427static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1428static inline void cpuacct_update_stats(struct task_struct *tsk,
1429 enum cpuacct_stat_index idx, cputime_t val) {}
1430#endif
1431
1432static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1433{
1434 update_load_add(&rq->load, load);
1435}
1436
1437static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1438{
1439 update_load_sub(&rq->load, load);
1440}
1441
1442#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1443typedef int (*tg_visitor)(struct task_group *, void *);
1444
1445
1446
1447
1448
1449static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1450{
1451 struct task_group *parent, *child;
1452 int ret;
1453
1454 rcu_read_lock();
1455 parent = &root_task_group;
1456down:
1457 ret = (*down)(parent, data);
1458 if (ret)
1459 goto out_unlock;
1460 list_for_each_entry_rcu(child, &parent->children, siblings) {
1461 parent = child;
1462 goto down;
1463
1464up:
1465 continue;
1466 }
1467 ret = (*up)(parent, data);
1468 if (ret)
1469 goto out_unlock;
1470
1471 child = parent;
1472 parent = parent->parent;
1473 if (parent)
1474 goto up;
1475out_unlock:
1476 rcu_read_unlock();
1477
1478 return ret;
1479}
1480
1481static int tg_nop(struct task_group *tg, void *data)
1482{
1483 return 0;
1484}
1485#endif
1486
1487#ifdef CONFIG_SMP
1488
1489static unsigned long weighted_cpuload(const int cpu)
1490{
1491 return cpu_rq(cpu)->load.weight;
1492}
1493
1494
1495
1496
1497
1498
1499
1500
1501static unsigned long source_load(int cpu, int type)
1502{
1503 struct rq *rq = cpu_rq(cpu);
1504 unsigned long total = weighted_cpuload(cpu);
1505
1506 if (type == 0 || !sched_feat(LB_BIAS))
1507 return total;
1508
1509 return min(rq->cpu_load[type-1], total);
1510}
1511
1512
1513
1514
1515
1516static unsigned long target_load(int cpu, int type)
1517{
1518 struct rq *rq = cpu_rq(cpu);
1519 unsigned long total = weighted_cpuload(cpu);
1520
1521 if (type == 0 || !sched_feat(LB_BIAS))
1522 return total;
1523
1524 return max(rq->cpu_load[type-1], total);
1525}
1526
1527static unsigned long power_of(int cpu)
1528{
1529 return cpu_rq(cpu)->cpu_power;
1530}
1531
1532static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1533
1534static unsigned long cpu_avg_load_per_task(int cpu)
1535{
1536 struct rq *rq = cpu_rq(cpu);
1537 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1538
1539 if (nr_running)
1540 rq->avg_load_per_task = rq->load.weight / nr_running;
1541 else
1542 rq->avg_load_per_task = 0;
1543
1544 return rq->avg_load_per_task;
1545}
1546
1547#ifdef CONFIG_FAIR_GROUP_SCHED
1548
1549static __read_mostly unsigned long __percpu *update_shares_data;
1550
1551static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1552
1553
1554
1555
1556static void update_group_shares_cpu(struct task_group *tg, int cpu,
1557 unsigned long sd_shares,
1558 unsigned long sd_rq_weight,
1559 unsigned long *usd_rq_weight)
1560{
1561 unsigned long shares, rq_weight;
1562 int boost = 0;
1563
1564 rq_weight = usd_rq_weight[cpu];
1565 if (!rq_weight) {
1566 boost = 1;
1567 rq_weight = NICE_0_LOAD;
1568 }
1569
1570
1571
1572
1573
1574
1575 shares = (sd_shares * rq_weight) / sd_rq_weight;
1576 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1577
1578 if (abs(shares - tg->se[cpu]->load.weight) >
1579 sysctl_sched_shares_thresh) {
1580 struct rq *rq = cpu_rq(cpu);
1581 unsigned long flags;
1582
1583 raw_spin_lock_irqsave(&rq->lock, flags);
1584 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1585 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1586 __set_se_shares(tg->se[cpu], shares);
1587 raw_spin_unlock_irqrestore(&rq->lock, flags);
1588 }
1589}
1590
1591
1592
1593
1594
1595
1596static int tg_shares_up(struct task_group *tg, void *data)
1597{
1598 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1599 unsigned long *usd_rq_weight;
1600 struct sched_domain *sd = data;
1601 unsigned long flags;
1602 int i;
1603
1604 if (!tg->se[0])
1605 return 0;
1606
1607 local_irq_save(flags);
1608 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1609
1610 for_each_cpu(i, sched_domain_span(sd)) {
1611 weight = tg->cfs_rq[i]->load.weight;
1612 usd_rq_weight[i] = weight;
1613
1614 rq_weight += weight;
1615
1616
1617
1618
1619
1620 if (!weight)
1621 weight = NICE_0_LOAD;
1622
1623 sum_weight += weight;
1624 shares += tg->cfs_rq[i]->shares;
1625 }
1626
1627 if (!rq_weight)
1628 rq_weight = sum_weight;
1629
1630 if ((!shares && rq_weight) || shares > tg->shares)
1631 shares = tg->shares;
1632
1633 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1634 shares = tg->shares;
1635
1636 for_each_cpu(i, sched_domain_span(sd))
1637 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1638
1639 local_irq_restore(flags);
1640
1641 return 0;
1642}
1643
1644
1645
1646
1647
1648
1649static int tg_load_down(struct task_group *tg, void *data)
1650{
1651 unsigned long load;
1652 long cpu = (long)data;
1653
1654 if (!tg->parent) {
1655 load = cpu_rq(cpu)->load.weight;
1656 } else {
1657 load = tg->parent->cfs_rq[cpu]->h_load;
1658 load *= tg->cfs_rq[cpu]->shares;
1659 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1660 }
1661
1662 tg->cfs_rq[cpu]->h_load = load;
1663
1664 return 0;
1665}
1666
1667static void update_shares(struct sched_domain *sd)
1668{
1669 s64 elapsed;
1670 u64 now;
1671
1672 if (root_task_group_empty())
1673 return;
1674
1675 now = local_clock();
1676 elapsed = now - sd->last_update;
1677
1678 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1679 sd->last_update = now;
1680 walk_tg_tree(tg_nop, tg_shares_up, sd);
1681 }
1682}
1683
1684static void update_h_load(long cpu)
1685{
1686 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1687}
1688
1689#else
1690
1691static inline void update_shares(struct sched_domain *sd)
1692{
1693}
1694
1695#endif
1696
1697#ifdef CONFIG_PREEMPT
1698
1699static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1710 __releases(this_rq->lock)
1711 __acquires(busiest->lock)
1712 __acquires(this_rq->lock)
1713{
1714 raw_spin_unlock(&this_rq->lock);
1715 double_rq_lock(this_rq, busiest);
1716
1717 return 1;
1718}
1719
1720#else
1721
1722
1723
1724
1725
1726
1727
1728static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1729 __releases(this_rq->lock)
1730 __acquires(busiest->lock)
1731 __acquires(this_rq->lock)
1732{
1733 int ret = 0;
1734
1735 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1736 if (busiest < this_rq) {
1737 raw_spin_unlock(&this_rq->lock);
1738 raw_spin_lock(&busiest->lock);
1739 raw_spin_lock_nested(&this_rq->lock,
1740 SINGLE_DEPTH_NESTING);
1741 ret = 1;
1742 } else
1743 raw_spin_lock_nested(&busiest->lock,
1744 SINGLE_DEPTH_NESTING);
1745 }
1746 return ret;
1747}
1748
1749#endif
1750
1751
1752
1753
1754static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1755{
1756 if (unlikely(!irqs_disabled())) {
1757
1758 raw_spin_unlock(&this_rq->lock);
1759 BUG_ON(1);
1760 }
1761
1762 return _double_lock_balance(this_rq, busiest);
1763}
1764
1765static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1766 __releases(busiest->lock)
1767{
1768 raw_spin_unlock(&busiest->lock);
1769 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1770}
1771
1772
1773
1774
1775
1776
1777
1778static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1779 __acquires(rq1->lock)
1780 __acquires(rq2->lock)
1781{
1782 BUG_ON(!irqs_disabled());
1783 if (rq1 == rq2) {
1784 raw_spin_lock(&rq1->lock);
1785 __acquire(rq2->lock);
1786 } else {
1787 if (rq1 < rq2) {
1788 raw_spin_lock(&rq1->lock);
1789 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1790 } else {
1791 raw_spin_lock(&rq2->lock);
1792 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1793 }
1794 }
1795}
1796
1797
1798
1799
1800
1801
1802
1803static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1804 __releases(rq1->lock)
1805 __releases(rq2->lock)
1806{
1807 raw_spin_unlock(&rq1->lock);
1808 if (rq1 != rq2)
1809 raw_spin_unlock(&rq2->lock);
1810 else
1811 __release(rq2->lock);
1812}
1813
1814#endif
1815
1816#ifdef CONFIG_FAIR_GROUP_SCHED
1817static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1818{
1819#ifdef CONFIG_SMP
1820 cfs_rq->shares = shares;
1821#endif
1822}
1823#endif
1824
1825static void calc_load_account_idle(struct rq *this_rq);
1826static void update_sysctl(void);
1827static int get_update_sysctl_factor(void);
1828static void update_cpu_load(struct rq *this_rq);
1829
1830static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1831{
1832 set_task_rq(p, cpu);
1833#ifdef CONFIG_SMP
1834
1835
1836
1837
1838
1839 smp_wmb();
1840 task_thread_info(p)->cpu = cpu;
1841#endif
1842}
1843
1844static const struct sched_class rt_sched_class;
1845
1846#define sched_class_highest (&stop_sched_class)
1847#define for_each_class(class) \
1848 for (class = sched_class_highest; class; class = class->next)
1849
1850#include "sched_stats.h"
1851
1852static void inc_nr_running(struct rq *rq)
1853{
1854 rq->nr_running++;
1855}
1856
1857static void dec_nr_running(struct rq *rq)
1858{
1859 rq->nr_running--;
1860}
1861
1862static void set_load_weight(struct task_struct *p)
1863{
1864
1865
1866
1867 if (p->policy == SCHED_IDLE) {
1868 p->se.load.weight = WEIGHT_IDLEPRIO;
1869 p->se.load.inv_weight = WMULT_IDLEPRIO;
1870 return;
1871 }
1872
1873 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1874 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1875}
1876
1877static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1878{
1879 update_rq_clock(rq);
1880 sched_info_queued(p);
1881 p->sched_class->enqueue_task(rq, p, flags);
1882 p->se.on_rq = 1;
1883}
1884
1885static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1886{
1887 update_rq_clock(rq);
1888 sched_info_dequeued(p);
1889 p->sched_class->dequeue_task(rq, p, flags);
1890 p->se.on_rq = 0;
1891}
1892
1893
1894
1895
1896static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1897{
1898 if (task_contributes_to_load(p))
1899 rq->nr_uninterruptible--;
1900
1901 enqueue_task(rq, p, flags);
1902 inc_nr_running(rq);
1903}
1904
1905
1906
1907
1908static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1909{
1910 if (task_contributes_to_load(p))
1911 rq->nr_uninterruptible++;
1912
1913 dequeue_task(rq, p, flags);
1914 dec_nr_running(rq);
1915}
1916
1917#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1931static DEFINE_PER_CPU(u64, cpu_softirq_time);
1932
1933static DEFINE_PER_CPU(u64, irq_start_time);
1934static int sched_clock_irqtime;
1935
1936void enable_sched_clock_irqtime(void)
1937{
1938 sched_clock_irqtime = 1;
1939}
1940
1941void disable_sched_clock_irqtime(void)
1942{
1943 sched_clock_irqtime = 0;
1944}
1945
1946#ifndef CONFIG_64BIT
1947static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1948
1949static inline void irq_time_write_begin(void)
1950{
1951 __this_cpu_inc(irq_time_seq.sequence);
1952 smp_wmb();
1953}
1954
1955static inline void irq_time_write_end(void)
1956{
1957 smp_wmb();
1958 __this_cpu_inc(irq_time_seq.sequence);
1959}
1960
1961static inline u64 irq_time_read(int cpu)
1962{
1963 u64 irq_time;
1964 unsigned seq;
1965
1966 do {
1967 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1968 irq_time = per_cpu(cpu_softirq_time, cpu) +
1969 per_cpu(cpu_hardirq_time, cpu);
1970 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1971
1972 return irq_time;
1973}
1974#else
1975static inline void irq_time_write_begin(void)
1976{
1977}
1978
1979static inline void irq_time_write_end(void)
1980{
1981}
1982
1983static inline u64 irq_time_read(int cpu)
1984{
1985 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1986}
1987#endif
1988
1989
1990
1991
1992
1993void account_system_vtime(struct task_struct *curr)
1994{
1995 unsigned long flags;
1996 s64 delta;
1997 int cpu;
1998
1999 if (!sched_clock_irqtime)
2000 return;
2001
2002 local_irq_save(flags);
2003
2004 cpu = smp_processor_id();
2005 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
2006 __this_cpu_add(irq_start_time, delta);
2007
2008 irq_time_write_begin();
2009
2010
2011
2012
2013
2014
2015 if (hardirq_count())
2016 __this_cpu_add(cpu_hardirq_time, delta);
2017 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
2018 __this_cpu_add(cpu_softirq_time, delta);
2019
2020 irq_time_write_end();
2021 local_irq_restore(flags);
2022}
2023EXPORT_SYMBOL_GPL(account_system_vtime);
2024
2025static void update_rq_clock_task(struct rq *rq, s64 delta)
2026{
2027 s64 irq_delta;
2028
2029 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046 if (irq_delta > delta)
2047 irq_delta = delta;
2048
2049 rq->prev_irq_time += irq_delta;
2050 delta -= irq_delta;
2051 rq->clock_task += delta;
2052
2053 if (irq_delta && sched_feat(NONIRQ_POWER))
2054 sched_rt_avg_update(rq, irq_delta);
2055}
2056
2057#else
2058
2059static void update_rq_clock_task(struct rq *rq, s64 delta)
2060{
2061 rq->clock_task += delta;
2062}
2063
2064#endif
2065
2066#include "sched_idletask.c"
2067#include "sched_fair.c"
2068#include "sched_rt.c"
2069#include "sched_stoptask.c"
2070#ifdef CONFIG_SCHED_DEBUG
2071# include "sched_debug.c"
2072#endif
2073
2074void sched_set_stop_task(int cpu, struct task_struct *stop)
2075{
2076 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2077 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2078
2079 if (stop) {
2080
2081
2082
2083
2084
2085
2086
2087
2088 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
2089
2090 stop->sched_class = &stop_sched_class;
2091 }
2092
2093 cpu_rq(cpu)->stop = stop;
2094
2095 if (old_stop) {
2096
2097
2098
2099
2100 old_stop->sched_class = &rt_sched_class;
2101 }
2102}
2103
2104
2105
2106
2107static inline int __normal_prio(struct task_struct *p)
2108{
2109 return p->static_prio;
2110}
2111
2112
2113
2114
2115
2116
2117
2118
2119static inline int normal_prio(struct task_struct *p)
2120{
2121 int prio;
2122
2123 if (task_has_rt_policy(p))
2124 prio = MAX_RT_PRIO-1 - p->rt_priority;
2125 else
2126 prio = __normal_prio(p);
2127 return prio;
2128}
2129
2130
2131
2132
2133
2134
2135
2136
2137static int effective_prio(struct task_struct *p)
2138{
2139 p->normal_prio = normal_prio(p);
2140
2141
2142
2143
2144
2145 if (!rt_prio(p->prio))
2146 return p->normal_prio;
2147 return p->prio;
2148}
2149
2150
2151
2152
2153
2154inline int task_curr(const struct task_struct *p)
2155{
2156 return cpu_curr(task_cpu(p)) == p;
2157}
2158
2159static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2160 const struct sched_class *prev_class,
2161 int oldprio, int running)
2162{
2163 if (prev_class != p->sched_class) {
2164 if (prev_class->switched_from)
2165 prev_class->switched_from(rq, p, running);
2166 p->sched_class->switched_to(rq, p, running);
2167 } else
2168 p->sched_class->prio_changed(rq, p, oldprio, running);
2169}
2170
2171static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2172{
2173 const struct sched_class *class;
2174
2175 if (p->sched_class == rq->curr->sched_class) {
2176 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2177 } else {
2178 for_each_class(class) {
2179 if (class == rq->curr->sched_class)
2180 break;
2181 if (class == p->sched_class) {
2182 resched_task(rq->curr);
2183 break;
2184 }
2185 }
2186 }
2187
2188
2189
2190
2191
2192 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2193 rq->skip_clock_update = 1;
2194}
2195
2196#ifdef CONFIG_SMP
2197
2198
2199
2200static int
2201task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2202{
2203 s64 delta;
2204
2205 if (p->sched_class != &fair_sched_class)
2206 return 0;
2207
2208 if (unlikely(p->policy == SCHED_IDLE))
2209 return 0;
2210
2211
2212
2213
2214 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2215 (&p->se == cfs_rq_of(&p->se)->next ||
2216 &p->se == cfs_rq_of(&p->se)->last))
2217 return 1;
2218
2219 if (sysctl_sched_migration_cost == -1)
2220 return 1;
2221 if (sysctl_sched_migration_cost == 0)
2222 return 0;
2223
2224 delta = now - p->se.exec_start;
2225
2226 return delta < (s64)sysctl_sched_migration_cost;
2227}
2228
2229void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2230{
2231#ifdef CONFIG_SCHED_DEBUG
2232
2233
2234
2235
2236 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2237 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2238#endif
2239
2240 trace_sched_migrate_task(p, new_cpu);
2241
2242 if (task_cpu(p) != new_cpu) {
2243 p->se.nr_migrations++;
2244 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2245 }
2246
2247 __set_task_cpu(p, new_cpu);
2248}
2249
2250struct migration_arg {
2251 struct task_struct *task;
2252 int dest_cpu;
2253};
2254
2255static int migration_cpu_stop(void *data);
2256
2257
2258
2259
2260
2261static bool migrate_task(struct task_struct *p, int dest_cpu)
2262{
2263 struct rq *rq = task_rq(p);
2264
2265
2266
2267
2268
2269 return p->se.on_rq || task_running(rq, p);
2270}
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2289{
2290 unsigned long flags;
2291 int running, on_rq;
2292 unsigned long ncsw;
2293 struct rq *rq;
2294
2295 for (;;) {
2296
2297
2298
2299
2300
2301
2302 rq = task_rq(p);
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315 while (task_running(rq, p)) {
2316 if (match_state && unlikely(p->state != match_state))
2317 return 0;
2318 cpu_relax();
2319 }
2320
2321
2322
2323
2324
2325
2326 rq = task_rq_lock(p, &flags);
2327 trace_sched_wait_task(p);
2328 running = task_running(rq, p);
2329 on_rq = p->se.on_rq;
2330 ncsw = 0;
2331 if (!match_state || p->state == match_state)
2332 ncsw = p->nvcsw | LONG_MIN;
2333 task_rq_unlock(rq, &flags);
2334
2335
2336
2337
2338 if (unlikely(!ncsw))
2339 break;
2340
2341
2342
2343
2344
2345
2346
2347 if (unlikely(running)) {
2348 cpu_relax();
2349 continue;
2350 }
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361 if (unlikely(on_rq)) {
2362 schedule_timeout_uninterruptible(1);
2363 continue;
2364 }
2365
2366
2367
2368
2369
2370
2371 break;
2372 }
2373
2374 return ncsw;
2375}
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390void kick_process(struct task_struct *p)
2391{
2392 int cpu;
2393
2394 preempt_disable();
2395 cpu = task_cpu(p);
2396 if ((cpu != smp_processor_id()) && task_curr(p))
2397 smp_send_reschedule(cpu);
2398 preempt_enable();
2399}
2400EXPORT_SYMBOL_GPL(kick_process);
2401#endif
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412void task_oncpu_function_call(struct task_struct *p,
2413 void (*func) (void *info), void *info)
2414{
2415 int cpu;
2416
2417 preempt_disable();
2418 cpu = task_cpu(p);
2419 if (task_curr(p))
2420 smp_call_function_single(cpu, func, info, 1);
2421 preempt_enable();
2422}
2423
2424#ifdef CONFIG_SMP
2425
2426
2427
2428static int select_fallback_rq(int cpu, struct task_struct *p)
2429{
2430 int dest_cpu;
2431 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2432
2433
2434 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2435 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2436 return dest_cpu;
2437
2438
2439 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2440 if (dest_cpu < nr_cpu_ids)
2441 return dest_cpu;
2442
2443
2444 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2445 dest_cpu = cpuset_cpus_allowed_fallback(p);
2446
2447
2448
2449
2450
2451 if (p->mm && printk_ratelimit()) {
2452 printk(KERN_INFO "process %d (%s) no "
2453 "longer affine to cpu%d\n",
2454 task_pid_nr(p), p->comm, cpu);
2455 }
2456 }
2457
2458 return dest_cpu;
2459}
2460
2461
2462
2463
2464static inline
2465int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2466{
2467 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2480 !cpu_online(cpu)))
2481 cpu = select_fallback_rq(task_cpu(p), p);
2482
2483 return cpu;
2484}
2485
2486static void update_avg(u64 *avg, u64 sample)
2487{
2488 s64 diff = sample - *avg;
2489 *avg += diff >> 3;
2490}
2491#endif
2492
2493static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2494 bool is_sync, bool is_migrate, bool is_local,
2495 unsigned long en_flags)
2496{
2497 schedstat_inc(p, se.statistics.nr_wakeups);
2498 if (is_sync)
2499 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2500 if (is_migrate)
2501 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2502 if (is_local)
2503 schedstat_inc(p, se.statistics.nr_wakeups_local);
2504 else
2505 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2506
2507 activate_task(rq, p, en_flags);
2508}
2509
2510static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2511 int wake_flags, bool success)
2512{
2513 trace_sched_wakeup(p, success);
2514 check_preempt_curr(rq, p, wake_flags);
2515
2516 p->state = TASK_RUNNING;
2517#ifdef CONFIG_SMP
2518 if (p->sched_class->task_woken)
2519 p->sched_class->task_woken(rq, p);
2520
2521 if (unlikely(rq->idle_stamp)) {
2522 u64 delta = rq->clock - rq->idle_stamp;
2523 u64 max = 2*sysctl_sched_migration_cost;
2524
2525 if (delta > max)
2526 rq->avg_idle = max;
2527 else
2528 update_avg(&rq->avg_idle, delta);
2529 rq->idle_stamp = 0;
2530 }
2531#endif
2532
2533 if ((p->flags & PF_WQ_WORKER) && success)
2534 wq_worker_waking_up(p, cpu_of(rq));
2535}
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552static int try_to_wake_up(struct task_struct *p, unsigned int state,
2553 int wake_flags)
2554{
2555 int cpu, orig_cpu, this_cpu, success = 0;
2556 unsigned long flags;
2557 unsigned long en_flags = ENQUEUE_WAKEUP;
2558 struct rq *rq;
2559
2560 this_cpu = get_cpu();
2561
2562 smp_wmb();
2563 rq = task_rq_lock(p, &flags);
2564 if (!(p->state & state))
2565 goto out;
2566
2567 if (p->se.on_rq)
2568 goto out_running;
2569
2570 cpu = task_cpu(p);
2571 orig_cpu = cpu;
2572
2573#ifdef CONFIG_SMP
2574 if (unlikely(task_running(rq, p)))
2575 goto out_activate;
2576
2577
2578
2579
2580
2581
2582
2583 if (task_contributes_to_load(p)) {
2584 if (likely(cpu_online(orig_cpu)))
2585 rq->nr_uninterruptible--;
2586 else
2587 this_rq()->nr_uninterruptible--;
2588 }
2589 p->state = TASK_WAKING;
2590
2591 if (p->sched_class->task_waking) {
2592 p->sched_class->task_waking(rq, p);
2593 en_flags |= ENQUEUE_WAKING;
2594 }
2595
2596 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2597 if (cpu != orig_cpu)
2598 set_task_cpu(p, cpu);
2599 __task_rq_unlock(rq);
2600
2601 rq = cpu_rq(cpu);
2602 raw_spin_lock(&rq->lock);
2603
2604
2605
2606
2607
2608
2609
2610 WARN_ON(task_cpu(p) != cpu);
2611 WARN_ON(p->state != TASK_WAKING);
2612
2613#ifdef CONFIG_SCHEDSTATS
2614 schedstat_inc(rq, ttwu_count);
2615 if (cpu == this_cpu)
2616 schedstat_inc(rq, ttwu_local);
2617 else {
2618 struct sched_domain *sd;
2619 for_each_domain(this_cpu, sd) {
2620 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2621 schedstat_inc(sd, ttwu_wake_remote);
2622 break;
2623 }
2624 }
2625 }
2626#endif
2627
2628out_activate:
2629#endif
2630 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2631 cpu == this_cpu, en_flags);
2632 success = 1;
2633out_running:
2634 ttwu_post_activation(p, rq, wake_flags, success);
2635out:
2636 task_rq_unlock(rq, &flags);
2637 put_cpu();
2638
2639 return success;
2640}
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650static void try_to_wake_up_local(struct task_struct *p)
2651{
2652 struct rq *rq = task_rq(p);
2653 bool success = false;
2654
2655 BUG_ON(rq != this_rq());
2656 BUG_ON(p == current);
2657 lockdep_assert_held(&rq->lock);
2658
2659 if (!(p->state & TASK_NORMAL))
2660 return;
2661
2662 if (!p->se.on_rq) {
2663 if (likely(!task_running(rq, p))) {
2664 schedstat_inc(rq, ttwu_count);
2665 schedstat_inc(rq, ttwu_local);
2666 }
2667 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2668 success = true;
2669 }
2670 ttwu_post_activation(p, rq, 0, success);
2671}
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684int wake_up_process(struct task_struct *p)
2685{
2686 return try_to_wake_up(p, TASK_ALL, 0);
2687}
2688EXPORT_SYMBOL(wake_up_process);
2689
2690int wake_up_state(struct task_struct *p, unsigned int state)
2691{
2692 return try_to_wake_up(p, state, 0);
2693}
2694
2695
2696
2697
2698
2699
2700
2701static void __sched_fork(struct task_struct *p)
2702{
2703 p->se.exec_start = 0;
2704 p->se.sum_exec_runtime = 0;
2705 p->se.prev_sum_exec_runtime = 0;
2706 p->se.nr_migrations = 0;
2707
2708#ifdef CONFIG_SCHEDSTATS
2709 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2710#endif
2711
2712 INIT_LIST_HEAD(&p->rt.run_list);
2713 p->se.on_rq = 0;
2714 INIT_LIST_HEAD(&p->se.group_node);
2715
2716#ifdef CONFIG_PREEMPT_NOTIFIERS
2717 INIT_HLIST_HEAD(&p->preempt_notifiers);
2718#endif
2719}
2720
2721
2722
2723
2724void sched_fork(struct task_struct *p, int clone_flags)
2725{
2726 int cpu = get_cpu();
2727
2728 __sched_fork(p);
2729
2730
2731
2732
2733
2734 p->state = TASK_RUNNING;
2735
2736
2737
2738
2739 if (unlikely(p->sched_reset_on_fork)) {
2740 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2741 p->policy = SCHED_NORMAL;
2742 p->normal_prio = p->static_prio;
2743 }
2744
2745 if (PRIO_TO_NICE(p->static_prio) < 0) {
2746 p->static_prio = NICE_TO_PRIO(0);
2747 p->normal_prio = p->static_prio;
2748 set_load_weight(p);
2749 }
2750
2751
2752
2753
2754
2755 p->sched_reset_on_fork = 0;
2756 }
2757
2758
2759
2760
2761 p->prio = current->normal_prio;
2762
2763 if (!rt_prio(p->prio))
2764 p->sched_class = &fair_sched_class;
2765
2766 if (p->sched_class->task_fork)
2767 p->sched_class->task_fork(p);
2768
2769
2770
2771
2772
2773
2774
2775
2776 rcu_read_lock();
2777 set_task_cpu(p, cpu);
2778 rcu_read_unlock();
2779
2780#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2781 if (likely(sched_info_on()))
2782 memset(&p->sched_info, 0, sizeof(p->sched_info));
2783#endif
2784#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2785 p->oncpu = 0;
2786#endif
2787#ifdef CONFIG_PREEMPT
2788
2789 task_thread_info(p)->preempt_count = 1;
2790#endif
2791 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2792
2793 put_cpu();
2794}
2795
2796
2797
2798
2799
2800
2801
2802
2803void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2804{
2805 unsigned long flags;
2806 struct rq *rq;
2807 int cpu __maybe_unused = get_cpu();
2808
2809#ifdef CONFIG_SMP
2810 rq = task_rq_lock(p, &flags);
2811 p->state = TASK_WAKING;
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2822 set_task_cpu(p, cpu);
2823
2824 p->state = TASK_RUNNING;
2825 task_rq_unlock(rq, &flags);
2826#endif
2827
2828 rq = task_rq_lock(p, &flags);
2829 activate_task(rq, p, 0);
2830 trace_sched_wakeup_new(p, 1);
2831 check_preempt_curr(rq, p, WF_FORK);
2832#ifdef CONFIG_SMP
2833 if (p->sched_class->task_woken)
2834 p->sched_class->task_woken(rq, p);
2835#endif
2836 task_rq_unlock(rq, &flags);
2837 put_cpu();
2838}
2839
2840#ifdef CONFIG_PREEMPT_NOTIFIERS
2841
2842
2843
2844
2845
2846void preempt_notifier_register(struct preempt_notifier *notifier)
2847{
2848 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2849}
2850EXPORT_SYMBOL_GPL(preempt_notifier_register);
2851
2852
2853
2854
2855
2856
2857
2858void preempt_notifier_unregister(struct preempt_notifier *notifier)
2859{
2860 hlist_del(¬ifier->link);
2861}
2862EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2863
2864static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2865{
2866 struct preempt_notifier *notifier;
2867 struct hlist_node *node;
2868
2869 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2870 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2871}
2872
2873static void
2874fire_sched_out_preempt_notifiers(struct task_struct *curr,
2875 struct task_struct *next)
2876{
2877 struct preempt_notifier *notifier;
2878 struct hlist_node *node;
2879
2880 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2881 notifier->ops->sched_out(notifier, next);
2882}
2883
2884#else
2885
2886static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2887{
2888}
2889
2890static void
2891fire_sched_out_preempt_notifiers(struct task_struct *curr,
2892 struct task_struct *next)
2893{
2894}
2895
2896#endif
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911static inline void
2912prepare_task_switch(struct rq *rq, struct task_struct *prev,
2913 struct task_struct *next)
2914{
2915 fire_sched_out_preempt_notifiers(prev, next);
2916 prepare_lock_switch(rq, next);
2917 prepare_arch_switch(next);
2918}
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2936 __releases(rq->lock)
2937{
2938 struct mm_struct *mm = rq->prev_mm;
2939 long prev_state;
2940
2941 rq->prev_mm = NULL;
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954 prev_state = prev->state;
2955 finish_arch_switch(prev);
2956#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2957 local_irq_disable();
2958#endif
2959 perf_event_task_sched_in(current);
2960#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2961 local_irq_enable();
2962#endif
2963 finish_lock_switch(rq, prev);
2964
2965 fire_sched_in_preempt_notifiers(current);
2966 if (mm)
2967 mmdrop(mm);
2968 if (unlikely(prev_state == TASK_DEAD)) {
2969
2970
2971
2972
2973 kprobe_flush_task(prev);
2974 put_task_struct(prev);
2975 }
2976}
2977
2978#ifdef CONFIG_SMP
2979
2980
2981static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2982{
2983 if (prev->sched_class->pre_schedule)
2984 prev->sched_class->pre_schedule(rq, prev);
2985}
2986
2987
2988static inline void post_schedule(struct rq *rq)
2989{
2990 if (rq->post_schedule) {
2991 unsigned long flags;
2992
2993 raw_spin_lock_irqsave(&rq->lock, flags);
2994 if (rq->curr->sched_class->post_schedule)
2995 rq->curr->sched_class->post_schedule(rq);
2996 raw_spin_unlock_irqrestore(&rq->lock, flags);
2997
2998 rq->post_schedule = 0;
2999 }
3000}
3001
3002#else
3003
3004static inline void pre_schedule(struct rq *rq, struct task_struct *p)
3005{
3006}
3007
3008static inline void post_schedule(struct rq *rq)
3009{
3010}
3011
3012#endif
3013
3014
3015
3016
3017
3018asmlinkage void schedule_tail(struct task_struct *prev)
3019 __releases(rq->lock)
3020{
3021 struct rq *rq = this_rq();
3022
3023 finish_task_switch(rq, prev);
3024
3025
3026
3027
3028
3029 post_schedule(rq);
3030
3031#ifdef __ARCH_WANT_UNLOCKED_CTXSW
3032
3033 preempt_enable();
3034#endif
3035 if (current->set_child_tid)
3036 put_user(task_pid_vnr(current), current->set_child_tid);
3037}
3038
3039
3040
3041
3042
3043static inline void
3044context_switch(struct rq *rq, struct task_struct *prev,
3045 struct task_struct *next)
3046{
3047 struct mm_struct *mm, *oldmm;
3048
3049 prepare_task_switch(rq, prev, next);
3050 trace_sched_switch(prev, next);
3051 mm = next->mm;
3052 oldmm = prev->active_mm;
3053
3054
3055
3056
3057
3058 arch_start_context_switch(prev);
3059
3060 if (!mm) {
3061 next->active_mm = oldmm;
3062 atomic_inc(&oldmm->mm_count);
3063 enter_lazy_tlb(oldmm, next);
3064 } else
3065 switch_mm(oldmm, mm, next);
3066
3067 if (!prev->mm) {
3068 prev->active_mm = NULL;
3069 rq->prev_mm = oldmm;
3070 }
3071
3072
3073
3074
3075
3076
3077#ifndef __ARCH_WANT_UNLOCKED_CTXSW
3078 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3079#endif
3080
3081
3082 switch_to(prev, next, prev);
3083
3084 barrier();
3085
3086
3087
3088
3089
3090 finish_task_switch(this_rq(), prev);
3091}
3092
3093
3094
3095
3096
3097
3098
3099
3100unsigned long nr_running(void)
3101{
3102 unsigned long i, sum = 0;
3103
3104 for_each_online_cpu(i)
3105 sum += cpu_rq(i)->nr_running;
3106
3107 return sum;
3108}
3109
3110unsigned long nr_uninterruptible(void)
3111{
3112 unsigned long i, sum = 0;
3113
3114 for_each_possible_cpu(i)
3115 sum += cpu_rq(i)->nr_uninterruptible;
3116
3117
3118
3119
3120
3121 if (unlikely((long)sum < 0))
3122 sum = 0;
3123
3124 return sum;
3125}
3126
3127unsigned long long nr_context_switches(void)
3128{
3129 int i;
3130 unsigned long long sum = 0;
3131
3132 for_each_possible_cpu(i)
3133 sum += cpu_rq(i)->nr_switches;
3134
3135 return sum;
3136}
3137
3138unsigned long nr_iowait(void)
3139{
3140 unsigned long i, sum = 0;
3141
3142 for_each_possible_cpu(i)
3143 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3144
3145 return sum;
3146}
3147
3148unsigned long nr_iowait_cpu(int cpu)
3149{
3150 struct rq *this = cpu_rq(cpu);
3151 return atomic_read(&this->nr_iowait);
3152}
3153
3154unsigned long this_cpu_load(void)
3155{
3156 struct rq *this = this_rq();
3157 return this->cpu_load[0];
3158}
3159
3160
3161
3162static atomic_long_t calc_load_tasks;
3163static unsigned long calc_load_update;
3164unsigned long avenrun[3];
3165EXPORT_SYMBOL(avenrun);
3166
3167static long calc_load_fold_active(struct rq *this_rq)
3168{
3169 long nr_active, delta = 0;
3170
3171 nr_active = this_rq->nr_running;
3172 nr_active += (long) this_rq->nr_uninterruptible;
3173
3174 if (nr_active != this_rq->calc_load_active) {
3175 delta = nr_active - this_rq->calc_load_active;
3176 this_rq->calc_load_active = nr_active;
3177 }
3178
3179 return delta;
3180}
3181
3182static unsigned long
3183calc_load(unsigned long load, unsigned long exp, unsigned long active)
3184{
3185 load *= exp;
3186 load += active * (FIXED_1 - exp);
3187 load += 1UL << (FSHIFT - 1);
3188 return load >> FSHIFT;
3189}
3190
3191#ifdef CONFIG_NO_HZ
3192
3193
3194
3195
3196
3197static atomic_long_t calc_load_tasks_idle;
3198
3199static void calc_load_account_idle(struct rq *this_rq)
3200{
3201 long delta;
3202
3203 delta = calc_load_fold_active(this_rq);
3204 if (delta)
3205 atomic_long_add(delta, &calc_load_tasks_idle);
3206}
3207
3208static long calc_load_fold_idle(void)
3209{
3210 long delta = 0;
3211
3212
3213
3214
3215 if (atomic_long_read(&calc_load_tasks_idle))
3216 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3217
3218 return delta;
3219}
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236static unsigned long
3237fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3238{
3239 unsigned long result = 1UL << frac_bits;
3240
3241 if (n) for (;;) {
3242 if (n & 1) {
3243 result *= x;
3244 result += 1UL << (frac_bits - 1);
3245 result >>= frac_bits;
3246 }
3247 n >>= 1;
3248 if (!n)
3249 break;
3250 x *= x;
3251 x += 1UL << (frac_bits - 1);
3252 x >>= frac_bits;
3253 }
3254
3255 return result;
3256}
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281static unsigned long
3282calc_load_n(unsigned long load, unsigned long exp,
3283 unsigned long active, unsigned int n)
3284{
3285
3286 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3287}
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298static void calc_global_nohz(unsigned long ticks)
3299{
3300 long delta, active, n;
3301
3302 if (time_before(jiffies, calc_load_update))
3303 return;
3304
3305
3306
3307
3308
3309
3310
3311 delta = calc_load_fold_idle();
3312 if (delta)
3313 atomic_long_add(delta, &calc_load_tasks);
3314
3315
3316
3317
3318 if (ticks >= LOAD_FREQ) {
3319 n = ticks / LOAD_FREQ;
3320
3321 active = atomic_long_read(&calc_load_tasks);
3322 active = active > 0 ? active * FIXED_1 : 0;
3323
3324 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3325 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3326 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3327
3328 calc_load_update += n * LOAD_FREQ;
3329 }
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341}
3342#else
3343static void calc_load_account_idle(struct rq *this_rq)
3344{
3345}
3346
3347static inline long calc_load_fold_idle(void)
3348{
3349 return 0;
3350}
3351
3352static void calc_global_nohz(unsigned long ticks)
3353{
3354}
3355#endif
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3366{
3367 loads[0] = (avenrun[0] + offset) << shift;
3368 loads[1] = (avenrun[1] + offset) << shift;
3369 loads[2] = (avenrun[2] + offset) << shift;
3370}
3371
3372
3373
3374
3375
3376void calc_global_load(unsigned long ticks)
3377{
3378 long active;
3379
3380 calc_global_nohz(ticks);
3381
3382 if (time_before(jiffies, calc_load_update + 10))
3383 return;
3384
3385 active = atomic_long_read(&calc_load_tasks);
3386 active = active > 0 ? active * FIXED_1 : 0;
3387
3388 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3389 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3390 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3391
3392 calc_load_update += LOAD_FREQ;
3393}
3394
3395
3396
3397
3398
3399static void calc_load_account_active(struct rq *this_rq)
3400{
3401 long delta;
3402
3403 if (time_before(jiffies, this_rq->calc_load_update))
3404 return;
3405
3406 delta = calc_load_fold_active(this_rq);
3407 delta += calc_load_fold_idle();
3408 if (delta)
3409 atomic_long_add(delta, &calc_load_tasks);
3410
3411 this_rq->calc_load_update += LOAD_FREQ;
3412}
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441#define DEGRADE_SHIFT 7
3442static const unsigned char
3443 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3444static const unsigned char
3445 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3446 {0, 0, 0, 0, 0, 0, 0, 0},
3447 {64, 32, 8, 0, 0, 0, 0, 0},
3448 {96, 72, 40, 12, 1, 0, 0},
3449 {112, 98, 75, 43, 15, 1, 0},
3450 {120, 112, 98, 76, 45, 16, 2} };
3451
3452
3453
3454
3455
3456
3457static unsigned long
3458decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3459{
3460 int j = 0;
3461
3462 if (!missed_updates)
3463 return load;
3464
3465 if (missed_updates >= degrade_zero_ticks[idx])
3466 return 0;
3467
3468 if (idx == 1)
3469 return load >> missed_updates;
3470
3471 while (missed_updates) {
3472 if (missed_updates % 2)
3473 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3474
3475 missed_updates >>= 1;
3476 j++;
3477 }
3478 return load;
3479}
3480
3481
3482
3483
3484
3485
3486static void update_cpu_load(struct rq *this_rq)
3487{
3488 unsigned long this_load = this_rq->load.weight;
3489 unsigned long curr_jiffies = jiffies;
3490 unsigned long pending_updates;
3491 int i, scale;
3492
3493 this_rq->nr_load_updates++;
3494
3495
3496 if (curr_jiffies == this_rq->last_load_update_tick)
3497 return;
3498
3499 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3500 this_rq->last_load_update_tick = curr_jiffies;
3501
3502
3503 this_rq->cpu_load[0] = this_load;
3504 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3505 unsigned long old_load, new_load;
3506
3507
3508
3509 old_load = this_rq->cpu_load[i];
3510 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3511 new_load = this_load;
3512
3513
3514
3515
3516
3517 if (new_load > old_load)
3518 new_load += scale - 1;
3519
3520 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3521 }
3522
3523 sched_avg_update(this_rq);
3524}
3525
3526static void update_cpu_load_active(struct rq *this_rq)
3527{
3528 update_cpu_load(this_rq);
3529
3530 calc_load_account_active(this_rq);
3531}
3532
3533#ifdef CONFIG_SMP
3534
3535
3536
3537
3538
3539void sched_exec(void)
3540{
3541 struct task_struct *p = current;
3542 unsigned long flags;
3543 struct rq *rq;
3544 int dest_cpu;
3545
3546 rq = task_rq_lock(p, &flags);
3547 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3548 if (dest_cpu == smp_processor_id())
3549 goto unlock;
3550
3551
3552
3553
3554 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3555 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3556 struct migration_arg arg = { p, dest_cpu };
3557
3558 task_rq_unlock(rq, &flags);
3559 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3560 return;
3561 }
3562unlock:
3563 task_rq_unlock(rq, &flags);
3564}
3565
3566#endif
3567
3568DEFINE_PER_CPU(struct kernel_stat, kstat);
3569
3570EXPORT_PER_CPU_SYMBOL(kstat);
3571
3572
3573
3574
3575
3576
3577
3578static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3579{
3580 u64 ns = 0;
3581
3582 if (task_current(rq, p)) {
3583 update_rq_clock(rq);
3584 ns = rq->clock_task - p->se.exec_start;
3585 if ((s64)ns < 0)
3586 ns = 0;
3587 }
3588
3589 return ns;
3590}
3591
3592unsigned long long task_delta_exec(struct task_struct *p)
3593{
3594 unsigned long flags;
3595 struct rq *rq;
3596 u64 ns = 0;
3597
3598 rq = task_rq_lock(p, &flags);
3599 ns = do_task_delta_exec(p, rq);
3600 task_rq_unlock(rq, &flags);
3601
3602 return ns;
3603}
3604
3605
3606
3607
3608
3609
3610unsigned long long task_sched_runtime(struct task_struct *p)
3611{
3612 unsigned long flags;
3613 struct rq *rq;
3614 u64 ns = 0;
3615
3616 rq = task_rq_lock(p, &flags);
3617 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3618 task_rq_unlock(rq, &flags);
3619
3620 return ns;
3621}
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632unsigned long long thread_group_sched_runtime(struct task_struct *p)
3633{
3634 struct task_cputime totals;
3635 unsigned long flags;
3636 struct rq *rq;
3637 u64 ns;
3638
3639 rq = task_rq_lock(p, &flags);
3640 thread_group_cputime(p, &totals);
3641 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3642 task_rq_unlock(rq, &flags);
3643
3644 return ns;
3645}
3646
3647
3648
3649
3650
3651
3652
3653void account_user_time(struct task_struct *p, cputime_t cputime,
3654 cputime_t cputime_scaled)
3655{
3656 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3657 cputime64_t tmp;
3658
3659
3660 p->utime = cputime_add(p->utime, cputime);
3661 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3662 account_group_user_time(p, cputime);
3663
3664
3665 tmp = cputime_to_cputime64(cputime);
3666 if (TASK_NICE(p) > 0)
3667 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3668 else
3669 cpustat->user = cputime64_add(cpustat->user, tmp);
3670
3671 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3672
3673 acct_update_integrals(p);
3674}
3675
3676
3677
3678
3679
3680
3681
3682static void account_guest_time(struct task_struct *p, cputime_t cputime,
3683 cputime_t cputime_scaled)
3684{
3685 cputime64_t tmp;
3686 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3687
3688 tmp = cputime_to_cputime64(cputime);
3689
3690
3691 p->utime = cputime_add(p->utime, cputime);
3692 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3693 account_group_user_time(p, cputime);
3694 p->gtime = cputime_add(p->gtime, cputime);
3695
3696
3697 if (TASK_NICE(p) > 0) {
3698 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3699 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3700 } else {
3701 cpustat->user = cputime64_add(cpustat->user, tmp);
3702 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3703 }
3704}
3705
3706
3707
3708
3709
3710
3711
3712
3713void account_system_time(struct task_struct *p, int hardirq_offset,
3714 cputime_t cputime, cputime_t cputime_scaled)
3715{
3716 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3717 cputime64_t tmp;
3718
3719 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3720 account_guest_time(p, cputime, cputime_scaled);
3721 return;
3722 }
3723
3724
3725 p->stime = cputime_add(p->stime, cputime);
3726 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3727 account_group_system_time(p, cputime);
3728
3729
3730 tmp = cputime_to_cputime64(cputime);
3731 if (hardirq_count() - hardirq_offset)
3732 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3733 else if (in_serving_softirq())
3734 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3735 else
3736 cpustat->system = cputime64_add(cpustat->system, tmp);
3737
3738 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3739
3740
3741 acct_update_integrals(p);
3742}
3743
3744
3745
3746
3747
3748void account_steal_time(cputime_t cputime)
3749{
3750 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3751 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3752
3753 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3754}
3755
3756
3757
3758
3759
3760void account_idle_time(cputime_t cputime)
3761{
3762 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3763 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3764 struct rq *rq = this_rq();
3765
3766 if (atomic_read(&rq->nr_iowait) > 0)
3767 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3768 else
3769 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3770}
3771
3772#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3773
3774
3775
3776
3777
3778
3779void account_process_tick(struct task_struct *p, int user_tick)
3780{
3781 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3782 struct rq *rq = this_rq();
3783
3784 if (user_tick)
3785 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3786 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3787 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3788 one_jiffy_scaled);
3789 else
3790 account_idle_time(cputime_one_jiffy);
3791}
3792
3793
3794
3795
3796
3797
3798void account_steal_ticks(unsigned long ticks)
3799{
3800 account_steal_time(jiffies_to_cputime(ticks));
3801}
3802
3803
3804
3805
3806
3807void account_idle_ticks(unsigned long ticks)
3808{
3809 account_idle_time(jiffies_to_cputime(ticks));
3810}
3811
3812#endif
3813
3814
3815
3816
3817#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3818void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3819{
3820 *ut = p->utime;
3821 *st = p->stime;
3822}
3823
3824void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3825{
3826 struct task_cputime cputime;
3827
3828 thread_group_cputime(p, &cputime);
3829
3830 *ut = cputime.utime;
3831 *st = cputime.stime;
3832}
3833#else
3834
3835#ifndef nsecs_to_cputime
3836# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3837#endif
3838
3839void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3840{
3841 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3842
3843
3844
3845
3846 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3847
3848 if (total) {
3849 u64 temp = rtime;
3850
3851 temp *= utime;
3852 do_div(temp, total);
3853 utime = (cputime_t)temp;
3854 } else
3855 utime = rtime;
3856
3857
3858
3859
3860 p->prev_utime = max(p->prev_utime, utime);
3861 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
3862
3863 *ut = p->prev_utime;
3864 *st = p->prev_stime;
3865}
3866
3867
3868
3869
3870void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3871{
3872 struct signal_struct *sig = p->signal;
3873 struct task_cputime cputime;
3874 cputime_t rtime, utime, total;
3875
3876 thread_group_cputime(p, &cputime);
3877
3878 total = cputime_add(cputime.utime, cputime.stime);
3879 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3880
3881 if (total) {
3882 u64 temp = rtime;
3883
3884 temp *= cputime.utime;
3885 do_div(temp, total);
3886 utime = (cputime_t)temp;
3887 } else
3888 utime = rtime;
3889
3890 sig->prev_utime = max(sig->prev_utime, utime);
3891 sig->prev_stime = max(sig->prev_stime,
3892 cputime_sub(rtime, sig->prev_utime));
3893
3894 *ut = sig->prev_utime;
3895 *st = sig->prev_stime;
3896}
3897#endif
3898
3899
3900
3901
3902
3903
3904
3905
3906void scheduler_tick(void)
3907{
3908 int cpu = smp_processor_id();
3909 struct rq *rq = cpu_rq(cpu);
3910 struct task_struct *curr = rq->curr;
3911
3912 sched_clock_tick();
3913
3914 raw_spin_lock(&rq->lock);
3915 update_rq_clock(rq);
3916 update_cpu_load_active(rq);
3917 curr->sched_class->task_tick(rq, curr, 0);
3918 raw_spin_unlock(&rq->lock);
3919
3920 perf_event_task_tick();
3921
3922#ifdef CONFIG_SMP
3923 rq->idle_at_tick = idle_cpu(cpu);
3924 trigger_load_balance(rq, cpu);
3925#endif
3926}
3927
3928notrace unsigned long get_parent_ip(unsigned long addr)
3929{
3930 if (in_lock_functions(addr)) {
3931 addr = CALLER_ADDR2;
3932 if (in_lock_functions(addr))
3933 addr = CALLER_ADDR3;
3934 }
3935 return addr;
3936}
3937
3938#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3939 defined(CONFIG_PREEMPT_TRACER))
3940
3941void __kprobes add_preempt_count(int val)
3942{
3943#ifdef CONFIG_DEBUG_PREEMPT
3944
3945
3946
3947 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3948 return;
3949#endif
3950 preempt_count() += val;
3951#ifdef CONFIG_DEBUG_PREEMPT
3952
3953
3954
3955 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3956 PREEMPT_MASK - 10);
3957#endif
3958 if (preempt_count() == val)
3959 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3960}
3961EXPORT_SYMBOL(add_preempt_count);
3962
3963void __kprobes sub_preempt_count(int val)
3964{
3965#ifdef CONFIG_DEBUG_PREEMPT
3966
3967
3968
3969 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3970 return;
3971
3972
3973
3974 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3975 !(preempt_count() & PREEMPT_MASK)))
3976 return;
3977#endif
3978
3979 if (preempt_count() == val)
3980 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3981 preempt_count() -= val;
3982}
3983EXPORT_SYMBOL(sub_preempt_count);
3984
3985#endif
3986
3987
3988
3989
3990static noinline void __schedule_bug(struct task_struct *prev)
3991{
3992 struct pt_regs *regs = get_irq_regs();
3993
3994 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3995 prev->comm, prev->pid, preempt_count());
3996
3997 debug_show_held_locks(prev);
3998 print_modules();
3999 if (irqs_disabled())
4000 print_irqtrace_events(prev);
4001
4002 if (regs)
4003 show_regs(regs);
4004 else
4005 dump_stack();
4006}
4007
4008
4009
4010
4011static inline void schedule_debug(struct task_struct *prev)
4012{
4013
4014
4015
4016
4017
4018 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4019 __schedule_bug(prev);
4020
4021 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4022
4023 schedstat_inc(this_rq(), sched_count);
4024#ifdef CONFIG_SCHEDSTATS
4025 if (unlikely(prev->lock_depth >= 0)) {
4026 schedstat_inc(this_rq(), bkl_count);
4027 schedstat_inc(prev, sched_info.bkl_count);
4028 }
4029#endif
4030}
4031
4032static void put_prev_task(struct rq *rq, struct task_struct *prev)
4033{
4034 if (prev->se.on_rq)
4035 update_rq_clock(rq);
4036 prev->sched_class->put_prev_task(rq, prev);
4037}
4038
4039
4040
4041
4042static inline struct task_struct *
4043pick_next_task(struct rq *rq)
4044{
4045 const struct sched_class *class;
4046 struct task_struct *p;
4047
4048
4049
4050
4051
4052 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4053 p = fair_sched_class.pick_next_task(rq);
4054 if (likely(p))
4055 return p;
4056 }
4057
4058 for_each_class(class) {
4059 p = class->pick_next_task(rq);
4060 if (p)
4061 return p;
4062 }
4063
4064 BUG();
4065}
4066
4067
4068
4069
4070asmlinkage void __sched schedule(void)
4071{
4072 struct task_struct *prev, *next;
4073 unsigned long *switch_count;
4074 struct rq *rq;
4075 int cpu;
4076
4077need_resched:
4078 preempt_disable();
4079 cpu = smp_processor_id();
4080 rq = cpu_rq(cpu);
4081 rcu_note_context_switch(cpu);
4082 prev = rq->curr;
4083
4084 release_kernel_lock(prev);
4085need_resched_nonpreemptible:
4086
4087 schedule_debug(prev);
4088
4089 if (sched_feat(HRTICK))
4090 hrtick_clear(rq);
4091
4092 raw_spin_lock_irq(&rq->lock);
4093
4094 switch_count = &prev->nivcsw;
4095 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4096 if (unlikely(signal_pending_state(prev->state, prev))) {
4097 prev->state = TASK_RUNNING;
4098 } else {
4099
4100
4101
4102
4103
4104
4105 if (prev->flags & PF_WQ_WORKER) {
4106 struct task_struct *to_wakeup;
4107
4108 to_wakeup = wq_worker_sleeping(prev, cpu);
4109 if (to_wakeup)
4110 try_to_wake_up_local(to_wakeup);
4111 }
4112 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4113 }
4114 switch_count = &prev->nvcsw;
4115 }
4116
4117 pre_schedule(rq, prev);
4118
4119 if (unlikely(!rq->nr_running))
4120 idle_balance(cpu, rq);
4121
4122 put_prev_task(rq, prev);
4123 next = pick_next_task(rq);
4124 clear_tsk_need_resched(prev);
4125 rq->skip_clock_update = 0;
4126
4127 if (likely(prev != next)) {
4128 sched_info_switch(prev, next);
4129 perf_event_task_sched_out(prev, next);
4130
4131 rq->nr_switches++;
4132 rq->curr = next;
4133 ++*switch_count;
4134
4135 context_switch(rq, prev, next);
4136
4137
4138
4139
4140
4141
4142 cpu = smp_processor_id();
4143 rq = cpu_rq(cpu);
4144 } else
4145 raw_spin_unlock_irq(&rq->lock);
4146
4147 post_schedule(rq);
4148
4149 if (unlikely(reacquire_kernel_lock(prev)))
4150 goto need_resched_nonpreemptible;
4151
4152 preempt_enable_no_resched();
4153 if (need_resched())
4154 goto need_resched;
4155}
4156EXPORT_SYMBOL(schedule);
4157
4158#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4159
4160
4161
4162
4163int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4164{
4165 unsigned int cpu;
4166 struct rq *rq;
4167
4168 if (!sched_feat(OWNER_SPIN))
4169 return 0;
4170
4171#ifdef CONFIG_DEBUG_PAGEALLOC
4172
4173
4174
4175
4176
4177 if (probe_kernel_address(&owner->cpu, cpu))
4178 return 0;
4179#else
4180 cpu = owner->cpu;
4181#endif
4182
4183
4184
4185
4186
4187 if (cpu >= nr_cpumask_bits)
4188 return 0;
4189
4190
4191
4192
4193
4194 if (!cpu_online(cpu))
4195 return 0;
4196
4197 rq = cpu_rq(cpu);
4198
4199 for (;;) {
4200
4201
4202
4203 if (lock->owner != owner) {
4204
4205
4206
4207
4208
4209 if (lock->owner)
4210 return 0;
4211 break;
4212 }
4213
4214
4215
4216
4217 if (task_thread_info(rq->curr) != owner || need_resched())
4218 return 0;
4219
4220 cpu_relax();
4221 }
4222
4223 return 1;
4224}
4225#endif
4226
4227#ifdef CONFIG_PREEMPT
4228
4229
4230
4231
4232
4233asmlinkage void __sched notrace preempt_schedule(void)
4234{
4235 struct thread_info *ti = current_thread_info();
4236
4237
4238
4239
4240
4241 if (likely(ti->preempt_count || irqs_disabled()))
4242 return;
4243
4244 do {
4245 add_preempt_count_notrace(PREEMPT_ACTIVE);
4246 schedule();
4247 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4248
4249
4250
4251
4252
4253 barrier();
4254 } while (need_resched());
4255}
4256EXPORT_SYMBOL(preempt_schedule);
4257
4258
4259
4260
4261
4262
4263
4264asmlinkage void __sched preempt_schedule_irq(void)
4265{
4266 struct thread_info *ti = current_thread_info();
4267
4268
4269 BUG_ON(ti->preempt_count || !irqs_disabled());
4270
4271 do {
4272 add_preempt_count(PREEMPT_ACTIVE);
4273 local_irq_enable();
4274 schedule();
4275 local_irq_disable();
4276 sub_preempt_count(PREEMPT_ACTIVE);
4277
4278
4279
4280
4281
4282 barrier();
4283 } while (need_resched());
4284}
4285
4286#endif
4287
4288int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4289 void *key)
4290{
4291 return try_to_wake_up(curr->private, mode, wake_flags);
4292}
4293EXPORT_SYMBOL(default_wake_function);
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4305 int nr_exclusive, int wake_flags, void *key)
4306{
4307 wait_queue_t *curr, *next;
4308
4309 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4310 unsigned flags = curr->flags;
4311
4312 if (curr->func(curr, mode, wake_flags, key) &&
4313 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4314 break;
4315 }
4316}
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328void __wake_up(wait_queue_head_t *q, unsigned int mode,
4329 int nr_exclusive, void *key)
4330{
4331 unsigned long flags;
4332
4333 spin_lock_irqsave(&q->lock, flags);
4334 __wake_up_common(q, mode, nr_exclusive, 0, key);
4335 spin_unlock_irqrestore(&q->lock, flags);
4336}
4337EXPORT_SYMBOL(__wake_up);
4338
4339
4340
4341
4342void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4343{
4344 __wake_up_common(q, mode, 1, 0, NULL);
4345}
4346EXPORT_SYMBOL_GPL(__wake_up_locked);
4347
4348void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4349{
4350 __wake_up_common(q, mode, 1, 0, key);
4351}
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4371 int nr_exclusive, void *key)
4372{
4373 unsigned long flags;
4374 int wake_flags = WF_SYNC;
4375
4376 if (unlikely(!q))
4377 return;
4378
4379 if (unlikely(!nr_exclusive))
4380 wake_flags = 0;
4381
4382 spin_lock_irqsave(&q->lock, flags);
4383 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4384 spin_unlock_irqrestore(&q->lock, flags);
4385}
4386EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4387
4388
4389
4390
4391void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4392{
4393 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4394}
4395EXPORT_SYMBOL_GPL(__wake_up_sync);
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409void complete(struct completion *x)
4410{
4411 unsigned long flags;
4412
4413 spin_lock_irqsave(&x->wait.lock, flags);
4414 x->done++;
4415 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4416 spin_unlock_irqrestore(&x->wait.lock, flags);
4417}
4418EXPORT_SYMBOL(complete);
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429void complete_all(struct completion *x)
4430{
4431 unsigned long flags;
4432
4433 spin_lock_irqsave(&x->wait.lock, flags);
4434 x->done += UINT_MAX/2;
4435 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4436 spin_unlock_irqrestore(&x->wait.lock, flags);
4437}
4438EXPORT_SYMBOL(complete_all);
4439
4440static inline long __sched
4441do_wait_for_common(struct completion *x, long timeout, int state)
4442{
4443 if (!x->done) {
4444 DECLARE_WAITQUEUE(wait, current);
4445
4446 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4447 do {
4448 if (signal_pending_state(state, current)) {
4449 timeout = -ERESTARTSYS;
4450 break;
4451 }
4452 __set_current_state(state);
4453 spin_unlock_irq(&x->wait.lock);
4454 timeout = schedule_timeout(timeout);
4455 spin_lock_irq(&x->wait.lock);
4456 } while (!x->done && timeout);
4457 __remove_wait_queue(&x->wait, &wait);
4458 if (!x->done)
4459 return timeout;
4460 }
4461 x->done--;
4462 return timeout ?: 1;
4463}
4464
4465static long __sched
4466wait_for_common(struct completion *x, long timeout, int state)
4467{
4468 might_sleep();
4469
4470 spin_lock_irq(&x->wait.lock);
4471 timeout = do_wait_for_common(x, timeout, state);
4472 spin_unlock_irq(&x->wait.lock);
4473 return timeout;
4474}
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486void __sched wait_for_completion(struct completion *x)
4487{
4488 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4489}
4490EXPORT_SYMBOL(wait_for_completion);
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501unsigned long __sched
4502wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4503{
4504 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4505}
4506EXPORT_SYMBOL(wait_for_completion_timeout);
4507
4508
4509
4510
4511
4512
4513
4514
4515int __sched wait_for_completion_interruptible(struct completion *x)
4516{
4517 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4518 if (t == -ERESTARTSYS)
4519 return t;
4520 return 0;
4521}
4522EXPORT_SYMBOL(wait_for_completion_interruptible);
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532long __sched
4533wait_for_completion_interruptible_timeout(struct completion *x,
4534 unsigned long timeout)
4535{
4536 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4537}
4538EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4539
4540
4541
4542
4543
4544
4545
4546
4547int __sched wait_for_completion_killable(struct completion *x)
4548{
4549 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4550 if (t == -ERESTARTSYS)
4551 return t;
4552 return 0;
4553}
4554EXPORT_SYMBOL(wait_for_completion_killable);
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565long __sched
4566wait_for_completion_killable_timeout(struct completion *x,
4567 unsigned long timeout)
4568{
4569 return wait_for_common(x, timeout, TASK_KILLABLE);
4570}
4571EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585bool try_wait_for_completion(struct completion *x)
4586{
4587 unsigned long flags;
4588 int ret = 1;
4589
4590 spin_lock_irqsave(&x->wait.lock, flags);
4591 if (!x->done)
4592 ret = 0;
4593 else
4594 x->done--;
4595 spin_unlock_irqrestore(&x->wait.lock, flags);
4596 return ret;
4597}
4598EXPORT_SYMBOL(try_wait_for_completion);
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608bool completion_done(struct completion *x)
4609{
4610 unsigned long flags;
4611 int ret = 1;
4612
4613 spin_lock_irqsave(&x->wait.lock, flags);
4614 if (!x->done)
4615 ret = 0;
4616 spin_unlock_irqrestore(&x->wait.lock, flags);
4617 return ret;
4618}
4619EXPORT_SYMBOL(completion_done);
4620
4621static long __sched
4622sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4623{
4624 unsigned long flags;
4625 wait_queue_t wait;
4626
4627 init_waitqueue_entry(&wait, current);
4628
4629 __set_current_state(state);
4630
4631 spin_lock_irqsave(&q->lock, flags);
4632 __add_wait_queue(q, &wait);
4633 spin_unlock(&q->lock);
4634 timeout = schedule_timeout(timeout);
4635 spin_lock_irq(&q->lock);
4636 __remove_wait_queue(q, &wait);
4637 spin_unlock_irqrestore(&q->lock, flags);
4638
4639 return timeout;
4640}
4641
4642void __sched interruptible_sleep_on(wait_queue_head_t *q)
4643{
4644 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4645}
4646EXPORT_SYMBOL(interruptible_sleep_on);
4647
4648long __sched
4649interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4650{
4651 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4652}
4653EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4654
4655void __sched sleep_on(wait_queue_head_t *q)
4656{
4657 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4658}
4659EXPORT_SYMBOL(sleep_on);
4660
4661long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4662{
4663 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4664}
4665EXPORT_SYMBOL(sleep_on_timeout);
4666
4667#ifdef CONFIG_RT_MUTEXES
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679void rt_mutex_setprio(struct task_struct *p, int prio)
4680{
4681 unsigned long flags;
4682 int oldprio, on_rq, running;
4683 struct rq *rq;
4684 const struct sched_class *prev_class;
4685
4686 BUG_ON(prio < 0 || prio > MAX_PRIO);
4687
4688 rq = task_rq_lock(p, &flags);
4689
4690 trace_sched_pi_setprio(p, prio);
4691 oldprio = p->prio;
4692 prev_class = p->sched_class;
4693 on_rq = p->se.on_rq;
4694 running = task_current(rq, p);
4695 if (on_rq)
4696 dequeue_task(rq, p, 0);
4697 if (running)
4698 p->sched_class->put_prev_task(rq, p);
4699
4700 if (rt_prio(prio))
4701 p->sched_class = &rt_sched_class;
4702 else
4703 p->sched_class = &fair_sched_class;
4704
4705 p->prio = prio;
4706
4707 if (running)
4708 p->sched_class->set_curr_task(rq);
4709 if (on_rq) {
4710 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4711
4712 check_class_changed(rq, p, prev_class, oldprio, running);
4713 }
4714 task_rq_unlock(rq, &flags);
4715}
4716
4717#endif
4718
4719void set_user_nice(struct task_struct *p, long nice)
4720{
4721 int old_prio, delta, on_rq;
4722 unsigned long flags;
4723 struct rq *rq;
4724
4725 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4726 return;
4727
4728
4729
4730
4731 rq = task_rq_lock(p, &flags);
4732
4733
4734
4735
4736
4737
4738 if (task_has_rt_policy(p)) {
4739 p->static_prio = NICE_TO_PRIO(nice);
4740 goto out_unlock;
4741 }
4742 on_rq = p->se.on_rq;
4743 if (on_rq)
4744 dequeue_task(rq, p, 0);
4745
4746 p->static_prio = NICE_TO_PRIO(nice);
4747 set_load_weight(p);
4748 old_prio = p->prio;
4749 p->prio = effective_prio(p);
4750 delta = p->prio - old_prio;
4751
4752 if (on_rq) {
4753 enqueue_task(rq, p, 0);
4754
4755
4756
4757
4758 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4759 resched_task(rq->curr);
4760 }
4761out_unlock:
4762 task_rq_unlock(rq, &flags);
4763}
4764EXPORT_SYMBOL(set_user_nice);
4765
4766
4767
4768
4769
4770
4771int can_nice(const struct task_struct *p, const int nice)
4772{
4773
4774 int nice_rlim = 20 - nice;
4775
4776 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4777 capable(CAP_SYS_NICE));
4778}
4779
4780#ifdef __ARCH_WANT_SYS_NICE
4781
4782
4783
4784
4785
4786
4787
4788
4789SYSCALL_DEFINE1(nice, int, increment)
4790{
4791 long nice, retval;
4792
4793
4794
4795
4796
4797
4798 if (increment < -40)
4799 increment = -40;
4800 if (increment > 40)
4801 increment = 40;
4802
4803 nice = TASK_NICE(current) + increment;
4804 if (nice < -20)
4805 nice = -20;
4806 if (nice > 19)
4807 nice = 19;
4808
4809 if (increment < 0 && !can_nice(current, nice))
4810 return -EPERM;
4811
4812 retval = security_task_setnice(current, nice);
4813 if (retval)
4814 return retval;
4815
4816 set_user_nice(current, nice);
4817 return 0;
4818}
4819
4820#endif
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830int task_prio(const struct task_struct *p)
4831{
4832 return p->prio - MAX_RT_PRIO;
4833}
4834
4835
4836
4837
4838
4839int task_nice(const struct task_struct *p)
4840{
4841 return TASK_NICE(p);
4842}
4843EXPORT_SYMBOL(task_nice);
4844
4845
4846
4847
4848
4849int idle_cpu(int cpu)
4850{
4851 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4852}
4853
4854
4855
4856
4857
4858struct task_struct *idle_task(int cpu)
4859{
4860 return cpu_rq(cpu)->idle;
4861}
4862
4863
4864
4865
4866
4867static struct task_struct *find_process_by_pid(pid_t pid)
4868{
4869 return pid ? find_task_by_vpid(pid) : current;
4870}
4871
4872
4873static void
4874__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4875{
4876 BUG_ON(p->se.on_rq);
4877
4878 p->policy = policy;
4879 p->rt_priority = prio;
4880 p->normal_prio = normal_prio(p);
4881
4882 p->prio = rt_mutex_getprio(p);
4883 if (rt_prio(p->prio))
4884 p->sched_class = &rt_sched_class;
4885 else
4886 p->sched_class = &fair_sched_class;
4887 set_load_weight(p);
4888}
4889
4890
4891
4892
4893static bool check_same_owner(struct task_struct *p)
4894{
4895 const struct cred *cred = current_cred(), *pcred;
4896 bool match;
4897
4898 rcu_read_lock();
4899 pcred = __task_cred(p);
4900 match = (cred->euid == pcred->euid ||
4901 cred->euid == pcred->uid);
4902 rcu_read_unlock();
4903 return match;
4904}
4905
4906static int __sched_setscheduler(struct task_struct *p, int policy,
4907 struct sched_param *param, bool user)
4908{
4909 int retval, oldprio, oldpolicy = -1, on_rq, running;
4910 unsigned long flags;
4911 const struct sched_class *prev_class;
4912 struct rq *rq;
4913 int reset_on_fork;
4914
4915
4916 BUG_ON(in_interrupt());
4917recheck:
4918
4919 if (policy < 0) {
4920 reset_on_fork = p->sched_reset_on_fork;
4921 policy = oldpolicy = p->policy;
4922 } else {
4923 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4924 policy &= ~SCHED_RESET_ON_FORK;
4925
4926 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4927 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4928 policy != SCHED_IDLE)
4929 return -EINVAL;
4930 }
4931
4932
4933
4934
4935
4936
4937 if (param->sched_priority < 0 ||
4938 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4939 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4940 return -EINVAL;
4941 if (rt_policy(policy) != (param->sched_priority != 0))
4942 return -EINVAL;
4943
4944
4945
4946
4947 if (user && !capable(CAP_SYS_NICE)) {
4948 if (rt_policy(policy)) {
4949 unsigned long rlim_rtprio =
4950 task_rlimit(p, RLIMIT_RTPRIO);
4951
4952
4953 if (policy != p->policy && !rlim_rtprio)
4954 return -EPERM;
4955
4956
4957 if (param->sched_priority > p->rt_priority &&
4958 param->sched_priority > rlim_rtprio)
4959 return -EPERM;
4960 }
4961
4962
4963
4964
4965 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4966 return -EPERM;
4967
4968
4969 if (!check_same_owner(p))
4970 return -EPERM;
4971
4972
4973 if (p->sched_reset_on_fork && !reset_on_fork)
4974 return -EPERM;
4975 }
4976
4977 if (user) {
4978 retval = security_task_setscheduler(p);
4979 if (retval)
4980 return retval;
4981 }
4982
4983
4984
4985
4986
4987 raw_spin_lock_irqsave(&p->pi_lock, flags);
4988
4989
4990
4991
4992 rq = __task_rq_lock(p);
4993
4994
4995
4996
4997 if (p == rq->stop) {
4998 __task_rq_unlock(rq);
4999 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5000 return -EINVAL;
5001 }
5002
5003#ifdef CONFIG_RT_GROUP_SCHED
5004 if (user) {
5005
5006
5007
5008
5009 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5010 task_group(p)->rt_bandwidth.rt_runtime == 0) {
5011 __task_rq_unlock(rq);
5012 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5013 return -EPERM;
5014 }
5015 }
5016#endif
5017
5018
5019 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5020 policy = oldpolicy = -1;
5021 __task_rq_unlock(rq);
5022 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5023 goto recheck;
5024 }
5025 on_rq = p->se.on_rq;
5026 running = task_current(rq, p);
5027 if (on_rq)
5028 deactivate_task(rq, p, 0);
5029 if (running)
5030 p->sched_class->put_prev_task(rq, p);
5031
5032 p->sched_reset_on_fork = reset_on_fork;
5033
5034 oldprio = p->prio;
5035 prev_class = p->sched_class;
5036 __setscheduler(rq, p, policy, param->sched_priority);
5037
5038 if (running)
5039 p->sched_class->set_curr_task(rq);
5040 if (on_rq) {
5041 activate_task(rq, p, 0);
5042
5043 check_class_changed(rq, p, prev_class, oldprio, running);
5044 }
5045 __task_rq_unlock(rq);
5046 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5047
5048 rt_mutex_adjust_pi(p);
5049
5050 return 0;
5051}
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061int sched_setscheduler(struct task_struct *p, int policy,
5062 struct sched_param *param)
5063{
5064 return __sched_setscheduler(p, policy, param, true);
5065}
5066EXPORT_SYMBOL_GPL(sched_setscheduler);
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5080 struct sched_param *param)
5081{
5082 return __sched_setscheduler(p, policy, param, false);
5083}
5084
5085static int
5086do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5087{
5088 struct sched_param lparam;
5089 struct task_struct *p;
5090 int retval;
5091
5092 if (!param || pid < 0)
5093 return -EINVAL;
5094 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5095 return -EFAULT;
5096
5097 rcu_read_lock();
5098 retval = -ESRCH;
5099 p = find_process_by_pid(pid);
5100 if (p != NULL)
5101 retval = sched_setscheduler(p, policy, &lparam);
5102 rcu_read_unlock();
5103
5104 return retval;
5105}
5106
5107
5108
5109
5110
5111
5112
5113SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5114 struct sched_param __user *, param)
5115{
5116
5117 if (policy < 0)
5118 return -EINVAL;
5119
5120 return do_sched_setscheduler(pid, policy, param);
5121}
5122
5123
5124
5125
5126
5127
5128SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5129{
5130 return do_sched_setscheduler(pid, -1, param);
5131}
5132
5133
5134
5135
5136
5137SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5138{
5139 struct task_struct *p;
5140 int retval;
5141
5142 if (pid < 0)
5143 return -EINVAL;
5144
5145 retval = -ESRCH;
5146 rcu_read_lock();
5147 p = find_process_by_pid(pid);
5148 if (p) {
5149 retval = security_task_getscheduler(p);
5150 if (!retval)
5151 retval = p->policy
5152 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5153 }
5154 rcu_read_unlock();
5155 return retval;
5156}
5157
5158
5159
5160
5161
5162
5163SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5164{
5165 struct sched_param lp;
5166 struct task_struct *p;
5167 int retval;
5168
5169 if (!param || pid < 0)
5170 return -EINVAL;
5171
5172 rcu_read_lock();
5173 p = find_process_by_pid(pid);
5174 retval = -ESRCH;
5175 if (!p)
5176 goto out_unlock;
5177
5178 retval = security_task_getscheduler(p);
5179 if (retval)
5180 goto out_unlock;
5181
5182 lp.sched_priority = p->rt_priority;
5183 rcu_read_unlock();
5184
5185
5186
5187
5188 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5189
5190 return retval;
5191
5192out_unlock:
5193 rcu_read_unlock();
5194 return retval;
5195}
5196
5197long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5198{
5199 cpumask_var_t cpus_allowed, new_mask;
5200 struct task_struct *p;
5201 int retval;
5202
5203 get_online_cpus();
5204 rcu_read_lock();
5205
5206 p = find_process_by_pid(pid);
5207 if (!p) {
5208 rcu_read_unlock();
5209 put_online_cpus();
5210 return -ESRCH;
5211 }
5212
5213
5214 get_task_struct(p);
5215 rcu_read_unlock();
5216
5217 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5218 retval = -ENOMEM;
5219 goto out_put_task;
5220 }
5221 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5222 retval = -ENOMEM;
5223 goto out_free_cpus_allowed;
5224 }
5225 retval = -EPERM;
5226 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5227 goto out_unlock;
5228
5229 retval = security_task_setscheduler(p);
5230 if (retval)
5231 goto out_unlock;
5232
5233 cpuset_cpus_allowed(p, cpus_allowed);
5234 cpumask_and(new_mask, in_mask, cpus_allowed);
5235again:
5236 retval = set_cpus_allowed_ptr(p, new_mask);
5237
5238 if (!retval) {
5239 cpuset_cpus_allowed(p, cpus_allowed);
5240 if (!cpumask_subset(new_mask, cpus_allowed)) {
5241
5242
5243
5244
5245
5246 cpumask_copy(new_mask, cpus_allowed);
5247 goto again;
5248 }
5249 }
5250out_unlock:
5251 free_cpumask_var(new_mask);
5252out_free_cpus_allowed:
5253 free_cpumask_var(cpus_allowed);
5254out_put_task:
5255 put_task_struct(p);
5256 put_online_cpus();
5257 return retval;
5258}
5259
5260static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5261 struct cpumask *new_mask)
5262{
5263 if (len < cpumask_size())
5264 cpumask_clear(new_mask);
5265 else if (len > cpumask_size())
5266 len = cpumask_size();
5267
5268 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5269}
5270
5271
5272
5273
5274
5275
5276
5277SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5278 unsigned long __user *, user_mask_ptr)
5279{
5280 cpumask_var_t new_mask;
5281 int retval;
5282
5283 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5284 return -ENOMEM;
5285
5286 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5287 if (retval == 0)
5288 retval = sched_setaffinity(pid, new_mask);
5289 free_cpumask_var(new_mask);
5290 return retval;
5291}
5292
5293long sched_getaffinity(pid_t pid, struct cpumask *mask)
5294{
5295 struct task_struct *p;
5296 unsigned long flags;
5297 struct rq *rq;
5298 int retval;
5299
5300 get_online_cpus();
5301 rcu_read_lock();
5302
5303 retval = -ESRCH;
5304 p = find_process_by_pid(pid);
5305 if (!p)
5306 goto out_unlock;
5307
5308 retval = security_task_getscheduler(p);
5309 if (retval)
5310 goto out_unlock;
5311
5312 rq = task_rq_lock(p, &flags);
5313 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5314 task_rq_unlock(rq, &flags);
5315
5316out_unlock:
5317 rcu_read_unlock();
5318 put_online_cpus();
5319
5320 return retval;
5321}
5322
5323
5324
5325
5326
5327
5328
5329SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5330 unsigned long __user *, user_mask_ptr)
5331{
5332 int ret;
5333 cpumask_var_t mask;
5334
5335 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5336 return -EINVAL;
5337 if (len & (sizeof(unsigned long)-1))
5338 return -EINVAL;
5339
5340 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5341 return -ENOMEM;
5342
5343 ret = sched_getaffinity(pid, mask);
5344 if (ret == 0) {
5345 size_t retlen = min_t(size_t, len, cpumask_size());
5346
5347 if (copy_to_user(user_mask_ptr, mask, retlen))
5348 ret = -EFAULT;
5349 else
5350 ret = retlen;
5351 }
5352 free_cpumask_var(mask);
5353
5354 return ret;
5355}
5356
5357
5358
5359
5360
5361
5362
5363SYSCALL_DEFINE0(sched_yield)
5364{
5365 struct rq *rq = this_rq_lock();
5366
5367 schedstat_inc(rq, yld_count);
5368 current->sched_class->yield_task(rq);
5369
5370
5371
5372
5373
5374 __release(rq->lock);
5375 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5376 do_raw_spin_unlock(&rq->lock);
5377 preempt_enable_no_resched();
5378
5379 schedule();
5380
5381 return 0;
5382}
5383
5384static inline int should_resched(void)
5385{
5386 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5387}
5388
5389static void __cond_resched(void)
5390{
5391 add_preempt_count(PREEMPT_ACTIVE);
5392 schedule();
5393 sub_preempt_count(PREEMPT_ACTIVE);
5394}
5395
5396int __sched _cond_resched(void)
5397{
5398 if (should_resched()) {
5399 __cond_resched();
5400 return 1;
5401 }
5402 return 0;
5403}
5404EXPORT_SYMBOL(_cond_resched);
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414int __cond_resched_lock(spinlock_t *lock)
5415{
5416 int resched = should_resched();
5417 int ret = 0;
5418
5419 lockdep_assert_held(lock);
5420
5421 if (spin_needbreak(lock) || resched) {
5422 spin_unlock(lock);
5423 if (resched)
5424 __cond_resched();
5425 else
5426 cpu_relax();
5427 ret = 1;
5428 spin_lock(lock);
5429 }
5430 return ret;
5431}
5432EXPORT_SYMBOL(__cond_resched_lock);
5433
5434int __sched __cond_resched_softirq(void)
5435{
5436 BUG_ON(!in_softirq());
5437
5438 if (should_resched()) {
5439 local_bh_enable();
5440 __cond_resched();
5441 local_bh_disable();
5442 return 1;
5443 }
5444 return 0;
5445}
5446EXPORT_SYMBOL(__cond_resched_softirq);
5447
5448
5449
5450
5451
5452
5453
5454void __sched yield(void)
5455{
5456 set_current_state(TASK_RUNNING);
5457 sys_sched_yield();
5458}
5459EXPORT_SYMBOL(yield);
5460
5461
5462
5463
5464
5465void __sched io_schedule(void)
5466{
5467 struct rq *rq = raw_rq();
5468
5469 delayacct_blkio_start();
5470 atomic_inc(&rq->nr_iowait);
5471 current->in_iowait = 1;
5472 schedule();
5473 current->in_iowait = 0;
5474 atomic_dec(&rq->nr_iowait);
5475 delayacct_blkio_end();
5476}
5477EXPORT_SYMBOL(io_schedule);
5478
5479long __sched io_schedule_timeout(long timeout)
5480{
5481 struct rq *rq = raw_rq();
5482 long ret;
5483
5484 delayacct_blkio_start();
5485 atomic_inc(&rq->nr_iowait);
5486 current->in_iowait = 1;
5487 ret = schedule_timeout(timeout);
5488 current->in_iowait = 0;
5489 atomic_dec(&rq->nr_iowait);
5490 delayacct_blkio_end();
5491 return ret;
5492}
5493
5494
5495
5496
5497
5498
5499
5500
5501SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5502{
5503 int ret = -EINVAL;
5504
5505 switch (policy) {
5506 case SCHED_FIFO:
5507 case SCHED_RR:
5508 ret = MAX_USER_RT_PRIO-1;
5509 break;
5510 case SCHED_NORMAL:
5511 case SCHED_BATCH:
5512 case SCHED_IDLE:
5513 ret = 0;
5514 break;
5515 }
5516 return ret;
5517}
5518
5519
5520
5521
5522
5523
5524
5525
5526SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5527{
5528 int ret = -EINVAL;
5529
5530 switch (policy) {
5531 case SCHED_FIFO:
5532 case SCHED_RR:
5533 ret = 1;
5534 break;
5535 case SCHED_NORMAL:
5536 case SCHED_BATCH:
5537 case SCHED_IDLE:
5538 ret = 0;
5539 }
5540 return ret;
5541}
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5552 struct timespec __user *, interval)
5553{
5554 struct task_struct *p;
5555 unsigned int time_slice;
5556 unsigned long flags;
5557 struct rq *rq;
5558 int retval;
5559 struct timespec t;
5560
5561 if (pid < 0)
5562 return -EINVAL;
5563
5564 retval = -ESRCH;
5565 rcu_read_lock();
5566 p = find_process_by_pid(pid);
5567 if (!p)
5568 goto out_unlock;
5569
5570 retval = security_task_getscheduler(p);
5571 if (retval)
5572 goto out_unlock;
5573
5574 rq = task_rq_lock(p, &flags);
5575 time_slice = p->sched_class->get_rr_interval(rq, p);
5576 task_rq_unlock(rq, &flags);
5577
5578 rcu_read_unlock();
5579 jiffies_to_timespec(time_slice, &t);
5580 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5581 return retval;
5582
5583out_unlock:
5584 rcu_read_unlock();
5585 return retval;
5586}
5587
5588static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5589
5590void sched_show_task(struct task_struct *p)
5591{
5592 unsigned long free = 0;
5593 unsigned state;
5594
5595 state = p->state ? __ffs(p->state) + 1 : 0;
5596 printk(KERN_INFO "%-13.13s %c", p->comm,
5597 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5598#if BITS_PER_LONG == 32
5599 if (state == TASK_RUNNING)
5600 printk(KERN_CONT " running ");
5601 else
5602 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5603#else
5604 if (state == TASK_RUNNING)
5605 printk(KERN_CONT " running task ");
5606 else
5607 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5608#endif
5609#ifdef CONFIG_DEBUG_STACK_USAGE
5610 free = stack_not_used(p);
5611#endif
5612 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5613 task_pid_nr(p), task_pid_nr(p->real_parent),
5614 (unsigned long)task_thread_info(p)->flags);
5615
5616 show_stack(p, NULL);
5617}
5618
5619void show_state_filter(unsigned long state_filter)
5620{
5621 struct task_struct *g, *p;
5622
5623#if BITS_PER_LONG == 32
5624 printk(KERN_INFO
5625 " task PC stack pid father\n");
5626#else
5627 printk(KERN_INFO
5628 " task PC stack pid father\n");
5629#endif
5630 read_lock(&tasklist_lock);
5631 do_each_thread(g, p) {
5632
5633
5634
5635
5636 touch_nmi_watchdog();
5637 if (!state_filter || (p->state & state_filter))
5638 sched_show_task(p);
5639 } while_each_thread(g, p);
5640
5641 touch_all_softlockup_watchdogs();
5642
5643#ifdef CONFIG_SCHED_DEBUG
5644 sysrq_sched_debug_show();
5645#endif
5646 read_unlock(&tasklist_lock);
5647
5648
5649
5650 if (!state_filter)
5651 debug_show_all_locks();
5652}
5653
5654void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5655{
5656 idle->sched_class = &idle_sched_class;
5657}
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667void __cpuinit init_idle(struct task_struct *idle, int cpu)
5668{
5669 struct rq *rq = cpu_rq(cpu);
5670 unsigned long flags;
5671
5672 raw_spin_lock_irqsave(&rq->lock, flags);
5673
5674 __sched_fork(idle);
5675 idle->state = TASK_RUNNING;
5676 idle->se.exec_start = sched_clock();
5677
5678 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689 rcu_read_lock();
5690 __set_task_cpu(idle, cpu);
5691 rcu_read_unlock();
5692
5693 rq->curr = rq->idle = idle;
5694#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5695 idle->oncpu = 1;
5696#endif
5697 raw_spin_unlock_irqrestore(&rq->lock, flags);
5698
5699
5700#if defined(CONFIG_PREEMPT)
5701 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5702#else
5703 task_thread_info(idle)->preempt_count = 0;
5704#endif
5705
5706
5707
5708 idle->sched_class = &idle_sched_class;
5709 ftrace_graph_init_idle_task(idle, cpu);
5710}
5711
5712
5713
5714
5715
5716
5717
5718
5719cpumask_var_t nohz_cpu_mask;
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730static int get_update_sysctl_factor(void)
5731{
5732 unsigned int cpus = min_t(int, num_online_cpus(), 8);
5733 unsigned int factor;
5734
5735 switch (sysctl_sched_tunable_scaling) {
5736 case SCHED_TUNABLESCALING_NONE:
5737 factor = 1;
5738 break;
5739 case SCHED_TUNABLESCALING_LINEAR:
5740 factor = cpus;
5741 break;
5742 case SCHED_TUNABLESCALING_LOG:
5743 default:
5744 factor = 1 + ilog2(cpus);
5745 break;
5746 }
5747
5748 return factor;
5749}
5750
5751static void update_sysctl(void)
5752{
5753 unsigned int factor = get_update_sysctl_factor();
5754
5755#define SET_SYSCTL(name) \
5756 (sysctl_##name = (factor) * normalized_sysctl_##name)
5757 SET_SYSCTL(sched_min_granularity);
5758 SET_SYSCTL(sched_latency);
5759 SET_SYSCTL(sched_wakeup_granularity);
5760 SET_SYSCTL(sched_shares_ratelimit);
5761#undef SET_SYSCTL
5762}
5763
5764static inline void sched_init_granularity(void)
5765{
5766 update_sysctl();
5767}
5768
5769#ifdef CONFIG_SMP
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5794{
5795 unsigned long flags;
5796 struct rq *rq;
5797 unsigned int dest_cpu;
5798 int ret = 0;
5799
5800
5801
5802
5803
5804again:
5805 while (task_is_waking(p))
5806 cpu_relax();
5807 rq = task_rq_lock(p, &flags);
5808 if (task_is_waking(p)) {
5809 task_rq_unlock(rq, &flags);
5810 goto again;
5811 }
5812
5813 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5814 ret = -EINVAL;
5815 goto out;
5816 }
5817
5818 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5819 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5820 ret = -EINVAL;
5821 goto out;
5822 }
5823
5824 if (p->sched_class->set_cpus_allowed)
5825 p->sched_class->set_cpus_allowed(p, new_mask);
5826 else {
5827 cpumask_copy(&p->cpus_allowed, new_mask);
5828 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5829 }
5830
5831
5832 if (cpumask_test_cpu(task_cpu(p), new_mask))
5833 goto out;
5834
5835 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5836 if (migrate_task(p, dest_cpu)) {
5837 struct migration_arg arg = { p, dest_cpu };
5838
5839 task_rq_unlock(rq, &flags);
5840 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5841 tlb_migrate_finish(p->mm);
5842 return 0;
5843 }
5844out:
5845 task_rq_unlock(rq, &flags);
5846
5847 return ret;
5848}
5849EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5863{
5864 struct rq *rq_dest, *rq_src;
5865 int ret = 0;
5866
5867 if (unlikely(!cpu_active(dest_cpu)))
5868 return ret;
5869
5870 rq_src = cpu_rq(src_cpu);
5871 rq_dest = cpu_rq(dest_cpu);
5872
5873 double_rq_lock(rq_src, rq_dest);
5874
5875 if (task_cpu(p) != src_cpu)
5876 goto done;
5877
5878 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
5879 goto fail;
5880
5881
5882
5883
5884
5885 if (p->se.on_rq) {
5886 deactivate_task(rq_src, p, 0);
5887 set_task_cpu(p, dest_cpu);
5888 activate_task(rq_dest, p, 0);
5889 check_preempt_curr(rq_dest, p, 0);
5890 }
5891done:
5892 ret = 1;
5893fail:
5894 double_rq_unlock(rq_src, rq_dest);
5895 return ret;
5896}
5897
5898
5899
5900
5901
5902
5903static int migration_cpu_stop(void *data)
5904{
5905 struct migration_arg *arg = data;
5906
5907
5908
5909
5910
5911 local_irq_disable();
5912 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5913 local_irq_enable();
5914 return 0;
5915}
5916
5917#ifdef CONFIG_HOTPLUG_CPU
5918
5919
5920
5921void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5922{
5923 struct rq *rq = cpu_rq(dead_cpu);
5924 int needs_cpu, uninitialized_var(dest_cpu);
5925 unsigned long flags;
5926
5927 local_irq_save(flags);
5928
5929 raw_spin_lock(&rq->lock);
5930 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5931 if (needs_cpu)
5932 dest_cpu = select_fallback_rq(dead_cpu, p);
5933 raw_spin_unlock(&rq->lock);
5934
5935
5936
5937
5938 if (needs_cpu)
5939 __migrate_task(p, dead_cpu, dest_cpu);
5940 local_irq_restore(flags);
5941}
5942
5943
5944
5945
5946
5947
5948
5949
5950static void migrate_nr_uninterruptible(struct rq *rq_src)
5951{
5952 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5953 unsigned long flags;
5954
5955 local_irq_save(flags);
5956 double_rq_lock(rq_src, rq_dest);
5957 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5958 rq_src->nr_uninterruptible = 0;
5959 double_rq_unlock(rq_src, rq_dest);
5960 local_irq_restore(flags);
5961}
5962
5963
5964static void migrate_live_tasks(int src_cpu)
5965{
5966 struct task_struct *p, *t;
5967
5968 read_lock(&tasklist_lock);
5969
5970 do_each_thread(t, p) {
5971 if (p == current)
5972 continue;
5973
5974 if (task_cpu(p) == src_cpu)
5975 move_task_off_dead_cpu(src_cpu, p);
5976 } while_each_thread(t, p);
5977
5978 read_unlock(&tasklist_lock);
5979}
5980
5981
5982
5983
5984
5985
5986void sched_idle_next(void)
5987{
5988 int this_cpu = smp_processor_id();
5989 struct rq *rq = cpu_rq(this_cpu);
5990 struct task_struct *p = rq->idle;
5991 unsigned long flags;
5992
5993
5994 BUG_ON(cpu_online(this_cpu));
5995
5996
5997
5998
5999
6000 raw_spin_lock_irqsave(&rq->lock, flags);
6001
6002 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6003
6004 activate_task(rq, p, 0);
6005
6006 raw_spin_unlock_irqrestore(&rq->lock, flags);
6007}
6008
6009
6010
6011
6012
6013void idle_task_exit(void)
6014{
6015 struct mm_struct *mm = current->active_mm;
6016
6017 BUG_ON(cpu_online(smp_processor_id()));
6018
6019 if (mm != &init_mm)
6020 switch_mm(mm, &init_mm, current);
6021 mmdrop(mm);
6022}
6023
6024
6025static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6026{
6027 struct rq *rq = cpu_rq(dead_cpu);
6028
6029
6030 BUG_ON(!p->exit_state);
6031
6032
6033 BUG_ON(p->state == TASK_DEAD);
6034
6035 get_task_struct(p);
6036
6037
6038
6039
6040
6041
6042 raw_spin_unlock_irq(&rq->lock);
6043 move_task_off_dead_cpu(dead_cpu, p);
6044 raw_spin_lock_irq(&rq->lock);
6045
6046 put_task_struct(p);
6047}
6048
6049
6050static void migrate_dead_tasks(unsigned int dead_cpu)
6051{
6052 struct rq *rq = cpu_rq(dead_cpu);
6053 struct task_struct *next;
6054
6055 for ( ; ; ) {
6056 if (!rq->nr_running)
6057 break;
6058 next = pick_next_task(rq);
6059 if (!next)
6060 break;
6061 next->sched_class->put_prev_task(rq, next);
6062 migrate_dead(dead_cpu, next);
6063
6064 }
6065}
6066
6067
6068
6069
6070static void calc_global_load_remove(struct rq *rq)
6071{
6072 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6073 rq->calc_load_active = 0;
6074}
6075#endif
6076
6077#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6078
6079static struct ctl_table sd_ctl_dir[] = {
6080 {
6081 .procname = "sched_domain",
6082 .mode = 0555,
6083 },
6084 {}
6085};
6086
6087static struct ctl_table sd_ctl_root[] = {
6088 {
6089 .procname = "kernel",
6090 .mode = 0555,
6091 .child = sd_ctl_dir,
6092 },
6093 {}
6094};
6095
6096static struct ctl_table *sd_alloc_ctl_entry(int n)
6097{
6098 struct ctl_table *entry =
6099 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6100
6101 return entry;
6102}
6103
6104static void sd_free_ctl_entry(struct ctl_table **tablep)
6105{
6106 struct ctl_table *entry;
6107
6108
6109
6110
6111
6112
6113
6114 for (entry = *tablep; entry->mode; entry++) {
6115 if (entry->child)
6116 sd_free_ctl_entry(&entry->child);
6117 if (entry->proc_handler == NULL)
6118 kfree(entry->procname);
6119 }
6120
6121 kfree(*tablep);
6122 *tablep = NULL;
6123}
6124
6125static void
6126set_table_entry(struct ctl_table *entry,
6127 const char *procname, void *data, int maxlen,
6128 mode_t mode, proc_handler *proc_handler)
6129{
6130 entry->procname = procname;
6131 entry->data = data;
6132 entry->maxlen = maxlen;
6133 entry->mode = mode;
6134 entry->proc_handler = proc_handler;
6135}
6136
6137static struct ctl_table *
6138sd_alloc_ctl_domain_table(struct sched_domain *sd)
6139{
6140 struct ctl_table *table = sd_alloc_ctl_entry(13);
6141
6142 if (table == NULL)
6143 return NULL;
6144
6145 set_table_entry(&table[0], "min_interval", &sd->min_interval,
6146 sizeof(long), 0644, proc_doulongvec_minmax);
6147 set_table_entry(&table[1], "max_interval", &sd->max_interval,
6148 sizeof(long), 0644, proc_doulongvec_minmax);
6149 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6150 sizeof(int), 0644, proc_dointvec_minmax);
6151 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6152 sizeof(int), 0644, proc_dointvec_minmax);
6153 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6154 sizeof(int), 0644, proc_dointvec_minmax);
6155 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6156 sizeof(int), 0644, proc_dointvec_minmax);
6157 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6158 sizeof(int), 0644, proc_dointvec_minmax);
6159 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6160 sizeof(int), 0644, proc_dointvec_minmax);
6161 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6162 sizeof(int), 0644, proc_dointvec_minmax);
6163 set_table_entry(&table[9], "cache_nice_tries",
6164 &sd->cache_nice_tries,
6165 sizeof(int), 0644, proc_dointvec_minmax);
6166 set_table_entry(&table[10], "flags", &sd->flags,
6167 sizeof(int), 0644, proc_dointvec_minmax);
6168 set_table_entry(&table[11], "name", sd->name,
6169 CORENAME_MAX_SIZE, 0444, proc_dostring);
6170
6171
6172 return table;
6173}