1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
59#include <linux/seq_file.h>
60#include <linux/sysctl.h>
61#include <linux/syscalls.h>
62#include <linux/times.h>
63#include <linux/tsacct_kern.h>
64#include <linux/kprobes.h>
65#include <linux/delayacct.h>
66#include <linux/reciprocal_div.h>
67#include <linux/unistd.h>
68#include <linux/pagemap.h>
69#include <linux/hrtimer.h>
70#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h>
73#include <linux/ctype.h>
74#include <linux/ftrace.h>
75#include <trace/sched.h>
76
77#include <asm/tlb.h>
78#include <asm/irq_regs.h>
79
80#include "sched_cpupri.h"
81
82
83
84
85
86
87#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
88#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
89#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
90
91
92
93
94
95
96#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
97#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
98#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
99
100
101
102
103#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
104
105#define NICE_0_LOAD SCHED_LOAD_SCALE
106#define NICE_0_SHIFT SCHED_LOAD_SHIFT
107
108
109
110
111
112
113
114#define DEF_TIMESLICE (100 * HZ / 1000)
115
116
117
118
119#define RUNTIME_INF ((u64)~0ULL)
120
121#ifdef CONFIG_SMP
122
123
124
125
126static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
127{
128 return reciprocal_divide(load, sg->reciprocal_cpu_power);
129}
130
131
132
133
134
135static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
136{
137 sg->__cpu_power += val;
138 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
139}
140#endif
141
142static inline int rt_policy(int policy)
143{
144 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
145 return 1;
146 return 0;
147}
148
149static inline int task_has_rt_policy(struct task_struct *p)
150{
151 return rt_policy(p->policy);
152}
153
154
155
156
157struct rt_prio_array {
158 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
159 struct list_head queue[MAX_RT_PRIO];
160};
161
162struct rt_bandwidth {
163
164 spinlock_t rt_runtime_lock;
165 ktime_t rt_period;
166 u64 rt_runtime;
167 struct hrtimer rt_period_timer;
168};
169
170static struct rt_bandwidth def_rt_bandwidth;
171
172static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
173
174static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
175{
176 struct rt_bandwidth *rt_b =
177 container_of(timer, struct rt_bandwidth, rt_period_timer);
178 ktime_t now;
179 int overrun;
180 int idle = 0;
181
182 for (;;) {
183 now = hrtimer_cb_get_time(timer);
184 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
185
186 if (!overrun)
187 break;
188
189 idle = do_sched_rt_period_timer(rt_b, overrun);
190 }
191
192 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
193}
194
195static
196void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
197{
198 rt_b->rt_period = ns_to_ktime(period);
199 rt_b->rt_runtime = runtime;
200
201 spin_lock_init(&rt_b->rt_runtime_lock);
202
203 hrtimer_init(&rt_b->rt_period_timer,
204 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 rt_b->rt_period_timer.function = sched_rt_period_timer;
206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207}
208
209static inline int rt_bandwidth_enabled(void)
210{
211 return sysctl_sched_rt_runtime >= 0;
212}
213
214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
215{
216 ktime_t now;
217
218 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
219 return;
220
221 if (hrtimer_active(&rt_b->rt_period_timer))
222 return;
223
224 spin_lock(&rt_b->rt_runtime_lock);
225 for (;;) {
226 if (hrtimer_active(&rt_b->rt_period_timer))
227 break;
228
229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
231 hrtimer_start_expires(&rt_b->rt_period_timer,
232 HRTIMER_MODE_ABS);
233 }
234 spin_unlock(&rt_b->rt_runtime_lock);
235}
236
237#ifdef CONFIG_RT_GROUP_SCHED
238static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
239{
240 hrtimer_cancel(&rt_b->rt_period_timer);
241}
242#endif
243
244
245
246
247
248static DEFINE_MUTEX(sched_domains_mutex);
249
250#ifdef CONFIG_GROUP_SCHED
251
252#include <linux/cgroup.h>
253
254struct cfs_rq;
255
256static LIST_HEAD(task_groups);
257
258
259struct task_group {
260#ifdef CONFIG_CGROUP_SCHED
261 struct cgroup_subsys_state css;
262#endif
263
264#ifdef CONFIG_FAIR_GROUP_SCHED
265
266 struct sched_entity **se;
267
268 struct cfs_rq **cfs_rq;
269 unsigned long shares;
270#endif
271
272#ifdef CONFIG_RT_GROUP_SCHED
273 struct sched_rt_entity **rt_se;
274 struct rt_rq **rt_rq;
275
276 struct rt_bandwidth rt_bandwidth;
277#endif
278
279 struct rcu_head rcu;
280 struct list_head list;
281
282 struct task_group *parent;
283 struct list_head siblings;
284 struct list_head children;
285};
286
287#ifdef CONFIG_USER_SCHED
288
289
290
291
292
293
294struct task_group root_task_group;
295
296#ifdef CONFIG_FAIR_GROUP_SCHED
297
298static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
299
300static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
301#endif
302
303#ifdef CONFIG_RT_GROUP_SCHED
304static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
305static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
306#endif
307#else
308#define root_task_group init_task_group
309#endif
310
311
312
313
314static DEFINE_SPINLOCK(task_group_lock);
315
316#ifdef CONFIG_FAIR_GROUP_SCHED
317#ifdef CONFIG_USER_SCHED
318# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
319#else
320# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
321#endif
322
323
324
325
326
327
328
329
330
331#define MIN_SHARES 2
332#define MAX_SHARES (1UL << 18)
333
334static int init_task_group_load = INIT_TASK_GROUP_LOAD;
335#endif
336
337
338
339
340struct task_group init_task_group;
341
342
343static inline struct task_group *task_group(struct task_struct *p)
344{
345 struct task_group *tg;
346
347#ifdef CONFIG_USER_SCHED
348 tg = p->user->tg;
349#elif defined(CONFIG_CGROUP_SCHED)
350 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
351 struct task_group, css);
352#else
353 tg = &init_task_group;
354#endif
355 return tg;
356}
357
358
359static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
360{
361#ifdef CONFIG_FAIR_GROUP_SCHED
362 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
363 p->se.parent = task_group(p)->se[cpu];
364#endif
365
366#ifdef CONFIG_RT_GROUP_SCHED
367 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
368 p->rt.parent = task_group(p)->rt_se[cpu];
369#endif
370}
371
372#else
373
374static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
375static inline struct task_group *task_group(struct task_struct *p)
376{
377 return NULL;
378}
379
380#endif
381
382
383struct cfs_rq {
384 struct load_weight load;
385 unsigned long nr_running;
386
387 u64 exec_clock;
388 u64 min_vruntime;
389
390 struct rb_root tasks_timeline;
391 struct rb_node *rb_leftmost;
392
393 struct list_head tasks;
394 struct list_head *balance_iterator;
395
396
397
398
399
400 struct sched_entity *curr, *next, *last;
401
402 unsigned int nr_spread_over;
403
404#ifdef CONFIG_FAIR_GROUP_SCHED
405 struct rq *rq;
406
407
408
409
410
411
412
413
414
415 struct list_head leaf_cfs_rq_list;
416 struct task_group *tg;
417
418#ifdef CONFIG_SMP
419
420
421
422 unsigned long task_weight;
423
424
425
426
427
428
429
430 unsigned long h_load;
431
432
433
434
435 unsigned long shares;
436
437
438
439
440 unsigned long rq_weight;
441#endif
442#endif
443};
444
445
446struct rt_rq {
447 struct rt_prio_array active;
448 unsigned long rt_nr_running;
449#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
450 int highest_prio;
451#endif
452#ifdef CONFIG_SMP
453 unsigned long rt_nr_migratory;
454 int overloaded;
455#endif
456 int rt_throttled;
457 u64 rt_time;
458 u64 rt_runtime;
459
460 spinlock_t rt_runtime_lock;
461
462#ifdef CONFIG_RT_GROUP_SCHED
463 unsigned long rt_nr_boosted;
464
465 struct rq *rq;
466 struct list_head leaf_rt_rq_list;
467 struct task_group *tg;
468 struct sched_rt_entity *rt_se;
469#endif
470};
471
472#ifdef CONFIG_SMP
473
474
475
476
477
478
479
480
481
482struct root_domain {
483 atomic_t refcount;
484 cpumask_t span;
485 cpumask_t online;
486
487
488
489
490
491 cpumask_t rto_mask;
492 atomic_t rto_count;
493#ifdef CONFIG_SMP
494 struct cpupri cpupri;
495#endif
496};
497
498
499
500
501
502static struct root_domain def_root_domain;
503
504#endif
505
506
507
508
509
510
511
512
513struct rq {
514
515 spinlock_t lock;
516
517
518
519
520
521 unsigned long nr_running;
522 #define CPU_LOAD_IDX_MAX 5
523 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
524 unsigned char idle_at_tick;
525#ifdef CONFIG_NO_HZ
526 unsigned long last_tick_seen;
527 unsigned char in_nohz_recently;
528#endif
529
530 struct load_weight load;
531 unsigned long nr_load_updates;
532 u64 nr_switches;
533
534 struct cfs_rq cfs;
535 struct rt_rq rt;
536
537#ifdef CONFIG_FAIR_GROUP_SCHED
538
539 struct list_head leaf_cfs_rq_list;
540#endif
541#ifdef CONFIG_RT_GROUP_SCHED
542 struct list_head leaf_rt_rq_list;
543#endif
544
545
546
547
548
549
550
551 unsigned long nr_uninterruptible;
552
553 struct task_struct *curr, *idle;
554 unsigned long next_balance;
555 struct mm_struct *prev_mm;
556
557 u64 clock;
558
559 atomic_t nr_iowait;
560
561#ifdef CONFIG_SMP
562 struct root_domain *rd;
563 struct sched_domain *sd;
564
565
566 int active_balance;
567 int push_cpu;
568
569 int cpu;
570 int online;
571
572 unsigned long avg_load_per_task;
573
574 struct task_struct *migration_thread;
575 struct list_head migration_queue;
576#endif
577
578#ifdef CONFIG_SCHED_HRTICK
579#ifdef CONFIG_SMP
580 int hrtick_csd_pending;
581 struct call_single_data hrtick_csd;
582#endif
583 struct hrtimer hrtick_timer;
584#endif
585
586#ifdef CONFIG_SCHEDSTATS
587
588 struct sched_info rq_sched_info;
589
590
591 unsigned int yld_exp_empty;
592 unsigned int yld_act_empty;
593 unsigned int yld_both_empty;
594 unsigned int yld_count;
595
596
597 unsigned int sched_switch;
598 unsigned int sched_count;
599 unsigned int sched_goidle;
600
601
602 unsigned int ttwu_count;
603 unsigned int ttwu_local;
604
605
606 unsigned int bkl_count;
607#endif
608};
609
610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
611
612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
613{
614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
615}
616
617static inline int cpu_of(struct rq *rq)
618{
619#ifdef CONFIG_SMP
620 return rq->cpu;
621#else
622 return 0;
623#endif
624}
625
626
627
628
629
630
631
632
633#define for_each_domain(cpu, __sd) \
634 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
635
636#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
637#define this_rq() (&__get_cpu_var(runqueues))
638#define task_rq(p) cpu_rq(task_cpu(p))
639#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
640
641static inline void update_rq_clock(struct rq *rq)
642{
643 rq->clock = sched_clock_cpu(cpu_of(rq));
644}
645
646
647
648
649#ifdef CONFIG_SCHED_DEBUG
650# define const_debug __read_mostly
651#else
652# define const_debug static const
653#endif
654
655
656
657
658
659
660
661
662int runqueue_is_locked(void)
663{
664 int cpu = get_cpu();
665 struct rq *rq = cpu_rq(cpu);
666 int ret;
667
668 ret = spin_is_locked(&rq->lock);
669 put_cpu();
670 return ret;
671}
672
673
674
675
676
677#define SCHED_FEAT(name, enabled) \
678 __SCHED_FEAT_##name ,
679
680enum {
681#include "sched_features.h"
682};
683
684#undef SCHED_FEAT
685
686#define SCHED_FEAT(name, enabled) \
687 (1UL << __SCHED_FEAT_##name) * enabled |
688
689const_debug unsigned int sysctl_sched_features =
690#include "sched_features.h"
691 0;
692
693#undef SCHED_FEAT
694
695#ifdef CONFIG_SCHED_DEBUG
696#define SCHED_FEAT(name, enabled) \
697 #name ,
698
699static __read_mostly char *sched_feat_names[] = {
700#include "sched_features.h"
701 NULL
702};
703
704#undef SCHED_FEAT
705
706static int sched_feat_open(struct inode *inode, struct file *filp)
707{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i;
720
721 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]);
723 len += 4;
724 }
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 }
736
737 r += sprintf(buf + r, "\n");
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745}
746
747static ssize_t
748sched_feat_write(struct file *filp, const char __user *ubuf,
749 size_t cnt, loff_t *ppos)
750{
751 char buf[64];
752 char *cmp = buf;
753 int neg = 0;
754 int i;
755
756 if (cnt > 63)
757 cnt = 63;
758
759 if (copy_from_user(&buf, ubuf, cnt))
760 return -EFAULT;
761
762 buf[cnt] = 0;
763
764 if (strncmp(buf, "NO_", 3) == 0) {
765 neg = 1;
766 cmp += 3;
767 }
768
769 for (i = 0; sched_feat_names[i]; i++) {
770 int len = strlen(sched_feat_names[i]);
771
772 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
773 if (neg)
774 sysctl_sched_features &= ~(1UL << i);
775 else
776 sysctl_sched_features |= (1UL << i);
777 break;
778 }
779 }
780
781 if (!sched_feat_names[i])
782 return -EINVAL;
783
784 filp->f_pos += cnt;
785
786 return cnt;
787}
788
789static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open,
791 .read = sched_feat_read,
792 .write = sched_feat_write,
793};
794
795static __init int sched_init_debug(void)
796{
797 debugfs_create_file("sched_features", 0644, NULL, NULL,
798 &sched_feat_fops);
799
800 return 0;
801}
802late_initcall(sched_init_debug);
803
804#endif
805
806#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
807
808
809
810
811
812const_debug unsigned int sysctl_sched_nr_migrate = 32;
813
814
815
816
817
818unsigned int sysctl_sched_shares_ratelimit = 250000;
819
820
821
822
823
824
825unsigned int sysctl_sched_shares_thresh = 4;
826
827
828
829
830
831unsigned int sysctl_sched_rt_period = 1000000;
832
833static __read_mostly int scheduler_running;
834
835
836
837
838
839int sysctl_sched_rt_runtime = 950000;
840
841static inline u64 global_rt_period(void)
842{
843 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
844}
845
846static inline u64 global_rt_runtime(void)
847{
848 if (sysctl_sched_rt_runtime < 0)
849 return RUNTIME_INF;
850
851 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
852}
853
854#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0)
856#endif
857#ifndef finish_arch_switch
858# define finish_arch_switch(prev) do { } while (0)
859#endif
860
861static inline int task_current(struct rq *rq, struct task_struct *p)
862{
863 return rq->curr == p;
864}
865
866#ifndef __ARCH_WANT_UNLOCKED_CTXSW
867static inline int task_running(struct rq *rq, struct task_struct *p)
868{
869 return task_current(rq, p);
870}
871
872static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
873{
874}
875
876static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
877{
878#ifdef CONFIG_DEBUG_SPINLOCK
879
880 rq->lock.owner = current;
881#endif
882
883
884
885
886
887 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
888
889 spin_unlock_irq(&rq->lock);
890}
891
892#else
893static inline int task_running(struct rq *rq, struct task_struct *p)
894{
895#ifdef CONFIG_SMP
896 return p->oncpu;
897#else
898 return task_current(rq, p);
899#endif
900}
901
902static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
903{
904#ifdef CONFIG_SMP
905
906
907
908
909
910 next->oncpu = 1;
911#endif
912#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
913 spin_unlock_irq(&rq->lock);
914#else
915 spin_unlock(&rq->lock);
916#endif
917}
918
919static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
920{
921#ifdef CONFIG_SMP
922
923
924
925
926
927 smp_wmb();
928 prev->oncpu = 0;
929#endif
930#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
931 local_irq_enable();
932#endif
933}
934#endif
935
936
937
938
939
940static inline struct rq *__task_rq_lock(struct task_struct *p)
941 __acquires(rq->lock)
942{
943 for (;;) {
944 struct rq *rq = task_rq(p);
945 spin_lock(&rq->lock);
946 if (likely(rq == task_rq(p)))
947 return rq;
948 spin_unlock(&rq->lock);
949 }
950}
951
952
953
954
955
956
957static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
958 __acquires(rq->lock)
959{
960 struct rq *rq;
961
962 for (;;) {
963 local_irq_save(*flags);
964 rq = task_rq(p);
965 spin_lock(&rq->lock);
966 if (likely(rq == task_rq(p)))
967 return rq;
968 spin_unlock_irqrestore(&rq->lock, *flags);
969 }
970}
971
972void task_rq_unlock_wait(struct task_struct *p)
973{
974 struct rq *rq = task_rq(p);
975
976 smp_mb();
977 spin_unlock_wait(&rq->lock);
978}
979
980static void __task_rq_unlock(struct rq *rq)
981 __releases(rq->lock)
982{
983 spin_unlock(&rq->lock);
984}
985
986static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
987 __releases(rq->lock)
988{
989 spin_unlock_irqrestore(&rq->lock, *flags);
990}
991
992
993
994
995static struct rq *this_rq_lock(void)
996 __acquires(rq->lock)
997{
998 struct rq *rq;
999
1000 local_irq_disable();
1001 rq = this_rq();
1002 spin_lock(&rq->lock);
1003
1004 return rq;
1005}
1006
1007#ifdef CONFIG_SCHED_HRTICK
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024static inline int hrtick_enabled(struct rq *rq)
1025{
1026 if (!sched_feat(HRTICK))
1027 return 0;
1028 if (!cpu_active(cpu_of(rq)))
1029 return 0;
1030 return hrtimer_is_hres_active(&rq->hrtick_timer);
1031}
1032
1033static void hrtick_clear(struct rq *rq)
1034{
1035 if (hrtimer_active(&rq->hrtick_timer))
1036 hrtimer_cancel(&rq->hrtick_timer);
1037}
1038
1039
1040
1041
1042
1043static enum hrtimer_restart hrtick(struct hrtimer *timer)
1044{
1045 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1046
1047 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1048
1049 spin_lock(&rq->lock);
1050 update_rq_clock(rq);
1051 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1052 spin_unlock(&rq->lock);
1053
1054 return HRTIMER_NORESTART;
1055}
1056
1057#ifdef CONFIG_SMP
1058
1059
1060
1061static void __hrtick_start(void *arg)
1062{
1063 struct rq *rq = arg;
1064
1065 spin_lock(&rq->lock);
1066 hrtimer_restart(&rq->hrtick_timer);
1067 rq->hrtick_csd_pending = 0;
1068 spin_unlock(&rq->lock);
1069}
1070
1071
1072
1073
1074
1075
1076static void hrtick_start(struct rq *rq, u64 delay)
1077{
1078 struct hrtimer *timer = &rq->hrtick_timer;
1079 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1080
1081 hrtimer_set_expires(timer, time);
1082
1083 if (rq == this_rq()) {
1084 hrtimer_restart(timer);
1085 } else if (!rq->hrtick_csd_pending) {
1086 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1087 rq->hrtick_csd_pending = 1;
1088 }
1089}
1090
1091static int
1092hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1093{
1094 int cpu = (int)(long)hcpu;
1095
1096 switch (action) {
1097 case CPU_UP_CANCELED:
1098 case CPU_UP_CANCELED_FROZEN:
1099 case CPU_DOWN_PREPARE:
1100 case CPU_DOWN_PREPARE_FROZEN:
1101 case CPU_DEAD:
1102 case CPU_DEAD_FROZEN:
1103 hrtick_clear(cpu_rq(cpu));
1104 return NOTIFY_OK;
1105 }
1106
1107 return NOTIFY_DONE;
1108}
1109
1110static __init void init_hrtick(void)
1111{
1112 hotcpu_notifier(hotplug_hrtick, 0);
1113}
1114#else
1115
1116
1117
1118
1119
1120static void hrtick_start(struct rq *rq, u64 delay)
1121{
1122 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1123}
1124
1125static inline void init_hrtick(void)
1126{
1127}
1128#endif
1129
1130static void init_rq_hrtick(struct rq *rq)
1131{
1132#ifdef CONFIG_SMP
1133 rq->hrtick_csd_pending = 0;
1134
1135 rq->hrtick_csd.flags = 0;
1136 rq->hrtick_csd.func = __hrtick_start;
1137 rq->hrtick_csd.info = rq;
1138#endif
1139
1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1141 rq->hrtick_timer.function = hrtick;
1142 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1143}
1144#else
1145static inline void hrtick_clear(struct rq *rq)
1146{
1147}
1148
1149static inline void init_rq_hrtick(struct rq *rq)
1150{
1151}
1152
1153static inline void init_hrtick(void)
1154{
1155}
1156#endif
1157
1158
1159
1160
1161
1162
1163
1164
1165#ifdef CONFIG_SMP
1166
1167#ifndef tsk_is_polling
1168#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1169#endif
1170
1171static void resched_task(struct task_struct *p)
1172{
1173 int cpu;
1174
1175 assert_spin_locked(&task_rq(p)->lock);
1176
1177 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1178 return;
1179
1180 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1181
1182 cpu = task_cpu(p);
1183 if (cpu == smp_processor_id())
1184 return;
1185
1186
1187 smp_mb();
1188 if (!tsk_is_polling(p))
1189 smp_send_reschedule(cpu);
1190}
1191
1192static void resched_cpu(int cpu)
1193{
1194 struct rq *rq = cpu_rq(cpu);
1195 unsigned long flags;
1196
1197 if (!spin_trylock_irqsave(&rq->lock, flags))
1198 return;
1199 resched_task(cpu_curr(cpu));
1200 spin_unlock_irqrestore(&rq->lock, flags);
1201}
1202
1203#ifdef CONFIG_NO_HZ
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214void wake_up_idle_cpu(int cpu)
1215{
1216 struct rq *rq = cpu_rq(cpu);
1217
1218 if (cpu == smp_processor_id())
1219 return;
1220
1221
1222
1223
1224
1225
1226
1227
1228 if (rq->curr != rq->idle)
1229 return;
1230
1231
1232
1233
1234
1235
1236 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1237
1238
1239 smp_mb();
1240 if (!tsk_is_polling(rq->idle))
1241 smp_send_reschedule(cpu);
1242}
1243#endif
1244
1245#else
1246static void resched_task(struct task_struct *p)
1247{
1248 assert_spin_locked(&task_rq(p)->lock);
1249 set_tsk_need_resched(p);
1250}
1251#endif
1252
1253#if BITS_PER_LONG == 32
1254# define WMULT_CONST (~0UL)
1255#else
1256# define WMULT_CONST (1UL << 32)
1257#endif
1258
1259#define WMULT_SHIFT 32
1260
1261
1262
1263
1264#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1265
1266
1267
1268
1269static unsigned long
1270calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1271 struct load_weight *lw)
1272{
1273 u64 tmp;
1274
1275 if (!lw->inv_weight) {
1276 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1277 lw->inv_weight = 1;
1278 else
1279 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1280 / (lw->weight+1);
1281 }
1282
1283 tmp = (u64)delta_exec * weight;
1284
1285
1286
1287 if (unlikely(tmp > WMULT_CONST))
1288 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1289 WMULT_SHIFT/2);
1290 else
1291 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1292
1293 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1294}
1295
1296static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1297{
1298 lw->weight += inc;
1299 lw->inv_weight = 0;
1300}
1301
1302static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1303{
1304 lw->weight -= dec;
1305 lw->inv_weight = 0;
1306}
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317#define WEIGHT_IDLEPRIO 2
1318#define WMULT_IDLEPRIO (1 << 31)
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332static const int prio_to_weight[40] = {
1333 88761, 71755, 56483, 46273, 36291,
1334 29154, 23254, 18705, 14949, 11916,
1335 9548, 7620, 6100, 4904, 3906,
1336 3121, 2501, 1991, 1586, 1277,
1337 1024, 820, 655, 526, 423,
1338 335, 272, 215, 172, 137,
1339 110, 87, 70, 56, 45,
1340 36, 29, 23, 18, 15,
1341};
1342
1343
1344
1345
1346
1347
1348
1349
1350static const u32 prio_to_wmult[40] = {
1351 48388, 59856, 76040, 92818, 118348,
1352 147320, 184698, 229616, 287308, 360437,
1353 449829, 563644, 704093, 875809, 1099582,
1354 1376151, 1717300, 2157191, 2708050, 3363326,
1355 4194304, 5237765, 6557202, 8165337, 10153587,
1356 12820798, 15790321, 19976592, 24970740, 31350126,
1357 39045157, 49367440, 61356676, 76695844, 95443717,
1358 119304647, 148102320, 186737708, 238609294, 286331153,
1359};
1360
1361static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1362
1363
1364
1365
1366
1367
1368struct rq_iterator {
1369 void *arg;
1370 struct task_struct *(*start)(void *);
1371 struct task_struct *(*next)(void *);
1372};
1373
1374#ifdef CONFIG_SMP
1375static unsigned long
1376balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1377 unsigned long max_load_move, struct sched_domain *sd,
1378 enum cpu_idle_type idle, int *all_pinned,
1379 int *this_best_prio, struct rq_iterator *iterator);
1380
1381static int
1382iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1383 struct sched_domain *sd, enum cpu_idle_type idle,
1384 struct rq_iterator *iterator);
1385#endif
1386
1387#ifdef CONFIG_CGROUP_CPUACCT
1388static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1389#else
1390static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1391#endif
1392
1393static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1394{
1395 update_load_add(&rq->load, load);
1396}
1397
1398static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1399{
1400 update_load_sub(&rq->load, load);
1401}
1402
1403#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1404typedef int (*tg_visitor)(struct task_group *, void *);
1405
1406
1407
1408
1409
1410static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1411{
1412 struct task_group *parent, *child;
1413 int ret;
1414
1415 rcu_read_lock();
1416 parent = &root_task_group;
1417down:
1418 ret = (*down)(parent, data);
1419 if (ret)
1420 goto out_unlock;
1421 list_for_each_entry_rcu(child, &parent->children, siblings) {
1422 parent = child;
1423 goto down;
1424
1425up:
1426 continue;
1427 }
1428 ret = (*up)(parent, data);
1429 if (ret)
1430 goto out_unlock;
1431
1432 child = parent;
1433 parent = parent->parent;
1434 if (parent)
1435 goto up;
1436out_unlock:
1437 rcu_read_unlock();
1438
1439 return ret;
1440}
1441
1442static int tg_nop(struct task_group *tg, void *data)
1443{
1444 return 0;
1445}
1446#endif
1447
1448#ifdef CONFIG_SMP
1449static unsigned long source_load(int cpu, int type);
1450static unsigned long target_load(int cpu, int type);
1451static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1452
1453static unsigned long cpu_avg_load_per_task(int cpu)
1454{
1455 struct rq *rq = cpu_rq(cpu);
1456 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1457
1458 if (nr_running)
1459 rq->avg_load_per_task = rq->load.weight / nr_running;
1460 else
1461 rq->avg_load_per_task = 0;
1462
1463 return rq->avg_load_per_task;
1464}
1465
1466#ifdef CONFIG_FAIR_GROUP_SCHED
1467
1468static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1469
1470
1471
1472
1473static void
1474update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{
1477 int boost = 0;
1478 unsigned long shares;
1479 unsigned long rq_weight;
1480
1481 if (!tg->se[cpu])
1482 return;
1483
1484 rq_weight = tg->cfs_rq[cpu]->load.weight;
1485
1486
1487
1488
1489
1490
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498
1499
1500
1501
1502
1503
1504
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507
1508 if (abs(shares - tg->se[cpu]->load.weight) >
1509 sysctl_sched_shares_thresh) {
1510 struct rq *rq = cpu_rq(cpu);
1511 unsigned long flags;
1512
1513 spin_lock_irqsave(&rq->lock, flags);
1514
1515
1516
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519
1520 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags);
1522 }
1523}
1524
1525
1526
1527
1528
1529
1530static int tg_shares_up(struct task_group *tg, void *data)
1531{
1532 unsigned long rq_weight = 0;
1533 unsigned long shares = 0;
1534 struct sched_domain *sd = data;
1535 int i;
1536
1537 for_each_cpu_mask(i, sd->span) {
1538 rq_weight += tg->cfs_rq[i]->load.weight;
1539 shares += tg->cfs_rq[i]->shares;
1540 }
1541
1542 if ((!shares && rq_weight) || shares > tg->shares)
1543 shares = tg->shares;
1544
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares;
1547
1548 if (!rq_weight)
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight);
1553
1554 return 0;
1555}
1556
1557
1558
1559
1560
1561
1562static int tg_load_down(struct task_group *tg, void *data)
1563{
1564 unsigned long load;
1565 long cpu = (long)data;
1566
1567 if (!tg->parent) {
1568 load = cpu_rq(cpu)->load.weight;
1569 } else {
1570 load = tg->parent->cfs_rq[cpu]->h_load;
1571 load *= tg->cfs_rq[cpu]->shares;
1572 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1573 }
1574
1575 tg->cfs_rq[cpu]->h_load = load;
1576
1577 return 0;
1578}
1579
1580static void update_shares(struct sched_domain *sd)
1581{
1582 u64 now = cpu_clock(raw_smp_processor_id());
1583 s64 elapsed = now - sd->last_update;
1584
1585 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1586 sd->last_update = now;
1587 walk_tg_tree(tg_nop, tg_shares_up, sd);
1588 }
1589}
1590
1591static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1592{
1593 spin_unlock(&rq->lock);
1594 update_shares(sd);
1595 spin_lock(&rq->lock);
1596}
1597
1598static void update_h_load(long cpu)
1599{
1600 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1601}
1602
1603#else
1604
1605static inline void update_shares(struct sched_domain *sd)
1606{
1607}
1608
1609static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610{
1611}
1612
1613#endif
1614
1615#endif
1616
1617#ifdef CONFIG_FAIR_GROUP_SCHED
1618static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1619{
1620#ifdef CONFIG_SMP
1621 cfs_rq->shares = shares;
1622#endif
1623}
1624#endif
1625
1626#include "sched_stats.h"
1627#include "sched_idletask.c"
1628#include "sched_fair.c"
1629#include "sched_rt.c"
1630#ifdef CONFIG_SCHED_DEBUG
1631# include "sched_debug.c"
1632#endif
1633
1634#define sched_class_highest (&rt_sched_class)
1635#define for_each_class(class) \
1636 for (class = sched_class_highest; class; class = class->next)
1637
1638static void inc_nr_running(struct rq *rq)
1639{
1640 rq->nr_running++;
1641}
1642
1643static void dec_nr_running(struct rq *rq)
1644{
1645 rq->nr_running--;
1646}
1647
1648static void set_load_weight(struct task_struct *p)
1649{
1650 if (task_has_rt_policy(p)) {
1651 p->se.load.weight = prio_to_weight[0] * 2;
1652 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1653 return;
1654 }
1655
1656
1657
1658
1659 if (p->policy == SCHED_IDLE) {
1660 p->se.load.weight = WEIGHT_IDLEPRIO;
1661 p->se.load.inv_weight = WMULT_IDLEPRIO;
1662 return;
1663 }
1664
1665 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1666 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1667}
1668
1669static void update_avg(u64 *avg, u64 sample)
1670{
1671 s64 diff = sample - *avg;
1672 *avg += diff >> 3;
1673}
1674
1675static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1676{
1677 sched_info_queued(p);
1678 p->sched_class->enqueue_task(rq, p, wakeup);
1679 p->se.on_rq = 1;
1680}
1681
1682static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1683{
1684 if (sleep && p->se.last_wakeup) {
1685 update_avg(&p->se.avg_overlap,
1686 p->se.sum_exec_runtime - p->se.last_wakeup);
1687 p->se.last_wakeup = 0;
1688 }
1689
1690 sched_info_dequeued(p);
1691 p->sched_class->dequeue_task(rq, p, sleep);
1692 p->se.on_rq = 0;
1693}
1694
1695
1696
1697
1698static inline int __normal_prio(struct task_struct *p)
1699{
1700 return p->static_prio;
1701}
1702
1703
1704
1705
1706
1707
1708
1709
1710static inline int normal_prio(struct task_struct *p)
1711{
1712 int prio;
1713
1714 if (task_has_rt_policy(p))
1715 prio = MAX_RT_PRIO-1 - p->rt_priority;
1716 else
1717 prio = __normal_prio(p);
1718 return prio;
1719}
1720
1721
1722
1723
1724
1725
1726
1727
1728static int effective_prio(struct task_struct *p)
1729{
1730 p->normal_prio = normal_prio(p);
1731
1732
1733
1734
1735
1736 if (!rt_prio(p->prio))
1737 return p->normal_prio;
1738 return p->prio;
1739}
1740
1741
1742
1743
1744static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1745{
1746 if (task_contributes_to_load(p))
1747 rq->nr_uninterruptible--;
1748
1749 enqueue_task(rq, p, wakeup);
1750 inc_nr_running(rq);
1751}
1752
1753
1754
1755
1756static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1757{
1758 if (task_contributes_to_load(p))
1759 rq->nr_uninterruptible++;
1760
1761 dequeue_task(rq, p, sleep);
1762 dec_nr_running(rq);
1763}
1764
1765
1766
1767
1768
1769inline int task_curr(const struct task_struct *p)
1770{
1771 return cpu_curr(task_cpu(p)) == p;
1772}
1773
1774static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1775{
1776 set_task_rq(p, cpu);
1777#ifdef CONFIG_SMP
1778
1779
1780
1781
1782
1783 smp_wmb();
1784 task_thread_info(p)->cpu = cpu;
1785#endif
1786}
1787
1788static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1789 const struct sched_class *prev_class,
1790 int oldprio, int running)
1791{
1792 if (prev_class != p->sched_class) {
1793 if (prev_class->switched_from)
1794 prev_class->switched_from(rq, p, running);
1795 p->sched_class->switched_to(rq, p, running);
1796 } else
1797 p->sched_class->prio_changed(rq, p, oldprio, running);
1798}
1799
1800#ifdef CONFIG_SMP
1801
1802
1803static unsigned long weighted_cpuload(const int cpu)
1804{
1805 return cpu_rq(cpu)->load.weight;
1806}
1807
1808
1809
1810
1811static int
1812task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1813{
1814 s64 delta;
1815
1816
1817
1818
1819 if (sched_feat(CACHE_HOT_BUDDY) &&
1820 (&p->se == cfs_rq_of(&p->se)->next ||
1821 &p->se == cfs_rq_of(&p->se)->last))
1822 return 1;
1823
1824 if (p->sched_class != &fair_sched_class)
1825 return 0;
1826
1827 if (sysctl_sched_migration_cost == -1)
1828 return 1;
1829 if (sysctl_sched_migration_cost == 0)
1830 return 0;
1831
1832 delta = now - p->se.exec_start;
1833
1834 return delta < (s64)sysctl_sched_migration_cost;
1835}
1836
1837
1838void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1839{
1840 int old_cpu = task_cpu(p);
1841 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1842 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1843 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1844 u64 clock_offset;
1845
1846 clock_offset = old_rq->clock - new_rq->clock;
1847
1848#ifdef CONFIG_SCHEDSTATS
1849 if (p->se.wait_start)
1850 p->se.wait_start -= clock_offset;
1851 if (p->se.sleep_start)
1852 p->se.sleep_start -= clock_offset;
1853 if (p->se.block_start)
1854 p->se.block_start -= clock_offset;
1855 if (old_cpu != new_cpu) {
1856 schedstat_inc(p, se.nr_migrations);
1857 if (task_hot(p, old_rq->clock, NULL))
1858 schedstat_inc(p, se.nr_forced2_migrations);
1859 }
1860#endif
1861 p->se.vruntime -= old_cfsrq->min_vruntime -
1862 new_cfsrq->min_vruntime;
1863
1864 __set_task_cpu(p, new_cpu);
1865}
1866
1867struct migration_req {
1868 struct list_head list;
1869
1870 struct task_struct *task;
1871 int dest_cpu;
1872
1873 struct completion done;
1874};
1875
1876
1877
1878
1879
1880static int
1881migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1882{
1883 struct rq *rq = task_rq(p);
1884
1885
1886
1887
1888
1889 if (!p->se.on_rq && !task_running(rq, p)) {
1890 set_task_cpu(p, dest_cpu);
1891 return 0;
1892 }
1893
1894 init_completion(&req->done);
1895 req->task = p;
1896 req->dest_cpu = dest_cpu;
1897 list_add(&req->list, &rq->migration_queue);
1898
1899 return 1;
1900}
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1919{
1920 unsigned long flags;
1921 int running, on_rq;
1922 unsigned long ncsw;
1923 struct rq *rq;
1924
1925 for (;;) {
1926
1927
1928
1929
1930
1931
1932 rq = task_rq(p);
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945 while (task_running(rq, p)) {
1946 if (match_state && unlikely(p->state != match_state))
1947 return 0;
1948 cpu_relax();
1949 }
1950
1951
1952
1953
1954
1955
1956 rq = task_rq_lock(p, &flags);
1957 trace_sched_wait_task(rq, p);
1958 running = task_running(rq, p);
1959 on_rq = p->se.on_rq;
1960 ncsw = 0;
1961 if (!match_state || p->state == match_state)
1962 ncsw = p->nvcsw | LONG_MIN;
1963 task_rq_unlock(rq, &flags);
1964
1965
1966
1967
1968 if (unlikely(!ncsw))
1969 break;
1970
1971
1972
1973
1974
1975
1976
1977 if (unlikely(running)) {
1978 cpu_relax();
1979 continue;
1980 }
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991 if (unlikely(on_rq)) {
1992 schedule_timeout_uninterruptible(1);
1993 continue;
1994 }
1995
1996
1997
1998
1999
2000
2001 break;
2002 }
2003
2004 return ncsw;
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020void kick_process(struct task_struct *p)
2021{
2022 int cpu;
2023
2024 preempt_disable();
2025 cpu = task_cpu(p);
2026 if ((cpu != smp_processor_id()) && task_curr(p))
2027 smp_send_reschedule(cpu);
2028 preempt_enable();
2029}
2030
2031
2032
2033
2034
2035
2036
2037
2038static unsigned long source_load(int cpu, int type)
2039{
2040 struct rq *rq = cpu_rq(cpu);
2041 unsigned long total = weighted_cpuload(cpu);
2042
2043 if (type == 0 || !sched_feat(LB_BIAS))
2044 return total;
2045
2046 return min(rq->cpu_load[type-1], total);
2047}
2048
2049
2050
2051
2052
2053static unsigned long target_load(int cpu, int type)
2054{
2055 struct rq *rq = cpu_rq(cpu);
2056 unsigned long total = weighted_cpuload(cpu);
2057
2058 if (type == 0 || !sched_feat(LB_BIAS))
2059 return total;
2060
2061 return max(rq->cpu_load[type-1], total);
2062}
2063
2064
2065
2066
2067
2068static struct sched_group *
2069find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2070{
2071 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2072 unsigned long min_load = ULONG_MAX, this_load = 0;
2073 int load_idx = sd->forkexec_idx;
2074 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2075
2076 do {
2077 unsigned long load, avg_load;
2078 int local_group;
2079 int i;
2080
2081
2082 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
2083 continue;
2084
2085 local_group = cpu_isset(this_cpu, group->cpumask);
2086
2087
2088 avg_load = 0;
2089
2090 for_each_cpu_mask_nr(i, group->cpumask) {
2091
2092 if (local_group)
2093 load = source_load(i, load_idx);
2094 else
2095 load = target_load(i, load_idx);
2096
2097 avg_load += load;
2098 }
2099
2100
2101 avg_load = sg_div_cpu_power(group,
2102 avg_load * SCHED_LOAD_SCALE);
2103
2104 if (local_group) {
2105 this_load = avg_load;
2106 this = group;
2107 } else if (avg_load < min_load) {
2108 min_load = avg_load;
2109 idlest = group;
2110 }
2111 } while (group = group->next, group != sd->groups);
2112
2113 if (!idlest || 100*this_load < imbalance*min_load)
2114 return NULL;
2115 return idlest;
2116}
2117
2118
2119
2120
2121static int
2122find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2123 cpumask_t *tmp)
2124{
2125 unsigned long load, min_load = ULONG_MAX;
2126 int idlest = -1;
2127 int i;
2128
2129
2130 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
2131
2132 for_each_cpu_mask_nr(i, *tmp) {
2133 load = weighted_cpuload(i);
2134
2135 if (load < min_load || (load == min_load && i == this_cpu)) {
2136 min_load = load;
2137 idlest = i;
2138 }
2139 }
2140
2141 return idlest;
2142}
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155static int sched_balance_self(int cpu, int flag)
2156{
2157 struct task_struct *t = current;
2158 struct sched_domain *tmp, *sd = NULL;
2159
2160 for_each_domain(cpu, tmp) {
2161
2162
2163
2164 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2165 break;
2166 if (tmp->flags & flag)
2167 sd = tmp;
2168 }
2169
2170 if (sd)
2171 update_shares(sd);
2172
2173 while (sd) {
2174 cpumask_t span, tmpmask;
2175 struct sched_group *group;
2176 int new_cpu, weight;
2177
2178 if (!(sd->flags & flag)) {
2179 sd = sd->child;
2180 continue;
2181 }
2182
2183 span = sd->span;
2184 group = find_idlest_group(sd, t, cpu);
2185 if (!group) {
2186 sd = sd->child;
2187 continue;
2188 }
2189
2190 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2191 if (new_cpu == -1 || new_cpu == cpu) {
2192
2193 sd = sd->child;
2194 continue;
2195 }
2196
2197
2198 cpu = new_cpu;
2199 sd = NULL;
2200 weight = cpus_weight(span);
2201 for_each_domain(cpu, tmp) {
2202 if (weight <= cpus_weight(tmp->span))
2203 break;
2204 if (tmp->flags & flag)
2205 sd = tmp;
2206 }
2207
2208 }
2209
2210 return cpu;
2211}
2212
2213#endif
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2230{
2231 int cpu, orig_cpu, this_cpu, success = 0;
2232 unsigned long flags;
2233 long old_state;
2234 struct rq *rq;
2235
2236 if (!sched_feat(SYNC_WAKEUPS))
2237 sync = 0;
2238
2239#ifdef CONFIG_SMP
2240 if (sched_feat(LB_WAKEUP_UPDATE)) {
2241 struct sched_domain *sd;
2242
2243 this_cpu = raw_smp_processor_id();
2244 cpu = task_cpu(p);
2245
2246 for_each_domain(this_cpu, sd) {
2247 if (cpu_isset(cpu, sd->span)) {
2248 update_shares(sd);
2249 break;
2250 }
2251 }
2252 }
2253#endif
2254
2255 smp_wmb();
2256 rq = task_rq_lock(p, &flags);
2257 old_state = p->state;
2258 if (!(old_state & state))
2259 goto out;
2260
2261 if (p->se.on_rq)
2262 goto out_running;
2263
2264 cpu = task_cpu(p);
2265 orig_cpu = cpu;
2266 this_cpu = smp_processor_id();
2267
2268#ifdef CONFIG_SMP
2269 if (unlikely(task_running(rq, p)))
2270 goto out_activate;
2271
2272 cpu = p->sched_class->select_task_rq(p, sync);
2273 if (cpu != orig_cpu) {
2274 set_task_cpu(p, cpu);
2275 task_rq_unlock(rq, &flags);
2276
2277 rq = task_rq_lock(p, &flags);
2278 old_state = p->state;
2279 if (!(old_state & state))
2280 goto out;
2281 if (p->se.on_rq)
2282 goto out_running;
2283
2284 this_cpu = smp_processor_id();
2285 cpu = task_cpu(p);
2286 }
2287
2288#ifdef CONFIG_SCHEDSTATS
2289 schedstat_inc(rq, ttwu_count);
2290 if (cpu == this_cpu)
2291 schedstat_inc(rq, ttwu_local);
2292 else {
2293 struct sched_domain *sd;
2294 for_each_domain(this_cpu, sd) {
2295 if (cpu_isset(cpu, sd->span)) {
2296 schedstat_inc(sd, ttwu_wake_remote);
2297 break;
2298 }
2299 }
2300 }
2301#endif
2302
2303out_activate:
2304#endif
2305 schedstat_inc(p, se.nr_wakeups);
2306 if (sync)
2307 schedstat_inc(p, se.nr_wakeups_sync);
2308 if (orig_cpu != cpu)
2309 schedstat_inc(p, se.nr_wakeups_migrate);
2310 if (cpu == this_cpu)
2311 schedstat_inc(p, se.nr_wakeups_local);
2312 else
2313 schedstat_inc(p, se.nr_wakeups_remote);
2314 update_rq_clock(rq);
2315 activate_task(rq, p, 1);
2316 success = 1;
2317
2318out_running:
2319 trace_sched_wakeup(rq, p);
2320 check_preempt_curr(rq, p, sync);
2321
2322 p->state = TASK_RUNNING;
2323#ifdef CONFIG_SMP
2324 if (p->sched_class->task_wake_up)
2325 p->sched_class->task_wake_up(rq, p);
2326#endif
2327out:
2328 current->se.last_wakeup = current->se.sum_exec_runtime;
2329
2330 task_rq_unlock(rq, &flags);
2331
2332 return success;
2333}
2334
2335int wake_up_process(struct task_struct *p)
2336{
2337 return try_to_wake_up(p, TASK_ALL, 0);
2338}
2339EXPORT_SYMBOL(wake_up_process);
2340
2341int wake_up_state(struct task_struct *p, unsigned int state)
2342{
2343 return try_to_wake_up(p, state, 0);
2344}
2345
2346
2347
2348
2349
2350
2351
2352static void __sched_fork(struct task_struct *p)
2353{
2354 p->se.exec_start = 0;
2355 p->se.sum_exec_runtime = 0;
2356 p->se.prev_sum_exec_runtime = 0;
2357 p->se.last_wakeup = 0;
2358 p->se.avg_overlap = 0;
2359
2360#ifdef CONFIG_SCHEDSTATS
2361 p->se.wait_start = 0;
2362 p->se.sum_sleep_runtime = 0;
2363 p->se.sleep_start = 0;
2364 p->se.block_start = 0;
2365 p->se.sleep_max = 0;
2366 p->se.block_max = 0;
2367 p->se.exec_max = 0;
2368 p->se.slice_max = 0;
2369 p->se.wait_max = 0;
2370#endif
2371
2372 INIT_LIST_HEAD(&p->rt.run_list);
2373 p->se.on_rq = 0;
2374 INIT_LIST_HEAD(&p->se.group_node);
2375
2376#ifdef CONFIG_PREEMPT_NOTIFIERS
2377 INIT_HLIST_HEAD(&p->preempt_notifiers);
2378#endif
2379
2380
2381
2382
2383
2384
2385
2386 p->state = TASK_RUNNING;
2387}
2388
2389
2390
2391
2392void sched_fork(struct task_struct *p, int clone_flags)
2393{
2394 int cpu = get_cpu();
2395
2396 __sched_fork(p);
2397
2398#ifdef CONFIG_SMP
2399 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2400#endif
2401 set_task_cpu(p, cpu);
2402
2403
2404
2405
2406 p->prio = current->normal_prio;
2407 if (!rt_prio(p->prio))
2408 p->sched_class = &fair_sched_class;
2409
2410#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2411 if (likely(sched_info_on()))
2412 memset(&p->sched_info, 0, sizeof(p->sched_info));
2413#endif
2414#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2415 p->oncpu = 0;
2416#endif
2417#ifdef CONFIG_PREEMPT
2418
2419 task_thread_info(p)->preempt_count = 1;
2420#endif
2421 put_cpu();
2422}
2423
2424
2425
2426
2427
2428
2429
2430
2431void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2432{
2433 unsigned long flags;
2434 struct rq *rq;
2435
2436 rq = task_rq_lock(p, &flags);
2437 BUG_ON(p->state != TASK_RUNNING);
2438 update_rq_clock(rq);
2439
2440 p->prio = effective_prio(p);
2441
2442 if (!p->sched_class->task_new || !current->se.on_rq) {
2443 activate_task(rq, p, 0);
2444 } else {
2445
2446
2447
2448
2449 p->sched_class->task_new(rq, p);
2450 inc_nr_running(rq);
2451 }
2452 trace_sched_wakeup_new(rq, p);
2453 check_preempt_curr(rq, p, 0);
2454#ifdef CONFIG_SMP
2455 if (p->sched_class->task_wake_up)
2456 p->sched_class->task_wake_up(rq, p);
2457#endif
2458 task_rq_unlock(rq, &flags);
2459}
2460
2461#ifdef CONFIG_PREEMPT_NOTIFIERS
2462
2463
2464
2465
2466
2467void preempt_notifier_register(struct preempt_notifier *notifier)
2468{
2469 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2470}
2471EXPORT_SYMBOL_GPL(preempt_notifier_register);
2472
2473
2474
2475
2476
2477
2478
2479void preempt_notifier_unregister(struct preempt_notifier *notifier)
2480{
2481 hlist_del(¬ifier->link);
2482}
2483EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2484
2485static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2486{
2487 struct preempt_notifier *notifier;
2488 struct hlist_node *node;
2489
2490 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2491 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2492}
2493
2494static void
2495fire_sched_out_preempt_notifiers(struct task_struct *curr,
2496 struct task_struct *next)
2497{
2498 struct preempt_notifier *notifier;
2499 struct hlist_node *node;
2500
2501 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2502 notifier->ops->sched_out(notifier, next);
2503}
2504
2505#else
2506
2507static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2508{
2509}
2510
2511static void
2512fire_sched_out_preempt_notifiers(struct task_struct *curr,
2513 struct task_struct *next)
2514{
2515}
2516
2517#endif
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532static inline void
2533prepare_task_switch(struct rq *rq, struct task_struct *prev,
2534 struct task_struct *next)
2535{
2536 fire_sched_out_preempt_notifiers(prev, next);
2537 prepare_lock_switch(rq, next);
2538 prepare_arch_switch(next);
2539}
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2557 __releases(rq->lock)
2558{
2559 struct mm_struct *mm = rq->prev_mm;
2560 long prev_state;
2561
2562 rq->prev_mm = NULL;
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575 prev_state = prev->state;
2576 finish_arch_switch(prev);
2577 finish_lock_switch(rq, prev);
2578#ifdef CONFIG_SMP
2579 if (current->sched_class->post_schedule)
2580 current->sched_class->post_schedule(rq);
2581#endif
2582
2583 fire_sched_in_preempt_notifiers(current);
2584 if (mm)
2585 mmdrop(mm);
2586 if (unlikely(prev_state == TASK_DEAD)) {
2587
2588
2589
2590
2591 kprobe_flush_task(prev);
2592 put_task_struct(prev);
2593 }
2594}
2595
2596
2597
2598
2599
2600asmlinkage void schedule_tail(struct task_struct *prev)
2601 __releases(rq->lock)
2602{
2603 struct rq *rq = this_rq();
2604
2605 finish_task_switch(rq, prev);
2606#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2607
2608 preempt_enable();
2609#endif
2610 if (current->set_child_tid)
2611 put_user(task_pid_vnr(current), current->set_child_tid);
2612}
2613
2614
2615
2616
2617
2618static inline void
2619context_switch(struct rq *rq, struct task_struct *prev,
2620 struct task_struct *next)
2621{
2622 struct mm_struct *mm, *oldmm;
2623
2624 prepare_task_switch(rq, prev, next);
2625 trace_sched_switch(rq, prev, next);
2626 mm = next->mm;
2627 oldmm = prev->active_mm;
2628
2629
2630
2631
2632
2633 arch_enter_lazy_cpu_mode();
2634
2635 if (unlikely(!mm)) {
2636 next->active_mm = oldmm;
2637 atomic_inc(&oldmm->mm_count);
2638 enter_lazy_tlb(oldmm, next);
2639 } else
2640 switch_mm(oldmm, mm, next);
2641
2642 if (unlikely(!prev->mm)) {
2643 prev->active_mm = NULL;
2644 rq->prev_mm = oldmm;
2645 }
2646
2647
2648
2649
2650
2651
2652#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2653 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2654#endif
2655
2656
2657 switch_to(prev, next, prev);
2658
2659 barrier();
2660
2661
2662
2663
2664
2665 finish_task_switch(this_rq(), prev);
2666}
2667
2668
2669
2670
2671
2672
2673
2674
2675unsigned long nr_running(void)
2676{
2677 unsigned long i, sum = 0;
2678
2679 for_each_online_cpu(i)
2680 sum += cpu_rq(i)->nr_running;
2681
2682 return sum;
2683}
2684
2685unsigned long nr_uninterruptible(void)
2686{
2687 unsigned long i, sum = 0;
2688
2689 for_each_possible_cpu(i)
2690 sum += cpu_rq(i)->nr_uninterruptible;
2691
2692
2693
2694
2695
2696 if (unlikely((long)sum < 0))
2697 sum = 0;
2698
2699 return sum;
2700}
2701
2702unsigned long long nr_context_switches(void)
2703{
2704 int i;
2705 unsigned long long sum = 0;
2706
2707 for_each_possible_cpu(i)
2708 sum += cpu_rq(i)->nr_switches;
2709
2710 return sum;
2711}
2712
2713unsigned long nr_iowait(void)
2714{
2715 unsigned long i, sum = 0;
2716
2717 for_each_possible_cpu(i)
2718 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2719
2720 return sum;
2721}
2722
2723unsigned long nr_active(void)
2724{
2725 unsigned long i, running = 0, uninterruptible = 0;
2726
2727 for_each_online_cpu(i) {
2728 running += cpu_rq(i)->nr_running;
2729 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2730 }
2731
2732 if (unlikely((long)uninterruptible < 0))
2733 uninterruptible = 0;
2734
2735 return running + uninterruptible;
2736}
2737
2738
2739
2740
2741
2742static void update_cpu_load(struct rq *this_rq)
2743{
2744 unsigned long this_load = this_rq->load.weight;
2745 int i, scale;
2746
2747 this_rq->nr_load_updates++;
2748
2749
2750 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2751 unsigned long old_load, new_load;
2752
2753
2754
2755 old_load = this_rq->cpu_load[i];
2756 new_load = this_load;
2757
2758
2759
2760
2761
2762 if (new_load > old_load)
2763 new_load += scale-1;
2764 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2765 }
2766}
2767
2768#ifdef CONFIG_SMP
2769
2770
2771
2772
2773
2774
2775
2776static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2777 __acquires(rq1->lock)
2778 __acquires(rq2->lock)
2779{
2780 BUG_ON(!irqs_disabled());
2781 if (rq1 == rq2) {
2782 spin_lock(&rq1->lock);
2783 __acquire(rq2->lock);
2784 } else {
2785 if (rq1 < rq2) {
2786 spin_lock(&rq1->lock);
2787 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2788 } else {
2789 spin_lock(&rq2->lock);
2790 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2791 }
2792 }
2793 update_rq_clock(rq1);
2794 update_rq_clock(rq2);
2795}
2796
2797
2798
2799
2800
2801
2802
2803static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2804 __releases(rq1->lock)
2805 __releases(rq2->lock)
2806{
2807 spin_unlock(&rq1->lock);
2808 if (rq1 != rq2)
2809 spin_unlock(&rq2->lock);
2810 else
2811 __release(rq2->lock);
2812}
2813
2814
2815
2816
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848
2849
2850
2851
2852
2853
2854static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2855{
2856 struct migration_req req;
2857 unsigned long flags;
2858 struct rq *rq;
2859
2860 rq = task_rq_lock(p, &flags);
2861 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2862 || unlikely(!cpu_active(dest_cpu)))
2863 goto out;
2864
2865 trace_sched_migrate_task(rq, p, dest_cpu);
2866
2867 if (migrate_task(p, dest_cpu, &req)) {
2868
2869 struct task_struct *mt = rq->migration_thread;
2870
2871 get_task_struct(mt);
2872 task_rq_unlock(rq, &flags);
2873 wake_up_process(mt);
2874 put_task_struct(mt);
2875 wait_for_completion(&req.done);
2876
2877 return;
2878 }
2879out:
2880 task_rq_unlock(rq, &flags);
2881}
2882
2883
2884
2885
2886
2887void sched_exec(void)
2888{
2889 int new_cpu, this_cpu = get_cpu();
2890 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2891 put_cpu();
2892 if (new_cpu != this_cpu)
2893 sched_migrate_task(current, new_cpu);
2894}
2895
2896
2897
2898
2899
2900static void pull_task(struct rq *src_rq, struct task_struct *p,
2901 struct rq *this_rq, int this_cpu)
2902{
2903 deactivate_task(src_rq, p, 0);
2904 set_task_cpu(p, this_cpu);
2905 activate_task(this_rq, p, 0);
2906
2907
2908
2909
2910 check_preempt_curr(this_rq, p, 0);
2911}
2912
2913
2914
2915
2916static
2917int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2918 struct sched_domain *sd, enum cpu_idle_type idle,
2919 int *all_pinned)
2920{
2921
2922
2923
2924
2925
2926
2927 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2928 schedstat_inc(p, se.nr_failed_migrations_affine);
2929 return 0;
2930 }
2931 *all_pinned = 0;
2932
2933 if (task_running(rq, p)) {
2934 schedstat_inc(p, se.nr_failed_migrations_running);
2935 return 0;
2936 }
2937
2938
2939
2940
2941
2942
2943
2944 if (!task_hot(p, rq->clock, sd) ||
2945 sd->nr_balance_failed > sd->cache_nice_tries) {
2946#ifdef CONFIG_SCHEDSTATS
2947 if (task_hot(p, rq->clock, sd)) {
2948 schedstat_inc(sd, lb_hot_gained[idle]);
2949 schedstat_inc(p, se.nr_forced_migrations);
2950 }
2951#endif
2952 return 1;
2953 }
2954
2955 if (task_hot(p, rq->clock, sd)) {
2956 schedstat_inc(p, se.nr_failed_migrations_hot);
2957 return 0;
2958 }
2959 return 1;
2960}
2961
2962static unsigned long
2963balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2964 unsigned long max_load_move, struct sched_domain *sd,
2965 enum cpu_idle_type idle, int *all_pinned,
2966 int *this_best_prio, struct rq_iterator *iterator)
2967{
2968 int loops = 0, pulled = 0, pinned = 0;
2969 struct task_struct *p;
2970 long rem_load_move = max_load_move;
2971
2972 if (max_load_move == 0)
2973 goto out;
2974
2975 pinned = 1;
2976
2977
2978
2979
2980 p = iterator->start(iterator->arg);
2981next:
2982 if (!p || loops++ > sysctl_sched_nr_migrate)
2983 goto out;
2984
2985 if ((p->se.load.weight >> 1) > rem_load_move ||
2986 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2987 p = iterator->next(iterator->arg);
2988 goto next;
2989 }
2990
2991 pull_task(busiest, p, this_rq, this_cpu);
2992 pulled++;
2993 rem_load_move -= p->se.load.weight;
2994
2995
2996
2997
2998 if (rem_load_move > 0) {
2999 if (p->prio < *this_best_prio)
3000 *this_best_prio = p->prio;
3001 p = iterator->next(iterator->arg);
3002 goto next;
3003 }
3004out:
3005
3006
3007
3008
3009
3010 schedstat_add(sd, lb_gained[idle], pulled);
3011
3012 if (all_pinned)
3013 *all_pinned = pinned;
3014
3015 return max_load_move - rem_load_move;
3016}
3017
3018
3019
3020
3021
3022
3023
3024
3025static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3026 unsigned long max_load_move,
3027 struct sched_domain *sd, enum cpu_idle_type idle,
3028 int *all_pinned)
3029{
3030 const struct sched_class *class = sched_class_highest;
3031 unsigned long total_load_moved = 0;
3032 int this_best_prio = this_rq->curr->prio;
3033
3034 do {
3035 total_load_moved +=
3036 class->load_balance(this_rq, this_cpu, busiest,
3037 max_load_move - total_load_moved,
3038 sd, idle, all_pinned, &this_best_prio);
3039 class = class->next;
3040
3041 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3042 break;
3043
3044 } while (class && max_load_move > total_load_moved);
3045
3046 return total_load_moved > 0;
3047}
3048
3049static int
3050iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3051 struct sched_domain *sd, enum cpu_idle_type idle,
3052 struct rq_iterator *iterator)
3053{
3054 struct task_struct *p = iterator->start(iterator->arg);
3055 int pinned = 0;
3056
3057 while (p) {
3058 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3059 pull_task(busiest, p, this_rq, this_cpu);
3060
3061
3062
3063
3064
3065 schedstat_inc(sd, lb_gained[idle]);
3066
3067 return 1;
3068 }
3069 p = iterator->next(iterator->arg);
3070 }
3071
3072 return 0;
3073}
3074
3075
3076
3077
3078
3079
3080
3081
3082static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 struct sched_domain *sd, enum cpu_idle_type idle)
3084{
3085 const struct sched_class *class;
3086
3087 for (class = sched_class_highest; class; class = class->next)
3088 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3089 return 1;
3090
3091 return 0;
3092}
3093
3094
3095
3096
3097
3098
3099static struct sched_group *
3100find_busiest_group(struct sched_domain *sd, int this_cpu,
3101 unsigned long *imbalance, enum cpu_idle_type idle,
3102 int *sd_idle, const cpumask_t *cpus, int *balance)
3103{
3104 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3105 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
3106 unsigned long max_pull;
3107 unsigned long busiest_load_per_task, busiest_nr_running;
3108 unsigned long this_load_per_task, this_nr_running;
3109 int load_idx, group_imb = 0;
3110#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3111 int power_savings_balance = 1;
3112 unsigned long leader_nr_running = 0, min_load_per_task = 0;
3113 unsigned long min_nr_running = ULONG_MAX;
3114 struct sched_group *group_min = NULL, *group_leader = NULL;
3115#endif
3116
3117 max_load = this_load = total_load = total_pwr = 0;
3118 busiest_load_per_task = busiest_nr_running = 0;
3119 this_load_per_task = this_nr_running = 0;
3120
3121 if (idle == CPU_NOT_IDLE)
3122 load_idx = sd->busy_idx;
3123 else if (idle == CPU_NEWLY_IDLE)
3124 load_idx = sd->newidle_idx;
3125 else
3126 load_idx = sd->idle_idx;
3127
3128 do {
3129 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
3130 int local_group;
3131 int i;
3132 int __group_imb = 0;
3133 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3134 unsigned long sum_nr_running, sum_weighted_load;
3135 unsigned long sum_avg_load_per_task;
3136 unsigned long avg_load_per_task;
3137
3138 local_group = cpu_isset(this_cpu, group->cpumask);
3139
3140 if (local_group)
3141 balance_cpu = first_cpu(group->cpumask);
3142
3143
3144 sum_weighted_load = sum_nr_running = avg_load = 0;
3145 sum_avg_load_per_task = avg_load_per_task = 0;
3146
3147 max_cpu_load = 0;
3148 min_cpu_load = ~0UL;
3149
3150 for_each_cpu_mask_nr(i, group->cpumask) {
3151 struct rq *rq;
3152
3153 if (!cpu_isset(i, *cpus))
3154 continue;
3155
3156 rq = cpu_rq(i);
3157
3158 if (*sd_idle && rq->nr_running)
3159 *sd_idle = 0;
3160
3161
3162 if (local_group) {
3163 if (idle_cpu(i) && !first_idle_cpu) {
3164 first_idle_cpu = 1;
3165 balance_cpu = i;
3166 }
3167
3168 load = target_load(i, load_idx);
3169 } else {
3170 load = source_load(i, load_idx);
3171 if (load > max_cpu_load)
3172 max_cpu_load = load;
3173 if (min_cpu_load > load)
3174 min_cpu_load = load;
3175 }
3176
3177 avg_load += load;
3178 sum_nr_running += rq->nr_running;
3179 sum_weighted_load += weighted_cpuload(i);
3180
3181 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3182 }
3183
3184
3185
3186
3187
3188
3189
3190 if (idle != CPU_NEWLY_IDLE && local_group &&
3191 balance_cpu != this_cpu && balance) {
3192 *balance = 0;
3193 goto ret;
3194 }
3195
3196 total_load += avg_load;
3197 total_pwr += group->__cpu_power;
3198
3199
3200 avg_load = sg_div_cpu_power(group,
3201 avg_load * SCHED_LOAD_SCALE);
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213 avg_load_per_task = sg_div_cpu_power(group,
3214 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3215
3216 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3217 __group_imb = 1;
3218
3219 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3220
3221 if (local_group) {
3222 this_load = avg_load;
3223 this = group;
3224 this_nr_running = sum_nr_running;
3225 this_load_per_task = sum_weighted_load;
3226 } else if (avg_load > max_load &&
3227 (sum_nr_running > group_capacity || __group_imb)) {
3228 max_load = avg_load;
3229 busiest = group;
3230 busiest_nr_running = sum_nr_running;
3231 busiest_load_per_task = sum_weighted_load;
3232 group_imb = __group_imb;
3233 }
3234
3235#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3236
3237
3238
3239
3240 if (idle == CPU_NOT_IDLE ||
3241 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3242 goto group_next;
3243
3244
3245
3246
3247
3248 if (local_group && (this_nr_running >= group_capacity ||
3249 !this_nr_running))
3250 power_savings_balance = 0;
3251
3252
3253
3254
3255
3256 if (!power_savings_balance || sum_nr_running >= group_capacity
3257 || !sum_nr_running)
3258 goto group_next;
3259
3260
3261
3262
3263
3264
3265 if ((sum_nr_running < min_nr_running) ||
3266 (sum_nr_running == min_nr_running &&
3267 first_cpu(group->cpumask) <
3268 first_cpu(group_min->cpumask))) {
3269 group_min = group;
3270 min_nr_running = sum_nr_running;
3271 min_load_per_task = sum_weighted_load /
3272 sum_nr_running;
3273 }
3274
3275
3276
3277
3278
3279
3280 if (sum_nr_running <= group_capacity - 1) {
3281 if (sum_nr_running > leader_nr_running ||
3282 (sum_nr_running == leader_nr_running &&
3283 first_cpu(group->cpumask) >
3284 first_cpu(group_leader->cpumask))) {
3285 group_leader = group;
3286 leader_nr_running = sum_nr_running;
3287 }
3288 }
3289group_next:
3290#endif
3291 group = group->next;
3292 } while (group != sd->groups);
3293
3294 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3295 goto out_balanced;
3296
3297 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3298
3299 if (this_load >= avg_load ||
3300 100*max_load <= sd->imbalance_pct*this_load)
3301 goto out_balanced;
3302
3303 busiest_load_per_task /= busiest_nr_running;
3304 if (group_imb)
3305 busiest_load_per_task = min(busiest_load_per_task, avg_load);
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318 if (max_load <= busiest_load_per_task)
3319 goto out_balanced;
3320
3321
3322
3323
3324
3325
3326 if (max_load < avg_load) {
3327 *imbalance = 0;
3328 goto small_imbalance;
3329 }
3330
3331
3332 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3333
3334
3335 *imbalance = min(max_pull * busiest->__cpu_power,
3336 (avg_load - this_load) * this->__cpu_power)
3337 / SCHED_LOAD_SCALE;
3338
3339
3340
3341
3342
3343
3344
3345 if (*imbalance < busiest_load_per_task) {
3346 unsigned long tmp, pwr_now, pwr_move;
3347 unsigned int imbn;
3348
3349small_imbalance:
3350 pwr_move = pwr_now = 0;
3351 imbn = 2;
3352 if (this_nr_running) {
3353 this_load_per_task /= this_nr_running;
3354 if (busiest_load_per_task > this_load_per_task)
3355 imbn = 1;
3356 } else
3357 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3358
3359 if (max_load - this_load + busiest_load_per_task >=
3360 busiest_load_per_task * imbn) {
3361 *imbalance = busiest_load_per_task;
3362 return busiest;
3363 }
3364
3365
3366
3367
3368
3369
3370
3371 pwr_now += busiest->__cpu_power *
3372 min(busiest_load_per_task, max_load);
3373 pwr_now += this->__cpu_power *
3374 min(this_load_per_task, this_load);
3375 pwr_now /= SCHED_LOAD_SCALE;
3376
3377
3378 tmp = sg_div_cpu_power(busiest,
3379 busiest_load_per_task * SCHED_LOAD_SCALE);
3380 if (max_load > tmp)
3381 pwr_move += busiest->__cpu_power *
3382 min(busiest_load_per_task, max_load - tmp);
3383
3384
3385 if (max_load * busiest->__cpu_power <
3386 busiest_load_per_task * SCHED_LOAD_SCALE)
3387 tmp = sg_div_cpu_power(this,
3388 max_load * busiest->__cpu_power);
3389 else
3390 tmp = sg_div_cpu_power(this,
3391 busiest_load_per_task * SCHED_LOAD_SCALE);
3392 pwr_move += this->__cpu_power *
3393 min(this_load_per_task, this_load + tmp);
3394 pwr_move /= SCHED_LOAD_SCALE;
3395
3396
3397 if (pwr_move > pwr_now)
3398 *imbalance = busiest_load_per_task;
3399 }
3400
3401 return busiest;
3402
3403out_balanced:
3404#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3405 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3406 goto ret;
3407
3408 if (this == group_leader && group_leader != group_min) {
3409 *imbalance = min_load_per_task;
3410 return group_min;
3411 }
3412#endif
3413ret:
3414 *imbalance = 0;
3415 return NULL;
3416}
3417
3418
3419
3420
3421static struct rq *
3422find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3423 unsigned long imbalance, const cpumask_t *cpus)
3424{
3425 struct rq *busiest = NULL, *rq;
3426 unsigned long max_load = 0;
3427 int i;
3428
3429 for_each_cpu_mask_nr(i, group->cpumask) {
3430 unsigned long wl;
3431
3432 if (!cpu_isset(i, *cpus))
3433 continue;
3434
3435 rq = cpu_rq(i);
3436 wl = weighted_cpuload(i);
3437
3438 if (rq->nr_running == 1 && wl > imbalance)
3439 continue;
3440
3441 if (wl > max_load) {
3442 max_load = wl;
3443 busiest = rq;
3444 }
3445 }
3446
3447 return busiest;
3448}
3449
3450
3451
3452
3453
3454#define MAX_PINNED_INTERVAL 512
3455
3456
3457
3458
3459
3460static int load_balance(int this_cpu, struct rq *this_rq,
3461 struct sched_domain *sd, enum cpu_idle_type idle,
3462 int *balance, cpumask_t *cpus)
3463{
3464 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3465 struct sched_group *group;
3466 unsigned long imbalance;
3467 struct rq *busiest;
3468 unsigned long flags;
3469
3470 cpus_setall(*cpus);
3471
3472
3473
3474
3475
3476
3477
3478 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3479 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3480 sd_idle = 1;
3481
3482 schedstat_inc(sd, lb_count[idle]);
3483
3484redo:
3485 update_shares(sd);
3486 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3487 cpus, balance);
3488
3489 if (*balance == 0)
3490 goto out_balanced;
3491
3492 if (!group) {
3493 schedstat_inc(sd, lb_nobusyg[idle]);
3494 goto out_balanced;
3495 }
3496
3497 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3498 if (!busiest) {
3499 schedstat_inc(sd, lb_nobusyq[idle]);
3500 goto out_balanced;
3501 }
3502
3503 BUG_ON(busiest == this_rq);
3504
3505 schedstat_add(sd, lb_imbalance[idle], imbalance);
3506
3507 ld_moved = 0;
3508 if (busiest->nr_running > 1) {
3509
3510
3511
3512
3513
3514
3515 local_irq_save(flags);
3516 double_rq_lock(this_rq, busiest);
3517 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3518 imbalance, sd, idle, &all_pinned);
3519 double_rq_unlock(this_rq, busiest);
3520 local_irq_restore(flags);
3521
3522
3523
3524
3525 if (ld_moved && this_cpu != smp_processor_id())
3526 resched_cpu(this_cpu);
3527
3528
3529 if (unlikely(all_pinned)) {
3530 cpu_clear(cpu_of(busiest), *cpus);
3531 if (!cpus_empty(*cpus))
3532 goto redo;
3533 goto out_balanced;
3534 }
3535 }
3536
3537 if (!ld_moved) {
3538 schedstat_inc(sd, lb_failed[idle]);
3539 sd->nr_balance_failed++;
3540
3541 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3542
3543 spin_lock_irqsave(&busiest->lock, flags);
3544
3545
3546
3547
3548 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
3549 spin_unlock_irqrestore(&busiest->lock, flags);
3550 all_pinned = 1;
3551 goto out_one_pinned;
3552 }
3553
3554 if (!busiest->active_balance) {
3555 busiest->active_balance = 1;
3556 busiest->push_cpu = this_cpu;
3557 active_balance = 1;
3558 }
3559 spin_unlock_irqrestore(&busiest->lock, flags);
3560 if (active_balance)
3561 wake_up_process(busiest->migration_thread);
3562
3563
3564
3565
3566
3567 sd->nr_balance_failed = sd->cache_nice_tries+1;
3568 }
3569 } else
3570 sd->nr_balance_failed = 0;
3571
3572 if (likely(!active_balance)) {
3573
3574 sd->balance_interval = sd->min_interval;
3575 } else {
3576
3577
3578
3579
3580
3581
3582 if (sd->balance_interval < sd->max_interval)
3583 sd->balance_interval *= 2;
3584 }
3585
3586 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3587 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3588 ld_moved = -1;
3589
3590 goto out;
3591
3592out_balanced:
3593 schedstat_inc(sd, lb_balanced[idle]);
3594
3595 sd->nr_balance_failed = 0;
3596
3597out_one_pinned:
3598
3599 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3600 (sd->balance_interval < sd->max_interval))
3601 sd->balance_interval *= 2;
3602
3603 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3604 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3605 ld_moved = -1;
3606 else
3607 ld_moved = 0;
3608out:
3609 if (ld_moved)
3610 update_shares(sd);
3611 return ld_moved;
3612}
3613
3614
3615
3616
3617
3618
3619
3620
3621static int
3622load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3623 cpumask_t *cpus)
3624{
3625 struct sched_group *group;
3626 struct rq *busiest = NULL;
3627 unsigned long imbalance;
3628 int ld_moved = 0;
3629 int sd_idle = 0;
3630 int all_pinned = 0;
3631
3632 cpus_setall(*cpus);
3633
3634
3635
3636
3637
3638
3639
3640 if (sd->flags & SD_SHARE_CPUPOWER &&
3641 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3642 sd_idle = 1;
3643
3644 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3645redo:
3646 update_shares_locked(this_rq, sd);
3647 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3648 &sd_idle, cpus, NULL);
3649 if (!group) {
3650 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3651 goto out_balanced;
3652 }
3653
3654 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3655 if (!busiest) {
3656 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3657 goto out_balanced;
3658 }
3659
3660 BUG_ON(busiest == this_rq);
3661
3662 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
3663
3664 ld_moved = 0;
3665 if (busiest->nr_running > 1) {
3666
3667 double_lock_balance(this_rq, busiest);
3668
3669 update_rq_clock(busiest);
3670 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3671 imbalance, sd, CPU_NEWLY_IDLE,
3672 &all_pinned);
3673 double_unlock_balance(this_rq, busiest);
3674
3675 if (unlikely(all_pinned)) {
3676 cpu_clear(cpu_of(busiest), *cpus);
3677 if (!cpus_empty(*cpus))
3678 goto redo;
3679 }
3680 }
3681
3682 if (!ld_moved) {
3683 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3684 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3685 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3686 return -1;
3687 } else
3688 sd->nr_balance_failed = 0;
3689
3690 update_shares_locked(this_rq, sd);
3691 return ld_moved;
3692
3693out_balanced:
3694 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
3695 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3696 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3697 return -1;
3698 sd->nr_balance_failed = 0;
3699
3700 return 0;
3701}
3702
3703
3704
3705
3706
3707static void idle_balance(int this_cpu, struct rq *this_rq)
3708{
3709 struct sched_domain *sd;
3710 int pulled_task = -1;
3711 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask;
3713
3714 for_each_domain(this_cpu, sd) {
3715 unsigned long interval;
3716
3717 if (!(sd->flags & SD_LOAD_BALANCE))
3718 continue;
3719
3720 if (sd->flags & SD_BALANCE_NEWIDLE)
3721
3722 pulled_task = load_balance_newidle(this_cpu, this_rq,
3723 sd, &tmpmask);
3724
3725 interval = msecs_to_jiffies(sd->balance_interval);
3726 if (time_after(next_balance, sd->last_balance + interval))
3727 next_balance = sd->last_balance + interval;
3728 if (pulled_task)
3729 break;
3730 }
3731 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3732
3733
3734
3735
3736 this_rq->next_balance = next_balance;
3737 }
3738}
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3749{
3750 int target_cpu = busiest_rq->push_cpu;
3751 struct sched_domain *sd;
3752 struct rq *target_rq;
3753
3754
3755 if (busiest_rq->nr_running <= 1)
3756 return;
3757
3758 target_rq = cpu_rq(target_cpu);
3759
3760
3761
3762
3763
3764
3765 BUG_ON(busiest_rq == target_rq);
3766
3767
3768 double_lock_balance(busiest_rq, target_rq);
3769 update_rq_clock(busiest_rq);
3770 update_rq_clock(target_rq);
3771
3772
3773 for_each_domain(target_cpu, sd) {
3774 if ((sd->flags & SD_LOAD_BALANCE) &&
3775 cpu_isset(busiest_cpu, sd->span))
3776 break;
3777 }
3778
3779 if (likely(sd)) {
3780 schedstat_inc(sd, alb_count);
3781
3782 if (move_one_task(target_rq, target_cpu, busiest_rq,
3783 sd, CPU_IDLE))
3784 schedstat_inc(sd, alb_pushed);
3785 else
3786 schedstat_inc(sd, alb_failed);
3787 }
3788 double_unlock_balance(busiest_rq, target_rq);
3789}
3790
3791#ifdef CONFIG_NO_HZ
3792static struct {
3793 atomic_t load_balancer;
3794 cpumask_t cpu_mask;
3795} nohz ____cacheline_aligned = {
3796 .load_balancer = ATOMIC_INIT(-1),
3797 .cpu_mask = CPU_MASK_NONE,
3798};
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820int select_nohz_load_balancer(int stop_tick)
3821{
3822 int cpu = smp_processor_id();
3823
3824 if (stop_tick) {
3825 cpu_set(cpu, nohz.cpu_mask);
3826 cpu_rq(cpu)->in_nohz_recently = 1;
3827
3828
3829
3830
3831 if (!cpu_active(cpu) &&
3832 atomic_read(&nohz.load_balancer) == cpu) {
3833 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3834 BUG();
3835 return 0;
3836 }
3837
3838
3839 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3840 if (atomic_read(&nohz.load_balancer) == cpu)
3841 atomic_set(&nohz.load_balancer, -1);
3842 return 0;
3843 }
3844
3845 if (atomic_read(&nohz.load_balancer) == -1) {
3846
3847 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3848 return 1;
3849 } else if (atomic_read(&nohz.load_balancer) == cpu)
3850 return 1;
3851 } else {
3852 if (!cpu_isset(cpu, nohz.cpu_mask))
3853 return 0;
3854
3855 cpu_clear(cpu, nohz.cpu_mask);
3856
3857 if (atomic_read(&nohz.load_balancer) == cpu)
3858 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3859 BUG();
3860 }
3861 return 0;
3862}
3863#endif
3864
3865static DEFINE_SPINLOCK(balancing);
3866
3867
3868
3869
3870
3871
3872
3873static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3874{
3875 int balance = 1;
3876 struct rq *rq = cpu_rq(cpu);
3877 unsigned long interval;
3878 struct sched_domain *sd;
3879
3880 unsigned long next_balance = jiffies + 60*HZ;
3881 int update_next_balance = 0;
3882 int need_serialize;
3883 cpumask_t tmp;
3884
3885 for_each_domain(cpu, sd) {
3886 if (!(sd->flags & SD_LOAD_BALANCE))
3887 continue;
3888
3889 interval = sd->balance_interval;
3890 if (idle != CPU_IDLE)
3891 interval *= sd->busy_factor;
3892
3893
3894 interval = msecs_to_jiffies(interval);
3895 if (unlikely(!interval))
3896 interval = 1;
3897 if (interval > HZ*NR_CPUS/10)
3898 interval = HZ*NR_CPUS/10;
3899
3900 need_serialize = sd->flags & SD_SERIALIZE;
3901
3902 if (need_serialize) {
3903 if (!spin_trylock(&balancing))
3904 goto out;
3905 }
3906
3907 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3908 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3909
3910
3911
3912
3913
3914 idle = CPU_NOT_IDLE;
3915 }
3916 sd->last_balance = jiffies;
3917 }
3918 if (need_serialize)
3919 spin_unlock(&balancing);
3920out:
3921 if (time_after(next_balance, sd->last_balance + interval)) {
3922 next_balance = sd->last_balance + interval;
3923 update_next_balance = 1;
3924 }
3925
3926
3927
3928
3929
3930
3931 if (!balance)
3932 break;
3933 }
3934
3935
3936
3937
3938
3939
3940 if (likely(update_next_balance))
3941 rq->next_balance = next_balance;
3942}
3943
3944
3945
3946
3947
3948
3949static void run_rebalance_domains(struct softirq_action *h)
3950{
3951 int this_cpu = smp_processor_id();
3952 struct rq *this_rq = cpu_rq(this_cpu);
3953 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3954 CPU_IDLE : CPU_NOT_IDLE;
3955
3956 rebalance_domains(this_cpu, idle);
3957
3958#ifdef CONFIG_NO_HZ
3959
3960
3961
3962
3963
3964 if (this_rq->idle_at_tick &&
3965 atomic_read(&nohz.load_balancer) == this_cpu) {
3966 cpumask_t cpus = nohz.cpu_mask;
3967 struct rq *rq;
3968 int balance_cpu;
3969
3970 cpu_clear(this_cpu, cpus);
3971 for_each_cpu_mask_nr(balance_cpu, cpus) {
3972
3973
3974
3975
3976
3977 if (need_resched())
3978 break;
3979
3980 rebalance_domains(balance_cpu, CPU_IDLE);
3981
3982 rq = cpu_rq(balance_cpu);
3983 if (time_after(this_rq->next_balance, rq->next_balance))
3984 this_rq->next_balance = rq->next_balance;
3985 }
3986 }
3987#endif
3988}
3989
3990
3991
3992
3993
3994
3995
3996
3997static inline void trigger_load_balance(struct rq *rq, int cpu)
3998{
3999#ifdef CONFIG_NO_HZ
4000
4001
4002
4003
4004
4005 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4006 rq->in_nohz_recently = 0;
4007
4008 if (atomic_read(&nohz.load_balancer) == cpu) {
4009 cpu_clear(cpu, nohz.cpu_mask);
4010 atomic_set(&nohz.load_balancer, -1);
4011 }
4012
4013 if (atomic_read(&nohz.load_balancer) == -1) {
4014
4015
4016
4017
4018
4019
4020
4021
4022 int ilb = first_cpu(nohz.cpu_mask);
4023
4024 if (ilb < nr_cpu_ids)
4025 resched_cpu(ilb);
4026 }
4027 }
4028
4029
4030
4031
4032
4033 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4034 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4035 resched_cpu(cpu);
4036 return;
4037 }
4038
4039
4040
4041
4042
4043 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4044 cpu_isset(cpu, nohz.cpu_mask))
4045 return;
4046#endif
4047 if (time_after_eq(jiffies, rq->next_balance))
4048 raise_softirq(SCHED_SOFTIRQ);
4049}
4050
4051#else
4052
4053
4054
4055
4056static inline void idle_balance(int cpu, struct rq *rq)
4057{
4058}
4059
4060#endif
4061
4062DEFINE_PER_CPU(struct kernel_stat, kstat);
4063
4064EXPORT_PER_CPU_SYMBOL(kstat);
4065
4066
4067
4068
4069
4070
4071
4072static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4073{
4074 u64 ns = 0;
4075
4076 if (task_current(rq, p)) {
4077 update_rq_clock(rq);
4078 ns = rq->clock - p->se.exec_start;
4079 if ((s64)ns < 0)
4080 ns = 0;
4081 }
4082
4083 return ns;
4084}
4085
4086unsigned long long task_delta_exec(struct task_struct *p)
4087{
4088 unsigned long flags;
4089 struct rq *rq;
4090 u64 ns = 0;
4091
4092 rq = task_rq_lock(p, &flags);
4093 ns = do_task_delta_exec(p, rq);
4094 task_rq_unlock(rq, &flags);
4095
4096 return ns;
4097}
4098
4099
4100
4101
4102
4103
4104unsigned long long task_sched_runtime(struct task_struct *p)
4105{
4106 unsigned long flags;
4107 struct rq *rq;
4108 u64 ns = 0;
4109
4110 rq = task_rq_lock(p, &flags);
4111 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4112 task_rq_unlock(rq, &flags);
4113
4114 return ns;
4115}
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126unsigned long long thread_group_sched_runtime(struct task_struct *p)
4127{
4128 struct task_cputime totals;
4129 unsigned long flags;
4130 struct rq *rq;
4131 u64 ns;
4132
4133 rq = task_rq_lock(p, &flags);
4134 thread_group_cputime(p, &totals);
4135 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4136 task_rq_unlock(rq, &flags);
4137
4138 return ns;
4139}
4140
4141
4142
4143
4144
4145
4146void account_user_time(struct task_struct *p, cputime_t cputime)
4147{
4148 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4149 cputime64_t tmp;
4150
4151 p->utime = cputime_add(p->utime, cputime);
4152 account_group_user_time(p, cputime);
4153
4154
4155 tmp = cputime_to_cputime64(cputime);
4156 if (TASK_NICE(p) > 0)
4157 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4158 else
4159 cpustat->user = cputime64_add(cpustat->user, tmp);
4160
4161 acct_update_integrals(p);
4162}
4163
4164
4165
4166
4167
4168
4169static void account_guest_time(struct task_struct *p, cputime_t cputime)
4170{
4171 cputime64_t tmp;
4172 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4173
4174 tmp = cputime_to_cputime64(cputime);
4175
4176 p->utime = cputime_add(p->utime, cputime);
4177 account_group_user_time(p, cputime);
4178 p->gtime = cputime_add(p->gtime, cputime);
4179
4180 cpustat->user = cputime64_add(cpustat->user, tmp);
4181 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4182}
4183
4184
4185
4186
4187
4188
4189void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4190{
4191 p->utimescaled = cputime_add(p->utimescaled, cputime);
4192}
4193
4194
4195
4196
4197
4198
4199
4200void account_system_time(struct task_struct *p, int hardirq_offset,
4201 cputime_t cputime)
4202{
4203 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4204 struct rq *rq = this_rq();
4205 cputime64_t tmp;
4206
4207 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4208 account_guest_time(p, cputime);
4209 return;
4210 }
4211
4212 p->stime = cputime_add(p->stime, cputime);
4213 account_group_system_time(p, cputime);
4214
4215
4216 tmp = cputime_to_cputime64(cputime);
4217 if (hardirq_count() - hardirq_offset)
4218 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4219 else if (softirq_count())
4220 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4221 else if (p != rq->idle)
4222 cpustat->system = cputime64_add(cpustat->system, tmp);
4223 else if (atomic_read(&rq->nr_iowait) > 0)
4224 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4225 else
4226 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4227
4228 acct_update_integrals(p);
4229}
4230
4231
4232
4233
4234
4235
4236
4237void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
4238{
4239 p->stimescaled = cputime_add(p->stimescaled, cputime);
4240}
4241
4242
4243
4244
4245
4246
4247void account_steal_time(struct task_struct *p, cputime_t steal)
4248{
4249 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4250 cputime64_t tmp = cputime_to_cputime64(steal);
4251 struct rq *rq = this_rq();
4252
4253 if (p == rq->idle) {
4254 p->stime = cputime_add(p->stime, steal);
4255 account_group_system_time(p, steal);
4256 if (atomic_read(&rq->nr_iowait) > 0)
4257 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4258 else
4259 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4260 } else
4261 cpustat->steal = cputime64_add(cpustat->steal, tmp);
4262}
4263
4264
4265
4266
4267#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4268cputime_t task_utime(struct task_struct *p)
4269{
4270 return p->utime;
4271}
4272
4273cputime_t task_stime(struct task_struct *p)
4274{
4275 return p->stime;
4276}
4277#else
4278cputime_t task_utime(struct task_struct *p)
4279{
4280 clock_t utime = cputime_to_clock_t(p->utime),
4281 total = utime + cputime_to_clock_t(p->stime);
4282 u64 temp;
4283
4284
4285
4286
4287 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4288
4289 if (total) {
4290 temp *= utime;
4291 do_div(temp, total);
4292 }
4293 utime = (clock_t)temp;
4294
4295 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4296 return p->prev_utime;
4297}
4298
4299cputime_t task_stime(struct task_struct *p)
4300{
4301 clock_t stime;
4302
4303
4304
4305
4306
4307
4308 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4309 cputime_to_clock_t(task_utime(p));
4310
4311 if (stime >= 0)
4312 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4313
4314 return p->prev_stime;
4315}
4316#endif
4317
4318inline cputime_t task_gtime(struct task_struct *p)
4319{
4320 return p->gtime;
4321}
4322
4323
4324
4325
4326
4327
4328
4329
4330void scheduler_tick(void)
4331{
4332 int cpu = smp_processor_id();
4333 struct rq *rq = cpu_rq(cpu);
4334 struct task_struct *curr = rq->curr;
4335
4336 sched_clock_tick();
4337
4338 spin_lock(&rq->lock);
4339 update_rq_clock(rq);
4340 update_cpu_load(rq);
4341 curr->sched_class->task_tick(rq, curr, 0);
4342 spin_unlock(&rq->lock);
4343
4344#ifdef CONFIG_SMP
4345 rq->idle_at_tick = idle_cpu(cpu);
4346 trigger_load_balance(rq, cpu);
4347#endif
4348}
4349
4350#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4351 defined(CONFIG_PREEMPT_TRACER))
4352
4353static inline unsigned long get_parent_ip(unsigned long addr)
4354{
4355 if (in_lock_functions(addr)) {
4356 addr = CALLER_ADDR2;
4357 if (in_lock_functions(addr))
4358 addr = CALLER_ADDR3;
4359 }
4360 return addr;
4361}
4362
4363void __kprobes add_preempt_count(int val)
4364{
4365#ifdef CONFIG_DEBUG_PREEMPT
4366
4367
4368
4369 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4370 return;
4371#endif
4372 preempt_count() += val;
4373#ifdef CONFIG_DEBUG_PREEMPT
4374
4375
4376
4377 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4378 PREEMPT_MASK - 10);
4379#endif
4380 if (preempt_count() == val)
4381 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4382}
4383EXPORT_SYMBOL(add_preempt_count);
4384
4385void __kprobes sub_preempt_count(int val)
4386{
4387#ifdef CONFIG_DEBUG_PREEMPT
4388
4389
4390
4391 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4392 return;
4393
4394
4395
4396 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4397 !(preempt_count() & PREEMPT_MASK)))
4398 return;
4399#endif
4400
4401 if (preempt_count() == val)
4402 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4403 preempt_count() -= val;
4404}
4405EXPORT_SYMBOL(sub_preempt_count);
4406
4407#endif
4408
4409
4410
4411
4412static noinline void __schedule_bug(struct task_struct *prev)
4413{
4414 struct pt_regs *regs = get_irq_regs();
4415
4416 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4417 prev->comm, prev->pid, preempt_count());
4418
4419 debug_show_held_locks(prev);
4420 print_modules();
4421 if (irqs_disabled())
4422 print_irqtrace_events(prev);
4423
4424 if (regs)
4425 show_regs(regs);
4426 else
4427 dump_stack();
4428}
4429
4430
4431
4432
4433static inline void schedule_debug(struct task_struct *prev)
4434{
4435
4436
4437
4438
4439
4440 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4441 __schedule_bug(prev);
4442
4443 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4444
4445 schedstat_inc(this_rq(), sched_count);
4446#ifdef CONFIG_SCHEDSTATS
4447 if (unlikely(prev->lock_depth >= 0)) {
4448 schedstat_inc(this_rq(), bkl_count);
4449 schedstat_inc(prev, sched_info.bkl_count);
4450 }
4451#endif
4452}
4453
4454
4455
4456
4457static inline struct task_struct *
4458pick_next_task(struct rq *rq, struct task_struct *prev)
4459{
4460 const struct sched_class *class;
4461 struct task_struct *p;
4462
4463
4464
4465
4466
4467 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4468 p = fair_sched_class.pick_next_task(rq);
4469 if (likely(p))
4470 return p;
4471 }
4472
4473 class = sched_class_highest;
4474 for ( ; ; ) {
4475 p = class->pick_next_task(rq);
4476 if (p)
4477 return p;
4478
4479
4480
4481
4482 class = class->next;
4483 }
4484}
4485
4486
4487
4488
4489asmlinkage void __sched schedule(void)
4490{
4491 struct task_struct *prev, *next;
4492 unsigned long *switch_count;
4493 struct rq *rq;
4494 int cpu;
4495
4496need_resched:
4497 preempt_disable();
4498 cpu = smp_processor_id();
4499 rq = cpu_rq(cpu);
4500 rcu_qsctr_inc(cpu);
4501 prev = rq->curr;
4502 switch_count = &prev->nivcsw;
4503
4504 release_kernel_lock(prev);
4505need_resched_nonpreemptible:
4506
4507 schedule_debug(prev);
4508
4509 if (sched_feat(HRTICK))
4510 hrtick_clear(rq);
4511
4512 spin_lock_irq(&rq->lock);
4513 update_rq_clock(rq);
4514 clear_tsk_need_resched(prev);
4515
4516 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4517 if (unlikely(signal_pending_state(prev->state, prev)))
4518 prev->state = TASK_RUNNING;
4519 else
4520 deactivate_task(rq, prev, 1);
4521 switch_count = &prev->nvcsw;
4522 }
4523
4524#ifdef CONFIG_SMP
4525 if (prev->sched_class->pre_schedule)
4526 prev->sched_class->pre_schedule(rq, prev);
4527#endif
4528
4529 if (unlikely(!rq->nr_running))
4530 idle_balance(cpu, rq);
4531
4532 prev->sched_class->put_prev_task(rq, prev);
4533 next = pick_next_task(rq, prev);
4534
4535 if (likely(prev != next)) {
4536 sched_info_switch(prev, next);
4537
4538 rq->nr_switches++;
4539 rq->curr = next;
4540 ++*switch_count;
4541
4542 context_switch(rq, prev, next);
4543
4544
4545
4546
4547 cpu = smp_processor_id();
4548 rq = cpu_rq(cpu);
4549 } else
4550 spin_unlock_irq(&rq->lock);
4551
4552 if (unlikely(reacquire_kernel_lock(current) < 0))
4553 goto need_resched_nonpreemptible;
4554
4555 preempt_enable_no_resched();
4556 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4557 goto need_resched;
4558}
4559EXPORT_SYMBOL(schedule);
4560
4561#ifdef CONFIG_PREEMPT
4562
4563
4564
4565
4566
4567asmlinkage void __sched preempt_schedule(void)
4568{
4569 struct thread_info *ti = current_thread_info();
4570
4571
4572
4573
4574
4575 if (likely(ti->preempt_count || irqs_disabled()))
4576 return;
4577
4578 do {
4579 add_preempt_count(PREEMPT_ACTIVE);
4580 schedule();
4581 sub_preempt_count(PREEMPT_ACTIVE);
4582
4583
4584
4585
4586
4587 barrier();
4588 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4589}
4590EXPORT_SYMBOL(preempt_schedule);
4591
4592
4593
4594
4595
4596
4597
4598asmlinkage void __sched preempt_schedule_irq(void)
4599{
4600 struct thread_info *ti = current_thread_info();
4601
4602
4603 BUG_ON(ti->preempt_count || !irqs_disabled());
4604
4605 do {
4606 add_preempt_count(PREEMPT_ACTIVE);
4607 local_irq_enable();
4608 schedule();
4609 local_irq_disable();
4610 sub_preempt_count(PREEMPT_ACTIVE);
4611
4612
4613
4614
4615
4616 barrier();
4617 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4618}
4619
4620#endif
4621
4622int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
4623 void *key)
4624{
4625 return try_to_wake_up(curr->private, mode, sync);
4626}
4627EXPORT_SYMBOL(default_wake_function);
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4639 int nr_exclusive, int sync, void *key)
4640{
4641 wait_queue_t *curr, *next;
4642
4643 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4644 unsigned flags = curr->flags;
4645
4646 if (curr->func(curr, mode, sync, key) &&
4647 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4648 break;
4649 }
4650}
4651
4652
4653
4654
4655
4656
4657
4658
4659void __wake_up(wait_queue_head_t *q, unsigned int mode,
4660 int nr_exclusive, void *key)
4661{
4662 unsigned long flags;
4663
4664 spin_lock_irqsave(&q->lock, flags);
4665 __wake_up_common(q, mode, nr_exclusive, 0, key);
4666 spin_unlock_irqrestore(&q->lock, flags);
4667}
4668EXPORT_SYMBOL(__wake_up);
4669
4670
4671
4672
4673void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4674{
4675 __wake_up_common(q, mode, 1, 0, NULL);
4676}
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691void
4692__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4693{
4694 unsigned long flags;
4695 int sync = 1;
4696
4697 if (unlikely(!q))
4698 return;
4699
4700 if (unlikely(!nr_exclusive))
4701 sync = 0;
4702
4703 spin_lock_irqsave(&q->lock, flags);
4704 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
4705 spin_unlock_irqrestore(&q->lock, flags);
4706}
4707EXPORT_SYMBOL_GPL(__wake_up_sync);
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718void complete(struct completion *x)
4719{
4720 unsigned long flags;
4721
4722 spin_lock_irqsave(&x->wait.lock, flags);
4723 x->done++;
4724 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4725 spin_unlock_irqrestore(&x->wait.lock, flags);
4726}
4727EXPORT_SYMBOL(complete);
4728
4729
4730
4731
4732
4733
4734
4735void complete_all(struct completion *x)
4736{
4737 unsigned long flags;
4738
4739 spin_lock_irqsave(&x->wait.lock, flags);
4740 x->done += UINT_MAX/2;
4741 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4742 spin_unlock_irqrestore(&x->wait.lock, flags);
4743}
4744EXPORT_SYMBOL(complete_all);
4745
4746static inline long __sched
4747do_wait_for_common(struct completion *x, long timeout, int state)
4748{
4749 if (!x->done) {
4750 DECLARE_WAITQUEUE(wait, current);
4751
4752 wait.flags |= WQ_FLAG_EXCLUSIVE;
4753 __add_wait_queue_tail(&x->wait, &wait);
4754 do {
4755 if (signal_pending_state(state, current)) {
4756 timeout = -ERESTARTSYS;
4757 break;
4758 }
4759 __set_current_state(state);
4760 spin_unlock_irq(&x->wait.lock);
4761 timeout = schedule_timeout(timeout);
4762 spin_lock_irq(&x->wait.lock);
4763 } while (!x->done && timeout);
4764 __remove_wait_queue(&x->wait, &wait);
4765 if (!x->done)
4766 return timeout;
4767 }
4768 x->done--;
4769 return timeout ?: 1;
4770}
4771
4772static long __sched
4773wait_for_common(struct completion *x, long timeout, int state)
4774{
4775 might_sleep();
4776
4777 spin_lock_irq(&x->wait.lock);
4778 timeout = do_wait_for_common(x, timeout, state);
4779 spin_unlock_irq(&x->wait.lock);
4780 return timeout;
4781}
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793void __sched wait_for_completion(struct completion *x)
4794{
4795 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4796}
4797EXPORT_SYMBOL(wait_for_completion);
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808unsigned long __sched
4809wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4810{
4811 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4812}
4813EXPORT_SYMBOL(wait_for_completion_timeout);
4814
4815
4816
4817
4818
4819
4820
4821
4822int __sched wait_for_completion_interruptible(struct completion *x)
4823{
4824 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4825 if (t == -ERESTARTSYS)
4826 return t;
4827 return 0;
4828}
4829EXPORT_SYMBOL(wait_for_completion_interruptible);
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839unsigned long __sched
4840wait_for_completion_interruptible_timeout(struct completion *x,
4841 unsigned long timeout)
4842{
4843 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4844}
4845EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4846
4847
4848
4849
4850
4851
4852
4853
4854int __sched wait_for_completion_killable(struct completion *x)
4855{
4856 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4857 if (t == -ERESTARTSYS)
4858 return t;
4859 return 0;
4860}
4861EXPORT_SYMBOL(wait_for_completion_killable);
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875bool try_wait_for_completion(struct completion *x)
4876{
4877 int ret = 1;
4878
4879 spin_lock_irq(&x->wait.lock);
4880 if (!x->done)
4881 ret = 0;
4882 else
4883 x->done--;
4884 spin_unlock_irq(&x->wait.lock);
4885 return ret;
4886}
4887EXPORT_SYMBOL(try_wait_for_completion);
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897bool completion_done(struct completion *x)
4898{
4899 int ret = 1;
4900
4901 spin_lock_irq(&x->wait.lock);
4902 if (!x->done)
4903 ret = 0;
4904 spin_unlock_irq(&x->wait.lock);
4905 return ret;
4906}
4907EXPORT_SYMBOL(completion_done);
4908
4909static long __sched
4910sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4911{
4912 unsigned long flags;
4913 wait_queue_t wait;
4914
4915 init_waitqueue_entry(&wait, current);
4916
4917 __set_current_state(state);
4918
4919 spin_lock_irqsave(&q->lock, flags);
4920 __add_wait_queue(q, &wait);
4921 spin_unlock(&q->lock);
4922 timeout = schedule_timeout(timeout);
4923 spin_lock_irq(&q->lock);
4924 __remove_wait_queue(q, &wait);
4925 spin_unlock_irqrestore(&q->lock, flags);
4926
4927 return timeout;
4928}
4929
4930void __sched interruptible_sleep_on(wait_queue_head_t *q)
4931{
4932 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4933}
4934EXPORT_SYMBOL(interruptible_sleep_on);
4935
4936long __sched
4937interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4938{
4939 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4940}
4941EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4942
4943void __sched sleep_on(wait_queue_head_t *q)
4944{
4945 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4946}
4947EXPORT_SYMBOL(sleep_on);
4948
4949long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4950{
4951 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4952}
4953EXPORT_SYMBOL(sleep_on_timeout);
4954
4955#ifdef CONFIG_RT_MUTEXES
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967void rt_mutex_setprio(struct task_struct *p, int prio)
4968{
4969 unsigned long flags;
4970 int oldprio, on_rq, running;
4971 struct rq *rq;
4972 const struct sched_class *prev_class = p->sched_class;
4973
4974 BUG_ON(prio < 0 || prio > MAX_PRIO);
4975
4976 rq = task_rq_lock(p, &flags);
4977 update_rq_clock(rq);
4978
4979 oldprio = p->prio;
4980 on_rq = p->se.on_rq;
4981 running = task_current(rq, p);
4982 if (on_rq)
4983 dequeue_task(rq, p, 0);
4984 if (running)
4985 p->sched_class->put_prev_task(rq, p);
4986
4987 if (rt_prio(prio))
4988 p->sched_class = &rt_sched_class;
4989 else
4990 p->sched_class = &fair_sched_class;
4991
4992 p->prio = prio;
4993
4994 if (running)
4995 p->sched_class->set_curr_task(rq);
4996 if (on_rq) {
4997 enqueue_task(rq, p, 0);
4998
4999 check_class_changed(rq, p, prev_class, oldprio, running);
5000 }
5001 task_rq_unlock(rq, &flags);
5002}
5003
5004#endif
5005
5006void set_user_nice(struct task_struct *p, long nice)
5007{
5008 int old_prio, delta, on_rq;
5009 unsigned long flags;
5010 struct rq *rq;
5011
5012 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5013 return;
5014
5015
5016
5017
5018 rq = task_rq_lock(p, &flags);
5019 update_rq_clock(rq);
5020
5021
5022
5023
5024
5025
5026 if (task_has_rt_policy(p)) {
5027 p->static_prio = NICE_TO_PRIO(nice);
5028 goto out_unlock;
5029 }
5030 on_rq = p->se.on_rq;
5031 if (on_rq)
5032 dequeue_task(rq, p, 0);
5033
5034 p->static_prio = NICE_TO_PRIO(nice);
5035 set_load_weight(p);
5036 old_prio = p->prio;
5037 p->prio = effective_prio(p);
5038 delta = p->prio - old_prio;
5039
5040 if (on_rq) {
5041 enqueue_task(rq, p, 0);
5042
5043
5044
5045
5046 if (delta < 0 || (delta > 0 && task_running(rq, p)))
5047 resched_task(rq->curr);
5048 }
5049out_unlock:
5050 task_rq_unlock(rq, &flags);
5051}
5052EXPORT_SYMBOL(set_user_nice);
5053
5054
5055
5056
5057
5058
5059int can_nice(const struct task_struct *p, const int nice)
5060{
5061
5062 int nice_rlim = 20 - nice;
5063
5064 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
5065 capable(CAP_SYS_NICE));
5066}
5067
5068#ifdef __ARCH_WANT_SYS_NICE
5069
5070
5071
5072
5073
5074
5075
5076
5077SYSCALL_DEFINE1(nice, int, increment)
5078{
5079 long nice, retval;
5080
5081
5082
5083
5084
5085
5086 if (increment < -40)
5087 increment = -40;
5088 if (increment > 40)
5089 increment = 40;
5090
5091 nice = PRIO_TO_NICE(current->static_prio) + increment;
5092 if (nice < -20)
5093 nice = -20;
5094 if (nice > 19)
5095 nice = 19;
5096
5097 if (increment < 0 && !can_nice(current, nice))
5098 return -EPERM;
5099
5100 retval = security_task_setnice(current, nice);
5101 if (retval)
5102 return retval;
5103
5104 set_user_nice(current, nice);
5105 return 0;
5106}
5107
5108#endif
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118int task_prio(const struct task_struct *p)
5119{
5120 return p->prio - MAX_RT_PRIO;
5121}
5122
5123
5124
5125
5126
5127int task_nice(const struct task_struct *p)
5128{
5129 return TASK_NICE(p);
5130}
5131EXPORT_SYMBOL(task_nice);
5132
5133
5134
5135
5136
5137int idle_cpu(int cpu)
5138{
5139 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
5140}
5141
5142
5143
5144
5145
5146struct task_struct *idle_task(int cpu)
5147{
5148 return cpu_rq(cpu)->idle;
5149}
5150
5151
5152
5153
5154
5155static struct task_struct *find_process_by_pid(pid_t pid)
5156{
5157 return pid ? find_task_by_vpid(pid) : current;
5158}
5159
5160
5161static void
5162__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5163{
5164 BUG_ON(p->se.on_rq);
5165
5166 p->policy = policy;
5167 switch (p->policy) {
5168 case SCHED_NORMAL:
5169 case SCHED_BATCH:
5170 case SCHED_IDLE:
5171 p->sched_class = &fair_sched_class;
5172 break;
5173 case SCHED_FIFO:
5174 case SCHED_RR:
5175 p->sched_class = &rt_sched_class;
5176 break;
5177 }
5178
5179 p->rt_priority = prio;
5180 p->normal_prio = normal_prio(p);
5181
5182 p->prio = rt_mutex_getprio(p);
5183 set_load_weight(p);
5184}
5185
5186static int __sched_setscheduler(struct task_struct *p, int policy,
5187 struct sched_param *param, bool user)
5188{
5189 int retval, oldprio, oldpolicy = -1, on_rq, running;
5190 unsigned long flags;
5191 const struct sched_class *prev_class = p->sched_class;
5192 struct rq *rq;
5193
5194
5195 BUG_ON(in_interrupt());
5196recheck:
5197
5198 if (policy < 0)
5199 policy = oldpolicy = p->policy;
5200 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
5201 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5202 policy != SCHED_IDLE)
5203 return -EINVAL;
5204
5205
5206
5207
5208
5209 if (param->sched_priority < 0 ||
5210 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5211 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5212 return -EINVAL;
5213 if (rt_policy(policy) != (param->sched_priority != 0))
5214 return -EINVAL;
5215
5216
5217
5218
5219 if (user && !capable(CAP_SYS_NICE)) {
5220 if (rt_policy(policy)) {
5221 unsigned long rlim_rtprio;
5222
5223 if (!lock_task_sighand(p, &flags))
5224 return -ESRCH;
5225 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
5226 unlock_task_sighand(p, &flags);
5227
5228
5229 if (policy != p->policy && !rlim_rtprio)
5230 return -EPERM;
5231
5232
5233 if (param->sched_priority > p->rt_priority &&
5234 param->sched_priority > rlim_rtprio)
5235 return -EPERM;
5236 }
5237
5238
5239
5240
5241 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
5242 return -EPERM;
5243
5244
5245 if ((current->euid != p->euid) &&
5246 (current->euid != p->uid))
5247 return -EPERM;
5248 }
5249
5250 if (user) {
5251#ifdef CONFIG_RT_GROUP_SCHED
5252
5253
5254
5255
5256 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5257 task_group(p)->rt_bandwidth.rt_runtime == 0)
5258 return -EPERM;
5259#endif
5260
5261 retval = security_task_setscheduler(p, policy, param);
5262 if (retval)
5263 return retval;
5264 }
5265
5266
5267
5268
5269
5270 spin_lock_irqsave(&p->pi_lock, flags);
5271
5272
5273
5274
5275 rq = __task_rq_lock(p);
5276
5277 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5278 policy = oldpolicy = -1;
5279 __task_rq_unlock(rq);
5280 spin_unlock_irqrestore(&p->pi_lock, flags);
5281 goto recheck;
5282 }
5283 update_rq_clock(rq);
5284 on_rq = p->se.on_rq;
5285 running = task_current(rq, p);
5286 if (on_rq)
5287 deactivate_task(rq, p, 0);
5288 if (running)
5289 p->sched_class->put_prev_task(rq, p);
5290
5291 oldprio = p->prio;
5292 __setscheduler(rq, p, policy, param->sched_priority);
5293
5294 if (running)
5295 p->sched_class->set_curr_task(rq);
5296 if (on_rq) {
5297 activate_task(rq, p, 0);
5298
5299 check_class_changed(rq, p, prev_class, oldprio, running);
5300 }
5301 __task_rq_unlock(rq);
5302 spin_unlock_irqrestore(&p->pi_lock, flags);
5303
5304 rt_mutex_adjust_pi(p);
5305
5306 return 0;
5307}
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317int sched_setscheduler(struct task_struct *p, int policy,
5318 struct sched_param *param)
5319{
5320 return __sched_setscheduler(p, policy, param, true);
5321}
5322EXPORT_SYMBOL_GPL(sched_setscheduler);
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5336 struct sched_param *param)
5337{
5338 return __sched_setscheduler(p, policy, param, false);
5339}
5340
5341static int
5342do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5343{
5344 struct sched_param lparam;
5345 struct task_struct *p;
5346 int retval;
5347
5348 if (!param || pid < 0)
5349 return -EINVAL;
5350 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5351 return -EFAULT;
5352
5353 rcu_read_lock();
5354 retval = -ESRCH;
5355 p = find_process_by_pid(pid);
5356 if (p != NULL)
5357 retval = sched_setscheduler(p, policy, &lparam);
5358 rcu_read_unlock();
5359
5360 return retval;
5361}
5362
5363
5364
5365
5366
5367
5368
5369SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5370 struct sched_param __user *, param)
5371{
5372
5373 if (policy < 0)
5374 return -EINVAL;
5375
5376 return do_sched_setscheduler(pid, policy, param);
5377}
5378
5379
5380
5381
5382
5383
5384SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5385{
5386 return do_sched_setscheduler(pid, -1, param);
5387}
5388
5389
5390
5391
5392
5393SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5394{
5395 struct task_struct *p;
5396 int retval;
5397
5398 if (pid < 0)
5399 return -EINVAL;
5400
5401 retval = -ESRCH;
5402 read_lock(&tasklist_lock);
5403 p = find_process_by_pid(pid);
5404 if (p) {
5405 retval = security_task_getscheduler(p);
5406 if (!retval)
5407 retval = p->policy;
5408 }
5409 read_unlock(&tasklist_lock);
5410 return retval;
5411}
5412
5413
5414
5415
5416
5417
5418SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5419{
5420 struct sched_param lp;
5421 struct task_struct *p;
5422 int retval;
5423
5424 if (!param || pid < 0)
5425 return -EINVAL;
5426
5427 read_lock(&tasklist_lock);
5428 p = find_process_by_pid(pid);
5429 retval = -ESRCH;
5430 if (!p)
5431 goto out_unlock;
5432
5433 retval = security_task_getscheduler(p);
5434 if (retval)
5435 goto out_unlock;
5436
5437 lp.sched_priority = p->rt_priority;
5438 read_unlock(&tasklist_lock);
5439
5440
5441
5442
5443 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5444
5445 return retval;
5446
5447out_unlock:
5448 read_unlock(&tasklist_lock);
5449 return retval;
5450}
5451
5452long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5453{
5454 cpumask_t cpus_allowed;
5455 cpumask_t new_mask = *in_mask;
5456 struct task_struct *p;
5457 int retval;
5458
5459 get_online_cpus();
5460 read_lock(&tasklist_lock);
5461
5462 p = find_process_by_pid(pid);
5463 if (!p) {
5464 read_unlock(&tasklist_lock);
5465 put_online_cpus();
5466 return -ESRCH;
5467 }
5468
5469
5470
5471
5472
5473
5474 get_task_struct(p);
5475 read_unlock(&tasklist_lock);
5476
5477 retval = -EPERM;
5478 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5479 !capable(CAP_SYS_NICE))
5480 goto out_unlock;
5481
5482 retval = security_task_setscheduler(p, 0, NULL);
5483 if (retval)
5484 goto out_unlock;
5485
5486 cpuset_cpus_allowed(p, &cpus_allowed);
5487 cpus_and(new_mask, new_mask, cpus_allowed);
5488 again:
5489 retval = set_cpus_allowed_ptr(p, &new_mask);
5490
5491 if (!retval) {
5492 cpuset_cpus_allowed(p, &cpus_allowed);
5493 if (!cpus_subset(new_mask, cpus_allowed)) {
5494
5495
5496
5497
5498
5499 new_mask = cpus_allowed;
5500 goto again;
5501 }
5502 }
5503out_unlock:
5504 put_task_struct(p);
5505 put_online_cpus();
5506 return retval;
5507}
5508
5509static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5510 cpumask_t *new_mask)
5511{
5512 if (len < sizeof(cpumask_t)) {
5513 memset(new_mask, 0, sizeof(cpumask_t));
5514 } else if (len > sizeof(cpumask_t)) {
5515 len = sizeof(cpumask_t);
5516 }
5517 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5518}
5519
5520
5521
5522
5523
5524
5525
5526SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5527 unsigned long __user *, user_mask_ptr)
5528{
5529 cpumask_t new_mask;
5530 int retval;
5531
5532 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
5533 if (retval)
5534 return retval;
5535
5536 return sched_setaffinity(pid, &new_mask);
5537}
5538
5539long sched_getaffinity(pid_t pid, cpumask_t *mask)
5540{
5541 struct task_struct *p;
5542 int retval;
5543
5544 get_online_cpus();
5545 read_lock(&tasklist_lock);
5546
5547 retval = -ESRCH;
5548 p = find_process_by_pid(pid);
5549 if (!p)
5550 goto out_unlock;
5551
5552 retval = security_task_getscheduler(p);
5553 if (retval)
5554 goto out_unlock;
5555
5556 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
5557
5558out_unlock:
5559 read_unlock(&tasklist_lock);
5560 put_online_cpus();
5561
5562 return retval;
5563}
5564
5565
5566
5567
5568
5569
5570
5571SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5572 unsigned long __user *, user_mask_ptr)
5573{
5574 int ret;
5575 cpumask_t mask;
5576
5577 if (len < sizeof(cpumask_t))
5578 return -EINVAL;
5579
5580 ret = sched_getaffinity(pid, &mask);
5581 if (ret < 0)
5582 return ret;
5583
5584 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
5585 return -EFAULT;
5586
5587 return sizeof(cpumask_t);
5588}
5589
5590
5591
5592
5593
5594
5595
5596SYSCALL_DEFINE0(sched_yield)
5597{
5598 struct rq *rq = this_rq_lock();
5599
5600 schedstat_inc(rq, yld_count);
5601 current->sched_class->yield_task(rq);
5602
5603
5604
5605
5606
5607 __release(rq->lock);
5608 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5609 _raw_spin_unlock(&rq->lock);
5610 preempt_enable_no_resched();
5611
5612 schedule();
5613
5614 return 0;
5615}
5616
5617static void __cond_resched(void)
5618{
5619#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5620 __might_sleep(__FILE__, __LINE__);
5621#endif
5622
5623
5624
5625
5626
5627 do {
5628 add_preempt_count(PREEMPT_ACTIVE);
5629 schedule();
5630 sub_preempt_count(PREEMPT_ACTIVE);
5631 } while (need_resched());
5632}
5633
5634int __sched _cond_resched(void)
5635{
5636 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
5637 system_state == SYSTEM_RUNNING) {
5638 __cond_resched();
5639 return 1;
5640 }
5641 return 0;
5642}
5643EXPORT_SYMBOL(_cond_resched);
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653int cond_resched_lock(spinlock_t *lock)
5654{
5655 int resched = need_resched() && system_state == SYSTEM_RUNNING;
5656 int ret = 0;
5657
5658 if (spin_needbreak(lock) || resched) {
5659 spin_unlock(lock);
5660 if (resched && need_resched())
5661 __cond_resched();
5662 else
5663 cpu_relax();
5664 ret = 1;
5665 spin_lock(lock);
5666 }
5667 return ret;
5668}
5669EXPORT_SYMBOL(cond_resched_lock);
5670
5671int __sched cond_resched_softirq(void)
5672{
5673 BUG_ON(!in_softirq());
5674
5675 if (need_resched() && system_state == SYSTEM_RUNNING) {
5676 local_bh_enable();
5677 __cond_resched();
5678 local_bh_disable();
5679 return 1;
5680 }
5681 return 0;
5682}
5683EXPORT_SYMBOL(cond_resched_softirq);
5684
5685
5686
5687
5688
5689
5690
5691void __sched yield(void)
5692{
5693 set_current_state(TASK_RUNNING);
5694 sys_sched_yield();
5695}
5696EXPORT_SYMBOL(yield);
5697
5698
5699
5700
5701
5702
5703
5704
5705void __sched io_schedule(void)
5706{
5707 struct rq *rq = &__raw_get_cpu_var(runqueues);
5708
5709 delayacct_blkio_start();
5710 atomic_inc(&rq->nr_iowait);
5711 schedule();
5712 atomic_dec(&rq->nr_iowait);
5713 delayacct_blkio_end();
5714}
5715EXPORT_SYMBOL(io_schedule);
5716
5717long __sched io_schedule_timeout(long timeout)
5718{
5719 struct rq *rq = &__raw_get_cpu_var(runqueues);
5720 long ret;
5721
5722 delayacct_blkio_start();
5723 atomic_inc(&rq->nr_iowait);
5724 ret = schedule_timeout(timeout);
5725 atomic_dec(&rq->nr_iowait);
5726 delayacct_blkio_end();
5727 return ret;
5728}
5729
5730
5731
5732
5733
5734
5735
5736
5737SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5738{
5739 int ret = -EINVAL;
5740
5741 switch (policy) {
5742 case SCHED_FIFO:
5743 case SCHED_RR:
5744 ret = MAX_USER_RT_PRIO-1;
5745 break;
5746 case SCHED_NORMAL:
5747 case SCHED_BATCH:
5748 case SCHED_IDLE:
5749 ret = 0;
5750 break;
5751 }
5752 return ret;
5753}
5754
5755
5756
5757
5758
5759
5760
5761
5762SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5763{
5764 int ret = -EINVAL;
5765
5766 switch (policy) {
5767 case SCHED_FIFO:
5768 case SCHED_RR:
5769 ret = 1;
5770 break;
5771 case SCHED_NORMAL:
5772 case SCHED_BATCH:
5773 case SCHED_IDLE:
5774 ret = 0;
5775 }
5776 return ret;
5777}
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5788 struct timespec __user *, interval)
5789{
5790 struct task_struct *p;
5791 unsigned int time_slice;
5792 int retval;
5793 struct timespec t;
5794
5795 if (pid < 0)
5796 return -EINVAL;
5797
5798 retval = -ESRCH;
5799 read_lock(&tasklist_lock);
5800 p = find_process_by_pid(pid);
5801 if (!p)
5802 goto out_unlock;
5803
5804 retval = security_task_getscheduler(p);
5805 if (retval)
5806 goto out_unlock;
5807
5808
5809
5810
5811
5812 time_slice = 0;
5813 if (p->policy == SCHED_RR) {
5814 time_slice = DEF_TIMESLICE;
5815 } else if (p->policy != SCHED_FIFO) {
5816 struct sched_entity *se = &p->se;
5817 unsigned long flags;
5818 struct rq *rq;
5819
5820 rq = task_rq_lock(p, &flags);
5821 if (rq->cfs.load.weight)
5822 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5823 task_rq_unlock(rq, &flags);
5824 }
5825 read_unlock(&tasklist_lock);
5826 jiffies_to_timespec(time_slice, &t);
5827 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5828 return retval;
5829
5830out_unlock:
5831 read_unlock(&tasklist_lock);
5832 return retval;
5833}
5834
5835static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5836
5837void sched_show_task(struct task_struct *p)
5838{
5839 unsigned long free = 0;
5840 unsigned state;
5841
5842 state = p->state ? __ffs(p->state) + 1 : 0;
5843 printk(KERN_INFO "%-13.13s %c", p->comm,
5844 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5845#if BITS_PER_LONG == 32
5846 if (state == TASK_RUNNING)
5847 printk(KERN_CONT " running ");
5848 else
5849 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5850#else
5851 if (state == TASK_RUNNING)
5852 printk(KERN_CONT " running task ");
5853 else
5854 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5855#endif
5856#ifdef CONFIG_DEBUG_STACK_USAGE
5857 {
5858 unsigned long *n = end_of_stack(p);
5859 while (!*n)
5860 n++;
5861 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5862 }
5863#endif
5864 printk(KERN_CONT "%5lu %5d %6d\n", free,
5865 task_pid_nr(p), task_pid_nr(p->real_parent));
5866
5867 show_stack(p, NULL);
5868}
5869
5870void show_state_filter(unsigned long state_filter)
5871{
5872 struct task_struct *g, *p;
5873
5874#if BITS_PER_LONG == 32
5875 printk(KERN_INFO
5876 " task PC stack pid father\n");
5877#else
5878 printk(KERN_INFO
5879 " task PC stack pid father\n");
5880#endif
5881 read_lock(&tasklist_lock);
5882 do_each_thread(g, p) {
5883
5884
5885
5886
5887 touch_nmi_watchdog();
5888 if (!state_filter || (p->state & state_filter))
5889 sched_show_task(p);
5890 } while_each_thread(g, p);
5891
5892 touch_all_softlockup_watchdogs();
5893
5894#ifdef CONFIG_SCHED_DEBUG
5895 sysrq_sched_debug_show();
5896#endif
5897 read_unlock(&tasklist_lock);
5898
5899
5900
5901 if (state_filter == -1)
5902 debug_show_all_locks();
5903}
5904
5905void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5906{
5907 idle->sched_class = &idle_sched_class;
5908}
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918void __cpuinit init_idle(struct task_struct *idle, int cpu)
5919{
5920 struct rq *rq = cpu_rq(cpu);
5921 unsigned long flags;
5922
5923 spin_lock_irqsave(&rq->lock, flags);
5924
5925 __sched_fork(idle);
5926 idle->se.exec_start = sched_clock();
5927
5928 idle->prio = idle->normal_prio = MAX_PRIO;
5929 idle->cpus_allowed = cpumask_of_cpu(cpu);
5930 __set_task_cpu(idle, cpu);
5931
5932 rq->curr = rq->idle = idle;
5933#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5934 idle->oncpu = 1;
5935#endif
5936 spin_unlock_irqrestore(&rq->lock, flags);
5937
5938
5939#if defined(CONFIG_PREEMPT)
5940 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5941#else
5942 task_thread_info(idle)->preempt_count = 0;
5943#endif
5944
5945
5946
5947 idle->sched_class = &idle_sched_class;
5948}
5949
5950
5951
5952
5953
5954
5955
5956
5957cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968static inline void sched_init_granularity(void)
5969{
5970 unsigned int factor = 1 + ilog2(num_online_cpus());
5971 const unsigned long limit = 200000000;
5972
5973 sysctl_sched_min_granularity *= factor;
5974 if (sysctl_sched_min_granularity > limit)
5975 sysctl_sched_min_granularity = limit;
5976
5977 sysctl_sched_latency *= factor;
5978 if (sysctl_sched_latency > limit)
5979 sysctl_sched_latency = limit;
5980
5981 sysctl_sched_wakeup_granularity *= factor;
5982
5983 sysctl_sched_shares_ratelimit *= factor;
5984}
5985
5986#ifdef CONFIG_SMP
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6013{
6014 struct migration_req req;
6015 unsigned long flags;
6016 struct rq *rq;
6017 int ret = 0;
6018
6019 rq = task_rq_lock(p, &flags);
6020 if (!cpus_intersects(*new_mask, cpu_online_map)) {
6021 ret = -EINVAL;
6022 goto out;
6023 }
6024
6025 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
6026 !cpus_equal(p->cpus_allowed, *new_mask))) {
6027 ret = -EINVAL;
6028 goto out;
6029 }
6030
6031 if (p->sched_class->set_cpus_allowed)
6032 p->sched_class->set_cpus_allowed(p, new_mask);
6033 else {
6034 p->cpus_allowed = *new_mask;
6035 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
6036 }
6037
6038
6039 if (cpu_isset(task_cpu(p), *new_mask))
6040 goto out;
6041
6042 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
6043
6044 task_rq_unlock(rq, &flags);
6045 wake_up_process(rq->migration_thread);
6046 wait_for_completion(&req.done);
6047 tlb_migrate_finish(p->mm);