1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
38#include <linux/capability.h>
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/kthread.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/reciprocal_div.h>
66#include <linux/unistd.h>
67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/bootmem.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
73#include <linux/ftrace.h>
74
75#include <asm/tlb.h>
76#include <asm/irq_regs.h>
77
78#include "sched_cpupri.h"
79
80
81
82
83
84
85#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
86#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
87#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
88
89
90
91
92
93
94#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
95#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
96#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
97
98
99
100
101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
102
103#define NICE_0_LOAD SCHED_LOAD_SCALE
104#define NICE_0_SHIFT SCHED_LOAD_SHIFT
105
106
107
108
109
110
111
112#define DEF_TIMESLICE (100 * HZ / 1000)
113
114
115
116
117#define RUNTIME_INF ((u64)~0ULL)
118
119#ifdef CONFIG_SMP
120
121
122
123
124static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
125{
126 return reciprocal_divide(load, sg->reciprocal_cpu_power);
127}
128
129
130
131
132
133static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
134{
135 sg->__cpu_power += val;
136 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
137}
138#endif
139
140static inline int rt_policy(int policy)
141{
142 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
143 return 1;
144 return 0;
145}
146
147static inline int task_has_rt_policy(struct task_struct *p)
148{
149 return rt_policy(p->policy);
150}
151
152
153
154
155struct rt_prio_array {
156 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);
157 struct list_head queue[MAX_RT_PRIO];
158};
159
160struct rt_bandwidth {
161
162 spinlock_t rt_runtime_lock;
163 ktime_t rt_period;
164 u64 rt_runtime;
165 struct hrtimer rt_period_timer;
166};
167
168static struct rt_bandwidth def_rt_bandwidth;
169
170static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
171
172static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
173{
174 struct rt_bandwidth *rt_b =
175 container_of(timer, struct rt_bandwidth, rt_period_timer);
176 ktime_t now;
177 int overrun;
178 int idle = 0;
179
180 for (;;) {
181 now = hrtimer_cb_get_time(timer);
182 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
183
184 if (!overrun)
185 break;
186
187 idle = do_sched_rt_period_timer(rt_b, overrun);
188 }
189
190 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
191}
192
193static
194void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
195{
196 rt_b->rt_period = ns_to_ktime(period);
197 rt_b->rt_runtime = runtime;
198
199 spin_lock_init(&rt_b->rt_runtime_lock);
200
201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{
209 ktime_t now;
210
211 if (rt_b->rt_runtime == RUNTIME_INF)
212 return;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 return;
216
217 spin_lock(&rt_b->rt_runtime_lock);
218 for (;;) {
219 if (hrtimer_active(&rt_b->rt_period_timer))
220 break;
221
222 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
223 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
224 hrtimer_start(&rt_b->rt_period_timer,
225 rt_b->rt_period_timer.expires,
226 HRTIMER_MODE_ABS);
227 }
228 spin_unlock(&rt_b->rt_runtime_lock);
229}
230
231#ifdef CONFIG_RT_GROUP_SCHED
232static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233{
234 hrtimer_cancel(&rt_b->rt_period_timer);
235}
236#endif
237
238
239
240
241
242static DEFINE_MUTEX(sched_domains_mutex);
243
244#ifdef CONFIG_GROUP_SCHED
245
246#include <linux/cgroup.h>
247
248struct cfs_rq;
249
250static LIST_HEAD(task_groups);
251
252
253struct task_group {
254#ifdef CONFIG_CGROUP_SCHED
255 struct cgroup_subsys_state css;
256#endif
257
258#ifdef CONFIG_FAIR_GROUP_SCHED
259
260 struct sched_entity **se;
261
262 struct cfs_rq **cfs_rq;
263 unsigned long shares;
264#endif
265
266#ifdef CONFIG_RT_GROUP_SCHED
267 struct sched_rt_entity **rt_se;
268 struct rt_rq **rt_rq;
269
270 struct rt_bandwidth rt_bandwidth;
271#endif
272
273 struct rcu_head rcu;
274 struct list_head list;
275
276 struct task_group *parent;
277 struct list_head siblings;
278 struct list_head children;
279};
280
281#ifdef CONFIG_USER_SCHED
282
283
284
285
286
287
288struct task_group root_task_group;
289
290#ifdef CONFIG_FAIR_GROUP_SCHED
291
292static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
293
294static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
295#endif
296
297#ifdef CONFIG_RT_GROUP_SCHED
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif
301#else
302#define root_task_group init_task_group
303#endif
304
305
306
307
308static DEFINE_SPINLOCK(task_group_lock);
309
310#ifdef CONFIG_FAIR_GROUP_SCHED
311#ifdef CONFIG_USER_SCHED
312# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
313#else
314# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
315#endif
316
317
318
319
320
321
322
323
324
325#define MIN_SHARES 2
326#define MAX_SHARES (1UL << 18)
327
328static int init_task_group_load = INIT_TASK_GROUP_LOAD;
329#endif
330
331
332
333
334struct task_group init_task_group;
335
336
337static inline struct task_group *task_group(struct task_struct *p)
338{
339 struct task_group *tg;
340
341#ifdef CONFIG_USER_SCHED
342 tg = p->user->tg;
343#elif defined(CONFIG_CGROUP_SCHED)
344 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
345 struct task_group, css);
346#else
347 tg = &init_task_group;
348#endif
349 return tg;
350}
351
352
353static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
354{
355#ifdef CONFIG_FAIR_GROUP_SCHED
356 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
357 p->se.parent = task_group(p)->se[cpu];
358#endif
359
360#ifdef CONFIG_RT_GROUP_SCHED
361 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
362 p->rt.parent = task_group(p)->rt_se[cpu];
363#endif
364}
365
366#else
367
368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
369static inline struct task_group *task_group(struct task_struct *p)
370{
371 return NULL;
372}
373
374#endif
375
376
377struct cfs_rq {
378 struct load_weight load;
379 unsigned long nr_running;
380
381 u64 exec_clock;
382 u64 min_vruntime;
383 u64 pair_start;
384
385 struct rb_root tasks_timeline;
386 struct rb_node *rb_leftmost;
387
388 struct list_head tasks;
389 struct list_head *balance_iterator;
390
391
392
393
394
395 struct sched_entity *curr, *next;
396
397 unsigned long nr_spread_over;
398
399#ifdef CONFIG_FAIR_GROUP_SCHED
400 struct rq *rq;
401
402
403
404
405
406
407
408
409
410 struct list_head leaf_cfs_rq_list;
411 struct task_group *tg;
412
413#ifdef CONFIG_SMP
414
415
416
417 unsigned long task_weight;
418
419
420
421
422
423
424
425 unsigned long h_load;
426
427
428
429
430 unsigned long shares;
431
432
433
434
435 unsigned long rq_weight;
436#endif
437#endif
438};
439
440
441struct rt_rq {
442 struct rt_prio_array active;
443 unsigned long rt_nr_running;
444#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
445 int highest_prio;
446#endif
447#ifdef CONFIG_SMP
448 unsigned long rt_nr_migratory;
449 int overloaded;
450#endif
451 int rt_throttled;
452 u64 rt_time;
453 u64 rt_runtime;
454
455 spinlock_t rt_runtime_lock;
456
457#ifdef CONFIG_RT_GROUP_SCHED
458 unsigned long rt_nr_boosted;
459
460 struct rq *rq;
461 struct list_head leaf_rt_rq_list;
462 struct task_group *tg;
463 struct sched_rt_entity *rt_se;
464#endif
465};
466
467#ifdef CONFIG_SMP
468
469
470
471
472
473
474
475
476
477struct root_domain {
478 atomic_t refcount;
479 cpumask_t span;
480 cpumask_t online;
481
482
483
484
485
486 cpumask_t rto_mask;
487 atomic_t rto_count;
488#ifdef CONFIG_SMP
489 struct cpupri cpupri;
490#endif
491};
492
493
494
495
496
497static struct root_domain def_root_domain;
498
499#endif
500
501
502
503
504
505
506
507
508struct rq {
509
510 spinlock_t lock;
511
512
513
514
515
516 unsigned long nr_running;
517 #define CPU_LOAD_IDX_MAX 5
518 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
519 unsigned char idle_at_tick;
520#ifdef CONFIG_NO_HZ
521 unsigned long last_tick_seen;
522 unsigned char in_nohz_recently;
523#endif
524
525 struct load_weight load;
526 unsigned long nr_load_updates;
527 u64 nr_switches;
528
529 struct cfs_rq cfs;
530 struct rt_rq rt;
531
532#ifdef CONFIG_FAIR_GROUP_SCHED
533
534 struct list_head leaf_cfs_rq_list;
535#endif
536#ifdef CONFIG_RT_GROUP_SCHED
537 struct list_head leaf_rt_rq_list;
538#endif
539
540
541
542
543
544
545
546 unsigned long nr_uninterruptible;
547
548 struct task_struct *curr, *idle;
549 unsigned long next_balance;
550 struct mm_struct *prev_mm;
551
552 u64 clock;
553
554 atomic_t nr_iowait;
555
556#ifdef CONFIG_SMP
557 struct root_domain *rd;
558 struct sched_domain *sd;
559
560
561 int active_balance;
562 int push_cpu;
563
564 int cpu;
565 int online;
566
567 unsigned long avg_load_per_task;
568
569 struct task_struct *migration_thread;
570 struct list_head migration_queue;
571#endif
572
573#ifdef CONFIG_SCHED_HRTICK
574#ifdef CONFIG_SMP
575 int hrtick_csd_pending;
576 struct call_single_data hrtick_csd;
577#endif
578 struct hrtimer hrtick_timer;
579#endif
580
581#ifdef CONFIG_SCHEDSTATS
582
583 struct sched_info rq_sched_info;
584
585
586 unsigned int yld_exp_empty;
587 unsigned int yld_act_empty;
588 unsigned int yld_both_empty;
589 unsigned int yld_count;
590
591
592 unsigned int sched_switch;
593 unsigned int sched_count;
594 unsigned int sched_goidle;
595
596
597 unsigned int ttwu_count;
598 unsigned int ttwu_local;
599
600
601 unsigned int bkl_count;
602#endif
603};
604
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{
609 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610}
611
612static inline int cpu_of(struct rq *rq)
613{
614#ifdef CONFIG_SMP
615 return rq->cpu;
616#else
617 return 0;
618#endif
619}
620
621
622
623
624
625
626
627
628#define for_each_domain(cpu, __sd) \
629 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
630
631#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
632#define this_rq() (&__get_cpu_var(runqueues))
633#define task_rq(p) cpu_rq(task_cpu(p))
634#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
635
636static inline void update_rq_clock(struct rq *rq)
637{
638 rq->clock = sched_clock_cpu(cpu_of(rq));
639}
640
641
642
643
644#ifdef CONFIG_SCHED_DEBUG
645# define const_debug __read_mostly
646#else
647# define const_debug static const
648#endif
649
650
651
652
653
654
655
656
657int runqueue_is_locked(void)
658{
659 int cpu = get_cpu();
660 struct rq *rq = cpu_rq(cpu);
661 int ret;
662
663 ret = spin_is_locked(&rq->lock);
664 put_cpu();
665 return ret;
666}
667
668
669
670
671
672#define SCHED_FEAT(name, enabled) \
673 __SCHED_FEAT_##name ,
674
675enum {
676#include "sched_features.h"
677};
678
679#undef SCHED_FEAT
680
681#define SCHED_FEAT(name, enabled) \
682 (1UL << __SCHED_FEAT_##name) * enabled |
683
684const_debug unsigned int sysctl_sched_features =
685#include "sched_features.h"
686 0;
687
688#undef SCHED_FEAT
689
690#ifdef CONFIG_SCHED_DEBUG
691#define SCHED_FEAT(name, enabled) \
692 #name ,
693
694static __read_mostly char *sched_feat_names[] = {
695#include "sched_features.h"
696 NULL
697};
698
699#undef SCHED_FEAT
700
701static int sched_feat_open(struct inode *inode, struct file *filp)
702{
703 filp->private_data = inode->i_private;
704 return 0;
705}
706
707static ssize_t
708sched_feat_read(struct file *filp, char __user *ubuf,
709 size_t cnt, loff_t *ppos)
710{
711 char *buf;
712 int r = 0;
713 int len = 0;
714 int i;
715
716 for (i = 0; sched_feat_names[i]; i++) {
717 len += strlen(sched_feat_names[i]);
718 len += 4;
719 }
720
721 buf = kmalloc(len + 2, GFP_KERNEL);
722 if (!buf)
723 return -ENOMEM;
724
725 for (i = 0; sched_feat_names[i]; i++) {
726 if (sysctl_sched_features & (1UL << i))
727 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
728 else
729 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
730 }
731
732 r += sprintf(buf + r, "\n");
733 WARN_ON(r >= len + 2);
734
735 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
736
737 kfree(buf);
738
739 return r;
740}
741
742static ssize_t
743sched_feat_write(struct file *filp, const char __user *ubuf,
744 size_t cnt, loff_t *ppos)
745{
746 char buf[64];
747 char *cmp;
748 int neg = 0;
749 int i;
750
751 if (cnt > 63)
752 cnt = 63;
753
754 if (copy_from_user(&buf, ubuf, cnt))
755 return -EFAULT;
756
757 buf[cnt] = 0;
758 cmp = strstrip(buf);
759
760 if (strncmp(buf, "NO_", 3) == 0) {
761 neg = 1;
762 cmp += 3;
763 }
764
765 for (i = 0; sched_feat_names[i]; i++) {
766 if (strcmp(cmp, sched_feat_names[i]) == 0) {
767 if (neg)
768 sysctl_sched_features &= ~(1UL << i);
769 else
770 sysctl_sched_features |= (1UL << i);
771 break;
772 }
773 }
774
775 if (!sched_feat_names[i])
776 return -EINVAL;
777
778 filp->f_pos += cnt;
779
780 return cnt;
781}
782
783static struct file_operations sched_feat_fops = {
784 .open = sched_feat_open,
785 .read = sched_feat_read,
786 .write = sched_feat_write,
787};
788
789static __init int sched_init_debug(void)
790{
791 debugfs_create_file("sched_features", 0644, NULL, NULL,
792 &sched_feat_fops);
793
794 return 0;
795}
796late_initcall(sched_init_debug);
797
798#endif
799
800#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
801
802
803
804
805
806const_debug unsigned int sysctl_sched_nr_migrate = 32;
807
808
809
810
811
812unsigned int sysctl_sched_shares_ratelimit = 250000;
813
814
815
816
817
818unsigned int sysctl_sched_rt_period = 1000000;
819
820static __read_mostly int scheduler_running;
821
822
823
824
825
826int sysctl_sched_rt_runtime = 950000;
827
828static inline u64 global_rt_period(void)
829{
830 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
831}
832
833static inline u64 global_rt_runtime(void)
834{
835 if (sysctl_sched_rt_runtime < 0)
836 return RUNTIME_INF;
837
838 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
839}
840
841#ifndef prepare_arch_switch
842# define prepare_arch_switch(next) do { } while (0)
843#endif
844#ifndef finish_arch_switch
845# define finish_arch_switch(prev) do { } while (0)
846#endif
847
848static inline int task_current(struct rq *rq, struct task_struct *p)
849{
850 return rq->curr == p;
851}
852
853#ifndef __ARCH_WANT_UNLOCKED_CTXSW
854static inline int task_running(struct rq *rq, struct task_struct *p)
855{
856 return task_current(rq, p);
857}
858
859static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
860{
861}
862
863static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
864{
865#ifdef CONFIG_DEBUG_SPINLOCK
866
867 rq->lock.owner = current;
868#endif
869
870
871
872
873
874 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
875
876 spin_unlock_irq(&rq->lock);
877}
878
879#else
880static inline int task_running(struct rq *rq, struct task_struct *p)
881{
882#ifdef CONFIG_SMP
883 return p->oncpu;
884#else
885 return task_current(rq, p);
886#endif
887}
888
889static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
890{
891#ifdef CONFIG_SMP
892
893
894
895
896
897 next->oncpu = 1;
898#endif
899#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
900 spin_unlock_irq(&rq->lock);
901#else
902 spin_unlock(&rq->lock);
903#endif
904}
905
906static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
907{
908#ifdef CONFIG_SMP
909
910
911
912
913
914 smp_wmb();
915 prev->oncpu = 0;
916#endif
917#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
918 local_irq_enable();
919#endif
920}
921#endif
922
923
924
925
926
927static inline struct rq *__task_rq_lock(struct task_struct *p)
928 __acquires(rq->lock)
929{
930 for (;;) {
931 struct rq *rq = task_rq(p);
932 spin_lock(&rq->lock);
933 if (likely(rq == task_rq(p)))
934 return rq;
935 spin_unlock(&rq->lock);
936 }
937}
938
939
940
941
942
943
944static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
945 __acquires(rq->lock)
946{
947 struct rq *rq;
948
949 for (;;) {
950 local_irq_save(*flags);
951 rq = task_rq(p);
952 spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p)))
954 return rq;
955 spin_unlock_irqrestore(&rq->lock, *flags);
956 }
957}
958
959static void __task_rq_unlock(struct rq *rq)
960 __releases(rq->lock)
961{
962 spin_unlock(&rq->lock);
963}
964
965static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
966 __releases(rq->lock)
967{
968 spin_unlock_irqrestore(&rq->lock, *flags);
969}
970
971
972
973
974static struct rq *this_rq_lock(void)
975 __acquires(rq->lock)
976{
977 struct rq *rq;
978
979 local_irq_disable();
980 rq = this_rq();
981 spin_lock(&rq->lock);
982
983 return rq;
984}
985
986#ifdef CONFIG_SCHED_HRTICK
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003static inline int hrtick_enabled(struct rq *rq)
1004{
1005 if (!sched_feat(HRTICK))
1006 return 0;
1007 if (!cpu_active(cpu_of(rq)))
1008 return 0;
1009 return hrtimer_is_hres_active(&rq->hrtick_timer);
1010}
1011
1012static void hrtick_clear(struct rq *rq)
1013{
1014 if (hrtimer_active(&rq->hrtick_timer))
1015 hrtimer_cancel(&rq->hrtick_timer);
1016}
1017
1018
1019
1020
1021
1022static enum hrtimer_restart hrtick(struct hrtimer *timer)
1023{
1024 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1025
1026 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1027
1028 spin_lock(&rq->lock);
1029 update_rq_clock(rq);
1030 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1031 spin_unlock(&rq->lock);
1032
1033 return HRTIMER_NORESTART;
1034}
1035
1036#ifdef CONFIG_SMP
1037
1038
1039
1040static void __hrtick_start(void *arg)
1041{
1042 struct rq *rq = arg;
1043
1044 spin_lock(&rq->lock);
1045 hrtimer_restart(&rq->hrtick_timer);
1046 rq->hrtick_csd_pending = 0;
1047 spin_unlock(&rq->lock);
1048}
1049
1050
1051
1052
1053
1054
1055static void hrtick_start(struct rq *rq, u64 delay)
1056{
1057 struct hrtimer *timer = &rq->hrtick_timer;
1058 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1059
1060 timer->expires = time;
1061
1062 if (rq == this_rq()) {
1063 hrtimer_restart(timer);
1064 } else if (!rq->hrtick_csd_pending) {
1065 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1066 rq->hrtick_csd_pending = 1;
1067 }
1068}
1069
1070static int
1071hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1072{
1073 int cpu = (int)(long)hcpu;
1074
1075 switch (action) {
1076 case CPU_UP_CANCELED:
1077 case CPU_UP_CANCELED_FROZEN:
1078 case CPU_DOWN_PREPARE:
1079 case CPU_DOWN_PREPARE_FROZEN:
1080 case CPU_DEAD:
1081 case CPU_DEAD_FROZEN:
1082 hrtick_clear(cpu_rq(cpu));
1083 return NOTIFY_OK;
1084 }
1085
1086 return NOTIFY_DONE;
1087}
1088
1089static __init void init_hrtick(void)
1090{
1091 hotcpu_notifier(hotplug_hrtick, 0);
1092}
1093#else
1094
1095
1096
1097
1098
1099static void hrtick_start(struct rq *rq, u64 delay)
1100{
1101 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1102}
1103
1104static void init_hrtick(void)
1105{
1106}
1107#endif
1108
1109static void init_rq_hrtick(struct rq *rq)
1110{
1111#ifdef CONFIG_SMP
1112 rq->hrtick_csd_pending = 0;
1113
1114 rq->hrtick_csd.flags = 0;
1115 rq->hrtick_csd.func = __hrtick_start;
1116 rq->hrtick_csd.info = rq;
1117#endif
1118
1119 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1120 rq->hrtick_timer.function = hrtick;
1121 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1122}
1123#else
1124static inline void hrtick_clear(struct rq *rq)
1125{
1126}
1127
1128static inline void init_rq_hrtick(struct rq *rq)
1129{
1130}
1131
1132static inline void init_hrtick(void)
1133{
1134}
1135#endif
1136
1137
1138
1139
1140
1141
1142
1143
1144#ifdef CONFIG_SMP
1145
1146#ifndef tsk_is_polling
1147#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1148#endif
1149
1150static void resched_task(struct task_struct *p)
1151{
1152 int cpu;
1153
1154 assert_spin_locked(&task_rq(p)->lock);
1155
1156 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1157 return;
1158
1159 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1160
1161 cpu = task_cpu(p);
1162 if (cpu == smp_processor_id())
1163 return;
1164
1165
1166 smp_mb();
1167 if (!tsk_is_polling(p))
1168 smp_send_reschedule(cpu);
1169}
1170
1171static void resched_cpu(int cpu)
1172{
1173 struct rq *rq = cpu_rq(cpu);
1174 unsigned long flags;
1175
1176 if (!spin_trylock_irqsave(&rq->lock, flags))
1177 return;
1178 resched_task(cpu_curr(cpu));
1179 spin_unlock_irqrestore(&rq->lock, flags);
1180}
1181
1182#ifdef CONFIG_NO_HZ
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193void wake_up_idle_cpu(int cpu)
1194{
1195 struct rq *rq = cpu_rq(cpu);
1196
1197 if (cpu == smp_processor_id())
1198 return;
1199
1200
1201
1202
1203
1204
1205
1206
1207 if (rq->curr != rq->idle)
1208 return;
1209
1210
1211
1212
1213
1214
1215 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1216
1217
1218 smp_mb();
1219 if (!tsk_is_polling(rq->idle))
1220 smp_send_reschedule(cpu);
1221}
1222#endif
1223
1224#else
1225static void resched_task(struct task_struct *p)
1226{
1227 assert_spin_locked(&task_rq(p)->lock);
1228 set_tsk_need_resched(p);
1229}
1230#endif
1231
1232#if BITS_PER_LONG == 32
1233# define WMULT_CONST (~0UL)
1234#else
1235# define WMULT_CONST (1UL << 32)
1236#endif
1237
1238#define WMULT_SHIFT 32
1239
1240
1241
1242
1243#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1244
1245
1246
1247
1248static unsigned long
1249calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1250 struct load_weight *lw)
1251{
1252 u64 tmp;
1253
1254 if (!lw->inv_weight) {
1255 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1256 lw->inv_weight = 1;
1257 else
1258 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1259 / (lw->weight+1);
1260 }
1261
1262 tmp = (u64)delta_exec * weight;
1263
1264
1265
1266 if (unlikely(tmp > WMULT_CONST))
1267 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1268 WMULT_SHIFT/2);
1269 else
1270 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1271
1272 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1273}
1274
1275static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1276{
1277 lw->weight += inc;
1278 lw->inv_weight = 0;
1279}
1280
1281static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1282{
1283 lw->weight -= dec;
1284 lw->inv_weight = 0;
1285}
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296#define WEIGHT_IDLEPRIO 2
1297#define WMULT_IDLEPRIO (1 << 31)
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311static const int prio_to_weight[40] = {
1312 88761, 71755, 56483, 46273, 36291,
1313 29154, 23254, 18705, 14949, 11916,
1314 9548, 7620, 6100, 4904, 3906,
1315 3121, 2501, 1991, 1586, 1277,
1316 1024, 820, 655, 526, 423,
1317 335, 272, 215, 172, 137,
1318 110, 87, 70, 56, 45,
1319 36, 29, 23, 18, 15,
1320};
1321
1322
1323
1324
1325
1326
1327
1328
1329static const u32 prio_to_wmult[40] = {
1330 48388, 59856, 76040, 92818, 118348,
1331 147320, 184698, 229616, 287308, 360437,
1332 449829, 563644, 704093, 875809, 1099582,
1333 1376151, 1717300, 2157191, 2708050, 3363326,
1334 4194304, 5237765, 6557202, 8165337, 10153587,
1335 12820798, 15790321, 19976592, 24970740, 31350126,
1336 39045157, 49367440, 61356676, 76695844, 95443717,
1337 119304647, 148102320, 186737708, 238609294, 286331153,
1338};
1339
1340static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1341
1342
1343
1344
1345
1346
1347struct rq_iterator {
1348 void *arg;
1349 struct task_struct *(*start)(void *);
1350 struct task_struct *(*next)(void *);
1351};
1352
1353#ifdef CONFIG_SMP
1354static unsigned long
1355balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1356 unsigned long max_load_move, struct sched_domain *sd,
1357 enum cpu_idle_type idle, int *all_pinned,
1358 int *this_best_prio, struct rq_iterator *iterator);
1359
1360static int
1361iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1362 struct sched_domain *sd, enum cpu_idle_type idle,
1363 struct rq_iterator *iterator);
1364#endif
1365
1366#ifdef CONFIG_CGROUP_CPUACCT
1367static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1368#else
1369static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1370#endif
1371
1372static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1373{
1374 update_load_add(&rq->load, load);
1375}
1376
1377static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1378{
1379 update_load_sub(&rq->load, load);
1380}
1381
1382#ifdef CONFIG_SMP
1383static unsigned long source_load(int cpu, int type);
1384static unsigned long target_load(int cpu, int type);
1385static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1386
1387static unsigned long cpu_avg_load_per_task(int cpu)
1388{
1389 struct rq *rq = cpu_rq(cpu);
1390
1391 if (rq->nr_running)
1392 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1393
1394 return rq->avg_load_per_task;
1395}
1396
1397#ifdef CONFIG_FAIR_GROUP_SCHED
1398
1399typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1400
1401
1402
1403
1404
1405static void
1406walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1407{
1408 struct task_group *parent, *child;
1409
1410 rcu_read_lock();
1411 parent = &root_task_group;
1412down:
1413 (*down)(parent, cpu, sd);
1414 list_for_each_entry_rcu(child, &parent->children, siblings) {
1415 parent = child;
1416 goto down;
1417
1418up:
1419 continue;
1420 }
1421 (*up)(parent, cpu, sd);
1422
1423 child = parent;
1424 parent = parent->parent;
1425 if (parent)
1426 goto up;
1427 rcu_read_unlock();
1428}
1429
1430static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1431
1432
1433
1434
1435static void
1436__update_group_shares_cpu(struct task_group *tg, int cpu,
1437 unsigned long sd_shares, unsigned long sd_rq_weight)
1438{
1439 int boost = 0;
1440 unsigned long shares;
1441 unsigned long rq_weight;
1442
1443 if (!tg->se[cpu])
1444 return;
1445
1446 rq_weight = tg->cfs_rq[cpu]->load.weight;
1447
1448
1449
1450
1451
1452
1453 if (!rq_weight) {
1454 boost = 1;
1455 rq_weight = NICE_0_LOAD;
1456 }
1457
1458 if (unlikely(rq_weight > sd_rq_weight))
1459 rq_weight = sd_rq_weight;
1460
1461
1462
1463
1464
1465
1466
1467 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1468
1469
1470
1471
1472 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1473 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1474
1475 if (shares < MIN_SHARES)
1476 shares = MIN_SHARES;
1477 else if (shares > MAX_SHARES)
1478 shares = MAX_SHARES;
1479
1480 __set_se_shares(tg->se[cpu], shares);
1481}
1482
1483
1484
1485
1486
1487
1488static void
1489tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1490{
1491 unsigned long rq_weight = 0;
1492 unsigned long shares = 0;
1493 int i;
1494
1495 for_each_cpu_mask(i, sd->span) {
1496 rq_weight += tg->cfs_rq[i]->load.weight;
1497 shares += tg->cfs_rq[i]->shares;
1498 }
1499
1500 if ((!shares && rq_weight) || shares > tg->shares)
1501 shares = tg->shares;
1502
1503 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1504 shares = tg->shares;
1505
1506 if (!rq_weight)
1507 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1508
1509 for_each_cpu_mask(i, sd->span) {
1510 struct rq *rq = cpu_rq(i);
1511 unsigned long flags;
1512
1513 spin_lock_irqsave(&rq->lock, flags);
1514 __update_group_shares_cpu(tg, i, shares, rq_weight);
1515 spin_unlock_irqrestore(&rq->lock, flags);
1516 }
1517}
1518
1519
1520
1521
1522
1523
1524static void
1525tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1526{
1527 unsigned long load;
1528
1529 if (!tg->parent) {
1530 load = cpu_rq(cpu)->load.weight;
1531 } else {
1532 load = tg->parent->cfs_rq[cpu]->h_load;
1533 load *= tg->cfs_rq[cpu]->shares;
1534 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1535 }
1536
1537 tg->cfs_rq[cpu]->h_load = load;
1538}
1539
1540static void
1541tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1542{
1543}
1544
1545static void update_shares(struct sched_domain *sd)
1546{
1547 u64 now = cpu_clock(raw_smp_processor_id());
1548 s64 elapsed = now - sd->last_update;
1549
1550 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1551 sd->last_update = now;
1552 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1553 }
1554}
1555
1556static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1557{
1558 spin_unlock(&rq->lock);
1559 update_shares(sd);
1560 spin_lock(&rq->lock);
1561}
1562
1563static void update_h_load(int cpu)
1564{
1565 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1566}
1567
1568#else
1569
1570static inline void update_shares(struct sched_domain *sd)
1571{
1572}
1573
1574static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1575{
1576}
1577
1578#endif
1579
1580#endif
1581
1582#ifdef CONFIG_FAIR_GROUP_SCHED
1583static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1584{
1585#ifdef CONFIG_SMP
1586 cfs_rq->shares = shares;
1587#endif
1588}
1589#endif
1590
1591#include "sched_stats.h"
1592#include "sched_idletask.c"
1593#include "sched_fair.c"
1594#include "sched_rt.c"
1595#ifdef CONFIG_SCHED_DEBUG
1596# include "sched_debug.c"
1597#endif
1598
1599#define sched_class_highest (&rt_sched_class)
1600#define for_each_class(class) \
1601 for (class = sched_class_highest; class; class = class->next)
1602
1603static void inc_nr_running(struct rq *rq)
1604{
1605 rq->nr_running++;
1606}
1607
1608static void dec_nr_running(struct rq *rq)
1609{
1610 rq->nr_running--;
1611}
1612
1613static void set_load_weight(struct task_struct *p)
1614{
1615 if (task_has_rt_policy(p)) {
1616 p->se.load.weight = prio_to_weight[0] * 2;
1617 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1618 return;
1619 }
1620
1621
1622
1623
1624 if (p->policy == SCHED_IDLE) {
1625 p->se.load.weight = WEIGHT_IDLEPRIO;
1626 p->se.load.inv_weight = WMULT_IDLEPRIO;
1627 return;
1628 }
1629
1630 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1631 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1632}
1633
1634static void update_avg(u64 *avg, u64 sample)
1635{
1636 s64 diff = sample - *avg;
1637 *avg += diff >> 3;
1638}
1639
1640static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1641{
1642 sched_info_queued(p);
1643 p->sched_class->enqueue_task(rq, p, wakeup);
1644 p->se.on_rq = 1;
1645}
1646
1647static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1648{
1649 if (sleep && p->se.last_wakeup) {
1650 update_avg(&p->se.avg_overlap,
1651 p->se.sum_exec_runtime - p->se.last_wakeup);
1652 p->se.last_wakeup = 0;
1653 }
1654
1655 sched_info_dequeued(p);
1656 p->sched_class->dequeue_task(rq, p, sleep);
1657 p->se.on_rq = 0;
1658}
1659
1660
1661
1662
1663static inline int __normal_prio(struct task_struct *p)
1664{
1665 return p->static_prio;
1666}
1667
1668
1669
1670
1671
1672
1673
1674
1675static inline int normal_prio(struct task_struct *p)
1676{
1677 int prio;
1678
1679 if (task_has_rt_policy(p))
1680 prio = MAX_RT_PRIO-1 - p->rt_priority;
1681 else
1682 prio = __normal_prio(p);
1683 return prio;
1684}
1685
1686
1687
1688
1689
1690
1691
1692
1693static int effective_prio(struct task_struct *p)
1694{
1695 p->normal_prio = normal_prio(p);
1696
1697
1698
1699
1700
1701 if (!rt_prio(p->prio))
1702 return p->normal_prio;
1703 return p->prio;
1704}
1705
1706
1707
1708
1709static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1710{
1711 if (task_contributes_to_load(p))
1712 rq->nr_uninterruptible--;
1713
1714 enqueue_task(rq, p, wakeup);
1715 inc_nr_running(rq);
1716}
1717
1718
1719
1720
1721static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1722{
1723 if (task_contributes_to_load(p))
1724 rq->nr_uninterruptible++;
1725
1726 dequeue_task(rq, p, sleep);
1727 dec_nr_running(rq);
1728}
1729
1730
1731
1732
1733
1734inline int task_curr(const struct task_struct *p)
1735{
1736 return cpu_curr(task_cpu(p)) == p;
1737}
1738
1739static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1740{
1741 set_task_rq(p, cpu);
1742#ifdef CONFIG_SMP
1743
1744
1745
1746
1747
1748 smp_wmb();
1749 task_thread_info(p)->cpu = cpu;
1750#endif
1751}
1752
1753static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1754 const struct sched_class *prev_class,
1755 int oldprio, int running)
1756{
1757 if (prev_class != p->sched_class) {
1758 if (prev_class->switched_from)
1759 prev_class->switched_from(rq, p, running);
1760 p->sched_class->switched_to(rq, p, running);
1761 } else
1762 p->sched_class->prio_changed(rq, p, oldprio, running);
1763}
1764
1765#ifdef CONFIG_SMP
1766
1767
1768static unsigned long weighted_cpuload(const int cpu)
1769{
1770 return cpu_rq(cpu)->load.weight;
1771}
1772
1773
1774
1775
1776static int
1777task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1778{
1779 s64 delta;
1780
1781
1782
1783
1784 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
1785 return 1;
1786
1787 if (p->sched_class != &fair_sched_class)
1788 return 0;
1789
1790 if (sysctl_sched_migration_cost == -1)
1791 return 1;
1792 if (sysctl_sched_migration_cost == 0)
1793 return 0;
1794
1795 delta = now - p->se.exec_start;
1796
1797 return delta < (s64)sysctl_sched_migration_cost;
1798}
1799
1800
1801void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1802{
1803 int old_cpu = task_cpu(p);
1804 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1805 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1806 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1807 u64 clock_offset;
1808
1809 clock_offset = old_rq->clock - new_rq->clock;
1810
1811#ifdef CONFIG_SCHEDSTATS
1812 if (p->se.wait_start)
1813 p->se.wait_start -= clock_offset;
1814 if (p->se.sleep_start)
1815 p->se.sleep_start -= clock_offset;
1816 if (p->se.block_start)
1817 p->se.block_start -= clock_offset;
1818 if (old_cpu != new_cpu) {
1819 schedstat_inc(p, se.nr_migrations);
1820 if (task_hot(p, old_rq->clock, NULL))
1821 schedstat_inc(p, se.nr_forced2_migrations);
1822 }
1823#endif
1824 p->se.vruntime -= old_cfsrq->min_vruntime -
1825 new_cfsrq->min_vruntime;
1826
1827 __set_task_cpu(p, new_cpu);
1828}
1829
1830struct migration_req {
1831 struct list_head list;
1832
1833 struct task_struct *task;
1834 int dest_cpu;
1835
1836 struct completion done;
1837};
1838
1839
1840
1841
1842
1843static int
1844migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1845{
1846 struct rq *rq = task_rq(p);
1847
1848
1849
1850
1851
1852 if (!p->se.on_rq && !task_running(rq, p)) {
1853 set_task_cpu(p, dest_cpu);
1854 return 0;
1855 }
1856
1857 init_completion(&req->done);
1858 req->task = p;
1859 req->dest_cpu = dest_cpu;
1860 list_add(&req->list, &rq->migration_queue);
1861
1862 return 1;
1863}
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1882{
1883 unsigned long flags;
1884 int running, on_rq;
1885 unsigned long ncsw;
1886 struct rq *rq;
1887
1888 for (;;) {
1889
1890
1891
1892
1893
1894
1895 rq = task_rq(p);
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908 while (task_running(rq, p)) {
1909 if (match_state && unlikely(p->state != match_state))
1910 return 0;
1911 cpu_relax();
1912 }
1913
1914
1915
1916
1917
1918
1919 rq = task_rq_lock(p, &flags);
1920 running = task_running(rq, p);
1921 on_rq = p->se.on_rq;
1922 ncsw = 0;
1923 if (!match_state || p->state == match_state) {
1924 ncsw = p->nivcsw + p->nvcsw;
1925 if (unlikely(!ncsw))
1926 ncsw = 1;
1927 }
1928 task_rq_unlock(rq, &flags);
1929
1930
1931
1932
1933 if (unlikely(!ncsw))
1934 break;
1935
1936
1937
1938
1939
1940
1941
1942 if (unlikely(running)) {
1943 cpu_relax();
1944 continue;
1945 }
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956 if (unlikely(on_rq)) {
1957 schedule_timeout_uninterruptible(1);
1958 continue;
1959 }
1960
1961
1962
1963
1964
1965
1966 break;
1967 }
1968
1969 return ncsw;
1970}
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985void kick_process(struct task_struct *p)
1986{
1987 int cpu;
1988
1989 preempt_disable();
1990 cpu = task_cpu(p);
1991 if ((cpu != smp_processor_id()) && task_curr(p))
1992 smp_send_reschedule(cpu);
1993 preempt_enable();
1994}
1995
1996
1997
1998
1999
2000
2001
2002
2003static unsigned long source_load(int cpu, int type)
2004{
2005 struct rq *rq = cpu_rq(cpu);
2006 unsigned long total = weighted_cpuload(cpu);
2007
2008 if (type == 0 || !sched_feat(LB_BIAS))
2009 return total;
2010
2011 return min(rq->cpu_load[type-1], total);
2012}
2013
2014
2015
2016
2017
2018static unsigned long target_load(int cpu, int type)
2019{
2020 struct rq *rq = cpu_rq(cpu);
2021 unsigned long total = weighted_cpuload(cpu);
2022
2023 if (type == 0 || !sched_feat(LB_BIAS))
2024 return total;
2025
2026 return max(rq->cpu_load[type-1], total);
2027}
2028
2029
2030
2031
2032
2033static struct sched_group *
2034find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2035{
2036 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2037 unsigned long min_load = ULONG_MAX, this_load = 0;
2038 int load_idx = sd->forkexec_idx;
2039 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2040
2041 do {
2042 unsigned long load, avg_load;
2043 int local_group;
2044 int i;
2045
2046
2047 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
2048 continue;
2049
2050 local_group = cpu_isset(this_cpu, group->cpumask);
2051
2052
2053 avg_load = 0;
2054
2055 for_each_cpu_mask_nr(i, group->cpumask) {
2056
2057 if (local_group)
2058 load = source_load(i, load_idx);
2059 else
2060 load = target_load(i, load_idx);
2061
2062 avg_load += load;
2063 }
2064
2065
2066 avg_load = sg_div_cpu_power(group,
2067 avg_load * SCHED_LOAD_SCALE);
2068
2069 if (local_group) {
2070 this_load = avg_load;
2071 this = group;
2072 } else if (avg_load < min_load) {
2073 min_load = avg_load;
2074 idlest = group;
2075 }
2076 } while (group = group->next, group != sd->groups);
2077
2078 if (!idlest || 100*this_load < imbalance*min_load)
2079 return NULL;
2080 return idlest;
2081}
2082
2083
2084
2085
2086static int
2087find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2088 cpumask_t *tmp)
2089{
2090 unsigned long load, min_load = ULONG_MAX;
2091 int idlest = -1;
2092 int i;
2093
2094
2095 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
2096
2097 for_each_cpu_mask_nr(i, *tmp) {
2098 load = weighted_cpuload(i);
2099
2100 if (load < min_load || (load == min_load && i == this_cpu)) {
2101 min_load = load;
2102 idlest = i;
2103 }
2104 }
2105
2106 return idlest;
2107}
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120static int sched_balance_self(int cpu, int flag)
2121{
2122 struct task_struct *t = current;
2123 struct sched_domain *tmp, *sd = NULL;
2124
2125 for_each_domain(cpu, tmp) {
2126
2127
2128
2129 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2130 break;
2131 if (tmp->flags & flag)
2132 sd = tmp;
2133 }
2134
2135 if (sd)
2136 update_shares(sd);
2137
2138 while (sd) {
2139 cpumask_t span, tmpmask;
2140 struct sched_group *group;
2141 int new_cpu, weight;
2142
2143 if (!(sd->flags & flag)) {
2144 sd = sd->child;
2145 continue;
2146 }
2147
2148 span = sd->span;
2149 group = find_idlest_group(sd, t, cpu);
2150 if (!group) {
2151 sd = sd->child;
2152 continue;
2153 }
2154
2155 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2156 if (new_cpu == -1 || new_cpu == cpu) {
2157
2158 sd = sd->child;
2159 continue;
2160 }
2161
2162
2163 cpu = new_cpu;
2164 sd = NULL;
2165 weight = cpus_weight(span);
2166 for_each_domain(cpu, tmp) {
2167 if (weight <= cpus_weight(tmp->span))
2168 break;
2169 if (tmp->flags & flag)
2170 sd = tmp;
2171 }
2172
2173 }
2174
2175 return cpu;
2176}
2177
2178#endif
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2195{
2196 int cpu, orig_cpu, this_cpu, success = 0;
2197 unsigned long flags;
2198 long old_state;
2199 struct rq *rq;
2200
2201 if (!sched_feat(SYNC_WAKEUPS))
2202 sync = 0;
2203
2204#ifdef CONFIG_SMP
2205 if (sched_feat(LB_WAKEUP_UPDATE)) {
2206 struct sched_domain *sd;
2207
2208 this_cpu = raw_smp_processor_id();
2209 cpu = task_cpu(p);
2210
2211 for_each_domain(this_cpu, sd) {
2212 if (cpu_isset(cpu, sd->span)) {
2213 update_shares(sd);
2214 break;
2215 }
2216 }
2217 }
2218#endif
2219
2220 smp_wmb();
2221 rq = task_rq_lock(p, &flags);
2222 old_state = p->state;
2223 if (!(old_state & state))
2224 goto out;
2225
2226 if (p->se.on_rq)
2227 goto out_running;
2228
2229 cpu = task_cpu(p);
2230 orig_cpu = cpu;
2231 this_cpu = smp_processor_id();
2232
2233#ifdef CONFIG_SMP
2234 if (unlikely(task_running(rq, p)))
2235 goto out_activate;
2236
2237 cpu = p->sched_class->select_task_rq(p, sync);
2238 if (cpu != orig_cpu) {
2239 set_task_cpu(p, cpu);
2240 task_rq_unlock(rq, &flags);
2241
2242 rq = task_rq_lock(p, &flags);
2243 old_state = p->state;
2244 if (!(old_state & state))
2245 goto out;
2246 if (p->se.on_rq)
2247 goto out_running;
2248
2249 this_cpu = smp_processor_id();
2250 cpu = task_cpu(p);
2251 }
2252
2253#ifdef CONFIG_SCHEDSTATS
2254 schedstat_inc(rq, ttwu_count);
2255 if (cpu == this_cpu)
2256 schedstat_inc(rq, ttwu_local);
2257 else {
2258 struct sched_domain *sd;
2259 for_each_domain(this_cpu, sd) {
2260 if (cpu_isset(cpu, sd->span)) {
2261 schedstat_inc(sd, ttwu_wake_remote);
2262 break;
2263 }
2264 }
2265 }
2266#endif
2267
2268out_activate:
2269#endif
2270 schedstat_inc(p, se.nr_wakeups);
2271 if (sync)
2272 schedstat_inc(p, se.nr_wakeups_sync);
2273 if (orig_cpu != cpu)
2274 schedstat_inc(p, se.nr_wakeups_migrate);
2275 if (cpu == this_cpu)
2276 schedstat_inc(p, se.nr_wakeups_local);
2277 else
2278 schedstat_inc(p, se.nr_wakeups_remote);
2279 update_rq_clock(rq);
2280 activate_task(rq, p, 1);
2281 success = 1;
2282
2283out_running:
2284 trace_mark(kernel_sched_wakeup,
2285 "pid %d state %ld ## rq %p task %p rq->curr %p",
2286 p->pid, p->state, rq, p, rq->curr);
2287 check_preempt_curr(rq, p, sync);
2288
2289 p->state = TASK_RUNNING;
2290#ifdef CONFIG_SMP
2291 if (p->sched_class->task_wake_up)
2292 p->sched_class->task_wake_up(rq, p);
2293#endif
2294out:
2295 current->se.last_wakeup = current->se.sum_exec_runtime;
2296
2297 task_rq_unlock(rq, &flags);
2298
2299 return success;
2300}
2301
2302int wake_up_process(struct task_struct *p)
2303{
2304 return try_to_wake_up(p, TASK_ALL, 0);
2305}
2306EXPORT_SYMBOL(wake_up_process);
2307
2308int wake_up_state(struct task_struct *p, unsigned int state)
2309{
2310 return try_to_wake_up(p, state, 0);
2311}
2312
2313
2314
2315
2316
2317
2318
2319static void __sched_fork(struct task_struct *p)
2320{
2321 p->se.exec_start = 0;
2322 p->se.sum_exec_runtime = 0;
2323 p->se.prev_sum_exec_runtime = 0;
2324 p->se.last_wakeup = 0;
2325 p->se.avg_overlap = 0;
2326
2327#ifdef CONFIG_SCHEDSTATS
2328 p->se.wait_start = 0;
2329 p->se.sum_sleep_runtime = 0;
2330 p->se.sleep_start = 0;
2331 p->se.block_start = 0;
2332 p->se.sleep_max = 0;
2333 p->se.block_max = 0;
2334 p->se.exec_max = 0;
2335 p->se.slice_max = 0;
2336 p->se.wait_max = 0;
2337#endif
2338
2339 INIT_LIST_HEAD(&p->rt.run_list);
2340 p->se.on_rq = 0;
2341 INIT_LIST_HEAD(&p->se.group_node);
2342
2343#ifdef CONFIG_PREEMPT_NOTIFIERS
2344 INIT_HLIST_HEAD(&p->preempt_notifiers);
2345#endif
2346
2347
2348
2349
2350
2351
2352
2353 p->state = TASK_RUNNING;
2354}
2355
2356
2357
2358
2359void sched_fork(struct task_struct *p, int clone_flags)
2360{
2361 int cpu = get_cpu();
2362
2363 __sched_fork(p);
2364
2365#ifdef CONFIG_SMP
2366 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2367#endif
2368 set_task_cpu(p, cpu);
2369
2370
2371
2372
2373 p->prio = current->normal_prio;
2374 if (!rt_prio(p->prio))
2375 p->sched_class = &fair_sched_class;
2376
2377#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2378 if (likely(sched_info_on()))
2379 memset(&p->sched_info, 0, sizeof(p->sched_info));
2380#endif
2381#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2382 p->oncpu = 0;
2383#endif
2384#ifdef CONFIG_PREEMPT
2385
2386 task_thread_info(p)->preempt_count = 1;
2387#endif
2388 put_cpu();
2389}
2390
2391
2392
2393
2394
2395
2396
2397
2398void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2399{
2400 unsigned long flags;
2401 struct rq *rq;
2402
2403 rq = task_rq_lock(p, &flags);
2404 BUG_ON(p->state != TASK_RUNNING);
2405 update_rq_clock(rq);
2406
2407 p->prio = effective_prio(p);
2408
2409 if (!p->sched_class->task_new || !current->se.on_rq) {
2410 activate_task(rq, p, 0);
2411 } else {
2412
2413
2414
2415
2416 p->sched_class->task_new(rq, p);
2417 inc_nr_running(rq);
2418 }
2419 trace_mark(kernel_sched_wakeup_new,
2420 "pid %d state %ld ## rq %p task %p rq->curr %p",
2421 p->pid, p->state, rq, p, rq->curr);
2422 check_preempt_curr(rq, p, 0);
2423#ifdef CONFIG_SMP
2424 if (p->sched_class->task_wake_up)
2425 p->sched_class->task_wake_up(rq, p);
2426#endif
2427 task_rq_unlock(rq, &flags);
2428}
2429
2430#ifdef CONFIG_PREEMPT_NOTIFIERS
2431
2432
2433
2434
2435
2436void preempt_notifier_register(struct preempt_notifier *notifier)
2437{
2438 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
2439}
2440EXPORT_SYMBOL_GPL(preempt_notifier_register);
2441
2442
2443
2444
2445
2446
2447
2448void preempt_notifier_unregister(struct preempt_notifier *notifier)
2449{
2450 hlist_del(¬ifier->link);
2451}
2452EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2453
2454static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2455{
2456 struct preempt_notifier *notifier;
2457 struct hlist_node *node;
2458
2459 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2460 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2461}
2462
2463static void
2464fire_sched_out_preempt_notifiers(struct task_struct *curr,
2465 struct task_struct *next)
2466{
2467 struct preempt_notifier *notifier;
2468 struct hlist_node *node;
2469
2470 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2471 notifier->ops->sched_out(notifier, next);
2472}
2473
2474#else
2475
2476static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2477{
2478}
2479
2480static void
2481fire_sched_out_preempt_notifiers(struct task_struct *curr,
2482 struct task_struct *next)
2483{
2484}
2485
2486#endif
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501static inline void
2502prepare_task_switch(struct rq *rq, struct task_struct *prev,
2503 struct task_struct *next)
2504{
2505 fire_sched_out_preempt_notifiers(prev, next);
2506 prepare_lock_switch(rq, next);
2507 prepare_arch_switch(next);
2508}
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2526 __releases(rq->lock)
2527{
2528 struct mm_struct *mm = rq->prev_mm;
2529 long prev_state;
2530
2531 rq->prev_mm = NULL;
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544 prev_state = prev->state;
2545 finish_arch_switch(prev);
2546 finish_lock_switch(rq, prev);
2547#ifdef CONFIG_SMP
2548 if (current->sched_class->post_schedule)
2549 current->sched_class->post_schedule(rq);
2550#endif
2551
2552 fire_sched_in_preempt_notifiers(current);
2553 if (mm)
2554 mmdrop(mm);
2555 if (unlikely(prev_state == TASK_DEAD)) {
2556
2557
2558
2559
2560 kprobe_flush_task(prev);
2561 put_task_struct(prev);
2562 }
2563}
2564
2565
2566
2567
2568
2569asmlinkage void schedule_tail(struct task_struct *prev)
2570 __releases(rq->lock)
2571{
2572 struct rq *rq = this_rq();
2573
2574 finish_task_switch(rq, prev);
2575#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2576
2577 preempt_enable();
2578#endif
2579 if (current->set_child_tid)
2580 put_user(task_pid_vnr(current), current->set_child_tid);
2581}
2582
2583
2584
2585
2586
2587static inline void
2588context_switch(struct rq *rq, struct task_struct *prev,
2589 struct task_struct *next)
2590{
2591 struct mm_struct *mm, *oldmm;
2592
2593 prepare_task_switch(rq, prev, next);
2594 trace_mark(kernel_sched_schedule,
2595 "prev_pid %d next_pid %d prev_state %ld "
2596 "## rq %p prev %p next %p",
2597 prev->pid, next->pid, prev->state,
2598 rq, prev, next);
2599 mm = next->mm;
2600 oldmm = prev->active_mm;
2601
2602
2603
2604
2605
2606 arch_enter_lazy_cpu_mode();
2607
2608 if (unlikely(!mm)) {
2609 next->active_mm = oldmm;
2610 atomic_inc(&oldmm->mm_count);
2611 enter_lazy_tlb(oldmm, next);
2612 } else
2613 switch_mm(oldmm, mm, next);
2614
2615 if (unlikely(!prev->mm)) {
2616 prev->active_mm = NULL;
2617 rq->prev_mm = oldmm;
2618 }
2619
2620
2621
2622
2623
2624
2625#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2626 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2627#endif
2628
2629
2630 switch_to(prev, next, prev);
2631
2632 barrier();
2633
2634
2635
2636
2637
2638 finish_task_switch(this_rq(), prev);
2639}
2640
2641
2642
2643
2644
2645
2646
2647
2648unsigned long nr_running(void)
2649{
2650 unsigned long i, sum = 0;
2651
2652 for_each_online_cpu(i)
2653 sum += cpu_rq(i)->nr_running;
2654
2655 return sum;
2656}
2657
2658unsigned long nr_uninterruptible(void)
2659{
2660 unsigned long i, sum = 0;
2661
2662 for_each_possible_cpu(i)
2663 sum += cpu_rq(i)->nr_uninterruptible;
2664
2665
2666
2667
2668
2669 if (unlikely((long)sum < 0))
2670 sum = 0;
2671
2672 return sum;
2673}
2674
2675unsigned long long nr_context_switches(void)
2676{
2677 int i;
2678 unsigned long long sum = 0;
2679
2680 for_each_possible_cpu(i)
2681 sum += cpu_rq(i)->nr_switches;
2682
2683 return sum;
2684}
2685
2686unsigned long nr_iowait(void)
2687{
2688 unsigned long i, sum = 0;
2689
2690 for_each_possible_cpu(i)
2691 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2692
2693 return sum;
2694}
2695
2696unsigned long nr_active(void)
2697{
2698 unsigned long i, running = 0, uninterruptible = 0;
2699
2700 for_each_online_cpu(i) {
2701 running += cpu_rq(i)->nr_running;
2702 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2703 }
2704
2705 if (unlikely((long)uninterruptible < 0))
2706 uninterruptible = 0;
2707
2708 return running + uninterruptible;
2709}
2710
2711
2712
2713
2714
2715static void update_cpu_load(struct rq *this_rq)
2716{
2717 unsigned long this_load = this_rq->load.weight;
2718 int i, scale;
2719
2720 this_rq->nr_load_updates++;
2721
2722
2723 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2724 unsigned long old_load, new_load;
2725
2726
2727
2728 old_load = this_rq->cpu_load[i];
2729 new_load = this_load;
2730
2731
2732
2733
2734
2735 if (new_load > old_load)
2736 new_load += scale-1;
2737 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2738 }
2739}
2740
2741#ifdef CONFIG_SMP
2742
2743
2744
2745
2746
2747
2748
2749static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2750 __acquires(rq1->lock)
2751 __acquires(rq2->lock)
2752{
2753 BUG_ON(!irqs_disabled());
2754 if (rq1 == rq2) {
2755 spin_lock(&rq1->lock);
2756 __acquire(rq2->lock);
2757 } else {
2758 if (rq1 < rq2) {
2759 spin_lock(&rq1->lock);
2760 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2761 } else {
2762 spin_lock(&rq2->lock);
2763 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2764 }
2765 }
2766 update_rq_clock(rq1);
2767 update_rq_clock(rq2);
2768}
2769
2770
2771
2772
2773
2774
2775
2776static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2777 __releases(rq1->lock)
2778 __releases(rq2->lock)
2779{
2780 spin_unlock(&rq1->lock);
2781 if (rq1 != rq2)
2782 spin_unlock(&rq2->lock);
2783 else
2784 __release(rq2->lock);
2785}
2786
2787
2788
2789
2790static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2791 __releases(this_rq->lock)
2792 __acquires(busiest->lock)
2793 __acquires(this_rq->lock)
2794{
2795 int ret = 0;
2796
2797 if (unlikely(!irqs_disabled())) {
2798
2799 spin_unlock(&this_rq->lock);
2800 BUG_ON(1);
2801 }
2802 if (unlikely(!spin_trylock(&busiest->lock))) {
2803 if (busiest < this_rq) {
2804 spin_unlock(&this_rq->lock);
2805 spin_lock(&busiest->lock);
2806 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2807 ret = 1;
2808 } else
2809 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2810 }
2811 return ret;
2812}
2813
2814static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2815 __releases(busiest->lock)
2816{
2817 spin_unlock(&busiest->lock);
2818 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2819}
2820
2821
2822
2823
2824
2825
2826
2827static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2828{
2829 struct migration_req req;
2830 unsigned long flags;
2831 struct rq *rq;
2832
2833 rq = task_rq_lock(p, &flags);
2834 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2835 || unlikely(!cpu_active(dest_cpu)))
2836 goto out;
2837
2838
2839 if (migrate_task(p, dest_cpu, &req)) {
2840
2841 struct task_struct *mt = rq->migration_thread;
2842
2843 get_task_struct(mt);
2844 task_rq_unlock(rq, &flags);
2845 wake_up_process(mt);
2846 put_task_struct(mt);
2847 wait_for_completion(&req.done);
2848
2849 return;
2850 }
2851out:
2852 task_rq_unlock(rq, &flags);
2853}
2854
2855
2856
2857
2858
2859void sched_exec(void)
2860{
2861 int new_cpu, this_cpu = get_cpu();
2862 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2863 put_cpu();
2864 if (new_cpu != this_cpu)
2865 sched_migrate_task(current, new_cpu);
2866}
2867
2868
2869
2870
2871
2872static void pull_task(struct rq *src_rq, struct task_struct *p,
2873 struct rq *this_rq, int this_cpu)
2874{
2875 deactivate_task(src_rq, p, 0);
2876 set_task_cpu(p, this_cpu);
2877 activate_task(this_rq, p, 0);
2878
2879
2880
2881
2882 check_preempt_curr(this_rq, p, 0);
2883}
2884
2885
2886
2887
2888static
2889int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2890 struct sched_domain *sd, enum cpu_idle_type idle,
2891 int *all_pinned)
2892{
2893
2894
2895
2896
2897
2898
2899 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2900 schedstat_inc(p, se.nr_failed_migrations_affine);
2901 return 0;
2902 }
2903 *all_pinned = 0;
2904
2905 if (task_running(rq, p)) {
2906 schedstat_inc(p, se.nr_failed_migrations_running);
2907 return 0;
2908 }
2909
2910
2911
2912
2913
2914
2915
2916 if (!task_hot(p, rq->clock, sd) ||
2917 sd->nr_balance_failed > sd->cache_nice_tries) {
2918#ifdef CONFIG_SCHEDSTATS
2919 if (task_hot(p, rq->clock, sd)) {
2920 schedstat_inc(sd, lb_hot_gained[idle]);
2921 schedstat_inc(p, se.nr_forced_migrations);
2922 }
2923#endif
2924 return 1;
2925 }
2926
2927 if (task_hot(p, rq->clock, sd)) {
2928 schedstat_inc(p, se.nr_failed_migrations_hot);
2929 return 0;
2930 }
2931 return 1;
2932}
2933
2934static unsigned long
2935balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2936 unsigned long max_load_move, struct sched_domain *sd,
2937 enum cpu_idle_type idle, int *all_pinned,
2938 int *this_best_prio, struct rq_iterator *iterator)
2939{
2940 int loops = 0, pulled = 0, pinned = 0;
2941 struct task_struct *p;
2942 long rem_load_move = max_load_move;
2943
2944 if (max_load_move == 0)
2945 goto out;
2946
2947 pinned = 1;
2948
2949
2950
2951
2952 p = iterator->start(iterator->arg);
2953next:
2954 if (!p || loops++ > sysctl_sched_nr_migrate)
2955 goto out;
2956
2957 if ((p->se.load.weight >> 1) > rem_load_move ||
2958 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2959 p = iterator->next(iterator->arg);
2960 goto next;
2961 }
2962
2963 pull_task(busiest, p, this_rq, this_cpu);
2964 pulled++;
2965 rem_load_move -= p->se.load.weight;
2966
2967
2968
2969
2970 if (rem_load_move > 0) {
2971 if (p->prio < *this_best_prio)
2972 *this_best_prio = p->prio;
2973 p = iterator->next(iterator->arg);
2974 goto next;
2975 }
2976out:
2977
2978
2979
2980
2981
2982 schedstat_add(sd, lb_gained[idle], pulled);
2983
2984 if (all_pinned)
2985 *all_pinned = pinned;
2986
2987 return max_load_move - rem_load_move;
2988}
2989
2990
2991
2992
2993
2994
2995
2996
2997static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2998 unsigned long max_load_move,
2999 struct sched_domain *sd, enum cpu_idle_type idle,
3000 int *all_pinned)
3001{
3002 const struct sched_class *class = sched_class_highest;
3003 unsigned long total_load_moved = 0;
3004 int this_best_prio = this_rq->curr->prio;
3005
3006 do {
3007 total_load_moved +=
3008 class->load_balance(this_rq, this_cpu, busiest,
3009 max_load_move - total_load_moved,
3010 sd, idle, all_pinned, &this_best_prio);
3011 class = class->next;
3012
3013 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3014 break;
3015
3016 } while (class && max_load_move > total_load_moved);
3017
3018 return total_load_moved > 0;
3019}
3020
3021static int
3022iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3023 struct sched_domain *sd, enum cpu_idle_type idle,
3024 struct rq_iterator *iterator)
3025{
3026 struct task_struct *p = iterator->start(iterator->arg);
3027 int pinned = 0;
3028
3029 while (p) {
3030 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3031 pull_task(busiest, p, this_rq, this_cpu);
3032
3033
3034
3035
3036
3037 schedstat_inc(sd, lb_gained[idle]);
3038
3039 return 1;
3040 }
3041 p = iterator->next(iterator->arg);
3042 }
3043
3044 return 0;
3045}
3046
3047
3048
3049
3050
3051
3052
3053
3054static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3055 struct sched_domain *sd, enum cpu_idle_type idle)
3056{
3057 const struct sched_class *class;
3058
3059 for (class = sched_class_highest; class; class = class->next)
3060 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3061 return 1;
3062
3063 return 0;
3064}
3065
3066
3067
3068
3069
3070
3071static struct sched_group *
3072find_busiest_group(struct sched_domain *sd, int this_cpu,
3073 unsigned long *imbalance, enum cpu_idle_type idle,
3074 int *sd_idle, const cpumask_t *cpus, int *balance)
3075{
3076 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3077 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
3078 unsigned long max_pull;
3079 unsigned long busiest_load_per_task, busiest_nr_running;
3080 unsigned long this_load_per_task, this_nr_running;
3081 int load_idx, group_imb = 0;
3082#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3083 int power_savings_balance = 1;
3084 unsigned long leader_nr_running = 0, min_load_per_task = 0;
3085 unsigned long min_nr_running = ULONG_MAX;
3086 struct sched_group *group_min = NULL, *group_leader = NULL;
3087#endif
3088
3089 max_load = this_load = total_load = total_pwr = 0;
3090 busiest_load_per_task = busiest_nr_running = 0;
3091 this_load_per_task = this_nr_running = 0;
3092
3093 if (idle == CPU_NOT_IDLE)
3094 load_idx = sd->busy_idx;
3095 else if (idle == CPU_NEWLY_IDLE)
3096 load_idx = sd->newidle_idx;
3097 else
3098 load_idx = sd->idle_idx;
3099
3100 do {
3101 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
3102 int local_group;
3103 int i;
3104 int __group_imb = 0;
3105 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3106 unsigned long sum_nr_running, sum_weighted_load;
3107 unsigned long sum_avg_load_per_task;
3108 unsigned long avg_load_per_task;
3109
3110 local_group = cpu_isset(this_cpu, group->cpumask);
3111
3112 if (local_group)
3113 balance_cpu = first_cpu(group->cpumask);
3114
3115
3116 sum_weighted_load = sum_nr_running = avg_load = 0;
3117 sum_avg_load_per_task = avg_load_per_task = 0;
3118
3119 max_cpu_load = 0;
3120 min_cpu_load = ~0UL;
3121
3122 for_each_cpu_mask_nr(i, group->cpumask) {
3123 struct rq *rq;
3124
3125 if (!cpu_isset(i, *cpus))
3126 continue;
3127
3128 rq = cpu_rq(i);
3129
3130 if (*sd_idle && rq->nr_running)
3131 *sd_idle = 0;
3132
3133
3134 if (local_group) {
3135 if (idle_cpu(i) && !first_idle_cpu) {
3136 first_idle_cpu = 1;
3137 balance_cpu = i;
3138 }
3139
3140 load = target_load(i, load_idx);
3141 } else {
3142 load = source_load(i, load_idx);
3143 if (load > max_cpu_load)
3144 max_cpu_load = load;
3145 if (min_cpu_load > load)
3146 min_cpu_load = load;
3147 }
3148
3149 avg_load += load;
3150 sum_nr_running += rq->nr_running;
3151 sum_weighted_load += weighted_cpuload(i);
3152
3153 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3154 }
3155
3156
3157
3158
3159
3160
3161
3162 if (idle != CPU_NEWLY_IDLE && local_group &&
3163 balance_cpu != this_cpu && balance) {
3164 *balance = 0;
3165 goto ret;
3166 }
3167
3168 total_load += avg_load;
3169 total_pwr += group->__cpu_power;
3170
3171
3172 avg_load = sg_div_cpu_power(group,
3173 avg_load * SCHED_LOAD_SCALE);
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185 avg_load_per_task = sg_div_cpu_power(group,
3186 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3187
3188 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3189 __group_imb = 1;
3190
3191 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3192
3193 if (local_group) {
3194 this_load = avg_load;
3195 this = group;
3196 this_nr_running = sum_nr_running;
3197 this_load_per_task = sum_weighted_load;
3198 } else if (avg_load > max_load &&
3199 (sum_nr_running > group_capacity || __group_imb)) {
3200 max_load = avg_load;
3201 busiest = group;
3202 busiest_nr_running = sum_nr_running;
3203 busiest_load_per_task = sum_weighted_load;
3204 group_imb = __group_imb;
3205 }
3206
3207#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3208
3209
3210
3211
3212 if (idle == CPU_NOT_IDLE ||
3213 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3214 goto group_next;
3215
3216
3217
3218
3219
3220 if (local_group && (this_nr_running >= group_capacity ||
3221 !this_nr_running))
3222 power_savings_balance = 0;
3223
3224
3225
3226
3227
3228 if (!power_savings_balance || sum_nr_running >= group_capacity
3229 || !sum_nr_running)
3230 goto group_next;
3231
3232
3233
3234
3235
3236
3237 if ((sum_nr_running < min_nr_running) ||
3238 (sum_nr_running == min_nr_running &&
3239 first_cpu(group->cpumask) <
3240 first_cpu(group_min->cpumask))) {
3241 group_min = group;
3242 min_nr_running = sum_nr_running;
3243 min_load_per_task = sum_weighted_load /
3244 sum_nr_running;
3245 }
3246
3247
3248
3249
3250
3251
3252 if (sum_nr_running <= group_capacity - 1) {
3253 if (sum_nr_running > leader_nr_running ||
3254 (sum_nr_running == leader_nr_running &&
3255 first_cpu(group->cpumask) >
3256 first_cpu(group_leader->cpumask))) {
3257 group_leader = group;
3258 leader_nr_running = sum_nr_running;
3259 }
3260 }
3261group_next:
3262#endif
3263 group = group->next;
3264 } while (group != sd->groups);
3265
3266 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3267 goto out_balanced;
3268
3269 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3270
3271 if (this_load >= avg_load ||
3272 100*max_load <= sd->imbalance_pct*this_load)
3273 goto out_balanced;
3274
3275 busiest_load_per_task /= busiest_nr_running;
3276 if (group_imb)
3277 busiest_load_per_task = min(busiest_load_per_task, avg_load);
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290 if (max_load <= busiest_load_per_task)
3291 goto out_balanced;
3292
3293
3294
3295
3296
3297
3298 if (max_load < avg_load) {
3299 *imbalance = 0;
3300 goto small_imbalance;
3301 }
3302
3303
3304 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3305
3306
3307 *imbalance = min(max_pull * busiest->__cpu_power,
3308 (avg_load - this_load) * this->__cpu_power)
3309 / SCHED_LOAD_SCALE;
3310
3311
3312
3313
3314
3315
3316
3317 if (*imbalance < busiest_load_per_task) {
3318 unsigned long tmp, pwr_now, pwr_move;
3319 unsigned int imbn;
3320
3321small_imbalance:
3322 pwr_move = pwr_now = 0;
3323 imbn = 2;
3324 if (this_nr_running) {
3325 this_load_per_task /= this_nr_running;
3326 if (busiest_load_per_task > this_load_per_task)
3327 imbn = 1;
3328 } else
3329 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3330
3331 if (max_load - this_load + 2*busiest_load_per_task >=
3332 busiest_load_per_task * imbn) {
3333 *imbalance = busiest_load_per_task;
3334 return busiest;
3335 }
3336
3337
3338
3339
3340
3341
3342
3343 pwr_now += busiest->__cpu_power *
3344 min(busiest_load_per_task, max_load);
3345 pwr_now += this->__cpu_power *
3346 min(this_load_per_task, this_load);
3347 pwr_now /= SCHED_LOAD_SCALE;
3348
3349
3350 tmp = sg_div_cpu_power(busiest,
3351 busiest_load_per_task * SCHED_LOAD_SCALE);
3352 if (max_load > tmp)
3353 pwr_move += busiest->__cpu_power *
3354 min(busiest_load_per_task, max_load - tmp);
3355
3356
3357 if (max_load * busiest->__cpu_power <
3358 busiest_load_per_task * SCHED_LOAD_SCALE)
3359 tmp = sg_div_cpu_power(this,
3360 max_load * busiest->__cpu_power);
3361 else
3362 tmp = sg_div_cpu_power(this,
3363 busiest_load_per_task * SCHED_LOAD_SCALE);
3364 pwr_move += this->__cpu_power *
3365 min(this_load_per_task, this_load + tmp);
3366 pwr_move /= SCHED_LOAD_SCALE;
3367
3368
3369 if (pwr_move > pwr_now)
3370 *imbalance = busiest_load_per_task;
3371 }
3372
3373 return busiest;
3374
3375out_balanced:
3376#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3377 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3378 goto ret;
3379
3380 if (this == group_leader && group_leader != group_min) {
3381 *imbalance = min_load_per_task;
3382 return group_min;
3383 }
3384#endif
3385ret:
3386 *imbalance = 0;
3387 return NULL;
3388}
3389
3390
3391
3392
3393static struct rq *
3394find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3395 unsigned long imbalance, const cpumask_t *cpus)
3396{
3397 struct rq *busiest = NULL, *rq;
3398 unsigned long max_load = 0;
3399 int i;
3400
3401 for_each_cpu_mask_nr(i, group->cpumask) {
3402 unsigned long wl;
3403
3404 if (!cpu_isset(i, *cpus))
3405 continue;
3406
3407 rq = cpu_rq(i);
3408 wl = weighted_cpuload(i);
3409
3410 if (rq->nr_running == 1 && wl > imbalance)
3411 continue;
3412
3413 if (wl > max_load) {
3414 max_load = wl;
3415 busiest = rq;
3416 }
3417 }
3418
3419 return busiest;
3420}
3421
3422
3423
3424
3425
3426#define MAX_PINNED_INTERVAL 512
3427
3428
3429
3430
3431
3432static int load_balance(int this_cpu, struct rq *this_rq,
3433 struct sched_domain *sd, enum cpu_idle_type idle,
3434 int *balance, cpumask_t *cpus)
3435{
3436 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3437 struct sched_group *group;
3438 unsigned long imbalance;
3439 struct rq *busiest;
3440 unsigned long flags;
3441
3442 cpus_setall(*cpus);
3443
3444
3445
3446
3447
3448
3449
3450 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3451 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3452 sd_idle = 1;
3453
3454 schedstat_inc(sd, lb_count[idle]);
3455
3456redo:
3457 update_shares(sd);
3458 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3459 cpus, balance);
3460
3461 if (*balance == 0)
3462 goto out_balanced;
3463
3464 if (!group) {
3465 schedstat_inc(sd, lb_nobusyg[idle]);
3466 goto out_balanced;
3467 }
3468
3469 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3470 if (!busiest) {
3471 schedstat_inc(sd, lb_nobusyq[idle]);
3472 goto out_balanced;
3473 }
3474
3475 BUG_ON(busiest == this_rq);
3476
3477 schedstat_add(sd, lb_imbalance[idle], imbalance);
3478
3479 ld_moved = 0;
3480 if (busiest->nr_running > 1) {
3481
3482
3483
3484
3485
3486
3487 local_irq_save(flags);
3488 double_rq_lock(this_rq, busiest);
3489 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3490 imbalance, sd, idle, &all_pinned);
3491 double_rq_unlock(this_rq, busiest);
3492 local_irq_restore(flags);
3493
3494
3495
3496
3497 if (ld_moved && this_cpu != smp_processor_id())
3498 resched_cpu(this_cpu);
3499
3500
3501 if (unlikely(all_pinned)) {
3502 cpu_clear(cpu_of(busiest), *cpus);
3503 if (!cpus_empty(*cpus))
3504 goto redo;
3505 goto out_balanced;
3506 }
3507 }
3508
3509 if (!ld_moved) {
3510 schedstat_inc(sd, lb_failed[idle]);
3511 sd->nr_balance_failed++;
3512
3513 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3514
3515 spin_lock_irqsave(&busiest->lock, flags);
3516
3517
3518
3519
3520 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
3521 spin_unlock_irqrestore(&busiest->lock, flags);
3522 all_pinned = 1;
3523 goto out_one_pinned;
3524 }
3525
3526 if (!busiest->active_balance) {
3527 busiest->active_balance = 1;
3528 busiest->push_cpu = this_cpu;
3529 active_balance = 1;
3530 }
3531 spin_unlock_irqrestore(&busiest->lock, flags);
3532 if (active_balance)
3533 wake_up_process(busiest->migration_thread);
3534
3535
3536
3537
3538
3539 sd->nr_balance_failed = sd->cache_nice_tries+1;
3540 }
3541 } else
3542 sd->nr_balance_failed = 0;
3543
3544 if (likely(!active_balance)) {
3545
3546 sd->balance_interval = sd->min_interval;
3547 } else {
3548
3549
3550
3551
3552
3553
3554 if (sd->balance_interval < sd->max_interval)
3555 sd->balance_interval *= 2;
3556 }
3557
3558 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3559 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3560 ld_moved = -1;
3561
3562 goto out;
3563
3564out_balanced:
3565 schedstat_inc(sd, lb_balanced[idle]);
3566
3567 sd->nr_balance_failed = 0;
3568
3569out_one_pinned:
3570
3571 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3572 (sd->balance_interval < sd->max_interval))
3573 sd->balance_interval *= 2;
3574
3575 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3576 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3577 ld_moved = -1;
3578 else
3579 ld_moved = 0;
3580out:
3581 if (ld_moved)
3582 update_shares(sd);
3583 return ld_moved;
3584}
3585
3586
3587
3588
3589
3590
3591
3592
3593static int
3594load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3595 cpumask_t *cpus)
3596{
3597 struct sched_group *group;
3598 struct rq *busiest = NULL;
3599 unsigned long imbalance;
3600 int ld_moved = 0;
3601 int sd_idle = 0;
3602 int all_pinned = 0;
3603
3604 cpus_setall(*cpus);
3605
3606
3607
3608
3609
3610
3611
3612 if (sd->flags & SD_SHARE_CPUPOWER &&
3613 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3614 sd_idle = 1;
3615
3616 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3617redo:
3618 update_shares_locked(this_rq, sd);
3619 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3620 &sd_idle, cpus, NULL);
3621 if (!group) {
3622 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3623 goto out_balanced;
3624 }
3625
3626 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3627 if (!busiest) {
3628 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3629 goto out_balanced;
3630 }
3631
3632 BUG_ON(busiest == this_rq);
3633
3634 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
3635
3636 ld_moved = 0;
3637 if (busiest->nr_running > 1) {
3638
3639 double_lock_balance(this_rq, busiest);
3640
3641 update_rq_clock(busiest);
3642 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3643 imbalance, sd, CPU_NEWLY_IDLE,
3644 &all_pinned);
3645 double_unlock_balance(this_rq, busiest);
3646
3647 if (unlikely(all_pinned)) {
3648 cpu_clear(cpu_of(busiest), *cpus);
3649 if (!cpus_empty(*cpus))
3650 goto redo;
3651 }
3652 }
3653
3654 if (!ld_moved) {
3655 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3656 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3657 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3658 return -1;
3659 } else
3660 sd->nr_balance_failed = 0;
3661
3662 update_shares_locked(this_rq, sd);
3663 return ld_moved;
3664
3665out_balanced:
3666 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
3667 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3668 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3669 return -1;
3670 sd->nr_balance_failed = 0;
3671
3672 return 0;
3673}
3674
3675
3676
3677
3678
3679static void idle_balance(int this_cpu, struct rq *this_rq)
3680{
3681 struct sched_domain *sd;
3682 int pulled_task = -1;
3683 unsigned long next_balance = jiffies + HZ;
3684 cpumask_t tmpmask;
3685
3686 for_each_domain(this_cpu, sd) {
3687 unsigned long interval;
3688
3689 if (!(sd->flags & SD_LOAD_BALANCE))
3690 continue;
3691
3692 if (sd->flags & SD_BALANCE_NEWIDLE)
3693
3694 pulled_task = load_balance_newidle(this_cpu, this_rq,
3695 sd, &tmpmask);
3696
3697 interval = msecs_to_jiffies(sd->balance_interval);
3698 if (time_after(next_balance, sd->last_balance + interval))
3699 next_balance = sd->last_balance + interval;
3700 if (pulled_task)
3701 break;
3702 }
3703 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3704
3705
3706
3707
3708 this_rq->next_balance = next_balance;
3709 }
3710}
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3721{
3722 int target_cpu = busiest_rq->push_cpu;
3723 struct sched_domain *sd;
3724 struct rq *target_rq;
3725
3726
3727 if (busiest_rq->nr_running <= 1)
3728 return;
3729
3730 target_rq = cpu_rq(target_cpu);
3731
3732
3733
3734
3735
3736
3737 BUG_ON(busiest_rq == target_rq);
3738
3739
3740 double_lock_balance(busiest_rq, target_rq);
3741 update_rq_clock(busiest_rq);
3742 update_rq_clock(target_rq);
3743
3744
3745 for_each_domain(target_cpu, sd) {
3746 if ((sd->flags & SD_LOAD_BALANCE) &&
3747 cpu_isset(busiest_cpu, sd->span))
3748 break;
3749 }
3750
3751 if (likely(sd)) {
3752 schedstat_inc(sd, alb_count);
3753
3754 if (move_one_task(target_rq, target_cpu, busiest_rq,
3755 sd, CPU_IDLE))
3756 schedstat_inc(sd, alb_pushed);
3757 else
3758 schedstat_inc(sd, alb_failed);
3759 }
3760 double_unlock_balance(busiest_rq, target_rq);
3761}
3762
3763#ifdef CONFIG_NO_HZ
3764static struct {
3765 atomic_t load_balancer;
3766 cpumask_t cpu_mask;
3767} nohz ____cacheline_aligned = {
3768 .load_balancer = ATOMIC_INIT(-1),
3769 .cpu_mask = CPU_MASK_NONE,
3770};
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792int select_nohz_load_balancer(int stop_tick)
3793{
3794 int cpu = smp_processor_id();
3795
3796 if (stop_tick) {
3797 cpu_set(cpu, nohz.cpu_mask);
3798 cpu_rq(cpu)->in_nohz_recently = 1;
3799
3800
3801
3802
3803 if (!cpu_active(cpu) &&
3804 atomic_read(&nohz.load_balancer) == cpu) {
3805 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3806 BUG();
3807 return 0;
3808 }
3809
3810
3811 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3812 if (atomic_read(&nohz.load_balancer) == cpu)
3813 atomic_set(&nohz.load_balancer, -1);
3814 return 0;
3815 }
3816
3817 if (atomic_read(&nohz.load_balancer) == -1) {
3818
3819 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3820 return 1;
3821 } else if (atomic_read(&nohz.load_balancer) == cpu)
3822 return 1;
3823 } else {
3824 if (!cpu_isset(cpu, nohz.cpu_mask))
3825 return 0;
3826
3827 cpu_clear(cpu, nohz.cpu_mask);
3828
3829 if (atomic_read(&nohz.load_balancer) == cpu)
3830 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3831 BUG();
3832 }
3833 return 0;
3834}
3835#endif
3836
3837static DEFINE_SPINLOCK(balancing);
3838
3839
3840
3841
3842
3843
3844
3845static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3846{
3847 int balance = 1;
3848 struct rq *rq = cpu_rq(cpu);
3849 unsigned long interval;
3850 struct sched_domain *sd;
3851
3852 unsigned long next_balance = jiffies + 60*HZ;
3853 int update_next_balance = 0;
3854 int need_serialize;
3855 cpumask_t tmp;
3856
3857 for_each_domain(cpu, sd) {
3858 if (!(sd->flags & SD_LOAD_BALANCE))
3859 continue;
3860
3861 interval = sd->balance_interval;
3862 if (idle != CPU_IDLE)
3863 interval *= sd->busy_factor;
3864
3865
3866 interval = msecs_to_jiffies(interval);
3867 if (unlikely(!interval))
3868 interval = 1;
3869 if (interval > HZ*NR_CPUS/10)
3870 interval = HZ*NR_CPUS/10;
3871
3872 need_serialize = sd->flags & SD_SERIALIZE;
3873
3874 if (need_serialize) {
3875 if (!spin_trylock(&balancing))
3876 goto out;
3877 }
3878
3879 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3880 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3881
3882
3883
3884
3885
3886 idle = CPU_NOT_IDLE;
3887 }
3888 sd->last_balance = jiffies;
3889 }
3890 if (need_serialize)
3891 spin_unlock(&balancing);
3892out:
3893 if (time_after(next_balance, sd->last_balance + interval)) {
3894 next_balance = sd->last_balance + interval;
3895 update_next_balance = 1;
3896 }
3897
3898
3899
3900
3901
3902
3903 if (!balance)
3904 break;
3905 }
3906
3907
3908
3909
3910
3911
3912 if (likely(update_next_balance))
3913 rq->next_balance = next_balance;
3914}
3915
3916
3917
3918
3919
3920
3921static void run_rebalance_domains(struct softirq_action *h)
3922{
3923 int this_cpu = smp_processor_id();
3924 struct rq *this_rq = cpu_rq(this_cpu);
3925 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3926 CPU_IDLE : CPU_NOT_IDLE;
3927
3928 rebalance_domains(this_cpu, idle);
3929
3930#ifdef CONFIG_NO_HZ
3931
3932
3933
3934
3935
3936 if (this_rq->idle_at_tick &&
3937 atomic_read(&nohz.load_balancer) == this_cpu) {
3938 cpumask_t cpus = nohz.cpu_mask;
3939 struct rq *rq;
3940 int balance_cpu;
3941
3942 cpu_clear(this_cpu, cpus);
3943 for_each_cpu_mask_nr(balance_cpu, cpus) {
3944
3945
3946
3947
3948
3949 if (need_resched())
3950 break;
3951
3952 rebalance_domains(balance_cpu, CPU_IDLE);
3953
3954 rq = cpu_rq(balance_cpu);
3955 if (time_after(this_rq->next_balance, rq->next_balance))
3956 this_rq->next_balance = rq->next_balance;
3957 }
3958 }
3959#endif
3960}
3961
3962
3963
3964
3965
3966
3967
3968
3969static inline void trigger_load_balance(struct rq *rq, int cpu)
3970{
3971#ifdef CONFIG_NO_HZ
3972
3973
3974
3975
3976
3977 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3978 rq->in_nohz_recently = 0;
3979
3980 if (atomic_read(&nohz.load_balancer) == cpu) {
3981 cpu_clear(cpu, nohz.cpu_mask);
3982 atomic_set(&nohz.load_balancer, -1);
3983 }
3984
3985 if (atomic_read(&nohz.load_balancer) == -1) {
3986
3987
3988
3989
3990
3991
3992
3993
3994 int ilb = first_cpu(nohz.cpu_mask);
3995
3996 if (ilb < nr_cpu_ids)
3997 resched_cpu(ilb);
3998 }
3999 }
4000
4001
4002
4003
4004
4005 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4006 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4007 resched_cpu(cpu);
4008 return;
4009 }
4010
4011
4012
4013
4014
4015 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4016 cpu_isset(cpu, nohz.cpu_mask))
4017 return;
4018#endif
4019 if (time_after_eq(jiffies, rq->next_balance))
4020 raise_softirq(SCHED_SOFTIRQ);
4021}
4022
4023#else
4024
4025
4026
4027
4028static inline void idle_balance(int cpu, struct rq *rq)
4029{
4030}
4031
4032#endif
4033
4034DEFINE_PER_CPU(struct kernel_stat, kstat);
4035
4036EXPORT_PER_CPU_SYMBOL(kstat);
4037
4038
4039
4040
4041
4042unsigned long long task_sched_runtime(struct task_struct *p)
4043{
4044 unsigned long flags;
4045 u64 ns, delta_exec;
4046 struct rq *rq;
4047
4048 rq = task_rq_lock(p, &flags);
4049 ns = p->se.sum_exec_runtime;
4050 if (task_current(rq, p)) {
4051 update_rq_clock(rq);
4052 delta_exec = rq->clock - p->se.exec_start;
4053 if ((s64)delta_exec > 0)
4054 ns += delta_exec;
4055 }
4056 task_rq_unlock(rq, &flags);
4057
4058 return ns;
4059}
4060
4061
4062
4063
4064
4065
4066void account_user_time(struct task_struct *p, cputime_t cputime)
4067{
4068 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4069 cputime64_t tmp;
4070
4071 p->utime = cputime_add(p->utime, cputime);
4072
4073
4074 tmp = cputime_to_cputime64(cputime);
4075 if (TASK_NICE(p) > 0)
4076 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4077 else
4078 cpustat->user = cputime64_add(cpustat->user, tmp);
4079
4080 acct_update_integrals(p);
4081}
4082
4083
4084
4085
4086
4087
4088static void account_guest_time(struct task_struct *p, cputime_t cputime)
4089{
4090 cputime64_t tmp;
4091 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4092
4093 tmp = cputime_to_cputime64(cputime);
4094
4095 p->utime = cputime_add(p->utime, cputime);
4096 p->gtime = cputime_add(p->gtime, cputime);
4097
4098 cpustat->user = cputime64_add(cpustat->user, tmp);
4099 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4100}
4101
4102
4103
4104
4105
4106
4107void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4108{
4109 p->utimescaled = cputime_add(p->utimescaled, cputime);
4110}
4111
4112
4113
4114
4115
4116
4117
4118void account_system_time(struct task_struct *p, int hardirq_offset,
4119 cputime_t cputime)
4120{
4121 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4122 struct rq *rq = this_rq();
4123 cputime64_t tmp;
4124
4125 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4126 account_guest_time(p, cputime);
4127 return;
4128 }
4129
4130 p->stime = cputime_add(p->stime, cputime);
4131
4132
4133 tmp = cputime_to_cputime64(cputime);
4134 if (hardirq_count() - hardirq_offset)
4135 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4136 else if (softirq_count())
4137 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4138 else if (p != rq->idle)
4139 cpustat->system = cputime64_add(cpustat->system, tmp);
4140 else if (atomic_read(&rq->nr_iowait) > 0)
4141 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4142 else
4143 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4144
4145 acct_update_integrals(p);
4146}
4147
4148
4149
4150
4151
4152
4153
4154void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
4155{
4156 p->stimescaled = cputime_add(p->stimescaled, cputime);
4157}
4158
4159
4160
4161
4162
4163
4164void account_steal_time(struct task_struct *p, cputime_t steal)
4165{
4166 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4167 cputime64_t tmp = cputime_to_cputime64(steal);
4168 struct rq *rq = this_rq();
4169
4170 if (p == rq->idle) {
4171 p->stime = cputime_add(p->stime, steal);
4172 if (atomic_read(&rq->nr_iowait) > 0)
4173 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4174 else
4175 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4176 } else
4177 cpustat->steal = cputime64_add(cpustat->steal, tmp);
4178}
4179
4180
4181
4182
4183#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4184cputime_t task_utime(struct task_struct *p)
4185{
4186 return p->utime;
4187}
4188
4189cputime_t task_stime(struct task_struct *p)
4190{
4191 return p->stime;
4192}
4193#else
4194cputime_t task_utime(struct task_struct *p)
4195{
4196 clock_t utime = cputime_to_clock_t(p->utime),
4197 total = utime + cputime_to_clock_t(p->stime);
4198 u64 temp;
4199
4200
4201
4202
4203 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4204
4205 if (total) {
4206 temp *= utime;
4207 do_div(temp, total);
4208 }
4209 utime = (clock_t)temp;
4210
4211 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4212 return p->prev_utime;
4213}
4214
4215cputime_t task_stime(struct task_struct *p)
4216{
4217 clock_t stime;
4218
4219
4220
4221
4222
4223
4224 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4225 cputime_to_clock_t(task_utime(p));
4226
4227 if (stime >= 0)
4228 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4229
4230 return p->prev_stime;
4231}
4232#endif
4233
4234inline cputime_t task_gtime(struct task_struct *p)
4235{
4236 return p->gtime;
4237}
4238
4239
4240
4241
4242
4243
4244
4245
4246void scheduler_tick(void)
4247{
4248 int cpu = smp_processor_id();
4249 struct rq *rq = cpu_rq(cpu);
4250 struct task_struct *curr = rq->curr;
4251
4252 sched_clock_tick();
4253
4254 spin_lock(&rq->lock);
4255 update_rq_clock(rq);
4256 update_cpu_load(rq);
4257 curr->sched_class->task_tick(rq, curr, 0);
4258 spin_unlock(&rq->lock);
4259
4260#ifdef CONFIG_SMP
4261 rq->idle_at_tick = idle_cpu(cpu);
4262 trigger_load_balance(rq, cpu);
4263#endif
4264}
4265
4266#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4267 defined(CONFIG_PREEMPT_TRACER))
4268
4269static inline unsigned long get_parent_ip(unsigned long addr)
4270{
4271 if (in_lock_functions(addr)) {
4272 addr = CALLER_ADDR2;
4273 if (in_lock_functions(addr))
4274 addr = CALLER_ADDR3;
4275 }
4276 return addr;
4277}
4278
4279void __kprobes add_preempt_count(int val)
4280{
4281#ifdef CONFIG_DEBUG_PREEMPT
4282
4283
4284
4285 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4286 return;
4287#endif
4288 preempt_count() += val;
4289#ifdef CONFIG_DEBUG_PREEMPT
4290
4291
4292
4293 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4294 PREEMPT_MASK - 10);
4295#endif
4296 if (preempt_count() == val)
4297 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4298}
4299EXPORT_SYMBOL(add_preempt_count);
4300
4301void __kprobes sub_preempt_count(int val)
4302{
4303#ifdef CONFIG_DEBUG_PREEMPT
4304
4305
4306
4307 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4308 return;
4309
4310
4311
4312 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4313 !(preempt_count() & PREEMPT_MASK)))
4314 return;
4315#endif
4316
4317 if (preempt_count() == val)
4318 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4319 preempt_count() -= val;
4320}
4321EXPORT_SYMBOL(sub_preempt_count);
4322
4323#endif
4324
4325
4326
4327
4328static noinline void __schedule_bug(struct task_struct *prev)
4329{
4330 struct pt_regs *regs = get_irq_regs();
4331
4332 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4333 prev->comm, prev->pid, preempt_count());
4334
4335 debug_show_held_locks(prev);
4336 print_modules();
4337 if (irqs_disabled())
4338 print_irqtrace_events(prev);
4339
4340 if (regs)
4341 show_regs(regs);
4342 else
4343 dump_stack();
4344}
4345
4346
4347
4348
4349static inline void schedule_debug(struct task_struct *prev)
4350{
4351
4352
4353
4354
4355
4356 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4357 __schedule_bug(prev);
4358
4359 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4360
4361 schedstat_inc(this_rq(), sched_count);
4362#ifdef CONFIG_SCHEDSTATS
4363 if (unlikely(prev->lock_depth >= 0)) {
4364 schedstat_inc(this_rq(), bkl_count);
4365 schedstat_inc(prev, sched_info.bkl_count);
4366 }
4367#endif
4368}
4369
4370
4371
4372
4373static inline struct task_struct *
4374pick_next_task(struct rq *rq, struct task_struct *prev)
4375{
4376 const struct sched_class *class;
4377 struct task_struct *p;
4378
4379
4380
4381
4382
4383 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4384 p = fair_sched_class.pick_next_task(rq);
4385 if (likely(p))
4386 return p;
4387 }
4388
4389 class = sched_class_highest;
4390 for ( ; ; ) {
4391 p = class->pick_next_task(rq);
4392 if (p)
4393 return p;
4394
4395
4396
4397
4398 class = class->next;
4399 }
4400}
4401
4402
4403
4404
4405asmlinkage void __sched schedule(void)
4406{
4407 struct task_struct *prev, *next;
4408 unsigned long *switch_count;
4409 struct rq *rq;
4410 int cpu;
4411
4412need_resched:
4413 preempt_disable();
4414 cpu = smp_processor_id();
4415 rq = cpu_rq(cpu);
4416 rcu_qsctr_inc(cpu);
4417 prev = rq->curr;
4418 switch_count = &prev->nivcsw;
4419
4420 release_kernel_lock(prev);
4421need_resched_nonpreemptible:
4422
4423 schedule_debug(prev);
4424
4425 if (sched_feat(HRTICK))
4426 hrtick_clear(rq);
4427
4428
4429
4430
4431 local_irq_disable();
4432 update_rq_clock(rq);
4433 spin_lock(&rq->lock);
4434 clear_tsk_need_resched(prev);
4435
4436 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4437 if (unlikely(signal_pending_state(prev->state, prev)))
4438 prev->state = TASK_RUNNING;
4439 else
4440 deactivate_task(rq, prev, 1);
4441 switch_count = &prev->nvcsw;
4442 }
4443
4444#ifdef CONFIG_SMP
4445 if (prev->sched_class->pre_schedule)
4446 prev->sched_class->pre_schedule(rq, prev);
4447#endif
4448
4449 if (unlikely(!rq->nr_running))
4450 idle_balance(cpu, rq);
4451
4452 prev->sched_class->put_prev_task(rq, prev);
4453 next = pick_next_task(rq, prev);
4454
4455 if (likely(prev != next)) {
4456 sched_info_switch(prev, next);
4457
4458 rq->nr_switches++;
4459 rq->curr = next;
4460 ++*switch_count;
4461
4462 context_switch(rq, prev, next);
4463
4464
4465
4466
4467 cpu = smp_processor_id();
4468 rq = cpu_rq(cpu);
4469 } else
4470 spin_unlock_irq(&rq->lock);
4471
4472 if (unlikely(reacquire_kernel_lock(current) < 0))
4473 goto need_resched_nonpreemptible;
4474
4475 preempt_enable_no_resched();
4476 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4477 goto need_resched;
4478}
4479EXPORT_SYMBOL(schedule);
4480
4481#ifdef CONFIG_PREEMPT
4482
4483
4484
4485
4486
4487asmlinkage void __sched preempt_schedule(void)
4488{
4489 struct thread_info *ti = current_thread_info();
4490
4491
4492
4493
4494
4495 if (likely(ti->preempt_count || irqs_disabled()))
4496 return;
4497
4498 do {
4499 add_preempt_count(PREEMPT_ACTIVE);
4500 schedule();
4501 sub_preempt_count(PREEMPT_ACTIVE);
4502
4503
4504
4505
4506
4507 barrier();
4508 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4509}
4510EXPORT_SYMBOL(preempt_schedule);
4511
4512
4513
4514
4515
4516
4517
4518asmlinkage void __sched preempt_schedule_irq(void)
4519{
4520 struct thread_info *ti = current_thread_info();
4521
4522
4523 BUG_ON(ti->preempt_count || !irqs_disabled());
4524
4525 do {
4526 add_preempt_count(PREEMPT_ACTIVE);
4527 local_irq_enable();
4528 schedule();
4529 local_irq_disable();
4530 sub_preempt_count(PREEMPT_ACTIVE);
4531
4532
4533
4534
4535
4536 barrier();
4537 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4538}
4539
4540#endif
4541
4542int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
4543 void *key)
4544{
4545 return try_to_wake_up(curr->private, mode, sync);
4546}
4547EXPORT_SYMBOL(default_wake_function);
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4559 int nr_exclusive, int sync, void *key)
4560{
4561 wait_queue_t *curr, *next;
4562
4563 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4564 unsigned flags = curr->flags;
4565
4566 if (curr->func(curr, mode, sync, key) &&
4567 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4568 break;
4569 }
4570}
4571
4572
4573
4574
4575
4576
4577
4578
4579void __wake_up(wait_queue_head_t *q, unsigned int mode,
4580 int nr_exclusive, void *key)
4581{
4582 unsigned long flags;
4583
4584 spin_lock_irqsave(&q->lock, flags);
4585 __wake_up_common(q, mode, nr_exclusive, 0, key);
4586 spin_unlock_irqrestore(&q->lock, flags);
4587}
4588EXPORT_SYMBOL(__wake_up);
4589
4590
4591
4592
4593void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4594{
4595 __wake_up_common(q, mode, 1, 0, NULL);
4596}
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611void
4612__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4613{
4614 unsigned long flags;
4615 int sync = 1;
4616
4617 if (unlikely(!q))
4618 return;
4619
4620 if (unlikely(!nr_exclusive))
4621 sync = 0;
4622
4623 spin_lock_irqsave(&q->lock, flags);
4624 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
4625 spin_unlock_irqrestore(&q->lock, flags);
4626}
4627EXPORT_SYMBOL_GPL(__wake_up_sync);
4628
4629void complete(struct completion *x)
4630{
4631 unsigned long flags;
4632
4633 spin_lock_irqsave(&x->wait.lock, flags);
4634 x->done++;
4635 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4636 spin_unlock_irqrestore(&x->wait.lock, flags);
4637}
4638EXPORT_SYMBOL(complete);
4639
4640void complete_all(struct completion *x)
4641{
4642 unsigned long flags;
4643
4644 spin_lock_irqsave(&x->wait.lock, flags);
4645 x->done += UINT_MAX/2;
4646 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4647 spin_unlock_irqrestore(&x->wait.lock, flags);
4648}
4649EXPORT_SYMBOL(complete_all);
4650
4651static inline long __sched
4652do_wait_for_common(struct completion *x, long timeout, int state)
4653{
4654 if (!x->done) {
4655 DECLARE_WAITQUEUE(wait, current);
4656
4657 wait.flags |= WQ_FLAG_EXCLUSIVE;
4658 __add_wait_queue_tail(&x->wait, &wait);
4659 do {
4660 if ((state == TASK_INTERRUPTIBLE &&
4661 signal_pending(current)) ||
4662 (state == TASK_KILLABLE &&
4663 fatal_signal_pending(current))) {
4664 timeout = -ERESTARTSYS;
4665 break;
4666 }
4667 __set_current_state(state);
4668 spin_unlock_irq(&x->wait.lock);
4669 timeout = schedule_timeout(timeout);
4670 spin_lock_irq(&x->wait.lock);
4671 } while (!x->done && timeout);
4672 __remove_wait_queue(&x->wait, &wait);
4673 if (!x->done)
4674 return timeout;
4675 }
4676 x->done--;
4677 return timeout ?: 1;
4678}
4679
4680static long __sched
4681wait_for_common(struct completion *x, long timeout, int state)
4682{
4683 might_sleep();
4684
4685 spin_lock_irq(&x->wait.lock);
4686 timeout = do_wait_for_common(x, timeout, state);
4687 spin_unlock_irq(&x->wait.lock);
4688 return timeout;
4689}
4690
4691void __sched wait_for_completion(struct completion *x)
4692{
4693 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4694}
4695EXPORT_SYMBOL(wait_for_completion);
4696
4697unsigned long __sched
4698wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4699{
4700 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4701}
4702EXPORT_SYMBOL(wait_for_completion_timeout);
4703
4704int __sched wait_for_completion_interruptible(struct completion *x)
4705{
4706 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4707 if (t == -ERESTARTSYS)
4708 return t;
4709 return 0;
4710}
4711EXPORT_SYMBOL(wait_for_completion_interruptible);
4712
4713unsigned long __sched
4714wait_for_completion_interruptible_timeout(struct completion *x,
4715 unsigned long timeout)
4716{
4717 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4718}
4719EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4720
4721int __sched wait_for_completion_killable(struct completion *x)
4722{
4723 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4724 if (t == -ERESTARTSYS)
4725 return t;
4726 return 0;
4727}
4728EXPORT_SYMBOL(wait_for_completion_killable);
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742bool try_wait_for_completion(struct completion *x)
4743{
4744 int ret = 1;
4745
4746 spin_lock_irq(&x->wait.lock);
4747 if (!x->done)
4748 ret = 0;
4749 else
4750 x->done--;
4751 spin_unlock_irq(&x->wait.lock);
4752 return ret;
4753}
4754EXPORT_SYMBOL(try_wait_for_completion);
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764bool completion_done(struct completion *x)
4765{
4766 int ret = 1;
4767
4768 spin_lock_irq(&x->wait.lock);
4769 if (!x->done)
4770 ret = 0;
4771 spin_unlock_irq(&x->wait.lock);
4772 return ret;
4773}
4774EXPORT_SYMBOL(completion_done);
4775
4776static long __sched
4777sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4778{
4779 unsigned long flags;
4780 wait_queue_t wait;
4781
4782 init_waitqueue_entry(&wait, current);
4783
4784 __set_current_state(state);
4785
4786 spin_lock_irqsave(&q->lock, flags);
4787 __add_wait_queue(q, &wait);
4788 spin_unlock(&q->lock);
4789 timeout = schedule_timeout(timeout);
4790 spin_lock_irq(&q->lock);
4791 __remove_wait_queue(q, &wait);
4792 spin_unlock_irqrestore(&q->lock, flags);
4793
4794 return timeout;
4795}
4796
4797void __sched interruptible_sleep_on(wait_queue_head_t *q)
4798{
4799 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4800}
4801EXPORT_SYMBOL(interruptible_sleep_on);
4802
4803long __sched
4804interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4805{
4806 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4807}
4808EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4809
4810void __sched sleep_on(wait_queue_head_t *q)
4811{
4812 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4813}
4814EXPORT_SYMBOL(sleep_on);
4815
4816long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4817{
4818 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4819}
4820EXPORT_SYMBOL(sleep_on_timeout);
4821
4822#ifdef CONFIG_RT_MUTEXES
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834void rt_mutex_setprio(struct task_struct *p, int prio)
4835{
4836 unsigned long flags;
4837 int oldprio, on_rq, running;
4838 struct rq *rq;
4839 const struct sched_class *prev_class = p->sched_class;
4840
4841 BUG_ON(prio < 0 || prio > MAX_PRIO);
4842
4843 rq = task_rq_lock(p, &flags);
4844 update_rq_clock(rq);
4845
4846 oldprio = p->prio;
4847 on_rq = p->se.on_rq;
4848 running = task_current(rq, p);
4849 if (on_rq)
4850 dequeue_task(rq, p, 0);
4851 if (running)
4852 p->sched_class->put_prev_task(rq, p);
4853
4854 if (rt_prio(prio))
4855 p->sched_class = &rt_sched_class;
4856 else
4857 p->sched_class = &fair_sched_class;
4858
4859 p->prio = prio;
4860
4861 if (running)
4862 p->sched_class->set_curr_task(rq);
4863 if (on_rq) {
4864 enqueue_task(rq, p, 0);
4865
4866 check_class_changed(rq, p, prev_class, oldprio, running);
4867 }
4868 task_rq_unlock(rq, &flags);
4869}
4870
4871#endif
4872
4873void set_user_nice(struct task_struct *p, long nice)
4874{
4875 int old_prio, delta, on_rq;
4876 unsigned long flags;
4877 struct rq *rq;
4878
4879 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4880 return;
4881
4882
4883
4884
4885 rq = task_rq_lock(p, &flags);
4886 update_rq_clock(rq);
4887
4888
4889
4890
4891
4892
4893 if (task_has_rt_policy(p)) {
4894 p->static_prio = NICE_TO_PRIO(nice);
4895 goto out_unlock;
4896 }
4897 on_rq = p->se.on_rq;
4898 if (on_rq)
4899 dequeue_task(rq, p, 0);
4900
4901 p->static_prio = NICE_TO_PRIO(nice);
4902 set_load_weight(p);
4903 old_prio = p->prio;
4904 p->prio = effective_prio(p);
4905 delta = p->prio - old_prio;
4906
4907 if (on_rq) {
4908 enqueue_task(rq, p, 0);
4909
4910
4911
4912
4913 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4914 resched_task(rq->curr);
4915 }
4916out_unlock:
4917 task_rq_unlock(rq, &flags);
4918}
4919EXPORT_SYMBOL(set_user_nice);
4920
4921
4922
4923
4924
4925
4926int can_nice(const struct task_struct *p, const int nice)
4927{
4928
4929 int nice_rlim = 20 - nice;
4930
4931 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
4932 capable(CAP_SYS_NICE));
4933}
4934
4935#ifdef __ARCH_WANT_SYS_NICE
4936
4937
4938
4939
4940
4941
4942
4943
4944SYSCALL_DEFINE1(nice, int, increment)
4945{
4946 long nice, retval;
4947
4948
4949
4950
4951
4952
4953 if (increment < -40)
4954 increment = -40;
4955 if (increment > 40)
4956 increment = 40;
4957
4958 nice = PRIO_TO_NICE(current->static_prio) + increment;
4959 if (nice < -20)
4960 nice = -20;
4961 if (nice > 19)
4962 nice = 19;
4963
4964 if (increment < 0 && !can_nice(current, nice))
4965 return -EPERM;
4966
4967 retval = security_task_setnice(current, nice);
4968 if (retval)
4969 return retval;
4970
4971 set_user_nice(current, nice);
4972 return 0;
4973}
4974
4975#endif
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985int task_prio(const struct task_struct *p)
4986{
4987 return p->prio - MAX_RT_PRIO;
4988}
4989
4990
4991
4992
4993
4994int task_nice(const struct task_struct *p)
4995{
4996 return TASK_NICE(p);
4997}
4998EXPORT_SYMBOL(task_nice);
4999
5000
5001
5002
5003
5004int idle_cpu(int cpu)
5005{
5006 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
5007}
5008
5009
5010
5011
5012
5013struct task_struct *idle_task(int cpu)
5014{
5015 return cpu_rq(cpu)->idle;
5016}
5017
5018
5019
5020
5021
5022static struct task_struct *find_process_by_pid(pid_t pid)
5023{
5024 return pid ? find_task_by_vpid(pid) : current;
5025}
5026
5027
5028static void
5029__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5030{
5031 BUG_ON(p->se.on_rq);
5032
5033 p->policy = policy;
5034 switch (p->policy) {
5035 case SCHED_NORMAL:
5036 case SCHED_BATCH:
5037 case SCHED_IDLE:
5038 p->sched_class = &fair_sched_class;
5039 break;
5040 case SCHED_FIFO:
5041 case SCHED_RR:
5042 p->sched_class = &rt_sched_class;
5043 break;
5044 }
5045
5046 p->rt_priority = prio;
5047 p->normal_prio = normal_prio(p);
5048
5049 p->prio = rt_mutex_getprio(p);
5050 set_load_weight(p);
5051}
5052
5053static int __sched_setscheduler(struct task_struct *p, int policy,
5054 struct sched_param *param, bool user)
5055{
5056 int retval, oldprio, oldpolicy = -1, on_rq, running;
5057 unsigned long flags;
5058 const struct sched_class *prev_class = p->sched_class;
5059 struct rq *rq;
5060
5061
5062 BUG_ON(in_interrupt());
5063recheck:
5064
5065 if (policy < 0)
5066 policy = oldpolicy = p->policy;
5067 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
5068 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5069 policy != SCHED_IDLE)
5070 return -EINVAL;
5071
5072
5073
5074
5075
5076 if (param->sched_priority < 0 ||
5077 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5078 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5079 return -EINVAL;
5080 if (rt_policy(policy) != (param->sched_priority != 0))
5081 return -EINVAL;
5082
5083
5084
5085
5086 if (user && !capable(CAP_SYS_NICE)) {
5087 if (rt_policy(policy)) {
5088 unsigned long rlim_rtprio;
5089
5090 if (!lock_task_sighand(p, &flags))
5091 return -ESRCH;
5092 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
5093 unlock_task_sighand(p, &flags);
5094
5095
5096 if (policy != p->policy && !rlim_rtprio)
5097 return -EPERM;
5098
5099
5100 if (param->sched_priority > p->rt_priority &&
5101 param->sched_priority > rlim_rtprio)
5102 return -EPERM;
5103 }
5104
5105
5106
5107
5108 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
5109 return -EPERM;
5110
5111
5112 if ((current->euid != p->euid) &&
5113 (current->euid != p->uid))
5114 return -EPERM;
5115 }
5116
5117 if (user) {
5118#ifdef CONFIG_RT_GROUP_SCHED
5119
5120
5121
5122
5123 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
5124 return -EPERM;
5125#endif
5126
5127 retval = security_task_setscheduler(p, policy, param);
5128 if (retval)
5129 return retval;
5130 }
5131
5132
5133
5134
5135
5136 spin_lock_irqsave(&p->pi_lock, flags);
5137
5138
5139
5140
5141 rq = __task_rq_lock(p);
5142
5143 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5144 policy = oldpolicy = -1;
5145 __task_rq_unlock(rq);
5146 spin_unlock_irqrestore(&p->pi_lock, flags);
5147 goto recheck;
5148 }
5149 update_rq_clock(rq);
5150 on_rq = p->se.on_rq;
5151 running = task_current(rq, p);
5152 if (on_rq)
5153 deactivate_task(rq, p, 0);
5154 if (running)
5155 p->sched_class->put_prev_task(rq, p);
5156
5157 oldprio = p->prio;
5158 __setscheduler(rq, p, policy, param->sched_priority);
5159
5160 if (running)
5161 p->sched_class->set_curr_task(rq);
5162 if (on_rq) {
5163 activate_task(rq, p, 0);
5164
5165 check_class_changed(rq, p, prev_class, oldprio, running);
5166 }
5167 __task_rq_unlock(rq);
5168 spin_unlock_irqrestore(&p->pi_lock, flags);
5169
5170 rt_mutex_adjust_pi(p);
5171
5172 return 0;
5173}
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183int sched_setscheduler(struct task_struct *p, int policy,
5184 struct sched_param *param)
5185{
5186 return __sched_setscheduler(p, policy, param, true);
5187}
5188EXPORT_SYMBOL_GPL(sched_setscheduler);
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5202 struct sched_param *param)
5203{
5204 return __sched_setscheduler(p, policy, param, false);
5205}
5206
5207static int
5208do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5209{
5210 struct sched_param lparam;
5211 struct task_struct *p;
5212 int retval;
5213
5214 if (!param || pid < 0)
5215 return -EINVAL;
5216 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5217 return -EFAULT;
5218
5219 rcu_read_lock();
5220 retval = -ESRCH;
5221 p = find_process_by_pid(pid);
5222 if (p != NULL)
5223 retval = sched_setscheduler(p, policy, &lparam);
5224 rcu_read_unlock();
5225
5226 return retval;
5227}
5228
5229
5230
5231
5232
5233
5234
5235SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5236 struct sched_param __user *, param)
5237{
5238
5239 if (policy < 0)
5240 return -EINVAL;
5241
5242 return do_sched_setscheduler(pid, policy, param);
5243}
5244
5245
5246
5247
5248
5249
5250SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5251{
5252 return do_sched_setscheduler(pid, -1, param);
5253}
5254
5255
5256
5257
5258
5259SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5260{
5261 struct task_struct *p;
5262 int retval;
5263
5264 if (pid < 0)
5265 return -EINVAL;
5266
5267 retval = -ESRCH;
5268 read_lock(&tasklist_lock);
5269 p = find_process_by_pid(pid);
5270 if (p) {
5271 retval = security_task_getscheduler(p);
5272 if (!retval)
5273 retval = p->policy;
5274 }
5275 read_unlock(&tasklist_lock);
5276 return retval;
5277}
5278
5279
5280
5281
5282
5283
5284SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5285{
5286 struct sched_param lp;
5287 struct task_struct *p;
5288 int retval;
5289
5290 if (!param || pid < 0)
5291 return -EINVAL;
5292
5293 read_lock(&tasklist_lock);
5294 p = find_process_by_pid(pid);
5295 retval = -ESRCH;
5296 if (!p)
5297 goto out_unlock;
5298
5299 retval = security_task_getscheduler(p);
5300 if (retval)
5301 goto out_unlock;
5302
5303 lp.sched_priority = p->rt_priority;
5304 read_unlock(&tasklist_lock);
5305
5306
5307
5308
5309 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5310
5311 return retval;
5312
5313out_unlock:
5314 read_unlock(&tasklist_lock);
5315 return retval;
5316}
5317
5318long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5319{
5320 cpumask_t cpus_allowed;
5321 cpumask_t new_mask = *in_mask;
5322 struct task_struct *p;
5323 int retval;
5324
5325 get_online_cpus();
5326 read_lock(&tasklist_lock);
5327
5328 p = find_process_by_pid(pid);
5329 if (!p) {
5330 read_unlock(&tasklist_lock);
5331 put_online_cpus();
5332 return -ESRCH;
5333 }
5334
5335
5336
5337
5338
5339
5340 get_task_struct(p);
5341 read_unlock(&tasklist_lock);
5342
5343 retval = -EPERM;
5344 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5345 !capable(CAP_SYS_NICE))
5346 goto out_unlock;
5347
5348 retval = security_task_setscheduler(p, 0, NULL);
5349 if (retval)
5350 goto out_unlock;
5351
5352 cpuset_cpus_allowed(p, &cpus_allowed);
5353 cpus_and(new_mask, new_mask, cpus_allowed);
5354 again:
5355 retval = set_cpus_allowed_ptr(p, &new_mask);
5356
5357 if (!retval) {
5358 cpuset_cpus_allowed(p, &cpus_allowed);
5359 if (!cpus_subset(new_mask, cpus_allowed)) {
5360
5361
5362
5363
5364
5365 new_mask = cpus_allowed;
5366 goto again;
5367 }
5368 }
5369out_unlock:
5370 put_task_struct(p);
5371 put_online_cpus();
5372 return retval;
5373}
5374
5375static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5376 cpumask_t *new_mask)
5377{
5378 if (len < sizeof(cpumask_t)) {
5379 memset(new_mask, 0, sizeof(cpumask_t));
5380 } else if (len > sizeof(cpumask_t)) {
5381 len = sizeof(cpumask_t);
5382 }
5383 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5384}
5385
5386
5387
5388
5389
5390
5391
5392SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5393 unsigned long __user *, user_mask_ptr)
5394{
5395 cpumask_t new_mask;
5396 int retval;
5397
5398 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
5399 if (retval)
5400 return retval;
5401
5402 return sched_setaffinity(pid, &new_mask);
5403}
5404
5405long sched_getaffinity(pid_t pid, cpumask_t *mask)
5406{
5407 struct task_struct *p;
5408 int retval;
5409
5410 get_online_cpus();
5411 read_lock(&tasklist_lock);
5412
5413 retval = -ESRCH;
5414 p = find_process_by_pid(pid);
5415 if (!p)
5416 goto out_unlock;
5417
5418 retval = security_task_getscheduler(p);
5419 if (retval)
5420 goto out_unlock;
5421
5422 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
5423
5424out_unlock:
5425 read_unlock(&tasklist_lock);
5426 put_online_cpus();
5427
5428 return retval;
5429}
5430
5431
5432
5433
5434
5435
5436
5437SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5438 unsigned long __user *, user_mask_ptr)
5439{
5440 int ret;
5441 cpumask_t mask;
5442
5443 if (len < sizeof(cpumask_t))
5444 return -EINVAL;
5445
5446 ret = sched_getaffinity(pid, &mask);
5447 if (ret < 0)
5448 return ret;
5449
5450 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
5451 return -EFAULT;
5452
5453 return sizeof(cpumask_t);
5454}
5455
5456
5457
5458
5459
5460
5461
5462SYSCALL_DEFINE0(sched_yield)
5463{
5464 struct rq *rq = this_rq_lock();
5465
5466 schedstat_inc(rq, yld_count);
5467 current->sched_class->yield_task(rq);
5468
5469
5470
5471
5472
5473 __release(rq->lock);
5474 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5475 _raw_spin_unlock(&rq->lock);
5476 preempt_enable_no_resched();
5477
5478 schedule();
5479
5480 return 0;
5481}
5482
5483static void __cond_resched(void)
5484{
5485#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5486 __might_sleep(__FILE__, __LINE__);
5487#endif
5488
5489
5490
5491
5492
5493 do {
5494 add_preempt_count(PREEMPT_ACTIVE);
5495 schedule();
5496 sub_preempt_count(PREEMPT_ACTIVE);
5497 } while (need_resched());
5498}
5499
5500int __sched _cond_resched(void)
5501{
5502 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
5503 system_state == SYSTEM_RUNNING) {
5504 __cond_resched();
5505 return 1;
5506 }
5507 return 0;
5508}
5509EXPORT_SYMBOL(_cond_resched);
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519int cond_resched_lock(spinlock_t *lock)
5520{
5521 int resched = need_resched() && system_state == SYSTEM_RUNNING;
5522 int ret = 0;
5523
5524 if (spin_needbreak(lock) || resched) {
5525 spin_unlock(lock);
5526 if (resched && need_resched())
5527 __cond_resched();
5528 else
5529 cpu_relax();
5530 ret = 1;
5531 spin_lock(lock);
5532 }
5533 return ret;
5534}
5535EXPORT_SYMBOL(cond_resched_lock);
5536
5537int __sched cond_resched_softirq(void)
5538{
5539 BUG_ON(!in_softirq());
5540
5541 if (need_resched() && system_state == SYSTEM_RUNNING) {
5542 local_bh_enable();
5543 __cond_resched();
5544 local_bh_disable();
5545 return 1;
5546 }
5547 return 0;
5548}
5549EXPORT_SYMBOL(cond_resched_softirq);
5550
5551
5552
5553
5554
5555
5556
5557void __sched yield(void)
5558{
5559 set_current_state(TASK_RUNNING);
5560 sys_sched_yield();
5561}
5562EXPORT_SYMBOL(yield);
5563
5564
5565
5566
5567
5568
5569
5570
5571void __sched io_schedule(void)
5572{
5573 struct rq *rq = &__raw_get_cpu_var(runqueues);
5574
5575 delayacct_blkio_start();
5576 atomic_inc(&rq->nr_iowait);
5577 schedule();
5578 atomic_dec(&rq->nr_iowait);
5579 delayacct_blkio_end();
5580}
5581EXPORT_SYMBOL(io_schedule);
5582
5583long __sched io_schedule_timeout(long timeout)
5584{
5585 struct rq *rq = &__raw_get_cpu_var(runqueues);
5586 long ret;
5587
5588 delayacct_blkio_start();
5589 atomic_inc(&rq->nr_iowait);
5590 ret = schedule_timeout(timeout);
5591 atomic_dec(&rq->nr_iowait);
5592 delayacct_blkio_end();
5593 return ret;
5594}
5595
5596
5597
5598
5599
5600
5601
5602
5603SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5604{
5605 int ret = -EINVAL;
5606
5607 switch (policy) {
5608 case SCHED_FIFO:
5609 case SCHED_RR:
5610 ret = MAX_USER_RT_PRIO-1;
5611 break;
5612 case SCHED_NORMAL:
5613 case SCHED_BATCH:
5614 case SCHED_IDLE:
5615 ret = 0;
5616 break;
5617 }
5618 return ret;
5619}
5620
5621
5622
5623
5624
5625
5626
5627
5628SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5629{
5630 int ret = -EINVAL;
5631
5632 switch (policy) {
5633 case SCHED_FIFO:
5634 case SCHED_RR:
5635 ret = 1;
5636 break;
5637 case SCHED_NORMAL:
5638 case SCHED_BATCH:
5639 case SCHED_IDLE:
5640 ret = 0;
5641 }
5642 return ret;
5643}
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5654 struct timespec __user *, interval)
5655{
5656 struct task_struct *p;
5657 unsigned int time_slice;
5658 int retval;
5659 struct timespec t;
5660
5661 if (pid < 0)
5662 return -EINVAL;
5663
5664 retval = -ESRCH;
5665 read_lock(&tasklist_lock);
5666 p = find_process_by_pid(pid);
5667 if (!p)
5668 goto out_unlock;
5669
5670 retval = security_task_getscheduler(p);
5671 if (retval)
5672 goto out_unlock;
5673
5674
5675
5676
5677
5678 time_slice = 0;
5679 if (p->policy == SCHED_RR) {
5680 time_slice = DEF_TIMESLICE;
5681 } else if (p->policy != SCHED_FIFO) {
5682 struct sched_entity *se = &p->se;
5683 unsigned long flags;
5684 struct rq *rq;
5685
5686 rq = task_rq_lock(p, &flags);
5687 if (rq->cfs.load.weight)
5688 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
5689 task_rq_unlock(rq, &flags);
5690 }
5691 read_unlock(&tasklist_lock);
5692 jiffies_to_timespec(time_slice, &t);
5693 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5694 return retval;
5695
5696out_unlock:
5697 read_unlock(&tasklist_lock);
5698 return retval;
5699}
5700
5701static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5702
5703void sched_show_task(struct task_struct *p)
5704{
5705 unsigned long free = 0;
5706 unsigned state;
5707
5708 state = p->state ? __ffs(p->state) + 1 : 0;
5709 printk(KERN_INFO "%-13.13s %c", p->comm,
5710 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5711#if BITS_PER_LONG == 32
5712 if (state == TASK_RUNNING)
5713 printk(KERN_CONT " running ");
5714 else
5715 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5716#else
5717 if (state == TASK_RUNNING)
5718 printk(KERN_CONT " running task ");
5719 else
5720 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5721#endif
5722#ifdef CONFIG_DEBUG_STACK_USAGE
5723 {
5724 unsigned long *n = end_of_stack(p);
5725 while (!*n)
5726 n++;
5727 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5728 }
5729#endif
5730 printk(KERN_CONT "%5lu %5d %6d\n", free,
5731 task_pid_nr(p), task_pid_nr(p->real_parent));
5732
5733 show_stack(p, NULL);
5734}
5735
5736void show_state_filter(unsigned long state_filter)
5737{
5738 struct task_struct *g, *p;
5739
5740#if BITS_PER_LONG == 32
5741 printk(KERN_INFO
5742 " task PC stack pid father\n");
5743#else
5744 printk(KERN_INFO
5745 " task PC stack pid father\n");
5746#endif
5747 read_lock(&tasklist_lock);
5748 do_each_thread(g, p) {
5749
5750
5751
5752
5753 touch_nmi_watchdog();
5754 if (!state_filter || (p->state & state_filter))
5755 sched_show_task(p);
5756 } while_each_thread(g, p);
5757
5758 touch_all_softlockup_watchdogs();
5759
5760#ifdef CONFIG_SCHED_DEBUG
5761 sysrq_sched_debug_show();
5762#endif
5763 read_unlock(&tasklist_lock);
5764
5765
5766
5767 if (state_filter == -1)
5768 debug_show_all_locks();
5769}
5770
5771void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5772{
5773 idle->sched_class = &idle_sched_class;
5774}
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784void __cpuinit init_idle(struct task_struct *idle, int cpu)
5785{
5786 struct rq *rq = cpu_rq(cpu);
5787 unsigned long flags;
5788
5789 __sched_fork(idle);
5790 idle->se.exec_start = sched_clock();
5791
5792 idle->prio = idle->normal_prio = MAX_PRIO;
5793 idle->cpus_allowed = cpumask_of_cpu(cpu);
5794 __set_task_cpu(idle, cpu);
5795
5796 spin_lock_irqsave(&rq->lock, flags);
5797 rq->curr = rq->idle = idle;
5798#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5799 idle->oncpu = 1;
5800#endif
5801 spin_unlock_irqrestore(&rq->lock, flags);
5802
5803
5804#if defined(CONFIG_PREEMPT)
5805 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5806#else
5807 task_thread_info(idle)->preempt_count = 0;
5808#endif
5809
5810
5811
5812 idle->sched_class = &idle_sched_class;
5813}
5814
5815
5816
5817
5818
5819
5820
5821
5822cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833static inline void sched_init_granularity(void)
5834{
5835 unsigned int factor = 1 + ilog2(num_online_cpus());
5836 const unsigned long limit = 200000000;
5837
5838 sysctl_sched_min_granularity *= factor;
5839 if (sysctl_sched_min_granularity > limit)
5840 sysctl_sched_min_granularity = limit;
5841
5842 sysctl_sched_latency *= factor;
5843 if (sysctl_sched_latency > limit)
5844 sysctl_sched_latency = limit;
5845
5846 sysctl_sched_wakeup_granularity *= factor;
5847
5848 sysctl_sched_shares_ratelimit *= factor;
5849}
5850
5851#ifdef CONFIG_SMP
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5878{
5879 struct migration_req req;
5880 unsigned long flags;
5881 struct rq *rq;
5882 int ret = 0;
5883
5884 rq = task_rq_lock(p, &flags);
5885 if (!cpus_intersects(*new_mask, cpu_online_map)) {
5886 ret = -EINVAL;
5887 goto out;
5888 }
5889
5890 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5891 !cpus_equal(p->cpus_allowed, *new_mask))) {
5892 ret = -EINVAL;
5893 goto out;
5894 }
5895
5896 if (p->sched_class->set_cpus_allowed)
5897 p->sched_class->set_cpus_allowed(p, new_mask);
5898 else {
5899 p->cpus_allowed = *new_mask;
5900 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
5901 }
5902
5903
5904 if (cpu_isset(task_cpu(p), *new_mask))
5905 goto out;
5906
5907 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
5908
5909 task_rq_unlock(rq, &flags);
5910 wake_up_process(rq->migration_thread);
5911 wait_for_completion(&req.done);
5912 tlb_migrate_finish(p->mm);
5913 return 0;
5914 }
5915out:
5916 task_rq_unlock(rq, &flags);
5917
5918 return ret;
5919}
5920EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5934{
5935 struct rq *rq_dest, *rq_src;
5936 int ret = 0, on_rq;
5937
5938 if (unlikely(!cpu_active(dest_cpu)))
5939 return ret;
5940
5941 rq_src = cpu_rq(src_cpu);
5942 rq_dest = cpu_rq(dest_cpu);
5943
5944 double_rq_lock(rq_src, rq_dest);
5945
5946 if (task_cpu(p) != src_cpu)
5947 goto done;
5948
5949 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5950 goto fail;
5951
5952 on_rq = p->se.on_rq;
5953 if (on_rq)
5954 deactivate_task(rq_src, p, 0);
5955
5956 set_task_cpu(p, dest_cpu);
5957 if (on_rq) {
5958 activate_task(rq_dest, p, 0);
5959 check_preempt_curr(rq_dest, p, 0);
5960 }
5961done:
5962 ret = 1;
5963fail:
5964 double_rq_unlock(rq_src, rq_dest);
5965 return ret;
5966}
5967
5968
5969
5970
5971
5972
5973static int migration_thread(void *data)
5974{
5975 int cpu = (long)data;
5976 struct rq *rq;
5977
5978 rq = cpu_rq(cpu);
5979 BUG_ON(rq->migration_thread != current);
5980
5981 set_current_state(TASK_INTERRUPTIBLE);
5982 while (!kthread_should_stop()) {
5983 struct migration_req *req;
5984 struct list_head *head;
5985
5986 spin_lock_irq(&rq->lock);
5987
5988 if (cpu_is_offline(cpu)) {
5989 spin_unlock_irq(&rq->lock);
5990 goto wait_to_die;
5991 }
5992
5993 if (rq->active_balance) {
5994 active_load_balance(rq, cpu);
5995 rq->active_balance = 0;
5996 }
5997
5998 head = &rq->migration_queue;
5999
6000 if (list_empty(head)) {
6001 spin_unlock_irq(&rq->lock);
6002 schedule();
6003 set_current_state(TASK_INTERRUPTIBLE);
6004 continue;
6005 }
6006 req = list_entry(head->next, struct migration_req, list);
6007 list_del_init(head->next);
6008
6009 spin_unlock(&rq->lock);
6010 __migrate_task(req->task, cpu, req->dest_cpu);
6011 local_irq_enable();
6012
6013 complete(&req->done);
6014 }
6015 __set_current_state(TASK_RUNNING);
6016 return 0;
6017
6018wait_to_die:
6019
6020 set_current_state(TASK_INTERRUPTIBLE);
6021 while (!kthread_should_stop()) {
6022 schedule();
6023 set_current_state(TASK_INTERRUPTIBLE);
6024 }
6025 __set_current_state(TASK_RUNNING);
6026 return 0;
6027}
6028
6029#ifdef CONFIG_HOTPLUG_CPU
6030
6031static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6032{
6033 int ret;
6034