1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_sched.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129
130
131
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif
270
271
272
273
274
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277
278
279
280
281
282
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285
286
287
288
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293
294
295
296
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301
302
303
304static inline struct rq *__task_rq_lock(struct task_struct *p)
305 __acquires(rq->lock)
306{
307 struct rq *rq;
308
309 lockdep_assert_held(&p->pi_lock);
310
311 for (;;) {
312 rq = task_rq(p);
313 raw_spin_lock(&rq->lock);
314 if (likely(rq == task_rq(p)))
315 return rq;
316 raw_spin_unlock(&rq->lock);
317 }
318}
319
320
321
322
323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324 __acquires(p->pi_lock)
325 __acquires(rq->lock)
326{
327 struct rq *rq;
328
329 for (;;) {
330 raw_spin_lock_irqsave(&p->pi_lock, *flags);
331 rq = task_rq(p);
332 raw_spin_lock(&rq->lock);
333 if (likely(rq == task_rq(p)))
334 return rq;
335 raw_spin_unlock(&rq->lock);
336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
337 }
338}
339
340static void __task_rq_unlock(struct rq *rq)
341 __releases(rq->lock)
342{
343 raw_spin_unlock(&rq->lock);
344}
345
346static inline void
347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
348 __releases(rq->lock)
349 __releases(p->pi_lock)
350{
351 raw_spin_unlock(&rq->lock);
352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
353}
354
355
356
357
358static struct rq *this_rq_lock(void)
359 __acquires(rq->lock)
360{
361 struct rq *rq;
362
363 local_irq_disable();
364 rq = this_rq();
365 raw_spin_lock(&rq->lock);
366
367 return rq;
368}
369
370#ifdef CONFIG_SCHED_HRTICK
371
372
373
374
375
376
377
378
379
380
381
382static void hrtick_clear(struct rq *rq)
383{
384 if (hrtimer_active(&rq->hrtick_timer))
385 hrtimer_cancel(&rq->hrtick_timer);
386}
387
388
389
390
391
392static enum hrtimer_restart hrtick(struct hrtimer *timer)
393{
394 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
395
396 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
397
398 raw_spin_lock(&rq->lock);
399 update_rq_clock(rq);
400 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
401 raw_spin_unlock(&rq->lock);
402
403 return HRTIMER_NORESTART;
404}
405
406#ifdef CONFIG_SMP
407
408
409
410static void __hrtick_start(void *arg)
411{
412 struct rq *rq = arg;
413
414 raw_spin_lock(&rq->lock);
415 hrtimer_restart(&rq->hrtick_timer);
416 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock);
418}
419
420
421
422
423
424
425void hrtick_start(struct rq *rq, u64 delay)
426{
427 struct hrtimer *timer = &rq->hrtick_timer;
428 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
429
430 hrtimer_set_expires(timer, time);
431
432 if (rq == this_rq()) {
433 hrtimer_restart(timer);
434 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1;
437 }
438}
439
440static int
441hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
442{
443 int cpu = (int)(long)hcpu;
444
445 switch (action) {
446 case CPU_UP_CANCELED:
447 case CPU_UP_CANCELED_FROZEN:
448 case CPU_DOWN_PREPARE:
449 case CPU_DOWN_PREPARE_FROZEN:
450 case CPU_DEAD:
451 case CPU_DEAD_FROZEN:
452 hrtick_clear(cpu_rq(cpu));
453 return NOTIFY_OK;
454 }
455
456 return NOTIFY_DONE;
457}
458
459static __init void init_hrtick(void)
460{
461 hotcpu_notifier(hotplug_hrtick, 0);
462}
463#else
464
465
466
467
468
469void hrtick_start(struct rq *rq, u64 delay)
470{
471 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
472 HRTIMER_MODE_REL_PINNED, 0);
473}
474
475static inline void init_hrtick(void)
476{
477}
478#endif
479
480static void init_rq_hrtick(struct rq *rq)
481{
482#ifdef CONFIG_SMP
483 rq->hrtick_csd_pending = 0;
484
485 rq->hrtick_csd.flags = 0;
486 rq->hrtick_csd.func = __hrtick_start;
487 rq->hrtick_csd.info = rq;
488#endif
489
490 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
491 rq->hrtick_timer.function = hrtick;
492}
493#else
494static inline void hrtick_clear(struct rq *rq)
495{
496}
497
498static inline void init_rq_hrtick(struct rq *rq)
499{
500}
501
502static inline void init_hrtick(void)
503{
504}
505#endif
506
507
508
509
510
511
512
513
514#ifdef CONFIG_SMP
515
516#ifndef tsk_is_polling
517#define tsk_is_polling(t) 0
518#endif
519
520void resched_task(struct task_struct *p)
521{
522 int cpu;
523
524 assert_raw_spin_locked(&task_rq(p)->lock);
525
526 if (test_tsk_need_resched(p))
527 return;
528
529 set_tsk_need_resched(p);
530
531 cpu = task_cpu(p);
532 if (cpu == smp_processor_id())
533 return;
534
535
536 smp_mb();
537 if (!tsk_is_polling(p))
538 smp_send_reschedule(cpu);
539}
540
541void resched_cpu(int cpu)
542{
543 struct rq *rq = cpu_rq(cpu);
544 unsigned long flags;
545
546 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
547 return;
548 resched_task(cpu_curr(cpu));
549 raw_spin_unlock_irqrestore(&rq->lock, flags);
550}
551
552#ifdef CONFIG_NO_HZ
553
554
555
556
557
558
559
560
561int get_nohz_timer_target(void)
562{
563 int cpu = smp_processor_id();
564 int i;
565 struct sched_domain *sd;
566
567 rcu_read_lock();
568 for_each_domain(cpu, sd) {
569 for_each_cpu(i, sched_domain_span(sd)) {
570 if (!idle_cpu(i)) {
571 cpu = i;
572 goto unlock;
573 }
574 }
575 }
576unlock:
577 rcu_read_unlock();
578 return cpu;
579}
580
581
582
583
584
585
586
587
588
589
590void wake_up_idle_cpu(int cpu)
591{
592 struct rq *rq = cpu_rq(cpu);
593
594 if (cpu == smp_processor_id())
595 return;
596
597
598
599
600
601
602
603
604 if (rq->curr != rq->idle)
605 return;
606
607
608
609
610
611
612 set_tsk_need_resched(rq->idle);
613
614
615 smp_mb();
616 if (!tsk_is_polling(rq->idle))
617 smp_send_reschedule(cpu);
618}
619
620static inline bool got_nohz_idle_kick(void)
621{
622 int cpu = smp_processor_id();
623 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
624}
625
626#else
627
628static inline bool got_nohz_idle_kick(void)
629{
630 return false;
631}
632
633#endif
634
635void sched_avg_update(struct rq *rq)
636{
637 s64 period = sched_avg_period();
638
639 while ((s64)(rq->clock - rq->age_stamp) > period) {
640
641
642
643
644
645 asm("" : "+rm" (rq->age_stamp));
646 rq->age_stamp += period;
647 rq->rt_avg /= 2;
648 }
649}
650
651#else
652void resched_task(struct task_struct *p)
653{
654 assert_raw_spin_locked(&task_rq(p)->lock);
655 set_tsk_need_resched(p);
656}
657#endif
658
659#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
660 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
661
662
663
664
665
666
667int walk_tg_tree_from(struct task_group *from,
668 tg_visitor down, tg_visitor up, void *data)
669{
670 struct task_group *parent, *child;
671 int ret;
672
673 parent = from;
674
675down:
676 ret = (*down)(parent, data);
677 if (ret)
678 goto out;
679 list_for_each_entry_rcu(child, &parent->children, siblings) {
680 parent = child;
681 goto down;
682
683up:
684 continue;
685 }
686 ret = (*up)(parent, data);
687 if (ret || parent == from)
688 goto out;
689
690 child = parent;
691 parent = parent->parent;
692 if (parent)
693 goto up;
694out:
695 return ret;
696}
697
698int tg_nop(struct task_group *tg, void *data)
699{
700 return 0;
701}
702#endif
703
704static void set_load_weight(struct task_struct *p)
705{
706 int prio = p->static_prio - MAX_RT_PRIO;
707 struct load_weight *load = &p->se.load;
708
709
710
711
712 if (p->policy == SCHED_IDLE) {
713 load->weight = scale_load(WEIGHT_IDLEPRIO);
714 load->inv_weight = WMULT_IDLEPRIO;
715 return;
716 }
717
718 load->weight = scale_load(prio_to_weight[prio]);
719 load->inv_weight = prio_to_wmult[prio];
720}
721
722static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
723{
724 update_rq_clock(rq);
725 sched_info_queued(p);
726 p->sched_class->enqueue_task(rq, p, flags);
727}
728
729static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
730{
731 update_rq_clock(rq);
732 sched_info_dequeued(p);
733 p->sched_class->dequeue_task(rq, p, flags);
734}
735
736void activate_task(struct rq *rq, struct task_struct *p, int flags)
737{
738 if (task_contributes_to_load(p))
739 rq->nr_uninterruptible--;
740
741 enqueue_task(rq, p, flags);
742}
743
744void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
745{
746 if (task_contributes_to_load(p))
747 rq->nr_uninterruptible++;
748
749 dequeue_task(rq, p, flags);
750}
751
752static void update_rq_clock_task(struct rq *rq, s64 delta)
753{
754
755
756
757
758#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
759 s64 steal = 0, irq_delta = 0;
760#endif
761#ifdef CONFIG_IRQ_TIME_ACCOUNTING
762 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779 if (irq_delta > delta)
780 irq_delta = delta;
781
782 rq->prev_irq_time += irq_delta;
783 delta -= irq_delta;
784#endif
785#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
786 if (static_key_false((¶virt_steal_rq_enabled))) {
787 u64 st;
788
789 steal = paravirt_steal_clock(cpu_of(rq));
790 steal -= rq->prev_steal_time_rq;
791
792 if (unlikely(steal > delta))
793 steal = delta;
794
795 st = steal_ticks(steal);
796 steal = st * TICK_NSEC;
797
798 rq->prev_steal_time_rq += steal;
799
800 delta -= steal;
801 }
802#endif
803
804 rq->clock_task += delta;
805
806#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
807 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
808 sched_rt_avg_update(rq, irq_delta + steal);
809#endif
810}
811
812void sched_set_stop_task(int cpu, struct task_struct *stop)
813{
814 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
815 struct task_struct *old_stop = cpu_rq(cpu)->stop;
816
817 if (stop) {
818
819
820
821
822
823
824
825
826 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
827
828 stop->sched_class = &stop_sched_class;
829 }
830
831 cpu_rq(cpu)->stop = stop;
832
833 if (old_stop) {
834
835
836
837
838 old_stop->sched_class = &rt_sched_class;
839 }
840}
841
842
843
844
845static inline int __normal_prio(struct task_struct *p)
846{
847 return p->static_prio;
848}
849
850
851
852
853
854
855
856
857static inline int normal_prio(struct task_struct *p)
858{
859 int prio;
860
861 if (task_has_rt_policy(p))
862 prio = MAX_RT_PRIO-1 - p->rt_priority;
863 else
864 prio = __normal_prio(p);
865 return prio;
866}
867
868
869
870
871
872
873
874
875static int effective_prio(struct task_struct *p)
876{
877 p->normal_prio = normal_prio(p);
878
879
880
881
882
883 if (!rt_prio(p->prio))
884 return p->normal_prio;
885 return p->prio;
886}
887
888
889
890
891
892inline int task_curr(const struct task_struct *p)
893{
894 return cpu_curr(task_cpu(p)) == p;
895}
896
897static inline void check_class_changed(struct rq *rq, struct task_struct *p,
898 const struct sched_class *prev_class,
899 int oldprio)
900{
901 if (prev_class != p->sched_class) {
902 if (prev_class->switched_from)
903 prev_class->switched_from(rq, p);
904 p->sched_class->switched_to(rq, p);
905 } else if (oldprio != p->prio)
906 p->sched_class->prio_changed(rq, p, oldprio);
907}
908
909void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
910{
911 const struct sched_class *class;
912
913 if (p->sched_class == rq->curr->sched_class) {
914 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
915 } else {
916 for_each_class(class) {
917 if (class == rq->curr->sched_class)
918 break;
919 if (class == p->sched_class) {
920 resched_task(rq->curr);
921 break;
922 }
923 }
924 }
925
926
927
928
929
930 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
931 rq->skip_clock_update = 1;
932}
933
934static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
935
936void register_task_migration_notifier(struct notifier_block *n)
937{
938 atomic_notifier_chain_register(&task_migration_notifier, n);
939}
940
941#ifdef CONFIG_SMP
942void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
943{
944#ifdef CONFIG_SCHED_DEBUG
945
946
947
948
949 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
950 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
951
952#ifdef CONFIG_LOCKDEP
953
954
955
956
957
958
959
960
961
962
963 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
964 lockdep_is_held(&task_rq(p)->lock)));
965#endif
966#endif
967
968 trace_sched_migrate_task(p, new_cpu);
969
970 if (task_cpu(p) != new_cpu) {
971 struct task_migration_notifier tmn;
972
973 if (p->sched_class->migrate_task_rq)
974 p->sched_class->migrate_task_rq(p, new_cpu);
975 p->se.nr_migrations++;
976 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978 tmn.task = p;
979 tmn.from_cpu = task_cpu(p);
980 tmn.to_cpu = new_cpu;
981
982 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
983 }
984
985 __set_task_cpu(p, new_cpu);
986}
987
988struct migration_arg {
989 struct task_struct *task;
990 int dest_cpu;
991};
992
993static int migration_cpu_stop(void *data);
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1012{
1013 unsigned long flags;
1014 int running, on_rq;
1015 unsigned long ncsw;
1016 struct rq *rq;
1017
1018 for (;;) {
1019
1020
1021
1022
1023
1024
1025 rq = task_rq(p);
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038 while (task_running(rq, p)) {
1039 if (match_state && unlikely(p->state != match_state))
1040 return 0;
1041 cpu_relax();
1042 }
1043
1044
1045
1046
1047
1048
1049 rq = task_rq_lock(p, &flags);
1050 trace_sched_wait_task(p);
1051 running = task_running(rq, p);
1052 on_rq = p->on_rq;
1053 ncsw = 0;
1054 if (!match_state || p->state == match_state)
1055 ncsw = p->nvcsw | LONG_MIN;
1056 task_rq_unlock(rq, p, &flags);
1057
1058
1059
1060
1061 if (unlikely(!ncsw))
1062 break;
1063
1064
1065
1066
1067
1068
1069
1070 if (unlikely(running)) {
1071 cpu_relax();
1072 continue;
1073 }
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084 if (unlikely(on_rq)) {
1085 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1086
1087 set_current_state(TASK_UNINTERRUPTIBLE);
1088 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1089 continue;
1090 }
1091
1092
1093
1094
1095
1096
1097 break;
1098 }
1099
1100 return ncsw;
1101}
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116void kick_process(struct task_struct *p)
1117{
1118 int cpu;
1119
1120 preempt_disable();
1121 cpu = task_cpu(p);
1122 if ((cpu != smp_processor_id()) && task_curr(p))
1123 smp_send_reschedule(cpu);
1124 preempt_enable();
1125}
1126EXPORT_SYMBOL_GPL(kick_process);
1127#endif
1128
1129#ifdef CONFIG_SMP
1130
1131
1132
1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1136 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu;
1138
1139
1140 for_each_cpu(dest_cpu, nodemask) {
1141 if (!cpu_online(dest_cpu))
1142 continue;
1143 if (!cpu_active(dest_cpu))
1144 continue;
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1146 return dest_cpu;
1147 }
1148
1149 for (;;) {
1150
1151 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1152 if (!cpu_online(dest_cpu))
1153 continue;
1154 if (!cpu_active(dest_cpu))
1155 continue;
1156 goto out;
1157 }
1158
1159 switch (state) {
1160 case cpuset:
1161
1162 cpuset_cpus_allowed_fallback(p);
1163 state = possible;
1164 break;
1165
1166 case possible:
1167 do_set_cpus_allowed(p, cpu_possible_mask);
1168 state = fail;
1169 break;
1170
1171 case fail:
1172 BUG();
1173 break;
1174 }
1175 }
1176
1177out:
1178 if (state != cpuset) {
1179
1180
1181
1182
1183
1184 if (p->mm && printk_ratelimit()) {
1185 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1186 task_pid_nr(p), p->comm, cpu);
1187 }
1188 }
1189
1190 return dest_cpu;
1191}
1192
1193
1194
1195
1196static inline
1197int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1198{
1199 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1212 !cpu_online(cpu)))
1213 cpu = select_fallback_rq(task_cpu(p), p);
1214
1215 return cpu;
1216}
1217
1218static void update_avg(u64 *avg, u64 sample)
1219{
1220 s64 diff = sample - *avg;
1221 *avg += diff >> 3;
1222}
1223#endif
1224
1225static void
1226ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1227{
1228#ifdef CONFIG_SCHEDSTATS
1229 struct rq *rq = this_rq();
1230
1231#ifdef CONFIG_SMP
1232 int this_cpu = smp_processor_id();
1233
1234 if (cpu == this_cpu) {
1235 schedstat_inc(rq, ttwu_local);
1236 schedstat_inc(p, se.statistics.nr_wakeups_local);
1237 } else {
1238 struct sched_domain *sd;
1239
1240 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1241 rcu_read_lock();
1242 for_each_domain(this_cpu, sd) {
1243 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1244 schedstat_inc(sd, ttwu_wake_remote);
1245 break;
1246 }
1247 }
1248 rcu_read_unlock();
1249 }
1250
1251 if (wake_flags & WF_MIGRATED)
1252 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1253
1254#endif
1255
1256 schedstat_inc(rq, ttwu_count);
1257 schedstat_inc(p, se.statistics.nr_wakeups);
1258
1259 if (wake_flags & WF_SYNC)
1260 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1261
1262#endif
1263}
1264
1265static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1266{
1267 activate_task(rq, p, en_flags);
1268 p->on_rq = 1;
1269
1270
1271 if (p->flags & PF_WQ_WORKER)
1272 wq_worker_waking_up(p, cpu_of(rq));
1273}
1274
1275
1276
1277
1278static void
1279ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1280{
1281 trace_sched_wakeup(p, true);
1282 check_preempt_curr(rq, p, wake_flags);
1283
1284 p->state = TASK_RUNNING;
1285#ifdef CONFIG_SMP
1286 if (p->sched_class->task_woken)
1287 p->sched_class->task_woken(rq, p);
1288
1289 if (rq->idle_stamp) {
1290 u64 delta = rq->clock - rq->idle_stamp;
1291 u64 max = 2*sysctl_sched_migration_cost;
1292
1293 if (delta > max)
1294 rq->avg_idle = max;
1295 else
1296 update_avg(&rq->avg_idle, delta);
1297 rq->idle_stamp = 0;
1298 }
1299#endif
1300}
1301
1302static void
1303ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1304{
1305#ifdef CONFIG_SMP
1306 if (p->sched_contributes_to_load)
1307 rq->nr_uninterruptible--;
1308#endif
1309
1310 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1311 ttwu_do_wakeup(rq, p, wake_flags);
1312}
1313
1314
1315
1316
1317
1318
1319
1320static int ttwu_remote(struct task_struct *p, int wake_flags)
1321{
1322 struct rq *rq;
1323 int ret = 0;
1324
1325 rq = __task_rq_lock(p);
1326 if (p->on_rq) {
1327 ttwu_do_wakeup(rq, p, wake_flags);
1328 ret = 1;
1329 }
1330 __task_rq_unlock(rq);
1331
1332 return ret;
1333}
1334
1335#ifdef CONFIG_SMP
1336static void sched_ttwu_pending(void)
1337{
1338 struct rq *rq = this_rq();
1339 struct llist_node *llist = llist_del_all(&rq->wake_list);
1340 struct task_struct *p;
1341
1342 raw_spin_lock(&rq->lock);
1343
1344 while (llist) {
1345 p = llist_entry(llist, struct task_struct, wake_entry);
1346 llist = llist_next(llist);
1347 ttwu_do_activate(rq, p, 0);
1348 }
1349
1350 raw_spin_unlock(&rq->lock);
1351}
1352
1353void scheduler_ipi(void)
1354{
1355 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1356 return;
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 irq_enter();
1372 sched_ttwu_pending();
1373
1374
1375
1376
1377 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1378 this_rq()->idle_balance = 1;
1379 raise_softirq_irqoff(SCHED_SOFTIRQ);
1380 }
1381 irq_exit();
1382}
1383
1384static void ttwu_queue_remote(struct task_struct *p, int cpu)
1385{
1386 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1387 smp_send_reschedule(cpu);
1388}
1389
1390bool cpus_share_cache(int this_cpu, int that_cpu)
1391{
1392 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1393}
1394#endif
1395
1396static void ttwu_queue(struct task_struct *p, int cpu)
1397{
1398 struct rq *rq = cpu_rq(cpu);
1399
1400#if defined(CONFIG_SMP)
1401 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1402 sched_clock_cpu(cpu);
1403 ttwu_queue_remote(p, cpu);
1404 return;
1405 }
1406#endif
1407
1408 raw_spin_lock(&rq->lock);
1409 ttwu_do_activate(rq, p, 0);
1410 raw_spin_unlock(&rq->lock);
1411}
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428static int
1429try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1430{
1431 unsigned long flags;
1432 int cpu, success = 0;
1433
1434 smp_wmb();
1435 raw_spin_lock_irqsave(&p->pi_lock, flags);
1436 if (!(p->state & state))
1437 goto out;
1438
1439 success = 1;
1440 cpu = task_cpu(p);
1441
1442 if (p->on_rq && ttwu_remote(p, wake_flags))
1443 goto stat;
1444
1445#ifdef CONFIG_SMP
1446
1447
1448
1449
1450 while (p->on_cpu)
1451 cpu_relax();
1452
1453
1454
1455 smp_rmb();
1456
1457 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1458 p->state = TASK_WAKING;
1459
1460 if (p->sched_class->task_waking)
1461 p->sched_class->task_waking(p);
1462
1463 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1464 if (task_cpu(p) != cpu) {
1465 wake_flags |= WF_MIGRATED;
1466 set_task_cpu(p, cpu);
1467 }
1468#endif
1469
1470 ttwu_queue(p, cpu);
1471stat:
1472 ttwu_stat(p, cpu, wake_flags);
1473out:
1474 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1475
1476 return success;
1477}
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487static void try_to_wake_up_local(struct task_struct *p)
1488{
1489 struct rq *rq = task_rq(p);
1490
1491 BUG_ON(rq != this_rq());
1492 BUG_ON(p == current);
1493 lockdep_assert_held(&rq->lock);
1494
1495 if (!raw_spin_trylock(&p->pi_lock)) {
1496 raw_spin_unlock(&rq->lock);
1497 raw_spin_lock(&p->pi_lock);
1498 raw_spin_lock(&rq->lock);
1499 }
1500
1501 if (!(p->state & TASK_NORMAL))
1502 goto out;
1503
1504 if (!p->on_rq)
1505 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1506
1507 ttwu_do_wakeup(rq, p, 0);
1508 ttwu_stat(p, smp_processor_id(), 0);
1509out:
1510 raw_spin_unlock(&p->pi_lock);
1511}
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524int wake_up_process(struct task_struct *p)
1525{
1526 WARN_ON(task_is_stopped_or_traced(p));
1527 return try_to_wake_up(p, TASK_NORMAL, 0);
1528}
1529EXPORT_SYMBOL(wake_up_process);
1530
1531int wake_up_state(struct task_struct *p, unsigned int state)
1532{
1533 return try_to_wake_up(p, state, 0);
1534}
1535
1536
1537
1538
1539
1540
1541
1542static void __sched_fork(struct task_struct *p)
1543{
1544 p->on_rq = 0;
1545
1546 p->se.on_rq = 0;
1547 p->se.exec_start = 0;
1548 p->se.sum_exec_runtime = 0;
1549 p->se.prev_sum_exec_runtime = 0;
1550 p->se.nr_migrations = 0;
1551 p->se.vruntime = 0;
1552 INIT_LIST_HEAD(&p->se.group_node);
1553
1554
1555
1556
1557
1558
1559#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1560 p->se.avg.runnable_avg_period = 0;
1561 p->se.avg.runnable_avg_sum = 0;
1562#endif
1563#ifdef CONFIG_SCHEDSTATS
1564 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1565#endif
1566
1567 INIT_LIST_HEAD(&p->rt.run_list);
1568
1569#ifdef CONFIG_PREEMPT_NOTIFIERS
1570 INIT_HLIST_HEAD(&p->preempt_notifiers);
1571#endif
1572
1573#ifdef CONFIG_NUMA_BALANCING
1574 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1575 p->mm->numa_next_scan = jiffies;
1576 p->mm->numa_next_reset = jiffies;
1577 p->mm->numa_scan_seq = 0;
1578 }
1579
1580 p->node_stamp = 0ULL;
1581 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1582 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1583 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1584 p->numa_work.next = &p->numa_work;
1585#endif
1586}
1587
1588#ifdef CONFIG_NUMA_BALANCING
1589#ifdef CONFIG_SCHED_DEBUG
1590void set_numabalancing_state(bool enabled)
1591{
1592 if (enabled)
1593 sched_feat_set("NUMA");
1594 else
1595 sched_feat_set("NO_NUMA");
1596}
1597#else
1598__read_mostly bool numabalancing_enabled;
1599
1600void set_numabalancing_state(bool enabled)
1601{
1602 numabalancing_enabled = enabled;
1603}
1604#endif
1605#endif
1606
1607
1608
1609
1610void sched_fork(struct task_struct *p)
1611{
1612 unsigned long flags;
1613 int cpu = get_cpu();
1614
1615 __sched_fork(p);
1616
1617
1618
1619
1620
1621 p->state = TASK_RUNNING;
1622
1623
1624
1625
1626 p->prio = current->normal_prio;
1627
1628
1629
1630
1631 if (unlikely(p->sched_reset_on_fork)) {
1632 if (task_has_rt_policy(p)) {
1633 p->policy = SCHED_NORMAL;
1634 p->static_prio = NICE_TO_PRIO(0);
1635 p->rt_priority = 0;
1636 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1637 p->static_prio = NICE_TO_PRIO(0);
1638
1639 p->prio = p->normal_prio = __normal_prio(p);
1640 set_load_weight(p);
1641
1642
1643
1644
1645
1646 p->sched_reset_on_fork = 0;
1647 }
1648
1649 if (!rt_prio(p->prio))
1650 p->sched_class = &fair_sched_class;
1651
1652 if (p->sched_class->task_fork)
1653 p->sched_class->task_fork(p);
1654
1655
1656
1657
1658
1659
1660
1661
1662 raw_spin_lock_irqsave(&p->pi_lock, flags);
1663 set_task_cpu(p, cpu);
1664 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1665
1666#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1667 if (likely(sched_info_on()))
1668 memset(&p->sched_info, 0, sizeof(p->sched_info));
1669#endif
1670#if defined(CONFIG_SMP)
1671 p->on_cpu = 0;
1672#endif
1673#ifdef CONFIG_PREEMPT_COUNT
1674
1675 task_thread_info(p)->preempt_count = 1;
1676#endif
1677#ifdef CONFIG_SMP
1678 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1679#endif
1680
1681 put_cpu();
1682}
1683
1684
1685
1686
1687
1688
1689
1690
1691void wake_up_new_task(struct task_struct *p)
1692{
1693 unsigned long flags;
1694 struct rq *rq;
1695
1696 raw_spin_lock_irqsave(&p->pi_lock, flags);
1697#ifdef CONFIG_SMP
1698
1699
1700
1701
1702
1703 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1704#endif
1705
1706 rq = __task_rq_lock(p);
1707 activate_task(rq, p, 0);
1708 p->on_rq = 1;
1709 trace_sched_wakeup_new(p, true);
1710 check_preempt_curr(rq, p, WF_FORK);
1711#ifdef CONFIG_SMP
1712 if (p->sched_class->task_woken)
1713 p->sched_class->task_woken(rq, p);
1714#endif
1715 task_rq_unlock(rq, p, &flags);
1716}
1717
1718#ifdef CONFIG_PREEMPT_NOTIFIERS
1719
1720
1721
1722
1723
1724void preempt_notifier_register(struct preempt_notifier *notifier)
1725{
1726 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1727}
1728EXPORT_SYMBOL_GPL(preempt_notifier_register);
1729
1730
1731
1732
1733
1734
1735
1736void preempt_notifier_unregister(struct preempt_notifier *notifier)
1737{
1738 hlist_del(¬ifier->link);
1739}
1740EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1741
1742static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1743{
1744 struct preempt_notifier *notifier;
1745 struct hlist_node *node;
1746
1747 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1748 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1749}
1750
1751static void
1752fire_sched_out_preempt_notifiers(struct task_struct *curr,
1753 struct task_struct *next)
1754{
1755 struct preempt_notifier *notifier;
1756 struct hlist_node *node;
1757
1758 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1759 notifier->ops->sched_out(notifier, next);
1760}
1761
1762#else
1763
1764static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1765{
1766}
1767
1768static void
1769fire_sched_out_preempt_notifiers(struct task_struct *curr,
1770 struct task_struct *next)
1771{
1772}
1773
1774#endif
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789static inline void
1790prepare_task_switch(struct rq *rq, struct task_struct *prev,
1791 struct task_struct *next)
1792{
1793 trace_sched_switch(prev, next);
1794 sched_info_switch(prev, next);
1795 perf_event_task_sched_out(prev, next);
1796 fire_sched_out_preempt_notifiers(prev, next);
1797 prepare_lock_switch(rq, next);
1798 prepare_arch_switch(next);
1799}
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1817 __releases(rq->lock)
1818{
1819 struct mm_struct *mm = rq->prev_mm;
1820 long prev_state;
1821
1822 rq->prev_mm = NULL;
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835 prev_state = prev->state;
1836 vtime_task_switch(prev);
1837 finish_arch_switch(prev);
1838 perf_event_task_sched_in(prev, current);
1839 finish_lock_switch(rq, prev);
1840 finish_arch_post_lock_switch();
1841
1842 fire_sched_in_preempt_notifiers(current);
1843 if (mm)
1844 mmdrop(mm);
1845 if (unlikely(prev_state == TASK_DEAD)) {
1846
1847
1848
1849
1850 kprobe_flush_task(prev);
1851 put_task_struct(prev);
1852 }
1853}
1854
1855#ifdef CONFIG_SMP
1856
1857
1858static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1859{
1860 if (prev->sched_class->pre_schedule)
1861 prev->sched_class->pre_schedule(rq, prev);
1862}
1863
1864
1865static inline void post_schedule(struct rq *rq)
1866{
1867 if (rq->post_schedule) {
1868 unsigned long flags;
1869
1870 raw_spin_lock_irqsave(&rq->lock, flags);
1871 if (rq->curr->sched_class->post_schedule)
1872 rq->curr->sched_class->post_schedule(rq);
1873 raw_spin_unlock_irqrestore(&rq->lock, flags);
1874
1875 rq->post_schedule = 0;
1876 }
1877}
1878
1879#else
1880
1881static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1882{
1883}
1884
1885static inline void post_schedule(struct rq *rq)
1886{
1887}
1888
1889#endif
1890
1891
1892
1893
1894
1895asmlinkage void schedule_tail(struct task_struct *prev)
1896 __releases(rq->lock)
1897{
1898 struct rq *rq = this_rq();
1899
1900 finish_task_switch(rq, prev);
1901
1902
1903
1904
1905
1906 post_schedule(rq);
1907
1908#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1909
1910 preempt_enable();
1911#endif
1912 if (current->set_child_tid)
1913 put_user(task_pid_vnr(current), current->set_child_tid);
1914}
1915
1916
1917
1918
1919
1920static inline void
1921context_switch(struct rq *rq, struct task_struct *prev,
1922 struct task_struct *next)
1923{
1924 struct mm_struct *mm, *oldmm;
1925
1926 prepare_task_switch(rq, prev, next);
1927
1928 mm = next->mm;
1929 oldmm = prev->active_mm;
1930
1931
1932
1933
1934
1935 arch_start_context_switch(prev);
1936
1937 if (!mm) {
1938 next->active_mm = oldmm;
1939 atomic_inc(&oldmm->mm_count);
1940 enter_lazy_tlb(oldmm, next);
1941 } else
1942 switch_mm(oldmm, mm, next);
1943
1944 if (!prev->mm) {
1945 prev->active_mm = NULL;
1946 rq->prev_mm = oldmm;
1947 }
1948
1949
1950
1951
1952
1953
1954#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1955 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1956#endif
1957
1958 context_tracking_task_switch(prev, next);
1959
1960 switch_to(prev, next, prev);
1961
1962 barrier();
1963
1964
1965
1966
1967
1968 finish_task_switch(this_rq(), prev);
1969}
1970
1971
1972
1973
1974
1975
1976
1977
1978unsigned long nr_running(void)
1979{
1980 unsigned long i, sum = 0;
1981
1982 for_each_online_cpu(i)
1983 sum += cpu_rq(i)->nr_running;
1984
1985 return sum;
1986}
1987
1988unsigned long nr_uninterruptible(void)
1989{
1990 unsigned long i, sum = 0;
1991
1992 for_each_possible_cpu(i)
1993 sum += cpu_rq(i)->nr_uninterruptible;
1994
1995
1996
1997
1998
1999 if (unlikely((long)sum < 0))
2000 sum = 0;
2001
2002 return sum;
2003}
2004
2005unsigned long long nr_context_switches(void)
2006{
2007 int i;
2008 unsigned long long sum = 0;
2009
2010 for_each_possible_cpu(i)
2011 sum += cpu_rq(i)->nr_switches;
2012
2013 return sum;
2014}
2015
2016unsigned long nr_iowait(void)
2017{
2018 unsigned long i, sum = 0;
2019
2020 for_each_possible_cpu(i)
2021 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2022
2023 return sum;
2024}
2025
2026unsigned long nr_iowait_cpu(int cpu)
2027{
2028 struct rq *this = cpu_rq(cpu);
2029 return atomic_read(&this->nr_iowait);
2030}
2031
2032unsigned long this_cpu_load(void)
2033{
2034 struct rq *this = this_rq();
2035 return this->cpu_load[0];
2036}
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087static atomic_long_t calc_load_tasks;
2088static unsigned long calc_load_update;
2089unsigned long avenrun[3];
2090EXPORT_SYMBOL(avenrun);
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2101{
2102 loads[0] = (avenrun[0] + offset) << shift;
2103 loads[1] = (avenrun[1] + offset) << shift;
2104 loads[2] = (avenrun[2] + offset) << shift;
2105}
2106
2107static long calc_load_fold_active(struct rq *this_rq)
2108{
2109 long nr_active, delta = 0;
2110
2111 nr_active = this_rq->nr_running;
2112 nr_active += (long) this_rq->nr_uninterruptible;
2113
2114 if (nr_active != this_rq->calc_load_active) {
2115 delta = nr_active - this_rq->calc_load_active;
2116 this_rq->calc_load_active = nr_active;
2117 }
2118
2119 return delta;
2120}
2121
2122
2123
2124
2125static unsigned long
2126calc_load(unsigned long load, unsigned long exp, unsigned long active)
2127{
2128 load *= exp;
2129 load += active * (FIXED_1 - exp);
2130 load += 1UL << (FSHIFT - 1);
2131 return load >> FSHIFT;
2132}
2133
2134#ifdef CONFIG_NO_HZ
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177static atomic_long_t calc_load_idle[2];
2178static int calc_load_idx;
2179
2180static inline int calc_load_write_idx(void)
2181{
2182 int idx = calc_load_idx;
2183
2184
2185
2186
2187
2188 smp_rmb();
2189
2190
2191
2192
2193
2194 if (!time_before(jiffies, calc_load_update))
2195 idx++;
2196
2197 return idx & 1;
2198}
2199
2200static inline int calc_load_read_idx(void)
2201{
2202 return calc_load_idx & 1;
2203}
2204
2205void calc_load_enter_idle(void)
2206{
2207 struct rq *this_rq = this_rq();
2208 long delta;
2209
2210
2211
2212
2213
2214 delta = calc_load_fold_active(this_rq);
2215 if (delta) {
2216 int idx = calc_load_write_idx();
2217 atomic_long_add(delta, &calc_load_idle[idx]);
2218 }
2219}
2220
2221void calc_load_exit_idle(void)
2222{
2223 struct rq *this_rq = this_rq();
2224
2225
2226
2227
2228 if (time_before(jiffies, this_rq->calc_load_update))
2229 return;
2230
2231
2232
2233
2234
2235
2236 this_rq->calc_load_update = calc_load_update;
2237 if (time_before(jiffies, this_rq->calc_load_update + 10))
2238 this_rq->calc_load_update += LOAD_FREQ;
2239}
2240
2241static long calc_load_fold_idle(void)
2242{
2243 int idx = calc_load_read_idx();
2244 long delta = 0;
2245
2246 if (atomic_long_read(&calc_load_idle[idx]))
2247 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2248
2249 return delta;
2250}
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267static unsigned long
2268fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2269{
2270 unsigned long result = 1UL << frac_bits;
2271
2272 if (n) for (;;) {
2273 if (n & 1) {
2274 result *= x;
2275 result += 1UL << (frac_bits - 1);
2276 result >>= frac_bits;
2277 }
2278 n >>= 1;
2279 if (!n)
2280 break;
2281 x *= x;
2282 x += 1UL << (frac_bits - 1);
2283 x >>= frac_bits;
2284 }
2285
2286 return result;
2287}
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312static unsigned long
2313calc_load_n(unsigned long load, unsigned long exp,
2314 unsigned long active, unsigned int n)
2315{
2316
2317 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2318}
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329static void calc_global_nohz(void)
2330{
2331 long delta, active, n;
2332
2333 if (!time_before(jiffies, calc_load_update + 10)) {
2334
2335
2336
2337 delta = jiffies - calc_load_update - 10;
2338 n = 1 + (delta / LOAD_FREQ);
2339
2340 active = atomic_long_read(&calc_load_tasks);
2341 active = active > 0 ? active * FIXED_1 : 0;
2342
2343 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2344 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2345 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2346
2347 calc_load_update += n * LOAD_FREQ;
2348 }
2349
2350
2351
2352
2353
2354
2355
2356
2357 smp_wmb();
2358 calc_load_idx++;
2359}
2360#else
2361
2362static inline long calc_load_fold_idle(void) { return 0; }
2363static inline void calc_global_nohz(void) { }
2364
2365#endif
2366
2367
2368
2369
2370
2371void calc_global_load(unsigned long ticks)
2372{
2373 long active, delta;
2374
2375 if (time_before(jiffies, calc_load_update + 10))
2376 return;
2377
2378
2379
2380
2381 delta = calc_load_fold_idle();
2382 if (delta)
2383 atomic_long_add(delta, &calc_load_tasks);
2384
2385 active = atomic_long_read(&calc_load_tasks);
2386 active = active > 0 ? active * FIXED_1 : 0;
2387
2388 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2389 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2390 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2391
2392 calc_load_update += LOAD_FREQ;
2393
2394
2395
2396
2397 calc_global_nohz();
2398}
2399
2400
2401
2402
2403
2404static void calc_load_account_active(struct rq *this_rq)
2405{
2406 long delta;
2407
2408 if (time_before(jiffies, this_rq->calc_load_update))
2409 return;
2410
2411 delta = calc_load_fold_active(this_rq);
2412 if (delta)
2413 atomic_long_add(delta, &calc_load_tasks);
2414
2415 this_rq->calc_load_update += LOAD_FREQ;
2416}
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449#define DEGRADE_SHIFT 7
2450static const unsigned char
2451 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2452static const unsigned char
2453 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2454 {0, 0, 0, 0, 0, 0, 0, 0},
2455 {64, 32, 8, 0, 0, 0, 0, 0},
2456 {96, 72, 40, 12, 1, 0, 0},
2457 {112, 98, 75, 43, 15, 1, 0},
2458 {120, 112, 98, 76, 45, 16, 2} };
2459
2460
2461
2462
2463
2464
2465static unsigned long
2466decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2467{
2468 int j = 0;
2469
2470 if (!missed_updates)
2471 return load;
2472
2473 if (missed_updates >= degrade_zero_ticks[idx])
2474 return 0;
2475
2476 if (idx == 1)
2477 return load >> missed_updates;
2478
2479 while (missed_updates) {
2480 if (missed_updates % 2)
2481 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2482
2483 missed_updates >>= 1;
2484 j++;
2485 }
2486 return load;
2487}
2488
2489
2490
2491
2492
2493
2494static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2495 unsigned long pending_updates)
2496{
2497 int i, scale;
2498
2499 this_rq->nr_load_updates++;
2500
2501
2502 this_rq->cpu_load[0] = this_load;
2503 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2504 unsigned long old_load, new_load;
2505
2506
2507
2508 old_load = this_rq->cpu_load[i];
2509 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2510 new_load = this_load;
2511
2512
2513
2514
2515
2516 if (new_load > old_load)
2517 new_load += scale - 1;
2518
2519 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2520 }
2521
2522 sched_avg_update(this_rq);
2523}
2524
2525#ifdef CONFIG_NO_HZ
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543void update_idle_cpu_load(struct rq *this_rq)
2544{
2545 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2546 unsigned long load = this_rq->load.weight;
2547 unsigned long pending_updates;
2548
2549
2550
2551
2552 if (load || curr_jiffies == this_rq->last_load_update_tick)
2553 return;
2554
2555 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2556 this_rq->last_load_update_tick = curr_jiffies;
2557
2558 __update_cpu_load(this_rq, load, pending_updates);
2559}
2560
2561
2562
2563
2564void update_cpu_load_nohz(void)
2565{
2566 struct rq *this_rq = this_rq();
2567 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2568 unsigned long pending_updates;
2569
2570 if (curr_jiffies == this_rq->last_load_update_tick)
2571 return;
2572
2573 raw_spin_lock(&this_rq->lock);
2574 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2575 if (pending_updates) {
2576 this_rq->last_load_update_tick = curr_jiffies;
2577
2578
2579
2580
2581 __update_cpu_load(this_rq, 0, pending_updates);
2582 }
2583 raw_spin_unlock(&this_rq->lock);
2584}
2585#endif
2586
2587
2588
2589
2590static void update_cpu_load_active(struct rq *this_rq)
2591{
2592
2593
2594
2595 this_rq->last_load_update_tick = jiffies;
2596 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2597
2598 calc_load_account_active(this_rq);
2599}
2600
2601#ifdef CONFIG_SMP
2602
2603
2604
2605
2606
2607void sched_exec(void)
2608{
2609 struct task_struct *p = current;
2610 unsigned long flags;
2611 int dest_cpu;
2612
2613 raw_spin_lock_irqsave(&p->pi_lock, flags);
2614 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2615 if (dest_cpu == smp_processor_id())
2616 goto unlock;
2617
2618 if (likely(cpu_active(dest_cpu))) {
2619 struct migration_arg arg = { p, dest_cpu };
2620
2621 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2622 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2623 return;
2624 }
2625unlock:
2626 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2627}
2628
2629#endif
2630
2631DEFINE_PER_CPU(struct kernel_stat, kstat);
2632DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2633
2634EXPORT_PER_CPU_SYMBOL(kstat);
2635EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2636
2637
2638
2639
2640
2641
2642
2643static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2644{
2645 u64 ns = 0;
2646
2647 if (task_current(rq, p)) {
2648 update_rq_clock(rq);
2649 ns = rq->clock_task - p->se.exec_start;
2650 if ((s64)ns < 0)
2651 ns = 0;
2652 }
2653
2654 return ns;
2655}
2656
2657unsigned long long task_delta_exec(struct task_struct *p)
2658{
2659 unsigned long flags;
2660 struct rq *rq;
2661 u64 ns = 0;
2662
2663 rq = task_rq_lock(p, &flags);
2664 ns = do_task_delta_exec(p, rq);
2665 task_rq_unlock(rq, p, &flags);
2666
2667 return ns;
2668}
2669
2670
2671
2672
2673
2674
2675unsigned long long task_sched_runtime(struct task_struct *p)
2676{
2677 unsigned long flags;
2678 struct rq *rq;
2679 u64 ns = 0;
2680
2681 rq = task_rq_lock(p, &flags);
2682 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2683 task_rq_unlock(rq, p, &flags);
2684
2685 return ns;
2686}
2687
2688
2689
2690
2691
2692void scheduler_tick(void)
2693{
2694 int cpu = smp_processor_id();
2695 struct rq *rq = cpu_rq(cpu);
2696 struct task_struct *curr = rq->curr;
2697
2698 sched_clock_tick();
2699
2700 raw_spin_lock(&rq->lock);
2701 update_rq_clock(rq);
2702 update_cpu_load_active(rq);
2703 curr->sched_class->task_tick(rq, curr, 0);
2704 raw_spin_unlock(&rq->lock);
2705
2706 perf_event_task_tick();
2707
2708#ifdef CONFIG_SMP
2709 rq->idle_balance = idle_cpu(cpu);
2710 trigger_load_balance(rq, cpu);
2711#endif
2712}
2713
2714notrace unsigned long get_parent_ip(unsigned long addr)
2715{
2716 if (in_lock_functions(addr)) {
2717 addr = CALLER_ADDR2;
2718 if (in_lock_functions(addr))
2719 addr = CALLER_ADDR3;
2720 }
2721 return addr;
2722}
2723
2724#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2725 defined(CONFIG_PREEMPT_TRACER))
2726
2727void __kprobes add_preempt_count(int val)
2728{
2729#ifdef CONFIG_DEBUG_PREEMPT
2730
2731
2732
2733 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2734 return;
2735#endif
2736 preempt_count() += val;
2737#ifdef CONFIG_DEBUG_PREEMPT
2738
2739
2740
2741 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2742 PREEMPT_MASK - 10);
2743#endif
2744 if (preempt_count() == val)
2745 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2746}
2747EXPORT_SYMBOL(add_preempt_count);
2748
2749void __kprobes sub_preempt_count(int val)
2750{
2751#ifdef CONFIG_DEBUG_PREEMPT
2752
2753
2754
2755 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2756 return;
2757
2758
2759
2760 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2761 !(preempt_count() & PREEMPT_MASK)))
2762 return;
2763#endif
2764
2765 if (preempt_count() == val)
2766 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2767 preempt_count() -= val;
2768}
2769EXPORT_SYMBOL(sub_preempt_count);
2770
2771#endif
2772
2773
2774
2775
2776static noinline void __schedule_bug(struct task_struct *prev)
2777{
2778 if (oops_in_progress)
2779 return;
2780
2781 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2782 prev->comm, prev->pid, preempt_count());
2783
2784 debug_show_held_locks(prev);
2785 print_modules();
2786 if (irqs_disabled())
2787 print_irqtrace_events(prev);
2788 dump_stack();
2789 add_taint(TAINT_WARN);
2790}
2791
2792
2793
2794
2795static inline void schedule_debug(struct task_struct *prev)
2796{
2797
2798
2799
2800
2801
2802 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2803 __schedule_bug(prev);
2804 rcu_sleep_check();
2805
2806 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2807
2808 schedstat_inc(this_rq(), sched_count);
2809}
2810
2811static void put_prev_task(struct rq *rq, struct task_struct *prev)
2812{
2813 if (prev->on_rq || rq->skip_clock_update < 0)
2814 update_rq_clock(rq);
2815 prev->sched_class->put_prev_task(rq, prev);
2816}
2817
2818
2819
2820
2821static inline struct task_struct *
2822pick_next_task(struct rq *rq)
2823{
2824 const struct sched_class *class;
2825 struct task_struct *p;
2826
2827
2828
2829
2830
2831 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2832 p = fair_sched_class.pick_next_task(rq);
2833 if (likely(p))
2834 return p;
2835 }
2836
2837 for_each_class(class) {
2838 p = class->pick_next_task(rq);
2839 if (p)
2840 return p;
2841 }
2842
2843 BUG();
2844}
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883static void __sched __schedule(void)
2884{
2885 struct task_struct *prev, *next;
2886 unsigned long *switch_count;
2887 struct rq *rq;
2888 int cpu;
2889
2890need_resched:
2891 preempt_disable();
2892 cpu = smp_processor_id();
2893 rq = cpu_rq(cpu);
2894 rcu_note_context_switch(cpu);
2895 prev = rq->curr;
2896
2897 schedule_debug(prev);
2898
2899 if (sched_feat(HRTICK))
2900 hrtick_clear(rq);
2901
2902 raw_spin_lock_irq(&rq->lock);
2903
2904 switch_count = &prev->nivcsw;
2905 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2906 if (unlikely(signal_pending_state(prev->state, prev))) {
2907 prev->state = TASK_RUNNING;
2908 } else {
2909 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2910 prev->on_rq = 0;
2911
2912
2913
2914
2915
2916
2917 if (prev->flags & PF_WQ_WORKER) {
2918 struct task_struct *to_wakeup;
2919
2920 to_wakeup = wq_worker_sleeping(prev, cpu);
2921 if (to_wakeup)
2922 try_to_wake_up_local(to_wakeup);
2923 }
2924 }
2925 switch_count = &prev->nvcsw;
2926 }
2927
2928 pre_schedule(rq, prev);
2929
2930 if (unlikely(!rq->nr_running))
2931 idle_balance(cpu, rq);
2932
2933 put_prev_task(rq, prev);
2934 next = pick_next_task(rq);
2935 clear_tsk_need_resched(prev);
2936 rq->skip_clock_update = 0;
2937
2938 if (likely(prev != next)) {
2939 rq->nr_switches++;
2940 rq->curr = next;
2941 ++*switch_count;
2942
2943 context_switch(rq, prev, next);
2944
2945
2946
2947
2948
2949
2950 cpu = smp_processor_id();
2951 rq = cpu_rq(cpu);
2952 } else
2953 raw_spin_unlock_irq(&rq->lock);
2954
2955 post_schedule(rq);
2956
2957 sched_preempt_enable_no_resched();
2958 if (need_resched())
2959 goto need_resched;
2960}
2961
2962static inline void sched_submit_work(struct task_struct *tsk)
2963{
2964 if (!tsk->state || tsk_is_pi_blocked(tsk))
2965 return;
2966
2967
2968
2969
2970 if (blk_needs_flush_plug(tsk))
2971 blk_schedule_flush_plug(tsk);
2972}
2973
2974asmlinkage void __sched schedule(void)
2975{
2976 struct task_struct *tsk = current;
2977
2978 sched_submit_work(tsk);
2979 __schedule();
2980}
2981EXPORT_SYMBOL(schedule);
2982
2983#ifdef CONFIG_CONTEXT_TRACKING
2984asmlinkage void __sched schedule_user(void)
2985{
2986
2987
2988
2989
2990
2991
2992 user_exit();
2993 schedule();
2994 user_enter();
2995}
2996#endif
2997
2998
2999
3000
3001
3002
3003void __sched schedule_preempt_disabled(void)
3004{
3005 sched_preempt_enable_no_resched();
3006 schedule();
3007 preempt_disable();
3008}
3009
3010#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3011
3012static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3013{
3014 if (lock->owner != owner)
3015 return false;
3016
3017
3018
3019
3020
3021
3022
3023 barrier();
3024
3025 return owner->on_cpu;
3026}
3027
3028
3029
3030
3031
3032int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3033{
3034 if (!sched_feat(OWNER_SPIN))
3035 return 0;
3036
3037 rcu_read_lock();
3038 while (owner_running(lock, owner)) {
3039 if (need_resched())
3040 break;
3041
3042 arch_mutex_cpu_relax();
3043 }
3044 rcu_read_unlock();
3045
3046
3047
3048
3049
3050
3051 return lock->owner == NULL;
3052}
3053#endif
3054
3055#ifdef CONFIG_PREEMPT
3056
3057
3058
3059
3060
3061asmlinkage void __sched notrace preempt_schedule(void)
3062{
3063 struct thread_info *ti = current_thread_info();
3064
3065
3066
3067
3068
3069 if (likely(ti->preempt_count || irqs_disabled()))
3070 return;
3071
3072 do {
3073 add_preempt_count_notrace(PREEMPT_ACTIVE);
3074 __schedule();
3075 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3076
3077
3078
3079
3080
3081 barrier();
3082 } while (need_resched());
3083}
3084EXPORT_SYMBOL(preempt_schedule);
3085
3086
3087
3088
3089
3090
3091
3092asmlinkage void __sched preempt_schedule_irq(void)
3093{
3094 struct thread_info *ti = current_thread_info();
3095
3096
3097 BUG_ON(ti->preempt_count || !irqs_disabled());
3098
3099 user_exit();
3100 do {
3101 add_preempt_count(PREEMPT_ACTIVE);
3102 local_irq_enable();
3103 __schedule();
3104 local_irq_disable();
3105 sub_preempt_count(PREEMPT_ACTIVE);
3106
3107
3108
3109
3110
3111 barrier();
3112 } while (need_resched());
3113}
3114
3115#endif
3116
3117int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3118 void *key)
3119{
3120 return try_to_wake_up(curr->private, mode, wake_flags);
3121}
3122EXPORT_SYMBOL(default_wake_function);
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3134 int nr_exclusive, int wake_flags, void *key)
3135{
3136 wait_queue_t *curr, *next;
3137
3138 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3139 unsigned flags = curr->flags;
3140
3141 if (curr->func(curr, mode, wake_flags, key) &&
3142 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3143 break;
3144 }
3145}
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157void __wake_up(wait_queue_head_t *q, unsigned int mode,
3158 int nr_exclusive, void *key)
3159{
3160 unsigned long flags;
3161
3162 spin_lock_irqsave(&q->lock, flags);
3163 __wake_up_common(q, mode, nr_exclusive, 0, key);
3164 spin_unlock_irqrestore(&q->lock, flags);
3165}
3166EXPORT_SYMBOL(__wake_up);
3167
3168
3169
3170
3171void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3172{
3173 __wake_up_common(q, mode, nr, 0, NULL);
3174}
3175EXPORT_SYMBOL_GPL(__wake_up_locked);
3176
3177void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3178{
3179 __wake_up_common(q, mode, 1, 0, key);
3180}
3181EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3201 int nr_exclusive, void *key)
3202{
3203 unsigned long flags;
3204 int wake_flags = WF_SYNC;
3205
3206 if (unlikely(!q))
3207 return;
3208
3209 if (unlikely(!nr_exclusive))
3210 wake_flags = 0;
3211
3212 spin_lock_irqsave(&q->lock, flags);
3213 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3214 spin_unlock_irqrestore(&q->lock, flags);
3215}
3216EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3217
3218
3219
3220
3221void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3222{
3223 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3224}
3225EXPORT_SYMBOL_GPL(__wake_up_sync);
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239void complete(struct completion *x)
3240{
3241 unsigned long flags;
3242
3243 spin_lock_irqsave(&x->wait.lock, flags);
3244 x->done++;
3245 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3246 spin_unlock_irqrestore(&x->wait.lock, flags);
3247}
3248EXPORT_SYMBOL(complete);
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259void complete_all(struct completion *x)
3260{
3261 unsigned long flags;
3262
3263 spin_lock_irqsave(&x->wait.lock, flags);
3264 x->done += UINT_MAX/2;
3265 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3266 spin_unlock_irqrestore(&x->wait.lock, flags);
3267}
3268EXPORT_SYMBOL(complete_all);
3269
3270static inline long __sched
3271do_wait_for_common(struct completion *x, long timeout, int state)
3272{
3273 if (!x->done) {
3274 DECLARE_WAITQUEUE(wait, current);
3275
3276 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3277 do {
3278 if (signal_pending_state(state, current)) {
3279 timeout = -ERESTARTSYS;
3280 break;
3281 }
3282 __set_current_state(state);
3283 spin_unlock_irq(&x->wait.lock);
3284 timeout = schedule_timeout(timeout);
3285 spin_lock_irq(&x->wait.lock);
3286 } while (!x->done && timeout);
3287 __remove_wait_queue(&x->wait, &wait);
3288 if (!x->done)
3289 return timeout;
3290 }
3291 x->done--;
3292 return timeout ?: 1;
3293}
3294
3295static long __sched
3296wait_for_common(struct completion *x, long timeout, int state)
3297{
3298 might_sleep();
3299
3300 spin_lock_irq(&x->wait.lock);
3301 timeout = do_wait_for_common(x, timeout, state);
3302 spin_unlock_irq(&x->wait.lock);
3303 return timeout;
3304}
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316void __sched wait_for_completion(struct completion *x)
3317{
3318 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3319}
3320EXPORT_SYMBOL(wait_for_completion);
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334unsigned long __sched
3335wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3336{
3337 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3338}
3339EXPORT_SYMBOL(wait_for_completion_timeout);
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350int __sched wait_for_completion_interruptible(struct completion *x)
3351{
3352 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3353 if (t == -ERESTARTSYS)
3354 return t;
3355 return 0;
3356}
3357EXPORT_SYMBOL(wait_for_completion_interruptible);
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370long __sched
3371wait_for_completion_interruptible_timeout(struct completion *x,
3372 unsigned long timeout)
3373{
3374 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3375}
3376EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387int __sched wait_for_completion_killable(struct completion *x)
3388{
3389 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3390 if (t == -ERESTARTSYS)
3391 return t;
3392 return 0;
3393}
3394EXPORT_SYMBOL(wait_for_completion_killable);
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408long __sched
3409wait_for_completion_killable_timeout(struct completion *x,
3410 unsigned long timeout)
3411{
3412 return wait_for_common(x, timeout, TASK_KILLABLE);
3413}
3414EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428bool try_wait_for_completion(struct completion *x)
3429{
3430 unsigned long flags;
3431 int ret = 1;
3432
3433 spin_lock_irqsave(&x->wait.lock, flags);
3434 if (!x->done)
3435 ret = 0;
3436 else
3437 x->done--;
3438 spin_unlock_irqrestore(&x->wait.lock, flags);
3439 return ret;
3440}
3441EXPORT_SYMBOL(try_wait_for_completion);
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451bool completion_done(struct completion *x)
3452{
3453 unsigned long flags;
3454 int ret = 1;
3455
3456 spin_lock_irqsave(&x->wait.lock, flags);
3457 if (!x->done)
3458 ret = 0;
3459 spin_unlock_irqrestore(&x->wait.lock, flags);
3460 return ret;
3461}
3462EXPORT_SYMBOL(completion_done);
3463
3464static long __sched
3465sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3466{
3467 unsigned long flags;
3468 wait_queue_t wait;
3469
3470 init_waitqueue_entry(&wait, current);
3471
3472 __set_current_state(state);
3473
3474 spin_lock_irqsave(&q->lock, flags);
3475 __add_wait_queue(q, &wait);
3476 spin_unlock(&q->lock);
3477 timeout = schedule_timeout(timeout);
3478 spin_lock_irq(&q->lock);
3479 __remove_wait_queue(q, &wait);
3480 spin_unlock_irqrestore(&q->lock, flags);
3481
3482 return timeout;
3483}
3484
3485void __sched interruptible_sleep_on(wait_queue_head_t *q)
3486{
3487 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3488}
3489EXPORT_SYMBOL(interruptible_sleep_on);
3490
3491long __sched
3492interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3493{
3494 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3495}
3496EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3497
3498void __sched sleep_on(wait_queue_head_t *q)
3499{
3500 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3501}
3502EXPORT_SYMBOL(sleep_on);
3503
3504long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3505{
3506 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3507}
3508EXPORT_SYMBOL(sleep_on_timeout);
3509
3510#ifdef CONFIG_RT_MUTEXES
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522void rt_mutex_setprio(struct task_struct *p, int prio)
3523{
3524 int oldprio, on_rq, running;
3525 struct rq *rq;
3526 const struct sched_class *prev_class;
3527
3528 BUG_ON(prio < 0 || prio > MAX_PRIO);
3529
3530 rq = __task_rq_lock(p);
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544 if (unlikely(p == rq->idle)) {
3545 WARN_ON(p != rq->curr);
3546 WARN_ON(p->pi_blocked_on);
3547 goto out_unlock;
3548 }
3549
3550 trace_sched_pi_setprio(p, prio);
3551 oldprio = p->prio;
3552 prev_class = p->sched_class;
3553 on_rq = p->on_rq;
3554 running = task_current(rq, p);
3555 if (on_rq)
3556 dequeue_task(rq, p, 0);
3557 if (running)
3558 p->sched_class->put_prev_task(rq, p);
3559
3560 if (rt_prio(prio))
3561 p->sched_class = &rt_sched_class;
3562 else
3563 p->sched_class = &fair_sched_class;
3564
3565 p->prio = prio;
3566
3567 if (running)
3568 p->sched_class->set_curr_task(rq);
3569 if (on_rq)
3570 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3571
3572 check_class_changed(rq, p, prev_class, oldprio);
3573out_unlock:
3574 __task_rq_unlock(rq);
3575}
3576#endif
3577void set_user_nice(struct task_struct *p, long nice)
3578{
3579 int old_prio, delta, on_rq;
3580 unsigned long flags;
3581 struct rq *rq;
3582
3583 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3584 return;
3585
3586
3587
3588
3589 rq = task_rq_lock(p, &flags);
3590
3591
3592
3593
3594
3595
3596 if (task_has_rt_policy(p)) {
3597 p->static_prio = NICE_TO_PRIO(nice);
3598 goto out_unlock;
3599 }
3600 on_rq = p->on_rq;
3601 if (on_rq)
3602 dequeue_task(rq, p, 0);
3603
3604 p->static_prio = NICE_TO_PRIO(nice);
3605 set_load_weight(p);
3606 old_prio = p->prio;
3607 p->prio = effective_prio(p);
3608 delta = p->prio - old_prio;
3609
3610 if (on_rq) {
3611 enqueue_task(rq, p, 0);
3612
3613
3614
3615
3616 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3617 resched_task(rq->curr);
3618 }
3619out_unlock:
3620 task_rq_unlock(rq, p, &flags);
3621}
3622EXPORT_SYMBOL(set_user_nice);
3623
3624
3625
3626
3627
3628
3629int can_nice(const struct task_struct *p, const int nice)
3630{
3631
3632 int nice_rlim = 20 - nice;
3633
3634 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3635 capable(CAP_SYS_NICE));
3636}
3637
3638#ifdef __ARCH_WANT_SYS_NICE
3639
3640
3641
3642
3643
3644
3645
3646
3647SYSCALL_DEFINE1(nice, int, increment)
3648{
3649 long nice, retval;
3650
3651
3652
3653
3654
3655
3656 if (increment < -40)
3657 increment = -40;
3658 if (increment > 40)
3659 increment = 40;
3660
3661 nice = TASK_NICE(current) + increment;
3662 if (nice < -20)
3663 nice = -20;
3664 if (nice > 19)
3665 nice = 19;
3666
3667 if (increment < 0 && !can_nice(current, nice))
3668 return -EPERM;
3669
3670 retval = security_task_setnice(current, nice);
3671 if (retval)
3672 return retval;
3673
3674 set_user_nice(current, nice);
3675 return 0;
3676}
3677
3678#endif
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688int task_prio(const struct task_struct *p)
3689{
3690 return p->prio - MAX_RT_PRIO;
3691}
3692
3693
3694
3695
3696
3697int task_nice(const struct task_struct *p)
3698{
3699 return TASK_NICE(p);
3700}
3701EXPORT_SYMBOL(task_nice);
3702
3703
3704
3705
3706
3707int idle_cpu(int cpu)
3708{
3709 struct rq *rq = cpu_rq(cpu);
3710
3711 if (rq->curr != rq->idle)
3712 return 0;
3713
3714 if (rq->nr_running)
3715 return 0;
3716
3717#ifdef CONFIG_SMP
3718 if (!llist_empty(&rq->wake_list))
3719 return 0;
3720#endif
3721
3722 return 1;
3723}
3724
3725
3726
3727
3728
3729struct task_struct *idle_task(int cpu)
3730{
3731 return cpu_rq(cpu)->idle;
3732}
3733
3734
3735
3736
3737
3738static struct task_struct *find_process_by_pid(pid_t pid)
3739{
3740 return pid ? find_task_by_vpid(pid) : current;
3741}
3742
3743
3744static void
3745__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3746{
3747 p->policy = policy;
3748 p->rt_priority = prio;
3749 p->normal_prio = normal_prio(p);
3750
3751 p->prio = rt_mutex_getprio(p);
3752 if (rt_prio(p->prio))
3753 p->sched_class = &rt_sched_class;
3754 else
3755 p->sched_class = &fair_sched_class;
3756 set_load_weight(p);
3757}
3758
3759
3760
3761
3762static bool check_same_owner(struct task_struct *p)
3763{
3764 const struct cred *cred = current_cred(), *pcred;
3765 bool match;
3766
3767 rcu_read_lock();
3768 pcred = __task_cred(p);
3769 match = (uid_eq(cred->euid, pcred->euid) ||
3770 uid_eq(cred->euid, pcred->uid));
3771 rcu_read_unlock();
3772 return match;
3773}
3774
3775static int __sched_setscheduler(struct task_struct *p, int policy,
3776 const struct sched_param *param, bool user)
3777{
3778 int retval, oldprio, oldpolicy = -1, on_rq, running;
3779 unsigned long flags;
3780 const struct sched_class *prev_class;
3781 struct rq *rq;
3782 int reset_on_fork;
3783
3784
3785 BUG_ON(in_interrupt());
3786recheck:
3787
3788 if (policy < 0) {
3789 reset_on_fork = p->sched_reset_on_fork;
3790 policy = oldpolicy = p->policy;
3791 } else {
3792 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3793 policy &= ~SCHED_RESET_ON_FORK;
3794
3795 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3796 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3797 policy != SCHED_IDLE)
3798 return -EINVAL;
3799 }
3800
3801
3802
3803
3804
3805
3806 if (param->sched_priority < 0 ||
3807 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3808 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3809 return -EINVAL;
3810 if (rt_policy(policy) != (param->sched_priority != 0))
3811 return -EINVAL;
3812
3813
3814
3815
3816 if (user && !capable(CAP_SYS_NICE)) {
3817 if (rt_policy(policy)) {
3818 unsigned long rlim_rtprio =
3819 task_rlimit(p, RLIMIT_RTPRIO);
3820
3821
3822 if (policy != p->policy && !rlim_rtprio)
3823 return -EPERM;
3824
3825
3826 if (param->sched_priority > p->rt_priority &&
3827 param->sched_priority > rlim_rtprio)
3828 return -EPERM;
3829 }
3830
3831
3832
3833
3834
3835 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3836 if (!can_nice(p, TASK_NICE(p)))
3837 return -EPERM;
3838 }
3839
3840
3841 if (!check_same_owner(p))
3842 return -EPERM;
3843
3844
3845 if (p->sched_reset_on_fork && !reset_on_fork)
3846 return -EPERM;
3847 }
3848
3849 if (user) {
3850 retval = security_task_setscheduler(p);
3851 if (retval)
3852 return retval;
3853 }
3854
3855
3856
3857
3858
3859
3860
3861
3862 rq = task_rq_lock(p, &flags);
3863
3864
3865
3866
3867 if (p == rq->stop) {
3868 task_rq_unlock(rq, p, &flags);
3869 return -EINVAL;
3870 }
3871
3872
3873
3874
3875 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3876 param->sched_priority == p->rt_priority))) {
3877 task_rq_unlock(rq, p, &flags);
3878 return 0;
3879 }
3880
3881#ifdef CONFIG_RT_GROUP_SCHED
3882 if (user) {
3883
3884
3885
3886
3887 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3888 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3889 !task_group_is_autogroup(task_group(p))) {
3890 task_rq_unlock(rq, p, &flags);
3891 return -EPERM;
3892 }
3893 }
3894#endif
3895
3896
3897 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3898 policy = oldpolicy = -1;
3899 task_rq_unlock(rq, p, &flags);
3900 goto recheck;
3901 }
3902 on_rq = p->on_rq;
3903 running = task_current(rq, p);
3904 if (on_rq)
3905 dequeue_task(rq, p, 0);
3906 if (running)
3907 p->sched_class->put_prev_task(rq, p);
3908
3909 p->sched_reset_on_fork = reset_on_fork;
3910
3911 oldprio = p->prio;
3912 prev_class = p->sched_class;
3913 __setscheduler(rq, p, policy, param->sched_priority);
3914
3915 if (running)
3916 p->sched_class->set_curr_task(rq);
3917 if (on_rq)
3918 enqueue_task(rq, p, 0);
3919
3920 check_class_changed(rq, p, prev_class, oldprio);
3921 task_rq_unlock(rq, p, &flags);
3922
3923 rt_mutex_adjust_pi(p);
3924
3925 return 0;
3926}
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936int sched_setscheduler(struct task_struct *p, int policy,
3937 const struct sched_param *param)
3938{
3939 return __sched_setscheduler(p, policy, param, true);
3940}
3941EXPORT_SYMBOL_GPL(sched_setscheduler);
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3955 const struct sched_param *param)
3956{
3957 return __sched_setscheduler(p, policy, param, false);
3958}
3959
3960static int
3961do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3962{
3963 struct sched_param lparam;
3964 struct task_struct *p;
3965 int retval;
3966
3967 if (!param || pid < 0)
3968 return -EINVAL;
3969 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3970 return -EFAULT;
3971
3972 rcu_read_lock();
3973 retval = -ESRCH;
3974 p = find_process_by_pid(pid);
3975 if (p != NULL)
3976 retval = sched_setscheduler(p, policy, &lparam);
3977 rcu_read_unlock();
3978
3979 return retval;
3980}
3981
3982
3983
3984
3985
3986
3987
3988SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3989 struct sched_param __user *, param)
3990{
3991
3992 if (policy < 0)
3993 return -EINVAL;
3994
3995 return do_sched_setscheduler(pid, policy, param);
3996}
3997
3998
3999
4000
4001
4002
4003SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4004{
4005 return do_sched_setscheduler(pid, -1, param);
4006}
4007
4008
4009
4010
4011
4012SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4013{
4014 struct task_struct *p;
4015 int retval;
4016
4017 if (pid < 0)
4018 return -EINVAL;
4019
4020 retval = -ESRCH;
4021 rcu_read_lock();
4022 p = find_process_by_pid(pid);
4023 if (p) {
4024 retval = security_task_getscheduler(p);
4025 if (!retval)
4026 retval = p->policy
4027 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4028 }
4029 rcu_read_unlock();
4030 return retval;
4031}
4032
4033
4034
4035
4036
4037
4038SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4039{
4040 struct sched_param lp;
4041 struct task_struct *p;
4042 int retval;
4043
4044 if (!param || pid < 0)
4045 return -EINVAL;
4046
4047 rcu_read_lock();
4048 p = find_process_by_pid(pid);
4049 retval = -ESRCH;
4050 if (!p)
4051 goto out_unlock;
4052
4053 retval = security_task_getscheduler(p);
4054 if (retval)
4055 goto out_unlock;
4056
4057 lp.sched_priority = p->rt_priority;
4058 rcu_read_unlock();
4059
4060
4061
4062
4063 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4064
4065 return retval;
4066
4067out_unlock:
4068 rcu_read_unlock();
4069 return retval;
4070}
4071
4072long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4073{
4074 cpumask_var_t cpus_allowed, new_mask;
4075 struct task_struct *p;
4076 int retval;
4077
4078 get_online_cpus();
4079 rcu_read_lock();
4080
4081 p = find_process_by_pid(pid);
4082 if (!p) {
4083 rcu_read_unlock();
4084 put_online_cpus();
4085 return -ESRCH;
4086 }
4087
4088
4089 get_task_struct(p);
4090 rcu_read_unlock();
4091
4092 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4093 retval = -ENOMEM;
4094 goto out_put_task;
4095 }
4096 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4097 retval = -ENOMEM;
4098 goto out_free_cpus_allowed;
4099 }
4100 retval = -EPERM;
4101 if (!check_same_owner(p)) {
4102 rcu_read_lock();
4103 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4104 rcu_read_unlock();
4105 goto out_unlock;
4106 }
4107 rcu_read_unlock();
4108 }
4109
4110 retval = security_task_setscheduler(p);
4111 if (retval)
4112 goto out_unlock;
4113
4114 cpuset_cpus_allowed(p, cpus_allowed);
4115 cpumask_and(new_mask, in_mask, cpus_allowed);
4116again:
4117 retval = set_cpus_allowed_ptr(p, new_mask);
4118
4119 if (!retval) {
4120 cpuset_cpus_allowed(p, cpus_allowed);
4121 if (!cpumask_subset(new_mask, cpus_allowed)) {
4122
4123
4124
4125
4126
4127 cpumask_copy(new_mask, cpus_allowed);
4128 goto again;
4129 }
4130 }
4131out_unlock:
4132 free_cpumask_var(new_mask);
4133out_free_cpus_allowed:
4134 free_cpumask_var(cpus_allowed);
4135out_put_task:
4136 put_task_struct(p);
4137 put_online_cpus();
4138 return retval;
4139}
4140
4141static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4142 struct cpumask *new_mask)
4143{
4144 if (len < cpumask_size())
4145 cpumask_clear(new_mask);
4146 else if (len > cpumask_size())
4147 len = cpumask_size();
4148
4149 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4150}
4151
4152
4153
4154
4155
4156
4157
4158SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4159 unsigned long __user *, user_mask_ptr)
4160{
4161 cpumask_var_t new_mask;
4162 int retval;
4163
4164 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4165 return -ENOMEM;
4166
4167 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4168 if (retval == 0)
4169 retval = sched_setaffinity(pid, new_mask);
4170 free_cpumask_var(new_mask);
4171 return retval;
4172}
4173
4174long sched_getaffinity(pid_t pid, struct cpumask *mask)
4175{
4176 struct task_struct *p;
4177 unsigned long flags;
4178 int retval;
4179
4180 get_online_cpus();
4181 rcu_read_lock();
4182
4183 retval = -ESRCH;
4184 p = find_process_by_pid(pid);
4185 if (!p)
4186 goto out_unlock;
4187
4188 retval = security_task_getscheduler(p);
4189 if (retval)
4190 goto out_unlock;
4191
4192 raw_spin_lock_irqsave(&p->pi_lock, flags);
4193 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4194 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4195
4196out_unlock:
4197 rcu_read_unlock();
4198 put_online_cpus();
4199
4200 return retval;
4201}
4202
4203
4204
4205
4206
4207
4208
4209SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4210 unsigned long __user *, user_mask_ptr)
4211{
4212 int ret;
4213 cpumask_var_t mask;
4214
4215 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4216 return -EINVAL;
4217 if (len & (sizeof(unsigned long)-1))
4218 return -EINVAL;
4219
4220 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4221 return -ENOMEM;
4222
4223 ret = sched_getaffinity(pid, mask);
4224 if (ret == 0) {
4225 size_t retlen = min_t(size_t, len, cpumask_size());
4226
4227 if (copy_to_user(user_mask_ptr, mask, retlen))
4228 ret = -EFAULT;
4229 else
4230 ret = retlen;
4231 }
4232 free_cpumask_var(mask);
4233
4234 return ret;
4235}
4236
4237
4238
4239
4240
4241
4242
4243SYSCALL_DEFINE0(sched_yield)
4244{
4245 struct rq *rq = this_rq_lock();
4246
4247 schedstat_inc(rq, yld_count);
4248 current->sched_class->yield_task(rq);
4249
4250
4251
4252
4253
4254 __release(rq->lock);
4255 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4256 do_raw_spin_unlock(&rq->lock);
4257 sched_preempt_enable_no_resched();
4258
4259 schedule();
4260
4261 return 0;
4262}
4263
4264static inline int should_resched(void)
4265{
4266 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4267}
4268
4269static void __cond_resched(void)
4270{
4271 add_preempt_count(PREEMPT_ACTIVE);
4272 __schedule();
4273 sub_preempt_count(PREEMPT_ACTIVE);
4274}
4275
4276int __sched _cond_resched(void)
4277{
4278 if (should_resched()) {
4279 __cond_resched();
4280 return 1;
4281 }
4282 return 0;
4283}
4284EXPORT_SYMBOL(_cond_resched);
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294int __cond_resched_lock(spinlock_t *lock)
4295{
4296 int resched = should_resched();
4297 int ret = 0;
4298
4299 lockdep_assert_held(lock);
4300
4301 if (spin_needbreak(lock) || resched) {
4302 spin_unlock(lock);
4303 if (resched)
4304 __cond_resched();
4305 else
4306 cpu_relax();
4307 ret = 1;
4308 spin_lock(lock);
4309 }
4310 return ret;
4311}
4312EXPORT_SYMBOL(__cond_resched_lock);
4313
4314int __sched __cond_resched_softirq(void)
4315{
4316 BUG_ON(!in_softirq());
4317
4318 if (should_resched()) {
4319 local_bh_enable();
4320 __cond_resched();
4321 local_bh_disable();
4322 return 1;
4323 }
4324 return 0;
4325}
4326EXPORT_SYMBOL(__cond_resched_softirq);
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350void __sched yield(void)
4351{
4352 set_current_state(TASK_RUNNING);
4353 sys_sched_yield();
4354}
4355EXPORT_SYMBOL(yield);
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369bool __sched yield_to(struct task_struct *p, bool preempt)
4370{
4371 struct task_struct *curr = current;
4372 struct rq *rq, *p_rq;
4373 unsigned long flags;
4374 bool yielded = 0;
4375
4376 local_irq_save(flags);
4377 rq = this_rq();
4378
4379again:
4380 p_rq = task_rq(p);
4381 double_rq_lock(rq, p_rq);
4382 while (task_rq(p) != p_rq) {
4383 double_rq_unlock(rq, p_rq);
4384 goto again;
4385 }
4386
4387 if (!curr->sched_class->yield_to_task)
4388 goto out;
4389
4390 if (curr->sched_class != p->sched_class)
4391 goto out;
4392
4393 if (task_running(p_rq, p) || p->state)
4394 goto out;
4395
4396 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4397 if (yielded) {
4398 schedstat_inc(rq, yld_count);
4399
4400
4401
4402
4403 if (preempt && rq != p_rq)
4404 resched_task(p_rq->curr);
4405 }
4406
4407out:
4408 double_rq_unlock(rq, p_rq);
4409 local_irq_restore(flags);
4410
4411 if (yielded)
4412 schedule();
4413
4414 return yielded;
4415}
4416EXPORT_SYMBOL_GPL(yield_to);
4417
4418
4419
4420
4421
4422void __sched io_schedule(void)
4423{
4424 struct rq *rq = raw_rq();
4425
4426 delayacct_blkio_start();
4427 atomic_inc(&rq->nr_iowait);
4428 blk_flush_plug(current);
4429 current->in_iowait = 1;
4430 schedule();
4431 current->in_iowait = 0;
4432 atomic_dec(&rq->nr_iowait);
4433 delayacct_blkio_end();
4434}
4435EXPORT_SYMBOL(io_schedule);
4436
4437long __sched io_schedule_timeout(long timeout)
4438{
4439 struct rq *rq = raw_rq();
4440 long ret;
4441
4442 delayacct_blkio_start();
4443 atomic_inc(&rq->nr_iowait);
4444 blk_flush_plug(current);
4445 current->in_iowait = 1;
4446 ret = schedule_timeout(timeout);
4447 current->in_iowait = 0;
4448 atomic_dec(&rq->nr_iowait);
4449 delayacct_blkio_end();
4450 return ret;
4451}
4452
4453
4454
4455
4456
4457
4458
4459
4460SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4461{
4462 int ret = -EINVAL;
4463
4464 switch (policy) {
4465 case SCHED_FIFO:
4466 case SCHED_RR:
4467 ret = MAX_USER_RT_PRIO-1;
4468 break;
4469 case SCHED_NORMAL:
4470 case SCHED_BATCH:
4471 case SCHED_IDLE:
4472 ret = 0;
4473 break;
4474 }
4475 return ret;
4476}
4477
4478
4479
4480
4481
4482
4483
4484
4485SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4486{
4487 int ret = -EINVAL;
4488
4489 switch (policy) {
4490 case SCHED_FIFO:
4491 case SCHED_RR:
4492 ret = 1;
4493 break;
4494 case SCHED_NORMAL:
4495 case SCHED_BATCH:
4496 case SCHED_IDLE:
4497 ret = 0;
4498 }
4499 return ret;
4500}
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4511 struct timespec __user *, interval)
4512{
4513 struct task_struct *p;
4514 unsigned int time_slice;
4515 unsigned long flags;
4516 struct rq *rq;
4517 int retval;
4518 struct timespec t;
4519
4520 if (pid < 0)
4521 return -EINVAL;
4522
4523 retval = -ESRCH;
4524 rcu_read_lock();
4525 p = find_process_by_pid(pid);
4526 if (!p)
4527 goto out_unlock;
4528
4529 retval = security_task_getscheduler(p);
4530 if (retval)
4531 goto out_unlock;
4532
4533 rq = task_rq_lock(p, &flags);
4534 time_slice = p->sched_class->get_rr_interval(rq, p);
4535 task_rq_unlock(rq, p, &flags);
4536
4537 rcu_read_unlock();
4538 jiffies_to_timespec(time_slice, &t);
4539 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4540 return retval;
4541
4542out_unlock:
4543 rcu_read_unlock();
4544 return retval;
4545}
4546
4547static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4548
4549void sched_show_task(struct task_struct *p)
4550{
4551 unsigned long free = 0;
4552 int ppid;
4553 unsigned state;
4554
4555 state = p->state ? __ffs(p->state) + 1 : 0;
4556 printk(KERN_INFO "%-15.15s %c", p->comm,
4557 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4558#if BITS_PER_LONG == 32
4559 if (state == TASK_RUNNING)
4560 printk(KERN_CONT " running ");
4561 else
4562 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4563#else
4564 if (state == TASK_RUNNING)
4565 printk(KERN_CONT " running task ");
4566 else
4567 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4568#endif
4569#ifdef CONFIG_DEBUG_STACK_USAGE
4570 free = stack_not_used(p);
4571#endif
4572 rcu_read_lock();
4573 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4574 rcu_read_unlock();
4575 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4576 task_pid_nr(p), ppid,
4577 (unsigned long)task_thread_info(p)->flags);
4578
4579 show_stack(p, NULL);
4580}
4581
4582void show_state_filter(unsigned long state_filter)
4583{
4584 struct task_struct *g, *p;
4585
4586#if BITS_PER_LONG == 32
4587 printk(KERN_INFO
4588 " task PC stack pid father\n");
4589#else
4590 printk(KERN_INFO
4591 " task PC stack pid father\n");
4592#endif
4593 rcu_read_lock();
4594 do_each_thread(g, p) {
4595
4596
4597
4598
4599 touch_nmi_watchdog();
4600 if (!state_filter || (p->state & state_filter))
4601 sched_show_task(p);
4602 } while_each_thread(g, p);
4603
4604 touch_all_softlockup_watchdogs();
4605
4606#ifdef CONFIG_SCHED_DEBUG
4607 sysrq_sched_debug_show();
4608#endif
4609 rcu_read_unlock();
4610
4611
4612
4613 if (!state_filter)
4614 debug_show_all_locks();
4615}
4616
4617void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4618{
4619 idle->sched_class = &idle_sched_class;
4620}
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630void __cpuinit init_idle(struct task_struct *idle, int cpu)
4631{
4632 struct rq *rq = cpu_rq(cpu);
4633 unsigned long flags;
4634
4635 raw_spin_lock_irqsave(&rq->lock, flags);
4636
4637 __sched_fork(idle);
4638 idle->state = TASK_RUNNING;
4639 idle->se.exec_start = sched_clock();
4640
4641 do_set_cpus_allowed(idle, cpumask_of(cpu));
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652 rcu_read_lock();
4653 __set_task_cpu(idle, cpu);
4654 rcu_read_unlock();
4655
4656 rq->curr = rq->idle = idle;
4657#if defined(CONFIG_SMP)
4658 idle->on_cpu = 1;
4659#endif
4660 raw_spin_unlock_irqrestore(&rq->lock, flags);
4661
4662
4663 task_thread_info(idle)->preempt_count = 0;
4664
4665
4666
4667
4668 idle->sched_class = &idle_sched_class;
4669 ftrace_graph_init_idle_task(idle, cpu);
4670#if defined(CONFIG_SMP)
4671 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4672#endif
4673}
4674
4675#ifdef CONFIG_SMP
4676void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4677{
4678 if (p->sched_class && p->sched_class->set_cpus_allowed)
4679 p->sched_class->set_cpus_allowed(p, new_mask);
4680
4681 cpumask_copy(&p->cpus_allowed, new_mask);
4682 p->nr_cpus_allowed = cpumask_weight(new_mask);
4683}
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4709{
4710 unsigned long flags;
4711 struct rq *rq;
4712 unsigned int dest_cpu;
4713 int ret = 0;
4714
4715 rq = task_rq_lock(p, &flags);
4716
4717 if (cpumask_equal(&p->cpus_allowed, new_mask))
4718 goto out;
4719
4720 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4721 ret = -EINVAL;
4722 goto out;
4723 }
4724
4725 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4726 ret = -EINVAL;
4727 goto out;
4728 }
4729
4730 do_set_cpus_allowed(p, new_mask);
4731
4732
4733 if (cpumask_test_cpu(task_cpu(p), new_mask))
4734 goto out;
4735
4736 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4737 if (p->on_rq) {
4738 struct migration_arg arg = { p, dest_cpu };
4739
4740 task_rq_unlock(rq, p, &flags);
4741 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4742 tlb_migrate_finish(p->mm);
4743 return 0;
4744 }
4745out:
4746 task_rq_unlock(rq, p, &flags);
4747
4748 return ret;
4749}
4750EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4764{
4765 struct rq *rq_dest, *rq_src;
4766 int ret = 0;
4767
4768 if (unlikely(!cpu_active(dest_cpu)))
4769 return ret;
4770
4771 rq_src = cpu_rq(src_cpu);
4772 rq_dest = cpu_rq(dest_cpu);
4773
4774 raw_spin_lock(&p->pi_lock);
4775 double_rq_lock(rq_src, rq_dest);
4776
4777 if (task_cpu(p) != src_cpu)
4778 goto done;
4779
4780 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4781 goto fail;
4782
4783
4784
4785
4786
4787 if (p->on_rq) {
4788 dequeue_task(rq_src, p, 0);
4789 set_task_cpu(p, dest_cpu);
4790 enqueue_task(rq_dest, p, 0);
4791 check_preempt_curr(rq_dest, p, 0);
4792 }
4793done:
4794 ret = 1;
4795fail:
4796 double_rq_unlock(rq_src, rq_dest);
4797 raw_spin_unlock(&p->pi_lock);
4798 return ret;
4799}
4800
4801
4802
4803
4804
4805
4806static int migration_cpu_stop(void *data)
4807{
4808 struct migration_arg *arg = data;
4809
4810
4811
4812
4813
4814 local_irq_disable();
4815 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4816 local_irq_enable();
4817 return 0;
4818}
4819
4820#ifdef CONFIG_HOTPLUG_CPU
4821
4822
4823
4824
4825
4826void idle_task_exit(void)
4827{
4828 struct mm_struct *mm = current->active_mm;
4829
4830 BUG_ON(cpu_online(smp_processor_id()));
4831
4832 if (mm != &init_mm)
4833 switch_mm(mm, &init_mm, current);
4834 mmdrop(mm);
4835}
4836
4837
4838
4839
4840
4841
4842
4843
4844static void calc_load_migrate(struct rq *rq)
4845{
4846 long delta = calc_load_fold_active(rq);
4847 if (delta)
4848 atomic_long_add(delta, &calc_load_tasks);
4849}
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859static void migrate_tasks(unsigned int dead_cpu)
4860{
4861 struct rq *rq = cpu_rq(dead_cpu);
4862 struct task_struct *next, *stop = rq->stop;
4863 int dest_cpu;
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874 rq->stop = NULL;
4875
4876 for ( ; ; ) {
4877
4878
4879
4880
4881 if (rq->nr_running == 1)
4882 break;
4883
4884 next = pick_next_task(rq);
4885 BUG_ON(!next);
4886 next->sched_class->put_prev_task(rq, next);
4887
4888
4889 dest_cpu = select_fallback_rq(dead_cpu, next);
4890 raw_spin_unlock(&rq->lock);
4891
4892 __migrate_task(next, dead_cpu, dest_cpu);
4893
4894 raw_spin_lock(&rq->lock);
4895 }
4896
4897 rq->stop = stop;
4898}
4899
4900#endif
4901
4902#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4903
4904static struct ctl_table sd_ctl_dir[] = {
4905 {
4906 .procname = "sched_domain",
4907 .mode = 0555,
4908 },
4909 {}
4910};
4911
4912static struct ctl_table