1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75
76#include <asm/switch_to.h>
77#include <asm/tlb.h>
78#include <asm/irq_regs.h>
79#include <asm/mutex.h>
80#ifdef CONFIG_PARAVIRT
81#include <asm/paravirt.h>
82#endif
83
84#include "sched.h"
85#include "../workqueue_sched.h"
86#include "../smpboot.h"
87
88#define CREATE_TRACE_POINTS
89#include <trace/events/sched.h>
90
91void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
92{
93 unsigned long delta;
94 ktime_t soft, hard, now;
95
96 for (;;) {
97 if (hrtimer_active(period_timer))
98 break;
99
100 now = hrtimer_cb_get_time(period_timer);
101 hrtimer_forward(period_timer, now, period);
102
103 soft = hrtimer_get_softexpires(period_timer);
104 hard = hrtimer_get_expires(period_timer);
105 delta = ktime_to_ns(ktime_sub(hard, soft));
106 __hrtimer_start_range_ns(period_timer, soft, delta,
107 HRTIMER_MODE_ABS_PINNED, 0);
108 }
109}
110
111DEFINE_MUTEX(sched_domains_mutex);
112DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
113
114static void update_rq_clock_task(struct rq *rq, s64 delta);
115
116void update_rq_clock(struct rq *rq)
117{
118 s64 delta;
119
120 if (rq->skip_clock_update > 0)
121 return;
122
123 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
124 rq->clock += delta;
125 update_rq_clock_task(rq, delta);
126}
127
128
129
130
131
132#define SCHED_FEAT(name, enabled) \
133 (1UL << __SCHED_FEAT_##name) * enabled |
134
135const_debug unsigned int sysctl_sched_features =
136#include "features.h"
137 0;
138
139#undef SCHED_FEAT
140
141#ifdef CONFIG_SCHED_DEBUG
142#define SCHED_FEAT(name, enabled) \
143 #name ,
144
145static const char * const sched_feat_names[] = {
146#include "features.h"
147};
148
149#undef SCHED_FEAT
150
151static int sched_feat_show(struct seq_file *m, void *v)
152{
153 int i;
154
155 for (i = 0; i < __SCHED_FEAT_NR; i++) {
156 if (!(sysctl_sched_features & (1UL << i)))
157 seq_puts(m, "NO_");
158 seq_printf(m, "%s ", sched_feat_names[i]);
159 }
160 seq_puts(m, "\n");
161
162 return 0;
163}
164
165#ifdef HAVE_JUMP_LABEL
166
167#define jump_label_key__true STATIC_KEY_INIT_TRUE
168#define jump_label_key__false STATIC_KEY_INIT_FALSE
169
170#define SCHED_FEAT(name, enabled) \
171 jump_label_key__##enabled ,
172
173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
174#include "features.h"
175};
176
177#undef SCHED_FEAT
178
179static void sched_feat_disable(int i)
180{
181 if (static_key_enabled(&sched_feat_keys[i]))
182 static_key_slow_dec(&sched_feat_keys[i]);
183}
184
185static void sched_feat_enable(int i)
186{
187 if (!static_key_enabled(&sched_feat_keys[i]))
188 static_key_slow_inc(&sched_feat_keys[i]);
189}
190#else
191static void sched_feat_disable(int i) { };
192static void sched_feat_enable(int i) { };
193#endif
194
195static ssize_t
196sched_feat_write(struct file *filp, const char __user *ubuf,
197 size_t cnt, loff_t *ppos)
198{
199 char buf[64];
200 char *cmp;
201 int neg = 0;
202 int i;
203
204 if (cnt > 63)
205 cnt = 63;
206
207 if (copy_from_user(&buf, ubuf, cnt))
208 return -EFAULT;
209
210 buf[cnt] = 0;
211 cmp = strstrip(buf);
212
213 if (strncmp(cmp, "NO_", 3) == 0) {
214 neg = 1;
215 cmp += 3;
216 }
217
218 for (i = 0; i < __SCHED_FEAT_NR; i++) {
219 if (strcmp(cmp, sched_feat_names[i]) == 0) {
220 if (neg) {
221 sysctl_sched_features &= ~(1UL << i);
222 sched_feat_disable(i);
223 } else {
224 sysctl_sched_features |= (1UL << i);
225 sched_feat_enable(i);
226 }
227 break;
228 }
229 }
230
231 if (i == __SCHED_FEAT_NR)
232 return -EINVAL;
233
234 *ppos += cnt;
235
236 return cnt;
237}
238
239static int sched_feat_open(struct inode *inode, struct file *filp)
240{
241 return single_open(filp, sched_feat_show, NULL);
242}
243
244static const struct file_operations sched_feat_fops = {
245 .open = sched_feat_open,
246 .write = sched_feat_write,
247 .read = seq_read,
248 .llseek = seq_lseek,
249 .release = single_release,
250};
251
252static __init int sched_init_debug(void)
253{
254 debugfs_create_file("sched_features", 0644, NULL, NULL,
255 &sched_feat_fops);
256
257 return 0;
258}
259late_initcall(sched_init_debug);
260#endif
261
262
263
264
265
266const_debug unsigned int sysctl_sched_nr_migrate = 32;
267
268
269
270
271
272
273
274const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
275
276
277
278
279
280unsigned int sysctl_sched_rt_period = 1000000;
281
282__read_mostly int scheduler_running;
283
284
285
286
287
288int sysctl_sched_rt_runtime = 950000;
289
290
291
292
293
294
295static inline struct rq *__task_rq_lock(struct task_struct *p)
296 __acquires(rq->lock)
297{
298 struct rq *rq;
299
300 lockdep_assert_held(&p->pi_lock);
301
302 for (;;) {
303 rq = task_rq(p);
304 raw_spin_lock(&rq->lock);
305 if (likely(rq == task_rq(p)))
306 return rq;
307 raw_spin_unlock(&rq->lock);
308 }
309}
310
311
312
313
314static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
315 __acquires(p->pi_lock)
316 __acquires(rq->lock)
317{
318 struct rq *rq;
319
320 for (;;) {
321 raw_spin_lock_irqsave(&p->pi_lock, *flags);
322 rq = task_rq(p);
323 raw_spin_lock(&rq->lock);
324 if (likely(rq == task_rq(p)))
325 return rq;
326 raw_spin_unlock(&rq->lock);
327 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
328 }
329}
330
331static void __task_rq_unlock(struct rq *rq)
332 __releases(rq->lock)
333{
334 raw_spin_unlock(&rq->lock);
335}
336
337static inline void
338task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
339 __releases(rq->lock)
340 __releases(p->pi_lock)
341{
342 raw_spin_unlock(&rq->lock);
343 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
344}
345
346
347
348
349static struct rq *this_rq_lock(void)
350 __acquires(rq->lock)
351{
352 struct rq *rq;
353
354 local_irq_disable();
355 rq = this_rq();
356 raw_spin_lock(&rq->lock);
357
358 return rq;
359}
360
361#ifdef CONFIG_SCHED_HRTICK
362
363
364
365
366
367
368
369
370
371
372
373static void hrtick_clear(struct rq *rq)
374{
375 if (hrtimer_active(&rq->hrtick_timer))
376 hrtimer_cancel(&rq->hrtick_timer);
377}
378
379
380
381
382
383static enum hrtimer_restart hrtick(struct hrtimer *timer)
384{
385 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
386
387 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
388
389 raw_spin_lock(&rq->lock);
390 update_rq_clock(rq);
391 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
392 raw_spin_unlock(&rq->lock);
393
394 return HRTIMER_NORESTART;
395}
396
397#ifdef CONFIG_SMP
398
399
400
401static void __hrtick_start(void *arg)
402{
403 struct rq *rq = arg;
404
405 raw_spin_lock(&rq->lock);
406 hrtimer_restart(&rq->hrtick_timer);
407 rq->hrtick_csd_pending = 0;
408 raw_spin_unlock(&rq->lock);
409}
410
411
412
413
414
415
416void hrtick_start(struct rq *rq, u64 delay)
417{
418 struct hrtimer *timer = &rq->hrtick_timer;
419 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
420
421 hrtimer_set_expires(timer, time);
422
423 if (rq == this_rq()) {
424 hrtimer_restart(timer);
425 } else if (!rq->hrtick_csd_pending) {
426 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
427 rq->hrtick_csd_pending = 1;
428 }
429}
430
431static int
432hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
433{
434 int cpu = (int)(long)hcpu;
435
436 switch (action) {
437 case CPU_UP_CANCELED:
438 case CPU_UP_CANCELED_FROZEN:
439 case CPU_DOWN_PREPARE:
440 case CPU_DOWN_PREPARE_FROZEN:
441 case CPU_DEAD:
442 case CPU_DEAD_FROZEN:
443 hrtick_clear(cpu_rq(cpu));
444 return NOTIFY_OK;
445 }
446
447 return NOTIFY_DONE;
448}
449
450static __init void init_hrtick(void)
451{
452 hotcpu_notifier(hotplug_hrtick, 0);
453}
454#else
455
456
457
458
459
460void hrtick_start(struct rq *rq, u64 delay)
461{
462 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
463 HRTIMER_MODE_REL_PINNED, 0);
464}
465
466static inline void init_hrtick(void)
467{
468}
469#endif
470
471static void init_rq_hrtick(struct rq *rq)
472{
473#ifdef CONFIG_SMP
474 rq->hrtick_csd_pending = 0;
475
476 rq->hrtick_csd.flags = 0;
477 rq->hrtick_csd.func = __hrtick_start;
478 rq->hrtick_csd.info = rq;
479#endif
480
481 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
482 rq->hrtick_timer.function = hrtick;
483}
484#else
485static inline void hrtick_clear(struct rq *rq)
486{
487}
488
489static inline void init_rq_hrtick(struct rq *rq)
490{
491}
492
493static inline void init_hrtick(void)
494{
495}
496#endif
497
498
499
500
501
502
503
504
505#ifdef CONFIG_SMP
506
507#ifndef tsk_is_polling
508#define tsk_is_polling(t) 0
509#endif
510
511void resched_task(struct task_struct *p)
512{
513 int cpu;
514
515 assert_raw_spin_locked(&task_rq(p)->lock);
516
517 if (test_tsk_need_resched(p))
518 return;
519
520 set_tsk_need_resched(p);
521
522 cpu = task_cpu(p);
523 if (cpu == smp_processor_id())
524 return;
525
526
527 smp_mb();
528 if (!tsk_is_polling(p))
529 smp_send_reschedule(cpu);
530}
531
532void resched_cpu(int cpu)
533{
534 struct rq *rq = cpu_rq(cpu);
535 unsigned long flags;
536
537 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
538 return;
539 resched_task(cpu_curr(cpu));
540 raw_spin_unlock_irqrestore(&rq->lock, flags);
541}
542
543#ifdef CONFIG_NO_HZ
544
545
546
547
548
549
550
551
552int get_nohz_timer_target(void)
553{
554 int cpu = smp_processor_id();
555 int i;
556 struct sched_domain *sd;
557
558 rcu_read_lock();
559 for_each_domain(cpu, sd) {
560 for_each_cpu(i, sched_domain_span(sd)) {
561 if (!idle_cpu(i)) {
562 cpu = i;
563 goto unlock;
564 }
565 }
566 }
567unlock:
568 rcu_read_unlock();
569 return cpu;
570}
571
572
573
574
575
576
577
578
579
580
581void wake_up_idle_cpu(int cpu)
582{
583 struct rq *rq = cpu_rq(cpu);
584
585 if (cpu == smp_processor_id())
586 return;
587
588
589
590
591
592
593
594
595 if (rq->curr != rq->idle)
596 return;
597
598
599
600
601
602
603 set_tsk_need_resched(rq->idle);
604
605
606 smp_mb();
607 if (!tsk_is_polling(rq->idle))
608 smp_send_reschedule(cpu);
609}
610
611static inline bool got_nohz_idle_kick(void)
612{
613 int cpu = smp_processor_id();
614 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
615}
616
617#else
618
619static inline bool got_nohz_idle_kick(void)
620{
621 return false;
622}
623
624#endif
625
626void sched_avg_update(struct rq *rq)
627{
628 s64 period = sched_avg_period();
629
630 while ((s64)(rq->clock - rq->age_stamp) > period) {
631
632
633
634
635
636 asm("" : "+rm" (rq->age_stamp));
637 rq->age_stamp += period;
638 rq->rt_avg /= 2;
639 }
640}
641
642#else
643void resched_task(struct task_struct *p)
644{
645 assert_raw_spin_locked(&task_rq(p)->lock);
646 set_tsk_need_resched(p);
647}
648#endif
649
650#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
651 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
652
653
654
655
656
657
658int walk_tg_tree_from(struct task_group *from,
659 tg_visitor down, tg_visitor up, void *data)
660{
661 struct task_group *parent, *child;
662 int ret;
663
664 parent = from;
665
666down:
667 ret = (*down)(parent, data);
668 if (ret)
669 goto out;
670 list_for_each_entry_rcu(child, &parent->children, siblings) {
671 parent = child;
672 goto down;
673
674up:
675 continue;
676 }
677 ret = (*up)(parent, data);
678 if (ret || parent == from)
679 goto out;
680
681 child = parent;
682 parent = parent->parent;
683 if (parent)
684 goto up;
685out:
686 return ret;
687}
688
689int tg_nop(struct task_group *tg, void *data)
690{
691 return 0;
692}
693#endif
694
695static void set_load_weight(struct task_struct *p)
696{
697 int prio = p->static_prio - MAX_RT_PRIO;
698 struct load_weight *load = &p->se.load;
699
700
701
702
703 if (p->policy == SCHED_IDLE) {
704 load->weight = scale_load(WEIGHT_IDLEPRIO);
705 load->inv_weight = WMULT_IDLEPRIO;
706 return;
707 }
708
709 load->weight = scale_load(prio_to_weight[prio]);
710 load->inv_weight = prio_to_wmult[prio];
711}
712
713static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
714{
715 update_rq_clock(rq);
716 sched_info_queued(p);
717 p->sched_class->enqueue_task(rq, p, flags);
718}
719
720static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
721{
722 update_rq_clock(rq);
723 sched_info_dequeued(p);
724 p->sched_class->dequeue_task(rq, p, flags);
725}
726
727void activate_task(struct rq *rq, struct task_struct *p, int flags)
728{
729 if (task_contributes_to_load(p))
730 rq->nr_uninterruptible--;
731
732 enqueue_task(rq, p, flags);
733}
734
735void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
736{
737 if (task_contributes_to_load(p))
738 rq->nr_uninterruptible++;
739
740 dequeue_task(rq, p, flags);
741}
742
743static void update_rq_clock_task(struct rq *rq, s64 delta)
744{
745
746
747
748
749#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
750 s64 steal = 0, irq_delta = 0;
751#endif
752#ifdef CONFIG_IRQ_TIME_ACCOUNTING
753 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770 if (irq_delta > delta)
771 irq_delta = delta;
772
773 rq->prev_irq_time += irq_delta;
774 delta -= irq_delta;
775#endif
776#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
777 if (static_key_false((¶virt_steal_rq_enabled))) {
778 u64 st;
779
780 steal = paravirt_steal_clock(cpu_of(rq));
781 steal -= rq->prev_steal_time_rq;
782
783 if (unlikely(steal > delta))
784 steal = delta;
785
786 st = steal_ticks(steal);
787 steal = st * TICK_NSEC;
788
789 rq->prev_steal_time_rq += steal;
790
791 delta -= steal;
792 }
793#endif
794
795 rq->clock_task += delta;
796
797#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
798 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
799 sched_rt_avg_update(rq, irq_delta + steal);
800#endif
801}
802
803void sched_set_stop_task(int cpu, struct task_struct *stop)
804{
805 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
806 struct task_struct *old_stop = cpu_rq(cpu)->stop;
807
808 if (stop) {
809
810
811
812
813
814
815
816
817 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
818
819 stop->sched_class = &stop_sched_class;
820 }
821
822 cpu_rq(cpu)->stop = stop;
823
824 if (old_stop) {
825
826
827
828
829 old_stop->sched_class = &rt_sched_class;
830 }
831}
832
833
834
835
836static inline int __normal_prio(struct task_struct *p)
837{
838 return p->static_prio;
839}
840
841
842
843
844
845
846
847
848static inline int normal_prio(struct task_struct *p)
849{
850 int prio;
851
852 if (task_has_rt_policy(p))
853 prio = MAX_RT_PRIO-1 - p->rt_priority;
854 else
855 prio = __normal_prio(p);
856 return prio;
857}
858
859
860
861
862
863
864
865
866static int effective_prio(struct task_struct *p)
867{
868 p->normal_prio = normal_prio(p);
869
870
871
872
873
874 if (!rt_prio(p->prio))
875 return p->normal_prio;
876 return p->prio;
877}
878
879
880
881
882
883inline int task_curr(const struct task_struct *p)
884{
885 return cpu_curr(task_cpu(p)) == p;
886}
887
888static inline void check_class_changed(struct rq *rq, struct task_struct *p,
889 const struct sched_class *prev_class,
890 int oldprio)
891{
892 if (prev_class != p->sched_class) {
893 if (prev_class->switched_from)
894 prev_class->switched_from(rq, p);
895 p->sched_class->switched_to(rq, p);
896 } else if (oldprio != p->prio)
897 p->sched_class->prio_changed(rq, p, oldprio);
898}
899
900void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
901{
902 const struct sched_class *class;
903
904 if (p->sched_class == rq->curr->sched_class) {
905 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
906 } else {
907 for_each_class(class) {
908 if (class == rq->curr->sched_class)
909 break;
910 if (class == p->sched_class) {
911 resched_task(rq->curr);
912 break;
913 }
914 }
915 }
916
917
918
919
920
921 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
922 rq->skip_clock_update = 1;
923}
924
925#ifdef CONFIG_SMP
926void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
927{
928#ifdef CONFIG_SCHED_DEBUG
929
930
931
932
933 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
934 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
935
936#ifdef CONFIG_LOCKDEP
937
938
939
940
941
942
943
944
945
946
947 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
948 lockdep_is_held(&task_rq(p)->lock)));
949#endif
950#endif
951
952 trace_sched_migrate_task(p, new_cpu);
953
954 if (task_cpu(p) != new_cpu) {
955 p->se.nr_migrations++;
956 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
957 }
958
959 __set_task_cpu(p, new_cpu);
960}
961
962struct migration_arg {
963 struct task_struct *task;
964 int dest_cpu;
965};
966
967static int migration_cpu_stop(void *data);
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985unsigned long wait_task_inactive(struct task_struct *p, long match_state)
986{
987 unsigned long flags;
988 int running, on_rq;
989 unsigned long ncsw;
990 struct rq *rq;
991
992 for (;;) {
993
994
995
996
997
998
999 rq = task_rq(p);
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012 while (task_running(rq, p)) {
1013 if (match_state && unlikely(p->state != match_state))
1014 return 0;
1015 cpu_relax();
1016 }
1017
1018
1019
1020
1021
1022
1023 rq = task_rq_lock(p, &flags);
1024 trace_sched_wait_task(p);
1025 running = task_running(rq, p);
1026 on_rq = p->on_rq;
1027 ncsw = 0;
1028 if (!match_state || p->state == match_state)
1029 ncsw = p->nvcsw | LONG_MIN;
1030 task_rq_unlock(rq, p, &flags);
1031
1032
1033
1034
1035 if (unlikely(!ncsw))
1036 break;
1037
1038
1039
1040
1041
1042
1043
1044 if (unlikely(running)) {
1045 cpu_relax();
1046 continue;
1047 }
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058 if (unlikely(on_rq)) {
1059 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1060
1061 set_current_state(TASK_UNINTERRUPTIBLE);
1062 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1063 continue;
1064 }
1065
1066
1067
1068
1069
1070
1071 break;
1072 }
1073
1074 return ncsw;
1075}
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090void kick_process(struct task_struct *p)
1091{
1092 int cpu;
1093
1094 preempt_disable();
1095 cpu = task_cpu(p);
1096 if ((cpu != smp_processor_id()) && task_curr(p))
1097 smp_send_reschedule(cpu);
1098 preempt_enable();
1099}
1100EXPORT_SYMBOL_GPL(kick_process);
1101#endif
1102
1103#ifdef CONFIG_SMP
1104
1105
1106
1107static int select_fallback_rq(int cpu, struct task_struct *p)
1108{
1109 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1110 enum { cpuset, possible, fail } state = cpuset;
1111 int dest_cpu;
1112
1113
1114 for_each_cpu(dest_cpu, nodemask) {
1115 if (!cpu_online(dest_cpu))
1116 continue;
1117 if (!cpu_active(dest_cpu))
1118 continue;
1119 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1120 return dest_cpu;
1121 }
1122
1123 for (;;) {
1124
1125 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1126 if (!cpu_online(dest_cpu))
1127 continue;
1128 if (!cpu_active(dest_cpu))
1129 continue;
1130 goto out;
1131 }
1132
1133 switch (state) {
1134 case cpuset:
1135
1136 cpuset_cpus_allowed_fallback(p);
1137 state = possible;
1138 break;
1139
1140 case possible:
1141 do_set_cpus_allowed(p, cpu_possible_mask);
1142 state = fail;
1143 break;
1144
1145 case fail:
1146 BUG();
1147 break;
1148 }
1149 }
1150
1151out:
1152 if (state != cpuset) {
1153
1154
1155
1156
1157
1158 if (p->mm && printk_ratelimit()) {
1159 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1160 task_pid_nr(p), p->comm, cpu);
1161 }
1162 }
1163
1164 return dest_cpu;
1165}
1166
1167
1168
1169
1170static inline
1171int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1172{
1173 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1186 !cpu_online(cpu)))
1187 cpu = select_fallback_rq(task_cpu(p), p);
1188
1189 return cpu;
1190}
1191
1192static void update_avg(u64 *avg, u64 sample)
1193{
1194 s64 diff = sample - *avg;
1195 *avg += diff >> 3;
1196}
1197#endif
1198
1199static void
1200ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1201{
1202#ifdef CONFIG_SCHEDSTATS
1203 struct rq *rq = this_rq();
1204
1205#ifdef CONFIG_SMP
1206 int this_cpu = smp_processor_id();
1207
1208 if (cpu == this_cpu) {
1209 schedstat_inc(rq, ttwu_local);
1210 schedstat_inc(p, se.statistics.nr_wakeups_local);
1211 } else {
1212 struct sched_domain *sd;
1213
1214 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1215 rcu_read_lock();
1216 for_each_domain(this_cpu, sd) {
1217 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1218 schedstat_inc(sd, ttwu_wake_remote);
1219 break;
1220 }
1221 }
1222 rcu_read_unlock();
1223 }
1224
1225 if (wake_flags & WF_MIGRATED)
1226 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1227
1228#endif
1229
1230 schedstat_inc(rq, ttwu_count);
1231 schedstat_inc(p, se.statistics.nr_wakeups);
1232
1233 if (wake_flags & WF_SYNC)
1234 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1235
1236#endif
1237}
1238
1239static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1240{
1241 activate_task(rq, p, en_flags);
1242 p->on_rq = 1;
1243
1244
1245 if (p->flags & PF_WQ_WORKER)
1246 wq_worker_waking_up(p, cpu_of(rq));
1247}
1248
1249
1250
1251
1252static void
1253ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1254{
1255 trace_sched_wakeup(p, true);
1256 check_preempt_curr(rq, p, wake_flags);
1257
1258 p->state = TASK_RUNNING;
1259#ifdef CONFIG_SMP
1260 if (p->sched_class->task_woken)
1261 p->sched_class->task_woken(rq, p);
1262
1263 if (rq->idle_stamp) {
1264 u64 delta = rq->clock - rq->idle_stamp;
1265 u64 max = 2*sysctl_sched_migration_cost;
1266
1267 if (delta > max)
1268 rq->avg_idle = max;
1269 else
1270 update_avg(&rq->avg_idle, delta);
1271 rq->idle_stamp = 0;
1272 }
1273#endif
1274}
1275
1276static void
1277ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1278{
1279#ifdef CONFIG_SMP
1280 if (p->sched_contributes_to_load)
1281 rq->nr_uninterruptible--;
1282#endif
1283
1284 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1285 ttwu_do_wakeup(rq, p, wake_flags);
1286}
1287
1288
1289
1290
1291
1292
1293
1294static int ttwu_remote(struct task_struct *p, int wake_flags)
1295{
1296 struct rq *rq;
1297 int ret = 0;
1298
1299 rq = __task_rq_lock(p);
1300 if (p->on_rq) {
1301 ttwu_do_wakeup(rq, p, wake_flags);
1302 ret = 1;
1303 }
1304 __task_rq_unlock(rq);
1305
1306 return ret;
1307}
1308
1309#ifdef CONFIG_SMP
1310static void sched_ttwu_pending(void)
1311{
1312 struct rq *rq = this_rq();
1313 struct llist_node *llist = llist_del_all(&rq->wake_list);
1314 struct task_struct *p;
1315
1316 raw_spin_lock(&rq->lock);
1317
1318 while (llist) {
1319 p = llist_entry(llist, struct task_struct, wake_entry);
1320 llist = llist_next(llist);
1321 ttwu_do_activate(rq, p, 0);
1322 }
1323
1324 raw_spin_unlock(&rq->lock);
1325}
1326
1327void scheduler_ipi(void)
1328{
1329 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1330 return;
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345 irq_enter();
1346 sched_ttwu_pending();
1347
1348
1349
1350
1351 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1352 this_rq()->idle_balance = 1;
1353 raise_softirq_irqoff(SCHED_SOFTIRQ);
1354 }
1355 irq_exit();
1356}
1357
1358static void ttwu_queue_remote(struct task_struct *p, int cpu)
1359{
1360 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1361 smp_send_reschedule(cpu);
1362}
1363
1364bool cpus_share_cache(int this_cpu, int that_cpu)
1365{
1366 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1367}
1368#endif
1369
1370static void ttwu_queue(struct task_struct *p, int cpu)
1371{
1372 struct rq *rq = cpu_rq(cpu);
1373
1374#if defined(CONFIG_SMP)
1375 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1376 sched_clock_cpu(cpu);
1377 ttwu_queue_remote(p, cpu);
1378 return;
1379 }
1380#endif
1381
1382 raw_spin_lock(&rq->lock);
1383 ttwu_do_activate(rq, p, 0);
1384 raw_spin_unlock(&rq->lock);
1385}
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402static int
1403try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1404{
1405 unsigned long flags;
1406 int cpu, success = 0;
1407
1408 smp_wmb();
1409 raw_spin_lock_irqsave(&p->pi_lock, flags);
1410 if (!(p->state & state))
1411 goto out;
1412
1413 success = 1;
1414 cpu = task_cpu(p);
1415
1416 if (p->on_rq && ttwu_remote(p, wake_flags))
1417 goto stat;
1418
1419#ifdef CONFIG_SMP
1420
1421
1422
1423
1424 while (p->on_cpu)
1425 cpu_relax();
1426
1427
1428
1429 smp_rmb();
1430
1431 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1432 p->state = TASK_WAKING;
1433
1434 if (p->sched_class->task_waking)
1435 p->sched_class->task_waking(p);
1436
1437 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1438 if (task_cpu(p) != cpu) {
1439 wake_flags |= WF_MIGRATED;
1440 set_task_cpu(p, cpu);
1441 }
1442#endif
1443
1444 ttwu_queue(p, cpu);
1445stat:
1446 ttwu_stat(p, cpu, wake_flags);
1447out:
1448 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1449
1450 return success;
1451}
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461static void try_to_wake_up_local(struct task_struct *p)
1462{
1463 struct rq *rq = task_rq(p);
1464
1465 BUG_ON(rq != this_rq());
1466 BUG_ON(p == current);
1467 lockdep_assert_held(&rq->lock);
1468
1469 if (!raw_spin_trylock(&p->pi_lock)) {
1470 raw_spin_unlock(&rq->lock);
1471 raw_spin_lock(&p->pi_lock);
1472 raw_spin_lock(&rq->lock);
1473 }
1474
1475 if (!(p->state & TASK_NORMAL))
1476 goto out;
1477
1478 if (!p->on_rq)
1479 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1480
1481 ttwu_do_wakeup(rq, p, 0);
1482 ttwu_stat(p, smp_processor_id(), 0);
1483out:
1484 raw_spin_unlock(&p->pi_lock);
1485}
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498int wake_up_process(struct task_struct *p)
1499{
1500 return try_to_wake_up(p, TASK_ALL, 0);
1501}
1502EXPORT_SYMBOL(wake_up_process);
1503
1504int wake_up_state(struct task_struct *p, unsigned int state)
1505{
1506 return try_to_wake_up(p, state, 0);
1507}
1508
1509
1510
1511
1512
1513
1514
1515static void __sched_fork(struct task_struct *p)
1516{
1517 p->on_rq = 0;
1518
1519 p->se.on_rq = 0;
1520 p->se.exec_start = 0;
1521 p->se.sum_exec_runtime = 0;
1522 p->se.prev_sum_exec_runtime = 0;
1523 p->se.nr_migrations = 0;
1524 p->se.vruntime = 0;
1525 INIT_LIST_HEAD(&p->se.group_node);
1526
1527#ifdef CONFIG_SCHEDSTATS
1528 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1529#endif
1530
1531 INIT_LIST_HEAD(&p->rt.run_list);
1532
1533#ifdef CONFIG_PREEMPT_NOTIFIERS
1534 INIT_HLIST_HEAD(&p->preempt_notifiers);
1535#endif
1536}
1537
1538
1539
1540
1541void sched_fork(struct task_struct *p)
1542{
1543 unsigned long flags;
1544 int cpu = get_cpu();
1545
1546 __sched_fork(p);
1547
1548
1549
1550
1551
1552 p->state = TASK_RUNNING;
1553
1554
1555
1556
1557 p->prio = current->normal_prio;
1558
1559
1560
1561
1562 if (unlikely(p->sched_reset_on_fork)) {
1563 if (task_has_rt_policy(p)) {
1564 p->policy = SCHED_NORMAL;
1565 p->static_prio = NICE_TO_PRIO(0);
1566 p->rt_priority = 0;
1567 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1568 p->static_prio = NICE_TO_PRIO(0);
1569
1570 p->prio = p->normal_prio = __normal_prio(p);
1571 set_load_weight(p);
1572
1573
1574
1575
1576
1577 p->sched_reset_on_fork = 0;
1578 }
1579
1580 if (!rt_prio(p->prio))
1581 p->sched_class = &fair_sched_class;
1582
1583 if (p->sched_class->task_fork)
1584 p->sched_class->task_fork(p);
1585
1586
1587
1588
1589
1590
1591
1592
1593 raw_spin_lock_irqsave(&p->pi_lock, flags);
1594 set_task_cpu(p, cpu);
1595 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1596
1597#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1598 if (likely(sched_info_on()))
1599 memset(&p->sched_info, 0, sizeof(p->sched_info));
1600#endif
1601#if defined(CONFIG_SMP)
1602 p->on_cpu = 0;
1603#endif
1604#ifdef CONFIG_PREEMPT_COUNT
1605
1606 task_thread_info(p)->preempt_count = 1;
1607#endif
1608#ifdef CONFIG_SMP
1609 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1610#endif
1611
1612 put_cpu();
1613}
1614
1615
1616
1617
1618
1619
1620
1621
1622void wake_up_new_task(struct task_struct *p)
1623{
1624 unsigned long flags;
1625 struct rq *rq;
1626
1627 raw_spin_lock_irqsave(&p->pi_lock, flags);
1628#ifdef CONFIG_SMP
1629
1630
1631
1632
1633
1634 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1635#endif
1636
1637 rq = __task_rq_lock(p);
1638 activate_task(rq, p, 0);
1639 p->on_rq = 1;
1640 trace_sched_wakeup_new(p, true);
1641 check_preempt_curr(rq, p, WF_FORK);
1642#ifdef CONFIG_SMP
1643 if (p->sched_class->task_woken)
1644 p->sched_class->task_woken(rq, p);
1645#endif
1646 task_rq_unlock(rq, p, &flags);
1647}
1648
1649#ifdef CONFIG_PREEMPT_NOTIFIERS
1650
1651
1652
1653
1654
1655void preempt_notifier_register(struct preempt_notifier *notifier)
1656{
1657 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
1658}
1659EXPORT_SYMBOL_GPL(preempt_notifier_register);
1660
1661
1662
1663
1664
1665
1666
1667void preempt_notifier_unregister(struct preempt_notifier *notifier)
1668{
1669 hlist_del(¬ifier->link);
1670}
1671EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1672
1673static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1674{
1675 struct preempt_notifier *notifier;
1676 struct hlist_node *node;
1677
1678 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1679 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1680}
1681
1682static void
1683fire_sched_out_preempt_notifiers(struct task_struct *curr,
1684 struct task_struct *next)
1685{
1686 struct preempt_notifier *notifier;
1687 struct hlist_node *node;
1688
1689 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1690 notifier->ops->sched_out(notifier, next);
1691}
1692
1693#else
1694
1695static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1696{
1697}
1698
1699static void
1700fire_sched_out_preempt_notifiers(struct task_struct *curr,
1701 struct task_struct *next)
1702{
1703}
1704
1705#endif
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720static inline void
1721prepare_task_switch(struct rq *rq, struct task_struct *prev,
1722 struct task_struct *next)
1723{
1724 trace_sched_switch(prev, next);
1725 sched_info_switch(prev, next);
1726 perf_event_task_sched_out(prev, next);
1727 fire_sched_out_preempt_notifiers(prev, next);
1728 prepare_lock_switch(rq, next);
1729 prepare_arch_switch(next);
1730}
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1748 __releases(rq->lock)
1749{
1750 struct mm_struct *mm = rq->prev_mm;
1751 long prev_state;
1752
1753 rq->prev_mm = NULL;
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766 prev_state = prev->state;
1767 vtime_task_switch(prev);
1768 finish_arch_switch(prev);
1769 perf_event_task_sched_in(prev, current);
1770 finish_lock_switch(rq, prev);
1771 finish_arch_post_lock_switch();
1772
1773 fire_sched_in_preempt_notifiers(current);
1774 if (mm)
1775 mmdrop(mm);
1776 if (unlikely(prev_state == TASK_DEAD)) {
1777
1778
1779
1780
1781 kprobe_flush_task(prev);
1782 put_task_struct(prev);
1783 }
1784}
1785
1786#ifdef CONFIG_SMP
1787
1788
1789static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1790{
1791 if (prev->sched_class->pre_schedule)
1792 prev->sched_class->pre_schedule(rq, prev);
1793}
1794
1795
1796static inline void post_schedule(struct rq *rq)
1797{
1798 if (rq->post_schedule) {
1799 unsigned long flags;
1800
1801 raw_spin_lock_irqsave(&rq->lock, flags);
1802 if (rq->curr->sched_class->post_schedule)
1803 rq->curr->sched_class->post_schedule(rq);
1804 raw_spin_unlock_irqrestore(&rq->lock, flags);
1805
1806 rq->post_schedule = 0;
1807 }
1808}
1809
1810#else
1811
1812static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1813{
1814}
1815
1816static inline void post_schedule(struct rq *rq)
1817{
1818}
1819
1820#endif
1821
1822
1823
1824
1825
1826asmlinkage void schedule_tail(struct task_struct *prev)
1827 __releases(rq->lock)
1828{
1829 struct rq *rq = this_rq();
1830
1831 finish_task_switch(rq, prev);
1832
1833
1834
1835
1836
1837 post_schedule(rq);
1838
1839#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1840
1841 preempt_enable();
1842#endif
1843 if (current->set_child_tid)
1844 put_user(task_pid_vnr(current), current->set_child_tid);
1845}
1846
1847
1848
1849
1850
1851static inline void
1852context_switch(struct rq *rq, struct task_struct *prev,
1853 struct task_struct *next)
1854{
1855 struct mm_struct *mm, *oldmm;
1856
1857 prepare_task_switch(rq, prev, next);
1858
1859 mm = next->mm;
1860 oldmm = prev->active_mm;
1861
1862
1863
1864
1865
1866 arch_start_context_switch(prev);
1867
1868 if (!mm) {
1869 next->active_mm = oldmm;
1870 atomic_inc(&oldmm->mm_count);
1871 enter_lazy_tlb(oldmm, next);
1872 } else
1873 switch_mm(oldmm, mm, next);
1874
1875 if (!prev->mm) {
1876 prev->active_mm = NULL;
1877 rq->prev_mm = oldmm;
1878 }
1879
1880
1881
1882
1883
1884
1885#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1886 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1887#endif
1888
1889
1890 rcu_switch(prev, next);
1891 switch_to(prev, next, prev);
1892
1893 barrier();
1894
1895
1896
1897
1898
1899 finish_task_switch(this_rq(), prev);
1900}
1901
1902
1903
1904
1905
1906
1907
1908
1909unsigned long nr_running(void)
1910{
1911 unsigned long i, sum = 0;
1912
1913 for_each_online_cpu(i)
1914 sum += cpu_rq(i)->nr_running;
1915
1916 return sum;
1917}
1918
1919unsigned long nr_uninterruptible(void)
1920{
1921 unsigned long i, sum = 0;
1922
1923 for_each_possible_cpu(i)
1924 sum += cpu_rq(i)->nr_uninterruptible;
1925
1926
1927
1928
1929
1930 if (unlikely((long)sum < 0))
1931 sum = 0;
1932
1933 return sum;
1934}
1935
1936unsigned long long nr_context_switches(void)
1937{
1938 int i;
1939 unsigned long long sum = 0;
1940
1941 for_each_possible_cpu(i)
1942 sum += cpu_rq(i)->nr_switches;
1943
1944 return sum;
1945}
1946
1947unsigned long nr_iowait(void)
1948{
1949 unsigned long i, sum = 0;
1950
1951 for_each_possible_cpu(i)
1952 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1953
1954 return sum;
1955}
1956
1957unsigned long nr_iowait_cpu(int cpu)
1958{
1959 struct rq *this = cpu_rq(cpu);
1960 return atomic_read(&this->nr_iowait);
1961}
1962
1963unsigned long this_cpu_load(void)
1964{
1965 struct rq *this = this_rq();
1966 return this->cpu_load[0];
1967}
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018static atomic_long_t calc_load_tasks;
2019static unsigned long calc_load_update;
2020unsigned long avenrun[3];
2021EXPORT_SYMBOL(avenrun);
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2032{
2033 loads[0] = (avenrun[0] + offset) << shift;
2034 loads[1] = (avenrun[1] + offset) << shift;
2035 loads[2] = (avenrun[2] + offset) << shift;
2036}
2037
2038static long calc_load_fold_active(struct rq *this_rq)
2039{
2040 long nr_active, delta = 0;
2041
2042 nr_active = this_rq->nr_running;
2043 nr_active += (long) this_rq->nr_uninterruptible;
2044
2045 if (nr_active != this_rq->calc_load_active) {
2046 delta = nr_active - this_rq->calc_load_active;
2047 this_rq->calc_load_active = nr_active;
2048 }
2049
2050 return delta;
2051}
2052
2053
2054
2055
2056static unsigned long
2057calc_load(unsigned long load, unsigned long exp, unsigned long active)
2058{
2059 load *= exp;
2060 load += active * (FIXED_1 - exp);
2061 load += 1UL << (FSHIFT - 1);
2062 return load >> FSHIFT;
2063}
2064
2065#ifdef CONFIG_NO_HZ
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108static atomic_long_t calc_load_idle[2];
2109static int calc_load_idx;
2110
2111static inline int calc_load_write_idx(void)
2112{
2113 int idx = calc_load_idx;
2114
2115
2116
2117
2118
2119 smp_rmb();
2120
2121
2122
2123
2124
2125 if (!time_before(jiffies, calc_load_update))
2126 idx++;
2127
2128 return idx & 1;
2129}
2130
2131static inline int calc_load_read_idx(void)
2132{
2133 return calc_load_idx & 1;
2134}
2135
2136void calc_load_enter_idle(void)
2137{
2138 struct rq *this_rq = this_rq();
2139 long delta;
2140
2141
2142
2143
2144
2145 delta = calc_load_fold_active(this_rq);
2146 if (delta) {
2147 int idx = calc_load_write_idx();
2148 atomic_long_add(delta, &calc_load_idle[idx]);
2149 }
2150}
2151
2152void calc_load_exit_idle(void)
2153{
2154 struct rq *this_rq = this_rq();
2155
2156
2157
2158
2159 if (time_before(jiffies, this_rq->calc_load_update))
2160 return;
2161
2162
2163
2164
2165
2166
2167 this_rq->calc_load_update = calc_load_update;
2168 if (time_before(jiffies, this_rq->calc_load_update + 10))
2169 this_rq->calc_load_update += LOAD_FREQ;
2170}
2171
2172static long calc_load_fold_idle(void)
2173{
2174 int idx = calc_load_read_idx();
2175 long delta = 0;
2176
2177 if (atomic_long_read(&calc_load_idle[idx]))
2178 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2179
2180 return delta;
2181}
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198static unsigned long
2199fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2200{
2201 unsigned long result = 1UL << frac_bits;
2202
2203 if (n) for (;;) {
2204 if (n & 1) {
2205 result *= x;
2206 result += 1UL << (frac_bits - 1);
2207 result >>= frac_bits;
2208 }
2209 n >>= 1;
2210 if (!n)
2211 break;
2212 x *= x;
2213 x += 1UL << (frac_bits - 1);
2214 x >>= frac_bits;
2215 }
2216
2217 return result;
2218}
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243static unsigned long
2244calc_load_n(unsigned long load, unsigned long exp,
2245 unsigned long active, unsigned int n)
2246{
2247
2248 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2249}
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260static void calc_global_nohz(void)
2261{
2262 long delta, active, n;
2263
2264 if (!time_before(jiffies, calc_load_update + 10)) {
2265
2266
2267
2268 delta = jiffies - calc_load_update - 10;
2269 n = 1 + (delta / LOAD_FREQ);
2270
2271 active = atomic_long_read(&calc_load_tasks);
2272 active = active > 0 ? active * FIXED_1 : 0;
2273
2274 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2275 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2276 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2277
2278 calc_load_update += n * LOAD_FREQ;
2279 }
2280
2281
2282
2283
2284
2285
2286
2287
2288 smp_wmb();
2289 calc_load_idx++;
2290}
2291#else
2292
2293static inline long calc_load_fold_idle(void) { return 0; }
2294static inline void calc_global_nohz(void) { }
2295
2296#endif
2297
2298
2299
2300
2301
2302void calc_global_load(unsigned long ticks)
2303{
2304 long active, delta;
2305
2306 if (time_before(jiffies, calc_load_update + 10))
2307 return;
2308
2309
2310
2311
2312 delta = calc_load_fold_idle();
2313 if (delta)
2314 atomic_long_add(delta, &calc_load_tasks);
2315
2316 active = atomic_long_read(&calc_load_tasks);
2317 active = active > 0 ? active * FIXED_1 : 0;
2318
2319 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2320 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2321 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2322
2323 calc_load_update += LOAD_FREQ;
2324
2325
2326
2327
2328 calc_global_nohz();
2329}
2330
2331
2332
2333
2334
2335static void calc_load_account_active(struct rq *this_rq)
2336{
2337 long delta;
2338
2339 if (time_before(jiffies, this_rq->calc_load_update))
2340 return;
2341
2342 delta = calc_load_fold_active(this_rq);
2343 if (delta)
2344 atomic_long_add(delta, &calc_load_tasks);
2345
2346 this_rq->calc_load_update += LOAD_FREQ;
2347}
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380#define DEGRADE_SHIFT 7
2381static const unsigned char
2382 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2383static const unsigned char
2384 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2385 {0, 0, 0, 0, 0, 0, 0, 0},
2386 {64, 32, 8, 0, 0, 0, 0, 0},
2387 {96, 72, 40, 12, 1, 0, 0},
2388 {112, 98, 75, 43, 15, 1, 0},
2389 {120, 112, 98, 76, 45, 16, 2} };
2390
2391
2392
2393
2394
2395
2396static unsigned long
2397decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2398{
2399 int j = 0;
2400
2401 if (!missed_updates)
2402 return load;
2403
2404 if (missed_updates >= degrade_zero_ticks[idx])
2405 return 0;
2406
2407 if (idx == 1)
2408 return load >> missed_updates;
2409
2410 while (missed_updates) {
2411 if (missed_updates % 2)
2412 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2413
2414 missed_updates >>= 1;
2415 j++;
2416 }
2417 return load;
2418}
2419
2420
2421
2422
2423
2424
2425static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2426 unsigned long pending_updates)
2427{
2428 int i, scale;
2429
2430 this_rq->nr_load_updates++;
2431
2432
2433 this_rq->cpu_load[0] = this_load;
2434 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2435 unsigned long old_load, new_load;
2436
2437
2438
2439 old_load = this_rq->cpu_load[i];
2440 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2441 new_load = this_load;
2442
2443
2444
2445
2446
2447 if (new_load > old_load)
2448 new_load += scale - 1;
2449
2450 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2451 }
2452
2453 sched_avg_update(this_rq);
2454}
2455
2456#ifdef CONFIG_NO_HZ
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474void update_idle_cpu_load(struct rq *this_rq)
2475{
2476 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2477 unsigned long load = this_rq->load.weight;
2478 unsigned long pending_updates;
2479
2480
2481
2482
2483 if (load || curr_jiffies == this_rq->last_load_update_tick)
2484 return;
2485
2486 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2487 this_rq->last_load_update_tick = curr_jiffies;
2488
2489 __update_cpu_load(this_rq, load, pending_updates);
2490}
2491
2492
2493
2494
2495void update_cpu_load_nohz(void)
2496{
2497 struct rq *this_rq = this_rq();
2498 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2499 unsigned long pending_updates;
2500
2501 if (curr_jiffies == this_rq->last_load_update_tick)
2502 return;
2503
2504 raw_spin_lock(&this_rq->lock);
2505 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2506 if (pending_updates) {
2507 this_rq->last_load_update_tick = curr_jiffies;
2508
2509
2510
2511
2512 __update_cpu_load(this_rq, 0, pending_updates);
2513 }
2514 raw_spin_unlock(&this_rq->lock);
2515}
2516#endif
2517
2518
2519
2520
2521static void update_cpu_load_active(struct rq *this_rq)
2522{
2523
2524
2525
2526 this_rq->last_load_update_tick = jiffies;
2527 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2528
2529 calc_load_account_active(this_rq);
2530}
2531
2532#ifdef CONFIG_SMP
2533
2534
2535
2536
2537
2538void sched_exec(void)
2539{
2540 struct task_struct *p = current;
2541 unsigned long flags;
2542 int dest_cpu;
2543
2544 raw_spin_lock_irqsave(&p->pi_lock, flags);
2545 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2546 if (dest_cpu == smp_processor_id())
2547 goto unlock;
2548
2549 if (likely(cpu_active(dest_cpu))) {
2550 struct migration_arg arg = { p, dest_cpu };
2551
2552 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2553 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2554 return;
2555 }
2556unlock:
2557 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2558}
2559
2560#endif
2561
2562DEFINE_PER_CPU(struct kernel_stat, kstat);
2563DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2564
2565EXPORT_PER_CPU_SYMBOL(kstat);
2566EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2567
2568
2569
2570
2571
2572
2573
2574static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2575{
2576 u64 ns = 0;
2577
2578 if (task_current(rq, p)) {
2579 update_rq_clock(rq);
2580 ns = rq->clock_task - p->se.exec_start;
2581 if ((s64)ns < 0)
2582 ns = 0;
2583 }
2584
2585 return ns;
2586}
2587
2588unsigned long long task_delta_exec(struct task_struct *p)
2589{
2590 unsigned long flags;
2591 struct rq *rq;
2592 u64 ns = 0;
2593
2594 rq = task_rq_lock(p, &flags);
2595 ns = do_task_delta_exec(p, rq);
2596 task_rq_unlock(rq, p, &flags);
2597
2598 return ns;
2599}
2600
2601
2602
2603
2604
2605
2606unsigned long long task_sched_runtime(struct task_struct *p)
2607{
2608 unsigned long flags;
2609 struct rq *rq;
2610 u64 ns = 0;
2611
2612 rq = task_rq_lock(p, &flags);
2613 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2614 task_rq_unlock(rq, p, &flags);
2615
2616 return ns;
2617}
2618
2619
2620
2621
2622
2623void scheduler_tick(void)
2624{
2625 int cpu = smp_processor_id();
2626 struct rq *rq = cpu_rq(cpu);
2627 struct task_struct *curr = rq->curr;
2628
2629 sched_clock_tick();
2630
2631 raw_spin_lock(&rq->lock);
2632 update_rq_clock(rq);
2633 update_cpu_load_active(rq);
2634 curr->sched_class->task_tick(rq, curr, 0);
2635 raw_spin_unlock(&rq->lock);
2636
2637 perf_event_task_tick();
2638
2639#ifdef CONFIG_SMP
2640 rq->idle_balance = idle_cpu(cpu);
2641 trigger_load_balance(rq, cpu);
2642#endif
2643}
2644
2645notrace unsigned long get_parent_ip(unsigned long addr)
2646{
2647 if (in_lock_functions(addr)) {
2648 addr = CALLER_ADDR2;
2649 if (in_lock_functions(addr))
2650 addr = CALLER_ADDR3;
2651 }
2652 return addr;
2653}
2654
2655#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2656 defined(CONFIG_PREEMPT_TRACER))
2657
2658void __kprobes add_preempt_count(int val)
2659{
2660#ifdef CONFIG_DEBUG_PREEMPT
2661
2662
2663
2664 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2665 return;
2666#endif
2667 preempt_count() += val;
2668#ifdef CONFIG_DEBUG_PREEMPT
2669
2670
2671
2672 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2673 PREEMPT_MASK - 10);
2674#endif
2675 if (preempt_count() == val)
2676 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2677}
2678EXPORT_SYMBOL(add_preempt_count);
2679
2680void __kprobes sub_preempt_count(int val)
2681{
2682#ifdef CONFIG_DEBUG_PREEMPT
2683
2684
2685
2686 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2687 return;
2688
2689
2690
2691 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2692 !(preempt_count() & PREEMPT_MASK)))
2693 return;
2694#endif
2695
2696 if (preempt_count() == val)
2697 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2698 preempt_count() -= val;
2699}
2700EXPORT_SYMBOL(sub_preempt_count);
2701
2702#endif
2703
2704
2705
2706
2707static noinline void __schedule_bug(struct task_struct *prev)
2708{
2709 if (oops_in_progress)
2710 return;
2711
2712 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2713 prev->comm, prev->pid, preempt_count());
2714
2715 debug_show_held_locks(prev);
2716 print_modules();
2717 if (irqs_disabled())
2718 print_irqtrace_events(prev);
2719 dump_stack();
2720 add_taint(TAINT_WARN);
2721}
2722
2723
2724
2725
2726static inline void schedule_debug(struct task_struct *prev)
2727{
2728
2729
2730
2731
2732
2733 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2734 __schedule_bug(prev);
2735 rcu_sleep_check();
2736
2737 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2738
2739 schedstat_inc(this_rq(), sched_count);
2740}
2741
2742static void put_prev_task(struct rq *rq, struct task_struct *prev)
2743{
2744 if (prev->on_rq || rq->skip_clock_update < 0)
2745 update_rq_clock(rq);
2746 prev->sched_class->put_prev_task(rq, prev);
2747}
2748
2749
2750
2751
2752static inline struct task_struct *
2753pick_next_task(struct rq *rq)
2754{
2755 const struct sched_class *class;
2756 struct task_struct *p;
2757
2758
2759
2760
2761
2762 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2763 p = fair_sched_class.pick_next_task(rq);
2764 if (likely(p))
2765 return p;
2766 }
2767
2768 for_each_class(class) {
2769 p = class->pick_next_task(rq);
2770 if (p)
2771 return p;
2772 }
2773
2774 BUG();
2775}
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814static void __sched __schedule(void)
2815{
2816 struct task_struct *prev, *next;
2817 unsigned long *switch_count;
2818 struct rq *rq;
2819 int cpu;
2820
2821need_resched:
2822 preempt_disable();
2823 cpu = smp_processor_id();
2824 rq = cpu_rq(cpu);
2825 rcu_note_context_switch(cpu);
2826 prev = rq->curr;
2827
2828 schedule_debug(prev);
2829
2830 if (sched_feat(HRTICK))
2831 hrtick_clear(rq);
2832
2833 raw_spin_lock_irq(&rq->lock);
2834
2835 switch_count = &prev->nivcsw;
2836 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2837 if (unlikely(signal_pending_state(prev->state, prev))) {
2838 prev->state = TASK_RUNNING;
2839 } else {
2840 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2841 prev->on_rq = 0;
2842
2843
2844
2845
2846
2847
2848 if (prev->flags & PF_WQ_WORKER) {
2849 struct task_struct *to_wakeup;
2850
2851 to_wakeup = wq_worker_sleeping(prev, cpu);
2852 if (to_wakeup)
2853 try_to_wake_up_local(to_wakeup);
2854 }
2855 }
2856 switch_count = &prev->nvcsw;
2857 }
2858
2859 pre_schedule(rq, prev);
2860
2861 if (unlikely(!rq->nr_running))
2862 idle_balance(cpu, rq);
2863
2864 put_prev_task(rq, prev);
2865 next = pick_next_task(rq);
2866 clear_tsk_need_resched(prev);
2867 rq->skip_clock_update = 0;
2868
2869 if (likely(prev != next)) {
2870 rq->nr_switches++;
2871 rq->curr = next;
2872 ++*switch_count;
2873
2874 context_switch(rq, prev, next);
2875
2876
2877
2878
2879
2880
2881 cpu = smp_processor_id();
2882 rq = cpu_rq(cpu);
2883 } else
2884 raw_spin_unlock_irq(&rq->lock);
2885
2886 post_schedule(rq);
2887
2888 sched_preempt_enable_no_resched();
2889 if (need_resched())
2890 goto need_resched;
2891}
2892
2893static inline void sched_submit_work(struct task_struct *tsk)
2894{
2895 if (!tsk->state || tsk_is_pi_blocked(tsk))
2896 return;
2897
2898
2899
2900
2901 if (blk_needs_flush_plug(tsk))
2902 blk_schedule_flush_plug(tsk);
2903}
2904
2905asmlinkage void __sched schedule(void)
2906{
2907 struct task_struct *tsk = current;
2908
2909 sched_submit_work(tsk);
2910 __schedule();
2911}
2912EXPORT_SYMBOL(schedule);
2913
2914#ifdef CONFIG_RCU_USER_QS
2915asmlinkage void __sched schedule_user(void)
2916{
2917
2918
2919
2920
2921
2922
2923 rcu_user_exit();
2924 schedule();
2925 rcu_user_enter();
2926}
2927#endif
2928
2929
2930
2931
2932
2933
2934void __sched schedule_preempt_disabled(void)
2935{
2936 sched_preempt_enable_no_resched();
2937 schedule();
2938 preempt_disable();
2939}
2940
2941#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
2942
2943static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
2944{
2945 if (lock->owner != owner)
2946 return false;
2947
2948
2949
2950
2951
2952
2953
2954 barrier();
2955
2956 return owner->on_cpu;
2957}
2958
2959
2960
2961
2962
2963int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
2964{
2965 if (!sched_feat(OWNER_SPIN))
2966 return 0;
2967
2968 rcu_read_lock();
2969 while (owner_running(lock, owner)) {
2970 if (need_resched())
2971 break;
2972
2973 arch_mutex_cpu_relax();
2974 }
2975 rcu_read_unlock();
2976
2977
2978
2979
2980
2981
2982 return lock->owner == NULL;
2983}
2984#endif
2985
2986#ifdef CONFIG_PREEMPT
2987
2988
2989
2990
2991
2992asmlinkage void __sched notrace preempt_schedule(void)
2993{
2994 struct thread_info *ti = current_thread_info();
2995
2996
2997
2998
2999
3000 if (likely(ti->preempt_count || irqs_disabled()))
3001 return;
3002
3003 do {
3004 add_preempt_count_notrace(PREEMPT_ACTIVE);
3005 __schedule();
3006 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3007
3008
3009
3010
3011
3012 barrier();
3013 } while (need_resched());
3014}
3015EXPORT_SYMBOL(preempt_schedule);
3016
3017
3018
3019
3020
3021
3022
3023asmlinkage void __sched preempt_schedule_irq(void)
3024{
3025 struct thread_info *ti = current_thread_info();
3026
3027
3028 BUG_ON(ti->preempt_count || !irqs_disabled());
3029
3030 rcu_user_exit();
3031 do {
3032 add_preempt_count(PREEMPT_ACTIVE);
3033 local_irq_enable();
3034 __schedule();
3035 local_irq_disable();
3036 sub_preempt_count(PREEMPT_ACTIVE);
3037
3038
3039
3040
3041
3042 barrier();
3043 } while (need_resched());
3044}
3045
3046#endif
3047
3048int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3049 void *key)
3050{
3051 return try_to_wake_up(curr->private, mode, wake_flags);
3052}
3053EXPORT_SYMBOL(default_wake_function);
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3065 int nr_exclusive, int wake_flags, void *key)
3066{
3067 wait_queue_t *curr, *next;
3068
3069 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3070 unsigned flags = curr->flags;
3071
3072 if (curr->func(curr, mode, wake_flags, key) &&
3073 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3074 break;
3075 }
3076}
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088void __wake_up(wait_queue_head_t *q, unsigned int mode,
3089 int nr_exclusive, void *key)
3090{
3091 unsigned long flags;
3092
3093 spin_lock_irqsave(&q->lock, flags);
3094 __wake_up_common(q, mode, nr_exclusive, 0, key);
3095 spin_unlock_irqrestore(&q->lock, flags);
3096}
3097EXPORT_SYMBOL(__wake_up);
3098
3099
3100
3101
3102void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3103{
3104 __wake_up_common(q, mode, nr, 0, NULL);
3105}
3106EXPORT_SYMBOL_GPL(__wake_up_locked);
3107
3108void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3109{
3110 __wake_up_common(q, mode, 1, 0, key);
3111}
3112EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3132 int nr_exclusive, void *key)
3133{
3134 unsigned long flags;
3135 int wake_flags = WF_SYNC;
3136
3137 if (unlikely(!q))
3138 return;
3139
3140 if (unlikely(!nr_exclusive))
3141 wake_flags = 0;
3142
3143 spin_lock_irqsave(&q->lock, flags);
3144 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3145 spin_unlock_irqrestore(&q->lock, flags);
3146}
3147EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3148
3149
3150
3151
3152void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3153{
3154 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3155}
3156EXPORT_SYMBOL_GPL(__wake_up_sync);
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170void complete(struct completion *x)
3171{
3172 unsigned long flags;
3173
3174 spin_lock_irqsave(&x->wait.lock, flags);
3175 x->done++;
3176 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3177 spin_unlock_irqrestore(&x->wait.lock, flags);
3178}
3179EXPORT_SYMBOL(complete);
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190void complete_all(struct completion *x)
3191{
3192 unsigned long flags;
3193
3194 spin_lock_irqsave(&x->wait.lock, flags);
3195 x->done += UINT_MAX/2;
3196 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3197 spin_unlock_irqrestore(&x->wait.lock, flags);
3198}
3199EXPORT_SYMBOL(complete_all);
3200
3201static inline long __sched
3202do_wait_for_common(struct completion *x, long timeout, int state)
3203{
3204 if (!x->done) {
3205 DECLARE_WAITQUEUE(wait, current);
3206
3207 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3208 do {
3209 if (signal_pending_state(state, current)) {
3210 timeout = -ERESTARTSYS;
3211 break;
3212 }
3213 __set_current_state(state);
3214 spin_unlock_irq(&x->wait.lock);
3215 timeout = schedule_timeout(timeout);
3216 spin_lock_irq(&x->wait.lock);
3217 } while (!x->done && timeout);
3218 __remove_wait_queue(&x->wait, &wait);
3219 if (!x->done)
3220 return timeout;
3221 }
3222 x->done--;
3223 return timeout ?: 1;
3224}
3225
3226static long __sched
3227wait_for_common(struct completion *x, long timeout, int state)
3228{
3229 might_sleep();
3230
3231 spin_lock_irq(&x->wait.lock);
3232 timeout = do_wait_for_common(x, timeout, state);
3233 spin_unlock_irq(&x->wait.lock);
3234 return timeout;
3235}
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247void __sched wait_for_completion(struct completion *x)
3248{
3249 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3250}
3251EXPORT_SYMBOL(wait_for_completion);
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265unsigned long __sched
3266wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3267{
3268 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3269}
3270EXPORT_SYMBOL(wait_for_completion_timeout);
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281int __sched wait_for_completion_interruptible(struct completion *x)
3282{
3283 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3284 if (t == -ERESTARTSYS)
3285 return t;
3286 return 0;
3287}
3288EXPORT_SYMBOL(wait_for_completion_interruptible);
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301long __sched
3302wait_for_completion_interruptible_timeout(struct completion *x,
3303 unsigned long timeout)
3304{
3305 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3306}
3307EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318int __sched wait_for_completion_killable(struct completion *x)
3319{
3320 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3321 if (t == -ERESTARTSYS)
3322 return t;
3323 return 0;
3324}
3325EXPORT_SYMBOL(wait_for_completion_killable);
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339long __sched
3340wait_for_completion_killable_timeout(struct completion *x,
3341 unsigned long timeout)
3342{
3343 return wait_for_common(x, timeout, TASK_KILLABLE);
3344}
3345EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359bool try_wait_for_completion(struct completion *x)
3360{
3361 unsigned long flags;
3362 int ret = 1;
3363
3364 spin_lock_irqsave(&x->wait.lock, flags);
3365 if (!x->done)
3366 ret = 0;
3367 else
3368 x->done--;
3369 spin_unlock_irqrestore(&x->wait.lock, flags);
3370 return ret;
3371}
3372EXPORT_SYMBOL(try_wait_for_completion);
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382bool completion_done(struct completion *x)
3383{
3384 unsigned long flags;
3385 int ret = 1;
3386
3387 spin_lock_irqsave(&x->wait.lock, flags);
3388 if (!x->done)
3389 ret = 0;
3390 spin_unlock_irqrestore(&x->wait.lock, flags);
3391 return ret;
3392}
3393EXPORT_SYMBOL(completion_done);
3394
3395static long __sched
3396sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3397{
3398 unsigned long flags;
3399 wait_queue_t wait;
3400
3401 init_waitqueue_entry(&wait, current);
3402
3403 __set_current_state(state);
3404
3405 spin_lock_irqsave(&q->lock, flags);
3406 __add_wait_queue(q, &wait);
3407 spin_unlock(&q->lock);
3408 timeout = schedule_timeout(timeout);
3409 spin_lock_irq(&q->lock);
3410 __remove_wait_queue(q, &wait);
3411 spin_unlock_irqrestore(&q->lock, flags);
3412
3413 return timeout;
3414}
3415
3416void __sched interruptible_sleep_on(wait_queue_head_t *q)
3417{
3418 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3419}
3420EXPORT_SYMBOL(interruptible_sleep_on);
3421
3422long __sched
3423interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3424{
3425 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3426}
3427EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3428
3429void __sched sleep_on(wait_queue_head_t *q)
3430{
3431 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3432}
3433EXPORT_SYMBOL(sleep_on);
3434
3435long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3436{
3437 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3438}
3439EXPORT_SYMBOL(sleep_on_timeout);
3440
3441#ifdef CONFIG_RT_MUTEXES
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453void rt_mutex_setprio(struct task_struct *p, int prio)
3454{
3455 int oldprio, on_rq, running;
3456 struct rq *rq;
3457 const struct sched_class *prev_class;
3458
3459 BUG_ON(prio < 0 || prio > MAX_PRIO);
3460
3461 rq = __task_rq_lock(p);
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475 if (unlikely(p == rq->idle)) {
3476 WARN_ON(p != rq->curr);
3477 WARN_ON(p->pi_blocked_on);
3478 goto out_unlock;
3479 }
3480
3481 trace_sched_pi_setprio(p, prio);
3482 oldprio = p->prio;
3483 prev_class = p->sched_class;
3484 on_rq = p->on_rq;
3485 running = task_current(rq, p);
3486 if (on_rq)
3487 dequeue_task(rq, p, 0);
3488 if (running)
3489 p->sched_class->put_prev_task(rq, p);
3490
3491 if (rt_prio(prio))
3492 p->sched_class = &rt_sched_class;
3493 else
3494 p->sched_class = &fair_sched_class;
3495
3496 p->prio = prio;
3497
3498 if (running)
3499 p->sched_class->set_curr_task(rq);
3500 if (on_rq)
3501 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3502
3503 check_class_changed(rq, p, prev_class, oldprio);
3504out_unlock:
3505 __task_rq_unlock(rq);
3506}
3507#endif
3508void set_user_nice(struct task_struct *p, long nice)
3509{
3510 int old_prio, delta, on_rq;
3511 unsigned long flags;
3512 struct rq *rq;
3513
3514 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3515 return;
3516
3517
3518
3519
3520 rq = task_rq_lock(p, &flags);
3521
3522
3523
3524
3525
3526
3527 if (task_has_rt_policy(p)) {
3528 p->static_prio = NICE_TO_PRIO(nice);
3529 goto out_unlock;
3530 }
3531 on_rq = p->on_rq;
3532 if (on_rq)
3533 dequeue_task(rq, p, 0);
3534
3535 p->static_prio = NICE_TO_PRIO(nice);
3536 set_load_weight(p);
3537 old_prio = p->prio;
3538 p->prio = effective_prio(p);
3539 delta = p->prio - old_prio;
3540
3541 if (on_rq) {
3542 enqueue_task(rq, p, 0);
3543
3544
3545
3546
3547 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3548 resched_task(rq->curr);
3549 }
3550out_unlock:
3551 task_rq_unlock(rq, p, &flags);
3552}
3553EXPORT_SYMBOL(set_user_nice);
3554
3555
3556
3557
3558
3559
3560int can_nice(const struct task_struct *p, const int nice)
3561{
3562
3563 int nice_rlim = 20 - nice;
3564
3565 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3566 capable(CAP_SYS_NICE));
3567}
3568
3569#ifdef __ARCH_WANT_SYS_NICE
3570
3571
3572
3573
3574
3575
3576
3577
3578SYSCALL_DEFINE1(nice, int, increment)
3579{
3580 long nice, retval;
3581
3582
3583
3584
3585
3586
3587 if (increment < -40)
3588 increment = -40;
3589 if (increment > 40)
3590 increment = 40;
3591
3592 nice = TASK_NICE(current) + increment;
3593 if (nice < -20)
3594 nice = -20;
3595 if (nice > 19)
3596 nice = 19;
3597
3598 if (increment < 0 && !can_nice(current, nice))
3599 return -EPERM;
3600
3601 retval = security_task_setnice(current, nice);
3602 if (retval)
3603 return retval;
3604
3605 set_user_nice(current, nice);
3606 return 0;
3607}
3608
3609#endif
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619int task_prio(const struct task_struct *p)
3620{
3621 return p->prio - MAX_RT_PRIO;
3622}
3623
3624
3625
3626
3627
3628int task_nice(const struct task_struct *p)
3629{
3630 return TASK_NICE(p);
3631}
3632EXPORT_SYMBOL(task_nice);
3633
3634
3635
3636
3637
3638int idle_cpu(int cpu)
3639{
3640 struct rq *rq = cpu_rq(cpu);
3641
3642 if (rq->curr != rq->idle)
3643 return 0;
3644
3645 if (rq->nr_running)
3646 return 0;
3647
3648#ifdef CONFIG_SMP
3649 if (!llist_empty(&rq->wake_list))
3650 return 0;
3651#endif
3652
3653 return 1;
3654}
3655
3656
3657
3658
3659
3660struct task_struct *idle_task(int cpu)
3661{
3662 return cpu_rq(cpu)->idle;
3663}
3664
3665
3666
3667
3668
3669static struct task_struct *find_process_by_pid(pid_t pid)
3670{
3671 return pid ? find_task_by_vpid(pid) : current;
3672}
3673
3674
3675static void
3676__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3677{
3678 p->policy = policy;
3679 p->rt_priority = prio;
3680 p->normal_prio = normal_prio(p);
3681
3682 p->prio = rt_mutex_getprio(p);
3683 if (rt_prio(p->prio))
3684 p->sched_class = &rt_sched_class;
3685 else
3686 p->sched_class = &fair_sched_class;
3687 set_load_weight(p);
3688}
3689
3690
3691
3692
3693static bool check_same_owner(struct task_struct *p)
3694{
3695 const struct cred *cred = current_cred(), *pcred;
3696 bool match;
3697
3698 rcu_read_lock();
3699 pcred = __task_cred(p);
3700 match = (uid_eq(cred->euid, pcred->euid) ||
3701 uid_eq(cred->euid, pcred->uid));
3702 rcu_read_unlock();
3703 return match;
3704}
3705
3706static int __sched_setscheduler(struct task_struct *p, int policy,
3707 const struct sched_param *param, bool user)
3708{
3709 int retval, oldprio, oldpolicy = -1, on_rq, running;
3710 unsigned long flags;
3711 const struct sched_class *prev_class;
3712 struct rq *rq;
3713 int reset_on_fork;
3714
3715
3716 BUG_ON(in_interrupt());
3717recheck:
3718
3719 if (policy < 0) {
3720 reset_on_fork = p->sched_reset_on_fork;
3721 policy = oldpolicy = p->policy;
3722 } else {
3723 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3724 policy &= ~SCHED_RESET_ON_FORK;
3725
3726 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3727 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3728 policy != SCHED_IDLE)
3729 return -EINVAL;
3730 }
3731
3732
3733
3734
3735
3736
3737 if (param->sched_priority < 0 ||
3738 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3739 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3740 return -EINVAL;
3741 if (rt_policy(policy) != (param->sched_priority != 0))
3742 return -EINVAL;
3743
3744
3745
3746
3747 if (user && !capable(CAP_SYS_NICE)) {
3748 if (rt_policy(policy)) {
3749 unsigned long rlim_rtprio =
3750 task_rlimit(p, RLIMIT_RTPRIO);
3751
3752
3753 if (policy != p->policy && !rlim_rtprio)
3754 return -EPERM;
3755
3756
3757 if (param->sched_priority > p->rt_priority &&
3758 param->sched_priority > rlim_rtprio)
3759 return -EPERM;
3760 }
3761
3762
3763
3764
3765
3766 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3767 if (!can_nice(p, TASK_NICE(p)))
3768 return -EPERM;
3769 }
3770
3771
3772 if (!check_same_owner(p))
3773 return -EPERM;
3774
3775
3776 if (p->sched_reset_on_fork && !reset_on_fork)
3777 return -EPERM;
3778 }
3779
3780 if (user) {
3781 retval = security_task_setscheduler(p);
3782 if (retval)
3783 return retval;
3784 }
3785
3786
3787
3788
3789
3790
3791
3792
3793 rq = task_rq_lock(p, &flags);
3794
3795
3796
3797
3798 if (p == rq->stop) {
3799 task_rq_unlock(rq, p, &flags);
3800 return -EINVAL;
3801 }
3802
3803
3804
3805
3806 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3807 param->sched_priority == p->rt_priority))) {
3808 task_rq_unlock(rq, p, &flags);
3809 return 0;
3810 }
3811
3812#ifdef CONFIG_RT_GROUP_SCHED
3813 if (user) {
3814
3815
3816
3817
3818 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3819 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3820 !task_group_is_autogroup(task_group(p))) {
3821 task_rq_unlock(rq, p, &flags);
3822 return -EPERM;
3823 }
3824 }
3825#endif
3826
3827
3828 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3829 policy = oldpolicy = -1;
3830 task_rq_unlock(rq, p, &flags);
3831 goto recheck;
3832 }
3833 on_rq = p->on_rq;
3834 running = task_current(rq, p);
3835 if (on_rq)
3836 dequeue_task(rq, p, 0);
3837 if (running)
3838 p->sched_class->put_prev_task(rq, p);
3839
3840 p->sched_reset_on_fork = reset_on_fork;
3841
3842 oldprio = p->prio;
3843 prev_class = p->sched_class;
3844 __setscheduler(rq, p, policy, param->sched_priority);
3845
3846 if (running)
3847 p->sched_class->set_curr_task(rq);
3848 if (on_rq)
3849 enqueue_task(rq, p, 0);
3850
3851 check_class_changed(rq, p, prev_class, oldprio);
3852 task_rq_unlock(rq, p, &flags);
3853
3854 rt_mutex_adjust_pi(p);
3855
3856 return 0;
3857}
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867int sched_setscheduler(struct task_struct *p, int policy,
3868 const struct sched_param *param)
3869{
3870 return __sched_setscheduler(p, policy, param, true);
3871}
3872EXPORT_SYMBOL_GPL(sched_setscheduler);
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3886 const struct sched_param *param)
3887{
3888 return __sched_setscheduler(p, policy, param, false);
3889}
3890
3891static int
3892do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3893{
3894 struct sched_param lparam;
3895 struct task_struct *p;
3896 int retval;
3897
3898 if (!param || pid < 0)
3899 return -EINVAL;
3900 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3901 return -EFAULT;
3902
3903 rcu_read_lock();
3904 retval = -ESRCH;
3905 p = find_process_by_pid(pid);
3906 if (p != NULL)
3907 retval = sched_setscheduler(p, policy, &lparam);
3908 rcu_read_unlock();
3909
3910 return retval;
3911}
3912
3913
3914
3915
3916
3917
3918
3919SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3920 struct sched_param __user *, param)
3921{
3922
3923 if (policy < 0)
3924 return -EINVAL;
3925
3926 return do_sched_setscheduler(pid, policy, param);
3927}
3928
3929
3930
3931
3932
3933
3934SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3935{
3936 return do_sched_setscheduler(pid, -1, param);
3937}
3938
3939
3940
3941
3942
3943SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3944{
3945 struct task_struct *p;
3946 int retval;
3947
3948 if (pid < 0)
3949 return -EINVAL;
3950
3951 retval = -ESRCH;
3952 rcu_read_lock();
3953 p = find_process_by_pid(pid);
3954 if (p) {
3955 retval = security_task_getscheduler(p);
3956 if (!retval)
3957 retval = p->policy
3958 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3959 }
3960 rcu_read_unlock();
3961 return retval;
3962}
3963
3964
3965
3966
3967
3968
3969SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3970{
3971 struct sched_param lp;
3972 struct task_struct *p;
3973 int retval;
3974
3975 if (!param || pid < 0)
3976 return -EINVAL;
3977
3978 rcu_read_lock();
3979 p = find_process_by_pid(pid);
3980 retval = -ESRCH;
3981 if (!p)
3982 goto out_unlock;
3983
3984 retval = security_task_getscheduler(p);
3985 if (retval)
3986 goto out_unlock;
3987
3988 lp.sched_priority = p->rt_priority;
3989 rcu_read_unlock();
3990
3991
3992
3993
3994 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3995
3996 return retval;
3997
3998out_unlock:
3999 rcu_read_unlock();
4000 return retval;
4001}
4002
4003long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4004{
4005 cpumask_var_t cpus_allowed, new_mask;
4006 struct task_struct *p;
4007 int retval;
4008
4009 get_online_cpus();
4010 rcu_read_lock();
4011
4012 p = find_process_by_pid(pid);
4013 if (!p) {
4014 rcu_read_unlock();
4015 put_online_cpus();
4016 return -ESRCH;
4017 }
4018
4019
4020 get_task_struct(p);
4021 rcu_read_unlock();
4022
4023 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4024 retval = -ENOMEM;
4025 goto out_put_task;
4026 }
4027 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4028 retval = -ENOMEM;
4029 goto out_free_cpus_allowed;
4030 }
4031 retval = -EPERM;
4032 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4033 goto out_unlock;
4034
4035 retval = security_task_setscheduler(p);
4036 if (retval)
4037 goto out_unlock;
4038
4039 cpuset_cpus_allowed(p, cpus_allowed);
4040 cpumask_and(new_mask, in_mask, cpus_allowed);
4041again:
4042 retval = set_cpus_allowed_ptr(p, new_mask);
4043
4044 if (!retval) {
4045 cpuset_cpus_allowed(p, cpus_allowed);
4046 if (!cpumask_subset(new_mask, cpus_allowed)) {
4047
4048
4049
4050
4051
4052 cpumask_copy(new_mask, cpus_allowed);
4053 goto again;
4054 }
4055 }
4056out_unlock:
4057 free_cpumask_var(new_mask);
4058out_free_cpus_allowed:
4059 free_cpumask_var(cpus_allowed);
4060out_put_task:
4061 put_task_struct(p);
4062 put_online_cpus();
4063 return retval;
4064}
4065
4066static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4067 struct cpumask *new_mask)
4068{
4069 if (len < cpumask_size())
4070 cpumask_clear(new_mask);
4071 else if (len > cpumask_size())
4072 len = cpumask_size();
4073
4074 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4075}
4076
4077
4078
4079
4080
4081
4082
4083SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4084 unsigned long __user *, user_mask_ptr)
4085{
4086 cpumask_var_t new_mask;
4087 int retval;
4088
4089 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4090 return -ENOMEM;
4091
4092 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4093 if (retval == 0)
4094 retval = sched_setaffinity(pid, new_mask);
4095 free_cpumask_var(new_mask);
4096 return retval;
4097}
4098
4099long sched_getaffinity(pid_t pid, struct cpumask *mask)
4100{
4101 struct task_struct *p;
4102 unsigned long flags;
4103 int retval;
4104
4105 get_online_cpus();
4106 rcu_read_lock();
4107
4108 retval = -ESRCH;
4109 p = find_process_by_pid(pid);
4110 if (!p)
4111 goto out_unlock;
4112
4113 retval = security_task_getscheduler(p);
4114 if (retval)
4115 goto out_unlock;
4116
4117 raw_spin_lock_irqsave(&p->pi_lock, flags);
4118 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4119 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4120
4121out_unlock:
4122 rcu_read_unlock();
4123 put_online_cpus();
4124
4125 return retval;
4126}
4127
4128
4129
4130
4131
4132
4133
4134SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4135 unsigned long __user *, user_mask_ptr)
4136{
4137 int ret;
4138 cpumask_var_t mask;
4139
4140 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4141 return -EINVAL;
4142 if (len & (sizeof(unsigned long)-1))
4143 return -EINVAL;
4144
4145 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4146 return -ENOMEM;
4147
4148 ret = sched_getaffinity(pid, mask);
4149 if (ret == 0) {
4150 size_t retlen = min_t(size_t, len, cpumask_size());
4151
4152 if (copy_to_user(user_mask_ptr, mask, retlen))
4153 ret = -EFAULT;
4154 else
4155 ret = retlen;
4156 }
4157 free_cpumask_var(mask);
4158
4159 return ret;
4160}
4161
4162
4163
4164
4165
4166
4167
4168SYSCALL_DEFINE0(sched_yield)
4169{
4170 struct rq *rq = this_rq_lock();
4171
4172 schedstat_inc(rq, yld_count);
4173 current->sched_class->yield_task(rq);
4174
4175
4176
4177
4178
4179 __release(rq->lock);
4180 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4181 do_raw_spin_unlock(&rq->lock);
4182 sched_preempt_enable_no_resched();
4183
4184 schedule();
4185
4186 return 0;
4187}
4188
4189static inline int should_resched(void)
4190{
4191 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4192}
4193
4194static void __cond_resched(void)
4195{
4196 add_preempt_count(PREEMPT_ACTIVE);
4197 __schedule();
4198 sub_preempt_count(PREEMPT_ACTIVE);
4199}
4200
4201int __sched _cond_resched(void)
4202{
4203 if (should_resched()) {
4204 __cond_resched();
4205 return 1;
4206 }
4207 return 0;
4208}
4209EXPORT_SYMBOL(_cond_resched);
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219int __cond_resched_lock(spinlock_t *lock)
4220{
4221 int resched = should_resched();
4222 int ret = 0;
4223
4224 lockdep_assert_held(lock);
4225
4226 if (spin_needbreak(lock) || resched) {
4227 spin_unlock(lock);
4228 if (resched)
4229 __cond_resched();
4230 else
4231 cpu_relax();
4232 ret = 1;
4233 spin_lock(lock);
4234 }
4235 return ret;
4236}
4237EXPORT_SYMBOL(__cond_resched_lock);
4238
4239int __sched __cond_resched_softirq(void)
4240{
4241 BUG_ON(!in_softirq());
4242
4243 if (should_resched()) {
4244 local_bh_enable();
4245 __cond_resched();
4246 local_bh_disable();
4247 return 1;
4248 }
4249 return 0;
4250}
4251EXPORT_SYMBOL(__cond_resched_softirq);
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275void __sched yield(void)
4276{
4277 set_current_state(TASK_RUNNING);
4278 sys_sched_yield();
4279}
4280EXPORT_SYMBOL(yield);
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294bool __sched yield_to(struct task_struct *p, bool preempt)
4295{
4296 struct task_struct *curr = current;
4297 struct rq *rq, *p_rq;
4298 unsigned long flags;
4299 bool yielded = 0;
4300
4301 local_irq_save(flags);
4302 rq = this_rq();
4303
4304again:
4305 p_rq = task_rq(p);
4306 double_rq_lock(rq, p_rq);
4307 while (task_rq(p) != p_rq) {
4308 double_rq_unlock(rq, p_rq);
4309 goto again;
4310 }
4311
4312 if (!curr->sched_class->yield_to_task)
4313 goto out;
4314
4315 if (curr->sched_class != p->sched_class)
4316 goto out;
4317
4318 if (task_running(p_rq, p) || p->state)
4319 goto out;
4320
4321 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4322 if (yielded) {
4323 schedstat_inc(rq, yld_count);
4324
4325
4326
4327
4328 if (preempt && rq != p_rq)
4329 resched_task(p_rq->curr);
4330 }
4331
4332out:
4333 double_rq_unlock(rq, p_rq);
4334 local_irq_restore(flags);
4335
4336 if (yielded)
4337 schedule();
4338
4339 return yielded;
4340}
4341EXPORT_SYMBOL_GPL(yield_to);
4342
4343
4344
4345
4346
4347void __sched io_schedule(void)
4348{
4349 struct rq *rq = raw_rq();
4350
4351 delayacct_blkio_start();
4352 atomic_inc(&rq->nr_iowait);
4353 blk_flush_plug(current);
4354 current->in_iowait = 1;
4355 schedule();
4356 current->in_iowait = 0;
4357 atomic_dec(&rq->nr_iowait);
4358 delayacct_blkio_end();
4359}
4360EXPORT_SYMBOL(io_schedule);
4361
4362long __sched io_schedule_timeout(long timeout)
4363{
4364 struct rq *rq = raw_rq();
4365 long ret;
4366
4367 delayacct_blkio_start();
4368 atomic_inc(&rq->nr_iowait);
4369 blk_flush_plug(current);
4370 current->in_iowait = 1;
4371 ret = schedule_timeout(timeout);
4372 current->in_iowait = 0;
4373 atomic_dec(&rq->nr_iowait);
4374 delayacct_blkio_end();
4375 return ret;
4376}
4377
4378
4379
4380
4381
4382
4383
4384
4385SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4386{
4387 int ret = -EINVAL;
4388
4389 switch (policy) {
4390 case SCHED_FIFO:
4391 case SCHED_RR:
4392 ret = MAX_USER_RT_PRIO-1;
4393 break;
4394 case SCHED_NORMAL:
4395 case SCHED_BATCH:
4396 case SCHED_IDLE:
4397 ret = 0;
4398 break;
4399 }
4400 return ret;
4401}
4402
4403
4404
4405
4406
4407
4408
4409
4410SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4411{
4412 int ret = -EINVAL;
4413
4414 switch (policy) {
4415 case SCHED_FIFO:
4416 case SCHED_RR:
4417 ret = 1;
4418 break;
4419 case SCHED_NORMAL:
4420 case SCHED_BATCH:
4421 case SCHED_IDLE:
4422 ret = 0;
4423 }
4424 return ret;
4425}
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4436 struct timespec __user *, interval)
4437{
4438 struct task_struct *p;
4439 unsigned int time_slice;
4440 unsigned long flags;
4441 struct rq *rq;
4442 int retval;
4443 struct timespec t;
4444
4445 if (pid < 0)
4446 return -EINVAL;
4447
4448 retval = -ESRCH;
4449 rcu_read_lock();
4450 p = find_process_by_pid(pid);
4451 if (!p)
4452 goto out_unlock;
4453
4454 retval = security_task_getscheduler(p);
4455 if (retval)
4456 goto out_unlock;
4457
4458 rq = task_rq_lock(p, &flags);
4459 time_slice = p->sched_class->get_rr_interval(rq, p);
4460 task_rq_unlock(rq, p, &flags);
4461
4462 rcu_read_unlock();
4463 jiffies_to_timespec(time_slice, &t);
4464 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4465 return retval;
4466
4467out_unlock:
4468 rcu_read_unlock();
4469 return retval;
4470}
4471
4472static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4473
4474void sched_show_task(struct task_struct *p)
4475{
4476 unsigned long free = 0;
4477 unsigned state;
4478
4479 state = p->state ? __ffs(p->state) + 1 : 0;
4480 printk(KERN_INFO "%-15.15s %c", p->comm,
4481 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4482#if BITS_PER_LONG == 32
4483 if (state == TASK_RUNNING)
4484 printk(KERN_CONT " running ");
4485 else
4486 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4487#else
4488 if (state == TASK_RUNNING)
4489 printk(KERN_CONT " running task ");
4490 else
4491 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4492#endif
4493#ifdef CONFIG_DEBUG_STACK_USAGE
4494 free = stack_not_used(p);
4495#endif
4496 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4497 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
4498 (unsigned long)task_thread_info(p)->flags);
4499
4500 show_stack(p, NULL);
4501}
4502
4503void show_state_filter(unsigned long state_filter)
4504{
4505 struct task_struct *g, *p;
4506
4507#if BITS_PER_LONG == 32
4508 printk(KERN_INFO
4509 " task PC stack pid father\n");
4510#else
4511 printk(KERN_INFO
4512 " task PC stack pid father\n");
4513#endif
4514 rcu_read_lock();
4515 do_each_thread(g, p) {
4516
4517
4518
4519
4520 touch_nmi_watchdog();
4521 if (!state_filter || (p->state & state_filter))
4522 sched_show_task(p);
4523 } while_each_thread(g, p);
4524
4525 touch_all_softlockup_watchdogs();
4526
4527#ifdef CONFIG_SCHED_DEBUG
4528 sysrq_sched_debug_show();
4529#endif
4530 rcu_read_unlock();
4531
4532
4533
4534 if (!state_filter)
4535 debug_show_all_locks();
4536}
4537
4538void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4539{
4540 idle->sched_class = &idle_sched_class;
4541}
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551void __cpuinit init_idle(struct task_struct *idle, int cpu)
4552{
4553 struct rq *rq = cpu_rq(cpu);
4554 unsigned long flags;
4555
4556 raw_spin_lock_irqsave(&rq->lock, flags);
4557
4558 __sched_fork(idle);
4559 idle->state = TASK_RUNNING;
4560 idle->se.exec_start = sched_clock();
4561
4562 do_set_cpus_allowed(idle, cpumask_of(cpu));
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573 rcu_read_lock();
4574 __set_task_cpu(idle, cpu);
4575 rcu_read_unlock();
4576
4577 rq->curr = rq->idle = idle;
4578#if defined(CONFIG_SMP)
4579 idle->on_cpu = 1;
4580#endif
4581 raw_spin_unlock_irqrestore(&rq->lock, flags);
4582
4583
4584 task_thread_info(idle)->preempt_count = 0;
4585
4586
4587
4588
4589 idle->sched_class = &idle_sched_class;
4590 ftrace_graph_init_idle_task(idle, cpu);
4591#if defined(CONFIG_SMP)
4592 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4593#endif
4594}
4595
4596#ifdef CONFIG_SMP
4597void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4598{
4599 if (p->sched_class && p->sched_class->set_cpus_allowed)
4600 p->sched_class->set_cpus_allowed(p, new_mask);
4601
4602 cpumask_copy(&p->cpus_allowed, new_mask);
4603 p->nr_cpus_allowed = cpumask_weight(new_mask);
4604}
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4630{
4631 unsigned long flags;
4632 struct rq *rq;
4633 unsigned int dest_cpu;
4634 int ret = 0;
4635
4636 rq = task_rq_lock(p, &flags);
4637
4638 if (cpumask_equal(&p->cpus_allowed, new_mask))
4639 goto out;
4640
4641 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4642 ret = -EINVAL;
4643 goto out;
4644 }
4645
4646 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4647 ret = -EINVAL;
4648 goto out;
4649 }
4650
4651 do_set_cpus_allowed(p, new_mask);
4652
4653
4654 if (cpumask_test_cpu(task_cpu(p), new_mask))
4655 goto out;
4656
4657 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4658 if (p->on_rq) {
4659 struct migration_arg arg = { p, dest_cpu };
4660
4661 task_rq_unlock(rq, p, &flags);
4662 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4663 tlb_migrate_finish(p->mm);
4664 return 0;
4665 }
4666out:
4667 task_rq_unlock(rq, p, &flags);
4668
4669 return ret;
4670}
4671EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4685{
4686 struct rq *rq_dest, *rq_src;
4687 int ret = 0;
4688
4689 if (unlikely(!cpu_active(dest_cpu)))
4690 return ret;
4691
4692 rq_src = cpu_rq(src_cpu);
4693 rq_dest = cpu_rq(dest_cpu);
4694
4695 raw_spin_lock(&p->pi_lock);
4696 double_rq_lock(rq_src, rq_dest);
4697
4698 if (task_cpu(p) != src_cpu)
4699 goto done;
4700
4701 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4702 goto fail;
4703
4704
4705
4706
4707
4708 if (p->on_rq) {
4709 dequeue_task(rq_src, p, 0);
4710 set_task_cpu(p, dest_cpu);
4711 enqueue_task(rq_dest, p, 0);
4712 check_preempt_curr(rq_dest, p, 0);
4713 }
4714done:
4715 ret = 1;
4716fail:
4717 double_rq_unlock(rq_src, rq_dest);
4718 raw_spin_unlock(&p->pi_lock);
4719 return ret;
4720}
4721
4722
4723
4724
4725
4726
4727static int migration_cpu_stop(void *data)
4728{
4729 struct migration_arg *arg = data;
4730
4731
4732
4733
4734
4735 local_irq_disable();
4736 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4737 local_irq_enable();
4738 return 0;
4739}
4740
4741#ifdef CONFIG_HOTPLUG_CPU
4742
4743
4744
4745
4746
4747void idle_task_exit(void)
4748{
4749 struct mm_struct *mm = current->active_mm;
4750
4751 BUG_ON(cpu_online(smp_processor_id()));
4752
4753 if (mm != &init_mm)
4754 switch_mm(mm, &init_mm, current);
4755 mmdrop(mm);
4756}
4757
4758
4759
4760
4761
4762
4763
4764
4765static void calc_load_migrate(struct rq *rq)
4766{
4767 long delta = calc_load_fold_active(rq);
4768 if (delta)
4769 atomic_long_add(delta, &calc_load_tasks);
4770}
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780static void migrate_tasks(unsigned int dead_cpu)
4781{
4782 struct rq *rq = cpu_rq(dead_cpu);
4783 struct task_struct *next, *stop = rq->stop;
4784 int dest_cpu;
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795 rq->stop = NULL;
4796
4797 for ( ; ; ) {
4798
4799
4800
4801
4802 if (rq->nr_running == 1)
4803 break;
4804
4805 next = pick_next_task(rq);
4806 BUG_ON(!next);
4807 next->sched_class->put_prev_task(rq, next);
4808
4809
4810 dest_cpu = select_fallback_rq(dead_cpu, next);
4811 raw_spin_unlock(&rq->lock);
4812
4813 __migrate_task(next, dead_cpu, dest_cpu);
4814
4815 raw_spin_lock(&rq->lock);
4816 }
4817
4818 rq->stop = stop;
4819}
4820
4821#endif
4822
4823#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4824
4825static struct ctl_table sd_ctl_dir[] = {
4826 {
4827 .procname = "sched_domain",
4828 .mode = 0555,
4829 },
4830 {}
4831};
4832
4833static struct ctl_table sd_ctl_root[] = {
4834 {
4835 .procname = "kernel",
4836 .mode = 0555,
4837 .child = sd_ctl_dir,
4838 },
4839 {}
4840};
4841
4842static struct ctl_table *sd_alloc_ctl_entry(int n)
4843{
4844 struct ctl_table *entry =
4845 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4846
4847 return entry;
4848}
4849
4850static void sd_free_ctl_entry(struct ctl_table **tablep)
4851{
4852 struct ctl_table *entry;
4853
4854
4855
4856
4857
4858
4859
4860 for (entry = *tablep; entry->mode; entry++) {
4861 if (entry->child)
4862 sd_free_ctl_entry(&entry->child);
4863 if (entry->proc_handler == NULL)
4864 kfree(entry->procname);
4865 }
4866
4867 kfree(*tablep);
4868 *tablep = NULL;
4869}
4870
4871static int min_load_idx = 0;
4872static int max_load_idx = CPU_LOAD_IDX_MAX;
4873
4874static void
4875set_table_entry(struct ctl_table *entry,
4876 const char *procname, void *data, int maxlen,
4877 umode_t mode, proc_handler *proc_handler,
4878 bool load_idx)
4879{
4880 entry->procname = procname;
4881 entry->data = data;
4882 entry->maxlen = maxlen;
4883 entry->mode = mode;
4884 entry->proc_handler = proc_handler;
4885
4886 if (load_idx) {
4887 entry->extra1 = &min_load_idx;
4888 entry->extra2 = &max_load_idx;
4889 }
4890}
4891
4892static struct ctl_table *
4893sd_alloc_ctl_domain_table(struct sched_domain *sd)
4894{
4895 struct ctl_table *table = sd_alloc_ctl_entry(13);
4896
4897 if (table == NULL)
4898 return NULL;
4899
4900 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4901 sizeof(long), 0644, proc_doulongvec_minmax, false);
4902 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4903 sizeof(long), 0644, proc_doulongvec_minmax, false);
4904 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4905 sizeof(int), 0644, proc_dointvec_minmax, true);
4906 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4907 sizeof(int), 0644, proc_dointvec_minmax, true);
4908 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4909 sizeof(int), 0644, proc_dointvec_minmax, true);
4910 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4911 sizeof(int), 0644, proc_dointvec_minmax, true);
4912 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4913 sizeof(int), 0644, proc_dointvec_minmax, true);
4914 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4915 sizeof(int), 0644, proc_dointvec_minmax, false);
4916 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4917 sizeof(int), 0644, proc_dointvec_minmax, false);
4918 set_table_entry(&table[9], "cache_nice_tries",
4919 &sd->cache_nice_tries,
4920 sizeof(int), 0644, proc_dointvec_minmax, false);
4921 set_table_entry(&table[10], "flags", &sd->flags,
4922 sizeof(int), 0644, proc_dointvec_minmax, false);
4923 set_table_entry(&table[11], "name", sd->name,
4924 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4925
4926
4927 return table;
4928}
4929
4930static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4931{
4932 struct ctl_table *entry, *table;
4933 struct sched_domain *sd;
4934 int domain_num = 0, i;
4935 char buf[32];
4936
4937 for_each_domain(cpu, sd)
4938 domain_num++;
4939 entry = table = sd_alloc_ctl_entry(domain_num + 1);
4940 if (table == NULL)
4941 return NULL;
4942
4943 i = 0;
4944 for_each_domain(cpu, sd) {
4945 snprintf(buf, 32, "domain%d", i);
4946 entry->procname = kstrdup(buf, GFP_KERNEL);
4947 entry->mode = 0555;
4948 entry->child = sd_alloc_ctl_domain_table(sd);
4949 entry++;
4950 i++;
4951 }
4952 return table;
4953}
4954
4955static struct ctl_table_header *sd_sysctl_header;
4956static void register_sched_domain_sysctl(void)
4957{
4958 int i, cpu_num = num_possible_cpus();
4959 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4960 char buf[32];
4961
4962 WARN_ON(sd_ctl_dir[0].child);
4963 sd_ctl_dir[0].child = entry;
4964
4965 if (entry == NULL)
4966 return;
4967
4968 for_each_possible_cpu(i) {
4969 snprintf(buf, 32, "cpu%d", i);
4970 entry->procname = kstrdup(buf, GFP_KERNEL);
4971 entry->mode = 0555;
4972 entry->child = sd_alloc_ctl_cpu_table(i);
4973 entry++;
4974 }
4975
4976 WARN_ON(sd_sysctl_header);
4977 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4978}
4979
4980
4981static void unregister_sched_domain_sysctl(void)
4982{
4983 if (sd_sysctl_header)
4984 unregister_sysctl_table(sd_sysctl_header);
4985 sd_sysctl_header = NULL;
4986 if (sd_ctl_dir[0].child)
4987 sd_free_ctl_entry(&sd_ctl_dir[0].child);
4988}
4989#else
4990static void register_sched_domain_sysctl(void)
4991{
4992}
4993static void unregister_sched_domain_sysctl(void)
4994{
4995}
4996#endif
4997
4998static void set_rq_online(struct rq *rq)
4999{
5000 if (!rq->online) {
5001 const struct sched_class *class;
5002
5003 cpumask_set_cpu(rq->cpu, rq->rd->online);
5004 rq->online = 1;
5005
5006 for_each_class(class) {
5007 if (class->rq_online)
5008 class->rq_online(rq);
5009 }
5010 }
5011}
5012
5013static void set_rq_offline(struct rq *rq)
5014{
5015 if (rq->online) {
5016 const struct sched_class *class;
5017
5018 for_each_class(class) {
5019 if (class->rq_offline)
5020 class->rq_offline(rq);
5021 }
5022
5023 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5024 rq->online = 0;
5025 }
5026}
5027
5028
5029
5030
5031
5032static int __cpuinit
5033migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5034{
5035 int cpu = (long)hcpu;
5036 unsigned long flags;
5037 struct rq *rq = cpu_rq(cpu);
5038
5039 switch (action & ~CPU_TASKS_FROZEN) {
5040
5041 case CPU_UP_PREPARE:
5042 rq->calc_load_update = calc_load_update;
5043 break;
5044
5045 case CPU_ONLINE:
5046
5047 raw_spin_lock_irqsave(&rq->lock, flags);
5048 if (rq->rd) {
5049 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5050
5051 set_rq_online(rq);
5052 }
5053 raw_spin_unlock_irqrestore(&rq->lock, flags);
5054 break;
5055
5056#ifdef CONFIG_HOTPLUG_CPU
5057 case CPU_DYING:
5058 sched_ttwu_pending();
5059
5060 raw_spin_lock_irqsave(&rq->lock, flags);
5061 if (rq->rd) {
5062 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5063 set_rq_offline(rq);
5064 }
5065 migrate_tasks(cpu);
5066 BUG_ON(rq->nr_running != 1);
5067 raw_spin_unlock_irqrestore(&rq->lock, flags);
5068 break;
5069
5070 case CPU_DEAD:
5071 calc_load_migrate(rq);
5072 break;
5073#endif
5074 }
5075
5076 update_max_interval();
5077
5078 return NOTIFY_OK;
5079}
5080
5081
5082
5083
5084
5085
5086static struct notifier_block __cpuinitdata migration_notifier = {
5087 .notifier_call = migration_call,
5088 .priority = CPU_PRI_MIGRATION,
5089};
5090
5091static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5092 unsigned long action, void *hcpu)
5093{
5094 switch (action & ~CPU_TASKS_FROZEN) {
5095 case CPU_STARTING:
5096 case CPU_DOWN_FAILED:
5097 set_cpu_active((long)hcpu, true);
5098 return NOTIFY_OK;
5099 default:
5100 return NOTIFY_DONE;
5101 }
5102}
5103
5104static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5105 unsigned long action, void *hcpu)
5106{
5107 switch (action & ~CPU_TASKS_FROZEN) {
5108 case CPU_DOWN_PREPARE:
5109 set_cpu_active((long)hcpu, false);
5110 return NOTIFY_OK;
5111 default:
5112 return NOTIFY_DONE;
5113 }
5114}
5115
5116static int __init migration_init(void)
5117{
5118 void *cpu = (void *)(long)smp_processor_id();
5119 int err;
5120
5121
5122 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5123 BUG_ON(err == NOTIFY_BAD);
5124 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5125 register_cpu_notifier(&migration_notifier);
5126
5127
5128 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5129 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5130
5131 return 0;
5132}
5133early_initcall(migration_init);
5134#endif
5135
5136#ifdef CONFIG_SMP
5137
5138static cpumask_var_t sched_domains_tmpmask;
5139
5140#ifdef CONFIG_SCHED_DEBUG
5141
5142static __read_mostly int sched_debug_enabled;
5143
5144static int __init sched_debug_setup(char *str)
5145{
5146 sched_debug_enabled = 1;
5147
5148 return 0;
5149}
5150early_param("sched_debug", sched_debug_setup);
5151
5152static inline bool sched_debug(void)
5153{
5154 return sched_debug_enabled;
5155}
5156
5157static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5158 struct cpumask *groupmask)
5159{
5160 struct sched_group *group = sd->groups;
5161 char str[256];
5162
5163 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5164 cpumask_clear(groupmask);
5165
5166 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5167
5168 if (!(sd->flags & SD_LOAD_BALANCE)) {
5169 printk("does not load-balance\n");
5170 if (sd->parent)
5171 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5172 " has parent");
5173 return -1;
5174 }
5175
5176 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5177
5178 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5179 printk(KERN_ERR "ERROR: domain->span does not contain "
5180 "CPU%d\n", cpu);
5181 }
5182 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5183 printk(KERN_ERR "ERROR: domain->groups does not contain"
5184 " CPU%d\n", cpu);
5185 }
5186
5187 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5188 do {
5189 if (!group) {
5190 printk("\n");
5191 printk(KERN_ERR "ERROR: group is NULL\n");
5192 break;
5193 }
5194
5195
5196
5197
5198
5199
5200 if (!group->sgp->power_orig) {
5201 printk(KERN_CONT "\n");
5202 printk(KERN_ERR "ERROR: domain->cpu_power not "
5203 "set\n");
5204 break;
5205 }
5206
5207 if (!cpumask_weight(sched_group_cpus(group))) {
5208 printk(KERN_CONT "\n");
5209 printk(KERN_ERR "ERROR: empty group\n");
5210 break;
5211 }
5212
5213 if (!(sd->flags & SD_OVERLAP) &&
5214 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5215 printk(KERN_CONT "\n");
5216 printk(KERN_ERR "ERROR: repeated CPUs\n");
5217 break;
5218 }
5219
5220 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5221
5222 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5223
5224 printk(KERN_CONT " %s", str);
5225 if (group->sgp->power != SCHED_POWER_SCALE) {
5226 printk(KERN_CONT " (cpu_power = %d)",
5227 group->sgp->power);
5228 }
5229
5230 group = group->next;
5231 } while (group != sd->groups);
5232 printk(KERN_CONT "\n");
5233
5234 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5235 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5236
5237 if (sd->parent &&
5238 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5239 printk(KERN_ERR "ERROR: parent span is not a superset "
5240 "of domain->span\n");
5241 return 0;
5242}
5243
5244static void sched_domain_debug(struct sched_domain *sd, int cpu)
5245{
5246 int level = 0;
5247
5248 if (!sched_debug_enabled)
5249 return;
5250
5251 if (!sd) {
5252 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5253 return;
5254 }
5255
5256 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5257
5258 for (;;) {
5259 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5260 break;
5261 level++;
5262 sd = sd->parent;
5263 if (!sd)
5264 break;
5265 }
5266}
5267#else
5268# define sched_domain_debug(sd, cpu) do { } while (0)
5269static inline bool sched_debug(void)
5270{
5271 return false;
5272}
5273#endif
5274
5275static int sd_degenerate(struct sched_domain *sd)
5276{
5277 if (cpumask_weight(sched_domain_span(sd)) == 1)
5278 return 1;
5279
5280
5281 if (sd->flags & (SD_LOAD_BALANCE |
5282 SD_BALANCE_NEWIDLE |
5283 SD_BALANCE_FORK |
5284 SD_BALANCE_EXEC |
5285 SD_SHARE_CPUPOWER |
5286 SD_SHARE_PKG_RESOURCES)) {
5287 if (sd->groups != sd->groups->next)
5288 return 0;
5289 }
5290
5291
5292 if (sd->flags & (SD_WAKE_AFFINE))
5293 return 0;
5294
5295 return 1;
5296}
5297
5298static int
5299sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5300{
5301 unsigned long cflags = sd->flags, pflags = parent->flags;
5302
5303 if (sd_degenerate(parent))
5304 return 1;
5305
5306 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5307 return 0;
5308
5309
5310 if (parent->groups == parent->groups->next) {
5311 pflags &= ~(SD_LOAD_BALANCE |
5312 SD_BALANCE_NEWIDLE |
5313 SD_BALANCE_FORK |
5314 SD_BALANCE_EXEC |
5315 SD_SHARE_CPUPOWER |
5316 SD_SHARE_PKG_RESOURCES);
5317 if (nr_node_ids == 1)
5318 pflags &= ~SD_SERIALIZE;
5319 }
5320 if (~cflags & pflags)
5321 return 0;
5322
5323 return 1;
5324}
5325
5326static void free_rootdomain(struct rcu_head *rcu)
5327{
5328 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5329
5330 cpupri_cleanup(&rd->cpupri);
5331 free_cpumask_var(rd->rto_mask);
5332 free_cpumask_var(rd->online);
5333 free_cpumask_var(rd->span);
5334 kfree(rd);
5335}
5336
5337static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5338{
5339 struct root_domain *old_rd = NULL;
5340 unsigned long flags;
5341
5342 raw_spin_lock_irqsave(&rq->lock, flags);
5343
5344 if (rq->rd) {
5345 old_rd = rq->rd;
5346
5347 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5348 set_rq_offline(rq);
5349
5350 cpumask_clear_cpu(rq->cpu, old_rd->span);
5351
5352
5353
5354
5355
5356
5357 if (!atomic_dec_and_test(&old_rd->refcount))
5358 old_rd = NULL;
5359 }
5360
5361 atomic_inc(&rd->refcount);
5362 rq->rd = rd;
5363
5364 cpumask_set_cpu(rq->cpu, rd->span);
5365 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5366 set_rq_online(rq);
5367
5368 raw_spin_unlock_irqrestore(&rq->lock, flags);
5369
5370 if (old_rd)
5371 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5372}
5373
5374static int init_rootdomain(struct root_domain *rd)
5375{
5376 memset(rd, 0, sizeof(*rd));
5377
5378 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5379 goto out;
5380 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5381 goto free_span;
5382 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5383 goto free_online;
5384
5385 if (cpupri_init(&rd->cpupri) != 0)
5386 goto free_rto_mask;
5387 return 0;
5388
5389free_rto_mask:
5390 free_cpumask_var(rd->rto_mask);
5391free_online:
5392 free_cpumask_var(rd->online);
5393free_span:
5394 free_cpumask_var(rd->span);
5395out:
5396 return -ENOMEM;
5397}
5398
5399
5400
5401
5402
5403struct root_domain def_root_domain;
5404
5405static void init_defrootdomain(void)
5406{
5407 init_rootdomain(&def_root_domain);
5408
5409 atomic_set(&def_root_domain.refcount, 1);
5410}
5411
5412static struct root_domain *alloc_rootdomain(void)
5413{
5414 struct root_domain *rd;
5415
5416 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5417 if (!rd)
5418 return NULL;
5419
5420 if (init_rootdomain(rd) != 0) {
5421 kfree(rd);
5422 return NULL;
5423 }
5424
5425 return rd;
5426}
5427
5428static void free_sched_groups(struct sched_group *sg, int free_sgp)
5429{
5430 struct sched_group *tmp, *first;
5431
5432 if (!sg)
5433 return;
5434
5435 first = sg;
5436 do {
5437 tmp = sg->next;
5438
5439 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5440 kfree(sg->sgp);
5441
5442 kfree(sg);
5443 sg = tmp;
5444 } while (sg != first);
5445}
5446
5447static void free_sched_domain(struct rcu_head *rcu)
5448{
5449 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5450
5451
5452
5453
5454
5455 if (sd->flags & SD_OVERLAP) {
5456 free_sched_groups(sd->groups, 1);
5457 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5458 kfree(sd->groups->sgp);
5459 kfree(sd->groups);
5460 }
5461 kfree(sd);
5462}
5463
5464static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5465{
5466 call_rcu(&sd->rcu, free_sched_domain);
5467}
5468
5469static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5470{
5471 for (; sd; sd = sd->parent)
5472 destroy_sched_domain(sd, cpu);
5473}
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5485DEFINE_PER_CPU(int, sd_llc_id);
5486
5487static void update_top_cache_domain(int cpu)
5488{
5489 struct sched_domain *sd;
5490 int id = cpu;
5491
5492 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5493 if (sd)
5494 id = cpumask_first(sched_domain_span(sd));
5495
5496 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5497 per_cpu(sd_llc_id, cpu) = id;
5498}
5499
5500
5501
5502
5503
5504static void
5505cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5506{
5507 struct rq *rq = cpu_rq(cpu);
5508 struct sched_domain *tmp;
5509
5510
5511 for (tmp = sd; tmp; ) {
5512 struct sched_domain *parent = tmp->parent;
5513 if (!parent)
5514 break;
5515
5516 if (sd_parent_degenerate(tmp, parent)) {
5517 tmp->parent = parent->parent;
5518 if (parent->parent)
5519 parent->parent->child = tmp;
5520 destroy_sched_domain(parent, cpu);
5521 } else
5522 tmp = tmp->parent;
5523 }
5524
5525 if (sd && sd_degenerate(sd)) {
5526 tmp = sd;
5527 sd = sd->parent;
5528 destroy_sched_domain(tmp, cpu);
5529 if (sd)
5530 sd->child = NULL;
5531 }
5532
5533 sched_domain_debug(sd, cpu);
5534
5535 rq_attach_root(rq, rd);
5536 tmp = rq->sd;
5537 rcu_assign_pointer(rq->sd, sd);
5538 destroy_sched_domains(tmp, cpu);
5539
5540 update_top_cache_domain(cpu);
5541}
5542
5543
5544static cpumask_var_t cpu_isolated_map;
5545
5546
5547static int __init isolated_cpu_setup(char *str)
5548{
5549 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5550 cpulist_parse(str, cpu_isolated_map);
5551 return 1;
5552}
5553
5554__setup("isolcpus=", isolated_cpu_setup);
5555
5556static const struct cpumask *cpu_cpu_mask(int cpu)
5557{
5558 return cpumask_of_node(cpu_to_node(cpu));
5559}
5560
5561struct sd_data {
5562 struct sched_domain **__percpu sd;
5563 struct sched_group **__percpu sg;
5564 struct sched_group_power **__percpu sgp;
5565};
5566
5567struct s_data {
5568 struct sched_domain ** __percpu sd;
5569 struct root_domain *rd;
5570};
5571
5572enum s_alloc {
5573 sa_rootdomain,
5574 sa_sd,
5575 sa_sd_storage,
5576 sa_none,
5577};
5578
5579struct sched_domain_topology_level;
5580
5581typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5582typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5583
5584#define SDTL_OVERLAP 0x01
5585
5586struct sched_domain_topology_level {
5587 sched_domain_init_f init;
5588 sched_domain_mask_f mask;
5589 int flags;
5590 int numa_level;
5591 struct sd_data data;
5592};
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5608{
5609 const struct cpumask *span = sched_domain_span(sd);
5610 struct sd_data *sdd = sd->private;
5611 struct sched_domain *sibling;
5612 int i;
5613
5614 for_each_cpu(i, span) {
5615 sibling = *per_cpu_ptr(sdd->sd, i);
5616 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5617 continue;
5618
5619 cpumask_set_cpu(i, sched_group_mask(sg));
5620 }
5621}
5622
5623
5624
5625
5626
5627int group_balance_cpu(struct sched_group *sg)
5628{
5629 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5630}
5631
5632static int
5633build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5634{
5635 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5636 const struct cpumask *span = sched_domain_span(sd);
5637 struct cpumask *covered = sched_domains_tmpmask;
5638 struct sd_data *sdd = sd->private;
5639 struct sched_domain *child;
5640 int i;
5641
5642 cpumask_clear(covered);
5643
5644 for_each_cpu(i, span) {
5645 struct cpumask *sg_span;
5646
5647 if (cpumask_test_cpu(i, covered))
5648 continue;
5649
5650 child = *per_cpu_ptr(sdd->sd, i);
5651
5652
5653 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5654 continue;
5655
5656 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5657 GFP_KERNEL, cpu_to_node(cpu));
5658
5659 if (!sg)
5660 goto fail;
5661
5662 sg_span = sched_group_cpus(sg);
5663 if (child->child) {
5664 child = child->child;
5665 cpumask_copy(sg_span, sched_domain_span(child));
5666 } else
5667 cpumask_set_cpu(i, sg_span);
5668
5669 cpumask_or(covered, covered, sg_span);
5670
5671 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5672 if (atomic_inc_return(&sg->sgp->ref) == 1)
5673 build_group_mask(sd, sg);
5674
5675
5676
5677
5678
5679
5680 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5681
5682
5683
5684
5685
5686
5687 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5688 group_balance_cpu(sg) == cpu)
5689 groups = sg;
5690
5691 if (!first)
5692 first = sg;
5693 if (last)
5694 last->next = sg;
5695 last = sg;
5696 last->next = first;
5697 }
5698 sd->groups = groups;
5699
5700 return 0;
5701
5702fail:
5703 free_sched_groups(first, 0);
5704
5705 return -ENOMEM;
5706}
5707
5708static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5709{
5710 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5711 struct sched_domain *child = sd->child;
5712
5713 if (child)
5714 cpu = cpumask_first(sched_domain_span(child));
5715
5716 if (sg) {
5717 *sg = *per_cpu_ptr(sdd->sg, cpu);
5718 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5719 atomic_set(&(*sg)->sgp->ref, 1);
5720 }
5721
5722 return cpu;
5723}
5724
5725
5726
5727
5728
5729
5730
5731
5732static int
5733build_sched_groups(struct sched_domain *sd, int cpu)
5734{
5735 struct sched_group *first = NULL, *last = NULL;
5736 struct sd_data *sdd = sd->private;
5737 const struct cpumask *span = sched_domain_span(sd);
5738 struct cpumask *covered;
5739 int i;
5740
5741 get_group(cpu, sdd, &sd->groups);
5742 atomic_inc(&sd->groups->ref);
5743
5744 if (cpu != cpumask_first(sched_domain_span(sd)))
5745 return 0;
5746
5747 lockdep_assert_held(&sched_domains_mutex);
5748 covered = sched_domains_tmpmask;
5749
5750 cpumask_clear(covered);
5751
5752 for_each_cpu(i, span) {
5753 struct sched_group *sg;
5754 int group = get_group(i, sdd, &sg);
5755 int j;
5756
5757 if (cpumask_test_cpu(i, covered))
5758 continue;
5759
5760 cpumask_clear(sched_group_cpus(sg));
5761 sg->sgp->power = 0;
5762 cpumask_setall(sched_group_mask(sg));
5763
5764 for_each_cpu(j, span) {
5765 if (get_group(j, sdd, NULL) != group)
5766 continue;
5767
5768 cpumask_set_cpu(j, covered);
5769 cpumask_set_cpu(j, sched_group_cpus(sg));
5770 }
5771
5772 if (!first)
5773 first = sg;
5774 if (last)
5775 last->next = sg;
5776 last = sg;
5777 }
5778 last->next = first;
5779
5780 return 0;
5781}
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5794{
5795 struct sched_group *sg = sd->groups;
5796
5797 WARN_ON(!sd || !sg);
5798
5799 do {
5800 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5801 sg = sg->next;
5802 } while (sg != sd->groups);
5803
5804 if (cpu != group_balance_cpu(sg))
5805 return;
5806
5807 update_group_power(sd, cpu);
5808 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5809}
5810
5811int __weak arch_sd_sibling_asym_packing(void)
5812{
5813 return 0*SD_ASYM_PACKING;
5814}
5815
5816
5817
5818
5819
5820
5821#ifdef CONFIG_SCHED_DEBUG
5822# define SD_INIT_NAME(sd, type) sd->name = #type
5823#else
5824# define SD_INIT_NAME(sd, type) do { } while (0)
5825#endif
5826
5827#define SD_INIT_FUNC(type) \
5828static noinline struct sched_domain * \
5829sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5830{ \
5831 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5832 *sd = SD_##type##_INIT; \
5833 SD_INIT_NAME(sd, type); \
5834 sd->private = &tl->data; \
5835 return sd; \
5836}
5837
5838SD_INIT_FUNC(CPU)
5839#ifdef CONFIG_SCHED_SMT
5840 SD_INIT_FUNC(SIBLING)
5841#endif
5842#ifdef CONFIG_SCHED_MC
5843 SD_INIT_FUNC(MC)
5844#endif
5845#ifdef CONFIG_SCHED_BOOK
5846 SD_INIT_FUNC(BOOK)
5847#endif
5848
5849static int default_relax_domain_level = -1;
5850int sched_domain_level_max;
5851
5852static int __init setup_relax_domain_level(char *str)
5853{
5854 if (kstrtoint(str, 0, &default_relax_domain_level))
5855 pr_warn("Unable to set relax_domain_level\n");
5856
5857 return 1;
5858}
5859__setup("relax_domain_level=", setup_relax_domain_level);
5860
5861static void set_domain_attribute(struct sched_domain *sd,
5862 struct sched_domain_attr *attr)
5863{
5864 int request;
5865
5866 if (!attr || attr->relax_domain_level < 0) {
5867 if (default_relax_domain_level < 0)
5868 return;
5869 else
5870 request = default_relax_domain_level;
5871 } else
5872 request = attr->relax_domain_level;
5873 if (request < sd->level) {
5874
5875 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5876 } else {
5877
5878 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5879 }
5880}
5881
5882static void __sdt_free(const struct cpumask *cpu_map);
5883static int __sdt_alloc(const struct cpumask *cpu_map);
5884
5885static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5886 const struct cpumask *cpu_map)
5887{
5888 switch (what) {
5889 case sa_rootdomain:
5890 if (!atomic_read(&d->rd->refcount))
5891 free_rootdomain(&d->rd->rcu);
5892 case sa_sd:
5893 free_percpu(d->sd);
5894 case sa_sd_storage:
5895 __sdt_free(cpu_map);
5896 case sa_none:
5897 break;
5898 }
5899}
5900
5901static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5902 const struct cpumask *cpu_map)
5903{
5904 memset(d, 0, sizeof(*d));
5905
5906 if (__sdt_alloc(cpu_map))
5907 return sa_sd_storage;
5908 d->sd = alloc_percpu(struct sched_domain *);
5909 if (!d->sd)
5910 return sa_sd_storage;
5911 d->rd = alloc_rootdomain();
5912 if (!d->rd)
5913 return sa_sd;
5914 return sa_rootdomain;
5915}
5916
5917
5918
5919
5920
5921
5922static void claim_allocations(int cpu, struct sched_domain *sd)
5923{
5924 struct sd_data *sdd = sd->private;
5925
5926 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5927 *per_cpu_ptr(sdd->sd, cpu) = NULL;
5928
5929 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5930 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5931
5932 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5933 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5934}
5935
5936#ifdef CONFIG_SCHED_SMT
5937static const struct cpumask *cpu_smt_mask(int cpu)
5938{
5939 return topology_thread_cpumask(cpu);
5940}
5941#endif
5942
5943
5944
5945
5946static struct sched_domain_topology_level default_topology[] = {
5947#ifdef CONFIG_SCHED_SMT
5948 { sd_init_SIBLING, cpu_smt_mask, },
5949#endif
5950#ifdef CONFIG_SCHED_MC
5951 { sd_init_MC, cpu_coregroup_mask, },
5952#endif
5953#ifdef CONFIG_SCHED_BOOK
5954 { sd_init_BOOK, cpu_book_mask, },
5955#endif
5956 { sd_init_CPU, cpu_cpu_mask, },
5957 { NULL, },
5958};
5959
5960static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5961
5962#ifdef CONFIG_NUMA