1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
30#include <linux/completion.h>
31#include <linux/kernel_stat.h>
32#include <linux/security.h>
33#include <linux/notifier.h>
34#include <linux/profile.h>
35#include <linux/suspend.h>
36#include <linux/blkdev.h>
37#include <linux/delay.h>
38#include <linux/smp.h>
39#include <linux/timer.h>
40#include <linux/rcupdate.h>
41#include <linux/cpu.h>
42#include <linux/percpu.h>
43#include <linux/kthread.h>
44#include <linux/seq_file.h>
45#include <linux/times.h>
46#include <asm/tlb.h>
47
48#include <asm/unistd.h>
49
50#ifdef CONFIG_NUMA
51#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
52#else
53#define cpu_to_node_mask(cpu) (cpu_online_map)
54#endif
55
56
57
58
59
60
61#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
62#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
63#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
64
65
66
67
68
69
70#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
71#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
72#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
73
74
75
76
77#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
78#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
79
80
81
82
83
84
85
86
87#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
88#define DEF_TIMESLICE (100 * HZ / 1000)
89#define ON_RUNQUEUE_WEIGHT 30
90#define CHILD_PENALTY 95
91#define PARENT_PENALTY 100
92#define EXIT_WEIGHT 3
93#define PRIO_BONUS_RATIO 25
94#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
95#define INTERACTIVE_DELTA 2
96#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
97#define STARVATION_LIMIT (MAX_SLEEP_AVG)
98#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
99#define CREDIT_LIMIT 100
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129#define CURRENT_BONUS(p) \
130 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
131 MAX_SLEEP_AVG)
132
133#ifdef CONFIG_SMP
134#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
135 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
136 num_online_cpus())
137#else
138#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
139 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
140#endif
141
142#define SCALE(v1,v1_max,v2_max) \
143 (v1) * (v2_max) / (v1_max)
144
145#define DELTA(p) \
146 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
147
148#define TASK_INTERACTIVE(p) \
149 ((p)->prio <= (p)->static_prio - DELTA(p))
150
151#define INTERACTIVE_SLEEP(p) \
152 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
153 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
154
155#define HIGH_CREDIT(p) \
156 ((p)->interactive_credit > CREDIT_LIMIT)
157
158#define LOW_CREDIT(p) \
159 ((p)->interactive_credit < -CREDIT_LIMIT)
160
161#define TASK_PREEMPTS_CURR(p, rq) \
162 ((p)->prio < (rq)->curr->prio)
163
164
165
166
167
168
169
170
171
172
173#define SCALE_PRIO(x, prio) \
174 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
175
176static unsigned int task_timeslice(task_t *p)
177{
178 if (p->static_prio < NICE_TO_PRIO(0))
179 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
180 else
181 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
182}
183#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
184 < (long long) (sd)->cache_hot_time)
185
186enum idle_type
187{
188 IDLE,
189 NOT_IDLE,
190 NEWLY_IDLE,
191 MAX_IDLE_TYPES
192};
193
194struct sched_domain;
195
196
197
198
199
200#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
201
202typedef struct runqueue runqueue_t;
203
204struct prio_array {
205 unsigned int nr_active;
206 unsigned long bitmap[BITMAP_SIZE];
207 struct list_head queue[MAX_PRIO];
208};
209
210
211
212
213
214
215
216
217struct runqueue {
218 spinlock_t lock;
219
220
221
222
223
224 unsigned long nr_running;
225#ifdef CONFIG_SMP
226 unsigned long cpu_load;
227#endif
228 unsigned long long nr_switches;
229 unsigned long expired_timestamp, nr_uninterruptible;
230 unsigned long long timestamp_last_tick;
231 task_t *curr, *idle;
232 struct mm_struct *prev_mm;
233 prio_array_t *active, *expired, arrays[2];
234 int best_expired_prio;
235 atomic_t nr_iowait;
236
237#ifdef CONFIG_SMP
238 struct sched_domain *sd;
239
240
241 int active_balance;
242 int push_cpu;
243
244 task_t *migration_thread;
245 struct list_head migration_queue;
246#endif
247
248#ifdef CONFIG_SCHEDSTATS
249
250 struct sched_info rq_sched_info;
251
252
253 unsigned long yld_exp_empty;
254 unsigned long yld_act_empty;
255 unsigned long yld_both_empty;
256 unsigned long yld_cnt;
257
258
259 unsigned long sched_noswitch;
260 unsigned long sched_switch;
261 unsigned long sched_cnt;
262 unsigned long sched_goidle;
263
264
265 unsigned long pt_gained[MAX_IDLE_TYPES];
266 unsigned long pt_lost[MAX_IDLE_TYPES];
267
268
269 unsigned long alb_cnt;
270 unsigned long alb_lost;
271 unsigned long alb_gained;
272 unsigned long alb_failed;
273
274
275 unsigned long ttwu_cnt;
276 unsigned long ttwu_attempts;
277 unsigned long ttwu_moved;
278
279
280 unsigned long wunt_cnt;
281 unsigned long wunt_moved;
282
283
284 unsigned long smt_cnt;
285
286
287 unsigned long sbe_cnt;
288#endif
289};
290
291static DEFINE_PER_CPU(struct runqueue, runqueues);
292
293
294
295
296#ifdef CONFIG_SMP
297#define SCHED_LOAD_SCALE 128UL
298
299#define SD_BALANCE_NEWIDLE 1
300#define SD_BALANCE_EXEC 2
301#define SD_WAKE_IDLE 4
302#define SD_WAKE_AFFINE 8
303#define SD_WAKE_BALANCE 16
304#define SD_SHARE_CPUPOWER 32
305
306struct sched_group {
307 struct sched_group *next;
308 cpumask_t cpumask;
309
310
311
312
313
314
315
316 unsigned long cpu_power;
317};
318
319struct sched_domain {
320
321 struct sched_domain *parent;
322 struct sched_group *groups;
323 cpumask_t span;
324 unsigned long min_interval;
325 unsigned long max_interval;
326 unsigned int busy_factor;
327 unsigned int imbalance_pct;
328 unsigned long long cache_hot_time;
329 unsigned int cache_nice_tries;
330 unsigned int per_cpu_gain;
331 int flags;
332
333
334 unsigned long last_balance;
335 unsigned int balance_interval;
336 unsigned int nr_balance_failed;
337
338#ifdef CONFIG_SCHEDSTATS
339
340 unsigned long lb_cnt[MAX_IDLE_TYPES];
341 unsigned long lb_failed[MAX_IDLE_TYPES];
342 unsigned long lb_imbalance[MAX_IDLE_TYPES];
343 unsigned long lb_nobusyg[MAX_IDLE_TYPES];
344 unsigned long lb_nobusyq[MAX_IDLE_TYPES];
345
346
347 unsigned long sbe_attempts;
348 unsigned long sbe_pushed;
349
350
351 unsigned long ttwu_wake_affine;
352 unsigned long ttwu_wake_balance;
353#endif
354};
355
356#ifndef ARCH_HAS_SCHED_TUNE
357#ifdef CONFIG_SCHED_SMT
358#define ARCH_HAS_SCHED_WAKE_IDLE
359
360#define SD_SIBLING_INIT (struct sched_domain) { \
361 .span = CPU_MASK_NONE, \
362 .parent = NULL, \
363 .groups = NULL, \
364 .min_interval = 1, \
365 .max_interval = 2, \
366 .busy_factor = 8, \
367 .imbalance_pct = 110, \
368 .cache_hot_time = 0, \
369 .cache_nice_tries = 0, \
370 .per_cpu_gain = 25, \
371 .flags = SD_BALANCE_NEWIDLE \
372 | SD_BALANCE_EXEC \
373 | SD_WAKE_AFFINE \
374 | SD_WAKE_IDLE \
375 | SD_SHARE_CPUPOWER, \
376 .last_balance = jiffies, \
377 .balance_interval = 1, \
378 .nr_balance_failed = 0, \
379}
380#endif
381
382
383#define SD_CPU_INIT (struct sched_domain) { \
384 .span = CPU_MASK_NONE, \
385 .parent = NULL, \
386 .groups = NULL, \
387 .min_interval = 1, \
388 .max_interval = 4, \
389 .busy_factor = 64, \
390 .imbalance_pct = 125, \
391 .cache_hot_time = cache_decay_ticks*1000000 ? : (5*1000000/2),\
392 .cache_nice_tries = 1, \
393 .per_cpu_gain = 100, \
394 .flags = SD_BALANCE_NEWIDLE \
395 | SD_BALANCE_EXEC \
396 | SD_WAKE_AFFINE \
397 | SD_WAKE_BALANCE, \
398 .last_balance = jiffies, \
399 .balance_interval = 1, \
400 .nr_balance_failed = 0, \
401}
402
403
404#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
405#define SD_NODE_INIT (struct sched_domain) { \
406 .span = CPU_MASK_NONE, \
407 .parent = NULL, \
408 .groups = NULL, \
409 .min_interval = 8, \
410 .max_interval = 32, \
411 .busy_factor = 32, \
412 .imbalance_pct = 125, \
413 .cache_hot_time = (10*1000000), \
414 .cache_nice_tries = 1, \
415 .per_cpu_gain = 100, \
416 .flags = SD_BALANCE_EXEC \
417 | SD_WAKE_BALANCE, \
418 .last_balance = jiffies, \
419 .balance_interval = 1, \
420 .nr_balance_failed = 0, \
421}
422#endif
423#endif
424#endif
425
426
427#define for_each_domain(cpu, domain) \
428 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
429
430#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
431#define this_rq() (&__get_cpu_var(runqueues))
432#define task_rq(p) cpu_rq(task_cpu(p))
433#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
434
435
436
437
438#ifndef prepare_arch_switch
439# define prepare_arch_switch(rq, next) do { } while (0)
440# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
441# define task_running(rq, p) ((rq)->curr == (p))
442#endif
443
444
445
446
447
448
449static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
450{
451 struct runqueue *rq;
452
453repeat_lock_task:
454 local_irq_save(*flags);
455 rq = task_rq(p);
456 spin_lock(&rq->lock);
457 if (unlikely(rq != task_rq(p))) {
458 spin_unlock_irqrestore(&rq->lock, *flags);
459 goto repeat_lock_task;
460 }
461 return rq;
462}
463
464static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
465{
466 spin_unlock_irqrestore(&rq->lock, *flags);
467}
468
469#ifdef CONFIG_SCHEDSTATS
470
471
472
473
474#define SCHEDSTAT_VERSION 10
475
476static int show_schedstat(struct seq_file *seq, void *v)
477{
478 int cpu;
479 enum idle_type itype;
480
481 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
482 seq_printf(seq, "timestamp %lu\n", jiffies);
483 for_each_online_cpu(cpu) {
484 runqueue_t *rq = cpu_rq(cpu);
485#ifdef CONFIG_SMP
486 struct sched_domain *sd;
487 int dcnt = 0;
488#endif
489
490
491 seq_printf(seq,
492 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
493 "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
494 cpu, rq->yld_both_empty,
495 rq->yld_act_empty, rq->yld_exp_empty,
496 rq->yld_cnt, rq->sched_noswitch,
497 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
498 rq->alb_cnt, rq->alb_gained, rq->alb_lost,
499 rq->alb_failed,
500 rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
501 rq->wunt_cnt, rq->wunt_moved,
502 rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
503 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
504
505 for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
506 seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
507 rq->pt_lost[itype]);
508 seq_printf(seq, "\n");
509
510#ifdef CONFIG_SMP
511
512 for_each_domain(cpu, sd) {
513 char mask_str[NR_CPUS];
514
515 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
516 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
517 for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
518 seq_printf(seq, " %lu %lu %lu %lu %lu",
519 sd->lb_cnt[itype],
520 sd->lb_failed[itype],
521 sd->lb_imbalance[itype],
522 sd->lb_nobusyq[itype],
523 sd->lb_nobusyg[itype]);
524 }
525 seq_printf(seq, " %lu %lu %lu %lu\n",
526 sd->sbe_pushed, sd->sbe_attempts,
527 sd->ttwu_wake_affine, sd->ttwu_wake_balance);
528 }
529#endif
530 }
531 return 0;
532}
533
534static int schedstat_open(struct inode *inode, struct file *file)
535{
536 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
537 char *buf = kmalloc(size, GFP_KERNEL);
538 struct seq_file *m;
539 int res;
540
541 if (!buf)
542 return -ENOMEM;
543 res = single_open(file, show_schedstat, NULL);
544 if (!res) {
545 m = file->private_data;
546 m->buf = buf;
547 m->size = size;
548 } else
549 kfree(buf);
550 return res;
551}
552
553struct file_operations proc_schedstat_operations = {
554 .open = schedstat_open,
555 .read = seq_read,
556 .llseek = seq_lseek,
557 .release = single_release,
558};
559
560# define schedstat_inc(rq, field) rq->field++;
561# define schedstat_add(rq, field, amt) rq->field += amt;
562#else
563# define schedstat_inc(rq, field) do { } while (0);
564# define schedstat_add(rq, field, amt) do { } while (0);
565#endif
566
567
568
569
570static runqueue_t *this_rq_lock(void)
571{
572 runqueue_t *rq;
573
574 local_irq_disable();
575 rq = this_rq();
576 spin_lock(&rq->lock);
577
578 return rq;
579}
580
581static inline void rq_unlock(runqueue_t *rq)
582{
583 spin_unlock_irq(&rq->lock);
584}
585
586#ifdef CONFIG_SCHEDSTATS
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602static inline void sched_info_dequeued(task_t *t)
603{
604 t->sched_info.last_queued = 0;
605}
606
607
608
609
610
611
612static inline void sched_info_arrive(task_t *t)
613{
614 unsigned long now = jiffies, diff = 0;
615 struct runqueue *rq = task_rq(t);
616
617 if (t->sched_info.last_queued)
618 diff = now - t->sched_info.last_queued;
619 sched_info_dequeued(t);
620 t->sched_info.run_delay += diff;
621 t->sched_info.last_arrival = now;
622 t->sched_info.pcnt++;
623
624 if (!rq)
625 return;
626
627 rq->rq_sched_info.run_delay += diff;
628 rq->rq_sched_info.pcnt++;
629}
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646static inline void sched_info_queued(task_t *t)
647{
648 if (!t->sched_info.last_queued)
649 t->sched_info.last_queued = jiffies;
650}
651
652
653
654
655
656static inline void sched_info_depart(task_t *t)
657{
658 struct runqueue *rq = task_rq(t);
659 unsigned long diff = jiffies - t->sched_info.last_arrival;
660
661 t->sched_info.cpu_time += diff;
662
663 if (rq)
664 rq->rq_sched_info.cpu_time += diff;
665}
666
667
668
669
670
671
672static inline void sched_info_switch(task_t *prev, task_t *next)
673{
674 struct runqueue *rq = task_rq(prev);
675
676
677
678
679
680
681 if (prev != rq->idle)
682 sched_info_depart(prev);
683
684 if (next != rq->idle)
685 sched_info_arrive(next);
686}
687#else
688#define sched_info_queued(t) do { } while (0)
689#define sched_info_switch(t, next) do { } while (0)
690#endif
691
692
693
694
695static void dequeue_task(struct task_struct *p, prio_array_t *array)
696{
697 array->nr_active--;
698 list_del(&p->run_list);
699 if (list_empty(array->queue + p->prio))
700 __clear_bit(p->prio, array->bitmap);
701}
702
703static void enqueue_task(struct task_struct *p, prio_array_t *array)
704{
705 sched_info_queued(p);
706 list_add_tail(&p->run_list, array->queue + p->prio);
707 __set_bit(p->prio, array->bitmap);
708 array->nr_active++;
709 p->array = array;
710}
711
712
713
714
715
716
717static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
718{
719 list_add(&p->run_list, array->queue + p->prio);
720 __set_bit(p->prio, array->bitmap);
721 array->nr_active++;
722 p->array = array;
723}
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739static int effective_prio(task_t *p)
740{
741 int bonus, prio;
742
743 if (rt_task(p))
744 return p->prio;
745
746 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
747
748 prio = p->static_prio - bonus;
749 if (prio < MAX_RT_PRIO)
750 prio = MAX_RT_PRIO;
751 if (prio > MAX_PRIO-1)
752 prio = MAX_PRIO-1;
753 return prio;
754}
755
756
757
758
759static inline void __activate_task(task_t *p, runqueue_t *rq)
760{
761 enqueue_task(p, rq->active);
762 rq->nr_running++;
763}
764
765
766
767
768static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
769{
770 enqueue_task_head(p, rq->active);
771 rq->nr_running++;
772}
773
774static void recalc_task_prio(task_t *p, unsigned long long now)
775{
776 unsigned long long __sleep_time = now - p->timestamp;
777 unsigned long sleep_time;
778
779 if (__sleep_time > NS_MAX_SLEEP_AVG)
780 sleep_time = NS_MAX_SLEEP_AVG;
781 else
782 sleep_time = (unsigned long)__sleep_time;
783
784 if (likely(sleep_time > 0)) {
785
786
787
788
789
790
791 if (p->mm && p->activated != -1 &&
792 sleep_time > INTERACTIVE_SLEEP(p)) {
793 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
794 DEF_TIMESLICE);
795 if (!HIGH_CREDIT(p))
796 p->interactive_credit++;
797 } else {
798
799
800
801
802 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
803
804
805
806
807
808 if (LOW_CREDIT(p) &&
809 sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
810 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
811
812
813
814
815
816
817 if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
818 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
819 sleep_time = 0;
820 else if (p->sleep_avg + sleep_time >=
821 INTERACTIVE_SLEEP(p)) {
822 p->sleep_avg = INTERACTIVE_SLEEP(p);
823 sleep_time = 0;
824 }
825 }
826
827
828
829
830
831
832
833
834
835 p->sleep_avg += sleep_time;
836
837 if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
838 p->sleep_avg = NS_MAX_SLEEP_AVG;
839 if (!HIGH_CREDIT(p))
840 p->interactive_credit++;
841 }
842 }
843 }
844
845 p->prio = effective_prio(p);
846}
847
848
849
850
851
852
853
854static void activate_task(task_t *p, runqueue_t *rq, int local)
855{
856 unsigned long long now;
857
858 now = sched_clock();
859#ifdef CONFIG_SMP
860 if (!local) {
861
862 runqueue_t *this_rq = this_rq();
863 now = (now - this_rq->timestamp_last_tick)
864 + rq->timestamp_last_tick;
865 }
866#endif
867
868 recalc_task_prio(p, now);
869
870
871
872
873
874 if (!p->activated) {
875
876
877
878
879
880
881
882 if (in_interrupt())
883 p->activated = 2;
884 else {
885
886
887
888
889 p->activated = 1;
890 }
891 }
892 p->timestamp = now;
893
894 __activate_task(p, rq);
895}
896
897
898
899
900static void deactivate_task(struct task_struct *p, runqueue_t *rq)
901{
902 rq->nr_running--;
903 if (p->state == TASK_UNINTERRUPTIBLE)
904 rq->nr_uninterruptible++;
905 dequeue_task(p, p->array);
906 p->array = NULL;
907}
908
909
910
911
912
913
914
915
916#ifdef CONFIG_SMP
917static void resched_task(task_t *p)
918{
919 int need_resched, nrpolling;
920
921 BUG_ON(!spin_is_locked(&task_rq(p)->lock));
922
923
924 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
925 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
926 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
927
928 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
929 smp_send_reschedule(task_cpu(p));
930}
931#else
932static inline void resched_task(task_t *p)
933{
934 set_tsk_need_resched(p);
935}
936#endif
937
938
939
940
941
942inline int task_curr(const task_t *p)
943{
944 return cpu_curr(task_cpu(p)) == p;
945}
946
947#ifdef CONFIG_SMP
948enum request_type {
949 REQ_MOVE_TASK,
950 REQ_SET_DOMAIN,
951};
952
953typedef struct {
954 struct list_head list;
955 enum request_type type;
956
957
958 task_t *task;
959 int dest_cpu;
960
961
962 struct sched_domain *sd;
963
964 struct completion done;
965} migration_req_t;
966
967
968
969
970
971static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
972{
973 runqueue_t *rq = task_rq(p);
974
975
976
977
978
979 if (!p->array && !task_running(rq, p)) {
980 set_task_cpu(p, dest_cpu);
981 return 0;
982 }
983
984 init_completion(&req->done);
985 req->type = REQ_MOVE_TASK;
986 req->task = p;
987 req->dest_cpu = dest_cpu;
988 list_add(&req->list, &rq->migration_queue);
989 return 1;
990}
991
992
993
994
995
996
997
998
999
1000
1001void wait_task_inactive(task_t * p)
1002{
1003 unsigned long flags;
1004 runqueue_t *rq;
1005 int preempted;
1006
1007repeat:
1008 rq = task_rq_lock(p, &flags);
1009
1010 if (unlikely(p->array)) {
1011
1012 preempted = !task_running(rq, p);
1013 task_rq_unlock(rq, &flags);
1014 cpu_relax();
1015 if (preempted)
1016 yield();
1017 goto repeat;
1018 }
1019 task_rq_unlock(rq, &flags);
1020}
1021
1022
1023
1024
1025
1026
1027
1028
1029void kick_process(task_t *p)
1030{
1031 int cpu;
1032
1033 preempt_disable();
1034 cpu = task_cpu(p);
1035 if ((cpu != smp_processor_id()) && task_curr(p))
1036 smp_send_reschedule(cpu);
1037 preempt_enable();
1038}
1039
1040EXPORT_SYMBOL_GPL(kick_process);
1041
1042
1043
1044
1045
1046
1047
1048static inline unsigned long source_load(int cpu)
1049{
1050 runqueue_t *rq = cpu_rq(cpu);
1051 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
1052
1053 return min(rq->cpu_load, load_now);
1054}
1055
1056
1057
1058
1059static inline unsigned long target_load(int cpu)
1060{
1061 runqueue_t *rq = cpu_rq(cpu);
1062 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
1063
1064 return max(rq->cpu_load, load_now);
1065}
1066
1067#endif
1068
1069
1070
1071
1072
1073
1074
1075
1076#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1077static int wake_idle(int cpu, task_t *p)
1078{
1079 cpumask_t tmp;
1080 runqueue_t *rq = cpu_rq(cpu);
1081 struct sched_domain *sd;
1082 int i;
1083
1084 if (idle_cpu(cpu))
1085 return cpu;
1086
1087 sd = rq->sd;
1088 if (!(sd->flags & SD_WAKE_IDLE))
1089 return cpu;
1090
1091 cpus_and(tmp, sd->span, cpu_online_map);
1092 cpus_and(tmp, tmp, p->cpus_allowed);
1093
1094 for_each_cpu_mask(i, tmp) {
1095 if (idle_cpu(i))
1096 return i;
1097 }
1098
1099 return cpu;
1100}
1101#else
1102static inline int wake_idle(int cpu, task_t *p)
1103{
1104 return cpu;
1105}
1106#endif
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122static int try_to_wake_up(task_t * p, unsigned int state, int sync)
1123{
1124 int cpu, this_cpu, success = 0;
1125 unsigned long flags;
1126 long old_state;
1127 runqueue_t *rq;
1128#ifdef CONFIG_SMP
1129 unsigned long load, this_load;
1130 struct sched_domain *sd;
1131 int new_cpu;
1132#endif
1133
1134 rq = task_rq_lock(p, &flags);
1135 schedstat_inc(rq, ttwu_cnt);
1136 old_state = p->state;
1137 if (!(old_state & state))
1138 goto out;
1139
1140 if (p->array)
1141 goto out_running;
1142
1143 cpu = task_cpu(p);
1144 this_cpu = smp_processor_id();
1145
1146#ifdef CONFIG_SMP
1147 if (unlikely(task_running(rq, p)))
1148 goto out_activate;
1149
1150 new_cpu = cpu;
1151
1152 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1153 goto out_set_cpu;
1154
1155 load = source_load(cpu);
1156 this_load = target_load(this_cpu);
1157
1158
1159
1160
1161
1162 if (sync)
1163 this_load -= SCHED_LOAD_SCALE;
1164
1165
1166 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1167 goto out_set_cpu;
1168
1169 new_cpu = this_cpu;
1170
1171
1172
1173
1174
1175 for_each_domain(this_cpu, sd) {
1176 unsigned int imbalance;
1177
1178
1179
1180
1181 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1182
1183 if ((sd->flags & SD_WAKE_AFFINE) &&
1184 !task_hot(p, rq->timestamp_last_tick, sd)) {
1185
1186
1187
1188
1189 if (cpu_isset(cpu, sd->span)) {
1190 schedstat_inc(sd, ttwu_wake_affine);
1191 goto out_set_cpu;
1192 }
1193 } else if ((sd->flags & SD_WAKE_BALANCE) &&
1194 imbalance*this_load <= 100*load) {
1195
1196
1197
1198
1199 if (cpu_isset(cpu, sd->span)) {
1200 schedstat_inc(sd, ttwu_wake_balance);
1201 goto out_set_cpu;
1202 }
1203 }
1204 }
1205
1206 new_cpu = cpu;
1207out_set_cpu:
1208 schedstat_inc(rq, ttwu_attempts);
1209 new_cpu = wake_idle(new_cpu, p);
1210 if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
1211 schedstat_inc(rq, ttwu_moved);
1212 set_task_cpu(p, new_cpu);
1213 task_rq_unlock(rq, &flags);
1214
1215 rq = task_rq_lock(p, &flags);
1216 old_state = p->state;
1217 if (!(old_state & state))
1218 goto out;
1219 if (p->array)
1220 goto out_running;
1221
1222 this_cpu = smp_processor_id();
1223 cpu = task_cpu(p);
1224 }
1225
1226out_activate:
1227#endif
1228 if (old_state == TASK_UNINTERRUPTIBLE) {
1229 rq->nr_uninterruptible--;
1230
1231
1232
1233
1234 p->activated = -1;
1235 }
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245 activate_task(p, rq, cpu == this_cpu);
1246 if (!sync || cpu != this_cpu) {
1247 if (TASK_PREEMPTS_CURR(p, rq))
1248 resched_task(rq->curr);
1249 }
1250 success = 1;
1251
1252out_running:
1253 p->state = TASK_RUNNING;
1254out:
1255 task_rq_unlock(rq, &flags);
1256
1257 return success;
1258}
1259
1260int fastcall wake_up_process(task_t * p)
1261{
1262 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1263 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1264}
1265
1266EXPORT_SYMBOL(wake_up_process);
1267
1268int fastcall wake_up_state(task_t *p, unsigned int state)
1269{
1270 return try_to_wake_up(p, state, 0);
1271}
1272
1273#ifdef CONFIG_SMP
1274static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1275 struct sched_domain *sd);
1276#endif
1277
1278
1279
1280
1281
1282void fastcall sched_fork(task_t *p)
1283{
1284
1285
1286
1287
1288
1289
1290 p->state = TASK_RUNNING;
1291 INIT_LIST_HEAD(&p->run_list);
1292 p->array = NULL;
1293 spin_lock_init(&p->switch_lock);
1294#ifdef CONFIG_SCHEDSTATS
1295 memset(&p->sched_info, 0, sizeof(p->sched_info));
1296#endif
1297#ifdef CONFIG_PREEMPT
1298
1299
1300
1301
1302
1303
1304 p->thread_info->preempt_count = 1;
1305#endif
1306
1307
1308
1309
1310
1311 local_irq_disable();
1312 p->time_slice = (current->time_slice + 1) >> 1;
1313
1314
1315
1316
1317 p->first_time_slice = 1;
1318 current->time_slice >>= 1;
1319 p->timestamp = sched_clock();
1320 if (unlikely(!current->time_slice)) {
1321
1322
1323
1324
1325
1326 current->time_slice = 1;
1327 preempt_disable();
1328 scheduler_tick(0, 0);
1329 local_irq_enable();
1330 preempt_enable();
1331 } else
1332 local_irq_enable();
1333}
1334
1335
1336
1337
1338
1339
1340
1341
1342void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1343{
1344 unsigned long flags;
1345 int this_cpu, cpu;
1346 runqueue_t *rq, *this_rq;
1347
1348 rq = task_rq_lock(p, &flags);
1349 cpu = task_cpu(p);
1350 this_cpu = smp_processor_id();
1351
1352 BUG_ON(p->state != TASK_RUNNING);
1353
1354 schedstat_inc(rq, wunt_cnt);
1355
1356
1357
1358
1359
1360
1361 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1362 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1363
1364 p->interactive_credit = 0;
1365
1366 p->prio = effective_prio(p);
1367
1368 if (likely(cpu == this_cpu)) {
1369 if (!(clone_flags & CLONE_VM)) {
1370
1371
1372
1373
1374
1375 if (unlikely(!current->array))
1376 __activate_task(p, rq);
1377 else {
1378 p->prio = current->prio;
1379 list_add_tail(&p->run_list, ¤t->run_list);
1380 p->array = current->array;
1381 p->array->nr_active++;
1382 rq->nr_running++;
1383 }
1384 set_need_resched();
1385 } else
1386
1387 __activate_task(p, rq);
1388
1389
1390
1391
1392
1393
1394 this_rq = rq;
1395 } else {
1396 this_rq = cpu_rq(this_cpu);
1397
1398
1399
1400
1401
1402 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1403 + rq->timestamp_last_tick;
1404 __activate_task(p, rq);
1405 if (TASK_PREEMPTS_CURR(p, rq))
1406 resched_task(rq->curr);
1407
1408 schedstat_inc(rq, wunt_moved);
1409
1410
1411
1412
1413 task_rq_unlock(rq, &flags);
1414 this_rq = task_rq_lock(current, &flags);
1415 }
1416 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1417 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1418 task_rq_unlock(this_rq, &flags);
1419}
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430void fastcall sched_exit(task_t * p)
1431{
1432 unsigned long flags;
1433 runqueue_t *rq;
1434
1435
1436
1437
1438
1439 rq = task_rq_lock(p->parent, &flags);
1440 if (p->first_time_slice) {
1441 p->parent->time_slice += p->time_slice;
1442 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1443 p->parent->time_slice = task_timeslice(p);
1444 }
1445 if (p->sleep_avg < p->parent->sleep_avg)
1446 p->parent->sleep_avg = p->parent->sleep_avg /
1447 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1448 (EXIT_WEIGHT + 1);
1449 task_rq_unlock(rq, &flags);
1450}
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465static void finish_task_switch(task_t *prev)
1466{
1467 runqueue_t *rq = this_rq();
1468 struct mm_struct *mm = rq->prev_mm;
1469 unsigned long prev_task_flags;
1470
1471 rq->prev_mm = NULL;
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484 prev_task_flags = prev->flags;
1485 finish_arch_switch(rq, prev);
1486 if (mm)
1487 mmdrop(mm);
1488 if (unlikely(prev_task_flags & PF_DEAD))
1489 put_task_struct(prev);
1490}
1491
1492
1493
1494
1495
1496asmlinkage void schedule_tail(task_t *prev)
1497{
1498 finish_task_switch(prev);
1499
1500 if (current->set_child_tid)
1501 put_user(current->pid, current->set_child_tid);
1502}
1503
1504
1505
1506
1507
1508static inline
1509task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1510{
1511 struct mm_struct *mm = next->mm;
1512 struct mm_struct *oldmm = prev->active_mm;
1513
1514 if (unlikely(!mm)) {
1515 next->active_mm = oldmm;
1516 atomic_inc(&oldmm->mm_count);
1517 enter_lazy_tlb(oldmm, next);
1518 } else
1519 switch_mm(oldmm, mm, next);
1520
1521 if (unlikely(!prev->mm)) {
1522 prev->active_mm = NULL;
1523 WARN_ON(rq->prev_mm);
1524 rq->prev_mm = oldmm;
1525 }
1526
1527
1528 switch_to(prev, next, prev);
1529
1530 return prev;
1531}
1532
1533
1534
1535
1536
1537
1538
1539
1540unsigned long nr_running(void)
1541{
1542 unsigned long i, sum = 0;
1543
1544 for_each_online_cpu(i)
1545 sum += cpu_rq(i)->nr_running;
1546
1547 return sum;
1548}
1549
1550unsigned long nr_uninterruptible(void)
1551{
1552 unsigned long i, sum = 0;
1553
1554 for_each_cpu(i)
1555 sum += cpu_rq(i)->nr_uninterruptible;
1556
1557 return sum;
1558}
1559
1560unsigned long long nr_context_switches(void)
1561{
1562 unsigned long long i, sum = 0;
1563
1564 for_each_cpu(i)
1565 sum += cpu_rq(i)->nr_switches;
1566
1567 return sum;
1568}
1569
1570unsigned long nr_iowait(void)
1571{
1572 unsigned long i, sum = 0;
1573
1574 for_each_cpu(i)
1575 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1576
1577 return sum;
1578}
1579
1580#ifdef CONFIG_SMP
1581
1582
1583
1584
1585
1586
1587
1588static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1589{
1590 if (rq1 == rq2)
1591 spin_lock(&rq1->lock);
1592 else {
1593 if (rq1 < rq2) {
1594 spin_lock(&rq1->lock);
1595 spin_lock(&rq2->lock);
1596 } else {
1597 spin_lock(&rq2->lock);
1598 spin_lock(&rq1->lock);
1599 }
1600 }
1601}
1602
1603
1604
1605
1606
1607
1608
1609static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1610{
1611 spin_unlock(&rq1->lock);
1612 if (rq1 != rq2)
1613 spin_unlock(&rq2->lock);
1614}
1615
1616
1617
1618
1619static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1620{
1621 if (unlikely(!spin_trylock(&busiest->lock))) {
1622 if (busiest < this_rq) {
1623 spin_unlock(&this_rq->lock);
1624 spin_lock(&busiest->lock);
1625 spin_lock(&this_rq->lock);
1626 } else
1627 spin_lock(&busiest->lock);
1628 }
1629}
1630
1631
1632
1633
1634static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1635 struct sched_domain *sd)
1636{
1637 unsigned long load, min_load, this_load;
1638 int i, min_cpu;
1639 cpumask_t mask;
1640
1641 min_cpu = UINT_MAX;
1642 min_load = ULONG_MAX;
1643
1644 cpus_and(mask, sd->span, cpu_online_map);
1645 cpus_and(mask, mask, p->cpus_allowed);
1646
1647 for_each_cpu_mask(i, mask) {
1648 load = target_load(i);
1649
1650 if (load < min_load) {
1651 min_cpu = i;
1652 min_load = load;
1653
1654
1655 if (!min_load)
1656 break;
1657 }
1658 }
1659
1660
1661 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1672 return min_cpu;
1673
1674 return this_cpu;
1675}
1676
1677
1678
1679
1680
1681
1682
1683static void sched_migrate_task(task_t *p, int dest_cpu)
1684{
1685 migration_req_t req;
1686 runqueue_t *rq;
1687 unsigned long flags;
1688
1689 rq = task_rq_lock(p, &flags);
1690 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1691 || unlikely(cpu_is_offline(dest_cpu)))
1692 goto out;
1693
1694 schedstat_inc(rq, smt_cnt);
1695
1696 if (migrate_task(p, dest_cpu, &req)) {
1697
1698 struct task_struct *mt = rq->migration_thread;
1699 get_task_struct(mt);
1700 task_rq_unlock(rq, &flags);
1701 wake_up_process(mt);
1702 put_task_struct(mt);
1703 wait_for_completion(&req.done);
1704 return;
1705 }
1706out:
1707 task_rq_unlock(rq, &flags);
1708}
1709
1710
1711
1712
1713
1714
1715
1716
1717void sched_exec(void)
1718{
1719 struct sched_domain *tmp, *sd = NULL;
1720 int new_cpu, this_cpu = get_cpu();
1721
1722 schedstat_inc(this_rq(), sbe_cnt);
1723
1724 if (this_rq()->nr_running <= 1)
1725 goto out;
1726
1727 for_each_domain(this_cpu, tmp)
1728 if (tmp->flags & SD_BALANCE_EXEC)
1729 sd = tmp;
1730
1731 if (sd) {
1732 schedstat_inc(sd, sbe_attempts);
1733 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1734 if (new_cpu != this_cpu) {
1735 schedstat_inc(sd, sbe_pushed);
1736 put_cpu();
1737 sched_migrate_task(current, new_cpu);
1738 return;
1739 }
1740 }
1741out:
1742 put_cpu();
1743}
1744
1745
1746
1747
1748
1749static inline
1750void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1751 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1752{
1753 dequeue_task(p, src_array);
1754 src_rq->nr_running--;
1755 set_task_cpu(p, this_cpu);
1756 this_rq->nr_running++;
1757 enqueue_task(p, this_array);
1758 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1759 + this_rq->timestamp_last_tick;
1760
1761
1762
1763
1764 if (TASK_PREEMPTS_CURR(p, this_rq))
1765 resched_task(this_rq->curr);
1766}
1767
1768
1769
1770
1771static inline
1772int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1773 struct sched_domain *sd, enum idle_type idle)
1774{
1775
1776
1777
1778
1779
1780
1781 if (task_running(rq, p))
1782 return 0;
1783 if (!cpu_isset(this_cpu, p->cpus_allowed))
1784 return 0;
1785
1786
1787 if (idle == NEWLY_IDLE ||
1788 sd->nr_balance_failed < sd->cache_nice_tries) {
1789 if (task_hot(p, rq->timestamp_last_tick, sd))
1790 return 0;
1791 }
1792
1793 return 1;
1794}
1795
1796
1797
1798
1799
1800
1801
1802
1803static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1804 unsigned long max_nr_move, struct sched_domain *sd,
1805 enum idle_type idle)
1806{
1807 prio_array_t *array, *dst_array;
1808 struct list_head *head, *curr;
1809 int idx, pulled = 0;
1810 task_t *tmp;
1811
1812 if (max_nr_move <= 0 || busiest->nr_running <= 1)
1813 goto out;
1814
1815
1816
1817
1818
1819
1820
1821 if (busiest->expired->nr_active) {
1822 array = busiest->expired;
1823 dst_array = this_rq->expired;
1824 } else {
1825 array = busiest->active;
1826 dst_array = this_rq->active;
1827 }
1828
1829new_array:
1830
1831 idx = 0;
1832skip_bitmap:
1833 if (!idx)
1834 idx = sched_find_first_bit(array->bitmap);
1835 else
1836 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1837 if (idx >= MAX_PRIO) {
1838 if (array == busiest->expired && busiest->active->nr_active) {
1839 array = busiest->active;
1840 dst_array = this_rq->active;
1841 goto new_array;
1842 }
1843 goto out;
1844 }
1845
1846 head = array->queue + idx;
1847 curr = head->prev;
1848skip_queue:
1849 tmp = list_entry(curr, task_t, run_list);
1850
1851 curr = curr->prev;
1852
1853 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1854 if (curr != head)
1855 goto skip_queue;
1856 idx++;
1857 goto skip_bitmap;
1858 }
1859
1860
1861
1862
1863
1864
1865 schedstat_inc(this_rq, pt_gained[idle]);
1866 schedstat_inc(busiest, pt_lost[idle]);
1867
1868 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1869 pulled++;
1870
1871
1872 if (pulled < max_nr_move) {
1873 if (curr != head)
1874 goto skip_queue;
1875 idx++;
1876 goto skip_bitmap;
1877 }
1878out:
1879 return pulled;
1880}
1881
1882
1883
1884
1885
1886
1887static struct sched_group *
1888find_busiest_group(struct sched_domain *sd, int this_cpu,
1889 unsigned long *imbalance, enum idle_type idle)
1890{
1891 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1892 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1893
1894 max_load = this_load = total_load = total_pwr = 0;
1895
1896 do {
1897 cpumask_t tmp;
1898 unsigned long load;
1899 int local_group;
1900 int i, nr_cpus = 0;
1901
1902 local_group = cpu_isset(this_cpu, group->cpumask);
1903
1904
1905 avg_load = 0;
1906 cpus_and(tmp, group->cpumask, cpu_online_map);
1907 if (unlikely(cpus_empty(tmp)))
1908 goto nextgroup;
1909
1910 for_each_cpu_mask(i, tmp) {
1911
1912 if (local_group)
1913 load = target_load(i);
1914 else
1915 load = source_load(i);
1916
1917 nr_cpus++;
1918 avg_load += load;
1919 }
1920
1921 if (!nr_cpus)
1922 goto nextgroup;
1923
1924 total_load += avg_load;
1925 total_pwr += group->cpu_power;
1926
1927
1928 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1929
1930 if (local_group) {
1931 this_load = avg_load;
1932 this = group;
1933 goto nextgroup;
1934 } else if (avg_load > max_load) {
1935 max_load = avg_load;
1936 busiest = group;
1937 }
1938nextgroup:
1939 group = group->next;
1940 } while (group != sd->groups);
1941
1942 if (!busiest || this_load >= max_load)
1943 goto out_balanced;
1944
1945 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1946
1947 if (this_load >= avg_load ||
1948 100*max_load <= sd->imbalance_pct*this_load)
1949 goto out_balanced;
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962 *imbalance = min(max_load - avg_load, avg_load - this_load);
1963
1964
1965 *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
1966 / SCHED_LOAD_SCALE;
1967
1968 if (*imbalance < SCHED_LOAD_SCALE - 1) {
1969 unsigned long pwr_now = 0, pwr_move = 0;
1970 unsigned long tmp;
1971
1972 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1973 *imbalance = 1;
1974 return busiest;
1975 }
1976
1977
1978
1979
1980
1981
1982
1983 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1984 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1985 pwr_now /= SCHED_LOAD_SCALE;
1986
1987
1988 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1989 if (max_load > tmp)
1990 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1991 max_load - tmp);
1992
1993
1994 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1995 if (max_load < tmp)
1996 tmp = max_load;
1997 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1998 pwr_move /= SCHED_LOAD_SCALE;
1999
2000
2001 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
2002 goto out_balanced;
2003
2004 *imbalance = 1;
2005 return busiest;
2006 }
2007
2008
2009 *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
2010
2011 return busiest;
2012
2013out_balanced:
2014 if (busiest && (idle == NEWLY_IDLE ||
2015 (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
2016 *imbalance = 1;
2017 return busiest;
2018 }
2019
2020 *imbalance = 0;
2021 return NULL;
2022}
2023
2024
2025
2026
2027static runqueue_t *find_busiest_queue(struct sched_group *group)
2028{
2029 cpumask_t tmp;
2030 unsigned long load, max_load = 0;
2031 runqueue_t *busiest = NULL;
2032 int i;
2033
2034 cpus_and(tmp, group->cpumask, cpu_online_map);
2035 for_each_cpu_mask(i, tmp) {
2036 load = source_load(i);
2037
2038 if (load > max_load) {
2039 max_load = load;
2040 busiest = cpu_rq(i);
2041 }
2042 }
2043
2044 return busiest;
2045}
2046
2047
2048
2049
2050
2051
2052
2053static int load_balance(int this_cpu, runqueue_t *this_rq,
2054 struct sched_domain *sd, enum idle_type idle)
2055{
2056 struct sched_group *group;
2057 runqueue_t *busiest;
2058 unsigned long imbalance;
2059 int nr_moved;
2060
2061 spin_lock(&this_rq->lock);
2062 schedstat_inc(sd, lb_cnt[idle]);
2063
2064 group = find_busiest_group(sd, this_cpu, &imbalance, idle);
2065 if (!group) {
2066 schedstat_inc(sd, lb_nobusyg[idle]);
2067 goto out_balanced;
2068 }
2069
2070 busiest = find_busiest_queue(group);
2071 if (!busiest) {
2072 schedstat_inc(sd, lb_nobusyq[idle]);
2073 goto out_balanced;
2074 }
2075
2076
2077
2078
2079
2080
2081 if (unlikely(busiest == this_rq)) {
2082 WARN_ON(1);
2083 goto out_balanced;
2084 }
2085
2086 schedstat_add(sd, lb_imbalance[idle], imbalance);
2087
2088 nr_moved = 0;
2089 if (busiest->nr_running > 1) {
2090
2091
2092
2093
2094
2095
2096 double_lock_balance(this_rq, busiest);
2097 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2098 imbalance, sd, idle);
2099 spin_unlock(&busiest->lock);
2100 }
2101 spin_unlock(&this_rq->lock);
2102
2103 if (!nr_moved) {
2104 schedstat_inc(sd, lb_failed[idle]);
2105 sd->nr_balance_failed++;
2106
2107 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2108 int wake = 0;
2109
2110 spin_lock(&busiest->lock);
2111 if (!busiest->active_balance) {
2112 busiest->active_balance = 1;
2113 busiest->push_cpu = this_cpu;
2114 wake = 1;
2115 }
2116 spin_unlock(&busiest->lock);
2117 if (wake)
2118 wake_up_process(busiest->migration_thread);
2119
2120
2121
2122
2123
2124 sd->nr_balance_failed = sd->cache_nice_tries;
2125 }
2126 } else
2127 sd->nr_balance_failed = 0;
2128
2129
2130 sd->balance_interval = sd->min_interval;
2131
2132 return nr_moved;
2133
2134out_balanced:
2135 spin_unlock(&this_rq->lock);
2136
2137
2138 if (sd->balance_interval < sd->max_interval)
2139 sd->balance_interval *= 2;
2140
2141 return 0;
2142}
2143
2144
2145
2146
2147
2148
2149
2150
2151static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2152 struct sched_domain *sd)
2153{
2154 struct sched_group *group;
2155 runqueue_t *busiest = NULL;
2156 unsigned long imbalance;
2157 int nr_moved = 0;
2158
2159 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2160 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2161 if (!group) {
2162 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2163 goto out;
2164 }
2165
2166 busiest = find_busiest_queue(group);
2167 if (!busiest || busiest == this_rq) {
2168 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2169 goto out;
2170 }
2171
2172
2173 double_lock_balance(this_rq, busiest);
2174
2175 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2176 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2177 imbalance, sd, NEWLY_IDLE);
2178 if (!nr_moved)
2179 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2180
2181 spin_unlock(&busiest->lock);
2182
2183out:
2184 return nr_moved;
2185}
2186
2187
2188
2189
2190
2191static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2192{
2193 struct sched_domain *sd;
2194
2195 for_each_domain(this_cpu, sd) {
2196 if (sd->flags & SD_BALANCE_NEWIDLE) {
2197 if (load_balance_newidle(this_cpu, this_rq, sd)) {
2198
2199 break;
2200 }
2201 }
2202 }
2203}
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
2214{
2215 struct sched_domain *sd;
2216 struct sched_group *group, *busy_group;
2217 int i;
2218
2219 schedstat_inc(busiest, alb_cnt);
2220 if (busiest->nr_running <= 1)
2221 return;
2222
2223 for_each_domain(busiest_cpu, sd)
2224 if (cpu_isset(busiest->push_cpu, sd->span))
2225 break;
2226 if (!sd)
2227 return;
2228
2229 group = sd->groups;
2230 while (!cpu_isset(busiest_cpu, group->cpumask))
2231 group = group->next;
2232 busy_group = group;
2233
2234 group = sd->groups;
2235 do {
2236 cpumask_t tmp;
2237 runqueue_t *rq;
2238 int push_cpu = 0;
2239
2240 if (group == busy_group)
2241 goto next_group;
2242
2243 cpus_and(tmp, group->cpumask, cpu_online_map);
2244 if (!cpus_weight(tmp))
2245 goto next_group;
2246
2247 for_each_cpu_mask(i, tmp) {
2248 if (!idle_cpu(i))
2249 goto next_group;
2250 push_cpu = i;
2251 }
2252
2253 rq = cpu_rq(push_cpu);
2254
2255
2256
2257
2258
2259
2260
2261 if (unlikely(busiest == rq))
2262 goto next_group;
2263 double_lock_balance(busiest, rq);
2264 if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
2265 schedstat_inc(busiest, alb_lost);
2266 schedstat_inc(rq, alb_gained);
2267 } else {
2268 schedstat_inc(busiest, alb_failed);
2269 }
2270 spin_unlock(&rq->lock);
2271next_group:
2272 group = group->next;
2273 } while (group != sd->groups);
2274}
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2287
2288static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2289 enum idle_type idle)
2290{
2291 unsigned long old_load, this_load;
2292 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2293 struct sched_domain *sd;
2294
2295
2296 old_load = this_rq->cpu_load;
2297 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2298
2299
2300
2301
2302
2303 if (this_load > old_load)
2304 old_load++;
2305 this_rq->cpu_load = (old_load + this_load) / 2;
2306
2307 for_each_domain(this_cpu, sd) {
2308 unsigned long interval = sd->balance_interval;
2309
2310 if (idle != IDLE)
2311 interval *= sd->busy_factor;
2312
2313
2314 interval = msecs_to_jiffies(interval);
2315 if (unlikely(!interval))
2316 interval = 1;
2317
2318 if (j - sd->last_balance >= interval) {
2319 if (load_balance(this_cpu, this_rq, sd, idle)) {
2320
2321 idle = NOT_IDLE;
2322 }
2323 sd->last_balance += interval;
2324 }
2325 }
2326}
2327#else
2328
2329
2330
2331static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2332{
2333}
2334static inline void idle_balance(int cpu, runqueue_t *rq)
2335{
2336}
2337#endif
2338
2339static inline int wake_priority_sleeper(runqueue_t *rq)
2340{
2341 int ret = 0;
2342#ifdef CONFIG_SCHED_SMT
2343 spin_lock(&rq->lock);
2344
2345
2346
2347
2348 if (rq->nr_running) {
2349 resched_task(rq->idle);
2350 ret = 1;
2351 }
2352 spin_unlock(&rq->lock);
2353#endif
2354 return ret;
2355}
2356
2357DEFINE_PER_CPU(struct kernel_stat, kstat);
2358
2359EXPORT_PER_CPU_SYMBOL(kstat);
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371#define EXPIRED_STARVING(rq) \
2372 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2373 (jiffies - (rq)->expired_timestamp >= \
2374 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2375 ((rq)->curr->static_prio > (rq)->best_expired_prio))
2376
2377
2378
2379
2380
2381
2382
2383
2384void scheduler_tick(int user_ticks, int sys_ticks)
2385{
2386 int cpu = smp_processor_id();
2387 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2388 runqueue_t *rq = this_rq();
2389 task_t *p = current;
2390
2391 rq->timestamp_last_tick = sched_clock();
2392
2393 if (rcu_pending(cpu))
2394 rcu_check_callbacks(cpu, user_ticks);
2395
2396
2397 if (hardirq_count() - HARDIRQ_OFFSET) {
2398 cpustat->irq += sys_ticks;
2399 sys_ticks = 0;
2400 } else if (softirq_count()) {
2401 cpustat->softirq += sys_ticks;
2402 sys_ticks = 0;
2403 }
2404
2405 if (p == rq->idle) {
2406 if (atomic_read(&rq->nr_iowait) > 0)
2407 cpustat->iowait += sys_ticks;
2408 else
2409 cpustat->idle += sys_ticks;
2410 if (wake_priority_sleeper(rq))
2411 goto out;
2412 rebalance_tick(cpu, rq, IDLE);
2413 return;
2414 }
2415 if (TASK_NICE(p) > 0)
2416 cpustat->nice += user_ticks;
2417 else
2418 cpustat->user += user_ticks;
2419 cpustat->system += sys_ticks;
2420
2421
2422 if (p->array != rq->active) {
2423 set_tsk_need_resched(p);
2424 goto out;
2425 }
2426 spin_lock(&rq->lock);
2427
2428
2429
2430
2431
2432
2433
2434 if (rt_task(p)) {
2435
2436
2437
2438
2439 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2440 p->time_slice = task_timeslice(p);
2441 p->first_time_slice = 0;
2442 set_tsk_need_resched(p);
2443
2444
2445 dequeue_task(p, rq->active);
2446 enqueue_task(p, rq->active);
2447 }
2448 goto out_unlock;
2449 }
2450 if (!--p->time_slice) {
2451 dequeue_task(p, rq->active);
2452 set_tsk_need_resched(p);
2453 p->prio = effective_prio(p);
2454 p->time_slice = task_timeslice(p);
2455 p->first_time_slice = 0;
2456
2457 if (!rq->expired_timestamp)
2458 rq->expired_timestamp = jiffies;
2459 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2460 enqueue_task(p, rq->expired);
2461 if (p->static_prio < rq->best_expired_prio)
2462 rq->best_expired_prio = p->static_prio;
2463 } else
2464 enqueue_task(p, rq->active);
2465 } else {
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2483 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2484 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2485 (p->array == rq->active)) {
2486
2487 dequeue_task(p, rq->active);
2488 set_tsk_need_resched(p);
2489 p->prio = effective_prio(p);
2490 enqueue_task(p, rq->active);
2491 }
2492 }
2493out_unlock:
2494 spin_unlock(&rq->lock);
2495out:
2496 rebalance_tick(cpu, rq, NOT_IDLE);
2497}
2498
2499#ifdef CONFIG_SCHED_SMT
2500static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2501{
2502 struct sched_domain *sd = this_rq->sd;
2503 cpumask_t sibling_map;
2504 int i;
2505
2506 if (!(sd->flags & SD_SHARE_CPUPOWER))
2507 return;
2508
2509
2510
2511
2512
2513
2514 spin_unlock(&this_rq->lock);
2515
2516 cpus_and(sibling_map, sd->span, cpu_online_map);
2517
2518 for_each_cpu_mask(i, sibling_map)
2519 spin_lock(&cpu_rq(i)->lock);
2520
2521
2522
2523
2524 cpu_clear(this_cpu, sibling_map);
2525
2526 for_each_cpu_mask(i, sibling_map) {
2527 runqueue_t *smt_rq = cpu_rq(i);
2528
2529
2530
2531
2532
2533 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2534 resched_task(smt_rq->idle);
2535 }
2536
2537 for_each_cpu_mask(i, sibling_map)
2538 spin_unlock(&cpu_rq(i)->lock);
2539
2540
2541
2542
2543}
2544
2545static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2546{
2547 struct sched_domain *sd = this_rq->sd;
2548 cpumask_t sibling_map;
2549 prio_array_t *array;
2550 int ret = 0, i;
2551 task_t *p;
2552
2553 if (!(sd->flags & SD_SHARE_CPUPOWER))
2554 return 0;
2555
2556
2557
2558
2559
2560 spin_unlock(&this_rq->lock);
2561 cpus_and(sibling_map, sd->span, cpu_online_map);
2562 for_each_cpu_mask(i, sibling_map)
2563 spin_lock(&cpu_rq(i)->lock);
2564 cpu_clear(this_cpu, sibling_map);
2565
2566
2567
2568
2569
2570 if (!this_rq->nr_running)
2571 goto out_unlock;
2572 array = this_rq->active;
2573 if (!array->nr_active)
2574 array = this_rq->expired;
2575 BUG_ON(!array->nr_active);
2576
2577 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2578 task_t, run_list);
2579
2580 for_each_cpu_mask(i, sibling_map) {
2581 runqueue_t *smt_rq = cpu_rq(i);
2582 task_t *smt_curr = smt_rq->curr;
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2593 task_timeslice(p) || rt_task(smt_curr)) &&
2594 p->mm && smt_curr->mm && !rt_task(p))
2595 ret = 1;
2596
2597
2598
2599
2600
2601
2602 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2603 task_timeslice(smt_curr) || rt_task(p)) &&
2604 smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2605 (smt_curr == smt_rq->idle && smt_rq->nr_running))
2606 resched_task(smt_curr);
2607 }
2608out_unlock:
2609 for_each_cpu_mask(i, sibling_map)
2610 spin_unlock(&cpu_rq(i)->lock);
2611 return ret;
2612}
2613#else
2614static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2615{
2616}
2617
2618static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2619{
2620 return 0;
2621}
2622#endif
2623
2624
2625
2626
2627asmlinkage void __sched schedule(void)
2628{
2629 long *switch_count;
2630 task_t *prev, *next;
2631 runqueue_t *rq;
2632 prio_array_t *array;
2633 struct list_head *queue;
2634 unsigned long long now;
2635 unsigned long run_time;
2636 int cpu, idx;
2637
2638
2639
2640
2641
2642
2643 if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
2644 if (unlikely(in_atomic())) {
2645 printk(KERN_ERR "bad: scheduling while atomic!\n");
2646 dump_stack();
2647 }
2648 }
2649
2650need_resched:
2651 preempt_disable();
2652 prev = current;
2653 rq = this_rq();
2654
2655
2656
2657
2658
2659 if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) {
2660 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2661 dump_stack();
2662 }
2663
2664 release_kernel_lock(prev);
2665 schedstat_inc(rq, sched_cnt);
2666 now = sched_clock();
2667 if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2668 run_time = now - prev->timestamp;
2669 else
2670 run_time = NS_MAX_SLEEP_AVG;
2671
2672
2673
2674
2675
2676
2677 if (HIGH_CREDIT(prev))
2678 run_time /= (CURRENT_BONUS(prev) ? : 1);
2679
2680 spin_lock_irq(&rq->lock);
2681
2682
2683
2684
2685
2686 switch_count = &prev->nivcsw;
2687 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2688 switch_count = &prev->nvcsw;
2689 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2690 unlikely(signal_pending(prev))))
2691 prev->state = TASK_RUNNING;
2692 else
2693 deactivate_task(prev, rq);
2694 }
2695
2696 cpu = smp_processor_id();
2697 if (unlikely(!rq->nr_running)) {
2698go_idle:
2699 idle_balance(cpu, rq);
2700 if (!rq->nr_running) {
2701 next = rq->idle;
2702 rq->expired_timestamp = 0;
2703 wake_sleeping_dependent(cpu, rq);
2704
2705
2706
2707
2708
2709 if (!rq->nr_running)
2710 goto switch_tasks;
2711 }
2712 } else {
2713 if (dependent_sleeper(cpu, rq)) {
2714 schedstat_inc(rq, sched_goidle);
2715 next = rq->idle;
2716 goto switch_tasks;
2717 }
2718
2719
2720
2721
2722
2723 if (unlikely(!rq->nr_running))
2724 goto go_idle;
2725 }
2726
2727 array = rq->active;
2728 if (unlikely(!array->nr_active)) {
2729
2730
2731
2732 schedstat_inc(rq, sched_switch);
2733 rq->active = rq->expired;
2734 rq->expired = array;
2735 array = rq->active;
2736 rq->expired_timestamp = 0;
2737 rq->best_expired_prio = MAX_PRIO;
2738 } else
2739 schedstat_inc(rq, sched_noswitch);
2740
2741 idx = sched_find_first_bit(array->bitmap);
2742 queue = array->queue + idx;
2743 next = list_entry(queue->next, task_t, run_list);
2744
2745 if (!rt_task(next) && next->activated > 0) {
2746 unsigned long long delta = now - next->timestamp;
2747
2748 if (next->activated == 1)
2749 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2750
2751 array = next->array;
2752 dequeue_task(next, array);
2753 recalc_task_prio(next, next->timestamp + delta);
2754 enqueue_task(next, array);
2755 }
2756 next->activated = 0;
2757switch_tasks:
2758 prefetch(next);
2759 clear_tsk_need_resched(prev);
2760 rcu_qsctr_inc(task_cpu(prev));
2761
2762 prev->sleep_avg -= run_time;
2763 if ((long)prev->sleep_avg <= 0) {
2764 prev->sleep_avg = 0;
2765 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
2766 prev->interactive_credit--;
2767 }
2768 prev->timestamp = prev->last_ran = now;
2769
2770 sched_info_switch(prev, next);
2771 if (likely(prev != next)) {
2772 next->timestamp = now;
2773 rq->nr_switches++;
2774 rq->curr = next;
2775 ++*switch_count;
2776
2777 prepare_arch_switch(rq, next);
2778 prev = context_switch(rq, prev, next);
2779 barrier();
2780
2781 finish_task_switch(prev);
2782 } else
2783 spin_unlock_irq(&rq->lock);
2784
2785 reacquire_kernel_lock(current);
2786 preempt_enable_no_resched();
2787 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2788 goto need_resched;
2789}
2790
2791EXPORT_SYMBOL(schedule);
2792
2793#ifdef CONFIG_PREEMPT
2794
2795
2796
2797
2798
2799asmlinkage void __sched preempt_schedule(void)
2800{
2801 struct thread_info *ti = current_thread_info();
2802
2803
2804
2805
2806
2807 if (unlikely(ti->preempt_count || irqs_disabled()))
2808 return;
2809
2810need_resched:
2811 ti->preempt_count = PREEMPT_ACTIVE;
2812 schedule();
2813 ti->preempt_count = 0;
2814
2815
2816 barrier();
2817 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2818 goto need_resched;
2819}
2820
2821EXPORT_SYMBOL(preempt_schedule);
2822#endif
2823
2824int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2825{
2826 task_t *p = curr->task;
2827 return try_to_wake_up(p, mode, sync);
2828}
2829
2830EXPORT_SYMBOL(default_wake_function);
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2842 int nr_exclusive, int sync, void *key)
2843{
2844 struct list_head *tmp, *next;
2845
2846 list_for_each_safe(tmp, next, &q->task_list) {
2847 wait_queue_t *curr;
2848 unsigned flags;
2849 curr = list_entry(tmp, wait_queue_t, task_list);
2850 flags = curr->flags;
2851 if (curr->func(curr, mode, sync, key) &&
2852 (flags & WQ_FLAG_EXCLUSIVE) &&
2853 !--nr_exclusive)
2854 break;
2855 }
2856}
2857
2858
2859
2860
2861
2862
2863
2864void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2865 int nr_exclusive, void *key)
2866{
2867 unsigned long flags;
2868
2869 spin_lock_irqsave(&q->lock, flags);
2870 __wake_up_common(q, mode, nr_exclusive, 0, key);
2871 spin_unlock_irqrestore(&q->lock, flags);
2872}
2873
2874EXPORT_SYMBOL(__wake_up);
2875
2876
2877
2878
2879void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2880{
2881 __wake_up_common(q, mode, 1, 0, NULL);
2882}
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2898{
2899 unsigned long flags;
2900 int sync = 1;
2901
2902 if (unlikely(!q))
2903 return;
2904
2905 if (unlikely(!nr_exclusive))
2906 sync = 0;
2907
2908 spin_lock_irqsave(&q->lock, flags);
2909 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2910 spin_unlock_irqrestore(&q->lock, flags);
2911}
2912EXPORT_SYMBOL_GPL(__wake_up_sync);
2913
2914void fastcall complete(struct completion *x)
2915{
2916 unsigned long flags;
2917
2918 spin_lock_irqsave(&x->wait.lock, flags);
2919 x->done++;
2920 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2921 1, 0, NULL);
2922 spin_unlock_irqrestore(&x->wait.lock, flags);
2923}
2924EXPORT_SYMBOL(complete);
2925
2926void fastcall complete_all(struct completion *x)
2927{
2928 unsigned long flags;
2929
2930 spin_lock_irqsave(&x->wait.lock, flags);
2931 x->done += UINT_MAX/2;
2932 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2933 0, 0, NULL);
2934 spin_unlock_irqrestore(&x->wait.lock, flags);
2935}
2936EXPORT_SYMBOL(complete_all);
2937
2938void fastcall __sched wait_for_completion(struct completion *x)
2939{
2940 might_sleep();
2941 spin_lock_irq(&x->wait.lock);
2942 if (!x->done) {
2943 DECLARE_WAITQUEUE(wait, current);
2944
2945 wait.flags |= WQ_FLAG_EXCLUSIVE;
2946 __add_wait_queue_tail(&x->wait, &wait);
2947 do {
2948 __set_current_state(TASK_UNINTERRUPTIBLE);
2949 spin_unlock_irq(&x->wait.lock);
2950 schedule();
2951 spin_lock_irq(&x->wait.lock);
2952 } while (!x->done);
2953 __remove_wait_queue(&x->wait, &wait);
2954 }
2955 x->done--;
2956 spin_unlock_irq(&x->wait.lock);
2957}
2958EXPORT_SYMBOL(wait_for_completion);
2959
2960#define SLEEP_ON_VAR \
2961 unsigned long flags; \
2962 wait_queue_t wait; \
2963 init_waitqueue_entry(&wait, current);
2964
2965#define SLEEP_ON_HEAD \
2966 spin_lock_irqsave(&q->lock,flags); \
2967 __add_wait_queue(q, &wait); \
2968 spin_unlock(&q->lock);
2969
2970#define SLEEP_ON_TAIL \
2971 spin_lock_irq(&q->lock); \
2972 __remove_wait_queue(q, &wait); \
2973 spin_unlock_irqrestore(&q->lock, flags);
2974
2975void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
2976{
2977 SLEEP_ON_VAR
2978
2979 current->state = TASK_INTERRUPTIBLE;
2980
2981 SLEEP_ON_HEAD
2982 schedule();
2983 SLEEP_ON_TAIL
2984}
2985
2986EXPORT_SYMBOL(interruptible_sleep_on);
2987
2988long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2989{
2990 SLEEP_ON_VAR
2991
2992 current->state = TASK_INTERRUPTIBLE;
2993
2994 SLEEP_ON_HEAD
2995 timeout = schedule_timeout(timeout);
2996 SLEEP_ON_TAIL
2997
2998 return timeout;
2999}
3000
3001EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3002
3003void fastcall __sched sleep_on(wait_queue_head_t *q)
3004{
3005 SLEEP_ON_VAR
3006
3007 current->state = TASK_UNINTERRUPTIBLE;
3008
3009 SLEEP_ON_HEAD
3010 schedule();
3011 SLEEP_ON_TAIL
3012}
3013
3014EXPORT_SYMBOL(sleep_on);
3015
3016long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3017{
3018 SLEEP_ON_VAR
3019
3020 current->state = TASK_UNINTERRUPTIBLE;
3021
3022 SLEEP_ON_HEAD
3023 timeout = schedule_timeout(timeout);
3024 SLEEP_ON_TAIL
3025
3026 return timeout;
3027}
3028
3029EXPORT_SYMBOL(sleep_on_timeout);
3030
3031void set_user_nice(task_t *p, long nice)
3032{
3033 unsigned long flags;
3034 prio_array_t *array;
3035 runqueue_t *rq;
3036 int old_prio, new_prio, delta;
3037
3038 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3039 return;
3040
3041
3042
3043
3044 rq = task_rq_lock(p, &flags);
3045
3046
3047
3048
3049
3050
3051 if (rt_task(p)) {
3052 p->static_prio = NICE_TO_PRIO(nice);
3053 goto out_unlock;
3054 }
3055 array = p->array;
3056 if (array)
3057 dequeue_task(p, array);
3058
3059 old_prio = p->prio;
3060 new_prio = NICE_TO_PRIO(nice);
3061 delta = new_prio - old_prio;
3062 p->static_prio = NICE_TO_PRIO(nice);
3063 p->prio += delta;
3064
3065 if (array) {
3066 enqueue_task(p, array);
3067
3068
3069
3070
3071 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3072 resched_task(rq->curr);
3073 }
3074out_unlock:
3075 task_rq_unlock(rq, &flags);
3076}
3077
3078EXPORT_SYMBOL(set_user_nice);
3079
3080#ifdef __ARCH_WANT_SYS_NICE
3081
3082
3083
3084
3085
3086
3087
3088
3089asmlinkage long sys_nice(int increment)
3090{
3091 int retval;
3092 long nice;
3093
3094
3095
3096
3097
3098
3099 if (increment < 0) {
3100 if (!capable(CAP_SYS_NICE))
3101 return -EPERM;
3102 if (increment < -40)
3103 increment = -40;
3104 }
3105 if (increment > 40)
3106 increment = 40;
3107
3108 nice = PRIO_TO_NICE(current->static_prio) + increment;
3109 if (nice < -20)
3110 nice = -20;
3111 if (nice > 19)
3112 nice = 19;
3113
3114 retval = security_task_setnice(current, nice);
3115 if (retval)
3116 return retval;
3117
3118 set_user_nice(current, nice);
3119 return 0;
3120}
3121
3122#endif
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132int task_prio(const task_t *p)
3133{
3134 return p->prio - MAX_RT_PRIO;
3135}
3136
3137
3138
3139
3140
3141int task_nice(const task_t *p)
3142{
3143 return TASK_NICE(p);
3144}
3145
3146EXPORT_SYMBOL(task_nice);
3147
3148
3149
3150
3151
3152int idle_cpu(int cpu)
3153{
3154 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3155}
3156
3157EXPORT_SYMBOL_GPL(idle_cpu);
3158
3159
3160
3161
3162
3163static inline task_t *find_process_by_pid(pid_t pid)
3164{
3165 return pid ? find_task_by_pid(pid) : current;
3166}
3167
3168
3169static void __setscheduler(struct task_struct *p, int policy, int prio)
3170{
3171 BUG_ON(p->array);
3172 p->policy = policy;
3173 p->rt_priority = prio;
3174 if (policy != SCHED_NORMAL)
3175 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3176 else
3177 p->prio = p->static_prio;
3178}
3179
3180
3181
3182
3183static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3184{
3185 struct sched_param lp;
3186 int retval = -EINVAL;
3187 int oldprio;
3188 prio_array_t *array;
3189 unsigned long flags;
3190 runqueue_t *rq;
3191 task_t *p;
3192
3193 if (!param || pid < 0)
3194 goto out_nounlock;
3195
3196 retval = -EFAULT;
3197 if (copy_from_user(&lp, param, sizeof(struct sched_param)))
3198 goto out_nounlock;
3199
3200
3201
3202
3203 read_lock_irq(&tasklist_lock);
3204
3205 p = find_process_by_pid(pid);
3206
3207 retval = -ESRCH;
3208 if (!p)
3209 goto out_unlock_tasklist;
3210
3211
3212
3213
3214
3215 rq = task_rq_lock(p, &flags);
3216
3217 if (policy < 0)
3218 policy = p->policy;
3219 else {
3220 retval = -EINVAL;
3221 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3222 policy != SCHED_NORMAL)
3223 goto out_unlock;
3224 }
3225 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3226
3227
3228
3229
3230
3231 retval = -EINVAL;
3232 if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
3233 goto out_unlock;
3234 if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
3235 goto out_unlock;
3236
3237 retval = -EPERM;
3238 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3239 !capable(CAP_SYS_NICE))
3240 goto out_unlock;
3241 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3242 !capable(CAP_SYS_NICE))
3243 goto out_unlock;
3244
3245 retval = security_task_setscheduler(p, policy, &lp);
3246 if (retval)
3247 goto out_unlock;
3248
3249 array = p->array;
3250 if (array)
3251 deactivate_task(p, task_rq(p));
3252 retval = 0;
3253 oldprio = p->prio;
3254 __setscheduler(p, policy, lp.sched_priority);
3255 if (array) {
3256 __activate_task(p, task_rq(p));
3257
3258
3259
3260
3261
3262 if (task_running(rq, p)) {
3263 if (p->prio > oldprio)
3264 resched_task(rq->curr);
3265 } else if (TASK_PREEMPTS_CURR(p, rq))
3266 resched_task(rq->curr);
3267 }
3268
3269out_unlock:
3270 task_rq_unlock(rq, &flags);
3271out_unlock_tasklist:
3272 read_unlock_irq(&tasklist_lock);
3273
3274out_nounlock:
3275 return retval;
3276}
3277
3278
3279
3280
3281
3282
3283
3284asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3285 struct sched_param __user *param)
3286{
3287 return setscheduler(pid, policy, param);
3288}
3289
3290
3291
3292
3293
3294
3295asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3296{
3297 return setscheduler(pid, -1, param);
3298}
3299
3300
3301
3302
3303
3304asmlinkage long sys_sched_getscheduler(pid_t pid)
3305{
3306 int retval = -EINVAL;
3307 task_t *p;
3308
3309 if (pid < 0)
3310 goto out_nounlock;
3311
3312 retval = -ESRCH;
3313 read_lock(&tasklist_lock);
3314 p = find_process_by_pid(pid);
3315 if (p) {
3316 retval = security_task_getscheduler(p);
3317 if (!retval)
3318 retval = p->policy;
3319 }
3320 read_unlock(&tasklist_lock);
3321
3322out_nounlock:
3323 return retval;
3324}
3325
3326
3327
3328
3329
3330
3331asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3332{
3333 struct sched_param lp;
3334 int retval = -EINVAL;
3335 task_t *p;
3336
3337 if (!param || pid < 0)
3338 goto out_nounlock;
3339
3340 read_lock(&tasklist_lock);
3341 p = find_process_by_pid(pid);
3342 retval = -ESRCH;
3343 if (!p)
3344 goto out_unlock;
3345
3346 retval = security_task_getscheduler(p);
3347 if (retval)
3348 goto out_unlock;
3349
3350 lp.sched_priority = p->rt_priority;
3351 read_unlock(&tasklist_lock);
3352
3353
3354
3355
3356 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3357
3358out_nounlock:
3359 return retval;
3360
3361out_unlock:
3362 read_unlock(&tasklist_lock);
3363 return retval;
3364}
3365
3366long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3367{
3368 task_t *p;
3369 int retval;
3370
3371 lock_cpu_hotplug();
3372 read_lock(&tasklist_lock);
3373
3374 p = find_process_by_pid(pid);
3375 if (!p) {
3376 read_unlock(&tasklist_lock);
3377 unlock_cpu_hotplug();
3378 return -ESRCH;
3379 }
3380
3381
3382
3383
3384
3385
3386 get_task_struct(p);
3387 read_unlock(&tasklist_lock);
3388
3389 retval = -EPERM;
3390 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3391 !capable(CAP_SYS_NICE))
3392 goto out_unlock;
3393
3394 retval = set_cpus_allowed(p, new_mask);
3395
3396out_unlock:
3397 put_task_struct(p);
3398 unlock_cpu_hotplug();
3399 return retval;
3400}
3401
3402static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3403 cpumask_t *new_mask)
3404{
3405 if (len < sizeof(cpumask_t)) {
3406 memset(new_mask, 0, sizeof(cpumask_t));
3407 } else if (len > sizeof(cpumask_t)) {
3408 len = sizeof(cpumask_t);
3409 }
3410 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3411}
3412
3413
3414
3415
3416
3417
3418
3419asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3420 unsigned long __user *user_mask_ptr)
3421{
3422 cpumask_t new_mask;
3423 int retval;
3424
3425 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3426 if (retval)
3427 return retval;
3428
3429 return sched_setaffinity(pid, new_mask);
3430}
3431
3432
3433
3434
3435
3436
3437
3438
3439cpumask_t cpu_present_map;
3440EXPORT_SYMBOL(cpu_present_map);
3441
3442#ifndef CONFIG_SMP
3443cpumask_t cpu_online_map = CPU_MASK_ALL;
3444cpumask_t cpu_possible_map = CPU_MASK_ALL;
3445#endif
3446
3447long sched_getaffinity(pid_t pid, cpumask_t *mask)
3448{
3449 int retval;
3450 task_t *p;
3451
3452 lock_cpu_hotplug();
3453 read_lock(&tasklist_lock);
3454
3455 retval = -ESRCH;
3456 p = find_process_by_pid(pid);
3457 if (!p)
3458 goto out_unlock;
3459
3460 retval = 0;
3461 cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3462
3463out_unlock:
3464 read_unlock(&tasklist_lock);
3465 unlock_cpu_hotplug();
3466 if (retval)
3467 return retval;
3468
3469 return 0;
3470}
3471
3472
3473
3474
3475
3476
3477
3478asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3479 unsigned long __user *user_mask_ptr)
3480{
3481 int ret;
3482 cpumask_t mask;
3483
3484 if (len < sizeof(cpumask_t))
3485 return -EINVAL;
3486
3487 ret = sched_getaffinity(pid, &mask);
3488 if (ret < 0)
3489 return ret;
3490
3491 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3492 return -EFAULT;
3493
3494 return sizeof(cpumask_t);
3495}
3496
3497
3498
3499
3500
3501
3502
3503
3504asmlinkage long sys_sched_yield(void)
3505{
3506 runqueue_t *rq = this_rq_lock();
3507 prio_array_t *array = current->array;
3508 prio_array_t *target = rq->expired;
3509
3510 schedstat_inc(rq, yld_cnt);
3511
3512
3513
3514
3515
3516
3517
3518 if (rt_task(current))
3519 target = rq->active;
3520
3521 if (current->array->nr_active == 1) {
3522 schedstat_inc(rq, yld_act_empty);
3523 if (!rq->expired->nr_active)
3524 schedstat_inc(rq, yld_both_empty);
3525 } else if (!rq->expired->nr_active)
3526 schedstat_inc(rq, yld_exp_empty);
3527
3528 dequeue_task(current, array);
3529 enqueue_task(current, target);
3530
3531
3532
3533
3534
3535 _raw_spin_unlock(&rq->lock);
3536 preempt_enable_no_resched();
3537
3538 schedule();
3539
3540 return 0;
3541}
3542
3543void __sched __cond_resched(void)
3544{
3545 set_current_state(TASK_RUNNING);
3546 schedule();
3547}
3548
3549EXPORT_SYMBOL(__cond_resched);
3550
3551
3552
3553
3554
3555
3556
3557void __sched yield(void)
3558{
3559 set_current_state(TASK_RUNNING);
3560 sys_sched_yield();
3561}
3562
3563EXPORT_SYMBOL(yield);
3564
3565
3566
3567
3568
3569
3570
3571
3572void __sched io_schedule(void)
3573{
3574 struct runqueue *rq = this_rq();
3575
3576 atomic_inc(&rq->nr_iowait);
3577 schedule();
3578 atomic_dec(&rq->nr_iowait);
3579}
3580
3581EXPORT_SYMBOL(io_schedule);
3582
3583long __sched io_schedule_timeout(long timeout)
3584{
3585 struct runqueue *rq = this_rq();
3586 long ret;
3587
3588 atomic_inc(&rq->nr_iowait);
3589 ret = schedule_timeout(timeout);
3590 atomic_dec(&rq->nr_iowait);
3591 return ret;
3592}
3593
3594
3595
3596
3597
3598
3599
3600
3601asmlinkage long sys_sched_get_priority_max(int policy)
3602{
3603 int ret = -EINVAL;
3604
3605 switch (policy) {
3606 case SCHED_FIFO:
3607 case SCHED_RR:
3608 ret = MAX_USER_RT_PRIO-1;
3609 break;
3610 case SCHED_NORMAL:
3611 ret = 0;
3612 break;
3613 }
3614 return ret;
3615}
3616
3617
3618
3619
3620
3621
3622
3623
3624asmlinkage long sys_sched_get_priority_min(int policy)
3625{
3626 int ret = -EINVAL;
3627
3628 switch (policy) {
3629 case SCHED_FIFO:
3630 case SCHED_RR:
3631 ret = 1;
3632 break;
3633 case SCHED_NORMAL:
3634 ret = 0;
3635 }
3636 return ret;
3637}
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647asmlinkage
3648long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3649{
3650 int retval = -EINVAL;
3651 struct timespec t;
3652 task_t *p;
3653
3654 if (pid < 0)
3655 goto out_nounlock;
3656
3657 retval = -ESRCH;
3658 read_lock(&tasklist_lock);
3659 p = find_process_by_pid(pid);
3660 if (!p)
3661 goto out_unlock;
3662
3663 retval = security_task_getscheduler(p);
3664 if (retval)
3665 goto out_unlock;
3666
3667 jiffies_to_timespec(p->policy & SCHED_FIFO ?
3668 0 : task_timeslice(p), &t);
3669 read_unlock(&tasklist_lock);
3670 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3671out_nounlock:
3672 return retval;
3673out_unlock:
3674 read_unlock(&tasklist_lock);
3675 return retval;
3676}
3677
3678static inline struct task_struct *eldest_child(struct task_struct *p)
3679{
3680 if (list_empty(&p->children)) return NULL;
3681 return list_entry(p->children.next,struct task_struct,sibling);
3682}
3683
3684static inline struct task_struct *older_sibling(struct task_struct *p)
3685{
3686 if (p->sibling.prev==&p->parent->children) return NULL;
3687 return list_entry(p->sibling.prev,struct task_struct,sibling);
3688}
3689
3690static inline struct task_struct *younger_sibling(struct task_struct *p)
3691{
3692 if (p->sibling.next==&p->parent->children) return NULL;
3693 return list_entry(p->sibling.next,struct task_struct,sibling);
3694}
3695
3696static void show_task(task_t * p)
3697{
3698 task_t *relative;
3699 unsigned state;
3700 unsigned long free = 0;
3701 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3702
3703 printk("%-13.13s ", p->comm);
3704 state = p->state ? __ffs(p->state) + 1 : 0;
3705 if (state < ARRAY_SIZE(stat_nam))
3706 printk(stat_nam[state]);
3707 else
3708 printk("?");
3709#if (BITS_PER_LONG == 32)
3710 if (state == TASK_RUNNING)
3711 printk(" running ");
3712 else
3713 printk(" %08lX ", thread_saved_pc(p));
3714#else
3715 if (state == TASK_RUNNING)
3716 printk(" running task ");
3717 else
3718 printk(" %016lx ", thread_saved_pc(p));
3719#endif
3720#ifdef CONFIG_DEBUG_STACK_USAGE
3721 {
3722 unsigned long * n = (unsigned long *) (p->thread_info+1);
3723 while (!*n)
3724 n++;
3725 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3726 }
3727#endif
3728 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3729 if ((relative = eldest_child(p)))
3730 printk("%5d ", relative->pid);
3731 else
3732 printk(" ");
3733 if ((relative = younger_sibling(p)))
3734 printk("%7d", relative->pid);
3735 else
3736 printk(" ");
3737 if ((relative = older_sibling(p)))
3738 printk(" %5d", relative->pid);
3739 else
3740 printk(" ");
3741 if (!p->mm)
3742 printk(" (L-TLB)\n");
3743 else
3744 printk(" (NOTLB)\n");
3745
3746 if (state != TASK_RUNNING)
3747 show_stack(p, NULL);
3748}
3749
3750void show_state(void)
3751{
3752 task_t *g, *p;
3753
3754#if (BITS_PER_LONG == 32)
3755 printk("\n"
3756 " sibling\n");
3757 printk(" task PC pid father child younger older\n");
3758#else
3759 printk("\n"
3760 " sibling\n");
3761 printk(" task PC pid father child younger older\n");
3762#endif
3763 read_lock(&tasklist_lock);
3764 do_each_thread(g, p) {
3765
3766
3767
3768
3769 touch_nmi_watchdog();
3770 show_task(p);
3771 } while_each_thread(g, p);
3772
3773 read_unlock(&tasklist_lock);
3774}
3775
3776void __devinit init_idle(task_t *idle, int cpu)
3777{
3778 runqueue_t *rq = cpu_rq(cpu);
3779 unsigned long flags;
3780
3781 idle->sleep_avg = 0;
3782 idle->interactive_credit = 0;
3783 idle->array = NULL;
3784 idle->prio = MAX_PRIO;
3785 idle->state = TASK_RUNNING;
3786 set_task_cpu(idle, cpu);
3787
3788 spin_lock_irqsave(&rq->lock, flags);
3789 rq->curr = rq->idle = idle;
3790 set_tsk_need_resched(idle);
3791 spin_unlock_irqrestore(&rq->lock, flags);
3792
3793
3794#ifdef CONFIG_PREEMPT
3795 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
3796#else
3797 idle->thread_info->preempt_count = 0;
3798#endif
3799}
3800
3801
3802
3803
3804
3805
3806
3807
3808cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
3809
3810#ifdef CONFIG_SMP
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836int set_cpus_allowed(task_t *p, cpumask_t new_mask)
3837{
3838 unsigned long flags;
3839 int ret = 0;
3840 migration_req_t req;
3841 runqueue_t *rq;
3842
3843 rq = task_rq_lock(p, &flags);
3844 if (!cpus_intersects(new_mask, cpu_online_map)) {
3845 ret = -EINVAL;
3846 goto out;
3847 }
3848
3849 p->cpus_allowed = new_mask;
3850
3851 if (cpu_isset(task_cpu(p), new_mask))
3852 goto out;
3853
3854 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
3855
3856 task_rq_unlock(rq, &flags);
3857 wake_up_process(rq->migration_thread);
3858 wait_for_completion(&req.done);
3859 tlb_migrate_finish(p->mm);
3860 return 0;
3861 }
3862out:
3863 task_rq_unlock(rq, &flags);
3864 return ret;
3865}
3866
3867EXPORT_SYMBOL_GPL(set_cpus_allowed);
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
3879{
3880 runqueue_t *rq_dest, *rq_src;
3881
3882 if (unlikely(cpu_is_offline(dest_cpu)))
3883 return;
3884
3885 rq_src = cpu_rq(src_cpu);
3886 rq_dest = cpu_rq(dest_cpu);
3887
3888 double_rq_lock(rq_src, rq_dest);
3889
3890 if (task_cpu(p) != src_cpu)
3891 goto out;
3892
3893 if (!cpu_isset(dest_cpu, p->cpus_allowed))
3894 goto out;
3895
3896 set_task_cpu(p, dest_cpu);
3897 if (p->array) {
3898
3899
3900
3901
3902
3903
3904 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
3905 + rq_dest->timestamp_last_tick;
3906 deactivate_task(p, rq_src);
3907 activate_task(p, rq_dest, 0);
3908 if (TASK_PREEMPTS_CURR(p, rq_dest))
3909 resched_task(rq_dest->curr);
3910 }
3911
3912out:
3913 double_rq_unlock(rq_src, rq_dest);
3914}
3915
3916
3917
3918
3919
3920
3921static int migration_thread(void * data)
3922{
3923 runqueue_t *rq;
3924 int cpu = (long)data;
3925
3926 rq = cpu_rq(cpu);
3927 BUG_ON(rq->migration_thread != current);
3928
3929 set_current_state(TASK_INTERRUPTIBLE);
3930 while (!kthread_should_stop()) {
3931 struct list_head *head;
3932 migration_req_t *req;
3933
3934 if (current->flags & PF_FREEZE)
3935 refrigerator(PF_FREEZE);
3936
3937 spin_lock_irq(&rq->lock);
3938
3939 if (cpu_is_offline(cpu)) {
3940 spin_unlock_irq(&rq->lock);
3941 goto wait_to_die;
3942 }
3943
3944 if (rq->active_balance) {
3945 active_load_balance(rq, cpu);
3946 rq->active_balance = 0;
3947 }
3948
3949 head = &rq->migration_queue;
3950
3951 if (list_empty(head)) {
3952 spin_unlock_irq(&rq->lock);
3953 schedule();
3954 set_current_state(TASK_INTERRUPTIBLE);
3955 continue;
3956 }
3957 req = list_entry(head->next, migration_req_t, list);
3958 list_del_init(head->next);
3959
3960 if (req->type == REQ_MOVE_TASK) {
3961 spin_unlock(&rq->lock);
3962 __migrate_task(req->task, smp_processor_id(),
3963 req->dest_cpu);
3964 local_irq_enable();
3965 } else if (req->type == REQ_SET_DOMAIN) {
3966 rq->sd = req->sd;
3967 spin_unlock_irq(&rq->lock);
3968 } else {
3969 spin_unlock_irq(&rq->lock);
3970 WARN_ON(1);
3971 }
3972
3973 complete(&req->done);
3974 }
3975 __set_current_state(TASK_RUNNING);
3976 return 0;
3977
3978wait_to_die:
3979
3980 set_current_state(TASK_INTERRUPTIBLE);
3981 while (!kthread_should_stop()) {
3982 schedule();
3983 set_current_state(TASK_INTERRUPTIBLE);
3984 }
3985 __set_current_state(TASK_RUNNING);
3986 return 0;
3987}
3988
3989#ifdef CONFIG_HOTPLUG_CPU
3990
3991static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
3992{
3993 int dest_cpu;
3994 cpumask_t mask;
3995
3996
3997 mask = node_to_cpumask(cpu_to_node(dead_cpu));
3998 cpus_and(mask, mask, tsk->cpus_allowed);
3999 dest_cpu = any_online_cpu(mask);
4000
4001
4002 if (dest_cpu == NR_CPUS)
4003 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4004
4005
4006 if (dest_cpu == NR_CPUS) {
4007 cpus_setall(tsk->cpus_allowed);
4008 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4009
4010
4011
4012
4013
4014
4015 if (tsk->mm && printk_ratelimit())
4016 printk(KERN_INFO "process %d (%s) no "
4017 "longer affine to cpu%d\n",
4018 tsk->pid, tsk->comm, dead_cpu);
4019 }
4020 __migrate_task(tsk, dead_cpu, dest_cpu);
4021}
4022
4023
4024static void migrate_live_tasks(int src_cpu)
4025{
4026 struct task_struct *tsk, *t;
4027
4028 write_lock_irq(&tasklist_lock);
4029
4030 do_each_thread(t, tsk) {
4031 if (tsk == current)
4032 continue;
4033
4034 if (task_cpu(tsk) == src_cpu)
4035 move_task_off_dead_cpu(src_cpu, tsk);
4036 } while_each_thread(t, tsk);
4037
4038 write_unlock_irq(&tasklist_lock);
4039}
4040
4041
4042
4043
4044
4045void sched_idle_next(void)
4046{
4047 int cpu = smp_processor_id();
4048 runqueue_t *rq = this_rq();
4049 struct task_struct *p = rq->idle;
4050 unsigned long flags;
4051
4052
4053 BUG_ON(cpu_online(cpu));
4054
4055
4056
4057
4058 spin_lock_irqsave(&rq->lock, flags);
4059
4060 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4061
4062 __activate_idle_task(p, rq);
4063
4064 spin_unlock_irqrestore(&rq->lock, flags);
4065}
4066
4067static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4068{
4069 struct runqueue *rq = cpu_rq(dead_cpu);
4070
4071
4072 BUG_ON(tsk->state != TASK_ZOMBIE && tsk->state != TASK_DEAD);
4073
4074
4075 BUG_ON(tsk->flags & PF_DEAD);
4076
4077 get_task_struct(tsk);
4078
4079
4080
4081
4082
4083
4084 spin_unlock_irq(&rq->lock);
4085 move_task_off_dead_cpu(dead_cpu, tsk);
4086 spin_lock_irq(&rq->lock);
4087
4088 put_task_struct(tsk);
4089}
4090
4091
4092static void migrate_dead_tasks(unsigned int dead_cpu)
4093{
4094 unsigned arr, i;
4095 struct runqueue *rq = cpu_rq(dead_cpu);
4096
4097 for (arr = 0; arr < 2; arr++) {
4098 for (i = 0; i < MAX_PRIO; i++) {
4099 struct list_head *list = &rq->arrays[arr].queue[i];
4100 while (!list_empty(list))
4101 migrate_dead(dead_cpu,
4102 list_entry(list->next, task_t,
4103 run_list));
4104 }
4105 }
4106}
4107#endif
4108
4109
4110
4111
4112
4113static int migration_call(struct notifier_block *nfb, unsigned long action,
4114 void *hcpu)
4115{
4116 int cpu = (long)hcpu;
4117 struct task_struct *p;
4118 struct runqueue *rq;
4119 unsigned long flags;
4120
4121 switch (action) {
4122 case CPU_UP_PREPARE:
4123 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4124 if (IS_ERR(p))
4125 return NOTIFY_BAD;
4126 p->flags |= PF_NOFREEZE;
4127 kthread_bind(p, cpu);
4128
4129 rq = task_rq_lock(p, &flags);
4130 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4131 task_rq_unlock(rq, &flags);
4132 cpu_rq(cpu)->migration_thread = p;
4133 break;
4134 case CPU_ONLINE:
4135
4136 wake_up_process(cpu_rq(cpu)->migration_thread);
4137 break;
4138#ifdef CONFIG_HOTPLUG_CPU
4139 case CPU_UP_CANCELED:
4140
4141 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4142 kthread_stop(cpu_rq(cpu)->migration_thread);
4143 cpu_rq(cpu)->migration_thread = NULL;
4144 break;
4145 case CPU_DEAD:
4146 migrate_live_tasks(cpu);
4147 rq = cpu_rq(cpu);
4148 kthread_stop(rq->migration_thread);
4149 rq->migration_thread = NULL;
4150
4151 rq = task_rq_lock(rq->idle, &flags);
4152 deactivate_task(rq->idle, rq);
4153 rq->idle->static_prio = MAX_PRIO;
4154 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4155 migrate_dead_tasks(cpu);
4156 task_rq_unlock(rq, &flags);
4157 BUG_ON(rq->nr_running != 0);
4158
4159
4160
4161
4162 spin_lock_irq(&rq->lock);
4163 while (!list_empty(&rq->migration_queue)) {
4164 migration_req_t *req;
4165 req = list_entry(rq->migration_queue.next,
4166 migration_req_t, list);
4167 BUG_ON(req->type != REQ_MOVE_TASK);
4168 list_del_init(&req->list);
4169 complete(&req->done);
4170 }
4171 spin_unlock_irq(&rq->lock);
4172 break;
4173#endif
4174 }
4175 return NOTIFY_OK;
4176}
4177
4178
4179
4180
4181static struct notifier_block __devinitdata migration_notifier = {
4182 .notifier_call = migration_call,
4183 .priority = 10
4184};
4185
4186int __init migration_init(void)
4187{
4188 void *cpu = (void *)(long)smp_processor_id();
4189
4190 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4191 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4192 register_cpu_notifier(&migration_notifier);
4193 return 0;
4194}
4195#endif
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
4210EXPORT_SYMBOL(kernel_flag);
4211
4212#ifdef CONFIG_SMP
4213
4214static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4215{
4216 migration_req_t req;
4217 unsigned long flags;
4218 runqueue_t *rq = cpu_rq(cpu);
4219 int local = 1;
4220
4221 lock_cpu_hotplug();
4222
4223 spin_lock_irqsave(&rq->lock, flags);
4224
4225 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4226 rq->sd = sd;
4227 } else {
4228 init_completion(&req.done);
4229 req.type = REQ_SET_DOMAIN;
4230 req.sd = sd;
4231 list_add(&req.list, &rq->migration_queue);
4232 local = 0;
4233 }
4234
4235 spin_unlock_irqrestore(&rq->lock, flags);
4236
4237 if (!local) {
4238 wake_up_process(rq->migration_thread);
4239 wait_for_completion(&req.done);
4240 }
4241
4242 unlock_cpu_hotplug();
4243}
4244
4245
4246
4247
4248
4249
4250#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN)
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261static int __init find_next_best_node(int node, unsigned long *used_nodes)
4262{
4263 int i, n, val, min_val, best_node = 0;
4264
4265 min_val = INT_MAX;
4266
4267 for (i = 0; i < numnodes; i++) {
4268
4269 n = (node + i) % numnodes;
4270
4271
4272 if (test_bit(n, used_nodes))
4273 continue;
4274
4275
4276 val = node_distance(node, i);
4277
4278 if (val < min_val) {
4279 min_val = val;
4280 best_node = n;
4281 }
4282 }
4283
4284 set_bit(best_node, used_nodes);
4285 return best_node;
4286}
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297cpumask_t __init sched_domain_node_span(int node)
4298{
4299 int i;
4300 cpumask_t span;
4301 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
4302
4303 cpus_clear(span);
4304 bitmap_zero(used_nodes, MAX_NUMNODES);
4305
4306 for (i = 0; i < SD_NODES_PER_DOMAIN; i++) {
4307 int next_node = find_next_best_node(node, used_nodes);
4308 cpumask_t nodemask;
4309
4310 nodemask = node_to_cpumask(next_node);
4311 cpus_or(span, span, nodemask);
4312 }
4313
4314 return span;
4315}
4316#else
4317cpumask_t __init sched_domain_node_span(int node)
4318{
4319 return cpu_possible_map;
4320}
4321#endif
4322
4323#ifdef CONFIG_SCHED_SMT
4324static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4325static struct sched_group sched_group_cpus[NR_CPUS];
4326__init static int cpu_to_cpu_group(int cpu)
4327{
4328 return cpu;
4329}
4330#endif
4331
4332static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4333static struct sched_group sched_group_phys[NR_CPUS];
4334__init static int cpu_to_phys_group(int cpu)
4335{
4336#ifdef CONFIG_SCHED_SMT
4337 return first_cpu(cpu_sibling_map[cpu]);
4338#else
4339 return cpu;
4340#endif
4341}
4342
4343#ifdef CONFIG_NUMA
4344
4345static DEFINE_PER_CPU(struct sched_domain, node_domains);
4346static struct sched_group sched_group_nodes[MAX_NUMNODES];
4347__init static int cpu_to_node_group(int cpu)
4348{
4349 return cpu_to_node(cpu);
4350}
4351#endif
4352
4353
4354static struct sched_group sched_group_isolated[NR_CPUS];
4355
4356
4357cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE;
4358
4359__init static int cpu_to_isolated_group(int cpu)
4360{
4361 return cpu;
4362}
4363
4364
4365static int __init isolated_cpu_setup(char *str)
4366{
4367 int ints[NR_CPUS], i;
4368
4369 str = get_options(str, ARRAY_SIZE(ints), ints);
4370 cpus_clear(cpu_isolated_map);
4371 for (i = 1; i <= ints[0]; i++)
4372 cpu_set(ints[i], cpu_isolated_map);
4373 return 1;
4374}
4375
4376__setup ("isolcpus=", isolated_cpu_setup);
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389__init static void init_sched_build_groups(struct sched_group groups[],
4390 cpumask_t span, int (*group_fn)(int cpu))
4391{
4392 struct sched_group *first = NULL, *last = NULL;
4393 cpumask_t covered = CPU_MASK_NONE;
4394 int i;
4395
4396 for_each_cpu_mask(i, span) {
4397 int group = group_fn(i);
4398 struct sched_group *sg = &groups[group];
4399 int j;
4400
4401 if (cpu_isset(i, covered))
4402 continue;
4403
4404 sg->cpumask = CPU_MASK_NONE;
4405 sg->cpu_power = 0;
4406
4407 for_each_cpu_mask(j, span) {
4408 if (group_fn(j) != group)
4409 continue;
4410
4411 cpu_set(j, covered);
4412 cpu_set(j, sg->cpumask);
4413 }
4414 if (!first)
4415 first = sg;
4416 if (last)
4417 last->next = sg;
4418 last = sg;
4419 }
4420 last->next = first;
4421}
4422
4423__init static void arch_init_sched_domains(void)
4424{
4425 int i;
4426 cpumask_t cpu_default_map;
4427
4428
4429
4430
4431
4432
4433 cpus_complement(cpu_default_map, cpu_isolated_map);
4434 cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map);
4435
4436
4437 for_each_cpu(i) {
4438 int group;
4439 struct sched_domain *sd = NULL, *p;
4440 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4441
4442 cpus_and(nodemask, nodemask, cpu_default_map);
4443
4444
4445
4446
4447
4448
4449 if (cpu_isset(i, cpu_isolated_map)) {
4450#ifdef CONFIG_SCHED_SMT
4451 sd = &per_cpu(cpu_domains, i);
4452#else
4453 sd = &per_cpu(phys_domains, i);
4454#endif
4455 group = cpu_to_isolated_group(i);
4456 *sd = SD_CPU_INIT;
4457 cpu_set(i, sd->span);
4458 sd->balance_interval = INT_MAX;
4459 sd->flags = 0;
4460 sd->groups = &sched_group_isolated[group];
4461 printk(KERN_INFO "Setting up cpu %d isolated.\n", i);
4462
4463 continue;
4464 }
4465
4466#ifdef CONFIG_NUMA
4467 sd = &per_cpu(node_domains, i);
4468 group = cpu_to_node_group(i);
4469 *sd = SD_NODE_INIT;
4470
4471 sd->span = sched_domain_node_span(i);
4472 cpus_and(sd->span, sd->span, cpu_default_map);
4473 sd->groups = &sched_group_nodes[group];
4474#endif
4475
4476 p = sd;
4477 sd = &per_cpu(phys_domains, i);
4478 group = cpu_to_phys_group(i);
4479 *sd = SD_CPU_INIT;
4480#ifdef CONFIG_NUMA
4481 sd->span = nodemask;
4482#else
4483 sd->span = cpu_possible_map;
4484#endif
4485 sd->parent = p;
4486 sd->groups = &sched_group_phys[group];
4487
4488#ifdef CONFIG_SCHED_SMT
4489 p = sd;
4490 sd = &per_cpu(cpu_domains, i);
4491 group = cpu_to_cpu_group(i);
4492 *sd = SD_SIBLING_INIT;
4493 sd->span = cpu_sibling_map[i];
4494 cpus_and(sd->span, sd->span, cpu_default_map);
4495 sd->parent = p;
4496 sd->groups = &sched_group_cpus[group];
4497#endif
4498 }
4499
4500#ifdef CONFIG_SCHED_SMT
4501
4502 for_each_cpu(i) {
4503 cpumask_t this_sibling_map = cpu_sibling_map[i];
4504 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4505 if (i != first_cpu(this_sibling_map))
4506 continue;
4507
4508 init_sched_build_groups(sched_group_cpus, this_sibling_map,
4509 &cpu_to_cpu_group);
4510 }
4511#endif
4512
4513
4514 for_each_cpu_mask(i, cpu_isolated_map) {
4515 cpumask_t mask;
4516 cpus_clear(mask);
4517 cpu_set(i, mask);
4518 init_sched_build_groups(sched_group_isolated, mask,
4519 &cpu_to_isolated_group);
4520 }
4521
4522#ifdef CONFIG_NUMA
4523
4524 for (i = 0; i < MAX_NUMNODES; i++) {
4525 cpumask_t nodemask = node_to_cpumask(i);
4526
4527 cpus_and(nodemask, nodemask, cpu_default_map);
4528 if (cpus_empty(nodemask))
4529 continue;
4530
4531 init_sched_build_groups(sched_group_phys, nodemask,
4532 &cpu_to_phys_group);
4533 }
4534#else
4535 init_sched_build_groups(sched_group_phys, cpu_possible_map,
4536 &cpu_to_phys_group);
4537#endif
4538
4539#ifdef CONFIG_NUMA
4540
4541 init_sched_build_groups(sched_group_nodes, cpu_default_map,
4542 &cpu_to_node_group);
4543#endif
4544
4545
4546 for_each_cpu_mask(i, cpu_default_map) {
4547 int power;
4548 struct sched_domain *sd;
4549#ifdef CONFIG_SCHED_SMT
4550 sd = &per_cpu(cpu_domains, i);
4551 power = SCHED_LOAD_SCALE;
4552 sd->groups->cpu_power = power;
4553#endif
4554
4555 sd = &per_cpu(phys_domains, i);
4556 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4557 (cpus_weight(sd->groups->cpumask)-1) / 10;
4558 sd->groups->cpu_power = power;
4559
4560#ifdef CONFIG_NUMA
4561 if (i == first_cpu(sd->groups->cpumask)) {
4562
4563 sd = &per_cpu(node_domains, i);
4564 sd->groups->cpu_power += power;
4565 }
4566#endif
4567 }
4568
4569
4570 for_each_cpu(i) {
4571 struct sched_domain *sd;
4572#ifdef CONFIG_SCHED_SMT
4573 sd = &per_cpu(cpu_domains, i);
4574#else
4575 sd = &per_cpu(phys_domains, i);
4576#endif
4577 cpu_attach_domain(sd, i);
4578 }
4579}
4580
4581#undef SCHED_DOMAIN_DEBUG
4582#ifdef SCHED_DOMAIN_DEBUG
4583void sched_domain_debug(void)
4584{
4585 int i;
4586
4587 for_each_cpu(i) {
4588 runqueue_t *rq = cpu_rq(i);
4589 struct sched_domain *sd;
4590 int level = 0;
4591
4592 sd = rq->sd;
4593
4594 printk(KERN_DEBUG "CPU%d: %s\n",
4595 i, (cpu_online(i) ? " online" : "offline"));
4596
4597 do {
4598 int j;
4599 char str[NR_CPUS];
4600 struct sched_group *group = sd->groups;
4601 cpumask_t groupmask;
4602
4603 cpumask_scnprintf(str, NR_CPUS, sd->span);
4604 cpus_clear(groupmask);
4605
4606 printk(KERN_DEBUG);
4607 for (j = 0; j < level + 1; j++)
4608 printk(" ");
4609 printk("domain %d: span %s\n", level, str);
4610
4611 if (!cpu_isset(i, sd->span))
4612 printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
4613 if (!cpu_isset(i, group->cpumask))
4614 printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
4615 if (!group->cpu_power)
4616 printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
4617
4618 printk(KERN_DEBUG);
4619 for (j = 0; j < level + 2; j++)
4620 printk(" ");
4621 printk("groups:");
4622 do {
4623 if (!group) {
4624 printk(" ERROR: NULL");
4625 break;
4626 }
4627
4628 if (!cpus_weight(group->cpumask))
4629 printk(" ERROR empty group:");
4630
4631 if (cpus_intersects(groupmask, group->cpumask))
4632 printk(" ERROR repeated CPUs:");
4633
4634 cpus_or(groupmask, groupmask, group->cpumask);
4635
4636 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4637 printk(" %s", str);
4638
4639 group = group->next;
4640 } while (group != sd->groups);
4641 printk("\n");
4642
4643 if (!cpus_equal(sd->span, groupmask))
4644 printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
4645
4646 level++;
4647 sd = sd->parent;
4648
4649 if (sd) {
4650 if (!cpus_subset(groupmask, sd->span))
4651 printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
4652 }
4653
4654 } while (sd);
4655 }
4656}
4657#else
4658#define sched_domain_debug() {}
4659#endif
4660
4661void __init sched_init_smp(void)
4662{
4663 arch_init_sched_domains();
4664 sched_domain_debug();
4665}
4666#else
4667void __init sched_init_smp(void)
4668{
4669}
4670#endif
4671
4672int in_sched_functions(unsigned long addr)
4673{
4674
4675 extern char __sched_text_start[], __sched_text_end[];
4676 return in_lock_functions(addr) ||
4677 (addr >= (unsigned long)__sched_text_start
4678 && addr < (unsigned long)__sched_text_end);
4679}
4680
4681void __init sched_init(void)
4682{
4683 runqueue_t *rq;
4684 int i, j, k;
4685
4686#ifdef CONFIG_SMP
4687
4688 static struct sched_domain sched_domain_init;
4689 static struct sched_group sched_group_init;
4690
4691 memset(&sched_domain_init, 0, sizeof(struct sched_domain));
4692 sched_domain_init.span = CPU_MASK_ALL;
4693 sched_domain_init.groups = &sched_group_init;
4694 sched_domain_init.last_balance = jiffies;
4695 sched_domain_init.balance_interval = INT_MAX;
4696 sched_domain_init.busy_factor = 1;
4697
4698 memset(&sched_group_init, 0, sizeof(struct sched_group));
4699 sched_group_init.cpumask = CPU_MASK_ALL;
4700 sched_group_init.next = &sched_group_init;
4701 sched_group_init.cpu_power = SCHED_LOAD_SCALE;
4702#endif
4703
4704 for (i = 0; i < NR_CPUS; i++) {
4705 prio_array_t *array;
4706
4707 rq = cpu_rq(i);
4708 spin_lock_init(&rq->lock);
4709 rq->active = rq->arrays;
4710 rq->expired = rq->arrays + 1;
4711 rq->best_expired_prio = MAX_PRIO;
4712
4713#ifdef CONFIG_SMP
4714 rq->sd = &sched_domain_init;
4715 rq->cpu_load = 0;
4716 rq->active_balance = 0;
4717 rq->push_cpu = 0;
4718 rq->migration_thread = NULL;
4719 INIT_LIST_HEAD(&rq->migration_queue);
4720#endif
4721 atomic_set(&rq->nr_iowait, 0);
4722
4723 for (j = 0; j < 2; j++) {
4724 array = rq->arrays + j;
4725 for (k = 0; k < MAX_PRIO; k++) {
4726 INIT_LIST_HEAD(array->queue + k);
4727 __clear_bit(k, array->bitmap);
4728 }
4729
4730 __set_bit(MAX_PRIO, array->bitmap);
4731 }
4732 }
4733
4734
4735
4736
4737 atomic_inc(&init_mm.mm_count);
4738 enter_lazy_tlb(&init_mm, current);
4739
4740
4741
4742
4743
4744
4745
4746 init_idle(current, smp_processor_id());
4747}
4748
4749#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4750void __might_sleep(char *file, int line)
4751{
4752#if defined(in_atomic)
4753 static unsigned long prev_jiffy;
4754
4755 if ((in_atomic() || irqs_disabled()) &&
4756 system_state == SYSTEM_RUNNING) {
4757 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4758 return;
4759 prev_jiffy = jiffies;
4760 printk(KERN_ERR "Debug: sleeping function called from invalid"
4761 " context at %s:%d\n", file, line);
4762 printk("in_atomic():%d, irqs_disabled():%d\n",
4763 in_atomic(), irqs_disabled());
4764 dump_stack();
4765 }
4766#endif
4767}
4768EXPORT_SYMBOL(__might_sleep);
4769#endif
4770