1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
30#include <linux/completion.h>
31#include <linux/kernel_stat.h>
32#include <linux/security.h>
33#include <linux/notifier.h>
34#include <linux/profile.h>
35#include <linux/suspend.h>
36#include <linux/blkdev.h>
37#include <linux/delay.h>
38#include <linux/smp.h>
39#include <linux/threads.h>
40#include <linux/timer.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h>
43#include <linux/percpu.h>
44#include <linux/kthread.h>
45#include <linux/seq_file.h>
46#include <linux/syscalls.h>
47#include <linux/times.h>
48#include <asm/tlb.h>
49
50#include <asm/unistd.h>
51
52
53
54
55
56
57#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
58#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
59#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
60
61
62
63
64
65
66#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
67#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
68#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
69
70
71
72
73#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
74#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
75
76
77
78
79
80
81
82
83#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
84#define DEF_TIMESLICE (100 * HZ / 1000)
85#define ON_RUNQUEUE_WEIGHT 30
86#define CHILD_PENALTY 95
87#define PARENT_PENALTY 100
88#define EXIT_WEIGHT 3
89#define PRIO_BONUS_RATIO 25
90#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
91#define INTERACTIVE_DELTA 2
92#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
93#define STARVATION_LIMIT (MAX_SLEEP_AVG)
94#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124#define CURRENT_BONUS(p) \
125 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
126 MAX_SLEEP_AVG)
127
128#define GRANULARITY (10 * HZ / 1000 ? : 1)
129
130#ifdef CONFIG_SMP
131#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
132 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
133 num_online_cpus())
134#else
135#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
136 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
137#endif
138
139#define SCALE(v1,v1_max,v2_max) \
140 (v1) * (v2_max) / (v1_max)
141
142#define DELTA(p) \
143 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
144
145#define TASK_INTERACTIVE(p) \
146 ((p)->prio <= (p)->static_prio - DELTA(p))
147
148#define INTERACTIVE_SLEEP(p) \
149 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
150 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
151
152#define TASK_PREEMPTS_CURR(p, rq) \
153 ((p)->prio < (rq)->curr->prio)
154
155
156
157
158
159
160
161
162
163
164#define SCALE_PRIO(x, prio) \
165 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
166
167static unsigned int task_timeslice(task_t *p)
168{
169 if (p->static_prio < NICE_TO_PRIO(0))
170 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
171 else
172 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
173}
174#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
175 < (long long) (sd)->cache_hot_time)
176
177
178
179
180
181#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
182
183typedef struct runqueue runqueue_t;
184
185struct prio_array {
186 unsigned int nr_active;
187 unsigned long bitmap[BITMAP_SIZE];
188 struct list_head queue[MAX_PRIO];
189};
190
191
192
193
194
195
196
197
198struct runqueue {
199 spinlock_t lock;
200
201
202
203
204
205 unsigned long nr_running;
206#ifdef CONFIG_SMP
207 unsigned long cpu_load;
208#endif
209 unsigned long long nr_switches;
210
211
212
213
214
215
216
217 unsigned long nr_uninterruptible;
218
219 unsigned long expired_timestamp;
220 unsigned long long timestamp_last_tick;
221 task_t *curr, *idle;
222 struct mm_struct *prev_mm;
223 prio_array_t *active, *expired, arrays[2];
224 int best_expired_prio;
225 atomic_t nr_iowait;
226
227#ifdef CONFIG_SMP
228 struct sched_domain *sd;
229
230
231 int active_balance;
232 int push_cpu;
233
234 task_t *migration_thread;
235 struct list_head migration_queue;
236#endif
237
238#ifdef CONFIG_SCHEDSTATS
239
240 struct sched_info rq_sched_info;
241
242
243 unsigned long yld_exp_empty;
244 unsigned long yld_act_empty;
245 unsigned long yld_both_empty;
246 unsigned long yld_cnt;
247
248
249 unsigned long sched_noswitch;
250 unsigned long sched_switch;
251 unsigned long sched_cnt;
252 unsigned long sched_goidle;
253
254
255 unsigned long pt_gained[MAX_IDLE_TYPES];
256 unsigned long pt_lost[MAX_IDLE_TYPES];
257
258
259 unsigned long alb_cnt;
260 unsigned long alb_lost;
261 unsigned long alb_gained;
262 unsigned long alb_failed;
263
264
265 unsigned long ttwu_cnt;
266 unsigned long ttwu_attempts;
267 unsigned long ttwu_moved;
268
269
270 unsigned long wunt_cnt;
271 unsigned long wunt_moved;
272
273
274 unsigned long smt_cnt;
275
276
277 unsigned long sbe_cnt;
278#endif
279};
280
281static DEFINE_PER_CPU(struct runqueue, runqueues);
282
283#define for_each_domain(cpu, domain) \
284 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
285
286#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
287#define this_rq() (&__get_cpu_var(runqueues))
288#define task_rq(p) cpu_rq(task_cpu(p))
289#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
290
291
292
293
294#ifndef prepare_arch_switch
295# define prepare_arch_switch(rq, next) do { } while (0)
296# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
297# define task_running(rq, p) ((rq)->curr == (p))
298#endif
299
300
301
302
303
304
305static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
306 __acquires(rq->lock)
307{
308 struct runqueue *rq;
309
310repeat_lock_task:
311 local_irq_save(*flags);
312 rq = task_rq(p);
313 spin_lock(&rq->lock);
314 if (unlikely(rq != task_rq(p))) {
315 spin_unlock_irqrestore(&rq->lock, *flags);
316 goto repeat_lock_task;
317 }
318 return rq;
319}
320
321static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
322 __releases(rq->lock)
323{
324 spin_unlock_irqrestore(&rq->lock, *flags);
325}
326
327#ifdef CONFIG_SCHEDSTATS
328
329
330
331
332#define SCHEDSTAT_VERSION 10
333
334static int show_schedstat(struct seq_file *seq, void *v)
335{
336 int cpu;
337 enum idle_type itype;
338
339 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
340 seq_printf(seq, "timestamp %lu\n", jiffies);
341 for_each_online_cpu(cpu) {
342 runqueue_t *rq = cpu_rq(cpu);
343#ifdef CONFIG_SMP
344 struct sched_domain *sd;
345 int dcnt = 0;
346#endif
347
348
349 seq_printf(seq,
350 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
351 "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
352 cpu, rq->yld_both_empty,
353 rq->yld_act_empty, rq->yld_exp_empty,
354 rq->yld_cnt, rq->sched_noswitch,
355 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
356 rq->alb_cnt, rq->alb_gained, rq->alb_lost,
357 rq->alb_failed,
358 rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
359 rq->wunt_cnt, rq->wunt_moved,
360 rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
361 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
362
363 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
364 seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
365 rq->pt_lost[itype]);
366 seq_printf(seq, "\n");
367
368#ifdef CONFIG_SMP
369
370 for_each_domain(cpu, sd) {
371 char mask_str[NR_CPUS];
372
373 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
374 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
375 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
376 itype++) {
377 seq_printf(seq, " %lu %lu %lu %lu %lu",
378 sd->lb_cnt[itype],
379 sd->lb_failed[itype],
380 sd->lb_imbalance[itype],
381 sd->lb_nobusyq[itype],
382 sd->lb_nobusyg[itype]);
383 }
384 seq_printf(seq, " %lu %lu %lu %lu\n",
385 sd->sbe_pushed, sd->sbe_attempts,
386 sd->ttwu_wake_affine, sd->ttwu_wake_balance);
387 }
388#endif
389 }
390 return 0;
391}
392
393static int schedstat_open(struct inode *inode, struct file *file)
394{
395 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
396 char *buf = kmalloc(size, GFP_KERNEL);
397 struct seq_file *m;
398 int res;
399
400 if (!buf)
401 return -ENOMEM;
402 res = single_open(file, show_schedstat, NULL);
403 if (!res) {
404 m = file->private_data;
405 m->buf = buf;
406 m->size = size;
407 } else
408 kfree(buf);
409 return res;
410}
411
412struct file_operations proc_schedstat_operations = {
413 .open = schedstat_open,
414 .read = seq_read,
415 .llseek = seq_lseek,
416 .release = single_release,
417};
418
419# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
420# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
421#else
422# define schedstat_inc(rq, field) do { } while (0)
423# define schedstat_add(rq, field, amt) do { } while (0)
424#endif
425
426
427
428
429static runqueue_t *this_rq_lock(void)
430 __acquires(rq->lock)
431{
432 runqueue_t *rq;
433
434 local_irq_disable();
435 rq = this_rq();
436 spin_lock(&rq->lock);
437
438 return rq;
439}
440
441#ifdef CONFIG_SCHED_SMT
442static int cpu_and_siblings_are_idle(int cpu)
443{
444 int sib;
445 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
446 if (idle_cpu(sib))
447 continue;
448 return 0;
449 }
450
451 return 1;
452}
453#else
454#define cpu_and_siblings_are_idle(A) idle_cpu(A)
455#endif
456
457#ifdef CONFIG_SCHEDSTATS
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473static inline void sched_info_dequeued(task_t *t)
474{
475 t->sched_info.last_queued = 0;
476}
477
478
479
480
481
482
483static inline void sched_info_arrive(task_t *t)
484{
485 unsigned long now = jiffies, diff = 0;
486 struct runqueue *rq = task_rq(t);
487
488 if (t->sched_info.last_queued)
489 diff = now - t->sched_info.last_queued;
490 sched_info_dequeued(t);
491 t->sched_info.run_delay += diff;
492 t->sched_info.last_arrival = now;
493 t->sched_info.pcnt++;
494
495 if (!rq)
496 return;
497
498 rq->rq_sched_info.run_delay += diff;
499 rq->rq_sched_info.pcnt++;
500}
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517static inline void sched_info_queued(task_t *t)
518{
519 if (!t->sched_info.last_queued)
520 t->sched_info.last_queued = jiffies;
521}
522
523
524
525
526
527static inline void sched_info_depart(task_t *t)
528{
529 struct runqueue *rq = task_rq(t);
530 unsigned long diff = jiffies - t->sched_info.last_arrival;
531
532 t->sched_info.cpu_time += diff;
533
534 if (rq)
535 rq->rq_sched_info.cpu_time += diff;
536}
537
538
539
540
541
542
543static inline void sched_info_switch(task_t *prev, task_t *next)
544{
545 struct runqueue *rq = task_rq(prev);
546
547
548
549
550
551
552 if (prev != rq->idle)
553 sched_info_depart(prev);
554
555 if (next != rq->idle)
556 sched_info_arrive(next);
557}
558#else
559#define sched_info_queued(t) do { } while (0)
560#define sched_info_switch(t, next) do { } while (0)
561#endif
562
563
564
565
566static void dequeue_task(struct task_struct *p, prio_array_t *array)
567{
568 array->nr_active--;
569 list_del(&p->run_list);
570 if (list_empty(array->queue + p->prio))
571 __clear_bit(p->prio, array->bitmap);
572}
573
574static void enqueue_task(struct task_struct *p, prio_array_t *array)
575{
576 sched_info_queued(p);
577 list_add_tail(&p->run_list, array->queue + p->prio);
578 __set_bit(p->prio, array->bitmap);
579 array->nr_active++;
580 p->array = array;
581}
582
583
584
585
586
587static void requeue_task(struct task_struct *p, prio_array_t *array)
588{
589 list_move_tail(&p->run_list, array->queue + p->prio);
590}
591
592static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
593{
594 list_add(&p->run_list, array->queue + p->prio);
595 __set_bit(p->prio, array->bitmap);
596 array->nr_active++;
597 p->array = array;
598}
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614static int effective_prio(task_t *p)
615{
616 int bonus, prio;
617
618 if (rt_task(p))
619 return p->prio;
620
621 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
622
623 prio = p->static_prio - bonus;
624 if (prio < MAX_RT_PRIO)
625 prio = MAX_RT_PRIO;
626 if (prio > MAX_PRIO-1)
627 prio = MAX_PRIO-1;
628 return prio;
629}
630
631
632
633
634static inline void __activate_task(task_t *p, runqueue_t *rq)
635{
636 enqueue_task(p, rq->active);
637 rq->nr_running++;
638}
639
640
641
642
643static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
644{
645 enqueue_task_head(p, rq->active);
646 rq->nr_running++;
647}
648
649static void recalc_task_prio(task_t *p, unsigned long long now)
650{
651 unsigned long long __sleep_time = now - p->timestamp;
652 unsigned long sleep_time;
653
654 if (__sleep_time > NS_MAX_SLEEP_AVG)
655 sleep_time = NS_MAX_SLEEP_AVG;
656 else
657 sleep_time = (unsigned long)__sleep_time;
658
659 if (likely(sleep_time > 0)) {
660
661
662
663
664
665
666 if (p->mm && p->activated != -1 &&
667 sleep_time > INTERACTIVE_SLEEP(p)) {
668 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
669 DEF_TIMESLICE);
670 } else {
671
672
673
674
675 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
676
677
678
679
680
681
682 if (p->activated == -1 && p->mm) {
683 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
684 sleep_time = 0;
685 else if (p->sleep_avg + sleep_time >=
686 INTERACTIVE_SLEEP(p)) {
687 p->sleep_avg = INTERACTIVE_SLEEP(p);
688 sleep_time = 0;
689 }
690 }
691
692
693
694
695
696
697
698
699
700 p->sleep_avg += sleep_time;
701
702 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
703 p->sleep_avg = NS_MAX_SLEEP_AVG;
704 }
705 }
706
707 p->prio = effective_prio(p);
708}
709
710
711
712
713
714
715
716static void activate_task(task_t *p, runqueue_t *rq, int local)
717{
718 unsigned long long now;
719
720 now = sched_clock();
721#ifdef CONFIG_SMP
722 if (!local) {
723
724 runqueue_t *this_rq = this_rq();
725 now = (now - this_rq->timestamp_last_tick)
726 + rq->timestamp_last_tick;
727 }
728#endif
729
730 recalc_task_prio(p, now);
731
732
733
734
735
736 if (!p->activated) {
737
738
739
740
741
742
743
744 if (in_interrupt())
745 p->activated = 2;
746 else {
747
748
749
750
751 p->activated = 1;
752 }
753 }
754 p->timestamp = now;
755
756 __activate_task(p, rq);
757}
758
759
760
761
762static void deactivate_task(struct task_struct *p, runqueue_t *rq)
763{
764 rq->nr_running--;
765 dequeue_task(p, p->array);
766 p->array = NULL;
767}
768
769
770
771
772
773
774
775
776#ifdef CONFIG_SMP
777static void resched_task(task_t *p)
778{
779 int need_resched, nrpolling;
780
781 assert_spin_locked(&task_rq(p)->lock);
782
783
784 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
785 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
786 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
787
788 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
789 smp_send_reschedule(task_cpu(p));
790}
791#else
792static inline void resched_task(task_t *p)
793{
794 set_tsk_need_resched(p);
795}
796#endif
797
798
799
800
801
802inline int task_curr(const task_t *p)
803{
804 return cpu_curr(task_cpu(p)) == p;
805}
806
807#ifdef CONFIG_SMP
808enum request_type {
809 REQ_MOVE_TASK,
810 REQ_SET_DOMAIN,
811};
812
813typedef struct {
814 struct list_head list;
815 enum request_type type;
816
817
818 task_t *task;
819 int dest_cpu;
820
821
822 struct sched_domain *sd;
823
824 struct completion done;
825} migration_req_t;
826
827
828
829
830
831static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
832{
833 runqueue_t *rq = task_rq(p);
834
835
836
837
838
839 if (!p->array && !task_running(rq, p)) {
840 set_task_cpu(p, dest_cpu);
841 return 0;
842 }
843
844 init_completion(&req->done);
845 req->type = REQ_MOVE_TASK;
846 req->task = p;
847 req->dest_cpu = dest_cpu;
848 list_add(&req->list, &rq->migration_queue);
849 return 1;
850}
851
852
853
854
855
856
857
858
859
860
861void wait_task_inactive(task_t * p)
862{
863 unsigned long flags;
864 runqueue_t *rq;
865 int preempted;
866
867repeat:
868 rq = task_rq_lock(p, &flags);
869
870 if (unlikely(p->array || task_running(rq, p))) {
871
872 preempted = !task_running(rq, p);
873 task_rq_unlock(rq, &flags);
874 cpu_relax();
875 if (preempted)
876 yield();
877 goto repeat;
878 }
879 task_rq_unlock(rq, &flags);
880}
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895void kick_process(task_t *p)
896{
897 int cpu;
898
899 preempt_disable();
900 cpu = task_cpu(p);
901 if ((cpu != smp_processor_id()) && task_curr(p))
902 smp_send_reschedule(cpu);
903 preempt_enable();
904}
905
906
907
908
909
910
911
912static inline unsigned long source_load(int cpu)
913{
914 runqueue_t *rq = cpu_rq(cpu);
915 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
916
917 return min(rq->cpu_load, load_now);
918}
919
920
921
922
923static inline unsigned long target_load(int cpu)
924{
925 runqueue_t *rq = cpu_rq(cpu);
926 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
927
928 return max(rq->cpu_load, load_now);
929}
930
931#endif
932
933
934
935
936
937
938
939
940
941#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
942static int wake_idle(int cpu, task_t *p)
943{
944 cpumask_t tmp;
945 struct sched_domain *sd;
946 int i;
947
948 if (idle_cpu(cpu))
949 return cpu;
950
951 for_each_domain(cpu, sd) {
952 if (sd->flags & SD_WAKE_IDLE) {
953 cpus_and(tmp, sd->span, cpu_online_map);
954 cpus_and(tmp, tmp, p->cpus_allowed);
955 for_each_cpu_mask(i, tmp) {
956 if (idle_cpu(i))
957 return i;
958 }
959 }
960 else break;
961 }
962 return cpu;
963}
964#else
965static inline int wake_idle(int cpu, task_t *p)
966{
967 return cpu;
968}
969#endif
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985static int try_to_wake_up(task_t * p, unsigned int state, int sync)
986{
987 int cpu, this_cpu, success = 0;
988 unsigned long flags;
989 long old_state;
990 runqueue_t *rq;
991#ifdef CONFIG_SMP
992 unsigned long load, this_load;
993 struct sched_domain *sd;
994 int new_cpu;
995#endif
996
997 rq = task_rq_lock(p, &flags);
998 schedstat_inc(rq, ttwu_cnt);
999 old_state = p->state;
1000 if (!(old_state & state))
1001 goto out;
1002
1003 if (p->array)
1004 goto out_running;
1005
1006 cpu = task_cpu(p);
1007 this_cpu = smp_processor_id();
1008
1009#ifdef CONFIG_SMP
1010 if (unlikely(task_running(rq, p)))
1011 goto out_activate;
1012
1013 new_cpu = cpu;
1014
1015 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1016 goto out_set_cpu;
1017
1018 load = source_load(cpu);
1019 this_load = target_load(this_cpu);
1020
1021
1022
1023
1024
1025 if (sync)
1026 this_load -= SCHED_LOAD_SCALE;
1027
1028
1029 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1030 goto out_set_cpu;
1031
1032 new_cpu = this_cpu;
1033
1034
1035
1036
1037
1038 for_each_domain(this_cpu, sd) {
1039 unsigned int imbalance;
1040
1041
1042
1043
1044 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1045
1046 if ((sd->flags & SD_WAKE_AFFINE) &&
1047 !task_hot(p, rq->timestamp_last_tick, sd)) {
1048
1049
1050
1051
1052 if (cpu_isset(cpu, sd->span)) {
1053 schedstat_inc(sd, ttwu_wake_affine);
1054 goto out_set_cpu;
1055 }
1056 } else if ((sd->flags & SD_WAKE_BALANCE) &&
1057 imbalance*this_load <= 100*load) {
1058
1059
1060
1061
1062 if (cpu_isset(cpu, sd->span)) {
1063 schedstat_inc(sd, ttwu_wake_balance);
1064 goto out_set_cpu;
1065 }
1066 }
1067 }
1068
1069 new_cpu = cpu;
1070out_set_cpu:
1071 schedstat_inc(rq, ttwu_attempts);
1072 new_cpu = wake_idle(new_cpu, p);
1073 if (new_cpu != cpu) {
1074 schedstat_inc(rq, ttwu_moved);
1075 set_task_cpu(p, new_cpu);
1076 task_rq_unlock(rq, &flags);
1077
1078 rq = task_rq_lock(p, &flags);
1079 old_state = p->state;
1080 if (!(old_state & state))
1081 goto out;
1082 if (p->array)
1083 goto out_running;
1084
1085 this_cpu = smp_processor_id();
1086 cpu = task_cpu(p);
1087 }
1088
1089out_activate:
1090#endif
1091 if (old_state == TASK_UNINTERRUPTIBLE) {
1092 rq->nr_uninterruptible--;
1093
1094
1095
1096
1097 p->activated = -1;
1098 }
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108 activate_task(p, rq, cpu == this_cpu);
1109 if (!sync || cpu != this_cpu) {
1110 if (TASK_PREEMPTS_CURR(p, rq))
1111 resched_task(rq->curr);
1112 }
1113 success = 1;
1114
1115out_running:
1116 p->state = TASK_RUNNING;
1117out:
1118 task_rq_unlock(rq, &flags);
1119
1120 return success;
1121}
1122
1123int fastcall wake_up_process(task_t * p)
1124{
1125 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1126 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1127}
1128
1129EXPORT_SYMBOL(wake_up_process);
1130
1131int fastcall wake_up_state(task_t *p, unsigned int state)
1132{
1133 return try_to_wake_up(p, state, 0);
1134}
1135
1136#ifdef CONFIG_SMP
1137static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1138 struct sched_domain *sd);
1139#endif
1140
1141
1142
1143
1144
1145void fastcall sched_fork(task_t *p)
1146{
1147
1148
1149
1150
1151
1152
1153 p->state = TASK_RUNNING;
1154 INIT_LIST_HEAD(&p->run_list);
1155 p->array = NULL;
1156 spin_lock_init(&p->switch_lock);
1157#ifdef CONFIG_SCHEDSTATS
1158 memset(&p->sched_info, 0, sizeof(p->sched_info));
1159#endif
1160#ifdef CONFIG_PREEMPT
1161
1162
1163
1164
1165
1166
1167 p->thread_info->preempt_count = 1;
1168#endif
1169
1170
1171
1172
1173
1174 local_irq_disable();
1175 p->time_slice = (current->time_slice + 1) >> 1;
1176
1177
1178
1179
1180 p->first_time_slice = 1;
1181 current->time_slice >>= 1;
1182 p->timestamp = sched_clock();
1183 if (unlikely(!current->time_slice)) {
1184
1185
1186
1187
1188
1189 current->time_slice = 1;
1190 preempt_disable();
1191 scheduler_tick();
1192 local_irq_enable();
1193 preempt_enable();
1194 } else
1195 local_irq_enable();
1196}
1197
1198
1199
1200
1201
1202
1203
1204
1205void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1206{
1207 unsigned long flags;
1208 int this_cpu, cpu;
1209 runqueue_t *rq, *this_rq;
1210
1211 rq = task_rq_lock(p, &flags);
1212 cpu = task_cpu(p);
1213 this_cpu = smp_processor_id();
1214
1215 BUG_ON(p->state != TASK_RUNNING);
1216
1217 schedstat_inc(rq, wunt_cnt);
1218
1219
1220
1221
1222
1223
1224 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1225 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1226
1227 p->prio = effective_prio(p);
1228
1229 if (likely(cpu == this_cpu)) {
1230 if (!(clone_flags & CLONE_VM)) {
1231
1232
1233
1234
1235
1236 if (unlikely(!current->array))
1237 __activate_task(p, rq);
1238 else {
1239 p->prio = current->prio;
1240 list_add_tail(&p->run_list, ¤t->run_list);
1241 p->array = current->array;
1242 p->array->nr_active++;
1243 rq->nr_running++;
1244 }
1245 set_need_resched();
1246 } else
1247
1248 __activate_task(p, rq);
1249
1250
1251
1252
1253
1254
1255 this_rq = rq;
1256 } else {
1257 this_rq = cpu_rq(this_cpu);
1258
1259
1260
1261
1262
1263 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1264 + rq->timestamp_last_tick;
1265 __activate_task(p, rq);
1266 if (TASK_PREEMPTS_CURR(p, rq))
1267 resched_task(rq->curr);
1268
1269 schedstat_inc(rq, wunt_moved);
1270
1271
1272
1273
1274 task_rq_unlock(rq, &flags);
1275 this_rq = task_rq_lock(current, &flags);
1276 }
1277 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1278 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1279 task_rq_unlock(this_rq, &flags);
1280}
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291void fastcall sched_exit(task_t * p)
1292{
1293 unsigned long flags;
1294 runqueue_t *rq;
1295
1296
1297
1298
1299
1300 rq = task_rq_lock(p->parent, &flags);
1301 if (p->first_time_slice) {
1302 p->parent->time_slice += p->time_slice;
1303 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1304 p->parent->time_slice = task_timeslice(p);
1305 }
1306 if (p->sleep_avg < p->parent->sleep_avg)
1307 p->parent->sleep_avg = p->parent->sleep_avg /
1308 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1309 (EXIT_WEIGHT + 1);
1310 task_rq_unlock(rq, &flags);
1311}
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326static void finish_task_switch(task_t *prev)
1327 __releases(rq->lock)
1328{
1329 runqueue_t *rq = this_rq();
1330 struct mm_struct *mm = rq->prev_mm;
1331 unsigned long prev_task_flags;
1332
1333 rq->prev_mm = NULL;
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346 prev_task_flags = prev->flags;
1347 finish_arch_switch(rq, prev);
1348 if (mm)
1349 mmdrop(mm);
1350 if (unlikely(prev_task_flags & PF_DEAD))
1351 put_task_struct(prev);
1352}
1353
1354
1355
1356
1357
1358asmlinkage void schedule_tail(task_t *prev)
1359 __releases(rq->lock)
1360{
1361 finish_task_switch(prev);
1362
1363 if (current->set_child_tid)
1364 put_user(current->pid, current->set_child_tid);
1365}
1366
1367
1368
1369
1370
1371static inline
1372task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1373{
1374 struct mm_struct *mm = next->mm;
1375 struct mm_struct *oldmm = prev->active_mm;
1376
1377 if (unlikely(!mm)) {
1378 next->active_mm = oldmm;
1379 atomic_inc(&oldmm->mm_count);
1380 enter_lazy_tlb(oldmm, next);
1381 } else
1382 switch_mm(oldmm, mm, next);
1383
1384 if (unlikely(!prev->mm)) {
1385 prev->active_mm = NULL;
1386 WARN_ON(rq->prev_mm);
1387 rq->prev_mm = oldmm;
1388 }
1389
1390
1391 switch_to(prev, next, prev);
1392
1393 return prev;
1394}
1395
1396
1397
1398
1399
1400
1401
1402
1403unsigned long nr_running(void)
1404{
1405 unsigned long i, sum = 0;
1406
1407 for_each_online_cpu(i)
1408 sum += cpu_rq(i)->nr_running;
1409
1410 return sum;
1411}
1412
1413unsigned long nr_uninterruptible(void)
1414{
1415 unsigned long i, sum = 0;
1416
1417 for_each_cpu(i)
1418 sum += cpu_rq(i)->nr_uninterruptible;
1419
1420
1421
1422
1423
1424 if (unlikely((long)sum < 0))
1425 sum = 0;
1426
1427 return sum;
1428}
1429
1430unsigned long long nr_context_switches(void)
1431{
1432 unsigned long long i, sum = 0;
1433
1434 for_each_cpu(i)
1435 sum += cpu_rq(i)->nr_switches;
1436
1437 return sum;
1438}
1439
1440unsigned long nr_iowait(void)
1441{
1442 unsigned long i, sum = 0;
1443
1444 for_each_cpu(i)
1445 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1446
1447 return sum;
1448}
1449
1450#ifdef CONFIG_SMP
1451
1452
1453
1454
1455
1456
1457
1458static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1459 __acquires(rq1->lock)
1460 __acquires(rq2->lock)
1461{
1462 if (rq1 == rq2) {
1463 spin_lock(&rq1->lock);
1464 __acquire(rq2->lock);
1465 } else {
1466 if (rq1 < rq2) {
1467 spin_lock(&rq1->lock);
1468 spin_lock(&rq2->lock);
1469 } else {
1470 spin_lock(&rq2->lock);
1471 spin_lock(&rq1->lock);
1472 }
1473 }
1474}
1475
1476
1477
1478
1479
1480
1481
1482static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1483 __releases(rq1->lock)
1484 __releases(rq2->lock)
1485{
1486 spin_unlock(&rq1->lock);
1487 if (rq1 != rq2)
1488 spin_unlock(&rq2->lock);
1489 else
1490 __release(rq2->lock);
1491}
1492
1493
1494
1495
1496static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1497 __releases(this_rq->lock)
1498 __acquires(busiest->lock)
1499 __acquires(this_rq->lock)
1500{
1501 if (unlikely(!spin_trylock(&busiest->lock))) {
1502 if (busiest < this_rq) {
1503 spin_unlock(&this_rq->lock);
1504 spin_lock(&busiest->lock);
1505 spin_lock(&this_rq->lock);
1506 } else
1507 spin_lock(&busiest->lock);
1508 }
1509}
1510
1511
1512
1513
1514static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1515 struct sched_domain *sd)
1516{
1517 unsigned long load, min_load, this_load;
1518 int i, min_cpu;
1519 cpumask_t mask;
1520
1521 min_cpu = UINT_MAX;
1522 min_load = ULONG_MAX;
1523
1524 cpus_and(mask, sd->span, p->cpus_allowed);
1525
1526 for_each_cpu_mask(i, mask) {
1527 load = target_load(i);
1528
1529 if (load < min_load) {
1530 min_cpu = i;
1531 min_load = load;
1532
1533
1534 if (!min_load)
1535 break;
1536 }
1537 }
1538
1539
1540 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1551 return min_cpu;
1552
1553 return this_cpu;
1554}
1555
1556
1557
1558
1559
1560
1561
1562static void sched_migrate_task(task_t *p, int dest_cpu)
1563{
1564 migration_req_t req;
1565 runqueue_t *rq;
1566 unsigned long flags;
1567
1568 rq = task_rq_lock(p, &flags);
1569 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1570 || unlikely(cpu_is_offline(dest_cpu)))
1571 goto out;
1572
1573 schedstat_inc(rq, smt_cnt);
1574
1575 if (migrate_task(p, dest_cpu, &req)) {
1576
1577 struct task_struct *mt = rq->migration_thread;
1578 get_task_struct(mt);
1579 task_rq_unlock(rq, &flags);
1580 wake_up_process(mt);
1581 put_task_struct(mt);
1582 wait_for_completion(&req.done);
1583 return;
1584 }
1585out:
1586 task_rq_unlock(rq, &flags);
1587}
1588
1589
1590
1591
1592
1593
1594
1595
1596void sched_exec(void)
1597{
1598 struct sched_domain *tmp, *sd = NULL;
1599 int new_cpu, this_cpu = get_cpu();
1600
1601 schedstat_inc(this_rq(), sbe_cnt);
1602
1603 if (this_rq()->nr_running <= 1)
1604 goto out;
1605
1606 for_each_domain(this_cpu, tmp)
1607 if (tmp->flags & SD_BALANCE_EXEC)
1608 sd = tmp;
1609
1610 if (sd) {
1611 schedstat_inc(sd, sbe_attempts);
1612 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1613 if (new_cpu != this_cpu) {
1614 schedstat_inc(sd, sbe_pushed);
1615 put_cpu();
1616 sched_migrate_task(current, new_cpu);
1617 return;
1618 }
1619 }
1620out:
1621 put_cpu();
1622}
1623
1624
1625
1626
1627
1628static inline
1629void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1630 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1631{
1632 dequeue_task(p, src_array);
1633 src_rq->nr_running--;
1634 set_task_cpu(p, this_cpu);
1635 this_rq->nr_running++;
1636 enqueue_task(p, this_array);
1637 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1638 + this_rq->timestamp_last_tick;
1639
1640
1641
1642
1643 if (TASK_PREEMPTS_CURR(p, this_rq))
1644 resched_task(this_rq->curr);
1645}
1646
1647
1648
1649
1650static inline
1651int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1652 struct sched_domain *sd, enum idle_type idle)
1653{
1654
1655
1656
1657
1658
1659
1660 if (task_running(rq, p))
1661 return 0;
1662 if (!cpu_isset(this_cpu, p->cpus_allowed))
1663 return 0;
1664
1665
1666
1667
1668
1669
1670
1671 if (cpu_and_siblings_are_idle(this_cpu) || \
1672 sd->nr_balance_failed > sd->cache_nice_tries)
1673 return 1;
1674
1675 if (task_hot(p, rq->timestamp_last_tick, sd))
1676 return 0;
1677 return 1;
1678}
1679
1680
1681
1682
1683
1684
1685
1686
1687static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1688 unsigned long max_nr_move, struct sched_domain *sd,
1689 enum idle_type idle)
1690{
1691 prio_array_t *array, *dst_array;
1692 struct list_head *head, *curr;
1693 int idx, pulled = 0;
1694 task_t *tmp;
1695
1696 if (max_nr_move <= 0 || busiest->nr_running <= 1)
1697 goto out;
1698
1699
1700
1701
1702
1703
1704
1705 if (busiest->expired->nr_active) {
1706 array = busiest->expired;
1707 dst_array = this_rq->expired;
1708 } else {
1709 array = busiest->active;
1710 dst_array = this_rq->active;
1711 }
1712
1713new_array:
1714
1715 idx = 0;
1716skip_bitmap:
1717 if (!idx)
1718 idx = sched_find_first_bit(array->bitmap);
1719 else
1720 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1721 if (idx >= MAX_PRIO) {
1722 if (array == busiest->expired && busiest->active->nr_active) {
1723 array = busiest->active;
1724 dst_array = this_rq->active;
1725 goto new_array;
1726 }
1727 goto out;
1728 }
1729
1730 head = array->queue + idx;
1731 curr = head->prev;
1732skip_queue:
1733 tmp = list_entry(curr, task_t, run_list);
1734
1735 curr = curr->prev;
1736
1737 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1738 if (curr != head)
1739 goto skip_queue;
1740 idx++;
1741 goto skip_bitmap;
1742 }
1743
1744
1745
1746
1747
1748
1749 schedstat_inc(this_rq, pt_gained[idle]);
1750 schedstat_inc(busiest, pt_lost[idle]);
1751
1752 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1753 pulled++;
1754
1755
1756 if (pulled < max_nr_move) {
1757 if (curr != head)
1758 goto skip_queue;
1759 idx++;
1760 goto skip_bitmap;
1761 }
1762out:
1763 return pulled;
1764}
1765
1766
1767
1768
1769
1770
1771static struct sched_group *
1772find_busiest_group(struct sched_domain *sd, int this_cpu,
1773 unsigned long *imbalance, enum idle_type idle)
1774{
1775 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1776 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1777
1778 max_load = this_load = total_load = total_pwr = 0;
1779
1780 do {
1781 unsigned long load;
1782 int local_group;
1783 int i, nr_cpus = 0;
1784
1785 local_group = cpu_isset(this_cpu, group->cpumask);
1786
1787
1788 avg_load = 0;
1789
1790 for_each_cpu_mask(i, group->cpumask) {
1791
1792 if (local_group)
1793 load = target_load(i);
1794 else
1795 load = source_load(i);
1796
1797 nr_cpus++;
1798 avg_load += load;
1799 }
1800
1801 if (!nr_cpus)
1802 goto nextgroup;
1803
1804 total_load += avg_load;
1805 total_pwr += group->cpu_power;
1806
1807
1808 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1809
1810 if (local_group) {
1811 this_load = avg_load;
1812 this = group;
1813 goto nextgroup;
1814 } else if (avg_load > max_load) {
1815 max_load = avg_load;
1816 busiest = group;
1817 }
1818nextgroup:
1819 group = group->next;
1820 } while (group != sd->groups);
1821
1822 if (!busiest || this_load >= max_load)
1823 goto out_balanced;
1824
1825 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1826
1827 if (this_load >= avg_load ||
1828 100*max_load <= sd->imbalance_pct*this_load)
1829 goto out_balanced;
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842 *imbalance = min(max_load - avg_load, avg_load - this_load);
1843
1844
1845 *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
1846 / SCHED_LOAD_SCALE;
1847
1848 if (*imbalance < SCHED_LOAD_SCALE - 1) {
1849 unsigned long pwr_now = 0, pwr_move = 0;
1850 unsigned long tmp;
1851
1852 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1853 *imbalance = 1;
1854 return busiest;
1855 }
1856
1857
1858
1859
1860
1861
1862
1863 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1864 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1865 pwr_now /= SCHED_LOAD_SCALE;
1866
1867
1868 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1869 if (max_load > tmp)
1870 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1871 max_load - tmp);
1872
1873
1874 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1875 if (max_load < tmp)
1876 tmp = max_load;
1877 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1878 pwr_move /= SCHED_LOAD_SCALE;
1879
1880
1881 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
1882 goto out_balanced;
1883
1884 *imbalance = 1;
1885 return busiest;
1886 }
1887
1888
1889 *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
1890
1891 return busiest;
1892
1893out_balanced:
1894 if (busiest && (idle == NEWLY_IDLE ||
1895 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1896 *imbalance = 1;
1897 return busiest;
1898 }
1899
1900 *imbalance = 0;
1901 return NULL;
1902}
1903
1904
1905
1906
1907static runqueue_t *find_busiest_queue(struct sched_group *group)
1908{
1909 unsigned long load, max_load = 0;
1910 runqueue_t *busiest = NULL;
1911 int i;
1912
1913 for_each_cpu_mask(i, group->cpumask) {
1914 load = source_load(i);
1915
1916 if (load > max_load) {
1917 max_load = load;
1918 busiest = cpu_rq(i);
1919 }
1920 }
1921
1922 return busiest;
1923}
1924
1925
1926
1927
1928
1929
1930
1931static int load_balance(int this_cpu, runqueue_t *this_rq,
1932 struct sched_domain *sd, enum idle_type idle)
1933{
1934 struct sched_group *group;
1935 runqueue_t *busiest;
1936 unsigned long imbalance;
1937 int nr_moved;
1938
1939 spin_lock(&this_rq->lock);
1940 schedstat_inc(sd, lb_cnt[idle]);
1941
1942 group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1943 if (!group) {
1944 schedstat_inc(sd, lb_nobusyg[idle]);
1945 goto out_balanced;
1946 }
1947
1948 busiest = find_busiest_queue(group);
1949 if (!busiest) {
1950 schedstat_inc(sd, lb_nobusyq[idle]);
1951 goto out_balanced;
1952 }
1953
1954
1955
1956
1957
1958
1959 if (unlikely(busiest == this_rq)) {
1960 WARN_ON(1);
1961 goto out_balanced;
1962 }
1963
1964 schedstat_add(sd, lb_imbalance[idle], imbalance);
1965
1966 nr_moved = 0;
1967 if (busiest->nr_running > 1) {
1968
1969
1970
1971
1972
1973
1974 double_lock_balance(this_rq, busiest);
1975 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1976 imbalance, sd, idle);
1977 spin_unlock(&busiest->lock);
1978 }
1979 spin_unlock(&this_rq->lock);
1980
1981 if (!nr_moved) {
1982 schedstat_inc(sd, lb_failed[idle]);
1983 sd->nr_balance_failed++;
1984
1985 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1986 int wake = 0;
1987
1988 spin_lock(&busiest->lock);
1989 if (!busiest->active_balance) {
1990 busiest->active_balance = 1;
1991 busiest->push_cpu = this_cpu;
1992 wake = 1;
1993 }
1994 spin_unlock(&busiest->lock);
1995 if (wake)
1996 wake_up_process(busiest->migration_thread);
1997
1998
1999
2000
2001
2002 sd->nr_balance_failed = sd->cache_nice_tries;
2003 }
2004
2005
2006
2007
2008
2009 if (sd->balance_interval < sd->max_interval)
2010 sd->balance_interval++;
2011 } else {
2012 sd->nr_balance_failed = 0;
2013
2014
2015 sd->balance_interval = sd->min_interval;
2016 }
2017
2018 return nr_moved;
2019
2020out_balanced:
2021 spin_unlock(&this_rq->lock);
2022
2023
2024 if (sd->balance_interval < sd->max_interval)
2025 sd->balance_interval *= 2;
2026
2027 return 0;
2028}
2029
2030
2031
2032
2033
2034
2035
2036
2037static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2038 struct sched_domain *sd)
2039{
2040 struct sched_group *group;
2041 runqueue_t *busiest = NULL;
2042 unsigned long imbalance;
2043 int nr_moved = 0;
2044
2045 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2046 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2047 if (!group) {
2048 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2049 goto out;
2050 }
2051
2052 busiest = find_busiest_queue(group);
2053 if (!busiest || busiest == this_rq) {
2054 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2055 goto out;
2056 }
2057
2058
2059 double_lock_balance(this_rq, busiest);
2060
2061 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2062 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2063 imbalance, sd, NEWLY_IDLE);
2064 if (!nr_moved)
2065 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2066
2067 spin_unlock(&busiest->lock);
2068
2069out:
2070 return nr_moved;
2071}
2072
2073
2074
2075
2076
2077static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2078{
2079 struct sched_domain *sd;
2080
2081 for_each_domain(this_cpu, sd) {
2082 if (sd->flags & SD_BALANCE_NEWIDLE) {
2083 if (load_balance_newidle(this_cpu, this_rq, sd)) {
2084
2085 break;
2086 }
2087 }
2088 }
2089}
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2100{
2101 struct sched_domain *sd;
2102 struct sched_group *cpu_group;
2103 runqueue_t *target_rq;
2104 cpumask_t visited_cpus;
2105 int cpu;
2106
2107 schedstat_inc(busiest_rq, alb_cnt);
2108
2109
2110
2111
2112 visited_cpus = CPU_MASK_NONE;
2113 for_each_domain(busiest_cpu, sd) {
2114 if (!(sd->flags & SD_LOAD_BALANCE))
2115
2116 break;
2117
2118 cpu_group = sd->groups;
2119 do {
2120 for_each_cpu_mask(cpu, cpu_group->cpumask) {
2121 if (busiest_rq->nr_running <= 1)
2122
2123 return;
2124 if (cpu_isset(cpu, visited_cpus))
2125 continue;
2126 cpu_set(cpu, visited_cpus);
2127 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
2128 continue;
2129
2130 target_rq = cpu_rq(cpu);
2131
2132
2133
2134
2135
2136 BUG_ON(busiest_rq == target_rq);
2137
2138
2139 double_lock_balance(busiest_rq, target_rq);
2140 if (move_tasks(target_rq, cpu, busiest_rq,
2141 1, sd, SCHED_IDLE)) {
2142 schedstat_inc(busiest_rq, alb_lost);
2143 schedstat_inc(target_rq, alb_gained);
2144 } else {
2145 schedstat_inc(busiest_rq, alb_failed);
2146 }
2147 spin_unlock(&target_rq->lock);
2148 }
2149 cpu_group = cpu_group->next;
2150 } while (cpu_group != sd->groups);
2151 }
2152}
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2165
2166static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2167 enum idle_type idle)
2168{
2169 unsigned long old_load, this_load;
2170 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2171 struct sched_domain *sd;
2172
2173
2174 old_load = this_rq->cpu_load;
2175 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2176
2177
2178
2179
2180
2181 if (this_load > old_load)
2182 old_load++;
2183 this_rq->cpu_load = (old_load + this_load) / 2;
2184
2185 for_each_domain(this_cpu, sd) {
2186 unsigned long interval;
2187
2188 if (!(sd->flags & SD_LOAD_BALANCE))
2189 continue;
2190
2191 interval = sd->balance_interval;
2192 if (idle != SCHED_IDLE)
2193 interval *= sd->busy_factor;
2194
2195
2196 interval = msecs_to_jiffies(interval);
2197 if (unlikely(!interval))
2198 interval = 1;
2199
2200 if (j - sd->last_balance >= interval) {
2201 if (load_balance(this_cpu, this_rq, sd, idle)) {
2202
2203 idle = NOT_IDLE;
2204 }
2205 sd->last_balance += interval;
2206 }
2207 }
2208}
2209#else
2210
2211
2212
2213static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2214{
2215}
2216static inline void idle_balance(int cpu, runqueue_t *rq)
2217{
2218}
2219#endif
2220
2221static inline int wake_priority_sleeper(runqueue_t *rq)
2222{
2223 int ret = 0;
2224#ifdef CONFIG_SCHED_SMT
2225 spin_lock(&rq->lock);
2226
2227
2228
2229
2230 if (rq->nr_running) {
2231 resched_task(rq->idle);
2232 ret = 1;
2233 }
2234 spin_unlock(&rq->lock);
2235#endif
2236 return ret;
2237}
2238
2239DEFINE_PER_CPU(struct kernel_stat, kstat);
2240
2241EXPORT_PER_CPU_SYMBOL(kstat);
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253#define EXPIRED_STARVING(rq) \
2254 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2255 (jiffies - (rq)->expired_timestamp >= \
2256 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2257 ((rq)->curr->static_prio > (rq)->best_expired_prio))
2258
2259
2260
2261
2262
2263
2264static inline void account_it_virt(struct task_struct * p, cputime_t cputime)
2265{
2266 cputime_t it_virt = p->it_virt_value;
2267
2268 if (cputime_gt(it_virt, cputime_zero) &&
2269 cputime_gt(cputime, cputime_zero)) {
2270 if (cputime_ge(cputime, it_virt)) {
2271 it_virt = cputime_add(it_virt, p->it_virt_incr);
2272 send_sig(SIGVTALRM, p, 1);
2273 }
2274 it_virt = cputime_sub(it_virt, cputime);
2275 p->it_virt_value = it_virt;
2276 }
2277}
2278
2279
2280
2281
2282
2283
2284static void account_it_prof(struct task_struct *p, cputime_t cputime)
2285{
2286 cputime_t it_prof = p->it_prof_value;
2287
2288 if (cputime_gt(it_prof, cputime_zero) &&
2289 cputime_gt(cputime, cputime_zero)) {
2290 if (cputime_ge(cputime, it_prof)) {
2291 it_prof = cputime_add(it_prof, p->it_prof_incr);
2292 send_sig(SIGPROF, p, 1);
2293 }
2294 it_prof = cputime_sub(it_prof, cputime);
2295 p->it_prof_value = it_prof;
2296 }
2297}
2298
2299
2300
2301
2302
2303
2304
2305static void check_rlimit(struct task_struct *p, cputime_t cputime)
2306{
2307 cputime_t total, tmp;
2308 unsigned long secs;
2309
2310 total = cputime_add(p->utime, p->stime);
2311 secs = cputime_to_secs(total);
2312 if (unlikely(secs >= p->signal->rlim[RLIMIT_CPU].rlim_cur)) {
2313
2314 tmp = cputime_sub(total, cputime);
2315 if (cputime_to_secs(tmp) < secs)
2316 send_sig(SIGXCPU, p, 1);
2317
2318 if (secs >= p->signal->rlim[RLIMIT_CPU].rlim_max)
2319 send_sig(SIGKILL, p, 1);
2320 }
2321}
2322
2323
2324
2325
2326
2327
2328
2329void account_user_time(struct task_struct *p, cputime_t cputime)
2330{
2331 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2332 cputime64_t tmp;
2333
2334 p->utime = cputime_add(p->utime, cputime);
2335
2336
2337 check_rlimit(p, cputime);
2338 account_it_virt(p, cputime);
2339 account_it_prof(p, cputime);
2340
2341
2342 tmp = cputime_to_cputime64(cputime);
2343 if (TASK_NICE(p) > 0)
2344 cpustat->nice = cputime64_add(cpustat->nice, tmp);
2345 else
2346 cpustat->user = cputime64_add(cpustat->user, tmp);
2347}
2348
2349
2350
2351
2352
2353
2354
2355void account_system_time(struct task_struct *p, int hardirq_offset,
2356 cputime_t cputime)
2357{
2358 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2359 runqueue_t *rq = this_rq();
2360 cputime64_t tmp;
2361
2362 p->stime = cputime_add(p->stime, cputime);
2363
2364
2365 if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) {
2366 check_rlimit(p, cputime);
2367 account_it_prof(p, cputime);
2368 }
2369
2370
2371 tmp = cputime_to_cputime64(cputime);
2372 if (hardirq_count() - hardirq_offset)
2373 cpustat->irq = cputime64_add(cpustat->irq, tmp);
2374 else if (softirq_count())
2375 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2376 else if (p != rq->idle)
2377 cpustat->system = cputime64_add(cpustat->system, tmp);
2378 else if (atomic_read(&rq->nr_iowait) > 0)
2379 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2380 else
2381 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2382}
2383
2384
2385
2386
2387
2388
2389void account_steal_time(struct task_struct *p, cputime_t steal)
2390{
2391 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2392 cputime64_t tmp = cputime_to_cputime64(steal);
2393 runqueue_t *rq = this_rq();
2394
2395 if (p == rq->idle) {
2396 p->stime = cputime_add(p->stime, steal);
2397 if (atomic_read(&rq->nr_iowait) > 0)
2398 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2399 else
2400 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2401 } else
2402 cpustat->steal = cputime64_add(cpustat->steal, tmp);
2403}
2404
2405
2406
2407
2408
2409
2410
2411
2412void scheduler_tick(void)
2413{
2414 int cpu = smp_processor_id();
2415 runqueue_t *rq = this_rq();
2416 task_t *p = current;
2417
2418 rq->timestamp_last_tick = sched_clock();
2419
2420 if (p == rq->idle) {
2421 if (wake_priority_sleeper(rq))
2422 goto out;
2423 rebalance_tick(cpu, rq, SCHED_IDLE);
2424 return;
2425 }
2426
2427
2428 if (p->array != rq->active) {
2429 set_tsk_need_resched(p);
2430 goto out;
2431 }
2432 spin_lock(&rq->lock);
2433
2434
2435
2436
2437
2438
2439
2440 if (rt_task(p)) {
2441
2442
2443
2444
2445 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2446 p->time_slice = task_timeslice(p);
2447 p->first_time_slice = 0;
2448 set_tsk_need_resched(p);
2449
2450
2451 requeue_task(p, rq->active);
2452 }
2453 goto out_unlock;
2454 }
2455 if (!--p->time_slice) {
2456 dequeue_task(p, rq->active);
2457 set_tsk_need_resched(p);
2458 p->prio = effective_prio(p);
2459 p->time_slice = task_timeslice(p);
2460 p->first_time_slice = 0;
2461
2462 if (!rq->expired_timestamp)
2463 rq->expired_timestamp = jiffies;
2464 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2465 enqueue_task(p, rq->expired);
2466 if (p->static_prio < rq->best_expired_prio)
2467 rq->best_expired_prio = p->static_prio;
2468 } else
2469 enqueue_task(p, rq->active);
2470 } else {
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2488 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2489 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2490 (p->array == rq->active)) {
2491
2492 requeue_task(p, rq->active);
2493 set_tsk_need_resched(p);
2494 }
2495 }
2496out_unlock:
2497 spin_unlock(&rq->lock);
2498out:
2499 rebalance_tick(cpu, rq, NOT_IDLE);
2500}
2501
2502#ifdef CONFIG_SCHED_SMT
2503static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2504{
2505 struct sched_domain *sd = this_rq->sd;
2506 cpumask_t sibling_map;
2507 int i;
2508
2509 if (!(sd->flags & SD_SHARE_CPUPOWER))
2510 return;
2511
2512
2513
2514
2515
2516
2517 spin_unlock(&this_rq->lock);
2518
2519 sibling_map = sd->span;
2520
2521 for_each_cpu_mask(i, sibling_map)
2522 spin_lock(&cpu_rq(i)->lock);
2523
2524
2525
2526
2527 cpu_clear(this_cpu, sibling_map);
2528
2529 for_each_cpu_mask(i, sibling_map) {
2530 runqueue_t *smt_rq = cpu_rq(i);
2531
2532
2533
2534
2535
2536 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2537 resched_task(smt_rq->idle);
2538 }
2539
2540 for_each_cpu_mask(i, sibling_map)
2541 spin_unlock(&cpu_rq(i)->lock);
2542
2543
2544
2545
2546}
2547
2548static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2549{
2550 struct sched_domain *sd = this_rq->sd;
2551 cpumask_t sibling_map;
2552 prio_array_t *array;
2553 int ret = 0, i;
2554 task_t *p;
2555
2556 if (!(sd->flags & SD_SHARE_CPUPOWER))
2557 return 0;
2558
2559
2560
2561
2562
2563 spin_unlock(&this_rq->lock);
2564 sibling_map = sd->span;
2565 for_each_cpu_mask(i, sibling_map)
2566 spin_lock(&cpu_rq(i)->lock);
2567 cpu_clear(this_cpu, sibling_map);
2568
2569
2570
2571
2572
2573 if (!this_rq->nr_running)
2574 goto out_unlock;
2575 array = this_rq->active;
2576 if (!array->nr_active)
2577 array = this_rq->expired;
2578 BUG_ON(!array->nr_active);
2579
2580 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2581 task_t, run_list);
2582
2583 for_each_cpu_mask(i, sibling_map) {
2584 runqueue_t *smt_rq = cpu_rq(i);
2585 task_t *smt_curr = smt_rq->curr;
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2596 task_timeslice(p) || rt_task(smt_curr)) &&
2597 p->mm && smt_curr->mm && !rt_task(p))
2598 ret = 1;
2599
2600
2601
2602
2603
2604
2605 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2606 task_timeslice(smt_curr) || rt_task(p)) &&
2607 smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2608 (smt_curr == smt_rq->idle && smt_rq->nr_running))
2609 resched_task(smt_curr);
2610 }
2611out_unlock:
2612 for_each_cpu_mask(i, sibling_map)
2613 spin_unlock(&cpu_rq(i)->lock);
2614 return ret;
2615}
2616#else
2617static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2618{
2619}
2620
2621static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2622{
2623 return 0;
2624}
2625#endif
2626
2627#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2628
2629void fastcall add_preempt_count(int val)
2630{
2631
2632
2633
2634 BUG_ON(((int)preempt_count() < 0));
2635 preempt_count() += val;
2636
2637
2638
2639 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2640}
2641EXPORT_SYMBOL(add_preempt_count);
2642
2643void fastcall sub_preempt_count(int val)
2644{
2645
2646
2647
2648 BUG_ON(val > preempt_count());
2649
2650
2651
2652 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2653 preempt_count() -= val;
2654}
2655EXPORT_SYMBOL(sub_preempt_count);
2656
2657#endif
2658
2659
2660
2661
2662asmlinkage void __sched schedule(void)
2663{
2664 long *switch_count;
2665 task_t *prev, *next;
2666 runqueue_t *rq;
2667 prio_array_t *array;
2668 struct list_head *queue;
2669 unsigned long long now;
2670 unsigned long run_time;
2671 int cpu, idx;
2672
2673
2674
2675
2676
2677
2678 if (likely(!current->exit_state)) {
2679 if (unlikely(in_atomic())) {
2680 printk(KERN_ERR "scheduling while atomic: "
2681 "%s/0x%08x/%d\n",
2682 current->comm, preempt_count(), current->pid);
2683 dump_stack();
2684 }
2685 }
2686 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2687
2688need_resched:
2689 preempt_disable();
2690 prev = current;
2691 release_kernel_lock(prev);
2692need_resched_nonpreemptible:
2693 rq = this_rq();
2694
2695
2696
2697
2698
2699 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2700 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2701 dump_stack();
2702 }
2703
2704 schedstat_inc(rq, sched_cnt);
2705 now = sched_clock();
2706 if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2707 run_time = now - prev->timestamp;
2708 else
2709 run_time = NS_MAX_SLEEP_AVG;
2710
2711
2712
2713
2714
2715 run_time /= (CURRENT_BONUS(prev) ? : 1);
2716
2717 spin_lock_irq(&rq->lock);
2718
2719 if (unlikely(prev->flags & PF_DEAD))
2720 prev->state = EXIT_DEAD;
2721
2722 switch_count = &prev->nivcsw;
2723 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2724 switch_count = &prev->nvcsw;
2725 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2726 unlikely(signal_pending(prev))))
2727 prev->state = TASK_RUNNING;
2728 else {
2729 if (prev->state == TASK_UNINTERRUPTIBLE)
2730 rq->nr_uninterruptible++;
2731 deactivate_task(prev, rq);
2732 }
2733 }
2734
2735 cpu = smp_processor_id();
2736 if (unlikely(!rq->nr_running)) {
2737go_idle:
2738 idle_balance(cpu, rq);
2739 if (!rq->nr_running) {
2740 next = rq->idle;
2741 rq->expired_timestamp = 0;
2742 wake_sleeping_dependent(cpu, rq);
2743
2744
2745
2746
2747
2748 if (!rq->nr_running)
2749 goto switch_tasks;
2750 }
2751 } else {
2752 if (dependent_sleeper(cpu, rq)) {
2753 next = rq->idle;
2754 goto switch_tasks;
2755 }
2756
2757
2758
2759
2760
2761 if (unlikely(!rq->nr_running))
2762 goto go_idle;
2763 }
2764
2765 array = rq->active;
2766 if (unlikely(!array->nr_active)) {
2767
2768
2769
2770 schedstat_inc(rq, sched_switch);
2771 rq->active = rq->expired;
2772 rq->expired = array;
2773 array = rq->active;
2774 rq->expired_timestamp = 0;
2775 rq->best_expired_prio = MAX_PRIO;
2776 } else
2777 schedstat_inc(rq, sched_noswitch);
2778
2779 idx = sched_find_first_bit(array->bitmap);
2780 queue = array->queue + idx;
2781 next = list_entry(queue->next, task_t, run_list);
2782
2783 if (!rt_task(next) && next->activated > 0) {
2784 unsigned long long delta = now - next->timestamp;
2785
2786 if (next->activated == 1)
2787 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2788
2789 array = next->array;
2790 dequeue_task(next, array);
2791 recalc_task_prio(next, next->timestamp + delta);
2792 enqueue_task(next, array);
2793 }
2794 next->activated = 0;
2795switch_tasks:
2796 if (next == rq->idle)
2797 schedstat_inc(rq, sched_goidle);
2798 prefetch(next);
2799 clear_tsk_need_resched(prev);
2800 rcu_qsctr_inc(task_cpu(prev));
2801
2802 prev->sleep_avg -= run_time;
2803 if ((long)prev->sleep_avg <= 0)
2804 prev->sleep_avg = 0;
2805 prev->timestamp = prev->last_ran = now;
2806
2807 sched_info_switch(prev, next);
2808 if (likely(prev != next)) {
2809 next->timestamp = now;
2810 rq->nr_switches++;
2811 rq->curr = next;
2812 ++*switch_count;
2813
2814 prepare_arch_switch(rq, next);
2815 prev = context_switch(rq, prev, next);
2816 barrier();
2817
2818 finish_task_switch(prev);
2819 } else
2820 spin_unlock_irq(&rq->lock);
2821
2822 prev = current;
2823 if (unlikely(reacquire_kernel_lock(prev) < 0))
2824 goto need_resched_nonpreemptible;
2825 preempt_enable_no_resched();
2826 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2827 goto need_resched;
2828}
2829
2830EXPORT_SYMBOL(schedule);
2831
2832#ifdef CONFIG_PREEMPT
2833
2834
2835
2836
2837
2838asmlinkage void __sched preempt_schedule(void)
2839{
2840 struct thread_info *ti = current_thread_info();
2841#ifdef CONFIG_PREEMPT_BKL
2842 struct task_struct *task = current;
2843 int saved_lock_depth;
2844#endif
2845
2846
2847
2848
2849 if (unlikely(ti->preempt_count || irqs_disabled()))
2850 return;
2851
2852need_resched:
2853 add_preempt_count(PREEMPT_ACTIVE);
2854
2855
2856
2857
2858
2859#ifdef CONFIG_PREEMPT_BKL
2860 saved_lock_depth = task->lock_depth;
2861 task->lock_depth = -1;
2862#endif
2863 schedule();
2864#ifdef CONFIG_PREEMPT_BKL
2865 task->lock_depth = saved_lock_depth;
2866#endif
2867 sub_preempt_count(PREEMPT_ACTIVE);
2868
2869
2870 barrier();
2871 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2872 goto need_resched;
2873}
2874
2875EXPORT_SYMBOL(preempt_schedule);
2876
2877
2878
2879
2880
2881
2882
2883asmlinkage void __sched preempt_schedule_irq(void)
2884{
2885 struct thread_info *ti = current_thread_info();
2886#ifdef CONFIG_PREEMPT_BKL
2887 struct task_struct *task = current;
2888 int saved_lock_depth;
2889#endif
2890
2891 BUG_ON(ti->preempt_count || !irqs_disabled());
2892
2893need_resched:
2894 add_preempt_count(PREEMPT_ACTIVE);
2895
2896
2897
2898
2899
2900#ifdef CONFIG_PREEMPT_BKL
2901 saved_lock_depth = task->lock_depth;
2902 task->lock_depth = -1;
2903#endif
2904 local_irq_enable();
2905 schedule();
2906 local_irq_disable();
2907#ifdef CONFIG_PREEMPT_BKL
2908 task->lock_depth = saved_lock_depth;
2909#endif
2910 sub_preempt_count(PREEMPT_ACTIVE);
2911
2912
2913 barrier();
2914 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2915 goto need_resched;
2916}
2917
2918#endif
2919
2920int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2921{
2922 task_t *p = curr->task;
2923 return try_to_wake_up(p, mode, sync);
2924}
2925
2926EXPORT_SYMBOL(default_wake_function);
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2938 int nr_exclusive, int sync, void *key)
2939{
2940 struct list_head *tmp, *next;
2941
2942 list_for_each_safe(tmp, next, &q->task_list) {
2943 wait_queue_t *curr;
2944 unsigned flags;
2945 curr = list_entry(tmp, wait_queue_t, task_list);
2946 flags = curr->flags;
2947 if (curr->func(curr, mode, sync, key) &&
2948 (flags & WQ_FLAG_EXCLUSIVE) &&
2949 !--nr_exclusive)
2950 break;
2951 }
2952}
2953
2954
2955
2956
2957
2958
2959
2960void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2961 int nr_exclusive, void *key)
2962{
2963 unsigned long flags;
2964
2965 spin_lock_irqsave(&q->lock, flags);
2966 __wake_up_common(q, mode, nr_exclusive, 0, key);
2967 spin_unlock_irqrestore(&q->lock, flags);
2968}
2969
2970EXPORT_SYMBOL(__wake_up);
2971
2972
2973
2974
2975void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2976{
2977 __wake_up_common(q, mode, 1, 0, NULL);
2978}
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2994{
2995 unsigned long flags;
2996 int sync = 1;
2997
2998 if (unlikely(!q))
2999 return;
3000
3001 if (unlikely(!nr_exclusive))
3002 sync = 0;
3003
3004 spin_lock_irqsave(&q->lock, flags);
3005 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3006 spin_unlock_irqrestore(&q->lock, flags);
3007}
3008EXPORT_SYMBOL_GPL(__wake_up_sync);
3009
3010void fastcall complete(struct completion *x)
3011{
3012 unsigned long flags;
3013
3014 spin_lock_irqsave(&x->wait.lock, flags);
3015 x->done++;
3016 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3017 1, 0, NULL);
3018 spin_unlock_irqrestore(&x->wait.lock, flags);
3019}
3020EXPORT_SYMBOL(complete);
3021
3022void fastcall complete_all(struct completion *x)
3023{
3024 unsigned long flags;
3025
3026 spin_lock_irqsave(&x->wait.lock, flags);
3027 x->done += UINT_MAX/2;
3028 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3029 0, 0, NULL);
3030 spin_unlock_irqrestore(&x->wait.lock, flags);
3031}
3032EXPORT_SYMBOL(complete_all);
3033
3034void fastcall __sched wait_for_completion(struct completion *x)
3035{
3036 might_sleep();
3037 spin_lock_irq(&x->wait.lock);
3038 if (!x->done) {
3039 DECLARE_WAITQUEUE(wait, current);
3040
3041 wait.flags |= WQ_FLAG_EXCLUSIVE;
3042 __add_wait_queue_tail(&x->wait, &wait);
3043 do {
3044 __set_current_state(TASK_UNINTERRUPTIBLE);
3045 spin_unlock_irq(&x->wait.lock);
3046 schedule();
3047 spin_lock_irq(&x->wait.lock);
3048 } while (!x->done);
3049 __remove_wait_queue(&x->wait, &wait);
3050 }
3051 x->done--;
3052 spin_unlock_irq(&x->wait.lock);
3053}
3054EXPORT_SYMBOL(wait_for_completion);
3055
3056unsigned long fastcall __sched
3057wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3058{
3059 might_sleep();
3060
3061 spin_lock_irq(&x->wait.lock);
3062 if (!x->done) {
3063 DECLARE_WAITQUEUE(wait, current);
3064
3065 wait.flags |= WQ_FLAG_EXCLUSIVE;
3066 __add_wait_queue_tail(&x->wait, &wait);
3067 do {
3068 __set_current_state(TASK_UNINTERRUPTIBLE);
3069 spin_unlock_irq(&x->wait.lock);
3070 timeout = schedule_timeout(timeout);
3071 spin_lock_irq(&x->wait.lock);
3072 if (!timeout) {
3073 __remove_wait_queue(&x->wait, &wait);
3074 goto out;
3075 }
3076 } while (!x->done);
3077 __remove_wait_queue(&x->wait, &wait);
3078 }
3079 x->done--;
3080out:
3081 spin_unlock_irq(&x->wait.lock);
3082 return timeout;
3083}
3084EXPORT_SYMBOL(wait_for_completion_timeout);
3085
3086int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3087{
3088 int ret = 0;
3089
3090 might_sleep();
3091
3092 spin_lock_irq(&x->wait.lock);
3093 if (!x->done) {
3094 DECLARE_WAITQUEUE(wait, current);
3095
3096 wait.flags |= WQ_FLAG_EXCLUSIVE;
3097 __add_wait_queue_tail(&x->wait, &wait);
3098 do {
3099 if (signal_pending(current)) {
3100 ret = -ERESTARTSYS;
3101 __remove_wait_queue(&x->wait, &wait);
3102 goto out;
3103 }
3104 __set_current_state(TASK_INTERRUPTIBLE);
3105 spin_unlock_irq(&x->wait.lock);
3106 schedule();
3107 spin_lock_irq(&x->wait.lock);
3108 } while (!x->done);
3109 __remove_wait_queue(&x->wait, &wait);
3110 }
3111 x->done--;
3112out:
3113 spin_unlock_irq(&x->wait.lock);
3114
3115 return ret;
3116}
3117EXPORT_SYMBOL(wait_for_completion_interruptible);
3118
3119unsigned long fastcall __sched
3120wait_for_completion_interruptible_timeout(struct completion *x,
3121 unsigned long timeout)
3122{
3123 might_sleep();
3124
3125 spin_lock_irq(&x->wait.lock);
3126 if (!x->done) {
3127 DECLARE_WAITQUEUE(wait, current);
3128
3129 wait.flags |= WQ_FLAG_EXCLUSIVE;
3130 __add_wait_queue_tail(&x->wait, &wait);
3131 do {
3132 if (signal_pending(current)) {
3133 timeout = -ERESTARTSYS;
3134 __remove_wait_queue(&x->wait, &wait);
3135 goto out;
3136 }
3137 __set_current_state(TASK_INTERRUPTIBLE);
3138 spin_unlock_irq(&x->wait.lock);
3139 timeout = schedule_timeout(timeout);
3140 spin_lock_irq(&x->wait.lock);
3141 if (!timeout) {
3142 __remove_wait_queue(&x->wait, &wait);
3143 goto out;
3144 }
3145 } while (!x->done);
3146 __remove_wait_queue(&x->wait, &wait);
3147 }
3148 x->done--;
3149out:
3150 spin_unlock_irq(&x->wait.lock);
3151 return timeout;
3152}
3153EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3154
3155
3156#define SLEEP_ON_VAR \
3157 unsigned long flags; \
3158 wait_queue_t wait; \
3159 init_waitqueue_entry(&wait, current);
3160
3161#define SLEEP_ON_HEAD \
3162 spin_lock_irqsave(&q->lock,flags); \
3163 __add_wait_queue(q, &wait); \
3164 spin_unlock(&q->lock);
3165
3166#define SLEEP_ON_TAIL \
3167 spin_lock_irq(&q->lock); \
3168 __remove_wait_queue(q, &wait); \
3169 spin_unlock_irqrestore(&q->lock, flags);
3170
3171void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3172{
3173 SLEEP_ON_VAR
3174
3175 current->state = TASK_INTERRUPTIBLE;
3176
3177 SLEEP_ON_HEAD
3178 schedule();
3179 SLEEP_ON_TAIL
3180}
3181
3182EXPORT_SYMBOL(interruptible_sleep_on);
3183
3184long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3185{
3186 SLEEP_ON_VAR
3187
3188 current->state = TASK_INTERRUPTIBLE;
3189
3190 SLEEP_ON_HEAD
3191 timeout = schedule_timeout(timeout);
3192 SLEEP_ON_TAIL
3193
3194 return timeout;
3195}
3196
3197EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3198
3199void fastcall __sched sleep_on(wait_queue_head_t *q)
3200{
3201 SLEEP_ON_VAR
3202
3203 current->state = TASK_UNINTERRUPTIBLE;
3204
3205 SLEEP_ON_HEAD
3206 schedule();
3207 SLEEP_ON_TAIL
3208}
3209
3210EXPORT_SYMBOL(sleep_on);
3211
3212long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3213{
3214 SLEEP_ON_VAR
3215
3216 current->state = TASK_UNINTERRUPTIBLE;
3217
3218 SLEEP_ON_HEAD
3219 timeout = schedule_timeout(timeout);
3220 SLEEP_ON_TAIL
3221
3222 return timeout;
3223}
3224
3225EXPORT_SYMBOL(sleep_on_timeout);
3226
3227void set_user_nice(task_t *p, long nice)
3228{
3229 unsigned long flags;
3230 prio_array_t *array;
3231 runqueue_t *rq;
3232 int old_prio, new_prio, delta;
3233
3234 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3235 return;
3236
3237
3238
3239
3240 rq = task_rq_lock(p, &flags);
3241
3242
3243
3244
3245
3246
3247 if (rt_task(p)) {
3248 p->static_prio = NICE_TO_PRIO(nice);
3249 goto out_unlock;
3250 }
3251 array = p->array;
3252 if (array)
3253 dequeue_task(p, array);
3254
3255 old_prio = p->prio;
3256 new_prio = NICE_TO_PRIO(nice);
3257 delta = new_prio - old_prio;
3258 p->static_prio = NICE_TO_PRIO(nice);
3259 p->prio += delta;
3260
3261 if (array) {
3262 enqueue_task(p, array);
3263
3264
3265
3266
3267 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3268 resched_task(rq->curr);
3269 }
3270out_unlock:
3271 task_rq_unlock(rq, &flags);
3272}
3273
3274EXPORT_SYMBOL(set_user_nice);
3275
3276#ifdef __ARCH_WANT_SYS_NICE
3277
3278
3279
3280
3281
3282
3283
3284
3285asmlinkage long sys_nice(int increment)
3286{
3287 int retval;
3288 long nice;
3289
3290
3291
3292
3293
3294
3295 if (increment < 0) {
3296 if (!capable(CAP_SYS_NICE))
3297 return -EPERM;
3298 if (increment < -40)
3299 increment = -40;
3300 }
3301 if (increment > 40)
3302 increment = 40;
3303
3304 nice = PRIO_TO_NICE(current->static_prio) + increment;
3305 if (nice < -20)
3306 nice = -20;
3307 if (nice > 19)
3308 nice = 19;
3309
3310 retval = security_task_setnice(current, nice);
3311 if (retval)
3312 return retval;
3313
3314 set_user_nice(current, nice);
3315 return 0;
3316}
3317
3318#endif
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328int task_prio(const task_t *p)
3329{
3330 return p->prio - MAX_RT_PRIO;
3331}
3332
3333
3334
3335
3336
3337int task_nice(const task_t *p)
3338{
3339 return TASK_NICE(p);
3340}
3341
3342
3343
3344
3345
3346
3347#ifdef CONFIG_COMPAT
3348EXPORT_SYMBOL_GPL(task_nice);
3349#endif
3350
3351
3352
3353
3354
3355int idle_cpu(int cpu)
3356{
3357 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3358}
3359
3360EXPORT_SYMBOL_GPL(idle_cpu);
3361
3362
3363
3364
3365
3366task_t *idle_task(int cpu)
3367{
3368 return cpu_rq(cpu)->idle;
3369}
3370
3371
3372
3373
3374
3375static inline task_t *find_process_by_pid(pid_t pid)
3376{
3377 return pid ? find_task_by_pid(pid) : current;
3378}
3379
3380
3381static void __setscheduler(struct task_struct *p, int policy, int prio)
3382{
3383 BUG_ON(p->array);
3384 p->policy = policy;
3385 p->rt_priority = prio;
3386 if (policy != SCHED_NORMAL)
3387 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3388 else
3389 p->prio = p->static_prio;
3390}
3391
3392
3393
3394
3395
3396
3397
3398
3399int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
3400{
3401 int retval;
3402 int oldprio, oldpolicy = -1;
3403 prio_array_t *array;
3404 unsigned long flags;
3405 runqueue_t *rq;
3406
3407recheck:
3408
3409 if (policy < 0)
3410 policy = oldpolicy = p->policy;
3411 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3412 policy != SCHED_NORMAL)
3413 return -EINVAL;
3414
3415
3416
3417
3418 if (param->sched_priority < 0 ||
3419 param->sched_priority > MAX_USER_RT_PRIO-1)
3420 return -EINVAL;
3421 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3422 return -EINVAL;
3423
3424 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3425 !capable(CAP_SYS_NICE))
3426 return -EPERM;
3427 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3428 !capable(CAP_SYS_NICE))
3429 return -EPERM;
3430
3431 retval = security_task_setscheduler(p, policy, param);
3432 if (retval)
3433 return retval;
3434
3435
3436
3437
3438 rq = task_rq_lock(p, &flags);
3439
3440 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3441 policy = oldpolicy = -1;
3442 task_rq_unlock(rq, &flags);
3443 goto recheck;
3444 }
3445 array = p->array;
3446 if (array)
3447 deactivate_task(p, rq);
3448 oldprio = p->prio;
3449 __setscheduler(p, policy, param->sched_priority);
3450 if (array) {
3451 __activate_task(p, rq);
3452
3453
3454
3455
3456
3457 if (task_running(rq, p)) {
3458 if (p->prio > oldprio)
3459 resched_task(rq->curr);
3460 } else if (TASK_PREEMPTS_CURR(p, rq))
3461 resched_task(rq->curr);
3462 }
3463 task_rq_unlock(rq, &flags);
3464 return 0;
3465}
3466EXPORT_SYMBOL_GPL(sched_setscheduler);
3467
3468static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3469{
3470 int retval;
3471 struct sched_param lparam;
3472 struct task_struct *p;
3473
3474 if (!param || pid < 0)
3475 return -EINVAL;
3476 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3477 return -EFAULT;
3478 read_lock_irq(&tasklist_lock);
3479 p = find_process_by_pid(pid);
3480 if (!p) {
3481 read_unlock_irq(&tasklist_lock);
3482 return -ESRCH;
3483 }
3484 retval = sched_setscheduler(p, policy, &lparam);
3485 read_unlock_irq(&tasklist_lock);
3486 return retval;
3487}
3488
3489
3490
3491
3492
3493
3494
3495asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3496 struct sched_param __user *param)
3497{
3498 return do_sched_setscheduler(pid, policy, param);
3499}
3500
3501
3502
3503
3504
3505
3506asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3507{
3508 return do_sched_setscheduler(pid, -1, param);
3509}
3510
3511
3512
3513
3514
3515asmlinkage long sys_sched_getscheduler(pid_t pid)
3516{
3517 int retval = -EINVAL;
3518 task_t *p;
3519
3520 if (pid < 0)
3521 goto out_nounlock;
3522
3523 retval = -ESRCH;
3524 read_lock(&tasklist_lock);
3525 p = find_process_by_pid(pid);
3526 if (p) {
3527 retval = security_task_getscheduler(p);
3528 if (!retval)
3529 retval = p->policy;
3530 }
3531 read_unlock(&tasklist_lock);
3532
3533out_nounlock:
3534 return retval;
3535}
3536
3537
3538
3539
3540
3541
3542asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3543{
3544 struct sched_param lp;
3545 int retval = -EINVAL;
3546 task_t *p;
3547
3548 if (!param || pid < 0)
3549 goto out_nounlock;
3550
3551 read_lock(&tasklist_lock);
3552 p = find_process_by_pid(pid);
3553 retval = -ESRCH;
3554 if (!p)
3555 goto out_unlock;
3556
3557 retval = security_task_getscheduler(p);
3558 if (retval)
3559 goto out_unlock;
3560
3561 lp.sched_priority = p->rt_priority;
3562 read_unlock(&tasklist_lock);
3563
3564
3565
3566
3567 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3568
3569out_nounlock:
3570 return retval;
3571
3572out_unlock:
3573 read_unlock(&tasklist_lock);
3574 return retval;
3575}
3576
3577long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3578{
3579 task_t *p;
3580 int retval;
3581
3582 lock_cpu_hotplug();
3583 read_lock(&tasklist_lock);
3584
3585 p = find_process_by_pid(pid);
3586 if (!p) {
3587 read_unlock(&tasklist_lock);
3588 unlock_cpu_hotplug();
3589 return -ESRCH;
3590 }
3591
3592
3593
3594
3595
3596
3597 get_task_struct(p);
3598 read_unlock(&tasklist_lock);
3599
3600 retval = -EPERM;
3601 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3602 !capable(CAP_SYS_NICE))
3603 goto out_unlock;
3604
3605 retval = set_cpus_allowed(p, new_mask);
3606
3607out_unlock:
3608 put_task_struct(p);
3609 unlock_cpu_hotplug();
3610 return retval;
3611}
3612
3613static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3614 cpumask_t *new_mask)
3615{
3616 if (len < sizeof(cpumask_t)) {
3617 memset(new_mask, 0, sizeof(cpumask_t));
3618 } else if (len > sizeof(cpumask_t)) {
3619 len = sizeof(cpumask_t);
3620 }
3621 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3622}
3623
3624
3625
3626
3627
3628
3629
3630asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3631 unsigned long __user *user_mask_ptr)
3632{
3633 cpumask_t new_mask;
3634 int retval;
3635
3636 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3637 if (retval)
3638 return retval;
3639
3640 return sched_setaffinity(pid, new_mask);
3641}
3642
3643
3644
3645
3646
3647
3648
3649
3650cpumask_t cpu_present_map;
3651EXPORT_SYMBOL(cpu_present_map);
3652
3653#ifndef CONFIG_SMP
3654cpumask_t cpu_online_map = CPU_MASK_ALL;
3655cpumask_t cpu_possible_map = CPU_MASK_ALL;
3656#endif
3657
3658long sched_getaffinity(pid_t pid, cpumask_t *mask)
3659{
3660 int retval;
3661 task_t *p;
3662
3663 lock_cpu_hotplug();
3664 read_lock(&tasklist_lock);
3665
3666 retval = -ESRCH;
3667 p = find_process_by_pid(pid);
3668 if (!p)
3669 goto out_unlock;
3670
3671 retval = 0;
3672 cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3673
3674out_unlock:
3675 read_unlock(&tasklist_lock);
3676 unlock_cpu_hotplug();
3677 if (retval)
3678 return retval;
3679
3680 return 0;
3681}
3682
3683
3684
3685
3686
3687
3688
3689asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3690 unsigned long __user *user_mask_ptr)
3691{
3692 int ret;
3693 cpumask_t mask;
3694
3695 if (len < sizeof(cpumask_t))
3696 return -EINVAL;
3697
3698 ret = sched_getaffinity(pid, &mask);
3699 if (ret < 0)
3700 return ret;
3701
3702 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3703 return -EFAULT;
3704
3705 return sizeof(cpumask_t);
3706}
3707
3708
3709
3710
3711
3712
3713
3714
3715asmlinkage long sys_sched_yield(void)
3716{
3717 runqueue_t *rq = this_rq_lock();
3718 prio_array_t *array = current->array;
3719 prio_array_t *target = rq->expired;
3720
3721 schedstat_inc(rq, yld_cnt);
3722
3723
3724
3725
3726
3727
3728
3729 if (rt_task(current))
3730 target = rq->active;
3731
3732 if (current->array->nr_active == 1) {
3733 schedstat_inc(rq, yld_act_empty);
3734 if (!rq->expired->nr_active)
3735 schedstat_inc(rq, yld_both_empty);
3736 } else if (!rq->expired->nr_active)
3737 schedstat_inc(rq, yld_exp_empty);
3738
3739 if (array != target) {
3740 dequeue_task(current, array);
3741 enqueue_task(current, target);
3742 } else
3743
3744
3745
3746 requeue_task(current, array);
3747
3748
3749
3750
3751
3752 __release(rq->lock);
3753 _raw_spin_unlock(&rq->lock);
3754 preempt_enable_no_resched();
3755
3756 schedule();
3757
3758 return 0;
3759}
3760
3761static inline void __cond_resched(void)
3762{
3763 do {
3764 add_preempt_count(PREEMPT_ACTIVE);
3765 schedule();
3766 sub_preempt_count(PREEMPT_ACTIVE);
3767 } while (need_resched());
3768}
3769
3770int __sched cond_resched(void)
3771{
3772 if (need_resched()) {
3773 __cond_resched();
3774 return 1;
3775 }
3776 return 0;
3777}
3778
3779EXPORT_SYMBOL(cond_resched);
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789int cond_resched_lock(spinlock_t * lock)
3790{
3791#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3792 if (lock->break_lock) {
3793 lock->break_lock = 0;
3794 spin_unlock(lock);
3795 cpu_relax();
3796 spin_lock(lock);
3797 }
3798#endif
3799 if (need_resched()) {
3800 _raw_spin_unlock(lock);
3801 preempt_enable_no_resched();
3802 __cond_resched();
3803 spin_lock(lock);
3804 return 1;
3805 }
3806 return 0;
3807}
3808
3809EXPORT_SYMBOL(cond_resched_lock);
3810
3811int __sched cond_resched_softirq(void)
3812{
3813 BUG_ON(!in_softirq());
3814
3815 if (need_resched()) {
3816 __local_bh_enable();
3817 __cond_resched();
3818 local_bh_disable();
3819 return 1;
3820 }
3821 return 0;
3822}
3823
3824EXPORT_SYMBOL(cond_resched_softirq);
3825
3826
3827
3828
3829
3830
3831
3832
3833void __sched yield(void)
3834{
3835 set_current_state(TASK_RUNNING);
3836 sys_sched_yield();
3837}
3838
3839EXPORT_SYMBOL(yield);
3840
3841
3842
3843
3844
3845
3846
3847
3848void __sched io_schedule(void)
3849{
3850 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3851
3852 atomic_inc(&rq->nr_iowait);
3853 schedule();
3854 atomic_dec(&rq->nr_iowait);
3855}
3856
3857EXPORT_SYMBOL(io_schedule);
3858
3859long __sched io_schedule_timeout(long timeout)
3860{
3861 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3862 long ret;
3863
3864 atomic_inc(&rq->nr_iowait);
3865 ret = schedule_timeout(timeout);
3866 atomic_dec(&rq->nr_iowait);
3867 return ret;
3868}
3869
3870
3871
3872
3873
3874
3875
3876
3877asmlinkage long sys_sched_get_priority_max(int policy)
3878{
3879 int ret = -EINVAL;
3880
3881 switch (policy) {
3882 case SCHED_FIFO:
3883 case SCHED_RR:
3884 ret = MAX_USER_RT_PRIO-1;
3885 break;
3886 case SCHED_NORMAL:
3887 ret = 0;
3888 break;
3889 }
3890 return ret;
3891}
3892
3893
3894
3895
3896
3897
3898
3899
3900asmlinkage long sys_sched_get_priority_min(int policy)
3901{
3902 int ret = -EINVAL;
3903
3904 switch (policy) {
3905 case SCHED_FIFO:
3906 case SCHED_RR:
3907 ret = 1;
3908 break;
3909 case SCHED_NORMAL:
3910 ret = 0;
3911 }
3912 return ret;
3913}
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923asmlinkage
3924long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3925{
3926 int retval = -EINVAL;
3927 struct timespec t;
3928 task_t *p;
3929
3930 if (pid < 0)
3931 goto out_nounlock;
3932
3933 retval = -ESRCH;
3934 read_lock(&tasklist_lock);
3935 p = find_process_by_pid(pid);
3936 if (!p)
3937 goto out_unlock;
3938
3939 retval = security_task_getscheduler(p);
3940 if (retval)
3941 goto out_unlock;
3942
3943 jiffies_to_timespec(p->policy & SCHED_FIFO ?
3944 0 : task_timeslice(p), &t);
3945 read_unlock(&tasklist_lock);
3946 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3947out_nounlock:
3948 return retval;
3949out_unlock:
3950 read_unlock(&tasklist_lock);
3951 return retval;
3952}
3953
3954static inline struct task_struct *eldest_child(struct task_struct *p)
3955{
3956 if (list_empty(&p->children)) return NULL;
3957 return list_entry(p->children.next,struct task_struct,sibling);
3958}
3959
3960static inline struct task_struct *older_sibling(struct task_struct *p)
3961{
3962 if (p->sibling.prev==&p->parent->children) return NULL;
3963 return list_entry(p->sibling.prev,struct task_struct,sibling);
3964}
3965
3966static inline struct task_struct *younger_sibling(struct task_struct *p)
3967{
3968 if (p->sibling.next==&p->parent->children) return NULL;
3969 return list_entry(p->sibling.next,struct task_struct,sibling);
3970}
3971
3972static void show_task(task_t * p)
3973{
3974 task_t *relative;
3975 unsigned state;
3976 unsigned long free = 0;
3977 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3978
3979 printk("%-13.13s ", p->comm);
3980 state = p->state ? __ffs(p->state) + 1 : 0;
3981 if (state < ARRAY_SIZE(stat_nam))
3982 printk(stat_nam[state]);
3983 else
3984 printk("?");
3985#if (BITS_PER_LONG == 32)
3986 if (state == TASK_RUNNING)
3987 printk(" running ");
3988 else
3989 printk(" %08lX ", thread_saved_pc(p));
3990#else
3991 if (state == TASK_RUNNING)
3992 printk(" running task ");
3993 else
3994 printk(" %016lx ", thread_saved_pc(p));
3995#endif
3996#ifdef CONFIG_DEBUG_STACK_USAGE
3997 {
3998 unsigned long * n = (unsigned long *) (p->thread_info+1);
3999 while (!*n)
4000 n++;
4001 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
4002 }
4003#endif
4004 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
4005 if ((relative = eldest_child(p)))
4006 printk("%5d ", relative->pid);
4007 else
4008 printk(" ");
4009 if ((relative = younger_sibling(p)))
4010 printk("%7d", relative->pid);
4011 else
4012 printk(" ");
4013 if ((relative = older_sibling(p)))
4014 printk(" %5d", relative->pid);
4015 else
4016 printk(" ");
4017 if (!p->mm)
4018 printk(" (L-TLB)\n");
4019 else
4020 printk(" (NOTLB)\n");
4021
4022 if (state != TASK_RUNNING)
4023 show_stack(p, NULL);
4024}
4025
4026void show_state(void)
4027{
4028 task_t *g, *p;
4029
4030#if (BITS_PER_LONG == 32)
4031 printk("\n"
4032 " sibling\n");
4033 printk(" task PC pid father child younger older\n");
4034#else
4035 printk("\n"
4036 " sibling\n");
4037 printk(" task PC pid father child younger older\n");
4038#endif
4039 read_lock(&tasklist_lock);
4040 do_each_thread(g, p) {
4041
4042
4043
4044
4045 touch_nmi_watchdog();
4046 show_task(p);
4047 } while_each_thread(g, p);
4048
4049 read_unlock(&tasklist_lock);
4050}
4051
4052void __devinit init_idle(task_t *idle, int cpu)
4053{
4054 runqueue_t *rq = cpu_rq(cpu);
4055 unsigned long flags;
4056
4057 idle->sleep_avg = 0;
4058 idle->array = NULL;
4059 idle->prio = MAX_PRIO;
4060 idle->state = TASK_RUNNING;
4061 set_task_cpu(idle, cpu);
4062
4063 spin_lock_irqsave(&rq->lock, flags);
4064 rq->curr = rq->idle = idle;
4065 set_tsk_need_resched(idle);
4066 spin_unlock_irqrestore(&rq->lock, flags);
4067
4068
4069#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4070 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4071#else
4072 idle->thread_info->preempt_count = 0;
4073#endif
4074}
4075
4076
4077
4078
4079
4080
4081
4082
4083cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4084
4085#ifdef CONFIG_SMP
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4112{
4113 unsigned long flags;
4114 int ret = 0;
4115 migration_req_t req;
4116 runqueue_t *rq;
4117
4118 rq = task_rq_lock(p, &flags);
4119 if (!cpus_intersects(new_mask, cpu_online_map)) {
4120 ret = -EINVAL;
4121 goto out;
4122 }
4123
4124 p->cpus_allowed = new_mask;
4125
4126 if (cpu_isset(task_cpu(p), new_mask))
4127 goto out;
4128
4129 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4130
4131 task_rq_unlock(rq, &flags);
4132 wake_up_process(rq->migration_thread);
4133 wait_for_completion(&req.done);
4134 tlb_migrate_finish(p->mm);
4135 return 0;
4136 }
4137out:
4138 task_rq_unlock(rq, &flags);
4139 return ret;
4140}
4141
4142EXPORT_SYMBOL_GPL(set_cpus_allowed);
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4154{
4155 runqueue_t *rq_dest, *rq_src;
4156
4157 if (unlikely(cpu_is_offline(dest_cpu)))
4158 return;
4159
4160 rq_src = cpu_rq(src_cpu);
4161 rq_dest = cpu_rq(dest_cpu);
4162
4163 double_rq_lock(rq_src, rq_dest);
4164
4165 if (task_cpu(p) != src_cpu)
4166 goto out;
4167
4168 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4169 goto out;
4170
4171 set_task_cpu(p, dest_cpu);
4172 if (p->array) {
4173
4174
4175
4176
4177
4178
4179 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4180 + rq_dest->timestamp_last_tick;
4181 deactivate_task(p, rq_src);
4182 activate_task(p, rq_dest, 0);
4183 if (TASK_PREEMPTS_CURR(p, rq_dest))
4184 resched_task(rq_dest->curr);
4185 }
4186
4187out:
4188 double_rq_unlock(rq_src, rq_dest);
4189}
4190
4191
4192
4193
4194
4195
4196static int migration_thread(void * data)
4197{
4198 runqueue_t *rq;
4199 int cpu = (long)data;
4200
4201 rq = cpu_rq(cpu);
4202 BUG_ON(rq->migration_thread != current);
4203
4204 set_current_state(TASK_INTERRUPTIBLE);
4205 while (!kthread_should_stop()) {
4206 struct list_head *head;
4207 migration_req_t *req;
4208
4209 if (current->flags & PF_FREEZE)
4210 refrigerator(PF_FREEZE);
4211
4212 spin_lock_irq(&rq->lock);
4213
4214 if (cpu_is_offline(cpu)) {
4215 spin_unlock_irq(&rq->lock);
4216 goto wait_to_die;
4217 }
4218
4219 if (rq->active_balance) {
4220 active_load_balance(rq, cpu);
4221 rq->active_balance = 0;
4222 }
4223
4224 head = &rq->migration_queue;
4225
4226 if (list_empty(head)) {
4227 spin_unlock_irq(&rq->lock);
4228 schedule();
4229 set_current_state(TASK_INTERRUPTIBLE);
4230 continue;
4231 }
4232 req = list_entry(head->next, migration_req_t, list);
4233 list_del_init(head->next);
4234
4235 if (req->type == REQ_MOVE_TASK) {
4236 spin_unlock(&rq->lock);
4237 __migrate_task(req->task, cpu, req->dest_cpu);
4238 local_irq_enable();
4239 } else if (req->type == REQ_SET_DOMAIN) {
4240 rq->sd = req->sd;
4241 spin_unlock_irq(&rq->lock);
4242 } else {
4243 spin_unlock_irq(&rq->lock);
4244 WARN_ON(1);
4245 }
4246
4247 complete(&req->done);
4248 }
4249 __set_current_state(TASK_RUNNING);
4250 return 0;
4251
4252wait_to_die:
4253
4254 set_current_state(TASK_INTERRUPTIBLE);
4255 while (!kthread_should_stop()) {
4256 schedule();
4257 set_current_state(TASK_INTERRUPTIBLE);
4258 }
4259 __set_current_state(TASK_RUNNING);
4260 return 0;
4261}
4262
4263#ifdef CONFIG_HOTPLUG_CPU
4264
4265static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4266{
4267 int dest_cpu;
4268 cpumask_t mask;
4269
4270
4271 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4272 cpus_and(mask, mask, tsk->cpus_allowed);
4273 dest_cpu = any_online_cpu(mask);
4274
4275
4276 if (dest_cpu == NR_CPUS)
4277 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4278
4279
4280 if (dest_cpu == NR_CPUS) {
4281 cpus_setall(tsk->cpus_allowed);
4282 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4283
4284
4285
4286
4287
4288
4289 if (tsk->mm && printk_ratelimit())
4290 printk(KERN_INFO "process %d (%s) no "
4291 "longer affine to cpu%d\n",
4292 tsk->pid, tsk->comm, dead_cpu);
4293 }
4294 __migrate_task(tsk, dead_cpu, dest_cpu);
4295}
4296
4297
4298
4299
4300
4301
4302
4303
4304static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4305{
4306 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4307 unsigned long flags;
4308
4309 local_irq_save(flags);
4310 double_rq_lock(rq_src, rq_dest);
4311 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4312 rq_src->nr_uninterruptible = 0;
4313 double_rq_unlock(rq_src, rq_dest);
4314 local_irq_restore(flags);
4315}
4316
4317
4318static void migrate_live_tasks(int src_cpu)
4319{
4320 struct task_struct *tsk, *t;
4321
4322 write_lock_irq(&tasklist_lock);
4323
4324 do_each_thread(t, tsk) {
4325 if (tsk == current)
4326 continue;
4327
4328 if (task_cpu(tsk) == src_cpu)
4329 move_task_off_dead_cpu(src_cpu, tsk);
4330 } while_each_thread(t, tsk);
4331
4332 write_unlock_irq(&tasklist_lock);
4333}
4334
4335
4336
4337
4338
4339void sched_idle_next(void)
4340{
4341 int cpu = smp_processor_id();
4342 runqueue_t *rq = this_rq();
4343 struct task_struct *p = rq->idle;
4344 unsigned long flags;
4345
4346
4347 BUG_ON(cpu_online(cpu));
4348
4349
4350
4351
4352 spin_lock_irqsave(&rq->lock, flags);
4353
4354 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4355
4356 __activate_idle_task(p, rq);
4357
4358 spin_unlock_irqrestore(&rq->lock, flags);
4359}
4360
4361
4362
4363
4364void idle_task_exit(void)
4365{
4366 struct mm_struct *mm = current->active_mm;
4367
4368 BUG_ON(cpu_online(smp_processor_id()));
4369
4370 if (mm != &init_mm)
4371 switch_mm(mm, &init_mm, current);
4372 mmdrop(mm);
4373}
4374
4375static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4376{
4377 struct runqueue *rq = cpu_rq(dead_cpu);
4378
4379
4380 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4381
4382
4383 BUG_ON(tsk->flags & PF_DEAD);
4384
4385 get_task_struct(tsk);
4386
4387
4388
4389
4390
4391
4392 spin_unlock_irq(&rq->lock);
4393 move_task_off_dead_cpu(dead_cpu, tsk);
4394 spin_lock_irq(&rq->lock);
4395
4396 put_task_struct(tsk);
4397}
4398
4399
4400static void migrate_dead_tasks(unsigned int dead_cpu)
4401{
4402 unsigned arr, i;
4403 struct runqueue *rq = cpu_rq(dead_cpu);
4404
4405 for (arr = 0; arr < 2; arr++) {
4406 for (i = 0; i < MAX_PRIO; i++) {
4407 struct list_head *list = &rq->arrays[arr].queue[i];
4408 while (!list_empty(list))
4409 migrate_dead(dead_cpu,
4410 list_entry(list->next, task_t,
4411 run_list));
4412 }
4413 }
4414}
4415#endif
4416
4417
4418
4419
4420
4421static int migration_call(struct notifier_block *nfb, unsigned long action,
4422 void *hcpu)
4423{
4424 int cpu = (long)hcpu;
4425 struct task_struct *p;
4426 struct runqueue *rq;
4427 unsigned long flags;
4428
4429 switch (action) {
4430 case CPU_UP_PREPARE:
4431 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4432 if (IS_ERR(p))
4433 return NOTIFY_BAD;
4434 p->flags |= PF_NOFREEZE;
4435 kthread_bind(p, cpu);
4436
4437 rq = task_rq_lock(p, &flags);
4438 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4439 task_rq_unlock(rq, &flags);
4440 cpu_rq(cpu)->migration_thread = p;
4441 break;
4442 case CPU_ONLINE:
4443
4444 wake_up_process(cpu_rq(cpu)->migration_thread);
4445 break;
4446#ifdef CONFIG_HOTPLUG_CPU
4447 case CPU_UP_CANCELED:
4448
4449 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4450 kthread_stop(cpu_rq(cpu)->migration_thread);
4451 cpu_rq(cpu)->migration_thread = NULL;
4452 break;
4453 case CPU_DEAD:
4454 migrate_live_tasks(cpu);
4455 rq = cpu_rq(cpu);
4456 kthread_stop(rq->migration_thread);
4457 rq->migration_thread = NULL;
4458
4459 rq = task_rq_lock(rq->idle, &flags);
4460 deactivate_task(rq->idle, rq);
4461 rq->idle->static_prio = MAX_PRIO;
4462 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4463 migrate_dead_tasks(cpu);
4464 task_rq_unlock(rq, &flags);
4465 migrate_nr_uninterruptible(rq);
4466 BUG_ON(rq->nr_running != 0);
4467
4468
4469
4470
4471 spin_lock_irq(&rq->lock);
4472 while (!list_empty(&rq->migration_queue)) {
4473 migration_req_t *req;
4474 req = list_entry(rq->migration_queue.next,
4475 migration_req_t, list);
4476 BUG_ON(req->type != REQ_MOVE_TASK);
4477 list_del_init(&req->list);
4478 complete(&req->done);
4479 }
4480 spin_unlock_irq(&rq->lock);
4481 break;
4482#endif
4483 }
4484 return NOTIFY_OK;
4485}
4486
4487
4488
4489
4490static struct notifier_block __devinitdata migration_notifier = {
4491 .notifier_call = migration_call,
4492 .priority = 10
4493};
4494
4495int __init migration_init(void)
4496{
4497 void *cpu = (void *)(long)smp_processor_id();
4498
4499 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4500 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4501 register_cpu_notifier(&migration_notifier);
4502 return 0;
4503}
4504#endif
4505
4506#ifdef CONFIG_SMP
4507#define SCHED_DOMAIN_DEBUG
4508#ifdef SCHED_DOMAIN_DEBUG
4509static void sched_domain_debug(struct sched_domain *sd, int cpu)
4510{
4511 int level = 0;
4512
4513 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4514
4515 do {
4516 int i;
4517 char str[NR_CPUS];
4518 struct sched_group *group = sd->groups;
4519 cpumask_t groupmask;
4520
4521 cpumask_scnprintf(str, NR_CPUS, sd->span);
4522 cpus_clear(groupmask);
4523
4524 printk(KERN_DEBUG);
4525 for (i = 0; i < level + 1; i++)
4526 printk(" ");
4527 printk("domain %d: ", level);
4528
4529 if (!(sd->flags & SD_LOAD_BALANCE)) {
4530 printk("does not load-balance\n");
4531 if (sd->parent)
4532 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4533 break;
4534 }
4535
4536 printk("span %s\n", str);
4537
4538 if (!cpu_isset(cpu, sd->span))
4539 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4540 if (!cpu_isset(cpu, group->cpumask))
4541 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4542
4543 printk(KERN_DEBUG);
4544 for (i = 0; i < level + 2; i++)
4545 printk(" ");
4546 printk("groups:");
4547 do {
4548 if (!group) {
4549 printk("\n");
4550 printk(KERN_ERR "ERROR: group is NULL\n");
4551 break;
4552 }
4553
4554 if (!group->cpu_power) {
4555 printk("\n");
4556 printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4557 }
4558
4559 if (!cpus_weight(group->cpumask)) {
4560 printk("\n");
4561 printk(KERN_ERR "ERROR: empty group\n");
4562 }
4563
4564 if (cpus_intersects(groupmask, group->cpumask)) {
4565 printk("\n");
4566 printk(KERN_ERR "ERROR: repeated CPUs\n");
4567 }
4568
4569 cpus_or(groupmask, groupmask, group->cpumask);
4570
4571 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4572 printk(" %s", str);
4573
4574 group = group->next;
4575 } while (group != sd->groups);
4576 printk("\n");
4577
4578 if (!cpus_equal(sd->span, groupmask))
4579 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4580
4581 level++;
4582 sd = sd->parent;
4583
4584 if (sd) {
4585 if (!cpus_subset(groupmask, sd->span))
4586 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4587 }
4588
4589 } while (sd);
4590}
4591#else
4592#define sched_domain_debug(sd, cpu) {}
4593#endif
4594
4595
4596
4597
4598
4599void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4600{
4601 migration_req_t req;
4602 unsigned long flags;
4603 runqueue_t *rq = cpu_rq(cpu);
4604 int local = 1;
4605
4606 sched_domain_debug(sd, cpu);
4607
4608 spin_lock_irqsave(&rq->lock, flags);
4609
4610 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4611 rq->sd = sd;
4612 } else {
4613 init_completion(&req.done);
4614 req.type = REQ_SET_DOMAIN;
4615 req.sd = sd;
4616 list_add(&req.list, &rq->migration_queue);
4617 local = 0;
4618 }
4619
4620 spin_unlock_irqrestore(&rq->lock, flags);
4621
4622 if (!local) {
4623 wake_up_process(rq->migration_thread);
4624 wait_for_completion(&req.done);
4625 }
4626}
4627
4628
4629cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4630
4631
4632static int __init isolated_cpu_setup(char *str)
4633{
4634 int ints[NR_CPUS], i;
4635
4636 str = get_options(str, ARRAY_SIZE(ints), ints);
4637 cpus_clear(cpu_isolated_map);
4638 for (i = 1; i <= ints[0]; i++)
4639 if (ints[i] < NR_CPUS)
4640 cpu_set(ints[i], cpu_isolated_map);
4641 return 1;
4642}
4643
4644__setup ("isolcpus=", isolated_cpu_setup);
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657void __devinit init_sched_build_groups(struct sched_group groups[],
4658 cpumask_t span, int (*group_fn)(int cpu))
4659{
4660 struct sched_group *first = NULL, *last = NULL;
4661 cpumask_t covered = CPU_MASK_NONE;
4662 int i;
4663
4664 for_each_cpu_mask(i, span) {
4665 int group = group_fn(i);
4666 struct sched_group *sg = &groups[group];
4667 int j;
4668
4669 if (cpu_isset(i, covered))
4670 continue;
4671
4672 sg->cpumask = CPU_MASK_NONE;
4673 sg->cpu_power = 0;
4674
4675 for_each_cpu_mask(j, span) {
4676 if (group_fn(j) != group)
4677 continue;
4678
4679 cpu_set(j, covered);
4680 cpu_set(j, sg->cpumask);
4681 }
4682 if (!first)
4683 first = sg;
4684 if (last)
4685 last->next = sg;
4686 last = sg;
4687 }
4688 last->next = first;
4689}
4690
4691
4692#ifdef ARCH_HAS_SCHED_DOMAIN
4693extern void __devinit arch_init_sched_domains(void);
4694extern void __devinit arch_destroy_sched_domains(void);
4695#else
4696#ifdef CONFIG_SCHED_SMT
4697static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4698static struct sched_group sched_group_cpus[NR_CPUS];
4699static int __devinit cpu_to_cpu_group(int cpu)
4700{
4701 return cpu;
4702}
4703#endif
4704
4705static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4706static struct sched_group sched_group_phys[NR_CPUS];
4707static int __devinit cpu_to_phys_group(int cpu)
4708{
4709#ifdef CONFIG_SCHED_SMT
4710 return first_cpu(cpu_sibling_map[cpu]);
4711#else
4712 return cpu;
4713#endif
4714}
4715
4716#ifdef CONFIG_NUMA
4717
4718static DEFINE_PER_CPU(struct sched_domain, node_domains);
4719static struct sched_group sched_group_nodes[MAX_NUMNODES];
4720static int __devinit cpu_to_node_group(int cpu)
4721{
4722 return cpu_to_node(cpu);
4723}
4724#endif
4725
4726#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4727
4728
4729
4730
4731
4732static void check_sibling_maps(void)
4733{
4734 int i, j;
4735
4736 for_each_online_cpu(i) {
4737 for_each_cpu_mask(j, cpu_sibling_map[i]) {
4738 if (cpu_to_node(i) != cpu_to_node(j)) {
4739 printk(KERN_INFO "warning: CPU %d siblings map "
4740 "to different node - isolating "
4741 "them.\n", i);
4742 cpu_sibling_map[i] = cpumask_of_cpu(i);
4743 break;
4744 }
4745 }
4746 }
4747}
4748#endif
4749
4750
4751
4752
4753static void __devinit arch_init_sched_domains(void)
4754{
4755 int i;
4756 cpumask_t cpu_default_map;
4757
4758#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4759 check_sibling_maps();
4760#endif
4761
4762
4763
4764
4765
4766 cpus_complement(cpu_default_map, cpu_isolated_map);
4767 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4768
4769
4770
4771
4772 for_each_cpu_mask(i, cpu_default_map) {
4773 int group;
4774 struct sched_domain *sd = NULL, *p;
4775 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4776
4777 cpus_and(nodemask, nodemask, cpu_default_map);
4778
4779#ifdef CONFIG_NUMA
4780 sd = &per_cpu(node_domains, i);
4781 group = cpu_to_node_group(i);
4782 *sd = SD_NODE_INIT;
4783 sd->span = cpu_default_map;
4784 sd->groups = &sched_group_nodes[group];
4785#endif
4786
4787 p = sd;
4788 sd = &per_cpu(phys_domains, i);
4789 group = cpu_to_phys_group(i);
4790 *sd = SD_CPU_INIT;
4791 sd->span = nodemask;
4792 sd->parent = p;
4793 sd->groups = &sched_group_phys[group];
4794
4795#ifdef CONFIG_SCHED_SMT
4796 p = sd;
4797 sd = &per_cpu(cpu_domains, i);
4798 group = cpu_to_cpu_group(i);
4799 *sd = SD_SIBLING_INIT;
4800 sd->span = cpu_sibling_map[i];
4801 cpus_and(sd->span, sd->span, cpu_default_map);
4802 sd->parent = p;
4803 sd->groups = &sched_group_cpus[group];
4804#endif
4805 }
4806
4807#ifdef CONFIG_SCHED_SMT
4808
4809 for_each_online_cpu(i) {
4810 cpumask_t this_sibling_map = cpu_sibling_map[i];
4811 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4812 if (i != first_cpu(this_sibling_map))
4813 continue;
4814
4815 init_sched_build_groups(sched_group_cpus, this_sibling_map,
4816 &cpu_to_cpu_group);
4817 }
4818#endif
4819
4820
4821 for (i = 0; i < MAX_NUMNODES; i++) {
4822 cpumask_t nodemask = node_to_cpumask(i);
4823
4824 cpus_and(nodemask, nodemask, cpu_default_map);
4825 if (cpus_empty(nodemask))
4826 continue;
4827
4828 init_sched_build_groups(sched_group_phys, nodemask,
4829 &cpu_to_phys_group);
4830 }
4831
4832#ifdef CONFIG_NUMA
4833
4834 init_sched_build_groups(sched_group_nodes, cpu_default_map,
4835 &cpu_to_node_group);
4836#endif
4837
4838
4839 for_each_cpu_mask(i, cpu_default_map) {
4840 int power;
4841 struct sched_domain *sd;
4842#ifdef CONFIG_SCHED_SMT
4843 sd = &per_cpu(cpu_domains, i);
4844 power = SCHED_LOAD_SCALE;
4845 sd->groups->cpu_power = power;
4846#endif
4847
4848 sd = &per_cpu(phys_domains, i);
4849 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4850 (cpus_weight(sd->groups->cpumask)-1) / 10;
4851 sd->groups->cpu_power = power;
4852
4853#ifdef CONFIG_NUMA
4854 if (i == first_cpu(sd->groups->cpumask)) {
4855
4856 sd = &per_cpu(node_domains, i);
4857 sd->groups->cpu_power += power;
4858 }
4859#endif
4860 }
4861
4862
4863 for_each_online_cpu(i) {
4864 struct sched_domain *sd;
4865#ifdef CONFIG_SCHED_SMT
4866 sd = &per_cpu(cpu_domains, i);
4867#else
4868 sd = &per_cpu(phys_domains, i);
4869#endif
4870 cpu_attach_domain(sd, i);
4871 }
4872}
4873
4874#ifdef CONFIG_HOTPLUG_CPU
4875static void __devinit arch_destroy_sched_domains(void)
4876{
4877
4878}
4879#endif
4880
4881#endif
4882
4883
4884
4885
4886
4887
4888static struct sched_domain sched_domain_dummy;
4889
4890#ifdef CONFIG_HOTPLUG_CPU
4891
4892
4893
4894
4895
4896
4897static int update_sched_domains(struct notifier_block *nfb,
4898 unsigned long action, void *hcpu)
4899{
4900 int i;
4901
4902 switch (action) {
4903 case CPU_UP_PREPARE:
4904 case CPU_DOWN_PREPARE:
4905 for_each_online_cpu(i)
4906 cpu_attach_domain(&sched_domain_dummy, i);
4907 arch_destroy_sched_domains();
4908 return NOTIFY_OK;
4909
4910 case CPU_UP_CANCELED:
4911 case CPU_DOWN_FAILED:
4912 case CPU_ONLINE:
4913 case CPU_DEAD:
4914
4915
4916
4917 break;
4918 default:
4919 return NOTIFY_DONE;
4920 }
4921
4922
4923 arch_init_sched_domains();
4924
4925 return NOTIFY_OK;
4926}
4927#endif
4928
4929void __init sched_init_smp(void)
4930{
4931 lock_cpu_hotplug();
4932 arch_init_sched_domains();
4933 unlock_cpu_hotplug();
4934
4935 hotcpu_notifier(update_sched_domains, 0);
4936}
4937#else
4938void __init sched_init_smp(void)
4939{
4940}
4941#endif
4942
4943int in_sched_functions(unsigned long addr)
4944{
4945
4946 extern char __sched_text_start[], __sched_text_end[];
4947 return in_lock_functions(addr) ||
4948 (addr >= (unsigned long)__sched_text_start
4949 && addr < (unsigned long)__sched_text_end);
4950}
4951
4952void __init sched_init(void)
4953{
4954 runqueue_t *rq;
4955 int i, j, k;
4956
4957 for (i = 0; i < NR_CPUS; i++) {
4958 prio_array_t *array;
4959
4960 rq = cpu_rq(i);
4961 spin_lock_init(&rq->lock);
4962 rq->active = rq->arrays;
4963 rq->expired = rq->arrays + 1;
4964 rq->best_expired_prio = MAX_PRIO;
4965
4966#ifdef CONFIG_SMP
4967 rq->sd = &sched_domain_dummy;
4968 rq->cpu_load = 0;
4969 rq->active_balance = 0;
4970 rq->push_cpu = 0;
4971 rq->migration_thread = NULL;
4972 INIT_LIST_HEAD(&rq->migration_queue);
4973#endif
4974 atomic_set(&rq->nr_iowait, 0);
4975
4976 for (j = 0; j < 2; j++) {
4977 array = rq->arrays + j;
4978 for (k = 0; k < MAX_PRIO; k++) {
4979 INIT_LIST_HEAD(array->queue + k);
4980 __clear_bit(k, array->bitmap);
4981 }
4982
4983 __set_bit(MAX_PRIO, array->bitmap);
4984 }
4985 }
4986
4987
4988
4989
4990 atomic_inc(&init_mm.mm_count);
4991 enter_lazy_tlb(&init_mm, current);
4992
4993
4994
4995
4996
4997
4998
4999 init_idle(current, smp_processor_id());
5000}
5001
5002#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5003void __might_sleep(char *file, int line)
5004{
5005#if defined(in_atomic)
5006 static unsigned long prev_jiffy;
5007
5008 if ((in_atomic() || irqs_disabled()) &&
5009 system_state == SYSTEM_RUNNING && !oops_in_progress) {
5010 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
5011 return;
5012 prev_jiffy = jiffies;
5013 printk(KERN_ERR "Debug: sleeping function called from invalid"
5014 " context at %s:%d\n", file, line);
5015 printk("in_atomic():%d, irqs_disabled():%d\n",
5016 in_atomic(), irqs_disabled());
5017 dump_stack();
5018 }
5019#endif
5020}
5021EXPORT_SYMBOL(__might_sleep);
5022#endif
5023
5024#ifdef CONFIG_MAGIC_SYSRQ
5025void normalize_rt_tasks(void)
5026{
5027 struct task_struct *p;
5028 prio_array_t *array;
5029 unsigned long flags;
5030 runqueue_t *rq;
5031
5032 read_lock_irq(&tasklist_lock);
5033 for_each_process (p) {
5034 if (!rt_task(p))
5035 continue;
5036
5037 rq = task_rq_lock(p, &flags);
5038
5039 array = p->array;
5040 if (array)
5041 deactivate_task(p, task_rq(p));
5042 __setscheduler(p, SCHED_NORMAL, 0);
5043 if (array) {
5044 __activate_task(p, task_rq(p));
5045 resched_task(rq->curr);
5046 }
5047
5048 task_rq_unlock(rq, &flags);
5049 }
5050 read_unlock_irq(&tasklist_lock);
5051}
5052
5053#endif
5054