1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
30#include <linux/completion.h>
31#include <linux/kernel_stat.h>
32#include <linux/security.h>
33#include <linux/notifier.h>
34#include <linux/suspend.h>
35#include <linux/blkdev.h>
36#include <linux/delay.h>
37#include <linux/smp.h>
38#include <linux/timer.h>
39#include <linux/rcupdate.h>
40#include <linux/cpu.h>
41#include <linux/percpu.h>
42#include <linux/kthread.h>
43#include <asm/tlb.h>
44
45#include <asm/unistd.h>
46
47#ifdef CONFIG_NUMA
48#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
49#else
50#define cpu_to_node_mask(cpu) (cpu_online_map)
51#endif
52
53
54
55
56
57
58#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
59#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
60#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
61
62
63
64
65
66
67#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
68#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
69#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
70#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
71 (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
72
73
74
75
76#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
77#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
78
79
80
81
82
83
84
85
86#define MIN_TIMESLICE ( 10 * HZ / 1000)
87#define MAX_TIMESLICE (200 * HZ / 1000)
88#define ON_RUNQUEUE_WEIGHT 30
89#define CHILD_PENALTY 95
90#define PARENT_PENALTY 100
91#define EXIT_WEIGHT 3
92#define PRIO_BONUS_RATIO 25
93#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
94#define INTERACTIVE_DELTA 2
95#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS)
96#define STARVATION_LIMIT (MAX_SLEEP_AVG)
97#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
98#define CREDIT_LIMIT 100
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128#define CURRENT_BONUS(p) \
129 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
130 MAX_SLEEP_AVG)
131
132#ifdef CONFIG_SMP
133#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
134 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
135 num_online_cpus())
136#else
137#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
138 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
139#endif
140
141#define SCALE(v1,v1_max,v2_max) \
142 (v1) * (v2_max) / (v1_max)
143
144#define DELTA(p) \
145 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
146
147#define TASK_INTERACTIVE(p) \
148 ((p)->prio <= (p)->static_prio - DELTA(p))
149
150#define INTERACTIVE_SLEEP(p) \
151 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
152 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
153
154#define HIGH_CREDIT(p) \
155 ((p)->interactive_credit > CREDIT_LIMIT)
156
157#define LOW_CREDIT(p) \
158 ((p)->interactive_credit < -CREDIT_LIMIT)
159
160#define TASK_PREEMPTS_CURR(p, rq) \
161 ((p)->prio < (rq)->curr->prio)
162
163
164
165
166
167
168
169
170
171
172
173
174#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
175 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
176 (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
177
178static unsigned int task_timeslice(task_t *p)
179{
180 return BASE_TIMESLICE(p);
181}
182
183#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
184
185
186
187
188
189#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
190
191typedef struct runqueue runqueue_t;
192
193struct prio_array {
194 unsigned int nr_active;
195 unsigned long bitmap[BITMAP_SIZE];
196 struct list_head queue[MAX_PRIO];
197};
198
199
200
201
202
203
204
205
206struct runqueue {
207 spinlock_t lock;
208
209
210
211
212
213 unsigned long nr_running;
214#ifdef CONFIG_SMP
215 unsigned long cpu_load;
216#endif
217 unsigned long long nr_switches;
218 unsigned long expired_timestamp, nr_uninterruptible;
219 unsigned long long timestamp_last_tick;
220 task_t *curr, *idle;
221 struct mm_struct *prev_mm;
222 prio_array_t *active, *expired, arrays[2];
223 int best_expired_prio;
224 atomic_t nr_iowait;
225
226#ifdef CONFIG_SMP
227 struct sched_domain *sd;
228
229
230 int active_balance;
231 int push_cpu;
232
233 task_t *migration_thread;
234 struct list_head migration_queue;
235#endif
236};
237
238static DEFINE_PER_CPU(struct runqueue, runqueues);
239
240#define for_each_domain(cpu, domain) \
241 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
242
243#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
244#define this_rq() (&__get_cpu_var(runqueues))
245#define task_rq(p) cpu_rq(task_cpu(p))
246#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
247
248
249
250
251#ifndef prepare_arch_switch
252# define prepare_arch_switch(rq, next) do { } while (0)
253# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
254# define task_running(rq, p) ((rq)->curr == (p))
255#endif
256
257
258
259
260
261
262static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
263{
264 struct runqueue *rq;
265
266repeat_lock_task:
267 local_irq_save(*flags);
268 rq = task_rq(p);
269 spin_lock(&rq->lock);
270 if (unlikely(rq != task_rq(p))) {
271 spin_unlock_irqrestore(&rq->lock, *flags);
272 goto repeat_lock_task;
273 }
274 return rq;
275}
276
277static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
278{
279 spin_unlock_irqrestore(&rq->lock, *flags);
280}
281
282
283
284
285static runqueue_t *this_rq_lock(void)
286{
287 runqueue_t *rq;
288
289 local_irq_disable();
290 rq = this_rq();
291 spin_lock(&rq->lock);
292
293 return rq;
294}
295
296static inline void rq_unlock(runqueue_t *rq)
297{
298 spin_unlock_irq(&rq->lock);
299}
300
301
302
303
304static void dequeue_task(struct task_struct *p, prio_array_t *array)
305{
306 array->nr_active--;
307 list_del(&p->run_list);
308 if (list_empty(array->queue + p->prio))
309 __clear_bit(p->prio, array->bitmap);
310}
311
312static void enqueue_task(struct task_struct *p, prio_array_t *array)
313{
314 list_add_tail(&p->run_list, array->queue + p->prio);
315 __set_bit(p->prio, array->bitmap);
316 array->nr_active++;
317 p->array = array;
318}
319
320
321
322
323
324
325static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
326{
327 list_add(&p->run_list, array->queue + p->prio);
328 __set_bit(p->prio, array->bitmap);
329 array->nr_active++;
330 p->array = array;
331}
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347static int effective_prio(task_t *p)
348{
349 int bonus, prio;
350
351 if (rt_task(p))
352 return p->prio;
353
354 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
355
356 prio = p->static_prio - bonus;
357 if (prio < MAX_RT_PRIO)
358 prio = MAX_RT_PRIO;
359 if (prio > MAX_PRIO-1)
360 prio = MAX_PRIO-1;
361 return prio;
362}
363
364
365
366
367static inline void __activate_task(task_t *p, runqueue_t *rq)
368{
369 enqueue_task(p, rq->active);
370 rq->nr_running++;
371}
372
373
374
375
376static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
377{
378 enqueue_task_head(p, rq->active);
379 rq->nr_running++;
380}
381
382static void recalc_task_prio(task_t *p, unsigned long long now)
383{
384 unsigned long long __sleep_time = now - p->timestamp;
385 unsigned long sleep_time;
386
387 if (__sleep_time > NS_MAX_SLEEP_AVG)
388 sleep_time = NS_MAX_SLEEP_AVG;
389 else
390 sleep_time = (unsigned long)__sleep_time;
391
392 if (likely(sleep_time > 0)) {
393
394
395
396
397
398
399 if (p->mm && p->activated != -1 &&
400 sleep_time > INTERACTIVE_SLEEP(p)) {
401 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
402 AVG_TIMESLICE);
403 if (!HIGH_CREDIT(p))
404 p->interactive_credit++;
405 } else {
406
407
408
409
410 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
411
412
413
414
415
416 if (LOW_CREDIT(p) &&
417 sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
418 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
419
420
421
422
423
424
425 if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
426 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
427 sleep_time = 0;
428 else if (p->sleep_avg + sleep_time >=
429 INTERACTIVE_SLEEP(p)) {
430 p->sleep_avg = INTERACTIVE_SLEEP(p);
431 sleep_time = 0;
432 }
433 }
434
435
436
437
438
439
440
441
442
443 p->sleep_avg += sleep_time;
444
445 if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
446 p->sleep_avg = NS_MAX_SLEEP_AVG;
447 if (!HIGH_CREDIT(p))
448 p->interactive_credit++;
449 }
450 }
451 }
452
453 p->prio = effective_prio(p);
454}
455
456
457
458
459
460
461
462static void activate_task(task_t *p, runqueue_t *rq, int local)
463{
464 unsigned long long now;
465
466 now = sched_clock();
467#ifdef CONFIG_SMP
468 if (!local) {
469
470 runqueue_t *this_rq = this_rq();
471 now = (now - this_rq->timestamp_last_tick)
472 + rq->timestamp_last_tick;
473 }
474#endif
475
476 recalc_task_prio(p, now);
477
478
479
480
481
482 if (!p->activated) {
483
484
485
486
487
488
489
490 if (in_interrupt())
491 p->activated = 2;
492 else {
493
494
495
496
497 p->activated = 1;
498 }
499 }
500 p->timestamp = now;
501
502 __activate_task(p, rq);
503}
504
505
506
507
508static void deactivate_task(struct task_struct *p, runqueue_t *rq)
509{
510 rq->nr_running--;
511 if (p->state == TASK_UNINTERRUPTIBLE)
512 rq->nr_uninterruptible++;
513 dequeue_task(p, p->array);
514 p->array = NULL;
515}
516
517
518
519
520
521
522
523
524#ifdef CONFIG_SMP
525static void resched_task(task_t *p)
526{
527 int need_resched, nrpolling;
528
529 preempt_disable();
530
531 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
532 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
533 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
534
535 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
536 smp_send_reschedule(task_cpu(p));
537 preempt_enable();
538}
539#else
540static inline void resched_task(task_t *p)
541{
542 set_tsk_need_resched(p);
543}
544#endif
545
546
547
548
549
550inline int task_curr(const task_t *p)
551{
552 return cpu_curr(task_cpu(p)) == p;
553}
554
555#ifdef CONFIG_SMP
556enum request_type {
557 REQ_MOVE_TASK,
558 REQ_SET_DOMAIN,
559};
560
561typedef struct {
562 struct list_head list;
563 enum request_type type;
564
565
566 task_t *task;
567 int dest_cpu;
568
569
570 struct sched_domain *sd;
571
572 struct completion done;
573} migration_req_t;
574
575
576
577
578
579static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
580{
581 runqueue_t *rq = task_rq(p);
582
583
584
585
586
587 if (!p->array && !task_running(rq, p)) {
588 set_task_cpu(p, dest_cpu);
589 return 0;
590 }
591
592 init_completion(&req->done);
593 req->type = REQ_MOVE_TASK;
594 req->task = p;
595 req->dest_cpu = dest_cpu;
596 list_add(&req->list, &rq->migration_queue);
597 return 1;
598}
599
600
601
602
603
604
605
606
607
608
609void wait_task_inactive(task_t * p)
610{
611 unsigned long flags;
612 runqueue_t *rq;
613 int preempted;
614
615repeat:
616 rq = task_rq_lock(p, &flags);
617
618 if (unlikely(p->array)) {
619
620 preempted = !task_running(rq, p);
621 task_rq_unlock(rq, &flags);
622 cpu_relax();
623 if (preempted)
624 yield();
625 goto repeat;
626 }
627 task_rq_unlock(rq, &flags);
628}
629
630
631
632
633
634
635
636
637void kick_process(task_t *p)
638{
639 int cpu;
640
641 preempt_disable();
642 cpu = task_cpu(p);
643 if ((cpu != smp_processor_id()) && task_curr(p))
644 smp_send_reschedule(cpu);
645 preempt_enable();
646}
647
648EXPORT_SYMBOL_GPL(kick_process);
649
650
651
652
653
654
655
656static inline unsigned long source_load(int cpu)
657{
658 runqueue_t *rq = cpu_rq(cpu);
659 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
660
661 return min(rq->cpu_load, load_now);
662}
663
664
665
666
667static inline unsigned long target_load(int cpu)
668{
669 runqueue_t *rq = cpu_rq(cpu);
670 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
671
672 return max(rq->cpu_load, load_now);
673}
674
675#endif
676
677
678
679
680
681
682
683
684#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
685static int wake_idle(int cpu, task_t *p)
686{
687 cpumask_t tmp;
688 runqueue_t *rq = cpu_rq(cpu);
689 struct sched_domain *sd;
690 int i;
691
692 if (idle_cpu(cpu))
693 return cpu;
694
695 sd = rq->sd;
696 if (!(sd->flags & SD_WAKE_IDLE))
697 return cpu;
698
699 cpus_and(tmp, sd->span, cpu_online_map);
700 cpus_and(tmp, tmp, p->cpus_allowed);
701
702 for_each_cpu_mask(i, tmp) {
703 if (idle_cpu(i))
704 return i;
705 }
706
707 return cpu;
708}
709#else
710static inline int wake_idle(int cpu, task_t *p)
711{
712 return cpu;
713}
714#endif
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730static int try_to_wake_up(task_t * p, unsigned int state, int sync)
731{
732 int cpu, this_cpu, success = 0;
733 unsigned long flags;
734 long old_state;
735 runqueue_t *rq;
736#ifdef CONFIG_SMP
737 unsigned long load, this_load;
738 struct sched_domain *sd;
739 int new_cpu;
740#endif
741
742 rq = task_rq_lock(p, &flags);
743 old_state = p->state;
744 if (!(old_state & state))
745 goto out;
746
747 if (p->array)
748 goto out_running;
749
750 cpu = task_cpu(p);
751 this_cpu = smp_processor_id();
752
753#ifdef CONFIG_SMP
754 if (unlikely(task_running(rq, p)))
755 goto out_activate;
756
757 new_cpu = cpu;
758
759 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
760 goto out_set_cpu;
761
762 load = source_load(cpu);
763 this_load = target_load(this_cpu);
764
765
766
767
768
769 if (sync)
770 this_load -= SCHED_LOAD_SCALE;
771
772
773 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
774 goto out_set_cpu;
775
776 new_cpu = this_cpu;
777
778
779
780
781
782 for_each_domain(this_cpu, sd) {
783 unsigned int imbalance;
784
785
786
787
788 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
789
790 if ( ((sd->flags & SD_WAKE_AFFINE) &&
791 !task_hot(p, rq->timestamp_last_tick, sd))
792 || ((sd->flags & SD_WAKE_BALANCE) &&
793 imbalance*this_load <= 100*load) ) {
794
795
796
797
798 if (cpu_isset(cpu, sd->span))
799 goto out_set_cpu;
800 }
801 }
802
803 new_cpu = cpu;
804out_set_cpu:
805 new_cpu = wake_idle(new_cpu, p);
806 if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
807 set_task_cpu(p, new_cpu);
808 task_rq_unlock(rq, &flags);
809
810 rq = task_rq_lock(p, &flags);
811 old_state = p->state;
812 if (!(old_state & state))
813 goto out;
814 if (p->array)
815 goto out_running;
816
817 this_cpu = smp_processor_id();
818 cpu = task_cpu(p);
819 }
820
821out_activate:
822#endif
823 if (old_state == TASK_UNINTERRUPTIBLE) {
824 rq->nr_uninterruptible--;
825
826
827
828
829 p->activated = -1;
830 }
831
832
833
834
835
836
837
838
839
840 activate_task(p, rq, cpu == this_cpu);
841 if (!sync || cpu != this_cpu) {
842 if (TASK_PREEMPTS_CURR(p, rq))
843 resched_task(rq->curr);
844 }
845 success = 1;
846
847out_running:
848 p->state = TASK_RUNNING;
849out:
850 task_rq_unlock(rq, &flags);
851
852 return success;
853}
854
855int fastcall wake_up_process(task_t * p)
856{
857 return try_to_wake_up(p, TASK_STOPPED |
858 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
859}
860
861EXPORT_SYMBOL(wake_up_process);
862
863int fastcall wake_up_state(task_t *p, unsigned int state)
864{
865 return try_to_wake_up(p, state, 0);
866}
867
868
869
870
871
872void fastcall sched_fork(task_t *p)
873{
874
875
876
877
878
879
880 p->state = TASK_RUNNING;
881 INIT_LIST_HEAD(&p->run_list);
882 p->array = NULL;
883 spin_lock_init(&p->switch_lock);
884#ifdef CONFIG_PREEMPT
885
886
887
888
889
890
891 p->thread_info->preempt_count = 1;
892#endif
893
894
895
896
897
898 local_irq_disable();
899 p->time_slice = (current->time_slice + 1) >> 1;
900
901
902
903
904 p->first_time_slice = 1;
905 current->time_slice >>= 1;
906 p->timestamp = sched_clock();
907 if (!current->time_slice) {
908
909
910
911
912
913 current->time_slice = 1;
914 preempt_disable();
915 scheduler_tick(0, 0);
916 local_irq_enable();
917 preempt_enable();
918 } else
919 local_irq_enable();
920}
921
922
923
924
925
926
927
928void fastcall wake_up_forked_process(task_t * p)
929{
930 unsigned long flags;
931 runqueue_t *rq = task_rq_lock(current, &flags);
932
933 BUG_ON(p->state != TASK_RUNNING);
934
935
936
937
938
939
940 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
941 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
942
943 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
944 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
945
946 p->interactive_credit = 0;
947
948 p->prio = effective_prio(p);
949 set_task_cpu(p, smp_processor_id());
950
951 if (unlikely(!current->array))
952 __activate_task(p, rq);
953 else {
954 p->prio = current->prio;
955 list_add_tail(&p->run_list, ¤t->run_list);
956 p->array = current->array;
957 p->array->nr_active++;
958 rq->nr_running++;
959 }
960 task_rq_unlock(rq, &flags);
961}
962
963
964
965
966
967
968
969
970
971
972void fastcall sched_exit(task_t * p)
973{
974 unsigned long flags;
975 runqueue_t *rq;
976
977 local_irq_save(flags);
978 if (p->first_time_slice) {
979 p->parent->time_slice += p->time_slice;
980 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
981 p->parent->time_slice = MAX_TIMESLICE;
982 }
983 local_irq_restore(flags);
984
985
986
987
988 rq = task_rq_lock(p->parent, &flags);
989 if (p->sleep_avg < p->parent->sleep_avg)
990 p->parent->sleep_avg = p->parent->sleep_avg /
991 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
992 (EXIT_WEIGHT + 1);
993 task_rq_unlock(rq, &flags);
994}
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009static void finish_task_switch(task_t *prev)
1010{
1011 runqueue_t *rq = this_rq();
1012 struct mm_struct *mm = rq->prev_mm;
1013 unsigned long prev_task_flags;
1014
1015 rq->prev_mm = NULL;
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028 prev_task_flags = prev->flags;
1029 finish_arch_switch(rq, prev);
1030 if (mm)
1031 mmdrop(mm);
1032 if (unlikely(prev_task_flags & PF_DEAD))
1033 put_task_struct(prev);
1034}
1035
1036
1037
1038
1039
1040asmlinkage void schedule_tail(task_t *prev)
1041{
1042 finish_task_switch(prev);
1043
1044 if (current->set_child_tid)
1045 put_user(current->pid, current->set_child_tid);
1046}
1047
1048
1049
1050
1051
1052static inline
1053task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1054{
1055 struct mm_struct *mm = next->mm;
1056 struct mm_struct *oldmm = prev->active_mm;
1057
1058 if (unlikely(!mm)) {
1059 next->active_mm = oldmm;
1060 atomic_inc(&oldmm->mm_count);
1061 enter_lazy_tlb(oldmm, next);
1062 } else
1063 switch_mm(oldmm, mm, next);
1064
1065 if (unlikely(!prev->mm)) {
1066 prev->active_mm = NULL;
1067 WARN_ON(rq->prev_mm);
1068 rq->prev_mm = oldmm;
1069 }
1070
1071
1072 switch_to(prev, next, prev);
1073
1074 return prev;
1075}
1076
1077
1078
1079
1080
1081
1082
1083
1084unsigned long nr_running(void)
1085{
1086 unsigned long i, sum = 0;
1087
1088 for_each_cpu(i)
1089 sum += cpu_rq(i)->nr_running;
1090
1091 return sum;
1092}
1093
1094unsigned long nr_uninterruptible(void)
1095{
1096 unsigned long i, sum = 0;
1097
1098 for_each_cpu(i)
1099 sum += cpu_rq(i)->nr_uninterruptible;
1100
1101 return sum;
1102}
1103
1104unsigned long long nr_context_switches(void)
1105{
1106 unsigned long long i, sum = 0;
1107
1108 for_each_cpu(i)
1109 sum += cpu_rq(i)->nr_switches;
1110
1111 return sum;
1112}
1113
1114unsigned long nr_iowait(void)
1115{
1116 unsigned long i, sum = 0;
1117
1118 for_each_cpu(i)
1119 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1120
1121 return sum;
1122}
1123
1124
1125
1126
1127
1128
1129
1130static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1131{
1132 if (rq1 == rq2)
1133 spin_lock(&rq1->lock);
1134 else {
1135 if (rq1 < rq2) {
1136 spin_lock(&rq1->lock);
1137 spin_lock(&rq2->lock);
1138 } else {
1139 spin_lock(&rq2->lock);
1140 spin_lock(&rq1->lock);
1141 }
1142 }
1143}
1144
1145
1146
1147
1148
1149
1150
1151static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1152{
1153 spin_unlock(&rq1->lock);
1154 if (rq1 != rq2)
1155 spin_unlock(&rq2->lock);
1156}
1157
1158enum idle_type
1159{
1160 IDLE,
1161 NOT_IDLE,
1162 NEWLY_IDLE,
1163};
1164
1165#ifdef CONFIG_SMP
1166
1167
1168
1169
1170static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1171 struct sched_domain *sd)
1172{
1173 unsigned long load, min_load, this_load;
1174 int i, min_cpu;
1175 cpumask_t mask;
1176
1177 min_cpu = UINT_MAX;
1178 min_load = ULONG_MAX;
1179
1180 cpus_and(mask, sd->span, cpu_online_map);
1181 cpus_and(mask, mask, p->cpus_allowed);
1182
1183 for_each_cpu_mask(i, mask) {
1184 load = target_load(i);
1185
1186 if (load < min_load) {
1187 min_cpu = i;
1188 min_load = load;
1189
1190
1191 if (!min_load)
1192 break;
1193 }
1194 }
1195
1196
1197 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1208 return min_cpu;
1209
1210 return this_cpu;
1211}
1212
1213
1214
1215
1216
1217
1218
1219
1220void fastcall wake_up_forked_thread(task_t * p)
1221{
1222 unsigned long flags;
1223 int this_cpu = get_cpu(), cpu;
1224 struct sched_domain *tmp, *sd = NULL;
1225 runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
1226
1227
1228
1229
1230
1231 for_each_domain(this_cpu, tmp)
1232 if (tmp->flags & SD_BALANCE_CLONE)
1233 sd = tmp;
1234 if (sd)
1235 cpu = find_idlest_cpu(p, this_cpu, sd);
1236 else
1237 cpu = this_cpu;
1238
1239 local_irq_save(flags);
1240lock_again:
1241 rq = cpu_rq(cpu);
1242 double_rq_lock(this_rq, rq);
1243
1244 BUG_ON(p->state != TASK_RUNNING);
1245
1246
1247
1248
1249
1250
1251 if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
1252 cpu = this_cpu;
1253 double_rq_unlock(this_rq, rq);
1254 goto lock_again;
1255 }
1256
1257
1258
1259
1260
1261 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1262 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1263
1264 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1265 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1266
1267 p->interactive_credit = 0;
1268
1269 p->prio = effective_prio(p);
1270 set_task_cpu(p, cpu);
1271
1272 if (cpu == this_cpu) {
1273 if (unlikely(!current->array))
1274 __activate_task(p, rq);
1275 else {
1276 p->prio = current->prio;
1277 list_add_tail(&p->run_list, ¤t->run_list);
1278 p->array = current->array;
1279 p->array->nr_active++;
1280 rq->nr_running++;
1281 }
1282 } else {
1283
1284 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1285 + rq->timestamp_last_tick;
1286 __activate_task(p, rq);
1287 if (TASK_PREEMPTS_CURR(p, rq))
1288 resched_task(rq->curr);
1289 }
1290
1291 double_rq_unlock(this_rq, rq);
1292 local_irq_restore(flags);
1293 put_cpu();
1294}
1295
1296
1297
1298
1299
1300
1301
1302static void sched_migrate_task(task_t *p, int dest_cpu)
1303{
1304 migration_req_t req;
1305 runqueue_t *rq;
1306 unsigned long flags;
1307
1308 rq = task_rq_lock(p, &flags);
1309 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1310 || unlikely(cpu_is_offline(dest_cpu)))
1311 goto out;
1312
1313
1314 if (migrate_task(p, dest_cpu, &req)) {
1315
1316 struct task_struct *mt = rq->migration_thread;
1317 get_task_struct(mt);
1318 task_rq_unlock(rq, &flags);
1319 wake_up_process(mt);
1320 put_task_struct(mt);
1321 wait_for_completion(&req.done);
1322 return;
1323 }
1324out:
1325 task_rq_unlock(rq, &flags);
1326}
1327
1328
1329
1330
1331
1332
1333
1334
1335void sched_balance_exec(void)
1336{
1337 struct sched_domain *tmp, *sd = NULL;
1338 int new_cpu, this_cpu = get_cpu();
1339
1340
1341 if (this_rq()->nr_running <= 1)
1342 goto out;
1343
1344 for_each_domain(this_cpu, tmp)
1345 if (tmp->flags & SD_BALANCE_EXEC)
1346 sd = tmp;
1347
1348 if (sd) {
1349 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1350 if (new_cpu != this_cpu) {
1351 put_cpu();
1352 sched_migrate_task(current, new_cpu);
1353 return;
1354 }
1355 }
1356out:
1357 put_cpu();
1358}
1359
1360
1361
1362
1363static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1364{
1365 if (unlikely(!spin_trylock(&busiest->lock))) {
1366 if (busiest < this_rq) {
1367 spin_unlock(&this_rq->lock);
1368 spin_lock(&busiest->lock);
1369 spin_lock(&this_rq->lock);
1370 } else
1371 spin_lock(&busiest->lock);
1372 }
1373}
1374
1375
1376
1377
1378
1379static inline
1380void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1381 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1382{
1383 dequeue_task(p, src_array);
1384 src_rq->nr_running--;
1385 set_task_cpu(p, this_cpu);
1386 this_rq->nr_running++;
1387 enqueue_task(p, this_array);
1388 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1389 + this_rq->timestamp_last_tick;
1390
1391
1392
1393
1394 if (TASK_PREEMPTS_CURR(p, this_rq))
1395 resched_task(this_rq->curr);
1396}
1397
1398
1399
1400
1401static inline
1402int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1403 struct sched_domain *sd, enum idle_type idle)
1404{
1405
1406
1407
1408
1409
1410
1411 if (task_running(rq, p))
1412 return 0;
1413 if (!cpu_isset(this_cpu, p->cpus_allowed))
1414 return 0;
1415
1416
1417 if (idle == NEWLY_IDLE ||
1418 sd->nr_balance_failed < sd->cache_nice_tries) {
1419 if (task_hot(p, rq->timestamp_last_tick, sd))
1420 return 0;
1421 }
1422
1423 return 1;
1424}
1425
1426
1427
1428
1429
1430
1431
1432
1433static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1434 unsigned long max_nr_move, struct sched_domain *sd,
1435 enum idle_type idle)
1436{
1437 prio_array_t *array, *dst_array;
1438 struct list_head *head, *curr;
1439 int idx, pulled = 0;
1440 task_t *tmp;
1441
1442 if (max_nr_move <= 0 || busiest->nr_running <= 1)
1443 goto out;
1444
1445
1446
1447
1448
1449
1450
1451 if (busiest->expired->nr_active) {
1452 array = busiest->expired;
1453 dst_array = this_rq->expired;
1454 } else {
1455 array = busiest->active;
1456 dst_array = this_rq->active;
1457 }
1458
1459new_array:
1460
1461 idx = 0;
1462skip_bitmap:
1463 if (!idx)
1464 idx = sched_find_first_bit(array->bitmap);
1465 else
1466 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1467 if (idx >= MAX_PRIO) {
1468 if (array == busiest->expired && busiest->active->nr_active) {
1469 array = busiest->active;
1470 dst_array = this_rq->active;
1471 goto new_array;
1472 }
1473 goto out;
1474 }
1475
1476 head = array->queue + idx;
1477 curr = head->prev;
1478skip_queue:
1479 tmp = list_entry(curr, task_t, run_list);
1480
1481 curr = curr->prev;
1482
1483 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1484 if (curr != head)
1485 goto skip_queue;
1486 idx++;
1487 goto skip_bitmap;
1488 }
1489 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1490 pulled++;
1491
1492
1493 if (pulled < max_nr_move) {
1494 if (curr != head)
1495 goto skip_queue;
1496 idx++;
1497 goto skip_bitmap;
1498 }
1499out:
1500 return pulled;
1501}
1502
1503
1504
1505
1506
1507
1508static struct sched_group *
1509find_busiest_group(struct sched_domain *sd, int this_cpu,
1510 unsigned long *imbalance, enum idle_type idle)
1511{
1512 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1513 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1514
1515 max_load = this_load = total_load = total_pwr = 0;
1516
1517 do {
1518 cpumask_t tmp;
1519 unsigned long load;
1520 int local_group;
1521 int i, nr_cpus = 0;
1522
1523 local_group = cpu_isset(this_cpu, group->cpumask);
1524
1525
1526 avg_load = 0;
1527 cpus_and(tmp, group->cpumask, cpu_online_map);
1528 if (unlikely(cpus_empty(tmp)))
1529 goto nextgroup;
1530
1531 for_each_cpu_mask(i, tmp) {
1532
1533 if (local_group)
1534 load = target_load(i);
1535 else
1536 load = source_load(i);
1537
1538 nr_cpus++;
1539 avg_load += load;
1540 }
1541
1542 if (!nr_cpus)
1543 goto nextgroup;
1544
1545 total_load += avg_load;
1546 total_pwr += group->cpu_power;
1547
1548
1549 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1550
1551 if (local_group) {
1552 this_load = avg_load;
1553 this = group;
1554 goto nextgroup;
1555 } else if (avg_load > max_load) {
1556 max_load = avg_load;
1557 busiest = group;
1558 }
1559nextgroup:
1560 group = group->next;
1561 } while (group != sd->groups);
1562
1563 if (!busiest || this_load >= max_load)
1564 goto out_balanced;
1565
1566 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1567
1568 if (this_load >= avg_load ||
1569 100*max_load <= sd->imbalance_pct*this_load)
1570 goto out_balanced;
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583 *imbalance = min(max_load - avg_load, avg_load - this_load);
1584
1585
1586 *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
1587 / SCHED_LOAD_SCALE;
1588
1589 if (*imbalance < SCHED_LOAD_SCALE - 1) {
1590 unsigned long pwr_now = 0, pwr_move = 0;
1591 unsigned long tmp;
1592
1593 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1594 *imbalance = 1;
1595 return busiest;
1596 }
1597
1598
1599
1600
1601
1602
1603
1604 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1605 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1606 pwr_now /= SCHED_LOAD_SCALE;
1607
1608
1609 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1610 if (max_load > tmp)
1611 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1612 max_load - tmp);
1613
1614
1615 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1616 if (max_load < tmp)
1617 tmp = max_load;
1618 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1619 pwr_move /= SCHED_LOAD_SCALE;
1620
1621
1622 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
1623 goto out_balanced;
1624
1625 *imbalance = 1;
1626 return busiest;
1627 }
1628
1629
1630 *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
1631
1632 return busiest;
1633
1634out_balanced:
1635 if (busiest && (idle == NEWLY_IDLE ||
1636 (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1637 *imbalance = 1;
1638 return busiest;
1639 }
1640
1641 *imbalance = 0;
1642 return NULL;
1643}
1644
1645
1646
1647
1648static runqueue_t *find_busiest_queue(struct sched_group *group)
1649{
1650 cpumask_t tmp;
1651 unsigned long load, max_load = 0;
1652 runqueue_t *busiest = NULL;
1653 int i;
1654
1655 cpus_and(tmp, group->cpumask, cpu_online_map);
1656 for_each_cpu_mask(i, tmp) {
1657 load = source_load(i);
1658
1659 if (load > max_load) {
1660 max_load = load;
1661 busiest = cpu_rq(i);
1662 }
1663 }
1664
1665 return busiest;
1666}
1667
1668
1669
1670
1671
1672
1673
1674static int load_balance(int this_cpu, runqueue_t *this_rq,
1675 struct sched_domain *sd, enum idle_type idle)
1676{
1677 struct sched_group *group;
1678 runqueue_t *busiest;
1679 unsigned long imbalance;
1680 int nr_moved;
1681
1682 spin_lock(&this_rq->lock);
1683
1684 group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1685 if (!group)
1686 goto out_balanced;
1687
1688 busiest = find_busiest_queue(group);
1689 if (!busiest)
1690 goto out_balanced;
1691
1692
1693
1694
1695
1696 if (unlikely(busiest == this_rq)) {
1697 WARN_ON(1);
1698 goto out_balanced;
1699 }
1700
1701 nr_moved = 0;
1702 if (busiest->nr_running > 1) {
1703
1704
1705
1706
1707
1708
1709 double_lock_balance(this_rq, busiest);
1710 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1711 imbalance, sd, idle);
1712 spin_unlock(&busiest->lock);
1713 }
1714 spin_unlock(&this_rq->lock);
1715
1716 if (!nr_moved) {
1717 sd->nr_balance_failed++;
1718
1719 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1720 int wake = 0;
1721
1722 spin_lock(&busiest->lock);
1723 if (!busiest->active_balance) {
1724 busiest->active_balance = 1;
1725 busiest->push_cpu = this_cpu;
1726 wake = 1;
1727 }
1728 spin_unlock(&busiest->lock);
1729 if (wake)
1730 wake_up_process(busiest->migration_thread);
1731
1732
1733
1734
1735
1736 sd->nr_balance_failed = sd->cache_nice_tries;
1737 }
1738 } else
1739 sd->nr_balance_failed = 0;
1740
1741
1742 sd->balance_interval = sd->min_interval;
1743
1744 return nr_moved;
1745
1746out_balanced:
1747 spin_unlock(&this_rq->lock);
1748
1749
1750 if (sd->balance_interval < sd->max_interval)
1751 sd->balance_interval *= 2;
1752
1753 return 0;
1754}
1755
1756
1757
1758
1759
1760
1761
1762
1763static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
1764 struct sched_domain *sd)
1765{
1766 struct sched_group *group;
1767 runqueue_t *busiest = NULL;
1768 unsigned long imbalance;
1769 int nr_moved = 0;
1770
1771 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
1772 if (!group)
1773 goto out;
1774
1775 busiest = find_busiest_queue(group);
1776 if (!busiest || busiest == this_rq)
1777 goto out;
1778
1779
1780 double_lock_balance(this_rq, busiest);
1781
1782 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1783 imbalance, sd, NEWLY_IDLE);
1784
1785 spin_unlock(&busiest->lock);
1786
1787out:
1788 return nr_moved;
1789}
1790
1791
1792
1793
1794
1795static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
1796{
1797 struct sched_domain *sd;
1798
1799 for_each_domain(this_cpu, sd) {
1800 if (sd->flags & SD_BALANCE_NEWIDLE) {
1801 if (load_balance_newidle(this_cpu, this_rq, sd)) {
1802
1803 break;
1804 }
1805 }
1806 }
1807}
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
1818{
1819 struct sched_domain *sd;
1820 struct sched_group *group, *busy_group;
1821 int i;
1822
1823 if (busiest->nr_running <= 1)
1824 return;
1825
1826 for_each_domain(busiest_cpu, sd)
1827 if (cpu_isset(busiest->push_cpu, sd->span))
1828 break;
1829 if (!sd) {
1830 WARN_ON(1);
1831 return;
1832 }
1833
1834 group = sd->groups;
1835 while (!cpu_isset(busiest_cpu, group->cpumask))
1836 group = group->next;
1837 busy_group = group;
1838
1839 group = sd->groups;
1840 do {
1841 cpumask_t tmp;
1842 runqueue_t *rq;
1843 int push_cpu = 0;
1844
1845 if (group == busy_group)
1846 goto next_group;
1847
1848 cpus_and(tmp, group->cpumask, cpu_online_map);
1849 if (!cpus_weight(tmp))
1850 goto next_group;
1851
1852 for_each_cpu_mask(i, tmp) {
1853 if (!idle_cpu(i))
1854 goto next_group;
1855 push_cpu = i;
1856 }
1857
1858 rq = cpu_rq(push_cpu);
1859
1860
1861
1862
1863
1864
1865
1866 if (unlikely(busiest == rq))
1867 goto next_group;
1868 double_lock_balance(busiest, rq);
1869 move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
1870 spin_unlock(&rq->lock);
1871next_group:
1872 group = group->next;
1873 } while (group != sd->groups);
1874}
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
1887
1888static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
1889 enum idle_type idle)
1890{
1891 unsigned long old_load, this_load;
1892 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
1893 struct sched_domain *sd;
1894
1895
1896 old_load = this_rq->cpu_load;
1897 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
1898
1899
1900
1901
1902
1903 if (this_load > old_load)
1904 old_load++;
1905 this_rq->cpu_load = (old_load + this_load) / 2;
1906
1907 for_each_domain(this_cpu, sd) {
1908 unsigned long interval = sd->balance_interval;
1909
1910 if (idle != IDLE)
1911 interval *= sd->busy_factor;
1912
1913
1914 interval = msecs_to_jiffies(interval);
1915 if (unlikely(!interval))
1916 interval = 1;
1917
1918 if (j - sd->last_balance >= interval) {
1919 if (load_balance(this_cpu, this_rq, sd, idle)) {
1920
1921 idle = NOT_IDLE;
1922 }
1923 sd->last_balance += interval;
1924 }
1925 }
1926}
1927#else
1928
1929
1930
1931static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
1932{
1933}
1934static inline void idle_balance(int cpu, runqueue_t *rq)
1935{
1936}
1937#endif
1938
1939static inline int wake_priority_sleeper(runqueue_t *rq)
1940{
1941#ifdef CONFIG_SCHED_SMT
1942
1943
1944
1945
1946 if (rq->nr_running) {
1947 resched_task(rq->idle);
1948 return 1;
1949 }
1950#endif
1951 return 0;
1952}
1953
1954DEFINE_PER_CPU(struct kernel_stat, kstat);
1955
1956EXPORT_PER_CPU_SYMBOL(kstat);
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968#define EXPIRED_STARVING(rq) \
1969 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1970 (jiffies - (rq)->expired_timestamp >= \
1971 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1972 ((rq)->curr->static_prio > (rq)->best_expired_prio))
1973
1974
1975
1976
1977
1978
1979
1980
1981void scheduler_tick(int user_ticks, int sys_ticks)
1982{
1983 int cpu = smp_processor_id();
1984 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1985 runqueue_t *rq = this_rq();
1986 task_t *p = current;
1987
1988 rq->timestamp_last_tick = sched_clock();
1989
1990 if (rcu_pending(cpu))
1991 rcu_check_callbacks(cpu, user_ticks);
1992
1993
1994 if (hardirq_count() - HARDIRQ_OFFSET) {
1995 cpustat->irq += sys_ticks;
1996 sys_ticks = 0;
1997 } else if (softirq_count()) {
1998 cpustat->softirq += sys_ticks;
1999 sys_ticks = 0;
2000 }
2001
2002 if (p == rq->idle) {
2003 if (atomic_read(&rq->nr_iowait) > 0)
2004 cpustat->iowait += sys_ticks;
2005 else
2006 cpustat->idle += sys_ticks;
2007 if (wake_priority_sleeper(rq))
2008 goto out;
2009 rebalance_tick(cpu, rq, IDLE);
2010 return;
2011 }
2012 if (TASK_NICE(p) > 0)
2013 cpustat->nice += user_ticks;
2014 else
2015 cpustat->user += user_ticks;
2016 cpustat->system += sys_ticks;
2017
2018
2019 if (p->array != rq->active) {
2020 set_tsk_need_resched(p);
2021 goto out;
2022 }
2023 spin_lock(&rq->lock);
2024
2025
2026
2027
2028
2029
2030
2031 if (unlikely(rt_task(p))) {
2032
2033
2034
2035
2036 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2037 p->time_slice = task_timeslice(p);
2038 p->first_time_slice = 0;
2039 set_tsk_need_resched(p);
2040
2041
2042 dequeue_task(p, rq->active);
2043 enqueue_task(p, rq->active);
2044 }
2045 goto out_unlock;
2046 }
2047 if (!--p->time_slice) {
2048 dequeue_task(p, rq->active);
2049 set_tsk_need_resched(p);
2050 p->prio = effective_prio(p);
2051 p->time_slice = task_timeslice(p);
2052 p->first_time_slice = 0;
2053
2054 if (!rq->expired_timestamp)
2055 rq->expired_timestamp = jiffies;
2056 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2057 enqueue_task(p, rq->expired);
2058 if (p->static_prio < rq->best_expired_prio)
2059 rq->best_expired_prio = p->static_prio;
2060 } else
2061 enqueue_task(p, rq->active);
2062 } else {
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2080 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2081 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2082 (p->array == rq->active)) {
2083
2084 dequeue_task(p, rq->active);
2085 set_tsk_need_resched(p);
2086 p->prio = effective_prio(p);
2087 enqueue_task(p, rq->active);
2088 }
2089 }
2090out_unlock:
2091 spin_unlock(&rq->lock);
2092out:
2093 rebalance_tick(cpu, rq, NOT_IDLE);
2094}
2095
2096#ifdef CONFIG_SCHED_SMT
2097static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2098{
2099 int i;
2100 struct sched_domain *sd = rq->sd;
2101 cpumask_t sibling_map;
2102
2103 if (!(sd->flags & SD_SHARE_CPUPOWER))
2104 return;
2105
2106 cpus_and(sibling_map, sd->span, cpu_online_map);
2107 for_each_cpu_mask(i, sibling_map) {
2108 runqueue_t *smt_rq;
2109
2110 if (i == cpu)
2111 continue;
2112
2113 smt_rq = cpu_rq(i);
2114
2115
2116
2117
2118
2119 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2120 resched_task(smt_rq->idle);
2121 }
2122}
2123
2124static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2125{
2126 struct sched_domain *sd = rq->sd;
2127 cpumask_t sibling_map;
2128 int ret = 0, i;
2129
2130 if (!(sd->flags & SD_SHARE_CPUPOWER))
2131 return 0;
2132
2133 cpus_and(sibling_map, sd->span, cpu_online_map);
2134 for_each_cpu_mask(i, sibling_map) {
2135 runqueue_t *smt_rq;
2136 task_t *smt_curr;
2137
2138 if (i == cpu)
2139 continue;
2140
2141 smt_rq = cpu_rq(i);
2142 smt_curr = smt_rq->curr;
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2153 task_timeslice(p) || rt_task(smt_curr)) &&
2154 p->mm && smt_curr->mm && !rt_task(p))
2155 ret = 1;
2156
2157
2158
2159
2160
2161
2162 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2163 task_timeslice(smt_curr) || rt_task(p)) &&
2164 smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2165 (smt_curr == smt_rq->idle && smt_rq->nr_running))
2166 resched_task(smt_curr);
2167 }
2168 return ret;
2169}
2170#else
2171static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2172{
2173}
2174
2175static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2176{
2177 return 0;
2178}
2179#endif
2180
2181
2182
2183
2184asmlinkage void __sched schedule(void)
2185{
2186 long *switch_count;
2187 task_t *prev, *next;
2188 runqueue_t *rq;
2189 prio_array_t *array;
2190 struct list_head *queue;
2191 unsigned long long now;
2192 unsigned long run_time;
2193 int cpu, idx;
2194
2195
2196
2197
2198
2199
2200 if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
2201 if (unlikely(in_atomic())) {
2202 printk(KERN_ERR "bad: scheduling while atomic!\n");
2203 dump_stack();
2204 }
2205 }
2206
2207need_resched:
2208 preempt_disable();
2209 prev = current;
2210 rq = this_rq();
2211
2212 release_kernel_lock(prev);
2213 now = sched_clock();
2214 if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2215 run_time = now - prev->timestamp;
2216 else
2217 run_time = NS_MAX_SLEEP_AVG;
2218
2219
2220
2221
2222
2223
2224 if (HIGH_CREDIT(prev))
2225 run_time /= (CURRENT_BONUS(prev) ? : 1);
2226
2227 spin_lock_irq(&rq->lock);
2228
2229
2230
2231
2232
2233 switch_count = &prev->nivcsw;
2234 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2235 switch_count = &prev->nvcsw;
2236 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2237 unlikely(signal_pending(prev))))
2238 prev->state = TASK_RUNNING;
2239 else
2240 deactivate_task(prev, rq);
2241 }
2242
2243 cpu = smp_processor_id();
2244 if (unlikely(!rq->nr_running)) {
2245 idle_balance(cpu, rq);
2246 if (!rq->nr_running) {
2247 next = rq->idle;
2248 rq->expired_timestamp = 0;
2249 wake_sleeping_dependent(cpu, rq);
2250 goto switch_tasks;
2251 }
2252 }
2253
2254 array = rq->active;
2255 if (unlikely(!array->nr_active)) {
2256
2257
2258
2259 rq->active = rq->expired;
2260 rq->expired = array;
2261 array = rq->active;
2262 rq->expired_timestamp = 0;
2263 rq->best_expired_prio = MAX_PRIO;
2264 }
2265
2266 idx = sched_find_first_bit(array->bitmap);
2267 queue = array->queue + idx;
2268 next = list_entry(queue->next, task_t, run_list);
2269
2270 if (dependent_sleeper(cpu, rq, next)) {
2271 next = rq->idle;
2272 goto switch_tasks;
2273 }
2274
2275 if (!rt_task(next) && next->activated > 0) {
2276 unsigned long long delta = now - next->timestamp;
2277
2278 if (next->activated == 1)
2279 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2280
2281 array = next->array;
2282 dequeue_task(next, array);
2283 recalc_task_prio(next, next->timestamp + delta);
2284 enqueue_task(next, array);
2285 }
2286 next->activated = 0;
2287switch_tasks:
2288 prefetch(next);
2289 clear_tsk_need_resched(prev);
2290 RCU_qsctr(task_cpu(prev))++;
2291
2292 prev->sleep_avg -= run_time;
2293 if ((long)prev->sleep_avg <= 0) {
2294 prev->sleep_avg = 0;
2295 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
2296 prev->interactive_credit--;
2297 }
2298 prev->timestamp = now;
2299
2300 if (likely(prev != next)) {
2301 next->timestamp = now;
2302 rq->nr_switches++;
2303 rq->curr = next;
2304 ++*switch_count;
2305
2306 prepare_arch_switch(rq, next);
2307 prev = context_switch(rq, prev, next);
2308 barrier();
2309
2310 finish_task_switch(prev);
2311 } else
2312 spin_unlock_irq(&rq->lock);
2313
2314 reacquire_kernel_lock(current);
2315 preempt_enable_no_resched();
2316 if (test_thread_flag(TIF_NEED_RESCHED))
2317 goto need_resched;
2318}
2319
2320EXPORT_SYMBOL(schedule);
2321
2322#ifdef CONFIG_PREEMPT
2323
2324
2325
2326
2327
2328asmlinkage void __sched preempt_schedule(void)
2329{
2330 struct thread_info *ti = current_thread_info();
2331
2332
2333
2334
2335
2336 if (unlikely(ti->preempt_count || irqs_disabled()))
2337 return;
2338
2339need_resched:
2340 ti->preempt_count = PREEMPT_ACTIVE;
2341 schedule();
2342 ti->preempt_count = 0;
2343
2344
2345 barrier();
2346 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2347 goto need_resched;
2348}
2349
2350EXPORT_SYMBOL(preempt_schedule);
2351#endif
2352
2353int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2354{
2355 task_t *p = curr->task;
2356 return try_to_wake_up(p, mode, sync);
2357}
2358
2359EXPORT_SYMBOL(default_wake_function);
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2371 int nr_exclusive, int sync, void *key)
2372{
2373 struct list_head *tmp, *next;
2374
2375 list_for_each_safe(tmp, next, &q->task_list) {
2376 wait_queue_t *curr;
2377 unsigned flags;
2378 curr = list_entry(tmp, wait_queue_t, task_list);
2379 flags = curr->flags;
2380 if (curr->func(curr, mode, sync, key) &&
2381 (flags & WQ_FLAG_EXCLUSIVE) &&
2382 !--nr_exclusive)
2383 break;
2384 }
2385}
2386
2387
2388
2389
2390
2391
2392
2393void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2394 int nr_exclusive, void *key)
2395{
2396 unsigned long flags;
2397
2398 spin_lock_irqsave(&q->lock, flags);
2399 __wake_up_common(q, mode, nr_exclusive, 0, key);
2400 spin_unlock_irqrestore(&q->lock, flags);
2401}
2402
2403EXPORT_SYMBOL(__wake_up);
2404
2405
2406
2407
2408void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2409{
2410 __wake_up_common(q, mode, 1, 0, NULL);
2411}
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2427{
2428 unsigned long flags;
2429 int sync = 1;
2430
2431 if (unlikely(!q))
2432 return;
2433
2434 if (unlikely(!nr_exclusive))
2435 sync = 0;
2436
2437 spin_lock_irqsave(&q->lock, flags);
2438 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2439 spin_unlock_irqrestore(&q->lock, flags);
2440}
2441EXPORT_SYMBOL_GPL(__wake_up_sync);
2442
2443void fastcall complete(struct completion *x)
2444{
2445 unsigned long flags;
2446
2447 spin_lock_irqsave(&x->wait.lock, flags);
2448 x->done++;
2449 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2450 1, 0, NULL);
2451 spin_unlock_irqrestore(&x->wait.lock, flags);
2452}
2453EXPORT_SYMBOL(complete);
2454
2455void fastcall complete_all(struct completion *x)
2456{
2457 unsigned long flags;
2458
2459 spin_lock_irqsave(&x->wait.lock, flags);
2460 x->done += UINT_MAX/2;
2461 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2462 0, 0, NULL);
2463 spin_unlock_irqrestore(&x->wait.lock, flags);
2464}
2465EXPORT_SYMBOL(complete_all);
2466
2467void fastcall __sched wait_for_completion(struct completion *x)
2468{
2469 might_sleep();
2470 spin_lock_irq(&x->wait.lock);
2471 if (!x->done) {
2472 DECLARE_WAITQUEUE(wait, current);
2473
2474 wait.flags |= WQ_FLAG_EXCLUSIVE;
2475 __add_wait_queue_tail(&x->wait, &wait);
2476 do {
2477 __set_current_state(TASK_UNINTERRUPTIBLE);
2478 spin_unlock_irq(&x->wait.lock);
2479 schedule();
2480 spin_lock_irq(&x->wait.lock);
2481 } while (!x->done);
2482 __remove_wait_queue(&x->wait, &wait);
2483 }
2484 x->done--;
2485 spin_unlock_irq(&x->wait.lock);
2486}
2487EXPORT_SYMBOL(wait_for_completion);
2488
2489#define SLEEP_ON_VAR \
2490 unsigned long flags; \
2491 wait_queue_t wait; \
2492 init_waitqueue_entry(&wait, current);
2493
2494#define SLEEP_ON_HEAD \
2495 spin_lock_irqsave(&q->lock,flags); \
2496 __add_wait_queue(q, &wait); \
2497 spin_unlock(&q->lock);
2498
2499#define SLEEP_ON_TAIL \
2500 spin_lock_irq(&q->lock); \
2501 __remove_wait_queue(q, &wait); \
2502 spin_unlock_irqrestore(&q->lock, flags);
2503
2504void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
2505{
2506 SLEEP_ON_VAR
2507
2508 current->state = TASK_INTERRUPTIBLE;
2509
2510 SLEEP_ON_HEAD
2511 schedule();
2512 SLEEP_ON_TAIL
2513}
2514
2515EXPORT_SYMBOL(interruptible_sleep_on);
2516
2517long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2518{
2519 SLEEP_ON_VAR
2520
2521 current->state = TASK_INTERRUPTIBLE;
2522
2523 SLEEP_ON_HEAD
2524 timeout = schedule_timeout(timeout);
2525 SLEEP_ON_TAIL
2526
2527 return timeout;
2528}
2529
2530EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2531
2532void fastcall __sched sleep_on(wait_queue_head_t *q)
2533{
2534 SLEEP_ON_VAR
2535
2536 current->state = TASK_UNINTERRUPTIBLE;
2537
2538 SLEEP_ON_HEAD
2539 schedule();
2540 SLEEP_ON_TAIL
2541}
2542
2543EXPORT_SYMBOL(sleep_on);
2544
2545long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2546{
2547 SLEEP_ON_VAR
2548
2549 current->state = TASK_UNINTERRUPTIBLE;
2550
2551 SLEEP_ON_HEAD
2552 timeout = schedule_timeout(timeout);
2553 SLEEP_ON_TAIL
2554
2555 return timeout;
2556}
2557
2558EXPORT_SYMBOL(sleep_on_timeout);
2559
2560void set_user_nice(task_t *p, long nice)
2561{
2562 unsigned long flags;
2563 prio_array_t *array;
2564 runqueue_t *rq;
2565 int old_prio, new_prio, delta;
2566
2567 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
2568 return;
2569
2570
2571
2572
2573 rq = task_rq_lock(p, &flags);
2574
2575
2576
2577
2578
2579
2580 if (rt_task(p)) {
2581 p->static_prio = NICE_TO_PRIO(nice);
2582 goto out_unlock;
2583 }
2584 array = p->array;
2585 if (array)
2586 dequeue_task(p, array);
2587
2588 old_prio = p->prio;
2589 new_prio = NICE_TO_PRIO(nice);
2590 delta = new_prio - old_prio;
2591 p->static_prio = NICE_TO_PRIO(nice);
2592 p->prio += delta;
2593
2594 if (array) {
2595 enqueue_task(p, array);
2596
2597
2598
2599
2600 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2601 resched_task(rq->curr);
2602 }
2603out_unlock:
2604 task_rq_unlock(rq, &flags);
2605}
2606
2607EXPORT_SYMBOL(set_user_nice);
2608
2609#ifdef __ARCH_WANT_SYS_NICE
2610
2611
2612
2613
2614
2615
2616
2617
2618asmlinkage long sys_nice(int increment)
2619{
2620 int retval;
2621 long nice;
2622
2623
2624
2625
2626
2627
2628 if (increment < 0) {
2629 if (!capable(CAP_SYS_NICE))
2630 return -EPERM;
2631 if (increment < -40)
2632 increment = -40;
2633 }
2634 if (increment > 40)
2635 increment = 40;
2636
2637 nice = PRIO_TO_NICE(current->static_prio) + increment;
2638 if (nice < -20)
2639 nice = -20;
2640 if (nice > 19)
2641 nice = 19;
2642
2643 retval = security_task_setnice(current, nice);
2644 if (retval)
2645 return retval;
2646
2647 set_user_nice(current, nice);
2648 return 0;
2649}
2650
2651#endif
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661int task_prio(const task_t *p)
2662{
2663 return p->prio - MAX_RT_PRIO;
2664}
2665
2666
2667
2668
2669
2670int task_nice(const task_t *p)
2671{
2672 return TASK_NICE(p);
2673}
2674
2675EXPORT_SYMBOL(task_nice);
2676
2677
2678
2679
2680
2681int idle_cpu(int cpu)
2682{
2683 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2684}
2685
2686EXPORT_SYMBOL_GPL(idle_cpu);
2687
2688
2689
2690
2691
2692static inline task_t *find_process_by_pid(pid_t pid)
2693{
2694 return pid ? find_task_by_pid(pid) : current;
2695}
2696
2697
2698static void __setscheduler(struct task_struct *p, int policy, int prio)
2699{
2700 BUG_ON(p->array);
2701 p->policy = policy;
2702 p->rt_priority = prio;
2703 if (policy != SCHED_NORMAL)
2704 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2705 else
2706 p->prio = p->static_prio;
2707}
2708
2709
2710
2711
2712static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2713{
2714 struct sched_param lp;
2715 int retval = -EINVAL;
2716 int oldprio;
2717 prio_array_t *array;
2718 unsigned long flags;
2719 runqueue_t *rq;
2720 task_t *p;
2721
2722 if (!param || pid < 0)
2723 goto out_nounlock;
2724
2725 retval = -EFAULT;
2726 if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2727 goto out_nounlock;
2728
2729
2730
2731
2732 read_lock_irq(&tasklist_lock);
2733
2734 p = find_process_by_pid(pid);
2735
2736 retval = -ESRCH;
2737 if (!p)
2738 goto out_unlock_tasklist;
2739
2740
2741
2742
2743
2744 rq = task_rq_lock(p, &flags);
2745
2746 if (policy < 0)
2747 policy = p->policy;
2748 else {
2749 retval = -EINVAL;
2750 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2751 policy != SCHED_NORMAL)
2752 goto out_unlock;
2753 }
2754
2755
2756
2757
2758
2759 retval = -EINVAL;
2760 if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2761 goto out_unlock;
2762 if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2763 goto out_unlock;
2764
2765 retval = -EPERM;
2766 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2767 !capable(CAP_SYS_NICE))
2768 goto out_unlock;
2769 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2770 !capable(CAP_SYS_NICE))
2771 goto out_unlock;
2772
2773 retval = security_task_setscheduler(p, policy, &lp);
2774 if (retval)
2775 goto out_unlock;
2776
2777 array = p->array;
2778 if (array)
2779 deactivate_task(p, task_rq(p));
2780 retval = 0;
2781 oldprio = p->prio;
2782 __setscheduler(p, policy, lp.sched_priority);
2783 if (array) {
2784 __activate_task(p, task_rq(p));
2785
2786
2787
2788
2789
2790 if (task_running(rq, p)) {
2791 if (p->prio > oldprio)
2792 resched_task(rq->curr);
2793 } else if (TASK_PREEMPTS_CURR(p, rq))
2794 resched_task(rq->curr);
2795 }
2796
2797out_unlock:
2798 task_rq_unlock(rq, &flags);
2799out_unlock_tasklist:
2800 read_unlock_irq(&tasklist_lock);
2801
2802out_nounlock:
2803 return retval;
2804}
2805
2806
2807
2808
2809
2810
2811
2812asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2813 struct sched_param __user *param)
2814{
2815 return setscheduler(pid, policy, param);
2816}
2817
2818
2819
2820
2821
2822
2823asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2824{
2825 return setscheduler(pid, -1, param);
2826}
2827
2828
2829
2830
2831
2832asmlinkage long sys_sched_getscheduler(pid_t pid)
2833{
2834 int retval = -EINVAL;
2835 task_t *p;
2836
2837 if (pid < 0)
2838 goto out_nounlock;
2839
2840 retval = -ESRCH;
2841 read_lock(&tasklist_lock);
2842 p = find_process_by_pid(pid);
2843 if (p) {
2844 retval = security_task_getscheduler(p);
2845 if (!retval)
2846 retval = p->policy;
2847 }
2848 read_unlock(&tasklist_lock);
2849
2850out_nounlock:
2851 return retval;
2852}
2853
2854
2855
2856
2857
2858
2859asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2860{
2861 struct sched_param lp;
2862 int retval = -EINVAL;
2863 task_t *p;
2864
2865 if (!param || pid < 0)
2866 goto out_nounlock;
2867
2868 read_lock(&tasklist_lock);
2869 p = find_process_by_pid(pid);
2870 retval = -ESRCH;
2871 if (!p)
2872 goto out_unlock;
2873
2874 retval = security_task_getscheduler(p);
2875 if (retval)
2876 goto out_unlock;
2877
2878 lp.sched_priority = p->rt_priority;
2879 read_unlock(&tasklist_lock);
2880
2881
2882
2883
2884 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2885
2886out_nounlock:
2887 return retval;
2888
2889out_unlock:
2890 read_unlock(&tasklist_lock);
2891 return retval;
2892}
2893
2894
2895
2896
2897
2898
2899
2900asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2901 unsigned long __user *user_mask_ptr)
2902{
2903 cpumask_t new_mask;
2904 int retval;
2905 task_t *p;
2906
2907 if (len < sizeof(new_mask))
2908 return -EINVAL;
2909
2910 if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2911 return -EFAULT;
2912
2913 lock_cpu_hotplug();
2914 read_lock(&tasklist_lock);
2915
2916 p = find_process_by_pid(pid);
2917 if (!p) {
2918 read_unlock(&tasklist_lock);
2919 unlock_cpu_hotplug();
2920 return -ESRCH;
2921 }
2922
2923
2924
2925
2926
2927
2928 get_task_struct(p);
2929 read_unlock(&tasklist_lock);
2930
2931 retval = -EPERM;
2932 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2933 !capable(CAP_SYS_NICE))
2934 goto out_unlock;
2935
2936 retval = set_cpus_allowed(p, new_mask);
2937
2938out_unlock:
2939 put_task_struct(p);
2940 unlock_cpu_hotplug();
2941 return retval;
2942}
2943
2944
2945
2946
2947
2948
2949
2950
2951cpumask_t cpu_present_map;
2952EXPORT_SYMBOL(cpu_present_map);
2953
2954#ifndef CONFIG_SMP
2955cpumask_t cpu_online_map = CPU_MASK_ALL;
2956cpumask_t cpu_possible_map = CPU_MASK_ALL;
2957#endif
2958
2959
2960
2961
2962
2963
2964
2965asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2966 unsigned long __user *user_mask_ptr)
2967{
2968 unsigned int real_len;
2969 cpumask_t mask;
2970 int retval;
2971 task_t *p;
2972
2973 real_len = sizeof(mask);
2974 if (len < real_len)
2975 return -EINVAL;
2976
2977 lock_cpu_hotplug();
2978 read_lock(&tasklist_lock);
2979
2980 retval = -ESRCH;
2981 p = find_process_by_pid(pid);
2982 if (!p)
2983 goto out_unlock;
2984
2985 retval = 0;
2986 cpus_and(mask, p->cpus_allowed, cpu_possible_map);
2987
2988out_unlock:
2989 read_unlock(&tasklist_lock);
2990 unlock_cpu_hotplug();
2991 if (retval)
2992 return retval;
2993 if (copy_to_user(user_mask_ptr, &mask, real_len))
2994 return -EFAULT;
2995 return real_len;
2996}
2997
2998
2999
3000
3001
3002
3003
3004
3005asmlinkage long sys_sched_yield(void)
3006{
3007 runqueue_t *rq = this_rq_lock();
3008 prio_array_t *array = current->array;
3009 prio_array_t *target = rq->expired;
3010
3011
3012
3013
3014
3015
3016
3017
3018 if (unlikely(rt_task(current)))
3019 target = rq->active;
3020
3021 dequeue_task(current, array);
3022 enqueue_task(current, target);
3023
3024
3025
3026
3027
3028 _raw_spin_unlock(&rq->lock);
3029 preempt_enable_no_resched();
3030
3031 schedule();
3032
3033 return 0;
3034}
3035
3036void __sched __cond_resched(void)
3037{
3038 set_current_state(TASK_RUNNING);
3039 schedule();
3040}
3041
3042EXPORT_SYMBOL(__cond_resched);
3043
3044
3045
3046
3047
3048
3049
3050void __sched yield(void)
3051{
3052 set_current_state(TASK_RUNNING);
3053 sys_sched_yield();
3054}
3055
3056EXPORT_SYMBOL(yield);
3057
3058
3059
3060
3061
3062
3063
3064
3065void __sched io_schedule(void)
3066{
3067 struct runqueue *rq = this_rq();
3068
3069 atomic_inc(&rq->nr_iowait);
3070 schedule();
3071 atomic_dec(&rq->nr_iowait);
3072}
3073
3074EXPORT_SYMBOL(io_schedule);
3075
3076long __sched io_schedule_timeout(long timeout)
3077{
3078 struct runqueue *rq = this_rq();
3079 long ret;
3080
3081 atomic_inc(&rq->nr_iowait);
3082 ret = schedule_timeout(timeout);
3083 atomic_dec(&rq->nr_iowait);
3084 return ret;
3085}
3086
3087
3088
3089
3090
3091
3092
3093
3094asmlinkage long sys_sched_get_priority_max(int policy)
3095{
3096 int ret = -EINVAL;
3097
3098 switch (policy) {
3099 case SCHED_FIFO:
3100 case SCHED_RR:
3101 ret = MAX_USER_RT_PRIO-1;
3102 break;
3103 case SCHED_NORMAL:
3104 ret = 0;
3105 break;
3106 }
3107 return ret;
3108}
3109
3110
3111
3112
3113
3114
3115
3116
3117asmlinkage long sys_sched_get_priority_min(int policy)
3118{
3119 int ret = -EINVAL;
3120
3121 switch (policy) {
3122 case SCHED_FIFO:
3123 case SCHED_RR:
3124 ret = 1;
3125 break;
3126 case SCHED_NORMAL:
3127 ret = 0;
3128 }
3129 return ret;
3130}
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140asmlinkage
3141long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3142{
3143 int retval = -EINVAL;
3144 struct timespec t;
3145 task_t *p;
3146
3147 if (pid < 0)
3148 goto out_nounlock;
3149
3150 retval = -ESRCH;
3151 read_lock(&tasklist_lock);
3152 p = find_process_by_pid(pid);
3153 if (!p)
3154 goto out_unlock;
3155
3156 retval = security_task_getscheduler(p);
3157 if (retval)
3158 goto out_unlock;
3159
3160 jiffies_to_timespec(p->policy & SCHED_FIFO ?
3161 0 : task_timeslice(p), &t);
3162 read_unlock(&tasklist_lock);
3163 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3164out_nounlock:
3165 return retval;
3166out_unlock:
3167 read_unlock(&tasklist_lock);
3168 return retval;
3169}
3170
3171static inline struct task_struct *eldest_child(struct task_struct *p)
3172{
3173 if (list_empty(&p->children)) return NULL;
3174 return list_entry(p->children.next,struct task_struct,sibling);
3175}
3176
3177static inline struct task_struct *older_sibling(struct task_struct *p)
3178{
3179 if (p->sibling.prev==&p->parent->children) return NULL;
3180 return list_entry(p->sibling.prev,struct task_struct,sibling);
3181}
3182
3183static inline struct task_struct *younger_sibling(struct task_struct *p)
3184{
3185 if (p->sibling.next==&p->parent->children) return NULL;
3186 return list_entry(p->sibling.next,struct task_struct,sibling);
3187}
3188
3189static void show_task(task_t * p)
3190{
3191 task_t *relative;
3192 unsigned state;
3193 unsigned long free = 0;
3194 static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
3195
3196 printk("%-13.13s ", p->comm);
3197 state = p->state ? __ffs(p->state) + 1 : 0;
3198 if (state < ARRAY_SIZE(stat_nam))
3199 printk(stat_nam[state]);
3200 else
3201 printk("?");
3202#if (BITS_PER_LONG == 32)
3203 if (state == TASK_RUNNING)
3204 printk(" running ");
3205 else
3206 printk(" %08lX ", thread_saved_pc(p));
3207#else
3208 if (state == TASK_RUNNING)
3209 printk(" running task ");
3210 else
3211 printk(" %016lx ", thread_saved_pc(p));
3212#endif
3213#ifdef CONFIG_DEBUG_STACK_USAGE
3214 {
3215 unsigned long * n = (unsigned long *) (p->thread_info+1);
3216 while (!*n)
3217 n++;
3218 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3219 }
3220#endif
3221 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3222 if ((relative = eldest_child(p)))
3223 printk("%5d ", relative->pid);
3224 else
3225 printk(" ");
3226 if ((relative = younger_sibling(p)))
3227 printk("%7d", relative->pid);
3228 else
3229 printk(" ");
3230 if ((relative = older_sibling(p)))
3231 printk(" %5d", relative->pid);
3232 else
3233 printk(" ");
3234 if (!p->mm)
3235 printk(" (L-TLB)\n");
3236 else
3237 printk(" (NOTLB)\n");
3238
3239 if (state != TASK_RUNNING)
3240 show_stack(p, NULL);
3241}
3242
3243void show_state(void)
3244{
3245 task_t *g, *p;
3246
3247#if (BITS_PER_LONG == 32)
3248 printk("\n"
3249 " sibling\n");
3250 printk(" task PC pid father child younger older\n");
3251#else
3252 printk("\n"
3253 " sibling\n");
3254 printk(" task PC pid father child younger older\n");
3255#endif
3256 read_lock(&tasklist_lock);
3257 do_each_thread(g, p) {
3258
3259
3260
3261
3262 touch_nmi_watchdog();
3263 show_task(p);
3264 } while_each_thread(g, p);
3265
3266 read_unlock(&tasklist_lock);
3267}
3268
3269void __devinit init_idle(task_t *idle, int cpu)
3270{
3271 runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
3272 unsigned long flags;
3273
3274 local_irq_save(flags);
3275 double_rq_lock(idle_rq, rq);
3276
3277 idle_rq->curr = idle_rq->idle = idle;
3278 deactivate_task(idle, rq);
3279 idle->array = NULL;
3280 idle->prio = MAX_PRIO;
3281 idle->state = TASK_RUNNING;
3282 set_task_cpu(idle, cpu);
3283 double_rq_unlock(idle_rq, rq);
3284 set_tsk_need_resched(idle);
3285 local_irq_restore(flags);
3286
3287
3288#ifdef CONFIG_PREEMPT
3289 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
3290#else
3291 idle->thread_info->preempt_count = 0;
3292#endif
3293}
3294
3295
3296
3297
3298
3299
3300
3301
3302cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
3303
3304#ifdef CONFIG_SMP
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330int set_cpus_allowed(task_t *p, cpumask_t new_mask)
3331{
3332 unsigned long flags;
3333 int ret = 0;
3334 migration_req_t req;
3335 runqueue_t *rq;
3336
3337 rq = task_rq_lock(p, &flags);
3338 if (!cpus_intersects(new_mask, cpu_online_map)) {
3339 ret = -EINVAL;
3340 goto out;
3341 }
3342
3343 p->cpus_allowed = new_mask;
3344
3345 if (cpu_isset(task_cpu(p), new_mask))
3346 goto out;
3347
3348 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
3349
3350 task_rq_unlock(rq, &flags);
3351 wake_up_process(rq->migration_thread);
3352 wait_for_completion(&req.done);
3353 tlb_migrate_finish(p->mm);
3354 return 0;
3355 }
3356out:
3357 task_rq_unlock(rq, &flags);
3358 return ret;
3359}
3360
3361EXPORT_SYMBOL_GPL(set_cpus_allowed);
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
3373{
3374 runqueue_t *rq_dest, *rq_src;
3375
3376 if (unlikely(cpu_is_offline(dest_cpu)))
3377 return;
3378
3379 rq_src = cpu_rq(src_cpu);
3380 rq_dest = cpu_rq(dest_cpu);
3381
3382 double_rq_lock(rq_src, rq_dest);
3383
3384 if (task_cpu(p) != src_cpu)
3385 goto out;
3386
3387 if (!cpu_isset(dest_cpu, p->cpus_allowed))
3388 goto out;
3389
3390 set_task_cpu(p, dest_cpu);
3391 if (p->array) {
3392
3393
3394
3395
3396
3397
3398 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
3399 + rq_dest->timestamp_last_tick;
3400 deactivate_task(p, rq_src);
3401 activate_task(p, rq_dest, 0);
3402 if (TASK_PREEMPTS_CURR(p, rq_dest))
3403 resched_task(rq_dest->curr);
3404 }
3405
3406out:
3407 double_rq_unlock(rq_src, rq_dest);
3408}
3409
3410
3411
3412
3413
3414
3415static int migration_thread(void * data)
3416{
3417 runqueue_t *rq;
3418 int cpu = (long)data;
3419
3420 rq = cpu_rq(cpu);
3421 BUG_ON(rq->migration_thread != current);
3422
3423 set_current_state(TASK_INTERRUPTIBLE);
3424 while (!kthread_should_stop()) {
3425 struct list_head *head;
3426 migration_req_t *req;
3427
3428 if (current->flags & PF_FREEZE)
3429 refrigerator(PF_FREEZE);
3430
3431 spin_lock_irq(&rq->lock);
3432
3433 if (cpu_is_offline(cpu)) {
3434 spin_unlock_irq(&rq->lock);
3435 goto wait_to_die;
3436 }
3437
3438 if (rq->active_balance) {
3439 active_load_balance(rq, cpu);
3440 rq->active_balance = 0;
3441 }
3442
3443 head = &rq->migration_queue;
3444
3445 if (list_empty(head)) {
3446 spin_unlock_irq(&rq->lock);
3447 schedule();
3448 set_current_state(TASK_INTERRUPTIBLE);
3449 continue;
3450 }
3451 req = list_entry(head->next, migration_req_t, list);
3452 list_del_init(head->next);
3453
3454 if (req->type == REQ_MOVE_TASK) {
3455 spin_unlock(&rq->lock);
3456 __migrate_task(req->task, smp_processor_id(),
3457 req->dest_cpu);
3458 local_irq_enable();
3459 } else if (req->type == REQ_SET_DOMAIN) {
3460 rq->sd = req->sd;
3461 spin_unlock_irq(&rq->lock);
3462 } else {
3463 spin_unlock_irq(&rq->lock);
3464 WARN_ON(1);
3465 }
3466
3467 complete(&req->done);
3468 }
3469 __set_current_state(TASK_RUNNING);
3470 return 0;
3471
3472wait_to_die:
3473
3474 set_current_state(TASK_INTERRUPTIBLE);
3475 while (!kthread_should_stop()) {
3476 schedule();
3477 set_current_state(TASK_INTERRUPTIBLE);
3478 }
3479 __set_current_state(TASK_RUNNING);
3480 return 0;
3481}
3482
3483#ifdef CONFIG_HOTPLUG_CPU
3484
3485static void migrate_all_tasks(int src_cpu)
3486{
3487 struct task_struct *tsk, *t;
3488 int dest_cpu;
3489 unsigned int node;
3490
3491 write_lock_irq(&tasklist_lock);
3492
3493
3494 node = cpu_to_node(src_cpu);
3495
3496 do_each_thread(t, tsk) {
3497 cpumask_t mask;
3498 if (tsk == current)
3499 continue;
3500
3501 if (task_cpu(tsk) != src_cpu)
3502 continue;
3503
3504
3505
3506
3507
3508 mask = node_to_cpumask(node);
3509 cpus_and(mask, mask, tsk->cpus_allowed);
3510 dest_cpu = any_online_cpu(mask);
3511 if (dest_cpu == NR_CPUS)
3512 dest_cpu = any_online_cpu(tsk->cpus_allowed);
3513 if (dest_cpu == NR_CPUS) {
3514 cpus_setall(tsk->cpus_allowed);
3515 dest_cpu = any_online_cpu(tsk->cpus_allowed);
3516
3517
3518
3519
3520 if (tsk->mm && printk_ratelimit())
3521 printk(KERN_INFO "process %d (%s) no "
3522 "longer affine to cpu%d\n",
3523 tsk->pid, tsk->comm, src_cpu);
3524 }
3525
3526 __migrate_task(tsk, src_cpu, dest_cpu);
3527 } while_each_thread(t, tsk);
3528
3529 write_unlock_irq(&tasklist_lock);
3530}
3531
3532
3533
3534
3535
3536void sched_idle_next(void)
3537{
3538 int cpu = smp_processor_id();
3539 runqueue_t *rq = this_rq();
3540 struct task_struct *p = rq->idle;
3541 unsigned long flags;
3542
3543
3544 BUG_ON(cpu_online(cpu));
3545
3546
3547
3548
3549 spin_lock_irqsave(&rq->lock, flags);
3550
3551 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3552
3553 __activate_idle_task(p, rq);
3554
3555 spin_unlock_irqrestore(&rq->lock, flags);
3556}
3557#endif
3558
3559
3560
3561
3562
3563static int migration_call(struct notifier_block *nfb, unsigned long action,
3564 void *hcpu)
3565{
3566 int cpu = (long)hcpu;
3567 struct task_struct *p;
3568 struct runqueue *rq;
3569 unsigned long flags;
3570
3571 switch (action) {
3572 case CPU_UP_PREPARE:
3573 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
3574 if (IS_ERR(p))
3575 return NOTIFY_BAD;
3576 p->flags |= PF_NOFREEZE;
3577 kthread_bind(p, cpu);
3578
3579 rq = task_rq_lock(p, &flags);
3580 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3581 task_rq_unlock(rq, &flags);
3582 cpu_rq(cpu)->migration_thread = p;
3583 break;
3584 case CPU_ONLINE:
3585
3586 wake_up_process(cpu_rq(cpu)->migration_thread);
3587 break;
3588#ifdef CONFIG_HOTPLUG_CPU
3589 case CPU_UP_CANCELED:
3590
3591 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
3592 kthread_stop(cpu_rq(cpu)->migration_thread);
3593 cpu_rq(cpu)->migration_thread = NULL;
3594 break;
3595 case CPU_DEAD:
3596 migrate_all_tasks(cpu);
3597 rq = cpu_rq(cpu);
3598 kthread_stop(rq->migration_thread);
3599 rq->migration_thread = NULL;
3600
3601 rq = task_rq_lock(rq->idle, &flags);
3602 deactivate_task(rq->idle, rq);
3603 rq->idle->static_prio = MAX_PRIO;
3604 __setscheduler(rq->idle, SCHED_NORMAL, 0);
3605 task_rq_unlock(rq, &flags);
3606 BUG_ON(rq->nr_running != 0);
3607
3608
3609
3610
3611 spin_lock_irq(&rq->lock);
3612 while (!list_empty(&rq->migration_queue)) {
3613 migration_req_t *req;
3614 req = list_entry(rq->migration_queue.next,
3615 migration_req_t, list);
3616 BUG_ON(req->type != REQ_MOVE_TASK);
3617 list_del_init(&req->list);
3618 complete(&req->done);
3619 }
3620 spin_unlock_irq(&rq->lock);
3621 break;
3622#endif
3623 }
3624 return NOTIFY_OK;
3625}
3626
3627
3628
3629
3630static struct notifier_block __devinitdata migration_notifier = {
3631 .notifier_call = migration_call,
3632 .priority = 10
3633};
3634
3635int __init migration_init(void)
3636{
3637 void *cpu = (void *)(long)smp_processor_id();
3638
3639 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
3640 migration_call(&migration_notifier, CPU_ONLINE, cpu);
3641 register_cpu_notifier(&migration_notifier);
3642 return 0;
3643}
3644#endif
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
3659EXPORT_SYMBOL(kernel_flag);
3660
3661#ifdef CONFIG_SMP
3662
3663void cpu_attach_domain(struct sched_domain *sd, int cpu)
3664{
3665 migration_req_t req;
3666 unsigned long flags;
3667 runqueue_t *rq = cpu_rq(cpu);
3668 int local = 1;
3669
3670 lock_cpu_hotplug();
3671
3672 spin_lock_irqsave(&rq->lock, flags);
3673
3674 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
3675 rq->sd = sd;
3676 } else {
3677 init_completion(&req.done);
3678 req.type = REQ_SET_DOMAIN;
3679 req.sd = sd;
3680 list_add(&req.list, &rq->migration_queue);
3681 local = 0;
3682 }
3683
3684 spin_unlock_irqrestore(&rq->lock, flags);
3685
3686 if (!local) {
3687 wake_up_process(rq->migration_thread);
3688 wait_for_completion(&req.done);
3689 }
3690
3691 unlock_cpu_hotplug();
3692}
3693
3694#ifdef ARCH_HAS_SCHED_DOMAIN
3695extern void __init arch_init_sched_domains(void);
3696#else
3697static struct sched_group sched_group_cpus[NR_CPUS];
3698static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
3699#ifdef CONFIG_NUMA
3700static struct sched_group sched_group_nodes[MAX_NUMNODES];
3701static DEFINE_PER_CPU(struct sched_domain, node_domains);
3702static void __init arch_init_sched_domains(void)
3703{
3704 int i;
3705 struct sched_group *first_node = NULL, *last_node = NULL;
3706
3707
3708 for_each_cpu(i) {
3709 int node = cpu_to_node(i);
3710 cpumask_t nodemask = node_to_cpumask(node);
3711 struct sched_domain *node_sd = &per_cpu(node_domains, i);
3712 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3713
3714 *node_sd = SD_NODE_INIT;
3715 node_sd->span = cpu_possible_map;
3716 node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
3717
3718 *cpu_sd = SD_CPU_INIT;
3719 cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
3720 cpu_sd->groups = &sched_group_cpus[i];
3721 cpu_sd->parent = node_sd;
3722 }
3723
3724
3725 for (i = 0; i < MAX_NUMNODES; i++) {
3726 cpumask_t tmp = node_to_cpumask(i);
3727 cpumask_t nodemask;
3728 struct sched_group *first_cpu = NULL, *last_cpu = NULL;
3729 struct sched_group *node = &sched_group_nodes[i];
3730 int j;
3731
3732 cpus_and(nodemask, tmp, cpu_possible_map);
3733
3734 if (cpus_empty(nodemask))
3735 continue;
3736
3737 node->cpumask = nodemask;
3738 node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
3739
3740 for_each_cpu_mask(j, node->cpumask) {
3741 struct sched_group *cpu = &sched_group_cpus[j];
3742
3743 cpus_clear(cpu->cpumask);
3744 cpu_set(j, cpu->cpumask);
3745 cpu->cpu_power = SCHED_LOAD_SCALE;
3746
3747 if (!first_cpu)
3748 first_cpu = cpu;
3749 if (last_cpu)
3750 last_cpu->next = cpu;
3751 last_cpu = cpu;
3752 }
3753 last_cpu->next = first_cpu;
3754
3755 if (!first_node)
3756 first_node = node;
3757 if (last_node)
3758 last_node->next = node;
3759 last_node = node;
3760 }
3761 last_node->next = first_node;
3762
3763 mb();
3764 for_each_cpu(i) {
3765 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3766 cpu_attach_domain(cpu_sd, i);
3767 }
3768}
3769
3770#else
3771static void __init arch_init_sched_domains(void)
3772{
3773 int i;
3774 struct sched_group *first_cpu = NULL, *last_cpu = NULL;
3775
3776
3777 for_each_cpu(i) {
3778 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3779
3780 *cpu_sd = SD_CPU_INIT;
3781 cpu_sd->span = cpu_possible_map;
3782 cpu_sd->groups = &sched_group_cpus[i];
3783 }
3784
3785
3786 for_each_cpu_mask(i, cpu_possible_map) {
3787 struct sched_group *cpu = &sched_group_cpus[i];
3788
3789 cpus_clear(cpu->cpumask);
3790 cpu_set(i, cpu->cpumask);
3791 cpu->cpu_power = SCHED_LOAD_SCALE;
3792
3793 if (!first_cpu)
3794 first_cpu = cpu;
3795 if (last_cpu)
3796 last_cpu->next = cpu;
3797 last_cpu = cpu;
3798 }
3799 last_cpu->next = first_cpu;
3800
3801 mb();
3802 for_each_cpu(i) {
3803 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3804 cpu_attach_domain(cpu_sd, i);
3805 }
3806}
3807
3808#endif
3809#endif
3810
3811#define SCHED_DOMAIN_DEBUG
3812#ifdef SCHED_DOMAIN_DEBUG
3813void sched_domain_debug(void)
3814{
3815 int i;
3816
3817 for_each_cpu(i) {
3818 runqueue_t *rq = cpu_rq(i);
3819 struct sched_domain *sd;
3820 int level = 0;
3821
3822 sd = rq->sd;
3823
3824 printk(KERN_DEBUG "CPU%d: %s\n",
3825 i, (cpu_online(i) ? " online" : "offline"));
3826
3827 do {
3828 int j;
3829 char str[NR_CPUS];
3830 struct sched_group *group = sd->groups;
3831 cpumask_t groupmask;
3832
3833 cpumask_scnprintf(str, NR_CPUS, sd->span);
3834 cpus_clear(groupmask);
3835
3836 printk(KERN_DEBUG);
3837 for (j = 0; j < level + 1; j++)
3838 printk(" ");
3839 printk("domain %d: span %s\n", level, str);
3840
3841 if (!cpu_isset(i, sd->span))
3842 printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
3843 if (!cpu_isset(i, group->cpumask))
3844 printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
3845 if (!group->cpu_power)
3846 printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
3847
3848 printk(KERN_DEBUG);
3849 for (j = 0; j < level + 2; j++)
3850 printk(" ");
3851 printk("groups:");
3852 do {
3853 if (!group) {
3854 printk(" ERROR: NULL");
3855 break;
3856 }
3857
3858 if (!cpus_weight(group->cpumask))
3859 printk(" ERROR empty group:");
3860
3861 if (cpus_intersects(groupmask, group->cpumask))
3862 printk(" ERROR repeated CPUs:");
3863
3864 cpus_or(groupmask, groupmask, group->cpumask);
3865
3866 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
3867 printk(" %s", str);
3868
3869 group = group->next;
3870 } while (group != sd->groups);
3871 printk("\n");
3872
3873 if (!cpus_equal(sd->span, groupmask))
3874 printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
3875
3876 level++;
3877 sd = sd->parent;
3878
3879 if (sd) {
3880 if (!cpus_subset(groupmask, sd->span))
3881 printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
3882 }
3883
3884 } while (sd);
3885 }
3886}
3887#else
3888#define sched_domain_debug() {}
3889#endif
3890
3891void __init sched_init_smp(void)
3892{
3893 arch_init_sched_domains();
3894 sched_domain_debug();
3895}
3896#else
3897void __init sched_init_smp(void)
3898{
3899}
3900#endif
3901
3902int in_sched_functions(unsigned long addr)
3903{
3904
3905 extern char __sched_text_start[], __sched_text_end[];
3906 return addr >= (unsigned long)__sched_text_start
3907 && addr < (unsigned long)__sched_text_end;
3908}
3909
3910void __init sched_init(void)
3911{
3912 runqueue_t *rq;
3913 int i, j, k;
3914
3915#ifdef CONFIG_SMP
3916
3917 static struct sched_domain sched_domain_init;
3918 static struct sched_group sched_group_init;
3919
3920 memset(&sched_domain_init, 0, sizeof(struct sched_domain));
3921 sched_domain_init.span = CPU_MASK_ALL;
3922 sched_domain_init.groups = &sched_group_init;
3923 sched_domain_init.last_balance = jiffies;
3924 sched_domain_init.balance_interval = INT_MAX;
3925 sched_domain_init.busy_factor = 1;
3926
3927 memset(&sched_group_init, 0, sizeof(struct sched_group));
3928 sched_group_init.cpumask = CPU_MASK_ALL;
3929 sched_group_init.next = &sched_group_init;
3930 sched_group_init.cpu_power = SCHED_LOAD_SCALE;
3931#endif
3932
3933 for (i = 0; i < NR_CPUS; i++) {
3934 prio_array_t *array;
3935
3936 rq = cpu_rq(i);
3937 spin_lock_init(&rq->lock);
3938 rq->active = rq->arrays;
3939 rq->expired = rq->arrays + 1;
3940 rq->best_expired_prio = MAX_PRIO;
3941
3942#ifdef CONFIG_SMP
3943 rq->sd = &sched_domain_init;
3944 rq->cpu_load = 0;
3945 rq->active_balance = 0;
3946 rq->push_cpu = 0;
3947 rq->migration_thread = NULL;
3948 INIT_LIST_HEAD(&rq->migration_queue);
3949#endif
3950 atomic_set(&rq->nr_iowait, 0);
3951
3952 for (j = 0; j < 2; j++) {
3953 array = rq->arrays + j;
3954 for (k = 0; k < MAX_PRIO; k++) {
3955 INIT_LIST_HEAD(array->queue + k);
3956 __clear_bit(k, array->bitmap);
3957 }
3958
3959 __set_bit(MAX_PRIO, array->bitmap);
3960 }
3961 }
3962
3963
3964
3965
3966 rq = this_rq();
3967 rq->curr = current;
3968 rq->idle = current;
3969 set_task_cpu(current, smp_processor_id());
3970 wake_up_forked_process(current);
3971
3972
3973
3974
3975 atomic_inc(&init_mm.mm_count);
3976 enter_lazy_tlb(&init_mm, current);
3977}
3978
3979#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
3980void __might_sleep(char *file, int line)
3981{
3982#if defined(in_atomic)
3983 static unsigned long prev_jiffy;
3984
3985 if ((in_atomic() || irqs_disabled()) &&
3986 system_state == SYSTEM_RUNNING) {
3987 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
3988 return;
3989 prev_jiffy = jiffies;
3990 printk(KERN_ERR "Debug: sleeping function called from invalid"
3991 " context at %s:%d\n", file, line);
3992 printk("in_atomic():%d, irqs_disabled():%d\n",
3993 in_atomic(), irqs_disabled());
3994 dump_stack();
3995 }
3996#endif
3997}
3998EXPORT_SYMBOL(__might_sleep);
3999#endif
4000
4001
4002#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013void __sched __preempt_spin_lock(spinlock_t *lock)
4014{
4015 if (preempt_count() > 1) {
4016 _raw_spin_lock(lock);
4017 return;
4018 }
4019 do {
4020 preempt_enable();
4021 while (spin_is_locked(lock))
4022 cpu_relax();
4023 preempt_disable();
4024 } while (!_raw_spin_trylock(lock));
4025}
4026
4027EXPORT_SYMBOL(__preempt_spin_lock);
4028
4029void __sched __preempt_write_lock(rwlock_t *lock)
4030{
4031 if (preempt_count() > 1) {
4032 _raw_write_lock(lock);
4033 return;
4034 }
4035
4036 do {
4037 preempt_enable();
4038 while (rwlock_is_locked(lock))
4039 cpu_relax();
4040 preempt_disable();
4041 } while (!_raw_write_trylock(lock));
4042}
4043
4044EXPORT_SYMBOL(__preempt_write_lock);
4045#endif
4046