linux-bk/kernel/sched.c
<<
>>
Prefs
   1/*
   2 *  kernel/sched.c
   3 *
   4 *  Kernel scheduler and related syscalls
   5 *
   6 *  Copyright (C) 1991-2002  Linus Torvalds
   7 *
   8 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9 *              make semaphores SMP safe
  10 *  1998-11-19  Implemented schedule_timeout() and related stuff
  11 *              by Andrea Arcangeli
  12 *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13 *              hybrid priority-list and round-robin design with
  14 *              an array-switch method of distributing timeslices
  15 *              and per-CPU runqueues.  Cleanups and useful suggestions
  16 *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17 *  2003-09-03  Interactivity tuning by Con Kolivas.
  18 *  2004-04-02  Scheduler domains code by Nick Piggin
  19 */
  20
  21#include <linux/mm.h>
  22#include <linux/module.h>
  23#include <linux/nmi.h>
  24#include <linux/init.h>
  25#include <asm/uaccess.h>
  26#include <linux/highmem.h>
  27#include <linux/smp_lock.h>
  28#include <asm/mmu_context.h>
  29#include <linux/interrupt.h>
  30#include <linux/completion.h>
  31#include <linux/kernel_stat.h>
  32#include <linux/security.h>
  33#include <linux/notifier.h>
  34#include <linux/profile.h>
  35#include <linux/suspend.h>
  36#include <linux/blkdev.h>
  37#include <linux/delay.h>
  38#include <linux/smp.h>
  39#include <linux/threads.h>
  40#include <linux/timer.h>
  41#include <linux/rcupdate.h>
  42#include <linux/cpu.h>
  43#include <linux/percpu.h>
  44#include <linux/kthread.h>
  45#include <linux/seq_file.h>
  46#include <linux/syscalls.h>
  47#include <linux/times.h>
  48#include <asm/tlb.h>
  49
  50#include <asm/unistd.h>
  51
  52/*
  53 * Convert user-nice values [ -20 ... 0 ... 19 ]
  54 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  55 * and back.
  56 */
  57#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  58#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  59#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  60
  61/*
  62 * 'User priority' is the nice value converted to something we
  63 * can work with better when scaling various scheduler parameters,
  64 * it's a [ 0 ... 39 ] range.
  65 */
  66#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  67#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  68#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  69
  70/*
  71 * Some helpers for converting nanosecond timing to jiffy resolution
  72 */
  73#define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  74#define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  75
  76/*
  77 * These are the 'tuning knobs' of the scheduler:
  78 *
  79 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
  80 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  81 * Timeslices get refilled after they expire.
  82 */
  83#define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
  84#define DEF_TIMESLICE           (100 * HZ / 1000)
  85#define ON_RUNQUEUE_WEIGHT       30
  86#define CHILD_PENALTY            95
  87#define PARENT_PENALTY          100
  88#define EXIT_WEIGHT               3
  89#define PRIO_BONUS_RATIO         25
  90#define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
  91#define INTERACTIVE_DELTA         2
  92#define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
  93#define STARVATION_LIMIT        (MAX_SLEEP_AVG)
  94#define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
  95
  96/*
  97 * If a task is 'interactive' then we reinsert it in the active
  98 * array after it has expired its current timeslice. (it will not
  99 * continue to run immediately, it will still roundrobin with
 100 * other interactive tasks.)
 101 *
 102 * This part scales the interactivity limit depending on niceness.
 103 *
 104 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 105 * Here are a few examples of different nice levels:
 106 *
 107 *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 108 *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 109 *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 110 *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 111 *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 112 *
 113 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 114 *  priority range a task can explore, a value of '1' means the
 115 *  task is rated interactive.)
 116 *
 117 * Ie. nice +19 tasks can never get 'interactive' enough to be
 118 * reinserted into the active array. And only heavily CPU-hog nice -20
 119 * tasks will be expired. Default nice 0 tasks are somewhere between,
 120 * it takes some effort for them to get interactive, but it's not
 121 * too hard.
 122 */
 123
 124#define CURRENT_BONUS(p) \
 125        (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 126                MAX_SLEEP_AVG)
 127
 128#define GRANULARITY     (10 * HZ / 1000 ? : 1)
 129
 130#ifdef CONFIG_SMP
 131#define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 132                (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 133                        num_online_cpus())
 134#else
 135#define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 136                (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 137#endif
 138
 139#define SCALE(v1,v1_max,v2_max) \
 140        (v1) * (v2_max) / (v1_max)
 141
 142#define DELTA(p) \
 143        (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
 144
 145#define TASK_INTERACTIVE(p) \
 146        ((p)->prio <= (p)->static_prio - DELTA(p))
 147
 148#define INTERACTIVE_SLEEP(p) \
 149        (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 150                (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 151
 152#define TASK_PREEMPTS_CURR(p, rq) \
 153        ((p)->prio < (rq)->curr->prio)
 154
 155/*
 156 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 157 * to time slice values: [800ms ... 100ms ... 5ms]
 158 *
 159 * The higher a thread's priority, the bigger timeslices
 160 * it gets during one round of execution. But even the lowest
 161 * priority thread gets MIN_TIMESLICE worth of execution time.
 162 */
 163
 164#define SCALE_PRIO(x, prio) \
 165        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 166
 167static unsigned int task_timeslice(task_t *p)
 168{
 169        if (p->static_prio < NICE_TO_PRIO(0))
 170                return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
 171        else
 172                return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
 173}
 174#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)       \
 175                                < (long long) (sd)->cache_hot_time)
 176
 177/*
 178 * These are the runqueue data structures:
 179 */
 180
 181#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 182
 183typedef struct runqueue runqueue_t;
 184
 185struct prio_array {
 186        unsigned int nr_active;
 187        unsigned long bitmap[BITMAP_SIZE];
 188        struct list_head queue[MAX_PRIO];
 189};
 190
 191/*
 192 * This is the main, per-CPU runqueue data structure.
 193 *
 194 * Locking rule: those places that want to lock multiple runqueues
 195 * (such as the load balancing or the thread migration code), lock
 196 * acquire operations must be ordered by ascending &runqueue.
 197 */
 198struct runqueue {
 199        spinlock_t lock;
 200
 201        /*
 202         * nr_running and cpu_load should be in the same cacheline because
 203         * remote CPUs use both these fields when doing load calculation.
 204         */
 205        unsigned long nr_running;
 206#ifdef CONFIG_SMP
 207        unsigned long cpu_load;
 208#endif
 209        unsigned long long nr_switches;
 210
 211        /*
 212         * This is part of a global counter where only the total sum
 213         * over all CPUs matters. A task can increase this counter on
 214         * one CPU and if it got migrated afterwards it may decrease
 215         * it on another CPU. Always updated under the runqueue lock:
 216         */
 217        unsigned long nr_uninterruptible;
 218
 219        unsigned long expired_timestamp;
 220        unsigned long long timestamp_last_tick;
 221        task_t *curr, *idle;
 222        struct mm_struct *prev_mm;
 223        prio_array_t *active, *expired, arrays[2];
 224        int best_expired_prio;
 225        atomic_t nr_iowait;
 226
 227#ifdef CONFIG_SMP
 228        struct sched_domain *sd;
 229
 230        /* For active balancing */
 231        int active_balance;
 232        int push_cpu;
 233
 234        task_t *migration_thread;
 235        struct list_head migration_queue;
 236#endif
 237
 238#ifdef CONFIG_SCHEDSTATS
 239        /* latency stats */
 240        struct sched_info rq_sched_info;
 241
 242        /* sys_sched_yield() stats */
 243        unsigned long yld_exp_empty;
 244        unsigned long yld_act_empty;
 245        unsigned long yld_both_empty;
 246        unsigned long yld_cnt;
 247
 248        /* schedule() stats */
 249        unsigned long sched_noswitch;
 250        unsigned long sched_switch;
 251        unsigned long sched_cnt;
 252        unsigned long sched_goidle;
 253
 254        /* pull_task() stats */
 255        unsigned long pt_gained[MAX_IDLE_TYPES];
 256        unsigned long pt_lost[MAX_IDLE_TYPES];
 257
 258        /* active_load_balance() stats */
 259        unsigned long alb_cnt;
 260        unsigned long alb_lost;
 261        unsigned long alb_gained;
 262        unsigned long alb_failed;
 263
 264        /* try_to_wake_up() stats */
 265        unsigned long ttwu_cnt;
 266        unsigned long ttwu_attempts;
 267        unsigned long ttwu_moved;
 268
 269        /* wake_up_new_task() stats */
 270        unsigned long wunt_cnt;
 271        unsigned long wunt_moved;
 272
 273        /* sched_migrate_task() stats */
 274        unsigned long smt_cnt;
 275
 276        /* sched_balance_exec() stats */
 277        unsigned long sbe_cnt;
 278#endif
 279};
 280
 281static DEFINE_PER_CPU(struct runqueue, runqueues);
 282
 283#define for_each_domain(cpu, domain) \
 284        for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 285
 286#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 287#define this_rq()               (&__get_cpu_var(runqueues))
 288#define task_rq(p)              cpu_rq(task_cpu(p))
 289#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 290
 291/*
 292 * Default context-switch locking:
 293 */
 294#ifndef prepare_arch_switch
 295# define prepare_arch_switch(rq, next)  do { } while (0)
 296# define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
 297# define task_running(rq, p)            ((rq)->curr == (p))
 298#endif
 299
 300/*
 301 * task_rq_lock - lock the runqueue a given task resides on and disable
 302 * interrupts.  Note the ordering: we can safely lookup the task_rq without
 303 * explicitly disabling preemption.
 304 */
 305static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 306        __acquires(rq->lock)
 307{
 308        struct runqueue *rq;
 309
 310repeat_lock_task:
 311        local_irq_save(*flags);
 312        rq = task_rq(p);
 313        spin_lock(&rq->lock);
 314        if (unlikely(rq != task_rq(p))) {
 315                spin_unlock_irqrestore(&rq->lock, *flags);
 316                goto repeat_lock_task;
 317        }
 318        return rq;
 319}
 320
 321static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 322        __releases(rq->lock)
 323{
 324        spin_unlock_irqrestore(&rq->lock, *flags);
 325}
 326
 327#ifdef CONFIG_SCHEDSTATS
 328/*
 329 * bump this up when changing the output format or the meaning of an existing
 330 * format, so that tools can adapt (or abort)
 331 */
 332#define SCHEDSTAT_VERSION 10
 333
 334static int show_schedstat(struct seq_file *seq, void *v)
 335{
 336        int cpu;
 337        enum idle_type itype;
 338
 339        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 340        seq_printf(seq, "timestamp %lu\n", jiffies);
 341        for_each_online_cpu(cpu) {
 342                runqueue_t *rq = cpu_rq(cpu);
 343#ifdef CONFIG_SMP
 344                struct sched_domain *sd;
 345                int dcnt = 0;
 346#endif
 347
 348                /* runqueue-specific stats */
 349                seq_printf(seq,
 350                    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
 351                    "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 352                    cpu, rq->yld_both_empty,
 353                    rq->yld_act_empty, rq->yld_exp_empty,
 354                    rq->yld_cnt, rq->sched_noswitch,
 355                    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 356                    rq->alb_cnt, rq->alb_gained, rq->alb_lost,
 357                    rq->alb_failed,
 358                    rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
 359                    rq->wunt_cnt, rq->wunt_moved,
 360                    rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
 361                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 362
 363                for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
 364                        seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
 365                                                    rq->pt_lost[itype]);
 366                seq_printf(seq, "\n");
 367
 368#ifdef CONFIG_SMP
 369                /* domain-specific stats */
 370                for_each_domain(cpu, sd) {
 371                        char mask_str[NR_CPUS];
 372
 373                        cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 374                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 375                        for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 376                                                itype++) {
 377                                seq_printf(seq, " %lu %lu %lu %lu %lu",
 378                                    sd->lb_cnt[itype],
 379                                    sd->lb_failed[itype],
 380                                    sd->lb_imbalance[itype],
 381                                    sd->lb_nobusyq[itype],
 382                                    sd->lb_nobusyg[itype]);
 383                        }
 384                        seq_printf(seq, " %lu %lu %lu %lu\n",
 385                            sd->sbe_pushed, sd->sbe_attempts,
 386                            sd->ttwu_wake_affine, sd->ttwu_wake_balance);
 387                }
 388#endif
 389        }
 390        return 0;
 391}
 392
 393static int schedstat_open(struct inode *inode, struct file *file)
 394{
 395        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
 396        char *buf = kmalloc(size, GFP_KERNEL);
 397        struct seq_file *m;
 398        int res;
 399
 400        if (!buf)
 401                return -ENOMEM;
 402        res = single_open(file, show_schedstat, NULL);
 403        if (!res) {
 404                m = file->private_data;
 405                m->buf = buf;
 406                m->size = size;
 407        } else
 408                kfree(buf);
 409        return res;
 410}
 411
 412struct file_operations proc_schedstat_operations = {
 413        .open    = schedstat_open,
 414        .read    = seq_read,
 415        .llseek  = seq_lseek,
 416        .release = single_release,
 417};
 418
 419# define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
 420# define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
 421#else /* !CONFIG_SCHEDSTATS */
 422# define schedstat_inc(rq, field)       do { } while (0)
 423# define schedstat_add(rq, field, amt)  do { } while (0)
 424#endif
 425
 426/*
 427 * rq_lock - lock a given runqueue and disable interrupts.
 428 */
 429static runqueue_t *this_rq_lock(void)
 430        __acquires(rq->lock)
 431{
 432        runqueue_t *rq;
 433
 434        local_irq_disable();
 435        rq = this_rq();
 436        spin_lock(&rq->lock);
 437
 438        return rq;
 439}
 440
 441#ifdef CONFIG_SCHED_SMT
 442static int cpu_and_siblings_are_idle(int cpu)
 443{
 444        int sib;
 445        for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
 446                if (idle_cpu(sib))
 447                        continue;
 448                return 0;
 449        }
 450
 451        return 1;
 452}
 453#else
 454#define cpu_and_siblings_are_idle(A) idle_cpu(A)
 455#endif
 456
 457#ifdef CONFIG_SCHEDSTATS
 458/*
 459 * Called when a process is dequeued from the active array and given
 460 * the cpu.  We should note that with the exception of interactive
 461 * tasks, the expired queue will become the active queue after the active
 462 * queue is empty, without explicitly dequeuing and requeuing tasks in the
 463 * expired queue.  (Interactive tasks may be requeued directly to the
 464 * active queue, thus delaying tasks in the expired queue from running;
 465 * see scheduler_tick()).
 466 *
 467 * This function is only called from sched_info_arrive(), rather than
 468 * dequeue_task(). Even though a task may be queued and dequeued multiple
 469 * times as it is shuffled about, we're really interested in knowing how
 470 * long it was from the *first* time it was queued to the time that it
 471 * finally hit a cpu.
 472 */
 473static inline void sched_info_dequeued(task_t *t)
 474{
 475        t->sched_info.last_queued = 0;
 476}
 477
 478/*
 479 * Called when a task finally hits the cpu.  We can now calculate how
 480 * long it was waiting to run.  We also note when it began so that we
 481 * can keep stats on how long its timeslice is.
 482 */
 483static inline void sched_info_arrive(task_t *t)
 484{
 485        unsigned long now = jiffies, diff = 0;
 486        struct runqueue *rq = task_rq(t);
 487
 488        if (t->sched_info.last_queued)
 489                diff = now - t->sched_info.last_queued;
 490        sched_info_dequeued(t);
 491        t->sched_info.run_delay += diff;
 492        t->sched_info.last_arrival = now;
 493        t->sched_info.pcnt++;
 494
 495        if (!rq)
 496                return;
 497
 498        rq->rq_sched_info.run_delay += diff;
 499        rq->rq_sched_info.pcnt++;
 500}
 501
 502/*
 503 * Called when a process is queued into either the active or expired
 504 * array.  The time is noted and later used to determine how long we
 505 * had to wait for us to reach the cpu.  Since the expired queue will
 506 * become the active queue after active queue is empty, without dequeuing
 507 * and requeuing any tasks, we are interested in queuing to either. It
 508 * is unusual but not impossible for tasks to be dequeued and immediately
 509 * requeued in the same or another array: this can happen in sched_yield(),
 510 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
 511 * to runqueue.
 512 *
 513 * This function is only called from enqueue_task(), but also only updates
 514 * the timestamp if it is already not set.  It's assumed that
 515 * sched_info_dequeued() will clear that stamp when appropriate.
 516 */
 517static inline void sched_info_queued(task_t *t)
 518{
 519        if (!t->sched_info.last_queued)
 520                t->sched_info.last_queued = jiffies;
 521}
 522
 523/*
 524 * Called when a process ceases being the active-running process, either
 525 * voluntarily or involuntarily.  Now we can calculate how long we ran.
 526 */
 527static inline void sched_info_depart(task_t *t)
 528{
 529        struct runqueue *rq = task_rq(t);
 530        unsigned long diff = jiffies - t->sched_info.last_arrival;
 531
 532        t->sched_info.cpu_time += diff;
 533
 534        if (rq)
 535                rq->rq_sched_info.cpu_time += diff;
 536}
 537
 538/*
 539 * Called when tasks are switched involuntarily due, typically, to expiring
 540 * their time slice.  (This may also be called when switching to or from
 541 * the idle task.)  We are only called when prev != next.
 542 */
 543static inline void sched_info_switch(task_t *prev, task_t *next)
 544{
 545        struct runqueue *rq = task_rq(prev);
 546
 547        /*
 548         * prev now departs the cpu.  It's not interesting to record
 549         * stats about how efficient we were at scheduling the idle
 550         * process, however.
 551         */
 552        if (prev != rq->idle)
 553                sched_info_depart(prev);
 554
 555        if (next != rq->idle)
 556                sched_info_arrive(next);
 557}
 558#else
 559#define sched_info_queued(t)            do { } while (0)
 560#define sched_info_switch(t, next)      do { } while (0)
 561#endif /* CONFIG_SCHEDSTATS */
 562
 563/*
 564 * Adding/removing a task to/from a priority array:
 565 */
 566static void dequeue_task(struct task_struct *p, prio_array_t *array)
 567{
 568        array->nr_active--;
 569        list_del(&p->run_list);
 570        if (list_empty(array->queue + p->prio))
 571                __clear_bit(p->prio, array->bitmap);
 572}
 573
 574static void enqueue_task(struct task_struct *p, prio_array_t *array)
 575{
 576        sched_info_queued(p);
 577        list_add_tail(&p->run_list, array->queue + p->prio);
 578        __set_bit(p->prio, array->bitmap);
 579        array->nr_active++;
 580        p->array = array;
 581}
 582
 583/*
 584 * Put task to the end of the run list without the overhead of dequeue
 585 * followed by enqueue.
 586 */
 587static void requeue_task(struct task_struct *p, prio_array_t *array)
 588{
 589        list_move_tail(&p->run_list, array->queue + p->prio);
 590}
 591
 592static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 593{
 594        list_add(&p->run_list, array->queue + p->prio);
 595        __set_bit(p->prio, array->bitmap);
 596        array->nr_active++;
 597        p->array = array;
 598}
 599
 600/*
 601 * effective_prio - return the priority that is based on the static
 602 * priority but is modified by bonuses/penalties.
 603 *
 604 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 605 * into the -5 ... 0 ... +5 bonus/penalty range.
 606 *
 607 * We use 25% of the full 0...39 priority range so that:
 608 *
 609 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 610 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 611 *
 612 * Both properties are important to certain workloads.
 613 */
 614static int effective_prio(task_t *p)
 615{
 616        int bonus, prio;
 617
 618        if (rt_task(p))
 619                return p->prio;
 620
 621        bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 622
 623        prio = p->static_prio - bonus;
 624        if (prio < MAX_RT_PRIO)
 625                prio = MAX_RT_PRIO;
 626        if (prio > MAX_PRIO-1)
 627                prio = MAX_PRIO-1;
 628        return prio;
 629}
 630
 631/*
 632 * __activate_task - move a task to the runqueue.
 633 */
 634static inline void __activate_task(task_t *p, runqueue_t *rq)
 635{
 636        enqueue_task(p, rq->active);
 637        rq->nr_running++;
 638}
 639
 640/*
 641 * __activate_idle_task - move idle task to the _front_ of runqueue.
 642 */
 643static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 644{
 645        enqueue_task_head(p, rq->active);
 646        rq->nr_running++;
 647}
 648
 649static void recalc_task_prio(task_t *p, unsigned long long now)
 650{
 651        unsigned long long __sleep_time = now - p->timestamp;
 652        unsigned long sleep_time;
 653
 654        if (__sleep_time > NS_MAX_SLEEP_AVG)
 655                sleep_time = NS_MAX_SLEEP_AVG;
 656        else
 657                sleep_time = (unsigned long)__sleep_time;
 658
 659        if (likely(sleep_time > 0)) {
 660                /*
 661                 * User tasks that sleep a long time are categorised as
 662                 * idle and will get just interactive status to stay active &
 663                 * prevent them suddenly becoming cpu hogs and starving
 664                 * other processes.
 665                 */
 666                if (p->mm && p->activated != -1 &&
 667                        sleep_time > INTERACTIVE_SLEEP(p)) {
 668                                p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 669                                                DEF_TIMESLICE);
 670                } else {
 671                        /*
 672                         * The lower the sleep avg a task has the more
 673                         * rapidly it will rise with sleep time.
 674                         */
 675                        sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 676
 677                        /*
 678                         * Tasks waking from uninterruptible sleep are
 679                         * limited in their sleep_avg rise as they
 680                         * are likely to be waiting on I/O
 681                         */
 682                        if (p->activated == -1 && p->mm) {
 683                                if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 684                                        sleep_time = 0;
 685                                else if (p->sleep_avg + sleep_time >=
 686                                                INTERACTIVE_SLEEP(p)) {
 687                                        p->sleep_avg = INTERACTIVE_SLEEP(p);
 688                                        sleep_time = 0;
 689                                }
 690                        }
 691
 692                        /*
 693                         * This code gives a bonus to interactive tasks.
 694                         *
 695                         * The boost works by updating the 'average sleep time'
 696                         * value here, based on ->timestamp. The more time a
 697                         * task spends sleeping, the higher the average gets -
 698                         * and the higher the priority boost gets as well.
 699                         */
 700                        p->sleep_avg += sleep_time;
 701
 702                        if (p->sleep_avg > NS_MAX_SLEEP_AVG)
 703                                p->sleep_avg = NS_MAX_SLEEP_AVG;
 704                }
 705        }
 706
 707        p->prio = effective_prio(p);
 708}
 709
 710/*
 711 * activate_task - move a task to the runqueue and do priority recalculation
 712 *
 713 * Update all the scheduling statistics stuff. (sleep average
 714 * calculation, priority modifiers, etc.)
 715 */
 716static void activate_task(task_t *p, runqueue_t *rq, int local)
 717{
 718        unsigned long long now;
 719
 720        now = sched_clock();
 721#ifdef CONFIG_SMP
 722        if (!local) {
 723                /* Compensate for drifting sched_clock */
 724                runqueue_t *this_rq = this_rq();
 725                now = (now - this_rq->timestamp_last_tick)
 726                        + rq->timestamp_last_tick;
 727        }
 728#endif
 729
 730        recalc_task_prio(p, now);
 731
 732        /*
 733         * This checks to make sure it's not an uninterruptible task
 734         * that is now waking up.
 735         */
 736        if (!p->activated) {
 737                /*
 738                 * Tasks which were woken up by interrupts (ie. hw events)
 739                 * are most likely of interactive nature. So we give them
 740                 * the credit of extending their sleep time to the period
 741                 * of time they spend on the runqueue, waiting for execution
 742                 * on a CPU, first time around:
 743                 */
 744                if (in_interrupt())
 745                        p->activated = 2;
 746                else {
 747                        /*
 748                         * Normal first-time wakeups get a credit too for
 749                         * on-runqueue time, but it will be weighted down:
 750                         */
 751                        p->activated = 1;
 752                }
 753        }
 754        p->timestamp = now;
 755
 756        __activate_task(p, rq);
 757}
 758
 759/*
 760 * deactivate_task - remove a task from the runqueue.
 761 */
 762static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 763{
 764        rq->nr_running--;
 765        dequeue_task(p, p->array);
 766        p->array = NULL;
 767}
 768
 769/*
 770 * resched_task - mark a task 'to be rescheduled now'.
 771 *
 772 * On UP this means the setting of the need_resched flag, on SMP it
 773 * might also involve a cross-CPU call to trigger the scheduler on
 774 * the target CPU.
 775 */
 776#ifdef CONFIG_SMP
 777static void resched_task(task_t *p)
 778{
 779        int need_resched, nrpolling;
 780
 781        assert_spin_locked(&task_rq(p)->lock);
 782
 783        /* minimise the chance of sending an interrupt to poll_idle() */
 784        nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 785        need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 786        nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 787
 788        if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 789                smp_send_reschedule(task_cpu(p));
 790}
 791#else
 792static inline void resched_task(task_t *p)
 793{
 794        set_tsk_need_resched(p);
 795}
 796#endif
 797
 798/**
 799 * task_curr - is this task currently executing on a CPU?
 800 * @p: the task in question.
 801 */
 802inline int task_curr(const task_t *p)
 803{
 804        return cpu_curr(task_cpu(p)) == p;
 805}
 806
 807#ifdef CONFIG_SMP
 808enum request_type {
 809        REQ_MOVE_TASK,
 810        REQ_SET_DOMAIN,
 811};
 812
 813typedef struct {
 814        struct list_head list;
 815        enum request_type type;
 816
 817        /* For REQ_MOVE_TASK */
 818        task_t *task;
 819        int dest_cpu;
 820
 821        /* For REQ_SET_DOMAIN */
 822        struct sched_domain *sd;
 823
 824        struct completion done;
 825} migration_req_t;
 826
 827/*
 828 * The task's runqueue lock must be held.
 829 * Returns true if you have to wait for migration thread.
 830 */
 831static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 832{
 833        runqueue_t *rq = task_rq(p);
 834
 835        /*
 836         * If the task is not on a runqueue (and not running), then
 837         * it is sufficient to simply update the task's cpu field.
 838         */
 839        if (!p->array && !task_running(rq, p)) {
 840                set_task_cpu(p, dest_cpu);
 841                return 0;
 842        }
 843
 844        init_completion(&req->done);
 845        req->type = REQ_MOVE_TASK;
 846        req->task = p;
 847        req->dest_cpu = dest_cpu;
 848        list_add(&req->list, &rq->migration_queue);
 849        return 1;
 850}
 851
 852/*
 853 * wait_task_inactive - wait for a thread to unschedule.
 854 *
 855 * The caller must ensure that the task *will* unschedule sometime soon,
 856 * else this function might spin for a *long* time. This function can't
 857 * be called with interrupts off, or it may introduce deadlock with
 858 * smp_call_function() if an IPI is sent by the same process we are
 859 * waiting to become inactive.
 860 */
 861void wait_task_inactive(task_t * p)
 862{
 863        unsigned long flags;
 864        runqueue_t *rq;
 865        int preempted;
 866
 867repeat:
 868        rq = task_rq_lock(p, &flags);
 869        /* Must be off runqueue entirely, not preempted. */
 870        if (unlikely(p->array || task_running(rq, p))) {
 871                /* If it's preempted, we yield.  It could be a while. */
 872                preempted = !task_running(rq, p);
 873                task_rq_unlock(rq, &flags);
 874                cpu_relax();
 875                if (preempted)
 876                        yield();
 877                goto repeat;
 878        }
 879        task_rq_unlock(rq, &flags);
 880}
 881
 882/***
 883 * kick_process - kick a running thread to enter/exit the kernel
 884 * @p: the to-be-kicked thread
 885 *
 886 * Cause a process which is running on another CPU to enter
 887 * kernel-mode, without any delay. (to get signals handled.)
 888 *
 889 * NOTE: this function doesnt have to take the runqueue lock,
 890 * because all it wants to ensure is that the remote task enters
 891 * the kernel. If the IPI races and the task has been migrated
 892 * to another CPU then no harm is done and the purpose has been
 893 * achieved as well.
 894 */
 895void kick_process(task_t *p)
 896{
 897        int cpu;
 898
 899        preempt_disable();
 900        cpu = task_cpu(p);
 901        if ((cpu != smp_processor_id()) && task_curr(p))
 902                smp_send_reschedule(cpu);
 903        preempt_enable();
 904}
 905
 906/*
 907 * Return a low guess at the load of a migration-source cpu.
 908 *
 909 * We want to under-estimate the load of migration sources, to
 910 * balance conservatively.
 911 */
 912static inline unsigned long source_load(int cpu)
 913{
 914        runqueue_t *rq = cpu_rq(cpu);
 915        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 916
 917        return min(rq->cpu_load, load_now);
 918}
 919
 920/*
 921 * Return a high guess at the load of a migration-target cpu
 922 */
 923static inline unsigned long target_load(int cpu)
 924{
 925        runqueue_t *rq = cpu_rq(cpu);
 926        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 927
 928        return max(rq->cpu_load, load_now);
 929}
 930
 931#endif
 932
 933/*
 934 * wake_idle() will wake a task on an idle cpu if task->cpu is
 935 * not idle and an idle cpu is available.  The span of cpus to
 936 * search starts with cpus closest then further out as needed,
 937 * so we always favor a closer, idle cpu.
 938 *
 939 * Returns the CPU we should wake onto.
 940 */
 941#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 942static int wake_idle(int cpu, task_t *p)
 943{
 944        cpumask_t tmp;
 945        struct sched_domain *sd;
 946        int i;
 947
 948        if (idle_cpu(cpu))
 949                return cpu;
 950
 951        for_each_domain(cpu, sd) {
 952                if (sd->flags & SD_WAKE_IDLE) {
 953                        cpus_and(tmp, sd->span, cpu_online_map);
 954                        cpus_and(tmp, tmp, p->cpus_allowed);
 955                        for_each_cpu_mask(i, tmp) {
 956                                if (idle_cpu(i))
 957                                        return i;
 958                        }
 959                }
 960                else break;
 961        }
 962        return cpu;
 963}
 964#else
 965static inline int wake_idle(int cpu, task_t *p)
 966{
 967        return cpu;
 968}
 969#endif
 970
 971/***
 972 * try_to_wake_up - wake up a thread
 973 * @p: the to-be-woken-up thread
 974 * @state: the mask of task states that can be woken
 975 * @sync: do a synchronous wakeup?
 976 *
 977 * Put it on the run-queue if it's not already there. The "current"
 978 * thread is always on the run-queue (except when the actual
 979 * re-schedule is in progress), and as such you're allowed to do
 980 * the simpler "current->state = TASK_RUNNING" to mark yourself
 981 * runnable without the overhead of this.
 982 *
 983 * returns failure only if the task is already active.
 984 */
 985static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 986{
 987        int cpu, this_cpu, success = 0;
 988        unsigned long flags;
 989        long old_state;
 990        runqueue_t *rq;
 991#ifdef CONFIG_SMP
 992        unsigned long load, this_load;
 993        struct sched_domain *sd;
 994        int new_cpu;
 995#endif
 996
 997        rq = task_rq_lock(p, &flags);
 998        schedstat_inc(rq, ttwu_cnt);
 999        old_state = p->state;
1000        if (!(old_state & state))
1001                goto out;
1002
1003        if (p->array)
1004                goto out_running;
1005
1006        cpu = task_cpu(p);
1007        this_cpu = smp_processor_id();
1008
1009#ifdef CONFIG_SMP
1010        if (unlikely(task_running(rq, p)))
1011                goto out_activate;
1012
1013        new_cpu = cpu;
1014
1015        if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1016                goto out_set_cpu;
1017
1018        load = source_load(cpu);
1019        this_load = target_load(this_cpu);
1020
1021        /*
1022         * If sync wakeup then subtract the (maximum possible) effect of
1023         * the currently running task from the load of the current CPU:
1024         */
1025        if (sync)
1026                this_load -= SCHED_LOAD_SCALE;
1027
1028        /* Don't pull the task off an idle CPU to a busy one */
1029        if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1030                goto out_set_cpu;
1031
1032        new_cpu = this_cpu; /* Wake to this CPU if we can */
1033
1034        /*
1035         * Scan domains for affine wakeup and passive balancing
1036         * possibilities.
1037         */
1038        for_each_domain(this_cpu, sd) {
1039                unsigned int imbalance;
1040                /*
1041                 * Start passive balancing when half the imbalance_pct
1042                 * limit is reached.
1043                 */
1044                imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1045
1046                if ((sd->flags & SD_WAKE_AFFINE) &&
1047                                !task_hot(p, rq->timestamp_last_tick, sd)) {
1048                        /*
1049                         * This domain has SD_WAKE_AFFINE and p is cache cold
1050                         * in this domain.
1051                         */
1052                        if (cpu_isset(cpu, sd->span)) {
1053                                schedstat_inc(sd, ttwu_wake_affine);
1054                                goto out_set_cpu;
1055                        }
1056                } else if ((sd->flags & SD_WAKE_BALANCE) &&
1057                                imbalance*this_load <= 100*load) {
1058                        /*
1059                         * This domain has SD_WAKE_BALANCE and there is
1060                         * an imbalance.
1061                         */
1062                        if (cpu_isset(cpu, sd->span)) {
1063                                schedstat_inc(sd, ttwu_wake_balance);
1064                                goto out_set_cpu;
1065                        }
1066                }
1067        }
1068
1069        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1070out_set_cpu:
1071        schedstat_inc(rq, ttwu_attempts);
1072        new_cpu = wake_idle(new_cpu, p);
1073        if (new_cpu != cpu) {
1074                schedstat_inc(rq, ttwu_moved);
1075                set_task_cpu(p, new_cpu);
1076                task_rq_unlock(rq, &flags);
1077                /* might preempt at this point */
1078                rq = task_rq_lock(p, &flags);
1079                old_state = p->state;
1080                if (!(old_state & state))
1081                        goto out;
1082                if (p->array)
1083                        goto out_running;
1084
1085                this_cpu = smp_processor_id();
1086                cpu = task_cpu(p);
1087        }
1088
1089out_activate:
1090#endif /* CONFIG_SMP */
1091        if (old_state == TASK_UNINTERRUPTIBLE) {
1092                rq->nr_uninterruptible--;
1093                /*
1094                 * Tasks on involuntary sleep don't earn
1095                 * sleep_avg beyond just interactive state.
1096                 */
1097                p->activated = -1;
1098        }
1099
1100        /*
1101         * Sync wakeups (i.e. those types of wakeups where the waker
1102         * has indicated that it will leave the CPU in short order)
1103         * don't trigger a preemption, if the woken up task will run on
1104         * this cpu. (in this case the 'I will reschedule' promise of
1105         * the waker guarantees that the freshly woken up task is going
1106         * to be considered on this CPU.)
1107         */
1108        activate_task(p, rq, cpu == this_cpu);
1109        if (!sync || cpu != this_cpu) {
1110                if (TASK_PREEMPTS_CURR(p, rq))
1111                        resched_task(rq->curr);
1112        }
1113        success = 1;
1114
1115out_running:
1116        p->state = TASK_RUNNING;
1117out:
1118        task_rq_unlock(rq, &flags);
1119
1120        return success;
1121}
1122
1123int fastcall wake_up_process(task_t * p)
1124{
1125        return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1126                                 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1127}
1128
1129EXPORT_SYMBOL(wake_up_process);
1130
1131int fastcall wake_up_state(task_t *p, unsigned int state)
1132{
1133        return try_to_wake_up(p, state, 0);
1134}
1135
1136#ifdef CONFIG_SMP
1137static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1138                           struct sched_domain *sd);
1139#endif
1140
1141/*
1142 * Perform scheduler related setup for a newly forked process p.
1143 * p is forked by current.
1144 */
1145void fastcall sched_fork(task_t *p)
1146{
1147        /*
1148         * We mark the process as running here, but have not actually
1149         * inserted it onto the runqueue yet. This guarantees that
1150         * nobody will actually run it, and a signal or other external
1151         * event cannot wake it up and insert it on the runqueue either.
1152         */
1153        p->state = TASK_RUNNING;
1154        INIT_LIST_HEAD(&p->run_list);
1155        p->array = NULL;
1156        spin_lock_init(&p->switch_lock);
1157#ifdef CONFIG_SCHEDSTATS
1158        memset(&p->sched_info, 0, sizeof(p->sched_info));
1159#endif
1160#ifdef CONFIG_PREEMPT
1161        /*
1162         * During context-switch we hold precisely one spinlock, which
1163         * schedule_tail drops. (in the common case it's this_rq()->lock,
1164         * but it also can be p->switch_lock.) So we compensate with a count
1165         * of 1. Also, we want to start with kernel preemption disabled.
1166         */
1167        p->thread_info->preempt_count = 1;
1168#endif
1169        /*
1170         * Share the timeslice between parent and child, thus the
1171         * total amount of pending timeslices in the system doesn't change,
1172         * resulting in more scheduling fairness.
1173         */
1174        local_irq_disable();
1175        p->time_slice = (current->time_slice + 1) >> 1;
1176        /*
1177         * The remainder of the first timeslice might be recovered by
1178         * the parent if the child exits early enough.
1179         */
1180        p->first_time_slice = 1;
1181        current->time_slice >>= 1;
1182        p->timestamp = sched_clock();
1183        if (unlikely(!current->time_slice)) {
1184                /*
1185                 * This case is rare, it happens when the parent has only
1186                 * a single jiffy left from its timeslice. Taking the
1187                 * runqueue lock is not a problem.
1188                 */
1189                current->time_slice = 1;
1190                preempt_disable();
1191                scheduler_tick();
1192                local_irq_enable();
1193                preempt_enable();
1194        } else
1195                local_irq_enable();
1196}
1197
1198/*
1199 * wake_up_new_task - wake up a newly created task for the first time.
1200 *
1201 * This function will do some initial scheduler statistics housekeeping
1202 * that must be done for every newly created context, then puts the task
1203 * on the runqueue and wakes it.
1204 */
1205void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1206{
1207        unsigned long flags;
1208        int this_cpu, cpu;
1209        runqueue_t *rq, *this_rq;
1210
1211        rq = task_rq_lock(p, &flags);
1212        cpu = task_cpu(p);
1213        this_cpu = smp_processor_id();
1214
1215        BUG_ON(p->state != TASK_RUNNING);
1216
1217        schedstat_inc(rq, wunt_cnt);
1218        /*
1219         * We decrease the sleep average of forking parents
1220         * and children as well, to keep max-interactive tasks
1221         * from forking tasks that are max-interactive. The parent
1222         * (current) is done further down, under its lock.
1223         */
1224        p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1225                CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1226
1227        p->prio = effective_prio(p);
1228
1229        if (likely(cpu == this_cpu)) {
1230                if (!(clone_flags & CLONE_VM)) {
1231                        /*
1232                         * The VM isn't cloned, so we're in a good position to
1233                         * do child-runs-first in anticipation of an exec. This
1234                         * usually avoids a lot of COW overhead.
1235                         */
1236                        if (unlikely(!current->array))
1237                                __activate_task(p, rq);
1238                        else {
1239                                p->prio = current->prio;
1240                                list_add_tail(&p->run_list, &current->run_list);
1241                                p->array = current->array;
1242                                p->array->nr_active++;
1243                                rq->nr_running++;
1244                        }
1245                        set_need_resched();
1246                } else
1247                        /* Run child last */
1248                        __activate_task(p, rq);
1249                /*
1250                 * We skip the following code due to cpu == this_cpu
1251                 *
1252                 *   task_rq_unlock(rq, &flags);
1253                 *   this_rq = task_rq_lock(current, &flags);
1254                 */
1255                this_rq = rq;
1256        } else {
1257                this_rq = cpu_rq(this_cpu);
1258
1259                /*
1260                 * Not the local CPU - must adjust timestamp. This should
1261                 * get optimised away in the !CONFIG_SMP case.
1262                 */
1263                p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1264                                        + rq->timestamp_last_tick;
1265                __activate_task(p, rq);
1266                if (TASK_PREEMPTS_CURR(p, rq))
1267                        resched_task(rq->curr);
1268
1269                schedstat_inc(rq, wunt_moved);
1270                /*
1271                 * Parent and child are on different CPUs, now get the
1272                 * parent runqueue to update the parent's ->sleep_avg:
1273                 */
1274                task_rq_unlock(rq, &flags);
1275                this_rq = task_rq_lock(current, &flags);
1276        }
1277        current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1278                PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1279        task_rq_unlock(this_rq, &flags);
1280}
1281
1282/*
1283 * Potentially available exiting-child timeslices are
1284 * retrieved here - this way the parent does not get
1285 * penalized for creating too many threads.
1286 *
1287 * (this cannot be used to 'generate' timeslices
1288 * artificially, because any timeslice recovered here
1289 * was given away by the parent in the first place.)
1290 */
1291void fastcall sched_exit(task_t * p)
1292{
1293        unsigned long flags;
1294        runqueue_t *rq;
1295
1296        /*
1297         * If the child was a (relative-) CPU hog then decrease
1298         * the sleep_avg of the parent as well.
1299         */
1300        rq = task_rq_lock(p->parent, &flags);
1301        if (p->first_time_slice) {
1302                p->parent->time_slice += p->time_slice;
1303                if (unlikely(p->parent->time_slice > task_timeslice(p)))
1304                        p->parent->time_slice = task_timeslice(p);
1305        }
1306        if (p->sleep_avg < p->parent->sleep_avg)
1307                p->parent->sleep_avg = p->parent->sleep_avg /
1308                (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1309                (EXIT_WEIGHT + 1);
1310        task_rq_unlock(rq, &flags);
1311}
1312
1313/**
1314 * finish_task_switch - clean up after a task-switch
1315 * @prev: the thread we just switched away from.
1316 *
1317 * We enter this with the runqueue still locked, and finish_arch_switch()
1318 * will unlock it along with doing any other architecture-specific cleanup
1319 * actions.
1320 *
1321 * Note that we may have delayed dropping an mm in context_switch(). If
1322 * so, we finish that here outside of the runqueue lock.  (Doing it
1323 * with the lock held can cause deadlocks; see schedule() for
1324 * details.)
1325 */
1326static void finish_task_switch(task_t *prev)
1327        __releases(rq->lock)
1328{
1329        runqueue_t *rq = this_rq();
1330        struct mm_struct *mm = rq->prev_mm;
1331        unsigned long prev_task_flags;
1332
1333        rq->prev_mm = NULL;
1334
1335        /*
1336         * A task struct has one reference for the use as "current".
1337         * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1338         * calls schedule one last time. The schedule call will never return,
1339         * and the scheduled task must drop that reference.
1340         * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1341         * still held, otherwise prev could be scheduled on another cpu, die
1342         * there before we look at prev->state, and then the reference would
1343         * be dropped twice.
1344         *              Manfred Spraul <manfred@colorfullife.com>
1345         */
1346        prev_task_flags = prev->flags;
1347        finish_arch_switch(rq, prev);
1348        if (mm)
1349                mmdrop(mm);
1350        if (unlikely(prev_task_flags & PF_DEAD))
1351                put_task_struct(prev);
1352}
1353
1354/**
1355 * schedule_tail - first thing a freshly forked thread must call.
1356 * @prev: the thread we just switched away from.
1357 */
1358asmlinkage void schedule_tail(task_t *prev)
1359        __releases(rq->lock)
1360{
1361        finish_task_switch(prev);
1362
1363        if (current->set_child_tid)
1364                put_user(current->pid, current->set_child_tid);
1365}
1366
1367/*
1368 * context_switch - switch to the new MM and the new
1369 * thread's register state.
1370 */
1371static inline
1372task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1373{
1374        struct mm_struct *mm = next->mm;
1375        struct mm_struct *oldmm = prev->active_mm;
1376
1377        if (unlikely(!mm)) {
1378                next->active_mm = oldmm;
1379                atomic_inc(&oldmm->mm_count);
1380                enter_lazy_tlb(oldmm, next);
1381        } else
1382                switch_mm(oldmm, mm, next);
1383
1384        if (unlikely(!prev->mm)) {
1385                prev->active_mm = NULL;
1386                WARN_ON(rq->prev_mm);
1387                rq->prev_mm = oldmm;
1388        }
1389
1390        /* Here we just switch the register state and the stack. */
1391        switch_to(prev, next, prev);
1392
1393        return prev;
1394}
1395
1396/*
1397 * nr_running, nr_uninterruptible and nr_context_switches:
1398 *
1399 * externally visible scheduler statistics: current number of runnable
1400 * threads, current number of uninterruptible-sleeping threads, total
1401 * number of context switches performed since bootup.
1402 */
1403unsigned long nr_running(void)
1404{
1405        unsigned long i, sum = 0;
1406
1407        for_each_online_cpu(i)
1408                sum += cpu_rq(i)->nr_running;
1409
1410        return sum;
1411}
1412
1413unsigned long nr_uninterruptible(void)
1414{
1415        unsigned long i, sum = 0;
1416
1417        for_each_cpu(i)
1418                sum += cpu_rq(i)->nr_uninterruptible;
1419
1420        /*
1421         * Since we read the counters lockless, it might be slightly
1422         * inaccurate. Do not allow it to go below zero though:
1423         */
1424        if (unlikely((long)sum < 0))
1425                sum = 0;
1426
1427        return sum;
1428}
1429
1430unsigned long long nr_context_switches(void)
1431{
1432        unsigned long long i, sum = 0;
1433
1434        for_each_cpu(i)
1435                sum += cpu_rq(i)->nr_switches;
1436
1437        return sum;
1438}
1439
1440unsigned long nr_iowait(void)
1441{
1442        unsigned long i, sum = 0;
1443
1444        for_each_cpu(i)
1445                sum += atomic_read(&cpu_rq(i)->nr_iowait);
1446
1447        return sum;
1448}
1449
1450#ifdef CONFIG_SMP
1451
1452/*
1453 * double_rq_lock - safely lock two runqueues
1454 *
1455 * Note this does not disable interrupts like task_rq_lock,
1456 * you need to do so manually before calling.
1457 */
1458static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1459        __acquires(rq1->lock)
1460        __acquires(rq2->lock)
1461{
1462        if (rq1 == rq2) {
1463                spin_lock(&rq1->lock);
1464                __acquire(rq2->lock);   /* Fake it out ;) */
1465        } else {
1466                if (rq1 < rq2) {
1467                        spin_lock(&rq1->lock);
1468                        spin_lock(&rq2->lock);
1469                } else {
1470                        spin_lock(&rq2->lock);
1471                        spin_lock(&rq1->lock);
1472                }
1473        }
1474}
1475
1476/*
1477 * double_rq_unlock - safely unlock two runqueues
1478 *
1479 * Note this does not restore interrupts like task_rq_unlock,
1480 * you need to do so manually after calling.
1481 */
1482static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1483        __releases(rq1->lock)
1484        __releases(rq2->lock)
1485{
1486        spin_unlock(&rq1->lock);
1487        if (rq1 != rq2)
1488                spin_unlock(&rq2->lock);
1489        else
1490                __release(rq2->lock);
1491}
1492
1493/*
1494 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1495 */
1496static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1497        __releases(this_rq->lock)
1498        __acquires(busiest->lock)
1499        __acquires(this_rq->lock)
1500{
1501        if (unlikely(!spin_trylock(&busiest->lock))) {
1502                if (busiest < this_rq) {
1503                        spin_unlock(&this_rq->lock);
1504                        spin_lock(&busiest->lock);
1505                        spin_lock(&this_rq->lock);
1506                } else
1507                        spin_lock(&busiest->lock);
1508        }
1509}
1510
1511/*
1512 * find_idlest_cpu - find the least busy runqueue.
1513 */
1514static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1515                           struct sched_domain *sd)
1516{
1517        unsigned long load, min_load, this_load;
1518        int i, min_cpu;
1519        cpumask_t mask;
1520
1521        min_cpu = UINT_MAX;
1522        min_load = ULONG_MAX;
1523
1524        cpus_and(mask, sd->span, p->cpus_allowed);
1525
1526        for_each_cpu_mask(i, mask) {
1527                load = target_load(i);
1528
1529                if (load < min_load) {
1530                        min_cpu = i;
1531                        min_load = load;
1532
1533                        /* break out early on an idle CPU: */
1534                        if (!min_load)
1535                                break;
1536                }
1537        }
1538
1539        /* add +1 to account for the new task */
1540        this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1541
1542        /*
1543         * Would with the addition of the new task to the
1544         * current CPU there be an imbalance between this
1545         * CPU and the idlest CPU?
1546         *
1547         * Use half of the balancing threshold - new-context is
1548         * a good opportunity to balance.
1549         */
1550        if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1551                return min_cpu;
1552
1553        return this_cpu;
1554}
1555
1556/*
1557 * If dest_cpu is allowed for this process, migrate the task to it.
1558 * This is accomplished by forcing the cpu_allowed mask to only
1559 * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1560 * the cpu_allowed mask is restored.
1561 */
1562static void sched_migrate_task(task_t *p, int dest_cpu)
1563{
1564        migration_req_t req;
1565        runqueue_t *rq;
1566        unsigned long flags;
1567
1568        rq = task_rq_lock(p, &flags);
1569        if (!cpu_isset(dest_cpu, p->cpus_allowed)
1570            || unlikely(cpu_is_offline(dest_cpu)))
1571                goto out;
1572
1573        schedstat_inc(rq, smt_cnt);
1574        /* force the process onto the specified CPU */
1575        if (migrate_task(p, dest_cpu, &req)) {
1576                /* Need to wait for migration thread (might exit: take ref). */
1577                struct task_struct *mt = rq->migration_thread;
1578                get_task_struct(mt);
1579                task_rq_unlock(rq, &flags);
1580                wake_up_process(mt);
1581                put_task_struct(mt);
1582                wait_for_completion(&req.done);
1583                return;
1584        }
1585out:
1586        task_rq_unlock(rq, &flags);
1587}
1588
1589/*
1590 * sched_exec(): find the highest-level, exec-balance-capable
1591 * domain and try to migrate the task to the least loaded CPU.
1592 *
1593 * execve() is a valuable balancing opportunity, because at this point
1594 * the task has the smallest effective memory and cache footprint.
1595 */
1596void sched_exec(void)
1597{
1598        struct sched_domain *tmp, *sd = NULL;
1599        int new_cpu, this_cpu = get_cpu();
1600
1601        schedstat_inc(this_rq(), sbe_cnt);
1602        /* Prefer the current CPU if there's only this task running */
1603        if (this_rq()->nr_running <= 1)
1604                goto out;
1605
1606        for_each_domain(this_cpu, tmp)
1607                if (tmp->flags & SD_BALANCE_EXEC)
1608                        sd = tmp;
1609
1610        if (sd) {
1611                schedstat_inc(sd, sbe_attempts);
1612                new_cpu = find_idlest_cpu(current, this_cpu, sd);
1613                if (new_cpu != this_cpu) {
1614                        schedstat_inc(sd, sbe_pushed);
1615                        put_cpu();
1616                        sched_migrate_task(current, new_cpu);
1617                        return;
1618                }
1619        }
1620out:
1621        put_cpu();
1622}
1623
1624/*
1625 * pull_task - move a task from a remote runqueue to the local runqueue.
1626 * Both runqueues must be locked.
1627 */
1628static inline
1629void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1630               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1631{
1632        dequeue_task(p, src_array);
1633        src_rq->nr_running--;
1634        set_task_cpu(p, this_cpu);
1635        this_rq->nr_running++;
1636        enqueue_task(p, this_array);
1637        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1638                                + this_rq->timestamp_last_tick;
1639        /*
1640         * Note that idle threads have a prio of MAX_PRIO, for this test
1641         * to be always true for them.
1642         */
1643        if (TASK_PREEMPTS_CURR(p, this_rq))
1644                resched_task(this_rq->curr);
1645}
1646
1647/*
1648 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1649 */
1650static inline
1651int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1652                     struct sched_domain *sd, enum idle_type idle)
1653{
1654        /*
1655         * We do not migrate tasks that are:
1656         * 1) running (obviously), or
1657         * 2) cannot be migrated to this CPU due to cpus_allowed, or
1658         * 3) are cache-hot on their current CPU.
1659         */
1660        if (task_running(rq, p))
1661                return 0;
1662        if (!cpu_isset(this_cpu, p->cpus_allowed))
1663                return 0;
1664
1665        /*
1666         * Aggressive migration if:
1667         * 1) the [whole] cpu is idle, or
1668         * 2) too many balance attempts have failed.
1669         */
1670
1671        if (cpu_and_siblings_are_idle(this_cpu) || \
1672                        sd->nr_balance_failed > sd->cache_nice_tries)
1673                return 1;
1674
1675        if (task_hot(p, rq->timestamp_last_tick, sd))
1676                        return 0;
1677        return 1;
1678}
1679
1680/*
1681 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1682 * as part of a balancing operation within "domain". Returns the number of
1683 * tasks moved.
1684 *
1685 * Called with both runqueues locked.
1686 */
1687static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1688                      unsigned long max_nr_move, struct sched_domain *sd,
1689                      enum idle_type idle)
1690{
1691        prio_array_t *array, *dst_array;
1692        struct list_head *head, *curr;
1693        int idx, pulled = 0;
1694        task_t *tmp;
1695
1696        if (max_nr_move <= 0 || busiest->nr_running <= 1)
1697                goto out;
1698
1699        /*
1700         * We first consider expired tasks. Those will likely not be
1701         * executed in the near future, and they are most likely to
1702         * be cache-cold, thus switching CPUs has the least effect
1703         * on them.
1704         */
1705        if (busiest->expired->nr_active) {
1706                array = busiest->expired;
1707                dst_array = this_rq->expired;
1708        } else {
1709                array = busiest->active;
1710                dst_array = this_rq->active;
1711        }
1712
1713new_array:
1714        /* Start searching at priority 0: */
1715        idx = 0;
1716skip_bitmap:
1717        if (!idx)
1718                idx = sched_find_first_bit(array->bitmap);
1719        else
1720                idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1721        if (idx >= MAX_PRIO) {
1722                if (array == busiest->expired && busiest->active->nr_active) {
1723                        array = busiest->active;
1724                        dst_array = this_rq->active;
1725                        goto new_array;
1726                }
1727                goto out;
1728        }
1729
1730        head = array->queue + idx;
1731        curr = head->prev;
1732skip_queue:
1733        tmp = list_entry(curr, task_t, run_list);
1734
1735        curr = curr->prev;
1736
1737        if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1738                if (curr != head)
1739                        goto skip_queue;
1740                idx++;
1741                goto skip_bitmap;
1742        }
1743
1744        /*
1745         * Right now, this is the only place pull_task() is called,
1746         * so we can safely collect pull_task() stats here rather than
1747         * inside pull_task().
1748         */
1749        schedstat_inc(this_rq, pt_gained[idle]);
1750        schedstat_inc(busiest, pt_lost[idle]);
1751
1752        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1753        pulled++;
1754
1755        /* We only want to steal up to the prescribed number of tasks. */
1756        if (pulled < max_nr_move) {
1757                if (curr != head)
1758                        goto skip_queue;
1759                idx++;
1760                goto skip_bitmap;
1761        }
1762out:
1763        return pulled;
1764}
1765
1766/*
1767 * find_busiest_group finds and returns the busiest CPU group within the
1768 * domain. It calculates and returns the number of tasks which should be
1769 * moved to restore balance via the imbalance parameter.
1770 */
1771static struct sched_group *
1772find_busiest_group(struct sched_domain *sd, int this_cpu,
1773                   unsigned long *imbalance, enum idle_type idle)
1774{
1775        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1776        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1777
1778        max_load = this_load = total_load = total_pwr = 0;
1779
1780        do {
1781                unsigned long load;
1782                int local_group;
1783                int i, nr_cpus = 0;
1784
1785                local_group = cpu_isset(this_cpu, group->cpumask);
1786
1787                /* Tally up the load of all CPUs in the group */
1788                avg_load = 0;
1789
1790                for_each_cpu_mask(i, group->cpumask) {
1791                        /* Bias balancing toward cpus of our domain */
1792                        if (local_group)
1793                                load = target_load(i);
1794                        else
1795                                load = source_load(i);
1796
1797                        nr_cpus++;
1798                        avg_load += load;
1799                }
1800
1801                if (!nr_cpus)
1802                        goto nextgroup;
1803
1804                total_load += avg_load;
1805                total_pwr += group->cpu_power;
1806
1807                /* Adjust by relative CPU power of the group */
1808                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1809
1810                if (local_group) {
1811                        this_load = avg_load;
1812                        this = group;
1813                        goto nextgroup;
1814                } else if (avg_load > max_load) {
1815                        max_load = avg_load;
1816                        busiest = group;
1817                }
1818nextgroup:
1819                group = group->next;
1820        } while (group != sd->groups);
1821
1822        if (!busiest || this_load >= max_load)
1823                goto out_balanced;
1824
1825        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1826
1827        if (this_load >= avg_load ||
1828                        100*max_load <= sd->imbalance_pct*this_load)
1829                goto out_balanced;
1830
1831        /*
1832         * We're trying to get all the cpus to the average_load, so we don't
1833         * want to push ourselves above the average load, nor do we wish to
1834         * reduce the max loaded cpu below the average load, as either of these
1835         * actions would just result in more rebalancing later, and ping-pong
1836         * tasks around. Thus we look for the minimum possible imbalance.
1837         * Negative imbalances (*we* are more loaded than anyone else) will
1838         * be counted as no imbalance for these purposes -- we can't fix that
1839         * by pulling tasks to us.  Be careful of negative numbers as they'll
1840         * appear as very large values with unsigned longs.
1841         */
1842        *imbalance = min(max_load - avg_load, avg_load - this_load);
1843
1844        /* How much load to actually move to equalise the imbalance */
1845        *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
1846                                / SCHED_LOAD_SCALE;
1847
1848        if (*imbalance < SCHED_LOAD_SCALE - 1) {
1849                unsigned long pwr_now = 0, pwr_move = 0;
1850                unsigned long tmp;
1851
1852                if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1853                        *imbalance = 1;
1854                        return busiest;
1855                }
1856
1857                /*
1858                 * OK, we don't have enough imbalance to justify moving tasks,
1859                 * however we may be able to increase total CPU power used by
1860                 * moving them.
1861                 */
1862
1863                pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1864                pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1865                pwr_now /= SCHED_LOAD_SCALE;
1866
1867                /* Amount of load we'd subtract */
1868                tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1869                if (max_load > tmp)
1870                        pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1871                                                        max_load - tmp);
1872
1873                /* Amount of load we'd add */
1874                tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1875                if (max_load < tmp)
1876                        tmp = max_load;
1877                pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1878                pwr_move /= SCHED_LOAD_SCALE;
1879
1880                /* Move if we gain another 8th of a CPU worth of throughput */
1881                if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
1882                        goto out_balanced;
1883
1884                *imbalance = 1;
1885                return busiest;
1886        }
1887
1888        /* Get rid of the scaling factor, rounding down as we divide */
1889        *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
1890
1891        return busiest;
1892
1893out_balanced:
1894        if (busiest && (idle == NEWLY_IDLE ||
1895                        (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1896                *imbalance = 1;
1897                return busiest;
1898        }
1899
1900        *imbalance = 0;
1901        return NULL;
1902}
1903
1904/*
1905 * find_busiest_queue - find the busiest runqueue among the cpus in group.
1906 */
1907static runqueue_t *find_busiest_queue(struct sched_group *group)
1908{
1909        unsigned long load, max_load = 0;
1910        runqueue_t *busiest = NULL;
1911        int i;
1912
1913        for_each_cpu_mask(i, group->cpumask) {
1914                load = source_load(i);
1915
1916                if (load > max_load) {
1917                        max_load = load;
1918                        busiest = cpu_rq(i);
1919                }
1920        }
1921
1922        return busiest;
1923}
1924
1925/*
1926 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1927 * tasks if there is an imbalance.
1928 *
1929 * Called with this_rq unlocked.
1930 */
1931static int load_balance(int this_cpu, runqueue_t *this_rq,
1932                        struct sched_domain *sd, enum idle_type idle)
1933{
1934        struct sched_group *group;
1935        runqueue_t *busiest;
1936        unsigned long imbalance;
1937        int nr_moved;
1938
1939        spin_lock(&this_rq->lock);
1940        schedstat_inc(sd, lb_cnt[idle]);
1941
1942        group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1943        if (!group) {
1944                schedstat_inc(sd, lb_nobusyg[idle]);
1945                goto out_balanced;
1946        }
1947
1948        busiest = find_busiest_queue(group);
1949        if (!busiest) {
1950                schedstat_inc(sd, lb_nobusyq[idle]);
1951                goto out_balanced;
1952        }
1953
1954        /*
1955         * This should be "impossible", but since load
1956         * balancing is inherently racy and statistical,
1957         * it could happen in theory.
1958         */
1959        if (unlikely(busiest == this_rq)) {
1960                WARN_ON(1);
1961                goto out_balanced;
1962        }
1963
1964        schedstat_add(sd, lb_imbalance[idle], imbalance);
1965
1966        nr_moved = 0;
1967        if (busiest->nr_running > 1) {
1968                /*
1969                 * Attempt to move tasks. If find_busiest_group has found
1970                 * an imbalance but busiest->nr_running <= 1, the group is
1971                 * still unbalanced. nr_moved simply stays zero, so it is
1972                 * correctly treated as an imbalance.
1973                 */
1974                double_lock_balance(this_rq, busiest);
1975                nr_moved = move_tasks(this_rq, this_cpu, busiest,
1976                                                imbalance, sd, idle);
1977                spin_unlock(&busiest->lock);
1978        }
1979        spin_unlock(&this_rq->lock);
1980
1981        if (!nr_moved) {
1982                schedstat_inc(sd, lb_failed[idle]);
1983                sd->nr_balance_failed++;
1984
1985                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1986                        int wake = 0;
1987
1988                        spin_lock(&busiest->lock);
1989                        if (!busiest->active_balance) {
1990                                busiest->active_balance = 1;
1991                                busiest->push_cpu = this_cpu;
1992                                wake = 1;
1993                        }
1994                        spin_unlock(&busiest->lock);
1995                        if (wake)
1996                                wake_up_process(busiest->migration_thread);
1997
1998                        /*
1999                         * We've kicked active balancing, reset the failure
2000                         * counter.
2001                         */
2002                        sd->nr_balance_failed = sd->cache_nice_tries;
2003                }
2004
2005                /*
2006                 * We were unbalanced, but unsuccessful in move_tasks(),
2007                 * so bump the balance_interval to lessen the lock contention.
2008                 */
2009                if (sd->balance_interval < sd->max_interval)
2010                        sd->balance_interval++;
2011        } else {
2012                sd->nr_balance_failed = 0;
2013
2014                /* We were unbalanced, so reset the balancing interval */
2015                sd->balance_interval = sd->min_interval;
2016        }
2017
2018        return nr_moved;
2019
2020out_balanced:
2021        spin_unlock(&this_rq->lock);
2022
2023        /* tune up the balancing interval */
2024        if (sd->balance_interval < sd->max_interval)
2025                sd->balance_interval *= 2;
2026
2027        return 0;
2028}
2029
2030/*
2031 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2032 * tasks if there is an imbalance.
2033 *
2034 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2035 * this_rq is locked.
2036 */
2037static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2038                                struct sched_domain *sd)
2039{
2040        struct sched_group *group;
2041        runqueue_t *busiest = NULL;
2042        unsigned long imbalance;
2043        int nr_moved = 0;
2044
2045        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2046        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2047        if (!group) {
2048                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2049                goto out;
2050        }
2051
2052        busiest = find_busiest_queue(group);
2053        if (!busiest || busiest == this_rq) {
2054                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2055                goto out;
2056        }
2057
2058        /* Attempt to move tasks */
2059        double_lock_balance(this_rq, busiest);
2060
2061        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2062        nr_moved = move_tasks(this_rq, this_cpu, busiest,
2063                                        imbalance, sd, NEWLY_IDLE);
2064        if (!nr_moved)
2065                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2066
2067        spin_unlock(&busiest->lock);
2068
2069out:
2070        return nr_moved;
2071}
2072
2073/*
2074 * idle_balance is called by schedule() if this_cpu is about to become
2075 * idle. Attempts to pull tasks from other CPUs.
2076 */
2077static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2078{
2079        struct sched_domain *sd;
2080
2081        for_each_domain(this_cpu, sd) {
2082                if (sd->flags & SD_BALANCE_NEWIDLE) {
2083                        if (load_balance_newidle(this_cpu, this_rq, sd)) {
2084                                /* We've pulled tasks over so stop searching */
2085                                break;
2086                        }
2087                }
2088        }
2089}
2090
2091/*
2092 * active_load_balance is run by migration threads. It pushes running tasks
2093 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2094 * running on each physical CPU where possible, and avoids physical /
2095 * logical imbalances.
2096 *
2097 * Called with busiest_rq locked.
2098 */
2099static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2100{
2101        struct sched_domain *sd;
2102        struct sched_group *cpu_group;
2103        runqueue_t *target_rq;
2104        cpumask_t visited_cpus;
2105        int cpu;
2106
2107        schedstat_inc(busiest_rq, alb_cnt);
2108        /*
2109         * Search for suitable CPUs to push tasks to in successively higher
2110         * domains with SD_LOAD_BALANCE set.
2111         */
2112        visited_cpus = CPU_MASK_NONE;
2113        for_each_domain(busiest_cpu, sd) {
2114                if (!(sd->flags & SD_LOAD_BALANCE))
2115                        /* no more domains to search */
2116                        break;
2117
2118                cpu_group = sd->groups;
2119                do {
2120                        for_each_cpu_mask(cpu, cpu_group->cpumask) {
2121                                if (busiest_rq->nr_running <= 1)
2122                                        /* no more tasks left to move */
2123                                        return;
2124                                if (cpu_isset(cpu, visited_cpus))
2125                                        continue;
2126                                cpu_set(cpu, visited_cpus);
2127                                if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
2128                                        continue;
2129
2130                                target_rq = cpu_rq(cpu);
2131                                /*
2132                                 * This condition is "impossible", if it occurs
2133                                 * we need to fix it.  Originally reported by
2134                                 * Bjorn Helgaas on a 128-cpu setup.
2135                                 */
2136                                BUG_ON(busiest_rq == target_rq);
2137
2138                                /* move a task from busiest_rq to target_rq */
2139                                double_lock_balance(busiest_rq, target_rq);
2140                                if (move_tasks(target_rq, cpu, busiest_rq,
2141                                                1, sd, SCHED_IDLE)) {
2142                                        schedstat_inc(busiest_rq, alb_lost);
2143                                        schedstat_inc(target_rq, alb_gained);
2144                                } else {
2145                                        schedstat_inc(busiest_rq, alb_failed);
2146                                }
2147                                spin_unlock(&target_rq->lock);
2148                        }
2149                        cpu_group = cpu_group->next;
2150                } while (cpu_group != sd->groups);
2151        }
2152}
2153
2154/*
2155 * rebalance_tick will get called every timer tick, on every CPU.
2156 *
2157 * It checks each scheduling domain to see if it is due to be balanced,
2158 * and initiates a balancing operation if so.
2159 *
2160 * Balancing parameters are set up in arch_init_sched_domains.
2161 */
2162
2163/* Don't have all balancing operations going off at once */
2164#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2165
2166static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2167                           enum idle_type idle)
2168{
2169        unsigned long old_load, this_load;
2170        unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2171        struct sched_domain *sd;
2172
2173        /* Update our load */
2174        old_load = this_rq->cpu_load;
2175        this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2176        /*
2177         * Round up the averaging division if load is increasing. This
2178         * prevents us from getting stuck on 9 if the load is 10, for
2179         * example.
2180         */
2181        if (this_load > old_load)
2182                old_load++;
2183        this_rq->cpu_load = (old_load + this_load) / 2;
2184
2185        for_each_domain(this_cpu, sd) {
2186                unsigned long interval;
2187
2188                if (!(sd->flags & SD_LOAD_BALANCE))
2189                        continue;
2190
2191                interval = sd->balance_interval;
2192                if (idle != SCHED_IDLE)
2193                        interval *= sd->busy_factor;
2194
2195                /* scale ms to jiffies */
2196                interval = msecs_to_jiffies(interval);
2197                if (unlikely(!interval))
2198                        interval = 1;
2199
2200                if (j - sd->last_balance >= interval) {
2201                        if (load_balance(this_cpu, this_rq, sd, idle)) {
2202                                /* We've pulled tasks over so no longer idle */
2203                                idle = NOT_IDLE;
2204                        }
2205                        sd->last_balance += interval;
2206                }
2207        }
2208}
2209#else
2210/*
2211 * on UP we do not need to balance between CPUs:
2212 */
2213static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2214{
2215}
2216static inline void idle_balance(int cpu, runqueue_t *rq)
2217{
2218}
2219#endif
2220
2221static inline int wake_priority_sleeper(runqueue_t *rq)
2222{
2223        int ret = 0;
2224#ifdef CONFIG_SCHED_SMT
2225        spin_lock(&rq->lock);
2226        /*
2227         * If an SMT sibling task has been put to sleep for priority
2228         * reasons reschedule the idle task to see if it can now run.
2229         */
2230        if (rq->nr_running) {
2231                resched_task(rq->idle);
2232                ret = 1;
2233        }
2234        spin_unlock(&rq->lock);
2235#endif
2236        return ret;
2237}
2238
2239DEFINE_PER_CPU(struct kernel_stat, kstat);
2240
2241EXPORT_PER_CPU_SYMBOL(kstat);
2242
2243/*
2244 * We place interactive tasks back into the active array, if possible.
2245 *
2246 * To guarantee that this does not starve expired tasks we ignore the
2247 * interactivity of a task if the first expired task had to wait more
2248 * than a 'reasonable' amount of time. This deadline timeout is
2249 * load-dependent, as the frequency of array switched decreases with
2250 * increasing number of running tasks. We also ignore the interactivity
2251 * if a better static_prio task has expired:
2252 */
2253#define EXPIRED_STARVING(rq) \
2254        ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2255                (jiffies - (rq)->expired_timestamp >= \
2256                        STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2257                        ((rq)->curr->static_prio > (rq)->best_expired_prio))
2258
2259/*
2260 * Do the virtual cpu time signal calculations.
2261 * @p: the process that the cpu time gets accounted to
2262 * @cputime: the cpu time spent in user space since the last update
2263 */
2264static inline void account_it_virt(struct task_struct * p, cputime_t cputime)
2265{
2266        cputime_t it_virt = p->it_virt_value;
2267
2268        if (cputime_gt(it_virt, cputime_zero) &&
2269            cputime_gt(cputime, cputime_zero)) {
2270                if (cputime_ge(cputime, it_virt)) {
2271                        it_virt = cputime_add(it_virt, p->it_virt_incr);
2272                        send_sig(SIGVTALRM, p, 1);
2273                }
2274                it_virt = cputime_sub(it_virt, cputime);
2275                p->it_virt_value = it_virt;
2276        }
2277}
2278
2279/*
2280 * Do the virtual profiling signal calculations.
2281 * @p: the process that the cpu time gets accounted to
2282 * @cputime: the cpu time spent in user and kernel space since the last update
2283 */
2284static void account_it_prof(struct task_struct *p, cputime_t cputime)
2285{
2286        cputime_t it_prof = p->it_prof_value;
2287
2288        if (cputime_gt(it_prof, cputime_zero) &&
2289            cputime_gt(cputime, cputime_zero)) {
2290                if (cputime_ge(cputime, it_prof)) {
2291                        it_prof = cputime_add(it_prof, p->it_prof_incr);
2292                        send_sig(SIGPROF, p, 1);
2293                }
2294                it_prof = cputime_sub(it_prof, cputime);
2295                p->it_prof_value = it_prof;
2296        }
2297}
2298
2299/*
2300 * Check if the process went over its cputime resource limit after
2301 * some cpu time got added to utime/stime.
2302 * @p: the process that the cpu time gets accounted to
2303 * @cputime: the cpu time spent in user and kernel space since the last update
2304 */
2305static void check_rlimit(struct task_struct *p, cputime_t cputime)
2306{
2307        cputime_t total, tmp;
2308        unsigned long secs;
2309
2310        total = cputime_add(p->utime, p->stime);
2311        secs = cputime_to_secs(total);
2312        if (unlikely(secs >= p->signal->rlim[RLIMIT_CPU].rlim_cur)) {
2313                /* Send SIGXCPU every second. */
2314                tmp = cputime_sub(total, cputime);
2315                if (cputime_to_secs(tmp) < secs)
2316                        send_sig(SIGXCPU, p, 1);
2317                /* and SIGKILL when we go over max.. */
2318                if (secs >= p->signal->rlim[RLIMIT_CPU].rlim_max)
2319                        send_sig(SIGKILL, p, 1);
2320        }
2321}
2322
2323/*
2324 * Account user cpu time to a process.
2325 * @p: the process that the cpu time gets accounted to
2326 * @hardirq_offset: the offset to subtract from hardirq_count()
2327 * @cputime: the cpu time spent in user space since the last update
2328 */
2329void account_user_time(struct task_struct *p, cputime_t cputime)
2330{
2331        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2332        cputime64_t tmp;
2333
2334        p->utime = cputime_add(p->utime, cputime);
2335
2336        /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */
2337        check_rlimit(p, cputime);
2338        account_it_virt(p, cputime);
2339        account_it_prof(p, cputime);
2340
2341        /* Add user time to cpustat. */
2342        tmp = cputime_to_cputime64(cputime);
2343        if (TASK_NICE(p) > 0)
2344                cpustat->nice = cputime64_add(cpustat->nice, tmp);
2345        else
2346                cpustat->user = cputime64_add(cpustat->user, tmp);
2347}
2348
2349/*
2350 * Account system cpu time to a process.
2351 * @p: the process that the cpu time gets accounted to
2352 * @hardirq_offset: the offset to subtract from hardirq_count()
2353 * @cputime: the cpu time spent in kernel space since the last update
2354 */
2355void account_system_time(struct task_struct *p, int hardirq_offset,
2356                         cputime_t cputime)
2357{
2358        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2359        runqueue_t *rq = this_rq();
2360        cputime64_t tmp;
2361
2362        p->stime = cputime_add(p->stime, cputime);
2363
2364        /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */
2365        if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) {
2366                check_rlimit(p, cputime);
2367                account_it_prof(p, cputime);
2368        }
2369
2370        /* Add system time to cpustat. */
2371        tmp = cputime_to_cputime64(cputime);
2372        if (hardirq_count() - hardirq_offset)
2373                cpustat->irq = cputime64_add(cpustat->irq, tmp);
2374        else if (softirq_count())
2375                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2376        else if (p != rq->idle)
2377                cpustat->system = cputime64_add(cpustat->system, tmp);
2378        else if (atomic_read(&rq->nr_iowait) > 0)
2379                cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2380        else
2381                cpustat->idle = cputime64_add(cpustat->idle, tmp);
2382}
2383
2384/*
2385 * Account for involuntary wait time.
2386 * @p: the process from which the cpu time has been stolen
2387 * @steal: the cpu time spent in involuntary wait
2388 */
2389void account_steal_time(struct task_struct *p, cputime_t steal)
2390{
2391        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2392        cputime64_t tmp = cputime_to_cputime64(steal);
2393        runqueue_t *rq = this_rq();
2394
2395        if (p == rq->idle) {
2396                p->stime = cputime_add(p->stime, steal);
2397                if (atomic_read(&rq->nr_iowait) > 0)
2398                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2399                else
2400                        cpustat->idle = cputime64_add(cpustat->idle, tmp);
2401        } else
2402                cpustat->steal = cputime64_add(cpustat->steal, tmp);
2403}
2404
2405/*
2406 * This function gets called by the timer code, with HZ frequency.
2407 * We call it with interrupts disabled.
2408 *
2409 * It also gets called by the fork code, when changing the parent's
2410 * timeslices.
2411 */
2412void scheduler_tick(void)
2413{
2414        int cpu = smp_processor_id();
2415        runqueue_t *rq = this_rq();
2416        task_t *p = current;
2417
2418        rq->timestamp_last_tick = sched_clock();
2419
2420        if (p == rq->idle) {
2421                if (wake_priority_sleeper(rq))
2422                        goto out;
2423                rebalance_tick(cpu, rq, SCHED_IDLE);
2424                return;
2425        }
2426
2427        /* Task might have expired already, but not scheduled off yet */
2428        if (p->array != rq->active) {
2429                set_tsk_need_resched(p);
2430                goto out;
2431        }
2432        spin_lock(&rq->lock);
2433        /*
2434         * The task was running during this tick - update the
2435         * time slice counter. Note: we do not update a thread's
2436         * priority until it either goes to sleep or uses up its
2437         * timeslice. This makes it possible for interactive tasks
2438         * to use up their timeslices at their highest priority levels.
2439         */
2440        if (rt_task(p)) {
2441                /*
2442                 * RR tasks need a special form of timeslice management.
2443                 * FIFO tasks have no timeslices.
2444                 */
2445                if ((p->policy == SCHED_RR) && !--p->time_slice) {
2446                        p->time_slice = task_timeslice(p);
2447                        p->first_time_slice = 0;
2448                        set_tsk_need_resched(p);
2449
2450                        /* put it at the end of the queue: */
2451                        requeue_task(p, rq->active);
2452                }
2453                goto out_unlock;
2454        }
2455        if (!--p->time_slice) {
2456                dequeue_task(p, rq->active);
2457                set_tsk_need_resched(p);
2458                p->prio = effective_prio(p);
2459                p->time_slice = task_timeslice(p);
2460                p->first_time_slice = 0;
2461
2462                if (!rq->expired_timestamp)
2463                        rq->expired_timestamp = jiffies;
2464                if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2465                        enqueue_task(p, rq->expired);
2466                        if (p->static_prio < rq->best_expired_prio)
2467                                rq->best_expired_prio = p->static_prio;
2468                } else
2469                        enqueue_task(p, rq->active);
2470        } else {
2471                /*
2472                 * Prevent a too long timeslice allowing a task to monopolize
2473                 * the CPU. We do this by splitting up the timeslice into
2474                 * smaller pieces.
2475                 *
2476                 * Note: this does not mean the task's timeslices expire or
2477                 * get lost in any way, they just might be preempted by
2478                 * another task of equal priority. (one with higher
2479                 * priority would have preempted this task already.) We
2480                 * requeue this task to the end of the list on this priority
2481                 * level, which is in essence a round-robin of tasks with
2482                 * equal priority.
2483                 *
2484                 * This only applies to tasks in the interactive
2485                 * delta range with at least TIMESLICE_GRANULARITY to requeue.
2486                 */
2487                if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2488                        p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2489                        (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2490                        (p->array == rq->active)) {
2491
2492                        requeue_task(p, rq->active);
2493                        set_tsk_need_resched(p);
2494                }
2495        }
2496out_unlock:
2497        spin_unlock(&rq->lock);
2498out:
2499        rebalance_tick(cpu, rq, NOT_IDLE);
2500}
2501
2502#ifdef CONFIG_SCHED_SMT
2503static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2504{
2505        struct sched_domain *sd = this_rq->sd;
2506        cpumask_t sibling_map;
2507        int i;
2508
2509        if (!(sd->flags & SD_SHARE_CPUPOWER))
2510                return;
2511
2512        /*
2513         * Unlock the current runqueue because we have to lock in
2514         * CPU order to avoid deadlocks. Caller knows that we might
2515         * unlock. We keep IRQs disabled.
2516         */
2517        spin_unlock(&this_rq->lock);
2518
2519        sibling_map = sd->span;
2520
2521        for_each_cpu_mask(i, sibling_map)
2522                spin_lock(&cpu_rq(i)->lock);
2523        /*
2524         * We clear this CPU from the mask. This both simplifies the
2525         * inner loop and keps this_rq locked when we exit:
2526         */
2527        cpu_clear(this_cpu, sibling_map);
2528
2529        for_each_cpu_mask(i, sibling_map) {
2530                runqueue_t *smt_rq = cpu_rq(i);
2531
2532                /*
2533                 * If an SMT sibling task is sleeping due to priority
2534                 * reasons wake it up now.
2535                 */
2536                if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2537                        resched_task(smt_rq->idle);
2538        }
2539
2540        for_each_cpu_mask(i, sibling_map)
2541                spin_unlock(&cpu_rq(i)->lock);
2542        /*
2543         * We exit with this_cpu's rq still held and IRQs
2544         * still disabled:
2545         */
2546}
2547
2548static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2549{
2550        struct sched_domain *sd = this_rq->sd;
2551        cpumask_t sibling_map;
2552        prio_array_t *array;
2553        int ret = 0, i;
2554        task_t *p;
2555
2556        if (!(sd->flags & SD_SHARE_CPUPOWER))
2557                return 0;
2558
2559        /*
2560         * The same locking rules and details apply as for
2561         * wake_sleeping_dependent():
2562         */
2563        spin_unlock(&this_rq->lock);
2564        sibling_map = sd->span;
2565        for_each_cpu_mask(i, sibling_map)
2566                spin_lock(&cpu_rq(i)->lock);
2567        cpu_clear(this_cpu, sibling_map);
2568
2569        /*
2570         * Establish next task to be run - it might have gone away because
2571         * we released the runqueue lock above:
2572         */
2573        if (!this_rq->nr_running)
2574                goto out_unlock;
2575        array = this_rq->active;
2576        if (!array->nr_active)
2577                array = this_rq->expired;
2578        BUG_ON(!array->nr_active);
2579
2580        p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2581                task_t, run_list);
2582
2583        for_each_cpu_mask(i, sibling_map) {
2584                runqueue_t *smt_rq = cpu_rq(i);
2585                task_t *smt_curr = smt_rq->curr;
2586
2587                /*
2588                 * If a user task with lower static priority than the
2589                 * running task on the SMT sibling is trying to schedule,
2590                 * delay it till there is proportionately less timeslice
2591                 * left of the sibling task to prevent a lower priority
2592                 * task from using an unfair proportion of the
2593                 * physical cpu's resources. -ck
2594                 */
2595                if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2596                        task_timeslice(p) || rt_task(smt_curr)) &&
2597                        p->mm && smt_curr->mm && !rt_task(p))
2598                                ret = 1;
2599
2600                /*
2601                 * Reschedule a lower priority task on the SMT sibling,
2602                 * or wake it up if it has been put to sleep for priority
2603                 * reasons.
2604                 */
2605                if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2606                        task_timeslice(smt_curr) || rt_task(p)) &&
2607                        smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2608                        (smt_curr == smt_rq->idle && smt_rq->nr_running))
2609                                resched_task(smt_curr);
2610        }
2611out_unlock:
2612        for_each_cpu_mask(i, sibling_map)
2613                spin_unlock(&cpu_rq(i)->lock);
2614        return ret;
2615}
2616#else
2617static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2618{
2619}
2620
2621static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2622{
2623        return 0;
2624}
2625#endif
2626
2627#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2628
2629void fastcall add_preempt_count(int val)
2630{
2631        /*
2632         * Underflow?
2633         */
2634        BUG_ON(((int)preempt_count() < 0));
2635        preempt_count() += val;
2636        /*
2637         * Spinlock count overflowing soon?
2638         */
2639        BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2640}
2641EXPORT_SYMBOL(add_preempt_count);
2642
2643void fastcall sub_preempt_count(int val)
2644{
2645        /*
2646         * Underflow?
2647         */
2648        BUG_ON(val > preempt_count());
2649        /*
2650         * Is the spinlock portion underflowing?
2651         */
2652        BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2653        preempt_count() -= val;
2654}
2655EXPORT_SYMBOL(sub_preempt_count);
2656
2657#endif
2658
2659/*
2660 * schedule() is the main scheduler function.
2661 */
2662asmlinkage void __sched schedule(void)
2663{
2664        long *switch_count;
2665        task_t *prev, *next;
2666        runqueue_t *rq;
2667        prio_array_t *array;
2668        struct list_head *queue;
2669        unsigned long long now;
2670        unsigned long run_time;
2671        int cpu, idx;
2672
2673        /*
2674         * Test if we are atomic.  Since do_exit() needs to call into
2675         * schedule() atomically, we ignore that path for now.
2676         * Otherwise, whine if we are scheduling when we should not be.
2677         */
2678        if (likely(!current->exit_state)) {
2679                if (unlikely(in_atomic())) {
2680                        printk(KERN_ERR "scheduling while atomic: "
2681                                "%s/0x%08x/%d\n",
2682                                current->comm, preempt_count(), current->pid);
2683                        dump_stack();
2684                }
2685        }
2686        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2687
2688need_resched:
2689        preempt_disable();
2690        prev = current;
2691        release_kernel_lock(prev);
2692need_resched_nonpreemptible:
2693        rq = this_rq();
2694
2695        /*
2696         * The idle thread is not allowed to schedule!
2697         * Remove this check after it has been exercised a bit.
2698         */
2699        if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2700                printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2701                dump_stack();
2702        }
2703
2704        schedstat_inc(rq, sched_cnt);
2705        now = sched_clock();
2706        if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2707                run_time = now - prev->timestamp;
2708        else
2709                run_time = NS_MAX_SLEEP_AVG;
2710
2711        /*
2712         * Tasks charged proportionately less run_time at high sleep_avg to
2713         * delay them losing their interactive status
2714         */
2715        run_time /= (CURRENT_BONUS(prev) ? : 1);
2716
2717        spin_lock_irq(&rq->lock);
2718
2719        if (unlikely(prev->flags & PF_DEAD))
2720                prev->state = EXIT_DEAD;
2721
2722        switch_count = &prev->nivcsw;
2723        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2724                switch_count = &prev->nvcsw;
2725                if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2726                                unlikely(signal_pending(prev))))
2727                        prev->state = TASK_RUNNING;
2728                else {
2729                        if (prev->state == TASK_UNINTERRUPTIBLE)
2730                                rq->nr_uninterruptible++;
2731                        deactivate_task(prev, rq);
2732                }
2733        }
2734
2735        cpu = smp_processor_id();
2736        if (unlikely(!rq->nr_running)) {
2737go_idle:
2738                idle_balance(cpu, rq);
2739                if (!rq->nr_running) {
2740                        next = rq->idle;
2741                        rq->expired_timestamp = 0;
2742                        wake_sleeping_dependent(cpu, rq);
2743                        /*
2744                         * wake_sleeping_dependent() might have released
2745                         * the runqueue, so break out if we got new
2746                         * tasks meanwhile:
2747                         */
2748                        if (!rq->nr_running)
2749                                goto switch_tasks;
2750                }
2751        } else {
2752                if (dependent_sleeper(cpu, rq)) {
2753                        next = rq->idle;
2754                        goto switch_tasks;
2755                }
2756                /*
2757                 * dependent_sleeper() releases and reacquires the runqueue
2758                 * lock, hence go into the idle loop if the rq went
2759                 * empty meanwhile:
2760                 */
2761                if (unlikely(!rq->nr_running))
2762                        goto go_idle;
2763        }
2764
2765        array = rq->active;
2766        if (unlikely(!array->nr_active)) {
2767                /*
2768                 * Switch the active and expired arrays.
2769                 */
2770                schedstat_inc(rq, sched_switch);
2771                rq->active = rq->expired;
2772                rq->expired = array;
2773                array = rq->active;
2774                rq->expired_timestamp = 0;
2775                rq->best_expired_prio = MAX_PRIO;
2776        } else
2777                schedstat_inc(rq, sched_noswitch);
2778
2779        idx = sched_find_first_bit(array->bitmap);
2780        queue = array->queue + idx;
2781        next = list_entry(queue->next, task_t, run_list);
2782
2783        if (!rt_task(next) && next->activated > 0) {
2784                unsigned long long delta = now - next->timestamp;
2785
2786                if (next->activated == 1)
2787                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2788
2789                array = next->array;
2790                dequeue_task(next, array);
2791                recalc_task_prio(next, next->timestamp + delta);
2792                enqueue_task(next, array);
2793        }
2794        next->activated = 0;
2795switch_tasks:
2796        if (next == rq->idle)
2797                schedstat_inc(rq, sched_goidle);
2798        prefetch(next);
2799        clear_tsk_need_resched(prev);
2800        rcu_qsctr_inc(task_cpu(prev));
2801
2802        prev->sleep_avg -= run_time;
2803        if ((long)prev->sleep_avg <= 0)
2804                prev->sleep_avg = 0;
2805        prev->timestamp = prev->last_ran = now;
2806
2807        sched_info_switch(prev, next);
2808        if (likely(prev != next)) {
2809                next->timestamp = now;
2810                rq->nr_switches++;
2811                rq->curr = next;
2812                ++*switch_count;
2813
2814                prepare_arch_switch(rq, next);
2815                prev = context_switch(rq, prev, next);
2816                barrier();
2817
2818                finish_task_switch(prev);
2819        } else
2820                spin_unlock_irq(&rq->lock);
2821
2822        prev = current;
2823        if (unlikely(reacquire_kernel_lock(prev) < 0))
2824                goto need_resched_nonpreemptible;
2825        preempt_enable_no_resched();
2826        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2827                goto need_resched;
2828}
2829
2830EXPORT_SYMBOL(schedule);
2831
2832#ifdef CONFIG_PREEMPT
2833/*
2834 * this is is the entry point to schedule() from in-kernel preemption
2835 * off of preempt_enable.  Kernel preemptions off return from interrupt
2836 * occur there and call schedule directly.
2837 */
2838asmlinkage void __sched preempt_schedule(void)
2839{
2840        struct thread_info *ti = current_thread_info();
2841#ifdef CONFIG_PREEMPT_BKL
2842        struct task_struct *task = current;
2843        int saved_lock_depth;
2844#endif
2845        /*
2846         * If there is a non-zero preempt_count or interrupts are disabled,
2847         * we do not want to preempt the current task.  Just return..
2848         */
2849        if (unlikely(ti->preempt_count || irqs_disabled()))
2850                return;
2851
2852need_resched:
2853        add_preempt_count(PREEMPT_ACTIVE);
2854        /*
2855         * We keep the big kernel semaphore locked, but we
2856         * clear ->lock_depth so that schedule() doesnt
2857         * auto-release the semaphore:
2858         */
2859#ifdef CONFIG_PREEMPT_BKL
2860        saved_lock_depth = task->lock_depth;
2861        task->lock_depth = -1;
2862#endif
2863        schedule();
2864#ifdef CONFIG_PREEMPT_BKL
2865        task->lock_depth = saved_lock_depth;
2866#endif
2867        sub_preempt_count(PREEMPT_ACTIVE);
2868
2869        /* we could miss a preemption opportunity between schedule and now */
2870        barrier();
2871        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2872                goto need_resched;
2873}
2874
2875EXPORT_SYMBOL(preempt_schedule);
2876
2877/*
2878 * this is is the entry point to schedule() from kernel preemption
2879 * off of irq context.
2880 * Note, that this is called and return with irqs disabled. This will
2881 * protect us against recursive calling from irq.
2882 */
2883asmlinkage void __sched preempt_schedule_irq(void)
2884{
2885        struct thread_info *ti = current_thread_info();
2886#ifdef CONFIG_PREEMPT_BKL
2887        struct task_struct *task = current;
2888        int saved_lock_depth;
2889#endif
2890        /* Catch callers which need to be fixed*/
2891        BUG_ON(ti->preempt_count || !irqs_disabled());
2892
2893need_resched:
2894        add_preempt_count(PREEMPT_ACTIVE);
2895        /*
2896         * We keep the big kernel semaphore locked, but we
2897         * clear ->lock_depth so that schedule() doesnt
2898         * auto-release the semaphore:
2899         */
2900#ifdef CONFIG_PREEMPT_BKL
2901        saved_lock_depth = task->lock_depth;
2902        task->lock_depth = -1;
2903#endif
2904        local_irq_enable();
2905        schedule();
2906        local_irq_disable();
2907#ifdef CONFIG_PREEMPT_BKL
2908        task->lock_depth = saved_lock_depth;
2909#endif
2910        sub_preempt_count(PREEMPT_ACTIVE);
2911
2912        /* we could miss a preemption opportunity between schedule and now */
2913        barrier();
2914        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2915                goto need_resched;
2916}
2917
2918#endif /* CONFIG_PREEMPT */
2919
2920int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2921{
2922        task_t *p = curr->task;
2923        return try_to_wake_up(p, mode, sync);
2924}
2925
2926EXPORT_SYMBOL(default_wake_function);
2927
2928/*
2929 * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
2930 * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
2931 * number) then we wake all the non-exclusive tasks and one exclusive task.
2932 *
2933 * There are circumstances in which we can try to wake a task which has already
2934 * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
2935 * zero in this (rare) case, and we handle it by continuing to scan the queue.
2936 */
2937static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2938                             int nr_exclusive, int sync, void *key)
2939{
2940        struct list_head *tmp, *next;
2941
2942        list_for_each_safe(tmp, next, &q->task_list) {
2943                wait_queue_t *curr;
2944                unsigned flags;
2945                curr = list_entry(tmp, wait_queue_t, task_list);
2946                flags = curr->flags;
2947                if (curr->func(curr, mode, sync, key) &&
2948                    (flags & WQ_FLAG_EXCLUSIVE) &&
2949                    !--nr_exclusive)
2950                        break;
2951        }
2952}
2953
2954/**
2955 * __wake_up - wake up threads blocked on a waitqueue.
2956 * @q: the waitqueue
2957 * @mode: which threads
2958 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2959 */
2960void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2961                                int nr_exclusive, void *key)
2962{
2963        unsigned long flags;
2964
2965        spin_lock_irqsave(&q->lock, flags);
2966        __wake_up_common(q, mode, nr_exclusive, 0, key);
2967        spin_unlock_irqrestore(&q->lock, flags);
2968}
2969
2970EXPORT_SYMBOL(__wake_up);
2971
2972/*
2973 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2974 */
2975void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2976{
2977        __wake_up_common(q, mode, 1, 0, NULL);
2978}
2979
2980/**
2981 * __wake_up - sync- wake up threads blocked on a waitqueue.
2982 * @q: the waitqueue
2983 * @mode: which threads
2984 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2985 *
2986 * The sync wakeup differs that the waker knows that it will schedule
2987 * away soon, so while the target thread will be woken up, it will not
2988 * be migrated to another CPU - ie. the two threads are 'synchronized'
2989 * with each other. This can prevent needless bouncing between CPUs.
2990 *
2991 * On UP it can prevent extra preemption.
2992 */
2993void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2994{
2995        unsigned long flags;
2996        int sync = 1;
2997
2998        if (unlikely(!q))
2999                return;
3000
3001        if (unlikely(!nr_exclusive))
3002                sync = 0;
3003
3004        spin_lock_irqsave(&q->lock, flags);
3005        __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3006        spin_unlock_irqrestore(&q->lock, flags);
3007}
3008EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3009
3010void fastcall complete(struct completion *x)
3011{
3012        unsigned long flags;
3013
3014        spin_lock_irqsave(&x->wait.lock, flags);
3015        x->done++;
3016        __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3017                         1, 0, NULL);
3018        spin_unlock_irqrestore(&x->wait.lock, flags);
3019}
3020EXPORT_SYMBOL(complete);
3021
3022void fastcall complete_all(struct completion *x)
3023{
3024        unsigned long flags;
3025
3026        spin_lock_irqsave(&x->wait.lock, flags);
3027        x->done += UINT_MAX/2;
3028        __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3029                         0, 0, NULL);
3030        spin_unlock_irqrestore(&x->wait.lock, flags);
3031}
3032EXPORT_SYMBOL(complete_all);
3033
3034void fastcall __sched wait_for_completion(struct completion *x)
3035{
3036        might_sleep();
3037        spin_lock_irq(&x->wait.lock);
3038        if (!x->done) {
3039                DECLARE_WAITQUEUE(wait, current);
3040
3041                wait.flags |= WQ_FLAG_EXCLUSIVE;
3042                __add_wait_queue_tail(&x->wait, &wait);
3043                do {
3044                        __set_current_state(TASK_UNINTERRUPTIBLE);
3045                        spin_unlock_irq(&x->wait.lock);
3046                        schedule();
3047                        spin_lock_irq(&x->wait.lock);
3048                } while (!x->done);
3049                __remove_wait_queue(&x->wait, &wait);
3050        }
3051        x->done--;
3052        spin_unlock_irq(&x->wait.lock);
3053}
3054EXPORT_SYMBOL(wait_for_completion);
3055
3056unsigned long fastcall __sched
3057wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3058{
3059        might_sleep();
3060
3061        spin_lock_irq(&x->wait.lock);
3062        if (!x->done) {
3063                DECLARE_WAITQUEUE(wait, current);
3064
3065                wait.flags |= WQ_FLAG_EXCLUSIVE;
3066                __add_wait_queue_tail(&x->wait, &wait);
3067                do {
3068                        __set_current_state(TASK_UNINTERRUPTIBLE);
3069                        spin_unlock_irq(&x->wait.lock);
3070                        timeout = schedule_timeout(timeout);
3071                        spin_lock_irq(&x->wait.lock);
3072                        if (!timeout) {
3073                                __remove_wait_queue(&x->wait, &wait);
3074                                goto out;
3075                        }
3076                } while (!x->done);
3077                __remove_wait_queue(&x->wait, &wait);
3078        }
3079        x->done--;
3080out:
3081        spin_unlock_irq(&x->wait.lock);
3082        return timeout;
3083}
3084EXPORT_SYMBOL(wait_for_completion_timeout);
3085
3086int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3087{
3088        int ret = 0;
3089
3090        might_sleep();
3091
3092        spin_lock_irq(&x->wait.lock);
3093        if (!x->done) {
3094                DECLARE_WAITQUEUE(wait, current);
3095
3096                wait.flags |= WQ_FLAG_EXCLUSIVE;
3097                __add_wait_queue_tail(&x->wait, &wait);
3098                do {
3099                        if (signal_pending(current)) {
3100                                ret = -ERESTARTSYS;
3101                                __remove_wait_queue(&x->wait, &wait);
3102                                goto out;
3103                        }
3104                        __set_current_state(TASK_INTERRUPTIBLE);
3105                        spin_unlock_irq(&x->wait.lock);
3106                        schedule();
3107                        spin_lock_irq(&x->wait.lock);
3108                } while (!x->done);
3109                __remove_wait_queue(&x->wait, &wait);
3110        }
3111        x->done--;
3112out:
3113        spin_unlock_irq(&x->wait.lock);
3114
3115        return ret;
3116}
3117EXPORT_SYMBOL(wait_for_completion_interruptible);
3118
3119unsigned long fastcall __sched
3120wait_for_completion_interruptible_timeout(struct completion *x,
3121                                          unsigned long timeout)
3122{
3123        might_sleep();
3124
3125        spin_lock_irq(&x->wait.lock);
3126        if (!x->done) {
3127                DECLARE_WAITQUEUE(wait, current);
3128
3129                wait.flags |= WQ_FLAG_EXCLUSIVE;
3130                __add_wait_queue_tail(&x->wait, &wait);
3131                do {
3132                        if (signal_pending(current)) {
3133                                timeout = -ERESTARTSYS;
3134                                __remove_wait_queue(&x->wait, &wait);
3135                                goto out;
3136                        }
3137                        __set_current_state(TASK_INTERRUPTIBLE);
3138                        spin_unlock_irq(&x->wait.lock);
3139                        timeout = schedule_timeout(timeout);
3140                        spin_lock_irq(&x->wait.lock);
3141                        if (!timeout) {
3142                                __remove_wait_queue(&x->wait, &wait);
3143                                goto out;
3144                        }
3145                } while (!x->done);
3146                __remove_wait_queue(&x->wait, &wait);
3147        }
3148        x->done--;
3149out:
3150        spin_unlock_irq(&x->wait.lock);
3151        return timeout;
3152}
3153EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3154
3155
3156#define SLEEP_ON_VAR                                    \
3157        unsigned long flags;                            \
3158        wait_queue_t wait;                              \
3159        init_waitqueue_entry(&wait, current);
3160
3161#define SLEEP_ON_HEAD                                   \
3162        spin_lock_irqsave(&q->lock,flags);              \
3163        __add_wait_queue(q, &wait);                     \
3164        spin_unlock(&q->lock);
3165
3166#define SLEEP_ON_TAIL                                   \
3167        spin_lock_irq(&q->lock);                        \
3168        __remove_wait_queue(q, &wait);                  \
3169        spin_unlock_irqrestore(&q->lock, flags);
3170
3171void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3172{
3173        SLEEP_ON_VAR
3174
3175        current->state = TASK_INTERRUPTIBLE;
3176
3177        SLEEP_ON_HEAD
3178        schedule();
3179        SLEEP_ON_TAIL
3180}
3181
3182EXPORT_SYMBOL(interruptible_sleep_on);
3183
3184long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3185{
3186        SLEEP_ON_VAR
3187
3188        current->state = TASK_INTERRUPTIBLE;
3189
3190        SLEEP_ON_HEAD
3191        timeout = schedule_timeout(timeout);
3192        SLEEP_ON_TAIL
3193
3194        return timeout;
3195}
3196
3197EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3198
3199void fastcall __sched sleep_on(wait_queue_head_t *q)
3200{
3201        SLEEP_ON_VAR
3202
3203        current->state = TASK_UNINTERRUPTIBLE;
3204
3205        SLEEP_ON_HEAD
3206        schedule();
3207        SLEEP_ON_TAIL
3208}
3209
3210EXPORT_SYMBOL(sleep_on);
3211
3212long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3213{
3214        SLEEP_ON_VAR
3215
3216        current->state = TASK_UNINTERRUPTIBLE;
3217
3218        SLEEP_ON_HEAD
3219        timeout = schedule_timeout(timeout);
3220        SLEEP_ON_TAIL
3221
3222        return timeout;
3223}
3224
3225EXPORT_SYMBOL(sleep_on_timeout);
3226
3227void set_user_nice(task_t *p, long nice)
3228{
3229        unsigned long flags;
3230        prio_array_t *array;
3231        runqueue_t *rq;
3232        int old_prio, new_prio, delta;
3233
3234        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3235                return;
3236        /*
3237         * We have to be careful, if called from sys_setpriority(),
3238         * the task might be in the middle of scheduling on another CPU.
3239         */
3240        rq = task_rq_lock(p, &flags);
3241        /*
3242         * The RT priorities are set via sched_setscheduler(), but we still
3243         * allow the 'normal' nice value to be set - but as expected
3244         * it wont have any effect on scheduling until the task is
3245         * not SCHED_NORMAL:
3246         */
3247        if (rt_task(p)) {
3248                p->static_prio = NICE_TO_PRIO(nice);
3249                goto out_unlock;
3250        }
3251        array = p->array;
3252        if (array)
3253                dequeue_task(p, array);
3254
3255        old_prio = p->prio;
3256        new_prio = NICE_TO_PRIO(nice);
3257        delta = new_prio - old_prio;
3258        p->static_prio = NICE_TO_PRIO(nice);
3259        p->prio += delta;
3260
3261        if (array) {
3262                enqueue_task(p, array);
3263                /*
3264                 * If the task increased its priority or is running and
3265                 * lowered its priority, then reschedule its CPU:
3266                 */
3267                if (delta < 0 || (delta > 0 && task_running(rq, p)))
3268                        resched_task(rq->curr);
3269        }
3270out_unlock:
3271        task_rq_unlock(rq, &flags);
3272}
3273
3274EXPORT_SYMBOL(set_user_nice);
3275
3276#ifdef __ARCH_WANT_SYS_NICE
3277
3278/*
3279 * sys_nice - change the priority of the current process.
3280 * @increment: priority increment
3281 *
3282 * sys_setpriority is a more generic, but much slower function that
3283 * does similar things.
3284 */
3285asmlinkage long sys_nice(int increment)
3286{
3287        int retval;
3288        long nice;
3289
3290        /*
3291         * Setpriority might change our priority at the same moment.
3292         * We don't have to worry. Conceptually one call occurs first
3293         * and we have a single winner.
3294         */
3295        if (increment < 0) {
3296                if (!capable(CAP_SYS_NICE))
3297                        return -EPERM;
3298                if (increment < -40)
3299                        increment = -40;
3300        }
3301        if (increment > 40)
3302                increment = 40;
3303
3304        nice = PRIO_TO_NICE(current->static_prio) + increment;
3305        if (nice < -20)
3306                nice = -20;
3307        if (nice > 19)
3308                nice = 19;
3309
3310        retval = security_task_setnice(current, nice);
3311        if (retval)
3312                return retval;
3313
3314        set_user_nice(current, nice);
3315        return 0;
3316}
3317
3318#endif
3319
3320/**
3321 * task_prio - return the priority value of a given task.
3322 * @p: the task in question.
3323 *
3324 * This is the priority value as seen by users in /proc.
3325 * RT tasks are offset by -200. Normal tasks are centered
3326 * around 0, value goes from -16 to +15.
3327 */
3328int task_prio(const task_t *p)
3329{
3330        return p->prio - MAX_RT_PRIO;
3331}
3332
3333/**
3334 * task_nice - return the nice value of a given task.
3335 * @p: the task in question.
3336 */
3337int task_nice(const task_t *p)
3338{
3339        return TASK_NICE(p);
3340}
3341
3342/*
3343 * The only users of task_nice are binfmt_elf and binfmt_elf32.
3344 * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3345 * Therefore, task_nice is needed if there is a compat_mode.
3346 */
3347#ifdef CONFIG_COMPAT
3348EXPORT_SYMBOL_GPL(task_nice);
3349#endif
3350
3351/**
3352 * idle_cpu - is a given cpu idle currently?
3353 * @cpu: the processor in question.
3354 */
3355int idle_cpu(int cpu)
3356{
3357        return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3358}
3359
3360EXPORT_SYMBOL_GPL(idle_cpu);
3361
3362/**
3363 * idle_task - return the idle task for a given cpu.
3364 * @cpu: the processor in question.
3365 */
3366task_t *idle_task(int cpu)
3367{
3368        return cpu_rq(cpu)->idle;
3369}
3370
3371/**
3372 * find_process_by_pid - find a process with a matching PID value.
3373 * @pid: the pid in question.
3374 */
3375static inline task_t *find_process_by_pid(pid_t pid)
3376{
3377        return pid ? find_task_by_pid(pid) : current;
3378}
3379
3380/* Actually do priority change: must hold rq lock. */
3381static void __setscheduler(struct task_struct *p, int policy, int prio)
3382{
3383        BUG_ON(p->array);
3384        p->policy = policy;
3385        p->rt_priority = prio;
3386        if (policy != SCHED_NORMAL)
3387                p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3388        else
3389                p->prio = p->static_prio;
3390}
3391
3392/**
3393 * sched_setscheduler - change the scheduling policy and/or RT priority of
3394 * a thread.
3395 * @p: the task in question.
3396 * @policy: new policy.
3397 * @param: structure containing the new RT priority.
3398 */
3399int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
3400{
3401        int retval;
3402        int oldprio, oldpolicy = -1;
3403        prio_array_t *array;
3404        unsigned long flags;
3405        runqueue_t *rq;
3406
3407recheck:
3408        /* double check policy once rq lock held */
3409        if (policy < 0)
3410                policy = oldpolicy = p->policy;
3411        else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3412                                policy != SCHED_NORMAL)
3413                        return -EINVAL;
3414        /*
3415         * Valid priorities for SCHED_FIFO and SCHED_RR are
3416         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3417         */
3418        if (param->sched_priority < 0 ||
3419            param->sched_priority > MAX_USER_RT_PRIO-1)
3420                return -EINVAL;
3421        if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3422                return -EINVAL;
3423
3424        if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3425            !capable(CAP_SYS_NICE))
3426                return -EPERM;
3427        if ((current->euid != p->euid) && (current->euid != p->uid) &&
3428            !capable(CAP_SYS_NICE))
3429                return -EPERM;
3430
3431        retval = security_task_setscheduler(p, policy, param);
3432        if (retval)
3433                return retval;
3434        /*
3435         * To be able to change p->policy safely, the apropriate
3436         * runqueue lock must be held.
3437         */
3438        rq = task_rq_lock(p, &flags);
3439        /* recheck policy now with rq lock held */
3440        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3441                policy = oldpolicy = -1;
3442                task_rq_unlock(rq, &flags);
3443                goto recheck;
3444        }
3445        array = p->array;
3446        if (array)
3447                deactivate_task(p, rq);
3448        oldprio = p->prio;
3449        __setscheduler(p, policy, param->sched_priority);
3450        if (array) {
3451                __activate_task(p, rq);
3452                /*
3453                 * Reschedule if we are currently running on this runqueue and
3454                 * our priority decreased, or if we are not currently running on
3455                 * this runqueue and our priority is higher than the current's
3456                 */
3457                if (task_running(rq, p)) {
3458                        if (p->prio > oldprio)
3459                                resched_task(rq->curr);
3460                } else if (TASK_PREEMPTS_CURR(p, rq))
3461                        resched_task(rq->curr);
3462        }
3463        task_rq_unlock(rq, &flags);
3464        return 0;
3465}
3466EXPORT_SYMBOL_GPL(sched_setscheduler);
3467
3468static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3469{
3470        int retval;
3471        struct sched_param lparam;
3472        struct task_struct *p;
3473
3474        if (!param || pid < 0)
3475                return -EINVAL;
3476        if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3477                return -EFAULT;
3478        read_lock_irq(&tasklist_lock);
3479        p = find_process_by_pid(pid);
3480        if (!p) {
3481                read_unlock_irq(&tasklist_lock);
3482                return -ESRCH;
3483        }
3484        retval = sched_setscheduler(p, policy, &lparam);
3485        read_unlock_irq(&tasklist_lock);
3486        return retval;
3487}
3488
3489/**
3490 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3491 * @pid: the pid in question.
3492 * @policy: new policy.
3493 * @param: structure containing the new RT priority.
3494 */
3495asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3496                                       struct sched_param __user *param)
3497{
3498        return do_sched_setscheduler(pid, policy, param);
3499}
3500
3501/**
3502 * sys_sched_setparam - set/change the RT priority of a thread
3503 * @pid: the pid in question.
3504 * @param: structure containing the new RT priority.
3505 */
3506asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3507{
3508        return do_sched_setscheduler(pid, -1, param);
3509}
3510
3511/**
3512 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3513 * @pid: the pid in question.
3514 */
3515asmlinkage long sys_sched_getscheduler(pid_t pid)
3516{
3517        int retval = -EINVAL;
3518        task_t *p;
3519
3520        if (pid < 0)
3521                goto out_nounlock;
3522
3523        retval = -ESRCH;
3524        read_lock(&tasklist_lock);
3525        p = find_process_by_pid(pid);
3526        if (p) {
3527                retval = security_task_getscheduler(p);
3528                if (!retval)
3529                        retval = p->policy;
3530        }
3531        read_unlock(&tasklist_lock);
3532
3533out_nounlock:
3534        return retval;
3535}
3536
3537/**
3538 * sys_sched_getscheduler - get the RT priority of a thread
3539 * @pid: the pid in question.
3540 * @param: structure containing the RT priority.
3541 */
3542asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3543{
3544        struct sched_param lp;
3545        int retval = -EINVAL;
3546        task_t *p;
3547
3548        if (!param || pid < 0)
3549                goto out_nounlock;
3550
3551        read_lock(&tasklist_lock);
3552        p = find_process_by_pid(pid);
3553        retval = -ESRCH;
3554        if (!p)
3555                goto out_unlock;
3556
3557        retval = security_task_getscheduler(p);
3558        if (retval)
3559                goto out_unlock;
3560
3561        lp.sched_priority = p->rt_priority;
3562        read_unlock(&tasklist_lock);
3563
3564        /*
3565         * This one might sleep, we cannot do it with a spinlock held ...
3566         */
3567        retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3568
3569out_nounlock:
3570        return retval;
3571
3572out_unlock:
3573        read_unlock(&tasklist_lock);
3574        return retval;
3575}
3576
3577long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3578{
3579        task_t *p;
3580        int retval;
3581
3582        lock_cpu_hotplug();
3583        read_lock(&tasklist_lock);
3584
3585        p = find_process_by_pid(pid);
3586        if (!p) {
3587                read_unlock(&tasklist_lock);
3588                unlock_cpu_hotplug();
3589                return -ESRCH;
3590        }
3591
3592        /*
3593         * It is not safe to call set_cpus_allowed with the
3594         * tasklist_lock held.  We will bump the task_struct's
3595         * usage count and then drop tasklist_lock.
3596         */
3597        get_task_struct(p);
3598        read_unlock(&tasklist_lock);
3599
3600        retval = -EPERM;
3601        if ((current->euid != p->euid) && (current->euid != p->uid) &&
3602                        !capable(CAP_SYS_NICE))
3603                goto out_unlock;
3604
3605        retval = set_cpus_allowed(p, new_mask);
3606
3607out_unlock:
3608        put_task_struct(p);
3609        unlock_cpu_hotplug();
3610        return retval;
3611}
3612
3613static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3614                             cpumask_t *new_mask)
3615{
3616        if (len < sizeof(cpumask_t)) {
3617                memset(new_mask, 0, sizeof(cpumask_t));
3618        } else if (len > sizeof(cpumask_t)) {
3619                len = sizeof(cpumask_t);
3620        }
3621        return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3622}
3623
3624/**
3625 * sys_sched_setaffinity - set the cpu affinity of a process
3626 * @pid: pid of the process
3627 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3628 * @user_mask_ptr: user-space pointer to the new cpu mask
3629 */
3630asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3631                                      unsigned long __user *user_mask_ptr)
3632{
3633        cpumask_t new_mask;
3634        int retval;
3635
3636        retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3637        if (retval)
3638                return retval;
3639
3640        return sched_setaffinity(pid, new_mask);
3641}
3642
3643/*
3644 * Represents all cpu's present in the system
3645 * In systems capable of hotplug, this map could dynamically grow
3646 * as new cpu's are detected in the system via any platform specific
3647 * method, such as ACPI for e.g.
3648 */
3649
3650cpumask_t cpu_present_map;
3651EXPORT_SYMBOL(cpu_present_map);
3652
3653#ifndef CONFIG_SMP
3654cpumask_t cpu_online_map = CPU_MASK_ALL;
3655cpumask_t cpu_possible_map = CPU_MASK_ALL;
3656#endif
3657
3658long sched_getaffinity(pid_t pid, cpumask_t *mask)
3659{
3660        int retval;
3661        task_t *p;
3662
3663        lock_cpu_hotplug();
3664        read_lock(&tasklist_lock);
3665
3666        retval = -ESRCH;
3667        p = find_process_by_pid(pid);
3668        if (!p)
3669                goto out_unlock;
3670
3671        retval = 0;
3672        cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3673
3674out_unlock:
3675        read_unlock(&tasklist_lock);
3676        unlock_cpu_hotplug();
3677        if (retval)
3678                return retval;
3679
3680        return 0;
3681}
3682
3683/**
3684 * sys_sched_getaffinity - get the cpu affinity of a process
3685 * @pid: pid of the process
3686 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3687 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3688 */
3689asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3690                                      unsigned long __user *user_mask_ptr)
3691{
3692        int ret;
3693        cpumask_t mask;
3694
3695        if (len < sizeof(cpumask_t))
3696                return -EINVAL;
3697
3698        ret = sched_getaffinity(pid, &mask);
3699        if (ret < 0)
3700                return ret;
3701
3702        if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3703                return -EFAULT;
3704
3705        return sizeof(cpumask_t);
3706}
3707
3708/**
3709 * sys_sched_yield - yield the current processor to other threads.
3710 *
3711 * this function yields the current CPU by moving the calling thread
3712 * to the expired array. If there are no other threads running on this
3713 * CPU then this function will return.
3714 */
3715asmlinkage long sys_sched_yield(void)
3716{
3717        runqueue_t *rq = this_rq_lock();
3718        prio_array_t *array = current->array;
3719        prio_array_t *target = rq->expired;
3720
3721        schedstat_inc(rq, yld_cnt);
3722        /*
3723         * We implement yielding by moving the task into the expired
3724         * queue.
3725         *
3726         * (special rule: RT tasks will just roundrobin in the active
3727         *  array.)
3728         */
3729        if (rt_task(current))
3730                target = rq->active;
3731
3732        if (current->array->nr_active == 1) {
3733                schedstat_inc(rq, yld_act_empty);
3734                if (!rq->expired->nr_active)
3735                        schedstat_inc(rq, yld_both_empty);
3736        } else if (!rq->expired->nr_active)
3737                schedstat_inc(rq, yld_exp_empty);
3738
3739        if (array != target) {
3740                dequeue_task(current, array);
3741                enqueue_task(current, target);
3742        } else
3743                /*
3744                 * requeue_task is cheaper so perform that if possible.
3745                 */
3746                requeue_task(current, array);
3747
3748        /*
3749         * Since we are going to call schedule() anyway, there's
3750         * no need to preempt or enable interrupts:
3751         */
3752        __release(rq->lock);
3753        _raw_spin_unlock(&rq->lock);
3754        preempt_enable_no_resched();
3755
3756        schedule();
3757
3758        return 0;
3759}
3760
3761static inline void __cond_resched(void)
3762{
3763        do {
3764                add_preempt_count(PREEMPT_ACTIVE);
3765                schedule();
3766                sub_preempt_count(PREEMPT_ACTIVE);
3767        } while (need_resched());
3768}
3769
3770int __sched cond_resched(void)
3771{
3772        if (need_resched()) {
3773                __cond_resched();
3774                return 1;
3775        }
3776        return 0;
3777}
3778
3779EXPORT_SYMBOL(cond_resched);
3780
3781/*
3782 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
3783 * call schedule, and on return reacquire the lock.
3784 *
3785 * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
3786 * operations here to prevent schedule() from being called twice (once via
3787 * spin_unlock(), once by hand).
3788 */
3789int cond_resched_lock(spinlock_t * lock)
3790{
3791#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3792        if (lock->break_lock) {
3793                lock->break_lock = 0;
3794                spin_unlock(lock);
3795                cpu_relax();
3796                spin_lock(lock);
3797        }
3798#endif
3799        if (need_resched()) {
3800                _raw_spin_unlock(lock);
3801                preempt_enable_no_resched();
3802                __cond_resched();
3803                spin_lock(lock);
3804                return 1;
3805        }
3806        return 0;
3807}
3808
3809EXPORT_SYMBOL(cond_resched_lock);
3810
3811int __sched cond_resched_softirq(void)
3812{
3813        BUG_ON(!in_softirq());
3814
3815        if (need_resched()) {
3816                __local_bh_enable();
3817                __cond_resched();
3818                local_bh_disable();
3819                return 1;
3820        }
3821        return 0;
3822}
3823
3824EXPORT_SYMBOL(cond_resched_softirq);
3825
3826
3827/**
3828 * yield - yield the current processor to other threads.
3829 *
3830 * this is a shortcut for kernel-space yielding - it marks the
3831 * thread runnable and calls sys_sched_yield().
3832 */
3833void __sched yield(void)
3834{
3835        set_current_state(TASK_RUNNING);
3836        sys_sched_yield();
3837}
3838
3839EXPORT_SYMBOL(yield);
3840
3841/*
3842 * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
3843 * that process accounting knows that this is a task in IO wait state.
3844 *
3845 * But don't do that if it is a deliberate, throttling IO wait (this task
3846 * has set its backing_dev_info: the queue against which it should throttle)
3847 */
3848void __sched io_schedule(void)
3849{
3850        struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3851
3852        atomic_inc(&rq->nr_iowait);
3853        schedule();
3854        atomic_dec(&rq->nr_iowait);
3855}
3856
3857EXPORT_SYMBOL(io_schedule);
3858
3859long __sched io_schedule_timeout(long timeout)
3860{
3861        struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3862        long ret;
3863
3864        atomic_inc(&rq->nr_iowait);
3865        ret = schedule_timeout(timeout);
3866        atomic_dec(&rq->nr_iowait);
3867        return ret;
3868}
3869
3870/**
3871 * sys_sched_get_priority_max - return maximum RT priority.
3872 * @policy: scheduling class.
3873 *
3874 * this syscall returns the maximum rt_priority that can be used
3875 * by a given scheduling class.
3876 */
3877asmlinkage long sys_sched_get_priority_max(int policy)
3878{
3879        int ret = -EINVAL;
3880
3881        switch (policy) {
3882        case SCHED_FIFO:
3883        case SCHED_RR:
3884                ret = MAX_USER_RT_PRIO-1;
3885                break;
3886        case SCHED_NORMAL:
3887                ret = 0;
3888                break;
3889        }
3890        return ret;
3891}
3892
3893/**
3894 * sys_sched_get_priority_min - return minimum RT priority.
3895 * @policy: scheduling class.
3896 *
3897 * this syscall returns the minimum rt_priority that can be used
3898 * by a given scheduling class.
3899 */
3900asmlinkage long sys_sched_get_priority_min(int policy)
3901{
3902        int ret = -EINVAL;
3903
3904        switch (policy) {
3905        case SCHED_FIFO:
3906        case SCHED_RR:
3907                ret = 1;
3908                break;
3909        case SCHED_NORMAL:
3910                ret = 0;
3911        }
3912        return ret;
3913}
3914
3915/**
3916 * sys_sched_rr_get_interval - return the default timeslice of a process.
3917 * @pid: pid of the process.
3918 * @interval: userspace pointer to the timeslice value.
3919 *
3920 * this syscall writes the default timeslice value of a given process
3921 * into the user-space timespec buffer. A value of '0' means infinity.
3922 */
3923asmlinkage
3924long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3925{
3926        int retval = -EINVAL;
3927        struct timespec t;
3928        task_t *p;
3929
3930        if (pid < 0)
3931                goto out_nounlock;
3932
3933        retval = -ESRCH;
3934        read_lock(&tasklist_lock);
3935        p = find_process_by_pid(pid);
3936        if (!p)
3937                goto out_unlock;
3938
3939        retval = security_task_getscheduler(p);
3940        if (retval)
3941                goto out_unlock;
3942
3943        jiffies_to_timespec(p->policy & SCHED_FIFO ?
3944                                0 : task_timeslice(p), &t);
3945        read_unlock(&tasklist_lock);
3946        retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3947out_nounlock:
3948        return retval;
3949out_unlock:
3950        read_unlock(&tasklist_lock);
3951        return retval;
3952}
3953
3954static inline struct task_struct *eldest_child(struct task_struct *p)
3955{
3956        if (list_empty(&p->children)) return NULL;
3957        return list_entry(p->children.next,struct task_struct,sibling);
3958}
3959
3960static inline struct task_struct *older_sibling(struct task_struct *p)
3961{
3962        if (p->sibling.prev==&p->parent->children) return NULL;
3963        return list_entry(p->sibling.prev,struct task_struct,sibling);
3964}
3965
3966static inline struct task_struct *younger_sibling(struct task_struct *p)
3967{
3968        if (p->sibling.next==&p->parent->children) return NULL;
3969        return list_entry(p->sibling.next,struct task_struct,sibling);
3970}
3971
3972static void show_task(task_t * p)
3973{
3974        task_t *relative;
3975        unsigned state;
3976        unsigned long free = 0;
3977        static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3978
3979        printk("%-13.13s ", p->comm);
3980        state = p->state ? __ffs(p->state) + 1 : 0;
3981        if (state < ARRAY_SIZE(stat_nam))
3982                printk(stat_nam[state]);
3983        else
3984                printk("?");
3985#if (BITS_PER_LONG == 32)
3986        if (state == TASK_RUNNING)
3987                printk(" running ");
3988        else
3989                printk(" %08lX ", thread_saved_pc(p));
3990#else
3991        if (state == TASK_RUNNING)
3992                printk("  running task   ");
3993        else
3994                printk(" %016lx ", thread_saved_pc(p));
3995#endif
3996#ifdef CONFIG_DEBUG_STACK_USAGE
3997        {
3998                unsigned long * n = (unsigned long *) (p->thread_info+1);
3999                while (!*n)
4000                        n++;
4001                free = (unsigned long) n - (unsigned long)(p->thread_info+1);
4002        }
4003#endif
4004        printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
4005        if ((relative = eldest_child(p)))
4006                printk("%5d ", relative->pid);
4007        else
4008                printk("      ");
4009        if ((relative = younger_sibling(p)))
4010                printk("%7d", relative->pid);
4011        else
4012                printk("       ");
4013        if ((relative = older_sibling(p)))
4014                printk(" %5d", relative->pid);
4015        else
4016                printk("      ");
4017        if (!p->mm)
4018                printk(" (L-TLB)\n");
4019        else
4020                printk(" (NOTLB)\n");
4021
4022        if (state != TASK_RUNNING)
4023                show_stack(p, NULL);
4024}
4025
4026void show_state(void)
4027{
4028        task_t *g, *p;
4029
4030#if (BITS_PER_LONG == 32)
4031        printk("\n"
4032               "                                               sibling\n");
4033        printk("  task             PC      pid father child younger older\n");
4034#else
4035        printk("\n"
4036               "                                                       sibling\n");
4037        printk("  task                 PC          pid father child younger older\n");
4038#endif
4039        read_lock(&tasklist_lock);
4040        do_each_thread(g, p) {
4041                /*
4042                 * reset the NMI-timeout, listing all files on a slow
4043                 * console might take alot of time:
4044                 */
4045                touch_nmi_watchdog();
4046                show_task(p);
4047        } while_each_thread(g, p);
4048
4049        read_unlock(&tasklist_lock);
4050}
4051
4052void __devinit init_idle(task_t *idle, int cpu)
4053{
4054        runqueue_t *rq = cpu_rq(cpu);
4055        unsigned long flags;
4056
4057        idle->sleep_avg = 0;
4058        idle->array = NULL;
4059        idle->prio = MAX_PRIO;
4060        idle->state = TASK_RUNNING;
4061        set_task_cpu(idle, cpu);
4062
4063        spin_lock_irqsave(&rq->lock, flags);
4064        rq->curr = rq->idle = idle;
4065        set_tsk_need_resched(idle);
4066        spin_unlock_irqrestore(&rq->lock, flags);
4067
4068        /* Set the preempt count _outside_ the spinlocks! */
4069#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4070        idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4071#else
4072        idle->thread_info->preempt_count = 0;
4073#endif
4074}
4075
4076/*
4077 * In a system that switches off the HZ timer nohz_cpu_mask
4078 * indicates which cpus entered this state. This is used
4079 * in the rcu update to wait only for active cpus. For system
4080 * which do not switch off the HZ timer nohz_cpu_mask should
4081 * always be CPU_MASK_NONE.
4082 */
4083cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4084
4085#ifdef CONFIG_SMP
4086/*
4087 * This is how migration works:
4088 *
4089 * 1) we queue a migration_req_t structure in the source CPU's
4090 *    runqueue and wake up that CPU's migration thread.
4091 * 2) we down() the locked semaphore => thread blocks.
4092 * 3) migration thread wakes up (implicitly it forces the migrated
4093 *    thread off the CPU)
4094 * 4) it gets the migration request and checks whether the migrated
4095 *    task is still in the wrong runqueue.
4096 * 5) if it's in the wrong runqueue then the migration thread removes
4097 *    it and puts it into the right queue.
4098 * 6) migration thread up()s the semaphore.
4099 * 7) we wake up and the migration is done.
4100 */
4101
4102/*
4103 * Change a given task's CPU affinity. Migrate the thread to a
4104 * proper CPU and schedule it away if the CPU it's executing on
4105 * is removed from the allowed bitmask.
4106 *
4107 * NOTE: the caller must have a valid reference to the task, the
4108 * task must not exit() & deallocate itself prematurely.  The
4109 * call is not atomic; no spinlocks may be held.
4110 */
4111int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4112{
4113        unsigned long flags;
4114        int ret = 0;
4115        migration_req_t req;
4116        runqueue_t *rq;
4117
4118        rq = task_rq_lock(p, &flags);
4119        if (!cpus_intersects(new_mask, cpu_online_map)) {
4120                ret = -EINVAL;
4121                goto out;
4122        }
4123
4124        p->cpus_allowed = new_mask;
4125        /* Can the task run on the task's current CPU? If so, we're done */
4126        if (cpu_isset(task_cpu(p), new_mask))
4127                goto out;
4128
4129        if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4130                /* Need help from migration thread: drop lock and wait. */
4131                task_rq_unlock(rq, &flags);
4132                wake_up_process(rq->migration_thread);
4133                wait_for_completion(&req.done);
4134                tlb_migrate_finish(p->mm);
4135                return 0;
4136        }
4137out:
4138        task_rq_unlock(rq, &flags);
4139        return ret;
4140}
4141
4142EXPORT_SYMBOL_GPL(set_cpus_allowed);
4143
4144/*
4145 * Move (not current) task off this cpu, onto dest cpu.  We're doing
4146 * this because either it can't run here any more (set_cpus_allowed()
4147 * away from this CPU, or CPU going down), or because we're
4148 * attempting to rebalance this task on exec (sched_exec).
4149 *
4150 * So we race with normal scheduler movements, but that's OK, as long
4151 * as the task is no longer on this CPU.
4152 */
4153static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4154{
4155        runqueue_t *rq_dest, *rq_src;
4156
4157        if (unlikely(cpu_is_offline(dest_cpu)))
4158                return;
4159
4160        rq_src = cpu_rq(src_cpu);
4161        rq_dest = cpu_rq(dest_cpu);
4162
4163        double_rq_lock(rq_src, rq_dest);
4164        /* Already moved. */
4165        if (task_cpu(p) != src_cpu)
4166                goto out;
4167        /* Affinity changed (again). */
4168        if (!cpu_isset(dest_cpu, p->cpus_allowed))
4169                goto out;
4170
4171        set_task_cpu(p, dest_cpu);
4172        if (p->array) {
4173                /*
4174                 * Sync timestamp with rq_dest's before activating.
4175                 * The same thing could be achieved by doing this step
4176                 * afterwards, and pretending it was a local activate.
4177                 * This way is cleaner and logically correct.
4178                 */
4179                p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4180                                + rq_dest->timestamp_last_tick;
4181                deactivate_task(p, rq_src);
4182                activate_task(p, rq_dest, 0);
4183                if (TASK_PREEMPTS_CURR(p, rq_dest))
4184                        resched_task(rq_dest->curr);
4185        }
4186
4187out:
4188        double_rq_unlock(rq_src, rq_dest);
4189}
4190
4191/*
4192 * migration_thread - this is a highprio system thread that performs
4193 * thread migration by bumping thread off CPU then 'pushing' onto
4194 * another runqueue.
4195 */
4196static int migration_thread(void * data)
4197{
4198        runqueue_t *rq;
4199        int cpu = (long)data;
4200
4201        rq = cpu_rq(cpu);
4202        BUG_ON(rq->migration_thread != current);
4203
4204        set_current_state(TASK_INTERRUPTIBLE);
4205        while (!kthread_should_stop()) {
4206                struct list_head *head;
4207                migration_req_t *req;
4208
4209                if (current->flags & PF_FREEZE)
4210                        refrigerator(PF_FREEZE);
4211
4212                spin_lock_irq(&rq->lock);
4213
4214                if (cpu_is_offline(cpu)) {
4215                        spin_unlock_irq(&rq->lock);
4216                        goto wait_to_die;
4217                }
4218
4219                if (rq->active_balance) {
4220                        active_load_balance(rq, cpu);
4221                        rq->active_balance = 0;
4222                }
4223
4224                head = &rq->migration_queue;
4225
4226                if (list_empty(head)) {
4227                        spin_unlock_irq(&rq->lock);
4228                        schedule();
4229                        set_current_state(TASK_INTERRUPTIBLE);
4230                        continue;
4231                }
4232                req = list_entry(head->next, migration_req_t, list);
4233                list_del_init(head->next);
4234
4235                if (req->type == REQ_MOVE_TASK) {
4236                        spin_unlock(&rq->lock);
4237                        __migrate_task(req->task, cpu, req->dest_cpu);
4238                        local_irq_enable();
4239                } else if (req->type == REQ_SET_DOMAIN) {
4240                        rq->sd = req->sd;
4241                        spin_unlock_irq(&rq->lock);
4242                } else {
4243                        spin_unlock_irq(&rq->lock);
4244                        WARN_ON(1);
4245                }
4246
4247                complete(&req->done);
4248        }
4249        __set_current_state(TASK_RUNNING);
4250        return 0;
4251
4252wait_to_die:
4253        /* Wait for kthread_stop */
4254        set_current_state(TASK_INTERRUPTIBLE);
4255        while (!kthread_should_stop()) {
4256                schedule();
4257                set_current_state(TASK_INTERRUPTIBLE);
4258        }
4259        __set_current_state(TASK_RUNNING);
4260        return 0;
4261}
4262
4263#ifdef CONFIG_HOTPLUG_CPU
4264/* Figure out where task on dead CPU should go, use force if neccessary. */
4265static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4266{
4267        int dest_cpu;
4268        cpumask_t mask;
4269
4270        /* On same node? */
4271        mask = node_to_cpumask(cpu_to_node(dead_cpu));
4272        cpus_and(mask, mask, tsk->cpus_allowed);
4273        dest_cpu = any_online_cpu(mask);
4274
4275        /* On any allowed CPU? */
4276        if (dest_cpu == NR_CPUS)
4277                dest_cpu = any_online_cpu(tsk->cpus_allowed);
4278
4279        /* No more Mr. Nice Guy. */
4280        if (dest_cpu == NR_CPUS) {
4281                cpus_setall(tsk->cpus_allowed);
4282                dest_cpu = any_online_cpu(tsk->cpus_allowed);
4283
4284                /*
4285                 * Don't tell them about moving exiting tasks or
4286                 * kernel threads (both mm NULL), since they never
4287                 * leave kernel.
4288                 */
4289                if (tsk->mm && printk_ratelimit())
4290                        printk(KERN_INFO "process %d (%s) no "
4291                               "longer affine to cpu%d\n",
4292                               tsk->pid, tsk->comm, dead_cpu);
4293        }
4294        __migrate_task(tsk, dead_cpu, dest_cpu);
4295}
4296
4297/*
4298 * While a dead CPU has no uninterruptible tasks queued at this point,
4299 * it might still have a nonzero ->nr_uninterruptible counter, because
4300 * for performance reasons the counter is not stricly tracking tasks to
4301 * their home CPUs. So we just add the counter to another CPU's counter,
4302 * to keep the global sum constant after CPU-down:
4303 */
4304static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4305{
4306        runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4307        unsigned long flags;
4308
4309        local_irq_save(flags);
4310        double_rq_lock(rq_src, rq_dest);
4311        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4312        rq_src->nr_uninterruptible = 0;
4313        double_rq_unlock(rq_src, rq_dest);
4314        local_irq_restore(flags);
4315}
4316
4317/* Run through task list and migrate tasks from the dead cpu. */
4318static void migrate_live_tasks(int src_cpu)
4319{
4320        struct task_struct *tsk, *t;
4321
4322        write_lock_irq(&tasklist_lock);
4323
4324        do_each_thread(t, tsk) {
4325                if (tsk == current)
4326                        continue;
4327
4328                if (task_cpu(tsk) == src_cpu)
4329                        move_task_off_dead_cpu(src_cpu, tsk);
4330        } while_each_thread(t, tsk);
4331
4332        write_unlock_irq(&tasklist_lock);
4333}
4334
4335/* Schedules idle task to be the next runnable task on current CPU.
4336 * It does so by boosting its priority to highest possible and adding it to
4337 * the _front_ of runqueue. Used by CPU offline code.
4338 */
4339void sched_idle_next(void)
4340{
4341        int cpu = smp_processor_id();
4342        runqueue_t *rq = this_rq();
4343        struct task_struct *p = rq->idle;
4344        unsigned long flags;
4345
4346        /* cpu has to be offline */
4347        BUG_ON(cpu_online(cpu));
4348
4349        /* Strictly not necessary since rest of the CPUs are stopped by now
4350         * and interrupts disabled on current cpu.
4351         */
4352        spin_lock_irqsave(&rq->lock, flags);
4353
4354        __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4355        /* Add idle task to _front_ of it's priority queue */
4356        __activate_idle_task(p, rq);
4357
4358        spin_unlock_irqrestore(&rq->lock, flags);
4359}
4360
4361/* Ensures that the idle task is using init_mm right before its cpu goes
4362 * offline.
4363 */
4364void idle_task_exit(void)
4365{
4366        struct mm_struct *mm = current->active_mm;
4367
4368        BUG_ON(cpu_online(smp_processor_id()));
4369
4370        if (mm != &init_mm)
4371                switch_mm(mm, &init_mm, current);
4372        mmdrop(mm);
4373}
4374
4375static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4376{
4377        struct runqueue *rq = cpu_rq(dead_cpu);
4378
4379        /* Must be exiting, otherwise would be on tasklist. */
4380        BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4381
4382        /* Cannot have done final schedule yet: would have vanished. */
4383        BUG_ON(tsk->flags & PF_DEAD);
4384
4385        get_task_struct(tsk);
4386
4387        /*
4388         * Drop lock around migration; if someone else moves it,
4389         * that's OK.  No task can be added to this CPU, so iteration is
4390         * fine.
4391         */
4392        spin_unlock_irq(&rq->lock);
4393        move_task_off_dead_cpu(dead_cpu, tsk);
4394        spin_lock_irq(&rq->lock);
4395
4396        put_task_struct(tsk);
4397}
4398
4399/* release_task() removes task from tasklist, so we won't find dead tasks. */
4400static void migrate_dead_tasks(unsigned int dead_cpu)
4401{
4402        unsigned arr, i;
4403        struct runqueue *rq = cpu_rq(dead_cpu);
4404
4405        for (arr = 0; arr < 2; arr++) {
4406                for (i = 0; i < MAX_PRIO; i++) {
4407                        struct list_head *list = &rq->arrays[arr].queue[i];
4408                        while (!list_empty(list))
4409                                migrate_dead(dead_cpu,
4410                                             list_entry(list->next, task_t,
4411                                                        run_list));
4412                }
4413        }
4414}
4415#endif /* CONFIG_HOTPLUG_CPU */
4416
4417/*
4418 * migration_call - callback that gets triggered when a CPU is added.
4419 * Here we can start up the necessary migration thread for the new CPU.
4420 */
4421static int migration_call(struct notifier_block *nfb, unsigned long action,
4422                          void *hcpu)
4423{
4424        int cpu = (long)hcpu;
4425        struct task_struct *p;
4426        struct runqueue *rq;
4427        unsigned long flags;
4428
4429        switch (action) {
4430        case CPU_UP_PREPARE:
4431                p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4432                if (IS_ERR(p))
4433                        return NOTIFY_BAD;
4434                p->flags |= PF_NOFREEZE;
4435                kthread_bind(p, cpu);
4436                /* Must be high prio: stop_machine expects to yield to it. */
4437                rq = task_rq_lock(p, &flags);
4438                __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4439                task_rq_unlock(rq, &flags);
4440                cpu_rq(cpu)->migration_thread = p;
4441                break;
4442        case CPU_ONLINE:
4443                /* Strictly unneccessary, as first user will wake it. */
4444                wake_up_process(cpu_rq(cpu)->migration_thread);
4445                break;
4446#ifdef CONFIG_HOTPLUG_CPU
4447        case CPU_UP_CANCELED:
4448                /* Unbind it from offline cpu so it can run.  Fall thru. */
4449                kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4450                kthread_stop(cpu_rq(cpu)->migration_thread);
4451                cpu_rq(cpu)->migration_thread = NULL;
4452                break;
4453        case CPU_DEAD:
4454                migrate_live_tasks(cpu);
4455                rq = cpu_rq(cpu);
4456                kthread_stop(rq->migration_thread);
4457                rq->migration_thread = NULL;
4458                /* Idle task back to normal (off runqueue, low prio) */
4459                rq = task_rq_lock(rq->idle, &flags);
4460                deactivate_task(rq->idle, rq);
4461                rq->idle->static_prio = MAX_PRIO;
4462                __setscheduler(rq->idle, SCHED_NORMAL, 0);
4463                migrate_dead_tasks(cpu);
4464                task_rq_unlock(rq, &flags);
4465                migrate_nr_uninterruptible(rq);
4466                BUG_ON(rq->nr_running != 0);
4467
4468                /* No need to migrate the tasks: it was best-effort if
4469                 * they didn't do lock_cpu_hotplug().  Just wake up
4470                 * the requestors. */
4471                spin_lock_irq(&rq->lock);
4472                while (!list_empty(&rq->migration_queue)) {
4473                        migration_req_t *req;
4474                        req = list_entry(rq->migration_queue.next,
4475                                         migration_req_t, list);
4476                        BUG_ON(req->type != REQ_MOVE_TASK);
4477                        list_del_init(&req->list);
4478                        complete(&req->done);
4479                }
4480                spin_unlock_irq(&rq->lock);
4481                break;
4482#endif
4483        }
4484        return NOTIFY_OK;
4485}
4486
4487/* Register at highest priority so that task migration (migrate_all_tasks)
4488 * happens before everything else.
4489 */
4490static struct notifier_block __devinitdata migration_notifier = {
4491        .notifier_call = migration_call,
4492        .priority = 10
4493};
4494
4495int __init migration_init(void)
4496{
4497        void *cpu = (void *)(long)smp_processor_id();
4498        /* Start one for boot CPU. */
4499        migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4500        migration_call(&migration_notifier, CPU_ONLINE, cpu);
4501        register_cpu_notifier(&migration_notifier);
4502        return 0;
4503}
4504#endif
4505
4506#ifdef CONFIG_SMP
4507#define SCHED_DOMAIN_DEBUG
4508#ifdef SCHED_DOMAIN_DEBUG
4509static void sched_domain_debug(struct sched_domain *sd, int cpu)
4510{
4511        int level = 0;
4512
4513        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4514
4515        do {
4516                int i;
4517                char str[NR_CPUS];
4518                struct sched_group *group = sd->groups;
4519                cpumask_t groupmask;
4520
4521                cpumask_scnprintf(str, NR_CPUS, sd->span);
4522                cpus_clear(groupmask);
4523
4524                printk(KERN_DEBUG);
4525                for (i = 0; i < level + 1; i++)
4526                        printk(" ");
4527                printk("domain %d: ", level);
4528
4529                if (!(sd->flags & SD_LOAD_BALANCE)) {
4530                        printk("does not load-balance\n");
4531                        if (sd->parent)
4532                                printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4533                        break;
4534                }
4535
4536                printk("span %s\n", str);
4537
4538                if (!cpu_isset(cpu, sd->span))
4539                        printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4540                if (!cpu_isset(cpu, group->cpumask))
4541                        printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4542
4543                printk(KERN_DEBUG);
4544                for (i = 0; i < level + 2; i++)
4545                        printk(" ");
4546                printk("groups:");
4547                do {
4548                        if (!group) {
4549                                printk("\n");
4550                                printk(KERN_ERR "ERROR: group is NULL\n");
4551                                break;
4552                        }
4553
4554                        if (!group->cpu_power) {
4555                                printk("\n");
4556                                printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4557                        }
4558
4559                        if (!cpus_weight(group->cpumask)) {
4560                                printk("\n");
4561                                printk(KERN_ERR "ERROR: empty group\n");
4562                        }
4563
4564                        if (cpus_intersects(groupmask, group->cpumask)) {
4565                                printk("\n");
4566                                printk(KERN_ERR "ERROR: repeated CPUs\n");
4567                        }
4568
4569                        cpus_or(groupmask, groupmask, group->cpumask);
4570
4571                        cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4572                        printk(" %s", str);
4573
4574                        group = group->next;
4575                } while (group != sd->groups);
4576                printk("\n");
4577
4578                if (!cpus_equal(sd->span, groupmask))
4579                        printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4580
4581                level++;
4582                sd = sd->parent;
4583
4584                if (sd) {
4585                        if (!cpus_subset(groupmask, sd->span))
4586                                printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4587                }
4588
4589        } while (sd);
4590}
4591#else
4592#define sched_domain_debug(sd, cpu) {}
4593#endif
4594
4595/*
4596 * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
4597 * hold the hotplug lock.
4598 */
4599void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4600{
4601        migration_req_t req;
4602        unsigned long flags;
4603        runqueue_t *rq = cpu_rq(cpu);
4604        int local = 1;
4605
4606        sched_domain_debug(sd, cpu);
4607
4608        spin_lock_irqsave(&rq->lock, flags);
4609
4610        if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4611                rq->sd = sd;
4612        } else {
4613                init_completion(&req.done);
4614                req.type = REQ_SET_DOMAIN;
4615                req.sd = sd;
4616                list_add(&req.list, &rq->migration_queue);
4617                local = 0;
4618        }
4619
4620        spin_unlock_irqrestore(&rq->lock, flags);
4621
4622        if (!local) {
4623                wake_up_process(rq->migration_thread);
4624                wait_for_completion(&req.done);
4625        }
4626}
4627
4628/* cpus with isolated domains */
4629cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4630
4631/* Setup the mask of cpus configured for isolated domains */
4632static int __init isolated_cpu_setup(char *str)
4633{
4634        int ints[NR_CPUS], i;
4635
4636        str = get_options(str, ARRAY_SIZE(ints), ints);
4637        cpus_clear(cpu_isolated_map);
4638        for (i = 1; i <= ints[0]; i++)
4639                if (ints[i] < NR_CPUS)
4640                        cpu_set(ints[i], cpu_isolated_map);
4641        return 1;
4642}
4643
4644__setup ("isolcpus=", isolated_cpu_setup);
4645
4646/*
4647 * init_sched_build_groups takes an array of groups, the cpumask we wish
4648 * to span, and a pointer to a function which identifies what group a CPU
4649 * belongs to. The return value of group_fn must be a valid index into the
4650 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4651 * keep track of groups covered with a cpumask_t).
4652 *
4653 * init_sched_build_groups will build a circular linked list of the groups
4654 * covered by the given span, and will set each group's ->cpumask correctly,
4655 * and ->cpu_power to 0.
4656 */
4657void __devinit init_sched_build_groups(struct sched_group groups[],
4658                        cpumask_t span, int (*group_fn)(int cpu))
4659{
4660        struct sched_group *first = NULL, *last = NULL;
4661        cpumask_t covered = CPU_MASK_NONE;
4662        int i;
4663
4664        for_each_cpu_mask(i, span) {
4665                int group = group_fn(i);
4666                struct sched_group *sg = &groups[group];
4667                int j;
4668
4669                if (cpu_isset(i, covered))
4670                        continue;
4671
4672                sg->cpumask = CPU_MASK_NONE;
4673                sg->cpu_power = 0;
4674
4675                for_each_cpu_mask(j, span) {
4676                        if (group_fn(j) != group)
4677                                continue;
4678
4679                        cpu_set(j, covered);
4680                        cpu_set(j, sg->cpumask);
4681                }
4682                if (!first)
4683                        first = sg;
4684                if (last)
4685                        last->next = sg;
4686                last = sg;
4687        }
4688        last->next = first;
4689}
4690
4691
4692#ifdef ARCH_HAS_SCHED_DOMAIN
4693extern void __devinit arch_init_sched_domains(void);
4694extern void __devinit arch_destroy_sched_domains(void);
4695#else
4696#ifdef CONFIG_SCHED_SMT
4697static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4698static struct sched_group sched_group_cpus[NR_CPUS];
4699static int __devinit cpu_to_cpu_group(int cpu)
4700{
4701        return cpu;
4702}
4703#endif
4704
4705static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4706static struct sched_group sched_group_phys[NR_CPUS];
4707static int __devinit cpu_to_phys_group(int cpu)
4708{
4709#ifdef CONFIG_SCHED_SMT
4710        return first_cpu(cpu_sibling_map[cpu]);
4711#else
4712        return cpu;
4713#endif
4714}
4715
4716#ifdef CONFIG_NUMA
4717
4718static DEFINE_PER_CPU(struct sched_domain, node_domains);
4719static struct sched_group sched_group_nodes[MAX_NUMNODES];
4720static int __devinit cpu_to_node_group(int cpu)
4721{
4722        return cpu_to_node(cpu);
4723}
4724#endif
4725
4726#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4727/*
4728 * The domains setup code relies on siblings not spanning
4729 * multiple nodes. Make sure the architecture has a proper
4730 * siblings map:
4731 */
4732static void check_sibling_maps(void)
4733{
4734        int i, j;
4735
4736        for_each_online_cpu(i) {
4737                for_each_cpu_mask(j, cpu_sibling_map[i]) {
4738                        if (cpu_to_node(i) != cpu_to_node(j)) {
4739                                printk(KERN_INFO "warning: CPU %d siblings map "
4740                                        "to different node - isolating "
4741                                        "them.\n", i);
4742                                cpu_sibling_map[i] = cpumask_of_cpu(i);
4743                                break;
4744                        }
4745                }
4746        }
4747}
4748#endif
4749
4750/*
4751 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
4752 */
4753static void __devinit arch_init_sched_domains(void)
4754{
4755        int i;
4756        cpumask_t cpu_default_map;
4757
4758#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4759        check_sibling_maps();
4760#endif
4761        /*
4762         * Setup mask for cpus without special case scheduling requirements.
4763         * For now this just excludes isolated cpus, but could be used to
4764         * exclude other special cases in the future.
4765         */
4766        cpus_complement(cpu_default_map, cpu_isolated_map);
4767        cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4768
4769        /*
4770         * Set up domains. Isolated domains just stay on the dummy domain.
4771         */
4772        for_each_cpu_mask(i, cpu_default_map) {
4773                int group;
4774                struct sched_domain *sd = NULL, *p;
4775                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4776
4777                cpus_and(nodemask, nodemask, cpu_default_map);
4778
4779#ifdef CONFIG_NUMA
4780                sd = &per_cpu(node_domains, i);
4781                group = cpu_to_node_group(i);
4782                *sd = SD_NODE_INIT;
4783                sd->span = cpu_default_map;
4784                sd->groups = &sched_group_nodes[group];
4785#endif
4786
4787                p = sd;
4788                sd = &per_cpu(phys_domains, i);
4789                group = cpu_to_phys_group(i);
4790                *sd = SD_CPU_INIT;
4791                sd->span = nodemask;
4792                sd->parent = p;
4793                sd->groups = &sched_group_phys[group];
4794
4795#ifdef CONFIG_SCHED_SMT
4796                p = sd;
4797                sd = &per_cpu(cpu_domains, i);
4798                group = cpu_to_cpu_group(i);
4799                *sd = SD_SIBLING_INIT;
4800                sd->span = cpu_sibling_map[i];
4801                cpus_and(sd->span, sd->span, cpu_default_map);
4802                sd->parent = p;
4803                sd->groups = &sched_group_cpus[group];
4804#endif
4805        }
4806
4807#ifdef CONFIG_SCHED_SMT
4808        /* Set up CPU (sibling) groups */
4809        for_each_online_cpu(i) {
4810                cpumask_t this_sibling_map = cpu_sibling_map[i];
4811                cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4812                if (i != first_cpu(this_sibling_map))
4813                        continue;
4814
4815                init_sched_build_groups(sched_group_cpus, this_sibling_map,
4816                                                &cpu_to_cpu_group);
4817        }
4818#endif
4819
4820        /* Set up physical groups */
4821        for (i = 0; i < MAX_NUMNODES; i++) {
4822                cpumask_t nodemask = node_to_cpumask(i);
4823
4824                cpus_and(nodemask, nodemask, cpu_default_map);
4825                if (cpus_empty(nodemask))
4826                        continue;
4827
4828                init_sched_build_groups(sched_group_phys, nodemask,
4829                                                &cpu_to_phys_group);
4830        }
4831
4832#ifdef CONFIG_NUMA
4833        /* Set up node groups */
4834        init_sched_build_groups(sched_group_nodes, cpu_default_map,
4835                                        &cpu_to_node_group);
4836#endif
4837
4838        /* Calculate CPU power for physical packages and nodes */
4839        for_each_cpu_mask(i, cpu_default_map) {
4840                int power;
4841                struct sched_domain *sd;
4842#ifdef CONFIG_SCHED_SMT
4843                sd = &per_cpu(cpu_domains, i);
4844                power = SCHED_LOAD_SCALE;
4845                sd->groups->cpu_power = power;
4846#endif
4847
4848                sd = &per_cpu(phys_domains, i);
4849                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4850                                (cpus_weight(sd->groups->cpumask)-1) / 10;
4851                sd->groups->cpu_power = power;
4852
4853#ifdef CONFIG_NUMA
4854                if (i == first_cpu(sd->groups->cpumask)) {
4855                        /* Only add "power" once for each physical package. */
4856                        sd = &per_cpu(node_domains, i);
4857                        sd->groups->cpu_power += power;
4858                }
4859#endif
4860        }
4861
4862        /* Attach the domains */
4863        for_each_online_cpu(i) {
4864                struct sched_domain *sd;
4865#ifdef CONFIG_SCHED_SMT
4866                sd = &per_cpu(cpu_domains, i);
4867#else
4868                sd = &per_cpu(phys_domains, i);
4869#endif
4870                cpu_attach_domain(sd, i);
4871        }
4872}
4873
4874#ifdef CONFIG_HOTPLUG_CPU
4875static void __devinit arch_destroy_sched_domains(void)
4876{
4877        /* Do nothing: everything is statically allocated. */
4878}
4879#endif
4880
4881#endif /* ARCH_HAS_SCHED_DOMAIN */
4882
4883/*
4884 * Initial dummy domain for early boot and for hotplug cpu. Being static,
4885 * it is initialized to zero, so all balancing flags are cleared which is
4886 * what we want.
4887 */
4888static struct sched_domain sched_domain_dummy;
4889
4890#ifdef CONFIG_HOTPLUG_CPU
4891/*
4892 * Force a reinitialization of the sched domains hierarchy.  The domains
4893 * and groups cannot be updated in place without racing with the balancing
4894 * code, so we temporarily attach all running cpus to a "dummy" domain
4895 * which will prevent rebalancing while the sched domains are recalculated.
4896 */
4897static int update_sched_domains(struct notifier_block *nfb,
4898                                unsigned long action, void *hcpu)
4899{
4900        int i;
4901
4902        switch (action) {
4903        case CPU_UP_PREPARE:
4904        case CPU_DOWN_PREPARE:
4905                for_each_online_cpu(i)
4906                        cpu_attach_domain(&sched_domain_dummy, i);
4907                arch_destroy_sched_domains();
4908                return NOTIFY_OK;
4909
4910        case CPU_UP_CANCELED:
4911        case CPU_DOWN_FAILED:
4912        case CPU_ONLINE:
4913        case CPU_DEAD:
4914                /*
4915                 * Fall through and re-initialise the domains.
4916                 */
4917                break;
4918        default:
4919                return NOTIFY_DONE;
4920        }
4921
4922        /* The hotplug lock is already held by cpu_up/cpu_down */
4923        arch_init_sched_domains();
4924
4925        return NOTIFY_OK;
4926}
4927#endif
4928
4929void __init sched_init_smp(void)
4930{
4931        lock_cpu_hotplug();
4932        arch_init_sched_domains();
4933        unlock_cpu_hotplug();
4934        /* XXX: Theoretical race here - CPU may be hotplugged now */
4935        hotcpu_notifier(update_sched_domains, 0);
4936}
4937#else
4938void __init sched_init_smp(void)
4939{
4940}
4941#endif /* CONFIG_SMP */
4942
4943int in_sched_functions(unsigned long addr)
4944{
4945        /* Linker adds these: start and end of __sched functions */
4946        extern char __sched_text_start[], __sched_text_end[];
4947        return in_lock_functions(addr) ||
4948                (addr >= (unsigned long)__sched_text_start
4949                && addr < (unsigned long)__sched_text_end);
4950}
4951
4952void __init sched_init(void)
4953{
4954        runqueue_t *rq;
4955        int i, j, k;
4956
4957        for (i = 0; i < NR_CPUS; i++) {
4958                prio_array_t *array;
4959
4960                rq = cpu_rq(i);
4961                spin_lock_init(&rq->lock);
4962                rq->active = rq->arrays;
4963                rq->expired = rq->arrays + 1;
4964                rq->best_expired_prio = MAX_PRIO;
4965
4966#ifdef CONFIG_SMP
4967                rq->sd = &sched_domain_dummy;
4968                rq->cpu_load = 0;
4969                rq->active_balance = 0;
4970                rq->push_cpu = 0;
4971                rq->migration_thread = NULL;
4972                INIT_LIST_HEAD(&rq->migration_queue);
4973#endif
4974                atomic_set(&rq->nr_iowait, 0);
4975
4976                for (j = 0; j < 2; j++) {
4977                        array = rq->arrays + j;
4978                        for (k = 0; k < MAX_PRIO; k++) {
4979                                INIT_LIST_HEAD(array->queue + k);
4980                                __clear_bit(k, array->bitmap);
4981                        }
4982                        // delimiter for bitsearch
4983                        __set_bit(MAX_PRIO, array->bitmap);
4984                }
4985        }
4986
4987        /*
4988         * The boot idle thread does lazy MMU switching as well:
4989         */
4990        atomic_inc(&init_mm.mm_count);
4991        enter_lazy_tlb(&init_mm, current);
4992
4993        /*
4994         * Make us the idle thread. Technically, schedule() should not be
4995         * called from this thread, however somewhere below it might be,
4996         * but because we are the idle thread, we just pick up running again
4997         * when this runqueue becomes "idle".
4998         */
4999        init_idle(current, smp_processor_id());
5000}
5001
5002#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5003void __might_sleep(char *file, int line)
5004{
5005#if defined(in_atomic)
5006        static unsigned long prev_jiffy;        /* ratelimiting */
5007
5008        if ((in_atomic() || irqs_disabled()) &&
5009            system_state == SYSTEM_RUNNING && !oops_in_progress) {
5010                if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
5011                        return;
5012                prev_jiffy = jiffies;
5013                printk(KERN_ERR "Debug: sleeping function called from invalid"
5014                                " context at %s:%d\n", file, line);
5015                printk("in_atomic():%d, irqs_disabled():%d\n",
5016                        in_atomic(), irqs_disabled());
5017                dump_stack();
5018        }
5019#endif
5020}
5021EXPORT_SYMBOL(__might_sleep);
5022#endif
5023
5024#ifdef CONFIG_MAGIC_SYSRQ
5025void normalize_rt_tasks(void)
5026{
5027        struct task_struct *p;
5028        prio_array_t *array;
5029        unsigned long flags;
5030        runqueue_t *rq;
5031
5032        read_lock_irq(&tasklist_lock);
5033        for_each_process (p) {
5034                if (!rt_task(p))
5035                        continue;
5036
5037                rq = task_rq_lock(p, &flags);
5038
5039                array = p->array;
5040                if (array)
5041                        deactivate_task(p, task_rq(p));
5042                __setscheduler(p, SCHED_NORMAL, 0);
5043                if (array) {
5044                        __activate_task(p, task_rq(p));
5045                        resched_task(rq->curr);
5046                }
5047
5048                task_rq_unlock(rq, &flags);
5049        }
5050        read_unlock_irq(&tasklist_lock);
5051}
5052
5053#endif /* CONFIG_MAGIC_SYSRQ */
5054
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.