linux/kernel/sched.c
<<
>>
Prefs
   1/*
   2 *  kernel/sched.c
   3 *
   4 *  Kernel scheduler and related syscalls
   5 *
   6 *  Copyright (C) 1991-2002  Linus Torvalds
   7 *
   8 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9 *              make semaphores SMP safe
  10 *  1998-11-19  Implemented schedule_timeout() and related stuff
  11 *              by Andrea Arcangeli
  12 *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13 *              hybrid priority-list and round-robin design with
  14 *              an array-switch method of distributing timeslices
  15 *              and per-CPU runqueues.  Cleanups and useful suggestions
  16 *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17 *  2003-09-03  Interactivity tuning by Con Kolivas.
  18 *  2004-04-02  Scheduler domains code by Nick Piggin
  19 *  2007-04-15  Work begun on replacing all interactivity tuning with a
  20 *              fair scheduling design by Con Kolivas.
  21 *  2007-05-05  Load balancing (smp-nice) and other improvements
  22 *              by Peter Williams
  23 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  24 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  25 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  26 *              Thomas Gleixner, Mike Kravetz
  27 */
  28
  29#include <linux/mm.h>
  30#include <linux/module.h>
  31#include <linux/nmi.h>
  32#include <linux/init.h>
  33#include <linux/uaccess.h>
  34#include <linux/highmem.h>
  35#include <linux/smp_lock.h>
  36#include <asm/mmu_context.h>
  37#include <linux/interrupt.h>
  38#include <linux/capability.h>
  39#include <linux/completion.h>
  40#include <linux/kernel_stat.h>
  41#include <linux/debug_locks.h>
  42#include <linux/security.h>
  43#include <linux/notifier.h>
  44#include <linux/profile.h>
  45#include <linux/freezer.h>
  46#include <linux/vmalloc.h>
  47#include <linux/blkdev.h>
  48#include <linux/delay.h>
  49#include <linux/pid_namespace.h>
  50#include <linux/smp.h>
  51#include <linux/threads.h>
  52#include <linux/timer.h>
  53#include <linux/rcupdate.h>
  54#include <linux/cpu.h>
  55#include <linux/cpuset.h>
  56#include <linux/percpu.h>
  57#include <linux/kthread.h>
  58#include <linux/proc_fs.h>
  59#include <linux/seq_file.h>
  60#include <linux/sysctl.h>
  61#include <linux/syscalls.h>
  62#include <linux/times.h>
  63#include <linux/tsacct_kern.h>
  64#include <linux/kprobes.h>
  65#include <linux/delayacct.h>
  66#include <linux/reciprocal_div.h>
  67#include <linux/unistd.h>
  68#include <linux/pagemap.h>
  69#include <linux/hrtimer.h>
  70#include <linux/tick.h>
  71#include <linux/bootmem.h>
  72#include <linux/debugfs.h>
  73#include <linux/ctype.h>
  74#include <linux/ftrace.h>
  75#include <trace/sched.h>
  76
  77#include <asm/tlb.h>
  78#include <asm/irq_regs.h>
  79
  80#include "sched_cpupri.h"
  81
  82/*
  83 * Convert user-nice values [ -20 ... 0 ... 19 ]
  84 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  85 * and back.
  86 */
  87#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  88#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  89#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  90
  91/*
  92 * 'User priority' is the nice value converted to something we
  93 * can work with better when scaling various scheduler parameters,
  94 * it's a [ 0 ... 39 ] range.
  95 */
  96#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  97#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  98#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  99
 100/*
 101 * Helpers for converting nanosecond timing to jiffy resolution
 102 */
 103#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 104
 105#define NICE_0_LOAD             SCHED_LOAD_SCALE
 106#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 107
 108/*
 109 * These are the 'tuning knobs' of the scheduler:
 110 *
 111 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 112 * Timeslices get refilled after they expire.
 113 */
 114#define DEF_TIMESLICE           (100 * HZ / 1000)
 115
 116/*
 117 * single value that denotes runtime == period, ie unlimited time.
 118 */
 119#define RUNTIME_INF     ((u64)~0ULL)
 120
 121DEFINE_TRACE(sched_wait_task);
 122DEFINE_TRACE(sched_wakeup);
 123DEFINE_TRACE(sched_wakeup_new);
 124DEFINE_TRACE(sched_switch);
 125DEFINE_TRACE(sched_migrate_task);
 126
 127#ifdef CONFIG_SMP
 128
 129static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 130
 131/*
 132 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 133 * Since cpu_power is a 'constant', we can use a reciprocal divide.
 134 */
 135static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 136{
 137        return reciprocal_divide(load, sg->reciprocal_cpu_power);
 138}
 139
 140/*
 141 * Each time a sched group cpu_power is changed,
 142 * we must compute its reciprocal value
 143 */
 144static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 145{
 146        sg->__cpu_power += val;
 147        sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 148}
 149#endif
 150
 151static inline int rt_policy(int policy)
 152{
 153        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 154                return 1;
 155        return 0;
 156}
 157
 158static inline int task_has_rt_policy(struct task_struct *p)
 159{
 160        return rt_policy(p->policy);
 161}
 162
 163/*
 164 * This is the priority-queue data structure of the RT scheduling class:
 165 */
 166struct rt_prio_array {
 167        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 168        struct list_head queue[MAX_RT_PRIO];
 169};
 170
 171struct rt_bandwidth {
 172        /* nests inside the rq lock: */
 173        spinlock_t              rt_runtime_lock;
 174        ktime_t                 rt_period;
 175        u64                     rt_runtime;
 176        struct hrtimer          rt_period_timer;
 177};
 178
 179static struct rt_bandwidth def_rt_bandwidth;
 180
 181static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 182
 183static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 184{
 185        struct rt_bandwidth *rt_b =
 186                container_of(timer, struct rt_bandwidth, rt_period_timer);
 187        ktime_t now;
 188        int overrun;
 189        int idle = 0;
 190
 191        for (;;) {
 192                now = hrtimer_cb_get_time(timer);
 193                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 194
 195                if (!overrun)
 196                        break;
 197
 198                idle = do_sched_rt_period_timer(rt_b, overrun);
 199        }
 200
 201        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 202}
 203
 204static
 205void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 206{
 207        rt_b->rt_period = ns_to_ktime(period);
 208        rt_b->rt_runtime = runtime;
 209
 210        spin_lock_init(&rt_b->rt_runtime_lock);
 211
 212        hrtimer_init(&rt_b->rt_period_timer,
 213                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 214        rt_b->rt_period_timer.function = sched_rt_period_timer;
 215}
 216
 217static inline int rt_bandwidth_enabled(void)
 218{
 219        return sysctl_sched_rt_runtime >= 0;
 220}
 221
 222static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 223{
 224        ktime_t now;
 225
 226        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 227                return;
 228
 229        if (hrtimer_active(&rt_b->rt_period_timer))
 230                return;
 231
 232        spin_lock(&rt_b->rt_runtime_lock);
 233        for (;;) {
 234                unsigned long delta;
 235                ktime_t soft, hard;
 236
 237                if (hrtimer_active(&rt_b->rt_period_timer))
 238                        break;
 239
 240                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 241                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 242
 243                soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
 244                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
 245                delta = ktime_to_ns(ktime_sub(hard, soft));
 246                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
 247                                HRTIMER_MODE_ABS, 0);
 248        }
 249        spin_unlock(&rt_b->rt_runtime_lock);
 250}
 251
 252#ifdef CONFIG_RT_GROUP_SCHED
 253static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 254{
 255        hrtimer_cancel(&rt_b->rt_period_timer);
 256}
 257#endif
 258
 259/*
 260 * sched_domains_mutex serializes calls to arch_init_sched_domains,
 261 * detach_destroy_domains and partition_sched_domains.
 262 */
 263static DEFINE_MUTEX(sched_domains_mutex);
 264
 265#ifdef CONFIG_GROUP_SCHED
 266
 267#include <linux/cgroup.h>
 268
 269struct cfs_rq;
 270
 271static LIST_HEAD(task_groups);
 272
 273/* task group related information */
 274struct task_group {
 275#ifdef CONFIG_CGROUP_SCHED
 276        struct cgroup_subsys_state css;
 277#endif
 278
 279#ifdef CONFIG_USER_SCHED
 280        uid_t uid;
 281#endif
 282
 283#ifdef CONFIG_FAIR_GROUP_SCHED
 284        /* schedulable entities of this group on each cpu */
 285        struct sched_entity **se;
 286        /* runqueue "owned" by this group on each cpu */
 287        struct cfs_rq **cfs_rq;
 288        unsigned long shares;
 289#endif
 290
 291#ifdef CONFIG_RT_GROUP_SCHED
 292        struct sched_rt_entity **rt_se;
 293        struct rt_rq **rt_rq;
 294
 295        struct rt_bandwidth rt_bandwidth;
 296#endif
 297
 298        struct rcu_head rcu;
 299        struct list_head list;
 300
 301        struct task_group *parent;
 302        struct list_head siblings;
 303        struct list_head children;
 304};
 305
 306#ifdef CONFIG_USER_SCHED
 307
 308/* Helper function to pass uid information to create_sched_user() */
 309void set_tg_uid(struct user_struct *user)
 310{
 311        user->tg->uid = user->uid;
 312}
 313
 314/*
 315 * Root task group.
 316 *      Every UID task group (including init_task_group aka UID-0) will
 317 *      be a child to this group.
 318 */
 319struct task_group root_task_group;
 320
 321#ifdef CONFIG_FAIR_GROUP_SCHED
 322/* Default task group's sched entity on each cpu */
 323static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 324/* Default task group's cfs_rq on each cpu */
 325static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 326#endif /* CONFIG_FAIR_GROUP_SCHED */
 327
 328#ifdef CONFIG_RT_GROUP_SCHED
 329static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 330static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 331#endif /* CONFIG_RT_GROUP_SCHED */
 332#else /* !CONFIG_USER_SCHED */
 333#define root_task_group init_task_group
 334#endif /* CONFIG_USER_SCHED */
 335
 336/* task_group_lock serializes add/remove of task groups and also changes to
 337 * a task group's cpu shares.
 338 */
 339static DEFINE_SPINLOCK(task_group_lock);
 340
 341#ifdef CONFIG_SMP
 342static int root_task_group_empty(void)
 343{
 344        return list_empty(&root_task_group.children);
 345}
 346#endif
 347
 348#ifdef CONFIG_FAIR_GROUP_SCHED
 349#ifdef CONFIG_USER_SCHED
 350# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
 351#else /* !CONFIG_USER_SCHED */
 352# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 353#endif /* CONFIG_USER_SCHED */
 354
 355/*
 356 * A weight of 0 or 1 can cause arithmetics problems.
 357 * A weight of a cfs_rq is the sum of weights of which entities
 358 * are queued on this cfs_rq, so a weight of a entity should not be
 359 * too large, so as the shares value of a task group.
 360 * (The default weight is 1024 - so there's no practical
 361 *  limitation from this.)
 362 */
 363#define MIN_SHARES      2
 364#define MAX_SHARES      (1UL << 18)
 365
 366static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 367#endif
 368
 369/* Default task group.
 370 *      Every task in system belong to this group at bootup.
 371 */
 372struct task_group init_task_group;
 373
 374/* return group to which a task belongs */
 375static inline struct task_group *task_group(struct task_struct *p)
 376{
 377        struct task_group *tg;
 378
 379#ifdef CONFIG_USER_SCHED
 380        rcu_read_lock();
 381        tg = __task_cred(p)->user->tg;
 382        rcu_read_unlock();
 383#elif defined(CONFIG_CGROUP_SCHED)
 384        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 385                                struct task_group, css);
 386#else
 387        tg = &init_task_group;
 388#endif
 389        return tg;
 390}
 391
 392/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 393static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 394{
 395#ifdef CONFIG_FAIR_GROUP_SCHED
 396        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 397        p->se.parent = task_group(p)->se[cpu];
 398#endif
 399
 400#ifdef CONFIG_RT_GROUP_SCHED
 401        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 402        p->rt.parent = task_group(p)->rt_se[cpu];
 403#endif
 404}
 405
 406#else
 407
 408#ifdef CONFIG_SMP
 409static int root_task_group_empty(void)
 410{
 411        return 1;
 412}
 413#endif
 414
 415static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 416static inline struct task_group *task_group(struct task_struct *p)
 417{
 418        return NULL;
 419}
 420
 421#endif  /* CONFIG_GROUP_SCHED */
 422
 423/* CFS-related fields in a runqueue */
 424struct cfs_rq {
 425        struct load_weight load;
 426        unsigned long nr_running;
 427
 428        u64 exec_clock;
 429        u64 min_vruntime;
 430
 431        struct rb_root tasks_timeline;
 432        struct rb_node *rb_leftmost;
 433
 434        struct list_head tasks;
 435        struct list_head *balance_iterator;
 436
 437        /*
 438         * 'curr' points to currently running entity on this cfs_rq.
 439         * It is set to NULL otherwise (i.e when none are currently running).
 440         */
 441        struct sched_entity *curr, *next, *last;
 442
 443        unsigned int nr_spread_over;
 444
 445#ifdef CONFIG_FAIR_GROUP_SCHED
 446        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 447
 448        /*
 449         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 450         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 451         * (like users, containers etc.)
 452         *
 453         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 454         * list is used during load balance.
 455         */
 456        struct list_head leaf_cfs_rq_list;
 457        struct task_group *tg;  /* group that "owns" this runqueue */
 458
 459#ifdef CONFIG_SMP
 460        /*
 461         * the part of load.weight contributed by tasks
 462         */
 463        unsigned long task_weight;
 464
 465        /*
 466         *   h_load = weight * f(tg)
 467         *
 468         * Where f(tg) is the recursive weight fraction assigned to
 469         * this group.
 470         */
 471        unsigned long h_load;
 472
 473        /*
 474         * this cpu's part of tg->shares
 475         */
 476        unsigned long shares;
 477
 478        /*
 479         * load.weight at the time we set shares
 480         */
 481        unsigned long rq_weight;
 482#endif
 483#endif
 484};
 485
 486/* Real-Time classes' related field in a runqueue: */
 487struct rt_rq {
 488        struct rt_prio_array active;
 489        unsigned long rt_nr_running;
 490#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 491        struct {
 492                int curr; /* highest queued rt task prio */
 493#ifdef CONFIG_SMP
 494                int next; /* next highest */
 495#endif
 496        } highest_prio;
 497#endif
 498#ifdef CONFIG_SMP
 499        unsigned long rt_nr_migratory;
 500        unsigned long rt_nr_total;
 501        int overloaded;
 502        struct plist_head pushable_tasks;
 503#endif
 504        int rt_throttled;
 505        u64 rt_time;
 506        u64 rt_runtime;
 507        /* Nests inside the rq lock: */
 508        spinlock_t rt_runtime_lock;
 509
 510#ifdef CONFIG_RT_GROUP_SCHED
 511        unsigned long rt_nr_boosted;
 512
 513        struct rq *rq;
 514        struct list_head leaf_rt_rq_list;
 515        struct task_group *tg;
 516        struct sched_rt_entity *rt_se;
 517#endif
 518};
 519
 520#ifdef CONFIG_SMP
 521
 522/*
 523 * We add the notion of a root-domain which will be used to define per-domain
 524 * variables. Each exclusive cpuset essentially defines an island domain by
 525 * fully partitioning the member cpus from any other cpuset. Whenever a new
 526 * exclusive cpuset is created, we also create and attach a new root-domain
 527 * object.
 528 *
 529 */
 530struct root_domain {
 531        atomic_t refcount;
 532        cpumask_var_t span;
 533        cpumask_var_t online;
 534
 535        /*
 536         * The "RT overload" flag: it gets set if a CPU has more than
 537         * one runnable RT task.
 538         */
 539        cpumask_var_t rto_mask;
 540        atomic_t rto_count;
 541#ifdef CONFIG_SMP
 542        struct cpupri cpupri;
 543#endif
 544#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 545        /*
 546         * Preferred wake up cpu nominated by sched_mc balance that will be
 547         * used when most cpus are idle in the system indicating overall very
 548         * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
 549         */
 550        unsigned int sched_mc_preferred_wakeup_cpu;
 551#endif
 552};
 553
 554/*
 555 * By default the system creates a single root-domain with all cpus as
 556 * members (mimicking the global state we have today).
 557 */
 558static struct root_domain def_root_domain;
 559
 560#endif
 561
 562/*
 563 * This is the main, per-CPU runqueue data structure.
 564 *
 565 * Locking rule: those places that want to lock multiple runqueues
 566 * (such as the load balancing or the thread migration code), lock
 567 * acquire operations must be ordered by ascending &runqueue.
 568 */
 569struct rq {
 570        /* runqueue lock: */
 571        spinlock_t lock;
 572
 573        /*
 574         * nr_running and cpu_load should be in the same cacheline because
 575         * remote CPUs use both these fields when doing load calculation.
 576         */
 577        unsigned long nr_running;
 578        #define CPU_LOAD_IDX_MAX 5
 579        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 580#ifdef CONFIG_NO_HZ
 581        unsigned long last_tick_seen;
 582        unsigned char in_nohz_recently;
 583#endif
 584        /* capture load from *all* tasks on this cpu: */
 585        struct load_weight load;
 586        unsigned long nr_load_updates;
 587        u64 nr_switches;
 588
 589        struct cfs_rq cfs;
 590        struct rt_rq rt;
 591
 592#ifdef CONFIG_FAIR_GROUP_SCHED
 593        /* list of leaf cfs_rq on this cpu: */
 594        struct list_head leaf_cfs_rq_list;
 595#endif
 596#ifdef CONFIG_RT_GROUP_SCHED
 597        struct list_head leaf_rt_rq_list;
 598#endif
 599
 600        /*
 601         * This is part of a global counter where only the total sum
 602         * over all CPUs matters. A task can increase this counter on
 603         * one CPU and if it got migrated afterwards it may decrease
 604         * it on another CPU. Always updated under the runqueue lock:
 605         */
 606        unsigned long nr_uninterruptible;
 607
 608        struct task_struct *curr, *idle;
 609        unsigned long next_balance;
 610        struct mm_struct *prev_mm;
 611
 612        u64 clock;
 613
 614        atomic_t nr_iowait;
 615
 616#ifdef CONFIG_SMP
 617        struct root_domain *rd;
 618        struct sched_domain *sd;
 619
 620        unsigned char idle_at_tick;
 621        /* For active balancing */
 622        int active_balance;
 623        int push_cpu;
 624        /* cpu of this runqueue: */
 625        int cpu;
 626        int online;
 627
 628        unsigned long avg_load_per_task;
 629
 630        struct task_struct *migration_thread;
 631        struct list_head migration_queue;
 632#endif
 633
 634#ifdef CONFIG_SCHED_HRTICK
 635#ifdef CONFIG_SMP
 636        int hrtick_csd_pending;
 637        struct call_single_data hrtick_csd;
 638#endif
 639        struct hrtimer hrtick_timer;
 640#endif
 641
 642#ifdef CONFIG_SCHEDSTATS
 643        /* latency stats */
 644        struct sched_info rq_sched_info;
 645        unsigned long long rq_cpu_time;
 646        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 647
 648        /* sys_sched_yield() stats */
 649        unsigned int yld_count;
 650
 651        /* schedule() stats */
 652        unsigned int sched_switch;
 653        unsigned int sched_count;
 654        unsigned int sched_goidle;
 655
 656        /* try_to_wake_up() stats */
 657        unsigned int ttwu_count;
 658        unsigned int ttwu_local;
 659
 660        /* BKL stats */
 661        unsigned int bkl_count;
 662#endif
 663};
 664
 665static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 666
 667static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 668{
 669        rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 670}
 671
 672static inline int cpu_of(struct rq *rq)
 673{
 674#ifdef CONFIG_SMP
 675        return rq->cpu;
 676#else
 677        return 0;
 678#endif
 679}
 680
 681/*
 682 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 683 * See detach_destroy_domains: synchronize_sched for details.
 684 *
 685 * The domain tree of any CPU may only be accessed from within
 686 * preempt-disabled sections.
 687 */
 688#define for_each_domain(cpu, __sd) \
 689        for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 690
 691#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 692#define this_rq()               (&__get_cpu_var(runqueues))
 693#define task_rq(p)              cpu_rq(task_cpu(p))
 694#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 695
 696static inline void update_rq_clock(struct rq *rq)
 697{
 698        rq->clock = sched_clock_cpu(cpu_of(rq));
 699}
 700
 701/*
 702 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 703 */
 704#ifdef CONFIG_SCHED_DEBUG
 705# define const_debug __read_mostly
 706#else
 707# define const_debug static const
 708#endif
 709
 710/**
 711 * runqueue_is_locked
 712 *
 713 * Returns true if the current cpu runqueue is locked.
 714 * This interface allows printk to be called with the runqueue lock
 715 * held and know whether or not it is OK to wake up the klogd.
 716 */
 717int runqueue_is_locked(void)
 718{
 719        int cpu = get_cpu();
 720        struct rq *rq = cpu_rq(cpu);
 721        int ret;
 722
 723        ret = spin_is_locked(&rq->lock);
 724        put_cpu();
 725        return ret;
 726}
 727
 728/*
 729 * Debugging: various feature bits
 730 */
 731
 732#define SCHED_FEAT(name, enabled)       \
 733        __SCHED_FEAT_##name ,
 734
 735enum {
 736#include "sched_features.h"
 737};
 738
 739#undef SCHED_FEAT
 740
 741#define SCHED_FEAT(name, enabled)       \
 742        (1UL << __SCHED_FEAT_##name) * enabled |
 743
 744const_debug unsigned int sysctl_sched_features =
 745#include "sched_features.h"
 746        0;
 747
 748#undef SCHED_FEAT
 749
 750#ifdef CONFIG_SCHED_DEBUG
 751#define SCHED_FEAT(name, enabled)       \
 752        #name ,
 753
 754static __read_mostly char *sched_feat_names[] = {
 755#include "sched_features.h"
 756        NULL
 757};
 758
 759#undef SCHED_FEAT
 760
 761static int sched_feat_show(struct seq_file *m, void *v)
 762{
 763        int i;
 764
 765        for (i = 0; sched_feat_names[i]; i++) {
 766                if (!(sysctl_sched_features & (1UL << i)))
 767                        seq_puts(m, "NO_");
 768                seq_printf(m, "%s ", sched_feat_names[i]);
 769        }
 770        seq_puts(m, "\n");
 771
 772        return 0;
 773}
 774
 775static ssize_t
 776sched_feat_write(struct file *filp, const char __user *ubuf,
 777                size_t cnt, loff_t *ppos)
 778{
 779        char buf[64];
 780        char *cmp = buf;
 781        int neg = 0;
 782        int i;
 783
 784        if (cnt > 63)
 785                cnt = 63;
 786
 787        if (copy_from_user(&buf, ubuf, cnt))
 788                return -EFAULT;
 789
 790        buf[cnt] = 0;
 791
 792        if (strncmp(buf, "NO_", 3) == 0) {
 793                neg = 1;
 794                cmp += 3;
 795        }
 796
 797        for (i = 0; sched_feat_names[i]; i++) {
 798                int len = strlen(sched_feat_names[i]);
 799
 800                if (strncmp(cmp, sched_feat_names[i], len) == 0) {
 801                        if (neg)
 802                                sysctl_sched_features &= ~(1UL << i);
 803                        else
 804                                sysctl_sched_features |= (1UL << i);
 805                        break;
 806                }
 807        }
 808
 809        if (!sched_feat_names[i])
 810                return -EINVAL;
 811
 812        filp->f_pos += cnt;
 813
 814        return cnt;
 815}
 816
 817static int sched_feat_open(struct inode *inode, struct file *filp)
 818{
 819        return single_open(filp, sched_feat_show, NULL);
 820}
 821
 822static struct file_operations sched_feat_fops = {
 823        .open           = sched_feat_open,
 824        .write          = sched_feat_write,
 825        .read           = seq_read,
 826        .llseek         = seq_lseek,
 827        .release        = single_release,
 828};
 829
 830static __init int sched_init_debug(void)
 831{
 832        debugfs_create_file("sched_features", 0644, NULL, NULL,
 833                        &sched_feat_fops);
 834
 835        return 0;
 836}
 837late_initcall(sched_init_debug);
 838
 839#endif
 840
 841#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 842
 843/*
 844 * Number of tasks to iterate in a single balance run.
 845 * Limited because this is done with IRQs disabled.
 846 */
 847const_debug unsigned int sysctl_sched_nr_migrate = 32;
 848
 849/*
 850 * ratelimit for updating the group shares.
 851 * default: 0.25ms
 852 */
 853unsigned int sysctl_sched_shares_ratelimit = 250000;
 854
 855/*
 856 * Inject some fuzzyness into changing the per-cpu group shares
 857 * this avoids remote rq-locks at the expense of fairness.
 858 * default: 4
 859 */
 860unsigned int sysctl_sched_shares_thresh = 4;
 861
 862/*
 863 * period over which we measure -rt task cpu usage in us.
 864 * default: 1s
 865 */
 866unsigned int sysctl_sched_rt_period = 1000000;
 867
 868static __read_mostly int scheduler_running;
 869
 870/*
 871 * part of the period that we allow rt tasks to run in us.
 872 * default: 0.95s
 873 */
 874int sysctl_sched_rt_runtime = 950000;
 875
 876static inline u64 global_rt_period(void)
 877{
 878        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 879}
 880
 881static inline u64 global_rt_runtime(void)
 882{
 883        if (sysctl_sched_rt_runtime < 0)
 884                return RUNTIME_INF;
 885
 886        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 887}
 888
 889#ifndef prepare_arch_switch
 890# define prepare_arch_switch(next)      do { } while (0)
 891#endif
 892#ifndef finish_arch_switch
 893# define finish_arch_switch(prev)       do { } while (0)
 894#endif
 895
 896static inline int task_current(struct rq *rq, struct task_struct *p)
 897{
 898        return rq->curr == p;
 899}
 900
 901#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 902static inline int task_running(struct rq *rq, struct task_struct *p)
 903{
 904        return task_current(rq, p);
 905}
 906
 907static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 908{
 909}
 910
 911static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 912{
 913#ifdef CONFIG_DEBUG_SPINLOCK
 914        /* this is a valid case when another task releases the spinlock */
 915        rq->lock.owner = current;
 916#endif
 917        /*
 918         * If we are tracking spinlock dependencies then we have to
 919         * fix up the runqueue lock - which gets 'carried over' from
 920         * prev into current:
 921         */
 922        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 923
 924        spin_unlock_irq(&rq->lock);
 925}
 926
 927#else /* __ARCH_WANT_UNLOCKED_CTXSW */
 928static inline int task_running(struct rq *rq, struct task_struct *p)
 929{
 930#ifdef CONFIG_SMP
 931        return p->oncpu;
 932#else
 933        return task_current(rq, p);
 934#endif
 935}
 936
 937static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 938{
 939#ifdef CONFIG_SMP
 940        /*
 941         * We can optimise this out completely for !SMP, because the
 942         * SMP rebalancing from interrupt is the only thing that cares
 943         * here.
 944         */
 945        next->oncpu = 1;
 946#endif
 947#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 948        spin_unlock_irq(&rq->lock);
 949#else
 950        spin_unlock(&rq->lock);
 951#endif
 952}
 953
 954static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 955{
 956#ifdef CONFIG_SMP
 957        /*
 958         * After ->oncpu is cleared, the task can be moved to a different CPU.
 959         * We must ensure this doesn't happen until the switch is completely
 960         * finished.
 961         */
 962        smp_wmb();
 963        prev->oncpu = 0;
 964#endif
 965#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 966        local_irq_enable();
 967#endif
 968}
 969#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 970
 971/*
 972 * __task_rq_lock - lock the runqueue a given task resides on.
 973 * Must be called interrupts disabled.
 974 */
 975static inline struct rq *__task_rq_lock(struct task_struct *p)
 976        __acquires(rq->lock)
 977{
 978        for (;;) {
 979                struct rq *rq = task_rq(p);
 980                spin_lock(&rq->lock);
 981                if (likely(rq == task_rq(p)))
 982                        return rq;
 983                spin_unlock(&rq->lock);
 984        }
 985}
 986
 987/*
 988 * task_rq_lock - lock the runqueue a given task resides on and disable
 989 * interrupts. Note the ordering: we can safely lookup the task_rq without
 990 * explicitly disabling preemption.
 991 */
 992static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 993        __acquires(rq->lock)
 994{
 995        struct rq *rq;
 996
 997        for (;;) {
 998                local_irq_save(*flags);
 999                rq = task_rq(p);
1000                spin_lock(&rq->lock);
1001                if (likely(rq == task_rq(p)))
1002                        return rq;
1003                spin_unlock_irqrestore(&rq->lock, *flags);
1004        }
1005}
1006
1007void task_rq_unlock_wait(struct task_struct *p)
1008{
1009        struct rq *rq = task_rq(p);
1010
1011        smp_mb(); /* spin-unlock-wait is not a full memory barrier */
1012        spin_unlock_wait(&rq->lock);
1013}
1014
1015static void __task_rq_unlock(struct rq *rq)
1016        __releases(rq->lock)
1017{
1018        spin_unlock(&rq->lock);
1019}
1020
1021static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1022        __releases(rq->lock)
1023{
1024        spin_unlock_irqrestore(&rq->lock, *flags);
1025}
1026
1027/*
1028 * this_rq_lock - lock this runqueue and disable interrupts.
1029 */
1030static struct rq *this_rq_lock(void)
1031        __acquires(rq->lock)
1032{
1033        struct rq *rq;
1034
1035        local_irq_disable();
1036        rq = this_rq();
1037        spin_lock(&rq->lock);
1038
1039        return rq;
1040}
1041
1042#ifdef CONFIG_SCHED_HRTICK
1043/*
1044 * Use HR-timers to deliver accurate preemption points.
1045 *
1046 * Its all a bit involved since we cannot program an hrt while holding the
1047 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1048 * reschedule event.
1049 *
1050 * When we get rescheduled we reprogram the hrtick_timer outside of the
1051 * rq->lock.
1052 */
1053
1054/*
1055 * Use hrtick when:
1056 *  - enabled by features
1057 *  - hrtimer is actually high res
1058 */
1059static inline int hrtick_enabled(struct rq *rq)
1060{
1061        if (!sched_feat(HRTICK))
1062                return 0;
1063        if (!cpu_active(cpu_of(rq)))
1064                return 0;
1065        return hrtimer_is_hres_active(&rq->hrtick_timer);
1066}
1067
1068static void hrtick_clear(struct rq *rq)
1069{
1070        if (hrtimer_active(&rq->hrtick_timer))
1071                hrtimer_cancel(&rq->hrtick_timer);
1072}
1073
1074/*
1075 * High-resolution timer tick.
1076 * Runs from hardirq context with interrupts disabled.
1077 */
1078static enum hrtimer_restart hrtick(struct hrtimer *timer)
1079{
1080        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1081
1082        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1083
1084        spin_lock(&rq->lock);
1085        update_rq_clock(rq);
1086        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1087        spin_unlock(&rq->lock);
1088
1089        return HRTIMER_NORESTART;
1090}
1091
1092#ifdef CONFIG_SMP
1093/*
1094 * called from hardirq (IPI) context
1095 */
1096static void __hrtick_start(void *arg)
1097{
1098        struct rq *rq = arg;
1099
1100        spin_lock(&rq->lock);
1101        hrtimer_restart(&rq->hrtick_timer);
1102        rq->hrtick_csd_pending = 0;
1103        spin_unlock(&rq->lock);
1104}
1105
1106/*
1107 * Called to set the hrtick timer state.
1108 *
1109 * called with rq->lock held and irqs disabled
1110 */
1111static void hrtick_start(struct rq *rq, u64 delay)
1112{
1113        struct hrtimer *timer = &rq->hrtick_timer;
1114        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1115
1116        hrtimer_set_expires(timer, time);
1117
1118        if (rq == this_rq()) {
1119                hrtimer_restart(timer);
1120        } else if (!rq->hrtick_csd_pending) {
1121                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1122                rq->hrtick_csd_pending = 1;
1123        }
1124}
1125
1126static int
1127hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1128{
1129        int cpu = (int)(long)hcpu;
1130
1131        switch (action) {
1132        case CPU_UP_CANCELED:
1133        case CPU_UP_CANCELED_FROZEN:
1134        case CPU_DOWN_PREPARE:
1135        case CPU_DOWN_PREPARE_FROZEN:
1136        case CPU_DEAD:
1137        case CPU_DEAD_FROZEN:
1138                hrtick_clear(cpu_rq(cpu));
1139                return NOTIFY_OK;
1140        }
1141
1142        return NOTIFY_DONE;
1143}
1144
1145static __init void init_hrtick(void)
1146{
1147        hotcpu_notifier(hotplug_hrtick, 0);
1148}
1149#else
1150/*
1151 * Called to set the hrtick timer state.
1152 *
1153 * called with rq->lock held and irqs disabled
1154 */
1155static void hrtick_start(struct rq *rq, u64 delay)
1156{
1157        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1158                        HRTIMER_MODE_REL, 0);
1159}
1160
1161static inline void init_hrtick(void)
1162{
1163}
1164#endif /* CONFIG_SMP */
1165
1166static void init_rq_hrtick(struct rq *rq)
1167{
1168#ifdef CONFIG_SMP
1169        rq->hrtick_csd_pending = 0;
1170
1171        rq->hrtick_csd.flags = 0;
1172        rq->hrtick_csd.func = __hrtick_start;
1173        rq->hrtick_csd.info = rq;
1174#endif
1175
1176        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1177        rq->hrtick_timer.function = hrtick;
1178}
1179#else   /* CONFIG_SCHED_HRTICK */
1180static inline void hrtick_clear(struct rq *rq)
1181{
1182}
1183
1184static inline void init_rq_hrtick(struct rq *rq)
1185{
1186}
1187
1188static inline void init_hrtick(void)
1189{
1190}
1191#endif  /* CONFIG_SCHED_HRTICK */
1192
1193/*
1194 * resched_task - mark a task 'to be rescheduled now'.
1195 *
1196 * On UP this means the setting of the need_resched flag, on SMP it
1197 * might also involve a cross-CPU call to trigger the scheduler on
1198 * the target CPU.
1199 */
1200#ifdef CONFIG_SMP
1201
1202#ifndef tsk_is_polling
1203#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1204#endif
1205
1206static void resched_task(struct task_struct *p)
1207{
1208        int cpu;
1209
1210        assert_spin_locked(&task_rq(p)->lock);
1211
1212        if (test_tsk_need_resched(p))
1213                return;
1214
1215        set_tsk_need_resched(p);
1216
1217        cpu = task_cpu(p);
1218        if (cpu == smp_processor_id())
1219                return;
1220
1221        /* NEED_RESCHED must be visible before we test polling */
1222        smp_mb();
1223        if (!tsk_is_polling(p))
1224                smp_send_reschedule(cpu);
1225}
1226
1227static void resched_cpu(int cpu)
1228{
1229        struct rq *rq = cpu_rq(cpu);
1230        unsigned long flags;
1231
1232        if (!spin_trylock_irqsave(&rq->lock, flags))
1233                return;
1234        resched_task(cpu_curr(cpu));
1235        spin_unlock_irqrestore(&rq->lock, flags);
1236}
1237
1238#ifdef CONFIG_NO_HZ
1239/*
1240 * When add_timer_on() enqueues a timer into the timer wheel of an
1241 * idle CPU then this timer might expire before the next timer event
1242 * which is scheduled to wake up that CPU. In case of a completely
1243 * idle system the next event might even be infinite time into the
1244 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1245 * leaves the inner idle loop so the newly added timer is taken into
1246 * account when the CPU goes back to idle and evaluates the timer
1247 * wheel for the next timer event.
1248 */
1249void wake_up_idle_cpu(int cpu)
1250{
1251        struct rq *rq = cpu_rq(cpu);
1252
1253        if (cpu == smp_processor_id())
1254                return;
1255
1256        /*
1257         * This is safe, as this function is called with the timer
1258         * wheel base lock of (cpu) held. When the CPU is on the way
1259         * to idle and has not yet set rq->curr to idle then it will
1260         * be serialized on the timer wheel base lock and take the new
1261         * timer into account automatically.
1262         */
1263        if (rq->curr != rq->idle)
1264                return;
1265
1266        /*
1267         * We can set TIF_RESCHED on the idle task of the other CPU
1268         * lockless. The worst case is that the other CPU runs the
1269         * idle task through an additional NOOP schedule()
1270         */
1271        set_tsk_need_resched(rq->idle);
1272
1273        /* NEED_RESCHED must be visible before we test polling */
1274        smp_mb();
1275        if (!tsk_is_polling(rq->idle))
1276                smp_send_reschedule(cpu);
1277}
1278#endif /* CONFIG_NO_HZ */
1279
1280#else /* !CONFIG_SMP */
1281static void resched_task(struct task_struct *p)
1282{
1283        assert_spin_locked(&task_rq(p)->lock);
1284        set_tsk_need_resched(p);
1285}
1286#endif /* CONFIG_SMP */
1287
1288#if BITS_PER_LONG == 32
1289# define WMULT_CONST    (~0UL)
1290#else
1291# define WMULT_CONST    (1UL << 32)
1292#endif
1293
1294#define WMULT_SHIFT     32
1295
1296/*
1297 * Shift right and round:
1298 */
1299#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1300
1301/*
1302 * delta *= weight / lw
1303 */
1304static unsigned long
1305calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1306                struct load_weight *lw)
1307{
1308        u64 tmp;
1309
1310        if (!lw->inv_weight) {
1311                if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1312                        lw->inv_weight = 1;
1313                else
1314                        lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1315                                / (lw->weight+1);
1316        }
1317
1318        tmp = (u64)delta_exec * weight;
1319        /*
1320         * Check whether we'd overflow the 64-bit multiplication:
1321         */
1322        if (unlikely(tmp > WMULT_CONST))
1323                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1324                        WMULT_SHIFT/2);
1325        else
1326                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1327
1328        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1329}
1330
1331static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1332{
1333        lw->weight += inc;
1334        lw->inv_weight = 0;
1335}
1336
1337static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1338{
1339        lw->weight -= dec;
1340        lw->inv_weight = 0;
1341}
1342
1343/*
1344 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1345 * of tasks with abnormal "nice" values across CPUs the contribution that
1346 * each task makes to its run queue's load is weighted according to its
1347 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1348 * scaled version of the new time slice allocation that they receive on time
1349 * slice expiry etc.
1350 */
1351
1352#define WEIGHT_IDLEPRIO                3
1353#define WMULT_IDLEPRIO         1431655765
1354
1355/*
1356 * Nice levels are multiplicative, with a gentle 10% change for every
1357 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1358 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1359 * that remained on nice 0.
1360 *
1361 * The "10% effect" is relative and cumulative: from _any_ nice level,
1362 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1363 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1364 * If a task goes up by ~10% and another task goes down by ~10% then
1365 * the relative distance between them is ~25%.)
1366 */
1367static const int prio_to_weight[40] = {
1368 /* -20 */     88761,     71755,     56483,     46273,     36291,
1369 /* -15 */     29154,     23254,     18705,     14949,     11916,
1370 /* -10 */      9548,      7620,      6100,      4904,      3906,
1371 /*  -5 */      3121,      2501,      1991,      1586,      1277,
1372 /*   0 */      1024,       820,       655,       526,       423,
1373 /*   5 */       335,       272,       215,       172,       137,
1374 /*  10 */       110,        87,        70,        56,        45,
1375 /*  15 */        36,        29,        23,        18,        15,
1376};
1377
1378/*
1379 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1380 *
1381 * In cases where the weight does not change often, we can use the
1382 * precalculated inverse to speed up arithmetics by turning divisions
1383 * into multiplications:
1384 */
1385static const u32 prio_to_wmult[40] = {
1386 /* -20 */     48388,     59856,     76040,     92818,    118348,
1387 /* -15 */    147320,    184698,    229616,    287308,    360437,
1388 /* -10 */    449829,    563644,    704093,    875809,   1099582,
1389 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1390 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1391 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1392 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1393 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1394};
1395
1396static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1397
1398/*
1399 * runqueue iterator, to support SMP load-balancing between different
1400 * scheduling classes, without having to expose their internal data
1401 * structures to the load-balancing proper:
1402 */
1403struct rq_iterator {
1404        void *arg;
1405        struct task_struct *(*start)(void *);
1406        struct task_struct *(*next)(void *);
1407};
1408
1409#ifdef CONFIG_SMP
1410static unsigned long
1411balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1412              unsigned long max_load_move, struct sched_domain *sd,
1413              enum cpu_idle_type idle, int *all_pinned,
1414              int *this_best_prio, struct rq_iterator *iterator);
1415
1416static int
1417iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1418                   struct sched_domain *sd, enum cpu_idle_type idle,
1419                   struct rq_iterator *iterator);
1420#endif
1421
1422/* Time spent by the tasks of the cpu accounting group executing in ... */
1423enum cpuacct_stat_index {
1424        CPUACCT_STAT_USER,      /* ... user mode */
1425        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
1426
1427        CPUACCT_STAT_NSTATS,
1428};
1429
1430#ifdef CONFIG_CGROUP_CPUACCT
1431static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1432static void cpuacct_update_stats(struct task_struct *tsk,
1433                enum cpuacct_stat_index idx, cputime_t val);
1434#else
1435static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1436static inline void cpuacct_update_stats(struct task_struct *tsk,
1437                enum cpuacct_stat_index idx, cputime_t val) {}
1438#endif
1439
1440static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1441{
1442        update_load_add(&rq->load, load);
1443}
1444
1445static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1446{
1447        update_load_sub(&rq->load, load);
1448}
1449
1450#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1451typedef int (*tg_visitor)(struct task_group *, void *);
1452
1453/*
1454 * Iterate the full tree, calling @down when first entering a node and @up when
1455 * leaving it for the final time.
1456 */
1457static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1458{
1459        struct task_group *parent, *child;
1460        int ret;
1461
1462        rcu_read_lock();
1463        parent = &root_task_group;
1464down:
1465        ret = (*down)(parent, data);
1466        if (ret)
1467                goto out_unlock;
1468        list_for_each_entry_rcu(child, &parent->children, siblings) {
1469                parent = child;
1470                goto down;
1471
1472up:
1473                continue;
1474        }
1475        ret = (*up)(parent, data);
1476        if (ret)
1477                goto out_unlock;
1478
1479        child = parent;
1480        parent = parent->parent;
1481        if (parent)
1482                goto up;
1483out_unlock:
1484        rcu_read_unlock();
1485
1486        return ret;
1487}
1488
1489static int tg_nop(struct task_group *tg, void *data)
1490{
1491        return 0;
1492}
1493#endif
1494
1495#ifdef CONFIG_SMP
1496static unsigned long source_load(int cpu, int type);
1497static unsigned long target_load(int cpu, int type);
1498static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1499
1500static unsigned long cpu_avg_load_per_task(int cpu)
1501{
1502        struct rq *rq = cpu_rq(cpu);
1503        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1504
1505        if (nr_running)
1506                rq->avg_load_per_task = rq->load.weight / nr_running;
1507        else
1508                rq->avg_load_per_task = 0;
1509
1510        return rq->avg_load_per_task;
1511}
1512
1513#ifdef CONFIG_FAIR_GROUP_SCHED
1514
1515static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1516
1517/*
1518 * Calculate and set the cpu's group shares.
1519 */
1520static void
1521update_group_shares_cpu(struct task_group *tg, int cpu,
1522                        unsigned long sd_shares, unsigned long sd_rq_weight)
1523{
1524        unsigned long shares;
1525        unsigned long rq_weight;
1526
1527        if (!tg->se[cpu])
1528                return;
1529
1530        rq_weight = tg->cfs_rq[cpu]->rq_weight;
1531
1532        /*
1533         *           \Sum shares * rq_weight
1534         * shares =  -----------------------
1535         *               \Sum rq_weight
1536         *
1537         */
1538        shares = (sd_shares * rq_weight) / sd_rq_weight;
1539        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1540
1541        if (abs(shares - tg->se[cpu]->load.weight) >
1542                        sysctl_sched_shares_thresh) {
1543                struct rq *rq = cpu_rq(cpu);
1544                unsigned long flags;
1545
1546                spin_lock_irqsave(&rq->lock, flags);
1547                tg->cfs_rq[cpu]->shares = shares;
1548
1549                __set_se_shares(tg->se[cpu], shares);
1550                spin_unlock_irqrestore(&rq->lock, flags);
1551        }
1552}
1553
1554/*
1555 * Re-compute the task group their per cpu shares over the given domain.
1556 * This needs to be done in a bottom-up fashion because the rq weight of a
1557 * parent group depends on the shares of its child groups.
1558 */
1559static int tg_shares_up(struct task_group *tg, void *data)
1560{
1561        unsigned long weight, rq_weight = 0;
1562        unsigned long shares = 0;
1563        struct sched_domain *sd = data;
1564        int i;
1565
1566        for_each_cpu(i, sched_domain_span(sd)) {
1567                /*
1568                 * If there are currently no tasks on the cpu pretend there
1569                 * is one of average load so that when a new task gets to
1570                 * run here it will not get delayed by group starvation.
1571                 */
1572                weight = tg->cfs_rq[i]->load.weight;
1573                if (!weight)
1574                        weight = NICE_0_LOAD;
1575
1576                tg->cfs_rq[i]->rq_weight = weight;
1577                rq_weight += weight;
1578                shares += tg->cfs_rq[i]->shares;
1579        }
1580
1581        if ((!shares && rq_weight) || shares > tg->shares)
1582                shares = tg->shares;
1583
1584        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1585                shares = tg->shares;
1586
1587        for_each_cpu(i, sched_domain_span(sd))
1588                update_group_shares_cpu(tg, i, shares, rq_weight);
1589
1590        return 0;
1591}
1592
1593/*
1594 * Compute the cpu's hierarchical load factor for each task group.
1595 * This needs to be done in a top-down fashion because the load of a child
1596 * group is a fraction of its parents load.
1597 */
1598static int tg_load_down(struct task_group *tg, void *data)
1599{
1600        unsigned long load;
1601        long cpu = (long)data;
1602
1603        if (!tg->parent) {
1604                load = cpu_rq(cpu)->load.weight;
1605        } else {
1606                load = tg->parent->cfs_rq[cpu]->h_load;
1607                load *= tg->cfs_rq[cpu]->shares;
1608                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1609        }
1610
1611        tg->cfs_rq[cpu]->h_load = load;
1612
1613        return 0;
1614}
1615
1616static void update_shares(struct sched_domain *sd)
1617{
1618        u64 now = cpu_clock(raw_smp_processor_id());
1619        s64 elapsed = now - sd->last_update;
1620
1621        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1622                sd->last_update = now;
1623                walk_tg_tree(tg_nop, tg_shares_up, sd);
1624        }
1625}
1626
1627static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1628{
1629        spin_unlock(&rq->lock);
1630        update_shares(sd);
1631        spin_lock(&rq->lock);
1632}
1633
1634static void update_h_load(long cpu)
1635{
1636        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1637}
1638
1639#else
1640
1641static inline void update_shares(struct sched_domain *sd)
1642{
1643}
1644
1645static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1646{
1647}
1648
1649#endif
1650
1651#ifdef CONFIG_PREEMPT
1652
1653/*
1654 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1655 * way at the expense of forcing extra atomic operations in all
1656 * invocations.  This assures that the double_lock is acquired using the
1657 * same underlying policy as the spinlock_t on this architecture, which
1658 * reduces latency compared to the unfair variant below.  However, it
1659 * also adds more overhead and therefore may reduce throughput.
1660 */
1661static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1662        __releases(this_rq->lock)
1663        __acquires(busiest->lock)
1664        __acquires(this_rq->lock)
1665{
1666        spin_unlock(&this_rq->lock);
1667        double_rq_lock(this_rq, busiest);
1668
1669        return 1;
1670}
1671
1672#else
1673/*
1674 * Unfair double_lock_balance: Optimizes throughput at the expense of
1675 * latency by eliminating extra atomic operations when the locks are
1676 * already in proper order on entry.  This favors lower cpu-ids and will
1677 * grant the double lock to lower cpus over higher ids under contention,
1678 * regardless of entry order into the function.
1679 */
1680static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1681        __releases(this_rq->lock)
1682        __acquires(busiest->lock)
1683        __acquires(this_rq->lock)
1684{
1685        int ret = 0;
1686
1687        if (unlikely(!spin_trylock(&busiest->lock))) {
1688                if (busiest < this_rq) {
1689                        spin_unlock(&this_rq->lock);
1690                        spin_lock(&busiest->lock);
1691                        spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1692                        ret = 1;
1693                } else
1694                        spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1695        }
1696        return ret;
1697}
1698
1699#endif /* CONFIG_PREEMPT */
1700
1701/*
1702 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1703 */
1704static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1705{
1706        if (unlikely(!irqs_disabled())) {
1707                /* printk() doesn't work good under rq->lock */
1708                spin_unlock(&this_rq->lock);
1709                BUG_ON(1);
1710        }
1711
1712        return _double_lock_balance(this_rq, busiest);
1713}
1714
1715static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1716        __releases(busiest->lock)
1717{
1718        spin_unlock(&busiest->lock);
1719        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1720}
1721#endif
1722
1723#ifdef CONFIG_FAIR_GROUP_SCHED
1724static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1725{
1726#ifdef CONFIG_SMP
1727        cfs_rq->shares = shares;
1728#endif
1729}
1730#endif
1731
1732#include "sched_stats.h"
1733#include "sched_idletask.c"
1734#include "sched_fair.c"
1735#include "sched_rt.c"
1736#ifdef CONFIG_SCHED_DEBUG
1737# include "sched_debug.c"
1738#endif
1739
1740#define sched_class_highest (&rt_sched_class)
1741#define for_each_class(class) \
1742   for (class = sched_class_highest; class; class = class->next)
1743
1744static void inc_nr_running(struct rq *rq)
1745{
1746        rq->nr_running++;
1747}
1748
1749static void dec_nr_running(struct rq *rq)
1750{
1751        rq->nr_running--;
1752}
1753
1754static void set_load_weight(struct task_struct *p)
1755{
1756        if (task_has_rt_policy(p)) {
1757                p->se.load.weight = prio_to_weight[0] * 2;
1758                p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1759                return;
1760        }
1761
1762        /*
1763         * SCHED_IDLE tasks get minimal weight:
1764         */
1765        if (p->policy == SCHED_IDLE) {
1766                p->se.load.weight = WEIGHT_IDLEPRIO;
1767                p->se.load.inv_weight = WMULT_IDLEPRIO;
1768                return;
1769        }
1770
1771        p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1772        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1773}
1774
1775static void update_avg(u64 *avg, u64 sample)
1776{
1777        s64 diff = sample - *avg;
1778        *avg += diff >> 3;
1779}
1780
1781static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1782{
1783        if (wakeup)
1784                p->se.start_runtime = p->se.sum_exec_runtime;
1785
1786        sched_info_queued(p);
1787        p->sched_class->enqueue_task(rq, p, wakeup);
1788        p->se.on_rq = 1;
1789}
1790
1791static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1792{
1793        if (sleep) {
1794                if (p->se.last_wakeup) {
1795                        update_avg(&p->se.avg_overlap,
1796                                p->se.sum_exec_runtime - p->se.last_wakeup);
1797                        p->se.last_wakeup = 0;
1798                } else {
1799                        update_avg(&p->se.avg_wakeup,
1800                                sysctl_sched_wakeup_granularity);
1801                }
1802        }
1803
1804        sched_info_dequeued(p);
1805        p->sched_class->dequeue_task(rq, p, sleep);
1806        p->se.on_rq = 0;
1807}
1808
1809/*
1810 * __normal_prio - return the priority that is based on the static prio
1811 */
1812static inline int __normal_prio(struct task_struct *p)
1813{
1814        return p->static_prio;
1815}
1816
1817/*
1818 * Calculate the expected normal priority: i.e. priority
1819 * without taking RT-inheritance into account. Might be
1820 * boosted by interactivity modifiers. Changes upon fork,
1821 * setprio syscalls, and whenever the interactivity
1822 * estimator recalculates.
1823 */
1824static inline int normal_prio(struct task_struct *p)
1825{
1826        int prio;
1827
1828        if (task_has_rt_policy(p))
1829                prio = MAX_RT_PRIO-1 - p->rt_priority;
1830        else
1831                prio = __normal_prio(p);
1832        return prio;
1833}
1834
1835/*
1836 * Calculate the current priority, i.e. the priority
1837 * taken into account by the scheduler. This value might
1838 * be boosted by RT tasks, or might be boosted by
1839 * interactivity modifiers. Will be RT if the task got
1840 * RT-boosted. If not then it returns p->normal_prio.
1841 */
1842static int effective_prio(struct task_struct *p)
1843{
1844        p->normal_prio = normal_prio(p);
1845        /*
1846         * If we are RT tasks or we were boosted to RT priority,
1847         * keep the priority unchanged. Otherwise, update priority
1848         * to the normal priority:
1849         */
1850        if (!rt_prio(p->prio))
1851                return p->normal_prio;
1852        return p->prio;
1853}
1854
1855/*
1856 * activate_task - move a task to the runqueue.
1857 */
1858static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1859{
1860        if (task_contributes_to_load(p))
1861                rq->nr_uninterruptible--;
1862
1863        enqueue_task(rq, p, wakeup);
1864        inc_nr_running(rq);
1865}
1866
1867/*
1868 * deactivate_task - remove a task from the runqueue.
1869 */
1870static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1871{
1872        if (task_contributes_to_load(p))
1873                rq->nr_uninterruptible++;
1874
1875        dequeue_task(rq, p, sleep);
1876        dec_nr_running(rq);
1877}
1878
1879/**
1880 * task_curr - is this task currently executing on a CPU?
1881 * @p: the task in question.
1882 */
1883inline int task_curr(const struct task_struct *p)
1884{
1885        return cpu_curr(task_cpu(p)) == p;
1886}
1887
1888static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1889{
1890        set_task_rq(p, cpu);
1891#ifdef CONFIG_SMP
1892        /*
1893         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1894         * successfuly executed on another CPU. We must ensure that updates of
1895         * per-task data have been completed by this moment.
1896         */
1897        smp_wmb();
1898        task_thread_info(p)->cpu = cpu;
1899#endif
1900}
1901
1902static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1903                                       const struct sched_class *prev_class,
1904                                       int oldprio, int running)
1905{
1906        if (prev_class != p->sched_class) {
1907                if (prev_class->switched_from)
1908                        prev_class->switched_from(rq, p, running);
1909                p->sched_class->switched_to(rq, p, running);
1910        } else
1911                p->sched_class->prio_changed(rq, p, oldprio, running);
1912}
1913
1914#ifdef CONFIG_SMP
1915
1916/* Used instead of source_load when we know the type == 0 */
1917static unsigned long weighted_cpuload(const int cpu)
1918{
1919        return cpu_rq(cpu)->load.weight;
1920}
1921
1922/*
1923 * Is this task likely cache-hot:
1924 */
1925static int
1926task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1927{
1928        s64 delta;
1929
1930        /*
1931         * Buddy candidates are cache hot:
1932         */
1933        if (sched_feat(CACHE_HOT_BUDDY) &&
1934                        (&p->se == cfs_rq_of(&p->se)->next ||
1935                         &p->se == cfs_rq_of(&p->se)->last))
1936                return 1;
1937
1938        if (p->sched_class != &fair_sched_class)
1939                return 0;
1940
1941        if (sysctl_sched_migration_cost == -1)
1942                return 1;
1943        if (sysctl_sched_migration_cost == 0)
1944                return 0;
1945
1946        delta = now - p->se.exec_start;
1947
1948        return delta < (s64)sysctl_sched_migration_cost;
1949}
1950
1951
1952void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1953{
1954        int old_cpu = task_cpu(p);
1955        struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1956        struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1957                      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1958        u64 clock_offset;
1959
1960        clock_offset = old_rq->clock - new_rq->clock;
1961
1962        trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1963
1964#ifdef CONFIG_SCHEDSTATS
1965        if (p->se.wait_start)
1966                p->se.wait_start -= clock_offset;
1967        if (p->se.sleep_start)
1968                p->se.sleep_start -= clock_offset;
1969        if (p->se.block_start)
1970                p->se.block_start -= clock_offset;
1971        if (old_cpu != new_cpu) {
1972                schedstat_inc(p, se.nr_migrations);
1973                if (task_hot(p, old_rq->clock, NULL))
1974                        schedstat_inc(p, se.nr_forced2_migrations);
1975        }
1976#endif
1977        p->se.vruntime -= old_cfsrq->min_vruntime -
1978                                         new_cfsrq->min_vruntime;
1979
1980        __set_task_cpu(p, new_cpu);
1981}
1982
1983struct migration_req {
1984        struct list_head list;
1985
1986        struct task_struct *task;
1987        int dest_cpu;
1988
1989        struct completion done;
1990};
1991
1992/*
1993 * The task's runqueue lock must be held.
1994 * Returns true if you have to wait for migration thread.
1995 */
1996static int
1997migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1998{
1999        struct rq *rq = task_rq(p);
2000
2001        /*
2002         * If the task is not on a runqueue (and not running), then
2003         * it is sufficient to simply update the task's cpu field.
2004         */
2005        if (!p->se.on_rq && !task_running(rq, p)) {
2006                set_task_cpu(p, dest_cpu);
2007                return 0;
2008        }
2009
2010        init_completion(&req->done);
2011        req->task = p;
2012        req->dest_cpu = dest_cpu;
2013        list_add(&req->list, &rq->migration_queue);
2014
2015        return 1;
2016}
2017
2018/*
2019 * wait_task_inactive - wait for a thread to unschedule.
2020 *
2021 * If @match_state is nonzero, it's the @p->state value just checked and
2022 * not expected to change.  If it changes, i.e. @p might have woken up,
2023 * then return zero.  When we succeed in waiting for @p to be off its CPU,
2024 * we return a positive number (its total switch count).  If a second call
2025 * a short while later returns the same number, the caller can be sure that
2026 * @p has remained unscheduled the whole time.
2027 *
2028 * The caller must ensure that the task *will* unschedule sometime soon,
2029 * else this function might spin for a *long* time. This function can't
2030 * be called with interrupts off, or it may introduce deadlock with
2031 * smp_call_function() if an IPI is sent by the same process we are
2032 * waiting to become inactive.
2033 */
2034unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2035{
2036        unsigned long flags;
2037        int running, on_rq;
2038        unsigned long ncsw;
2039        struct rq *rq;
2040
2041        for (;;) {
2042                /*
2043                 * We do the initial early heuristics without holding
2044                 * any task-queue locks at all. We'll only try to get
2045                 * the runqueue lock when things look like they will
2046                 * work out!
2047                 */
2048                rq = task_rq(p);
2049
2050                /*
2051                 * If the task is actively running on another CPU
2052                 * still, just relax and busy-wait without holding
2053                 * any locks.
2054                 *
2055                 * NOTE! Since we don't hold any locks, it's not
2056                 * even sure that "rq" stays as the right runqueue!
2057                 * But we don't care, since "task_running()" will
2058                 * return false if the runqueue has changed and p
2059                 * is actually now running somewhere else!
2060                 */
2061                while (task_running(rq, p)) {
2062                        if (match_state && unlikely(p->state != match_state))
2063                                return 0;
2064                        cpu_relax();
2065                }
2066
2067                /*
2068                 * Ok, time to look more closely! We need the rq
2069                 * lock now, to be *sure*. If we're wrong, we'll
2070                 * just go back and repeat.
2071                 */
2072                rq = task_rq_lock(p, &flags);
2073                trace_sched_wait_task(rq, p);
2074                running = task_running(rq, p);
2075                on_rq = p->se.on_rq;
2076                ncsw = 0;
2077                if (!match_state || p->state == match_state)
2078                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2079                task_rq_unlock(rq, &flags);
2080
2081                /*
2082                 * If it changed from the expected state, bail out now.
2083                 */
2084                if (unlikely(!ncsw))
2085                        break;
2086
2087                /*
2088                 * Was it really running after all now that we
2089                 * checked with the proper locks actually held?
2090                 *
2091                 * Oops. Go back and try again..
2092                 */
2093                if (unlikely(running)) {
2094                        cpu_relax();
2095                        continue;
2096                }
2097
2098                /*
2099                 * It's not enough that it's not actively running,
2100                 * it must be off the runqueue _entirely_, and not
2101                 * preempted!
2102                 *
2103                 * So if it was still runnable (but just not actively
2104                 * running right now), it's preempted, and we should
2105                 * yield - it could be a while.
2106                 */
2107                if (unlikely(on_rq)) {
2108                        schedule_timeout_uninterruptible(1);
2109                        continue;
2110                }
2111
2112                /*
2113                 * Ahh, all good. It wasn't running, and it wasn't
2114                 * runnable, which means that it will never become
2115                 * running in the future either. We're all done!
2116                 */
2117                break;
2118        }
2119
2120        return ncsw;
2121}
2122
2123/***
2124 * kick_process - kick a running thread to enter/exit the kernel
2125 * @p: the to-be-kicked thread
2126 *
2127 * Cause a process which is running on another CPU to enter
2128 * kernel-mode, without any delay. (to get signals handled.)
2129 *
2130 * NOTE: this function doesnt have to take the runqueue lock,
2131 * because all it wants to ensure is that the remote task enters
2132 * the kernel. If the IPI races and the task has been migrated
2133 * to another CPU then no harm is done and the purpose has been
2134 * achieved as well.
2135 */
2136void kick_process(struct task_struct *p)
2137{
2138        int cpu;
2139
2140        preempt_disable();
2141        cpu = task_cpu(p);
2142        if ((cpu != smp_processor_id()) && task_curr(p))
2143                smp_send_reschedule(cpu);
2144        preempt_enable();
2145}
2146
2147/*
2148 * Return a low guess at the load of a migration-source cpu weighted
2149 * according to the scheduling class and "nice" value.
2150 *
2151 * We want to under-estimate the load of migration sources, to
2152 * balance conservatively.
2153 */
2154static unsigned long source_load(int cpu, int type)
2155{
2156        struct rq *rq = cpu_rq(cpu);
2157        unsigned long total = weighted_cpuload(cpu);
2158
2159        if (type == 0 || !sched_feat(LB_BIAS))
2160                return total;
2161
2162        return min(rq->cpu_load[type-1], total);
2163}
2164
2165/*
2166 * Return a high guess at the load of a migration-target cpu weighted
2167 * according to the scheduling class and "nice" value.
2168 */
2169static unsigned long target_load(int cpu, int type)
2170{
2171        struct rq *rq = cpu_rq(cpu);
2172        unsigned long total = weighted_cpuload(cpu);
2173
2174        if (type == 0 || !sched_feat(LB_BIAS))
2175                return total;
2176
2177        return max(rq->cpu_load[type-1], total);
2178}
2179
2180/*
2181 * find_idlest_group finds and returns the least busy CPU group within the
2182 * domain.
2183 */
2184static struct sched_group *
2185find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2186{
2187        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2188        unsigned long min_load = ULONG_MAX, this_load = 0;
2189        int load_idx = sd->forkexec_idx;
2190        int imbalance = 100 + (sd->imbalance_pct-100)/2;
2191
2192        do {
2193                unsigned long load, avg_load;
2194                int local_group;
2195                int i;
2196
2197                /* Skip over this group if it has no CPUs allowed */
2198                if (!cpumask_intersects(sched_group_cpus(group),
2199                                        &p->cpus_allowed))
2200                        continue;
2201
2202                local_group = cpumask_test_cpu(this_cpu,
2203                                               sched_group_cpus(group));
2204
2205                /* Tally up the load of all CPUs in the group */
2206                avg_load = 0;
2207
2208                for_each_cpu(i, sched_group_cpus(group)) {
2209                        /* Bias balancing toward cpus of our domain */
2210                        if (local_group)
2211                                load = source_load(i, load_idx);
2212                        else
2213                                load = target_load(i, load_idx);
2214
2215                        avg_load += load;
2216                }
2217
2218                /* Adjust by relative CPU power of the group */
2219                avg_load = sg_div_cpu_power(group,
2220                                avg_load * SCHED_LOAD_SCALE);
2221
2222                if (local_group) {
2223                        this_load = avg_load;
2224                        this = group;
2225                } else if (avg_load < min_load) {
2226                        min_load = avg_load;
2227                        idlest = group;
2228                }
2229        } while (group = group->next, group != sd->groups);
2230
2231        if (!idlest || 100*this_load < imbalance*min_load)
2232                return NULL;
2233        return idlest;
2234}
2235
2236/*
2237 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2238 */
2239static int
2240find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2241{
2242        unsigned long load, min_load = ULONG_MAX;
2243        int idlest = -1;
2244        int i;
2245
2246        /* Traverse only the allowed CPUs */
2247        for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2248                load = weighted_cpuload(i);
2249
2250                if (load < min_load || (load == min_load && i == this_cpu)) {
2251                        min_load = load;
2252                        idlest = i;
2253                }
2254        }
2255
2256        return idlest;
2257}
2258
2259/*
2260 * sched_balance_self: balance the current task (running on cpu) in domains
2261 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2262 * SD_BALANCE_EXEC.
2263 *
2264 * Balance, ie. select the least loaded group.
2265 *
2266 * Returns the target CPU number, or the same CPU if no balancing is needed.
2267 *
2268 * preempt must be disabled.
2269 */
2270static int sched_balance_self(int cpu, int flag)
2271{
2272        struct task_struct *t = current;
2273        struct sched_domain *tmp, *sd = NULL;
2274
2275        for_each_domain(cpu, tmp) {
2276                /*
2277                 * If power savings logic is enabled for a domain, stop there.
2278                 */
2279                if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2280                        break;
2281                if (tmp->flags & flag)
2282                        sd = tmp;
2283        }
2284
2285        if (sd)
2286                update_shares(sd);
2287
2288        while (sd) {
2289                struct sched_group *group;
2290                int new_cpu, weight;
2291
2292                if (!(sd->flags & flag)) {
2293                        sd = sd->child;
2294                        continue;
2295                }
2296
2297                group = find_idlest_group(sd, t, cpu);
2298                if (!group) {
2299                        sd = sd->child;
2300                        continue;
2301                }
2302
2303                new_cpu = find_idlest_cpu(group, t, cpu);
2304                if (new_cpu == -1 || new_cpu == cpu) {
2305                        /* Now try balancing at a lower domain level of cpu */
2306                        sd = sd->child;
2307                        continue;
2308                }
2309
2310                /* Now try balancing at a lower domain level of new_cpu */
2311                cpu = new_cpu;
2312                weight = cpumask_weight(sched_domain_span(sd));
2313                sd = NULL;
2314                for_each_domain(cpu, tmp) {
2315                        if (weight <= cpumask_weight(sched_domain_span(tmp)))
2316                                break;
2317                        if (tmp->flags & flag)
2318                                sd = tmp;
2319                }
2320                /* while loop will break here if sd == NULL */
2321        }
2322
2323        return cpu;
2324}
2325
2326#endif /* CONFIG_SMP */
2327
2328/***
2329 * try_to_wake_up - wake up a thread
2330 * @p: the to-be-woken-up thread
2331 * @state: the mask of task states that can be woken
2332 * @sync: do a synchronous wakeup?
2333 *
2334 * Put it on the run-queue if it's not already there. The "current"
2335 * thread is always on the run-queue (except when the actual
2336 * re-schedule is in progress), and as such you're allowed to do
2337 * the simpler "current->state = TASK_RUNNING" to mark yourself
2338 * runnable without the overhead of this.
2339 *
2340 * returns failure only if the task is already active.
2341 */
2342static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2343{
2344        int cpu, orig_cpu, this_cpu, success = 0;
2345        unsigned long flags;
2346        long old_state;
2347        struct rq *rq;
2348
2349        if (!sched_feat(SYNC_WAKEUPS))
2350                sync = 0;
2351
2352#ifdef CONFIG_SMP
2353        if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2354                struct sched_domain *sd;
2355
2356                this_cpu = raw_smp_processor_id();
2357                cpu = task_cpu(p);
2358
2359                for_each_domain(this_cpu, sd) {
2360                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2361                                update_shares(sd);
2362                                break;
2363                        }
2364                }
2365        }
2366#endif
2367
2368        smp_wmb();
2369        rq = task_rq_lock(p, &flags);
2370        update_rq_clock(rq);
2371        old_state = p->state;
2372        if (!(old_state & state))
2373                goto out;
2374
2375        if (p->se.on_rq)
2376                goto out_running;
2377
2378        cpu = task_cpu(p);
2379        orig_cpu = cpu;
2380        this_cpu = smp_processor_id();
2381
2382#ifdef CONFIG_SMP
2383        if (unlikely(task_running(rq, p)))
2384                goto out_activate;
2385
2386        cpu = p->sched_class->select_task_rq(p, sync);
2387        if (cpu != orig_cpu) {
2388                set_task_cpu(p, cpu);
2389                task_rq_unlock(rq, &flags);
2390                /* might preempt at this point */
2391                rq = task_rq_lock(p, &flags);
2392                old_state = p->state;
2393                if (!(old_state & state))
2394                        goto out;
2395                if (p->se.on_rq)
2396                        goto out_running;
2397
2398                this_cpu = smp_processor_id();
2399                cpu = task_cpu(p);
2400        }
2401
2402#ifdef CONFIG_SCHEDSTATS
2403        schedstat_inc(rq, ttwu_count);
2404        if (cpu == this_cpu)
2405                schedstat_inc(rq, ttwu_local);
2406        else {
2407                struct sched_domain *sd;
2408                for_each_domain(this_cpu, sd) {
2409                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2410                                schedstat_inc(sd, ttwu_wake_remote);
2411                                break;
2412                        }
2413                }
2414        }
2415#endif /* CONFIG_SCHEDSTATS */
2416
2417out_activate:
2418#endif /* CONFIG_SMP */
2419        schedstat_inc(p, se.nr_wakeups);
2420        if (sync)
2421                schedstat_inc(p, se.nr_wakeups_sync);
2422        if (orig_cpu != cpu)
2423                schedstat_inc(p, se.nr_wakeups_migrate);
2424        if (cpu == this_cpu)
2425                schedstat_inc(p, se.nr_wakeups_local);
2426        else
2427                schedstat_inc(p, se.nr_wakeups_remote);
2428        activate_task(rq, p, 1);
2429        success = 1;
2430
2431        /*
2432         * Only attribute actual wakeups done by this task.
2433         */
2434        if (!in_interrupt()) {
2435                struct sched_entity *se = &current->se;
2436                u64 sample = se->sum_exec_runtime;
2437
2438                if (se->last_wakeup)
2439                        sample -= se->last_wakeup;
2440                else
2441                        sample -= se->start_runtime;
2442                update_avg(&se->avg_wakeup, sample);
2443
2444                se->last_wakeup = se->sum_exec_runtime;
2445        }
2446
2447out_running:
2448        trace_sched_wakeup(rq, p, success);
2449        check_preempt_curr(rq, p, sync);
2450
2451        p->state = TASK_RUNNING;
2452#ifdef CONFIG_SMP
2453        if (p->sched_class->task_wake_up)
2454                p->sched_class->task_wake_up(rq, p);
2455#endif
2456out:
2457        task_rq_unlock(rq, &flags);
2458
2459        return success;
2460}
2461
2462int wake_up_process(struct task_struct *p)
2463{
2464        return try_to_wake_up(p, TASK_ALL, 0);
2465}
2466EXPORT_SYMBOL(wake_up_process);
2467
2468int wake_up_state(struct task_struct *p, unsigned int state)
2469{
2470        return try_to_wake_up(p, state, 0);
2471}
2472
2473/*
2474 * Perform scheduler related setup for a newly forked process p.
2475 * p is forked by current.
2476 *
2477 * __sched_fork() is basic setup used by init_idle() too:
2478 */
2479static void __sched_fork(struct task_struct *p)
2480{
2481        p->se.exec_start                = 0;
2482        p->se.sum_exec_runtime          = 0;
2483        p->se.prev_sum_exec_runtime     = 0;
2484        p->se.last_wakeup               = 0;
2485        p->se.avg_overlap               = 0;
2486        p->se.start_runtime             = 0;
2487        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
2488
2489#ifdef CONFIG_SCHEDSTATS
2490        p->se.wait_start                = 0;
2491        p->se.sum_sleep_runtime         = 0;
2492        p->se.sleep_start               = 0;
2493        p->se.block_start               = 0;
2494        p->se.sleep_max                 = 0;
2495        p->se.block_max                 = 0;
2496        p->se.exec_max                  = 0;
2497        p->se.slice_max                 = 0;
2498        p->se.wait_max                  = 0;
2499#endif
2500
2501        INIT_LIST_HEAD(&p->rt.run_list);
2502        p->se.on_rq = 0;
2503        INIT_LIST_HEAD(&p->se.group_node);
2504
2505#ifdef CONFIG_PREEMPT_NOTIFIERS
2506        INIT_HLIST_HEAD(&p->preempt_notifiers);
2507#endif
2508
2509        /*
2510         * We mark the process as running here, but have not actually
2511         * inserted it onto the runqueue yet. This guarantees that
2512         * nobody will actually run it, and a signal or other external
2513         * event cannot wake it up and insert it on the runqueue either.
2514         */
2515        p->state = TASK_RUNNING;
2516}
2517
2518/*
2519 * fork()/clone()-time setup:
2520 */
2521void sched_fork(struct task_struct *p, int clone_flags)
2522{
2523        int cpu = get_cpu();
2524
2525        __sched_fork(p);
2526
2527#ifdef CONFIG_SMP
2528        cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2529#endif
2530        set_task_cpu(p, cpu);
2531
2532        /*
2533         * Make sure we do not leak PI boosting priority to the child:
2534         */
2535        p->prio = current->normal_prio;
2536        if (!rt_prio(p->prio))
2537                p->sched_class = &fair_sched_class;
2538
2539#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2540        if (likely(sched_info_on()))
2541                memset(&p->sched_info, 0, sizeof(p->sched_info));
2542#endif
2543#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2544        p->oncpu = 0;
2545#endif
2546#ifdef CONFIG_PREEMPT
2547        /* Want to start with kernel preemption disabled. */
2548        task_thread_info(p)->preempt_count = 1;
2549#endif
2550        plist_node_init(&p->pushable_tasks, MAX_PRIO);
2551
2552        put_cpu();
2553}
2554
2555/*
2556 * wake_up_new_task - wake up a newly created task for the first time.
2557 *
2558 * This function will do some initial scheduler statistics housekeeping
2559 * that must be done for every newly created context, then puts the task
2560 * on the runqueue and wakes it.
2561 */
2562void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2563{
2564        unsigned long flags;
2565        struct rq *rq;
2566
2567        rq = task_rq_lock(p, &flags);
2568        BUG_ON(p->state != TASK_RUNNING);
2569        update_rq_clock(rq);
2570
2571        p->prio = effective_prio(p);
2572
2573        if (!p->sched_class->task_new || !current->se.on_rq) {
2574                activate_task(rq, p, 0);
2575        } else {
2576                /*
2577                 * Let the scheduling class do new task startup
2578                 * management (if any):
2579                 */
2580                p->sched_class->task_new(rq, p);
2581                inc_nr_running(rq);
2582        }
2583        trace_sched_wakeup_new(rq, p, 1);
2584        check_preempt_curr(rq, p, 0);
2585#ifdef CONFIG_SMP
2586        if (p->sched_class->task_wake_up)
2587                p->sched_class->task_wake_up(rq, p);
2588#endif
2589        task_rq_unlock(rq, &flags);
2590}
2591
2592#ifdef CONFIG_PREEMPT_NOTIFIERS
2593
2594/**
2595 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2596 * @notifier: notifier struct to register
2597 */
2598void preempt_notifier_register(struct preempt_notifier *notifier)
2599{
2600        hlist_add_head(&notifier->link, &current->preempt_notifiers);
2601}
2602EXPORT_SYMBOL_GPL(preempt_notifier_register);
2603
2604/**
2605 * preempt_notifier_unregister - no longer interested in preemption notifications
2606 * @notifier: notifier struct to unregister
2607 *
2608 * This is safe to call from within a preemption notifier.
2609 */
2610void preempt_notifier_unregister(struct preempt_notifier *notifier)
2611{
2612        hlist_del(&notifier->link);
2613}
2614EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2615
2616static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2617{
2618        struct preempt_notifier *notifier;
2619        struct hlist_node *node;
2620
2621        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2622                notifier->ops->sched_in(notifier, raw_smp_processor_id());
2623}
2624
2625static void
2626fire_sched_out_preempt_notifiers(struct task_struct *curr,
2627                                 struct task_struct *next)
2628{
2629        struct preempt_notifier *notifier;
2630        struct hlist_node *node;
2631
2632        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2633                notifier->ops->sched_out(notifier, next);
2634}
2635
2636#else /* !CONFIG_PREEMPT_NOTIFIERS */
2637
2638static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2639{
2640}
2641
2642static void
2643fire_sched_out_preempt_notifiers(struct task_struct *curr,
2644                                 struct task_struct *next)
2645{
2646}
2647
2648#endif /* CONFIG_PREEMPT_NOTIFIERS */
2649
2650/**
2651 * prepare_task_switch - prepare to switch tasks
2652 * @rq: the runqueue preparing to switch
2653 * @prev: the current task that is being switched out
2654 * @next: the task we are going to switch to.
2655 *
2656 * This is called with the rq lock held and interrupts off. It must
2657 * be paired with a subsequent finish_task_switch after the context
2658 * switch.
2659 *
2660 * prepare_task_switch sets up locking and calls architecture specific
2661 * hooks.
2662 */
2663static inline void
2664prepare_task_switch(struct rq *rq, struct task_struct *prev,
2665                    struct task_struct *next)
2666{
2667        fire_sched_out_preempt_notifiers(prev, next);
2668        prepare_lock_switch(rq, next);
2669        prepare_arch_switch(next);
2670}
2671
2672/**
2673 * finish_task_switch - clean up after a task-switch
2674 * @rq: runqueue associated with task-switch
2675 * @prev: the thread we just switched away from.
2676 *
2677 * finish_task_switch must be called after the context switch, paired
2678 * with a prepare_task_switch call before the context switch.
2679 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2680 * and do any other architecture-specific cleanup actions.
2681 *
2682 * Note that we may have delayed dropping an mm in context_switch(). If
2683 * so, we finish that here outside of the runqueue lock. (Doing it
2684 * with the lock held can cause deadlocks; see schedule() for
2685 * details.)
2686 */
2687static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2688        __releases(rq->lock)
2689{
2690        struct mm_struct *mm = rq->prev_mm;
2691        long prev_state;
2692#ifdef CONFIG_SMP
2693        int post_schedule = 0;
2694
2695        if (current->sched_class->needs_post_schedule)
2696                post_schedule = current->sched_class->needs_post_schedule(rq);
2697#endif
2698
2699        rq->prev_mm = NULL;
2700
2701        /*
2702         * A task struct has one reference for the use as "current".
2703         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2704         * schedule one last time. The schedule call will never return, and
2705         * the scheduled task must drop that reference.
2706         * The test for TASK_DEAD must occur while the runqueue locks are
2707         * still held, otherwise prev could be scheduled on another cpu, die
2708         * there before we look at prev->state, and then the reference would
2709         * be dropped twice.
2710         *              Manfred Spraul <manfred@colorfullife.com>
2711         */
2712        prev_state = prev->state;
2713        finish_arch_switch(prev);
2714        finish_lock_switch(rq, prev);
2715#ifdef CONFIG_SMP
2716        if (post_schedule)
2717                current->sched_class->post_schedule(rq);
2718#endif
2719
2720        fire_sched_in_preempt_notifiers(current);
2721        if (mm)
2722                mmdrop(mm);
2723        if (unlikely(prev_state == TASK_DEAD)) {
2724                /*
2725                 * Remove function-return probe instances associated with this
2726                 * task and put them back on the free list.
2727                 */
2728                kprobe_flush_task(prev);
2729                put_task_struct(prev);
2730        }
2731}
2732
2733/**
2734 * schedule_tail - first thing a freshly forked thread must call.
2735 * @prev: the thread we just switched away from.
2736 */
2737asmlinkage void schedule_tail(struct task_struct *prev)
2738        __releases(rq->lock)
2739{
2740        struct rq *rq = this_rq();
2741
2742        finish_task_switch(rq, prev);
2743#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2744        /* In this case, finish_task_switch does not reenable preemption */
2745        preempt_enable();
2746#endif
2747        if (current->set_child_tid)
2748                put_user(task_pid_vnr(current), current->set_child_tid);
2749}
2750
2751/*
2752 * context_switch - switch to the new MM and the new
2753 * thread's register state.
2754 */
2755static inline void
2756context_switch(struct rq *rq, struct task_struct *prev,
2757               struct task_struct *next)
2758{
2759        struct mm_struct *mm, *oldmm;
2760
2761        prepare_task_switch(rq, prev, next);
2762        trace_sched_switch(rq, prev, next);
2763        mm = next->mm;
2764        oldmm = prev->active_mm;
2765        /*
2766         * For paravirt, this is coupled with an exit in switch_to to
2767         * combine the page table reload and the switch backend into
2768         * one hypercall.
2769         */
2770        arch_enter_lazy_cpu_mode();
2771
2772        if (unlikely(!mm)) {
2773                next->active_mm = oldmm;
2774                atomic_inc(&oldmm->mm_count);
2775                enter_lazy_tlb(oldmm, next);
2776        } else
2777                switch_mm(oldmm, mm, next);
2778
2779        if (unlikely(!prev->mm)) {
2780                prev->active_mm = NULL;
2781                rq->prev_mm = oldmm;
2782        }
2783        /*
2784         * Since the runqueue lock will be released by the next
2785         * task (which is an invalid locking op but in the case
2786         * of the scheduler it's an obvious special-case), so we
2787         * do an early lockdep release here:
2788         */
2789#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2790        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2791#endif
2792
2793        /* Here we just switch the register state and the stack. */
2794        switch_to(prev, next, prev);
2795
2796        barrier();
2797        /*
2798         * this_rq must be evaluated again because prev may have moved
2799         * CPUs since it called schedule(), thus the 'rq' on its stack
2800         * frame will be invalid.
2801         */
2802        finish_task_switch(this_rq(), prev);
2803}
2804
2805/*
2806 * nr_running, nr_uninterruptible and nr_context_switches:
2807 *
2808 * externally visible scheduler statistics: current number of runnable
2809 * threads, current number of uninterruptible-sleeping threads, total
2810 * number of context switches performed since bootup.
2811 */
2812unsigned long nr_running(void)
2813{
2814        unsigned long i, sum = 0;
2815
2816        for_each_online_cpu(i)
2817                sum += cpu_rq(i)->nr_running;
2818
2819        return sum;
2820}
2821
2822unsigned long nr_uninterruptible(void)
2823{
2824        unsigned long i, sum = 0;
2825
2826        for_each_possible_cpu(i)
2827                sum += cpu_rq(i)->nr_uninterruptible;
2828
2829        /*
2830         * Since we read the counters lockless, it might be slightly
2831         * inaccurate. Do not allow it to go below zero though:
2832         */
2833        if (unlikely((long)sum < 0))
2834                sum = 0;
2835
2836        return sum;
2837}
2838
2839unsigned long long nr_context_switches(void)
2840{
2841        int i;
2842        unsigned long long sum = 0;
2843
2844        for_each_possible_cpu(i)
2845                sum += cpu_rq(i)->nr_switches;
2846
2847        return sum;
2848}
2849
2850unsigned long nr_iowait(void)
2851{
2852        unsigned long i, sum = 0;
2853
2854        for_each_possible_cpu(i)
2855                sum += atomic_read(&cpu_rq(i)->nr_iowait);
2856
2857        return sum;
2858}
2859
2860unsigned long nr_active(void)
2861{
2862        unsigned long i, running = 0, uninterruptible = 0;
2863
2864        for_each_online_cpu(i) {
2865                running += cpu_rq(i)->nr_running;
2866                uninterruptible += cpu_rq(i)->nr_uninterruptible;
2867        }
2868
2869        if (unlikely((long)uninterruptible < 0))
2870                uninterruptible = 0;
2871
2872        return running + uninterruptible;
2873}
2874
2875/*
2876 * Update rq->cpu_load[] statistics. This function is usually called every
2877 * scheduler tick (TICK_NSEC).
2878 */
2879static void update_cpu_load(struct rq *this_rq)
2880{
2881        unsigned long this_load = this_rq->load.weight;
2882        int i, scale;
2883
2884        this_rq->nr_load_updates++;
2885
2886        /* Update our load: */
2887        for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2888                unsigned long old_load, new_load;
2889
2890                /* scale is effectively 1 << i now, and >> i divides by scale */
2891
2892                old_load = this_rq->cpu_load[i];
2893                new_load = this_load;
2894                /*
2895                 * Round up the averaging division if load is increasing. This
2896                 * prevents us from getting stuck on 9 if the load is 10, for
2897                 * example.
2898                 */
2899                if (new_load > old_load)
2900                        new_load += scale-1;
2901                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2902        }
2903}
2904
2905#ifdef CONFIG_SMP
2906
2907/*
2908 * double_rq_lock - safely lock two runqueues
2909 *
2910 * Note this does not disable interrupts like task_rq_lock,
2911 * you need to do so manually before calling.
2912 */
2913static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2914        __acquires(rq1->lock)
2915        __acquires(rq2->lock)
2916{
2917        BUG_ON(!irqs_disabled());
2918        if (rq1 == rq2) {
2919                spin_lock(&rq1->lock);
2920                __acquire(rq2->lock);   /* Fake it out ;) */
2921        } else {
2922                if (rq1 < rq2) {
2923                        spin_lock(&rq1->lock);
2924                        spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2925                } else {
2926                        spin_lock(&rq2->lock);
2927                        spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2928                }
2929        }
2930        update_rq_clock(rq1);
2931        update_rq_clock(rq2);
2932}
2933
2934/*
2935 * double_rq_unlock - safely unlock two runqueues
2936 *
2937 * Note this does not restore interrupts like task_rq_unlock,
2938 * you need to do so manually after calling.
2939 */
2940static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2941        __releases(rq1->lock)
2942        __releases(rq2->lock)
2943{
2944        spin_unlock(&rq1->lock);
2945        if (rq1 != rq2)
2946                spin_unlock(&rq2->lock);
2947        else
2948                __release(rq2->lock);
2949}
2950
2951/*
2952 * If dest_cpu is allowed for this process, migrate the task to it.
2953 * This is accomplished by forcing the cpu_allowed mask to only
2954 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2955 * the cpu_allowed mask is restored.
2956 */
2957static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2958{
2959        struct migration_req req;
2960        unsigned long flags;
2961        struct rq *rq;
2962
2963        rq = task_rq_lock(p, &flags);
2964        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2965            || unlikely(!cpu_active(dest_cpu)))
2966                goto out;
2967
2968        /* force the process onto the specified CPU */
2969        if (migrate_task(p, dest_cpu, &req)) {
2970                /* Need to wait for migration thread (might exit: take ref). */
2971                struct task_struct *mt = rq->migration_thread;
2972
2973                get_task_struct(mt);
2974                task_rq_unlock(rq, &flags);
2975                wake_up_process(mt);
2976                put_task_struct(mt);
2977                wait_for_completion(&req.done);
2978
2979                return;
2980        }
2981out:
2982        task_rq_unlock(rq, &flags);
2983}
2984
2985/*
2986 * sched_exec - execve() is a valuable balancing opportunity, because at
2987 * this point the task has the smallest effective memory and cache footprint.
2988 */
2989void sched_exec(void)
2990{
2991        int new_cpu, this_cpu = get_cpu();
2992        new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2993        put_cpu();
2994        if (new_cpu != this_cpu)
2995                sched_migrate_task(current, new_cpu);
2996}
2997
2998/*
2999 * pull_task - move a task from a remote runqueue to the local runqueue.
3000 * Both runqueues must be locked.
3001 */
3002static void pull_task(struct rq *src_rq, struct task_struct *p,
3003                      struct rq *this_rq, int this_cpu)
3004{
3005        deactivate_task(src_rq, p, 0);
3006        set_task_cpu(p, this_cpu);
3007        activate_task(this_rq, p, 0);
3008        /*
3009         * Note that idle threads have a prio of MAX_PRIO, for this test
3010         * to be always true for them.
3011         */
3012        check_preempt_curr(this_rq, p, 0);
3013}
3014
3015/*
3016 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3017 */
3018static
3019int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3020                     struct sched_domain *sd, enum cpu_idle_type idle,
3021                     int *all_pinned)
3022{
3023        int tsk_cache_hot = 0;
3024        /*
3025         * We do not migrate tasks that are:
3026         * 1) running (obviously), or
3027         * 2) cannot be migrated to this CPU due to cpus_allowed, or
3028         * 3) are cache-hot on their current CPU.
3029         */
3030        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3031                schedstat_inc(p, se.nr_failed_migrations_affine);
3032                return 0;
3033        }
3034        *all_pinned = 0;
3035
3036        if (task_running(rq, p)) {
3037                schedstat_inc(p, se.nr_failed_migrations_running);
3038                return 0;
3039        }
3040
3041        /*
3042         * Aggressive migration if:
3043         * 1) task is cache cold, or
3044         * 2) too many balance attempts have failed.
3045         */
3046
3047        tsk_cache_hot = task_hot(p, rq->clock, sd);
3048        if (!tsk_cache_hot ||
3049                sd->nr_balance_failed > sd->cache_nice_tries) {
3050#ifdef CONFIG_SCHEDSTATS
3051                if (tsk_cache_hot) {
3052                        schedstat_inc(sd, lb_hot_gained[idle]);
3053                        schedstat_inc(p, se.nr_forced_migrations);
3054                }
3055#endif
3056                return 1;
3057        }
3058
3059        if (tsk_cache_hot) {
3060                schedstat_inc(p, se.nr_failed_migrations_hot);
3061                return 0;
3062        }
3063        return 1;
3064}
3065
3066static unsigned long
3067balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3068              unsigned long max_load_move, struct sched_domain *sd,
3069              enum cpu_idle_type idle, int *all_pinned,
3070              int *this_best_prio, struct rq_iterator *iterator)
3071{
3072        int loops = 0, pulled = 0, pinned = 0;
3073        struct task_struct *p;
3074        long rem_load_move = max_load_move;
3075
3076        if (max_load_move == 0)
3077                goto out;
3078
3079        pinned = 1;
3080
3081        /*
3082         * Start the load-balancing iterator:
3083         */
3084        p = iterator->start(iterator->arg);
3085next:
3086        if (!p || loops++ > sysctl_sched_nr_migrate)
3087                goto out;
3088
3089        if ((p->se.load.weight >> 1) > rem_load_move ||
3090            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3091                p = iterator->next(iterator->arg);
3092                goto next;
3093        }
3094
3095        pull_task(busiest, p, this_rq, this_cpu);
3096        pulled++;
3097        rem_load_move -= p->se.load.weight;
3098
3099#ifdef CONFIG_PREEMPT
3100        /*
3101         * NEWIDLE balancing is a source of latency, so preemptible kernels
3102         * will stop after the first task is pulled to minimize the critical
3103         * section.
3104         */
3105        if (idle == CPU_NEWLY_IDLE)
3106                goto out;
3107#endif
3108
3109        /*
3110         * We only want to steal up to the prescribed amount of weighted load.
3111         */
3112        if (rem_load_move > 0) {
3113                if (p->prio < *this_best_prio)
3114                        *this_best_prio = p->prio;
3115                p = iterator->next(iterator->arg);
3116                goto next;
3117        }
3118out:
3119        /*
3120         * Right now, this is one of only two places pull_task() is called,
3121         * so we can safely collect pull_task() stats here rather than
3122         * inside pull_task().
3123         */
3124        schedstat_add(sd, lb_gained[idle], pulled);
3125
3126        if (all_pinned)
3127                *all_pinned = pinned;
3128
3129        return max_load_move - rem_load_move;
3130}
3131
3132/*
3133 * move_tasks tries to move up to max_load_move weighted load from busiest to
3134 * this_rq, as part of a balancing operation within domain "sd".
3135 * Returns 1 if successful and 0 otherwise.
3136 *
3137 * Called with both runqueues locked.
3138 */
3139static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3140                      unsigned long max_load_move,
3141                      struct sched_domain *sd, enum cpu_idle_type idle,
3142                      int *all_pinned)
3143{
3144        const struct sched_class *class = sched_class_highest;
3145        unsigned long total_load_moved = 0;
3146        int this_best_prio = this_rq->curr->prio;
3147
3148        do {
3149                total_load_moved +=
3150                        class->load_balance(this_rq, this_cpu, busiest,
3151                                max_load_move - total_load_moved,
3152                                sd, idle, all_pinned, &this_best_prio);
3153                class = class->next;
3154
3155#ifdef CONFIG_PREEMPT
3156                /*
3157                 * NEWIDLE balancing is a source of latency, so preemptible
3158                 * kernels will stop after the first task is pulled to minimize
3159                 * the critical section.
3160                 */
3161                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3162                        break;
3163#endif
3164        } while (class && max_load_move > total_load_moved);
3165
3166        return total_load_moved > 0;
3167}
3168
3169static int
3170iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3171                   struct sched_domain *sd, enum cpu_idle_type idle,
3172                   struct rq_iterator *iterator)
3173{
3174        struct task_struct *p = iterator->start(iterator->arg);
3175        int pinned = 0;
3176
3177        while (p) {
3178                if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3179                        pull_task(busiest, p, this_rq, this_cpu);
3180                        /*
3181                         * Right now, this is only the second place pull_task()
3182                         * is called, so we can safely collect pull_task()
3183                         * stats here rather than inside pull_task().
3184                         */
3185                        schedstat_inc(sd, lb_gained[idle]);
3186
3187                        return 1;
3188                }
3189                p = iterator->next(iterator->arg);
3190        }
3191
3192        return 0;
3193}
3194
3195/*
3196 * move_one_task tries to move exactly one task from busiest to this_rq, as
3197 * part of active balancing operations within "domain".
3198 * Returns 1 if successful and 0 otherwise.
3199 *
3200 * Called with both runqueues locked.
3201 */
3202static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3203                         struct sched_domain *sd, enum cpu_idle_type idle)
3204{
3205        const struct sched_class *class;
3206
3207        for (class = sched_class_highest; class; class = class->next)
3208                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3209                        return 1;
3210
3211        return 0;
3212}
3213/********** Helpers for find_busiest_group ************************/
3214/*
3215 * sd_lb_stats - Structure to store the statistics of a sched_domain
3216 *              during load balancing.
3217 */
3218struct sd_lb_stats {
3219        struct sched_group *busiest; /* Busiest group in this sd */
3220        struct sched_group *this;  /* Local group in this sd */
3221        unsigned long total_load;  /* Total load of all groups in sd */
3222        unsigned long total_pwr;   /*   Total power of all groups in sd */
3223        unsigned long avg_load;    /* Average load across all groups in sd */
3224
3225        /** Statistics of this group */
3226        unsigned long this_load;
3227        unsigned long this_load_per_task;
3228        unsigned long this_nr_running;
3229
3230        /* Statistics of the busiest group */
3231        unsigned long max_load;
3232        unsigned long busiest_load_per_task;
3233        unsigned long busiest_nr_running;
3234
3235        int group_imb; /* Is there imbalance in this sd */
3236#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3237        int power_savings_balance; /* Is powersave balance needed for this sd */
3238        struct sched_group *group_min; /* Least loaded group in sd */
3239        struct sched_group *group_leader; /* Group which relieves group_min */
3240        unsigned long min_load_per_task; /* load_per_task in group_min */
3241        unsigned long leader_nr_running; /* Nr running of group_leader */
3242        unsigned long min_nr_running; /* Nr running of group_min */
3243#endif
3244};
3245
3246/*
3247 * sg_lb_stats - stats of a sched_group required for load_balancing
3248 */
3249struct sg_lb_stats {
3250        unsigned long avg_load; /*Avg load across the CPUs of the group */
3251        unsigned long group_load; /* Total load over the CPUs of the group */
3252        unsigned long sum_nr_running; /* Nr tasks running in the group */
3253        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3254        unsigned long group_capacity;
3255        int group_imb; /* Is there an imbalance in the group ? */
3256};
3257
3258/**
3259 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3260 * @group: The group whose first cpu is to be returned.
3261 */
3262static inline unsigned int group_first_cpu(struct sched_group *group)
3263{
3264        return cpumask_first(sched_group_cpus(group));
3265}
3266
3267/**
3268 * get_sd_load_idx - Obtain the load index for a given sched domain.
3269 * @sd: The sched_domain whose load_idx is to be obtained.
3270 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3271 */
3272static inline int get_sd_load_idx(struct sched_domain *sd,
3273                                        enum cpu_idle_type idle)
3274{
3275        int load_idx;
3276
3277        switch (idle) {
3278        case CPU_NOT_IDLE:
3279                load_idx = sd->busy_idx;
3280                break;
3281
3282        case CPU_NEWLY_IDLE:
3283                load_idx = sd->newidle_idx;
3284                break;
3285        default:
3286                load_idx = sd->idle_idx;
3287                break;
3288        }
3289
3290        return load_idx;
3291}
3292
3293
3294#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3295/**
3296 * init_sd_power_savings_stats - Initialize power savings statistics for
3297 * the given sched_domain, during load balancing.
3298 *
3299 * @sd: Sched domain whose power-savings statistics are to be initialized.
3300 * @sds: Variable containing the statistics for sd.
3301 * @idle: Idle status of the CPU at which we're performing load-balancing.
3302 */
3303static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3304        struct sd_lb_stats *sds, enum cpu_idle_type idle)
3305{
3306        /*
3307         * Busy processors will not participate in power savings
3308         * balance.
3309         */
3310        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3311                sds->power_savings_balance = 0;
3312        else {
3313                sds->power_savings_balance = 1;
3314                sds->min_nr_running = ULONG_MAX;
3315                sds->leader_nr_running = 0;
3316        }
3317}
3318
3319/**
3320 * update_sd_power_savings_stats - Update the power saving stats for a
3321 * sched_domain while performing load balancing.
3322 *
3323 * @group: sched_group belonging to the sched_domain under consideration.
3324 * @sds: Variable containing the statistics of the sched_domain
3325 * @local_group: Does group contain the CPU for which we're performing
3326 *              load balancing ?
3327 * @sgs: Variable containing the statistics of the group.
3328 */
3329static inline void update_sd_power_savings_stats(struct sched_group *group,
3330        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3331{
3332
3333        if (!sds->power_savings_balance)
3334                return;
3335
3336        /*
3337         * If the local group is idle or completely loaded
3338         * no need to do power savings balance at this domain
3339         */
3340        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3341                                !sds->this_nr_running))
3342                sds->power_savings_balance = 0;
3343
3344        /*
3345         * If a group is already running at full capacity or idle,
3346         * don't include that group in power savings calculations
3347         */
3348        if (!sds->power_savings_balance ||
3349                sgs->sum_nr_running >= sgs->group_capacity ||
3350                !sgs->sum_nr_running)
3351                return;
3352
3353        /*
3354         * Calculate the group which has the least non-idle load.
3355         * This is the group from where we need to pick up the load
3356         * for saving power
3357         */
3358        if ((sgs->sum_nr_running < sds->min_nr_running) ||
3359            (sgs->sum_nr_running == sds->min_nr_running &&
3360             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3361                sds->group_min = group;
3362                sds->min_nr_running = sgs->sum_nr_running;
3363                sds->min_load_per_task = sgs->sum_weighted_load /
3364                                                sgs->sum_nr_running;
3365        }
3366
3367        /*
3368         * Calculate the group which is almost near its
3369         * capacity but still has some space to pick up some load
3370         * from other group and save more power
3371         */
3372        if (sgs->sum_nr_running > sgs->group_capacity - 1)
3373                return;
3374
3375        if (sgs->sum_nr_running > sds->leader_nr_running ||
3376            (sgs->sum_nr_running == sds->leader_nr_running &&
3377             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3378                sds->group_leader = group;
3379                sds->leader_nr_running = sgs->sum_nr_running;
3380        }
3381}
3382
3383/**
3384 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3385 * @sds: Variable containing the statistics of the sched_domain
3386 *      under consideration.
3387 * @this_cpu: Cpu at which we're currently performing load-balancing.
3388 * @imbalance: Variable to store the imbalance.
3389 *
3390 * Description:
3391 * Check if we have potential to perform some power-savings balance.
3392 * If yes, set the busiest group to be the least loaded group in the
3393 * sched_domain, so that it's CPUs can be put to idle.
3394 *
3395 * Returns 1 if there is potential to perform power-savings balance.
3396 * Else returns 0.
3397 */
3398static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3399                                        int this_cpu, unsigned long *imbalance)
3400{
3401        if (!sds->power_savings_balance)
3402                return 0;
3403
3404        if (sds->this != sds->group_leader ||
3405                        sds->group_leader == sds->group_min)
3406                return 0;
3407
3408        *imbalance = sds->min_load_per_task;
3409        sds->busiest = sds->group_min;
3410
3411        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3412                cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3413                        group_first_cpu(sds->group_leader);
3414        }
3415
3416        return 1;
3417
3418}
3419#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3420static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3421        struct sd_lb_stats *sds, enum cpu_idle_type idle)
3422{
3423        return;
3424}
3425
3426static inline void update_sd_power_savings_stats(struct sched_group *group,
3427        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3428{
3429        return;
3430}
3431
3432static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3433                                        int this_cpu, unsigned long *imbalance)
3434{
3435        return 0;
3436}
3437#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3438
3439
3440/**
3441 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3442 * @group: sched_group whose statistics are to be updated.
3443 * @this_cpu: Cpu for which load balance is currently performed.
3444 * @idle: Idle status of this_cpu
3445 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3446 * @sd_idle: Idle status of the sched_domain containing group.
3447 * @local_group: Does group contain this_cpu.
3448 * @cpus: Set of cpus considered for load balancing.
3449 * @balance: Should we balance.
3450 * @sgs: variable to hold the statistics for this group.
3451 */
3452static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3453                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
3454                        int local_group, const struct cpumask *cpus,
3455                        int *balance, struct sg_lb_stats *sgs)
3456{
3457        unsigned long load, max_cpu_load, min_cpu_load;
3458        int i;
3459        unsigned int balance_cpu = -1, first_idle_cpu = 0;
3460        unsigned long sum_avg_load_per_task;
3461        unsigned long avg_load_per_task;
3462
3463        if (local_group)
3464                balance_cpu = group_first_cpu(group);
3465
3466        /* Tally up the load of all CPUs in the group */
3467        sum_avg_load_per_task = avg_load_per_task = 0;
3468        max_cpu_load = 0;
3469        min_cpu_load = ~0UL;
3470
3471        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3472                struct rq *rq = cpu_rq(i);
3473
3474                if (*sd_idle && rq->nr_running)
3475                        *sd_idle = 0;
3476
3477                /* Bias balancing toward cpus of our domain */
3478                if (local_group) {
3479                        if (idle_cpu(i) && !first_idle_cpu) {
3480                                first_idle_cpu = 1;
3481                                balance_cpu = i;
3482                        }
3483
3484                        load = target_load(i, load_idx);
3485                } else {
3486                        load = source_load(i, load_idx);
3487                        if (load > max_cpu_load)
3488                                max_cpu_load = load;
3489                        if (min_cpu_load > load)
3490                                min_cpu_load = load;
3491                }
3492
3493                sgs->group_load += load;
3494                sgs->sum_nr_running += rq->nr_running;
3495                sgs->sum_weighted_load += weighted_cpuload(i);
3496
3497                sum_avg_load_per_task += cpu_avg_load_per_task(i);
3498        }
3499
3500        /*
3501         * First idle cpu or the first cpu(busiest) in this sched group
3502         * is eligible for doing load balancing at this and above
3503         * domains. In the newly idle case, we will allow all the cpu's
3504         * to do the newly idle load balance.
3505         */
3506        if (idle != CPU_NEWLY_IDLE && local_group &&
3507            balance_cpu != this_cpu && balance) {
3508                *balance = 0;
3509                return;
3510        }
3511
3512        /* Adjust by relative CPU power of the group */
3513        sgs->avg_load = sg_div_cpu_power(group,
3514                        sgs->group_load * SCHED_LOAD_SCALE);
3515
3516
3517        /*
3518         * Consider the group unbalanced when the imbalance is larger
3519         * than the average weight of two tasks.
3520         *
3521         * APZ: with cgroup the avg task weight can vary wildly and
3522         *      might not be a suitable number - should we keep a
3523         *      normalized nr_running number somewhere that negates
3524         *      the hierarchy?
3525         */
3526        avg_load_per_task = sg_div_cpu_power(group,
3527                        sum_avg_load_per_task * SCHED_LOAD_SCALE);
3528
3529        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3530                sgs->group_imb = 1;
3531
3532        sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3533
3534}
3535
3536/**
3537 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3538 * @sd: sched_domain whose statistics are to be updated.
3539 * @this_cpu: Cpu for which load balance is currently performed.
3540 * @idle: Idle status of this_cpu
3541 * @sd_idle: Idle status of the sched_domain containing group.
3542 * @cpus: Set of cpus considered for load balancing.
3543 * @balance: Should we balance.
3544 * @sds: variable to hold the statistics for this sched_domain.
3545 */
3546static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3547                        enum cpu_idle_type idle, int *sd_idle,
3548                        const struct cpumask *cpus, int *balance,
3549                        struct sd_lb_stats *sds)
3550{
3551        struct sched_group *group = sd->groups;
3552        struct sg_lb_stats sgs;
3553        int load_idx;
3554
3555        init_sd_power_savings_stats(sd, sds, idle);
3556        load_idx = get_sd_load_idx(sd, idle);
3557
3558        do {
3559                int local_group;
3560
3561                local_group = cpumask_test_cpu(this_cpu,
3562                                               sched_group_cpus(group));
3563                memset(&sgs, 0, sizeof(sgs));
3564                update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3565                                local_group, cpus, balance, &sgs);
3566
3567                if (local_group && balance && !(*balance))
3568                        return;
3569
3570                sds->total_load += sgs.group_load;
3571                sds->total_pwr += group->__cpu_power;
3572
3573                if (local_group) {
3574                        sds->this_load = sgs.avg_load;
3575                        sds->this = group;
3576                        sds->this_nr_running = sgs.sum_nr_running;
3577                        sds->this_load_per_task = sgs.sum_weighted_load;
3578                } else if (sgs.avg_load > sds->max_load &&
3579                           (sgs.sum_nr_running > sgs.group_capacity ||
3580                                sgs.group_imb)) {
3581                        sds->max_load = sgs.avg_load;
3582                        sds->busiest = group;
3583                        sds->busiest_nr_running = sgs.sum_nr_running;
3584                        sds->busiest_load_per_task = sgs.sum_weighted_load;
3585                        sds->group_imb = sgs.group_imb;
3586                }
3587
3588                update_sd_power_savings_stats(group, sds, local_group, &sgs);
3589                group = group->next;
3590        } while (group != sd->groups);
3591
3592}
3593
3594/**
3595 * fix_small_imbalance - Calculate the minor imbalance that exists
3596 *                      amongst the groups of a sched_domain, during
3597 *                      load balancing.
3598 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3599 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3600 * @imbalance: Variable to store the imbalance.
3601 */
3602static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3603                                int this_cpu, unsigned long *imbalance)
3604{
3605        unsigned long tmp, pwr_now = 0, pwr_move = 0;
3606        unsigned int imbn = 2;
3607
3608        if (sds->this_nr_running) {
3609                sds->this_load_per_task /= sds->this_nr_running;
3610                if (sds->busiest_load_per_task >
3611                                sds->this_load_per_task)
3612                        imbn = 1;
3613        } else
3614                sds->this_load_per_task =
3615                        cpu_avg_load_per_task(this_cpu);
3616
3617        if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3618                        sds->busiest_load_per_task * imbn) {
3619                *imbalance = sds->busiest_load_per_task;
3620                return;
3621        }
3622
3623        /*
3624         * OK, we don't have enough imbalance to justify moving tasks,
3625         * however we may be able to increase total CPU power used by
3626         * moving them.
3627         */
3628
3629        pwr_now += sds->busiest->__cpu_power *
3630                        min(sds->busiest_load_per_task, sds->max_load);
3631        pwr_now += sds->this->__cpu_power *
3632                        min(sds->this_load_per_task, sds->this_load);
3633        pwr_now /= SCHED_LOAD_SCALE;
3634
3635        /* Amount of load we'd subtract */
3636        tmp = sg_div_cpu_power(sds->busiest,
3637                        sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3638        if (sds->max_load > tmp)
3639                pwr_move += sds->busiest->__cpu_power *
3640                        min(sds->busiest_load_per_task, sds->max_load - tmp);
3641
3642        /* Amount of load we'd add */
3643        if (sds->max_load * sds->busiest->__cpu_power <
3644                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3645                tmp = sg_div_cpu_power(sds->this,
3646                        sds->max_load * sds->busiest->__cpu_power);
3647        else
3648                tmp = sg_div_cpu_power(sds->this,
3649                        sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3650        pwr_move += sds->this->__cpu_power *
3651                        min(sds->this_load_per_task, sds->this_load + tmp);
3652        pwr_move /= SCHED_LOAD_SCALE;
3653
3654        /* Move if we gain throughput */
3655        if (pwr_move > pwr_now)
3656                *imbalance = sds->busiest_load_per_task;
3657}
3658
3659/**
3660 * calculate_imbalance - Calculate the amount of imbalance present within the
3661 *                       groups of a given sched_domain during load balance.
3662 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3663 * @this_cpu: Cpu for which currently load balance is being performed.
3664 * @imbalance: The variable to store the imbalance.
3665 */
3666static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3667                unsigned long *imbalance)
3668{
3669        unsigned long max_pull;
3670        /*
3671         * In the presence of smp nice balancing, certain scenarios can have
3672         * max load less than avg load(as we skip the groups at or below
3673         * its cpu_power, while calculating max_load..)
3674         */
3675        if (sds->max_load < sds->avg_load) {
3676                *imbalance = 0;
3677                return fix_small_imbalance(sds, this_cpu, imbalance);
3678        }
3679
3680        /* Don't want to pull so many tasks that a group would go idle */
3681        max_pull = min(sds->max_load - sds->avg_load,
3682                        sds->max_load - sds->busiest_load_per_task);
3683
3684        /* How much load to actually move to equalise the imbalance */
3685        *imbalance = min(max_pull * sds->busiest->__cpu_power,
3686                (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3687                        / SCHED_LOAD_SCALE;
3688
3689        /*
3690         * if *imbalance is less than the average load per runnable task
3691         * there is no gaurantee that any tasks will be moved so we'll have
3692         * a think about bumping its value to force at least one task to be
3693         * moved
3694         */
3695        if (*imbalance < sds->busiest_load_per_task)
3696                return fix_small_imbalance(sds, this_cpu, imbalance);
3697
3698}
3699/******* find_busiest_group() helpers end here *********************/
3700
3701/**
3702 * find_busiest_group - Returns the busiest group within the sched_domain
3703 * if there is an imbalance. If there isn't an imbalance, and
3704 * the user has opted for power-savings, it returns a group whose
3705 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3706 * such a group exists.
3707 *
3708 * Also calculates the amount of weighted load which should be moved
3709 * to restore balance.
3710 *
3711 * @sd: The sched_domain whose busiest group is to be returned.
3712 * @this_cpu: The cpu for which load balancing is currently being performed.
3713 * @imbalance: Variable which stores amount of weighted load which should
3714 *              be moved to restore balance/put a group to idle.
3715 * @idle: The idle status of this_cpu.
3716 * @sd_idle: The idleness of sd
3717 * @cpus: The set of CPUs under consideration for load-balancing.
3718 * @balance: Pointer to a variable indicating if this_cpu
3719 *      is the appropriate cpu to perform load balancing at this_level.
3720 *
3721 * Returns:     - the busiest group if imbalance exists.
3722 *              - If no imbalance and user has opted for power-savings balance,
3723 *                 return the least loaded group whose CPUs can be
3724 *                 put to idle by rebalancing its tasks onto our group.
3725 */
3726static struct sched_group *
3727find_busiest_group(struct sched_domain *sd, int this_cpu,
3728                   unsigned long *imbalance, enum cpu_idle_type idle,
3729                   int *sd_idle, const struct cpumask *cpus, int *balance)
3730{
3731        struct sd_lb_stats sds;
3732
3733        memset(&sds, 0, sizeof(sds));
3734
3735        /*
3736         * Compute the various statistics relavent for load balancing at
3737         * this level.
3738         */
3739        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3740                                        balance, &sds);
3741
3742        /* Cases where imbalance does not exist from POV of this_cpu */
3743        /* 1) this_cpu is not the appropriate cpu to perform load balancing
3744         *    at this level.
3745         * 2) There is no busy sibling group to pull from.
3746         * 3) This group is the busiest group.
3747         * 4) This group is more busy than the avg busieness at this
3748         *    sched_domain.
3749         * 5) The imbalance is within the specified limit.
3750         * 6) Any rebalance would lead to ping-pong
3751         */
3752        if (balance && !(*balance))
3753                goto ret;
3754
3755        if (!sds.busiest || sds.busiest_nr_running == 0)
3756                goto out_balanced;
3757
3758        if (sds.this_load >= sds.max_load)
3759                goto out_balanced;
3760
3761        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3762
3763        if (sds.this_load >= sds.avg_load)
3764                goto out_balanced;
3765
3766        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3767                goto out_balanced;
3768
3769        sds.busiest_load_per_task /= sds.busiest_nr_running;
3770        if (sds.group_imb)
3771                sds.busiest_load_per_task =
3772                        min(sds.busiest_load_per_task, sds.avg_load);
3773
3774        /*
3775         * We're trying to get all the cpus to the average_load, so we don't
3776         * want to push ourselves above the average load, nor do we wish to
3777         * reduce the max loaded cpu below the average load, as either of these
3778         * actions would just result in more rebalancing later, and ping-pong
3779         * tasks around. Thus we look for the minimum possible imbalance.
3780         * Negative imbalances (*we* are more loaded than anyone else) will
3781         * be counted as no imbalance for these purposes -- we can't fix that
3782         * by pulling tasks to us. Be careful of negative numbers as they'll
3783         * appear as very large values with unsigned longs.
3784         */
3785        if (sds.max_load <= sds.busiest_load_per_task)
3786                goto out_balanced;
3787
3788        /* Looks like there is an imbalance. Compute it */
3789        calculate_imbalance(&sds, this_cpu, imbalance);
3790        return sds.busiest;
3791
3792out_balanced:
3793        /*
3794         * There is no obvious imbalance. But check if we can do some balancing
3795         * to save power.
3796         */
3797        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3798                return sds.busiest;
3799ret:
3800        *imbalance = 0;
3801        return NULL;
3802}
3803
3804/*
3805 * find_busiest_queue - find the busiest runqueue among the cpus in group.
3806 */
3807static struct rq *
3808find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3809                   unsigned long imbalance, const struct cpumask *cpus)
3810{
3811        struct rq *busiest = NULL, *rq;
3812        unsigned long max_load = 0;
3813        int i;
3814
3815        for_each_cpu(i, sched_group_cpus(group)) {
3816                unsigned long wl;
3817
3818                if (!cpumask_test_cpu(i, cpus))
3819                        continue;
3820
3821                rq = cpu_rq(i);
3822                wl = weighted_cpuload(i);
3823
3824                if (rq->nr_running == 1 && wl > imbalance)
3825                        continue;
3826
3827                if (wl > max_load) {
3828                        max_load = wl;
3829                        busiest = rq;
3830                }
3831        }
3832
3833        return busiest;
3834}
3835
3836/*
3837 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
3838 * so long as it is large enough.
3839 */
3840#define MAX_PINNED_INTERVAL     512
3841
3842/* Working cpumask for load_balance and load_balance_newidle. */
3843static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3844
3845/*
3846 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3847 * tasks if there is an imbalance.
3848 */
3849static int load_balance(int this_cpu, struct rq *this_rq,
3850                        struct sched_domain *sd, enum cpu_idle_type idle,
3851                        int *balance)
3852{
3853        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3854        struct sched_group *group;
3855        unsigned long imbalance;
3856        struct rq *busiest;
3857        unsigned long flags;
3858        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3859
3860        cpumask_setall(cpus);
3861
3862        /*
3863         * When power savings policy is enabled for the parent domain, idle
3864         * sibling can pick up load irrespective of busy siblings. In this case,
3865         * let the state of idle sibling percolate up as CPU_IDLE, instead of
3866         * portraying it as CPU_NOT_IDLE.
3867         */
3868        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3869            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3870                sd_idle = 1;
3871
3872        schedstat_inc(sd, lb_count[idle]);
3873
3874redo:
3875        update_shares(sd);
3876        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3877                                   cpus, balance);
3878
3879        if (*balance == 0)
3880                goto out_balanced;
3881
3882        if (!group) {
3883                schedstat_inc(sd, lb_nobusyg[idle]);
3884                goto out_balanced;
3885        }
3886
3887        busiest = find_busiest_queue(group, idle, imbalance, cpus);
3888        if (!busiest) {
3889                schedstat_inc(sd, lb_nobusyq[idle]);
3890                goto out_balanced;
3891        }
3892
3893        BUG_ON(busiest == this_rq);
3894
3895        schedstat_add(sd, lb_imbalance[idle], imbalance);
3896
3897        ld_moved = 0;
3898        if (busiest->nr_running > 1) {
3899                /*
3900                 * Attempt to move tasks. If find_busiest_group has found
3901                 * an imbalance but busiest->nr_running <= 1, the group is
3902                 * still unbalanced. ld_moved simply stays zero, so it is
3903                 * correctly treated as an imbalance.
3904                 */
3905                local_irq_save(flags);
3906                double_rq_lock(this_rq, busiest);
3907                ld_moved = move_tasks(this_rq, this_cpu, busiest,
3908                                      imbalance, sd, idle, &all_pinned);
3909                double_rq_unlock(this_rq, busiest);
3910                local_irq_restore(flags);
3911
3912                /*
3913                 * some other cpu did the load balance for us.
3914                 */
3915                if (ld_moved && this_cpu != smp_processor_id())
3916                        resched_cpu(this_cpu);
3917
3918                /* All tasks on this runqueue were pinned by CPU affinity */
3919                if (unlikely(all_pinned)) {
3920                        cpumask_clear_cpu(cpu_of(busiest), cpus);
3921                        if (!cpumask_empty(cpus))
3922                                goto redo;
3923                        goto out_balanced;
3924                }
3925        }
3926
3927        if (!ld_moved) {
3928                schedstat_inc(sd, lb_failed[idle]);
3929                sd->nr_balance_failed++;
3930
3931                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3932
3933                        spin_lock_irqsave(&busiest->lock, flags);
3934
3935                        /* don't kick the migration_thread, if the curr
3936                         * task on busiest cpu can't be moved to this_cpu
3937                         */
3938                        if (!cpumask_test_cpu(this_cpu,
3939                                              &busiest->curr->cpus_allowed)) {
3940                                spin_unlock_irqrestore(&busiest->lock, flags);
3941                                all_pinned = 1;
3942                                goto out_one_pinned;
3943                        }
3944
3945                        if (!busiest->active_balance) {
3946                                busiest->active_balance = 1;
3947                                busiest->push_cpu = this_cpu;
3948                                active_balance = 1;
3949                        }
3950                        spin_unlock_irqrestore(&busiest->lock, flags);
3951                        if (active_balance)
3952                                wake_up_process(busiest->migration_thread);
3953
3954                        /*
3955                         * We've kicked active balancing, reset the failure
3956                         * counter.
3957                         */
3958                        sd->nr_balance_failed = sd->cache_nice_tries+1;
3959                }
3960        } else
3961                sd->nr_balance_failed = 0;
3962
3963        if (likely(!active_balance)) {
3964                /* We were unbalanced, so reset the balancing interval */
3965                sd->balance_interval = sd->min_interval;
3966        } else {
3967                /*
3968                 * If we've begun active balancing, start to back off. This
3969                 * case may not be covered by the all_pinned logic if there
3970                 * is only 1 task on the busy runqueue (because we don't call
3971                 * move_tasks).
3972                 */
3973                if (sd->balance_interval < sd->max_interval)
3974                        sd->balance_interval *= 2;
3975        }
3976
3977        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3978            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3979                ld_moved = -1;
3980
3981        goto out;
3982
3983out_balanced:
3984        schedstat_inc(sd, lb_balanced[idle]);
3985
3986        sd->nr_balance_failed = 0;
3987
3988out_one_pinned:
3989        /* tune up the balancing interval */
3990        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3991                        (sd->balance_interval < sd->max_interval))
3992                sd->balance_interval *= 2;
3993
3994        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3995            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3996                ld_moved = -1;
3997        else
3998                ld_moved = 0;
3999out:
4000        if (ld_moved)
4001                update_shares(sd);
4002        return ld_moved;
4003}
4004
4005/*
4006 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4007 * tasks if there is an imbalance.
4008 *
4009 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4010 * this_rq is locked.
4011 */
4012static int
4013load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4014{
4015        struct sched_group *group;
4016        struct rq *busiest = NULL;
4017        unsigned long imbalance;
4018        int ld_moved = 0;
4019        int sd_idle = 0;
4020        int all_pinned = 0;
4021        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4022
4023        cpumask_setall(cpus);
4024
4025        /*
4026         * When power savings policy is enabled for the parent domain, idle
4027         * sibling can pick up load irrespective of busy siblings. In this case,
4028         * let the state of idle sibling percolate up as IDLE, instead of
4029         * portraying it as CPU_NOT_IDLE.
4030         */
4031        if (sd->flags & SD_SHARE_CPUPOWER &&
4032            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4033                sd_idle = 1;
4034
4035        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4036redo:
4037        update_shares_locked(this_rq, sd);
4038        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4039                                   &sd_idle, cpus, NULL);
4040        if (!group) {
4041                schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4042                goto out_balanced;
4043        }
4044
4045        busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4046        if (!busiest) {
4047                schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4048                goto out_balanced;
4049        }
4050
4051        BUG_ON(busiest == this_rq);
4052
4053        schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4054
4055        ld_moved = 0;
4056        if (busiest->nr_running > 1) {
4057                /* Attempt to move tasks */
4058                double_lock_balance(this_rq, busiest);
4059                /* this_rq->clock is already updated */
4060                update_rq_clock(busiest);
4061                ld_moved = move_tasks(this_rq, this_cpu, busiest,
4062                                        imbalance, sd, CPU_NEWLY_IDLE,
4063                                        &all_pinned);
4064                double_unlock_balance(this_rq, busiest);
4065
4066                if (unlikely(all_pinned)) {
4067                        cpumask_clear_cpu(cpu_of(busiest), cpus);
4068                        if (!cpumask_empty(cpus))
4069                                goto redo;
4070                }
4071        }
4072
4073        if (!ld_moved) {
4074                int active_balance = 0;
4075
4076                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4077                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4078                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4079                        return -1;
4080
4081                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4082                        return -1;
4083
4084                if (sd->nr_balance_failed++ < 2)
4085                        return -1;
4086
4087                /*
4088                 * The only task running in a non-idle cpu can be moved to this
4089                 * cpu in an attempt to completely freeup the other CPU
4090                 * package. The same method used to move task in load_balance()
4091                 * have been extended for load_balance_newidle() to speedup
4092                 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4093                 *
4094                 * The package power saving logic comes from
4095                 * find_busiest_group().  If there are no imbalance, then
4096                 * f_b_g() will return NULL.  However when sched_mc={1,2} then
4097                 * f_b_g() will select a group from which a running task may be
4098                 * pulled to this cpu in order to make the other package idle.
4099                 * If there is no opportunity to make a package idle and if
4100                 * there are no imbalance, then f_b_g() will return NULL and no
4101                 * action will be taken in load_balance_newidle().
4102                 *
4103                 * Under normal task pull operation due to imbalance, there
4104                 * will be more than one task in the source run queue and
4105                 * move_tasks() will succeed.  ld_moved will be true and this
4106                 * active balance code will not be triggered.
4107                 */
4108
4109                /* Lock busiest in correct order while this_rq is held */
4110                double_lock_balance(this_rq, busiest);
4111
4112                /*
4113                 * don't kick the migration_thread, if the curr
4114                 * task on busiest cpu can't be moved to this_cpu
4115                 */
4116                if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4117                        double_unlock_balance(this_rq, busiest);
4118                        all_pinned = 1;
4119                        return ld_moved;
4120                }
4121
4122                if (!busiest->active_balance) {
4123                        busiest->active_balance = 1;
4124                        busiest->push_cpu = this_cpu;
4125                        active_balance = 1;
4126                }
4127
4128                double_unlock_balance(this_rq, busiest);
4129                /*
4130                 * Should not call ttwu while holding a rq->lock
4131                 */
4132                spin_unlock(&this_rq->lock);
4133                if (active_balance)
4134                        wake_up_process(busiest->migration_thread);
4135                spin_lock(&this_rq->lock);
4136
4137        } else
4138                sd->nr_balance_failed = 0;
4139
4140        update_shares_locked(this_rq, sd);
4141        return ld_moved;
4142
4143out_balanced:
4144        schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4145        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4146            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4147                return -1;
4148        sd->nr_balance_failed = 0;
4149
4150        return 0;
4151}
4152
4153/*
4154 * idle_balance is called by schedule() if this_cpu is about to become
4155 * idle. Attempts to pull tasks from other CPUs.
4156 */
4157static void idle_balance(int this_cpu, struct rq *this_rq)
4158{
4159        struct sched_domain *sd;
4160        int pulled_task = 0;
4161        unsigned long next_balance = jiffies + HZ;
4162
4163        for_each_domain(this_cpu, sd) {
4164                unsigned long interval;
4165
4166                if (!(sd->flags & SD_LOAD_BALANCE))
4167                        continue;
4168
4169                if (sd->flags & SD_BALANCE_NEWIDLE)
4170                        /* If we've pulled tasks over stop searching: */
4171                        pulled_task = load_balance_newidle(this_cpu, this_rq,
4172                                                           sd);
4173
4174                interval = msecs_to_jiffies(sd->balance_interval);
4175                if (time_after(next_balance, sd->last_balance + interval))
4176                        next_balance = sd->last_balance + interval;
4177                if (pulled_task)
4178                        break;
4179        }
4180        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4181                /*
4182                 * We are going idle. next_balance may be set based on
4183                 * a busy processor. So reset next_balance.
4184                 */
4185                this_rq->next_balance = next_balance;
4186        }
4187}
4188
4189/*
4190 * active_load_balance is run by migration threads. It pushes running tasks
4191 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4192 * running on each physical CPU where possible, and avoids physical /
4193 * logical imbalances.
4194 *
4195 * Called with busiest_rq locked.
4196 */
4197static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4198{
4199        int target_cpu = busiest_rq->push_cpu;
4200        struct sched_domain *sd;
4201        struct rq *target_rq;
4202
4203        /* Is there any task to move? */
4204        if (busiest_rq->nr_running <= 1)
4205                return;
4206
4207        target_rq = cpu_rq(target_cpu);
4208
4209        /*
4210         * This condition is "impossible", if it occurs
4211         * we need to fix it. Originally reported by
4212         * Bjorn Helgaas on a 128-cpu setup.
4213         */
4214        BUG_ON(busiest_rq == target_rq);
4215
4216        /* move a task from busiest_rq to target_rq */
4217        double_lock_balance(busiest_rq, target_rq);
4218        update_rq_clock(busiest_rq);
4219        update_rq_clock(target_rq);
4220
4221        /* Search for an sd spanning us and the target CPU. */
4222        for_each_domain(target_cpu, sd) {
4223                if ((sd->flags & SD_LOAD_BALANCE) &&
4224                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4225                                break;
4226        }
4227
4228        if (likely(sd)) {
4229                schedstat_inc(sd, alb_count);
4230
4231                if (move_one_task(target_rq, target_cpu, busiest_rq,
4232                                  sd, CPU_IDLE))
4233                        schedstat_inc(sd, alb_pushed);
4234                else
4235                        schedstat_inc(sd, alb_failed);
4236        }
4237        double_unlock_balance(busiest_rq, target_rq);
4238}
4239
4240#ifdef CONFIG_NO_HZ
4241static struct {
4242        atomic_t load_balancer;
4243        cpumask_var_t cpu_mask;
4244} nohz ____cacheline_aligned = {
4245        .load_balancer = ATOMIC_INIT(-1),
4246};
4247
4248/*
4249 * This routine will try to nominate the ilb (idle load balancing)
4250 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4251 * load balancing on behalf of all those cpus. If all the cpus in the system
4252 * go into this tickless mode, then there will be no ilb owner (as there is
4253 * no need for one) and all the cpus will sleep till the next wakeup event
4254 * arrives...
4255 *
4256 * For the ilb owner, tick is not stopped. And this tick will be used
4257 * for idle load balancing. ilb owner will still be part of
4258 * nohz.cpu_mask..
4259 *
4260 * While stopping the tick, this cpu will become the ilb owner if there
4261 * is no other owner. And will be the owner till that cpu becomes busy
4262 * or if all cpus in the system stop their ticks at which point
4263 * there is no need for ilb owner.
4264 *
4265 * When the ilb owner becomes busy, it nominates another owner, during the
4266 * next busy scheduler_tick()
4267 */
4268int select_nohz_load_balancer(int stop_tick)
4269{
4270        int cpu = smp_processor_id();
4271
4272        if (stop_tick) {
4273                cpu_rq(cpu)->in_nohz_recently = 1;
4274
4275                if (!cpu_active(cpu)) {
4276                        if (atomic_read(&nohz.load_balancer) != cpu)
4277                                return 0;
4278
4279                        /*
4280                         * If we are going offline and still the leader,
4281                         * give up!
4282                         */
4283                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4284                                BUG();
4285
4286                        return 0;
4287                }
4288
4289                cpumask_set_cpu(cpu, nohz.cpu_mask);
4290
4291                /* time for ilb owner also to sleep */
4292                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4293                        if (atomic_read(&nohz.load_balancer) == cpu)
4294                                atomic_set(&nohz.load_balancer, -1);
4295                        return 0;
4296                }
4297
4298                if (atomic_read(&nohz.load_balancer) == -1) {
4299                        /* make me the ilb owner */
4300                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4301                                return 1;
4302                } else if (atomic_read(&nohz.load_balancer) == cpu)
4303                        return 1;
4304        } else {
4305                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4306                        return 0;
4307
4308                cpumask_clear_cpu(cpu, nohz.cpu_mask);
4309
4310                if (atomic_read(&nohz.load_balancer) == cpu)
4311                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4312                                BUG();
4313        }
4314        return 0;
4315}
4316#endif
4317
4318static DEFINE_SPINLOCK(balancing);
4319
4320/*
4321 * It checks each scheduling domain to see if it is due to be balanced,
4322 * and initiates a balancing operation if so.
4323 *
4324 * Balancing parameters are set up in arch_init_sched_domains.
4325 */
4326static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4327{
4328        int balance = 1;
4329        struct rq *rq = cpu_rq(cpu);
4330        unsigned long interval;
4331        struct sched_domain *sd;
4332        /* Earliest time when we have to do rebalance again */
4333        unsigned long next_balance = jiffies + 60*HZ;
4334        int update_next_balance = 0;
4335        int need_serialize;
4336
4337        for_each_domain(cpu, sd) {
4338                if (!(sd->flags & SD_LOAD_BALANCE))
4339                        continue;
4340
4341                interval = sd->balance_interval;
4342                if (idle != CPU_IDLE)
4343                        interval *= sd->busy_factor;
4344
4345                /* scale ms to jiffies */
4346                interval = msecs_to_jiffies(interval);
4347                if (unlikely(!interval))
4348                        interval = 1;
4349                if (interval > HZ*NR_CPUS/10)
4350                        interval = HZ*NR_CPUS/10;
4351
4352                need_serialize = sd->flags & SD_SERIALIZE;
4353
4354                if (need_serialize) {
4355                        if (!spin_trylock(&balancing))
4356                                goto out;
4357                }
4358
4359                if (time_after_eq(jiffies, sd->last_balance + interval)) {
4360                        if (load_balance(cpu, rq, sd, idle, &balance)) {
4361                                /*
4362                                 * We've pulled tasks over so either we're no
4363                                 * longer idle, or one of our SMT siblings is
4364                                 * not idle.
4365                                 */
4366                                idle = CPU_NOT_IDLE;
4367                        }
4368                        sd->last_balance = jiffies;
4369                }
4370                if (need_serialize)
4371                        spin_unlock(&balancing);
4372out:
4373                if (time_after(next_balance, sd->last_balance + interval)) {
4374                        next_balance = sd->last_balance + interval;
4375                        update_next_balance = 1;
4376                }
4377
4378                /*
4379                 * Stop the load balance at this level. There is another
4380                 * CPU in our sched group which is doing load balancing more
4381                 * actively.
4382                 */
4383                if (!balance)
4384                        break;
4385        }
4386
4387        /*
4388         * next_balance will be updated only when there is a need.
4389         * When the cpu is attached to null domain for ex, it will not be
4390         * updated.
4391         */
4392        if (likely(update_next_balance))
4393                rq->next_balance = next_balance;
4394}
4395
4396/*
4397 * run_rebalance_domains is triggered when needed from the scheduler tick.
4398 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4399 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4400 */
4401static void run_rebalance_domains(struct softirq_action *h)
4402{
4403        int this_cpu = smp_processor_id();
4404        struct rq *this_rq = cpu_rq(this_cpu);
4405        enum cpu_idle_type idle = this_rq->idle_at_tick ?
4406                                                CPU_IDLE : CPU_NOT_IDLE;
4407
4408        rebalance_domains(this_cpu, idle);
4409
4410#ifdef CONFIG_NO_HZ
4411        /*
4412         * If this cpu is the owner for idle load balancing, then do the
4413         * balancing on behalf of the other idle cpus whose ticks are
4414         * stopped.
4415         */
4416        if (this_rq->idle_at_tick &&
4417            atomic_read(&nohz.load_balancer) == this_cpu) {
4418                struct rq *rq;
4419                int balance_cpu;
4420
4421                for_each_cpu(balance_cpu, nohz.cpu_mask) {
4422                        if (balance_cpu == this_cpu)
4423                                continue;
4424
4425                        /*
4426                         * If this cpu gets work to do, stop the load balancing
4427                         * work being done for other cpus. Next load
4428                         * balancing owner will pick it up.
4429                         */
4430                        if (need_resched())
4431                                break;
4432
4433                        rebalance_domains(balance_cpu, CPU_IDLE);
4434
4435                        rq = cpu_rq(balance_cpu);
4436                        if (time_after(this_rq->next_balance, rq->next_balance))
4437                                this_rq->next_balance = rq->next_balance;
4438                }
4439        }
4440#endif
4441}
4442
4443static inline int on_null_domain(int cpu)
4444{
4445        return !rcu_dereference(cpu_rq(cpu)->sd);
4446}
4447
4448/*
4449 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4450 *
4451 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4452 * idle load balancing owner or decide to stop the periodic load balancing,
4453 * if the whole system is idle.
4454 */
4455static inline void trigger_load_balance(struct rq *rq, int cpu)
4456{
4457#ifdef CONFIG_NO_HZ
4458        /*
4459         * If we were in the nohz mode recently and busy at the current
4460         * scheduler tick, then check if we need to nominate new idle
4461         * load balancer.
4462         */
4463        if (rq->in_nohz_recently && !rq->idle_at_tick) {
4464                rq->in_nohz_recently = 0;
4465
4466                if (atomic_read(&nohz.load_balancer) == cpu) {
4467                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
4468                        atomic_set(&nohz.load_balancer, -1);
4469                }
4470
4471                if (atomic_read(&nohz.load_balancer) == -1) {
4472                        /*
4473                         * simple selection for now: Nominate the
4474                         * first cpu in the nohz list to be the next
4475                         * ilb owner.
4476                         *
4477                         * TBD: Traverse the sched domains and nominate
4478                         * the nearest cpu in the nohz.cpu_mask.
4479                         */
4480                        int ilb = cpumask_first(nohz.cpu_mask);
4481
4482                        if (ilb < nr_cpu_ids)
4483                                resched_cpu(ilb);
4484                }
4485        }
4486
4487        /*
4488         * If this cpu is idle and doing idle load balancing for all the
4489         * cpus with ticks stopped, is it time for that to stop?
4490         */
4491        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4492            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4493                resched_cpu(cpu);
4494                return;
4495        }
4496
4497        /*
4498         * If this cpu is idle and the idle load balancing is done by
4499         * someone else, then no need raise the SCHED_SOFTIRQ
4500         */
4501        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4502            cpumask_test_cpu(cpu, nohz.cpu_mask))
4503                return;
4504#endif
4505        /* Don't need to rebalance while attached to NULL domain */
4506        if (time_after_eq(jiffies, rq->next_balance) &&
4507            likely(!on_null_domain(cpu)))
4508                raise_softirq(SCHED_SOFTIRQ);
4509}
4510
4511#else   /* CONFIG_SMP */
4512
4513/*
4514 * on UP we do not need to balance between CPUs:
4515 */
4516static inline void idle_balance(int cpu, struct rq *rq)
4517{
4518}
4519
4520#endif
4521
4522DEFINE_PER_CPU(struct kernel_stat, kstat);
4523
4524EXPORT_PER_CPU_SYMBOL(kstat);
4525
4526/*
4527 * Return any ns on the sched_clock that have not yet been accounted in
4528 * @p in case that task is currently running.
4529 *
4530 * Called with task_rq_lock() held on @rq.
4531 */
4532static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4533{
4534        u64 ns = 0;
4535
4536        if (task_current(rq, p)) {
4537                update_rq_clock(rq);
4538                ns = rq->clock - p->se.exec_start;
4539                if ((s64)ns < 0)
4540                        ns = 0;
4541        }
4542
4543        return ns;
4544}
4545
4546unsigned long long task_delta_exec(struct task_struct *p)
4547{
4548        unsigned long flags;
4549        struct rq *rq;
4550        u64 ns = 0;
4551
4552        rq = task_rq_lock(p, &flags);
4553        ns = do_task_delta_exec(p, rq);
4554        task_rq_unlock(rq, &flags);
4555
4556        return ns;
4557}
4558
4559/*
4560 * Return accounted runtime for the task.
4561 * In case the task is currently running, return the runtime plus current's
4562 * pending runtime that have not been accounted yet.
4563 */
4564unsigned long long task_sched_runtime(struct task_struct *p)
4565{
4566        unsigned long flags;
4567        struct rq *rq;
4568        u64 ns = 0;
4569
4570        rq = task_rq_lock(p, &flags);
4571        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4572        task_rq_unlock(rq, &flags);
4573
4574        return ns;
4575}
4576
4577/*
4578 * Return sum_exec_runtime for the thread group.
4579 * In case the task is currently running, return the sum plus current's
4580 * pending runtime that have not been accounted yet.
4581 *
4582 * Note that the thread group might have other running tasks as well,
4583 * so the return value not includes other pending runtime that other
4584 * running tasks might have.
4585 */
4586unsigned long long thread_group_sched_runtime(struct task_struct *p)
4587{
4588        struct task_cputime totals;
4589        unsigned long flags;
4590        struct rq *rq;
4591        u64 ns;
4592
4593        rq = task_rq_lock(p, &flags);
4594        thread_group_cputime(p, &totals);
4595        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4596        task_rq_unlock(rq, &flags);
4597
4598        return ns;
4599}
4600
4601/*
4602 * Account user cpu time to a process.
4603 * @p: the process that the cpu time gets accounted to
4604 * @cputime: the cpu time spent in user space since the last update
4605 * @cputime_scaled: cputime scaled by cpu frequency
4606 */
4607void account_user_time(struct task_struct *p, cputime_t cputime,
4608                       cputime_t cputime_scaled)
4609{
4610        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4611        cputime64_t tmp;
4612
4613        /* Add user time to process. */
4614        p->utime = cputime_add(p->utime, cputime);
4615        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4616        account_group_user_time(p, cputime);
4617
4618        /* Add user time to cpustat. */
4619        tmp = cputime_to_cputime64(cputime);
4620        if (TASK_NICE(p) > 0)
4621                cpustat->nice = cputime64_add(cpustat->nice, tmp);
4622        else
4623                cpustat->user = cputime64_add(cpustat->user, tmp);
4624
4625        cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4626        /* Account for user time used */
4627        acct_update_integrals(p);
4628}
4629
4630/*
4631 * Account guest cpu time to a process.
4632 * @p: the process that the cpu time gets accounted to
4633 * @cputime: the cpu time spent in virtual machine since the last update
4634 * @cputime_scaled: cputime scaled by cpu frequency
4635 */
4636static void account_guest_time(struct task_struct *p, cputime_t cputime,
4637                               cputime_t cputime_scaled)
4638{
4639        cputime64_t tmp;
4640        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4641
4642        tmp = cputime_to_cputime64(cputime);
4643
4644        /* Add guest time to process. */
4645        p->utime = cputime_add(p->utime, cputime);
4646        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4647        account_group_user_time(p, cputime);
4648        p->gtime = cputime_add(p->gtime, cputime);
4649
4650        /* Add guest time to cpustat. */
4651        cpustat->user = cputime64_add(cpustat->user, tmp);
4652        cpustat->guest = cputime64_add(cpustat->guest, tmp);
4653}
4654
4655/*
4656 * Account system cpu time to a process.
4657 * @p: the process that the cpu time gets accounted to
4658 * @hardirq_offset: the offset to subtract from hardirq_count()
4659 * @cputime: the cpu time spent in kernel space since the last update
4660 * @cputime_scaled: cputime scaled by cpu frequency
4661 */
4662void account_system_time(struct task_struct *p, int hardirq_offset,
4663                         cputime_t cputime, cputime_t cputime_scaled)
4664{
4665        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4666        cputime64_t tmp;
4667
4668        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4669                account_guest_time(p, cputime, cputime_scaled);
4670                return;
4671        }
4672
4673        /* Add system time to process. */
4674        p->stime = cputime_add(p->stime, cputime);
4675        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4676        account_group_system_time(p, cputime);
4677
4678        /* Add system time to cpustat. */
4679        tmp = cputime_to_cputime64(cputime);
4680        if (hardirq_count() - hardirq_offset)
4681                cpustat->irq = cputime64_add(cpustat->irq, tmp);
4682        else if (softirq_count())
4683                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4684        else
4685                cpustat->system = cputime64_add(cpustat->system, tmp);
4686
4687        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
4688
4689        /* Account for system time used */
4690        acct_update_integrals(p);
4691}
4692
4693/*
4694 * Account for involuntary wait time.
4695 * @steal: the cpu time spent in involuntary wait
4696 */
4697void account_steal_time(cputime_t cputime)
4698{
4699        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4700        cputime64_t cputime64 = cputime_to_cputime64(cputime);
4701
4702        cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4703}
4704
4705/*
4706 * Account for idle time.
4707 * @cputime: the cpu time spent in idle wait
4708 */
4709void account_idle_time(cputime_t cputime)
4710{
4711        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4712        cputime64_t cputime64 = cputime_to_cputime64(cputime);
4713        struct rq *rq = this_rq();
4714
4715        if (atomic_read(&rq->nr_iowait) > 0)
4716                cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4717        else
4718                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4719}
4720
4721#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4722
4723/*
4724 * Account a single tick of cpu time.
4725 * @p: the process that the cpu time gets accounted to
4726 * @user_tick: indicates if the tick is a user or a system tick
4727 */
4728void account_process_tick(struct task_struct *p, int user_tick)
4729{
4730        cputime_t one_jiffy = jiffies_to_cputime(1);
4731        cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4732        struct rq *rq = this_rq();
4733
4734        if (user_tick)
4735                account_user_time(p, one_jiffy, one_jiffy_scaled);
4736        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
4737                account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4738                                    one_jiffy_scaled);
4739        else
4740                account_idle_time(one_jiffy);
4741}
4742
4743/*
4744 * Account multiple ticks of steal time.
4745 * @p: the process from which the cpu time has been stolen
4746 * @ticks: number of stolen ticks
4747 */
4748void account_steal_ticks(unsigned long ticks)
4749{
4750        account_steal_time(jiffies_to_cputime(ticks));
4751}
4752
4753/*
4754 * Account multiple ticks of idle time.
4755 * @ticks: number of stolen ticks
4756 */
4757void account_idle_ticks(unsigned long ticks)
4758{
4759        account_idle_time(jiffies_to_cputime(ticks));
4760}
4761
4762#endif
4763
4764/*
4765 * Use precise platform statistics if available:
4766 */
4767#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4768cputime_t task_utime(struct task_struct *p)
4769{
4770        return p->utime;
4771}
4772
4773cputime_t task_stime(struct task_struct *p)
4774{
4775        return p->stime;
4776}
4777#else
4778cputime_t task_utime(struct task_struct *p)
4779{
4780        clock_t utime = cputime_to_clock_t(p->utime),
4781                total = utime + cputime_to_clock_t(p->stime);
4782        u64 temp;
4783
4784        /*
4785         * Use CFS's precise accounting:
4786         */
4787        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4788
4789        if (total) {
4790                temp *= utime;
4791                do_div(temp, total);
4792        }
4793        utime = (clock_t)temp;
4794
4795        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4796        return p->prev_utime;
4797}
4798
4799cputime_t task_stime(struct task_struct *p)
4800{
4801        clock_t stime;
4802
4803        /*
4804         * Use CFS's precise accounting. (we subtract utime from
4805         * the total, to make sure the total observed by userspace
4806         * grows monotonically - apps rely on that):
4807         */
4808        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4809                        cputime_to_clock_t(task_utime(p));
4810
4811        if (stime >= 0)
4812                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4813
4814        return p->prev_stime;
4815}
4816#endif
4817
4818inline cputime_t task_gtime(struct task_struct *p)
4819{
4820        return p->gtime;
4821}
4822
4823/*
4824 * This function gets called by the timer code, with HZ frequency.
4825 * We call it with interrupts disabled.
4826 *
4827 * It also gets called by the fork code, when changing the parent's
4828 * timeslices.
4829 */
4830void scheduler_tick(void)
4831{
4832        int cpu = smp_processor_id();
4833        struct rq *rq = cpu_rq(cpu);
4834        struct task_struct *curr = rq->curr;
4835
4836        sched_clock_tick();
4837
4838        spin_lock(&rq->lock);
4839        update_rq_clock(rq);
4840        update_cpu_load(rq);
4841        curr->sched_class->task_tick(rq, curr, 0);
4842        spin_unlock(&rq->lock);
4843
4844#ifdef CONFIG_SMP
4845        rq->idle_at_tick = idle_cpu(cpu);
4846        trigger_load_balance(rq, cpu);
4847#endif
4848}
4849
4850notrace unsigned long get_parent_ip(unsigned long addr)
4851{
4852        if (in_lock_functions(addr)) {
4853                addr = CALLER_ADDR2;
4854                if (in_lock_functions(addr))
4855                        addr = CALLER_ADDR3;
4856        }
4857        return addr;
4858}
4859
4860#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4861                                defined(CONFIG_PREEMPT_TRACER))
4862
4863void __kprobes add_preempt_count(int val)
4864{
4865#ifdef CONFIG_DEBUG_PREEMPT
4866        /*
4867         * Underflow?
4868         */
4869        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4870                return;
4871#endif
4872        preempt_count() += val;
4873#ifdef CONFIG_DEBUG_PREEMPT
4874        /*
4875         * Spinlock count overflowing soon?
4876         */
4877        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4878                                PREEMPT_MASK - 10);
4879#endif
4880        if (preempt_count() == val)
4881                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4882}
4883EXPORT_SYMBOL(add_preempt_count);
4884
4885void __kprobes sub_preempt_count(int val)
4886{
4887#ifdef CONFIG_DEBUG_PREEMPT
4888        /*
4889         * Underflow?
4890         */
4891        if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4892                return;
4893        /*
4894         * Is the spinlock portion underflowing?
4895         */
4896        if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4897                        !(preempt_count() & PREEMPT_MASK)))
4898                return;
4899#endif
4900
4901        if (preempt_count() == val)
4902                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4903        preempt_count() -= val;
4904}
4905EXPORT_SYMBOL(sub_preempt_count);
4906
4907#endif
4908
4909/*
4910 * Print scheduling while atomic bug:
4911 */
4912static noinline void __schedule_bug(struct task_struct *prev)
4913{
4914        struct pt_regs *regs = get_irq_regs();
4915
4916        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4917                prev->comm, prev->pid, preempt_count());
4918
4919        debug_show_held_locks(prev);
4920        print_modules();
4921        if (irqs_disabled())
4922                print_irqtrace_events(prev);
4923
4924        if (regs)
4925                show_regs(regs);
4926        else
4927                dump_stack();
4928}
4929
4930/*
4931 * Various schedule()-time debugging checks and statistics:
4932 */
4933static inline void schedule_debug(struct task_struct *prev)
4934{
4935        /*
4936         * Test if we are atomic. Since do_exit() needs to call into
4937         * schedule() atomically, we ignore that path for now.
4938         * Otherwise, whine if we are scheduling when we should not be.
4939         */
4940        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4941                __schedule_bug(prev);
4942
4943        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4944
4945        schedstat_inc(this_rq(), sched_count);
4946#ifdef CONFIG_SCHEDSTATS
4947        if (unlikely(prev->lock_depth >= 0)) {
4948                schedstat_inc(this_rq(), bkl_count);
4949                schedstat_inc(prev, sched_info.bkl_count);
4950        }
4951#endif
4952}
4953
4954static void put_prev_task(struct rq *rq, struct task_struct *prev)
4955{
4956        if (prev->state == TASK_RUNNING) {
4957                u64 runtime = prev->se.sum_exec_runtime;
4958
4959                runtime -= prev->se.prev_sum_exec_runtime;
4960                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4961
4962                /*
4963                 * In order to avoid avg_overlap growing stale when we are
4964                 * indeed overlapping and hence not getting put to sleep, grow
4965                 * the avg_overlap on preemption.
4966                 *
4967                 * We use the average preemption runtime because that
4968                 * correlates to the amount of cache footprint a task can
4969                 * build up.
4970                 */
4971                update_avg(&prev->se.avg_overlap, runtime);
4972        }
4973        prev->sched_class->put_prev_task(rq, prev);
4974}
4975
4976/*
4977 * Pick up the highest-prio task:
4978 */
4979static inline struct task_struct *
4980pick_next_task(struct rq *rq)
4981{
4982        const struct sched_class *class;
4983        struct task_struct *p;
4984
4985        /*
4986         * Optimization: we know that if all tasks are in
4987         * the fair class we can call that function directly:
4988         */
4989        if (likely(rq->nr_running == rq->cfs.nr_running)) {
4990                p = fair_sched_class.pick_next_task(rq);
4991                if (likely(p))
4992                        return p;
4993        }
4994
4995        class = sched_class_highest;
4996        for ( ; ; ) {
4997                p = class->pick_next_task(rq);
4998                if (p)
4999                        return p;
5000                /*
5001                 * Will never be NULL as the idle class always
5002                 * returns a non-NULL p:
5003                 */
5004                class = class->next;
5005        }
5006}
5007
5008/*
5009 * schedule() is the main scheduler function.
5010 */
5011asmlinkage void __sched __schedule(void)
5012{
5013        struct task_struct *prev, *next;
5014        unsigned long *switch_count;
5015        struct rq *rq;
5016        int cpu;
5017
5018        cpu = smp_processor_id();
5019        rq = cpu_rq(cpu);
5020        rcu_qsctr_inc(cpu);
5021        prev = rq->curr;
5022        switch_count = &prev->nivcsw;
5023
5024        release_kernel_lock(prev);
5025need_resched_nonpreemptible:
5026
5027        schedule_debug(prev);
5028
5029        if (sched_feat(HRTICK))
5030                hrtick_clear(rq);
5031
5032        spin_lock_irq(&rq->lock);
5033        update_rq_clock(rq);
5034        clear_tsk_need_resched(prev);
5035
5036        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5037                if (unlikely(signal_pending_state(prev->state, prev)))
5038                        prev->state = TASK_RUNNING;
5039                else
5040                        deactivate_task(rq, prev, 1);
5041                switch_count = &prev->nvcsw;
5042        }
5043
5044#ifdef CONFIG_SMP
5045        if (prev->sched_class->pre_schedule)
5046                prev->sched_class->pre_schedule(rq, prev);
5047#endif
5048
5049        if (unlikely(!rq->nr_running))
5050                idle_balance(cpu, rq);
5051
5052        put_prev_task(rq, prev);
5053        next = pick_next_task