linux/kernel/sched.c
<<
>>
Prefs
   1/*
   2 *  kernel/sched.c
   3 *
   4 *  Kernel scheduler and related syscalls
   5 *
   6 *  Copyright (C) 1991-2002  Linus Torvalds
   7 *
   8 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9 *              make semaphores SMP safe
  10 *  1998-11-19  Implemented schedule_timeout() and related stuff
  11 *              by Andrea Arcangeli
  12 *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13 *              hybrid priority-list and round-robin design with
  14 *              an array-switch method of distributing timeslices
  15 *              and per-CPU runqueues.  Cleanups and useful suggestions
  16 *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17 *  2003-09-03  Interactivity tuning by Con Kolivas.
  18 *  2004-04-02  Scheduler domains code by Nick Piggin
  19 *  2007-04-15  Work begun on replacing all interactivity tuning with a
  20 *              fair scheduling design by Con Kolivas.
  21 *  2007-05-05  Load balancing (smp-nice) and other improvements
  22 *              by Peter Williams
  23 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  24 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  25 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  26 *              Thomas Gleixner, Mike Kravetz
  27 */
  28
  29#include <linux/mm.h>
  30#include <linux/module.h>
  31#include <linux/nmi.h>
  32#include <linux/init.h>
  33#include <linux/uaccess.h>
  34#include <linux/highmem.h>
  35#include <linux/smp_lock.h>
  36#include <asm/mmu_context.h>
  37#include <linux/interrupt.h>
  38#include <linux/capability.h>
  39#include <linux/completion.h>
  40#include <linux/kernel_stat.h>
  41#include <linux/debug_locks.h>
  42#include <linux/perf_event.h>
  43#include <linux/security.h>
  44#include <linux/notifier.h>
  45#include <linux/profile.h>
  46#include <linux/freezer.h>
  47#include <linux/vmalloc.h>
  48#include <linux/blkdev.h>
  49#include <linux/delay.h>
  50#include <linux/pid_namespace.h>
  51#include <linux/smp.h>
  52#include <linux/threads.h>
  53#include <linux/timer.h>
  54#include <linux/rcupdate.h>
  55#include <linux/cpu.h>
  56#include <linux/cpuset.h>
  57#include <linux/percpu.h>
  58#include <linux/proc_fs.h>
  59#include <linux/seq_file.h>
  60#include <linux/stop_machine.h>
  61#include <linux/sysctl.h>
  62#include <linux/syscalls.h>
  63#include <linux/times.h>
  64#include <linux/tsacct_kern.h>
  65#include <linux/kprobes.h>
  66#include <linux/delayacct.h>
  67#include <linux/unistd.h>
  68#include <linux/pagemap.h>
  69#include <linux/hrtimer.h>
  70#include <linux/tick.h>
  71#include <linux/debugfs.h>
  72#include <linux/ctype.h>
  73#include <linux/ftrace.h>
  74#include <linux/slab.h>
  75
  76#include <asm/tlb.h>
  77#include <asm/irq_regs.h>
  78
  79#include "sched_cpupri.h"
  80#include "workqueue_sched.h"
  81
  82#define CREATE_TRACE_POINTS
  83#include <trace/events/sched.h>
  84
  85/*
  86 * Convert user-nice values [ -20 ... 0 ... 19 ]
  87 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  88 * and back.
  89 */
  90#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  91#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  92#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  93
  94/*
  95 * 'User priority' is the nice value converted to something we
  96 * can work with better when scaling various scheduler parameters,
  97 * it's a [ 0 ... 39 ] range.
  98 */
  99#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
 100#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
 101#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
 102
 103/*
 104 * Helpers for converting nanosecond timing to jiffy resolution
 105 */
 106#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 107
 108#define NICE_0_LOAD             SCHED_LOAD_SCALE
 109#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 110
 111/*
 112 * These are the 'tuning knobs' of the scheduler:
 113 *
 114 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 115 * Timeslices get refilled after they expire.
 116 */
 117#define DEF_TIMESLICE           (100 * HZ / 1000)
 118
 119/*
 120 * single value that denotes runtime == period, ie unlimited time.
 121 */
 122#define RUNTIME_INF     ((u64)~0ULL)
 123
 124static inline int rt_policy(int policy)
 125{
 126        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 127                return 1;
 128        return 0;
 129}
 130
 131static inline int task_has_rt_policy(struct task_struct *p)
 132{
 133        return rt_policy(p->policy);
 134}
 135
 136/*
 137 * This is the priority-queue data structure of the RT scheduling class:
 138 */
 139struct rt_prio_array {
 140        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 141        struct list_head queue[MAX_RT_PRIO];
 142};
 143
 144struct rt_bandwidth {
 145        /* nests inside the rq lock: */
 146        raw_spinlock_t          rt_runtime_lock;
 147        ktime_t                 rt_period;
 148        u64                     rt_runtime;
 149        struct hrtimer          rt_period_timer;
 150};
 151
 152static struct rt_bandwidth def_rt_bandwidth;
 153
 154static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 155
 156static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 157{
 158        struct rt_bandwidth *rt_b =
 159                container_of(timer, struct rt_bandwidth, rt_period_timer);
 160        ktime_t now;
 161        int overrun;
 162        int idle = 0;
 163
 164        for (;;) {
 165                now = hrtimer_cb_get_time(timer);
 166                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 167
 168                if (!overrun)
 169                        break;
 170
 171                idle = do_sched_rt_period_timer(rt_b, overrun);
 172        }
 173
 174        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 175}
 176
 177static
 178void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 179{
 180        rt_b->rt_period = ns_to_ktime(period);
 181        rt_b->rt_runtime = runtime;
 182
 183        raw_spin_lock_init(&rt_b->rt_runtime_lock);
 184
 185        hrtimer_init(&rt_b->rt_period_timer,
 186                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 187        rt_b->rt_period_timer.function = sched_rt_period_timer;
 188}
 189
 190static inline int rt_bandwidth_enabled(void)
 191{
 192        return sysctl_sched_rt_runtime >= 0;
 193}
 194
 195static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 196{
 197        ktime_t now;
 198
 199        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 200                return;
 201
 202        if (hrtimer_active(&rt_b->rt_period_timer))
 203                return;
 204
 205        raw_spin_lock(&rt_b->rt_runtime_lock);
 206        for (;;) {
 207                unsigned long delta;
 208                ktime_t soft, hard;
 209
 210                if (hrtimer_active(&rt_b->rt_period_timer))
 211                        break;
 212
 213                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 214                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 215
 216                soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
 217                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
 218                delta = ktime_to_ns(ktime_sub(hard, soft));
 219                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
 220                                HRTIMER_MODE_ABS_PINNED, 0);
 221        }
 222        raw_spin_unlock(&rt_b->rt_runtime_lock);
 223}
 224
 225#ifdef CONFIG_RT_GROUP_SCHED
 226static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 227{
 228        hrtimer_cancel(&rt_b->rt_period_timer);
 229}
 230#endif
 231
 232/*
 233 * sched_domains_mutex serializes calls to arch_init_sched_domains,
 234 * detach_destroy_domains and partition_sched_domains.
 235 */
 236static DEFINE_MUTEX(sched_domains_mutex);
 237
 238#ifdef CONFIG_CGROUP_SCHED
 239
 240#include <linux/cgroup.h>
 241
 242struct cfs_rq;
 243
 244static LIST_HEAD(task_groups);
 245
 246/* task group related information */
 247struct task_group {
 248        struct cgroup_subsys_state css;
 249
 250#ifdef CONFIG_FAIR_GROUP_SCHED
 251        /* schedulable entities of this group on each cpu */
 252        struct sched_entity **se;
 253        /* runqueue "owned" by this group on each cpu */
 254        struct cfs_rq **cfs_rq;
 255        unsigned long shares;
 256#endif
 257
 258#ifdef CONFIG_RT_GROUP_SCHED
 259        struct sched_rt_entity **rt_se;
 260        struct rt_rq **rt_rq;
 261
 262        struct rt_bandwidth rt_bandwidth;
 263#endif
 264
 265        struct rcu_head rcu;
 266        struct list_head list;
 267
 268        struct task_group *parent;
 269        struct list_head siblings;
 270        struct list_head children;
 271};
 272
 273#define root_task_group init_task_group
 274
 275/* task_group_lock serializes add/remove of task groups and also changes to
 276 * a task group's cpu shares.
 277 */
 278static DEFINE_SPINLOCK(task_group_lock);
 279
 280#ifdef CONFIG_FAIR_GROUP_SCHED
 281
 282#ifdef CONFIG_SMP
 283static int root_task_group_empty(void)
 284{
 285        return list_empty(&root_task_group.children);
 286}
 287#endif
 288
 289# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 290
 291/*
 292 * A weight of 0 or 1 can cause arithmetics problems.
 293 * A weight of a cfs_rq is the sum of weights of which entities
 294 * are queued on this cfs_rq, so a weight of a entity should not be
 295 * too large, so as the shares value of a task group.
 296 * (The default weight is 1024 - so there's no practical
 297 *  limitation from this.)
 298 */
 299#define MIN_SHARES      2
 300#define MAX_SHARES      (1UL << 18)
 301
 302static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 303#endif
 304
 305/* Default task group.
 306 *      Every task in system belong to this group at bootup.
 307 */
 308struct task_group init_task_group;
 309
 310#endif  /* CONFIG_CGROUP_SCHED */
 311
 312/* CFS-related fields in a runqueue */
 313struct cfs_rq {
 314        struct load_weight load;
 315        unsigned long nr_running;
 316
 317        u64 exec_clock;
 318        u64 min_vruntime;
 319
 320        struct rb_root tasks_timeline;
 321        struct rb_node *rb_leftmost;
 322
 323        struct list_head tasks;
 324        struct list_head *balance_iterator;
 325
 326        /*
 327         * 'curr' points to currently running entity on this cfs_rq.
 328         * It is set to NULL otherwise (i.e when none are currently running).
 329         */
 330        struct sched_entity *curr, *next, *last;
 331
 332        unsigned int nr_spread_over;
 333
 334#ifdef CONFIG_FAIR_GROUP_SCHED
 335        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 336
 337        /*
 338         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 339         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 340         * (like users, containers etc.)
 341         *
 342         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 343         * list is used during load balance.
 344         */
 345        struct list_head leaf_cfs_rq_list;
 346        struct task_group *tg;  /* group that "owns" this runqueue */
 347
 348#ifdef CONFIG_SMP
 349        /*
 350         * the part of load.weight contributed by tasks
 351         */
 352        unsigned long task_weight;
 353
 354        /*
 355         *   h_load = weight * f(tg)
 356         *
 357         * Where f(tg) is the recursive weight fraction assigned to
 358         * this group.
 359         */
 360        unsigned long h_load;
 361
 362        /*
 363         * this cpu's part of tg->shares
 364         */
 365        unsigned long shares;
 366
 367        /*
 368         * load.weight at the time we set shares
 369         */
 370        unsigned long rq_weight;
 371#endif
 372#endif
 373};
 374
 375/* Real-Time classes' related field in a runqueue: */
 376struct rt_rq {
 377        struct rt_prio_array active;
 378        unsigned long rt_nr_running;
 379#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 380        struct {
 381                int curr; /* highest queued rt task prio */
 382#ifdef CONFIG_SMP
 383                int next; /* next highest */
 384#endif
 385        } highest_prio;
 386#endif
 387#ifdef CONFIG_SMP
 388        unsigned long rt_nr_migratory;
 389        unsigned long rt_nr_total;
 390        int overloaded;
 391        struct plist_head pushable_tasks;
 392#endif
 393        int rt_throttled;
 394        u64 rt_time;
 395        u64 rt_runtime;
 396        /* Nests inside the rq lock: */
 397        raw_spinlock_t rt_runtime_lock;
 398
 399#ifdef CONFIG_RT_GROUP_SCHED
 400        unsigned long rt_nr_boosted;
 401
 402        struct rq *rq;
 403        struct list_head leaf_rt_rq_list;
 404        struct task_group *tg;
 405#endif
 406};
 407
 408#ifdef CONFIG_SMP
 409
 410/*
 411 * We add the notion of a root-domain which will be used to define per-domain
 412 * variables. Each exclusive cpuset essentially defines an island domain by
 413 * fully partitioning the member cpus from any other cpuset. Whenever a new
 414 * exclusive cpuset is created, we also create and attach a new root-domain
 415 * object.
 416 *
 417 */
 418struct root_domain {
 419        atomic_t refcount;
 420        cpumask_var_t span;
 421        cpumask_var_t online;
 422
 423        /*
 424         * The "RT overload" flag: it gets set if a CPU has more than
 425         * one runnable RT task.
 426         */
 427        cpumask_var_t rto_mask;
 428        atomic_t rto_count;
 429        struct cpupri cpupri;
 430};
 431
 432/*
 433 * By default the system creates a single root-domain with all cpus as
 434 * members (mimicking the global state we have today).
 435 */
 436static struct root_domain def_root_domain;
 437
 438#endif /* CONFIG_SMP */
 439
 440/*
 441 * This is the main, per-CPU runqueue data structure.
 442 *
 443 * Locking rule: those places that want to lock multiple runqueues
 444 * (such as the load balancing or the thread migration code), lock
 445 * acquire operations must be ordered by ascending &runqueue.
 446 */
 447struct rq {
 448        /* runqueue lock: */
 449        raw_spinlock_t lock;
 450
 451        /*
 452         * nr_running and cpu_load should be in the same cacheline because
 453         * remote CPUs use both these fields when doing load calculation.
 454         */
 455        unsigned long nr_running;
 456        #define CPU_LOAD_IDX_MAX 5
 457        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 458        unsigned long last_load_update_tick;
 459#ifdef CONFIG_NO_HZ
 460        u64 nohz_stamp;
 461        unsigned char nohz_balance_kick;
 462#endif
 463        unsigned int skip_clock_update;
 464
 465        /* capture load from *all* tasks on this cpu: */
 466        struct load_weight load;
 467        unsigned long nr_load_updates;
 468        u64 nr_switches;
 469
 470        struct cfs_rq cfs;
 471        struct rt_rq rt;
 472
 473#ifdef CONFIG_FAIR_GROUP_SCHED
 474        /* list of leaf cfs_rq on this cpu: */
 475        struct list_head leaf_cfs_rq_list;
 476#endif
 477#ifdef CONFIG_RT_GROUP_SCHED
 478        struct list_head leaf_rt_rq_list;
 479#endif
 480
 481        /*
 482         * This is part of a global counter where only the total sum
 483         * over all CPUs matters. A task can increase this counter on
 484         * one CPU and if it got migrated afterwards it may decrease
 485         * it on another CPU. Always updated under the runqueue lock:
 486         */
 487        unsigned long nr_uninterruptible;
 488
 489        struct task_struct *curr, *idle, *stop;
 490        unsigned long next_balance;
 491        struct mm_struct *prev_mm;
 492
 493        u64 clock;
 494        u64 clock_task;
 495
 496        atomic_t nr_iowait;
 497
 498#ifdef CONFIG_SMP
 499        struct root_domain *rd;
 500        struct sched_domain *sd;
 501
 502        unsigned long cpu_power;
 503
 504        unsigned char idle_at_tick;
 505        /* For active balancing */
 506        int post_schedule;
 507        int active_balance;
 508        int push_cpu;
 509        struct cpu_stop_work active_balance_work;
 510        /* cpu of this runqueue: */
 511        int cpu;
 512        int online;
 513
 514        unsigned long avg_load_per_task;
 515
 516        u64 rt_avg;
 517        u64 age_stamp;
 518        u64 idle_stamp;
 519        u64 avg_idle;
 520#endif
 521
 522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 523        u64 prev_irq_time;
 524#endif
 525
 526        /* calc_load related fields */
 527        unsigned long calc_load_update;
 528        long calc_load_active;
 529
 530#ifdef CONFIG_SCHED_HRTICK
 531#ifdef CONFIG_SMP
 532        int hrtick_csd_pending;
 533        struct call_single_data hrtick_csd;
 534#endif
 535        struct hrtimer hrtick_timer;
 536#endif
 537
 538#ifdef CONFIG_SCHEDSTATS
 539        /* latency stats */
 540        struct sched_info rq_sched_info;
 541        unsigned long long rq_cpu_time;
 542        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 543
 544        /* sys_sched_yield() stats */
 545        unsigned int yld_count;
 546
 547        /* schedule() stats */
 548        unsigned int sched_switch;
 549        unsigned int sched_count;
 550        unsigned int sched_goidle;
 551
 552        /* try_to_wake_up() stats */
 553        unsigned int ttwu_count;
 554        unsigned int ttwu_local;
 555
 556        /* BKL stats */
 557        unsigned int bkl_count;
 558#endif
 559};
 560
 561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 562
 563
 564static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 565
 566static inline int cpu_of(struct rq *rq)
 567{
 568#ifdef CONFIG_SMP
 569        return rq->cpu;
 570#else
 571        return 0;
 572#endif
 573}
 574
 575#define rcu_dereference_check_sched_domain(p) \
 576        rcu_dereference_check((p), \
 577                              rcu_read_lock_sched_held() || \
 578                              lockdep_is_held(&sched_domains_mutex))
 579
 580/*
 581 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 582 * See detach_destroy_domains: synchronize_sched for details.
 583 *
 584 * The domain tree of any CPU may only be accessed from within
 585 * preempt-disabled sections.
 586 */
 587#define for_each_domain(cpu, __sd) \
 588        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 589
 590#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 591#define this_rq()               (&__get_cpu_var(runqueues))
 592#define task_rq(p)              cpu_rq(task_cpu(p))
 593#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 594#define raw_rq()                (&__raw_get_cpu_var(runqueues))
 595
 596#ifdef CONFIG_CGROUP_SCHED
 597
 598/*
 599 * Return the group to which this tasks belongs.
 600 *
 601 * We use task_subsys_state_check() and extend the RCU verification
 602 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
 603 * holds that lock for each task it moves into the cgroup. Therefore
 604 * by holding that lock, we pin the task to the current cgroup.
 605 */
 606static inline struct task_group *task_group(struct task_struct *p)
 607{
 608        struct cgroup_subsys_state *css;
 609
 610        if (p->flags & PF_EXITING)
 611                return &root_task_group;
 612
 613        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 614                        lockdep_is_held(&task_rq(p)->lock));
 615        return container_of(css, struct task_group, css);
 616}
 617
 618/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 619static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 620{
 621#ifdef CONFIG_FAIR_GROUP_SCHED
 622        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 623        p->se.parent = task_group(p)->se[cpu];
 624#endif
 625
 626#ifdef CONFIG_RT_GROUP_SCHED
 627        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 628        p->rt.parent = task_group(p)->rt_se[cpu];
 629#endif
 630}
 631
 632#else /* CONFIG_CGROUP_SCHED */
 633
 634static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 635static inline struct task_group *task_group(struct task_struct *p)
 636{
 637        return NULL;
 638}
 639
 640#endif /* CONFIG_CGROUP_SCHED */
 641
 642static void update_rq_clock_task(struct rq *rq, s64 delta);
 643
 644static void update_rq_clock(struct rq *rq)
 645{
 646        s64 delta;
 647
 648        if (rq->skip_clock_update)
 649                return;
 650
 651        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 652        rq->clock += delta;
 653        update_rq_clock_task(rq, delta);
 654}
 655
 656/*
 657 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 658 */
 659#ifdef CONFIG_SCHED_DEBUG
 660# define const_debug __read_mostly
 661#else
 662# define const_debug static const
 663#endif
 664
 665/**
 666 * runqueue_is_locked
 667 * @cpu: the processor in question.
 668 *
 669 * Returns true if the current cpu runqueue is locked.
 670 * This interface allows printk to be called with the runqueue lock
 671 * held and know whether or not it is OK to wake up the klogd.
 672 */
 673int runqueue_is_locked(int cpu)
 674{
 675        return raw_spin_is_locked(&cpu_rq(cpu)->lock);
 676}
 677
 678/*
 679 * Debugging: various feature bits
 680 */
 681
 682#define SCHED_FEAT(name, enabled)       \
 683        __SCHED_FEAT_##name ,
 684
 685enum {
 686#include "sched_features.h"
 687};
 688
 689#undef SCHED_FEAT
 690
 691#define SCHED_FEAT(name, enabled)       \
 692        (1UL << __SCHED_FEAT_##name) * enabled |
 693
 694const_debug unsigned int sysctl_sched_features =
 695#include "sched_features.h"
 696        0;
 697
 698#undef SCHED_FEAT
 699
 700#ifdef CONFIG_SCHED_DEBUG
 701#define SCHED_FEAT(name, enabled)       \
 702        #name ,
 703
 704static __read_mostly char *sched_feat_names[] = {
 705#include "sched_features.h"
 706        NULL
 707};
 708
 709#undef SCHED_FEAT
 710
 711static int sched_feat_show(struct seq_file *m, void *v)
 712{
 713        int i;
 714
 715        for (i = 0; sched_feat_names[i]; i++) {
 716                if (!(sysctl_sched_features & (1UL << i)))
 717                        seq_puts(m, "NO_");
 718                seq_printf(m, "%s ", sched_feat_names[i]);
 719        }
 720        seq_puts(m, "\n");
 721
 722        return 0;
 723}
 724
 725static ssize_t
 726sched_feat_write(struct file *filp, const char __user *ubuf,
 727                size_t cnt, loff_t *ppos)
 728{
 729        char buf[64];
 730        char *cmp;
 731        int neg = 0;
 732        int i;
 733
 734        if (cnt > 63)
 735                cnt = 63;
 736
 737        if (copy_from_user(&buf, ubuf, cnt))
 738                return -EFAULT;
 739
 740        buf[cnt] = 0;
 741        cmp = strstrip(buf);
 742
 743        if (strncmp(buf, "NO_", 3) == 0) {
 744                neg = 1;
 745                cmp += 3;
 746        }
 747
 748        for (i = 0; sched_feat_names[i]; i++) {
 749                if (strcmp(cmp, sched_feat_names[i]) == 0) {
 750                        if (neg)
 751                                sysctl_sched_features &= ~(1UL << i);
 752                        else
 753                                sysctl_sched_features |= (1UL << i);
 754                        break;
 755                }
 756        }
 757
 758        if (!sched_feat_names[i])
 759                return -EINVAL;
 760
 761        *ppos += cnt;
 762
 763        return cnt;
 764}
 765
 766static int sched_feat_open(struct inode *inode, struct file *filp)
 767{
 768        return single_open(filp, sched_feat_show, NULL);
 769}
 770
 771static const struct file_operations sched_feat_fops = {
 772        .open           = sched_feat_open,
 773        .write          = sched_feat_write,
 774        .read           = seq_read,
 775        .llseek         = seq_lseek,
 776        .release        = single_release,
 777};
 778
 779static __init int sched_init_debug(void)
 780{
 781        debugfs_create_file("sched_features", 0644, NULL, NULL,
 782                        &sched_feat_fops);
 783
 784        return 0;
 785}
 786late_initcall(sched_init_debug);
 787
 788#endif
 789
 790#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 791
 792/*
 793 * Number of tasks to iterate in a single balance run.
 794 * Limited because this is done with IRQs disabled.
 795 */
 796const_debug unsigned int sysctl_sched_nr_migrate = 32;
 797
 798/*
 799 * ratelimit for updating the group shares.
 800 * default: 0.25ms
 801 */
 802unsigned int sysctl_sched_shares_ratelimit = 250000;
 803unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
 804
 805/*
 806 * Inject some fuzzyness into changing the per-cpu group shares
 807 * this avoids remote rq-locks at the expense of fairness.
 808 * default: 4
 809 */
 810unsigned int sysctl_sched_shares_thresh = 4;
 811
 812/*
 813 * period over which we average the RT time consumption, measured
 814 * in ms.
 815 *
 816 * default: 1s
 817 */
 818const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 819
 820/*
 821 * period over which we measure -rt task cpu usage in us.
 822 * default: 1s
 823 */
 824unsigned int sysctl_sched_rt_period = 1000000;
 825
 826static __read_mostly int scheduler_running;
 827
 828/*
 829 * part of the period that we allow rt tasks to run in us.
 830 * default: 0.95s
 831 */
 832int sysctl_sched_rt_runtime = 950000;
 833
 834static inline u64 global_rt_period(void)
 835{
 836        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 837}
 838
 839static inline u64 global_rt_runtime(void)
 840{
 841        if (sysctl_sched_rt_runtime < 0)
 842                return RUNTIME_INF;
 843
 844        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 845}
 846
 847#ifndef prepare_arch_switch
 848# define prepare_arch_switch(next)      do { } while (0)
 849#endif
 850#ifndef finish_arch_switch
 851# define finish_arch_switch(prev)       do { } while (0)
 852#endif
 853
 854static inline int task_current(struct rq *rq, struct task_struct *p)
 855{
 856        return rq->curr == p;
 857}
 858
 859#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 860static inline int task_running(struct rq *rq, struct task_struct *p)
 861{
 862        return task_current(rq, p);
 863}
 864
 865static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 866{
 867}
 868
 869static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 870{
 871#ifdef CONFIG_DEBUG_SPINLOCK
 872        /* this is a valid case when another task releases the spinlock */
 873        rq->lock.owner = current;
 874#endif
 875        /*
 876         * If we are tracking spinlock dependencies then we have to
 877         * fix up the runqueue lock - which gets 'carried over' from
 878         * prev into current:
 879         */
 880        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 881
 882        raw_spin_unlock_irq(&rq->lock);
 883}
 884
 885#else /* __ARCH_WANT_UNLOCKED_CTXSW */
 886static inline int task_running(struct rq *rq, struct task_struct *p)
 887{
 888#ifdef CONFIG_SMP
 889        return p->oncpu;
 890#else
 891        return task_current(rq, p);
 892#endif
 893}
 894
 895static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 896{
 897#ifdef CONFIG_SMP
 898        /*
 899         * We can optimise this out completely for !SMP, because the
 900         * SMP rebalancing from interrupt is the only thing that cares
 901         * here.
 902         */
 903        next->oncpu = 1;
 904#endif
 905#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 906        raw_spin_unlock_irq(&rq->lock);
 907#else
 908        raw_spin_unlock(&rq->lock);
 909#endif
 910}
 911
 912static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 913{
 914#ifdef CONFIG_SMP
 915        /*
 916         * After ->oncpu is cleared, the task can be moved to a different CPU.
 917         * We must ensure this doesn't happen until the switch is completely
 918         * finished.
 919         */
 920        smp_wmb();
 921        prev->oncpu = 0;
 922#endif
 923#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 924        local_irq_enable();
 925#endif
 926}
 927#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 928
 929/*
 930 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
 931 * against ttwu().
 932 */
 933static inline int task_is_waking(struct task_struct *p)
 934{
 935        return unlikely(p->state == TASK_WAKING);
 936}
 937
 938/*
 939 * __task_rq_lock - lock the runqueue a given task resides on.
 940 * Must be called interrupts disabled.
 941 */
 942static inline struct rq *__task_rq_lock(struct task_struct *p)
 943        __acquires(rq->lock)
 944{
 945        struct rq *rq;
 946
 947        for (;;) {
 948                rq = task_rq(p);
 949                raw_spin_lock(&rq->lock);
 950                if (likely(rq == task_rq(p)))
 951                        return rq;
 952                raw_spin_unlock(&rq->lock);
 953        }
 954}
 955
 956/*
 957 * task_rq_lock - lock the runqueue a given task resides on and disable
 958 * interrupts. Note the ordering: we can safely lookup the task_rq without
 959 * explicitly disabling preemption.
 960 */
 961static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 962        __acquires(rq->lock)
 963{
 964        struct rq *rq;
 965
 966        for (;;) {
 967                local_irq_save(*flags);
 968                rq = task_rq(p);
 969                raw_spin_lock(&rq->lock);
 970                if (likely(rq == task_rq(p)))
 971                        return rq;
 972                raw_spin_unlock_irqrestore(&rq->lock, *flags);
 973        }
 974}
 975
 976static void __task_rq_unlock(struct rq *rq)
 977        __releases(rq->lock)
 978{
 979        raw_spin_unlock(&rq->lock);
 980}
 981
 982static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 983        __releases(rq->lock)
 984{
 985        raw_spin_unlock_irqrestore(&rq->lock, *flags);
 986}
 987
 988/*
 989 * this_rq_lock - lock this runqueue and disable interrupts.
 990 */
 991static struct rq *this_rq_lock(void)
 992        __acquires(rq->lock)
 993{
 994        struct rq *rq;
 995
 996        local_irq_disable();
 997        rq = this_rq();
 998        raw_spin_lock(&rq->lock);
 999
1000        return rq;
1001}
1002
1003#ifdef CONFIG_SCHED_HRTICK
1004/*
1005 * Use HR-timers to deliver accurate preemption points.
1006 *
1007 * Its all a bit involved since we cannot program an hrt while holding the
1008 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1009 * reschedule event.
1010 *
1011 * When we get rescheduled we reprogram the hrtick_timer outside of the
1012 * rq->lock.
1013 */
1014
1015/*
1016 * Use hrtick when:
1017 *  - enabled by features
1018 *  - hrtimer is actually high res
1019 */
1020static inline int hrtick_enabled(struct rq *rq)
1021{
1022        if (!sched_feat(HRTICK))
1023                return 0;
1024        if (!cpu_active(cpu_of(rq)))
1025                return 0;
1026        return hrtimer_is_hres_active(&rq->hrtick_timer);
1027}
1028
1029static void hrtick_clear(struct rq *rq)
1030{
1031        if (hrtimer_active(&rq->hrtick_timer))
1032                hrtimer_cancel(&rq->hrtick_timer);
1033}
1034
1035/*
1036 * High-resolution timer tick.
1037 * Runs from hardirq context with interrupts disabled.
1038 */
1039static enum hrtimer_restart hrtick(struct hrtimer *timer)
1040{
1041        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1042
1043        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1044
1045        raw_spin_lock(&rq->lock);
1046        update_rq_clock(rq);
1047        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1048        raw_spin_unlock(&rq->lock);
1049
1050        return HRTIMER_NORESTART;
1051}
1052
1053#ifdef CONFIG_SMP
1054/*
1055 * called from hardirq (IPI) context
1056 */
1057static void __hrtick_start(void *arg)
1058{
1059        struct rq *rq = arg;
1060
1061        raw_spin_lock(&rq->lock);
1062        hrtimer_restart(&rq->hrtick_timer);
1063        rq->hrtick_csd_pending = 0;
1064        raw_spin_unlock(&rq->lock);
1065}
1066
1067/*
1068 * Called to set the hrtick timer state.
1069 *
1070 * called with rq->lock held and irqs disabled
1071 */
1072static void hrtick_start(struct rq *rq, u64 delay)
1073{
1074        struct hrtimer *timer = &rq->hrtick_timer;
1075        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1076
1077        hrtimer_set_expires(timer, time);
1078
1079        if (rq == this_rq()) {
1080                hrtimer_restart(timer);
1081        } else if (!rq->hrtick_csd_pending) {
1082                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1083                rq->hrtick_csd_pending = 1;
1084        }
1085}
1086
1087static int
1088hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1089{
1090        int cpu = (int)(long)hcpu;
1091
1092        switch (action) {
1093        case CPU_UP_CANCELED:
1094        case CPU_UP_CANCELED_FROZEN:
1095        case CPU_DOWN_PREPARE:
1096        case CPU_DOWN_PREPARE_FROZEN:
1097        case CPU_DEAD:
1098        case CPU_DEAD_FROZEN:
1099                hrtick_clear(cpu_rq(cpu));
1100                return NOTIFY_OK;
1101        }
1102
1103        return NOTIFY_DONE;
1104}
1105
1106static __init void init_hrtick(void)
1107{
1108        hotcpu_notifier(hotplug_hrtick, 0);
1109}
1110#else
1111/*
1112 * Called to set the hrtick timer state.
1113 *
1114 * called with rq->lock held and irqs disabled
1115 */
1116static void hrtick_start(struct rq *rq, u64 delay)
1117{
1118        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1119                        HRTIMER_MODE_REL_PINNED, 0);
1120}
1121
1122static inline void init_hrtick(void)
1123{
1124}
1125#endif /* CONFIG_SMP */
1126
1127static void init_rq_hrtick(struct rq *rq)
1128{
1129#ifdef CONFIG_SMP
1130        rq->hrtick_csd_pending = 0;
1131
1132        rq->hrtick_csd.flags = 0;
1133        rq->hrtick_csd.func = __hrtick_start;
1134        rq->hrtick_csd.info = rq;
1135#endif
1136
1137        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1138        rq->hrtick_timer.function = hrtick;
1139}
1140#else   /* CONFIG_SCHED_HRTICK */
1141static inline void hrtick_clear(struct rq *rq)
1142{
1143}
1144
1145static inline void init_rq_hrtick(struct rq *rq)
1146{
1147}
1148
1149static inline void init_hrtick(void)
1150{
1151}
1152#endif  /* CONFIG_SCHED_HRTICK */
1153
1154/*
1155 * resched_task - mark a task 'to be rescheduled now'.
1156 *
1157 * On UP this means the setting of the need_resched flag, on SMP it
1158 * might also involve a cross-CPU call to trigger the scheduler on
1159 * the target CPU.
1160 */
1161#ifdef CONFIG_SMP
1162
1163#ifndef tsk_is_polling
1164#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1165#endif
1166
1167static void resched_task(struct task_struct *p)
1168{
1169        int cpu;
1170
1171        assert_raw_spin_locked(&task_rq(p)->lock);
1172
1173        if (test_tsk_need_resched(p))
1174                return;
1175
1176        set_tsk_need_resched(p);
1177
1178        cpu = task_cpu(p);
1179        if (cpu == smp_processor_id())
1180                return;
1181
1182        /* NEED_RESCHED must be visible before we test polling */
1183        smp_mb();
1184        if (!tsk_is_polling(p))
1185                smp_send_reschedule(cpu);
1186}
1187
1188static void resched_cpu(int cpu)
1189{
1190        struct rq *rq = cpu_rq(cpu);
1191        unsigned long flags;
1192
1193        if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1194                return;
1195        resched_task(cpu_curr(cpu));
1196        raw_spin_unlock_irqrestore(&rq->lock, flags);
1197}
1198
1199#ifdef CONFIG_NO_HZ
1200/*
1201 * In the semi idle case, use the nearest busy cpu for migrating timers
1202 * from an idle cpu.  This is good for power-savings.
1203 *
1204 * We don't do similar optimization for completely idle system, as
1205 * selecting an idle cpu will add more delays to the timers than intended
1206 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1207 */
1208int get_nohz_timer_target(void)
1209{
1210        int cpu = smp_processor_id();
1211        int i;
1212        struct sched_domain *sd;
1213
1214        for_each_domain(cpu, sd) {
1215                for_each_cpu(i, sched_domain_span(sd))
1216                        if (!idle_cpu(i))
1217                                return i;
1218        }
1219        return cpu;
1220}
1221/*
1222 * When add_timer_on() enqueues a timer into the timer wheel of an
1223 * idle CPU then this timer might expire before the next timer event
1224 * which is scheduled to wake up that CPU. In case of a completely
1225 * idle system the next event might even be infinite time into the
1226 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1227 * leaves the inner idle loop so the newly added timer is taken into
1228 * account when the CPU goes back to idle and evaluates the timer
1229 * wheel for the next timer event.
1230 */
1231void wake_up_idle_cpu(int cpu)
1232{
1233        struct rq *rq = cpu_rq(cpu);
1234
1235        if (cpu == smp_processor_id())
1236                return;
1237
1238        /*
1239         * This is safe, as this function is called with the timer
1240         * wheel base lock of (cpu) held. When the CPU is on the way
1241         * to idle and has not yet set rq->curr to idle then it will
1242         * be serialized on the timer wheel base lock and take the new
1243         * timer into account automatically.
1244         */
1245        if (rq->curr != rq->idle)
1246                return;
1247
1248        /*
1249         * We can set TIF_RESCHED on the idle task of the other CPU
1250         * lockless. The worst case is that the other CPU runs the
1251         * idle task through an additional NOOP schedule()
1252         */
1253        set_tsk_need_resched(rq->idle);
1254
1255        /* NEED_RESCHED must be visible before we test polling */
1256        smp_mb();
1257        if (!tsk_is_polling(rq->idle))
1258                smp_send_reschedule(cpu);
1259}
1260
1261#endif /* CONFIG_NO_HZ */
1262
1263static u64 sched_avg_period(void)
1264{
1265        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1266}
1267
1268static void sched_avg_update(struct rq *rq)
1269{
1270        s64 period = sched_avg_period();
1271
1272        while ((s64)(rq->clock - rq->age_stamp) > period) {
1273                /*
1274                 * Inline assembly required to prevent the compiler
1275                 * optimising this loop into a divmod call.
1276                 * See __iter_div_u64_rem() for another example of this.
1277                 */
1278                asm("" : "+rm" (rq->age_stamp));
1279                rq->age_stamp += period;
1280                rq->rt_avg /= 2;
1281        }
1282}
1283
1284static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1285{
1286        rq->rt_avg += rt_delta;
1287        sched_avg_update(rq);
1288}
1289
1290#else /* !CONFIG_SMP */
1291static void resched_task(struct task_struct *p)
1292{
1293        assert_raw_spin_locked(&task_rq(p)->lock);
1294        set_tsk_need_resched(p);
1295}
1296
1297static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1298{
1299}
1300
1301static void sched_avg_update(struct rq *rq)
1302{
1303}
1304#endif /* CONFIG_SMP */
1305
1306#if BITS_PER_LONG == 32
1307# define WMULT_CONST    (~0UL)
1308#else
1309# define WMULT_CONST    (1UL << 32)
1310#endif
1311
1312#define WMULT_SHIFT     32
1313
1314/*
1315 * Shift right and round:
1316 */
1317#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1318
1319/*
1320 * delta *= weight / lw
1321 */
1322static unsigned long
1323calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1324                struct load_weight *lw)
1325{
1326        u64 tmp;
1327
1328        if (!lw->inv_weight) {
1329                if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1330                        lw->inv_weight = 1;
1331                else
1332                        lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1333                                / (lw->weight+1);
1334        }
1335
1336        tmp = (u64)delta_exec * weight;
1337        /*
1338         * Check whether we'd overflow the 64-bit multiplication:
1339         */
1340        if (unlikely(tmp > WMULT_CONST))
1341                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1342                        WMULT_SHIFT/2);
1343        else
1344                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1345
1346        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1347}
1348
1349static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1350{
1351        lw->weight += inc;
1352        lw->inv_weight = 0;
1353}
1354
1355static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1356{
1357        lw->weight -= dec;
1358        lw->inv_weight = 0;
1359}
1360
1361/*
1362 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1363 * of tasks with abnormal "nice" values across CPUs the contribution that
1364 * each task makes to its run queue's load is weighted according to its
1365 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1366 * scaled version of the new time slice allocation that they receive on time
1367 * slice expiry etc.
1368 */
1369
1370#define WEIGHT_IDLEPRIO                3
1371#define WMULT_IDLEPRIO         1431655765
1372
1373/*
1374 * Nice levels are multiplicative, with a gentle 10% change for every
1375 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1376 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1377 * that remained on nice 0.
1378 *
1379 * The "10% effect" is relative and cumulative: from _any_ nice level,
1380 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1381 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1382 * If a task goes up by ~10% and another task goes down by ~10% then
1383 * the relative distance between them is ~25%.)
1384 */
1385static const int prio_to_weight[40] = {
1386 /* -20 */     88761,     71755,     56483,     46273,     36291,
1387 /* -15 */     29154,     23254,     18705,     14949,     11916,
1388 /* -10 */      9548,      7620,      6100,      4904,      3906,
1389 /*  -5 */      3121,      2501,      1991,      1586,      1277,
1390 /*   0 */      1024,       820,       655,       526,       423,
1391 /*   5 */       335,       272,       215,       172,       137,
1392 /*  10 */       110,        87,        70,        56,        45,
1393 /*  15 */        36,        29,        23,        18,        15,
1394};
1395
1396/*
1397 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1398 *
1399 * In cases where the weight does not change often, we can use the
1400 * precalculated inverse to speed up arithmetics by turning divisions
1401 * into multiplications:
1402 */
1403static const u32 prio_to_wmult[40] = {
1404 /* -20 */     48388,     59856,     76040,     92818,    118348,
1405 /* -15 */    147320,    184698,    229616,    287308,    360437,
1406 /* -10 */    449829,    563644,    704093,    875809,   1099582,
1407 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1408 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1409 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1410 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1411 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1412};
1413
1414/* Time spent by the tasks of the cpu accounting group executing in ... */
1415enum cpuacct_stat_index {
1416        CPUACCT_STAT_USER,      /* ... user mode */
1417        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
1418
1419        CPUACCT_STAT_NSTATS,
1420};
1421
1422#ifdef CONFIG_CGROUP_CPUACCT
1423static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1424static void cpuacct_update_stats(struct task_struct *tsk,
1425                enum cpuacct_stat_index idx, cputime_t val);
1426#else
1427static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1428static inline void cpuacct_update_stats(struct task_struct *tsk,
1429                enum cpuacct_stat_index idx, cputime_t val) {}
1430#endif
1431
1432static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1433{
1434        update_load_add(&rq->load, load);
1435}
1436
1437static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1438{
1439        update_load_sub(&rq->load, load);
1440}
1441
1442#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1443typedef int (*tg_visitor)(struct task_group *, void *);
1444
1445/*
1446 * Iterate the full tree, calling @down when first entering a node and @up when
1447 * leaving it for the final time.
1448 */
1449static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1450{
1451        struct task_group *parent, *child;
1452        int ret;
1453
1454        rcu_read_lock();
1455        parent = &root_task_group;
1456down:
1457        ret = (*down)(parent, data);
1458        if (ret)
1459                goto out_unlock;
1460        list_for_each_entry_rcu(child, &parent->children, siblings) {
1461                parent = child;
1462                goto down;
1463
1464up:
1465                continue;
1466        }
1467        ret = (*up)(parent, data);
1468        if (ret)
1469                goto out_unlock;
1470
1471        child = parent;
1472        parent = parent->parent;
1473        if (parent)
1474                goto up;
1475out_unlock:
1476        rcu_read_unlock();
1477
1478        return ret;
1479}
1480
1481static int tg_nop(struct task_group *tg, void *data)
1482{
1483        return 0;
1484}
1485#endif
1486
1487#ifdef CONFIG_SMP
1488/* Used instead of source_load when we know the type == 0 */
1489static unsigned long weighted_cpuload(const int cpu)
1490{
1491        return cpu_rq(cpu)->load.weight;
1492}
1493
1494/*
1495 * Return a low guess at the load of a migration-source cpu weighted
1496 * according to the scheduling class and "nice" value.
1497 *
1498 * We want to under-estimate the load of migration sources, to
1499 * balance conservatively.
1500 */
1501static unsigned long source_load(int cpu, int type)
1502{
1503        struct rq *rq = cpu_rq(cpu);
1504        unsigned long total = weighted_cpuload(cpu);
1505
1506        if (type == 0 || !sched_feat(LB_BIAS))
1507                return total;
1508
1509        return min(rq->cpu_load[type-1], total);
1510}
1511
1512/*
1513 * Return a high guess at the load of a migration-target cpu weighted
1514 * according to the scheduling class and "nice" value.
1515 */
1516static unsigned long target_load(int cpu, int type)
1517{
1518        struct rq *rq = cpu_rq(cpu);
1519        unsigned long total = weighted_cpuload(cpu);
1520
1521        if (type == 0 || !sched_feat(LB_BIAS))
1522                return total;
1523
1524        return max(rq->cpu_load[type-1], total);
1525}
1526
1527static unsigned long power_of(int cpu)
1528{
1529        return cpu_rq(cpu)->cpu_power;
1530}
1531
1532static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1533
1534static unsigned long cpu_avg_load_per_task(int cpu)
1535{
1536        struct rq *rq = cpu_rq(cpu);
1537        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1538
1539        if (nr_running)
1540                rq->avg_load_per_task = rq->load.weight / nr_running;
1541        else
1542                rq->avg_load_per_task = 0;
1543
1544        return rq->avg_load_per_task;
1545}
1546
1547#ifdef CONFIG_FAIR_GROUP_SCHED
1548
1549static __read_mostly unsigned long __percpu *update_shares_data;
1550
1551static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1552
1553/*
1554 * Calculate and set the cpu's group shares.
1555 */
1556static void update_group_shares_cpu(struct task_group *tg, int cpu,
1557                                    unsigned long sd_shares,
1558                                    unsigned long sd_rq_weight,
1559                                    unsigned long *usd_rq_weight)
1560{
1561        unsigned long shares, rq_weight;
1562        int boost = 0;
1563
1564        rq_weight = usd_rq_weight[cpu];
1565        if (!rq_weight) {
1566                boost = 1;
1567                rq_weight = NICE_0_LOAD;
1568        }
1569
1570        /*
1571         *             \Sum_j shares_j * rq_weight_i
1572         * shares_i =  -----------------------------
1573         *                  \Sum_j rq_weight_j
1574         */
1575        shares = (sd_shares * rq_weight) / sd_rq_weight;
1576        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1577
1578        if (abs(shares - tg->se[cpu]->load.weight) >
1579                        sysctl_sched_shares_thresh) {
1580                struct rq *rq = cpu_rq(cpu);
1581                unsigned long flags;
1582
1583                raw_spin_lock_irqsave(&rq->lock, flags);
1584                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1585                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1586                __set_se_shares(tg->se[cpu], shares);
1587                raw_spin_unlock_irqrestore(&rq->lock, flags);
1588        }
1589}
1590
1591/*
1592 * Re-compute the task group their per cpu shares over the given domain.
1593 * This needs to be done in a bottom-up fashion because the rq weight of a
1594 * parent group depends on the shares of its child groups.
1595 */
1596static int tg_shares_up(struct task_group *tg, void *data)
1597{
1598        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1599        unsigned long *usd_rq_weight;
1600        struct sched_domain *sd = data;
1601        unsigned long flags;
1602        int i;
1603
1604        if (!tg->se[0])
1605                return 0;
1606
1607        local_irq_save(flags);
1608        usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1609
1610        for_each_cpu(i, sched_domain_span(sd)) {
1611                weight = tg->cfs_rq[i]->load.weight;
1612                usd_rq_weight[i] = weight;
1613
1614                rq_weight += weight;
1615                /*
1616                 * If there are currently no tasks on the cpu pretend there
1617                 * is one of average load so that when a new task gets to
1618                 * run here it will not get delayed by group starvation.
1619                 */
1620                if (!weight)
1621                        weight = NICE_0_LOAD;
1622
1623                sum_weight += weight;
1624                shares += tg->cfs_rq[i]->shares;
1625        }
1626
1627        if (!rq_weight)
1628                rq_weight = sum_weight;
1629
1630        if ((!shares && rq_weight) || shares > tg->shares)
1631                shares = tg->shares;
1632
1633        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1634                shares = tg->shares;
1635
1636        for_each_cpu(i, sched_domain_span(sd))
1637                update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1638
1639        local_irq_restore(flags);
1640
1641        return 0;
1642}
1643
1644/*
1645 * Compute the cpu's hierarchical load factor for each task group.
1646 * This needs to be done in a top-down fashion because the load of a child
1647 * group is a fraction of its parents load.
1648 */
1649static int tg_load_down(struct task_group *tg, void *data)
1650{
1651        unsigned long load;
1652        long cpu = (long)data;
1653
1654        if (!tg->parent) {
1655                load = cpu_rq(cpu)->load.weight;
1656        } else {
1657                load = tg->parent->cfs_rq[cpu]->h_load;
1658                load *= tg->cfs_rq[cpu]->shares;
1659                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1660        }
1661
1662        tg->cfs_rq[cpu]->h_load = load;
1663
1664        return 0;
1665}
1666
1667static void update_shares(struct sched_domain *sd)
1668{
1669        s64 elapsed;
1670        u64 now;
1671
1672        if (root_task_group_empty())
1673                return;
1674
1675        now = local_clock();
1676        elapsed = now - sd->last_update;
1677
1678        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1679                sd->last_update = now;
1680                walk_tg_tree(tg_nop, tg_shares_up, sd);
1681        }
1682}
1683
1684static void update_h_load(long cpu)
1685{
1686        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1687}
1688
1689#else
1690
1691static inline void update_shares(struct sched_domain *sd)
1692{
1693}
1694
1695#endif
1696
1697#ifdef CONFIG_PREEMPT
1698
1699static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1700
1701/*
1702 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1703 * way at the expense of forcing extra atomic operations in all
1704 * invocations.  This assures that the double_lock is acquired using the
1705 * same underlying policy as the spinlock_t on this architecture, which
1706 * reduces latency compared to the unfair variant below.  However, it
1707 * also adds more overhead and therefore may reduce throughput.
1708 */
1709static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1710        __releases(this_rq->lock)
1711        __acquires(busiest->lock)
1712        __acquires(this_rq->lock)
1713{
1714        raw_spin_unlock(&this_rq->lock);
1715        double_rq_lock(this_rq, busiest);
1716
1717        return 1;
1718}
1719
1720#else
1721/*
1722 * Unfair double_lock_balance: Optimizes throughput at the expense of
1723 * latency by eliminating extra atomic operations when the locks are
1724 * already in proper order on entry.  This favors lower cpu-ids and will
1725 * grant the double lock to lower cpus over higher ids under contention,
1726 * regardless of entry order into the function.
1727 */
1728static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1729        __releases(this_rq->lock)
1730        __acquires(busiest->lock)
1731        __acquires(this_rq->lock)
1732{
1733        int ret = 0;
1734
1735        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1736                if (busiest < this_rq) {
1737                        raw_spin_unlock(&this_rq->lock);
1738                        raw_spin_lock(&busiest->lock);
1739                        raw_spin_lock_nested(&this_rq->lock,
1740                                              SINGLE_DEPTH_NESTING);
1741                        ret = 1;
1742                } else
1743                        raw_spin_lock_nested(&busiest->lock,
1744                                              SINGLE_DEPTH_NESTING);
1745        }
1746        return ret;
1747}
1748
1749#endif /* CONFIG_PREEMPT */
1750
1751/*
1752 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1753 */
1754static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1755{
1756        if (unlikely(!irqs_disabled())) {
1757                /* printk() doesn't work good under rq->lock */
1758                raw_spin_unlock(&this_rq->lock);
1759                BUG_ON(1);
1760        }
1761
1762        return _double_lock_balance(this_rq, busiest);
1763}
1764
1765static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1766        __releases(busiest->lock)
1767{
1768        raw_spin_unlock(&busiest->lock);
1769        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1770}
1771
1772/*
1773 * double_rq_lock - safely lock two runqueues
1774 *
1775 * Note this does not disable interrupts like task_rq_lock,
1776 * you need to do so manually before calling.
1777 */
1778static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1779        __acquires(rq1->lock)
1780        __acquires(rq2->lock)
1781{
1782        BUG_ON(!irqs_disabled());
1783        if (rq1 == rq2) {
1784                raw_spin_lock(&rq1->lock);
1785                __acquire(rq2->lock);   /* Fake it out ;) */
1786        } else {
1787                if (rq1 < rq2) {
1788                        raw_spin_lock(&rq1->lock);
1789                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1790                } else {
1791                        raw_spin_lock(&rq2->lock);
1792                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1793                }
1794        }
1795}
1796
1797/*
1798 * double_rq_unlock - safely unlock two runqueues
1799 *
1800 * Note this does not restore interrupts like task_rq_unlock,
1801 * you need to do so manually after calling.
1802 */
1803static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1804        __releases(rq1->lock)
1805        __releases(rq2->lock)
1806{
1807        raw_spin_unlock(&rq1->lock);
1808        if (rq1 != rq2)
1809                raw_spin_unlock(&rq2->lock);
1810        else
1811                __release(rq2->lock);
1812}
1813
1814#endif
1815
1816#ifdef CONFIG_FAIR_GROUP_SCHED
1817static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1818{
1819#ifdef CONFIG_SMP
1820        cfs_rq->shares = shares;
1821#endif
1822}
1823#endif
1824
1825static void calc_load_account_idle(struct rq *this_rq);
1826static void update_sysctl(void);
1827static int get_update_sysctl_factor(void);
1828static void update_cpu_load(struct rq *this_rq);
1829
1830static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1831{
1832        set_task_rq(p, cpu);
1833#ifdef CONFIG_SMP
1834        /*
1835         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1836         * successfuly executed on another CPU. We must ensure that updates of
1837         * per-task data have been completed by this moment.
1838         */
1839        smp_wmb();
1840        task_thread_info(p)->cpu = cpu;
1841#endif
1842}
1843
1844static const struct sched_class rt_sched_class;
1845
1846#define sched_class_highest (&stop_sched_class)
1847#define for_each_class(class) \
1848   for (class = sched_class_highest; class; class = class->next)
1849
1850#include "sched_stats.h"
1851
1852static void inc_nr_running(struct rq *rq)
1853{
1854        rq->nr_running++;
1855}
1856
1857static void dec_nr_running(struct rq *rq)
1858{
1859        rq->nr_running--;
1860}
1861
1862static void set_load_weight(struct task_struct *p)
1863{
1864        /*
1865         * SCHED_IDLE tasks get minimal weight:
1866         */
1867        if (p->policy == SCHED_IDLE) {
1868                p->se.load.weight = WEIGHT_IDLEPRIO;
1869                p->se.load.inv_weight = WMULT_IDLEPRIO;
1870                return;
1871        }
1872
1873        p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1874        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1875}
1876
1877static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1878{
1879        update_rq_clock(rq);
1880        sched_info_queued(p);
1881        p->sched_class->enqueue_task(rq, p, flags);
1882        p->se.on_rq = 1;
1883}
1884
1885static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1886{
1887        update_rq_clock(rq);
1888        sched_info_dequeued(p);
1889        p->sched_class->dequeue_task(rq, p, flags);
1890        p->se.on_rq = 0;
1891}
1892
1893/*
1894 * activate_task - move a task to the runqueue.
1895 */
1896static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1897{
1898        if (task_contributes_to_load(p))
1899                rq->nr_uninterruptible--;
1900
1901        enqueue_task(rq, p, flags);
1902        inc_nr_running(rq);
1903}
1904
1905/*
1906 * deactivate_task - remove a task from the runqueue.
1907 */
1908static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1909{
1910        if (task_contributes_to_load(p))
1911                rq->nr_uninterruptible++;
1912
1913        dequeue_task(rq, p, flags);
1914        dec_nr_running(rq);
1915}
1916
1917#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1918
1919/*
1920 * There are no locks covering percpu hardirq/softirq time.
1921 * They are only modified in account_system_vtime, on corresponding CPU
1922 * with interrupts disabled. So, writes are safe.
1923 * They are read and saved off onto struct rq in update_rq_clock().
1924 * This may result in other CPU reading this CPU's irq time and can
1925 * race with irq/account_system_vtime on this CPU. We would either get old
1926 * or new value with a side effect of accounting a slice of irq time to wrong
1927 * task when irq is in progress while we read rq->clock. That is a worthy
1928 * compromise in place of having locks on each irq in account_system_time.
1929 */
1930static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1931static DEFINE_PER_CPU(u64, cpu_softirq_time);
1932
1933static DEFINE_PER_CPU(u64, irq_start_time);
1934static int sched_clock_irqtime;
1935
1936void enable_sched_clock_irqtime(void)
1937{
1938        sched_clock_irqtime = 1;
1939}
1940
1941void disable_sched_clock_irqtime(void)
1942{
1943        sched_clock_irqtime = 0;
1944}
1945
1946#ifndef CONFIG_64BIT
1947static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1948
1949static inline void irq_time_write_begin(void)
1950{
1951        __this_cpu_inc(irq_time_seq.sequence);
1952        smp_wmb();
1953}
1954
1955static inline void irq_time_write_end(void)
1956{
1957        smp_wmb();
1958        __this_cpu_inc(irq_time_seq.sequence);
1959}
1960
1961static inline u64 irq_time_read(int cpu)
1962{
1963        u64 irq_time;
1964        unsigned seq;
1965
1966        do {
1967                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1968                irq_time = per_cpu(cpu_softirq_time, cpu) +
1969                           per_cpu(cpu_hardirq_time, cpu);
1970        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1971
1972        return irq_time;
1973}
1974#else /* CONFIG_64BIT */
1975static inline void irq_time_write_begin(void)
1976{
1977}
1978
1979static inline void irq_time_write_end(void)
1980{
1981}
1982
1983static inline u64 irq_time_read(int cpu)
1984{
1985        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1986}
1987#endif /* CONFIG_64BIT */
1988
1989/*
1990 * Called before incrementing preempt_count on {soft,}irq_enter
1991 * and before decrementing preempt_count on {soft,}irq_exit.
1992 */
1993void account_system_vtime(struct task_struct *curr)
1994{
1995        unsigned long flags;
1996        s64 delta;
1997        int cpu;
1998
1999        if (!sched_clock_irqtime)
2000                return;
2001
2002        local_irq_save(flags);
2003
2004        cpu = smp_processor_id();
2005        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
2006        __this_cpu_add(irq_start_time, delta);
2007
2008        irq_time_write_begin();
2009        /*
2010         * We do not account for softirq time from ksoftirqd here.
2011         * We want to continue accounting softirq time to ksoftirqd thread
2012         * in that case, so as not to confuse scheduler with a special task
2013         * that do not consume any time, but still wants to run.
2014         */
2015        if (hardirq_count())
2016                __this_cpu_add(cpu_hardirq_time, delta);
2017        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
2018                __this_cpu_add(cpu_softirq_time, delta);
2019
2020        irq_time_write_end();
2021        local_irq_restore(flags);
2022}
2023EXPORT_SYMBOL_GPL(account_system_vtime);
2024
2025static void update_rq_clock_task(struct rq *rq, s64 delta)
2026{
2027        s64 irq_delta;
2028
2029        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2030
2031        /*
2032         * Since irq_time is only updated on {soft,}irq_exit, we might run into
2033         * this case when a previous update_rq_clock() happened inside a
2034         * {soft,}irq region.
2035         *
2036         * When this happens, we stop ->clock_task and only update the
2037         * prev_irq_time stamp to account for the part that fit, so that a next
2038         * update will consume the rest. This ensures ->clock_task is
2039         * monotonic.
2040         *
2041         * It does however cause some slight miss-attribution of {soft,}irq
2042         * time, a more accurate solution would be to update the irq_time using
2043         * the current rq->clock timestamp, except that would require using
2044         * atomic ops.
2045         */
2046        if (irq_delta > delta)
2047                irq_delta = delta;
2048
2049        rq->prev_irq_time += irq_delta;
2050        delta -= irq_delta;
2051        rq->clock_task += delta;
2052
2053        if (irq_delta && sched_feat(NONIRQ_POWER))
2054                sched_rt_avg_update(rq, irq_delta);
2055}
2056
2057#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2058
2059static void update_rq_clock_task(struct rq *rq, s64 delta)
2060{
2061        rq->clock_task += delta;
2062}
2063
2064#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2065
2066#include "sched_idletask.c"
2067#include "sched_fair.c"
2068#include "sched_rt.c"
2069#include "sched_stoptask.c"
2070#ifdef CONFIG_SCHED_DEBUG
2071# include "sched_debug.c"
2072#endif
2073
2074void sched_set_stop_task(int cpu, struct task_struct *stop)
2075{
2076        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2077        struct task_struct *old_stop = cpu_rq(cpu)->stop;
2078
2079        if (stop) {
2080                /*
2081                 * Make it appear like a SCHED_FIFO task, its something
2082                 * userspace knows about and won't get confused about.
2083                 *
2084                 * Also, it will make PI more or less work without too
2085                 * much confusion -- but then, stop work should not
2086                 * rely on PI working anyway.
2087                 */
2088                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2089
2090                stop->sched_class = &stop_sched_class;
2091        }
2092
2093        cpu_rq(cpu)->stop = stop;
2094
2095        if (old_stop) {
2096                /*
2097                 * Reset it back to a normal scheduling class so that
2098                 * it can die in pieces.
2099                 */
2100                old_stop->sched_class = &rt_sched_class;
2101        }
2102}
2103
2104/*
2105 * __normal_prio - return the priority that is based on the static prio
2106 */
2107static inline int __normal_prio(struct task_struct *p)
2108{
2109        return p->static_prio;
2110}
2111
2112/*
2113 * Calculate the expected normal priority: i.e. priority
2114 * without taking RT-inheritance into account. Might be
2115 * boosted by interactivity modifiers. Changes upon fork,
2116 * setprio syscalls, and whenever the interactivity
2117 * estimator recalculates.
2118 */
2119static inline int normal_prio(struct task_struct *p)
2120{
2121        int prio;
2122
2123        if (task_has_rt_policy(p))
2124                prio = MAX_RT_PRIO-1 - p->rt_priority;
2125        else
2126                prio = __normal_prio(p);
2127        return prio;
2128}
2129
2130/*
2131 * Calculate the current priority, i.e. the priority
2132 * taken into account by the scheduler. This value might
2133 * be boosted by RT tasks, or might be boosted by
2134 * interactivity modifiers. Will be RT if the task got
2135 * RT-boosted. If not then it returns p->normal_prio.
2136 */
2137static int effective_prio(struct task_struct *p)
2138{
2139        p->normal_prio = normal_prio(p);
2140        /*
2141         * If we are RT tasks or we were boosted to RT priority,
2142         * keep the priority unchanged. Otherwise, update priority
2143         * to the normal priority:
2144         */
2145        if (!rt_prio(p->prio))
2146                return p->normal_prio;
2147        return p->prio;
2148}
2149
2150/**
2151 * task_curr - is this task currently executing on a CPU?
2152 * @p: the task in question.
2153 */
2154inline int task_curr(const struct task_struct *p)
2155{
2156        return cpu_curr(task_cpu(p)) == p;
2157}
2158
2159static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2160                                       const struct sched_class *prev_class,
2161                                       int oldprio, int running)
2162{
2163        if (prev_class != p->sched_class) {
2164                if (prev_class->switched_from)
2165                        prev_class->switched_from(rq, p, running);
2166                p->sched_class->switched_to(rq, p, running);
2167        } else
2168                p->sched_class->prio_changed(rq, p, oldprio, running);
2169}
2170
2171static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2172{
2173        const struct sched_class *class;
2174
2175        if (p->sched_class == rq->curr->sched_class) {
2176                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2177        } else {
2178                for_each_class(class) {
2179                        if (class == rq->curr->sched_class)
2180                                break;
2181                        if (class == p->sched_class) {
2182                                resched_task(rq->curr);
2183                                break;
2184                        }
2185                }
2186        }
2187
2188        /*
2189         * A queue event has occurred, and we're going to schedule.  In
2190         * this case, we can save a useless back to back clock update.
2191         */
2192        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2193                rq->skip_clock_update = 1;
2194}
2195
2196#ifdef CONFIG_SMP
2197/*
2198 * Is this task likely cache-hot:
2199 */
2200static int
2201task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2202{
2203        s64 delta;
2204
2205        if (p->sched_class != &fair_sched_class)
2206                return 0;
2207
2208        if (unlikely(p->policy == SCHED_IDLE))
2209                return 0;
2210
2211        /*
2212         * Buddy candidates are cache hot:
2213         */
2214        if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2215                        (&p->se == cfs_rq_of(&p->se)->next ||
2216                         &p->se == cfs_rq_of(&p->se)->last))
2217                return 1;
2218
2219        if (sysctl_sched_migration_cost == -1)
2220                return 1;
2221        if (sysctl_sched_migration_cost == 0)
2222                return 0;
2223
2224        delta = now - p->se.exec_start;
2225
2226        return delta < (s64)sysctl_sched_migration_cost;
2227}
2228
2229void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2230{
2231#ifdef CONFIG_SCHED_DEBUG
2232        /*
2233         * We should never call set_task_cpu() on a blocked task,
2234         * ttwu() will sort out the placement.
2235         */
2236        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2237                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2238#endif
2239
2240        trace_sched_migrate_task(p, new_cpu);
2241
2242        if (task_cpu(p) != new_cpu) {
2243                p->se.nr_migrations++;
2244                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2245        }
2246
2247        __set_task_cpu(p, new_cpu);
2248}
2249
2250struct migration_arg {
2251        struct task_struct *task;
2252        int dest_cpu;
2253};
2254
2255static int migration_cpu_stop(void *data);
2256
2257/*
2258 * The task's runqueue lock must be held.
2259 * Returns true if you have to wait for migration thread.
2260 */
2261static bool migrate_task(struct task_struct *p, int dest_cpu)
2262{
2263        struct rq *rq = task_rq(p);
2264
2265        /*
2266         * If the task is not on a runqueue (and not running), then
2267         * the next wake-up will properly place the task.
2268         */
2269        return p->se.on_rq || task_running(rq, p);
2270}
2271
2272/*
2273 * wait_task_inactive - wait for a thread to unschedule.
2274 *
2275 * If @match_state is nonzero, it's the @p->state value just checked and
2276 * not expected to change.  If it changes, i.e. @p might have woken up,
2277 * then return zero.  When we succeed in waiting for @p to be off its CPU,
2278 * we return a positive number (its total switch count).  If a second call
2279 * a short while later returns the same number, the caller can be sure that
2280 * @p has remained unscheduled the whole time.
2281 *
2282 * The caller must ensure that the task *will* unschedule sometime soon,
2283 * else this function might spin for a *long* time. This function can't
2284 * be called with interrupts off, or it may introduce deadlock with
2285 * smp_call_function() if an IPI is sent by the same process we are
2286 * waiting to become inactive.
2287 */
2288unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2289{
2290        unsigned long flags;
2291        int running, on_rq;
2292        unsigned long ncsw;
2293        struct rq *rq;
2294
2295        for (;;) {
2296                /*
2297                 * We do the initial early heuristics without holding
2298                 * any task-queue locks at all. We'll only try to get
2299                 * the runqueue lock when things look like they will
2300                 * work out!
2301                 */
2302                rq = task_rq(p);
2303
2304                /*
2305                 * If the task is actively running on another CPU
2306                 * still, just relax and busy-wait without holding
2307                 * any locks.
2308                 *
2309                 * NOTE! Since we don't hold any locks, it's not
2310                 * even sure that "rq" stays as the right runqueue!
2311                 * But we don't care, since "task_running()" will
2312                 * return false if the runqueue has changed and p
2313                 * is actually now running somewhere else!
2314                 */
2315                while (task_running(rq, p)) {
2316                        if (match_state && unlikely(p->state != match_state))
2317                                return 0;
2318                        cpu_relax();
2319                }
2320
2321                /*
2322                 * Ok, time to look more closely! We need the rq
2323                 * lock now, to be *sure*. If we're wrong, we'll
2324                 * just go back and repeat.
2325                 */
2326                rq = task_rq_lock(p, &flags);
2327                trace_sched_wait_task(p);
2328                running = task_running(rq, p);
2329                on_rq = p->se.on_rq;
2330                ncsw = 0;
2331                if (!match_state || p->state == match_state)
2332                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2333                task_rq_unlock(rq, &flags);
2334
2335                /*
2336                 * If it changed from the expected state, bail out now.
2337                 */
2338                if (unlikely(!ncsw))
2339                        break;
2340
2341                /*
2342                 * Was it really running after all now that we
2343                 * checked with the proper locks actually held?
2344                 *
2345                 * Oops. Go back and try again..
2346                 */
2347                if (unlikely(running)) {
2348                        cpu_relax();
2349                        continue;
2350                }
2351
2352                /*
2353                 * It's not enough that it's not actively running,
2354                 * it must be off the runqueue _entirely_, and not
2355                 * preempted!
2356                 *
2357                 * So if it was still runnable (but just not actively
2358                 * running right now), it's preempted, and we should
2359                 * yield - it could be a while.
2360                 */
2361                if (unlikely(on_rq)) {
2362                        schedule_timeout_uninterruptible(1);
2363                        continue;
2364                }
2365
2366                /*
2367                 * Ahh, all good. It wasn't running, and it wasn't
2368                 * runnable, which means that it will never become
2369                 * running in the future either. We're all done!
2370                 */
2371                break;
2372        }
2373
2374        return ncsw;
2375}
2376
2377/***
2378 * kick_process - kick a running thread to enter/exit the kernel
2379 * @p: the to-be-kicked thread
2380 *
2381 * Cause a process which is running on another CPU to enter
2382 * kernel-mode, without any delay. (to get signals handled.)
2383 *
2384 * NOTE: this function doesnt have to take the runqueue lock,
2385 * because all it wants to ensure is that the remote task enters
2386 * the kernel. If the IPI races and the task has been migrated
2387 * to another CPU then no harm is done and the purpose has been
2388 * achieved as well.
2389 */
2390void kick_process(struct task_struct *p)
2391{
2392        int cpu;
2393
2394        preempt_disable();
2395        cpu = task_cpu(p);
2396        if ((cpu != smp_processor_id()) && task_curr(p))
2397                smp_send_reschedule(cpu);
2398        preempt_enable();
2399}
2400EXPORT_SYMBOL_GPL(kick_process);
2401#endif /* CONFIG_SMP */
2402
2403/**
2404 * task_oncpu_function_call - call a function on the cpu on which a task runs
2405 * @p:          the task to evaluate
2406 * @func:       the function to be called
2407 * @info:       the function call argument
2408 *
2409 * Calls the function @func when the task is currently running. This might
2410 * be on the current CPU, which just calls the function directly
2411 */
2412void task_oncpu_function_call(struct task_struct *p,
2413                              void (*func) (void *info), void *info)
2414{
2415        int cpu;
2416
2417        preempt_disable();
2418        cpu = task_cpu(p);
2419        if (task_curr(p))
2420                smp_call_function_single(cpu, func, info, 1);
2421        preempt_enable();
2422}
2423
2424#ifdef CONFIG_SMP
2425/*
2426 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2427 */
2428static int select_fallback_rq(int cpu, struct task_struct *p)
2429{
2430        int dest_cpu;
2431        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2432
2433        /* Look for allowed, online CPU in same node. */
2434        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2435                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2436                        return dest_cpu;
2437
2438        /* Any allowed, online CPU? */
2439        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2440        if (dest_cpu < nr_cpu_ids)
2441                return dest_cpu;
2442
2443        /* No more Mr. Nice Guy. */
2444        if (unlikely(dest_cpu >= nr_cpu_ids)) {
2445                dest_cpu = cpuset_cpus_allowed_fallback(p);
2446                /*
2447                 * Don't tell them about moving exiting tasks or
2448                 * kernel threads (both mm NULL), since they never
2449                 * leave kernel.
2450                 */
2451                if (p->mm && printk_ratelimit()) {
2452                        printk(KERN_INFO "process %d (%s) no "
2453                               "longer affine to cpu%d\n",
2454                               task_pid_nr(p), p->comm, cpu);
2455                }
2456        }
2457
2458        return dest_cpu;
2459}
2460
2461/*
2462 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2463 */
2464static inline
2465int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2466{
2467        int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2468
2469        /*
2470         * In order not to call set_task_cpu() on a blocking task we need
2471         * to rely on ttwu() to place the task on a valid ->cpus_allowed
2472         * cpu.
2473         *
2474         * Since this is common to all placement strategies, this lives here.
2475         *
2476         * [ this allows ->select_task() to simply return task_cpu(p) and
2477         *   not worry about this generic constraint ]
2478         */
2479        if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2480                     !cpu_online(cpu)))
2481                cpu = select_fallback_rq(task_cpu(p), p);
2482
2483        return cpu;
2484}
2485
2486static void update_avg(u64 *avg, u64 sample)
2487{
2488        s64 diff = sample - *avg;
2489        *avg += diff >> 3;
2490}
2491#endif
2492
2493static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2494                                 bool is_sync, bool is_migrate, bool is_local,
2495                                 unsigned long en_flags)
2496{
2497        schedstat_inc(p, se.statistics.nr_wakeups);
2498        if (is_sync)
2499                schedstat_inc(p, se.statistics.nr_wakeups_sync);
2500        if (is_migrate)
2501                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2502        if (is_local)
2503                schedstat_inc(p, se.statistics.nr_wakeups_local);
2504        else
2505                schedstat_inc(p, se.statistics.nr_wakeups_remote);
2506
2507        activate_task(rq, p, en_flags);
2508}
2509
2510static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2511                                        int wake_flags, bool success)
2512{
2513        trace_sched_wakeup(p, success);
2514        check_preempt_curr(rq, p, wake_flags);
2515
2516        p->state = TASK_RUNNING;
2517#ifdef CONFIG_SMP
2518        if (p->sched_class->task_woken)
2519                p->sched_class->task_woken(rq, p);
2520
2521        if (unlikely(rq->idle_stamp)) {
2522                u64 delta = rq->clock - rq->idle_stamp;
2523                u64 max = 2*sysctl_sched_migration_cost;
2524
2525                if (delta > max)
2526                        rq->avg_idle = max;
2527                else
2528                        update_avg(&rq->avg_idle, delta);
2529                rq->idle_stamp = 0;
2530        }
2531#endif
2532        /* if a worker is waking up, notify workqueue */
2533        if ((p->flags & PF_WQ_WORKER) && success)
2534                wq_worker_waking_up(p, cpu_of(rq));
2535}
2536
2537/**
2538 * try_to_wake_up - wake up a thread
2539 * @p: the thread to be awakened
2540 * @state: the mask of task states that can be woken
2541 * @wake_flags: wake modifier flags (WF_*)
2542 *
2543 * Put it on the run-queue if it's not already there. The "current"
2544 * thread is always on the run-queue (except when the actual
2545 * re-schedule is in progress), and as such you're allowed to do
2546 * the simpler "current->state = TASK_RUNNING" to mark yourself
2547 * runnable without the overhead of this.
2548 *
2549 * Returns %true if @p was woken up, %false if it was already running
2550 * or @state didn't match @p's state.
2551 */
2552static int try_to_wake_up(struct task_struct *p, unsigned int state,
2553                          int wake_flags)
2554{
2555        int cpu, orig_cpu, this_cpu, success = 0;
2556        unsigned long flags;
2557        unsigned long en_flags = ENQUEUE_WAKEUP;
2558        struct rq *rq;
2559
2560        this_cpu = get_cpu();
2561
2562        smp_wmb();
2563        rq = task_rq_lock(p, &flags);
2564        if (!(p->state & state))
2565                goto out;
2566
2567        if (p->se.on_rq)
2568                goto out_running;
2569
2570        cpu = task_cpu(p);
2571        orig_cpu = cpu;
2572
2573#ifdef CONFIG_SMP
2574        if (unlikely(task_running(rq, p)))
2575                goto out_activate;
2576
2577        /*
2578         * In order to handle concurrent wakeups and release the rq->lock
2579         * we put the task in TASK_WAKING state.
2580         *
2581         * First fix up the nr_uninterruptible count:
2582         */
2583        if (task_contributes_to_load(p)) {
2584                if (likely(cpu_online(orig_cpu)))
2585                        rq->nr_uninterruptible--;
2586                else
2587                        this_rq()->nr_uninterruptible--;
2588        }
2589        p->state = TASK_WAKING;
2590
2591        if (p->sched_class->task_waking) {
2592                p->sched_class->task_waking(rq, p);
2593                en_flags |= ENQUEUE_WAKING;
2594        }
2595
2596        cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2597        if (cpu != orig_cpu)
2598                set_task_cpu(p, cpu);
2599        __task_rq_unlock(rq);
2600
2601        rq = cpu_rq(cpu);
2602        raw_spin_lock(&rq->lock);
2603
2604        /*
2605         * We migrated the task without holding either rq->lock, however
2606         * since the task is not on the task list itself, nobody else
2607         * will try and migrate the task, hence the rq should match the
2608         * cpu we just moved it to.
2609         */
2610        WARN_ON(task_cpu(p) != cpu);
2611        WARN_ON(p->state != TASK_WAKING);
2612
2613#ifdef CONFIG_SCHEDSTATS
2614        schedstat_inc(rq, ttwu_count);
2615        if (cpu == this_cpu)
2616                schedstat_inc(rq, ttwu_local);
2617        else {
2618                struct sched_domain *sd;
2619                for_each_domain(this_cpu, sd) {
2620                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2621                                schedstat_inc(sd, ttwu_wake_remote);
2622                                break;
2623                        }
2624                }
2625        }
2626#endif /* CONFIG_SCHEDSTATS */
2627
2628out_activate:
2629#endif /* CONFIG_SMP */
2630        ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2631                      cpu == this_cpu, en_flags);
2632        success = 1;
2633out_running:
2634        ttwu_post_activation(p, rq, wake_flags, success);
2635out:
2636        task_rq_unlock(rq, &flags);
2637        put_cpu();
2638
2639        return success;
2640}
2641
2642/**
2643 * try_to_wake_up_local - try to wake up a local task with rq lock held
2644 * @p: the thread to be awakened
2645 *
2646 * Put @p on the run-queue if it's not alredy there.  The caller must
2647 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2648 * the current task.  this_rq() stays locked over invocation.
2649 */
2650static void try_to_wake_up_local(struct task_struct *p)
2651{
2652        struct rq *rq = task_rq(p);
2653        bool success = false;
2654
2655        BUG_ON(rq != this_rq());
2656        BUG_ON(p == current);
2657        lockdep_assert_held(&rq->lock);
2658
2659        if (!(p->state & TASK_NORMAL))
2660                return;
2661
2662        if (!p->se.on_rq) {
2663                if (likely(!task_running(rq, p))) {
2664                        schedstat_inc(rq, ttwu_count);
2665                        schedstat_inc(rq, ttwu_local);
2666                }
2667                ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2668                success = true;
2669        }
2670        ttwu_post_activation(p, rq, 0, success);
2671}
2672
2673/**
2674 * wake_up_process - Wake up a specific process
2675 * @p: The process to be woken up.
2676 *
2677 * Attempt to wake up the nominated process and move it to the set of runnable
2678 * processes.  Returns 1 if the process was woken up, 0 if it was already
2679 * running.
2680 *
2681 * It may be assumed that this function implies a write memory barrier before
2682 * changing the task state if and only if any tasks are woken up.
2683 */
2684int wake_up_process(struct task_struct *p)
2685{
2686        return try_to_wake_up(p, TASK_ALL, 0);
2687}
2688EXPORT_SYMBOL(wake_up_process);
2689
2690int wake_up_state(struct task_struct *p, unsigned int state)
2691{
2692        return try_to_wake_up(p, state, 0);
2693}
2694
2695/*
2696 * Perform scheduler related setup for a newly forked process p.
2697 * p is forked by current.
2698 *
2699 * __sched_fork() is basic setup used by init_idle() too:
2700 */
2701static void __sched_fork(struct task_struct *p)
2702{
2703        p->se.exec_start                = 0;
2704        p->se.sum_exec_runtime          = 0;
2705        p->se.prev_sum_exec_runtime     = 0;
2706        p->se.nr_migrations             = 0;
2707
2708#ifdef CONFIG_SCHEDSTATS
2709        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2710#endif
2711
2712        INIT_LIST_HEAD(&p->rt.run_list);
2713        p->se.on_rq = 0;
2714        INIT_LIST_HEAD(&p->se.group_node);
2715
2716#ifdef CONFIG_PREEMPT_NOTIFIERS
2717        INIT_HLIST_HEAD(&p->preempt_notifiers);
2718#endif
2719}
2720
2721/*
2722 * fork()/clone()-time setup:
2723 */
2724void sched_fork(struct task_struct *p, int clone_flags)
2725{
2726        int cpu = get_cpu();
2727
2728        __sched_fork(p);
2729        /*
2730         * We mark the process as running here. This guarantees that
2731         * nobody will actually run it, and a signal or other external
2732         * event cannot wake it up and insert it on the runqueue either.
2733         */
2734        p->state = TASK_RUNNING;
2735
2736        /*
2737         * Revert to default priority/policy on fork if requested.
2738         */
2739        if (unlikely(p->sched_reset_on_fork)) {
2740                if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2741                        p->policy = SCHED_NORMAL;
2742                        p->normal_prio = p->static_prio;
2743                }
2744
2745                if (PRIO_TO_NICE(p->static_prio) < 0) {
2746                        p->static_prio = NICE_TO_PRIO(0);
2747                        p->normal_prio = p->static_prio;
2748                        set_load_weight(p);
2749                }
2750
2751                /*
2752                 * We don't need the reset flag anymore after the fork. It has
2753                 * fulfilled its duty:
2754                 */
2755                p->sched_reset_on_fork = 0;
2756        }
2757
2758        /*
2759         * Make sure we do not leak PI boosting priority to the child.
2760         */
2761        p->prio = current->normal_prio;
2762
2763        if (!rt_prio(p->prio))
2764                p->sched_class = &fair_sched_class;
2765
2766        if (p->sched_class->task_fork)
2767                p->sched_class->task_fork(p);
2768
2769        /*
2770         * The child is not yet in the pid-hash so no cgroup attach races,
2771         * and the cgroup is pinned to this child due to cgroup_fork()
2772         * is ran before sched_fork().
2773         *
2774         * Silence PROVE_RCU.
2775         */
2776        rcu_read_lock();
2777        set_task_cpu(p, cpu);
2778        rcu_read_unlock();
2779
2780#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2781        if (likely(sched_info_on()))
2782                memset(&p->sched_info, 0, sizeof(p->sched_info));
2783#endif
2784#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2785        p->oncpu = 0;
2786#endif
2787#ifdef CONFIG_PREEMPT
2788        /* Want to start with kernel preemption disabled. */
2789        task_thread_info(p)->preempt_count = 1;
2790#endif
2791        plist_node_init(&p->pushable_tasks, MAX_PRIO);
2792
2793        put_cpu();
2794}
2795
2796/*
2797 * wake_up_new_task - wake up a newly created task for the first time.
2798 *
2799 * This function will do some initial scheduler statistics housekeeping
2800 * that must be done for every newly created context, then puts the task
2801 * on the runqueue and wakes it.
2802 */
2803void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2804{
2805        unsigned long flags;
2806        struct rq *rq;
2807        int cpu __maybe_unused = get_cpu();
2808
2809#ifdef CONFIG_SMP
2810        rq = task_rq_lock(p, &flags);
2811        p->state = TASK_WAKING;
2812
2813        /*
2814         * Fork balancing, do it here and not earlier because:
2815         *  - cpus_allowed can change in the fork path
2816         *  - any previously selected cpu might disappear through hotplug
2817         *
2818         * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2819         * without people poking at ->cpus_allowed.
2820         */
2821        cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2822        set_task_cpu(p, cpu);
2823
2824        p->state = TASK_RUNNING;
2825        task_rq_unlock(rq, &flags);
2826#endif
2827
2828        rq = task_rq_lock(p, &flags);
2829        activate_task(rq, p, 0);
2830        trace_sched_wakeup_new(p, 1);
2831        check_preempt_curr(rq, p, WF_FORK);
2832#ifdef CONFIG_SMP
2833        if (p->sched_class->task_woken)
2834                p->sched_class->task_woken(rq, p);
2835#endif
2836        task_rq_unlock(rq, &flags);
2837        put_cpu();
2838}
2839
2840#ifdef CONFIG_PREEMPT_NOTIFIERS
2841
2842/**
2843 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2844 * @notifier: notifier struct to register
2845 */
2846void preempt_notifier_register(struct preempt_notifier *notifier)
2847{
2848        hlist_add_head(&notifier->link, &current->preempt_notifiers);
2849}
2850EXPORT_SYMBOL_GPL(preempt_notifier_register);
2851
2852/**
2853 * preempt_notifier_unregister - no longer interested in preemption notifications
2854 * @notifier: notifier struct to unregister
2855 *
2856 * This is safe to call from within a preemption notifier.
2857 */
2858void preempt_notifier_unregister(struct preempt_notifier *notifier)
2859{
2860        hlist_del(&notifier->link);
2861}
2862EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2863
2864static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2865{
2866        struct preempt_notifier *notifier;
2867        struct hlist_node *node;
2868
2869        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2870                notifier->ops->sched_in(notifier, raw_smp_processor_id());
2871}
2872
2873static void
2874fire_sched_out_preempt_notifiers(struct task_struct *curr,
2875                                 struct task_struct *next)
2876{
2877        struct preempt_notifier *notifier;
2878        struct hlist_node *node;
2879
2880        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2881                notifier->ops->sched_out(notifier, next);
2882}
2883
2884#else /* !CONFIG_PREEMPT_NOTIFIERS */
2885
2886static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2887{
2888}
2889
2890static void
2891fire_sched_out_preempt_notifiers(struct task_struct *curr,
2892                                 struct task_struct *next)
2893{
2894}
2895
2896#endif /* CONFIG_PREEMPT_NOTIFIERS */
2897
2898/**
2899 * prepare_task_switch - prepare to switch tasks
2900 * @rq: the runqueue preparing to switch
2901 * @prev: the current task that is being switched out
2902 * @next: the task we are going to switch to.
2903 *
2904 * This is called with the rq lock held and interrupts off. It must
2905 * be paired with a subsequent finish_task_switch after the context
2906 * switch.
2907 *
2908 * prepare_task_switch sets up locking and calls architecture specific
2909 * hooks.
2910 */
2911static inline void
2912prepare_task_switch(struct rq *rq, struct task_struct *prev,
2913                    struct task_struct *next)
2914{
2915        fire_sched_out_preempt_notifiers(prev, next);
2916        prepare_lock_switch(rq, next);
2917        prepare_arch_switch(next);
2918}
2919
2920/**
2921 * finish_task_switch - clean up after a task-switch
2922 * @rq: runqueue associated with task-switch
2923 * @prev: the thread we just switched away from.
2924 *
2925 * finish_task_switch must be called after the context switch, paired
2926 * with a prepare_task_switch call before the context switch.
2927 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2928 * and do any other architecture-specific cleanup actions.
2929 *
2930 * Note that we may have delayed dropping an mm in context_switch(). If
2931 * so, we finish that here outside of the runqueue lock. (Doing it
2932 * with the lock held can cause deadlocks; see schedule() for
2933 * details.)
2934 */
2935static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2936        __releases(rq->lock)
2937{
2938        struct mm_struct *mm = rq->prev_mm;
2939        long prev_state;
2940
2941        rq->prev_mm = NULL;
2942
2943        /*
2944         * A task struct has one reference for the use as "current".
2945         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2946         * schedule one last time. The schedule call will never return, and
2947         * the scheduled task must drop that reference.
2948         * The test for TASK_DEAD must occur while the runqueue locks are
2949         * still held, otherwise prev could be scheduled on another cpu, die
2950         * there before we look at prev->state, and then the reference would
2951         * be dropped twice.
2952         *              Manfred Spraul <manfred@colorfullife.com>
2953         */
2954        prev_state = prev->state;
2955        finish_arch_switch(prev);
2956#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2957        local_irq_disable();
2958#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2959        perf_event_task_sched_in(current);
2960#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2961        local_irq_enable();
2962#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2963        finish_lock_switch(rq, prev);
2964
2965        fire_sched_in_preempt_notifiers(current);
2966        if (mm)
2967                mmdrop(mm);
2968        if (unlikely(prev_state == TASK_DEAD)) {
2969                /*
2970                 * Remove function-return probe instances associated with this
2971                 * task and put them back on the free list.
2972                 */
2973                kprobe_flush_task(prev);
2974                put_task_struct(prev);
2975        }
2976}
2977
2978#ifdef CONFIG_SMP
2979
2980/* assumes rq->lock is held */
2981static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2982{
2983        if (prev->sched_class->pre_schedule)
2984                prev->sched_class->pre_schedule(rq, prev);
2985}
2986
2987/* rq->lock is NOT held, but preemption is disabled */
2988static inline void post_schedule(struct rq *rq)
2989{
2990        if (rq->post_schedule) {
2991                unsigned long flags;
2992
2993                raw_spin_lock_irqsave(&rq->lock, flags);
2994                if (rq->curr->sched_class->post_schedule)
2995                        rq->curr->sched_class->post_schedule(rq);
2996                raw_spin_unlock_irqrestore(&rq->lock, flags);
2997
2998                rq->post_schedule = 0;
2999        }
3000}
3001
3002#else
3003
3004static inline void pre_schedule(struct rq *rq, struct task_struct *p)
3005{
3006}
3007
3008static inline void post_schedule(struct rq *rq)
3009{
3010}
3011
3012#endif
3013
3014/**
3015 * schedule_tail - first thing a freshly forked thread must call.
3016 * @prev: the thread we just switched away from.
3017 */
3018asmlinkage void schedule_tail(struct task_struct *prev)
3019        __releases(rq->lock)
3020{
3021        struct rq *rq = this_rq();
3022
3023        finish_task_switch(rq, prev);
3024
3025        /*
3026         * FIXME: do we need to worry about rq being invalidated by the
3027         * task_switch?
3028         */
3029        post_schedule(rq);
3030
3031#ifdef __ARCH_WANT_UNLOCKED_CTXSW
3032        /* In this case, finish_task_switch does not reenable preemption */
3033        preempt_enable();
3034#endif
3035        if (current->set_child_tid)
3036                put_user(task_pid_vnr(current), current->set_child_tid);
3037}
3038
3039/*
3040 * context_switch - switch to the new MM and the new
3041 * thread's register state.
3042 */
3043static inline void
3044context_switch(struct rq *rq, struct task_struct *prev,
3045               struct task_struct *next)
3046{
3047        struct mm_struct *mm, *oldmm;
3048
3049        prepare_task_switch(rq, prev, next);
3050        trace_sched_switch(prev, next);
3051        mm = next->mm;
3052        oldmm = prev->active_mm;
3053        /*
3054         * For paravirt, this is coupled with an exit in switch_to to
3055         * combine the page table reload and the switch backend into
3056         * one hypercall.
3057         */
3058        arch_start_context_switch(prev);
3059
3060        if (!mm) {
3061                next->active_mm = oldmm;
3062                atomic_inc(&oldmm->mm_count);
3063                enter_lazy_tlb(oldmm, next);
3064        } else
3065                switch_mm(oldmm, mm, next);
3066
3067        if (!prev->mm) {
3068                prev->active_mm = NULL;
3069                rq->prev_mm = oldmm;
3070        }
3071        /*
3072         * Since the runqueue lock will be released by the next
3073         * task (which is an invalid locking op but in the case
3074         * of the scheduler it's an obvious special-case), so we
3075         * do an early lockdep release here:
3076         */
3077#ifndef __ARCH_WANT_UNLOCKED_CTXSW
3078        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3079#endif
3080
3081        /* Here we just switch the register state and the stack. */
3082        switch_to(prev, next, prev);
3083
3084        barrier();
3085        /*
3086         * this_rq must be evaluated again because prev may have moved
3087         * CPUs since it called schedule(), thus the 'rq' on its stack
3088         * frame will be invalid.
3089         */
3090        finish_task_switch(this_rq(), prev);
3091}
3092
3093/*
3094 * nr_running, nr_uninterruptible and nr_context_switches:
3095 *
3096 * externally visible scheduler statistics: current number of runnable
3097 * threads, current number of uninterruptible-sleeping threads, total
3098 * number of context switches performed since bootup.
3099 */
3100unsigned long nr_running(void)
3101{
3102        unsigned long i, sum = 0;
3103
3104        for_each_online_cpu(i)
3105                sum += cpu_rq(i)->nr_running;
3106
3107        return sum;
3108}
3109
3110unsigned long nr_uninterruptible(void)
3111{
3112        unsigned long i, sum = 0;
3113
3114        for_each_possible_cpu(i)
3115                sum += cpu_rq(i)->nr_uninterruptible;
3116
3117        /*
3118         * Since we read the counters lockless, it might be slightly
3119         * inaccurate. Do not allow it to go below zero though:
3120         */
3121        if (unlikely((long)sum < 0))
3122                sum = 0;
3123
3124        return sum;
3125}
3126
3127unsigned long long nr_context_switches(void)
3128{
3129        int i;
3130        unsigned long long sum = 0;
3131
3132        for_each_possible_cpu(i)
3133                sum += cpu_rq(i)->nr_switches;
3134
3135        return sum;
3136}
3137
3138unsigned long nr_iowait(void)
3139{
3140        unsigned long i, sum = 0;
3141
3142        for_each_possible_cpu(i)
3143                sum += atomic_read(&cpu_rq(i)->nr_iowait);
3144
3145        return sum;
3146}
3147
3148unsigned long nr_iowait_cpu(int cpu)
3149{
3150        struct rq *this = cpu_rq(cpu);
3151        return atomic_read(&this->nr_iowait);
3152}
3153
3154unsigned long this_cpu_load(void)
3155{
3156        struct rq *this = this_rq();
3157        return this->cpu_load[0];
3158}
3159
3160
3161/* Variables and functions for calc_load */
3162static atomic_long_t calc_load_tasks;
3163static unsigned long calc_load_update;
3164unsigned long avenrun[3];
3165EXPORT_SYMBOL(avenrun);
3166
3167static long calc_load_fold_active(struct rq *this_rq)
3168{
3169        long nr_active, delta = 0;
3170
3171        nr_active = this_rq->nr_running;
3172        nr_active += (long) this_rq->nr_uninterruptible;
3173
3174        if (nr_active != this_rq->calc_load_active) {
3175                delta = nr_active - this_rq->calc_load_active;
3176                this_rq->calc_load_active = nr_active;
3177        }
3178
3179        return delta;
3180}
3181
3182static unsigned long
3183calc_load(unsigned long load, unsigned long exp, unsigned long active)
3184{
3185        load *= exp;
3186        load += active * (FIXED_1 - exp);
3187        load += 1UL << (FSHIFT - 1);
3188        return load >> FSHIFT;
3189}
3190
3191#ifdef CONFIG_NO_HZ
3192/*
3193 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3194 *
3195 * When making the ILB scale, we should try to pull this in as well.
3196 */
3197static atomic_long_t calc_load_tasks_idle;
3198
3199static void calc_load_account_idle(struct rq *this_rq)
3200{
3201        long delta;
3202
3203        delta = calc_load_fold_active(this_rq);
3204        if (delta)
3205                atomic_long_add(delta, &calc_load_tasks_idle);
3206}
3207
3208static long calc_load_fold_idle(void)
3209{
3210        long delta = 0;
3211
3212        /*
3213         * Its got a race, we don't care...
3214         */
3215        if (atomic_long_read(&calc_load_tasks_idle))
3216                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3217
3218        return delta;
3219}
3220
3221/**
3222 * fixed_power_int - compute: x^n, in O(log n) time
3223 *
3224 * @x:         base of the power
3225 * @frac_bits: fractional bits of @x
3226 * @n:         power to raise @x to.
3227 *
3228 * By exploiting the relation between the definition of the natural power
3229 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3230 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3231 * (where: n_i \elem {0, 1}, the binary vector representing n),
3232 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3233 * of course trivially computable in O(log_2 n), the length of our binary
3234 * vector.
3235 */
3236static unsigned long
3237fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3238{
3239        unsigned long result = 1UL << frac_bits;
3240
3241        if (n) for (;;) {
3242                if (n & 1) {
3243                        result *= x;
3244                        result += 1UL << (frac_bits - 1);
3245                        result >>= frac_bits;
3246                }
3247                n >>= 1;
3248                if (!n)
3249                        break;
3250                x *= x;
3251                x += 1UL << (frac_bits - 1);
3252                x >>= frac_bits;
3253        }
3254
3255        return result;
3256}
3257
3258/*
3259 * a1 = a0 * e + a * (1 - e)
3260 *
3261 * a2 = a1 * e + a * (1 - e)
3262 *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3263 *    = a0 * e^2 + a * (1 - e) * (1 + e)
3264 *
3265 * a3 = a2 * e + a * (1 - e)
3266 *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3267 *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3268 *
3269 *  ...
3270 *
3271 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3272 *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3273 *    = a0 * e^n + a * (1 - e^n)
3274 *
3275 * [1] application of the geometric series:
3276 *
3277 *              n         1 - x^(n+1)
3278 *     S_n := \Sum x^i = -------------
3279 *             i=0          1 - x
3280 */
3281static unsigned long
3282calc_load_n(unsigned long load, unsigned long exp,
3283            unsigned long active, unsigned int n)
3284{
3285
3286        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3287}
3288
3289/*
3290 * NO_HZ can leave us missing all per-cpu ticks calling
3291 * calc_load_account_active(), but since an idle CPU folds its delta into
3292 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3293 * in the pending idle delta if our idle period crossed a load cycle boundary.
3294 *
3295 * Once we've updated the global active value, we need to apply the exponential
3296 * weights adjusted to the number of cycles missed.
3297 */
3298static void calc_global_nohz(unsigned long ticks)
3299{
3300        long delta, active, n;
3301
3302        if (time_before(jiffies, calc_load_update))
3303                return;
3304
3305        /*
3306         * If we crossed a calc_load_update boundary, make sure to fold
3307         * any pending idle changes, the respective CPUs might have
3308         * missed the tick driven calc_load_account_active() update
3309         * due to NO_HZ.
3310         */
3311        delta = calc_load_fold_idle();
3312        if (delta)
3313                atomic_long_add(delta, &calc_load_tasks);
3314
3315        /*
3316         * If we were idle for multiple load cycles, apply them.
3317         */
3318        if (ticks >= LOAD_FREQ) {
3319                n = ticks / LOAD_FREQ;
3320
3321                active = atomic_long_read(&calc_load_tasks);
3322                active = active > 0 ? active * FIXED_1 : 0;
3323
3324                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3325                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3326                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3327
3328                calc_load_update += n * LOAD_FREQ;
3329        }
3330
3331        /*
3332         * Its possible the remainder of the above division also crosses
3333         * a LOAD_FREQ period, the regular check in calc_global_load()
3334         * which comes after this will take care of that.
3335         *
3336         * Consider us being 11 ticks before a cycle completion, and us
3337         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3338         * age us 4 cycles, and the test in calc_global_load() will
3339         * pick up the final one.
3340         */
3341}
3342#else
3343static void calc_load_account_idle(struct rq *this_rq)
3344{
3345}
3346
3347static inline long calc_load_fold_idle(void)
3348{
3349        return 0;
3350}
3351
3352static void calc_global_nohz(unsigned long ticks)
3353{
3354}
3355#endif
3356
3357/**
3358 * get_avenrun - get the load average array
3359 * @loads:      pointer to dest load array
3360 * @offset:     offset to add
3361 * @shift:      shift count to shift the result left
3362 *
3363 * These values are estimates at best, so no need for locking.
3364 */
3365void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3366{
3367        loads[0] = (avenrun[0] + offset) << shift;
3368        loads[1] = (avenrun[1] + offset) << shift;
3369        loads[2] = (avenrun[2] + offset) << shift;
3370}
3371
3372/*
3373 * calc_load - update the avenrun load estimates 10 ticks after the
3374 * CPUs have updated calc_load_tasks.
3375 */
3376void calc_global_load(unsigned long ticks)
3377{
3378        long active;
3379
3380        calc_global_nohz(ticks);
3381
3382        if (time_before(jiffies, calc_load_update + 10))
3383                return;
3384
3385        active = atomic_long_read(&calc_load_tasks);
3386        active = active > 0 ? active * FIXED_1 : 0;
3387
3388        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3389        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3390        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3391
3392        calc_load_update += LOAD_FREQ;
3393}
3394
3395/*
3396 * Called from update_cpu_load() to periodically update this CPU's
3397 * active count.
3398 */
3399static void calc_load_account_active(struct rq *this_rq)
3400{
3401        long delta;
3402
3403        if (time_before(jiffies, this_rq->calc_load_update))
3404                return;
3405
3406        delta  = calc_load_fold_active(this_rq);
3407        delta += calc_load_fold_idle();
3408        if (delta)
3409                atomic_long_add(delta, &calc_load_tasks);
3410
3411        this_rq->calc_load_update += LOAD_FREQ;
3412}
3413
3414/*
3415 * The exact cpuload at various idx values, calculated at every tick would be
3416 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3417 *
3418 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3419 * on nth tick when cpu may be busy, then we have:
3420 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3421 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3422 *
3423 * decay_load_missed() below does efficient calculation of
3424 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3425 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3426 *
3427 * The calculation is approximated on a 128 point scale.
3428 * degrade_zero_ticks is the number of ticks after which load at any
3429 * particular idx is approximated to be zero.
3430 * degrade_factor is a precomputed table, a row for each load idx.
3431 * Each column corresponds to degradation factor for a power of two ticks,
3432 * based on 128 point scale.
3433 * Example:
3434 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3435 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3436 *
3437 * With this power of 2 load factors, we can degrade the load n times
3438 * by looking at 1 bits in n and doing as many mult/shift instead of
3439 * n mult/shifts needed by the exact degradation.
3440 */
3441#define DEGRADE_SHIFT           7
3442static const unsigned char
3443                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3444static const unsigned char
3445                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3446                                        {0, 0, 0, 0, 0, 0, 0, 0},
3447                                        {64, 32, 8, 0, 0, 0, 0, 0},
3448                                        {96, 72, 40, 12, 1, 0, 0},
3449                                        {112, 98, 75, 43, 15, 1, 0},
3450                                        {120, 112, 98, 76, 45, 16, 2} };
3451
3452/*
3453 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3454 * would be when CPU is idle and so we just decay the old load without
3455 * adding any new load.
3456 */
3457static unsigned long
3458decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3459{
3460        int j = 0;
3461
3462        if (!missed_updates)
3463                return load;
3464
3465        if (missed_updates >= degrade_zero_ticks[idx])
3466                return 0;
3467
3468        if (idx == 1)
3469                return load >> missed_updates;
3470
3471        while (missed_updates) {
3472                if (missed_updates % 2)
3473                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3474
3475                missed_updates >>= 1;
3476                j++;
3477        }
3478        return load;
3479}
3480
3481/*
3482 * Update rq->cpu_load[] statistics. This function is usually called every
3483 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3484 * every tick. We fix it up based on jiffies.
3485 */
3486static void update_cpu_load(struct rq *this_rq)
3487{
3488        unsigned long this_load = this_rq->load.weight;
3489        unsigned long curr_jiffies = jiffies;
3490        unsigned long pending_updates;
3491        int i, scale;
3492
3493        this_rq->nr_load_updates++;
3494
3495        /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3496        if (curr_jiffies == this_rq->last_load_update_tick)
3497                return;
3498
3499        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3500        this_rq->last_load_update_tick = curr_jiffies;
3501
3502        /* Update our load: */
3503        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3504        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3505                unsigned long old_load, new_load;
3506
3507                /* scale is effectively 1 << i now, and >> i divides by scale */
3508
3509                old_load = this_rq->cpu_load[i];
3510                old_load = decay_load_missed(old_load, pending_updates - 1, i);
3511                new_load = this_load;
3512                /*
3513                 * Round up the averaging division if load is increasing. This
3514                 * prevents us from getting stuck on 9 if the load is 10, for
3515                 * example.
3516                 */
3517                if (new_load > old_load)
3518                        new_load += scale - 1;
3519
3520                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3521        }
3522
3523        sched_avg_update(this_rq);
3524}
3525
3526static void update_cpu_load_active(struct rq *this_rq)
3527{
3528        update_cpu_load(this_rq);
3529
3530        calc_load_account_active(this_rq);
3531}
3532
3533#ifdef CONFIG_SMP
3534
3535/*
3536 * sched_exec - execve() is a valuable balancing opportunity, because at
3537 * this point the task has the smallest effective memory and cache footprint.
3538 */
3539void sched_exec(void)
3540{
3541        struct task_struct *p = current;
3542        unsigned long flags;
3543        struct rq *rq;
3544        int dest_cpu;
3545
3546        rq = task_rq_lock(p, &flags);
3547        dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3548        if (dest_cpu == smp_processor_id())
3549                goto unlock;
3550
3551        /*
3552         * select_task_rq() can race against ->cpus_allowed
3553         */
3554        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3555            likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3556                struct migration_arg arg = { p, dest_cpu };
3557
3558                task_rq_unlock(rq, &flags);
3559                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3560                return;
3561        }
3562unlock:
3563        task_rq_unlock(rq, &flags);
3564}
3565
3566#endif
3567
3568DEFINE_PER_CPU(struct kernel_stat, kstat);
3569
3570EXPORT_PER_CPU_SYMBOL(kstat);
3571
3572/*
3573 * Return any ns on the sched_clock that have not yet been accounted in
3574 * @p in case that task is currently running.
3575 *
3576 * Called with task_rq_lock() held on @rq.
3577 */
3578static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3579{
3580        u64 ns = 0;
3581
3582        if (task_current(rq, p)) {
3583                update_rq_clock(rq);
3584                ns = rq->clock_task - p->se.exec_start;
3585                if ((s64)ns < 0)
3586                        ns = 0;
3587        }
3588
3589        return ns;
3590}
3591
3592unsigned long long task_delta_exec(struct task_struct *p)
3593{
3594        unsigned long flags;
3595        struct rq *rq;
3596        u64 ns = 0;
3597
3598        rq = task_rq_lock(p, &flags);
3599        ns = do_task_delta_exec(p, rq);
3600        task_rq_unlock(rq, &flags);
3601
3602        return ns;
3603}
3604
3605/*
3606 * Return accounted runtime for the task.
3607 * In case the task is currently running, return the runtime plus current's
3608 * pending runtime that have not been accounted yet.
3609 */
3610unsigned long long task_sched_runtime(struct task_struct *p)
3611{
3612        unsigned long flags;
3613        struct rq *rq;
3614        u64 ns = 0;
3615
3616        rq = task_rq_lock(p, &flags);
3617        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3618        task_rq_unlock(rq, &flags);
3619
3620        return ns;
3621}
3622
3623/*
3624 * Return sum_exec_runtime for the thread group.
3625 * In case the task is currently running, return the sum plus current's
3626 * pending runtime that have not been accounted yet.
3627 *
3628 * Note that the thread group might have other running tasks as well,
3629 * so the return value not includes other pending runtime that other
3630 * running tasks might have.
3631 */
3632unsigned long long thread_group_sched_runtime(struct task_struct *p)
3633{
3634        struct task_cputime totals;
3635        unsigned long flags;
3636        struct rq *rq;
3637        u64 ns;
3638
3639        rq = task_rq_lock(p, &flags);
3640        thread_group_cputime(p, &totals);
3641        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3642        task_rq_unlock(rq, &flags);
3643
3644        return ns;
3645}
3646
3647/*
3648 * Account user cpu time to a process.
3649 * @p: the process that the cpu time gets accounted to
3650 * @cputime: the cpu time spent in user space since the last update
3651 * @cputime_scaled: cputime scaled by cpu frequency
3652 */
3653void account_user_time(struct task_struct *p, cputime_t cputime,
3654                       cputime_t cputime_scaled)
3655{
3656        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3657        cputime64_t tmp;
3658
3659        /* Add user time to process. */
3660        p->utime = cputime_add(p->utime, cputime);
3661        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3662        account_group_user_time(p, cputime);
3663
3664        /* Add user time to cpustat. */
3665        tmp = cputime_to_cputime64(cputime);
3666        if (TASK_NICE(p) > 0)
3667                cpustat->nice = cputime64_add(cpustat->nice, tmp);
3668        else
3669                cpustat->user = cputime64_add(cpustat->user, tmp);
3670
3671        cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3672        /* Account for user time used */
3673        acct_update_integrals(p);
3674}
3675
3676/*
3677 * Account guest cpu time to a process.
3678 * @p: the process that the cpu time gets accounted to
3679 * @cputime: the cpu time spent in virtual machine since the last update
3680 * @cputime_scaled: cputime scaled by cpu frequency
3681 */
3682static void account_guest_time(struct task_struct *p, cputime_t cputime,
3683                               cputime_t cputime_scaled)
3684{
3685        cputime64_t tmp;
3686        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3687
3688        tmp = cputime_to_cputime64(cputime);
3689
3690        /* Add guest time to process. */
3691        p->utime = cputime_add(p->utime, cputime);
3692        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3693        account_group_user_time(p, cputime);
3694        p->gtime = cputime_add(p->gtime, cputime);
3695
3696        /* Add guest time to cpustat. */
3697        if (TASK_NICE(p) > 0) {
3698                cpustat->nice = cputime64_add(cpustat->nice, tmp);
3699                cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3700        } else {
3701                cpustat->user = cputime64_add(cpustat->user, tmp);
3702                cpustat->guest = cputime64_add(cpustat->guest, tmp);
3703        }
3704}
3705
3706/*
3707 * Account system cpu time to a process.
3708 * @p: the process that the cpu time gets accounted to
3709 * @hardirq_offset: the offset to subtract from hardirq_count()
3710 * @cputime: the cpu time spent in kernel space since the last update
3711 * @cputime_scaled: cputime scaled by cpu frequency
3712 */
3713void account_system_time(struct task_struct *p, int hardirq_offset,
3714                         cputime_t cputime, cputime_t cputime_scaled)
3715{
3716        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3717        cputime64_t tmp;
3718
3719        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3720                account_guest_time(p, cputime, cputime_scaled);
3721                return;
3722        }
3723
3724        /* Add system time to process. */
3725        p->stime = cputime_add(p->stime, cputime);
3726        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3727        account_group_system_time(p, cputime);
3728
3729        /* Add system time to cpustat. */
3730        tmp = cputime_to_cputime64(cputime);
3731        if (hardirq_count() - hardirq_offset)
3732                cpustat->irq = cputime64_add(cpustat->irq, tmp);
3733        else if (in_serving_softirq())
3734                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3735        else
3736                cpustat->system = cputime64_add(cpustat->system, tmp);
3737
3738        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3739
3740        /* Account for system time used */
3741        acct_update_integrals(p);
3742}
3743
3744/*
3745 * Account for involuntary wait time.
3746 * @steal: the cpu time spent in involuntary wait
3747 */
3748void account_steal_time(cputime_t cputime)
3749{
3750        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3751        cputime64_t cputime64 = cputime_to_cputime64(cputime);
3752
3753        cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3754}
3755
3756/*
3757 * Account for idle time.
3758 * @cputime: the cpu time spent in idle wait
3759 */
3760void account_idle_time(cputime_t cputime)
3761{
3762        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3763        cputime64_t cputime64 = cputime_to_cputime64(cputime);
3764        struct rq *rq = this_rq();
3765
3766        if (atomic_read(&rq->nr_iowait) > 0)
3767                cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3768        else
3769                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3770}
3771
3772#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3773
3774/*
3775 * Account a single tick of cpu time.
3776 * @p: the process that the cpu time gets accounted to
3777 * @user_tick: indicates if the tick is a user or a system tick
3778 */
3779void account_process_tick(struct task_struct *p, int user_tick)
3780{
3781        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3782        struct rq *rq = this_rq();
3783
3784        if (user_tick)
3785                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3786        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3787                account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3788                                    one_jiffy_scaled);
3789        else
3790                account_idle_time(cputime_one_jiffy);
3791}
3792
3793/*
3794 * Account multiple ticks of steal time.
3795 * @p: the process from which the cpu time has been stolen
3796 * @ticks: number of stolen ticks
3797 */
3798void account_steal_ticks(unsigned long ticks)
3799{
3800        account_steal_time(jiffies_to_cputime(ticks));
3801}
3802
3803/*
3804 * Account multiple ticks of idle time.
3805 * @ticks: number of stolen ticks
3806 */
3807void account_idle_ticks(unsigned long ticks)
3808{
3809        account_idle_time(jiffies_to_cputime(ticks));
3810}
3811
3812#endif
3813
3814/*
3815 * Use precise platform statistics if available:
3816 */
3817#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3818void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3819{
3820        *ut = p->utime;
3821        *st = p->stime;
3822}
3823
3824void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3825{
3826        struct task_cputime cputime;
3827
3828        thread_group_cputime(p, &cputime);
3829
3830        *ut = cputime.utime;
3831        *st = cputime.stime;
3832}
3833#else
3834
3835#ifndef nsecs_to_cputime
3836# define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
3837#endif
3838
3839void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3840{
3841        cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3842
3843        /*
3844         * Use CFS's precise accounting:
3845         */
3846        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3847
3848        if (total) {
3849                u64 temp = rtime;
3850
3851                temp *= utime;
3852                do_div(temp, total);
3853                utime = (cputime_t)temp;
3854        } else
3855                utime = rtime;
3856
3857        /*
3858         * Compare with previous values, to keep monotonicity:
3859         */
3860        p->prev_utime = max(p->prev_utime, utime);
3861        p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
3862
3863        *ut = p->prev_utime;
3864        *st = p->prev_stime;
3865}
3866
3867/*
3868 * Must be called with siglock held.
3869 */
3870void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3871{
3872        struct signal_struct *sig = p->signal;
3873        struct task_cputime cputime;
3874        cputime_t rtime, utime, total;
3875
3876        thread_group_cputime(p, &cputime);
3877
3878        total = cputime_add(cputime.utime, cputime.stime);
3879        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3880
3881        if (total) {
3882                u64 temp = rtime;
3883
3884                temp *= cputime.utime;
3885                do_div(temp, total);
3886                utime = (cputime_t)temp;
3887        } else
3888                utime = rtime;
3889
3890        sig->prev_utime = max(sig->prev_utime, utime);
3891        sig->prev_stime = max(sig->prev_stime,
3892                              cputime_sub(rtime, sig->prev_utime));
3893
3894        *ut = sig->prev_utime;
3895        *st = sig->prev_stime;
3896}
3897#endif
3898
3899/*
3900 * This function gets called by the timer code, with HZ frequency.
3901 * We call it with interrupts disabled.
3902 *
3903 * It also gets called by the fork code, when changing the parent's
3904 * timeslices.
3905 */
3906void scheduler_tick(void)
3907{
3908        int cpu = smp_processor_id();
3909        struct rq *rq = cpu_rq(cpu);
3910        struct task_struct *curr = rq->curr;
3911
3912        sched_clock_tick();
3913
3914        raw_spin_lock(&rq->lock);
3915        update_rq_clock(rq);
3916        update_cpu_load_active(rq);
3917        curr->sched_class->task_tick(rq, curr, 0);
3918        raw_spin_unlock(&rq->lock);
3919
3920        perf_event_task_tick();
3921
3922#ifdef CONFIG_SMP
3923        rq->idle_at_tick = idle_cpu(cpu);
3924        trigger_load_balance(rq, cpu);
3925#endif
3926}
3927
3928notrace unsigned long get_parent_ip(unsigned long addr)
3929{
3930        if (in_lock_functions(addr)) {
3931                addr = CALLER_ADDR2;
3932                if (in_lock_functions(addr))
3933                        addr = CALLER_ADDR3;
3934        }
3935        return addr;
3936}
3937
3938#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3939                                defined(CONFIG_PREEMPT_TRACER))
3940
3941void __kprobes add_preempt_count(int val)
3942{
3943#ifdef CONFIG_DEBUG_PREEMPT
3944        /*
3945         * Underflow?
3946         */
3947        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3948                return;
3949#endif
3950        preempt_count() += val;
3951#ifdef CONFIG_DEBUG_PREEMPT
3952        /*
3953         * Spinlock count overflowing soon?
3954         */
3955        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3956                                PREEMPT_MASK - 10);
3957#endif
3958        if (preempt_count() == val)
3959                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3960}
3961EXPORT_SYMBOL(add_preempt_count);
3962
3963void __kprobes sub_preempt_count(int val)
3964{
3965#ifdef CONFIG_DEBUG_PREEMPT
3966        /*
3967         * Underflow?
3968         */
3969        if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3970                return;
3971        /*
3972         * Is the spinlock portion underflowing?
3973         */
3974        if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3975                        !(preempt_count() & PREEMPT_MASK)))
3976                return;
3977#endif
3978
3979        if (preempt_count() == val)
3980                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3981        preempt_count() -= val;
3982}
3983EXPORT_SYMBOL(sub_preempt_count);
3984
3985#endif
3986
3987/*
3988 * Print scheduling while atomic bug:
3989 */
3990static noinline void __schedule_bug(struct task_struct *prev)
3991{
3992        struct pt_regs *regs = get_irq_regs();
3993
3994        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3995                prev->comm, prev->pid, preempt_count());
3996
3997        debug_show_held_locks(prev);
3998        print_modules();
3999        if (irqs_disabled())
4000                print_irqtrace_events(prev);
4001
4002        if (regs)
4003                show_regs(regs);
4004        else
4005                dump_stack();
4006}
4007
4008/*
4009 * Various schedule()-time debugging checks and statistics:
4010 */
4011static inline void schedule_debug(struct task_struct *prev)
4012{
4013        /*
4014         * Test if we are atomic. Since do_exit() needs to call into
4015         * schedule() atomically, we ignore that path for now.
4016         * Otherwise, whine if we are scheduling when we should not be.
4017         */
4018        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4019                __schedule_bug(prev);
4020
4021        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4022
4023        schedstat_inc(this_rq(), sched_count);
4024#ifdef CONFIG_SCHEDSTATS
4025        if (unlikely(prev->lock_depth >= 0)) {
4026                schedstat_inc(this_rq(), bkl_count);
4027                schedstat_inc(prev, sched_info.bkl_count);
4028        }
4029#endif
4030}
4031
4032static void put_prev_task(struct rq *rq, struct task_struct *prev)
4033{
4034        if (prev->se.on_rq)
4035                update_rq_clock(rq);
4036        prev->sched_class->put_prev_task(rq, prev);
4037}
4038
4039/*
4040 * Pick up the highest-prio task:
4041 */
4042static inline struct task_struct *
4043pick_next_task(struct rq *rq)
4044{
4045        const struct sched_class *class;
4046        struct task_struct *p;
4047
4048        /*
4049         * Optimization: we know that if all tasks are in
4050         * the fair class we can call that function directly:
4051         */
4052        if (likely(rq->nr_running == rq->cfs.nr_running)) {
4053                p = fair_sched_class.pick_next_task(rq);
4054                if (likely(p))
4055                        return p;
4056        }
4057
4058        for_each_class(class) {
4059                p = class->pick_next_task(rq);
4060                if (p)
4061                        return p;
4062        }
4063
4064        BUG(); /* the idle class will always have a runnable task */
4065}
4066
4067/*
4068 * schedule() is the main scheduler function.
4069 */
4070asmlinkage void __sched schedule(void)
4071{
4072        struct task_struct *prev, *next;
4073        unsigned long *switch_count;
4074        struct rq *rq;
4075        int cpu;
4076
4077need_resched:
4078        preempt_disable();
4079        cpu = smp_processor_id();
4080        rq = cpu_rq(cpu);
4081        rcu_note_context_switch(cpu);
4082        prev = rq->curr;
4083
4084        release_kernel_lock(prev);
4085need_resched_nonpreemptible:
4086
4087        schedule_debug(prev);
4088
4089        if (sched_feat(HRTICK))
4090                hrtick_clear(rq);
4091
4092        raw_spin_lock_irq(&rq->lock);
4093
4094        switch_count = &prev->nivcsw;
4095        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4096                if (unlikely(signal_pending_state(prev->state, prev))) {
4097                        prev->state = TASK_RUNNING;
4098                } else {
4099                        /*
4100                         * If a worker is going to sleep, notify and
4101                         * ask workqueue whether it wants to wake up a
4102                         * task to maintain concurrency.  If so, wake
4103                         * up the task.
4104                         */
4105                        if (prev->flags & PF_WQ_WORKER) {
4106                                struct task_struct *to_wakeup;
4107
4108                                to_wakeup = wq_worker_sleeping(prev, cpu);
4109                                if (to_wakeup)
4110                                        try_to_wake_up_local(to_wakeup);
4111                        }
4112                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
4113                }
4114                switch_count = &prev->nvcsw;
4115        }
4116
4117        pre_schedule(rq, prev);
4118
4119        if (unlikely(!rq->nr_running))
4120                idle_balance(cpu, rq);
4121
4122        put_prev_task(rq, prev);
4123        next = pick_next_task(rq);
4124        clear_tsk_need_resched(prev);
4125        rq->skip_clock_update = 0;
4126
4127        if (likely(prev != next)) {
4128                sched_info_switch(prev, next);
4129                perf_event_task_sched_out(prev, next);
4130
4131                rq->nr_switches++;
4132                rq->curr = next;
4133                ++*switch_count;
4134
4135                context_switch(rq, prev, next); /* unlocks the rq */
4136                /*
4137                 * The context switch have flipped the stack from under us
4138                 * and restored the local variables which were saved when
4139                 * this task called schedule() in the past. prev == current
4140                 * is still correct, but it can be moved to another cpu/rq.
4141                 */
4142                cpu = smp_processor_id();
4143                rq = cpu_rq(cpu);
4144        } else
4145                raw_spin_unlock_irq(&rq->lock);
4146
4147        post_schedule(rq);
4148
4149        if (unlikely(reacquire_kernel_lock(prev)))
4150                goto need_resched_nonpreemptible;
4151
4152        preempt_enable_no_resched();
4153        if (need_resched())
4154                goto need_resched;
4155}
4156EXPORT_SYMBOL(schedule);
4157
4158#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4159/*
4160 * Look out! "owner" is an entirely speculative pointer
4161 * access and not reliable.
4162 */
4163int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4164{
4165        unsigned int cpu;
4166        struct rq *rq;
4167
4168        if (!sched_feat(OWNER_SPIN))
4169                return 0;
4170
4171#ifdef CONFIG_DEBUG_PAGEALLOC
4172        /*
4173         * Need to access the cpu field knowing that
4174         * DEBUG_PAGEALLOC could have unmapped it if
4175         * the mutex owner just released it and exited.
4176         */
4177        if (probe_kernel_address(&owner->cpu, cpu))
4178                return 0;
4179#else
4180        cpu = owner->cpu;
4181#endif
4182
4183        /*
4184         * Even if the access succeeded (likely case),
4185         * the cpu field may no longer be valid.
4186         */
4187        if (cpu >= nr_cpumask_bits)
4188                return 0;
4189
4190        /*
4191         * We need to validate that we can do a
4192         * get_cpu() and that we have the percpu area.
4193         */
4194        if (!cpu_online(cpu))
4195                return 0;
4196
4197        rq = cpu_rq(cpu);
4198
4199        for (;;) {
4200                /*
4201                 * Owner changed, break to re-assess state.
4202                 */
4203                if (lock->owner != owner) {
4204                        /*
4205                         * If the lock has switched to a different owner,
4206                         * we likely have heavy contention. Return 0 to quit
4207                         * optimistic spinning and not contend further:
4208                         */
4209                        if (lock->owner)
4210                                return 0;
4211                        break;
4212                }
4213
4214                /*
4215                 * Is that owner really running on that cpu?
4216                 */
4217                if (task_thread_info(rq->curr) != owner || need_resched())
4218                        return 0;
4219
4220                cpu_relax();
4221        }
4222
4223        return 1;
4224}
4225#endif
4226
4227#ifdef CONFIG_PREEMPT
4228/*
4229 * this is the entry point to schedule() from in-kernel preemption
4230 * off of preempt_enable. Kernel preemptions off return from interrupt
4231 * occur there and call schedule directly.
4232 */
4233asmlinkage void __sched notrace preempt_schedule(void)
4234{
4235        struct thread_info *ti = current_thread_info();
4236
4237        /*
4238         * If there is a non-zero preempt_count or interrupts are disabled,
4239         * we do not want to preempt the current task. Just return..
4240         */
4241        if (likely(ti->preempt_count || irqs_disabled()))
4242                return;
4243
4244        do {
4245                add_preempt_count_notrace(PREEMPT_ACTIVE);
4246                schedule();
4247                sub_preempt_count_notrace(PREEMPT_ACTIVE);
4248
4249                /*
4250                 * Check again in case we missed a preemption opportunity
4251                 * between schedule and now.
4252                 */
4253                barrier();
4254        } while (need_resched());
4255}
4256EXPORT_SYMBOL(preempt_schedule);
4257
4258/*
4259 * this is the entry point to schedule() from kernel preemption
4260 * off of irq context.
4261 * Note, that this is called and return with irqs disabled. This will
4262 * protect us against recursive calling from irq.
4263 */
4264asmlinkage void __sched preempt_schedule_irq(void)
4265{
4266        struct thread_info *ti = current_thread_info();
4267
4268        /* Catch callers which need to be fixed */
4269        BUG_ON(ti->preempt_count || !irqs_disabled());
4270
4271        do {
4272                add_preempt_count(PREEMPT_ACTIVE);
4273                local_irq_enable();
4274                schedule();
4275                local_irq_disable();
4276                sub_preempt_count(PREEMPT_ACTIVE);
4277
4278                /*
4279                 * Check again in case we missed a preemption opportunity
4280                 * between schedule and now.
4281                 */
4282                barrier();
4283        } while (need_resched());
4284}
4285
4286#endif /* CONFIG_PREEMPT */
4287
4288int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4289                          void *key)
4290{
4291        return try_to_wake_up(curr->private, mode, wake_flags);
4292}
4293EXPORT_SYMBOL(default_wake_function);
4294
4295/*
4296 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4297 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
4298 * number) then we wake all the non-exclusive tasks and one exclusive task.
4299 *
4300 * There are circumstances in which we can try to wake a task which has already
4301 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4302 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4303 */
4304static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4305                        int nr_exclusive, int wake_flags, void *key)
4306{
4307        wait_queue_t *curr, *next;
4308
4309        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4310                unsigned flags = curr->flags;
4311
4312                if (curr->func(curr, mode, wake_flags, key) &&
4313                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4314                        break;
4315        }
4316}
4317
4318/**
4319 * __wake_up - wake up threads blocked on a waitqueue.
4320 * @q: the waitqueue
4321 * @mode: which threads
4322 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4323 * @key: is directly passed to the wakeup function
4324 *
4325 * It may be assumed that this function implies a write memory barrier before
4326 * changing the task state if and only if any tasks are woken up.
4327 */
4328void __wake_up(wait_queue_head_t *q, unsigned int mode,
4329                        int nr_exclusive, void *key)
4330{
4331        unsigned long flags;
4332
4333        spin_lock_irqsave(&q->lock, flags);
4334        __wake_up_common(q, mode, nr_exclusive, 0, key);
4335        spin_unlock_irqrestore(&q->lock, flags);
4336}
4337EXPORT_SYMBOL(__wake_up);
4338
4339/*
4340 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4341 */
4342void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4343{
4344        __wake_up_common(q, mode, 1, 0, NULL);
4345}
4346EXPORT_SYMBOL_GPL(__wake_up_locked);
4347
4348void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4349{
4350        __wake_up_common(q, mode, 1, 0, key);
4351}
4352
4353/**
4354 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4355 * @q: the waitqueue
4356 * @mode: which threads
4357 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4358 * @key: opaque value to be passed to wakeup targets
4359 *
4360 * The sync wakeup differs that the waker knows that it will schedule
4361 * away soon, so while the target thread will be woken up, it will not
4362 * be migrated to another CPU - ie. the two threads are 'synchronized'
4363 * with each other. This can prevent needless bouncing between CPUs.
4364 *
4365 * On UP it can prevent extra preemption.
4366 *
4367 * It may be assumed that this function implies a write memory barrier before
4368 * changing the task state if and only if any tasks are woken up.
4369 */
4370void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4371                        int nr_exclusive, void *key)
4372{
4373        unsigned long flags;
4374        int wake_flags = WF_SYNC;
4375
4376        if (unlikely(!q))
4377                return;
4378
4379        if (unlikely(!nr_exclusive))
4380                wake_flags = 0;
4381
4382        spin_lock_irqsave(&q->lock, flags);
4383        __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4384        spin_unlock_irqrestore(&q->lock, flags);
4385}
4386EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4387
4388/*
4389 * __wake_up_sync - see __wake_up_sync_key()
4390 */
4391void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4392{
4393        __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4394}
4395EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
4396
4397/**
4398 * complete: - signals a single thread waiting on this completion
4399 * @x:  holds the state of this particular completion
4400 *
4401 * This will wake up a single thread waiting on this completion. Threads will be
4402 * awakened in the same order in which they were queued.
4403 *
4404 * See also complete_all(), wait_for_completion() and related routines.
4405 *
4406 * It may be assumed that this function implies a write memory barrier before
4407 * changing the task state if and only if any tasks are woken up.
4408 */
4409void complete(struct completion *x)
4410{
4411        unsigned long flags;
4412
4413        spin_lock_irqsave(&x->wait.lock, flags);
4414        x->done++;
4415        __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4416        spin_unlock_irqrestore(&x->wait.lock, flags);
4417}
4418EXPORT_SYMBOL(complete);
4419
4420/**
4421 * complete_all: - signals all threads waiting on this completion
4422 * @x:  holds the state of this particular completion
4423 *
4424 * This will wake up all threads waiting on this particular completion event.
4425 *
4426 * It may be assumed that this function implies a write memory barrier before
4427 * changing the task state if and only if any tasks are woken up.
4428 */
4429void complete_all(struct completion *x)
4430{
4431        unsigned long flags;
4432
4433        spin_lock_irqsave(&x->wait.lock, flags);
4434        x->done += UINT_MAX/2;
4435        __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4436        spin_unlock_irqrestore(&x->wait.lock, flags);
4437}
4438EXPORT_SYMBOL(complete_all);
4439
4440static inline long __sched
4441do_wait_for_common(struct completion *x, long timeout, int state)
4442{
4443        if (!x->done) {
4444                DECLARE_WAITQUEUE(wait, current);
4445
4446                __add_wait_queue_tail_exclusive(&x->wait, &wait);
4447                do {
4448                        if (signal_pending_state(state, current)) {
4449                                timeout = -ERESTARTSYS;
4450                                break;
4451                        }
4452                        __set_current_state(state);
4453                        spin_unlock_irq(&x->wait.lock);
4454                        timeout = schedule_timeout(timeout);
4455                        spin_lock_irq(&x->wait.lock);
4456                } while (!x->done && timeout);
4457                __remove_wait_queue(&x->wait, &wait);
4458                if (!x->done)
4459                        return timeout;
4460        }
4461        x->done--;
4462        return timeout ?: 1;
4463}
4464
4465static long __sched
4466wait_for_common(struct completion *x, long timeout, int state)
4467{
4468        might_sleep();
4469
4470        spin_lock_irq(&x->wait.lock);
4471        timeout = do_wait_for_common(x, timeout, state);
4472        spin_unlock_irq(&x->wait.lock);
4473        return timeout;
4474}
4475
4476/**
4477 * wait_for_completion: - waits for completion of a task
4478 * @x:  holds the state of this particular completion
4479 *
4480 * This waits to be signaled for completion of a specific task. It is NOT
4481 * interruptible and there is no timeout.
4482 *
4483 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4484 * and interrupt capability. Also see complete().
4485 */
4486void __sched wait_for_completion(struct completion *x)
4487{
4488        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4489}
4490EXPORT_SYMBOL(wait_for_completion);
4491
4492/**
4493 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4494 * @x:  holds the state of this particular completion
4495 * @timeout:  timeout value in jiffies
4496 *
4497 * This waits for either a completion of a specific task to be signaled or for a
4498 * specified timeout to expire. The timeout is in jiffies. It is not
4499 * interruptible.
4500 */
4501unsigned long __sched
4502wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4503{
4504        return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4505}
4506EXPORT_SYMBOL(wait_for_completion_timeout);
4507
4508/**
4509 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4510 * @x:  holds the state of this particular completion
4511 *
4512 * This waits for completion of a specific task to be signaled. It is
4513 * interruptible.
4514 */
4515int __sched wait_for_completion_interruptible(struct completion *x)
4516{
4517        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4518        if (t == -ERESTARTSYS)
4519                return t;
4520        return 0;
4521}
4522EXPORT_SYMBOL(wait_for_completion_interruptible);
4523
4524/**
4525 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4526 * @x:  holds the state of this particular completion
4527 * @timeout:  timeout value in jiffies
4528 *
4529 * This waits for either a completion of a specific task to be signaled or for a
4530 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4531 */
4532long __sched
4533wait_for_completion_interruptible_timeout(struct completion *x,
4534                                          unsigned long timeout)
4535{
4536        return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4537}
4538EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4539
4540/**
4541 * wait_for_completion_killable: - waits for completion of a task (killable)
4542 * @x:  holds the state of this particular completion
4543 *
4544 * This waits to be signaled for completion of a specific task. It can be
4545 * interrupted by a kill signal.
4546 */
4547int __sched wait_for_completion_killable(struct completion *x)
4548{
4549        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4550        if (t == -ERESTARTSYS)
4551                return t;
4552        return 0;
4553}
4554EXPORT_SYMBOL(wait_for_completion_killable);
4555
4556/**
4557 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4558 * @x:  holds the state of this particular completion
4559 * @timeout:  timeout value in jiffies
4560 *
4561 * This waits for either a completion of a specific task to be
4562 * signaled or for a specified timeout to expire. It can be
4563 * interrupted by a kill signal. The timeout is in jiffies.
4564 */
4565long __sched
4566wait_for_completion_killable_timeout(struct completion *x,
4567                                     unsigned long timeout)
4568{
4569        return wait_for_common(x, timeout, TASK_KILLABLE);
4570}
4571EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4572
4573/**
4574 *      try_wait_for_completion - try to decrement a completion without blocking
4575 *      @x:     completion structure
4576 *
4577 *      Returns: 0 if a decrement cannot be done without blocking
4578 *               1 if a decrement succeeded.
4579 *
4580 *      If a completion is being used as a counting completion,
4581 *      attempt to decrement the counter without blocking. This
4582 *      enables us to avoid waiting if the resource the completion
4583 *      is protecting is not available.
4584 */
4585bool try_wait_for_completion(struct completion *x)
4586{
4587        unsigned long flags;
4588        int ret = 1;
4589
4590        spin_lock_irqsave(&x->wait.lock, flags);
4591        if (!x->done)
4592                ret = 0;
4593        else
4594                x->done--;
4595        spin_unlock_irqrestore(&x->wait.lock, flags);
4596        return ret;
4597}
4598EXPORT_SYMBOL(try_wait_for_completion);
4599
4600/**
4601 *      completion_done - Test to see if a completion has any waiters
4602 *      @x:     completion structure
4603 *
4604 *      Returns: 0 if there are waiters (wait_for_completion() in progress)
4605 *               1 if there are no waiters.
4606 *
4607 */
4608bool completion_done(struct completion *x)
4609{
4610        unsigned long flags;
4611        int ret = 1;
4612
4613        spin_lock_irqsave(&x->wait.lock, flags);
4614        if (!x->done)
4615                ret = 0;
4616        spin_unlock_irqrestore(&x->wait.lock, flags);
4617        return ret;
4618}
4619EXPORT_SYMBOL(completion_done);
4620
4621static long __sched
4622sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4623{
4624        unsigned long flags;
4625        wait_queue_t wait;
4626
4627        init_waitqueue_entry(&wait, current);
4628
4629        __set_current_state(state);
4630
4631        spin_lock_irqsave(&q->lock, flags);
4632        __add_wait_queue(q, &wait);
4633        spin_unlock(&q->lock);
4634        timeout = schedule_timeout(timeout);
4635        spin_lock_irq(&q->lock);
4636        __remove_wait_queue(q, &wait);
4637        spin_unlock_irqrestore(&q->lock, flags);
4638
4639        return timeout;
4640}
4641
4642void __sched interruptible_sleep_on(wait_queue_head_t *q)
4643{
4644        sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4645}
4646EXPORT_SYMBOL(interruptible_sleep_on);
4647
4648long __sched
4649interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4650{
4651        return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4652}
4653EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4654
4655void __sched sleep_on(wait_queue_head_t *q)
4656{
4657        sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4658}
4659EXPORT_SYMBOL(sleep_on);
4660
4661long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4662{
4663        return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4664}
4665EXPORT_SYMBOL(sleep_on_timeout);
4666
4667#ifdef CONFIG_RT_MUTEXES
4668
4669/*
4670 * rt_mutex_setprio - set the current priority of a task
4671 * @p: task
4672 * @prio: prio value (kernel-internal form)
4673 *
4674 * This function changes the 'effective' priority of a task. It does
4675 * not touch ->normal_prio like __setscheduler().
4676 *
4677 * Used by the rt_mutex code to implement priority inheritance logic.
4678 */
4679void rt_mutex_setprio(struct task_struct *p, int prio)
4680{
4681        unsigned long flags;
4682        int oldprio, on_rq, running;
4683        struct rq *rq;
4684        const struct sched_class *prev_class;
4685
4686        BUG_ON(prio < 0 || prio > MAX_PRIO);
4687
4688        rq = task_rq_lock(p, &flags);
4689
4690        trace_sched_pi_setprio(p, prio);
4691        oldprio = p->prio;
4692        prev_class = p->sched_class;
4693        on_rq = p->se.on_rq;
4694        running = task_current(rq, p);
4695        if (on_rq)
4696                dequeue_task(rq, p, 0);
4697        if (running)
4698                p->sched_class->put_prev_task(rq, p);
4699
4700        if (rt_prio(prio))
4701                p->sched_class = &rt_sched_class;
4702        else
4703                p->sched_class = &fair_sched_class;
4704
4705        p->prio = prio;
4706
4707        if (running)
4708                p->sched_class->set_curr_task(rq);
4709        if (on_rq) {
4710                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4711
4712                check_class_changed(rq, p, prev_class, oldprio, running);
4713        }
4714        task_rq_unlock(rq, &flags);
4715}
4716
4717#endif
4718
4719void set_user_nice(struct task_struct *p, long nice)
4720{
4721        int old_prio, delta, on_rq;
4722        unsigned long flags;
4723        struct rq *rq;
4724
4725        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4726                return;
4727        /*
4728         * We have to be careful, if called from sys_setpriority(),
4729         * the task might be in the middle of scheduling on another CPU.
4730         */
4731        rq = task_rq_lock(p, &flags);
4732        /*
4733         * The RT priorities are set via sched_setscheduler(), but we still
4734         * allow the 'normal' nice value to be set - but as expected
4735         * it wont have any effect on scheduling until the task is
4736         * SCHED_FIFO/SCHED_RR:
4737         */
4738        if (task_has_rt_policy(p)) {
4739                p->static_prio = NICE_TO_PRIO(nice);
4740                goto out_unlock;
4741        }
4742        on_rq = p->se.on_rq;
4743        if (on_rq)
4744                dequeue_task(rq, p, 0);
4745
4746        p->static_prio = NICE_TO_PRIO(nice);
4747        set_load_weight(p);
4748        old_prio = p->prio;
4749        p->prio = effective_prio(p);
4750        delta = p->prio - old_prio;
4751
4752        if (on_rq) {
4753                enqueue_task(rq, p, 0);
4754                /*
4755                 * If the task increased its priority or is running and
4756                 * lowered its priority, then reschedule its CPU:
4757                 */
4758                if (delta < 0 || (delta > 0 && task_running(rq, p)))
4759                        resched_task(rq->curr);
4760        }
4761out_unlock:
4762        task_rq_unlock(rq, &flags);
4763}
4764EXPORT_SYMBOL(set_user_nice);
4765
4766/*
4767 * can_nice - check if a task can reduce its nice value
4768 * @p: task
4769 * @nice: nice value
4770 */
4771int can_nice(const struct task_struct *p, const int nice)
4772{
4773        /* convert nice value [19,-20] to rlimit style value [1,40] */
4774        int nice_rlim = 20 - nice;
4775
4776        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4777                capable(CAP_SYS_NICE));
4778}
4779
4780#ifdef __ARCH_WANT_SYS_NICE
4781
4782/*
4783 * sys_nice - change the priority of the current process.
4784 * @increment: priority increment
4785 *
4786 * sys_setpriority is a more generic, but much slower function that
4787 * does similar things.
4788 */
4789SYSCALL_DEFINE1(nice, int, increment)
4790{
4791        long nice, retval;
4792
4793        /*
4794         * Setpriority might change our priority at the same moment.
4795         * We don't have to worry. Conceptually one call occurs first
4796         * and we have a single winner.
4797         */
4798        if (increment < -40)
4799                increment = -40;
4800        if (increment > 40)
4801                increment = 40;
4802
4803        nice = TASK_NICE(current) + increment;
4804        if (nice < -20)
4805                nice = -20;
4806        if (nice > 19)
4807                nice = 19;
4808
4809        if (increment < 0 && !can_nice(current, nice))
4810                return -EPERM;
4811
4812        retval = security_task_setnice(current, nice);
4813        if (retval)
4814                return retval;
4815
4816        set_user_nice(current, nice);
4817        return 0;
4818}
4819
4820#endif
4821
4822/**
4823 * task_prio - return the priority value of a given task.
4824 * @p: the task in question.
4825 *
4826 * This is the priority value as seen by users in /proc.
4827 * RT tasks are offset by -200. Normal tasks are centered
4828 * around 0, value goes from -16 to +15.
4829 */
4830int task_prio(const struct task_struct *p)
4831{
4832        return p->prio - MAX_RT_PRIO;
4833}
4834
4835/**
4836 * task_nice - return the nice value of a given task.
4837 * @p: the task in question.
4838 */
4839int task_nice(const struct task_struct *p)
4840{
4841        return TASK_NICE(p);
4842}
4843EXPORT_SYMBOL(task_nice);
4844
4845/**
4846 * idle_cpu - is a given cpu idle currently?
4847 * @cpu: the processor in question.
4848 */
4849int idle_cpu(int cpu)
4850{
4851        return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4852}
4853
4854/**
4855 * idle_task - return the idle task for a given cpu.
4856 * @cpu: the processor in question.
4857 */
4858struct task_struct *idle_task(int cpu)
4859{
4860        return cpu_rq(cpu)->idle;
4861}
4862
4863/**
4864 * find_process_by_pid - find a process with a matching PID value.
4865 * @pid: the pid in question.
4866 */
4867static struct task_struct *find_process_by_pid(pid_t pid)
4868{
4869        return pid ? find_task_by_vpid(pid) : current;
4870}
4871
4872/* Actually do priority change: must hold rq lock. */
4873static void
4874__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4875{
4876        BUG_ON(p->se.on_rq);
4877
4878        p->policy = policy;
4879        p->rt_priority = prio;
4880        p->normal_prio = normal_prio(p);
4881        /* we are holding p->pi_lock already */
4882        p->prio = rt_mutex_getprio(p);
4883        if (rt_prio(p->prio))
4884                p->sched_class = &rt_sched_class;
4885        else
4886                p->sched_class = &fair_sched_class;
4887        set_load_weight(p);
4888}
4889
4890/*
4891 * check the target process has a UID that matches the current process's
4892 */
4893static bool check_same_owner(struct task_struct *p)
4894{
4895        const struct cred *cred = current_cred(), *pcred;
4896        bool match;
4897
4898        rcu_read_lock();
4899        pcred = __task_cred(p);
4900        match = (cred->euid == pcred->euid ||
4901                 cred->euid == pcred->uid);
4902        rcu_read_unlock();
4903        return match;
4904}
4905
4906static int __sched_setscheduler(struct task_struct *p, int policy,
4907                                struct sched_param *param, bool user)
4908{
4909        int retval, oldprio, oldpolicy = -1, on_rq, running;
4910        unsigned long flags;
4911        const struct sched_class *prev_class;
4912        struct rq *rq;
4913        int reset_on_fork;
4914
4915        /* may grab non-irq protected spin_locks */
4916        BUG_ON(in_interrupt());
4917recheck:
4918        /* double check policy once rq lock held */
4919        if (policy < 0) {
4920                reset_on_fork = p->sched_reset_on_fork;
4921                policy = oldpolicy = p->policy;
4922        } else {
4923                reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4924                policy &= ~SCHED_RESET_ON_FORK;
4925
4926                if (policy != SCHED_FIFO && policy != SCHED_RR &&
4927                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4928                                policy != SCHED_IDLE)
4929                        return -EINVAL;
4930        }
4931
4932        /*
4933         * Valid priorities for SCHED_FIFO and SCHED_RR are
4934         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4935         * SCHED_BATCH and SCHED_IDLE is 0.
4936         */
4937        if (param->sched_priority < 0 ||
4938            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4939            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4940                return -EINVAL;
4941        if (rt_policy(policy) != (param->sched_priority != 0))
4942                return -EINVAL;
4943
4944        /*
4945         * Allow unprivileged RT tasks to decrease priority:
4946         */
4947        if (user && !capable(CAP_SYS_NICE)) {
4948                if (rt_policy(policy)) {
4949                        unsigned long rlim_rtprio =
4950                                        task_rlimit(p, RLIMIT_RTPRIO);
4951
4952                        /* can't set/change the rt policy */
4953                        if (policy != p->policy && !rlim_rtprio)
4954                                return -EPERM;
4955
4956                        /* can't increase priority */
4957                        if (param->sched_priority > p->rt_priority &&
4958                            param->sched_priority > rlim_rtprio)
4959                                return -EPERM;
4960                }
4961                /*
4962                 * Like positive nice levels, dont allow tasks to
4963                 * move out of SCHED_IDLE either:
4964                 */
4965                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4966                        return -EPERM;
4967
4968                /* can't change other user's priorities */
4969                if (!check_same_owner(p))
4970                        return -EPERM;
4971
4972                /* Normal users shall not reset the sched_reset_on_fork flag */
4973                if (p->sched_reset_on_fork && !reset_on_fork)
4974                        return -EPERM;
4975        }
4976
4977        if (user) {
4978                retval = security_task_setscheduler(p);
4979                if (retval)
4980                        return retval;
4981        }
4982
4983        /*
4984         * make sure no PI-waiters arrive (or leave) while we are
4985         * changing the priority of the task:
4986         */
4987        raw_spin_lock_irqsave(&p->pi_lock, flags);
4988        /*
4989         * To be able to change p->policy safely, the apropriate
4990         * runqueue lock must be held.
4991         */
4992        rq = __task_rq_lock(p);
4993
4994        /*
4995         * Changing the policy of the stop threads its a very bad idea
4996         */
4997        if (p == rq->stop) {
4998                __task_rq_unlock(rq);
4999                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5000                return -EINVAL;
5001        }
5002
5003#ifdef CONFIG_RT_GROUP_SCHED
5004        if (user) {
5005                /*
5006                 * Do not allow realtime tasks into groups that have no runtime
5007                 * assigned.
5008                 */
5009                if (rt_bandwidth_enabled() && rt_policy(policy) &&
5010                                task_group(p)->rt_bandwidth.rt_runtime == 0) {
5011                        __task_rq_unlock(rq);
5012                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5013                        return -EPERM;
5014                }
5015        }
5016#endif
5017
5018        /* recheck policy now with rq lock held */
5019        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5020                policy = oldpolicy = -1;
5021                __task_rq_unlock(rq);
5022                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5023                goto recheck;
5024        }
5025        on_rq = p->se.on_rq;
5026        running = task_current(rq, p);
5027        if (on_rq)
5028                deactivate_task(rq, p, 0);
5029        if (running)
5030                p->sched_class->put_prev_task(rq, p);
5031
5032        p->sched_reset_on_fork = reset_on_fork;
5033
5034        oldprio = p->prio;
5035        prev_class = p->sched_class;
5036        __setscheduler(rq, p, policy, param->sched_priority);
5037
5038        if (running)
5039                p->sched_class->set_curr_task(rq);
5040        if (on_rq) {
5041                activate_task(rq, p, 0);
5042
5043                check_class_changed(rq, p, prev_class, oldprio, running);
5044        }
5045        __task_rq_unlock(rq);
5046        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5047
5048        rt_mutex_adjust_pi(p);
5049
5050        return 0;
5051}
5052
5053/**
5054 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5055 * @p: the task in question.
5056 * @policy: new policy.
5057 * @param: structure containing the new RT priority.
5058 *
5059 * NOTE that the task may be already dead.
5060 */
5061int sched_setscheduler(struct task_struct *p, int policy,
5062                       struct sched_param *param)
5063{
5064        return __sched_setscheduler(p, policy, param, true);
5065}
5066EXPORT_SYMBOL_GPL(sched_setscheduler);
5067
5068/**
5069 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5070 * @p: the task in question.
5071 * @policy: new policy.
5072 * @param: structure containing the new RT priority.
5073 *
5074 * Just like sched_setscheduler, only don't bother checking if the
5075 * current context has permission.  For example, this is needed in
5076 * stop_machine(): we create temporary high priority worker threads,
5077 * but our caller might not have that capability.
5078 */
5079int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5080                               struct sched_param *param)
5081{
5082        return __sched_setscheduler(p, policy, param, false);
5083}
5084
5085static int
5086do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5087{
5088        struct sched_param lparam;
5089        struct task_struct *p;
5090        int retval;
5091
5092        if (!param || pid < 0)
5093                return -EINVAL;
5094        if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5095                return -EFAULT;
5096
5097        rcu_read_lock();
5098        retval = -ESRCH;
5099        p = find_process_by_pid(pid);
5100        if (p != NULL)
5101                retval = sched_setscheduler(p, policy, &lparam);
5102        rcu_read_unlock();
5103
5104        return retval;
5105}
5106
5107/**
5108 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5109 * @pid: the pid in question.
5110 * @policy: new policy.
5111 * @param: structure containing the new RT priority.
5112 */
5113SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5114                struct sched_param __user *, param)
5115{
5116        /* negative values for policy are not valid */
5117        if (policy < 0)
5118                return -EINVAL;
5119
5120        return do_sched_setscheduler(pid, policy, param);
5121}
5122
5123/**
5124 * sys_sched_setparam - set/change the RT priority of a thread
5125 * @pid: the pid in question.
5126 * @param: structure containing the new RT priority.
5127 */
5128SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5129{
5130        return do_sched_setscheduler(pid, -1, param);
5131}
5132
5133/**
5134 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5135 * @pid: the pid in question.
5136 */
5137SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5138{
5139        struct task_struct *p;
5140        int retval;
5141
5142        if (pid < 0)
5143                return -EINVAL;
5144
5145        retval = -ESRCH;
5146        rcu_read_lock();
5147        p = find_process_by_pid(pid);
5148        if (p) {
5149                retval = security_task_getscheduler(p);
5150                if (!retval)
5151                        retval = p->policy
5152                                | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5153        }
5154        rcu_read_unlock();
5155        return retval;
5156}
5157
5158/**
5159 * sys_sched_getparam - get the RT priority of a thread
5160 * @pid: the pid in question.
5161 * @param: structure containing the RT priority.
5162 */
5163SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5164{
5165        struct sched_param lp;
5166        struct task_struct *p;
5167        int retval;
5168
5169        if (!param || pid < 0)
5170                return -EINVAL;
5171
5172        rcu_read_lock();
5173        p = find_process_by_pid(pid);
5174        retval = -ESRCH;
5175        if (!p)
5176                goto out_unlock;
5177
5178        retval = security_task_getscheduler(p);
5179        if (retval)
5180                goto out_unlock;
5181
5182        lp.sched_priority = p->rt_priority;
5183        rcu_read_unlock();
5184
5185        /*
5186         * This one might sleep, we cannot do it with a spinlock held ...
5187         */
5188        retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5189
5190        return retval;
5191
5192out_unlock:
5193        rcu_read_unlock();
5194        return retval;
5195}
5196
5197long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5198{
5199        cpumask_var_t cpus_allowed, new_mask;
5200        struct task_struct *p;
5201        int retval;
5202
5203        get_online_cpus();
5204        rcu_read_lock();
5205
5206        p = find_process_by_pid(pid);
5207        if (!p) {
5208                rcu_read_unlock();
5209                put_online_cpus();
5210                return -ESRCH;
5211        }
5212
5213        /* Prevent p going away */
5214        get_task_struct(p);
5215        rcu_read_unlock();
5216
5217        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5218                retval = -ENOMEM;
5219                goto out_put_task;
5220        }
5221        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5222                retval = -ENOMEM;
5223                goto out_free_cpus_allowed;
5224        }
5225        retval = -EPERM;
5226        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5227                goto out_unlock;
5228
5229        retval = security_task_setscheduler(p);
5230        if (retval)
5231                goto out_unlock;
5232
5233        cpuset_cpus_allowed(p, cpus_allowed);
5234        cpumask_and(new_mask, in_mask, cpus_allowed);
5235again:
5236        retval = set_cpus_allowed_ptr(p, new_mask);
5237
5238        if (!retval) {
5239                cpuset_cpus_allowed(p, cpus_allowed);
5240                if (!cpumask_subset(new_mask, cpus_allowed)) {
5241                        /*
5242                         * We must have raced with a concurrent cpuset
5243                         * update. Just reset the cpus_allowed to the
5244                         * cpuset's cpus_allowed
5245                         */
5246                        cpumask_copy(new_mask, cpus_allowed);
5247                        goto again;
5248                }
5249        }
5250out_unlock:
5251        free_cpumask_var(new_mask);
5252out_free_cpus_allowed:
5253        free_cpumask_var(cpus_allowed);
5254out_put_task:
5255        put_task_struct(p);
5256        put_online_cpus();
5257        return retval;
5258}
5259
5260static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5261                             struct cpumask *