linux/kernel/sched/core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  kernel/sched/core.c
   4 *
   5 *  Core kernel scheduler code and related syscalls
   6 *
   7 *  Copyright (C) 1991-2002  Linus Torvalds
   8 */
   9#define CREATE_TRACE_POINTS
  10#include <trace/events/sched.h>
  11#undef CREATE_TRACE_POINTS
  12
  13#include "sched.h"
  14
  15#include <linux/nospec.h>
  16
  17#include <linux/kcov.h>
  18#include <linux/scs.h>
  19
  20#include <asm/switch_to.h>
  21#include <asm/tlb.h>
  22
  23#include "../workqueue_internal.h"
  24#include "../../fs/io-wq.h"
  25#include "../smpboot.h"
  26
  27#include "pelt.h"
  28#include "smp.h"
  29
  30/*
  31 * Export tracepoints that act as a bare tracehook (ie: have no trace event
  32 * associated with them) to allow external modules to probe them.
  33 */
  34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
  35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
  36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
  37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
  38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
  39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
  40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
  41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
  42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
  43EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
  44
  45DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  46
  47#ifdef CONFIG_SCHED_DEBUG
  48/*
  49 * Debugging: various feature bits
  50 *
  51 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
  52 * sysctl_sched_features, defined in sched.h, to allow constants propagation
  53 * at compile time and compiler optimization based on features default.
  54 */
  55#define SCHED_FEAT(name, enabled)       \
  56        (1UL << __SCHED_FEAT_##name) * enabled |
  57const_debug unsigned int sysctl_sched_features =
  58#include "features.h"
  59        0;
  60#undef SCHED_FEAT
  61
  62/*
  63 * Print a warning if need_resched is set for the given duration (if
  64 * LATENCY_WARN is enabled).
  65 *
  66 * If sysctl_resched_latency_warn_once is set, only one warning will be shown
  67 * per boot.
  68 */
  69__read_mostly int sysctl_resched_latency_warn_ms = 100;
  70__read_mostly int sysctl_resched_latency_warn_once = 1;
  71#endif /* CONFIG_SCHED_DEBUG */
  72
  73/*
  74 * Number of tasks to iterate in a single balance run.
  75 * Limited because this is done with IRQs disabled.
  76 */
  77const_debug unsigned int sysctl_sched_nr_migrate = 32;
  78
  79/*
  80 * period over which we measure -rt task CPU usage in us.
  81 * default: 1s
  82 */
  83unsigned int sysctl_sched_rt_period = 1000000;
  84
  85__read_mostly int scheduler_running;
  86
  87#ifdef CONFIG_SCHED_CORE
  88
  89DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
  90
  91/* kernel prio, less is more */
  92static inline int __task_prio(struct task_struct *p)
  93{
  94        if (p->sched_class == &stop_sched_class) /* trumps deadline */
  95                return -2;
  96
  97        if (rt_prio(p->prio)) /* includes deadline */
  98                return p->prio; /* [-1, 99] */
  99
 100        if (p->sched_class == &idle_sched_class)
 101                return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
 102
 103        return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
 104}
 105
 106/*
 107 * l(a,b)
 108 * le(a,b) := !l(b,a)
 109 * g(a,b)  := l(b,a)
 110 * ge(a,b) := !l(a,b)
 111 */
 112
 113/* real prio, less is less */
 114static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
 115{
 116
 117        int pa = __task_prio(a), pb = __task_prio(b);
 118
 119        if (-pa < -pb)
 120                return true;
 121
 122        if (-pb < -pa)
 123                return false;
 124
 125        if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
 126                return !dl_time_before(a->dl.deadline, b->dl.deadline);
 127
 128        if (pa == MAX_RT_PRIO + MAX_NICE)       /* fair */
 129                return cfs_prio_less(a, b, in_fi);
 130
 131        return false;
 132}
 133
 134static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
 135{
 136        if (a->core_cookie < b->core_cookie)
 137                return true;
 138
 139        if (a->core_cookie > b->core_cookie)
 140                return false;
 141
 142        /* flip prio, so high prio is leftmost */
 143        if (prio_less(b, a, task_rq(a)->core->core_forceidle))
 144                return true;
 145
 146        return false;
 147}
 148
 149#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
 150
 151static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
 152{
 153        return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
 154}
 155
 156static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
 157{
 158        const struct task_struct *p = __node_2_sc(node);
 159        unsigned long cookie = (unsigned long)key;
 160
 161        if (cookie < p->core_cookie)
 162                return -1;
 163
 164        if (cookie > p->core_cookie)
 165                return 1;
 166
 167        return 0;
 168}
 169
 170void sched_core_enqueue(struct rq *rq, struct task_struct *p)
 171{
 172        rq->core->core_task_seq++;
 173
 174        if (!p->core_cookie)
 175                return;
 176
 177        rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
 178}
 179
 180void sched_core_dequeue(struct rq *rq, struct task_struct *p)
 181{
 182        rq->core->core_task_seq++;
 183
 184        if (!sched_core_enqueued(p))
 185                return;
 186
 187        rb_erase(&p->core_node, &rq->core_tree);
 188        RB_CLEAR_NODE(&p->core_node);
 189}
 190
 191/*
 192 * Find left-most (aka, highest priority) task matching @cookie.
 193 */
 194static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
 195{
 196        struct rb_node *node;
 197
 198        node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
 199        /*
 200         * The idle task always matches any cookie!
 201         */
 202        if (!node)
 203                return idle_sched_class.pick_task(rq);
 204
 205        return __node_2_sc(node);
 206}
 207
 208static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
 209{
 210        struct rb_node *node = &p->core_node;
 211
 212        node = rb_next(node);
 213        if (!node)
 214                return NULL;
 215
 216        p = container_of(node, struct task_struct, core_node);
 217        if (p->core_cookie != cookie)
 218                return NULL;
 219
 220        return p;
 221}
 222
 223/*
 224 * Magic required such that:
 225 *
 226 *      raw_spin_rq_lock(rq);
 227 *      ...
 228 *      raw_spin_rq_unlock(rq);
 229 *
 230 * ends up locking and unlocking the _same_ lock, and all CPUs
 231 * always agree on what rq has what lock.
 232 *
 233 * XXX entirely possible to selectively enable cores, don't bother for now.
 234 */
 235
 236static DEFINE_MUTEX(sched_core_mutex);
 237static atomic_t sched_core_count;
 238static struct cpumask sched_core_mask;
 239
 240static void sched_core_lock(int cpu, unsigned long *flags)
 241{
 242        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 243        int t, i = 0;
 244
 245        local_irq_save(*flags);
 246        for_each_cpu(t, smt_mask)
 247                raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
 248}
 249
 250static void sched_core_unlock(int cpu, unsigned long *flags)
 251{
 252        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 253        int t;
 254
 255        for_each_cpu(t, smt_mask)
 256                raw_spin_unlock(&cpu_rq(t)->__lock);
 257        local_irq_restore(*flags);
 258}
 259
 260static void __sched_core_flip(bool enabled)
 261{
 262        unsigned long flags;
 263        int cpu, t;
 264
 265        cpus_read_lock();
 266
 267        /*
 268         * Toggle the online cores, one by one.
 269         */
 270        cpumask_copy(&sched_core_mask, cpu_online_mask);
 271        for_each_cpu(cpu, &sched_core_mask) {
 272                const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 273
 274                sched_core_lock(cpu, &flags);
 275
 276                for_each_cpu(t, smt_mask)
 277                        cpu_rq(t)->core_enabled = enabled;
 278
 279                sched_core_unlock(cpu, &flags);
 280
 281                cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
 282        }
 283
 284        /*
 285         * Toggle the offline CPUs.
 286         */
 287        cpumask_copy(&sched_core_mask, cpu_possible_mask);
 288        cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
 289
 290        for_each_cpu(cpu, &sched_core_mask)
 291                cpu_rq(cpu)->core_enabled = enabled;
 292
 293        cpus_read_unlock();
 294}
 295
 296static void sched_core_assert_empty(void)
 297{
 298        int cpu;
 299
 300        for_each_possible_cpu(cpu)
 301                WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
 302}
 303
 304static void __sched_core_enable(void)
 305{
 306        static_branch_enable(&__sched_core_enabled);
 307        /*
 308         * Ensure all previous instances of raw_spin_rq_*lock() have finished
 309         * and future ones will observe !sched_core_disabled().
 310         */
 311        synchronize_rcu();
 312        __sched_core_flip(true);
 313        sched_core_assert_empty();
 314}
 315
 316static void __sched_core_disable(void)
 317{
 318        sched_core_assert_empty();
 319        __sched_core_flip(false);
 320        static_branch_disable(&__sched_core_enabled);
 321}
 322
 323void sched_core_get(void)
 324{
 325        if (atomic_inc_not_zero(&sched_core_count))
 326                return;
 327
 328        mutex_lock(&sched_core_mutex);
 329        if (!atomic_read(&sched_core_count))
 330                __sched_core_enable();
 331
 332        smp_mb__before_atomic();
 333        atomic_inc(&sched_core_count);
 334        mutex_unlock(&sched_core_mutex);
 335}
 336
 337static void __sched_core_put(struct work_struct *work)
 338{
 339        if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
 340                __sched_core_disable();
 341                mutex_unlock(&sched_core_mutex);
 342        }
 343}
 344
 345void sched_core_put(void)
 346{
 347        static DECLARE_WORK(_work, __sched_core_put);
 348
 349        /*
 350         * "There can be only one"
 351         *
 352         * Either this is the last one, or we don't actually need to do any
 353         * 'work'. If it is the last *again*, we rely on
 354         * WORK_STRUCT_PENDING_BIT.
 355         */
 356        if (!atomic_add_unless(&sched_core_count, -1, 1))
 357                schedule_work(&_work);
 358}
 359
 360#else /* !CONFIG_SCHED_CORE */
 361
 362static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
 363static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
 364
 365#endif /* CONFIG_SCHED_CORE */
 366
 367/*
 368 * part of the period that we allow rt tasks to run in us.
 369 * default: 0.95s
 370 */
 371int sysctl_sched_rt_runtime = 950000;
 372
 373
 374/*
 375 * Serialization rules:
 376 *
 377 * Lock order:
 378 *
 379 *   p->pi_lock
 380 *     rq->lock
 381 *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
 382 *
 383 *  rq1->lock
 384 *    rq2->lock  where: rq1 < rq2
 385 *
 386 * Regular state:
 387 *
 388 * Normal scheduling state is serialized by rq->lock. __schedule() takes the
 389 * local CPU's rq->lock, it optionally removes the task from the runqueue and
 390 * always looks at the local rq data structures to find the most eligible task
 391 * to run next.
 392 *
 393 * Task enqueue is also under rq->lock, possibly taken from another CPU.
 394 * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
 395 * the local CPU to avoid bouncing the runqueue state around [ see
 396 * ttwu_queue_wakelist() ]
 397 *
 398 * Task wakeup, specifically wakeups that involve migration, are horribly
 399 * complicated to avoid having to take two rq->locks.
 400 *
 401 * Special state:
 402 *
 403 * System-calls and anything external will use task_rq_lock() which acquires
 404 * both p->pi_lock and rq->lock. As a consequence the state they change is
 405 * stable while holding either lock:
 406 *
 407 *  - sched_setaffinity()/
 408 *    set_cpus_allowed_ptr():   p->cpus_ptr, p->nr_cpus_allowed
 409 *  - set_user_nice():          p->se.load, p->*prio
 410 *  - __sched_setscheduler():   p->sched_class, p->policy, p->*prio,
 411 *                              p->se.load, p->rt_priority,
 412 *                              p->dl.dl_{runtime, deadline, period, flags, bw, density}
 413 *  - sched_setnuma():          p->numa_preferred_nid
 414 *  - sched_move_task()/
 415 *    cpu_cgroup_fork():        p->sched_task_group
 416 *  - uclamp_update_active()    p->uclamp*
 417 *
 418 * p->state <- TASK_*:
 419 *
 420 *   is changed locklessly using set_current_state(), __set_current_state() or
 421 *   set_special_state(), see their respective comments, or by
 422 *   try_to_wake_up(). This latter uses p->pi_lock to serialize against
 423 *   concurrent self.
 424 *
 425 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
 426 *
 427 *   is set by activate_task() and cleared by deactivate_task(), under
 428 *   rq->lock. Non-zero indicates the task is runnable, the special
 429 *   ON_RQ_MIGRATING state is used for migration without holding both
 430 *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
 431 *
 432 * p->on_cpu <- { 0, 1 }:
 433 *
 434 *   is set by prepare_task() and cleared by finish_task() such that it will be
 435 *   set before p is scheduled-in and cleared after p is scheduled-out, both
 436 *   under rq->lock. Non-zero indicates the task is running on its CPU.
 437 *
 438 *   [ The astute reader will observe that it is possible for two tasks on one
 439 *     CPU to have ->on_cpu = 1 at the same time. ]
 440 *
 441 * task_cpu(p): is changed by set_task_cpu(), the rules are:
 442 *
 443 *  - Don't call set_task_cpu() on a blocked task:
 444 *
 445 *    We don't care what CPU we're not running on, this simplifies hotplug,
 446 *    the CPU assignment of blocked tasks isn't required to be valid.
 447 *
 448 *  - for try_to_wake_up(), called under p->pi_lock:
 449 *
 450 *    This allows try_to_wake_up() to only take one rq->lock, see its comment.
 451 *
 452 *  - for migration called under rq->lock:
 453 *    [ see task_on_rq_migrating() in task_rq_lock() ]
 454 *
 455 *    o move_queued_task()
 456 *    o detach_task()
 457 *
 458 *  - for migration called under double_rq_lock():
 459 *
 460 *    o __migrate_swap_task()
 461 *    o push_rt_task() / pull_rt_task()
 462 *    o push_dl_task() / pull_dl_task()
 463 *    o dl_task_offline_migration()
 464 *
 465 */
 466
 467void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
 468{
 469        raw_spinlock_t *lock;
 470
 471        /* Matches synchronize_rcu() in __sched_core_enable() */
 472        preempt_disable();
 473        if (sched_core_disabled()) {
 474                raw_spin_lock_nested(&rq->__lock, subclass);
 475                /* preempt_count *MUST* be > 1 */
 476                preempt_enable_no_resched();
 477                return;
 478        }
 479
 480        for (;;) {
 481                lock = __rq_lockp(rq);
 482                raw_spin_lock_nested(lock, subclass);
 483                if (likely(lock == __rq_lockp(rq))) {
 484                        /* preempt_count *MUST* be > 1 */
 485                        preempt_enable_no_resched();
 486                        return;
 487                }
 488                raw_spin_unlock(lock);
 489        }
 490}
 491
 492bool raw_spin_rq_trylock(struct rq *rq)
 493{
 494        raw_spinlock_t *lock;
 495        bool ret;
 496
 497        /* Matches synchronize_rcu() in __sched_core_enable() */
 498        preempt_disable();
 499        if (sched_core_disabled()) {
 500                ret = raw_spin_trylock(&rq->__lock);
 501                preempt_enable();
 502                return ret;
 503        }
 504
 505        for (;;) {
 506                lock = __rq_lockp(rq);
 507                ret = raw_spin_trylock(lock);
 508                if (!ret || (likely(lock == __rq_lockp(rq)))) {
 509                        preempt_enable();
 510                        return ret;
 511                }
 512                raw_spin_unlock(lock);
 513        }
 514}
 515
 516void raw_spin_rq_unlock(struct rq *rq)
 517{
 518        raw_spin_unlock(rq_lockp(rq));
 519}
 520
 521#ifdef CONFIG_SMP
 522/*
 523 * double_rq_lock - safely lock two runqueues
 524 */
 525void double_rq_lock(struct rq *rq1, struct rq *rq2)
 526{
 527        lockdep_assert_irqs_disabled();
 528
 529        if (rq_order_less(rq2, rq1))
 530                swap(rq1, rq2);
 531
 532        raw_spin_rq_lock(rq1);
 533        if (__rq_lockp(rq1) == __rq_lockp(rq2))
 534                return;
 535
 536        raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
 537}
 538#endif
 539
 540/*
 541 * __task_rq_lock - lock the rq @p resides on.
 542 */
 543struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 544        __acquires(rq->lock)
 545{
 546        struct rq *rq;
 547
 548        lockdep_assert_held(&p->pi_lock);
 549
 550        for (;;) {
 551                rq = task_rq(p);
 552                raw_spin_rq_lock(rq);
 553                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 554                        rq_pin_lock(rq, rf);
 555                        return rq;
 556                }
 557                raw_spin_rq_unlock(rq);
 558
 559                while (unlikely(task_on_rq_migrating(p)))
 560                        cpu_relax();
 561        }
 562}
 563
 564/*
 565 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 566 */
 567struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 568        __acquires(p->pi_lock)
 569        __acquires(rq->lock)
 570{
 571        struct rq *rq;
 572
 573        for (;;) {
 574                raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
 575                rq = task_rq(p);
 576                raw_spin_rq_lock(rq);
 577                /*
 578                 *      move_queued_task()              task_rq_lock()
 579                 *
 580                 *      ACQUIRE (rq->lock)
 581                 *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
 582                 *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
 583                 *      [S] ->cpu = new_cpu             [L] task_rq()
 584                 *                                      [L] ->on_rq
 585                 *      RELEASE (rq->lock)
 586                 *
 587                 * If we observe the old CPU in task_rq_lock(), the acquire of
 588                 * the old rq->lock will fully serialize against the stores.
 589                 *
 590                 * If we observe the new CPU in task_rq_lock(), the address
 591                 * dependency headed by '[L] rq = task_rq()' and the acquire
 592                 * will pair with the WMB to ensure we then also see migrating.
 593                 */
 594                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 595                        rq_pin_lock(rq, rf);
 596                        return rq;
 597                }
 598                raw_spin_rq_unlock(rq);
 599                raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 600
 601                while (unlikely(task_on_rq_migrating(p)))
 602                        cpu_relax();
 603        }
 604}
 605
 606/*
 607 * RQ-clock updating methods:
 608 */
 609
 610static void update_rq_clock_task(struct rq *rq, s64 delta)
 611{
 612/*
 613 * In theory, the compile should just see 0 here, and optimize out the call
 614 * to sched_rt_avg_update. But I don't trust it...
 615 */
 616        s64 __maybe_unused steal = 0, irq_delta = 0;
 617
 618#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 619        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 620
 621        /*
 622         * Since irq_time is only updated on {soft,}irq_exit, we might run into
 623         * this case when a previous update_rq_clock() happened inside a
 624         * {soft,}irq region.
 625         *
 626         * When this happens, we stop ->clock_task and only update the
 627         * prev_irq_time stamp to account for the part that fit, so that a next
 628         * update will consume the rest. This ensures ->clock_task is
 629         * monotonic.
 630         *
 631         * It does however cause some slight miss-attribution of {soft,}irq
 632         * time, a more accurate solution would be to update the irq_time using
 633         * the current rq->clock timestamp, except that would require using
 634         * atomic ops.
 635         */
 636        if (irq_delta > delta)
 637                irq_delta = delta;
 638
 639        rq->prev_irq_time += irq_delta;
 640        delta -= irq_delta;
 641#endif
 642#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 643        if (static_key_false((&paravirt_steal_rq_enabled))) {
 644                steal = paravirt_steal_clock(cpu_of(rq));
 645                steal -= rq->prev_steal_time_rq;
 646
 647                if (unlikely(steal > delta))
 648                        steal = delta;
 649
 650                rq->prev_steal_time_rq += steal;
 651                delta -= steal;
 652        }
 653#endif
 654
 655        rq->clock_task += delta;
 656
 657#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 658        if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 659                update_irq_load_avg(rq, irq_delta + steal);
 660#endif
 661        update_rq_clock_pelt(rq, delta);
 662}
 663
 664void update_rq_clock(struct rq *rq)
 665{
 666        s64 delta;
 667
 668        lockdep_assert_rq_held(rq);
 669
 670        if (rq->clock_update_flags & RQCF_ACT_SKIP)
 671                return;
 672
 673#ifdef CONFIG_SCHED_DEBUG
 674        if (sched_feat(WARN_DOUBLE_CLOCK))
 675                SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
 676        rq->clock_update_flags |= RQCF_UPDATED;
 677#endif
 678
 679        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 680        if (delta < 0)
 681                return;
 682        rq->clock += delta;
 683        update_rq_clock_task(rq, delta);
 684}
 685
 686#ifdef CONFIG_SCHED_HRTICK
 687/*
 688 * Use HR-timers to deliver accurate preemption points.
 689 */
 690
 691static void hrtick_clear(struct rq *rq)
 692{
 693        if (hrtimer_active(&rq->hrtick_timer))
 694                hrtimer_cancel(&rq->hrtick_timer);
 695}
 696
 697/*
 698 * High-resolution timer tick.
 699 * Runs from hardirq context with interrupts disabled.
 700 */
 701static enum hrtimer_restart hrtick(struct hrtimer *timer)
 702{
 703        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 704        struct rq_flags rf;
 705
 706        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 707
 708        rq_lock(rq, &rf);
 709        update_rq_clock(rq);
 710        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 711        rq_unlock(rq, &rf);
 712
 713        return HRTIMER_NORESTART;
 714}
 715
 716#ifdef CONFIG_SMP
 717
 718static void __hrtick_restart(struct rq *rq)
 719{
 720        struct hrtimer *timer = &rq->hrtick_timer;
 721        ktime_t time = rq->hrtick_time;
 722
 723        hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
 724}
 725
 726/*
 727 * called from hardirq (IPI) context
 728 */
 729static void __hrtick_start(void *arg)
 730{
 731        struct rq *rq = arg;
 732        struct rq_flags rf;
 733
 734        rq_lock(rq, &rf);
 735        __hrtick_restart(rq);
 736        rq_unlock(rq, &rf);
 737}
 738
 739/*
 740 * Called to set the hrtick timer state.
 741 *
 742 * called with rq->lock held and irqs disabled
 743 */
 744void hrtick_start(struct rq *rq, u64 delay)
 745{
 746        struct hrtimer *timer = &rq->hrtick_timer;
 747        s64 delta;
 748
 749        /*
 750         * Don't schedule slices shorter than 10000ns, that just
 751         * doesn't make sense and can cause timer DoS.
 752         */
 753        delta = max_t(s64, delay, 10000LL);
 754        rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
 755
 756        if (rq == this_rq())
 757                __hrtick_restart(rq);
 758        else
 759                smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
 760}
 761
 762#else
 763/*
 764 * Called to set the hrtick timer state.
 765 *
 766 * called with rq->lock held and irqs disabled
 767 */
 768void hrtick_start(struct rq *rq, u64 delay)
 769{
 770        /*
 771         * Don't schedule slices shorter than 10000ns, that just
 772         * doesn't make sense. Rely on vruntime for fairness.
 773         */
 774        delay = max_t(u64, delay, 10000LL);
 775        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 776                      HRTIMER_MODE_REL_PINNED_HARD);
 777}
 778
 779#endif /* CONFIG_SMP */
 780
 781static void hrtick_rq_init(struct rq *rq)
 782{
 783#ifdef CONFIG_SMP
 784        INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
 785#endif
 786        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 787        rq->hrtick_timer.function = hrtick;
 788}
 789#else   /* CONFIG_SCHED_HRTICK */
 790static inline void hrtick_clear(struct rq *rq)
 791{
 792}
 793
 794static inline void hrtick_rq_init(struct rq *rq)
 795{
 796}
 797#endif  /* CONFIG_SCHED_HRTICK */
 798
 799/*
 800 * cmpxchg based fetch_or, macro so it works for different integer types
 801 */
 802#define fetch_or(ptr, mask)                                             \
 803        ({                                                              \
 804                typeof(ptr) _ptr = (ptr);                               \
 805                typeof(mask) _mask = (mask);                            \
 806                typeof(*_ptr) _old, _val = *_ptr;                       \
 807                                                                        \
 808                for (;;) {                                              \
 809                        _old = cmpxchg(_ptr, _val, _val | _mask);       \
 810                        if (_old == _val)                               \
 811                                break;                                  \
 812                        _val = _old;                                    \
 813                }                                                       \
 814        _old;                                                           \
 815})
 816
 817#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 818/*
 819 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 820 * this avoids any races wrt polling state changes and thereby avoids
 821 * spurious IPIs.
 822 */
 823static bool set_nr_and_not_polling(struct task_struct *p)
 824{
 825        struct thread_info *ti = task_thread_info(p);
 826        return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
 827}
 828
 829/*
 830 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 831 *
 832 * If this returns true, then the idle task promises to call
 833 * sched_ttwu_pending() and reschedule soon.
 834 */
 835static bool set_nr_if_polling(struct task_struct *p)
 836{
 837        struct thread_info *ti = task_thread_info(p);
 838        typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 839
 840        for (;;) {
 841                if (!(val & _TIF_POLLING_NRFLAG))
 842                        return false;
 843                if (val & _TIF_NEED_RESCHED)
 844                        return true;
 845                old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
 846                if (old == val)
 847                        break;
 848                val = old;
 849        }
 850        return true;
 851}
 852
 853#else
 854static bool set_nr_and_not_polling(struct task_struct *p)
 855{
 856        set_tsk_need_resched(p);
 857        return true;
 858}
 859
 860#ifdef CONFIG_SMP
 861static bool set_nr_if_polling(struct task_struct *p)
 862{
 863        return false;
 864}
 865#endif
 866#endif
 867
 868static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
 869{
 870        struct wake_q_node *node = &task->wake_q;
 871
 872        /*
 873         * Atomically grab the task, if ->wake_q is !nil already it means
 874         * it's already queued (either by us or someone else) and will get the
 875         * wakeup due to that.
 876         *
 877         * In order to ensure that a pending wakeup will observe our pending
 878         * state, even in the failed case, an explicit smp_mb() must be used.
 879         */
 880        smp_mb__before_atomic();
 881        if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
 882                return false;
 883
 884        /*
 885         * The head is context local, there can be no concurrency.
 886         */
 887        *head->lastp = node;
 888        head->lastp = &node->next;
 889        return true;
 890}
 891
 892/**
 893 * wake_q_add() - queue a wakeup for 'later' waking.
 894 * @head: the wake_q_head to add @task to
 895 * @task: the task to queue for 'later' wakeup
 896 *
 897 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
 898 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
 899 * instantly.
 900 *
 901 * This function must be used as-if it were wake_up_process(); IOW the task
 902 * must be ready to be woken at this location.
 903 */
 904void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 905{
 906        if (__wake_q_add(head, task))
 907                get_task_struct(task);
 908}
 909
 910/**
 911 * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
 912 * @head: the wake_q_head to add @task to
 913 * @task: the task to queue for 'later' wakeup
 914 *
 915 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
 916 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
 917 * instantly.
 918 *
 919 * This function must be used as-if it were wake_up_process(); IOW the task
 920 * must be ready to be woken at this location.
 921 *
 922 * This function is essentially a task-safe equivalent to wake_q_add(). Callers
 923 * that already hold reference to @task can call the 'safe' version and trust
 924 * wake_q to do the right thing depending whether or not the @task is already
 925 * queued for wakeup.
 926 */
 927void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
 928{
 929        if (!__wake_q_add(head, task))
 930                put_task_struct(task);
 931}
 932
 933void wake_up_q(struct wake_q_head *head)
 934{
 935        struct wake_q_node *node = head->first;
 936
 937        while (node != WAKE_Q_TAIL) {
 938                struct task_struct *task;
 939
 940                task = container_of(node, struct task_struct, wake_q);
 941                /* Task can safely be re-inserted now: */
 942                node = node->next;
 943                task->wake_q.next = NULL;
 944
 945                /*
 946                 * wake_up_process() executes a full barrier, which pairs with
 947                 * the queueing in wake_q_add() so as not to miss wakeups.
 948                 */
 949                wake_up_process(task);
 950                put_task_struct(task);
 951        }
 952}
 953
 954/*
 955 * resched_curr - mark rq's current task 'to be rescheduled now'.
 956 *
 957 * On UP this means the setting of the need_resched flag, on SMP it
 958 * might also involve a cross-CPU call to trigger the scheduler on
 959 * the target CPU.
 960 */
 961void resched_curr(struct rq *rq)
 962{
 963        struct task_struct *curr = rq->curr;
 964        int cpu;
 965
 966        lockdep_assert_rq_held(rq);
 967
 968        if (test_tsk_need_resched(curr))
 969                return;
 970
 971        cpu = cpu_of(rq);
 972
 973        if (cpu == smp_processor_id()) {
 974                set_tsk_need_resched(curr);
 975                set_preempt_need_resched();
 976                return;
 977        }
 978
 979        if (set_nr_and_not_polling(curr))
 980                smp_send_reschedule(cpu);
 981        else
 982                trace_sched_wake_idle_without_ipi(cpu);
 983}
 984
 985void resched_cpu(int cpu)
 986{
 987        struct rq *rq = cpu_rq(cpu);
 988        unsigned long flags;
 989
 990        raw_spin_rq_lock_irqsave(rq, flags);
 991        if (cpu_online(cpu) || cpu == smp_processor_id())
 992                resched_curr(rq);
 993        raw_spin_rq_unlock_irqrestore(rq, flags);
 994}
 995
 996#ifdef CONFIG_SMP
 997#ifdef CONFIG_NO_HZ_COMMON
 998/*
 999 * In the semi idle case, use the nearest busy CPU for migrating timers
1000 * from an idle CPU.  This is good for power-savings.
1001 *
1002 * We don't do similar optimization for completely idle system, as
1003 * selecting an idle CPU will add more delays to the timers than intended
1004 * (as that CPU's timer base may not be uptodate wrt jiffies etc).
1005 */
1006int get_nohz_timer_target(void)
1007{
1008        int i, cpu = smp_processor_id(), default_cpu = -1;
1009        struct sched_domain *sd;
1010
1011        if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1012                if (!idle_cpu(cpu))
1013                        return cpu;
1014                default_cpu = cpu;
1015        }
1016
1017        rcu_read_lock();
1018        for_each_domain(cpu, sd) {
1019                for_each_cpu_and(i, sched_domain_span(sd),
1020                        housekeeping_cpumask(HK_FLAG_TIMER)) {
1021                        if (cpu == i)
1022                                continue;
1023
1024                        if (!idle_cpu(i)) {
1025                                cpu = i;
1026                                goto unlock;
1027                        }
1028                }
1029        }
1030
1031        if (default_cpu == -1)
1032                default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1033        cpu = default_cpu;
1034unlock:
1035        rcu_read_unlock();
1036        return cpu;
1037}
1038
1039/*
1040 * When add_timer_on() enqueues a timer into the timer wheel of an
1041 * idle CPU then this timer might expire before the next timer event
1042 * which is scheduled to wake up that CPU. In case of a completely
1043 * idle system the next event might even be infinite time into the
1044 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1045 * leaves the inner idle loop so the newly added timer is taken into
1046 * account when the CPU goes back to idle and evaluates the timer
1047 * wheel for the next timer event.
1048 */
1049static void wake_up_idle_cpu(int cpu)
1050{
1051        struct rq *rq = cpu_rq(cpu);
1052
1053        if (cpu == smp_processor_id())
1054                return;
1055
1056        if (set_nr_and_not_polling(rq->idle))
1057                smp_send_reschedule(cpu);
1058        else
1059                trace_sched_wake_idle_without_ipi(cpu);
1060}
1061
1062static bool wake_up_full_nohz_cpu(int cpu)
1063{
1064        /*
1065         * We just need the target to call irq_exit() and re-evaluate
1066         * the next tick. The nohz full kick at least implies that.
1067         * If needed we can still optimize that later with an
1068         * empty IRQ.
1069         */
1070        if (cpu_is_offline(cpu))
1071                return true;  /* Don't try to wake offline CPUs. */
1072        if (tick_nohz_full_cpu(cpu)) {
1073                if (cpu != smp_processor_id() ||
1074                    tick_nohz_tick_stopped())
1075                        tick_nohz_full_kick_cpu(cpu);
1076                return true;
1077        }
1078
1079        return false;
1080}
1081
1082/*
1083 * Wake up the specified CPU.  If the CPU is going offline, it is the
1084 * caller's responsibility to deal with the lost wakeup, for example,
1085 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
1086 */
1087void wake_up_nohz_cpu(int cpu)
1088{
1089        if (!wake_up_full_nohz_cpu(cpu))
1090                wake_up_idle_cpu(cpu);
1091}
1092
1093static void nohz_csd_func(void *info)
1094{
1095        struct rq *rq = info;
1096        int cpu = cpu_of(rq);
1097        unsigned int flags;
1098
1099        /*
1100         * Release the rq::nohz_csd.
1101         */
1102        flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1103        WARN_ON(!(flags & NOHZ_KICK_MASK));
1104
1105        rq->idle_balance = idle_cpu(cpu);
1106        if (rq->idle_balance && !need_resched()) {
1107                rq->nohz_idle_balance = flags;
1108                raise_softirq_irqoff(SCHED_SOFTIRQ);
1109        }
1110}
1111
1112#endif /* CONFIG_NO_HZ_COMMON */
1113
1114#ifdef CONFIG_NO_HZ_FULL
1115bool sched_can_stop_tick(struct rq *rq)
1116{
1117        int fifo_nr_running;
1118
1119        /* Deadline tasks, even if single, need the tick */
1120        if (rq->dl.dl_nr_running)
1121                return false;
1122
1123        /*
1124         * If there are more than one RR tasks, we need the tick to affect the
1125         * actual RR behaviour.
1126         */
1127        if (rq->rt.rr_nr_running) {
1128                if (rq->rt.rr_nr_running == 1)
1129                        return true;
1130                else
1131                        return false;
1132        }
1133
1134        /*
1135         * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
1136         * forced preemption between FIFO tasks.
1137         */
1138        fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1139        if (fifo_nr_running)
1140                return true;
1141
1142        /*
1143         * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
1144         * if there's more than one we need the tick for involuntary
1145         * preemption.
1146         */
1147        if (rq->nr_running > 1)
1148                return false;
1149
1150        return true;
1151}
1152#endif /* CONFIG_NO_HZ_FULL */
1153#endif /* CONFIG_SMP */
1154
1155#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1156                        (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1157/*
1158 * Iterate task_group tree rooted at *from, calling @down when first entering a
1159 * node and @up when leaving it for the final time.
1160 *
1161 * Caller must hold rcu_lock or sufficient equivalent.
1162 */
1163int walk_tg_tree_from(struct task_group *from,
1164                             tg_visitor down, tg_visitor up, void *data)
1165{
1166        struct task_group *parent, *child;
1167        int ret;
1168
1169        parent = from;
1170
1171down:
1172        ret = (*down)(parent, data);
1173        if (ret)
1174                goto out;
1175        list_for_each_entry_rcu(child, &parent->children, siblings) {
1176                parent = child;
1177                goto down;
1178
1179up:
1180                continue;
1181        }
1182        ret = (*up)(parent, data);
1183        if (ret || parent == from)
1184                goto out;
1185
1186        child = parent;
1187        parent = parent->parent;
1188        if (parent)
1189                goto up;
1190out:
1191        return ret;
1192}
1193
1194int tg_nop(struct task_group *tg, void *data)
1195{
1196        return 0;
1197}
1198#endif
1199
1200static void set_load_weight(struct task_struct *p, bool update_load)
1201{
1202        int prio = p->static_prio - MAX_RT_PRIO;
1203        struct load_weight *load = &p->se.load;
1204
1205        /*
1206         * SCHED_IDLE tasks get minimal weight:
1207         */
1208        if (task_has_idle_policy(p)) {
1209                load->weight = scale_load(WEIGHT_IDLEPRIO);
1210                load->inv_weight = WMULT_IDLEPRIO;
1211                return;
1212        }
1213
1214        /*
1215         * SCHED_OTHER tasks have to update their load when changing their
1216         * weight
1217         */
1218        if (update_load && p->sched_class == &fair_sched_class) {
1219                reweight_task(p, prio);
1220        } else {
1221                load->weight = scale_load(sched_prio_to_weight[prio]);
1222                load->inv_weight = sched_prio_to_wmult[prio];
1223        }
1224}
1225
1226#ifdef CONFIG_UCLAMP_TASK
1227/*
1228 * Serializes updates of utilization clamp values
1229 *
1230 * The (slow-path) user-space triggers utilization clamp value updates which
1231 * can require updates on (fast-path) scheduler's data structures used to
1232 * support enqueue/dequeue operations.
1233 * While the per-CPU rq lock protects fast-path update operations, user-space
1234 * requests are serialized using a mutex to reduce the risk of conflicting
1235 * updates or API abuses.
1236 */
1237static DEFINE_MUTEX(uclamp_mutex);
1238
1239/* Max allowed minimum utilization */
1240unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1241
1242/* Max allowed maximum utilization */
1243unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1244
1245/*
1246 * By default RT tasks run at the maximum performance point/capacity of the
1247 * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1248 * SCHED_CAPACITY_SCALE.
1249 *
1250 * This knob allows admins to change the default behavior when uclamp is being
1251 * used. In battery powered devices, particularly, running at the maximum
1252 * capacity and frequency will increase energy consumption and shorten the
1253 * battery life.
1254 *
1255 * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1256 *
1257 * This knob will not override the system default sched_util_clamp_min defined
1258 * above.
1259 */
1260unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1261
1262/* All clamps are required to be less or equal than these values */
1263static struct uclamp_se uclamp_default[UCLAMP_CNT];
1264
1265/*
1266 * This static key is used to reduce the uclamp overhead in the fast path. It
1267 * primarily disables the call to uclamp_rq_{inc, dec}() in
1268 * enqueue/dequeue_task().
1269 *
1270 * This allows users to continue to enable uclamp in their kernel config with
1271 * minimum uclamp overhead in the fast path.
1272 *
1273 * As soon as userspace modifies any of the uclamp knobs, the static key is
1274 * enabled, since we have an actual users that make use of uclamp
1275 * functionality.
1276 *
1277 * The knobs that would enable this static key are:
1278 *
1279 *   * A task modifying its uclamp value with sched_setattr().
1280 *   * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1281 *   * An admin modifying the cgroup cpu.uclamp.{min, max}
1282 */
1283DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1284
1285/* Integer rounded range for each bucket */
1286#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1287
1288#define for_each_clamp_id(clamp_id) \
1289        for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1290
1291static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1292{
1293        return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1294}
1295
1296static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1297{
1298        if (clamp_id == UCLAMP_MIN)
1299                return 0;
1300        return SCHED_CAPACITY_SCALE;
1301}
1302
1303static inline void uclamp_se_set(struct uclamp_se *uc_se,
1304                                 unsigned int value, bool user_defined)
1305{
1306        uc_se->value = value;
1307        uc_se->bucket_id = uclamp_bucket_id(value);
1308        uc_se->user_defined = user_defined;
1309}
1310
1311static inline unsigned int
1312uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1313                  unsigned int clamp_value)
1314{
1315        /*
1316         * Avoid blocked utilization pushing up the frequency when we go
1317         * idle (which drops the max-clamp) by retaining the last known
1318         * max-clamp.
1319         */
1320        if (clamp_id == UCLAMP_MAX) {
1321                rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1322                return clamp_value;
1323        }
1324
1325        return uclamp_none(UCLAMP_MIN);
1326}
1327
1328static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1329                                     unsigned int clamp_value)
1330{
1331        /* Reset max-clamp retention only on idle exit */
1332        if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1333                return;
1334
1335        WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1336}
1337
1338static inline
1339unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1340                                   unsigned int clamp_value)
1341{
1342        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1343        int bucket_id = UCLAMP_BUCKETS - 1;
1344
1345        /*
1346         * Since both min and max clamps are max aggregated, find the
1347         * top most bucket with tasks in.
1348         */
1349        for ( ; bucket_id >= 0; bucket_id--) {
1350                if (!bucket[bucket_id].tasks)
1351                        continue;
1352                return bucket[bucket_id].value;
1353        }
1354
1355        /* No tasks -- default clamp values */
1356        return uclamp_idle_value(rq, clamp_id, clamp_value);
1357}
1358
1359static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1360{
1361        unsigned int default_util_min;
1362        struct uclamp_se *uc_se;
1363
1364        lockdep_assert_held(&p->pi_lock);
1365
1366        uc_se = &p->uclamp_req[UCLAMP_MIN];
1367
1368        /* Only sync if user didn't override the default */
1369        if (uc_se->user_defined)
1370                return;
1371
1372        default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1373        uclamp_se_set(uc_se, default_util_min, false);
1374}
1375
1376static void uclamp_update_util_min_rt_default(struct task_struct *p)
1377{
1378        struct rq_flags rf;
1379        struct rq *rq;
1380
1381        if (!rt_task(p))
1382                return;
1383
1384        /* Protect updates to p->uclamp_* */
1385        rq = task_rq_lock(p, &rf);
1386        __uclamp_update_util_min_rt_default(p);
1387        task_rq_unlock(rq, p, &rf);
1388}
1389
1390static void uclamp_sync_util_min_rt_default(void)
1391{
1392        struct task_struct *g, *p;
1393
1394        /*
1395         * copy_process()                       sysctl_uclamp
1396         *                                        uclamp_min_rt = X;
1397         *   write_lock(&tasklist_lock)           read_lock(&tasklist_lock)
1398         *   // link thread                       smp_mb__after_spinlock()
1399         *   write_unlock(&tasklist_lock)         read_unlock(&tasklist_lock);
1400         *   sched_post_fork()                    for_each_process_thread()
1401         *     __uclamp_sync_rt()                   __uclamp_sync_rt()
1402         *
1403         * Ensures that either sched_post_fork() will observe the new
1404         * uclamp_min_rt or for_each_process_thread() will observe the new
1405         * task.
1406         */
1407        read_lock(&tasklist_lock);
1408        smp_mb__after_spinlock();
1409        read_unlock(&tasklist_lock);
1410
1411        rcu_read_lock();
1412        for_each_process_thread(g, p)
1413                uclamp_update_util_min_rt_default(p);
1414        rcu_read_unlock();
1415}
1416
1417static inline struct uclamp_se
1418uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1419{
1420        /* Copy by value as we could modify it */
1421        struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1422#ifdef CONFIG_UCLAMP_TASK_GROUP
1423        unsigned int tg_min, tg_max, value;
1424
1425        /*
1426         * Tasks in autogroups or root task group will be
1427         * restricted by system defaults.
1428         */
1429        if (task_group_is_autogroup(task_group(p)))
1430                return uc_req;
1431        if (task_group(p) == &root_task_group)
1432                return uc_req;
1433
1434        tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1435        tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1436        value = uc_req.value;
1437        value = clamp(value, tg_min, tg_max);
1438        uclamp_se_set(&uc_req, value, false);
1439#endif
1440
1441        return uc_req;
1442}
1443
1444/*
1445 * The effective clamp bucket index of a task depends on, by increasing
1446 * priority:
1447 * - the task specific clamp value, when explicitly requested from userspace
1448 * - the task group effective clamp value, for tasks not either in the root
1449 *   group or in an autogroup
1450 * - the system default clamp value, defined by the sysadmin
1451 */
1452static inline struct uclamp_se
1453uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1454{
1455        struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1456        struct uclamp_se uc_max = uclamp_default[clamp_id];
1457
1458        /* System default restrictions always apply */
1459        if (unlikely(uc_req.value > uc_max.value))
1460                return uc_max;
1461
1462        return uc_req;
1463}
1464
1465unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1466{
1467        struct uclamp_se uc_eff;
1468
1469        /* Task currently refcounted: use back-annotated (effective) value */
1470        if (p->uclamp[clamp_id].active)
1471                return (unsigned long)p->uclamp[clamp_id].value;
1472
1473        uc_eff = uclamp_eff_get(p, clamp_id);
1474
1475        return (unsigned long)uc_eff.value;
1476}
1477
1478/*
1479 * When a task is enqueued on a rq, the clamp bucket currently defined by the
1480 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
1481 * updates the rq's clamp value if required.
1482 *
1483 * Tasks can have a task-specific value requested from user-space, track
1484 * within each bucket the maximum value for tasks refcounted in it.
1485 * This "local max aggregation" allows to track the exact "requested" value
1486 * for each bucket when all its RUNNABLE tasks require the same clamp.
1487 */
1488static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1489                                    enum uclamp_id clamp_id)
1490{
1491        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1492        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1493        struct uclamp_bucket *bucket;
1494
1495        lockdep_assert_rq_held(rq);
1496
1497        /* Update task effective clamp */
1498        p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1499
1500        bucket = &uc_rq->bucket[uc_se->bucket_id];
1501        bucket->tasks++;
1502        uc_se->active = true;
1503
1504        uclamp_idle_reset(rq, clamp_id, uc_se->value);
1505
1506        /*
1507         * Local max aggregation: rq buckets always track the max
1508         * "requested" clamp value of its RUNNABLE tasks.
1509         */
1510        if (bucket->tasks == 1 || uc_se->value > bucket->value)
1511                bucket->value = uc_se->value;
1512
1513        if (uc_se->value > READ_ONCE(uc_rq->value))
1514                WRITE_ONCE(uc_rq->value, uc_se->value);
1515}
1516
1517/*
1518 * When a task is dequeued from a rq, the clamp bucket refcounted by the task
1519 * is released. If this is the last task reference counting the rq's max
1520 * active clamp value, then the rq's clamp value is updated.
1521 *
1522 * Both refcounted tasks and rq's cached clamp values are expected to be
1523 * always valid. If it's detected they are not, as defensive programming,
1524 * enforce the expected state and warn.
1525 */
1526static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1527                                    enum uclamp_id clamp_id)
1528{
1529        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1530        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1531        struct uclamp_bucket *bucket;
1532        unsigned int bkt_clamp;
1533        unsigned int rq_clamp;
1534
1535        lockdep_assert_rq_held(rq);
1536
1537        /*
1538         * If sched_uclamp_used was enabled after task @p was enqueued,
1539         * we could end up with unbalanced call to uclamp_rq_dec_id().
1540         *
1541         * In this case the uc_se->active flag should be false since no uclamp
1542         * accounting was performed at enqueue time and we can just return
1543         * here.
1544         *
1545         * Need to be careful of the following enqueue/dequeue ordering
1546         * problem too
1547         *
1548         *      enqueue(taskA)
1549         *      // sched_uclamp_used gets enabled
1550         *      enqueue(taskB)
1551         *      dequeue(taskA)
1552         *      // Must not decrement bucket->tasks here
1553         *      dequeue(taskB)
1554         *
1555         * where we could end up with stale data in uc_se and
1556         * bucket[uc_se->bucket_id].
1557         *
1558         * The following check here eliminates the possibility of such race.
1559         */
1560        if (unlikely(!uc_se->active))
1561                return;
1562
1563        bucket = &uc_rq->bucket[uc_se->bucket_id];
1564
1565        SCHED_WARN_ON(!bucket->tasks);
1566        if (likely(bucket->tasks))
1567                bucket->tasks--;
1568
1569        uc_se->active = false;
1570
1571        /*
1572         * Keep "local max aggregation" simple and accept to (possibly)
1573         * overboost some RUNNABLE tasks in the same bucket.
1574         * The rq clamp bucket value is reset to its base value whenever
1575         * there are no more RUNNABLE tasks refcounting it.
1576         */
1577        if (likely(bucket->tasks))
1578                return;
1579
1580        rq_clamp = READ_ONCE(uc_rq->value);
1581        /*
1582         * Defensive programming: this should never happen. If it happens,
1583         * e.g. due to future modification, warn and fixup the expected value.
1584         */
1585        SCHED_WARN_ON(bucket->value > rq_clamp);
1586        if (bucket->value >= rq_clamp) {
1587                bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1588                WRITE_ONCE(uc_rq->value, bkt_clamp);
1589        }
1590}
1591
1592static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1593{
1594        enum uclamp_id clamp_id;
1595
1596        /*
1597         * Avoid any overhead until uclamp is actually used by the userspace.
1598         *
1599         * The condition is constructed such that a NOP is generated when
1600         * sched_uclamp_used is disabled.
1601         */
1602        if (!static_branch_unlikely(&sched_uclamp_used))
1603                return;
1604
1605        if (unlikely(!p->sched_class->uclamp_enabled))
1606                return;
1607
1608        for_each_clamp_id(clamp_id)
1609                uclamp_rq_inc_id(rq, p, clamp_id);
1610
1611        /* Reset clamp idle holding when there is one RUNNABLE task */
1612        if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1613                rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1614}
1615
1616static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1617{
1618        enum uclamp_id clamp_id;
1619
1620        /*
1621         * Avoid any overhead until uclamp is actually used by the userspace.
1622         *
1623         * The condition is constructed such that a NOP is generated when
1624         * sched_uclamp_used is disabled.
1625         */
1626        if (!static_branch_unlikely(&sched_uclamp_used))
1627                return;
1628
1629        if (unlikely(!p->sched_class->uclamp_enabled))
1630                return;
1631
1632        for_each_clamp_id(clamp_id)
1633                uclamp_rq_dec_id(rq, p, clamp_id);
1634}
1635
1636static inline void
1637uclamp_update_active(struct task_struct *p)
1638{
1639        enum uclamp_id clamp_id;
1640        struct rq_flags rf;
1641        struct rq *rq;
1642
1643        /*
1644         * Lock the task and the rq where the task is (or was) queued.
1645         *
1646         * We might lock the (previous) rq of a !RUNNABLE task, but that's the
1647         * price to pay to safely serialize util_{min,max} updates with
1648         * enqueues, dequeues and migration operations.
1649         * This is the same locking schema used by __set_cpus_allowed_ptr().
1650         */
1651        rq = task_rq_lock(p, &rf);
1652
1653        /*
1654         * Setting the clamp bucket is serialized by task_rq_lock().
1655         * If the task is not yet RUNNABLE and its task_struct is not
1656         * affecting a valid clamp bucket, the next time it's enqueued,
1657         * it will already see the updated clamp bucket value.
1658         */
1659        for_each_clamp_id(clamp_id) {
1660                if (p->uclamp[clamp_id].active) {
1661                        uclamp_rq_dec_id(rq, p, clamp_id);
1662                        uclamp_rq_inc_id(rq, p, clamp_id);
1663                }
1664        }
1665
1666        task_rq_unlock(rq, p, &rf);
1667}
1668
1669#ifdef CONFIG_UCLAMP_TASK_GROUP
1670static inline void
1671uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1672{
1673        struct css_task_iter it;
1674        struct task_struct *p;
1675
1676        css_task_iter_start(css, 0, &it);
1677        while ((p = css_task_iter_next(&it)))
1678                uclamp_update_active(p);
1679        css_task_iter_end(&it);
1680}
1681
1682static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1683static void uclamp_update_root_tg(void)
1684{
1685        struct task_group *tg = &root_task_group;
1686
1687        uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1688                      sysctl_sched_uclamp_util_min, false);
1689        uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1690                      sysctl_sched_uclamp_util_max, false);
1691
1692        rcu_read_lock();
1693        cpu_util_update_eff(&root_task_group.css);
1694        rcu_read_unlock();
1695}
1696#else
1697static void uclamp_update_root_tg(void) { }
1698#endif
1699
1700int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1701                                void *buffer, size_t *lenp, loff_t *ppos)
1702{
1703        bool update_root_tg = false;
1704        int old_min, old_max, old_min_rt;
1705        int result;
1706
1707        mutex_lock(&uclamp_mutex);
1708        old_min = sysctl_sched_uclamp_util_min;
1709        old_max = sysctl_sched_uclamp_util_max;
1710        old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1711
1712        result = proc_dointvec(table, write, buffer, lenp, ppos);
1713        if (result)
1714                goto undo;
1715        if (!write)
1716                goto done;
1717
1718        if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1719            sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1720            sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1721
1722                result = -EINVAL;
1723                goto undo;
1724        }
1725
1726        if (old_min != sysctl_sched_uclamp_util_min) {
1727                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1728                              sysctl_sched_uclamp_util_min, false);
1729                update_root_tg = true;
1730        }
1731        if (old_max != sysctl_sched_uclamp_util_max) {
1732                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1733                              sysctl_sched_uclamp_util_max, false);
1734                update_root_tg = true;
1735        }
1736
1737        if (update_root_tg) {
1738                static_branch_enable(&sched_uclamp_used);
1739                uclamp_update_root_tg();
1740        }
1741
1742        if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1743                static_branch_enable(&sched_uclamp_used);
1744                uclamp_sync_util_min_rt_default();
1745        }
1746
1747        /*
1748         * We update all RUNNABLE tasks only when task groups are in use.
1749         * Otherwise, keep it simple and do just a lazy update at each next
1750         * task enqueue time.
1751         */
1752
1753        goto done;
1754
1755undo:
1756        sysctl_sched_uclamp_util_min = old_min;
1757        sysctl_sched_uclamp_util_max = old_max;
1758        sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1759done:
1760        mutex_unlock(&uclamp_mutex);
1761
1762        return result;
1763}
1764
1765static int uclamp_validate(struct task_struct *p,
1766                           const struct sched_attr *attr)
1767{
1768        int util_min = p->uclamp_req[UCLAMP_MIN].value;
1769        int util_max = p->uclamp_req[UCLAMP_MAX].value;
1770
1771        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1772                util_min = attr->sched_util_min;
1773
1774                if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1775                        return -EINVAL;
1776        }
1777
1778        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1779                util_max = attr->sched_util_max;
1780
1781                if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1782                        return -EINVAL;
1783        }
1784
1785        if (util_min != -1 && util_max != -1 && util_min > util_max)
1786                return -EINVAL;
1787
1788        /*
1789         * We have valid uclamp attributes; make sure uclamp is enabled.
1790         *
1791         * We need to do that here, because enabling static branches is a
1792         * blocking operation which obviously cannot be done while holding
1793         * scheduler locks.
1794         */
1795        static_branch_enable(&sched_uclamp_used);
1796
1797        return 0;
1798}
1799
1800static bool uclamp_reset(const struct sched_attr *attr,
1801                         enum uclamp_id clamp_id,
1802                         struct uclamp_se *uc_se)
1803{
1804        /* Reset on sched class change for a non user-defined clamp value. */
1805        if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1806            !uc_se->user_defined)
1807                return true;
1808
1809        /* Reset on sched_util_{min,max} == -1. */
1810        if (clamp_id == UCLAMP_MIN &&
1811            attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1812            attr->sched_util_min == -1) {
1813                return true;
1814        }
1815
1816        if (clamp_id == UCLAMP_MAX &&
1817            attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1818            attr->sched_util_max == -1) {
1819                return true;
1820        }
1821
1822        return false;
1823}
1824
1825static void __setscheduler_uclamp(struct task_struct *p,
1826                                  const struct sched_attr *attr)
1827{
1828        enum uclamp_id clamp_id;
1829
1830        for_each_clamp_id(clamp_id) {
1831                struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1832                unsigned int value;
1833
1834                if (!uclamp_reset(attr, clamp_id, uc_se))
1835                        continue;
1836
1837                /*
1838                 * RT by default have a 100% boost value that could be modified
1839                 * at runtime.
1840                 */
1841                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1842                        value = sysctl_sched_uclamp_util_min_rt_default;
1843                else
1844                        value = uclamp_none(clamp_id);
1845
1846                uclamp_se_set(uc_se, value, false);
1847
1848        }
1849
1850        if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1851                return;
1852
1853        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1854            attr->sched_util_min != -1) {
1855                uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1856                              attr->sched_util_min, true);
1857        }
1858
1859        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1860            attr->sched_util_max != -1) {
1861                uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1862                              attr->sched_util_max, true);
1863        }
1864}
1865
1866static void uclamp_fork(struct task_struct *p)
1867{
1868        enum uclamp_id clamp_id;
1869
1870        /*
1871         * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1872         * as the task is still at its early fork stages.
1873         */
1874        for_each_clamp_id(clamp_id)
1875                p->uclamp[clamp_id].active = false;
1876
1877        if (likely(!p->sched_reset_on_fork))
1878                return;
1879
1880        for_each_clamp_id(clamp_id) {
1881                uclamp_se_set(&p->uclamp_req[clamp_id],
1882                              uclamp_none(clamp_id), false);
1883        }
1884}
1885
1886static void uclamp_post_fork(struct task_struct *p)
1887{
1888        uclamp_update_util_min_rt_default(p);
1889}
1890
1891static void __init init_uclamp_rq(struct rq *rq)
1892{
1893        enum uclamp_id clamp_id;
1894        struct uclamp_rq *uc_rq = rq->uclamp;
1895
1896        for_each_clamp_id(clamp_id) {
1897                uc_rq[clamp_id] = (struct uclamp_rq) {
1898                        .value = uclamp_none(clamp_id)
1899                };
1900        }
1901
1902        rq->uclamp_flags = 0;
1903}
1904
1905static void __init init_uclamp(void)
1906{
1907        struct uclamp_se uc_max = {};
1908        enum uclamp_id clamp_id;
1909        int cpu;
1910
1911        for_each_possible_cpu(cpu)
1912                init_uclamp_rq(cpu_rq(cpu));
1913
1914        for_each_clamp_id(clamp_id) {
1915                uclamp_se_set(&init_task.uclamp_req[clamp_id],
1916                              uclamp_none(clamp_id), false);
1917        }
1918
1919        /* System defaults allow max clamp values for both indexes */
1920        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1921        for_each_clamp_id(clamp_id) {
1922                uclamp_default[clamp_id] = uc_max;
1923#ifdef CONFIG_UCLAMP_TASK_GROUP
1924                root_task_group.uclamp_req[clamp_id] = uc_max;
1925                root_task_group.uclamp[clamp_id] = uc_max;
1926#endif
1927        }
1928}
1929
1930#else /* CONFIG_UCLAMP_TASK */
1931static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1932static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1933static inline int uclamp_validate(struct task_struct *p,
1934                                  const struct sched_attr *attr)
1935{
1936        return -EOPNOTSUPP;
1937}
1938static void __setscheduler_uclamp(struct task_struct *p,
1939                                  const struct sched_attr *attr) { }
1940static inline void uclamp_fork(struct task_struct *p) { }
1941static inline void uclamp_post_fork(struct task_struct *p) { }
1942static inline void init_uclamp(void) { }
1943#endif /* CONFIG_UCLAMP_TASK */
1944
1945bool sched_task_on_rq(struct task_struct *p)
1946{
1947        return task_on_rq_queued(p);
1948}
1949
1950static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1951{
1952        if (!(flags & ENQUEUE_NOCLOCK))
1953                update_rq_clock(rq);
1954
1955        if (!(flags & ENQUEUE_RESTORE)) {
1956                sched_info_enqueue(rq, p);
1957                psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1958        }
1959
1960        uclamp_rq_inc(rq, p);
1961        p->sched_class->enqueue_task(rq, p, flags);
1962
1963        if (sched_core_enabled(rq))
1964                sched_core_enqueue(rq, p);
1965}
1966
1967static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1968{
1969        if (sched_core_enabled(rq))
1970                sched_core_dequeue(rq, p);
1971
1972        if (!(flags & DEQUEUE_NOCLOCK))
1973                update_rq_clock(rq);
1974
1975        if (!(flags & DEQUEUE_SAVE)) {
1976                sched_info_dequeue(rq, p);
1977                psi_dequeue(p, flags & DEQUEUE_SLEEP);
1978        }
1979
1980        uclamp_rq_dec(rq, p);
1981        p->sched_class->dequeue_task(rq, p, flags);
1982}
1983
1984void activate_task(struct rq *rq, struct task_struct *p, int flags)
1985{
1986        enqueue_task(rq, p, flags);
1987
1988        p->on_rq = TASK_ON_RQ_QUEUED;
1989}
1990
1991void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1992{
1993        p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1994
1995        dequeue_task(rq, p, flags);
1996}
1997
1998static inline int __normal_prio(int policy, int rt_prio, int nice)
1999{
2000        int prio;
2001
2002        if (dl_policy(policy))
2003                prio = MAX_DL_PRIO - 1;
2004        else if (rt_policy(policy))
2005                prio = MAX_RT_PRIO - 1 - rt_prio;
2006        else
2007                prio = NICE_TO_PRIO(nice);
2008
2009        return prio;
2010}
2011
2012/*
2013 * Calculate the expected normal priority: i.e. priority
2014 * without taking RT-inheritance into account. Might be
2015 * boosted by interactivity modifiers. Changes upon fork,
2016 * setprio syscalls, and whenever the interactivity
2017 * estimator recalculates.
2018 */
2019static inline int normal_prio(struct task_struct *p)
2020{
2021        return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2022}
2023
2024/*
2025 * Calculate the current priority, i.e. the priority
2026 * taken into account by the scheduler. This value might
2027 * be boosted by RT tasks, or might be boosted by
2028 * interactivity modifiers. Will be RT if the task got
2029 * RT-boosted. If not then it returns p->normal_prio.
2030 */
2031static int effective_prio(struct task_struct *p)
2032{
2033        p->normal_prio = normal_prio(p);
2034        /*
2035         * If we are RT tasks or we were boosted to RT priority,
2036         * keep the priority unchanged. Otherwise, update priority
2037         * to the normal priority:
2038         */
2039        if (!rt_prio(p->prio))
2040                return p->normal_prio;
2041        return p->prio;
2042}
2043
2044/**
2045 * task_curr - is this task currently executing on a CPU?
2046 * @p: the task in question.
2047 *
2048 * Return: 1 if the task is currently executing. 0 otherwise.
2049 */
2050inline int task_curr(const struct task_struct *p)
2051{
2052        return cpu_curr(task_cpu(p)) == p;
2053}
2054
2055/*
2056 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
2057 * use the balance_callback list if you want balancing.
2058 *
2059 * this means any call to check_class_changed() must be followed by a call to
2060 * balance_callback().
2061 */
2062static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2063                                       const struct sched_class *prev_class,
2064                                       int oldprio)
2065{
2066        if (prev_class != p->sched_class) {
2067                if (prev_class->switched_from)
2068                        prev_class->switched_from(rq, p);
2069
2070                p->sched_class->switched_to(rq, p);
2071        } else if (oldprio != p->prio || dl_task(p))
2072                p->sched_class->prio_changed(rq, p, oldprio);
2073}
2074
2075void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2076{
2077        if (p->sched_class == rq->curr->sched_class)
2078                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2079        else if (p->sched_class > rq->curr->sched_class)
2080                resched_curr(rq);
2081
2082        /*
2083         * A queue event has occurred, and we're going to schedule.  In
2084         * this case, we can save a useless back to back clock update.
2085         */
2086        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2087                rq_clock_skip_update(rq);
2088}
2089
2090#ifdef CONFIG_SMP
2091
2092static void
2093__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2094
2095static int __set_cpus_allowed_ptr(struct task_struct *p,
2096                                  const struct cpumask *new_mask,
2097                                  u32 flags);
2098
2099static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2100{
2101        if (likely(!p->migration_disabled))
2102                return;
2103
2104        if (p->cpus_ptr != &p->cpus_mask)
2105                return;
2106
2107        /*
2108         * Violates locking rules! see comment in __do_set_cpus_allowed().
2109         */
2110        __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2111}
2112
2113void migrate_disable(void)
2114{
2115        struct task_struct *p = current;
2116
2117        if (p->migration_disabled) {
2118                p->migration_disabled++;
2119                return;
2120        }
2121
2122        preempt_disable();
2123        this_rq()->nr_pinned++;
2124        p->migration_disabled = 1;
2125        preempt_enable();
2126}
2127EXPORT_SYMBOL_GPL(migrate_disable);
2128
2129void migrate_enable(void)
2130{
2131        struct task_struct *p = current;
2132
2133        if (p->migration_disabled > 1) {
2134                p->migration_disabled--;
2135                return;
2136        }
2137
2138        /*
2139         * Ensure stop_task runs either before or after this, and that
2140         * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
2141         */
2142        preempt_disable();
2143        if (p->cpus_ptr != &p->cpus_mask)
2144                __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2145        /*
2146         * Mustn't clear migration_disabled() until cpus_ptr points back at the
2147         * regular cpus_mask, otherwise things that race (eg.
2148         * select_fallback_rq) get confused.
2149         */
2150        barrier();
2151        p->migration_disabled = 0;
2152        this_rq()->nr_pinned--;
2153        preempt_enable();
2154}
2155EXPORT_SYMBOL_GPL(migrate_enable);
2156
2157static inline bool rq_has_pinned_tasks(struct rq *rq)
2158{
2159        return rq->nr_pinned;
2160}
2161
2162/*
2163 * Per-CPU kthreads are allowed to run on !active && online CPUs, see
2164 * __set_cpus_allowed_ptr() and select_fallback_rq().
2165 */
2166static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2167{
2168        /* When not in the task's cpumask, no point in looking further. */
2169        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2170                return false;
2171
2172        /* migrate_disabled() must be allowed to finish. */
2173        if (is_migration_disabled(p))
2174                return cpu_online(cpu);
2175
2176        /* Non kernel threads are not allowed during either online or offline. */
2177        if (!(p->flags & PF_KTHREAD))
2178                return cpu_active(cpu);
2179
2180        /* KTHREAD_IS_PER_CPU is always allowed. */
2181        if (kthread_is_per_cpu(p))
2182                return cpu_online(cpu);
2183
2184        /* Regular kernel threads don't get to stay during offline. */
2185        if (cpu_dying(cpu))
2186                return false;
2187
2188        /* But are allowed during online. */
2189        return cpu_online(cpu);
2190}
2191
2192/*
2193 * This is how migration works:
2194 *
2195 * 1) we invoke migration_cpu_stop() on the target CPU using
2196 *    stop_one_cpu().
2197 * 2) stopper starts to run (implicitly forcing the migrated thread
2198 *    off the CPU)
2199 * 3) it checks whether the migrated task is still in the wrong runqueue.
2200 * 4) if it's in the wrong runqueue then the migration thread removes
2201 *    it and puts it into the right queue.
2202 * 5) stopper completes and stop_one_cpu() returns and the migration
2203 *    is done.
2204 */
2205
2206/*
2207 * move_queued_task - move a queued task to new rq.
2208 *
2209 * Returns (locked) new rq. Old rq's lock is released.
2210 */
2211static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2212                                   struct task_struct *p, int new_cpu)
2213{
2214        lockdep_assert_rq_held(rq);
2215
2216        deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2217        set_task_cpu(p, new_cpu);
2218        rq_unlock(rq, rf);
2219
2220        rq = cpu_rq(new_cpu);
2221
2222        rq_lock(rq, rf);
2223        BUG_ON(task_cpu(p) != new_cpu);
2224        activate_task(rq, p, 0);
2225        check_preempt_curr(rq, p, 0);
2226
2227        return rq;
2228}
2229
2230struct migration_arg {
2231        struct task_struct              *task;
2232        int                             dest_cpu;
2233        struct set_affinity_pending     *pending;
2234};
2235
2236/*
2237 * @refs: number of wait_for_completion()
2238 * @stop_pending: is @stop_work in use
2239 */
2240struct set_affinity_pending {
2241        refcount_t              refs;
2242        unsigned int            stop_pending;
2243        struct completion       done;
2244        struct cpu_stop_work    stop_work;
2245        struct migration_arg    arg;
2246};
2247
2248/*
2249 * Move (not current) task off this CPU, onto the destination CPU. We're doing
2250 * this because either it can't run here any more (set_cpus_allowed()
2251 * away from this CPU, or CPU going down), or because we're
2252 * attempting to rebalance this task on exec (sched_exec).
2253 *
2254 * So we race with normal scheduler movements, but that's OK, as long
2255 * as the task is no longer on this CPU.
2256 */
2257static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2258                                 struct task_struct *p, int dest_cpu)
2259{
2260        /* Affinity changed (again). */
2261        if (!is_cpu_allowed(p, dest_cpu))
2262                return rq;
2263
2264        update_rq_clock(rq);
2265        rq = move_queued_task(rq, rf, p, dest_cpu);
2266
2267        return rq;
2268}
2269
2270/*
2271 * migration_cpu_stop - this will be executed by a highprio stopper thread
2272 * and performs thread migration by bumping thread off CPU then
2273 * 'pushing' onto another runqueue.
2274 */
2275static int migration_cpu_stop(void *data)
2276{
2277        struct migration_arg *arg = data;
2278        struct set_affinity_pending *pending = arg->pending;
2279        struct task_struct *p = arg->task;
2280        struct rq *rq = this_rq();
2281        bool complete = false;
2282        struct rq_flags rf;
2283
2284        /*
2285         * The original target CPU might have gone down and we might
2286         * be on another CPU but it doesn't matter.
2287         */
2288        local_irq_save(rf.flags);
2289        /*
2290         * We need to explicitly wake pending tasks before running
2291         * __migrate_task() such that we will not miss enforcing cpus_ptr
2292         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
2293         */
2294        flush_smp_call_function_from_idle();
2295
2296        raw_spin_lock(&p->pi_lock);
2297        rq_lock(rq, &rf);
2298
2299        /*
2300         * If we were passed a pending, then ->stop_pending was set, thus
2301         * p->migration_pending must have remained stable.
2302         */
2303        WARN_ON_ONCE(pending && pending != p->migration_pending);
2304
2305        /*
2306         * If task_rq(p) != rq, it cannot be migrated here, because we're
2307         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
2308         * we're holding p->pi_lock.
2309         */
2310        if (task_rq(p) == rq) {
2311                if (is_migration_disabled(p))
2312                        goto out;
2313
2314                if (pending) {
2315                        p->migration_pending = NULL;
2316                        complete = true;
2317
2318                        if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2319                                goto out;
2320                }
2321
2322                if (task_on_rq_queued(p))
2323                        rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2324                else
2325                        p->wake_cpu = arg->dest_cpu;
2326
2327                /*
2328                 * XXX __migrate_task() can fail, at which point we might end
2329                 * up running on a dodgy CPU, AFAICT this can only happen
2330                 * during CPU hotplug, at which point we'll get pushed out
2331                 * anyway, so it's probably not a big deal.
2332                 */
2333
2334        } else if (pending) {
2335                /*
2336                 * This happens when we get migrated between migrate_enable()'s
2337                 * preempt_enable() and scheduling the stopper task. At that
2338                 * point we're a regular task again and not current anymore.
2339                 *
2340                 * A !PREEMPT kernel has a giant hole here, which makes it far
2341                 * more likely.
2342                 */
2343
2344                /*
2345                 * The task moved before the stopper got to run. We're holding
2346                 * ->pi_lock, so the allowed mask is stable - if it got
2347                 * somewhere allowed, we're done.
2348                 */
2349                if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2350                        p->migration_pending = NULL;
2351                        complete = true;
2352                        goto out;
2353                }
2354
2355                /*
2356                 * When migrate_enable() hits a rq mis-match we can't reliably
2357                 * determine is_migration_disabled() and so have to chase after
2358                 * it.
2359                 */
2360                WARN_ON_ONCE(!pending->stop_pending);
2361                task_rq_unlock(rq, p, &rf);
2362                stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2363                                    &pending->arg, &pending->stop_work);
2364                return 0;
2365        }
2366out:
2367        if (pending)
2368                pending->stop_pending = false;
2369        task_rq_unlock(rq, p, &rf);
2370
2371        if (complete)
2372                complete_all(&pending->done);
2373
2374        return 0;
2375}
2376
2377int push_cpu_stop(void *arg)
2378{
2379        struct rq *lowest_rq = NULL, *rq = this_rq();
2380        struct task_struct *p = arg;
2381
2382        raw_spin_lock_irq(&p->pi_lock);
2383        raw_spin_rq_lock(rq);
2384
2385        if (task_rq(p) != rq)
2386                goto out_unlock;
2387
2388        if (is_migration_disabled(p)) {
2389                p->migration_flags |= MDF_PUSH;
2390                goto out_unlock;
2391        }
2392
2393        p->migration_flags &= ~MDF_PUSH;
2394
2395        if (p->sched_class->find_lock_rq)
2396                lowest_rq = p->sched_class->find_lock_rq(p, rq);
2397
2398        if (!lowest_rq)
2399                goto out_unlock;
2400
2401        // XXX validate p is still the highest prio task
2402        if (task_rq(p) == rq) {
2403                deactivate_task(rq, p, 0);
2404                set_task_cpu(p, lowest_rq->cpu);
2405                activate_task(lowest_rq, p, 0);
2406                resched_curr(lowest_rq);
2407        }
2408
2409        double_unlock_balance(rq, lowest_rq);
2410
2411out_unlock:
2412        rq->push_busy = false;
2413        raw_spin_rq_unlock(rq);
2414        raw_spin_unlock_irq(&p->pi_lock);
2415
2416        put_task_struct(p);
2417        return 0;
2418}
2419
2420/*
2421 * sched_class::set_cpus_allowed must do the below, but is not required to
2422 * actually call this function.
2423 */
2424void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2425{
2426        if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2427                p->cpus_ptr = new_mask;
2428                return;
2429        }
2430
2431        cpumask_copy(&p->cpus_mask, new_mask);
2432        p->nr_cpus_allowed = cpumask_weight(new_mask);
2433}
2434
2435static void
2436__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2437{
2438        struct rq *rq = task_rq(p);
2439        bool queued, running;
2440
2441        /*
2442         * This here violates the locking rules for affinity, since we're only
2443         * supposed to change these variables while holding both rq->lock and
2444         * p->pi_lock.
2445         *
2446         * HOWEVER, it magically works, because ttwu() is the only code that
2447         * accesses these variables under p->pi_lock and only does so after
2448         * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
2449         * before finish_task().
2450         *
2451         * XXX do further audits, this smells like something putrid.
2452         */
2453        if (flags & SCA_MIGRATE_DISABLE)
2454                SCHED_WARN_ON(!p->on_cpu);
2455        else
2456                lockdep_assert_held(&p->pi_lock);
2457
2458        queued = task_on_rq_queued(p);
2459        running = task_current(rq, p);
2460
2461        if (queued) {
2462                /*
2463                 * Because __kthread_bind() calls this on blocked tasks without
2464                 * holding rq->lock.
2465                 */
2466                lockdep_assert_rq_held(rq);
2467                dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2468        }
2469        if (running)
2470                put_prev_task(rq, p);
2471
2472        p->sched_class->set_cpus_allowed(p, new_mask, flags);
2473
2474        if (queued)
2475                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2476        if (running)
2477                set_next_task(rq, p);
2478}
2479
2480void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2481{
2482        __do_set_cpus_allowed(p, new_mask, 0);
2483}
2484
2485/*
2486 * This function is wildly self concurrent; here be dragons.
2487 *
2488 *
2489 * When given a valid mask, __set_cpus_allowed_ptr() must block until the
2490 * designated task is enqueued on an allowed CPU. If that task is currently
2491 * running, we have to kick it out using the CPU stopper.
2492 *
2493 * Migrate-Disable comes along and tramples all over our nice sandcastle.
2494 * Consider:
2495 *
2496 *     Initial conditions: P0->cpus_mask = [0, 1]
2497 *
2498 *     P0@CPU0                  P1
2499 *
2500 *     migrate_disable();
2501 *     <preempted>
2502 *                              set_cpus_allowed_ptr(P0, [1]);
2503 *
2504 * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
2505 * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2506 * This means we need the following scheme:
2507 *
2508 *     P0@CPU0                  P1
2509 *
2510 *     migrate_disable();
2511 *     <preempted>
2512 *                              set_cpus_allowed_ptr(P0, [1]);
2513 *                                <blocks>
2514 *     <resumes>
2515 *     migrate_enable();
2516 *       __set_cpus_allowed_ptr();
2517 *       <wakes local stopper>
2518 *                         `--> <woken on migration completion>
2519 *
2520 * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2521 * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
2522 * task p are serialized by p->pi_lock, which we can leverage: the one that
2523 * should come into effect at the end of the Migrate-Disable region is the last
2524 * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2525 * but we still need to properly signal those waiting tasks at the appropriate
2526 * moment.
2527 *
2528 * This is implemented using struct set_affinity_pending. The first
2529 * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2530 * setup an instance of that struct and install it on the targeted task_struct.
2531 * Any and all further callers will reuse that instance. Those then wait for
2532 * a completion signaled at the tail of the CPU stopper callback (1), triggered
2533 * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2534 *
2535 *
2536 * (1) In the cases covered above. There is one more where the completion is
2537 * signaled within affine_move_task() itself: when a subsequent affinity request
2538 * occurs after the stopper bailed out due to the targeted task still being
2539 * Migrate-Disable. Consider:
2540 *
2541 *     Initial conditions: P0->cpus_mask = [0, 1]
2542 *
2543 *     CPU0               P1                            P2
2544 *     <P0>
2545 *       migrate_disable();
2546 *       <preempted>
2547 *                        set_cpus_allowed_ptr(P0, [1]);
2548 *                          <blocks>
2549 *     <migration/0>
2550 *       migration_cpu_stop()
2551 *         is_migration_disabled()
2552 *           <bails>
2553 *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
2554 *                                                         <signal completion>
2555 *                          <awakes>
2556 *
2557 * Note that the above is safe vs a concurrent migrate_enable(), as any
2558 * pending affinity completion is preceded by an uninstallation of
2559 * p->migration_pending done with p->pi_lock held.
2560 */
2561static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2562                            int dest_cpu, unsigned int flags)
2563{
2564        struct set_affinity_pending my_pending = { }, *pending = NULL;
2565        bool stop_pending, complete = false;
2566
2567        /* Can the task run on the task's current CPU? If so, we're done */
2568        if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2569                struct task_struct *push_task = NULL;
2570
2571                if ((flags & SCA_MIGRATE_ENABLE) &&
2572                    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2573                        rq->push_busy = true;
2574                        push_task = get_task_struct(p);
2575                }
2576
2577                /*
2578                 * If there are pending waiters, but no pending stop_work,
2579                 * then complete now.
2580                 */
2581                pending = p->migration_pending;
2582                if (pending && !pending->stop_pending) {
2583                        p->migration_pending = NULL;
2584                        complete = true;
2585                }
2586
2587                task_rq_unlock(rq, p, rf);
2588
2589                if (push_task) {
2590                        stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2591                                            p, &rq->push_work);
2592                }
2593
2594                if (complete)
2595                        complete_all(&pending->done);
2596
2597                return 0;
2598        }
2599
2600        if (!(flags & SCA_MIGRATE_ENABLE)) {
2601                /* serialized by p->pi_lock */
2602                if (!p->migration_pending) {
2603                        /* Install the request */
2604                        refcount_set(&my_pending.refs, 1);
2605                        init_completion(&my_pending.done);
2606                        my_pending.arg = (struct migration_arg) {
2607                                .task = p,
2608                                .dest_cpu = dest_cpu,
2609                                .pending = &my_pending,
2610                        };
2611
2612                        p->migration_pending = &my_pending;
2613                } else {
2614                        pending = p->migration_pending;
2615                        refcount_inc(&pending->refs);
2616                        /*
2617                         * Affinity has changed, but we've already installed a
2618                         * pending. migration_cpu_stop() *must* see this, else
2619                         * we risk a completion of the pending despite having a
2620                         * task on a disallowed CPU.
2621                         *
2622                         * Serialized by p->pi_lock, so this is safe.
2623                         */
2624                        pending->arg.dest_cpu = dest_cpu;
2625                }
2626        }
2627        pending = p->migration_pending;
2628        /*
2629         * - !MIGRATE_ENABLE:
2630         *   we'll have installed a pending if there wasn't one already.
2631         *
2632         * - MIGRATE_ENABLE:
2633         *   we're here because the current CPU isn't matching anymore,
2634         *   the only way that can happen is because of a concurrent
2635         *   set_cpus_allowed_ptr() call, which should then still be
2636         *   pending completion.
2637         *
2638         * Either way, we really should have a @pending here.
2639         */
2640        if (WARN_ON_ONCE(!pending)) {
2641                task_rq_unlock(rq, p, rf);
2642                return -EINVAL;
2643        }
2644
2645        if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2646                /*
2647                 * MIGRATE_ENABLE gets here because 'p == current', but for
2648                 * anything else we cannot do is_migration_disabled(), punt
2649                 * and have the stopper function handle it all race-free.
2650                 */
2651                stop_pending = pending->stop_pending;
2652                if (!stop_pending)
2653                        pending->stop_pending = true;
2654
2655                if (flags & SCA_MIGRATE_ENABLE)
2656                        p->migration_flags &= ~MDF_PUSH;
2657
2658                task_rq_unlock(rq, p, rf);
2659
2660                if (!stop_pending) {
2661                        stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2662                                            &pending->arg, &pending->stop_work);
2663                }
2664
2665                if (flags & SCA_MIGRATE_ENABLE)
2666                        return 0;
2667        } else {
2668
2669                if (!is_migration_disabled(p)) {
2670                        if (task_on_rq_queued(p))
2671                                rq = move_queued_task(rq, rf, p, dest_cpu);
2672
2673                        if (!pending->stop_pending) {
2674                                p->migration_pending = NULL;
2675                                complete = true;
2676                        }
2677                }
2678                task_rq_unlock(rq, p, rf);
2679
2680                if (complete)
2681                        complete_all(&pending->done);
2682        }
2683
2684        wait_for_completion(&pending->done);
2685
2686        if (refcount_dec_and_test(&pending->refs))
2687                wake_up_var(&pending->refs); /* No UaF, just an address */
2688
2689        /*
2690         * Block the original owner of &pending until all subsequent callers
2691         * have seen the completion and decremented the refcount
2692         */
2693        wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2694
2695        /* ARGH */
2696        WARN_ON_ONCE(my_pending.stop_pending);
2697
2698        return 0;
2699}
2700
2701/*
2702 * Change a given task's CPU affinity. Migrate the thread to a
2703 * proper CPU and schedule it away if the CPU it's executing on
2704 * is removed from the allowed bitmask.
2705 *
2706 * NOTE: the caller must have a valid reference to the task, the
2707 * task must not exit() & deallocate itself prematurely. The
2708 * call is not atomic; no spinlocks may be held.
2709 */
2710static int __set_cpus_allowed_ptr(struct task_struct *p,
2711                                  const struct cpumask *new_mask,
2712                                  u32 flags)
2713{
2714        const struct cpumask *cpu_valid_mask = cpu_active_mask;
2715        unsigned int dest_cpu;
2716        struct rq_flags rf;
2717        struct rq *rq;
2718        int ret = 0;
2719
2720        rq = task_rq_lock(p, &rf);
2721        update_rq_clock(rq);
2722
2723        if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
2724                /*
2725                 * Kernel threads are allowed on online && !active CPUs,
2726                 * however, during cpu-hot-unplug, even these might get pushed
2727                 * away if not KTHREAD_IS_PER_CPU.
2728                 *
2729                 * Specifically, migration_disabled() tasks must not fail the
2730                 * cpumask_any_and_distribute() pick below, esp. so on
2731                 * SCA_MIGRATE_ENABLE, otherwise we'll not call
2732                 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
2733                 */
2734                cpu_valid_mask = cpu_online_mask;
2735        }
2736
2737        /*
2738         * Must re-check here, to close a race against __kthread_bind(),
2739         * sched_setaffinity() is not guaranteed to observe the flag.
2740         */
2741        if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2742                ret = -EINVAL;
2743                goto out;
2744        }
2745
2746        if (!(flags & SCA_MIGRATE_ENABLE)) {
2747                if (cpumask_equal(&p->cpus_mask, new_mask))
2748                        goto out;
2749
2750                if (WARN_ON_ONCE(p == current &&
2751                                 is_migration_disabled(p) &&
2752                                 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2753                        ret = -EBUSY;
2754                        goto out;
2755                }
2756        }
2757
2758        /*
2759         * Picking a ~random cpu helps in cases where we are changing affinity
2760         * for groups of tasks (ie. cpuset), so that load balancing is not
2761         * immediately required to distribute the tasks within their new mask.
2762         */
2763        dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2764        if (dest_cpu >= nr_cpu_ids) {
2765                ret = -EINVAL;
2766                goto out;
2767        }
2768
2769        __do_set_cpus_allowed(p, new_mask, flags);
2770
2771        return affine_move_task(rq, p, &rf, dest_cpu, flags);
2772
2773out:
2774        task_rq_unlock(rq, p, &rf);
2775
2776        return ret;
2777}
2778
2779int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2780{
2781        return __set_cpus_allowed_ptr(p, new_mask, 0);
2782}
2783EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2784
2785void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2786{
2787#ifdef CONFIG_SCHED_DEBUG
2788        unsigned int state = READ_ONCE(p->__state);
2789
2790        /*
2791         * We should never call set_task_cpu() on a blocked task,
2792         * ttwu() will sort out the placement.
2793         */
2794        WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
2795
2796        /*
2797         * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
2798         * because schedstat_wait_{start,end} rebase migrating task's wait_start
2799         * time relying on p->on_rq.
2800         */
2801        WARN_ON_ONCE(state == TASK_RUNNING &&
2802                     p->sched_class == &fair_sched_class &&
2803                     (p->on_rq && !task_on_rq_migrating(p)));
2804
2805#ifdef CONFIG_LOCKDEP
2806        /*
2807         * The caller should hold either p->pi_lock or rq->lock, when changing
2808         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2809         *
2810         * sched_move_task() holds both and thus holding either pins the cgroup,
2811         * see task_group().
2812         *
2813         * Furthermore, all task_rq users should acquire both locks, see
2814         * task_rq_lock().
2815         */
2816        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2817                                      lockdep_is_held(__rq_lockp(task_rq(p)))));
2818#endif
2819        /*
2820         * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
2821         */
2822        WARN_ON_ONCE(!cpu_online(new_cpu));
2823
2824        WARN_ON_ONCE(is_migration_disabled(p));
2825#endif
2826
2827        trace_sched_migrate_task(p, new_cpu);
2828
2829        if (task_cpu(p) != new_cpu) {
2830                if (p->sched_class->migrate_task_rq)
2831                        p->sched_class->migrate_task_rq(p, new_cpu);
2832                p->se.nr_migrations++;
2833                rseq_migrate(p);
2834                perf_event_task_migrate(p);
2835        }
2836
2837        __set_task_cpu(p, new_cpu);
2838}
2839
2840#ifdef CONFIG_NUMA_BALANCING
2841static void __migrate_swap_task(struct task_struct *p, int cpu)
2842{
2843        if (task_on_rq_queued(p)) {
2844                struct rq *src_rq, *dst_rq;
2845                struct rq_flags srf, drf;
2846
2847                src_rq = task_rq(p);
2848                dst_rq = cpu_rq(cpu);
2849
2850                rq_pin_lock(src_rq, &srf);
2851                rq_pin_lock(dst_rq, &drf);
2852
2853                deactivate_task(src_rq, p, 0);
2854                set_task_cpu(p, cpu);
2855                activate_task(dst_rq, p, 0);
2856                check_preempt_curr(dst_rq, p, 0);
2857
2858                rq_unpin_lock(dst_rq, &drf);
2859                rq_unpin_lock(src_rq, &srf);
2860
2861        } else {
2862                /*
2863                 * Task isn't running anymore; make it appear like we migrated
2864                 * it before it went to sleep. This means on wakeup we make the
2865                 * previous CPU our target instead of where it really is.
2866                 */
2867                p->wake_cpu = cpu;
2868        }
2869}
2870
2871struct migration_swap_arg {
2872        struct task_struct *src_task, *dst_task;
2873        int src_cpu, dst_cpu;
2874};
2875
2876static int migrate_swap_stop(void *data)
2877{
2878        struct migration_swap_arg *arg = data;
2879        struct rq *src_rq, *dst_rq;
2880        int ret = -EAGAIN;
2881
2882        if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
2883                return -EAGAIN;
2884
2885        src_rq = cpu_rq(arg->src_cpu);
2886        dst_rq = cpu_rq(arg->dst_cpu);
2887
2888        double_raw_lock(&arg->src_task->pi_lock,
2889                        &arg->dst_task->pi_lock);
2890        double_rq_lock(src_rq, dst_rq);
2891
2892        if (task_cpu(arg->dst_task) != arg->dst_cpu)
2893                goto unlock;
2894
2895        if (task_cpu(arg->src_task) != arg->src_cpu)
2896                goto unlock;
2897
2898        if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
2899                goto unlock;
2900
2901        if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
2902                goto unlock;
2903
2904        __migrate_swap_task(arg->src_task, arg->dst_cpu);
2905        __migrate_swap_task(arg->dst_task, arg->src_cpu);
2906
2907        ret = 0;
2908
2909unlock:
2910        double_rq_unlock(src_rq, dst_rq);
2911        raw_spin_unlock(&arg->dst_task->pi_lock);
2912        raw_spin_unlock(&arg->src_task->pi_lock);
2913
2914        return ret;
2915}
2916
2917/*
2918 * Cross migrate two tasks
2919 */
2920int migrate_swap(struct task_struct *cur, struct task_struct *p,
2921                int target_cpu, int curr_cpu)
2922{
2923        struct migration_swap_arg arg;
2924        int ret = -EINVAL;
2925
2926        arg = (struct migration_swap_arg){
2927                .src_task = cur,
2928                .src_cpu = curr_cpu,
2929                .dst_task = p,
2930                .dst_cpu = target_cpu,
2931        };
2932
2933        if (arg.src_cpu == arg.dst_cpu)
2934                goto out;
2935
2936        /*
2937         * These three tests are all lockless; this is OK since all of them
2938         * will be re-checked with proper locks held further down the line.
2939         */
2940        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
2941                goto out;
2942
2943        if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
2944                goto out;
2945
2946        if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
2947                goto out;
2948
2949        trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2950        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2951
2952out:
2953        return ret;
2954}
2955#endif /* CONFIG_NUMA_BALANCING */
2956
2957/*
2958 * wait_task_inactive - wait for a thread to unschedule.
2959 *
2960 * If @match_state is nonzero, it's the @p->state value just checked and
2961 * not expected to change.  If it changes, i.e. @p might have woken up,
2962 * then return zero.  When we succeed in waiting for @p to be off its CPU,
2963 * we return a positive number (its total switch count).  If a second call
2964 * a short while later returns the same number, the caller can be sure that
2965 * @p has remained unscheduled the whole time.
2966 *
2967 * The caller must ensure that the task *will* unschedule sometime soon,
2968 * else this function might spin for a *long* time. This function can't
2969 * be called with interrupts off, or it may introduce deadlock with
2970 * smp_call_function() if an IPI is sent by the same process we are
2971 * waiting to become inactive.
2972 */
2973unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
2974{
2975        int running, queued;
2976        struct rq_flags rf;
2977        unsigned long ncsw;
2978        struct rq *rq;
2979
2980        for (;;) {
2981                /*
2982                 * We do the initial early heuristics without holding
2983                 * any task-queue locks at all. We'll only try to get
2984                 * the runqueue lock when things look like they will
2985                 * work out!
2986                 */
2987                rq = task_rq(p);
2988
2989                /*
2990                 * If the task is actively running on another CPU
2991                 * still, just relax and busy-wait without holding
2992                 * any locks.
2993                 *
2994                 * NOTE! Since we don't hold any locks, it's not
2995                 * even sure that "rq" stays as the right runqueue!
2996                 * But we don't care, since "task_running()" will
2997                 * return false if the runqueue has changed and p
2998                 * is actually now running somewhere else!
2999                 */
3000                while (task_running(rq, p)) {
3001                        if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3002                                return 0;
3003                        cpu_relax();
3004                }
3005
3006                /*
3007                 * Ok, time to look more closely! We need the rq
3008                 * lock now, to be *sure*. If we're wrong, we'll
3009                 * just go back and repeat.
3010                 */
3011                rq = task_rq_lock(p, &rf);
3012                trace_sched_wait_task(p);
3013                running = task_running(rq, p);
3014                queued = task_on_rq_queued(p);
3015                ncsw = 0;
3016                if (!match_state || READ_ONCE(p->__state) == match_state)
3017                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
3018                task_rq_unlock(rq, p, &rf);
3019
3020                /*
3021                 * If it changed from the expected state, bail out now.
3022                 */
3023                if (unlikely(!ncsw))
3024                        break;
3025
3026                /*
3027                 * Was it really running after all now that we
3028                 * checked with the proper locks actually held?
3029                 *
3030                 * Oops. Go back and try again..
3031                 */
3032                if (unlikely(running)) {
3033                        cpu_relax();
3034                        continue;
3035                }
3036
3037                /*
3038                 * It's not enough that it's not actively running,
3039                 * it must be off the runqueue _entirely_, and not
3040                 * preempted!
3041                 *
3042                 * So if it was still runnable (but just not actively
3043                 * running right now), it's preempted, and we should
3044                 * yield - it could be a while.
3045                 */
3046                if (unlikely(queued)) {
3047                        ktime_t to = NSEC_PER_SEC / HZ;
3048
3049                        set_current_state(TASK_UNINTERRUPTIBLE);
3050                        schedule_hrtimeout(&to, HRTIMER_MODE_REL);
3051                        continue;
3052                }
3053
3054                /*
3055                 * Ahh, all good. It wasn't running, and it wasn't
3056                 * runnable, which means that it will never become
3057                 * running in the future either. We're all done!
3058                 */
3059                break;
3060        }
3061
3062        return ncsw;
3063}
3064
3065/***
3066 * kick_process - kick a running thread to enter/exit the kernel
3067 * @p: the to-be-kicked thread
3068 *
3069 * Cause a process which is running on another CPU to enter
3070 * kernel-mode, without any delay. (to get signals handled.)
3071 *
3072 * NOTE: this function doesn't have to take the runqueue lock,
3073 * because all it wants to ensure is that the remote task enters
3074 * the kernel. If the IPI races and the task has been migrated
3075 * to another CPU then no harm is done and the purpose has been
3076 * achieved as well.
3077 */
3078void kick_process(struct task_struct *p)
3079{
3080        int cpu;
3081
3082        preempt_disable();
3083        cpu = task_cpu(p);
3084        if ((cpu != smp_processor_id()) && task_curr(p))
3085                smp_send_reschedule(cpu);
3086        preempt_enable();
3087}
3088EXPORT_SYMBOL_GPL(kick_process);
3089
3090/*
3091 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
3092 *
3093 * A few notes on cpu_active vs cpu_online:
3094 *
3095 *  - cpu_active must be a subset of cpu_online
3096 *
3097 *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
3098 *    see __set_cpus_allowed_ptr(). At this point the newly online
3099 *    CPU isn't yet part of the sched domains, and balancing will not
3100 *    see it.
3101 *
3102 *  - on CPU-down we clear cpu_active() to mask the sched domains and
3103 *    avoid the load balancer to place new tasks on the to be removed
3104 *    CPU. Existing tasks will remain running there and will be taken
3105 *    off.
3106 *
3107 * This means that fallback selection must not select !active CPUs.
3108 * And can assume that any active CPU must be online. Conversely
3109 * select_task_rq() below may allow selection of !active CPUs in order
3110 * to satisfy the above rules.
3111 */
3112static int select_fallback_rq(int cpu, struct task_struct *p)
3113{
3114        int nid = cpu_to_node(cpu);
3115        const struct cpumask *nodemask = NULL;
3116        enum { cpuset, possible, fail } state = cpuset;
3117        int dest_cpu;
3118
3119        /*
3120         * If the node that the CPU is on has been offlined, cpu_to_node()
3121         * will return -1. There is no CPU on the node, and we should
3122         * select the CPU on the other node.
3123         */
3124        if (nid != -1) {
3125                nodemask = cpumask_of_node(nid);
3126
3127                /* Look for allowed, online CPU in same node. */
3128                for_each_cpu(dest_cpu, nodemask) {
3129                        if (!cpu_active(dest_cpu))
3130                                continue;
3131                        if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
3132                                return dest_cpu;
3133                }
3134        }
3135
3136        for (;;) {
3137                /* Any allowed, online CPU? */
3138                for_each_cpu(dest_cpu, p->cpus_ptr) {
3139                        if (!is_cpu_allowed(p, dest_cpu))
3140                                continue;
3141
3142                        goto out;
3143                }
3144
3145                /* No more Mr. Nice Guy. */
3146                switch (state) {
3147                case cpuset:
3148                        if (IS_ENABLED(CONFIG_CPUSETS)) {
3149                                cpuset_cpus_allowed_fallback(p);
3150                                state = possible;
3151                                break;
3152                        }
3153                        fallthrough;
3154                case possible:
3155                        /*
3156                         * XXX When called from select_task_rq() we only
3157                         * hold p->pi_lock and again violate locking order.
3158                         *
3159                         * More yuck to audit.
3160                         */
3161                        do_set_cpus_allowed(p, cpu_possible_mask);
3162                        state = fail;
3163                        break;
3164
3165                case fail:
3166                        BUG();
3167                        break;
3168                }
3169        }
3170
3171out:
3172        if (state != cpuset) {
3173                /*
3174                 * Don't tell them about moving exiting tasks or
3175                 * kernel threads (both mm NULL), since they never
3176                 * leave kernel.
3177                 */
3178                if (p->mm && printk_ratelimit()) {
3179                        printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3180                                        task_pid_nr(p), p->comm, cpu);
3181                }
3182        }
3183
3184        return dest_cpu;
3185}
3186
3187/*
3188 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
3189 */
3190static inline
3191int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3192{
3193        lockdep_assert_held(&p->pi_lock);
3194
3195        if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3196                cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3197        else
3198                cpu = cpumask_any(p->cpus_ptr);
3199
3200        /*
3201         * In order not to call set_task_cpu() on a blocking task we need
3202         * to rely on ttwu() to place the task on a valid ->cpus_ptr
3203         * CPU.
3204         *
3205         * Since this is common to all placement strategies, this lives here.
3206         *
3207         * [ this allows ->select_task() to simply return task_cpu(p) and
3208         *   not worry about this generic constraint ]
3209         */
3210        if (unlikely(!is_cpu_allowed(p, cpu)))
3211                cpu = select_fallback_rq(task_cpu(p), p);
3212
3213        return cpu;
3214}
3215
3216void sched_set_stop_task(int cpu, struct task_struct *stop)
3217{
3218        static struct lock_class_key stop_pi_lock;
3219        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3220        struct task_struct *old_stop = cpu_rq(cpu)->stop;
3221
3222        if (stop) {
3223                /*
3224                 * Make it appear like a SCHED_FIFO task, its something
3225                 * userspace knows about and won't get confused about.
3226                 *
3227                 * Also, it will make PI more or less work without too
3228                 * much confusion -- but then, stop work should not
3229                 * rely on PI working anyway.
3230                 */
3231                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
3232
3233                stop->sched_class = &stop_sched_class;
3234
3235                /*
3236                 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
3237                 * adjust the effective priority of a task. As a result,
3238                 * rt_mutex_setprio() can trigger (RT) balancing operations,
3239                 * which can then trigger wakeups of the stop thread to push
3240                 * around the current task.
3241                 *
3242                 * The stop task itself will never be part of the PI-chain, it
3243                 * never blocks, therefore that ->pi_lock recursion is safe.
3244                 * Tell lockdep about this by placing the stop->pi_lock in its
3245                 * own class.
3246                 */
3247                lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3248        }
3249
3250        cpu_rq(cpu)->stop = stop;
3251
3252        if (old_stop) {
3253                /*
3254                 * Reset it back to a normal scheduling class so that
3255                 * it can die in pieces.
3256                 */
3257                old_stop->sched_class = &rt_sched_class;
3258        }
3259}
3260
3261#else /* CONFIG_SMP */
3262
3263static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3264                                         const struct cpumask *new_mask,
3265                                         u32 flags)
3266{
3267        return set_cpus_allowed_ptr(p, new_mask);
3268}
3269
3270static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3271
3272static inline bool rq_has_pinned_tasks(struct rq *rq)
3273{
3274        return false;
3275}
3276
3277#endif /* !CONFIG_SMP */
3278
3279static void
3280ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3281{
3282        struct rq *rq;
3283
3284        if (!schedstat_enabled())
3285                return;
3286
3287        rq = this_rq();
3288
3289#ifdef CONFIG_SMP
3290        if (cpu == rq->cpu) {
3291                __schedstat_inc(rq->ttwu_local);
3292                __schedstat_inc(p->se.statistics.nr_wakeups_local);
3293        } else {
3294                struct sched_domain *sd;
3295
3296                __schedstat_inc(p->se.statistics.nr_wakeups_remote);
3297                rcu_read_lock();
3298                for_each_domain(rq->cpu, sd) {
3299                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3300                                __schedstat_inc(sd->ttwu_wake_remote);
3301                                break;
3302                        }
3303                }
3304                rcu_read_unlock();
3305        }
3306
3307        if (wake_flags & WF_MIGRATED)
3308                __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
3309#endif /* CONFIG_SMP */
3310
3311        __schedstat_inc(rq->ttwu_count);
3312        __schedstat_inc(p->se.statistics.nr_wakeups);
3313
3314        if (wake_flags & WF_SYNC)
3315                __schedstat_inc(p->se.statistics.nr_wakeups_sync);
3316}
3317
3318/*
3319 * Mark the task runnable and perform wakeup-preemption.
3320 */
3321static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3322                           struct rq_flags *rf)
3323{
3324        check_preempt_curr(rq, p, wake_flags);
3325        WRITE_ONCE(p->__state, TASK_RUNNING);
3326        trace_sched_wakeup(p);
3327
3328#ifdef CONFIG_SMP
3329        if (p->sched_class->task_woken) {
3330                /*
3331                 * Our task @p is fully woken up and running; so it's safe to
3332                 * drop the rq->lock, hereafter rq is only used for statistics.
3333                 */
3334                rq_unpin_lock(rq, rf);
3335                p->sched_class->task_woken(rq, p);
3336                rq_repin_lock(rq, rf);
3337        }
3338
3339        if (rq->idle_stamp) {
3340                u64 delta = rq_clock(rq) - rq->idle_stamp;
3341                u64 max = 2*rq->max_idle_balance_cost;
3342
3343                update_avg(&rq->avg_idle, delta);
3344
3345                if (rq->avg_idle > max)
3346                        rq->avg_idle = max;
3347
3348                rq->wake_stamp = jiffies;
3349                rq->wake_avg_idle = rq->avg_idle / 2;
3350
3351                rq->idle_stamp = 0;
3352        }
3353#endif
3354}
3355
3356static void
3357ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3358                 struct rq_flags *rf)
3359{
3360        int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3361
3362        lockdep_assert_rq_held(rq);
3363
3364        if (p->sched_contributes_to_load)
3365                rq->nr_uninterruptible--;
3366
3367#ifdef CONFIG_SMP
3368        if (wake_flags & WF_MIGRATED)
3369                en_flags |= ENQUEUE_MIGRATED;
3370        else
3371#endif
3372        if (p->in_iowait) {
3373                delayacct_blkio_end(p);
3374                atomic_dec(&task_rq(p)->nr_iowait);
3375        }
3376
3377        activate_task(rq, p, en_flags);
3378        ttwu_do_wakeup(rq, p, wake_flags, rf);
3379}
3380
3381/*
3382 * Consider @p being inside a wait loop:
3383 *
3384 *   for (;;) {
3385 *      set_current_state(TASK_UNINTERRUPTIBLE);
3386 *
3387 *      if (CONDITION)
3388 *         break;
3389 *
3390 *      schedule();
3391 *   }
3392 *   __set_current_state(TASK_RUNNING);
3393 *
3394 * between set_current_state() and schedule(). In this case @p is still
3395 * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3396 * an atomic manner.
3397 *
3398 * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3399 * then schedule() must still happen and p->state can be changed to
3400 * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
3401 * need to do a full wakeup with enqueue.
3402 *
3403 * Returns: %true when the wakeup is done,
3404 *          %false otherwise.
3405 */
3406static int ttwu_runnable(struct task_struct *p, int wake_flags)
3407{
3408        struct rq_flags rf;
3409        struct rq *rq;
3410        int ret = 0;
3411
3412        rq = __task_rq_lock(p, &rf);
3413        if (task_on_rq_queued(p)) {
3414                /* check_preempt_curr() may use rq clock */
3415                update_rq_clock(rq);
3416                ttwu_do_wakeup(rq, p, wake_flags, &rf);
3417                ret = 1;
3418        }
3419        __task_rq_unlock(rq, &rf);
3420
3421        return ret;
3422}
3423
3424#ifdef CONFIG_SMP
3425void sched_ttwu_pending(void *arg)
3426{
3427        struct llist_node *llist = arg;
3428        struct rq *rq = this_rq();
3429        struct task_struct *p, *t;
3430        struct rq_flags rf;
3431
3432        if (!llist)
3433                return;
3434
3435        /*
3436         * rq::ttwu_pending racy indication of out-standing wakeups.
3437         * Races such that false-negatives are possible, since they
3438         * are shorter lived that false-positives would be.
3439         */
3440        WRITE_ONCE(rq->ttwu_pending, 0);
3441
3442        rq_lock_irqsave(rq, &rf);
3443        update_rq_clock(rq);
3444
3445        llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3446                if (WARN_ON_ONCE(p->on_cpu))
3447                        smp_cond_load_acquire(&p->on_cpu, !VAL);
3448
3449                if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3450                        set_task_cpu(p, cpu_of(rq));
3451
3452                ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3453        }
3454
3455        rq_unlock_irqrestore(rq, &rf);
3456}
3457
3458void send_call_function_single_ipi(int cpu)
3459{
3460        struct rq *rq = cpu_rq(cpu);
3461
3462        if (!set_nr_if_polling(rq->idle))
3463                arch_send_call_function_single_ipi(cpu);
3464        else
3465                trace_sched_wake_idle_without_ipi(cpu);
3466}
3467
3468/*
3469 * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3470 * necessary. The wakee CPU on receipt of the IPI will queue the task
3471 * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
3472 * of the wakeup instead of the waker.
3473 */
3474static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3475{
3476        struct rq *rq = cpu_rq(cpu);
3477
3478        p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3479
3480        WRITE_ONCE(rq->ttwu_pending, 1);
3481        __smp_call_single_queue(cpu, &p->wake_entry.llist);
3482}
3483
3484void wake_up_if_idle(int cpu)
3485{
3486        struct rq *rq = cpu_rq(cpu);
3487        struct rq_flags rf;
3488
3489        rcu_read_lock();
3490
3491        if (!is_idle_task(rcu_dereference(rq->curr)))
3492                goto out;
3493
3494        if (set_nr_if_polling(rq->idle)) {
3495                trace_sched_wake_idle_without_ipi(cpu);
3496        } else {
3497                rq_lock_irqsave(rq, &rf);
3498                if (is_idle_task(rq->curr))
3499                        smp_send_reschedule(cpu);
3500                /* Else CPU is not idle, do nothing here: */
3501                rq_unlock_irqrestore(rq, &rf);
3502        }
3503
3504out:
3505        rcu_read_unlock();
3506}
3507
3508bool cpus_share_cache(int this_cpu, int that_cpu)
3509{
3510        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3511}
3512
3513static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3514{
3515        /*
3516         * Do not complicate things with the async wake_list while the CPU is
3517         * in hotplug state.
3518         */
3519        if (!cpu_active(cpu))
3520                return false;
3521
3522        /*
3523         * If the CPU does not share cache, then queue the task on the
3524         * remote rqs wakelist to avoid accessing remote data.
3525         */
3526        if (!cpus_share_cache(smp_processor_id(), cpu))
3527                return true;
3528
3529        /*
3530         * If the task is descheduling and the only running task on the
3531         * CPU then use the wakelist to offload the task activation to
3532         * the soon-to-be-idle CPU as the current CPU is likely busy.
3533         * nr_running is checked to avoid unnecessary task stacking.
3534         */
3535        if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
3536                return true;
3537
3538        return false;
3539}
3540
3541static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3542{
3543        if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
3544                if (WARN_ON_ONCE(cpu == smp_processor_id()))
3545                        return false;
3546
3547                sched_clock_cpu(cpu); /* Sync clocks across CPUs */
3548                __ttwu_queue_wakelist(p, cpu, wake_flags);
3549                return true;
3550        }
3551
3552        return false;
3553}
3554
3555#else /* !CONFIG_SMP */
3556
3557static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3558{
3559        return false;
3560}
3561
3562#endif /* CONFIG_SMP */
3563
3564static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3565{
3566        struct rq *rq = cpu_rq(cpu);
3567        struct rq_flags rf;
3568
3569        if (ttwu_queue_wakelist(p, cpu, wake_flags))
3570                return;
3571
3572        rq_lock(rq, &rf);
3573        update_rq_clock(rq);
3574        ttwu_do_activate(rq, p, wake_flags, &rf);
3575        rq_unlock(rq, &rf);
3576}
3577
3578/*
3579 * Notes on Program-Order guarantees on SMP systems.
3580 *
3581 *  MIGRATION
3582 *
3583 * The basic program-order guarantee on SMP systems is that when a task [t]
3584 * migrates, all its activity on its old CPU [c0] happens-before any subsequent
3585 * execution on its new CPU [c1].
3586 *
3587 * For migration (of runnable tasks) this is provided by the following means:
3588 *
3589 *  A) UNLOCK of the rq(c0)->lock scheduling out task t
3590 *  B) migration for t is required to synchronize *both* rq(c0)->lock and
3591 *     rq(c1)->lock (if not at the same time, then in that order).
3592 *  C) LOCK of the rq(c1)->lock scheduling in task
3593 *
3594 * Release/acquire chaining guarantees that B happens after A and C after B.
3595 * Note: the CPU doing B need not be c0 or c1
3596 *
3597 * Example:
3598 *
3599 *   CPU0            CPU1            CPU2
3600 *
3601 *   LOCK rq(0)->lock
3602 *   sched-out X
3603 *   sched-in Y
3604 *   UNLOCK rq(0)->lock
3605 *
3606 *                                   LOCK rq(0)->lock // orders against CPU0
3607 *                                   dequeue X
3608 *                                   UNLOCK rq(0)->lock
3609 *
3610 *                                   LOCK rq(1)->lock
3611 *                                   enqueue X
3612 *                                   UNLOCK rq(1)->lock
3613 *
3614 *                   LOCK rq(1)->lock // orders against CPU2
3615 *                   sched-out Z
3616 *                   sched-in X
3617 *                   UNLOCK rq(1)->lock
3618 *
3619 *
3620 *  BLOCKING -- aka. SLEEP + WAKEUP
3621 *
3622 * For blocking we (obviously) need to provide the same guarantee as for
3623 * migration. However the means are completely different as there is no lock
3624 * chain to provide order. Instead we do:
3625 *
3626 *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()
3627 *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
3628 *
3629 * Example:
3630 *
3631 *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
3632 *
3633 *   LOCK rq(0)->lock LOCK X->pi_lock
3634 *   dequeue X
3635 *   sched-out X
3636 *   smp_store_release(X->on_cpu, 0);
3637 *
3638 *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
3639 *                    X->state = WAKING
3640 *                    set_task_cpu(X,2)
3641 *
3642 *                    LOCK rq(2)->lock
3643 *                    enqueue X
3644 *                    X->state = RUNNING
3645 *                    UNLOCK rq(2)->lock
3646 *
3647 *                                          LOCK rq(2)->lock // orders against CPU1
3648 *                                          sched-out Z
3649 *                                          sched-in X
3650 *                                          UNLOCK rq(2)->lock
3651 *
3652 *                    UNLOCK X->pi_lock
3653 *   UNLOCK rq(0)->lock
3654 *
3655 *
3656 * However, for wakeups there is a second guarantee we must provide, namely we
3657 * must ensure that CONDITION=1 done by the caller can not be reordered with
3658 * accesses to the task state; see try_to_wake_up() and set_current_state().
3659 */
3660
3661/**
3662 * try_to_wake_up - wake up a thread
3663 * @p: the thread to be awakened
3664 * @state: the mask of task states that can be woken
3665 * @wake_flags: wake modifier flags (WF_*)
3666 *
3667 * Conceptually does:
3668 *
3669 *   If (@state & @p->state) @p->state = TASK_RUNNING.
3670 *
3671 * If the task was not queued/runnable, also place it back on a runqueue.
3672 *
3673 * This function is atomic against schedule() which would dequeue the task.
3674 *
3675 * It issues a full memory barrier before accessing @p->state, see the comment
3676 * with set_current_state().
3677 *
3678 * Uses p->pi_lock to serialize against concurrent wake-ups.
3679 *
3680 * Relies on p->pi_lock stabilizing:
3681 *  - p->sched_class
3682 *  - p->cpus_ptr
3683 *  - p->sched_task_group
3684 * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3685 *
3686 * Tries really hard to only take one task_rq(p)->lock for performance.
3687 * Takes rq->lock in:
3688 *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
3689 *  - ttwu_queue()       -- new rq, for enqueue of the task;
3690 *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3691 *
3692 * As a consequence we race really badly with just about everything. See the
3693 * many memory barriers and their comments for details.
3694 *
3695 * Return: %true if @p->state changes (an actual wakeup was done),
3696 *         %false otherwise.
3697 */
3698static int
3699try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3700{
3701        unsigned long flags;
3702        int cpu, success = 0;
3703
3704        preempt_disable();
3705        if (p == current) {
3706                /*
3707                 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3708                 * == smp_processor_id()'. Together this means we can special
3709                 * case the whole 'p->on_rq && ttwu_runnable()' case below
3710                 * without taking any locks.
3711                 *
3712                 * In particular:
3713                 *  - we rely on Program-Order guarantees for all the ordering,
3714                 *  - we're serialized against set_special_state() by virtue of
3715                 *    it disabling IRQs (this allows not taking ->pi_lock).
3716                 */
3717                if (!(READ_ONCE(p->__state) & state))
3718                        goto out;
3719
3720                success = 1;
3721                trace_sched_waking(p);
3722                WRITE_ONCE(p->__state, TASK_RUNNING);
3723                trace_sched_wakeup(p);
3724                goto out;
3725        }
3726
3727        /*
3728         * If we are going to wake up a thread waiting for CONDITION we
3729         * need to ensure that CONDITION=1 done by the caller can not be
3730         * reordered with p->state check below. This pairs with smp_store_mb()
3731         * in set_current_state() that the waiting thread does.
3732         */
3733        raw_spin_lock_irqsave(&p->pi_lock, flags);
3734        smp_mb__after_spinlock();
3735        if (!(READ_ONCE(p->__state) & state))
3736                goto unlock;
3737
3738        trace_sched_waking(p);
3739
3740        /* We're going to change ->state: */
3741        success = 1;
3742
3743        /*
3744         * Ensure we load p->on_rq _after_ p->state, otherwise it would
3745         * be possible to, falsely, observe p->on_rq == 0 and get stuck
3746         * in smp_cond_load_acquire() below.
3747         *
3748         * sched_ttwu_pending()                 try_to_wake_up()
3749         *   STORE p->on_rq = 1                   LOAD p->state
3750         *   UNLOCK rq->lock
3751         *
3752         * __schedule() (switch to task 'p')
3753         *   LOCK rq->lock                        smp_rmb();
3754         *   smp_mb__after_spinlock();
3755         *   UNLOCK rq->lock
3756         *
3757         * [task p]
3758         *   STORE p->state = UNINTERRUPTIBLE     LOAD p->on_rq
3759         *
3760         * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
3761         * __schedule().  See the comment for smp_mb__after_spinlock().
3762         *
3763         * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
3764         */
3765        smp_rmb();
3766        if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3767                goto unlock;
3768
3769#ifdef CONFIG_SMP
3770        /*
3771         * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
3772         * possible to, falsely, observe p->on_cpu == 0.
3773         *
3774         * One must be running (->on_cpu == 1) in order to remove oneself
3775         * from the runqueue.
3776         *
3777         * __schedule() (switch to task 'p')    try_to_wake_up()
3778         *   STORE p->on_cpu = 1                  LOAD p->on_rq
3779         *   UNLOCK rq->lock
3780         *
3781         * __schedule() (put 'p' to sleep)
3782         *   LOCK rq->lock                        smp_rmb();
3783         *   smp_mb__after_spinlock();
3784         *   STORE p->on_rq = 0                   LOAD p->on_cpu
3785         *
3786         * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
3787         * __schedule().  See the comment for smp_mb__after_spinlock().
3788         *
3789         * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3790         * schedule()'s deactivate_task() has 'happened' and p will no longer
3791         * care about it's own p->state. See the comment in __schedule().
3792         */
3793        smp_acquire__after_ctrl_dep();
3794
3795        /*
3796         * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3797         * == 0), which means we need to do an enqueue, change p->state to
3798         * TASK_WAKING such that we can unlock p->pi_lock before doing the
3799         * enqueue, such as ttwu_queue_wakelist().
3800         */
3801        WRITE_ONCE(p->__state, TASK_WAKING);
3802
3803        /*
3804         * If the owning (remote) CPU is still in the middle of schedule() with
3805         * this task as prev, considering queueing p on the remote CPUs wake_list
3806         * which potentially sends an IPI instead of spinning on p->on_cpu to
3807         * let the waker make forward progress. This is safe because IRQs are
3808         * disabled and the IPI will deliver after on_cpu is cleared.
3809         *
3810         * Ensure we load task_cpu(p) after p->on_cpu:
3811         *
3812         * set_task_cpu(p, cpu);
3813         *   STORE p->cpu = @cpu
3814         * __schedule() (switch to task 'p')
3815         *   LOCK rq->lock
3816         *   smp_mb__after_spin_lock()          smp_cond_load_acquire(&p->on_cpu)
3817         *   STORE p->on_cpu = 1                LOAD p->cpu
3818         *
3819         * to ensure we observe the correct CPU on which the task is currently
3820         * scheduling.
3821         */
3822        if (smp_load_acquire(&p->on_cpu) &&
3823            ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3824                goto unlock;
3825
3826        /*
3827         * If the owning (remote) CPU is still in the middle of schedule() with
3828         * this task as prev, wait until it's done referencing the task.
3829         *
3830         * Pairs with the smp_store_release() in finish_task().
3831         *
3832         * This ensures that tasks getting woken will be fully ordered against
3833         * their previous state and preserve Program Order.
3834         */
3835        smp_cond_load_acquire(&p->on_cpu, !VAL);
3836
3837        cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
3838        if (task_cpu(p) != cpu) {
3839                if (p->in_iowait) {
3840                        delayacct_blkio_end(p);
3841                        atomic_dec(&task_rq(p)->nr_iowait);
3842                }
3843
3844                wake_flags |= WF_MIGRATED;
3845                psi_ttwu_dequeue(p);
3846                set_task_cpu(p, cpu);
3847        }
3848#else
3849        cpu = task_cpu(p);
3850#endif /* CONFIG_SMP */
3851
3852        ttwu_queue(p, cpu, wake_flags);
3853unlock:
3854        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3855out:
3856        if (success)
3857                ttwu_stat(p, task_cpu(p), wake_flags);
3858        preempt_enable();
3859
3860        return success;
3861}
3862
3863/**
3864 * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3865 * @p: Process for which the function is to be invoked, can be @current.
3866 * @func: Function to invoke.
3867 * @arg: Argument to function.
3868 *
3869 * If the specified task can be quickly locked into a definite state
3870 * (either sleeping or on a given runqueue), arrange to keep it in that
3871 * state while invoking @func(@arg).  This function can use ->on_rq and
3872 * task_curr() to work out what the state is, if required.  Given that
3873 * @func can be invoked with a runqueue lock held, it had better be quite
3874 * lightweight.
3875 *
3876 * Returns:
3877 *      @false if the task slipped out from under the locks.
3878 *      @true if the task was locked onto a runqueue or is sleeping.
3879 *              However, @func can override this by returning @false.
3880 */
3881bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3882{
3883        struct rq_flags rf;
3884        bool ret = false;
3885        struct rq *rq;
3886
3887        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3888        if (p->on_rq) {
3889                rq = __task_rq_lock(p, &rf);
3890                if (task_rq(p) == rq)
3891                        ret = func(p, arg);
3892                rq_unlock(rq, &rf);
3893        } else {
3894                switch (READ_ONCE(p->__state)) {
3895                case TASK_RUNNING:
3896                case TASK_WAKING:
3897                        break;
3898                default:
3899                        smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3900                        if (!p->on_rq)
3901                                ret = func(p, arg);
3902                }
3903        }
3904        raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3905        return ret;
3906}
3907
3908/**
3909 * wake_up_process - Wake up a specific process
3910 * @p: The process to be woken up.
3911 *
3912 * Attempt to wake up the nominated process and move it to the set of runnable
3913 * processes.
3914 *
3915 * Return: 1 if the process was woken up, 0 if it was already running.
3916 *
3917 * This function executes a full memory barrier before accessing the task state.
3918 */
3919int wake_up_process(struct task_struct *p)
3920{
3921        return try_to_wake_up(p, TASK_NORMAL, 0);
3922}
3923EXPORT_SYMBOL(wake_up_process);
3924
3925int wake_up_state(struct task_struct *p, unsigned int state)
3926{
3927        return try_to_wake_up(p, state, 0);
3928}
3929
3930/*
3931 * Perform scheduler related setup for a newly forked process p.
3932 * p is forked by current.
3933 *
3934 * __sched_fork() is basic setup used by init_idle() too:
3935 */
3936static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3937{
3938        p->on_rq                        = 0;
3939
3940        p->se.on_rq                     = 0;
3941        p->se.exec_start                = 0;
3942        p->se.sum_exec_runtime          = 0;
3943        p->se.prev_sum_exec_runtime     = 0;
3944        p->se.nr_migrations             = 0;
3945        p->se.vruntime                  = 0;
3946        INIT_LIST_HEAD(&p->se.group_node);
3947
3948#ifdef CONFIG_FAIR_GROUP_SCHED
3949        p->se.cfs_rq                    = NULL;
3950#endif
3951
3952#ifdef CONFIG_SCHEDSTATS
3953        /* Even if schedstat is disabled, there should not be garbage */
3954        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3955#endif
3956
3957        RB_CLEAR_NODE(&p->dl.rb_node);
3958        init_dl_task_timer(&p->dl);
3959        init_dl_inactive_task_timer(&p->dl);
3960        __dl_clear_params(p);
3961
3962        INIT_LIST_HEAD(&p->rt.run_list);
3963        p->rt.timeout           = 0;
3964        p->rt.time_slice        = sched_rr_timeslice;
3965        p->rt.on_rq             = 0;
3966        p->rt.on_list           = 0;
3967
3968#ifdef CONFIG_PREEMPT_NOTIFIERS
3969        INIT_HLIST_HEAD(&p->preempt_notifiers);
3970#endif
3971
3972#ifdef CONFIG_COMPACTION
3973        p->capture_control = NULL;
3974#endif
3975        init_numa_balancing(clone_flags, p);
3976#ifdef CONFIG_SMP
3977        p->wake_entry.u_flags = CSD_TYPE_TTWU;
3978        p->migration_pending = NULL;
3979#endif
3980}
3981
3982DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3983
3984#ifdef CONFIG_NUMA_BALANCING
3985
3986void set_numabalancing_state(bool enabled)
3987{
3988        if (enabled)
3989                static_branch_enable(&sched_numa_balancing);
3990        else
3991                static_branch_disable(&sched_numa_balancing);
3992}
3993
3994#ifdef CONFIG_PROC_SYSCTL
3995int sysctl_numa_balancing(struct ctl_table *table, int write,
3996                          void *buffer, size_t *lenp, loff_t *ppos)
3997{
3998        struct ctl_table t;
3999        int err;
4000        int state = static_branch_likely(&sched_numa_balancing);
4001
4002        if (write && !capable(CAP_SYS_ADMIN))
4003                return -EPERM;
4004
4005        t = *table;
4006        t.data = &state;
4007        err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4008        if (err < 0)
4009                return err;
4010        if (write)
4011                set_numabalancing_state(state);
4012        return err;
4013}
4014#endif
4015#endif
4016
4017#ifdef CONFIG_SCHEDSTATS
4018
4019DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4020
4021static void set_schedstats(bool enabled)
4022{
4023        if (enabled)
4024                static_branch_enable(&sched_schedstats);
4025        else
4026                static_branch_disable(&sched_schedstats);
4027}
4028
4029void force_schedstat_enabled(void)
4030{
4031        if (!schedstat_enabled()) {
4032                pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4033                static_branch_enable(&sched_schedstats);
4034        }
4035}
4036
4037static int __init setup_schedstats(char *str)
4038{
4039        int ret = 0;
4040        if (!str)
4041                goto out;
4042
4043        if (!strcmp(str, "enable")) {
4044                set_schedstats(true);
4045                ret = 1;
4046        } else if (!strcmp(str, "disable")) {
4047                set_schedstats(false);
4048                ret = 1;
4049        }
4050out:
4051        if (!ret)
4052                pr_warn("Unable to parse schedstats=\n");
4053
4054        return ret;
4055}
4056__setup("schedstats=", setup_schedstats);
4057
4058#ifdef CONFIG_PROC_SYSCTL
4059int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4060                size_t *lenp, loff_t *ppos)
4061{
4062        struct ctl_table t;
4063        int err;
4064        int state = static_branch_likely(&sched_schedstats);
4065
4066        if (write && !capable(CAP_SYS_ADMIN))
4067                return -EPERM;
4068
4069        t = *table;
4070        t.data = &state;
4071        err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4072        if (err < 0)
4073                return err;
4074        if (write)
4075                set_schedstats(state);
4076        return err;
4077}
4078#endif /* CONFIG_PROC_SYSCTL */
4079#endif /* CONFIG_SCHEDSTATS */
4080
4081/*
4082 * fork()/clone()-time setup:
4083 */
4084int sched_fork(unsigned long clone_flags, struct task_struct *p)
4085{
4086        unsigned long flags;
4087
4088        __sched_fork(clone_flags, p);
4089        /*
4090         * We mark the process as NEW here. This guarantees that
4091         * nobody will actually run it, and a signal or other external
4092         * event cannot wake it up and insert it on the runqueue either.
4093         */
4094        p->__state = TASK_NEW;
4095
4096        /*
4097         * Make sure we do not leak PI boosting priority to the child.
4098         */
4099        p->prio = current->normal_prio;
4100
4101        uclamp_fork(p);
4102
4103        /*
4104         * Revert to default priority/policy on fork if requested.
4105         */
4106        if (unlikely(p->sched_reset_on_fork)) {
4107                if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4108                        p->policy = SCHED_NORMAL;
4109                        p->static_prio = NICE_TO_PRIO(0);
4110                        p->rt_priority = 0;
4111                } else if (PRIO_TO_NICE(p->static_prio) < 0)
4112                        p->static_prio = NICE_TO_PRIO(0);
4113
4114                p->prio = p->normal_prio = p->static_prio;
4115                set_load_weight(p, false);
4116
4117                /*
4118                 * We don't need the reset flag anymore after the fork. It has
4119                 * fulfilled its duty:
4120                 */
4121                p->sched_reset_on_fork = 0;
4122        }
4123
4124        if (dl_prio(p->prio))
4125                return -EAGAIN;
4126        else if (rt_prio(p->prio))
4127                p->sched_class = &rt_sched_class;
4128        else
4129                p->sched_class = &fair_sched_class;
4130
4131        init_entity_runnable_average(&p->se);
4132
4133        /*
4134         * The child is not yet in the pid-hash so no cgroup attach races,
4135         * and the cgroup is pinned to this child due to cgroup_fork()
4136         * is ran before sched_fork().
4137         *
4138         * Silence PROVE_RCU.
4139         */
4140        raw_spin_lock_irqsave(&p->pi_lock, flags);
4141        rseq_migrate(p);
4142        /*
4143         * We're setting the CPU for the first time, we don't migrate,
4144         * so use __set_task_cpu().
4145         */
4146        __set_task_cpu(p, smp_processor_id());
4147        if (p->sched_class->task_fork)
4148                p->sched_class->task_fork(p);
4149        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4150
4151#ifdef CONFIG_SCHED_INFO
4152        if (likely(sched_info_on()))
4153                memset(&p->sched_info, 0, sizeof(p->sched_info));
4154#endif
4155#if defined(CONFIG_SMP)
4156        p->on_cpu = 0;
4157#endif
4158        init_task_preempt_count(p);
4159#ifdef CONFIG_SMP
4160        plist_node_init(&p->pushable_tasks, MAX_PRIO);
4161        RB_CLEAR_NODE(&p->pushable_dl_tasks);
4162#endif
4163        return 0;
4164}
4165
4166void sched_post_fork(struct task_struct *p)
4167{
4168        uclamp_post_fork(p);
4169}
4170
4171unsigned long to_ratio(u64 period, u64 runtime)
4172{
4173        if (runtime == RUNTIME_INF)
4174                return BW_UNIT;
4175
4176        /*
4177         * Doing this here saves a lot of checks in all
4178         * the calling paths, and returning zero seems
4179         * safe for them anyway.
4180         */
4181        if (period == 0)
4182                return 0;
4183
4184        return div64_u64(runtime << BW_SHIFT, period);
4185}
4186
4187/*
4188 * wake_up_new_task - wake up a newly created task for the first time.
4189 *
4190 * This function will do some initial scheduler statistics housekeeping
4191 * that must be done for every newly created context, then puts the task
4192 * on the runqueue and wakes it.
4193 */
4194void wake_up_new_task(struct task_struct *p)
4195{
4196        struct rq_flags rf;
4197        struct rq *rq;
4198
4199        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4200        WRITE_ONCE(p->__state, TASK_RUNNING);
4201#ifdef CONFIG_SMP
4202        /*
4203         * Fork balancing, do it here and not earlier because:
4204         *  - cpus_ptr can change in the fork path
4205         *  - any previously selected CPU might disappear through hotplug
4206         *
4207         * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
4208         * as we're not fully set-up yet.
4209         */
4210        p->recent_used_cpu = task_cpu(p);
4211        rseq_migrate(p);
4212        __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4213#endif
4214        rq = __task_rq_lock(p, &rf);
4215        update_rq_clock(rq);
4216        post_init_entity_util_avg(p);
4217
4218        activate_task(rq, p, ENQUEUE_NOCLOCK);
4219        trace_sched_wakeup_new(p);
4220        check_preempt_curr(rq, p, WF_FORK);
4221#ifdef CONFIG_SMP
4222        if (p->sched_class->task_woken) {
4223                /*
4224                 * Nothing relies on rq->lock after this, so it's fine to
4225                 * drop it.
4226                 */
4227                rq_unpin_lock(rq, &rf);
4228                p->sched_class->task_woken(rq, p);
4229                rq_repin_lock(rq, &rf);
4230        }
4231#endif
4232        task_rq_unlock(rq, p, &rf);
4233}
4234
4235#ifdef CONFIG_PREEMPT_NOTIFIERS
4236
4237static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4238
4239void preempt_notifier_inc(void)
4240{
4241        static_branch_inc(&preempt_notifier_key);
4242}
4243EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4244
4245void preempt_notifier_dec(void)
4246{
4247        static_branch_dec(&preempt_notifier_key);
4248}
4249EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4250
4251/**
4252 * preempt_notifier_register - tell me when current is being preempted & rescheduled
4253 * @notifier: notifier struct to register
4254 */
4255void preempt_notifier_register(struct preempt_notifier *notifier)
4256{
4257        if (!static_branch_unlikely(&preempt_notifier_key))
4258                WARN(1, "registering preempt_notifier while notifiers disabled\n");
4259
4260        hlist_add_head(&notifier->link, &current->preempt_notifiers);
4261}
4262EXPORT_SYMBOL_GPL(preempt_notifier_register);
4263
4264/**
4265 * preempt_notifier_unregister - no longer interested in preemption notifications
4266 * @notifier: notifier struct to unregister
4267 *
4268 * This is *not* safe to call from within a preemption notifier.
4269 */
4270void preempt_notifier_unregister(struct preempt_notifier *notifier)
4271{
4272        hlist_del(&notifier->link);
4273}
4274EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4275
4276static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4277{
4278        struct preempt_notifier *notifier;
4279
4280        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4281                notifier->ops->sched_in(notifier, raw_smp_processor_id());
4282}
4283
4284static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4285{
4286        if (static_branch_unlikely(&preempt_notifier_key))
4287                __fire_sched_in_preempt_notifiers(curr);
4288}
4289
4290static void
4291__fire_sched_out_preempt_notifiers(struct task_struct *curr,
4292                                   struct task_struct *next)
4293{
4294        struct preempt_notifier *notifier;
4295
4296        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4297                notifier->ops->sched_out(notifier, next);
4298}
4299
4300static __always_inline void
4301fire_sched_out_preempt_notifiers(struct task_struct *curr,
4302                                 struct task_struct *next)
4303{
4304        if (static_branch_unlikely(&preempt_notifier_key))
4305                __fire_sched_out_preempt_notifiers(curr, next);
4306}
4307
4308#else /* !CONFIG_PREEMPT_NOTIFIERS */
4309
4310static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4311{
4312}
4313
4314static inline void
4315fire_sched_out_preempt_notifiers(struct task_struct *curr,
4316                                 struct task_struct *next)
4317{
4318}
4319
4320#endif /* CONFIG_PREEMPT_NOTIFIERS */
4321
4322static inline void prepare_task(struct task_struct *next)
4323{
4324#ifdef CONFIG_SMP
4325        /*
4326         * Claim the task as running, we do this before switching to it
4327         * such that any running task will have this set.
4328         *
4329         * See the ttwu() WF_ON_CPU case and its ordering comment.
4330         */
4331        WRITE_ONCE(next->on_cpu, 1);
4332#endif
4333}
4334
4335static inline void finish_task(struct task_struct *prev)
4336{
4337#ifdef CONFIG_SMP
4338        /*
4339         * This must be the very last reference to @prev from this CPU. After
4340         * p->on_cpu is cleared, the task can be moved to a different CPU. We
4341         * must ensure this doesn't happen until the switch is completely
4342         * finished.
4343         *
4344         * In particular, the load of prev->state in finish_task_switch() must
4345         * happen before this.
4346         *
4347         * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
4348         */
4349        smp_store_release(&prev->on_cpu, 0);
4350#endif
4351}
4352
4353#ifdef CONFIG_SMP
4354
4355static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4356{
4357        void (*func)(struct rq *rq);
4358        struct callback_head *next;
4359
4360        lockdep_assert_rq_held(rq);
4361
4362        while (head) {
4363                func = (void (*)(struct rq *))head->func;
4364                next = head->next;
4365                head->next = NULL;
4366                head = next;
4367
4368                func(rq);
4369        }
4370}
4371
4372static void balance_push(struct rq *rq);
4373
4374struct callback_head balance_push_callback = {
4375        .next = NULL,
4376        .func = (void (*)(struct callback_head *))balance_push,
4377};
4378
4379static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4380{
4381        struct callback_head *head = rq->balance_callback;
4382
4383        lockdep_assert_rq_held(rq);
4384        if (head)
4385                rq->balance_callback = NULL;
4386
4387        return head;
4388}
4389
4390static void __balance_callbacks(struct rq *rq)
4391{
4392        do_balance_callbacks(rq, splice_balance_callbacks(rq));
4393}
4394
4395static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4396{
4397        unsigned long flags;
4398
4399        if (unlikely(head)) {
4400                raw_spin_rq_lock_irqsave(rq, flags);
4401                do_balance_callbacks(rq, head);
4402                raw_spin_rq_unlock_irqrestore(rq, flags);
4403        }
4404}
4405
4406#else
4407
4408static inline void __balance_callbacks(struct rq *rq)
4409{
4410}
4411
4412static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4413{
4414        return NULL;
4415}
4416
4417static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4418{
4419}
4420
4421#endif
4422
4423static inline void
4424prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4425{
4426        /*
4427         * Since the runqueue lock will be released by the next
4428         * task (which is an invalid locking op but in the case
4429         * of the scheduler it's an obvious special-case), so we
4430         * do an early lockdep release here:
4431         */
4432        rq_unpin_lock(rq, rf);
4433        spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
4434#ifdef CONFIG_DEBUG_SPINLOCK
4435        /* this is a valid case when another task releases the spinlock */
4436        rq_lockp(rq)->owner = next;
4437#endif
4438}
4439
4440static inline void finish_lock_switch(struct rq *rq)
4441{
4442        /*
4443         * If we are tracking spinlock dependencies then we have to
4444         * fix up the runqueue lock - which gets 'carried over' from
4445         * prev into current:
4446         */
4447        spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
4448        __balance_callbacks(rq);
4449        raw_spin_rq_unlock_irq(rq);
4450}
4451
4452/*
4453 * NOP if the arch has not defined these:
4454 */
4455
4456#ifndef prepare_arch_switch
4457# define prepare_arch_switch(next)      do { } while (0)
4458#endif
4459
4460#ifndef finish_arch_post_lock_switch
4461# define finish_arch_post_lock_switch() do { } while (0)
4462#endif
4463
4464static inline void kmap_local_sched_out(void)
4465{
4466#ifdef CONFIG_KMAP_LOCAL
4467        if (unlikely(current->kmap_ctrl.idx))
4468                __kmap_local_sched_out();
4469#endif
4470}
4471
4472static inline void kmap_local_sched_in(void)
4473{
4474#ifdef CONFIG_KMAP_LOCAL
4475        if (unlikely(current->kmap_ctrl.idx))
4476                __kmap_local_sched_in();
4477#endif
4478}
4479
4480/**
4481 * prepare_task_switch - prepare to switch tasks
4482 * @rq: the runqueue preparing to switch
4483 * @prev: the current task that is being switched out
4484 * @next: the task we are going to switch to.
4485 *
4486 * This is called with the rq lock held and interrupts off. It must
4487 * be paired with a subsequent finish_task_switch after the context
4488 * switch.
4489 *
4490 * prepare_task_switch sets up locking and calls architecture specific
4491 * hooks.
4492 */
4493static inline void
4494prepare_task_switch(struct rq *rq, struct task_struct *prev,
4495                    struct task_struct *next)
4496{
4497        kcov_prepare_switch(prev);
4498        sched_info_switch(rq, prev, next);
4499        perf_event_task_sched_out(prev, next);
4500        rseq_preempt(prev);
4501        fire_sched_out_preempt_notifiers(prev, next);
4502        kmap_local_sched_out();
4503        prepare_task(next);
4504        prepare_arch_switch(next);
4505}
4506
4507/**
4508 * finish_task_switch - clean up after a task-switch
4509 * @prev: the thread we just switched away from.
4510 *
4511 * finish_task_switch must be called after the context switch, paired
4512 * with a prepare_task_switch call before the context switch.
4513 * finish_task_switch will reconcile locking set up by prepare_task_switch,
4514 * and do any other architecture-specific cleanup actions.
4515 *
4516 * Note that we may have delayed dropping an mm in context_switch(). If
4517 * so, we finish that here outside of the runqueue lock. (Doing it
4518 * with the lock held can cause deadlocks; see schedule() for
4519 * details.)
4520 *
4521 * The context switch have flipped the stack from under us and restored the
4522 * local variables which were saved when this task called schedule() in the
4523 * past. prev == current is still correct but we need to recalculate this_rq
4524 * because prev may have moved to another CPU.
4525 */
4526static struct rq *finish_task_switch(struct task_struct *prev)
4527        __releases(rq->lock)
4528{
4529        struct rq *rq = this_rq();
4530        struct mm_struct *mm = rq->prev_mm;
4531        long prev_state;
4532
4533        /*
4534         * The previous task will have left us with a preempt_count of 2
4535         * because it left us after:
4536         *
4537         *      schedule()
4538         *        preempt_disable();                    // 1
4539         *        __schedule()
4540         *          raw_spin_lock_irq(&rq->lock)        // 2
4541         *
4542         * Also, see FORK_PREEMPT_COUNT.
4543         */
4544        if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
4545                      "corrupted preempt_count: %s/%d/0x%x\n",
4546                      current->comm, current->pid, preempt_count()))
4547                preempt_count_set(FORK_PREEMPT_COUNT);
4548
4549        rq->prev_mm = NULL;
4550
4551        /*
4552         * A task struct has one reference for the use as "current".
4553         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
4554         * schedule one last time. The schedule call will never return, and
4555         * the scheduled task must drop that reference.
4556         *
4557         * We must observe prev->state before clearing prev->on_cpu (in
4558         * finish_task), otherwise a concurrent wakeup can get prev
4559         * running on another CPU and we could rave with its RUNNING -> DEAD
4560         * transition, resulting in a double drop.
4561         */
4562        prev_state = READ_ONCE(prev->__state);
4563        vtime_task_switch(prev);
4564        perf_event_task_sched_in(prev, current);
4565        finish_task(prev);
4566        tick_nohz_task_switch();
4567        finish_lock_switch(rq);
4568        finish_arch_post_lock_switch();
4569        kcov_finish_switch(current);
4570        /*
4571         * kmap_local_sched_out() is invoked with rq::lock held and
4572         * interrupts disabled. There is no requirement for that, but the
4573         * sched out code does not have an interrupt enabled section.
4574         * Restoring the maps on sched in does not require interrupts being
4575         * disabled either.
4576         */
4577        kmap_local_sched_in();
4578
4579        fire_sched_in_preempt_notifiers(current);
4580        /*
4581         * When switching through a kernel thread, the loop in
4582         * membarrier_{private,global}_expedited() may have observed that
4583         * kernel thread and not issued an IPI. It is therefore possible to
4584         * schedule between user->kernel->user threads without passing though
4585         * switch_mm(). Membarrier requires a barrier after storing to
4586         * rq->curr, before returning to userspace, so provide them here:
4587         *
4588         * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
4589         *   provided by mmdrop(),
4590         * - a sync_core for SYNC_CORE.
4591         */
4592        if (mm) {
4593                membarrier_mm_sync_core_before_usermode(mm);
4594                mmdrop(mm);
4595        }
4596        if (unlikely(prev_state == TASK_DEAD)) {
4597                if (prev->sched_class->task_dead)
4598                        prev->sched_class->task_dead(prev);
4599
4600                /*
4601                 * Remove function-return probe instances associated with this
4602                 * task and put them back on the free list.
4603                 */
4604                kprobe_flush_task(prev);
4605
4606                /* Task is done with its stack. */
4607                put_task_stack(prev);
4608
4609                put_task_struct_rcu_user(prev);
4610        }
4611
4612        return rq;
4613}
4614
4615/**
4616 * schedule_tail - first thing a freshly forked thread must call.
4617 * @prev: the thread we just switched away from.
4618 */
4619asmlinkage __visible void schedule_tail(struct task_struct *prev)
4620        __releases(rq->lock)
4621{
4622        /*
4623         * New tasks start with FORK_PREEMPT_COUNT, see there and
4624         * finish_task_switch() for details.
4625         *
4626         * finish_task_switch() will drop rq->lock() and lower preempt_count
4627         * and the preempt_enable() will end up enabling preemption (on
4628         * PREEMPT_COUNT kernels).
4629         */
4630
4631        finish_task_switch(prev);
4632        preempt_enable();
4633
4634        if (current->set_child_tid)
4635                put_user(task_pid_vnr(current), current->set_child_tid);
4636
4637        calculate_sigpending();
4638}
4639
4640/*
4641 * context_switch - switch to the new MM and the new thread's register state.
4642 */
4643static __always_inline struct rq *
4644context_switch(struct rq *rq, struct task_struct *prev,
4645               struct task_struct *next, struct rq_flags *rf)
4646{
4647        prepare_task_switch(rq, prev, next);
4648
4649        /*
4650         * For paravirt, this is coupled with an exit in switch_to to
4651         * combine the page table reload and the switch backend into
4652         * one hypercall.
4653         */
4654        arch_start_context_switch(prev);
4655
4656        /*
4657         * kernel -> kernel   lazy + transfer active
4658         *   user -> kernel   lazy + mmgrab() active
4659         *
4660         * kernel ->   user   switch + mmdrop() active
4661         *   user ->   user   switch
4662         */
4663        if (!next->mm) {                                // to kernel
4664                enter_lazy_tlb(prev->active_mm, next);
4665
4666                next->active_mm = prev->active_mm;
4667                if (prev->mm)                           // from user
4668                        mmgrab(prev->active_mm);
4669                else
4670                        prev->active_mm = NULL;
4671        } else {                                        // to user
4672                membarrier_switch_mm(rq, prev->active_mm, next->mm);
4673                /*
4674                 * sys_membarrier() requires an smp_mb() between setting
4675                 * rq->curr / membarrier_switch_mm() and returning to userspace.
4676                 *
4677                 * The below provides this either through switch_mm(), or in
4678                 * case 'prev->active_mm == next->mm' through
4679                 * finish_task_switch()'s mmdrop().
4680                 */
4681                switch_mm_irqs_off(prev->active_mm, next->mm, next);
4682
4683                if (!prev->mm) {                        // from kernel
4684                        /* will mmdrop() in finish_task_switch(). */
4685                        rq->prev_mm = prev->active_mm;
4686                        prev->active_mm = NULL;
4687                }
4688        }
4689
4690        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4691
4692        prepare_lock_switch(rq, next, rf);
4693
4694        /* Here we just switch the register state and the stack. */
4695        switch_to(prev, next, prev);
4696        barrier();
4697
4698        return finish_task_switch(prev);
4699}
4700
4701/*
4702 * nr_running and nr_context_switches:
4703 *
4704 * externally visible scheduler statistics: current number of runnable
4705 * threads, total number of context switches performed since bootup.
4706 */
4707unsigned int nr_running(void)
4708{
4709        unsigned int i, sum = 0;
4710
4711        for_each_online_cpu(i)
4712                sum += cpu_rq(i)->nr_running;
4713
4714        return sum;
4715}
4716
4717/*
4718 * Check if only the current task is running on the CPU.
4719 *
4720 * Caution: this function does not check that the caller has disabled
4721 * preemption, thus the result might have a time-of-check-to-time-of-use
4722 * race.  The caller is responsible to use it correctly, for example:
4723 *
4724 * - from a non-preemptible section (of course)
4725 *
4726 * - from a thread that is bound to a single CPU
4727 *
4728 * - in a loop with very short iterations (e.g. a polling loop)
4729 */
4730bool single_task_running(void)
4731{
4732        return raw_rq()->nr_running == 1;
4733}
4734EXPORT_SYMBOL(single_task_running);
4735
4736unsigned long long nr_context_switches(void)
4737{
4738        int i;
4739        unsigned long long sum = 0;
4740
4741        for_each_possible_cpu(i)
4742                sum += cpu_rq(i)->nr_switches;
4743
4744        return sum;
4745}
4746
4747/*
4748 * Consumers of these two interfaces, like for example the cpuidle menu
4749 * governor, are using nonsensical data. Preferring shallow idle state selection
4750 * for a CPU that has IO-wait which might not even end up running the task when
4751 * it does become runnable.
4752 */
4753
4754unsigned int nr_iowait_cpu(int cpu)
4755{
4756        return atomic_read(&cpu_rq(cpu)->nr_iowait);
4757}
4758
4759/*
4760 * IO-wait accounting, and how it's mostly bollocks (on SMP).
4761 *
4762 * The idea behind IO-wait account is to account the idle time that we could
4763 * have spend running if it were not for IO. That is, if we were to improve the
4764 * storage performance, we'd have a proportional reduction in IO-wait time.
4765 *
4766 * This all works nicely on UP, where, when a task blocks on IO, we account
4767 * idle time as IO-wait, because if the storage were faster, it could've been
4768 * running and we'd not be idle.
4769 *
4770 * This has been extended to SMP, by doing the same for each CPU. This however
4771 * is broken.
4772 *
4773 * Imagine for instance the case where two tasks block on one CPU, only the one
4774 * CPU will have IO-wait accounted, while the other has regular idle. Even
4775 * though, if the storage were faster, both could've ran at the same time,
4776 * utilising both CPUs.
4777 *
4778 * This means, that when looking globally, the current IO-wait accounting on
4779 * SMP is a lower bound, by reason of under accounting.
4780 *
4781 * Worse, since the numbers are provided per CPU, they are sometimes
4782 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
4783 * associated with any one particular CPU, it can wake to another CPU than it
4784 * blocked on. This means the per CPU IO-wait number is meaningless.
4785 *
4786 * Task CPU affinities can make all that even more 'interesting'.
4787 */
4788
4789unsigned int nr_iowait(void)
4790{
4791        unsigned int i, sum = 0;
4792
4793        for_each_possible_cpu(i)
4794                sum += nr_iowait_cpu(i);
4795
4796        return sum;
4797}
4798
4799#ifdef CONFIG_SMP
4800
4801/*
4802 * sched_exec - execve() is a valuable balancing opportunity, because at
4803 * this point the task has the smallest effective memory and cache footprint.
4804 */
4805void sched_exec(void)
4806{
4807        struct task_struct *p = current;
4808        unsigned long flags;
4809        int dest_cpu;
4810
4811        raw_spin_lock_irqsave(&p->pi_lock, flags);
4812        dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
4813        if (dest_cpu == smp_processor_id())
4814                goto unlock;
4815
4816        if (likely(cpu_active(dest_cpu))) {
4817                struct migration_arg arg = { p, dest_cpu };
4818
4819                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4820                stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
4821                return;
4822        }
4823unlock:
4824        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4825}
4826
4827#endif
4828
4829DEFINE_PER_CPU(struct kernel_stat, kstat);
4830DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
4831
4832EXPORT_PER_CPU_SYMBOL(kstat);
4833EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
4834
4835/*
4836 * The function fair_sched_class.update_curr accesses the struct curr
4837 * and its field curr->exec_start; when called from task_sched_runtime(),
4838 * we observe a high rate of cache misses in practice.
4839 * Prefetching this data results in improved performance.
4840 */
4841static inline void prefetch_curr_exec_start(struct task_struct *p)
4842{
4843#ifdef CONFIG_FAIR_GROUP_SCHED
4844        struct sched_entity *curr = (&p->se)->cfs_rq->curr;
4845#else
4846        struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
4847#endif
4848        prefetch(curr);
4849        prefetch(&curr->exec_start);
4850}
4851
4852/*
4853 * Return accounted runtime for the task.
4854 * In case the task is currently running, return the runtime plus current's
4855 * pending runtime that have not been accounted yet.
4856 */
4857unsigned long long task_sched_runtime(struct task_struct *p)
4858{
4859        struct rq_flags rf;
4860        struct rq *rq;
4861        u64 ns;
4862
4863#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
4864        /*
4865         * 64-bit doesn't need locks to atomically read a 64-bit value.
4866         * So we have a optimization chance when the task's delta_exec is 0.
4867         * Reading ->on_cpu is racy, but this is ok.
4868         *
4869         * If we race with it leaving CPU, we'll take a lock. So we're correct.
4870         * If we race with it entering CPU, unaccounted time is 0. This is
4871         * indistinguishable from the read occurring a few cycles earlier.
4872         * If we see ->on_cpu without ->on_rq, the task is leaving, and has
4873         * been accounted, so we're correct here as well.
4874         */
4875        if (!p->on_cpu || !task_on_rq_queued(p))
4876                return p->se.sum_exec_runtime;
4877#endif
4878
4879        rq = task_rq_lock(p, &rf);
4880        /*
4881         * Must be ->curr _and_ ->on_rq.  If dequeued, we would
4882         * project cycles that may never be accounted to this
4883         * thread, breaking clock_gettime().
4884         */
4885        if (task_current(rq, p) && task_on_rq_queued(p)) {
4886                prefetch_curr_exec_start(p);
4887                update_rq_clock(rq);
4888                p->sched_class->update_curr(rq);
4889        }
4890        ns = p->se.sum_exec_runtime;
4891        task_rq_unlock(rq, p, &rf);
4892
4893        return ns;
4894}
4895
4896#ifdef CONFIG_SCHED_DEBUG
4897static u64 cpu_resched_latency(struct rq *rq)
4898{
4899        int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
4900        u64 resched_latency, now = rq_clock(rq);
4901        static bool warned_once;
4902
4903        if (sysctl_resched_latency_warn_once && warned_once)
4904                return 0;
4905
4906        if (!need_resched() || !latency_warn_ms)
4907                return 0;
4908
4909        if (system_state == SYSTEM_BOOTING)
4910                return 0;
4911
4912        if (!rq->last_seen_need_resched_ns) {
4913                rq->last_seen_need_resched_ns = now;
4914                rq->ticks_without_resched = 0;
4915                return 0;
4916        }
4917
4918        rq->ticks_without_resched++;
4919        resched_latency = now - rq->last_seen_need_resched_ns;
4920        if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
4921                return 0;
4922
4923        warned_once = true;
4924
4925        return resched_latency;
4926}
4927
4928static int __init setup_resched_latency_warn_ms(char *str)
4929{
4930        long val;
4931
4932        if ((kstrtol(str, 0, &val))) {
4933                pr_warn("Unable to set resched_latency_warn_ms\n");
4934                return 1;
4935        }
4936
4937        sysctl_resched_latency_warn_ms = val;
4938        return 1;
4939}
4940__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
4941#else
4942static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
4943#endif /* CONFIG_SCHED_DEBUG */
4944
4945/*
4946 * This function gets called by the timer code, with HZ frequency.
4947 * We call it with interrupts disabled.
4948 */
4949void scheduler_tick(void)
4950{
4951        int cpu = smp_processor_id();
4952        struct rq *rq = cpu_rq(cpu);
4953        struct task_struct *curr = rq->curr;
4954        struct rq_flags rf;
4955        unsigned long thermal_pressure;
4956        u64 resched_latency;
4957
4958        arch_scale_freq_tick();
4959        sched_clock_tick();
4960
4961        rq_lock(rq, &rf);
4962
4963        update_rq_clock(rq);
4964        thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4965        update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
4966        curr->sched_class->task_tick(rq, curr, 0);
4967        if (sched_feat(LATENCY_WARN))
4968                resched_latency = cpu_resched_latency(rq);
4969        calc_global_load_tick(rq);
4970
4971        rq_unlock(rq, &rf);
4972
4973        if (sched_feat(LATENCY_WARN) && resched_latency)
4974                resched_latency_warn(cpu, resched_latency);
4975
4976        perf_event_task_tick();
4977
4978#ifdef CONFIG_SMP
4979        rq->idle_balance = idle_cpu(cpu);
4980        trigger_load_balance(rq);
4981#endif
4982}
4983
4984#ifdef CONFIG_NO_HZ_FULL
4985
4986struct tick_work {
4987        int                     cpu;
4988        atomic_t                state;
4989        struct delayed_work     work;
4990};
4991/* Values for ->state, see diagram below. */
4992#define TICK_SCHED_REMOTE_OFFLINE       0
4993#define TICK_SCHED_REMOTE_OFFLINING     1
4994#define TICK_SCHED_REMOTE_RUNNING       2
4995
4996/*
4997 * State diagram for ->state:
4998 *
4999 *
5000 *          TICK_SCHED_REMOTE_OFFLINE
5001 *                    |   ^
5002 *                    |   |
5003 *                    |   | sched_tick_remote()
5004 *                    |   |
5005 *                    |   |
5006 *                    +--TICK_SCHED_REMOTE_OFFLINING
5007 *                    |   ^
5008 *                    |   |
5009 * sched_tick_start() |   | sched_tick_stop()
5010 *                    |   |
5011 *                    V   |
5012 *          TICK_SCHED_REMOTE_RUNNING
5013 *
5014 *
5015 * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
5016 * and sched_tick_start() are happy to leave the state in RUNNING.
5017 */
5018
5019static struct tick_work __percpu *tick_work_cpu;
5020
5021static void sched_tick_remote(struct work_struct *work)
5022{
5023        struct delayed_work *dwork = to_delayed_work(work);
5024        struct tick_work *twork = container_of(dwork, struct tick_work, work);
5025        int cpu = twork->cpu;
5026        struct rq *rq = cpu_rq(cpu);
5027        struct task_struct *curr;
5028        struct rq_flags rf;
5029        u64 delta;
5030        int os;
5031
5032        /*
5033         * Handle the tick only if it appears the remote CPU is running in full
5034         * dynticks mode. The check is racy by nature, but missing a tick or
5035         * having one too much is no big deal because the scheduler tick updates
5036         * statistics and checks timeslices in a time-independent way, regardless
5037         * of when exactly it is running.
5038         */
5039        if (!tick_nohz_tick_stopped_cpu(cpu))
5040                goto out_requeue;
5041
5042        rq_lock_irq(rq, &rf);
5043        curr = rq->curr;
5044        if (cpu_is_offline(cpu))
5045                goto out_unlock;
5046
5047        update_rq_clock(rq);
5048
5049        if (!is_idle_task(curr)) {
5050                /*
5051                 * Make sure the next tick runs within a reasonable
5052                 * amount of time.
5053                 */
5054                delta = rq_clock_task(rq) - curr->se.exec_start;
5055                WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
5056        }
5057        curr->sched_class->task_tick(rq, curr, 0);
5058
5059        calc_load_nohz_remote(rq);
5060out_unlock:
5061        rq_unlock_irq(rq, &rf);
5062out_requeue:
5063
5064        /*
5065         * Run the remote tick once per second (1Hz). This arbitrary
5066         * frequency is large enough to avoid overload but short enough
5067         * to keep scheduler internal stats reasonably up to date.  But
5068         * first update state to reflect hotplug activity if required.
5069         */
5070        os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
5071        WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
5072        if (os == TICK_SCHED_REMOTE_RUNNING)
5073                queue_delayed_work(system_unbound_wq, dwork, HZ);
5074}
5075
5076static void sched_tick_start(int cpu)
5077{
5078        int os;
5079        struct tick_work *twork;
5080
5081        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5082                return;
5083
5084        WARN_ON_ONCE(!tick_work_cpu);
5085
5086        twork = per_cpu_ptr(tick_work_cpu, cpu);
5087        os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
5088        WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
5089        if (os == TICK_SCHED_REMOTE_OFFLINE) {
5090                twork->cpu = cpu;
5091                INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
5092                queue_delayed_work(system_unbound_wq, &twork->work, HZ);
5093        }
5094}
5095
5096#ifdef CONFIG_HOTPLUG_CPU
5097static void sched_tick_stop(int cpu)
5098{
5099        struct tick_work *twork;
5100        int os;
5101
5102        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5103                return;
5104
5105        WARN_ON_ONCE(!tick_work_cpu);
5106
5107        twork = per_cpu_ptr(tick_work_cpu, cpu);
5108        /* There cannot be competing actions, but don't rely on stop-machine. */
5109        os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
5110        WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
5111        /* Don't cancel, as this would mess up the state machine. */
5112}
5113#endif /* CONFIG_HOTPLUG_CPU */
5114
5115int __init sched_tick_offload_init(void)
5116{
5117        tick_work_cpu = alloc_percpu(struct tick_work);
5118        BUG_ON(!tick_work_cpu);
5119        return 0;
5120}
5121
5122#else /* !CONFIG_NO_HZ_FULL */
5123static inline void sched_tick_start(int cpu) { }
5124static inline void sched_tick_stop(int cpu) { }
5125#endif
5126
5127#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
5128                                defined(CONFIG_TRACE_PREEMPT_TOGGLE))
5129/*
5130 * If the value passed in is equal to the current preempt count
5131 * then we just disabled preemption. Start timing the latency.
5132 */
5133static inline void preempt_latency_start(int val)
5134{
5135        if (preempt_count() == val) {
5136                unsigned long ip = get_lock_parent_ip();
5137#ifdef CONFIG_DEBUG_PREEMPT
5138                current->preempt_disable_ip = ip;
5139#endif
5140                trace_preempt_off(CALLER_ADDR0, ip);
5141        }
5142}
5143
5144void preempt_count_add(int val)
5145{
5146#ifdef CONFIG_DEBUG_PREEMPT
5147        /*
5148         * Underflow?
5149         */
5150        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5151                return;
5152#endif
5153        __preempt_count_add(val);
5154#ifdef